Unroll loop in h264_idct_add8_sse2(). This means we can inline scan8[] in the
code directly also and remove loop setup. 20% faster in function, 0.8% overall. See "[PATCH] unroll loop in h264_idct_add8_sse2()" thread on ML. Originally committed as revision 25171 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
d801f1c848
commit
4bca677494
@ -804,62 +804,53 @@ cglobal h264_idct_add16intra_sse2, 5, 7, 8
|
||||
jl .next2blocks
|
||||
REP_RET
|
||||
|
||||
h264_idct_add8_sse2_plane:
|
||||
.next2blocks
|
||||
movzx r0, byte [scan8+r5]
|
||||
movzx r0, word [r4+r0]
|
||||
%macro add8_sse2_cycle 2
|
||||
movzx r0, word [r4+%2]
|
||||
test r0, r0
|
||||
jz .try_dc
|
||||
jz .try%1dc
|
||||
%ifdef ARCH_X86_64
|
||||
mov r0d, dword [r1+r5*4]
|
||||
mov r0d, dword [r1+%1*8+64]
|
||||
add r0, [r10]
|
||||
%else
|
||||
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
|
||||
mov r0, r0m
|
||||
mov r0, [r0]
|
||||
add r0, dword [r1+r5*4]
|
||||
add r0, dword [r1+%1*8+64]
|
||||
%endif
|
||||
call x264_add8x4_idct_sse2
|
||||
add r5, 2
|
||||
add r2, 64
|
||||
test r5, 3
|
||||
jnz .next2blocks
|
||||
rep ret
|
||||
.try_dc
|
||||
jmp .cycle%1end
|
||||
.try%1dc
|
||||
movsx r0, word [r2 ]
|
||||
or r0w, word [r2+32]
|
||||
jz .skip2blocks
|
||||
jz .cycle%1end
|
||||
%ifdef ARCH_X86_64
|
||||
mov r0d, dword [r1+r5*4]
|
||||
mov r0d, dword [r1+%1*8+64]
|
||||
add r0, [r10]
|
||||
%else
|
||||
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
|
||||
mov r0, r0m
|
||||
mov r0, [r0]
|
||||
add r0, dword [r1+r5*4]
|
||||
add r0, dword [r1+%1*8+64]
|
||||
%endif
|
||||
call h264_idct_dc_add8_mmx2
|
||||
.skip2blocks
|
||||
add r5, 2
|
||||
.cycle%1end
|
||||
%if %1 < 3
|
||||
add r2, 64
|
||||
test r5, 3
|
||||
jnz .next2blocks
|
||||
rep ret
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
|
||||
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
||||
cglobal h264_idct_add8_sse2, 5, 7, 8
|
||||
mov r5, 16
|
||||
add r2, 512
|
||||
%ifdef PIC
|
||||
lea r11, [scan8_mem]
|
||||
%endif
|
||||
%ifdef ARCH_X86_64
|
||||
mov r10, r0
|
||||
%endif
|
||||
call h264_idct_add8_sse2_plane
|
||||
add8_sse2_cycle 0, 0x09
|
||||
add8_sse2_cycle 1, 0x11
|
||||
%ifdef ARCH_X86_64
|
||||
add r10, gprsize
|
||||
%else
|
||||
add r0mp, gprsize
|
||||
%endif
|
||||
call h264_idct_add8_sse2_plane
|
||||
add8_sse2_cycle 2, 0x21
|
||||
add8_sse2_cycle 3, 0x29
|
||||
RET
|
||||
|
Loading…
x
Reference in New Issue
Block a user