Use word-writing instead of dword-writing (with two cached but otherwise
unchanged bytes) in the horizontal simple loopfilter. This makes the filter quite a bit faster in itself (~30 cycles less on Core1), probably mostly because we don't need a complex 4x4 transpose, but only a simple byte interleave. Also allows using pextrw on SSE4, which speeds up even more (e.g. 25% faster on Core i7). Originally committed as revision 24638 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
ace7f813cd
commit
6341838f3c
@ -346,7 +346,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
|||||||
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
|
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
|
||||||
|
|
||||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
|
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
|
||||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
|
|
||||||
|
|
||||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
|
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
|
||||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
|
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
|
||||||
@ -358,6 +357,8 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
|||||||
if (mm_flags & FF_MM_SSE2) {
|
if (mm_flags & FF_MM_SSE2) {
|
||||||
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
|
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
|
||||||
|
|
||||||
|
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
|
||||||
|
|
||||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
|
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
|
||||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
|
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
|
||||||
|
|
||||||
@ -390,6 +391,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
|||||||
if (mm_flags & FF_MM_SSE4) {
|
if (mm_flags & FF_MM_SSE4) {
|
||||||
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
|
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
|
||||||
|
|
||||||
|
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
|
||||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
|
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
|
||||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
|
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
|
||||||
}
|
}
|
||||||
|
@ -1354,6 +1354,81 @@ cglobal vp8_luma_dc_wht_mmx, 2,3
|
|||||||
movd [%7+%9*2], m%4
|
movd [%7+%9*2], m%4
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
; write 4 or 8 words in the mmx/xmm registers as 8 lines
|
||||||
|
; 1 and 2 are the registers to write, this can be the same (for SSE2)
|
||||||
|
; for pre-SSE4:
|
||||||
|
; 3 is a general-purpose register that we will clobber
|
||||||
|
; for SSE4:
|
||||||
|
; 3 is a pointer to the destination's 5th line
|
||||||
|
; 4 is a pointer to the destination's 4th line
|
||||||
|
; 5/6 is -stride and +stride
|
||||||
|
%macro WRITE_2x4W 6
|
||||||
|
movd %3, %1
|
||||||
|
punpckhdq %1, %1
|
||||||
|
mov [%4+%5*4], %3w
|
||||||
|
shr %3, 16
|
||||||
|
add %4, %6
|
||||||
|
mov [%4+%5*4], %3w
|
||||||
|
|
||||||
|
movd %3, %1
|
||||||
|
add %4, %5
|
||||||
|
mov [%4+%5*2], %3w
|
||||||
|
shr %3, 16
|
||||||
|
mov [%4+%5 ], %3w
|
||||||
|
|
||||||
|
movd %3, %2
|
||||||
|
punpckhdq %2, %2
|
||||||
|
mov [%4 ], %3w
|
||||||
|
shr %3, 16
|
||||||
|
mov [%4+%6 ], %3w
|
||||||
|
|
||||||
|
movd %3, %2
|
||||||
|
add %4, %6
|
||||||
|
mov [%4+%6 ], %3w
|
||||||
|
shr %3, 16
|
||||||
|
mov [%4+%6*2], %3w
|
||||||
|
add %4, %5
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro WRITE_8W_SSE2 5
|
||||||
|
movd %2, %1
|
||||||
|
psrldq %1, 4
|
||||||
|
mov [%3+%4*4], %2w
|
||||||
|
shr %2, 16
|
||||||
|
add %3, %5
|
||||||
|
mov [%3+%4*4], %2w
|
||||||
|
|
||||||
|
movd %2, %1
|
||||||
|
psrldq %1, 4
|
||||||
|
add %3, %4
|
||||||
|
mov [%3+%4*2], %2w
|
||||||
|
shr %2, 16
|
||||||
|
mov [%3+%4 ], %2w
|
||||||
|
|
||||||
|
movd %2, %1
|
||||||
|
psrldq %1, 4
|
||||||
|
mov [%3 ], %2w
|
||||||
|
shr %2, 16
|
||||||
|
mov [%3+%5 ], %2w
|
||||||
|
|
||||||
|
movd %2, %1
|
||||||
|
add %3, %5
|
||||||
|
mov [%3+%5 ], %2w
|
||||||
|
shr %2, 16
|
||||||
|
mov [%3+%5*2], %2w
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro WRITE_8W_SSE4 5
|
||||||
|
pextrw [%3+%4*4], %1, 0
|
||||||
|
pextrw [%2+%4*4], %1, 1
|
||||||
|
pextrw [%3+%4*2], %1, 2
|
||||||
|
pextrw [%3+%4 ], %1, 3
|
||||||
|
pextrw [%3 ], %1, 4
|
||||||
|
pextrw [%2 ], %1, 5
|
||||||
|
pextrw [%2+%5 ], %1, 6
|
||||||
|
pextrw [%2+%5*2], %1, 7
|
||||||
|
%endmacro
|
||||||
|
|
||||||
%macro SPLATB_REG_MMX 2-3
|
%macro SPLATB_REG_MMX 2-3
|
||||||
movd %1, %2
|
movd %1, %2
|
||||||
punpcklbw %1, %1
|
punpcklbw %1, %1
|
||||||
@ -1381,10 +1456,6 @@ cglobal vp8_luma_dc_wht_mmx, 2,3
|
|||||||
|
|
||||||
%macro SIMPLE_LOOPFILTER 3
|
%macro SIMPLE_LOOPFILTER 3
|
||||||
cglobal vp8_%2_loop_filter_simple_%1, 3, %3
|
cglobal vp8_%2_loop_filter_simple_%1, 3, %3
|
||||||
%ifidn %2, h
|
|
||||||
mov r5, rsp ; backup stack pointer
|
|
||||||
and rsp, ~(mmsize-1) ; align stack
|
|
||||||
%endif
|
|
||||||
%if mmsize == 8 ; mmx/mmxext
|
%if mmsize == 8 ; mmx/mmxext
|
||||||
mov r3, 2
|
mov r3, 2
|
||||||
%endif
|
%endif
|
||||||
@ -1400,7 +1471,6 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3
|
|||||||
neg r1
|
neg r1
|
||||||
%ifidn %2, h
|
%ifidn %2, h
|
||||||
lea r0, [r0+4*r2-2]
|
lea r0, [r0+4*r2-2]
|
||||||
sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1
|
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
%if mmsize == 8 ; mmx / mmxext
|
%if mmsize == 8 ; mmx / mmxext
|
||||||
@ -1421,9 +1491,6 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3
|
|||||||
READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
|
READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
|
||||||
%endif
|
%endif
|
||||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||||
|
|
||||||
mova [rsp], m0 ; store p1
|
|
||||||
mova [rsp+mmsize], m3 ; store q1
|
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
; simple_limit
|
; simple_limit
|
||||||
@ -1494,17 +1561,21 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3
|
|||||||
mova [r0], m4
|
mova [r0], m4
|
||||||
mova [r0+r1], m6
|
mova [r0+r1], m6
|
||||||
%else ; h
|
%else ; h
|
||||||
mova m0, [rsp] ; p1
|
inc r0
|
||||||
SWAP 2, 4 ; p0
|
SBUTTERFLY bw, 6, 4, 0
|
||||||
SWAP 1, 6 ; q0
|
|
||||||
mova m3, [rsp+mmsize] ; q1
|
|
||||||
|
|
||||||
TRANSPOSE4x4B 0, 1, 2, 3, 4
|
|
||||||
%if mmsize == 16 ; sse2
|
%if mmsize == 16 ; sse2
|
||||||
add r3, r1 ; change from r4*8*stride to r0+8*stride
|
%ifidn %1, sse4
|
||||||
WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16
|
inc r4
|
||||||
|
%endif
|
||||||
|
WRITE_8W m6, r4, r0, r1, r2
|
||||||
|
lea r4, [r3+r1+1]
|
||||||
|
%ifidn %1, sse4
|
||||||
|
inc r3
|
||||||
|
%endif
|
||||||
|
WRITE_8W m4, r3, r4, r1, r2
|
||||||
%else ; mmx/mmxext
|
%else ; mmx/mmxext
|
||||||
WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
|
WRITE_2x4W m6, m4, r4, r0, r1, r2
|
||||||
%endif
|
%endif
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
@ -1513,20 +1584,12 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3
|
|||||||
%ifidn %2, v
|
%ifidn %2, v
|
||||||
add r0, 8 ; advance 8 cols = pixels
|
add r0, 8 ; advance 8 cols = pixels
|
||||||
%else ; h
|
%else ; h
|
||||||
lea r0, [r0+r2*8] ; advance 8 rows = lines
|
lea r0, [r0+r2*8-1] ; advance 8 rows = lines
|
||||||
%endif
|
%endif
|
||||||
dec r3
|
dec r3
|
||||||
jg .next8px
|
jg .next8px
|
||||||
%ifidn %2, v
|
|
||||||
REP_RET
|
REP_RET
|
||||||
%else ; h
|
|
||||||
mov rsp, r5 ; restore stack pointer
|
|
||||||
RET
|
|
||||||
%endif
|
|
||||||
%else ; sse2
|
%else ; sse2
|
||||||
%ifidn %2, h
|
|
||||||
mov rsp, r5 ; restore stack pointer
|
|
||||||
%endif
|
|
||||||
RET
|
RET
|
||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
@ -1534,17 +1597,20 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3
|
|||||||
INIT_MMX
|
INIT_MMX
|
||||||
%define SPLATB_REG SPLATB_REG_MMX
|
%define SPLATB_REG SPLATB_REG_MMX
|
||||||
SIMPLE_LOOPFILTER mmx, v, 4
|
SIMPLE_LOOPFILTER mmx, v, 4
|
||||||
SIMPLE_LOOPFILTER mmx, h, 6
|
SIMPLE_LOOPFILTER mmx, h, 5
|
||||||
%define SPLATB_REG SPLATB_REG_MMXEXT
|
%define SPLATB_REG SPLATB_REG_MMXEXT
|
||||||
SIMPLE_LOOPFILTER mmxext, v, 4
|
SIMPLE_LOOPFILTER mmxext, v, 4
|
||||||
SIMPLE_LOOPFILTER mmxext, h, 6
|
SIMPLE_LOOPFILTER mmxext, h, 5
|
||||||
INIT_XMM
|
INIT_XMM
|
||||||
%define SPLATB_REG SPLATB_REG_SSE2
|
%define SPLATB_REG SPLATB_REG_SSE2
|
||||||
|
%define WRITE_8W WRITE_8W_SSE2
|
||||||
SIMPLE_LOOPFILTER sse2, v, 3
|
SIMPLE_LOOPFILTER sse2, v, 3
|
||||||
SIMPLE_LOOPFILTER sse2, h, 6
|
SIMPLE_LOOPFILTER sse2, h, 5
|
||||||
%define SPLATB_REG SPLATB_REG_SSSE3
|
%define SPLATB_REG SPLATB_REG_SSSE3
|
||||||
SIMPLE_LOOPFILTER ssse3, v, 3
|
SIMPLE_LOOPFILTER ssse3, v, 3
|
||||||
SIMPLE_LOOPFILTER ssse3, h, 6
|
SIMPLE_LOOPFILTER ssse3, h, 5
|
||||||
|
%define WRITE_8W WRITE_8W_SSE4
|
||||||
|
SIMPLE_LOOPFILTER sse4, h, 5
|
||||||
|
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
|
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
|
||||||
@ -2075,81 +2141,6 @@ INNER_LOOPFILTER ssse3, h, 6, 8, 13
|
|||||||
; int flimE, int flimI, int hev_thr);
|
; int flimE, int flimI, int hev_thr);
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
|
|
||||||
; write 4 or 8 words in the mmx/xmm registers as 8 lines
|
|
||||||
; 1 and 2 are the registers to write, this can be the same (for SSE2)
|
|
||||||
; for pre-SSE4:
|
|
||||||
; 3 is a general-purpose register that we will clobber
|
|
||||||
; for SSE4:
|
|
||||||
; 3 is a pointer to the destination's 5th line
|
|
||||||
; 4 is a pointer to the destination's 4th line
|
|
||||||
; 5/6 is -stride and +stride
|
|
||||||
%macro WRITE_2x4W 6
|
|
||||||
movd %3, %1
|
|
||||||
punpckhdq %1, %1
|
|
||||||
mov [%4+%5*4], %3w
|
|
||||||
shr %3, 16
|
|
||||||
add %4, %6
|
|
||||||
mov [%4+%5*4], %3w
|
|
||||||
|
|
||||||
movd %3, %1
|
|
||||||
add %4, %5
|
|
||||||
mov [%4+%5*2], %3w
|
|
||||||
shr %3, 16
|
|
||||||
mov [%4+%5 ], %3w
|
|
||||||
|
|
||||||
movd %3, %2
|
|
||||||
punpckhdq %2, %2
|
|
||||||
mov [%4 ], %3w
|
|
||||||
shr %3, 16
|
|
||||||
mov [%4+%6 ], %3w
|
|
||||||
|
|
||||||
movd %3, %2
|
|
||||||
add %4, %6
|
|
||||||
mov [%4+%6 ], %3w
|
|
||||||
shr %3, 16
|
|
||||||
mov [%4+%6*2], %3w
|
|
||||||
add %4, %5
|
|
||||||
%endmacro
|
|
||||||
|
|
||||||
%macro WRITE_8W_SSE2 5
|
|
||||||
movd %2, %1
|
|
||||||
psrldq %1, 4
|
|
||||||
mov [%3+%4*4], %2w
|
|
||||||
shr %2, 16
|
|
||||||
add %3, %5
|
|
||||||
mov [%3+%4*4], %2w
|
|
||||||
|
|
||||||
movd %2, %1
|
|
||||||
psrldq %1, 4
|
|
||||||
add %3, %4
|
|
||||||
mov [%3+%4*2], %2w
|
|
||||||
shr %2, 16
|
|
||||||
mov [%3+%4 ], %2w
|
|
||||||
|
|
||||||
movd %2, %1
|
|
||||||
psrldq %1, 4
|
|
||||||
mov [%3 ], %2w
|
|
||||||
shr %2, 16
|
|
||||||
mov [%3+%5 ], %2w
|
|
||||||
|
|
||||||
movd %2, %1
|
|
||||||
add %3, %5
|
|
||||||
mov [%3+%5 ], %2w
|
|
||||||
shr %2, 16
|
|
||||||
mov [%3+%5*2], %2w
|
|
||||||
%endmacro
|
|
||||||
|
|
||||||
%macro WRITE_8W_SSE4 5
|
|
||||||
pextrw [%3+%4*4], %1, 0
|
|
||||||
pextrw [%2+%4*4], %1, 1
|
|
||||||
pextrw [%3+%4*2], %1, 2
|
|
||||||
pextrw [%3+%4 ], %1, 3
|
|
||||||
pextrw [%3 ], %1, 4
|
|
||||||
pextrw [%2 ], %1, 5
|
|
||||||
pextrw [%2+%5 ], %1, 6
|
|
||||||
pextrw [%2+%5*2], %1, 7
|
|
||||||
%endmacro
|
|
||||||
|
|
||||||
%macro MBEDGE_LOOPFILTER 5
|
%macro MBEDGE_LOOPFILTER 5
|
||||||
%if %4 == 8 ; chroma
|
%if %4 == 8 ; chroma
|
||||||
cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
|
cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
|
||||||
|
Loading…
x
Reference in New Issue
Block a user