vp9/x86: add AVX for itxfm and lpf.

4412 decicycles in ff_vp9_loop_filter_h_16_16_ssse3, 4193462 runs, 842 skips
3600 decicycles in ff_vp9_loop_filter_h_16_16_avx, 4193621 runs, 683 skips

3010 decicycles in ff_vp9_loop_filter_v_16_16_ssse3, 4193528 runs, 776 skips
2678 decicycles in ff_vp9_loop_filter_v_16_16_avx, 4193742 runs, 562 skips

23025 decicycles in ff_vp9_idct_idct_32x32_add_ssse3, 2096871 runs, 281 skips
19943 decicycles in ff_vp9_idct_idct_32x32_add_avx, 2096815 runs, 337 skips

4675 decicycles in ff_vp9_idct_idct_16x16_add_ssse3, 4194018 runs, 286 skips
3980 decicycles in ff_vp9_idct_idct_16x16_add_avx, 4194022 runs, 282 skips

967 decicycles in ff_vp9_idct_idct_8x8_add_ssse3, 16776972 runs, 244 skips
887 decicycles in ff_vp9_idct_idct_8x8_add_avx, 16777002 runs, 214 skips
This commit is contained in:
Clément Bœsch 2014-01-14 08:09:48 +01:00 committed by Clément Bœsch
parent 53e6977c07
commit 8b4190da93
3 changed files with 42 additions and 4 deletions

View File

@ -159,11 +159,16 @@ filters_8tap_1d_fn3(avg)
void ff_vp9_idct_idct_4x4_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); void ff_vp9_idct_idct_4x4_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_idct_idct_8x8_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); void ff_vp9_idct_idct_8x8_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_idct_idct_8x8_add_avx (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_idct_idct_16x16_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); void ff_vp9_idct_idct_16x16_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_idct_idct_16x16_add_avx (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_idct_idct_32x32_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); void ff_vp9_idct_idct_32x32_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_idct_idct_32x32_add_avx (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_loop_filter_v_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H); void ff_vp9_loop_filter_v_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
void ff_vp9_loop_filter_v_16_16_avx (uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
void ff_vp9_loop_filter_h_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H); void ff_vp9_loop_filter_h_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
void ff_vp9_loop_filter_h_16_16_avx (uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
#endif /* HAVE_YASM */ #endif /* HAVE_YASM */
@ -231,6 +236,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
} }
} }
if (EXTERNAL_AVX(cpu_flags)) {
if (ARCH_X86_64) {
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
dsp->itxfm_add[TX_32X32][ADST_ADST] =
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_avx;
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_avx;
}
}
#undef init_fpel #undef init_fpel
#undef init_subpel1 #undef init_subpel1
#undef init_subpel2 #undef init_subpel2

View File

@ -289,7 +289,8 @@ cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob
VP9_STORE_2X 10, 11, 6, 7, 4 VP9_STORE_2X 10, 11, 6, 7, 4
%endmacro %endmacro
INIT_XMM ssse3 %macro VP9_IDCT_IDCT_8x8_ADD_XMM 1
INIT_XMM %1
cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
mova m12, [pw_11585x2] ; often used mova m12, [pw_11585x2] ; often used
@ -376,6 +377,10 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
mova [blockq+112], m4 mova [blockq+112], m4
VP9_IDCT8_WRITEOUT VP9_IDCT8_WRITEOUT
RET RET
%endmacro
VP9_IDCT_IDCT_8x8_ADD_XMM ssse3
VP9_IDCT_IDCT_8x8_ADD_XMM avx
;--------------------------------------------------------------------------------------------- ;---------------------------------------------------------------------------------------------
; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); ; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
@ -655,7 +660,8 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
mova [dstq+%7], m%4 mova [dstq+%7], m%4
%endmacro %endmacro
INIT_XMM ssse3 %macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
INIT_XMM %1
cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
; 2x2=eob=3, 4x4=eob=10 ; 2x2=eob=3, 4x4=eob=10
cmp eobd, 38 cmp eobd, 38
@ -724,6 +730,10 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
; use that to zero out block coefficients ; use that to zero out block coefficients
ZERO_BLOCK blockq, 32, 16, m0 ZERO_BLOCK blockq, 32, 16, m0
RET RET
%endmacro
VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
VP9_IDCT_IDCT_16x16_ADD_XMM avx
;--------------------------------------------------------------------------------------------- ;---------------------------------------------------------------------------------------------
; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); ; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
@ -1102,7 +1112,8 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
%endif %endif
%endmacro %endmacro
INIT_XMM ssse3 %macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
INIT_XMM %1
cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
cmp eobd, 135 cmp eobd, 135
jg .idctfull jg .idctfull
@ -1213,5 +1224,9 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
; use that to zero out block coefficients ; use that to zero out block coefficients
ZERO_BLOCK blockq, 64, 32, m7 ZERO_BLOCK blockq, 64, 32, m7
RET RET
%endmacro
VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
VP9_IDCT_IDCT_32x32_ADD_XMM avx
%endif ; x86-64 %endif ; x86-64

View File

@ -655,12 +655,17 @@ SECTION .text
%endif %endif
%endmacro %endmacro
INIT_XMM ssse3 %macro LPF_16_16_VH 1
INIT_XMM %1
cglobal vp9_loop_filter_v_16_16, 5,8,16, dst, stride, E, I, H, mstride, dst1, dst2 cglobal vp9_loop_filter_v_16_16, 5,8,16, dst, stride, E, I, H, mstride, dst1, dst2
LPF_16_16 v LPF_16_16 v
RET RET
cglobal vp9_loop_filter_h_16_16, 5,8,16, 256, dst, stride, E, I, H, mstride, dst1, dst2 cglobal vp9_loop_filter_h_16_16, 5,8,16, 256, dst, stride, E, I, H, mstride, dst1, dst2
LPF_16_16 h LPF_16_16 h
RET RET
%endmacro
LPF_16_16_VH ssse3
LPF_16_16_VH avx
%endif ; x86-64 %endif ; x86-64