x86/hevc_idct: replace old and unused idct functions

Only 8-bit and 10-bit idct_dc() functions are included (adding others should be trivial). Benchmarks on an Intel Core i5-4200U: idct8x8_dc SSE2 MMXEXT C cycles 22 26 57 idct16x16_dc AVX2 SSE2 C cycles 27 32 249 idct32x32_dc AVX2 SSE2 C cycles 62 126 1375 Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2014-07-26 04:47:14 -03:00 · 2014-07-26 04:47:14 -03:00 · 1ace9573dc
commit 1ace9573dc
parent 23480da0aa
5 changed files with 93 additions and 285 deletions
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@ -56,8 +56,6 @@ typedef struct HEVCDSPContext {
    void (*idct_dc[4])(int16_t *coeffs);
    void (*transform_dc_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
    void (*sao_band_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
                            struct SAOParams *sao, int *borders,
                            int width, int height, int c_idx);
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@ -1,6 +1,7 @@
 ; /*
-; * Provide SSE & MMX idct functions for HEVC decoding
+; * SIMD optimized idct functions for HEVC decoding
 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
 ; * Copyright (c) 2014 James Almer
 ; *
 ; * This file is part of FFmpeg.
 ; *
@ -20,206 +21,86 @@
 ; */
 %include "libavutil/x86/x86util.asm"
 SECTION_RODATA 32
 max_pixels_10:          times 16  dw ((1 << 10)-1)
 dc_add_10:              times 4 dd ((1 << 14-10) + 1)
 SECTION_TEXT 32
-;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
+; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
-
+; %1 = HxW
-%macro DC_ADD_INIT 2
+; %2 = number of loops
-    add              %1w, ((1 << 14-8) + 1)
+; %3 = bitdepth
-    sar              %1w, (15-8)
+%macro IDCT_DC 3
-    movd              m0, %1d
+cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
-    lea               %1, [%2*3]
+    movsx             tmpq, word [coeffq]
-    SPLATW            m0, m0, 0
+    add               tmpw, ((1 << 14-%3) + 1)
-    pxor              m1, m1
+    sar               tmpw, (15-%3)
-    psubw             m1, m0
+    movd               xm0, tmpd
-    packuswb          m0, m0
+    SPLATW              m0, xm0
-    packuswb          m1, m1
+    DEFINE_ARGS coeff, cnt
    mov               cntd, %2
 .loop
    mova [coeffq+mmsize*0], m0
    mova [coeffq+mmsize*1], m0
    mova [coeffq+mmsize*2], m0
    mova [coeffq+mmsize*3], m0
    mova [coeffq+mmsize*4], m0
    mova [coeffq+mmsize*5], m0
    mova [coeffq+mmsize*6], m0
    mova [coeffq+mmsize*7], m0
    add  coeffq, mmsize*8
    dec  cntd
    jg  .loop
    RET
 %endmacro
-%macro DC_ADD_INIT_AVX2 2
+; %1 = HxW
-    add              %1w, ((1 << 14-8) + 1)
+; %2 = bitdepth
-    sar              %1w, (15-8)
+%macro IDCT_DC_NL 2 ; No loop
-    movd             xm0, %1d
+cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp
-    vpbroadcastw      m0, xm0    ;SPLATW
+    movsx             tmpq, word [coeffq]
-    lea               %1, [%2*3]
+    add               tmpw, ((1 << 14-%2) + 1)
-    pxor              m1, m1
+    sar               tmpw, (15-%2)
-    psubw             m1, m0
+    movd                m0, tmpd
-    packuswb          m0, m0
+    SPLATW              m0, xm0
-    packuswb          m1, m1
+    mova [coeffq+mmsize*0], m0
-%endmacro
+    mova [coeffq+mmsize*1], m0
-
+    mova [coeffq+mmsize*2], m0
-%macro DC_ADD_OP 4
+    mova [coeffq+mmsize*3], m0
-    %1                m2, [%2     ]
+%if mmsize == 16
-    %1                m3, [%2+%3  ]
+    mova [coeffq+mmsize*4], m0
-    %1                m4, [%2+%3*2]
+    mova [coeffq+mmsize*5], m0
-    %1                m5, [%2+%4  ]
+    mova [coeffq+mmsize*6], m0
-    paddusb           m2, m0
+    mova [coeffq+mmsize*7], m0
    paddusb           m3, m0
    paddusb           m4, m0
    paddusb           m5, m0
    psubusb           m2, m1
    psubusb           m3, m1
    psubusb           m4, m1
    psubusb           m5, m1
    %1         [%2     ], m2
    %1         [%2+%3  ], m3
    %1         [%2+%3*2], m4
    %1         [%2+%4  ], m5
 %endmacro
 INIT_MMX mmxext
 ; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 %if ARCH_X86_64
 cglobal hevc_idct4_dc_add_8, 3, 4, 0
    movsx             r3, word [r1]
    DC_ADD_INIT       r3, r2
    DC_ADD_OP       movh, r0, r2, r3
    RET
 ; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_idct8_dc_add_8, 3, 4, 0
    movsx             r3, word [r1]
    DC_ADD_INIT       r3, r2
    DC_ADD_OP       mova, r0, r2, r3
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
    RET
 %else
 ; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_idct4_dc_add_8, 2, 3, 0
    movsx             r2, word [r1]
    mov               r1, r2m
    DC_ADD_INIT       r2, r1
    DC_ADD_OP       movh, r0, r1, r2
    RET
 ; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_idct8_dc_add_8, 2, 3, 0
    movsx             r2, word [r1]
    mov               r1, r2m
    DC_ADD_INIT       r2, r1
    DC_ADD_OP       mova, r0, r1, r2
    lea               r0, [r0+r1*4]
    DC_ADD_OP       mova, r0, r1, r2
    RET
 %endif
    RET
 %endmacro
 ; 8-bit
 INIT_MMX mmxext
 IDCT_DC_NL  4,      8
 IDCT_DC     8,  2,  8
 INIT_XMM sse2
-; void ff_hevc_idct16_dc_add_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+IDCT_DC_NL  8,      8
-cglobal hevc_idct16_dc_add_8, 3, 4, 6
+IDCT_DC    16,  4,  8
-    movsx             r3, word [r1]
+IDCT_DC    32, 16,  8
    DC_ADD_INIT       r3, r2
    DC_ADD_OP       mova, r0, r2, r3
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
    RET
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
-; void ff_hevc_idct32_dc_add_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+IDCT_DC    16,  2,  8
-cglobal hevc_idct32_dc_add_8, 3, 4, 6
+IDCT_DC    32,  8,  8
    movsx             r3, word [r1]
    DC_ADD_INIT_AVX2  r3, r2
    DC_ADD_OP       mova, r0, r2, r3,
 %rep 7
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
 %endrep
    RET
 %endif ;HAVE_AVX2_EXTERNAL
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
 %macro IDCT_DC_ADD_OP_10 3
    pxor              m5, m5
 %if avx_enabled
    paddw             m1, m0, [%1+0   ]
    paddw             m2, m0, [%1+%2  ]
    paddw             m3, m0, [%1+%2*2]
    paddw             m4, m0, [%1+%3  ]
 %else
    mova              m1, [%1+0   ]
    mova              m2, [%1+%2  ]
    mova              m3, [%1+%2*2]
    mova              m4, [%1+%3  ]
    paddw             m1, m0
    paddw             m2, m0
    paddw             m3, m0
    paddw             m4, m0
 %endif
    CLIPW             m1, m5, m6
    CLIPW             m2, m5, m6
    CLIPW             m3, m5, m6
    CLIPW             m4, m5, m6
    mova       [%1+0   ], m1
    mova       [%1+%2  ], m2
    mova       [%1+%2*2], m3
    mova       [%1+%3  ], m4
 %endmacro
 ; 10-bit
 INIT_MMX mmxext
-cglobal hevc_idct4_dc_add_10,3,3
+IDCT_DC_NL  4,     10
-    mov              r1w, [r1]
+IDCT_DC     8,  2, 10
    add              r1w, ((1 << 4) + 1)
    sar              r1w, 5
    movd              m0, r1d
    lea               r1, [r2*3]
    SPLATW            m0, m0, 0
    mova              m6, [max_pixels_10]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    RET
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
 %macro IDCT8_DC_ADD 0
 cglobal hevc_idct8_dc_add_10,3,4,7
    mov              r1w, [r1]
    add              r1w, ((1 << 4) + 1)
    sar              r1w, 5
    movd              m0, r1d
    lea               r1, [r2*3]
    SPLATW            m0, m0, 0
    mova              m6, [max_pixels_10]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    lea               r0, [r0+r2*4]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    RET
 %endmacro
 INIT_XMM sse2
-IDCT8_DC_ADD
+IDCT_DC_NL  8,     10
-%if HAVE_AVX_EXTERNAL
+IDCT_DC    16,  4, 10
-INIT_XMM avx
+IDCT_DC    32, 16, 10
 IDCT8_DC_ADD
 %endif
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
-cglobal hevc_idct16_dc_add_10,3,4,7
+IDCT_DC    16,  2, 10
-    mov              r1w, [r1]
+IDCT_DC    32,  8, 10
-    add              r1w, ((1 << 4) + 1)
+%endif ;HAVE_AVX2_EXTERNAL
    sar              r1w, 5
    movd             xm0, r1d
    lea               r1, [r2*3]
    vpbroadcastw      m0, xm0    ;SPLATW
    mova              m6, [max_pixels_10]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    lea               r0, [r0+r2*4]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    lea               r0, [r0+r2*4]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    lea               r0, [r0+r2*4]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    RET
 %endif ;HAVE_AVX_EXTERNAL
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@ -131,32 +131,4 @@ WEIGHTING_PROTOTYPES(8, sse4);
 WEIGHTING_PROTOTYPES(10, sse4);
 WEIGHTING_PROTOTYPES(12, sse4);
 ///////////////////////////////////////////////////////////////////////////////
 // IDCT
 ///////////////////////////////////////////////////////////////////////////////
 idct_dc_proto(4, 8,mmxext);
 idct_dc_proto(8, 8,mmxext);
 idct_dc_proto(16,8,  sse2);
 idct_dc_proto(32,8,  sse2);
 idct_dc_proto(32,8,  avx2);
 idct_dc_proto(4, 10,mmxext);
 idct_dc_proto(8, 10,  sse2);
 idct_dc_proto(16,10,  sse2);
 idct_dc_proto(32,10,  sse2);
 idct_dc_proto(8, 10,   avx);
 idct_dc_proto(16,10,   avx);
 idct_dc_proto(32,10,   avx);
 idct_dc_proto(16,10,  avx2);
 idct_dc_proto(32,10,  avx2);
 #endif // AVCODEC_X86_HEVCDSP_H
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@ -54,59 +54,17 @@ LFL_FUNCS(uint8_t,   8, ssse3)
 LFL_FUNCS(uint8_t,  10, ssse3)
 LFL_FUNCS(uint8_t,  12, ssse3)
-#if HAVE_SSE2_EXTERNAL
+#define IDCT_FUNCS(W, opt) \
-void ff_hevc_idct32_dc_add_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \
-{
+void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs)
    ff_hevc_idct16_dc_add_8_sse2(dst, coeffs, stride);
    ff_hevc_idct16_dc_add_8_sse2(dst+16, coeffs, stride);
    ff_hevc_idct16_dc_add_8_sse2(dst+16*stride, coeffs, stride);
    ff_hevc_idct16_dc_add_8_sse2(dst+16*stride+16, coeffs, stride);
 }
-void ff_hevc_idct16_dc_add_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+IDCT_FUNCS(4x4,   mmxext);
-{
+IDCT_FUNCS(8x8,   mmxext);
-    ff_hevc_idct8_dc_add_10_sse2(dst, coeffs, stride);
+IDCT_FUNCS(8x8,   sse2);
-    ff_hevc_idct8_dc_add_10_sse2(dst+16, coeffs, stride);
+IDCT_FUNCS(16x16, sse2);
-    ff_hevc_idct8_dc_add_10_sse2(dst+8*stride, coeffs, stride);
+IDCT_FUNCS(32x32, sse2);
-    ff_hevc_idct8_dc_add_10_sse2(dst+8*stride+16, coeffs, stride);
+IDCT_FUNCS(16x16, avx2);
-}
+IDCT_FUNCS(32x32, avx2);
 void ff_hevc_idct32_dc_add_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct16_dc_add_10_sse2(dst, coeffs, stride);
    ff_hevc_idct16_dc_add_10_sse2(dst+32, coeffs, stride);
    ff_hevc_idct16_dc_add_10_sse2(dst+16*stride, coeffs, stride);
    ff_hevc_idct16_dc_add_10_sse2(dst+16*stride+32, coeffs, stride);
 }
 #endif //HAVE_SSE2_EXTERNAL
 #if HAVE_AVX_EXTERNAL
 void ff_hevc_idct16_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct8_dc_add_10_avx(dst, coeffs, stride);
    ff_hevc_idct8_dc_add_10_avx(dst+16, coeffs, stride);
    ff_hevc_idct8_dc_add_10_avx(dst+8*stride, coeffs, stride);
    ff_hevc_idct8_dc_add_10_avx(dst+8*stride+16, coeffs, stride);
 }
 void ff_hevc_idct32_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct16_dc_add_10_avx(dst, coeffs, stride);
    ff_hevc_idct16_dc_add_10_avx(dst+32, coeffs, stride);
    ff_hevc_idct16_dc_add_10_avx(dst+16*stride, coeffs, stride);
    ff_hevc_idct16_dc_add_10_avx(dst+16*stride+32, coeffs, stride);
 }
 #endif //HAVE_AVX_EXTERNAL
 #if HAVE_AVX2_EXTERNAL
 void ff_hevc_idct32_dc_add_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct16_dc_add_10_avx2(dst, coeffs, stride);
    ff_hevc_idct16_dc_add_10_avx2(dst+32, coeffs, stride);
    ff_hevc_idct16_dc_add_10_avx2(dst+16*stride, coeffs, stride);
    ff_hevc_idct16_dc_add_10_avx2(dst+16*stride+32, coeffs, stride);
 }
 #endif //HAVE_AVX2_EXTERNAL
 #define mc_rep_func(name, bitd, step, W, opt) \
 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride,                            \
@ -504,8 +462,8 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
    if (bit_depth == 8) {
        if (EXTERNAL_MMXEXT(mm_flags)) {
-            c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_8_mmxext;
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
-            c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_8_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
        }
        if (EXTERNAL_SSE2(mm_flags)) {
            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
@ -515,8 +473,9 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
            }
-            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_8_sse2;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
-            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_8_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
        }
        if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
@ -535,12 +494,13 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
        }
        if (EXTERNAL_AVX2(mm_flags)) {
-            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_8_avx2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
        }
    } else if (bit_depth == 10) {
        if (EXTERNAL_MMXEXT(mm_flags)) {
-            c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_10_mmxext;
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
-
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
        }
        if (EXTERNAL_SSE2(mm_flags)) {
            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
@ -550,9 +510,9 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
            }
-            c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_10_sse2;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
-            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
-            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
        }
        if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
@ -569,14 +529,9 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
        }
        if (EXTERNAL_AVX(mm_flags)) {
            c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_10_avx;
            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_avx;
            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_avx;
        }
        if (EXTERNAL_AVX2(mm_flags)) {
-            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_avx2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
-            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
        }
    } else if (bit_depth == 12) {
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@ -598,7 +598,9 @@
 %endmacro
 %macro SPLATW 2-3 0
-%if mmsize == 16
+%if cpuflag(avx2) && %3 == 0
    vpbroadcastw %1, %2
 %elif mmsize == 16
    pshuflw    %1, %2, (%3)*0x55
    punpcklqdq %1, %1
 %elif cpuflag(mmxext)