x86/vp9lpf: add ff_vp9_loop_filter_h_{48,84}_16_{sse2,ssse3,avx}().
5.40s → 5.30s overall decode time with -threads 1 on ped1080p.webm (i7 920, ssse3)
This commit is contained in:
		@@ -187,6 +187,12 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
 | 
			
		||||
lpf_funcs(16, 16, sse2);
 | 
			
		||||
lpf_funcs(16, 16, ssse3);
 | 
			
		||||
lpf_funcs(16, 16, avx);
 | 
			
		||||
lpf_funcs(84, 16, sse2);
 | 
			
		||||
lpf_funcs(84, 16, ssse3);
 | 
			
		||||
lpf_funcs(84, 16, avx);
 | 
			
		||||
lpf_funcs(48, 16, sse2);
 | 
			
		||||
lpf_funcs(48, 16, ssse3);
 | 
			
		||||
lpf_funcs(48, 16, avx);
 | 
			
		||||
lpf_funcs(88, 16, sse2);
 | 
			
		||||
lpf_funcs(88, 16, ssse3);
 | 
			
		||||
lpf_funcs(88, 16, avx);
 | 
			
		||||
@@ -224,6 +230,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
 | 
			
		||||
    init_subpel2(idx, 0, 1, v, type, opt); \
 | 
			
		||||
    init_subpel2(idx, 1, 0, h, type, opt)
 | 
			
		||||
 | 
			
		||||
#define init_lpf(opt) do { \
 | 
			
		||||
    if (ARCH_X86_64) { \
 | 
			
		||||
        dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
 | 
			
		||||
        dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
 | 
			
		||||
        dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
 | 
			
		||||
        dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
 | 
			
		||||
        dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
 | 
			
		||||
        dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
 | 
			
		||||
        dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
 | 
			
		||||
        dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
 | 
			
		||||
    } \
 | 
			
		||||
} while (0)
 | 
			
		||||
 | 
			
		||||
    if (EXTERNAL_MMX(cpu_flags)) {
 | 
			
		||||
        init_fpel(4, 0,  4, put, mmx);
 | 
			
		||||
        init_fpel(3, 0,  8, put, mmx);
 | 
			
		||||
@@ -248,12 +267,7 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
 | 
			
		||||
        init_fpel(2, 1, 16, avg, sse2);
 | 
			
		||||
        init_fpel(1, 1, 32, avg, sse2);
 | 
			
		||||
        init_fpel(0, 1, 64, avg, sse2);
 | 
			
		||||
        if (ARCH_X86_64) {
 | 
			
		||||
            dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_sse2;
 | 
			
		||||
            dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_sse2;
 | 
			
		||||
            dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2;
 | 
			
		||||
            dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2;
 | 
			
		||||
        }
 | 
			
		||||
        init_lpf(sse2);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (EXTERNAL_SSSE3(cpu_flags)) {
 | 
			
		||||
@@ -276,11 +290,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
 | 
			
		||||
            dsp->itxfm_add[TX_32X32][ADST_DCT] =
 | 
			
		||||
            dsp->itxfm_add[TX_32X32][DCT_ADST] =
 | 
			
		||||
            dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
 | 
			
		||||
            dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_ssse3;
 | 
			
		||||
            dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_ssse3;
 | 
			
		||||
            dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_ssse3;
 | 
			
		||||
            dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_ssse3;
 | 
			
		||||
        }
 | 
			
		||||
        init_lpf(ssse3);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (EXTERNAL_AVX(cpu_flags)) {
 | 
			
		||||
@@ -297,11 +308,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
 | 
			
		||||
            dsp->itxfm_add[TX_32X32][ADST_DCT] =
 | 
			
		||||
            dsp->itxfm_add[TX_32X32][DCT_ADST] =
 | 
			
		||||
            dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
 | 
			
		||||
            dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_avx;
 | 
			
		||||
            dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_avx;
 | 
			
		||||
            dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_avx;
 | 
			
		||||
            dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_avx;
 | 
			
		||||
        }
 | 
			
		||||
        init_lpf(avx);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#undef init_fpel
 | 
			
		||||
 
 | 
			
		||||
@@ -44,6 +44,11 @@ pw_8:   times  8 dw 8
 | 
			
		||||
mask_mix: times 8 db 0
 | 
			
		||||
          times 8 db 1
 | 
			
		||||
 | 
			
		||||
mask_mix84: times 8 db 0xff
 | 
			
		||||
            times 8 db 0x00
 | 
			
		||||
mask_mix48: times 8 db 0x00
 | 
			
		||||
            times 8 db 0xff
 | 
			
		||||
 | 
			
		||||
SECTION .text
 | 
			
		||||
 | 
			
		||||
; %1 = abs(%2-%3)
 | 
			
		||||
@@ -324,7 +329,7 @@ SECTION .text
 | 
			
		||||
    neg mstride3q
 | 
			
		||||
 | 
			
		||||
%ifidn %1, h
 | 
			
		||||
%if %2 == 88
 | 
			
		||||
%if %2 > 16
 | 
			
		||||
%define movx movh
 | 
			
		||||
    lea dstq, [dstq + 8*strideq - 4]
 | 
			
		||||
%else
 | 
			
		||||
@@ -372,7 +377,7 @@ SECTION .text
 | 
			
		||||
%define Q6 rsp + 224
 | 
			
		||||
%define Q7 rsp + 240
 | 
			
		||||
 | 
			
		||||
%if %2 != 88
 | 
			
		||||
%if %2 == 16
 | 
			
		||||
    TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
 | 
			
		||||
    mova           [P7],  m0
 | 
			
		||||
    mova           [P6],  m1
 | 
			
		||||
@@ -389,7 +394,7 @@ SECTION .text
 | 
			
		||||
    mova           [Q1],  m9
 | 
			
		||||
    mova           [Q2], m10
 | 
			
		||||
    mova           [Q3], m11
 | 
			
		||||
%if %2 != 88
 | 
			
		||||
%if %2 == 16
 | 
			
		||||
    mova           [Q4], m12
 | 
			
		||||
    mova           [Q5], m13
 | 
			
		||||
    mova           [Q6], m14
 | 
			
		||||
@@ -404,7 +409,7 @@ SECTION .text
 | 
			
		||||
%endif
 | 
			
		||||
    SPLATB_REG          m2, I, m0                       ; I I I I ...
 | 
			
		||||
    SPLATB_REG          m3, E, m0                       ; E E E E ...
 | 
			
		||||
%elif %2 == 88
 | 
			
		||||
%else
 | 
			
		||||
%if cpuflag(ssse3)
 | 
			
		||||
    mova                m0, [mask_mix]
 | 
			
		||||
%endif
 | 
			
		||||
@@ -462,7 +467,7 @@ SECTION .text
 | 
			
		||||
    ABSSUB_CMP          m1, m9, m11, m6, m4, m5, m8     ; abs(p2 - p0) <= 1
 | 
			
		||||
    pand                m2, m1
 | 
			
		||||
    ABSSUB              m4, m10, m11, m5                ; abs(p1 - p0)
 | 
			
		||||
%if %2 != 88
 | 
			
		||||
%if %2 == 16
 | 
			
		||||
%if cpuflag(ssse3)
 | 
			
		||||
    pxor                m0, m0
 | 
			
		||||
%endif
 | 
			
		||||
@@ -490,8 +495,11 @@ SECTION .text
 | 
			
		||||
    pand                m2, m1
 | 
			
		||||
    ABSSUB_CMP          m1, m15, m12, m6, m4, m5, m8    ; abs(q3 - q0) <= 1
 | 
			
		||||
    pand                m2, m1                          ; flat8in final value
 | 
			
		||||
%if %2 == 84 || %2 == 48
 | 
			
		||||
    pand                m2, [mask_mix%2]
 | 
			
		||||
%endif
 | 
			
		||||
 | 
			
		||||
%if %2 != 88
 | 
			
		||||
%if %2 == 16
 | 
			
		||||
    ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
 | 
			
		||||
    ; calc flat8out mask
 | 
			
		||||
    mova                m8, [P7]
 | 
			
		||||
@@ -584,7 +592,7 @@ SECTION .text
 | 
			
		||||
    ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
 | 
			
		||||
    ; filter6()
 | 
			
		||||
    pxor                m0, m0
 | 
			
		||||
%if %2 == 88
 | 
			
		||||
%if %2 > 16
 | 
			
		||||
    pand                m3, m2
 | 
			
		||||
%else
 | 
			
		||||
    pand                m2, m3                          ;               mask(fm) & mask(in)
 | 
			
		||||
@@ -622,7 +630,7 @@ SECTION .text
 | 
			
		||||
    ; q5  +5  -p2 -q4 +q5 +q7                 .  q5   .               .
 | 
			
		||||
    ; q6  +6  -p1 -q5 +q6 +q7                     .  q6   .           .
 | 
			
		||||
 | 
			
		||||
%if %2 != 88
 | 
			
		||||
%if %2 == 16
 | 
			
		||||
    pand            m1, m2                                                              ; mask(out) & (mask(fm) & mask(in))
 | 
			
		||||
    mova            m2, [P7]
 | 
			
		||||
    mova            m3, [P6]
 | 
			
		||||
@@ -645,7 +653,7 @@ SECTION .text
 | 
			
		||||
%endif
 | 
			
		||||
 | 
			
		||||
%ifidn %1, h
 | 
			
		||||
%if %2 != 88
 | 
			
		||||
%if %2 == 16
 | 
			
		||||
    mova                    m0, [P7]
 | 
			
		||||
    mova                    m1, [P6]
 | 
			
		||||
    mova                    m2, [P5]
 | 
			
		||||
@@ -753,28 +761,23 @@ SECTION .text
 | 
			
		||||
    RET
 | 
			
		||||
%endmacro
 | 
			
		||||
 | 
			
		||||
%macro LPF_16_16_VH 1
 | 
			
		||||
INIT_XMM %1
 | 
			
		||||
cglobal vp9_loop_filter_v_16_16, 5,10,16,      dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
 | 
			
		||||
    LOOPFILTER v, 16
 | 
			
		||||
cglobal vp9_loop_filter_h_16_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
 | 
			
		||||
    LOOPFILTER h, 16
 | 
			
		||||
%macro LPF_16_VH 2
 | 
			
		||||
INIT_XMM %2
 | 
			
		||||
cglobal vp9_loop_filter_v_%1_16, 5,10,16,      dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
 | 
			
		||||
    LOOPFILTER v, %1
 | 
			
		||||
cglobal vp9_loop_filter_h_%1_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
 | 
			
		||||
    LOOPFILTER h, %1
 | 
			
		||||
%endmacro
 | 
			
		||||
 | 
			
		||||
%macro LPF_88_16_VH 1
 | 
			
		||||
INIT_XMM %1
 | 
			
		||||
cglobal vp9_loop_filter_v_88_16, 5,10,16,      dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
 | 
			
		||||
    LOOPFILTER v, 88
 | 
			
		||||
cglobal vp9_loop_filter_h_88_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
 | 
			
		||||
    LOOPFILTER h, 88
 | 
			
		||||
%macro LPF_16_VH_ALL_OPTS 1
 | 
			
		||||
LPF_16_VH %1, sse2
 | 
			
		||||
LPF_16_VH %1, ssse3
 | 
			
		||||
LPF_16_VH %1, avx
 | 
			
		||||
%endmacro
 | 
			
		||||
 | 
			
		||||
LPF_16_16_VH sse2
 | 
			
		||||
LPF_16_16_VH ssse3
 | 
			
		||||
LPF_16_16_VH avx
 | 
			
		||||
 | 
			
		||||
LPF_88_16_VH sse2
 | 
			
		||||
LPF_88_16_VH ssse3
 | 
			
		||||
LPF_88_16_VH avx
 | 
			
		||||
LPF_16_VH_ALL_OPTS 16
 | 
			
		||||
LPF_16_VH_ALL_OPTS 48
 | 
			
		||||
LPF_16_VH_ALL_OPTS 84
 | 
			
		||||
LPF_16_VH_ALL_OPTS 88
 | 
			
		||||
 | 
			
		||||
%endif ; x86-64
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user