From 3f31aff4dc5037cec1440f39aea8be54432a6aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sindre=20Aam=C3=A5s?= Date: Tue, 19 Apr 2016 16:41:10 +0200 Subject: [PATCH] [Encoder] Add an SSE4.2 implementation of CavlcParamCal Use a combination of table lookups and pshufb to convert coefficients to zero run/level format. Two 16-entry lookup tables are used for a total of 192 bytes worth of tables. (The existing SSE2 version uses a table of size 2048 bytes.) Speedup is ~1.5x-3x as compared with the SSE2 version on Haswell (the speedup is greater for input with many trailing zeros). The use of popcnt makes it require SSE4.2. This can be replaced with a small LUT and accumulation which would reduce the requirement to SSSE3. --- codec/encoder/core/inc/set_mb_syn_cavlc.h | 2 + codec/encoder/core/src/set_mb_syn_cavlc.cpp | 5 + codec/encoder/core/x86/coeff.asm | 211 +++++++++++++++++++- test/encoder/EncUT_Cavlc.cpp | 7 + 4 files changed, 224 insertions(+), 1 deletion(-) diff --git a/codec/encoder/core/inc/set_mb_syn_cavlc.h b/codec/encoder/core/inc/set_mb_syn_cavlc.h index 4faffc2d..37735b9b 100644 --- a/codec/encoder/core/inc/set_mb_syn_cavlc.h +++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h @@ -80,6 +80,8 @@ int32_t CavlcParamCal_c (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, in #ifdef X86_ASM int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs , int32_t iEndIdx); +int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs , + int32_t iEndIdx); #endif #if defined(__cplusplus) diff --git a/codec/encoder/core/src/set_mb_syn_cavlc.cpp b/codec/encoder/core/src/set_mb_syn_cavlc.cpp index 676a4cf0..c5d79edf 100644 --- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp +++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp @@ -278,6 +278,11 @@ void InitCoeffFunc (SWelsFuncPtrList* pFuncList, const uint32_t uiCpuFlag, int32 if (uiCpuFlag & WELS_CPU_SSE2) { pFuncList->pfCavlcParamCal = CavlcParamCal_sse2; } +#endif +#ifdef X86_ASM + if (uiCpuFlag & WELS_CPU_SSE42) { + pFuncList->pfCavlcParamCal = CavlcParamCal_sse42; + } #endif if (iEntropyCodingModeFlag) { pFuncList->pfStashMBStatus = StashMBStatusCabac; diff --git a/codec/encoder/core/x86/coeff.asm b/codec/encoder/core/x86/coeff.asm index ccc9ded9..98d57b41 100644 --- a/codec/encoder/core/x86/coeff.asm +++ b/codec/encoder/core/x86/coeff.asm @@ -42,10 +42,57 @@ %include "asm_inc.asm" +SECTION .rodata align=16 + +align 16 + +wels_shufb_rev: + db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +; 4-bit table giving number of preceding zeros for each set bit as well as the +; eventual next bit. For the case where all 4 bits are set, this requires 5 +; zeros. The 5th zero can either be read from beyond the final table entry or +; implied via zero-initializing the location being read into. +wels_cavlc_param_cal_run_lut: + db 4, 0, 0, 0 + db 0, 3, 0, 0 + db 1, 2, 0, 0 + db 0, 0, 2, 0 + db 2, 1, 0, 0 + db 0, 1, 1, 0 + db 1, 0, 1, 0 + db 0, 0, 0, 1 + db 3, 0, 0, 0 + db 0, 2, 0, 0 + db 1, 1, 0, 0 + db 0, 0, 1, 0 + db 2, 0, 0, 0 + db 0, 1, 0, 0 + db 1, 0, 0, 0 + db 0, 0, 0, 0 +; db 0 +; 4-bit table giving pshufb vectors for compacting 4-word vectors by removing +; the words that match zero bits and concatenating in reverse order. +wels_cavlc_param_cal_shufb_lut: + db 0, 0, 0, 0, 0, 0, 0, 0 + db 6, 7, 0, 0, 0, 0, 0, 0 + db 4, 5, 0, 0, 0, 0, 0, 0 + db 6, 7, 4, 5, 0, 0, 0, 0 + db 2, 3, 0, 0, 0, 0, 0, 0 + db 6, 7, 2, 3, 0, 0, 0, 0 + db 4, 5, 2, 3, 0, 0, 0, 0 + db 6, 7, 4, 5, 2, 3, 0, 0 + db 0, 1, 0, 0, 0, 0, 0, 0 + db 6, 7, 0, 1, 0, 0, 0, 0 + db 4, 5, 0, 1, 0, 0, 0, 0 + db 6, 7, 4, 5, 0, 1, 0, 0 + db 2, 3, 0, 1, 0, 0, 0, 0 + db 6, 7, 2, 3, 0, 1, 0, 0 + db 4, 5, 2, 3, 0, 1, 0, 0 + db 6, 7, 4, 5, 2, 3, 0, 1 %ifdef X86_32 -SECTION .rodata align=16 align 16 sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8 @@ -312,12 +359,15 @@ byte_1pos_table: db 7,6,5,4,3,2,1,7, ;254 db 7,6,5,4,3,2,1,8, ;255 +%endif ; X86_32 + ;*********************************************************************** ; Code ;*********************************************************************** SECTION .text +%ifdef X86_32 ;*********************************************************************** ;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx); @@ -457,3 +507,162 @@ WELS_EXTERN CavlcParamCal_sse2 pop ebx ret %endif + +;*********************************************************************** +;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx); +;*********************************************************************** + +WELS_EXTERN CavlcParamCal_sse42 +%define p_coeff_level r0 +%define p_run r1 +%define p_level r2 +%define p_total_coeffs r3 +%define i_endidxd r4d + +%ifdef X86_32 + push r5 + push r6 + %assign push_num 2 + %define r_mask r5 + %define r_maskd r5d + %define p_shufb_lut wels_cavlc_param_cal_shufb_lut + %define p_run_lut wels_cavlc_param_cal_run_lut +%elifdef WIN64 + push rbx + %assign push_num 1 + %define r_mask rbx + %define r_maskd ebx + %define p_shufb_lut r5 + %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut)) + lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut] +%else + %assign push_num 0 + %define r_mask rax + %define r_maskd eax + %define p_shufb_lut r5 + %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut)) + lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut] +%endif + + LOAD_5_PARA + PUSH_XMM 2 + + ; Free up rcx/ecx because only cl is accepted as shift amount operand. +%ifidni r0b, cl + mov r6, r0 + %undef p_coeff_level + %define p_coeff_level r6 + %define r_tmp r0 + %define r_tmpd r0d + %define r_tmpb r0b +%elifidni r1b, cl + mov r6, r1 + %undef p_run + %define p_run r6 + %define r_tmp r1 + %define r_tmpd r1d + %define r_tmpb r1b +%elifidni r3b, cl + mov r6, r3 + %undef p_total_coeffs + %define p_total_coeffs r6 + %define r_tmp r3 + %define r_tmpd r3d + %define r_tmpb r3b +%else + %error "Unknown cl register." +%endif + + ; Acquire a bitmask indicating which words are non-zero. + ; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3. + ; Otherwise, assume 8 bytes available. Assume that input beyond endIdx is zero. + ; Assumptions are taken from previous implementations. + pxor xmm1, xmm1 + cmp i_endidxd, 3 + jg .load16 + movq xmm0, [p_coeff_level] + packsswb xmm0, xmm1 + jmp .load_done +.load16: + movdqa xmm0, [p_coeff_level] + packsswb xmm0, [p_coeff_level + 16] +.load_done: + movdqa [p_run], xmm1 ; Zero-initialize because we may read back implied zeros. + pcmpeqb xmm0, xmm1 + pshufb xmm0, [wels_shufb_rev] + pmovmskb r_maskd, xmm0 + xor r_maskd, 0FFFFh + mov r_tmpd, i_endidxd +%undef i_endidxd +%define r_tmp2 r4 +%define r_tmp2d r4d + popcnt r_tmp2d, r_maskd + mov [p_total_coeffs], r_tmp2d +%xdefine i_total_zeros p_total_coeffs +%undef p_total_coeffs + mov i_total_zeros, r_tmp2 + jz .done + mov i_total_zeros, 16 + sub i_total_zeros, r_tmp2 + bsf r_tmpd, r_maskd ; Find first set bit. + sub i_total_zeros, r_tmp + ; Skip trailing zeros. + ; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores. + and r_tmpd, -4 + shr r_maskd, r_tmpb + add r_tmpd, r_tmpd + sub p_coeff_level, r_tmp + ; Handle first quadruple containing a non-zero value. + mov r_tmp, r_mask + and r_tmpd, 0Fh + movq xmm0, [p_coeff_level + 24] + movq xmm1, [p_shufb_lut + 8 * r_tmp] + pshufb xmm0, xmm1 + mov r_tmp2d, [p_run_lut + 4 * r_tmp] + shr r_tmp2d, 8 ; Skip initial zero run. + movlps [p_level], xmm0 ; Store levels for the first quadruple. + mov [p_run], r_tmp2d ; Store accompanying zero runs thus far. + shr r_maskd, 4 + jz .done +.loop: + ; Increment pointers. + popcnt r_tmpd, r_tmpd ; Number of non-zero values handled. + lea p_level, [p_level + 2 * r_tmp] + add p_run, r_tmp + ; Handle next quadruple. + mov r_tmp, r_mask + and r_tmpd, 0Fh + movq xmm0, [p_coeff_level + 16] + sub p_coeff_level, 8 + movq xmm1, [p_shufb_lut + 8 * r_tmp] + pshufb xmm0, xmm1 + movzx r_tmp2d, byte [p_run - 1] + add r_tmp2d, [p_run_lut + 4 * r_tmp] ; Add to previous run and get eventual new runs. + movlps [p_level], xmm0 ; Store levels (potentially none). + mov [p_run - 1], r_tmp2d ; Update previous run and store eventual new runs. + shr r_maskd, 4 + jnz .loop +.done: + mov retrq, i_total_zeros + POP_XMM + LOAD_5_PARA_POP +%ifdef X86_32 + pop r6 + pop r5 +%elifdef WIN64 + pop rbx +%endif + ret +%undef p_coeff_level +%undef p_run +%undef p_level +%undef i_total_zeros +%undef r_mask +%undef r_maskd +%undef r_tmp +%undef r_tmpd +%undef r_tmpb +%undef r_tmp2 +%undef r_tmp2d +%undef p_shufb_lut +%undef p_run_lut diff --git a/test/encoder/EncUT_Cavlc.cpp b/test/encoder/EncUT_Cavlc.cpp index 04837d7f..5cf4158d 100644 --- a/test/encoder/EncUT_Cavlc.cpp +++ b/test/encoder/EncUT_Cavlc.cpp @@ -81,3 +81,10 @@ TEST (CavlcTest, CavlcParamCal_sse2) { TestCavlcParamCal (CavlcParamCal_sse2); } #endif + +#ifdef X86_ASM +TEST (CavlcTest, CavlcParamCal_sse42) { + if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42) + TestCavlcParamCal (CavlcParamCal_sse42); +} +#endif