[Encoder] Add an SSE4.2 implementation of CavlcParamCal

Use a combination of table lookups and pshufb to convert coefficients
to zero run/level format. Two 16-entry lookup tables are used for a
total of 192 bytes worth of tables. (The existing SSE2 version uses a
table of size 2048 bytes.)

Speedup is ~1.5x-3x as compared with the SSE2 version on Haswell (the
speedup is greater for input with many trailing zeros).

The use of popcnt makes it require SSE4.2. This can be replaced with
a small LUT and accumulation which would reduce the requirement to
SSSE3.
This commit is contained in:
Sindre Aamås 2016-04-19 16:41:10 +02:00
parent 502b16925e
commit 3f31aff4dc
4 changed files with 224 additions and 1 deletions

View File

@ -80,6 +80,8 @@ int32_t CavlcParamCal_c (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, in
#ifdef X86_ASM
int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
int32_t iEndIdx);
int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
int32_t iEndIdx);
#endif
#if defined(__cplusplus)

View File

@ -278,6 +278,11 @@ void InitCoeffFunc (SWelsFuncPtrList* pFuncList, const uint32_t uiCpuFlag, int32
if (uiCpuFlag & WELS_CPU_SSE2) {
pFuncList->pfCavlcParamCal = CavlcParamCal_sse2;
}
#endif
#ifdef X86_ASM
if (uiCpuFlag & WELS_CPU_SSE42) {
pFuncList->pfCavlcParamCal = CavlcParamCal_sse42;
}
#endif
if (iEntropyCodingModeFlag) {
pFuncList->pfStashMBStatus = StashMBStatusCabac;

View File

@ -42,10 +42,57 @@
%include "asm_inc.asm"
SECTION .rodata align=16
align 16
wels_shufb_rev:
db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
; 4-bit table giving number of preceding zeros for each set bit as well as the
; eventual next bit. For the case where all 4 bits are set, this requires 5
; zeros. The 5th zero can either be read from beyond the final table entry or
; implied via zero-initializing the location being read into.
wels_cavlc_param_cal_run_lut:
db 4, 0, 0, 0
db 0, 3, 0, 0
db 1, 2, 0, 0
db 0, 0, 2, 0
db 2, 1, 0, 0
db 0, 1, 1, 0
db 1, 0, 1, 0
db 0, 0, 0, 1
db 3, 0, 0, 0
db 0, 2, 0, 0
db 1, 1, 0, 0
db 0, 0, 1, 0
db 2, 0, 0, 0
db 0, 1, 0, 0
db 1, 0, 0, 0
db 0, 0, 0, 0
; db 0
; 4-bit table giving pshufb vectors for compacting 4-word vectors by removing
; the words that match zero bits and concatenating in reverse order.
wels_cavlc_param_cal_shufb_lut:
db 0, 0, 0, 0, 0, 0, 0, 0
db 6, 7, 0, 0, 0, 0, 0, 0
db 4, 5, 0, 0, 0, 0, 0, 0
db 6, 7, 4, 5, 0, 0, 0, 0
db 2, 3, 0, 0, 0, 0, 0, 0
db 6, 7, 2, 3, 0, 0, 0, 0
db 4, 5, 2, 3, 0, 0, 0, 0
db 6, 7, 4, 5, 2, 3, 0, 0
db 0, 1, 0, 0, 0, 0, 0, 0
db 6, 7, 0, 1, 0, 0, 0, 0
db 4, 5, 0, 1, 0, 0, 0, 0
db 6, 7, 4, 5, 0, 1, 0, 0
db 2, 3, 0, 1, 0, 0, 0, 0
db 6, 7, 2, 3, 0, 1, 0, 0
db 4, 5, 2, 3, 0, 1, 0, 0
db 6, 7, 4, 5, 2, 3, 0, 1
%ifdef X86_32
SECTION .rodata align=16
align 16
sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
@ -312,12 +359,15 @@ byte_1pos_table:
db 7,6,5,4,3,2,1,7, ;254
db 7,6,5,4,3,2,1,8, ;255
%endif ; X86_32
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
%ifdef X86_32
;***********************************************************************
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
@ -457,3 +507,162 @@ WELS_EXTERN CavlcParamCal_sse2
pop ebx
ret
%endif
;***********************************************************************
;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
WELS_EXTERN CavlcParamCal_sse42
%define p_coeff_level r0
%define p_run r1
%define p_level r2
%define p_total_coeffs r3
%define i_endidxd r4d
%ifdef X86_32
push r5
push r6
%assign push_num 2
%define r_mask r5
%define r_maskd r5d
%define p_shufb_lut wels_cavlc_param_cal_shufb_lut
%define p_run_lut wels_cavlc_param_cal_run_lut
%elifdef WIN64
push rbx
%assign push_num 1
%define r_mask rbx
%define r_maskd ebx
%define p_shufb_lut r5
%define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
%else
%assign push_num 0
%define r_mask rax
%define r_maskd eax
%define p_shufb_lut r5
%define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
%endif
LOAD_5_PARA
PUSH_XMM 2
; Free up rcx/ecx because only cl is accepted as shift amount operand.
%ifidni r0b, cl
mov r6, r0
%undef p_coeff_level
%define p_coeff_level r6
%define r_tmp r0
%define r_tmpd r0d
%define r_tmpb r0b
%elifidni r1b, cl
mov r6, r1
%undef p_run
%define p_run r6
%define r_tmp r1
%define r_tmpd r1d
%define r_tmpb r1b
%elifidni r3b, cl
mov r6, r3
%undef p_total_coeffs
%define p_total_coeffs r6
%define r_tmp r3
%define r_tmpd r3d
%define r_tmpb r3b
%else
%error "Unknown cl register."
%endif
; Acquire a bitmask indicating which words are non-zero.
; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3.
; Otherwise, assume 8 bytes available. Assume that input beyond endIdx is zero.
; Assumptions are taken from previous implementations.
pxor xmm1, xmm1
cmp i_endidxd, 3
jg .load16
movq xmm0, [p_coeff_level]
packsswb xmm0, xmm1
jmp .load_done
.load16:
movdqa xmm0, [p_coeff_level]
packsswb xmm0, [p_coeff_level + 16]
.load_done:
movdqa [p_run], xmm1 ; Zero-initialize because we may read back implied zeros.
pcmpeqb xmm0, xmm1
pshufb xmm0, [wels_shufb_rev]
pmovmskb r_maskd, xmm0
xor r_maskd, 0FFFFh
mov r_tmpd, i_endidxd
%undef i_endidxd
%define r_tmp2 r4
%define r_tmp2d r4d
popcnt r_tmp2d, r_maskd
mov [p_total_coeffs], r_tmp2d
%xdefine i_total_zeros p_total_coeffs
%undef p_total_coeffs
mov i_total_zeros, r_tmp2
jz .done
mov i_total_zeros, 16
sub i_total_zeros, r_tmp2
bsf r_tmpd, r_maskd ; Find first set bit.
sub i_total_zeros, r_tmp
; Skip trailing zeros.
; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores.
and r_tmpd, -4
shr r_maskd, r_tmpb
add r_tmpd, r_tmpd
sub p_coeff_level, r_tmp
; Handle first quadruple containing a non-zero value.
mov r_tmp, r_mask
and r_tmpd, 0Fh
movq xmm0, [p_coeff_level + 24]
movq xmm1, [p_shufb_lut + 8 * r_tmp]
pshufb xmm0, xmm1
mov r_tmp2d, [p_run_lut + 4 * r_tmp]
shr r_tmp2d, 8 ; Skip initial zero run.
movlps [p_level], xmm0 ; Store levels for the first quadruple.
mov [p_run], r_tmp2d ; Store accompanying zero runs thus far.
shr r_maskd, 4
jz .done
.loop:
; Increment pointers.
popcnt r_tmpd, r_tmpd ; Number of non-zero values handled.
lea p_level, [p_level + 2 * r_tmp]
add p_run, r_tmp
; Handle next quadruple.
mov r_tmp, r_mask
and r_tmpd, 0Fh
movq xmm0, [p_coeff_level + 16]
sub p_coeff_level, 8
movq xmm1, [p_shufb_lut + 8 * r_tmp]
pshufb xmm0, xmm1
movzx r_tmp2d, byte [p_run - 1]
add r_tmp2d, [p_run_lut + 4 * r_tmp] ; Add to previous run and get eventual new runs.
movlps [p_level], xmm0 ; Store levels (potentially none).
mov [p_run - 1], r_tmp2d ; Update previous run and store eventual new runs.
shr r_maskd, 4
jnz .loop
.done:
mov retrq, i_total_zeros
POP_XMM
LOAD_5_PARA_POP
%ifdef X86_32
pop r6
pop r5
%elifdef WIN64
pop rbx
%endif
ret
%undef p_coeff_level
%undef p_run
%undef p_level
%undef i_total_zeros
%undef r_mask
%undef r_maskd
%undef r_tmp
%undef r_tmpd
%undef r_tmpb
%undef r_tmp2
%undef r_tmp2d
%undef p_shufb_lut
%undef p_run_lut

View File

@ -81,3 +81,10 @@ TEST (CavlcTest, CavlcParamCal_sse2) {
TestCavlcParamCal (CavlcParamCal_sse2);
}
#endif
#ifdef X86_ASM
TEST (CavlcTest, CavlcParamCal_sse42) {
if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
TestCavlcParamCal (CavlcParamCal_sse42);
}
#endif