Merge pull request #2447 from saamas/encoder-cavlcparamcal-sse42
[Encoder] Add an SSE4.2 implementation of CavlcParamCal
This commit is contained in:
commit
e9dc97803d
@ -75,9 +75,13 @@ int32_t WriteBlockResidualCavlc (SWelsFuncPtrList* pFuncList, int16_t* pCoffLev
|
||||
extern "C" {
|
||||
#endif//__cplusplus
|
||||
|
||||
int32_t CavlcParamCal_c (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
|
||||
int32_t iEndIdx);
|
||||
#ifdef X86_ASM
|
||||
int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
|
||||
int32_t iEndIdx);
|
||||
int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
|
||||
int32_t iEndIdx);
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
|
@ -278,6 +278,11 @@ void InitCoeffFunc (SWelsFuncPtrList* pFuncList, const uint32_t uiCpuFlag, int32
|
||||
if (uiCpuFlag & WELS_CPU_SSE2) {
|
||||
pFuncList->pfCavlcParamCal = CavlcParamCal_sse2;
|
||||
}
|
||||
#endif
|
||||
#ifdef X86_ASM
|
||||
if (uiCpuFlag & WELS_CPU_SSE42) {
|
||||
pFuncList->pfCavlcParamCal = CavlcParamCal_sse42;
|
||||
}
|
||||
#endif
|
||||
if (iEntropyCodingModeFlag) {
|
||||
pFuncList->pfStashMBStatus = StashMBStatusCabac;
|
||||
|
@ -42,10 +42,57 @@
|
||||
|
||||
%include "asm_inc.asm"
|
||||
|
||||
SECTION .rodata align=16
|
||||
|
||||
align 16
|
||||
|
||||
wels_shufb_rev:
|
||||
db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
|
||||
; 4-bit table giving number of preceding zeros for each set bit as well as the
|
||||
; eventual next bit. For the case where all 4 bits are set, this requires 5
|
||||
; zeros. The 5th zero can either be read from beyond the final table entry or
|
||||
; implied via zero-initializing the location being read into.
|
||||
wels_cavlc_param_cal_run_lut:
|
||||
db 4, 0, 0, 0
|
||||
db 0, 3, 0, 0
|
||||
db 1, 2, 0, 0
|
||||
db 0, 0, 2, 0
|
||||
db 2, 1, 0, 0
|
||||
db 0, 1, 1, 0
|
||||
db 1, 0, 1, 0
|
||||
db 0, 0, 0, 1
|
||||
db 3, 0, 0, 0
|
||||
db 0, 2, 0, 0
|
||||
db 1, 1, 0, 0
|
||||
db 0, 0, 1, 0
|
||||
db 2, 0, 0, 0
|
||||
db 0, 1, 0, 0
|
||||
db 1, 0, 0, 0
|
||||
db 0, 0, 0, 0
|
||||
; db 0
|
||||
; 4-bit table giving pshufb vectors for compacting 4-word vectors by removing
|
||||
; the words that match zero bits and concatenating in reverse order.
|
||||
wels_cavlc_param_cal_shufb_lut:
|
||||
db 0, 0, 0, 0, 0, 0, 0, 0
|
||||
db 6, 7, 0, 0, 0, 0, 0, 0
|
||||
db 4, 5, 0, 0, 0, 0, 0, 0
|
||||
db 6, 7, 4, 5, 0, 0, 0, 0
|
||||
db 2, 3, 0, 0, 0, 0, 0, 0
|
||||
db 6, 7, 2, 3, 0, 0, 0, 0
|
||||
db 4, 5, 2, 3, 0, 0, 0, 0
|
||||
db 6, 7, 4, 5, 2, 3, 0, 0
|
||||
db 0, 1, 0, 0, 0, 0, 0, 0
|
||||
db 6, 7, 0, 1, 0, 0, 0, 0
|
||||
db 4, 5, 0, 1, 0, 0, 0, 0
|
||||
db 6, 7, 4, 5, 0, 1, 0, 0
|
||||
db 2, 3, 0, 1, 0, 0, 0, 0
|
||||
db 6, 7, 2, 3, 0, 1, 0, 0
|
||||
db 4, 5, 2, 3, 0, 1, 0, 0
|
||||
db 6, 7, 4, 5, 2, 3, 0, 1
|
||||
|
||||
|
||||
%ifdef X86_32
|
||||
SECTION .rodata align=16
|
||||
|
||||
align 16
|
||||
sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
|
||||
@ -312,12 +359,15 @@ byte_1pos_table:
|
||||
db 7,6,5,4,3,2,1,7, ;254
|
||||
db 7,6,5,4,3,2,1,8, ;255
|
||||
|
||||
%endif ; X86_32
|
||||
|
||||
;***********************************************************************
|
||||
; Code
|
||||
;***********************************************************************
|
||||
SECTION .text
|
||||
|
||||
|
||||
%ifdef X86_32
|
||||
|
||||
;***********************************************************************
|
||||
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
|
||||
@ -457,3 +507,166 @@ WELS_EXTERN CavlcParamCal_sse2
|
||||
pop ebx
|
||||
ret
|
||||
%endif
|
||||
|
||||
;***********************************************************************
|
||||
;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
|
||||
;***********************************************************************
|
||||
|
||||
WELS_EXTERN CavlcParamCal_sse42
|
||||
%define i_endidxd dword arg5d
|
||||
|
||||
%ifdef X86_32
|
||||
push r3
|
||||
push r4
|
||||
push r5
|
||||
push r6
|
||||
%assign push_num 4
|
||||
%define p_total_coeffs r0
|
||||
%define r_tmp r1
|
||||
%define r_tmpd r1d
|
||||
%define r_tmpb r1b
|
||||
%define p_level r2
|
||||
%define p_coeff_level r3
|
||||
%define r_mask r5
|
||||
%define r_maskd r5d
|
||||
%define p_run r6
|
||||
%define p_shufb_lut wels_cavlc_param_cal_shufb_lut
|
||||
%define p_run_lut wels_cavlc_param_cal_run_lut
|
||||
mov p_coeff_level, arg1
|
||||
mov p_run, arg2
|
||||
mov p_level, arg3
|
||||
mov p_total_coeffs, arg4
|
||||
%elifdef WIN64
|
||||
push rbx
|
||||
%assign push_num 1
|
||||
%define p_coeff_level r0
|
||||
%define p_run r1
|
||||
%define p_level r2
|
||||
%define p_total_coeffs r3
|
||||
%define r_mask rbx
|
||||
%define r_maskd ebx
|
||||
%define p_shufb_lut r5
|
||||
%define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
|
||||
lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
|
||||
; Free up rcx/ecx because only cl is accepted as shift amount operand.
|
||||
mov r6, r0
|
||||
%undef p_coeff_level
|
||||
%define p_coeff_level r6
|
||||
%define r_tmp r0
|
||||
%define r_tmpd r0d
|
||||
%define r_tmpb r0b
|
||||
%else
|
||||
%assign push_num 0
|
||||
%define p_coeff_level r0
|
||||
%define p_run r1
|
||||
%define p_level r2
|
||||
%define p_total_coeffs r3
|
||||
%define r_mask rax
|
||||
%define r_maskd eax
|
||||
%define p_shufb_lut r5
|
||||
%define i_total_zeros r6
|
||||
%define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
|
||||
lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
|
||||
%endif
|
||||
|
||||
; Acquire a bitmask indicating which words are non-zero.
|
||||
; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3.
|
||||
; Otherwise, assume 8 bytes available. Assume that input beyond endIdx is zero.
|
||||
; Assumptions are taken from previous implementations.
|
||||
pxor xmm1, xmm1
|
||||
cmp i_endidxd, 3
|
||||
jg .load16
|
||||
movq xmm0, [p_coeff_level]
|
||||
packsswb xmm0, xmm1
|
||||
jmp .load_done
|
||||
.load16:
|
||||
movdqa xmm0, [p_coeff_level]
|
||||
packsswb xmm0, [p_coeff_level + 16]
|
||||
.load_done:
|
||||
movdqa [p_run], xmm1 ; Zero-initialize because we may read back implied zeros.
|
||||
pcmpeqb xmm0, xmm1
|
||||
pshufb xmm0, [wels_shufb_rev]
|
||||
pmovmskb r_maskd, xmm0
|
||||
xor r_maskd, 0FFFFh
|
||||
%undef i_endidxd
|
||||
%define r_tmp2 r4
|
||||
%define r_tmp2d r4d
|
||||
popcnt r_tmp2d, r_maskd
|
||||
mov [p_total_coeffs], r_tmp2d
|
||||
; Recycle p_total_coeffs.
|
||||
%ifidni p_total_coeffs, rcx
|
||||
%define r_tmp rcx
|
||||
%define r_tmpd ecx
|
||||
%define r_tmpb cl
|
||||
%else
|
||||
%xdefine i_total_zeros p_total_coeffs
|
||||
%endif
|
||||
%undef p_total_coeffs
|
||||
mov i_total_zeros, r_tmp2
|
||||
jz .done
|
||||
mov i_total_zeros, 16
|
||||
sub i_total_zeros, r_tmp2
|
||||
bsf r_tmpd, r_maskd ; Find first set bit.
|
||||
sub i_total_zeros, r_tmp
|
||||
; Skip trailing zeros.
|
||||
; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores.
|
||||
and r_tmpd, -4
|
||||
shr r_maskd, r_tmpb
|
||||
add r_tmpd, r_tmpd
|
||||
sub p_coeff_level, r_tmp
|
||||
; Handle first quadruple containing a non-zero value.
|
||||
mov r_tmp, r_mask
|
||||
and r_tmpd, 0Fh
|
||||
movq xmm0, [p_coeff_level + 24]
|
||||
movq xmm1, [p_shufb_lut + 8 * r_tmp]
|
||||
pshufb xmm0, xmm1
|
||||
mov r_tmp2d, [p_run_lut + 4 * r_tmp]
|
||||
shr r_tmp2d, 8 ; Skip initial zero run.
|
||||
movlps [p_level], xmm0 ; Store levels for the first quadruple.
|
||||
mov [p_run], r_tmp2d ; Store accompanying zero runs thus far.
|
||||
shr r_maskd, 4
|
||||
jz .done
|
||||
.loop:
|
||||
; Increment pointers.
|
||||
popcnt r_tmpd, r_tmpd ; Number of non-zero values handled.
|
||||
lea p_level, [p_level + 2 * r_tmp]
|
||||
add p_run, r_tmp
|
||||
; Handle next quadruple.
|
||||
mov r_tmp, r_mask
|
||||
and r_tmpd, 0Fh
|
||||
movq xmm0, [p_coeff_level + 16]
|
||||
sub p_coeff_level, 8
|
||||
movq xmm1, [p_shufb_lut + 8 * r_tmp]
|
||||
pshufb xmm0, xmm1
|
||||
movzx r_tmp2d, byte [p_run - 1]
|
||||
add r_tmp2d, [p_run_lut + 4 * r_tmp] ; Add to previous run and get eventual new runs.
|
||||
movlps [p_level], xmm0 ; Store levels (potentially none).
|
||||
mov [p_run - 1], r_tmp2d ; Update previous run and store eventual new runs.
|
||||
shr r_maskd, 4
|
||||
jnz .loop
|
||||
.done:
|
||||
%ifnidni retrq, i_total_zeros
|
||||
mov retrq, i_total_zeros
|
||||
%endif
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
pop r5
|
||||
pop r4
|
||||
pop r3
|
||||
%elifdef WIN64
|
||||
pop rbx
|
||||
%endif
|
||||
ret
|
||||
%undef p_coeff_level
|
||||
%undef p_run
|
||||
%undef p_level
|
||||
%undef i_total_zeros
|
||||
%undef r_mask
|
||||
%undef r_maskd
|
||||
%undef r_tmp
|
||||
%undef r_tmpd
|
||||
%undef r_tmpb
|
||||
%undef r_tmp2
|
||||
%undef r_tmp2d
|
||||
%undef p_shufb_lut
|
||||
%undef p_run_lut
|
||||
|
@ -390,6 +390,10 @@
|
||||
<Filter
|
||||
Name="encoder"
|
||||
>
|
||||
<File
|
||||
RelativePath="..\..\..\encoder\EncUT_Cavlc.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath="..\..\..\encoder\EncUT_DecodeMbAux.cpp"
|
||||
>
|
||||
|
90
test/encoder/EncUT_Cavlc.cpp
Normal file
90
test/encoder/EncUT_Cavlc.cpp
Normal file
@ -0,0 +1,90 @@
|
||||
#include "cpu.h"
|
||||
#include "macros.h"
|
||||
#include "set_mb_syn_cavlc.h"
|
||||
#include <gtest/gtest.h>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
|
||||
using namespace WelsEnc;
|
||||
|
||||
namespace {
|
||||
|
||||
int32_t CavlcParamCal_ref (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeff,
|
||||
int32_t iLastIndex) {
|
||||
int32_t iTotalZeros = 0;
|
||||
int32_t iTotalCoeffs = 0;
|
||||
|
||||
while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {
|
||||
-- iLastIndex;
|
||||
}
|
||||
|
||||
while (iLastIndex >= 0) {
|
||||
int32_t iCountZero = 0;
|
||||
pLevel[iTotalCoeffs] = pCoffLevel[iLastIndex--];
|
||||
|
||||
while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {
|
||||
++ iCountZero;
|
||||
-- iLastIndex;
|
||||
}
|
||||
iTotalZeros += iCountZero;
|
||||
pRun[iTotalCoeffs++] = iCountZero;
|
||||
}
|
||||
*pTotalCoeff = iTotalCoeffs;
|
||||
return iTotalZeros;
|
||||
}
|
||||
|
||||
void TestCavlcParamCalWithEndIdx (PCavlcParamCalFunc func, int endIdx, bool allZero, bool allNonZero) {
|
||||
ENFORCE_STACK_ALIGN_1D(int16_t, coeffLevel, 16, 16);
|
||||
ENFORCE_STACK_ALIGN_1D(int16_t, level, 16, 16);
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, run, 16, 16);
|
||||
uint8_t run_ref[16];
|
||||
int16_t level_ref[16];
|
||||
int32_t totalCoeffs = 0;
|
||||
int32_t totalCoeffs_ref = 0;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
const int r = std::rand();
|
||||
if (allZero || (i > endIdx && endIdx > 7))
|
||||
coeffLevel[i] = 0;
|
||||
else if (allNonZero)
|
||||
coeffLevel[i] = r % 0xFFFF - 0x8000 ? r % 0xFFFF - 0x8000 : 0x7FFF;
|
||||
else
|
||||
coeffLevel[i] = (r >> 16 & 1) * ((r & 0xFFFF) - 0x8000);
|
||||
}
|
||||
const int32_t totalZeros_ref = CavlcParamCal_ref (coeffLevel, run_ref, level_ref, &totalCoeffs_ref, endIdx);
|
||||
const int32_t totalZeros = func (coeffLevel, run, level, &totalCoeffs, endIdx);
|
||||
ASSERT_EQ (totalCoeffs, totalCoeffs_ref);
|
||||
if (totalCoeffs > 0)
|
||||
ASSERT_EQ (totalZeros, totalZeros_ref);
|
||||
for (int i = 0; i < totalCoeffs_ref; i++)
|
||||
ASSERT_EQ (level[i], level_ref[i]);
|
||||
for (int i = 0; i < totalCoeffs_ref - 1; i++)
|
||||
ASSERT_EQ (run[i], run_ref[i]);
|
||||
}
|
||||
|
||||
void TestCavlcParamCal (PCavlcParamCalFunc func) {
|
||||
const int endIdxes[] = { 3, 14, 15 };
|
||||
const int num_test_repetitions = 10000;
|
||||
for (std::size_t i = 0; i < sizeof endIdxes / sizeof *endIdxes; i++) {
|
||||
for (int count = 0; count < num_test_repetitions; count++)
|
||||
TestCavlcParamCalWithEndIdx (func, endIdxes[i], count == 0, count == 1);
|
||||
}
|
||||
}
|
||||
|
||||
} // anon ns.
|
||||
|
||||
TEST (CavlcTest, CavlcParamCal_c) {
|
||||
TestCavlcParamCal (CavlcParamCal_c);
|
||||
}
|
||||
|
||||
#ifdef X86_32_ASM
|
||||
TEST (CavlcTest, CavlcParamCal_sse2) {
|
||||
TestCavlcParamCal (CavlcParamCal_sse2);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef X86_ASM
|
||||
TEST (CavlcTest, CavlcParamCal_sse42) {
|
||||
if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
|
||||
TestCavlcParamCal (CavlcParamCal_sse42);
|
||||
}
|
||||
#endif
|
@ -1,5 +1,6 @@
|
||||
ENCODER_UNITTEST_SRCDIR=test/encoder
|
||||
ENCODER_UNITTEST_CPP_SRCS=\
|
||||
$(ENCODER_UNITTEST_SRCDIR)/EncUT_Cavlc.cpp\
|
||||
$(ENCODER_UNITTEST_SRCDIR)/EncUT_DecodeMbAux.cpp\
|
||||
$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderExt.cpp\
|
||||
$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMb.cpp\
|
||||
|
Loading…
x
Reference in New Issue
Block a user