Merge pull request #2447 from saamas/encoder-cavlcparamcal-sse42

[Encoder] Add an SSE4.2 implementation of CavlcParamCal
2016-04-28 09:08:44 +08:00 · 2016-04-28 09:08:44 +08:00 · e9dc97803d
commit e9dc97803d
parent 7d65687284 fb0b2b3f41
6 changed files with 318 additions and 1 deletions
--- a/codec/encoder/core/inc/set_mb_syn_cavlc.h
+++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h
@ -75,9 +75,13 @@ int32_t  WriteBlockResidualCavlc (SWelsFuncPtrList* pFuncList, int16_t* pCoffLev
 extern "C" {
 #endif//__cplusplus

+int32_t CavlcParamCal_c (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
+                         int32_t iEndIdx);
 #ifdef  X86_ASM
 int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
                            int32_t iEndIdx);
+int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
+                             int32_t iEndIdx);
 #endif

 #if defined(__cplusplus)
--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
@ -278,6 +278,11 @@ void InitCoeffFunc (SWelsFuncPtrList* pFuncList, const uint32_t uiCpuFlag, int32
  if (uiCpuFlag & WELS_CPU_SSE2) {
    pFuncList->pfCavlcParamCal = CavlcParamCal_sse2;
  }
+#endif
+#ifdef X86_ASM
+  if (uiCpuFlag & WELS_CPU_SSE42) {
+    pFuncList->pfCavlcParamCal = CavlcParamCal_sse42;
+  }
 #endif
  if (iEntropyCodingModeFlag) {
    pFuncList->pfStashMBStatus = StashMBStatusCabac;
--- a/codec/encoder/core/x86/coeff.asm
+++ b/codec/encoder/core/x86/coeff.asm
@ -42,10 +42,57 @@

 %include "asm_inc.asm"

+SECTION .rodata align=16
+
+align 16
+
+wels_shufb_rev:
+    db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+; 4-bit table giving number of preceding zeros for each set bit as well as the
+; eventual next bit. For the case where all 4 bits are set, this requires 5
+; zeros. The 5th zero can either be read from beyond the final table entry or
+; implied via zero-initializing the location being read into.
+wels_cavlc_param_cal_run_lut:
+    db 4, 0, 0, 0
+    db 0, 3, 0, 0
+    db 1, 2, 0, 0
+    db 0, 0, 2, 0
+    db 2, 1, 0, 0
+    db 0, 1, 1, 0
+    db 1, 0, 1, 0
+    db 0, 0, 0, 1
+    db 3, 0, 0, 0
+    db 0, 2, 0, 0
+    db 1, 1, 0, 0
+    db 0, 0, 1, 0
+    db 2, 0, 0, 0
+    db 0, 1, 0, 0
+    db 1, 0, 0, 0
+    db 0, 0, 0, 0
+;   db 0
+; 4-bit table giving pshufb vectors for compacting 4-word vectors by removing
+; the words that match zero bits and concatenating in reverse order.
+wels_cavlc_param_cal_shufb_lut:
+    db 0, 0, 0, 0, 0, 0, 0, 0
+    db 6, 7, 0, 0, 0, 0, 0, 0
+    db 4, 5, 0, 0, 0, 0, 0, 0
+    db 6, 7, 4, 5, 0, 0, 0, 0
+    db 2, 3, 0, 0, 0, 0, 0, 0
+    db 6, 7, 2, 3, 0, 0, 0, 0
+    db 4, 5, 2, 3, 0, 0, 0, 0
+    db 6, 7, 4, 5, 2, 3, 0, 0
+    db 0, 1, 0, 0, 0, 0, 0, 0
+    db 6, 7, 0, 1, 0, 0, 0, 0
+    db 4, 5, 0, 1, 0, 0, 0, 0
+    db 6, 7, 4, 5, 0, 1, 0, 0
+    db 2, 3, 0, 1, 0, 0, 0, 0
+    db 6, 7, 2, 3, 0, 1, 0, 0
+    db 4, 5, 2, 3, 0, 1, 0, 0
+    db 6, 7, 4, 5, 2, 3, 0, 1


 %ifdef X86_32
-SECTION .rodata align=16

 align 16
 sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
@ -312,12 +359,15 @@ byte_1pos_table:
    db 7,6,5,4,3,2,1,7, ;254
    db 7,6,5,4,3,2,1,8, ;255

+%endif ; X86_32
+
 ;***********************************************************************
 ; Code
 ;***********************************************************************
 SECTION .text


+%ifdef X86_32

 ;***********************************************************************
 ;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
@ -457,3 +507,166 @@ WELS_EXTERN CavlcParamCal_sse2
    pop ebx
    ret
 %endif
+
+;***********************************************************************
+;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
+;***********************************************************************
+
+WELS_EXTERN CavlcParamCal_sse42
+%define i_endidxd      dword arg5d
+
+%ifdef X86_32
+    push            r3
+    push            r4
+    push            r5
+    push            r6
+    %assign push_num 4
+    %define p_total_coeffs r0
+    %define r_tmp r1
+    %define r_tmpd r1d
+    %define r_tmpb r1b
+    %define p_level r2
+    %define p_coeff_level r3
+    %define r_mask  r5
+    %define r_maskd r5d
+    %define p_run r6
+    %define p_shufb_lut wels_cavlc_param_cal_shufb_lut
+    %define p_run_lut   wels_cavlc_param_cal_run_lut
+    mov             p_coeff_level, arg1
+    mov             p_run, arg2
+    mov             p_level, arg3
+    mov             p_total_coeffs, arg4
+%elifdef WIN64
+    push            rbx
+    %assign push_num 1
+    %define p_coeff_level r0
+    %define p_run r1
+    %define p_level r2
+    %define p_total_coeffs r3
+    %define r_mask  rbx
+    %define r_maskd ebx
+    %define p_shufb_lut r5
+    %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
+    lea             p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
+    ; Free up rcx/ecx because only cl is accepted as shift amount operand.
+    mov             r6, r0
+    %undef p_coeff_level
+    %define p_coeff_level r6
+    %define r_tmp r0
+    %define r_tmpd r0d
+    %define r_tmpb r0b
+%else
+    %assign push_num 0
+    %define p_coeff_level r0
+    %define p_run r1
+    %define p_level r2
+    %define p_total_coeffs r3
+    %define r_mask  rax
+    %define r_maskd eax
+    %define p_shufb_lut r5
+    %define i_total_zeros r6
+    %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
+    lea             p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
+%endif
+
+    ; Acquire a bitmask indicating which words are non-zero.
+    ; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3.
+    ; Otherwise, assume 8 bytes available. Assume that input beyond endIdx is zero.
+    ; Assumptions are taken from previous implementations.
+    pxor            xmm1, xmm1
+    cmp             i_endidxd, 3
+    jg              .load16
+    movq            xmm0, [p_coeff_level]
+    packsswb        xmm0, xmm1
+    jmp             .load_done
+.load16:
+    movdqa          xmm0, [p_coeff_level]
+    packsswb        xmm0, [p_coeff_level + 16]
+.load_done:
+    movdqa          [p_run], xmm1                           ; Zero-initialize because we may read back implied zeros.
+    pcmpeqb         xmm0, xmm1
+    pshufb          xmm0, [wels_shufb_rev]
+    pmovmskb        r_maskd, xmm0
+    xor             r_maskd, 0FFFFh
+%undef i_endidxd
+%define r_tmp2  r4
+%define r_tmp2d r4d
+    popcnt          r_tmp2d, r_maskd
+    mov             [p_total_coeffs], r_tmp2d
+    ; Recycle p_total_coeffs.
+%ifidni p_total_coeffs, rcx
+    %define r_tmp rcx
+    %define r_tmpd ecx
+    %define r_tmpb cl
+%else
+    %xdefine i_total_zeros p_total_coeffs
+%endif
+%undef p_total_coeffs
+    mov             i_total_zeros, r_tmp2
+    jz              .done
+    mov             i_total_zeros, 16
+    sub             i_total_zeros, r_tmp2
+    bsf             r_tmpd, r_maskd                         ; Find first set bit.
+    sub             i_total_zeros, r_tmp
+    ; Skip trailing zeros.
+    ; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores.
+    and             r_tmpd, -4
+    shr             r_maskd, r_tmpb
+    add             r_tmpd, r_tmpd
+    sub             p_coeff_level, r_tmp
+    ; Handle first quadruple containing a non-zero value.
+    mov             r_tmp, r_mask
+    and             r_tmpd, 0Fh
+    movq            xmm0, [p_coeff_level + 24]
+    movq            xmm1, [p_shufb_lut + 8 * r_tmp]
+    pshufb          xmm0, xmm1
+    mov             r_tmp2d, [p_run_lut + 4 * r_tmp]
+    shr             r_tmp2d, 8                              ; Skip initial zero run.
+    movlps          [p_level], xmm0                         ; Store levels for the first quadruple.
+    mov             [p_run], r_tmp2d                        ; Store accompanying zero runs thus far.
+    shr             r_maskd, 4
+    jz              .done
+.loop:
+    ; Increment pointers.
+    popcnt          r_tmpd, r_tmpd                          ; Number of non-zero values handled.
+    lea             p_level, [p_level + 2 * r_tmp]
+    add             p_run, r_tmp
+    ; Handle next quadruple.
+    mov             r_tmp, r_mask
+    and             r_tmpd, 0Fh
+    movq            xmm0, [p_coeff_level + 16]
+    sub             p_coeff_level, 8
+    movq            xmm1, [p_shufb_lut + 8 * r_tmp]
+    pshufb          xmm0, xmm1
+    movzx           r_tmp2d, byte [p_run - 1]
+    add             r_tmp2d, [p_run_lut + 4 * r_tmp]        ; Add to previous run and get eventual new runs.
+    movlps          [p_level], xmm0                         ; Store levels (potentially none).
+    mov             [p_run - 1], r_tmp2d                    ; Update previous run and store eventual new runs.
+    shr             r_maskd, 4
+    jnz             .loop
+.done:
+%ifnidni retrq, i_total_zeros
+    mov             retrq, i_total_zeros
+%endif
+%ifdef X86_32
+    pop             r6
+    pop             r5
+    pop             r4
+    pop             r3
+%elifdef WIN64
+    pop             rbx
+%endif
+    ret
+%undef p_coeff_level
+%undef p_run
+%undef p_level
+%undef i_total_zeros
+%undef r_mask
+%undef r_maskd
+%undef r_tmp
+%undef r_tmpd
+%undef r_tmpb
+%undef r_tmp2
+%undef r_tmp2d
+%undef p_shufb_lut
+%undef p_run_lut
--- a/test/build/win32/codec_ut/codec_unittest.vcproj
+++ b/test/build/win32/codec_ut/codec_unittest.vcproj
@ -390,6 +390,10 @@
 		<Filter
 			Name="encoder"
 			>
+			<File
+				RelativePath="..\..\..\encoder\EncUT_Cavlc.cpp"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\encoder\EncUT_DecodeMbAux.cpp"
 				>
--- a/test/encoder/EncUT_Cavlc.cpp
+++ b/test/encoder/EncUT_Cavlc.cpp
@ -0,0 +1,90 @@
+#include "cpu.h"
+#include "macros.h"
+#include "set_mb_syn_cavlc.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <cstddef>
+
+using namespace WelsEnc;
+
+namespace {
+
+int32_t CavlcParamCal_ref (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeff,
+                           int32_t iLastIndex) {
+  int32_t iTotalZeros = 0;
+  int32_t iTotalCoeffs = 0;
+
+  while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {
+    -- iLastIndex;
+  }
+
+  while (iLastIndex >= 0) {
+    int32_t iCountZero = 0;
+    pLevel[iTotalCoeffs] = pCoffLevel[iLastIndex--];
+
+    while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {
+      ++ iCountZero;
+      -- iLastIndex;
+    }
+    iTotalZeros += iCountZero;
+    pRun[iTotalCoeffs++] = iCountZero;
+  }
+  *pTotalCoeff = iTotalCoeffs;
+  return iTotalZeros;
+}
+
+void TestCavlcParamCalWithEndIdx (PCavlcParamCalFunc func, int endIdx, bool allZero, bool allNonZero) {
+  ENFORCE_STACK_ALIGN_1D(int16_t, coeffLevel, 16, 16);
+  ENFORCE_STACK_ALIGN_1D(int16_t, level, 16, 16);
+  ENFORCE_STACK_ALIGN_1D(uint8_t, run, 16, 16);
+  uint8_t run_ref[16];
+  int16_t level_ref[16];
+  int32_t totalCoeffs = 0;
+  int32_t totalCoeffs_ref = 0;
+  for (int i = 0; i < 16; i++) {
+    const int r = std::rand();
+    if (allZero || (i > endIdx && endIdx > 7))
+      coeffLevel[i] = 0;
+    else if (allNonZero)
+      coeffLevel[i] = r % 0xFFFF - 0x8000 ? r % 0xFFFF - 0x8000 : 0x7FFF;
+    else
+      coeffLevel[i] = (r >> 16 & 1) * ((r & 0xFFFF) - 0x8000);
+  }
+  const int32_t totalZeros_ref = CavlcParamCal_ref (coeffLevel, run_ref, level_ref, &totalCoeffs_ref, endIdx);
+  const int32_t totalZeros = func (coeffLevel, run, level, &totalCoeffs, endIdx);
+  ASSERT_EQ (totalCoeffs, totalCoeffs_ref);
+  if (totalCoeffs > 0)
+    ASSERT_EQ (totalZeros, totalZeros_ref);
+  for (int i = 0; i < totalCoeffs_ref; i++)
+    ASSERT_EQ (level[i], level_ref[i]);
+  for (int i = 0; i < totalCoeffs_ref - 1; i++)
+    ASSERT_EQ (run[i], run_ref[i]);
+}
+
+void TestCavlcParamCal (PCavlcParamCalFunc func) {
+  const int endIdxes[] = { 3, 14, 15 };
+  const int num_test_repetitions = 10000;
+  for (std::size_t i = 0; i < sizeof endIdxes / sizeof *endIdxes; i++) {
+    for (int count = 0; count < num_test_repetitions; count++)
+      TestCavlcParamCalWithEndIdx (func, endIdxes[i], count == 0, count == 1);
+  }
+}
+
+} // anon ns.
+
+TEST (CavlcTest, CavlcParamCal_c) {
+  TestCavlcParamCal (CavlcParamCal_c);
+}
+
+#ifdef X86_32_ASM
+TEST (CavlcTest, CavlcParamCal_sse2) {
+  TestCavlcParamCal (CavlcParamCal_sse2);
+}
+#endif
+
+#ifdef X86_ASM
+TEST (CavlcTest, CavlcParamCal_sse42) {
+  if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
+    TestCavlcParamCal (CavlcParamCal_sse42);
+}
+#endif
--- a/test/encoder/targets.mk
+++ b/test/encoder/targets.mk
@ -1,5 +1,6 @@
 ENCODER_UNITTEST_SRCDIR=test/encoder
 ENCODER_UNITTEST_CPP_SRCS=\
+	$(ENCODER_UNITTEST_SRCDIR)/EncUT_Cavlc.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_DecodeMbAux.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderExt.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMb.cpp\