From 3f31aff4dc5037cec1440f39aea8be54432a6aaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sindre=20Aam=C3=A5s?= <saamas@cisco.com>
Date: Tue, 19 Apr 2016 16:41:10 +0200
Subject: [PATCH] [Encoder] Add an SSE4.2 implementation of CavlcParamCal

Use a combination of table lookups and pshufb to convert coefficients
to zero run/level format. Two 16-entry lookup tables are used for a
total of 192 bytes worth of tables. (The existing SSE2 version uses a
table of size 2048 bytes.)

Speedup is ~1.5x-3x as compared with the SSE2 version on Haswell (the
speedup is greater for input with many trailing zeros).

The use of popcnt makes it require SSE4.2. This can be replaced with
a small LUT and accumulation which would reduce the requirement to
SSSE3.
---
 codec/encoder/core/inc/set_mb_syn_cavlc.h   |   2 +
 codec/encoder/core/src/set_mb_syn_cavlc.cpp |   5 +
 codec/encoder/core/x86/coeff.asm            | 211 +++++++++++++++++++-
 test/encoder/EncUT_Cavlc.cpp                |   7 +
 4 files changed, 224 insertions(+), 1 deletion(-)

diff --git a/codec/encoder/core/inc/set_mb_syn_cavlc.h b/codec/encoder/core/inc/set_mb_syn_cavlc.h
index 4faffc2d..37735b9b 100644
--- a/codec/encoder/core/inc/set_mb_syn_cavlc.h
+++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h
@@ -80,6 +80,8 @@ int32_t CavlcParamCal_c (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, in
 #ifdef  X86_ASM
 int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
                             int32_t iEndIdx);
+int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
+                             int32_t iEndIdx);
 #endif
 
 #if defined(__cplusplus)
diff --git a/codec/encoder/core/src/set_mb_syn_cavlc.cpp b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
index 676a4cf0..c5d79edf 100644
--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
@@ -278,6 +278,11 @@ void InitCoeffFunc (SWelsFuncPtrList* pFuncList, const uint32_t uiCpuFlag, int32
   if (uiCpuFlag & WELS_CPU_SSE2) {
     pFuncList->pfCavlcParamCal = CavlcParamCal_sse2;
   }
+#endif
+#ifdef X86_ASM
+  if (uiCpuFlag & WELS_CPU_SSE42) {
+    pFuncList->pfCavlcParamCal = CavlcParamCal_sse42;
+  }
 #endif
   if (iEntropyCodingModeFlag) {
     pFuncList->pfStashMBStatus = StashMBStatusCabac;
diff --git a/codec/encoder/core/x86/coeff.asm b/codec/encoder/core/x86/coeff.asm
index ccc9ded9..98d57b41 100644
--- a/codec/encoder/core/x86/coeff.asm
+++ b/codec/encoder/core/x86/coeff.asm
@@ -42,10 +42,57 @@
 
 %include "asm_inc.asm"
 
+SECTION .rodata align=16
+
+align 16
+
+wels_shufb_rev:
+    db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+; 4-bit table giving number of preceding zeros for each set bit as well as the
+; eventual next bit. For the case where all 4 bits are set, this requires 5
+; zeros. The 5th zero can either be read from beyond the final table entry or
+; implied via zero-initializing the location being read into.
+wels_cavlc_param_cal_run_lut:
+    db 4, 0, 0, 0
+    db 0, 3, 0, 0
+    db 1, 2, 0, 0
+    db 0, 0, 2, 0
+    db 2, 1, 0, 0
+    db 0, 1, 1, 0
+    db 1, 0, 1, 0
+    db 0, 0, 0, 1
+    db 3, 0, 0, 0
+    db 0, 2, 0, 0
+    db 1, 1, 0, 0
+    db 0, 0, 1, 0
+    db 2, 0, 0, 0
+    db 0, 1, 0, 0
+    db 1, 0, 0, 0
+    db 0, 0, 0, 0
+;   db 0
+; 4-bit table giving pshufb vectors for compacting 4-word vectors by removing
+; the words that match zero bits and concatenating in reverse order.
+wels_cavlc_param_cal_shufb_lut:
+    db 0, 0, 0, 0, 0, 0, 0, 0
+    db 6, 7, 0, 0, 0, 0, 0, 0
+    db 4, 5, 0, 0, 0, 0, 0, 0
+    db 6, 7, 4, 5, 0, 0, 0, 0
+    db 2, 3, 0, 0, 0, 0, 0, 0
+    db 6, 7, 2, 3, 0, 0, 0, 0
+    db 4, 5, 2, 3, 0, 0, 0, 0
+    db 6, 7, 4, 5, 2, 3, 0, 0
+    db 0, 1, 0, 0, 0, 0, 0, 0
+    db 6, 7, 0, 1, 0, 0, 0, 0
+    db 4, 5, 0, 1, 0, 0, 0, 0
+    db 6, 7, 4, 5, 0, 1, 0, 0
+    db 2, 3, 0, 1, 0, 0, 0, 0
+    db 6, 7, 2, 3, 0, 1, 0, 0
+    db 4, 5, 2, 3, 0, 1, 0, 0
+    db 6, 7, 4, 5, 2, 3, 0, 1
 
 
 %ifdef X86_32
-SECTION .rodata align=16
 
 align 16
 sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
@@ -312,12 +359,15 @@ byte_1pos_table:
     db 7,6,5,4,3,2,1,7, ;254
     db 7,6,5,4,3,2,1,8, ;255
 
+%endif ; X86_32
+
 ;***********************************************************************
 ; Code
 ;***********************************************************************
 SECTION .text
 
 
+%ifdef X86_32
 
 ;***********************************************************************
 ;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
@@ -457,3 +507,162 @@ WELS_EXTERN CavlcParamCal_sse2
     pop ebx
     ret
 %endif
+
+;***********************************************************************
+;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
+;***********************************************************************
+
+WELS_EXTERN CavlcParamCal_sse42
+%define p_coeff_level  r0
+%define p_run          r1
+%define p_level        r2
+%define p_total_coeffs r3
+%define i_endidxd      r4d
+
+%ifdef X86_32
+    push            r5
+    push            r6
+    %assign push_num 2
+    %define r_mask  r5
+    %define r_maskd r5d
+    %define p_shufb_lut wels_cavlc_param_cal_shufb_lut
+    %define p_run_lut   wels_cavlc_param_cal_run_lut
+%elifdef WIN64
+    push            rbx
+    %assign push_num 1
+    %define r_mask  rbx
+    %define r_maskd ebx
+    %define p_shufb_lut r5
+    %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
+    lea             p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
+%else
+    %assign push_num 0
+    %define r_mask  rax
+    %define r_maskd eax
+    %define p_shufb_lut r5
+    %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
+    lea             p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
+%endif
+
+    LOAD_5_PARA
+    PUSH_XMM 2
+
+    ; Free up rcx/ecx because only cl is accepted as shift amount operand.
+%ifidni r0b, cl
+    mov             r6, r0
+    %undef p_coeff_level
+    %define p_coeff_level r6
+    %define r_tmp r0
+    %define r_tmpd r0d
+    %define r_tmpb r0b
+%elifidni r1b, cl
+    mov             r6, r1
+    %undef p_run
+    %define p_run r6
+    %define r_tmp r1
+    %define r_tmpd r1d
+    %define r_tmpb r1b
+%elifidni r3b, cl
+    mov             r6, r3
+    %undef p_total_coeffs
+    %define p_total_coeffs r6
+    %define r_tmp r3
+    %define r_tmpd r3d
+    %define r_tmpb r3b
+%else
+    %error "Unknown cl register."
+%endif
+
+    ; Acquire a bitmask indicating which words are non-zero.
+    ; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3.
+    ; Otherwise, assume 8 bytes available. Assume that input beyond endIdx is zero.
+    ; Assumptions are taken from previous implementations.
+    pxor            xmm1, xmm1
+    cmp             i_endidxd, 3
+    jg              .load16
+    movq            xmm0, [p_coeff_level]
+    packsswb        xmm0, xmm1
+    jmp             .load_done
+.load16:
+    movdqa          xmm0, [p_coeff_level]
+    packsswb        xmm0, [p_coeff_level + 16]
+.load_done:
+    movdqa          [p_run], xmm1                           ; Zero-initialize because we may read back implied zeros.
+    pcmpeqb         xmm0, xmm1
+    pshufb          xmm0, [wels_shufb_rev]
+    pmovmskb        r_maskd, xmm0
+    xor             r_maskd, 0FFFFh
+    mov             r_tmpd, i_endidxd
+%undef i_endidxd
+%define r_tmp2  r4
+%define r_tmp2d r4d
+    popcnt          r_tmp2d, r_maskd
+    mov             [p_total_coeffs], r_tmp2d
+%xdefine i_total_zeros p_total_coeffs
+%undef p_total_coeffs
+    mov             i_total_zeros, r_tmp2
+    jz              .done
+    mov             i_total_zeros, 16
+    sub             i_total_zeros, r_tmp2
+    bsf             r_tmpd, r_maskd                         ; Find first set bit.
+    sub             i_total_zeros, r_tmp
+    ; Skip trailing zeros.
+    ; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores.
+    and             r_tmpd, -4
+    shr             r_maskd, r_tmpb
+    add             r_tmpd, r_tmpd
+    sub             p_coeff_level, r_tmp
+    ; Handle first quadruple containing a non-zero value.
+    mov             r_tmp, r_mask
+    and             r_tmpd, 0Fh
+    movq            xmm0, [p_coeff_level + 24]
+    movq            xmm1, [p_shufb_lut + 8 * r_tmp]
+    pshufb          xmm0, xmm1
+    mov             r_tmp2d, [p_run_lut + 4 * r_tmp]
+    shr             r_tmp2d, 8                              ; Skip initial zero run.
+    movlps          [p_level], xmm0                         ; Store levels for the first quadruple.
+    mov             [p_run], r_tmp2d                        ; Store accompanying zero runs thus far.
+    shr             r_maskd, 4
+    jz              .done
+.loop:
+    ; Increment pointers.
+    popcnt          r_tmpd, r_tmpd                          ; Number of non-zero values handled.
+    lea             p_level, [p_level + 2 * r_tmp]
+    add             p_run, r_tmp
+    ; Handle next quadruple.
+    mov             r_tmp, r_mask
+    and             r_tmpd, 0Fh
+    movq            xmm0, [p_coeff_level + 16]
+    sub             p_coeff_level, 8
+    movq            xmm1, [p_shufb_lut + 8 * r_tmp]
+    pshufb          xmm0, xmm1
+    movzx           r_tmp2d, byte [p_run - 1]
+    add             r_tmp2d, [p_run_lut + 4 * r_tmp]        ; Add to previous run and get eventual new runs.
+    movlps          [p_level], xmm0                         ; Store levels (potentially none).
+    mov             [p_run - 1], r_tmp2d                    ; Update previous run and store eventual new runs.
+    shr             r_maskd, 4
+    jnz             .loop
+.done:
+    mov             retrq, i_total_zeros
+    POP_XMM
+    LOAD_5_PARA_POP
+%ifdef X86_32
+    pop             r6
+    pop             r5
+%elifdef WIN64
+    pop             rbx
+%endif
+    ret
+%undef p_coeff_level
+%undef p_run
+%undef p_level
+%undef i_total_zeros
+%undef r_mask
+%undef r_maskd
+%undef r_tmp
+%undef r_tmpd
+%undef r_tmpb
+%undef r_tmp2
+%undef r_tmp2d
+%undef p_shufb_lut
+%undef p_run_lut
diff --git a/test/encoder/EncUT_Cavlc.cpp b/test/encoder/EncUT_Cavlc.cpp
index 04837d7f..5cf4158d 100644
--- a/test/encoder/EncUT_Cavlc.cpp
+++ b/test/encoder/EncUT_Cavlc.cpp
@@ -81,3 +81,10 @@ TEST (CavlcTest, CavlcParamCal_sse2) {
   TestCavlcParamCal (CavlcParamCal_sse2);
 }
 #endif
+
+#ifdef X86_ASM
+TEST (CavlcTest, CavlcParamCal_sse42) {
+  if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
+    TestCavlcParamCal (CavlcParamCal_sse42);
+}
+#endif