From 48a520915aba79d7ade9362bd138ae2992a4ba26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sindre=20Aam=C3=A5s?= <saamas@cisco.com>
Date: Mon, 7 Mar 2016 10:02:44 +0100
Subject: [PATCH] [Encoder/x86] Add AVX2 SATD routines

WelsSampleSatd16x16_avx2 (~2.31x speedup over SSE4.1 on Haswell).
WelsSampleSatd16x8_avx2  (~2.19x speedup over SSE4.1 on Haswell).
WelsSampleSatd8x16_avx2  (~1.68x speedup over SSE4.1 on Haswell).
WelsSampleSatd8x8_avx2   (~1.53x speedup over SSE4.1 on Haswell).
---
 codec/common/x86/asm_inc.asm      |   5 +
 codec/common/x86/satd_sad.asm     | 231 ++++++++++++++++++++++++++++++
 codec/encoder/core/inc/sample.h   |   5 +
 codec/encoder/core/src/sample.cpp |   7 +
 test/encoder/EncUT_Sample.cpp     |   5 +
 5 files changed, 253 insertions(+)

diff --git a/codec/common/x86/asm_inc.asm b/codec/common/x86/asm_inc.asm
index adcc0aae..b41996ec 100644
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -647,6 +647,11 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-
     packuswb %1,%1
 %endmacro
 
+%macro WELS_DW1_VEX 1
+    vpcmpeqw %1, %1, %1
+    vpsrlw   %1, %1, 15
+%endmacro
+
 %macro WELS_DW32_VEX 1
     vpcmpeqw %1, %1, %1
     vpsrlw   %1, %1, 15
diff --git a/codec/common/x86/satd_sad.asm b/codec/common/x86/satd_sad.asm
index dd2a22fa..d325cef8 100644
--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -1498,6 +1498,237 @@ loop_get_satd_16x16_right:
 ;
 ;***********************************************************************
 
+;***********************************************************************
+;
+;Pixel_satd_wxh_avx2 BEGIN
+;
+;***********************************************************************
+
+; out=%1 pSrcA=%2 pSrcB=%3 HSumSubDB1_256=%4 ymm_clobber=%5
+%macro AVX2_LoadDiffSatd16x1 5
+    vbroadcasti128   %1, [%2]
+    vpmaddubsw       %1, %1, %4             ; hadamard neighboring horizontal sums and differences
+    vbroadcasti128   %5, [%3]
+    vpmaddubsw       %5, %5, %4             ; hadamard neighboring horizontal sums and differences
+    vpsubw           %1, %1, %5             ; diff srcA srcB
+%endmacro
+
+; out=%1 pSrcA=%2 pSrcA+4*iStride=%3 pSrcB=%4 pSrcB+4*iStride=%5 HSumSubDB1_128x2=%6 ymm_clobber=%7,%8
+%macro AVX2_LoadDiffSatd8x2 8
+    vpbroadcastq     %1, [%2]
+    vpbroadcastq     %7, [%3]
+    vpblendd         %1, %1, %7, 11110000b
+    vpmaddubsw       %1, %1, %6             ; hadamard neighboring horizontal sums and differences
+    vpbroadcastq     %7, [%4]
+    vpbroadcastq     %8, [%5]
+    vpblendd         %7, %7, %8, 11110000b
+    vpmaddubsw       %7, %7, %6             ; hadamard neighboring horizontal sums and differences
+    vpsubw           %1, %1, %7             ; diff srcA srcB
+%endmacro
+
+; in/out=%1,%2,%3,%4 clobber=%5
+%macro AVX2_HDMFour4x4 5
+    vpsubw           %5, %1, %4             ; s3 = x0 - x3
+    vpaddw           %1, %1, %4             ; s0 = x0 + x3
+    vpsubw           %4, %2, %3             ; s2 = x1 - x2
+    vpaddw           %2, %2, %3             ; s1 = x1 + x2
+    vpsubw           %3, %1, %2             ; y2 = s0 - s1
+    vpaddw           %1, %1, %2             ; y0 = s0 + s1
+    vpaddw           %2, %5, %4             ; y1 = s3 + s2
+    vpsubw           %4, %5, %4             ; y3 = s3 - s2
+%endmacro
+
+; out=%1 in=%1,%2,%3,%4 clobber=%5
+%macro AVX2_SatdFour4x4 5
+    AVX2_HDMFour4x4  %1, %2, %3, %4, %5
+    vpabsw           %1, %1
+    vpabsw           %2, %2
+    vpabsw           %3, %3
+    vpabsw           %4, %4
+    ; second stage of horizontal hadamard.
+    ; utilizes that |a + b| + |a - b| = 2 * max(|a|, |b|)
+    vpblendw         %5, %1, %2, 10101010b
+    vpslld           %2, %2, 16
+    vpsrld           %1, %1, 16
+    vpor             %2, %2, %1
+    vpmaxuw          %2, %2, %5
+    vpblendw         %5, %3, %4, 10101010b
+    vpslld           %4, %4, 16
+    vpsrld           %3, %3, 16
+    vpor             %4, %4, %3
+    vpmaxuw          %3, %5, %4
+    vpaddw           %1, %2, %3
+%endmacro
+
+; out=%1 pSrcA=%2 iStrideA=%3 3*iStrideA=%4 pSrcB=%5 iStrideB=%6 3*iStrideB=%7 HSumSubDB1_256=%8 ymm_clobber=%9,%10,%11,%12
+%macro AVX2_GetSatd16x4 12
+    AVX2_LoadDiffSatd16x1  %1, %2 + 0 * %3, %5 + 0 * %6, %8, %12
+    AVX2_LoadDiffSatd16x1  %9, %2 + 1 * %3, %5 + 1 * %6, %8, %12
+    AVX2_LoadDiffSatd16x1 %10, %2 + 2 * %3, %5 + 2 * %6, %8, %12
+    AVX2_LoadDiffSatd16x1 %11, %2 + 1 * %4, %5 + 1 * %7, %8, %12
+    AVX2_SatdFour4x4 %1, %9, %10, %11, %12
+%endmacro
+
+; out=%1 pSrcA=%2 iStrideA=%3 3*iStrideA=%4 pSrcB=%5 iStrideB=%6 3*iStrideB=%7 HSumSubDB1_128x2=%8 ymm_clobber=%9,%10,%11,%12,%13
+%macro AVX2_GetSatd8x8 13
+    AVX2_LoadDiffSatd8x2  %1, %2 + 0 * %3, %2 + 4 * %3, %5 + 0 * %6, %5 + 4 * %6, %8, %12, %13
+    AVX2_LoadDiffSatd8x2 %10, %2 + 2 * %3, %2 + 2 * %4, %5 + 2 * %6, %5 + 2 * %7, %8, %12, %13
+    add              %2, %3
+    add              %5, %6
+    AVX2_LoadDiffSatd8x2  %9, %2 + 0 * %3, %2 + 4 * %3, %5 + 0 * %6, %5 + 4 * %6, %8, %12, %13
+    AVX2_LoadDiffSatd8x2 %11, %2 + 2 * %3, %2 + 2 * %4, %5 + 2 * %6, %5 + 2 * %7, %8, %12, %13
+    AVX2_SatdFour4x4 %1, %9, %10, %11, %12
+%endmacro
+
+; d_out=%1 mm_in=%2 mm_clobber=%3
+%macro AVX2_SumWHorizon 3
+    WELS_DW1_VEX     y%3
+    vpmaddwd         y%2, y%2, y%3
+    vextracti128     x%3, y%2, 1
+    vpaddd           x%2, x%2, x%3
+    vpunpckhqdq      x%3, x%2, x%2
+    vpaddd           x%2, x%2, x%3
+    vpsrldq          x%3, x%2, 4
+    vpaddd           x%2, x%2, x%3
+    vmovd            %1, x%2
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x16_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd8x16_avx2
+    %assign push_num 0
+%ifdef X86_32
+    push r4
+    %assign push_num 1
+%endif
+    mov r4, 2                      ; loop cnt
+    jmp WelsSampleSatd8x8N_avx2
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x8_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd8x8_avx2
+    %assign push_num 0
+%ifdef X86_32
+    push           r4
+    %assign push_num 1
+%endif
+    mov            r4, 1           ; loop cnt
+                                   ; fall through
+WelsSampleSatd8x8N_avx2:
+%ifdef X86_32
+    push           r5
+    push           r6
+    %assign push_num push_num+2
+%endif
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+
+    vbroadcasti128 ymm7, [HSumSubDB1]
+    lea            r5, [3 * r1]
+    lea            r6, [3 * r3]
+    vpxor          ymm6, ymm6, ymm6
+.loop:
+    AVX2_GetSatd8x8 ymm0, r0, r1, r5, r2, r3, r6, ymm7, ymm1, ymm2, ymm3, ymm4, ymm5
+    vpaddw         ymm6, ymm6, ymm0
+    sub            r4, 1
+    jbe            .loop_end
+    add            r0, r5
+    add            r2, r6
+    lea            r0, [r0 + 4 * r1]
+    lea            r2, [r2 + 4 * r3]
+    jmp            .loop
+.loop_end:
+    AVX2_SumWHorizon retrd, mm6, mm5
+    vzeroupper
+    POP_XMM
+    LOAD_4_PARA_POP
+%ifdef X86_32
+    pop            r6
+    pop            r5
+    pop            r4
+%endif
+    ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd16x16_avx2
+    %assign push_num 0
+%ifdef X86_32
+    push r4
+    %assign push_num 1
+%endif
+    mov r4, 4                      ; loop cnt
+    jmp WelsSampleSatd16x4N_avx2
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd16x8_avx2
+    %assign push_num 0
+%ifdef X86_32
+    push r4
+    %assign push_num 1
+%endif
+    mov r4, 2                      ; loop cnt
+                                   ; fall through
+WelsSampleSatd16x4N_avx2:
+%ifdef X86_32
+    push r5
+    push r6
+    %assign push_num push_num+2
+%endif
+    LOAD_4_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+
+    vpbroadcastq xmm0, [HSumSubDB1]
+    vpbroadcastq ymm6, [HSumSubDB1 + 8]
+    vpblendd     ymm6, ymm0, ymm6, 11110000b
+    lea          r5, [3 * r1]
+    lea          r6, [3 * r3]
+    vpxor        ymm5, ymm5, ymm5
+.loop:
+    AVX2_GetSatd16x4 ymm0, r0, r1, r5, r2, r3, r6, ymm6, ymm1, ymm2, ymm3, ymm4
+    vpaddw       ymm5, ymm5, ymm0
+    lea          r0, [r0 + 4 * r1]
+    lea          r2, [r2 + 4 * r3]
+    sub          r4, 1
+    ja           .loop
+    AVX2_SumWHorizon retrd, mm5, mm0
+    vzeroupper
+    POP_XMM
+    LOAD_4_PARA_POP
+%ifdef X86_32
+    pop r6
+    pop r5
+    pop r4
+%endif
+    ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_avx2 END
+;
+;***********************************************************************
+
 ;***********************************************************************
 ;
 ;Pixel_sad_wxh_sse2 BEGIN
diff --git a/codec/encoder/core/inc/sample.h b/codec/encoder/core/inc/sample.h
index 31a04fb5..7406e2fd 100644
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@@ -82,6 +82,11 @@ int32_t WelsIntra16x16Combined3Sad_ssse3 (uint8_t*, int32_t, uint8_t*, int32_t,
 int32_t WelsIntraChroma8x8Combined3Satd_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
     uint8_t*, uint8_t*);
 
+int32_t WelsSampleSatd8x8_avx2 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd8x16_avx2 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x8_avx2 (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x16_avx2 (uint8_t*, int32_t, uint8_t*, int32_t);
+
 #endif//X86_ASM
 
 #if defined (HAVE_NEON)
diff --git a/codec/encoder/core/src/sample.cpp b/codec/encoder/core/src/sample.cpp
index 6906dac1..75a01f12 100644
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -405,6 +405,13 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
     pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
   }
 
+  if (uiCpuFlag & WELS_CPU_AVX2) {
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_avx2;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8]  = WelsSampleSatd16x8_avx2;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16]  = WelsSampleSatd8x16_avx2;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8]   = WelsSampleSatd8x8_avx2;
+  }
+
 #endif //(X86_ASM)
 
 #if defined (HAVE_NEON)
diff --git a/test/encoder/EncUT_Sample.cpp b/test/encoder/EncUT_Sample.cpp
index 492358c7..66bc76d3 100644
--- a/test/encoder/EncUT_Sample.cpp
+++ b/test/encoder/EncUT_Sample.cpp
@@ -635,6 +635,11 @@ GENERATE_Sad8x8_UT (WelsSampleSatd8x8_sse41, WelsSampleSatd8x8_c, WELS_CPU_SSE41
 GENERATE_Sad8x16_UT (WelsSampleSatd8x16_sse41, WelsSampleSatd8x16_c, WELS_CPU_SSE41)
 GENERATE_Sad16x8_UT (WelsSampleSatd16x8_sse41, WelsSampleSatd16x8_c, WELS_CPU_SSE41)
 GENERATE_Sad16x16_UT (WelsSampleSatd16x16_sse41, WelsSampleSatd16x16_c, WELS_CPU_SSE41)
+
+GENERATE_Sad8x8_UT (WelsSampleSatd8x8_avx2, WelsSampleSatd8x8_c, WELS_CPU_AVX2)
+GENERATE_Sad8x16_UT (WelsSampleSatd8x16_avx2, WelsSampleSatd8x16_c, WELS_CPU_AVX2)
+GENERATE_Sad16x8_UT (WelsSampleSatd16x8_avx2, WelsSampleSatd16x8_c, WELS_CPU_AVX2)
+GENERATE_Sad16x16_UT (WelsSampleSatd16x16_avx2, WelsSampleSatd16x16_c, WELS_CPU_AVX2)
 #endif
 
 #ifdef HAVE_NEON