From 7d00e8bc42247ebbb3ae5bbd8e4274bf6f489290 Mon Sep 17 00:00:00 2001
From: Guangwei Wang <guangwwa@cisco.com>
Date: Fri, 15 Jul 2016 12:15:57 +0800
Subject: [PATCH] add option for enable/disable AVX2

---
 build/arch.mk                                      | 14 ++++++++++++++
 codec/common/inc/cpu_core.h                        |  7 ++++++-
 codec/common/x86/dct.asm                           |  3 +++
 codec/common/x86/satd_sad.asm                      |  3 +++
 codec/decoder/core/inc/decode_mb_aux.h             |  2 ++
 codec/decoder/core/src/decoder.cpp                 |  3 +++
 codec/encoder/core/x86/quant.asm                   |  3 +++
 codec/processing/src/downsample/downsample.cpp     |  2 ++
 codec/processing/src/downsample/downsample.h       |  4 ++++
 .../processing/src/downsample/downsamplefuncs.cpp  |  2 ++
 codec/processing/src/x86/downsample_bilinear.asm   |  3 +++
 codec/processing/src/x86/vaa.asm                   |  4 ++++
 test/decoder/DecUT_IdctResAddPred.cpp              |  4 ++++
 test/processing/ProcessUT_DownSample.cpp           |  3 +++
 14 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/build/arch.mk b/build/arch.mk
index 846b69a0..e9eb2539 100644
--- a/build/arch.mk
+++ b/build/arch.mk
@@ -1,6 +1,18 @@
+#for x86
+HAVE_AVX2 := true
+
 ifneq ($(filter %86 x86_64, $(ARCH)),)
 include $(SRC_PATH)build/x86-common.mk
+ifeq ($(USE_ASM), Yes)
+ifeq ($(HAVE_AVX2), true)
+CFLAGS += -DHAVE_AVX2
+CXXFLAGS += -DHAVE_AVX2
+ASMFLAGS += -DHAVE_AVX2
 endif
+endif
+endif
+
+#for arm
 ifneq ($(filter-out arm64, $(filter arm%, $(ARCH))),)
 ifeq ($(USE_ASM), Yes)
 ASM_ARCH = arm
@@ -8,6 +20,8 @@ ASMFLAGS += -I$(SRC_PATH)codec/common/arm/
 CFLAGS += -DHAVE_NEON
 endif
 endif
+
+#for arm64
 ifneq ($(filter arm64 aarch64, $(ARCH)),)
 ifeq ($(USE_ASM), Yes)
 ASM_ARCH = arm64
diff --git a/codec/common/inc/cpu_core.h b/codec/common/inc/cpu_core.h
index 2b30010f..8d96604a 100644
--- a/codec/common/inc/cpu_core.h
+++ b/codec/common/inc/cpu_core.h
@@ -56,7 +56,6 @@
 #define WELS_CPU_SSE42      0x00000400    /* sse 4.2 */
 
 /* CPU features application extensive */
-#define WELS_CPU_AVX        0x00000800  /* Advanced Vector eXtentions */
 #define WELS_CPU_FPU        0x00001000  /* x87-FPU on chip */
 #define WELS_CPU_HTT        0x00002000  /* Hyper-Threading Technology (HTT), Multi-threading enabled feature:
                                            physical processor package is capable of supporting more than one logic processor
@@ -67,7 +66,13 @@
 #define WELS_CPU_MOVBE      0x00008000  /* MOVBE instruction */
 #define WELS_CPU_AES        0x00010000  /* AES instruction extensions */
 #define WELS_CPU_FMA        0x00020000  /* AVX VEX FMA instruction sets */
+#define WELS_CPU_AVX        0x00000800  /* Advanced Vector eXtentions */
+
+#ifdef HAVE_AVX2
 #define WELS_CPU_AVX2       0x00040000  /* AVX2 */
+#else
+#define WELS_CPU_AVX2       0x00000000  /* !AVX2 */
+#endif
 
 #define WELS_CPU_CACHELINE_16    0x10000000    /* CacheLine Size 16 */
 #define WELS_CPU_CACHELINE_32    0x20000000    /* CacheLine Size 32 */
diff --git a/codec/common/x86/dct.asm b/codec/common/x86/dct.asm
index 1941a726..dd97197b 100644
--- a/codec/common/x86/dct.asm
+++ b/codec/common/x86/dct.asm
@@ -678,6 +678,7 @@ WELS_EXTERN WelsIDctRecI16x16Dc_sse2
 ; AVX2 functions
 ;***********************************************************************
 
+%ifdef HAVE_AVX2
 ; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8
 %macro AVX2_LoadDiff16P 8
     vmovq         x%1, [%2         ]
@@ -1011,3 +1012,5 @@ WELS_EXTERN WelsIDctT4Rec_avx2
     POP_XMM
     LOAD_5_PARA_POP
     ret
+%endif
+
diff --git a/codec/common/x86/satd_sad.asm b/codec/common/x86/satd_sad.asm
index d325cef8..bd280874 100644
--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -1504,6 +1504,7 @@ loop_get_satd_16x16_right:
 ;
 ;***********************************************************************
 
+%ifdef HAVE_AVX2
 ; out=%1 pSrcA=%2 pSrcB=%3 HSumSubDB1_256=%4 ymm_clobber=%5
 %macro AVX2_LoadDiffSatd16x1 5
     vbroadcasti128   %1, [%2]
@@ -1723,6 +1724,8 @@ WelsSampleSatd16x4N_avx2:
 %endif
     ret
 
+%endif
+
 ;***********************************************************************
 ;
 ;Pixel_satd_wxh_avx2 END
diff --git a/codec/decoder/core/inc/decode_mb_aux.h b/codec/decoder/core/inc/decode_mb_aux.h
index 6a438ef5..e991f6e6 100644
--- a/codec/decoder/core/inc/decode_mb_aux.h
+++ b/codec/decoder/core/inc/decode_mb_aux.h
@@ -48,8 +48,10 @@ extern "C" {
 #if defined(X86_ASM)
 void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
 void IdctResAddPred_sse2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
+#if defined(HAVE_AVX2)
 void IdctResAddPred_avx2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
 void IdctFourResAddPred_avx2 (uint8_t* pPred, int32_t iStride, int16_t* pRs, const int8_t* pNzc);
+#endif
 #endif//X86_ASM
 
 #if defined(HAVE_NEON)
diff --git a/codec/decoder/core/src/decoder.cpp b/codec/decoder/core/src/decoder.cpp
index eeed6bb1..e8092396 100644
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -1005,11 +1005,14 @@ void InitPredFunc (PWelsDecoderContext pCtx, uint32_t uiCpuFlag) {
     pCtx->pGetIChromaPredFunc[C_PRED_DC_T]    = WelsDecoderIChromaPredDcTop_sse2;
     pCtx->pGetI4x4LumaPredFunc[I4_PRED_H]     = WelsDecoderI4x4LumaPredH_sse2;
   }
+#if defined(HAVE_AVX2)
   if (uiCpuFlag & WELS_CPU_AVX2) {
     pCtx->pIdctResAddPredFunc     = IdctResAddPred_avx2;
     pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_avx2;
   }
 #endif
+
+#endif
 }
 
 //reset decoder number related statistics info
diff --git a/codec/encoder/core/x86/quant.asm b/codec/encoder/core/x86/quant.asm
index b8d3fa8b..21b56b53 100644
--- a/codec/encoder/core/x86/quant.asm
+++ b/codec/encoder/core/x86/quant.asm
@@ -370,6 +370,7 @@ WELS_EXTERN WelsDequantIHadamard4x4_sse2
     ret
 
 
+%ifdef HAVE_AVX2
 ; data=%1 abs_out=%2 ff=%3 mf=%4 7FFFh=%5
 %macro AVX2_Quant 5
     vpabsw          %2, %1
@@ -502,3 +503,5 @@ WELS_EXTERN WelsQuantFour4x4Max_avx2
     POP_XMM
     LOAD_4_PARA_POP
     ret
+%endif
+
diff --git a/codec/processing/src/downsample/downsample.cpp b/codec/processing/src/downsample/downsample.cpp
index 829de4c4..a0f26ae2 100644
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -107,10 +107,12 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int
     sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_sse4;
     sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsamplerWrap_sse41;
   }
+#ifdef HAVE_AVX2
   if (iCpuFlag & WELS_CPU_AVX2) {
     sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_avx2;
     sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_avx2;
   }
+#endif
 #endif//X86_ASM
 
 #if defined(HAVE_NEON)
diff --git a/codec/processing/src/downsample/downsample.h b/codec/processing/src/downsample/downsample.h
index dd30ec52..ca5c4bd4 100644
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -99,8 +99,10 @@ GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_ssse3;
 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse41;
+#ifdef HAVE_AVX2
 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_avx2;
 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_avx2;
+#endif
 
 SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_ssse3;
 SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_sse4;
@@ -120,12 +122,14 @@ void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, in
 void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
     int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
     uint32_t uiScaleY);
+#ifdef HAVE_AVX2
 void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
     int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
     uint32_t uiScaleY);
 void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
     int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
     uint32_t uiScaleY);
+#endif
 
 WELSVP_EXTERN_C_END
 #endif
diff --git a/codec/processing/src/downsample/downsamplefuncs.cpp b/codec/processing/src/downsample/downsamplefuncs.cpp
index 9e92849a..ed5e7572 100644
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -284,8 +284,10 @@ DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2)
 DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)
 DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)
 DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)
+#ifdef HAVE_AVX2
 DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (avx2)
 DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (avx2)
+#endif
 #endif //X86_ASM
 
 #ifdef HAVE_NEON
diff --git a/codec/processing/src/x86/downsample_bilinear.asm b/codec/processing/src/x86/downsample_bilinear.asm
index 7df67280..b39608d5 100644
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -3254,6 +3254,7 @@ WELS_EXTERN GeneralBilinearAccurateDownsampler_sse41
 %undef xmm_xfrac1_begin
 %undef xmm_xfrac_inc
 
+%ifdef HAVE_AVX2
 ; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
 %macro AVX2_BilinearIncXposuw 5
     vpaddusw        %5, %2, %4
@@ -4552,3 +4553,5 @@ WELS_EXTERN GeneralBilinearAccurateDownsampler_avx2
 %undef ymm_xfrac0_begin
 %undef ymm_xfrac1_begin
 %undef ymm_xfrac_inc
+%endif
+
diff --git a/codec/processing/src/x86/vaa.asm b/codec/processing/src/x86/vaa.asm
index e2b49e31..9b728576 100644
--- a/codec/processing/src/x86/vaa.asm
+++ b/codec/processing/src/x86/vaa.asm
@@ -2088,6 +2088,7 @@ sqdiff_bgd_width_loop:
     %assign push_num push_num - stack_alloc_num
 %endmacro
 
+%ifdef HAVE_AVX2
 ; Max unsigned byte per quadword
 ; out=%1 in=%2 tmp=%3
 %macro AVX2_Maxubq 3
@@ -3557,3 +3558,6 @@ WELS_EXTERN VAACalcSadSsdBgd_avx2
 %undef           p_sd8x8
 %undef           p_mad8x8
     ret
+
+%endif
+
diff --git a/test/decoder/DecUT_IdctResAddPred.cpp b/test/decoder/DecUT_IdctResAddPred.cpp
index 7a053839..c9f33b49 100644
--- a/test/decoder/DecUT_IdctResAddPred.cpp
+++ b/test/decoder/DecUT_IdctResAddPred.cpp
@@ -53,6 +53,7 @@ void SetNonZeroCount_ref (int8_t* pNonZeroCount) {
 }
 
 #if defined(X86_ASM)
+#if defined(HAVE_AVX2)
 void IdctFourResAddPred_ref (uint8_t* pPred, int32_t iStride, int16_t* pRs) {
   IdctResAddPred_ref (pPred + 0 * iStride + 0, iStride, pRs + 0 * 16);
   IdctResAddPred_ref (pPred + 0 * iStride + 4, iStride, pRs + 1 * 16);
@@ -60,6 +61,7 @@ void IdctFourResAddPred_ref (uint8_t* pPred, int32_t iStride, int16_t* pRs) {
   IdctResAddPred_ref (pPred + 4 * iStride + 4, iStride, pRs + 3 * 16);
 }
 #endif
+#endif
 
 } // anon ns
 
@@ -138,9 +140,11 @@ GENERATE_IDCTRESADDPRED (IdctResAddPred_c, 0)
 #if defined(X86_ASM)
 GENERATE_IDCTRESADDPRED (IdctResAddPred_mmx, WELS_CPU_MMXEXT)
 GENERATE_IDCTRESADDPRED (IdctResAddPred_sse2, WELS_CPU_SSE2)
+#if defined(HAVE_AVX2)
 GENERATE_IDCTRESADDPRED (IdctResAddPred_avx2, WELS_CPU_AVX2)
 GENERATE_IDCTFOURRESADDPRED (IdctFourResAddPred_avx2, WELS_CPU_AVX2)
 #endif
+#endif
 
 #if defined(HAVE_NEON)
 GENERATE_IDCTRESADDPRED (IdctResAddPred_neon, WELS_CPU_NEON)
diff --git a/test/processing/ProcessUT_DownSample.cpp b/test/processing/ProcessUT_DownSample.cpp
index 21fd396b..faa3edbf 100644
--- a/test/processing/ProcessUT_DownSample.cpp
+++ b/test/processing/ProcessUT_DownSample.cpp
@@ -372,12 +372,15 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_ssse3
                                         WELS_CPU_SSSE3)
 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse41,
                                         GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE41)
+#ifdef HAVE_AVX2
 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_avx2, GeneralBilinearFastDownsampler_ref, 1,
                                         WELS_CPU_AVX2)
 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_avx2,
                                         GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_AVX2)
 #endif
 
+#endif
+
 #if defined(HAVE_NEON)
 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_neon, 1, WELS_CPU_NEON)
 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_neon, 1, WELS_CPU_NEON)