From 7d00e8bc42247ebbb3ae5bbd8e4274bf6f489290 Mon Sep 17 00:00:00 2001 From: Guangwei Wang Date: Fri, 15 Jul 2016 12:15:57 +0800 Subject: [PATCH] add option for enable/disable AVX2 --- build/arch.mk | 14 ++++++++++++++ codec/common/inc/cpu_core.h | 7 ++++++- codec/common/x86/dct.asm | 3 +++ codec/common/x86/satd_sad.asm | 3 +++ codec/decoder/core/inc/decode_mb_aux.h | 2 ++ codec/decoder/core/src/decoder.cpp | 3 +++ codec/encoder/core/x86/quant.asm | 3 +++ codec/processing/src/downsample/downsample.cpp | 2 ++ codec/processing/src/downsample/downsample.h | 4 ++++ .../processing/src/downsample/downsamplefuncs.cpp | 2 ++ codec/processing/src/x86/downsample_bilinear.asm | 3 +++ codec/processing/src/x86/vaa.asm | 4 ++++ test/decoder/DecUT_IdctResAddPred.cpp | 4 ++++ test/processing/ProcessUT_DownSample.cpp | 3 +++ 14 files changed, 56 insertions(+), 1 deletion(-) diff --git a/build/arch.mk b/build/arch.mk index 846b69a0..e9eb2539 100644 --- a/build/arch.mk +++ b/build/arch.mk @@ -1,6 +1,18 @@ +#for x86 +HAVE_AVX2 := true + ifneq ($(filter %86 x86_64, $(ARCH)),) include $(SRC_PATH)build/x86-common.mk +ifeq ($(USE_ASM), Yes) +ifeq ($(HAVE_AVX2), true) +CFLAGS += -DHAVE_AVX2 +CXXFLAGS += -DHAVE_AVX2 +ASMFLAGS += -DHAVE_AVX2 endif +endif +endif + +#for arm ifneq ($(filter-out arm64, $(filter arm%, $(ARCH))),) ifeq ($(USE_ASM), Yes) ASM_ARCH = arm @@ -8,6 +20,8 @@ ASMFLAGS += -I$(SRC_PATH)codec/common/arm/ CFLAGS += -DHAVE_NEON endif endif + +#for arm64 ifneq ($(filter arm64 aarch64, $(ARCH)),) ifeq ($(USE_ASM), Yes) ASM_ARCH = arm64 diff --git a/codec/common/inc/cpu_core.h b/codec/common/inc/cpu_core.h index 2b30010f..8d96604a 100644 --- a/codec/common/inc/cpu_core.h +++ b/codec/common/inc/cpu_core.h @@ -56,7 +56,6 @@ #define WELS_CPU_SSE42 0x00000400 /* sse 4.2 */ /* CPU features application extensive */ -#define WELS_CPU_AVX 0x00000800 /* Advanced Vector eXtentions */ #define WELS_CPU_FPU 0x00001000 /* x87-FPU on chip */ #define WELS_CPU_HTT 0x00002000 /* Hyper-Threading Technology (HTT), Multi-threading enabled feature: physical processor package is capable of supporting more than one logic processor @@ -67,7 +66,13 @@ #define WELS_CPU_MOVBE 0x00008000 /* MOVBE instruction */ #define WELS_CPU_AES 0x00010000 /* AES instruction extensions */ #define WELS_CPU_FMA 0x00020000 /* AVX VEX FMA instruction sets */ +#define WELS_CPU_AVX 0x00000800 /* Advanced Vector eXtentions */ + +#ifdef HAVE_AVX2 #define WELS_CPU_AVX2 0x00040000 /* AVX2 */ +#else +#define WELS_CPU_AVX2 0x00000000 /* !AVX2 */ +#endif #define WELS_CPU_CACHELINE_16 0x10000000 /* CacheLine Size 16 */ #define WELS_CPU_CACHELINE_32 0x20000000 /* CacheLine Size 32 */ diff --git a/codec/common/x86/dct.asm b/codec/common/x86/dct.asm index 1941a726..dd97197b 100644 --- a/codec/common/x86/dct.asm +++ b/codec/common/x86/dct.asm @@ -678,6 +678,7 @@ WELS_EXTERN WelsIDctRecI16x16Dc_sse2 ; AVX2 functions ;*********************************************************************** +%ifdef HAVE_AVX2 ; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8 %macro AVX2_LoadDiff16P 8 vmovq x%1, [%2 ] @@ -1011,3 +1012,5 @@ WELS_EXTERN WelsIDctT4Rec_avx2 POP_XMM LOAD_5_PARA_POP ret +%endif + diff --git a/codec/common/x86/satd_sad.asm b/codec/common/x86/satd_sad.asm index d325cef8..bd280874 100644 --- a/codec/common/x86/satd_sad.asm +++ b/codec/common/x86/satd_sad.asm @@ -1504,6 +1504,7 @@ loop_get_satd_16x16_right: ; ;*********************************************************************** +%ifdef HAVE_AVX2 ; out=%1 pSrcA=%2 pSrcB=%3 HSumSubDB1_256=%4 ymm_clobber=%5 %macro AVX2_LoadDiffSatd16x1 5 vbroadcasti128 %1, [%2] @@ -1723,6 +1724,8 @@ WelsSampleSatd16x4N_avx2: %endif ret +%endif + ;*********************************************************************** ; ;Pixel_satd_wxh_avx2 END diff --git a/codec/decoder/core/inc/decode_mb_aux.h b/codec/decoder/core/inc/decode_mb_aux.h index 6a438ef5..e991f6e6 100644 --- a/codec/decoder/core/inc/decode_mb_aux.h +++ b/codec/decoder/core/inc/decode_mb_aux.h @@ -48,8 +48,10 @@ extern "C" { #if defined(X86_ASM) void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs); void IdctResAddPred_sse2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs); +#if defined(HAVE_AVX2) void IdctResAddPred_avx2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs); void IdctFourResAddPred_avx2 (uint8_t* pPred, int32_t iStride, int16_t* pRs, const int8_t* pNzc); +#endif #endif//X86_ASM #if defined(HAVE_NEON) diff --git a/codec/decoder/core/src/decoder.cpp b/codec/decoder/core/src/decoder.cpp index eeed6bb1..e8092396 100644 --- a/codec/decoder/core/src/decoder.cpp +++ b/codec/decoder/core/src/decoder.cpp @@ -1005,11 +1005,14 @@ void InitPredFunc (PWelsDecoderContext pCtx, uint32_t uiCpuFlag) { pCtx->pGetIChromaPredFunc[C_PRED_DC_T] = WelsDecoderIChromaPredDcTop_sse2; pCtx->pGetI4x4LumaPredFunc[I4_PRED_H] = WelsDecoderI4x4LumaPredH_sse2; } +#if defined(HAVE_AVX2) if (uiCpuFlag & WELS_CPU_AVX2) { pCtx->pIdctResAddPredFunc = IdctResAddPred_avx2; pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_avx2; } #endif + +#endif } //reset decoder number related statistics info diff --git a/codec/encoder/core/x86/quant.asm b/codec/encoder/core/x86/quant.asm index b8d3fa8b..21b56b53 100644 --- a/codec/encoder/core/x86/quant.asm +++ b/codec/encoder/core/x86/quant.asm @@ -370,6 +370,7 @@ WELS_EXTERN WelsDequantIHadamard4x4_sse2 ret +%ifdef HAVE_AVX2 ; data=%1 abs_out=%2 ff=%3 mf=%4 7FFFh=%5 %macro AVX2_Quant 5 vpabsw %2, %1 @@ -502,3 +503,5 @@ WELS_EXTERN WelsQuantFour4x4Max_avx2 POP_XMM LOAD_4_PARA_POP ret +%endif + diff --git a/codec/processing/src/downsample/downsample.cpp b/codec/processing/src/downsample/downsample.cpp index 829de4c4..a0f26ae2 100644 --- a/codec/processing/src/downsample/downsample.cpp +++ b/codec/processing/src/downsample/downsample.cpp @@ -107,10 +107,12 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4; sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse41; } +#ifdef HAVE_AVX2 if (iCpuFlag & WELS_CPU_AVX2) { sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_avx2; sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_avx2; } +#endif #endif//X86_ASM #if defined(HAVE_NEON) diff --git a/codec/processing/src/downsample/downsample.h b/codec/processing/src/downsample/downsample.h index dd30ec52..ca5c4bd4 100644 --- a/codec/processing/src/downsample/downsample.h +++ b/codec/processing/src/downsample/downsample.h @@ -99,8 +99,10 @@ GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2; GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2; GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_ssse3; GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse41; +#ifdef HAVE_AVX2 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_avx2; GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_avx2; +#endif SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_ssse3; SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_sse4; @@ -120,12 +122,14 @@ void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, in void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, uint32_t uiScaleY); +#ifdef HAVE_AVX2 void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, uint32_t uiScaleY); void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, uint32_t uiScaleY); +#endif WELSVP_EXTERN_C_END #endif diff --git a/codec/processing/src/downsample/downsamplefuncs.cpp b/codec/processing/src/downsample/downsamplefuncs.cpp index 9e92849a..ed5e7572 100644 --- a/codec/processing/src/downsample/downsamplefuncs.cpp +++ b/codec/processing/src/downsample/downsamplefuncs.cpp @@ -284,8 +284,10 @@ DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2) DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2) DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3) DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41) +#ifdef HAVE_AVX2 DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (avx2) DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (avx2) +#endif #endif //X86_ASM #ifdef HAVE_NEON diff --git a/codec/processing/src/x86/downsample_bilinear.asm b/codec/processing/src/x86/downsample_bilinear.asm index 7df67280..b39608d5 100644 --- a/codec/processing/src/x86/downsample_bilinear.asm +++ b/codec/processing/src/x86/downsample_bilinear.asm @@ -3254,6 +3254,7 @@ WELS_EXTERN GeneralBilinearAccurateDownsampler_sse41 %undef xmm_xfrac1_begin %undef xmm_xfrac_inc +%ifdef HAVE_AVX2 ; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5 %macro AVX2_BilinearIncXposuw 5 vpaddusw %5, %2, %4 @@ -4552,3 +4553,5 @@ WELS_EXTERN GeneralBilinearAccurateDownsampler_avx2 %undef ymm_xfrac0_begin %undef ymm_xfrac1_begin %undef ymm_xfrac_inc +%endif + diff --git a/codec/processing/src/x86/vaa.asm b/codec/processing/src/x86/vaa.asm index e2b49e31..9b728576 100644 --- a/codec/processing/src/x86/vaa.asm +++ b/codec/processing/src/x86/vaa.asm @@ -2088,6 +2088,7 @@ sqdiff_bgd_width_loop: %assign push_num push_num - stack_alloc_num %endmacro +%ifdef HAVE_AVX2 ; Max unsigned byte per quadword ; out=%1 in=%2 tmp=%3 %macro AVX2_Maxubq 3 @@ -3557,3 +3558,6 @@ WELS_EXTERN VAACalcSadSsdBgd_avx2 %undef p_sd8x8 %undef p_mad8x8 ret + +%endif + diff --git a/test/decoder/DecUT_IdctResAddPred.cpp b/test/decoder/DecUT_IdctResAddPred.cpp index 7a053839..c9f33b49 100644 --- a/test/decoder/DecUT_IdctResAddPred.cpp +++ b/test/decoder/DecUT_IdctResAddPred.cpp @@ -53,6 +53,7 @@ void SetNonZeroCount_ref (int8_t* pNonZeroCount) { } #if defined(X86_ASM) +#if defined(HAVE_AVX2) void IdctFourResAddPred_ref (uint8_t* pPred, int32_t iStride, int16_t* pRs) { IdctResAddPred_ref (pPred + 0 * iStride + 0, iStride, pRs + 0 * 16); IdctResAddPred_ref (pPred + 0 * iStride + 4, iStride, pRs + 1 * 16); @@ -60,6 +61,7 @@ void IdctFourResAddPred_ref (uint8_t* pPred, int32_t iStride, int16_t* pRs) { IdctResAddPred_ref (pPred + 4 * iStride + 4, iStride, pRs + 3 * 16); } #endif +#endif } // anon ns @@ -138,9 +140,11 @@ GENERATE_IDCTRESADDPRED (IdctResAddPred_c, 0) #if defined(X86_ASM) GENERATE_IDCTRESADDPRED (IdctResAddPred_mmx, WELS_CPU_MMXEXT) GENERATE_IDCTRESADDPRED (IdctResAddPred_sse2, WELS_CPU_SSE2) +#if defined(HAVE_AVX2) GENERATE_IDCTRESADDPRED (IdctResAddPred_avx2, WELS_CPU_AVX2) GENERATE_IDCTFOURRESADDPRED (IdctFourResAddPred_avx2, WELS_CPU_AVX2) #endif +#endif #if defined(HAVE_NEON) GENERATE_IDCTRESADDPRED (IdctResAddPred_neon, WELS_CPU_NEON) diff --git a/test/processing/ProcessUT_DownSample.cpp b/test/processing/ProcessUT_DownSample.cpp index 21fd396b..faa3edbf 100644 --- a/test/processing/ProcessUT_DownSample.cpp +++ b/test/processing/ProcessUT_DownSample.cpp @@ -372,12 +372,15 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_ssse3 WELS_CPU_SSSE3) GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse41, GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE41) +#ifdef HAVE_AVX2 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_avx2, GeneralBilinearFastDownsampler_ref, 1, WELS_CPU_AVX2) GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_avx2, GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_AVX2) #endif +#endif + #if defined(HAVE_NEON) GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_neon, 1, WELS_CPU_NEON) GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsampler_neon, 1, WELS_CPU_NEON)