diff --git a/codec/common/x86/asm_inc.asm b/codec/common/x86/asm_inc.asm index b41996ec..7d66dd9c 100644 --- a/codec/common/x86/asm_inc.asm +++ b/codec/common/x86/asm_inc.asm @@ -657,3 +657,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non- vpsrlw %1, %1, 15 vpsllw %1, %1, 5 %endmacro + +%macro WELS_DW32767_VEX 1 + vpcmpeqw %1, %1, %1 + vpsrlw %1, %1, 1 +%endmacro diff --git a/codec/encoder/core/inc/encode_mb_aux.h b/codec/encoder/core/inc/encode_mb_aux.h index 3f95d761..b17adec3 100644 --- a/codec/encoder/core/inc/encode_mb_aux.h +++ b/codec/encoder/core/inc/encode_mb_aux.h @@ -106,6 +106,11 @@ void WelsQuant4x4Dc_sse2 (int16_t* pDct, int16_t iFF, int16_t iMF); void WelsQuantFour4x4_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF); void WelsQuantFour4x4Max_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax); +void WelsQuant4x4_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF); +void WelsQuant4x4Dc_avx2 (int16_t* pDct, int16_t iFF, int16_t iMF); +void WelsQuantFour4x4_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF); +void WelsQuantFour4x4Max_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax); + #endif #ifdef HAVE_NEON diff --git a/codec/encoder/core/src/encode_mb_aux.cpp b/codec/encoder/core/src/encode_mb_aux.cpp index 31ceb68a..86b68dcc 100644 --- a/codec/encoder/core/src/encode_mb_aux.cpp +++ b/codec/encoder/core/src/encode_mb_aux.cpp @@ -526,6 +526,11 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { if (uiCpuFlag & WELS_CPU_AVX2) { pFuncList->pfDctT4 = WelsDctT4_avx2; pFuncList->pfDctFourT4 = WelsDctFourT4_avx2; + + pFuncList->pfQuantization4x4 = WelsQuant4x4_avx2; + pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_avx2; + pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_avx2; + pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_avx2; } //#endif//MACOS diff --git a/codec/encoder/core/x86/quant.asm b/codec/encoder/core/x86/quant.asm index 8d8cb5dd..b8d3fa8b 100644 --- a/codec/encoder/core/x86/quant.asm +++ b/codec/encoder/core/x86/quant.asm @@ -368,3 +368,137 @@ WELS_EXTERN WelsDequantIHadamard4x4_sse2 punpcklqdq xmm2, xmm3 MOVDQ [r0+16], xmm2 ret + + +; data=%1 abs_out=%2 ff=%3 mf=%4 7FFFh=%5 +%macro AVX2_Quant 5 + vpabsw %2, %1 + vpor %1, %1, %5 ; ensure non-zero before vpsignw + vpaddusw %2, %2, %3 + vpmulhuw %2, %2, %4 + vpsignw %1, %2, %1 +%endmacro + + +;*********************************************************************** +; void WelsQuant4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf); +;*********************************************************************** + +WELS_EXTERN WelsQuant4x4_avx2 + %assign push_num 0 + LOAD_3_PARA + PUSH_XMM 5 + vbroadcasti128 ymm0, [r1] + vbroadcasti128 ymm1, [r2] + WELS_DW32767_VEX ymm2 + vmovdqu ymm3, [r0] + AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2 + vmovdqu [r0], ymm3 + vzeroupper + POP_XMM + ret + + +;*********************************************************************** +;void WelsQuant4x4Dc_avx2(int16_t *pDct, int16_t ff, int16_t mf); +;*********************************************************************** + +WELS_EXTERN WelsQuant4x4Dc_avx2 + %assign push_num 0 + LOAD_1_PARA + PUSH_XMM 5 +%ifidni r1, arg2 + vmovd xmm0, arg2d + vpbroadcastw ymm0, xmm0 +%else + vpbroadcastw ymm0, arg2 +%endif +%ifidni r2, arg3 + vmovd xmm1, arg3d + vpbroadcastw ymm1, xmm1 +%else + vpbroadcastw ymm1, arg3 +%endif + WELS_DW32767_VEX ymm2 + vmovdqu ymm3, [r0] + AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2 + vmovdqu [r0], ymm3 + vzeroupper + POP_XMM + ret + + +;*********************************************************************** +; void WelsQuantFour4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf); +;*********************************************************************** + +WELS_EXTERN WelsQuantFour4x4_avx2 + %assign push_num 0 + LOAD_3_PARA + PUSH_XMM 6 + vbroadcasti128 ymm0, [r1] + vbroadcasti128 ymm1, [r2] + WELS_DW32767_VEX ymm4 + vmovdqu ymm3, [r0 + 0x00] + vmovdqu ymm5, [r0 + 0x20] + AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4 + vmovdqu [r0 + 0x00], ymm3 + AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4 + vmovdqu [r0 + 0x20], ymm5 + vmovdqu ymm3, [r0 + 0x40] + vmovdqu ymm5, [r0 + 0x60] + AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4 + vmovdqu [r0 + 0x40], ymm3 + AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4 + vmovdqu [r0 + 0x60], ymm5 + vzeroupper + POP_XMM + ret + + +;*********************************************************************** +; void WelsQuantFour4x4Max_avx2(int16_t *pDct, int32_t* ff, int16_t *mf, int16_t *max); +;*********************************************************************** + +WELS_EXTERN WelsQuantFour4x4Max_avx2 + %assign push_num 0 + LOAD_4_PARA + PUSH_XMM 7 + vbroadcasti128 ymm0, [r1] + vbroadcasti128 ymm1, [r2] + WELS_DW32767_VEX ymm6 + vmovdqu ymm4, [r0 + 0x00] + vmovdqu ymm5, [r0 + 0x20] + AVX2_Quant ymm4, ymm2, ymm0, ymm1, ymm6 + vmovdqu [r0 + 0x00], ymm4 + AVX2_Quant ymm5, ymm3, ymm0, ymm1, ymm6 + vmovdqu [r0 + 0x20], ymm5 + vperm2i128 ymm4, ymm2, ymm3, 00100000b + vperm2i128 ymm3, ymm2, ymm3, 00110001b + vpmaxsw ymm2, ymm4, ymm3 + vmovdqu ymm4, [r0 + 0x40] + vmovdqu ymm5, [r0 + 0x60] + AVX2_Quant ymm4, ymm3, ymm0, ymm1, ymm6 + vmovdqu [r0 + 0x40], ymm4 + AVX2_Quant ymm5, ymm4, ymm0, ymm1, ymm6 + vmovdqu [r0 + 0x60], ymm5 + vperm2i128 ymm5, ymm3, ymm4, 00100000b + vperm2i128 ymm4, ymm3, ymm4, 00110001b + vpmaxsw ymm3, ymm5, ymm4 + vpxor ymm2, ymm2, ymm6 ; flip bits so as to enable use of vphminposuw to find max value. + vpxor ymm3, ymm3, ymm6 ; flip bits so as to enable use of vphminposuw to find max value. + vextracti128 xmm4, ymm2, 1 + vextracti128 xmm5, ymm3, 1 + vphminposuw xmm2, xmm2 + vphminposuw xmm3, xmm3 + vphminposuw xmm4, xmm4 + vphminposuw xmm5, xmm5 + vpunpcklwd xmm2, xmm2, xmm4 + vpunpcklwd xmm3, xmm3, xmm5 + vpunpckldq xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm6 ; restore non-flipped values. + vmovq [r3], xmm2 ; store max values. + vzeroupper + POP_XMM + LOAD_4_PARA_POP + ret diff --git a/test/encoder/EncUT_EncoderMbAux.cpp b/test/encoder/EncUT_EncoderMbAux.cpp index 28133c87..51ea5eeb 100644 --- a/test/encoder/EncUT_EncoderMbAux.cpp +++ b/test/encoder/EncUT_EncoderMbAux.cpp @@ -3,6 +3,8 @@ #include "ls_defines.h" #include "encode_mb_aux.h" #include "wels_common_basis.h" +#include +#include using namespace WelsEnc; @@ -292,41 +294,95 @@ TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) { #define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign) #define NEW_QUANT(pDct, ff, mf) (((ff)+ WELS_ABS_LC(pDct))*(mf)) >>16 #define WELS_NEW_QUANT(pDct,ff,mf) WELS_ABS_LC(NEW_QUANT(pDct, ff, mf)) -void WelsQuantFour4x4MaxAnchor (int16_t* pDct, int16_t* ff, int16_t* mf, int16_t* max) { - int32_t i, j, k, sign; - int16_t max_abs; - for (k = 0; k < 4; k++) { - max_abs = 0; - for (i = 0; i < 16; i++) { - j = i & 0x07; - sign = WELS_SIGN (pDct[i]); - pDct[i] = NEW_QUANT (pDct[i], ff[j], mf[j]); - if (max_abs < pDct[i]) max_abs = pDct[i]; - pDct[i] = WELS_ABS_LC (pDct[i]); - } - pDct += 16; - max[k] = max_abs; +namespace { +int16_t WelsQuant4x4MaxAnchor (int16_t* pDct, int16_t* ff, int16_t* mf) { + int16_t max_abs = 0; + for (int i = 0; i < 16; i++) { + const int j = i & 0x07; + const int32_t sign = WELS_SIGN (pDct[i]); + pDct[i] = NEW_QUANT (pDct[i], ff[j], mf[j]); + max_abs = std::max(max_abs, pDct[i]); + pDct[i] = WELS_ABS_LC (pDct[i]); + } + return max_abs; +} +void WelsQuant4x4DcAnchor (int16_t* pDct, int16_t iFF, int16_t iMF) { + for (int i = 0; i < 16; i++) { + const int32_t sign = WELS_SIGN (pDct[i]); + pDct[i] = WELS_NEW_QUANT (pDct[i], iFF, iMF); } } -TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_c) { - int16_t ff[8], mf[8]; - int16_t iDctA[64], iMaxA[16]; - int16_t iDctC[64], iMaxC[16]; - for (int i = 0; i < 8; i++) { +void WelsQuantFour4x4Anchor (int16_t* pDct, int16_t* ff, int16_t* mf) { + for (int i = 0; i < 4; i++) + WelsQuant4x4MaxAnchor (pDct + 16 * i, ff, mf); +} +void WelsQuantFour4x4MaxAnchor (int16_t* pDct, int16_t* ff, int16_t* mf, int16_t* max) { + for (int i = 0; i < 4; i++) + max[i] = WelsQuant4x4MaxAnchor (pDct + 16 * i, ff, mf); +} +void TestWelsQuant4x4 (PQuantizationFunc func) { + const std::size_t f_size = 8; + const std::size_t dct_size = 16; + CMemoryAlign cMemoryAlign (0); + ALLOC_MEMORY (int16_t, ff, f_size); + ALLOC_MEMORY (int16_t, mf, f_size); + ALLOC_MEMORY (int16_t, iDctC, dct_size); + ALLOC_MEMORY (int16_t, iDctS, dct_size); + for (std::size_t i = 0; i < f_size; i++) { ff[i] = rand() & 32767; mf[i] = rand() & 32767; } - for (int i = 0; i < 64; i++) - iDctA[i] = iDctC[i] = (rand() & 65535) - 32767; - WelsQuantFour4x4MaxAnchor (iDctA, ff, mf, iMaxA); - WelsQuantFour4x4Max_c (iDctC, ff, mf, iMaxC); - for (int i = 0; i < 64; i++) - EXPECT_EQ (iDctA[i], iDctC[i]); - for (int i = 0; i < 4; i++) - EXPECT_EQ (iMaxA[i], iMaxC[i]); + for (std::size_t i = 0; i < dct_size; i++) + iDctC[i] = iDctS[i] = (rand() & 65535) - 32768; + WelsQuant4x4MaxAnchor (iDctC, ff, mf); + func (iDctS, ff, mf); + for (std::size_t i = 0; i < dct_size; i++) + EXPECT_EQ (iDctC[i], iDctS[i]); + FREE_MEMORY (ff); + FREE_MEMORY (mf); + FREE_MEMORY (iDctC); + FREE_MEMORY (iDctS); } -#ifdef X86_ASM -TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) { +void TestWelsQuant4x4Dc (PQuantizationDcFunc func) { + const std::size_t dct_size = 16; + const int16_t ff = rand() & 32767; + const int16_t mf = rand() & 32767; + CMemoryAlign cMemoryAlign (0); + ALLOC_MEMORY (int16_t, iDctC, dct_size); + ALLOC_MEMORY (int16_t, iDctS, dct_size); + for (std::size_t i = 0; i < dct_size; i++) + iDctC[i] = iDctS[i] = (rand() & 65535) - 32768; + WelsQuant4x4DcAnchor (iDctC, ff, mf); + func (iDctS, ff, mf); + for (std::size_t i = 0; i < dct_size; i++) + EXPECT_EQ (iDctC[i], iDctS[i]); + FREE_MEMORY (iDctC); + FREE_MEMORY (iDctS); +} +void TestWelsQuantFour4x4 (PQuantizationFunc func) { + const std::size_t f_size = 8; + const std::size_t dct_size = 4 * 16; + CMemoryAlign cMemoryAlign (0); + ALLOC_MEMORY (int16_t, ff, f_size); + ALLOC_MEMORY (int16_t, mf, f_size); + ALLOC_MEMORY (int16_t, iDctC, dct_size); + ALLOC_MEMORY (int16_t, iDctS, dct_size); + for (std::size_t i = 0; i < f_size; i++) { + ff[i] = rand() & 32767; + mf[i] = rand() & 32767; + } + for (std::size_t i = 0; i < dct_size; i++) + iDctC[i] = iDctS[i] = (rand() & 65535) - 32768; + WelsQuantFour4x4Anchor (iDctC, ff, mf); + func (iDctS, ff, mf); + for (std::size_t i = 0; i < dct_size; i++) + EXPECT_EQ (iDctC[i], iDctS[i]); + FREE_MEMORY (ff); + FREE_MEMORY (mf); + FREE_MEMORY (iDctC); + FREE_MEMORY (iDctS); +} +void TestWelsQuantFour4x4Max (PQuantizationMaxFunc func) { CMemoryAlign cMemoryAlign (0); ALLOC_MEMORY (int16_t, ff, 8); ALLOC_MEMORY (int16_t, mf, 8); @@ -340,8 +396,8 @@ TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) { } for (int i = 0; i < 64; i++) iDctC[i] = iDctS[i] = (rand() & 65535) - 32767; - WelsQuantFour4x4Max_c (iDctC, ff, mf, iMaxC); - WelsQuantFour4x4Max_sse2 (iDctS, ff, mf, iMaxS); + WelsQuantFour4x4MaxAnchor (iDctC, ff, mf, iMaxC); + func (iDctS, ff, mf, iMaxS); for (int i = 0; i < 64; i++) EXPECT_EQ (iDctC[i], iDctS[i]); for (int i = 0; i < 4; i++) @@ -353,6 +409,48 @@ TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) { FREE_MEMORY (iMaxC); FREE_MEMORY (iMaxS); } +} // anon ns +TEST (EncodeMbAuxTest, WelsQuant4x4_c) { + TestWelsQuant4x4 (WelsQuant4x4_c); +} +TEST (EncodeMbAuxTest, WelsQuant4x4Dc_c) { + TestWelsQuant4x4Dc (WelsQuant4x4Dc_c); +} +TEST (EncodeMbAuxTest, WelsQuantFour4x4_c) { + TestWelsQuantFour4x4 (WelsQuantFour4x4_c); +} +TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_c) { + TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_c); +} +#ifdef X86_ASM +TEST (EncodeMbAuxTest, WelsQuant4x4_sse2) { + TestWelsQuant4x4 (WelsQuant4x4_sse2); +} +TEST (EncodeMbAuxTest, WelsQuant4x4Dc_sse2) { + TestWelsQuant4x4Dc (WelsQuant4x4Dc_sse2); +} +TEST (EncodeMbAuxTest, WelsQuantFour4x4_sse2) { + TestWelsQuantFour4x4 (WelsQuantFour4x4_sse2); +} +TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) { + TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_sse2); +} +TEST (EncodeMbAuxTest, WelsQuant4x4_avx2) { + if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2) + TestWelsQuant4x4 (WelsQuant4x4_avx2); +} +TEST (EncodeMbAuxTest, WelsQuant4x4Dc_avx2) { + if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2) + TestWelsQuant4x4Dc (WelsQuant4x4Dc_avx2); +} +TEST (EncodeMbAuxTest, WelsQuantFour4x4_avx2) { + if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2) + TestWelsQuantFour4x4 (WelsQuantFour4x4_avx2); +} +TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_avx2) { + if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2) + TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_avx2); +} #endif int32_t WelsHadamardQuant2x2SkipAnchor (int16_t* rs, int16_t ff, int16_t mf) { int16_t pDct[4], s[4];