From d906dda2240b2c4b39687f7474a4d1607319681a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sindre=20Aam=C3=A5s?= Date: Tue, 19 Apr 2016 20:50:34 +0200 Subject: [PATCH 1/2] [UT] Improve GetNonZeroCount tests Reduce duplication. Test more combinations. Always test boundary cases. --- test/encoder/EncUT_EncoderMbAux.cpp | 43 +++++++++++++++++++---------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/test/encoder/EncUT_EncoderMbAux.cpp b/test/encoder/EncUT_EncoderMbAux.cpp index 28133c87..37b6a62c 100644 --- a/test/encoder/EncUT_EncoderMbAux.cpp +++ b/test/encoder/EncUT_EncoderMbAux.cpp @@ -267,26 +267,39 @@ GENERATE_UT_FOR_COPY (16, 8, WelsCopy16x8NotAligned_sse2); GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16NotAligned_sse2); GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16_sse2); #endif -TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_c) { + +namespace { + +void TestGetNoneZeroCount (PGetNoneZeroCountFunc func) { ENFORCE_STACK_ALIGN_1D (int16_t, pLevel, 16, 16); - int32_t result = 0; - for (int i = 0; i < 16; i++) { - pLevel[i] = (rand() & 0x07) - 4; - if (pLevel[i]) result ++; + const int num_test_runs = 1000; + for (int run = 0; run < num_test_runs; run++) { + const bool all_zero = run == 0; + const bool all_nonzero = run == 1; + int result = 0; + for (int i = 0; i < 16; i++) { + const int r = rand(); + if (all_zero) + pLevel[i] = 0; + else if (all_nonzero) + pLevel[i] = r % 0xFFFF - 0x8000 ? r % 0xFFFF - 0x8000 : 0x7FFF; + else + pLevel[i] = (r >> 16 & 1) * ((r & 0xFFFF) - 0x8000); + result += pLevel[i] != 0; + } + const int32_t nnz = func (pLevel); + EXPECT_EQ (nnz, result); } - int32_t nnz = WelsGetNoneZeroCount_c (pLevel); - EXPECT_EQ (nnz, result); +} + +} // anon ns. + +TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_c) { + TestGetNoneZeroCount (WelsGetNoneZeroCount_c); } #ifdef X86_ASM TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) { - ENFORCE_STACK_ALIGN_1D (int16_t, pLevel, 16, 16); - int32_t result = 0; - for (int i = 0; i < 16; i++) { - pLevel[i] = (rand() & 0x07) - 4; - if (pLevel[i]) result ++; - } - int32_t nnz = WelsGetNoneZeroCount_sse2 (pLevel); - EXPECT_EQ (nnz, result); + TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2); } #endif #define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign) From 4645bd26aa506fe5dd54dc230f3d36e446261360 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sindre=20Aam=C3=A5s?= Date: Tue, 19 Apr 2016 19:42:17 +0200 Subject: [PATCH 2/2] [Encoder] Add an SSE4.2 implementation of WelsGetNonZeroCount Avoid touching some cache lines by using popcnt instead of table lookups. Also gives a speedup of ~1.4x on Haswell as compared with SSE2. --- codec/encoder/core/inc/encode_mb_aux.h | 1 + codec/encoder/core/src/encode_mb_aux.cpp | 3 +++ codec/encoder/core/x86/score.asm | 14 ++++++++++++++ test/encoder/EncUT_EncoderMbAux.cpp | 4 ++++ 4 files changed, 22 insertions(+) diff --git a/codec/encoder/core/inc/encode_mb_aux.h b/codec/encoder/core/inc/encode_mb_aux.h index 3f95d761..6bd5b7f6 100644 --- a/codec/encoder/core/inc/encode_mb_aux.h +++ b/codec/encoder/core/inc/encode_mb_aux.h @@ -76,6 +76,7 @@ extern "C" { #ifdef X86_ASM int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel); +int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel); /**************************************************************************** * Scan and Score functions diff --git a/codec/encoder/core/src/encode_mb_aux.cpp b/codec/encoder/core/src/encode_mb_aux.cpp index 31ceb68a..b4c62587 100644 --- a/codec/encoder/core/src/encode_mb_aux.cpp +++ b/codec/encoder/core/src/encode_mb_aux.cpp @@ -523,6 +523,9 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { if (uiCpuFlag & WELS_CPU_SSSE3) { pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3; } + if (uiCpuFlag & WELS_CPU_SSE42) { + pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse42; + } if (uiCpuFlag & WELS_CPU_AVX2) { pFuncList->pfDctT4 = WelsDctT4_avx2; pFuncList->pfDctFourT4 = WelsDctFourT4_avx2; diff --git a/codec/encoder/core/x86/score.asm b/codec/encoder/core/x86/score.asm index fa9651c9..0f372d20 100644 --- a/codec/encoder/core/x86/score.asm +++ b/codec/encoder/core/x86/score.asm @@ -337,3 +337,17 @@ WELS_EXTERN WelsGetNoneZeroCount_sse2 ;add al, [nozero_count_table+r1] ret +;*********************************************************************** +; int32_t WelsGetNoneZeroCount_sse42(int16_t* level); +;*********************************************************************** +WELS_EXTERN WelsGetNoneZeroCount_sse42 + %assign push_num 0 + LOAD_1_PARA + movdqa xmm0, [r0] + packsswb xmm0, [r0 + 16] + pxor xmm1, xmm1 + pcmpeqb xmm0, xmm1 + pmovmskb retrd, xmm0 + xor retrd, 0FFFFh + popcnt retrd, retrd + ret diff --git a/test/encoder/EncUT_EncoderMbAux.cpp b/test/encoder/EncUT_EncoderMbAux.cpp index 37b6a62c..2af3c2c2 100644 --- a/test/encoder/EncUT_EncoderMbAux.cpp +++ b/test/encoder/EncUT_EncoderMbAux.cpp @@ -301,6 +301,10 @@ TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_c) { TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) { TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2); } +TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) { + if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42) + TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42); +} #endif #define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign) #define NEW_QUANT(pDct, ff, mf) (((ff)+ WELS_ABS_LC(pDct))*(mf)) >>16