diff --git a/codec/encoder/core/inc/encode_mb_aux.h b/codec/encoder/core/inc/encode_mb_aux.h index b17adec3..00fe6e9a 100644 --- a/codec/encoder/core/inc/encode_mb_aux.h +++ b/codec/encoder/core/inc/encode_mb_aux.h @@ -76,6 +76,7 @@ extern "C" { #ifdef X86_ASM int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel); +int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel); /**************************************************************************** * Scan and Score functions diff --git a/codec/encoder/core/src/encode_mb_aux.cpp b/codec/encoder/core/src/encode_mb_aux.cpp index 86b68dcc..ccd4332c 100644 --- a/codec/encoder/core/src/encode_mb_aux.cpp +++ b/codec/encoder/core/src/encode_mb_aux.cpp @@ -523,6 +523,9 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { if (uiCpuFlag & WELS_CPU_SSSE3) { pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3; } + if (uiCpuFlag & WELS_CPU_SSE42) { + pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse42; + } if (uiCpuFlag & WELS_CPU_AVX2) { pFuncList->pfDctT4 = WelsDctT4_avx2; pFuncList->pfDctFourT4 = WelsDctFourT4_avx2; diff --git a/codec/encoder/core/x86/score.asm b/codec/encoder/core/x86/score.asm index fa9651c9..0f372d20 100644 --- a/codec/encoder/core/x86/score.asm +++ b/codec/encoder/core/x86/score.asm @@ -337,3 +337,17 @@ WELS_EXTERN WelsGetNoneZeroCount_sse2 ;add al, [nozero_count_table+r1] ret +;*********************************************************************** +; int32_t WelsGetNoneZeroCount_sse42(int16_t* level); +;*********************************************************************** +WELS_EXTERN WelsGetNoneZeroCount_sse42 + %assign push_num 0 + LOAD_1_PARA + movdqa xmm0, [r0] + packsswb xmm0, [r0 + 16] + pxor xmm1, xmm1 + pcmpeqb xmm0, xmm1 + pmovmskb retrd, xmm0 + xor retrd, 0FFFFh + popcnt retrd, retrd + ret diff --git a/test/encoder/EncUT_EncoderMbAux.cpp b/test/encoder/EncUT_EncoderMbAux.cpp index 51ea5eeb..0015478e 100644 --- a/test/encoder/EncUT_EncoderMbAux.cpp +++ b/test/encoder/EncUT_EncoderMbAux.cpp @@ -269,26 +269,43 @@ GENERATE_UT_FOR_COPY (16, 8, WelsCopy16x8NotAligned_sse2); GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16NotAligned_sse2); GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16_sse2); #endif -TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_c) { + +namespace { + +void TestGetNoneZeroCount (PGetNoneZeroCountFunc func) { ENFORCE_STACK_ALIGN_1D (int16_t, pLevel, 16, 16); - int32_t result = 0; - for (int i = 0; i < 16; i++) { - pLevel[i] = (rand() & 0x07) - 4; - if (pLevel[i]) result ++; + const int num_test_runs = 1000; + for (int run = 0; run < num_test_runs; run++) { + const bool all_zero = run == 0; + const bool all_nonzero = run == 1; + int result = 0; + for (int i = 0; i < 16; i++) { + const int r = rand(); + if (all_zero) + pLevel[i] = 0; + else if (all_nonzero) + pLevel[i] = r % 0xFFFF - 0x8000 ? r % 0xFFFF - 0x8000 : 0x7FFF; + else + pLevel[i] = (r >> 16 & 1) * ((r & 0xFFFF) - 0x8000); + result += pLevel[i] != 0; + } + const int32_t nnz = func (pLevel); + EXPECT_EQ (nnz, result); } - int32_t nnz = WelsGetNoneZeroCount_c (pLevel); - EXPECT_EQ (nnz, result); +} + +} // anon ns. + +TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_c) { + TestGetNoneZeroCount (WelsGetNoneZeroCount_c); } #ifdef X86_ASM TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) { - ENFORCE_STACK_ALIGN_1D (int16_t, pLevel, 16, 16); - int32_t result = 0; - for (int i = 0; i < 16; i++) { - pLevel[i] = (rand() & 0x07) - 4; - if (pLevel[i]) result ++; - } - int32_t nnz = WelsGetNoneZeroCount_sse2 (pLevel); - EXPECT_EQ (nnz, result); + TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2); +} +TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) { + if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42) + TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42); } #endif #define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)