Merge pull request #2448 from saamas/encoder-getnonzerocount-sse42
[Encoder] Add an SSE4.2 implementation of WelsGetNonZeroCount
This commit is contained in:
commit
c30cc41261
@ -76,6 +76,7 @@ extern "C" {
|
||||
#ifdef X86_ASM
|
||||
|
||||
int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel);
|
||||
int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel);
|
||||
|
||||
/****************************************************************************
|
||||
* Scan and Score functions
|
||||
|
@ -523,6 +523,9 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
if (uiCpuFlag & WELS_CPU_SSSE3) {
|
||||
pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3;
|
||||
}
|
||||
if (uiCpuFlag & WELS_CPU_SSE42) {
|
||||
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse42;
|
||||
}
|
||||
if (uiCpuFlag & WELS_CPU_AVX2) {
|
||||
pFuncList->pfDctT4 = WelsDctT4_avx2;
|
||||
pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
|
||||
|
@ -337,3 +337,17 @@ WELS_EXTERN WelsGetNoneZeroCount_sse2
|
||||
;add al, [nozero_count_table+r1]
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; int32_t WelsGetNoneZeroCount_sse42(int16_t* level);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsGetNoneZeroCount_sse42
|
||||
%assign push_num 0
|
||||
LOAD_1_PARA
|
||||
movdqa xmm0, [r0]
|
||||
packsswb xmm0, [r0 + 16]
|
||||
pxor xmm1, xmm1
|
||||
pcmpeqb xmm0, xmm1
|
||||
pmovmskb retrd, xmm0
|
||||
xor retrd, 0FFFFh
|
||||
popcnt retrd, retrd
|
||||
ret
|
||||
|
@ -269,26 +269,43 @@ GENERATE_UT_FOR_COPY (16, 8, WelsCopy16x8NotAligned_sse2);
|
||||
GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16NotAligned_sse2);
|
||||
GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16_sse2);
|
||||
#endif
|
||||
TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_c) {
|
||||
|
||||
namespace {
|
||||
|
||||
void TestGetNoneZeroCount (PGetNoneZeroCountFunc func) {
|
||||
ENFORCE_STACK_ALIGN_1D (int16_t, pLevel, 16, 16);
|
||||
int32_t result = 0;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
pLevel[i] = (rand() & 0x07) - 4;
|
||||
if (pLevel[i]) result ++;
|
||||
const int num_test_runs = 1000;
|
||||
for (int run = 0; run < num_test_runs; run++) {
|
||||
const bool all_zero = run == 0;
|
||||
const bool all_nonzero = run == 1;
|
||||
int result = 0;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
const int r = rand();
|
||||
if (all_zero)
|
||||
pLevel[i] = 0;
|
||||
else if (all_nonzero)
|
||||
pLevel[i] = r % 0xFFFF - 0x8000 ? r % 0xFFFF - 0x8000 : 0x7FFF;
|
||||
else
|
||||
pLevel[i] = (r >> 16 & 1) * ((r & 0xFFFF) - 0x8000);
|
||||
result += pLevel[i] != 0;
|
||||
}
|
||||
const int32_t nnz = func (pLevel);
|
||||
EXPECT_EQ (nnz, result);
|
||||
}
|
||||
int32_t nnz = WelsGetNoneZeroCount_c (pLevel);
|
||||
EXPECT_EQ (nnz, result);
|
||||
}
|
||||
|
||||
} // anon ns.
|
||||
|
||||
TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_c) {
|
||||
TestGetNoneZeroCount (WelsGetNoneZeroCount_c);
|
||||
}
|
||||
#ifdef X86_ASM
|
||||
TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
|
||||
ENFORCE_STACK_ALIGN_1D (int16_t, pLevel, 16, 16);
|
||||
int32_t result = 0;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
pLevel[i] = (rand() & 0x07) - 4;
|
||||
if (pLevel[i]) result ++;
|
||||
}
|
||||
int32_t nnz = WelsGetNoneZeroCount_sse2 (pLevel);
|
||||
EXPECT_EQ (nnz, result);
|
||||
TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2);
|
||||
}
|
||||
TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) {
|
||||
if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
|
||||
TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);
|
||||
}
|
||||
#endif
|
||||
#define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)
|
||||
|
Loading…
x
Reference in New Issue
Block a user