Merge pull request #2441 from saamas/encoder-add-avx2-4x4-quantization-routines
[Encoder] Add AVX2 4x4 quantization routines
This commit is contained in:
commit
7d65687284
@ -657,3 +657,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-
|
||||
vpsrlw %1, %1, 15
|
||||
vpsllw %1, %1, 5
|
||||
%endmacro
|
||||
|
||||
%macro WELS_DW32767_VEX 1
|
||||
vpcmpeqw %1, %1, %1
|
||||
vpsrlw %1, %1, 1
|
||||
%endmacro
|
||||
|
@ -106,6 +106,11 @@ void WelsQuant4x4Dc_sse2 (int16_t* pDct, int16_t iFF, int16_t iMF);
|
||||
void WelsQuantFour4x4_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
|
||||
void WelsQuantFour4x4Max_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
|
||||
|
||||
void WelsQuant4x4_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
|
||||
void WelsQuant4x4Dc_avx2 (int16_t* pDct, int16_t iFF, int16_t iMF);
|
||||
void WelsQuantFour4x4_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
|
||||
void WelsQuantFour4x4Max_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
|
@ -526,6 +526,11 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
if (uiCpuFlag & WELS_CPU_AVX2) {
|
||||
pFuncList->pfDctT4 = WelsDctT4_avx2;
|
||||
pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
|
||||
|
||||
pFuncList->pfQuantization4x4 = WelsQuant4x4_avx2;
|
||||
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_avx2;
|
||||
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_avx2;
|
||||
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_avx2;
|
||||
}
|
||||
|
||||
//#endif//MACOS
|
||||
|
@ -368,3 +368,137 @@ WELS_EXTERN WelsDequantIHadamard4x4_sse2
|
||||
punpcklqdq xmm2, xmm3
|
||||
MOVDQ [r0+16], xmm2
|
||||
ret
|
||||
|
||||
|
||||
; data=%1 abs_out=%2 ff=%3 mf=%4 7FFFh=%5
|
||||
%macro AVX2_Quant 5
|
||||
vpabsw %2, %1
|
||||
vpor %1, %1, %5 ; ensure non-zero before vpsignw
|
||||
vpaddusw %2, %2, %3
|
||||
vpmulhuw %2, %2, %4
|
||||
vpsignw %1, %2, %1
|
||||
%endmacro
|
||||
|
||||
|
||||
;***********************************************************************
|
||||
; void WelsQuant4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf);
|
||||
;***********************************************************************
|
||||
|
||||
WELS_EXTERN WelsQuant4x4_avx2
|
||||
%assign push_num 0
|
||||
LOAD_3_PARA
|
||||
PUSH_XMM 5
|
||||
vbroadcasti128 ymm0, [r1]
|
||||
vbroadcasti128 ymm1, [r2]
|
||||
WELS_DW32767_VEX ymm2
|
||||
vmovdqu ymm3, [r0]
|
||||
AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2
|
||||
vmovdqu [r0], ymm3
|
||||
vzeroupper
|
||||
POP_XMM
|
||||
ret
|
||||
|
||||
|
||||
;***********************************************************************
|
||||
;void WelsQuant4x4Dc_avx2(int16_t *pDct, int16_t ff, int16_t mf);
|
||||
;***********************************************************************
|
||||
|
||||
WELS_EXTERN WelsQuant4x4Dc_avx2
|
||||
%assign push_num 0
|
||||
LOAD_1_PARA
|
||||
PUSH_XMM 5
|
||||
%ifidni r1, arg2
|
||||
vmovd xmm0, arg2d
|
||||
vpbroadcastw ymm0, xmm0
|
||||
%else
|
||||
vpbroadcastw ymm0, arg2
|
||||
%endif
|
||||
%ifidni r2, arg3
|
||||
vmovd xmm1, arg3d
|
||||
vpbroadcastw ymm1, xmm1
|
||||
%else
|
||||
vpbroadcastw ymm1, arg3
|
||||
%endif
|
||||
WELS_DW32767_VEX ymm2
|
||||
vmovdqu ymm3, [r0]
|
||||
AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2
|
||||
vmovdqu [r0], ymm3
|
||||
vzeroupper
|
||||
POP_XMM
|
||||
ret
|
||||
|
||||
|
||||
;***********************************************************************
|
||||
; void WelsQuantFour4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf);
|
||||
;***********************************************************************
|
||||
|
||||
WELS_EXTERN WelsQuantFour4x4_avx2
|
||||
%assign push_num 0
|
||||
LOAD_3_PARA
|
||||
PUSH_XMM 6
|
||||
vbroadcasti128 ymm0, [r1]
|
||||
vbroadcasti128 ymm1, [r2]
|
||||
WELS_DW32767_VEX ymm4
|
||||
vmovdqu ymm3, [r0 + 0x00]
|
||||
vmovdqu ymm5, [r0 + 0x20]
|
||||
AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4
|
||||
vmovdqu [r0 + 0x00], ymm3
|
||||
AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4
|
||||
vmovdqu [r0 + 0x20], ymm5
|
||||
vmovdqu ymm3, [r0 + 0x40]
|
||||
vmovdqu ymm5, [r0 + 0x60]
|
||||
AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4
|
||||
vmovdqu [r0 + 0x40], ymm3
|
||||
AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4
|
||||
vmovdqu [r0 + 0x60], ymm5
|
||||
vzeroupper
|
||||
POP_XMM
|
||||
ret
|
||||
|
||||
|
||||
;***********************************************************************
|
||||
; void WelsQuantFour4x4Max_avx2(int16_t *pDct, int32_t* ff, int16_t *mf, int16_t *max);
|
||||
;***********************************************************************
|
||||
|
||||
WELS_EXTERN WelsQuantFour4x4Max_avx2
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 7
|
||||
vbroadcasti128 ymm0, [r1]
|
||||
vbroadcasti128 ymm1, [r2]
|
||||
WELS_DW32767_VEX ymm6
|
||||
vmovdqu ymm4, [r0 + 0x00]
|
||||
vmovdqu ymm5, [r0 + 0x20]
|
||||
AVX2_Quant ymm4, ymm2, ymm0, ymm1, ymm6
|
||||
vmovdqu [r0 + 0x00], ymm4
|
||||
AVX2_Quant ymm5, ymm3, ymm0, ymm1, ymm6
|
||||
vmovdqu [r0 + 0x20], ymm5
|
||||
vperm2i128 ymm4, ymm2, ymm3, 00100000b
|
||||
vperm2i128 ymm3, ymm2, ymm3, 00110001b
|
||||
vpmaxsw ymm2, ymm4, ymm3
|
||||
vmovdqu ymm4, [r0 + 0x40]
|
||||
vmovdqu ymm5, [r0 + 0x60]
|
||||
AVX2_Quant ymm4, ymm3, ymm0, ymm1, ymm6
|
||||
vmovdqu [r0 + 0x40], ymm4
|
||||
AVX2_Quant ymm5, ymm4, ymm0, ymm1, ymm6
|
||||
vmovdqu [r0 + 0x60], ymm5
|
||||
vperm2i128 ymm5, ymm3, ymm4, 00100000b
|
||||
vperm2i128 ymm4, ymm3, ymm4, 00110001b
|
||||
vpmaxsw ymm3, ymm5, ymm4
|
||||
vpxor ymm2, ymm2, ymm6 ; flip bits so as to enable use of vphminposuw to find max value.
|
||||
vpxor ymm3, ymm3, ymm6 ; flip bits so as to enable use of vphminposuw to find max value.
|
||||
vextracti128 xmm4, ymm2, 1
|
||||
vextracti128 xmm5, ymm3, 1
|
||||
vphminposuw xmm2, xmm2
|
||||
vphminposuw xmm3, xmm3
|
||||
vphminposuw xmm4, xmm4
|
||||
vphminposuw xmm5, xmm5
|
||||
vpunpcklwd xmm2, xmm2, xmm4
|
||||
vpunpcklwd xmm3, xmm3, xmm5
|
||||
vpunpckldq xmm2, xmm2, xmm3
|
||||
vpxor xmm2, xmm2, xmm6 ; restore non-flipped values.
|
||||
vmovq [r3], xmm2 ; store max values.
|
||||
vzeroupper
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
@ -3,6 +3,8 @@
|
||||
#include "ls_defines.h"
|
||||
#include "encode_mb_aux.h"
|
||||
#include "wels_common_basis.h"
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
|
||||
using namespace WelsEnc;
|
||||
|
||||
@ -292,41 +294,95 @@ TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
|
||||
#define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)
|
||||
#define NEW_QUANT(pDct, ff, mf) (((ff)+ WELS_ABS_LC(pDct))*(mf)) >>16
|
||||
#define WELS_NEW_QUANT(pDct,ff,mf) WELS_ABS_LC(NEW_QUANT(pDct, ff, mf))
|
||||
void WelsQuantFour4x4MaxAnchor (int16_t* pDct, int16_t* ff, int16_t* mf, int16_t* max) {
|
||||
int32_t i, j, k, sign;
|
||||
int16_t max_abs;
|
||||
for (k = 0; k < 4; k++) {
|
||||
max_abs = 0;
|
||||
for (i = 0; i < 16; i++) {
|
||||
j = i & 0x07;
|
||||
sign = WELS_SIGN (pDct[i]);
|
||||
pDct[i] = NEW_QUANT (pDct[i], ff[j], mf[j]);
|
||||
if (max_abs < pDct[i]) max_abs = pDct[i];
|
||||
pDct[i] = WELS_ABS_LC (pDct[i]);
|
||||
}
|
||||
pDct += 16;
|
||||
max[k] = max_abs;
|
||||
namespace {
|
||||
int16_t WelsQuant4x4MaxAnchor (int16_t* pDct, int16_t* ff, int16_t* mf) {
|
||||
int16_t max_abs = 0;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
const int j = i & 0x07;
|
||||
const int32_t sign = WELS_SIGN (pDct[i]);
|
||||
pDct[i] = NEW_QUANT (pDct[i], ff[j], mf[j]);
|
||||
max_abs = std::max(max_abs, pDct[i]);
|
||||
pDct[i] = WELS_ABS_LC (pDct[i]);
|
||||
}
|
||||
return max_abs;
|
||||
}
|
||||
void WelsQuant4x4DcAnchor (int16_t* pDct, int16_t iFF, int16_t iMF) {
|
||||
for (int i = 0; i < 16; i++) {
|
||||
const int32_t sign = WELS_SIGN (pDct[i]);
|
||||
pDct[i] = WELS_NEW_QUANT (pDct[i], iFF, iMF);
|
||||
}
|
||||
}
|
||||
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_c) {
|
||||
int16_t ff[8], mf[8];
|
||||
int16_t iDctA[64], iMaxA[16];
|
||||
int16_t iDctC[64], iMaxC[16];
|
||||
for (int i = 0; i < 8; i++) {
|
||||
void WelsQuantFour4x4Anchor (int16_t* pDct, int16_t* ff, int16_t* mf) {
|
||||
for (int i = 0; i < 4; i++)
|
||||
WelsQuant4x4MaxAnchor (pDct + 16 * i, ff, mf);
|
||||
}
|
||||
void WelsQuantFour4x4MaxAnchor (int16_t* pDct, int16_t* ff, int16_t* mf, int16_t* max) {
|
||||
for (int i = 0; i < 4; i++)
|
||||
max[i] = WelsQuant4x4MaxAnchor (pDct + 16 * i, ff, mf);
|
||||
}
|
||||
void TestWelsQuant4x4 (PQuantizationFunc func) {
|
||||
const std::size_t f_size = 8;
|
||||
const std::size_t dct_size = 16;
|
||||
CMemoryAlign cMemoryAlign (0);
|
||||
ALLOC_MEMORY (int16_t, ff, f_size);
|
||||
ALLOC_MEMORY (int16_t, mf, f_size);
|
||||
ALLOC_MEMORY (int16_t, iDctC, dct_size);
|
||||
ALLOC_MEMORY (int16_t, iDctS, dct_size);
|
||||
for (std::size_t i = 0; i < f_size; i++) {
|
||||
ff[i] = rand() & 32767;
|
||||
mf[i] = rand() & 32767;
|
||||
}
|
||||
for (int i = 0; i < 64; i++)
|
||||
iDctA[i] = iDctC[i] = (rand() & 65535) - 32767;
|
||||
WelsQuantFour4x4MaxAnchor (iDctA, ff, mf, iMaxA);
|
||||
WelsQuantFour4x4Max_c (iDctC, ff, mf, iMaxC);
|
||||
for (int i = 0; i < 64; i++)
|
||||
EXPECT_EQ (iDctA[i], iDctC[i]);
|
||||
for (int i = 0; i < 4; i++)
|
||||
EXPECT_EQ (iMaxA[i], iMaxC[i]);
|
||||
for (std::size_t i = 0; i < dct_size; i++)
|
||||
iDctC[i] = iDctS[i] = (rand() & 65535) - 32768;
|
||||
WelsQuant4x4MaxAnchor (iDctC, ff, mf);
|
||||
func (iDctS, ff, mf);
|
||||
for (std::size_t i = 0; i < dct_size; i++)
|
||||
EXPECT_EQ (iDctC[i], iDctS[i]);
|
||||
FREE_MEMORY (ff);
|
||||
FREE_MEMORY (mf);
|
||||
FREE_MEMORY (iDctC);
|
||||
FREE_MEMORY (iDctS);
|
||||
}
|
||||
#ifdef X86_ASM
|
||||
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
|
||||
void TestWelsQuant4x4Dc (PQuantizationDcFunc func) {
|
||||
const std::size_t dct_size = 16;
|
||||
const int16_t ff = rand() & 32767;
|
||||
const int16_t mf = rand() & 32767;
|
||||
CMemoryAlign cMemoryAlign (0);
|
||||
ALLOC_MEMORY (int16_t, iDctC, dct_size);
|
||||
ALLOC_MEMORY (int16_t, iDctS, dct_size);
|
||||
for (std::size_t i = 0; i < dct_size; i++)
|
||||
iDctC[i] = iDctS[i] = (rand() & 65535) - 32768;
|
||||
WelsQuant4x4DcAnchor (iDctC, ff, mf);
|
||||
func (iDctS, ff, mf);
|
||||
for (std::size_t i = 0; i < dct_size; i++)
|
||||
EXPECT_EQ (iDctC[i], iDctS[i]);
|
||||
FREE_MEMORY (iDctC);
|
||||
FREE_MEMORY (iDctS);
|
||||
}
|
||||
void TestWelsQuantFour4x4 (PQuantizationFunc func) {
|
||||
const std::size_t f_size = 8;
|
||||
const std::size_t dct_size = 4 * 16;
|
||||
CMemoryAlign cMemoryAlign (0);
|
||||
ALLOC_MEMORY (int16_t, ff, f_size);
|
||||
ALLOC_MEMORY (int16_t, mf, f_size);
|
||||
ALLOC_MEMORY (int16_t, iDctC, dct_size);
|
||||
ALLOC_MEMORY (int16_t, iDctS, dct_size);
|
||||
for (std::size_t i = 0; i < f_size; i++) {
|
||||
ff[i] = rand() & 32767;
|
||||
mf[i] = rand() & 32767;
|
||||
}
|
||||
for (std::size_t i = 0; i < dct_size; i++)
|
||||
iDctC[i] = iDctS[i] = (rand() & 65535) - 32768;
|
||||
WelsQuantFour4x4Anchor (iDctC, ff, mf);
|
||||
func (iDctS, ff, mf);
|
||||
for (std::size_t i = 0; i < dct_size; i++)
|
||||
EXPECT_EQ (iDctC[i], iDctS[i]);
|
||||
FREE_MEMORY (ff);
|
||||
FREE_MEMORY (mf);
|
||||
FREE_MEMORY (iDctC);
|
||||
FREE_MEMORY (iDctS);
|
||||
}
|
||||
void TestWelsQuantFour4x4Max (PQuantizationMaxFunc func) {
|
||||
CMemoryAlign cMemoryAlign (0);
|
||||
ALLOC_MEMORY (int16_t, ff, 8);
|
||||
ALLOC_MEMORY (int16_t, mf, 8);
|
||||
@ -340,8 +396,8 @@ TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
|
||||
}
|
||||
for (int i = 0; i < 64; i++)
|
||||
iDctC[i] = iDctS[i] = (rand() & 65535) - 32767;
|
||||
WelsQuantFour4x4Max_c (iDctC, ff, mf, iMaxC);
|
||||
WelsQuantFour4x4Max_sse2 (iDctS, ff, mf, iMaxS);
|
||||
WelsQuantFour4x4MaxAnchor (iDctC, ff, mf, iMaxC);
|
||||
func (iDctS, ff, mf, iMaxS);
|
||||
for (int i = 0; i < 64; i++)
|
||||
EXPECT_EQ (iDctC[i], iDctS[i]);
|
||||
for (int i = 0; i < 4; i++)
|
||||
@ -353,6 +409,48 @@ TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
|
||||
FREE_MEMORY (iMaxC);
|
||||
FREE_MEMORY (iMaxS);
|
||||
}
|
||||
} // anon ns
|
||||
TEST (EncodeMbAuxTest, WelsQuant4x4_c) {
|
||||
TestWelsQuant4x4 (WelsQuant4x4_c);
|
||||
}
|
||||
TEST (EncodeMbAuxTest, WelsQuant4x4Dc_c) {
|
||||
TestWelsQuant4x4Dc (WelsQuant4x4Dc_c);
|
||||
}
|
||||
TEST (EncodeMbAuxTest, WelsQuantFour4x4_c) {
|
||||
TestWelsQuantFour4x4 (WelsQuantFour4x4_c);
|
||||
}
|
||||
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_c) {
|
||||
TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_c);
|
||||
}
|
||||
#ifdef X86_ASM
|
||||
TEST (EncodeMbAuxTest, WelsQuant4x4_sse2) {
|
||||
TestWelsQuant4x4 (WelsQuant4x4_sse2);
|
||||
}
|
||||
TEST (EncodeMbAuxTest, WelsQuant4x4Dc_sse2) {
|
||||
TestWelsQuant4x4Dc (WelsQuant4x4Dc_sse2);
|
||||
}
|
||||
TEST (EncodeMbAuxTest, WelsQuantFour4x4_sse2) {
|
||||
TestWelsQuantFour4x4 (WelsQuantFour4x4_sse2);
|
||||
}
|
||||
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
|
||||
TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_sse2);
|
||||
}
|
||||
TEST (EncodeMbAuxTest, WelsQuant4x4_avx2) {
|
||||
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
|
||||
TestWelsQuant4x4 (WelsQuant4x4_avx2);
|
||||
}
|
||||
TEST (EncodeMbAuxTest, WelsQuant4x4Dc_avx2) {
|
||||
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
|
||||
TestWelsQuant4x4Dc (WelsQuant4x4Dc_avx2);
|
||||
}
|
||||
TEST (EncodeMbAuxTest, WelsQuantFour4x4_avx2) {
|
||||
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
|
||||
TestWelsQuantFour4x4 (WelsQuantFour4x4_avx2);
|
||||
}
|
||||
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_avx2) {
|
||||
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
|
||||
TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_avx2);
|
||||
}
|
||||
#endif
|
||||
int32_t WelsHadamardQuant2x2SkipAnchor (int16_t* rs, int16_t ff, int16_t mf) {
|
||||
int16_t pDct[4], s[4];
|
||||
|
Loading…
x
Reference in New Issue
Block a user