Merge pull request #2441 from saamas/encoder-add-avx2-4x4-quantization-routines

[Encoder] Add AVX2 4x4 quantization routines
This commit is contained in:
ruil2 2016-04-28 09:08:31 +08:00
commit 7d65687284
5 changed files with 278 additions and 31 deletions

View File

@ -657,3 +657,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-
vpsrlw %1, %1, 15
vpsllw %1, %1, 5
%endmacro
%macro WELS_DW32767_VEX 1
vpcmpeqw %1, %1, %1
vpsrlw %1, %1, 1
%endmacro

View File

@ -106,6 +106,11 @@ void WelsQuant4x4Dc_sse2 (int16_t* pDct, int16_t iFF, int16_t iMF);
void WelsQuantFour4x4_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
void WelsQuantFour4x4Max_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
void WelsQuant4x4_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
void WelsQuant4x4Dc_avx2 (int16_t* pDct, int16_t iFF, int16_t iMF);
void WelsQuantFour4x4_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
void WelsQuantFour4x4Max_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
#endif
#ifdef HAVE_NEON

View File

@ -526,6 +526,11 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
if (uiCpuFlag & WELS_CPU_AVX2) {
pFuncList->pfDctT4 = WelsDctT4_avx2;
pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
pFuncList->pfQuantization4x4 = WelsQuant4x4_avx2;
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_avx2;
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_avx2;
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_avx2;
}
//#endif//MACOS

View File

@ -368,3 +368,137 @@ WELS_EXTERN WelsDequantIHadamard4x4_sse2
punpcklqdq xmm2, xmm3
MOVDQ [r0+16], xmm2
ret
; data=%1 abs_out=%2 ff=%3 mf=%4 7FFFh=%5
%macro AVX2_Quant 5
vpabsw %2, %1
vpor %1, %1, %5 ; ensure non-zero before vpsignw
vpaddusw %2, %2, %3
vpmulhuw %2, %2, %4
vpsignw %1, %2, %1
%endmacro
;***********************************************************************
; void WelsQuant4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4_avx2
%assign push_num 0
LOAD_3_PARA
PUSH_XMM 5
vbroadcasti128 ymm0, [r1]
vbroadcasti128 ymm1, [r2]
WELS_DW32767_VEX ymm2
vmovdqu ymm3, [r0]
AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2
vmovdqu [r0], ymm3
vzeroupper
POP_XMM
ret
;***********************************************************************
;void WelsQuant4x4Dc_avx2(int16_t *pDct, int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4Dc_avx2
%assign push_num 0
LOAD_1_PARA
PUSH_XMM 5
%ifidni r1, arg2
vmovd xmm0, arg2d
vpbroadcastw ymm0, xmm0
%else
vpbroadcastw ymm0, arg2
%endif
%ifidni r2, arg3
vmovd xmm1, arg3d
vpbroadcastw ymm1, xmm1
%else
vpbroadcastw ymm1, arg3
%endif
WELS_DW32767_VEX ymm2
vmovdqu ymm3, [r0]
AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2
vmovdqu [r0], ymm3
vzeroupper
POP_XMM
ret
;***********************************************************************
; void WelsQuantFour4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4_avx2
%assign push_num 0
LOAD_3_PARA
PUSH_XMM 6
vbroadcasti128 ymm0, [r1]
vbroadcasti128 ymm1, [r2]
WELS_DW32767_VEX ymm4
vmovdqu ymm3, [r0 + 0x00]
vmovdqu ymm5, [r0 + 0x20]
AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4
vmovdqu [r0 + 0x00], ymm3
AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4
vmovdqu [r0 + 0x20], ymm5
vmovdqu ymm3, [r0 + 0x40]
vmovdqu ymm5, [r0 + 0x60]
AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4
vmovdqu [r0 + 0x40], ymm3
AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4
vmovdqu [r0 + 0x60], ymm5
vzeroupper
POP_XMM
ret
;***********************************************************************
; void WelsQuantFour4x4Max_avx2(int16_t *pDct, int32_t* ff, int16_t *mf, int16_t *max);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4Max_avx2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 7
vbroadcasti128 ymm0, [r1]
vbroadcasti128 ymm1, [r2]
WELS_DW32767_VEX ymm6
vmovdqu ymm4, [r0 + 0x00]
vmovdqu ymm5, [r0 + 0x20]
AVX2_Quant ymm4, ymm2, ymm0, ymm1, ymm6
vmovdqu [r0 + 0x00], ymm4
AVX2_Quant ymm5, ymm3, ymm0, ymm1, ymm6
vmovdqu [r0 + 0x20], ymm5
vperm2i128 ymm4, ymm2, ymm3, 00100000b
vperm2i128 ymm3, ymm2, ymm3, 00110001b
vpmaxsw ymm2, ymm4, ymm3
vmovdqu ymm4, [r0 + 0x40]
vmovdqu ymm5, [r0 + 0x60]
AVX2_Quant ymm4, ymm3, ymm0, ymm1, ymm6
vmovdqu [r0 + 0x40], ymm4
AVX2_Quant ymm5, ymm4, ymm0, ymm1, ymm6
vmovdqu [r0 + 0x60], ymm5
vperm2i128 ymm5, ymm3, ymm4, 00100000b
vperm2i128 ymm4, ymm3, ymm4, 00110001b
vpmaxsw ymm3, ymm5, ymm4
vpxor ymm2, ymm2, ymm6 ; flip bits so as to enable use of vphminposuw to find max value.
vpxor ymm3, ymm3, ymm6 ; flip bits so as to enable use of vphminposuw to find max value.
vextracti128 xmm4, ymm2, 1
vextracti128 xmm5, ymm3, 1
vphminposuw xmm2, xmm2
vphminposuw xmm3, xmm3
vphminposuw xmm4, xmm4
vphminposuw xmm5, xmm5
vpunpcklwd xmm2, xmm2, xmm4
vpunpcklwd xmm3, xmm3, xmm5
vpunpckldq xmm2, xmm2, xmm3
vpxor xmm2, xmm2, xmm6 ; restore non-flipped values.
vmovq [r3], xmm2 ; store max values.
vzeroupper
POP_XMM
LOAD_4_PARA_POP
ret

View File

@ -3,6 +3,8 @@
#include "ls_defines.h"
#include "encode_mb_aux.h"
#include "wels_common_basis.h"
#include <algorithm>
#include <cstddef>
using namespace WelsEnc;
@ -292,41 +294,95 @@ TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
#define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)
#define NEW_QUANT(pDct, ff, mf) (((ff)+ WELS_ABS_LC(pDct))*(mf)) >>16
#define WELS_NEW_QUANT(pDct,ff,mf) WELS_ABS_LC(NEW_QUANT(pDct, ff, mf))
void WelsQuantFour4x4MaxAnchor (int16_t* pDct, int16_t* ff, int16_t* mf, int16_t* max) {
int32_t i, j, k, sign;
int16_t max_abs;
for (k = 0; k < 4; k++) {
max_abs = 0;
for (i = 0; i < 16; i++) {
j = i & 0x07;
sign = WELS_SIGN (pDct[i]);
namespace {
int16_t WelsQuant4x4MaxAnchor (int16_t* pDct, int16_t* ff, int16_t* mf) {
int16_t max_abs = 0;
for (int i = 0; i < 16; i++) {
const int j = i & 0x07;
const int32_t sign = WELS_SIGN (pDct[i]);
pDct[i] = NEW_QUANT (pDct[i], ff[j], mf[j]);
if (max_abs < pDct[i]) max_abs = pDct[i];
max_abs = std::max(max_abs, pDct[i]);
pDct[i] = WELS_ABS_LC (pDct[i]);
}
pDct += 16;
max[k] = max_abs;
return max_abs;
}
void WelsQuant4x4DcAnchor (int16_t* pDct, int16_t iFF, int16_t iMF) {
for (int i = 0; i < 16; i++) {
const int32_t sign = WELS_SIGN (pDct[i]);
pDct[i] = WELS_NEW_QUANT (pDct[i], iFF, iMF);
}
}
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_c) {
int16_t ff[8], mf[8];
int16_t iDctA[64], iMaxA[16];
int16_t iDctC[64], iMaxC[16];
for (int i = 0; i < 8; i++) {
void WelsQuantFour4x4Anchor (int16_t* pDct, int16_t* ff, int16_t* mf) {
for (int i = 0; i < 4; i++)
WelsQuant4x4MaxAnchor (pDct + 16 * i, ff, mf);
}
void WelsQuantFour4x4MaxAnchor (int16_t* pDct, int16_t* ff, int16_t* mf, int16_t* max) {
for (int i = 0; i < 4; i++)
max[i] = WelsQuant4x4MaxAnchor (pDct + 16 * i, ff, mf);
}
void TestWelsQuant4x4 (PQuantizationFunc func) {
const std::size_t f_size = 8;
const std::size_t dct_size = 16;
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (int16_t, ff, f_size);
ALLOC_MEMORY (int16_t, mf, f_size);
ALLOC_MEMORY (int16_t, iDctC, dct_size);
ALLOC_MEMORY (int16_t, iDctS, dct_size);
for (std::size_t i = 0; i < f_size; i++) {
ff[i] = rand() & 32767;
mf[i] = rand() & 32767;
}
for (int i = 0; i < 64; i++)
iDctA[i] = iDctC[i] = (rand() & 65535) - 32767;
WelsQuantFour4x4MaxAnchor (iDctA, ff, mf, iMaxA);
WelsQuantFour4x4Max_c (iDctC, ff, mf, iMaxC);
for (int i = 0; i < 64; i++)
EXPECT_EQ (iDctA[i], iDctC[i]);
for (int i = 0; i < 4; i++)
EXPECT_EQ (iMaxA[i], iMaxC[i]);
for (std::size_t i = 0; i < dct_size; i++)
iDctC[i] = iDctS[i] = (rand() & 65535) - 32768;
WelsQuant4x4MaxAnchor (iDctC, ff, mf);
func (iDctS, ff, mf);
for (std::size_t i = 0; i < dct_size; i++)
EXPECT_EQ (iDctC[i], iDctS[i]);
FREE_MEMORY (ff);
FREE_MEMORY (mf);
FREE_MEMORY (iDctC);
FREE_MEMORY (iDctS);
}
#ifdef X86_ASM
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
void TestWelsQuant4x4Dc (PQuantizationDcFunc func) {
const std::size_t dct_size = 16;
const int16_t ff = rand() & 32767;
const int16_t mf = rand() & 32767;
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (int16_t, iDctC, dct_size);
ALLOC_MEMORY (int16_t, iDctS, dct_size);
for (std::size_t i = 0; i < dct_size; i++)
iDctC[i] = iDctS[i] = (rand() & 65535) - 32768;
WelsQuant4x4DcAnchor (iDctC, ff, mf);
func (iDctS, ff, mf);
for (std::size_t i = 0; i < dct_size; i++)
EXPECT_EQ (iDctC[i], iDctS[i]);
FREE_MEMORY (iDctC);
FREE_MEMORY (iDctS);
}
void TestWelsQuantFour4x4 (PQuantizationFunc func) {
const std::size_t f_size = 8;
const std::size_t dct_size = 4 * 16;
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (int16_t, ff, f_size);
ALLOC_MEMORY (int16_t, mf, f_size);
ALLOC_MEMORY (int16_t, iDctC, dct_size);
ALLOC_MEMORY (int16_t, iDctS, dct_size);
for (std::size_t i = 0; i < f_size; i++) {
ff[i] = rand() & 32767;
mf[i] = rand() & 32767;
}
for (std::size_t i = 0; i < dct_size; i++)
iDctC[i] = iDctS[i] = (rand() & 65535) - 32768;
WelsQuantFour4x4Anchor (iDctC, ff, mf);
func (iDctS, ff, mf);
for (std::size_t i = 0; i < dct_size; i++)
EXPECT_EQ (iDctC[i], iDctS[i]);
FREE_MEMORY (ff);
FREE_MEMORY (mf);
FREE_MEMORY (iDctC);
FREE_MEMORY (iDctS);
}
void TestWelsQuantFour4x4Max (PQuantizationMaxFunc func) {
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (int16_t, ff, 8);
ALLOC_MEMORY (int16_t, mf, 8);
@ -340,8 +396,8 @@ TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
}
for (int i = 0; i < 64; i++)
iDctC[i] = iDctS[i] = (rand() & 65535) - 32767;
WelsQuantFour4x4Max_c (iDctC, ff, mf, iMaxC);
WelsQuantFour4x4Max_sse2 (iDctS, ff, mf, iMaxS);
WelsQuantFour4x4MaxAnchor (iDctC, ff, mf, iMaxC);
func (iDctS, ff, mf, iMaxS);
for (int i = 0; i < 64; i++)
EXPECT_EQ (iDctC[i], iDctS[i]);
for (int i = 0; i < 4; i++)
@ -353,6 +409,48 @@ TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
FREE_MEMORY (iMaxC);
FREE_MEMORY (iMaxS);
}
} // anon ns
TEST (EncodeMbAuxTest, WelsQuant4x4_c) {
TestWelsQuant4x4 (WelsQuant4x4_c);
}
TEST (EncodeMbAuxTest, WelsQuant4x4Dc_c) {
TestWelsQuant4x4Dc (WelsQuant4x4Dc_c);
}
TEST (EncodeMbAuxTest, WelsQuantFour4x4_c) {
TestWelsQuantFour4x4 (WelsQuantFour4x4_c);
}
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_c) {
TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_c);
}
#ifdef X86_ASM
TEST (EncodeMbAuxTest, WelsQuant4x4_sse2) {
TestWelsQuant4x4 (WelsQuant4x4_sse2);
}
TEST (EncodeMbAuxTest, WelsQuant4x4Dc_sse2) {
TestWelsQuant4x4Dc (WelsQuant4x4Dc_sse2);
}
TEST (EncodeMbAuxTest, WelsQuantFour4x4_sse2) {
TestWelsQuantFour4x4 (WelsQuantFour4x4_sse2);
}
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_sse2);
}
TEST (EncodeMbAuxTest, WelsQuant4x4_avx2) {
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
TestWelsQuant4x4 (WelsQuant4x4_avx2);
}
TEST (EncodeMbAuxTest, WelsQuant4x4Dc_avx2) {
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
TestWelsQuant4x4Dc (WelsQuant4x4Dc_avx2);
}
TEST (EncodeMbAuxTest, WelsQuantFour4x4_avx2) {
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
TestWelsQuantFour4x4 (WelsQuantFour4x4_avx2);
}
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_avx2) {
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_avx2);
}
#endif
int32_t WelsHadamardQuant2x2SkipAnchor (int16_t* rs, int16_t ff, int16_t mf) {
int16_t pDct[4], s[4];