[Encoder] Add AVX2 4x4 quantization routines
WelsQuantFour4x4Max_avx2 (~2.06x speedup over SSE2) WelsQuantFour4x4_avx2 (~2.32x speedup over SSE2) WelsQuant4x4Dc_avx2 (~1.49x speedup over SSE2) WelsQuant4x4_avx2 (~1.42x speedup over SSE2)
This commit is contained in:
parent
1e83bec860
commit
bb49e23719
@ -657,3 +657,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-
|
|||||||
vpsrlw %1, %1, 15
|
vpsrlw %1, %1, 15
|
||||||
vpsllw %1, %1, 5
|
vpsllw %1, %1, 5
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
%macro WELS_DW32767_VEX 1
|
||||||
|
vpcmpeqw %1, %1, %1
|
||||||
|
vpsrlw %1, %1, 1
|
||||||
|
%endmacro
|
||||||
|
@ -106,6 +106,11 @@ void WelsQuant4x4Dc_sse2 (int16_t* pDct, int16_t iFF, int16_t iMF);
|
|||||||
void WelsQuantFour4x4_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
|
void WelsQuantFour4x4_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
|
||||||
void WelsQuantFour4x4Max_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
|
void WelsQuantFour4x4Max_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
|
||||||
|
|
||||||
|
void WelsQuant4x4_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
|
||||||
|
void WelsQuant4x4Dc_avx2 (int16_t* pDct, int16_t iFF, int16_t iMF);
|
||||||
|
void WelsQuantFour4x4_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
|
||||||
|
void WelsQuantFour4x4Max_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE_NEON
|
#ifdef HAVE_NEON
|
||||||
|
@ -526,6 +526,11 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
|||||||
if (uiCpuFlag & WELS_CPU_AVX2) {
|
if (uiCpuFlag & WELS_CPU_AVX2) {
|
||||||
pFuncList->pfDctT4 = WelsDctT4_avx2;
|
pFuncList->pfDctT4 = WelsDctT4_avx2;
|
||||||
pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
|
pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
|
||||||
|
|
||||||
|
pFuncList->pfQuantization4x4 = WelsQuant4x4_avx2;
|
||||||
|
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_avx2;
|
||||||
|
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_avx2;
|
||||||
|
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_avx2;
|
||||||
}
|
}
|
||||||
|
|
||||||
//#endif//MACOS
|
//#endif//MACOS
|
||||||
|
@ -368,3 +368,137 @@ WELS_EXTERN WelsDequantIHadamard4x4_sse2
|
|||||||
punpcklqdq xmm2, xmm3
|
punpcklqdq xmm2, xmm3
|
||||||
MOVDQ [r0+16], xmm2
|
MOVDQ [r0+16], xmm2
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
; data=%1 abs_out=%2 ff=%3 mf=%4 7FFFh=%5
|
||||||
|
%macro AVX2_Quant 5
|
||||||
|
vpabsw %2, %1
|
||||||
|
vpor %1, %1, %5 ; ensure non-zero before vpsignw
|
||||||
|
vpaddusw %2, %2, %3
|
||||||
|
vpmulhuw %2, %2, %4
|
||||||
|
vpsignw %1, %2, %1
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
|
||||||
|
;***********************************************************************
|
||||||
|
; void WelsQuant4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf);
|
||||||
|
;***********************************************************************
|
||||||
|
|
||||||
|
WELS_EXTERN WelsQuant4x4_avx2
|
||||||
|
%assign push_num 0
|
||||||
|
LOAD_3_PARA
|
||||||
|
PUSH_XMM 5
|
||||||
|
vbroadcasti128 ymm0, [r1]
|
||||||
|
vbroadcasti128 ymm1, [r2]
|
||||||
|
WELS_DW32767_VEX ymm2
|
||||||
|
vmovdqu ymm3, [r0]
|
||||||
|
AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2
|
||||||
|
vmovdqu [r0], ymm3
|
||||||
|
vzeroupper
|
||||||
|
POP_XMM
|
||||||
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
;***********************************************************************
|
||||||
|
;void WelsQuant4x4Dc_avx2(int16_t *pDct, int16_t ff, int16_t mf);
|
||||||
|
;***********************************************************************
|
||||||
|
|
||||||
|
WELS_EXTERN WelsQuant4x4Dc_avx2
|
||||||
|
%assign push_num 0
|
||||||
|
LOAD_1_PARA
|
||||||
|
PUSH_XMM 5
|
||||||
|
%ifidni r1, arg2
|
||||||
|
vmovd xmm0, arg2d
|
||||||
|
vpbroadcastw ymm0, xmm0
|
||||||
|
%else
|
||||||
|
vpbroadcastw ymm0, arg2
|
||||||
|
%endif
|
||||||
|
%ifidni r2, arg3
|
||||||
|
vmovd xmm1, arg3d
|
||||||
|
vpbroadcastw ymm1, xmm1
|
||||||
|
%else
|
||||||
|
vpbroadcastw ymm1, arg3
|
||||||
|
%endif
|
||||||
|
WELS_DW32767_VEX ymm2
|
||||||
|
vmovdqu ymm3, [r0]
|
||||||
|
AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2
|
||||||
|
vmovdqu [r0], ymm3
|
||||||
|
vzeroupper
|
||||||
|
POP_XMM
|
||||||
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
;***********************************************************************
|
||||||
|
; void WelsQuantFour4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf);
|
||||||
|
;***********************************************************************
|
||||||
|
|
||||||
|
WELS_EXTERN WelsQuantFour4x4_avx2
|
||||||
|
%assign push_num 0
|
||||||
|
LOAD_3_PARA
|
||||||
|
PUSH_XMM 6
|
||||||
|
vbroadcasti128 ymm0, [r1]
|
||||||
|
vbroadcasti128 ymm1, [r2]
|
||||||
|
WELS_DW32767_VEX ymm4
|
||||||
|
vmovdqu ymm3, [r0 + 0x00]
|
||||||
|
vmovdqu ymm5, [r0 + 0x20]
|
||||||
|
AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4
|
||||||
|
vmovdqu [r0 + 0x00], ymm3
|
||||||
|
AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4
|
||||||
|
vmovdqu [r0 + 0x20], ymm5
|
||||||
|
vmovdqu ymm3, [r0 + 0x40]
|
||||||
|
vmovdqu ymm5, [r0 + 0x60]
|
||||||
|
AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4
|
||||||
|
vmovdqu [r0 + 0x40], ymm3
|
||||||
|
AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4
|
||||||
|
vmovdqu [r0 + 0x60], ymm5
|
||||||
|
vzeroupper
|
||||||
|
POP_XMM
|
||||||
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
;***********************************************************************
|
||||||
|
; void WelsQuantFour4x4Max_avx2(int16_t *pDct, int32_t* ff, int16_t *mf, int16_t *max);
|
||||||
|
;***********************************************************************
|
||||||
|
|
||||||
|
WELS_EXTERN WelsQuantFour4x4Max_avx2
|
||||||
|
%assign push_num 0
|
||||||
|
LOAD_4_PARA
|
||||||
|
PUSH_XMM 7
|
||||||
|
vbroadcasti128 ymm0, [r1]
|
||||||
|
vbroadcasti128 ymm1, [r2]
|
||||||
|
WELS_DW32767_VEX ymm6
|
||||||
|
vmovdqu ymm4, [r0 + 0x00]
|
||||||
|
vmovdqu ymm5, [r0 + 0x20]
|
||||||
|
AVX2_Quant ymm4, ymm2, ymm0, ymm1, ymm6
|
||||||
|
vmovdqu [r0 + 0x00], ymm4
|
||||||
|
AVX2_Quant ymm5, ymm3, ymm0, ymm1, ymm6
|
||||||
|
vmovdqu [r0 + 0x20], ymm5
|
||||||
|
vperm2i128 ymm4, ymm2, ymm3, 00100000b
|
||||||
|
vperm2i128 ymm3, ymm2, ymm3, 00110001b
|
||||||
|
vpmaxsw ymm2, ymm4, ymm3
|
||||||
|
vmovdqu ymm4, [r0 + 0x40]
|
||||||
|
vmovdqu ymm5, [r0 + 0x60]
|
||||||
|
AVX2_Quant ymm4, ymm3, ymm0, ymm1, ymm6
|
||||||
|
vmovdqu [r0 + 0x40], ymm4
|
||||||
|
AVX2_Quant ymm5, ymm4, ymm0, ymm1, ymm6
|
||||||
|
vmovdqu [r0 + 0x60], ymm5
|
||||||
|
vperm2i128 ymm5, ymm3, ymm4, 00100000b
|
||||||
|
vperm2i128 ymm4, ymm3, ymm4, 00110001b
|
||||||
|
vpmaxsw ymm3, ymm5, ymm4
|
||||||
|
vpxor ymm2, ymm2, ymm6 ; flip bits so as to enable use of vphminposuw to find max value.
|
||||||
|
vpxor ymm3, ymm3, ymm6 ; flip bits so as to enable use of vphminposuw to find max value.
|
||||||
|
vextracti128 xmm4, ymm2, 1
|
||||||
|
vextracti128 xmm5, ymm3, 1
|
||||||
|
vphminposuw xmm2, xmm2
|
||||||
|
vphminposuw xmm3, xmm3
|
||||||
|
vphminposuw xmm4, xmm4
|
||||||
|
vphminposuw xmm5, xmm5
|
||||||
|
vpunpcklwd xmm2, xmm2, xmm4
|
||||||
|
vpunpcklwd xmm3, xmm3, xmm5
|
||||||
|
vpunpckldq xmm2, xmm2, xmm3
|
||||||
|
vpxor xmm2, xmm2, xmm6 ; restore non-flipped values.
|
||||||
|
vmovq [r3], xmm2 ; store max values.
|
||||||
|
vzeroupper
|
||||||
|
POP_XMM
|
||||||
|
LOAD_4_PARA_POP
|
||||||
|
ret
|
||||||
|
@ -435,6 +435,22 @@ TEST (EncodeMbAuxTest, WelsQuantFour4x4_sse2) {
|
|||||||
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
|
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
|
||||||
TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_sse2);
|
TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_sse2);
|
||||||
}
|
}
|
||||||
|
TEST (EncodeMbAuxTest, WelsQuant4x4_avx2) {
|
||||||
|
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
|
||||||
|
TestWelsQuant4x4 (WelsQuant4x4_avx2);
|
||||||
|
}
|
||||||
|
TEST (EncodeMbAuxTest, WelsQuant4x4Dc_avx2) {
|
||||||
|
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
|
||||||
|
TestWelsQuant4x4Dc (WelsQuant4x4Dc_avx2);
|
||||||
|
}
|
||||||
|
TEST (EncodeMbAuxTest, WelsQuantFour4x4_avx2) {
|
||||||
|
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
|
||||||
|
TestWelsQuantFour4x4 (WelsQuantFour4x4_avx2);
|
||||||
|
}
|
||||||
|
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_avx2) {
|
||||||
|
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
|
||||||
|
TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_avx2);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
int32_t WelsHadamardQuant2x2SkipAnchor (int16_t* rs, int16_t ff, int16_t mf) {
|
int32_t WelsHadamardQuant2x2SkipAnchor (int16_t* rs, int16_t ff, int16_t mf) {
|
||||||
int16_t pDct[4], s[4];
|
int16_t pDct[4], s[4];
|
||||||
|
Loading…
x
Reference in New Issue
Block a user