[Encoder] Add single-block SSE2 4x4 DCT/IDCT routines

We do four blocks at a time when possible, but need to handle
single blocks at a time for intra prediction.

~2.31x speedup over MMX for the DCT on Haswell.
~1.92x speedup over MMX for the IDCT on Haswell.
This commit is contained in:
Sindre Aamås 2016-01-27 13:18:51 +01:00
parent 7486de2844
commit f90960983c
7 changed files with 157 additions and 31 deletions

View File

@ -65,6 +65,7 @@ void WelsDequantFour4x4_sse2 (int16_t* pDct, const uint16_t* kpMF);
void WelsDequantIHadamard4x4_sse2 (int16_t* pRes, const uint16_t kuiMF);
void WelsIDctT4Rec_mmx (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
void WelsIDctT4Rec_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
void WelsIDctFourT4Rec_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
void WelsIDctRecI16x16Dc_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride,
int16_t* pDctDc);

View File

@ -89,6 +89,7 @@ int32_t WelsCalculateSingleCtr4x4_sse2 (int16_t* pDct);
* DCT functions
****************************************************************************/
void WelsDctT4_mmx (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
void WelsDctT4_sse2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
void WelsDctFourT4_sse2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
void WelsDctFourT4_avx2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);

View File

@ -266,6 +266,7 @@ void WelsInitReconstructionFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFl
pFuncList->pfDequantizationFour4x4 = WelsDequantFour4x4_sse2;
pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_sse2;
pFuncList->pfIDctT4 = WelsIDctT4Rec_sse2;
pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_sse2;
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
}

View File

@ -516,6 +516,7 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
pFuncList->pfScan4x4Ac = WelsScan4x4Ac_sse2;
pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_sse2;
pFuncList->pfDctT4 = WelsDctT4_sse2;
pFuncList->pfDctFourT4 = WelsDctFourT4_sse2;
}
//#ifndef MACOS

View File

@ -75,6 +75,12 @@ wels_p0m8000p0m8000w_128:
times 4 dw 0, -8000h
wels_p1p1m1m1w_128:
times 2 dw 1, 1, -1, -1
wels_4xp1w_4xp2w:
times 4 dw 1
times 4 dw 2
wels_4xp0w_4xm8000w:
times 4 dw 0
times 4 dw -8000h
align 16
SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
@ -281,6 +287,42 @@ WELS_EXTERN WelsIDctT4Rec_mmx
movq %4, %2
%endmacro
%macro SSE2_Load2x4P 2
MOVDQ %1, [%2]
%endmacro
%macro SSE2_Store2x4P 2
MOVDQ [%1], %2
%endmacro
; out=%1 pPixel1Line1=%2 pPixel1Line2=%3 pPixel2Line1=%4 pPixel2Line2=%5 zero=%6 clobber=%7,%8
%macro SSE2_LoadDiff2x4P 8
movd %1, [%2]
movd %7, [%3]
punpckldq %1, %7
punpcklbw %1, %6
movd %7, [%4]
movd %8, [%5]
punpckldq %7, %8
punpcklbw %7, %6
psubw %1, %7
%endmacro
; pRec1=%1 pRec2=%2 data=%3 pPred1=%4 pPred2=%5 dw32=%6 zero=%7 clobber=%8,%9
%macro SSE2_StoreDiff2x4P 9
paddw %3, %6
psraw %3, 6
movd %8, [%4]
movd %9, [%5]
punpckldq %8, %9
punpcklbw %8, %7
paddsw %3, %8
packuswb %3, %3
movd [%1], %3
psrlq %3, 32
movd [%2], %3
%endmacro
%macro SSE2_Load8DC 6
movdqa %1, %6 ; %1 = dc0 dc1
paddw %1, %5
@ -353,6 +395,43 @@ WELS_EXTERN WelsIDctT4Rec_mmx
paddw %1, %3 ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
%endmacro
; Do 4 vertical 4-pt DCTs in parallel packed as 16 words in 2 xmm registers.
; Uses scrambled input to save a negation.
; [y0,y1]=%1 [y2,y3]=%2 [x1,x0]=%1 [x2,x3]=%2 clobber=%3
%macro SSE2_DCT_4x4P 3
movdqa %3, %1
psubw %1, %2 ; [x1-x2,x0-x3]
paddw %2, %3 ; [x1+x2,x0+x3]
movdqa %3, %2
punpckhqdq %2, %1 ; s03 = [x0+x3,x0-x3]
punpcklqdq %3, %1 ; s12 = [x1+x2,x1-x2]
movdqa %1, %2
pmullw %1, [wels_4xp1w_4xp2w] ; [s03[0],2*s03[1]]
paddw %1, %3 ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
pmullw %3, [wels_4xp1w_4xp2w] ; [s12[0],2*s12[1]]
psubw %2, %3 ; [y2,y3] = [s03[0]-s12[0],s03[1]-2*s12[1]]
%endmacro
; Do 4 vertical 4-pt IDCTs in parallel packed as 16 words in 2 xmm registers.
; Output is scrambled to save a negation.
; [y1,y0]=%1 [y2,y3]=%2 [x0,x1]=%1 [x2,x3]=%2 clobber=%3,%4
%macro SSE2_IDCT_4x4P 4
movdqa %4, [wels_4xp0w_4xm8000w]
movdqa %3, %1
pmulhw %3, %4 ; x[0:1] * [0,-8000h] >> 16
pmulhw %4, %2 ; x[2:3] * [0,-8000h] >> 16
paddw %3, %1 ; [x[0],x[1]>>1]
paddw %4, %2 ; [x[2],x[3]>>1]
psubw %3, %2 ; [x[0]-x[2],(x[1]>>1)-x[3]]
paddw %1, %4 ; [x[2]+x[0],(x[3]>>1)+x[1]]
movdqa %2, %3
punpckhqdq %3, %1 ; s13 = [(x[1]>>1)-x[3],(x[3]>>1)+x[1]]
punpcklqdq %2, %1 ; s02 = [x[0]-x[2], x[2]+x[0]]
movdqa %1, %2
paddw %1, %3 ; [y1,y0] = [s02[0]+s13[0],s02[1]+s13[1]]
psubw %2, %3 ; [y2,y3] = [s02[0]-s13[0],s02[1]-s13[1]]
%endmacro
;***********************************************************************
; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
;***********************************************************************
@ -454,6 +533,58 @@ WELS_EXTERN WelsIDctFourT4Rec_sse2
; pop ebx
ret
;***********************************************************************
; void WelsDctT4_sse2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
;***********************************************************************
WELS_EXTERN WelsDctT4_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 5
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
WELS_Zero xmm2
SSE2_LoadDiff2x4P xmm0, r1+r2, r1, r3+r4, r3, xmm2, xmm3, xmm4
add r1, r2
add r3, r4
SSE2_LoadDiff2x4P xmm1, r1+r2, r1+2*r2, r3+r4, r3+2*r4, xmm2, xmm3, xmm4
SSE2_DCT_HORIZONTAL xmm0, xmm3
SSE2_DCT_HORIZONTAL xmm1, xmm3
SSE2_DCT_4x4P xmm0, xmm1, xmm3
SSE2_Store2x4P r0, xmm0
SSE2_Store2x4P r0+16, xmm1
POP_XMM
LOAD_5_PARA_POP
ret
;***********************************************************************
; void WelsIDctT4Rec_sse2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
;***********************************************************************
WELS_EXTERN WelsIDctT4Rec_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 6
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SSE2_Load2x4P xmm0, r4
SSE2_Load2x4P xmm1, r4+16
movdqa xmm4, [wels_p1m1m1p1w_128]
SSE2_IDCT_HORIZONTAL xmm0, xmm4, xmm2, xmm3
SSE2_IDCT_HORIZONTAL xmm1, xmm4, xmm2, xmm3
SSE2_IDCT_4x4P xmm0, xmm1, xmm2, xmm3
WELS_Zero xmm4
WELS_DW32 xmm5
SSE2_StoreDiff2x4P r0+r1, r0, xmm0, r2+r3, r2, xmm5, xmm4, xmm2, xmm3
add r0, r1
add r2, r3
SSE2_StoreDiff2x4P r0+r1, r0+2*r1, xmm1, r2+r3, r2+2*r3, xmm5, xmm4, xmm2, xmm3
POP_XMM
LOAD_5_PARA_POP
ret
%macro SSE2_StoreDiff4x8p 8
SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]

View File

@ -203,7 +203,8 @@ void WelsIDctT4Anchor (uint8_t* p_dst, int16_t dct[16]) {
p_dst[i + iStridex3] = WelsClip1 (uiDst + (clip_t (tmp[i] - tmp[4 + i] + tmp[8 + i] - (tmp[12 + i] >> 1) + 32) >> 6));
}
}
TEST (DecodeMbAuxTest, WelsIDctT4Rec_c) {
template<typename clip_t>
void TestIDctT4Rec (PIDctFunc func) {
int16_t iRefDct[16];
uint8_t iRefDst[16 * FDEC_STRIDE];
ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 16, 16);
@ -215,8 +216,8 @@ TEST (DecodeMbAuxTest, WelsIDctT4Rec_c) {
iPred[i * FDEC_STRIDE + j] = iRefDst[i * FDEC_STRIDE + j] = rand() & 255;
}
}
WelsIDctT4Anchor<int32_t> (iRefDst, iRefDct);
WelsIDctT4Rec_c (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
WelsIDctT4Anchor<clip_t> (iRefDst, iRefDct);
func (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
int ok = -1;
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
@ -228,34 +229,15 @@ TEST (DecodeMbAuxTest, WelsIDctT4Rec_c) {
}
EXPECT_EQ (ok, -1);
}
TEST (DecodeMbAuxTest, WelsIDctT4Rec_c) {
TestIDctT4Rec<int32_t> (WelsIDctT4Rec_c);
}
#if defined(X86_ASM)
TEST (DecodeMbAuxTest, WelsIDctT4Rec_mmx) {
int32_t iCpuCores = 0;
uint32_t uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
if (uiCpuFeatureFlag & WELS_CPU_MMXEXT) {
ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 16, 16);
ENFORCE_STACK_ALIGN_1D (uint8_t, iPred, 16 * FDEC_STRIDE, 16);
ENFORCE_STACK_ALIGN_1D (uint8_t, iRecC, 16 * FDEC_STRIDE, 16);
ENFORCE_STACK_ALIGN_1D (uint8_t, iRecM, 16 * FDEC_STRIDE, 16);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
iDct[i * 4 + j] = (rand() & ((1 << 12) - 1)) - (1 << 11);
iPred[i * FDEC_STRIDE + j] = rand() & 255;
}
}
WelsIDctT4Rec_c (iRecC, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
WelsIDctT4Rec_mmx (iRecM, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
int ok = -1;
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
if (iRecC[i * FDEC_STRIDE + j] != iRecM[i * FDEC_STRIDE + j]) {
ok = i * 4 + j;
break;
}
}
}
EXPECT_EQ (ok, -1);
}
TestIDctT4Rec<int16_t> (WelsIDctT4Rec_mmx);
}
TEST (DecodeMbAuxTest, WelsIDctT4Rec_sse2) {
TestIDctT4Rec<int16_t> (WelsIDctT4Rec_sse2);
}
#endif
template<typename clip_t>

View File

@ -147,8 +147,10 @@ static void Sub8x8DctAnchor (int16_t iDct[4][4][4], uint8_t* pPix1, uint8_t* pPi
}
static void TestDctT4 (PDctFunc func) {
int16_t iDctRef[4][4];
uint8_t uiPix1[16 * FENC_STRIDE], uiPix2[16 * FDEC_STRIDE];
int16_t iDct[16];
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (uint8_t, uiPix1, 16 * FENC_STRIDE);
ALLOC_MEMORY (uint8_t, uiPix2, 16 * FDEC_STRIDE);
ALLOC_MEMORY (int16_t, iDct, 16);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
uiPix1[i * FENC_STRIDE + j] = rand() & 255;
@ -160,6 +162,9 @@ static void TestDctT4 (PDctFunc func) {
for (int i = 0; i < 4; i++)
for (int j = 0; j < 4; j++)
EXPECT_EQ (iDctRef[j][i], iDct[i * 4 + j]);
FREE_MEMORY (uiPix1);
FREE_MEMORY (uiPix2);
FREE_MEMORY (iDct);
}
static void TestDctFourT4 (PDctFunc func) {
int16_t iDctRef[4][4][4];
@ -195,6 +200,10 @@ TEST (EncodeMbAuxTest, WelsDctT4_mmx) {
TestDctT4 (WelsDctT4_mmx);
}
TEST (EncodeMbAuxTest, WelsDctT4_sse2) {
TestDctT4 (WelsDctT4_sse2);
}
TEST (EncodeMbAuxTest, WelsDctFourT4_sse2) {
TestDctFourT4 (WelsDctFourT4_sse2);
}