diff --git a/codec/encoder/core/inc/decode_mb_aux.h b/codec/encoder/core/inc/decode_mb_aux.h index d92ce963..662f8f0d 100644 --- a/codec/encoder/core/inc/decode_mb_aux.h +++ b/codec/encoder/core/inc/decode_mb_aux.h @@ -65,6 +65,7 @@ void WelsDequantFour4x4_sse2 (int16_t* pDct, const uint16_t* kpMF); void WelsDequantIHadamard4x4_sse2 (int16_t* pRes, const uint16_t kuiMF); void WelsIDctT4Rec_mmx (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct); +void WelsIDctT4Rec_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct); void WelsIDctFourT4Rec_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct); void WelsIDctRecI16x16Dc_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDctDc); diff --git a/codec/encoder/core/inc/encode_mb_aux.h b/codec/encoder/core/inc/encode_mb_aux.h index 693fac97..06fcb581 100644 --- a/codec/encoder/core/inc/encode_mb_aux.h +++ b/codec/encoder/core/inc/encode_mb_aux.h @@ -89,6 +89,7 @@ int32_t WelsCalculateSingleCtr4x4_sse2 (int16_t* pDct); * DCT functions ****************************************************************************/ void WelsDctT4_mmx (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2); +void WelsDctT4_sse2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2); void WelsDctFourT4_sse2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2); void WelsDctFourT4_avx2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2); diff --git a/codec/encoder/core/src/decode_mb_aux.cpp b/codec/encoder/core/src/decode_mb_aux.cpp index 8d3777b0..72d57881 100644 --- a/codec/encoder/core/src/decode_mb_aux.cpp +++ b/codec/encoder/core/src/decode_mb_aux.cpp @@ -266,6 +266,7 @@ void WelsInitReconstructionFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFl pFuncList->pfDequantizationFour4x4 = WelsDequantFour4x4_sse2; pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_sse2; + pFuncList->pfIDctT4 = WelsIDctT4Rec_sse2; pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_sse2; pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2; } diff --git a/codec/encoder/core/src/encode_mb_aux.cpp b/codec/encoder/core/src/encode_mb_aux.cpp index b24f9f87..4b01a5b5 100644 --- a/codec/encoder/core/src/encode_mb_aux.cpp +++ b/codec/encoder/core/src/encode_mb_aux.cpp @@ -516,6 +516,7 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { pFuncList->pfScan4x4Ac = WelsScan4x4Ac_sse2; pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_sse2; + pFuncList->pfDctT4 = WelsDctT4_sse2; pFuncList->pfDctFourT4 = WelsDctFourT4_sse2; } //#ifndef MACOS diff --git a/codec/encoder/core/x86/dct.asm b/codec/encoder/core/x86/dct.asm index 63d871c1..784c5556 100644 --- a/codec/encoder/core/x86/dct.asm +++ b/codec/encoder/core/x86/dct.asm @@ -75,6 +75,12 @@ wels_p0m8000p0m8000w_128: times 4 dw 0, -8000h wels_p1p1m1m1w_128: times 2 dw 1, 1, -1, -1 +wels_4xp1w_4xp2w: + times 4 dw 1 + times 4 dw 2 +wels_4xp0w_4xm8000w: + times 4 dw 0 + times 4 dw -8000h align 16 SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16, @@ -281,6 +287,42 @@ WELS_EXTERN WelsIDctT4Rec_mmx movq %4, %2 %endmacro +%macro SSE2_Load2x4P 2 + MOVDQ %1, [%2] +%endmacro + +%macro SSE2_Store2x4P 2 + MOVDQ [%1], %2 +%endmacro + +; out=%1 pPixel1Line1=%2 pPixel1Line2=%3 pPixel2Line1=%4 pPixel2Line2=%5 zero=%6 clobber=%7,%8 +%macro SSE2_LoadDiff2x4P 8 + movd %1, [%2] + movd %7, [%3] + punpckldq %1, %7 + punpcklbw %1, %6 + movd %7, [%4] + movd %8, [%5] + punpckldq %7, %8 + punpcklbw %7, %6 + psubw %1, %7 +%endmacro + +; pRec1=%1 pRec2=%2 data=%3 pPred1=%4 pPred2=%5 dw32=%6 zero=%7 clobber=%8,%9 +%macro SSE2_StoreDiff2x4P 9 + paddw %3, %6 + psraw %3, 6 + movd %8, [%4] + movd %9, [%5] + punpckldq %8, %9 + punpcklbw %8, %7 + paddsw %3, %8 + packuswb %3, %3 + movd [%1], %3 + psrlq %3, 32 + movd [%2], %3 +%endmacro + %macro SSE2_Load8DC 6 movdqa %1, %6 ; %1 = dc0 dc1 paddw %1, %5 @@ -353,6 +395,43 @@ WELS_EXTERN WelsIDctT4Rec_mmx paddw %1, %3 ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...] %endmacro +; Do 4 vertical 4-pt DCTs in parallel packed as 16 words in 2 xmm registers. +; Uses scrambled input to save a negation. +; [y0,y1]=%1 [y2,y3]=%2 [x1,x0]=%1 [x2,x3]=%2 clobber=%3 +%macro SSE2_DCT_4x4P 3 + movdqa %3, %1 + psubw %1, %2 ; [x1-x2,x0-x3] + paddw %2, %3 ; [x1+x2,x0+x3] + movdqa %3, %2 + punpckhqdq %2, %1 ; s03 = [x0+x3,x0-x3] + punpcklqdq %3, %1 ; s12 = [x1+x2,x1-x2] + movdqa %1, %2 + pmullw %1, [wels_4xp1w_4xp2w] ; [s03[0],2*s03[1]] + paddw %1, %3 ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]] + pmullw %3, [wels_4xp1w_4xp2w] ; [s12[0],2*s12[1]] + psubw %2, %3 ; [y2,y3] = [s03[0]-s12[0],s03[1]-2*s12[1]] +%endmacro + +; Do 4 vertical 4-pt IDCTs in parallel packed as 16 words in 2 xmm registers. +; Output is scrambled to save a negation. +; [y1,y0]=%1 [y2,y3]=%2 [x0,x1]=%1 [x2,x3]=%2 clobber=%3,%4 +%macro SSE2_IDCT_4x4P 4 + movdqa %4, [wels_4xp0w_4xm8000w] + movdqa %3, %1 + pmulhw %3, %4 ; x[0:1] * [0,-8000h] >> 16 + pmulhw %4, %2 ; x[2:3] * [0,-8000h] >> 16 + paddw %3, %1 ; [x[0],x[1]>>1] + paddw %4, %2 ; [x[2],x[3]>>1] + psubw %3, %2 ; [x[0]-x[2],(x[1]>>1)-x[3]] + paddw %1, %4 ; [x[2]+x[0],(x[3]>>1)+x[1]] + movdqa %2, %3 + punpckhqdq %3, %1 ; s13 = [(x[1]>>1)-x[3],(x[3]>>1)+x[1]] + punpcklqdq %2, %1 ; s02 = [x[0]-x[2], x[2]+x[0]] + movdqa %1, %2 + paddw %1, %3 ; [y1,y0] = [s02[0]+s13[0],s02[1]+s13[1]] + psubw %2, %3 ; [y2,y3] = [s02[0]-s13[0],s02[1]-s13[1]] +%endmacro + ;*********************************************************************** ; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 ) ;*********************************************************************** @@ -454,6 +533,58 @@ WELS_EXTERN WelsIDctFourT4Rec_sse2 ; pop ebx ret +;*********************************************************************** +; void WelsDctT4_sse2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) +;*********************************************************************** +WELS_EXTERN WelsDctT4_sse2 + %assign push_num 0 + LOAD_5_PARA + PUSH_XMM 5 + SIGN_EXTENSION r2, r2d + SIGN_EXTENSION r4, r4d + + WELS_Zero xmm2 + SSE2_LoadDiff2x4P xmm0, r1+r2, r1, r3+r4, r3, xmm2, xmm3, xmm4 + add r1, r2 + add r3, r4 + SSE2_LoadDiff2x4P xmm1, r1+r2, r1+2*r2, r3+r4, r3+2*r4, xmm2, xmm3, xmm4 + SSE2_DCT_HORIZONTAL xmm0, xmm3 + SSE2_DCT_HORIZONTAL xmm1, xmm3 + SSE2_DCT_4x4P xmm0, xmm1, xmm3 + SSE2_Store2x4P r0, xmm0 + SSE2_Store2x4P r0+16, xmm1 + + POP_XMM + LOAD_5_PARA_POP + ret + +;*********************************************************************** +; void WelsIDctT4Rec_sse2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct); +;*********************************************************************** +WELS_EXTERN WelsIDctT4Rec_sse2 + %assign push_num 0 + LOAD_5_PARA + PUSH_XMM 6 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + + SSE2_Load2x4P xmm0, r4 + SSE2_Load2x4P xmm1, r4+16 + movdqa xmm4, [wels_p1m1m1p1w_128] + SSE2_IDCT_HORIZONTAL xmm0, xmm4, xmm2, xmm3 + SSE2_IDCT_HORIZONTAL xmm1, xmm4, xmm2, xmm3 + SSE2_IDCT_4x4P xmm0, xmm1, xmm2, xmm3 + WELS_Zero xmm4 + WELS_DW32 xmm5 + SSE2_StoreDiff2x4P r0+r1, r0, xmm0, r2+r3, r2, xmm5, xmm4, xmm2, xmm3 + add r0, r1 + add r2, r3 + SSE2_StoreDiff2x4P r0+r1, r0+2*r1, xmm1, r2+r3, r2+2*r3, xmm5, xmm4, xmm2, xmm3 + + POP_XMM + LOAD_5_PARA_POP + ret + %macro SSE2_StoreDiff4x8p 8 SSE2_StoreDiff8p %1, %3, %4, [%5], [%6] SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8] diff --git a/test/encoder/EncUT_DecodeMbAux.cpp b/test/encoder/EncUT_DecodeMbAux.cpp index c18dae8e..21cc96a9 100644 --- a/test/encoder/EncUT_DecodeMbAux.cpp +++ b/test/encoder/EncUT_DecodeMbAux.cpp @@ -203,7 +203,8 @@ void WelsIDctT4Anchor (uint8_t* p_dst, int16_t dct[16]) { p_dst[i + iStridex3] = WelsClip1 (uiDst + (clip_t (tmp[i] - tmp[4 + i] + tmp[8 + i] - (tmp[12 + i] >> 1) + 32) >> 6)); } } -TEST (DecodeMbAuxTest, WelsIDctT4Rec_c) { +template +void TestIDctT4Rec (PIDctFunc func) { int16_t iRefDct[16]; uint8_t iRefDst[16 * FDEC_STRIDE]; ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 16, 16); @@ -215,8 +216,8 @@ TEST (DecodeMbAuxTest, WelsIDctT4Rec_c) { iPred[i * FDEC_STRIDE + j] = iRefDst[i * FDEC_STRIDE + j] = rand() & 255; } } - WelsIDctT4Anchor (iRefDst, iRefDct); - WelsIDctT4Rec_c (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct); + WelsIDctT4Anchor (iRefDst, iRefDct); + func (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct); int ok = -1; for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { @@ -228,34 +229,15 @@ TEST (DecodeMbAuxTest, WelsIDctT4Rec_c) { } EXPECT_EQ (ok, -1); } +TEST (DecodeMbAuxTest, WelsIDctT4Rec_c) { + TestIDctT4Rec (WelsIDctT4Rec_c); +} #if defined(X86_ASM) TEST (DecodeMbAuxTest, WelsIDctT4Rec_mmx) { - int32_t iCpuCores = 0; - uint32_t uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); - if (uiCpuFeatureFlag & WELS_CPU_MMXEXT) { - ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 16, 16); - ENFORCE_STACK_ALIGN_1D (uint8_t, iPred, 16 * FDEC_STRIDE, 16); - ENFORCE_STACK_ALIGN_1D (uint8_t, iRecC, 16 * FDEC_STRIDE, 16); - ENFORCE_STACK_ALIGN_1D (uint8_t, iRecM, 16 * FDEC_STRIDE, 16); - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - iDct[i * 4 + j] = (rand() & ((1 << 12) - 1)) - (1 << 11); - iPred[i * FDEC_STRIDE + j] = rand() & 255; - } - } - WelsIDctT4Rec_c (iRecC, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct); - WelsIDctT4Rec_mmx (iRecM, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct); - int ok = -1; - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - if (iRecC[i * FDEC_STRIDE + j] != iRecM[i * FDEC_STRIDE + j]) { - ok = i * 4 + j; - break; - } - } - } - EXPECT_EQ (ok, -1); - } + TestIDctT4Rec (WelsIDctT4Rec_mmx); +} +TEST (DecodeMbAuxTest, WelsIDctT4Rec_sse2) { + TestIDctT4Rec (WelsIDctT4Rec_sse2); } #endif template diff --git a/test/encoder/EncUT_EncoderMbAux.cpp b/test/encoder/EncUT_EncoderMbAux.cpp index 49b300b3..137179c1 100644 --- a/test/encoder/EncUT_EncoderMbAux.cpp +++ b/test/encoder/EncUT_EncoderMbAux.cpp @@ -147,8 +147,10 @@ static void Sub8x8DctAnchor (int16_t iDct[4][4][4], uint8_t* pPix1, uint8_t* pPi } static void TestDctT4 (PDctFunc func) { int16_t iDctRef[4][4]; - uint8_t uiPix1[16 * FENC_STRIDE], uiPix2[16 * FDEC_STRIDE]; - int16_t iDct[16]; + CMemoryAlign cMemoryAlign (0); + ALLOC_MEMORY (uint8_t, uiPix1, 16 * FENC_STRIDE); + ALLOC_MEMORY (uint8_t, uiPix2, 16 * FDEC_STRIDE); + ALLOC_MEMORY (int16_t, iDct, 16); for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { uiPix1[i * FENC_STRIDE + j] = rand() & 255; @@ -160,6 +162,9 @@ static void TestDctT4 (PDctFunc func) { for (int i = 0; i < 4; i++) for (int j = 0; j < 4; j++) EXPECT_EQ (iDctRef[j][i], iDct[i * 4 + j]); + FREE_MEMORY (uiPix1); + FREE_MEMORY (uiPix2); + FREE_MEMORY (iDct); } static void TestDctFourT4 (PDctFunc func) { int16_t iDctRef[4][4][4]; @@ -195,6 +200,10 @@ TEST (EncodeMbAuxTest, WelsDctT4_mmx) { TestDctT4 (WelsDctT4_mmx); } +TEST (EncodeMbAuxTest, WelsDctT4_sse2) { + TestDctT4 (WelsDctT4_sse2); +} + TEST (EncodeMbAuxTest, WelsDctFourT4_sse2) { TestDctFourT4 (WelsDctFourT4_sse2); }