Merge pull request #2341 from saamas/encoder-x86-dct-opt

[Encoder] x86 DCT optimizations
This commit is contained in:
zhilwang 2016-01-28 10:33:34 +08:00
commit c420d72443
10 changed files with 386 additions and 100 deletions

View File

@ -1,4 +1,5 @@
language: cpp
dist: trusty
compiler:
- g++
@ -6,7 +7,7 @@ compiler:
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq nasm g++-4.6-multilib gcc-multilib libc6-dev-i386
- sudo apt-get install -qq nasm g++-multilib gcc-multilib libc6-dev-i386
install:
- make gmp-bootstrap

View File

@ -50,7 +50,7 @@ Processor Support
Building the Library
--------------------
NASM needed to be installed for assembly code: workable version 2.07 or above, nasm can downloaded from http://www.nasm.us/
NASM needed to be installed for assembly code: workable version 2.10 or above, nasm can downloaded from http://www.nasm.us/
For Mac OSX 64-bit NASM needed to be below version 2.11.08 as nasm 2.11.08 will introduce error when using RIP-relative addresses in Mac OSX 64-bit
To build the arm assembly for Windows Phone, gas-preprocessor is required. It can be downloaded from git://git.libav.org/gas-preprocessor.git

View File

@ -605,8 +605,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-
packuswb %1,%1
%endmacro
%macro WELS_DW32_VEX 1
vpcmpeqw %1, %1, %1
vpsrlw %1, %1, 15
vpsllw %1, %1, 5
%endmacro

View File

@ -68,6 +68,7 @@ void WelsIDctT4Rec_mmx (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, in
void WelsIDctFourT4Rec_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
void WelsIDctRecI16x16Dc_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride,
int16_t* pDctDc);
void WelsIDctFourT4Rec_avx2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
#endif//X86_ASM
#ifdef HAVE_NEON

View File

@ -90,6 +90,7 @@ int32_t WelsCalculateSingleCtr4x4_sse2 (int16_t* pDct);
****************************************************************************/
void WelsDctT4_mmx (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
void WelsDctFourT4_sse2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
void WelsDctFourT4_avx2 (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
/****************************************************************************
* HDM and Quant functions

View File

@ -269,6 +269,9 @@ void WelsInitReconstructionFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFl
pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_sse2;
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
}
if (uiCpuFlag & WELS_CPU_AVX2) {
pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_avx2;
}
#endif//X86_ASM
#if defined(HAVE_NEON)

View File

@ -522,6 +522,9 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
if (uiCpuFlag & WELS_CPU_SSSE3) {
pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3;
}
if (uiCpuFlag & WELS_CPU_AVX2) {
pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
}
//#endif//MACOS

View File

@ -42,12 +42,36 @@
%include "asm_inc.asm"
SECTION .rodata align=16
SECTION .rodata align=32
;***********************************************************************
; Constant
;***********************************************************************
align 32
wels_p1m1p1m1w_256:
times 8 dw 1, -1
wels_p1p2p1p2w_256:
times 8 dw 1, 2
wels_rev64w_256:
times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
wels_p1m1m1p1w_256:
times 4 dw 1, -1, -1, 1
wels_p1p1m1m1w_256:
times 4 dw 1, 1, -1, -1
align 16
wels_p1m1p1m1w_128:
times 4 dw 1, -1
wels_p1p2p1p2w_128:
times 4 dw 1, 2
wels_p1m1m1p1w_128:
times 2 dw 1, -1, -1, 1
wels_p0m8000p0m8000w_128:
times 4 dw 0, -8000h
wels_p1p1m1m1w_128:
times 2 dw 1, 1, -1, -1
align 16
SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 10, 13, 10, 13, 13, 16, 13, 16,
@ -62,7 +86,6 @@ SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 18, 23, 18, 23, 23, 29, 23, 29,
dw 18, 23, 18, 23, 23, 29, 23, 29
;***********************************************************************
; MMX functions
;***********************************************************************
@ -194,12 +217,14 @@ WELS_EXTERN WelsIDctT4Rec_mmx
; SSE2 functions
;***********************************************************************
%macro SSE2_Store4x8p 6
SSE2_XSawp qdq, %2, %3, %6
SSE2_XSawp qdq, %4, %5, %3
MOVDQ [%1+0x00], %2
MOVDQ [%1+0x10], %4
MOVDQ [%1+0x20], %6
MOVDQ [%1+0x30], %3
movlps [%1+0x00], %2
movhps [%1+0x20], %2
movlps [%1+0x08], %3
movhps [%1+0x28], %3
movlps [%1+0x10], %4
movhps [%1+0x30], %4
movlps [%1+0x18], %5
movhps [%1+0x38], %5
%endmacro
%macro SSE2_Load4x8p 6
@ -213,9 +238,9 @@ WELS_EXTERN WelsIDctT4Rec_mmx
%macro SSE2_SumSubMul2 3
movdqa %3, %1
paddw %1, %1
psllw %1, 1
paddw %1, %2
psubw %3, %2
psllw %2, 1
psubw %3, %2
%endmacro
@ -228,14 +253,20 @@ WELS_EXTERN WelsIDctT4Rec_mmx
psubw %4, %3
%endmacro
%macro SSE2_StoreDiff8p 6
paddw %1, %3
%macro SSE2_StoreDiff16p 9
paddw %1, %4
psraw %1, $06
movq %2, %6
punpcklbw %2, %4
paddsw %2, %1
packuswb %2, %2
movq %5, %2
movq %3, %7
punpcklbw %3, %5
paddsw %1, %3
paddw %2, %4
psraw %2, $06
movq %3, %9
punpcklbw %3, %5
paddsw %2, %3
packuswb %1, %2
movlps %6, %1
movhps %8, %1
%endmacro
%macro SSE2_StoreDiff8p 5
@ -284,6 +315,40 @@ WELS_EXTERN WelsIDctT4Rec_mmx
SSE2_SumSub %7, %4, %5
%endmacro
; Do 2 horizontal 4-pt DCTs in parallel packed as 8 words in an xmm register.
; out=%1 in=%1 clobber=%2
%macro SSE2_DCT_HORIZONTAL 2
pshuflw %2, %1, 1bh ; [x[3],x[2],x[1],x[0]] low qw
pmullw %1, [wels_p1m1p1m1w_128] ; [x[0],-x[1],x[2],-x[3], ...]
pshufhw %2, %2, 1bh ; [x[3],x[2],x[1],x[0]] high qw
paddw %1, %2 ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
pshufd %2, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...]
pmullw %1, [wels_p1m1m1p1w_128] ; [s[0],-s[1],-s[2],s[3], ...]
pmullw %2, [wels_p1p2p1p2w_128] ; [s[2],2*s[3],s[0],2*s[1], ...]]
paddw %1, %2 ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
%endmacro
; Do 2 horizontal 4-pt IDCTs in parallel packed as 8 words in an xmm register.
;
; Use a multiply by reciprocal to get -x>>1, and x+=-x>>1 to get x>>1, which
; avoids a cumbersome blend with SSE2 to get a vector with right-shifted odd
; elements.
;
; out=%1 in=%1 wels_p1m1m1p1w_128=%2 clobber=%3,%4
%macro SSE2_IDCT_HORIZONTAL 4
movdqa %3, [wels_p0m8000p0m8000w_128]
pmulhw %3, %1 ; x[0:7] * [0,-8000h,0,-8000h, ...] >> 16
pshufd %4, %1, 0b1h ; [x[2],x[3],x[0],x[1], ...]
pmullw %4, %2 ; [x[2],-x[3],-x[0],x[1], ...]
paddw %1, %3 ; [x[0]+0,x[1]+(-x[1]>>1),x[2]+0,x[3]+(-x[3]>>1), ...]
paddw %1, %4 ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...]
pshuflw %3, %1, 1bh ; [s[3],s[2],s[1],s[0]] low qw
pmullw %1, [wels_p1p1m1m1w_128] ; [s[0],s[1],-s[2],-s[3], ...]
pshufhw %3, %3, 1bh ; [s[3],s[2],s[1],s[0]] high qw
pmullw %3, %2 ; [s[3],-s[2],-s[1],s[0], ...]
paddw %1, %3 ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
%endmacro
;***********************************************************************
; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
;***********************************************************************
@ -303,11 +368,12 @@ WELS_EXTERN WelsDctFourT4_sse2
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
SSE2_DCT_HORIZONTAL xmm2, xmm5
SSE2_DCT_HORIZONTAL xmm0, xmm5
SSE2_DCT_HORIZONTAL xmm3, xmm5
SSE2_DCT_HORIZONTAL xmm4, xmm5
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
SSE2_Store4x8p r0, xmm2, xmm0, xmm3, xmm4, xmm1
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
@ -321,12 +387,12 @@ WELS_EXTERN WelsDctFourT4_sse2
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
SSE2_DCT_HORIZONTAL xmm2, xmm5
SSE2_DCT_HORIZONTAL xmm0, xmm5
SSE2_DCT_HORIZONTAL xmm3, xmm5
SSE2_DCT_HORIZONTAL xmm4, xmm5
lea r0, [r0+64]
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
SSE2_Store4x8p r0+64, xmm2, xmm0, xmm3, xmm4, xmm1
POP_XMM
LOAD_5_PARA_POP
@ -345,40 +411,39 @@ WELS_EXTERN WelsIDctFourT4Rec_sse2
;Load 4x8
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
movdqa xmm7, [wels_p1m1m1p1w_128]
SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm2, xmm7, xmm5, xmm6
SSE2_IDCT xmm1, xmm4, xmm2, xmm3, xmm5, xmm6, xmm0
WELS_Zero xmm7
WELS_DW32 xmm6
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
SSE2_StoreDiff16p xmm1, xmm3, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
SSE2_StoreDiff16p xmm0, xmm4, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
add r4, 64
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_Load4x8p r4+64, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
movdqa xmm7, [wels_p1m1m1p1w_128]
SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm2, xmm7, xmm5, xmm6
SSE2_IDCT xmm1, xmm4, xmm2, xmm3, xmm5, xmm6, xmm0
WELS_Zero xmm7
WELS_DW32 xmm6
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
SSE2_StoreDiff16p xmm1, xmm3, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
SSE2_StoreDiff16p xmm0, xmm4, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
POP_XMM
LOAD_5_PARA_POP
; pop esi
@ -502,3 +567,202 @@ WELS_EXTERN WelsHadamardT4Dc_sse2
POP_XMM
ret
;***********************************************************************
; AVX2 functions
;***********************************************************************
; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 zero=%6 clobber=%7,%8
%macro AVX2_LoadDiff16P 8
vmovq x%1, [%2 ]
vpbroadcastq y%7, [%2 + 4 * %3]
vpblendd y%1, y%1, y%7, 11110000b
vpunpcklbw y%1, y%1, y%6
vmovq x%7, [%4 ]
vpbroadcastq y%8, [%4 + 4 * %5]
vpblendd y%7, y%7, y%8, 11110000b
vpunpcklbw y%7, y%7, y%6
vpsubw y%1, y%1, y%7
%endmacro
; pRec=%1 iStride=%2 data=%3,%4 pPred=%5 iPredStride=%6 dw32=%7 zero=%8 clobber=%9,%10
%macro AVX2_StoreDiff32P 10
vpaddw y%3, y%3, y%7
vpsraw y%3, y%3, 6
vmovq x%9, [%5 ]
vpbroadcastq y%10, [%5 + 4 * %6]
add %5, %6
vpblendd y%9, y%9, y%10, 11110000b
vpunpcklbw y%9, y%9, y%8
vpaddsw y%3, y%3, y%9
vpaddw y%4, y%4, y%7
vpsraw y%4, y%4, 6
vmovq x%9, [%5 ]
vpbroadcastq y%10, [%5 + 4 * %6]
vpblendd y%9, y%9, y%10, 11110000b
vpunpcklbw y%9, y%9, y%8
vpaddsw y%4, y%4, y%9
vpackuswb y%3, y%3, y%4
vextracti128 x%4, y%3, 1
vmovlps [%1 ], x%3
vmovlps [%1 + 4 * %2], x%4
add %1, %2
vmovhps [%1 ], x%3
vmovhps [%1 + 4 * %2], x%4
%endmacro
; out=%1,%2,%3,%4 pDct=%5 clobber=%6
%macro AVX2_Load4x16P 6
vmovdqa x%2, [%5+0x00]
vinserti128 y%2, y%2, [%5+0x40], 1
vmovdqa x%6, [%5+0x20]
vinserti128 y%6, y%6, [%5+0x60], 1
vpunpcklqdq y%1, y%2, y%6
vpunpckhqdq y%2, y%2, y%6
vmovdqa x%4, [%5+0x10]
vinserti128 y%4, y%4, [%5+0x50], 1
vmovdqa x%6, [%5+0x30]
vinserti128 y%6, y%6, [%5+0x70], 1
vpunpcklqdq y%3, y%4, y%6
vpunpckhqdq y%4, y%4, y%6
%endmacro
; pDct=%1 data=%1,%2,%3,%4 clobber=%5
%macro AVX2_Store4x16P 6
vpunpcklqdq y%6, y%2, y%3
vmovdqa [%1+0x00], x%6
vextracti128 [%1+0x40], y%6, 1
vpunpckhqdq y%6, y%2, y%3
vmovdqa [%1+0x20], x%6
vextracti128 [%1+0x60], y%6, 1
vpunpcklqdq y%6, y%4, y%5
vmovdqa [%1+0x10], x%6
vextracti128 [%1+0x50], y%6, 1
vpunpckhqdq y%6, y%4, y%5
vmovdqa [%1+0x30], x%6
vextracti128 [%1+0x70], y%6, 1
%endmacro
; 4-pt DCT
; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
%macro AVX2_DCT 5
vpsubw %5, %1, %4 ; s3 = x0 - x3
vpaddw %1, %1, %4 ; s0 = x0 + x3
vpsubw %4, %2, %3 ; s2 = x1 - x2
vpaddw %2, %2, %3 ; s1 = x1 + x2
vpsubw %3, %1, %2 ; y2 = s0 - s1
vpaddw %1, %1, %2 ; y0 = s0 + s1
vpsllw %2, %5, 1
vpaddw %2, %2, %4 ; y1 = 2 * s3 + s2
vpsllw %4, %4, 1
vpsubw %4, %5, %4 ; y3 = s3 - 2 * s2
%endmacro
; 4-pt IDCT
; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
%macro AVX2_IDCT 5
vpsraw %5, %2, 1
vpsubw %5, %5, %4 ; t3 = (x1 >> 1) - x3
vpsraw %4, %4, 1
vpaddw %4, %2, %4 ; t2 = x1 + (x3 >> 1)
vpaddw %2, %1, %3 ; t0 = x0 + x2
vpsubw %3, %1, %3 ; t1 = x0 - x2
vpaddw %1, %2, %4 ; y0 = t0 + t2
vpsubw %4, %2, %4 ; y3 = t0 - t2
vpaddw %2, %3, %5 ; y1 = t1 + t3
vpsubw %3, %3, %5 ; y2 = t1 - t3
%endmacro
; Do 4 horizontal 4-pt DCTs in parallel packed as 16 words in a ymm register.
; out=%1 in=%1 wels_rev64w_256=%2 clobber=%3
%macro AVX2_DCT_HORIZONTAL 3
vpsignw %3, %1, [wels_p1m1p1m1w_256] ; [x[0],-x[1],x[2],-x[3], ...]
vpshufb %1, %1, %2 ; [x[3],x[2],x[1],x[0], ...]
vpaddw %1, %1, %3 ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
vpmullw %3, %1, [wels_p1m1m1p1w_256] ; [s[0],-s[1],-s[2],s[3], ...]
vpshufd %1, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...]
vpmullw %1, %1, [wels_p1p2p1p2w_256] ; [s[2],2*s[3],s[0],2*s[1], ...]
vpaddw %1, %1, %3 ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
%endmacro
; Do 4 horizontal 4-pt IDCTs in parallel packed as 16 words in a ymm register.
; out=%1 in=%1 wels_rev64w_256=%2 clobber=%3
%macro AVX2_IDCT_HORIZONTAL 3
vpsraw %3, %1, 1 ; [x[0]>>1,x[1]>>1,x[2]>>1,x[3]>>1, ...]
vpblendw %3, %1, %3, 10101010b ; [x[0],x[1]>>1,x[2],x[3]>>1, ...]
vpshufd %1, %1, 0b1h ; [x[2],x[3],x[0],x[1], ...]
vpsignw %1, %1, [wels_p1m1m1p1w_256] ; [x[2],-x[3],-x[0],x[1], ...]
vpaddw %1, %3, %1 ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...]
vpshufb %3, %1, %2 ; [s[3],s[2],s[1],s[0], ...]
vpmullw %1, %1, [wels_p1p1m1m1w_256] ; [s[0],s[1],-s[2],-s[3], ...]
vpmullw %3, %3, [wels_p1m1m1p1w_256] ; [s[3],-s[2],-s[1],s[0], ...]
vpaddw %1, %1, %3 ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
%endmacro
;***********************************************************************
; void WelsDctFourT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
;***********************************************************************
WELS_EXTERN WelsDctFourT4_avx2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 7
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
vpxor ymm6, ymm6, ymm6
;Load 4x16
AVX2_LoadDiff16P mm0, r1, r2, r3, r4, mm6, mm4, mm5
add r1, r2
add r3, r4
AVX2_LoadDiff16P mm1, r1, r2, r3, r4, mm6, mm4, mm5
add r1, r2
add r3, r4
AVX2_LoadDiff16P mm2, r1, r2, r3, r4, mm6, mm4, mm5
add r1, r2
add r3, r4
AVX2_LoadDiff16P mm3, r1, r2, r3, r4, mm6, mm4, mm5
AVX2_DCT ymm0, ymm1, ymm2, ymm3, ymm5
vmovdqa ymm6, [wels_rev64w_256]
AVX2_DCT_HORIZONTAL ymm0, ymm6, ymm5
AVX2_DCT_HORIZONTAL ymm1, ymm6, ymm5
AVX2_DCT_HORIZONTAL ymm2, ymm6, ymm5
AVX2_DCT_HORIZONTAL ymm3, ymm6, ymm5
AVX2_Store4x16P r0, mm0, mm1, mm2, mm3, mm5
vzeroupper
POP_XMM
LOAD_5_PARA_POP
ret
;***********************************************************************
; void WelsIDctFourT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
;***********************************************************************
WELS_EXTERN WelsIDctFourT4Rec_avx2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
AVX2_Load4x16P mm0, mm1, mm2, mm3, r4, mm5
vmovdqa ymm6, [wels_rev64w_256]
AVX2_IDCT_HORIZONTAL ymm0, ymm6, ymm5
AVX2_IDCT_HORIZONTAL ymm1, ymm6, ymm5
AVX2_IDCT_HORIZONTAL ymm2, ymm6, ymm5
AVX2_IDCT_HORIZONTAL ymm3, ymm6, ymm5
AVX2_IDCT ymm0, ymm1, ymm2, ymm3, ymm5
vpxor ymm6, ymm6, ymm6
WELS_DW32_VEX ymm7
AVX2_StoreDiff32P r0, r1, mm0, mm1, r2, r3, mm7, mm6, mm5, mm4
add r2, r3
add r0, r1
AVX2_StoreDiff32P r0, r1, mm2, mm3, r2, r3, mm7, mm6, mm5, mm4
vzeroupper
POP_XMM
LOAD_5_PARA_POP
ret

View File

@ -179,6 +179,7 @@ TEST (DecodeMbAuxTest, WelsDequantIHadamard2x2Dc) {
EXPECT_TRUE (ok);
}
#define FDEC_STRIDE 32
template<typename clip_t>
void WelsIDctT4Anchor (uint8_t* p_dst, int16_t dct[16]) {
int16_t tmp[16];
int32_t iStridex2 = (FDEC_STRIDE << 1);
@ -193,13 +194,13 @@ void WelsIDctT4Anchor (uint8_t* p_dst, int16_t dct[16]) {
}
for (i = 0; i < 4; i++) {
uiDst = p_dst[i];
p_dst[i] = WelsClip1 (uiDst + ((tmp[i] + tmp[4 + i] + tmp[8 + i] + (tmp[12 + i] >> 1) + 32) >> 6));
p_dst[i] = WelsClip1 (uiDst + (clip_t (tmp[i] + tmp[4 + i] + tmp[8 + i] + (tmp[12 + i] >> 1) + 32) >> 6));
uiDst = p_dst[i + FDEC_STRIDE];
p_dst[i + FDEC_STRIDE] = WelsClip1 (uiDst + ((tmp[i] + (tmp[4 + i] >> 1) - tmp[8 + i] - tmp[12 + i] + 32) >> 6));
p_dst[i + FDEC_STRIDE] = WelsClip1 (uiDst + (clip_t (tmp[i] + (tmp[4 + i] >> 1) - tmp[8 + i] - tmp[12 + i] + 32) >> 6));
uiDst = p_dst[i + iStridex2];
p_dst[i + iStridex2] = WelsClip1 (uiDst + ((tmp[i] - (tmp[4 + i] >> 1) - tmp[8 + i] + tmp[12 + i] + 32) >> 6));
p_dst[i + iStridex2] = WelsClip1 (uiDst + (clip_t (tmp[i] - (tmp[4 + i] >> 1) - tmp[8 + i] + tmp[12 + i] + 32) >> 6));
uiDst = p_dst[i + iStridex3];
p_dst[i + iStridex3] = WelsClip1 (uiDst + ((tmp[i] - tmp[4 + i] + tmp[8 + i] - (tmp[12 + i] >> 1) + 32) >> 6));
p_dst[i + iStridex3] = WelsClip1 (uiDst + (clip_t (tmp[i] - tmp[4 + i] + tmp[8 + i] - (tmp[12 + i] >> 1) + 32) >> 6));
}
}
TEST (DecodeMbAuxTest, WelsIDctT4Rec_c) {
@ -214,7 +215,7 @@ TEST (DecodeMbAuxTest, WelsIDctT4Rec_c) {
iPred[i * FDEC_STRIDE + j] = iRefDst[i * FDEC_STRIDE + j] = rand() & 255;
}
}
WelsIDctT4Anchor (iRefDst, iRefDct);
WelsIDctT4Anchor<int32_t> (iRefDst, iRefDct);
WelsIDctT4Rec_c (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
int ok = -1;
for (int i = 0; i < 4; i++) {
@ -257,13 +258,15 @@ TEST (DecodeMbAuxTest, WelsIDctT4Rec_mmx) {
}
}
#endif
template<typename clip_t>
void WelsIDctT8Anchor (uint8_t* p_dst, int16_t dct[4][16]) {
WelsIDctT4Anchor (&p_dst[0], dct[0]);
WelsIDctT4Anchor (&p_dst[4], dct[1]);
WelsIDctT4Anchor (&p_dst[4 * FDEC_STRIDE + 0], dct[2]);
WelsIDctT4Anchor (&p_dst[4 * FDEC_STRIDE + 4], dct[3]);
WelsIDctT4Anchor<clip_t> (&p_dst[0], dct[0]);
WelsIDctT4Anchor<clip_t> (&p_dst[4], dct[1]);
WelsIDctT4Anchor<clip_t> (&p_dst[4 * FDEC_STRIDE + 0], dct[2]);
WelsIDctT4Anchor<clip_t> (&p_dst[4 * FDEC_STRIDE + 4], dct[3]);
}
TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_c) {
template<typename clip_t>
void TestIDctFourT4Rec (PIDctFunc func) {
int16_t iRefDct[4][16];
uint8_t iRefDst[16 * FDEC_STRIDE];
ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 64, 16);
@ -277,8 +280,8 @@ TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_c) {
for (int j = 0; j < 8; j++)
iPred[i * FDEC_STRIDE + j] = iRefDst[i * FDEC_STRIDE + j] = rand() & 255;
WelsIDctT8Anchor (iRefDst, iRefDct);
WelsIDctFourT4Rec_c (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
WelsIDctT8Anchor<clip_t> (iRefDst, iRefDct);
func (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
int ok = -1;
for (int i = 0; i < 8; i++) {
for (int j = 0; j < 8; j++) {
@ -290,6 +293,9 @@ TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_c) {
}
EXPECT_EQ (ok, -1);
}
TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_c) {
TestIDctFourT4Rec<int32_t> (WelsIDctFourT4Rec_c);
}
void WelsIDctRecI16x4DcAnchor (uint8_t* p_dst, int16_t dct[4]) {
for (int i = 0; i < 4; i++, p_dst += FDEC_STRIDE) {
p_dst[0] = WelsClip1 (p_dst[0] + ((dct[0] + 32) >> 6));
@ -345,6 +351,13 @@ TEST (DecodeMbAuxTest, WelsIDctRecI16x16Dc_c) {
EXPECT_EQ (ok, -1);
}
#if defined(X86_ASM)
TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_sse2) {
TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_sse2);
}
TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_avx2) {
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_avx2);
}
TEST (DecodeMbAuxTest, WelsIDctRecI16x16Dc_sse2) {
int32_t iCpuCores = 0;
uint32_t uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

View File

@ -1,4 +1,5 @@
#include <gtest/gtest.h>
#include "cpu.h"
#include "ls_defines.h"
#include "encode_mb_aux.h"
#include "wels_common_basis.h"
@ -144,64 +145,63 @@ static void Sub8x8DctAnchor (int16_t iDct[4][4][4], uint8_t* pPix1, uint8_t* pPi
Sub4x4DctAnchor (iDct[2], &pPix1[4 * FENC_STRIDE + 0], &pPix2[4 * FDEC_STRIDE + 0]);
Sub4x4DctAnchor (iDct[3], &pPix1[4 * FENC_STRIDE + 4], &pPix2[4 * FDEC_STRIDE + 4]);
}
TEST (EncodeMbAuxTest, WelsDctT4_c) {
static void TestDctT4 (PDctFunc func) {
int16_t iDctRef[4][4];
uint8_t uiPix1[16 * FENC_STRIDE], uiPix2[16 * FDEC_STRIDE];
int16_t iDct[16];
for (int i = 0; i < 4; i++)
for (int j = 0; j < 4; j++)
uiPix1[i * FENC_STRIDE + j] = uiPix2[i * FDEC_STRIDE + j] = rand() & 255;
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
uiPix1[i * FENC_STRIDE + j] = rand() & 255;
uiPix2[i * FDEC_STRIDE + j] = rand() & 255;
}
}
Sub4x4DctAnchor (iDctRef, uiPix1, uiPix2);
WelsDctT4_c (iDct, uiPix1, FENC_STRIDE, uiPix2, FDEC_STRIDE);
func (iDct, uiPix1, FENC_STRIDE, uiPix2, FDEC_STRIDE);
for (int i = 0; i < 4; i++)
for (int j = 0; j < 4; j++)
EXPECT_EQ (iDctRef[j][i], iDct[i * 4 + j]);
}
TEST (EncodeMbAuxTest, WelsDctFourT4_c) {
static void TestDctFourT4 (PDctFunc func) {
int16_t iDctRef[4][4][4];
uint8_t uiPix1[16 * FENC_STRIDE], uiPix2[16 * FDEC_STRIDE];
int16_t iDct[16 * 4];
for (int i = 0; i < 8; i++)
for (int j = 0; j < 8; j++)
uiPix1[i * FENC_STRIDE + j] = uiPix2[i * FDEC_STRIDE + j] = rand() & 255;
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (uint8_t, uiPix1, 16 * FENC_STRIDE);
ALLOC_MEMORY (uint8_t, uiPix2, 16 * FDEC_STRIDE);
ALLOC_MEMORY (int16_t, iDct, 16 * 4);
for (int i = 0; i < 8; i++) {
for (int j = 0; j < 8; j++) {
uiPix1[i * FENC_STRIDE + j] = rand() & 255;
uiPix2[i * FDEC_STRIDE + j] = rand() & 255;
}
}
Sub8x8DctAnchor (iDctRef, uiPix1, uiPix2);
WelsDctFourT4_c (iDct, uiPix1, FENC_STRIDE, uiPix2, FDEC_STRIDE);
func (iDct, uiPix1, FENC_STRIDE, uiPix2, FDEC_STRIDE);
for (int k = 0; k < 4; k++)
for (int i = 0; i < 4; i++)
for (int j = 0; j < 4; j++)
EXPECT_EQ (iDctRef[k][j][i], iDct[k * 16 + i * 4 + j]);
FREE_MEMORY (uiPix1);
FREE_MEMORY (uiPix2);
FREE_MEMORY (iDct);
}
TEST (EncodeMbAuxTest, WelsDctT4_c) {
TestDctT4 (WelsDctT4_c);
}
TEST (EncodeMbAuxTest, WelsDctFourT4_c) {
TestDctFourT4 (WelsDctFourT4_c);
}
#ifdef X86_ASM
TEST (EncodeMbAuxTest, WelsDctT4_mmx) {
int16_t iDctC[16], iDctM[16];
uint8_t uiPix1[16 * FENC_STRIDE], uiPix2[16 * FDEC_STRIDE];
for (int i = 0; i < 4; i++)
for (int j = 0; j < 4; j++)
uiPix1[i * FENC_STRIDE + j] = uiPix2[i * FDEC_STRIDE + j] = rand() & 255;
WelsDctT4_c (iDctC, uiPix1, FENC_STRIDE, uiPix2, FDEC_STRIDE);
WelsDctT4_mmx (iDctM, uiPix1, FENC_STRIDE, uiPix2, FDEC_STRIDE);
for (int i = 0; i < 16; i++)
EXPECT_EQ (iDctC[i], iDctM[i]);
TestDctT4 (WelsDctT4_mmx);
}
TEST (EncodeMbAuxTest, WelsDctFourT4_sse2) {
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (uint8_t, uiPix1, 16 * FENC_STRIDE);
ALLOC_MEMORY (uint8_t, uiPix2, 16 * FDEC_STRIDE);
ALLOC_MEMORY (int16_t, iDctC, 16 * 4);
ALLOC_MEMORY (int16_t, iDctS, 16 * 4);
for (int i = 0; i < 8; i++)
for (int j = 0; j < 8; j++)
uiPix1[i * FENC_STRIDE + j] = uiPix2[i * FDEC_STRIDE + j] = rand() & 255;
WelsDctFourT4_c (iDctC, uiPix1, FENC_STRIDE, uiPix2, FDEC_STRIDE);
WelsDctFourT4_sse2 (iDctS, uiPix1, FENC_STRIDE, uiPix2, FDEC_STRIDE);
for (int i = 0; i < 64; i++)
EXPECT_EQ (iDctC[i], iDctS[i]);
FREE_MEMORY (uiPix1);
FREE_MEMORY (uiPix2);
FREE_MEMORY (iDctC);
FREE_MEMORY (iDctS);
TestDctFourT4 (WelsDctFourT4_sse2);
}
TEST (EncodeMbAuxTest, WelsDctFourT4_avx2) {
if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
TestDctFourT4 (WelsDctFourT4_avx2);
}
TEST (EncodeMbAuxTest, WelsCalculateSingleCtr4x4_sse2) {