Rewrite vp8_short_walsh4x4_sse2()

This rewriting reflects changes made in commit "Improve the
accuracy of forward walsh-hadamard transform". Since this function
is not called much, only a small encoder performance gain (~0.5% )
is seen.

Change-Id: Ie9df58a43028a11fd5b115c4bbe3141f7596578b
This commit is contained in:
Yunqing Wang 2010-10-21 10:26:50 -04:00
parent 4db2076594
commit fc94ffcea4
2 changed files with 121 additions and 79 deletions

View File

@ -21,94 +21,122 @@ sym(vp8_short_walsh4x4_sse2):
push rdi
; end prolog
mov rsi, arg(0)
mov rdi, arg(1)
mov rsi, arg(0) ; input
mov rdi, arg(1) ; output
movsxd rdx, dword ptr arg(2) ; pitch
movdqu xmm4, [rsi + 0] ;ip[4] ip[0]
movdqu xmm0, [rsi + 16] ;ip[12] ip[8]
; first for loop
movq xmm0, MMWORD PTR [rsi] ; load input
movq xmm1, MMWORD PTR [rsi + rdx]
lea rsi, [rsi + rdx*2]
movq xmm2, MMWORD PTR [rsi]
movq xmm3, MMWORD PTR [rsi + rdx]
pxor xmm7, xmm7
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
;
; 33 32 31 30 23 22 21 20
;
movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
movdqa xmm3, xmm4 ;ip[4] ip[0]
punpcklwd xmm0, xmm1
punpcklwd xmm2, xmm3
paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
movdqa xmm1, xmm0
punpckldq xmm0, xmm2 ; ip[1] ip[0]
punpckhdq xmm1, xmm2 ; ip[3] ip[2]
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
psllw xmm0, 2 ; d1 a1
psllw xmm2, 2 ; c1 b1
movdqa xmm1, xmm0
punpcklqdq xmm0, xmm2 ; b1 a1
punpckhqdq xmm1, xmm2 ; c1 d1
pxor xmm6, xmm6
movq xmm6, xmm0
pxor xmm7, xmm7
pcmpeqw xmm7, xmm6
paddw xmm7, [GLOBAL(c1)]
movdqa xmm2, xmm0
paddw xmm0, xmm1 ; b1+c1 a1+d1
psubw xmm2, xmm1 ; b1-c1 a1-d1
paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0)
; second for loop
; input: 13 9 5 1 12 8 4 0 (xmm0)
; 14 10 6 2 15 11 7 3 (xmm2)
; after shuffle:
; 13 5 9 1 12 4 8 0 (xmm0)
; 14 6 10 2 15 7 11 3 (xmm1)
pshuflw xmm3, xmm0, 0xd8
pshufhw xmm0, xmm3, 0xd8
pshuflw xmm3, xmm2, 0xd8
pshufhw xmm1, xmm3, 0xd8
movdqa xmm2, xmm0
pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10
pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10
movdqa xmm3, xmm1
pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13
pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13
pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10
pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10
pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12
pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12
movdqa xmm0, xmm4
punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10
punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10
movdqa xmm1, xmm6
punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12
punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12
movdqa xmm2, xmm0
paddd xmm0, xmm4 ; b21 b20 a21 a20
psubd xmm2, xmm4 ; c21 c20 d21 d20
movdqa xmm3, xmm1
paddd xmm1, xmm6 ; b23 b22 a23 a22
psubd xmm3, xmm6 ; c23 c22 d23 d22
pxor xmm4, xmm4
movdqa xmm5, xmm4
punpcklqdq xmm4, xmm3 ;d1 a1
punpckhqdq xmm5, xmm3 ;c1 b1
pcmpgtd xmm4, xmm0
pcmpgtd xmm5, xmm2
pand xmm4, [GLOBAL(cd1)]
pand xmm5, [GLOBAL(cd1)]
movdqa xmm1, xmm5 ;c1 b1
paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
;
; 33 32 31 30 23 22 21 20
;
movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00
punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00
punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10
movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00
punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
movdqa xmm3, xmm5 ;ip[4] ip[0]
pxor xmm6, xmm6
movdqa xmm7, xmm6
pcmpgtd xmm6, xmm1
pcmpgtd xmm7, xmm3
pand xmm6, [GLOBAL(cd1)]
pand xmm7, [GLOBAL(cd1)]
paddw xmm5, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
paddd xmm0, xmm4
paddd xmm2, xmm5
paddd xmm0, [GLOBAL(cd3)]
paddd xmm2, [GLOBAL(cd3)]
paddd xmm1, xmm6
paddd xmm3, xmm7
paddd xmm1, [GLOBAL(cd3)]
paddd xmm3, [GLOBAL(cd3)]
movdqa xmm6, xmm5
punpcklqdq xmm5, xmm3 ;d1 a1
punpckhqdq xmm6, xmm3 ;c1 b1
psrad xmm0, 3
psrad xmm1, 3
psrad xmm2, 3
psrad xmm3, 3
movdqa xmm4, xmm0
punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20
punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20
movdqa xmm5, xmm2
punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20
punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20
movdqa xmm1, xmm6 ;c1 b1
paddw xmm6, xmm5 ;dl+cl a1+b1 aka op[4] op[0]
psubw xmm5, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20
packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20
movdqa xmm0, xmm6 ;aka b2 a2
movdqa xmm1, xmm5 ;aka d2 c2
pcmpgtw xmm0, xmm7
pcmpgtw xmm1, xmm7
psrlw xmm0, 15
psrlw xmm1, 15
paddw xmm6, xmm0
paddw xmm5, xmm1
psraw xmm6, 1
psraw xmm5, 1
; a2 = a1 + b1;
; b2 = c1 + d1;
; c2 = a1 - b1;
; d2 = d1 - c1;
; a2 += (a2>0);
; b2 += (b2>0);
; c2 += (c2>0);
; d2 += (d2>0);
; op[0] = (a2)>>1;
; op[4] = (b2)>>1;
; op[8] = (c2)>>1;
; op[12]= (d2)>>1;
movdqu [rdi + 0], xmm6
movdqu [rdi + 16], xmm5
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi + 16], xmm2
; begin epilog
pop rdi
@ -116,3 +144,17 @@ sym(vp8_short_walsh4x4_sse2):
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
c1:
dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
align 16
cn1:
dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
align 16
cd1:
dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
align 16
cd3:
dd 0x00000003, 0x00000003, 0x00000003, 0x00000003

View File

@ -289,7 +289,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_sse2;
cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_sse2;
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c ;
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2 ;
cpi->rtcd.encodemb.berr = vp8_block_error_xmm;
cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm;