[Common/x86] DeblockChromaEq4H_ssse3 optimizations

Use packed 8-bit operations rather than unpack to 16-bit.

~5.80x speedup on Haswell (x86-64).
~1.69x speedup on Haswell (x86 32-bit).
This commit is contained in:
Sindre Aamås 2016-02-25 16:00:26 +01:00
parent 9909c306f1
commit a009153741

View File

@ -509,6 +509,32 @@ WELS_EXTERN DeblockLumaEq4V_ssse3
SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9
%endmacro
; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 xmmclobber=%7,%8,%9
%macro SSSE3_DeblockChromaEq4 9
movdqa %7, %3
SSE2_AbsDiffUB %7, %2, %8 ; |p0 - q0|
SSE2_CmpgeUB %7, %5 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
movdqa %8, %4
SSE2_AbsDiffUB %8, %3, %5 ; |q1 - q0|
movdqa %9, %1
SSE2_AbsDiffUB %9, %2, %5 ; |p1 - p0|
pmaxub %8, %9 ; max(|q1 - q0|, |p1 - p0|)
pxor %9, %9
movd %5, %6
pshufb %5, %9 ; iBeta
SSE2_CmpgeUB %8, %5 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
por %7, %8 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
WELS_DB1 %5
movdqa %8, %2
SSE2_AvgbFloor1 %8, %4, %5, %9 ; (p0 + q1) >> 1
pavgb %8, %1 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
movdqa %9, %7
SSE2_Blend %2, %8, %7 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
SSE2_AvgbFloor1 %1, %3, %5, %7 ; (q0 + p1) >> 1
pavgb %1, %4 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
SSE2_Blend %3, %1, %9 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
%endmacro
;******************************************************************************
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@ -572,42 +598,15 @@ WELS_EXTERN DeblockChromaEq4V_ssse3
movhps xmm0, [r1 + 0 * r2] ; q0 cr
movq xmm2, [r0 + 1 * r3] ; p0 cb
movhps xmm2, [r1 + 1 * r3] ; p0 cr
movdqa xmm4, xmm0
SSE2_AbsDiffUB xmm4, xmm2, xmm5 ; |p0 - q0|
SSE2_CmpgeUB xmm4, xmm7 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
movq xmm1, [r0 + 1 * r2] ; q1 cb
movhps xmm1, [r1 + 1 * r2] ; q1 cr
movq xmm3, [r0 + 2 * r3] ; p1 cb
movhps xmm3, [r1 + 2 * r3] ; p1 cr
movdqa xmm5, xmm1
SSE2_AbsDiffUB xmm5, xmm0, xmm7 ; |q1 - q0|
movdqa xmm6, xmm3
SSE2_AbsDiffUB xmm6, xmm2, xmm7 ; |p1 - p0|
pmaxub xmm5, xmm6 ; max(|q1 - q0|, |p1 - p0|)
SSSE3_DeblockChromaEq4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, xmm4, xmm5, xmm6
pxor xmm6, xmm6
movd xmm7, arg5d
pshufb xmm7, xmm6 ; iBeta
SSE2_CmpgeUB xmm5, xmm7 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
por xmm4, xmm5 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
WELS_DB1 xmm7
movdqa xmm5, xmm2
SSE2_AvgbFloor1 xmm2, xmm1, xmm7, xmm6 ; (p0 + q1) >> 1
pavgb xmm2, xmm3 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
movdqa xmm6, xmm4
SSE2_Blend xmm5, xmm2, xmm4 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
SSE2_AvgbFloor1 xmm3, xmm0, xmm7, xmm4 ; (q0 + p1) >> 1
pavgb xmm3, xmm1 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
SSE2_Blend xmm0, xmm3, xmm6 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
movlps [r0 + 1 * r3], xmm5 ; store p0 cb
movhps [r1 + 1 * r3], xmm5 ; store p0 cr
movlps [r0 + 1 * r3], xmm2 ; store p0 cb
movhps [r1 + 1 * r3], xmm2 ; store p0 cr
movlps [r0 + 0 * r2], xmm0 ; store q0 cb
movhps [r1 + 0 * r2], xmm0 ; store q0 cr
@ -640,834 +639,36 @@ WELS_EXTERN DeblockChromaLt4H_ssse3
ret
%ifdef WIN64
WELS_EXTERN DeblockChromaEq4H_ssse3
mov rax,rsp
mov [rax+20h],rbx
push rdi
PUSH_XMM 16
sub rsp,140h
mov rdi,rdx
lea eax,[r8*4]
movsxd r10,eax
mov eax,[rcx-2]
mov [rsp+10h],eax
lea rbx,[r10+rdx-2]
lea r11,[r10+rcx-2]
movdqa xmm5,[rsp+10h]
movsxd r10,r8d
mov eax,[r10+rcx-2]
lea rdx,[r10+r10*2]
mov [rsp+20h],eax
mov eax,[rcx+r10*2-2]
mov [rsp+30h],eax
mov eax,[rdx+rcx-2]
movdqa xmm2,[rsp+20h]
mov [rsp+40h],eax
mov eax, [rdi-2]
movdqa xmm4,[rsp+30h]
mov [rsp+50h],eax
mov eax,[r10+rdi-2]
movdqa xmm3,[rsp+40h]
mov [rsp+60h],eax
mov eax,[rdi+r10*2-2]
punpckldq xmm5,[rsp+50h]
mov [rsp+70h],eax
mov eax, [rdx+rdi-2]
punpckldq xmm2, [rsp+60h]
mov [rsp+80h],eax
mov eax,[r11]
punpckldq xmm4, [rsp+70h]
mov [rsp+50h],eax
mov eax,[rbx]
punpckldq xmm3,[rsp+80h]
mov [rsp+60h],eax
mov eax,[r10+r11]
movdqa xmm0, [rsp+50h]
punpckldq xmm0, [rsp+60h]
punpcklqdq xmm5,xmm0
movdqa [rsp+50h],xmm0
mov [rsp+50h],eax
mov eax,[r10+rbx]
movdqa xmm0,[rsp+50h]
movdqa xmm1,xmm5
mov [rsp+60h],eax
mov eax,[r11+r10*2]
punpckldq xmm0, [rsp+60h]
punpcklqdq xmm2,xmm0
punpcklbw xmm1,xmm2
punpckhbw xmm5,xmm2
movdqa [rsp+50h],xmm0
mov [rsp+50h],eax
mov eax,[rbx+r10*2]
movdqa xmm0,[rsp+50h]
mov [rsp+60h],eax
mov eax, [rdx+r11]
movdqa xmm15,xmm1
punpckldq xmm0,[rsp+60h]
punpcklqdq xmm4,xmm0
movdqa [rsp+50h],xmm0
mov [rsp+50h],eax
mov eax, [rdx+rbx]
movdqa xmm0,[rsp+50h]
mov [rsp+60h],eax
punpckldq xmm0, [rsp+60h]
punpcklqdq xmm3,xmm0
movdqa xmm0,xmm4
punpcklbw xmm0,xmm3
punpckhbw xmm4,xmm3
punpcklwd xmm15,xmm0
punpckhwd xmm1,xmm0
movdqa xmm0,xmm5
movdqa xmm12,xmm15
punpcklwd xmm0,xmm4
punpckhwd xmm5,xmm4
punpckldq xmm12,xmm0
punpckhdq xmm15,xmm0
movdqa xmm0,xmm1
movdqa xmm11,xmm12
punpckldq xmm0,xmm5
punpckhdq xmm1,xmm5
punpcklqdq xmm11,xmm0
punpckhqdq xmm12,xmm0
movsx eax,r9w
movdqa xmm14,xmm15
punpcklqdq xmm14,xmm1
punpckhqdq xmm15,xmm1
pxor xmm1,xmm1
movd xmm0,eax
movdqa xmm4,xmm12
movdqa xmm8,xmm11
movsx eax,word [rsp+170h + 160] ; iBeta
punpcklwd xmm0,xmm0
punpcklbw xmm4,xmm1
punpckhbw xmm12,xmm1
movdqa xmm9,xmm14
movdqa xmm7,xmm15
movdqa xmm10,xmm15
pshufd xmm13,xmm0,0
punpcklbw xmm9,xmm1
punpckhbw xmm14,xmm1
movdqa xmm6,xmm13
movd xmm0,eax
movdqa [rsp],xmm11
mov eax,2
cwde
punpckhbw xmm11,xmm1
punpckhbw xmm10,xmm1
punpcklbw xmm7,xmm1
punpcklwd xmm0,xmm0
punpcklbw xmm8,xmm1
pshufd xmm3,xmm0,0
movdqa xmm1,xmm8
movdqa xmm0,xmm4
psubw xmm0,xmm9
psubw xmm1,xmm4
movdqa xmm2,xmm3
pabsw xmm0,xmm0
pcmpgtw xmm6,xmm0
pabsw xmm0,xmm1
movdqa xmm1,xmm3
pcmpgtw xmm2,xmm0
pand xmm6,xmm2
movdqa xmm0,xmm7
movdqa xmm2,xmm3
psubw xmm0,xmm9
pabsw xmm0,xmm0
pcmpgtw xmm1,xmm0
pand xmm6,xmm1
movdqa xmm0,xmm12
movdqa xmm1,xmm11
psubw xmm0,xmm14
psubw xmm1,xmm12
movdqa xmm5,xmm6
pabsw xmm0,xmm0
pcmpgtw xmm13,xmm0
pabsw xmm0,xmm1
movdqa xmm1,xmm8
pcmpgtw xmm2,xmm0
paddw xmm1,xmm8
movdqa xmm0,xmm10
pand xmm13,xmm2
psubw xmm0,xmm14
paddw xmm1,xmm4
movdqa xmm2,xmm11
pabsw xmm0,xmm0
paddw xmm2,xmm11
paddw xmm1,xmm7
pcmpgtw xmm3,xmm0
paddw xmm2,xmm12
movd xmm0,eax
pand xmm13,xmm3
paddw xmm2,xmm10
punpcklwd xmm0,xmm0
pshufd xmm3,xmm0,0
movdqa xmm0,xmm6
paddw xmm1,xmm3
pandn xmm0,xmm4
paddw xmm2,xmm3
psraw xmm1,2
pand xmm5,xmm1
por xmm5,xmm0
paddw xmm7,xmm7
paddw xmm10,xmm10
psraw xmm2,2
movdqa xmm1,xmm13
movdqa xmm0,xmm13
pandn xmm0,xmm12
pand xmm1,xmm2
paddw xmm7,xmm9
por xmm1,xmm0
paddw xmm10,xmm14
paddw xmm7,xmm8
movdqa xmm0,xmm13
packuswb xmm5,xmm1
paddw xmm7,xmm3
paddw xmm10,xmm11
movdqa xmm1,xmm6
paddw xmm10,xmm3
pandn xmm6,xmm9
psraw xmm7,2
pand xmm1,xmm7
psraw xmm10,2
pandn xmm13,xmm14
pand xmm0,xmm10
por xmm1,xmm6
movdqa xmm6,[rsp]
movdqa xmm4,xmm6
por xmm0,xmm13
punpcklbw xmm4,xmm5
punpckhbw xmm6,xmm5
movdqa xmm3,xmm4
packuswb xmm1,xmm0
movdqa xmm0,xmm1
punpckhbw xmm1,xmm15
punpcklbw xmm0,xmm15
punpcklwd xmm3,xmm0
punpckhwd xmm4,xmm0
movdqa xmm0,xmm6
movdqa xmm2,xmm3
punpcklwd xmm0,xmm1
punpckhwd xmm6,xmm1
movdqa xmm1,xmm4
punpckldq xmm2,xmm0
punpckhdq xmm3,xmm0
punpckldq xmm1,xmm6
movdqa xmm0,xmm2
punpcklqdq xmm0,xmm1
punpckhdq xmm4,xmm6
punpckhqdq xmm2,xmm1
movdqa [rsp+10h],xmm0
movdqa [rsp+60h],xmm2
movdqa xmm0,xmm3
mov eax,[rsp+10h]
mov [rcx-2],eax
mov eax,[rsp+60h]
punpcklqdq xmm0,xmm4
punpckhqdq xmm3,xmm4
mov [r10+rcx-2],eax
movdqa [rsp+20h],xmm0
mov eax, [rsp+20h]
movdqa [rsp+70h],xmm3
mov [rcx+r10*2-2],eax
mov eax,[rsp+70h]
mov [rdx+rcx-2],eax
mov eax,[rsp+18h]
mov [r11],eax
mov eax,[rsp+68h]
mov [r10+r11],eax
mov eax,[rsp+28h]
mov [r11+r10*2],eax
mov eax,[rsp+78h]
mov [rdx+r11],eax
mov eax,[rsp+14h]
mov [rdi-2],eax
mov eax,[rsp+64h]
mov [r10+rdi-2],eax
mov eax,[rsp+24h]
mov [rdi+r10*2-2],eax
mov eax, [rsp+74h]
mov [rdx+rdi-2],eax
mov eax, [rsp+1Ch]
mov [rbx],eax
mov eax, [rsp+6Ch]
mov [r10+rbx],eax
mov eax,[rsp+2Ch]
mov [rbx+r10*2],eax
mov eax,[rsp+7Ch]
mov [rdx+rbx],eax
lea rsp,[rsp+140h]
POP_XMM
mov rbx, [rsp+28h]
pop rdi
ret
%elifdef UNIX64
WELS_EXTERN DeblockChromaEq4H_ssse3
mov rax,rsp
push rbx
push rbp
push r12
mov rbp, r8
mov r8, rdx
mov r9, rcx
mov rcx, rdi
mov rdx, rsi
mov rdi, rdx
sub rsp,140h
lea eax,[r8*4]
movsxd r10,eax
mov eax,[rcx-2]
mov [rsp+10h],eax
lea rbx,[r10+rdx-2]
lea r11,[r10+rcx-2]
movdqa xmm5,[rsp+10h]
movsxd r10,r8d
mov eax,[r10+rcx-2]
lea rdx,[r10+r10*2]
mov [rsp+20h],eax
mov eax,[rcx+r10*2-2]
mov [rsp+30h],eax
mov eax,[rdx+rcx-2]
movdqa xmm2,[rsp+20h]
mov [rsp+40h],eax
mov eax, [rdi-2]
movdqa xmm4,[rsp+30h]
mov [rsp+50h],eax
mov eax,[r10+rdi-2]
movdqa xmm3,[rsp+40h]
mov [rsp+60h],eax
mov eax,[rdi+r10*2-2]
punpckldq xmm5,[rsp+50h]
mov [rsp+70h],eax
mov eax, [rdx+rdi-2]
punpckldq xmm2, [rsp+60h]
mov [rsp+80h],eax
mov eax,[r11]
punpckldq xmm4, [rsp+70h]
mov [rsp+50h],eax
mov eax,[rbx]
punpckldq xmm3,[rsp+80h]
mov [rsp+60h],eax
mov eax,[r10+r11]
movdqa xmm0, [rsp+50h]
punpckldq xmm0, [rsp+60h]
punpcklqdq xmm5,xmm0
movdqa [rsp+50h],xmm0
mov [rsp+50h],eax
mov eax,[r10+rbx]
movdqa xmm0,[rsp+50h]
movdqa xmm1,xmm5
mov [rsp+60h],eax
mov eax,[r11+r10*2]
punpckldq xmm0, [rsp+60h]
punpcklqdq xmm2,xmm0
punpcklbw xmm1,xmm2
punpckhbw xmm5,xmm2
movdqa [rsp+50h],xmm0
mov [rsp+50h],eax
mov eax,[rbx+r10*2]
movdqa xmm0,[rsp+50h]
mov [rsp+60h],eax
mov eax, [rdx+r11]
movdqa xmm15,xmm1
punpckldq xmm0,[rsp+60h]
punpcklqdq xmm4,xmm0
movdqa [rsp+50h],xmm0
mov [rsp+50h],eax
mov eax, [rdx+rbx]
movdqa xmm0,[rsp+50h]
mov [rsp+60h],eax
punpckldq xmm0, [rsp+60h]
punpcklqdq xmm3,xmm0
movdqa xmm0,xmm4
punpcklbw xmm0,xmm3
punpckhbw xmm4,xmm3
punpcklwd xmm15,xmm0
punpckhwd xmm1,xmm0
movdqa xmm0,xmm5
movdqa xmm12,xmm15
punpcklwd xmm0,xmm4
punpckhwd xmm5,xmm4
punpckldq xmm12,xmm0
punpckhdq xmm15,xmm0
movdqa xmm0,xmm1
movdqa xmm11,xmm12
punpckldq xmm0,xmm5
punpckhdq xmm1,xmm5
punpcklqdq xmm11,xmm0
punpckhqdq xmm12,xmm0
movsx eax,r9w
movdqa xmm14,xmm15
punpcklqdq xmm14,xmm1
punpckhqdq xmm15,xmm1
pxor xmm1,xmm1
movd xmm0,eax
movdqa xmm4,xmm12
movdqa xmm8,xmm11
mov eax, ebp ; iBeta
punpcklwd xmm0,xmm0
punpcklbw xmm4,xmm1
punpckhbw xmm12,xmm1
movdqa xmm9,xmm14
movdqa xmm7,xmm15
movdqa xmm10,xmm15
pshufd xmm13,xmm0,0
punpcklbw xmm9,xmm1
punpckhbw xmm14,xmm1
movdqa xmm6,xmm13
movd xmm0,eax
movdqa [rsp],xmm11
mov eax,2
cwde
punpckhbw xmm11,xmm1
punpckhbw xmm10,xmm1
punpcklbw xmm7,xmm1
punpcklwd xmm0,xmm0
punpcklbw xmm8,xmm1
pshufd xmm3,xmm0,0
movdqa xmm1,xmm8
movdqa xmm0,xmm4
psubw xmm0,xmm9
psubw xmm1,xmm4
movdqa xmm2,xmm3
pabsw xmm0,xmm0
pcmpgtw xmm6,xmm0
pabsw xmm0,xmm1
movdqa xmm1,xmm3
pcmpgtw xmm2,xmm0
pand xmm6,xmm2
movdqa xmm0,xmm7
movdqa xmm2,xmm3
psubw xmm0,xmm9
pabsw xmm0,xmm0
pcmpgtw xmm1,xmm0
pand xmm6,xmm1
movdqa xmm0,xmm12
movdqa xmm1,xmm11
psubw xmm0,xmm14
psubw xmm1,xmm12
movdqa xmm5,xmm6
pabsw xmm0,xmm0
pcmpgtw xmm13,xmm0
pabsw xmm0,xmm1
movdqa xmm1,xmm8
pcmpgtw xmm2,xmm0
paddw xmm1,xmm8
movdqa xmm0,xmm10
pand xmm13,xmm2
psubw xmm0,xmm14
paddw xmm1,xmm4
movdqa xmm2,xmm11
pabsw xmm0,xmm0
paddw xmm2,xmm11
paddw xmm1,xmm7
pcmpgtw xmm3,xmm0
paddw xmm2,xmm12
movd xmm0,eax
pand xmm13,xmm3
paddw xmm2,xmm10
punpcklwd xmm0,xmm0
pshufd xmm3,xmm0,0
movdqa xmm0,xmm6
paddw xmm1,xmm3
pandn xmm0,xmm4
paddw xmm2,xmm3
psraw xmm1,2
pand xmm5,xmm1
por xmm5,xmm0
paddw xmm7,xmm7
paddw xmm10,xmm10
psraw xmm2,2
movdqa xmm1,xmm13
movdqa xmm0,xmm13
pandn xmm0,xmm12
pand xmm1,xmm2
paddw xmm7,xmm9
por xmm1,xmm0
paddw xmm10,xmm14
paddw xmm7,xmm8
movdqa xmm0,xmm13
packuswb xmm5,xmm1
paddw xmm7,xmm3
paddw xmm10,xmm11
movdqa xmm1,xmm6
paddw xmm10,xmm3
pandn xmm6,xmm9
psraw xmm7,2
pand xmm1,xmm7
psraw xmm10,2
pandn xmm13,xmm14
pand xmm0,xmm10
por xmm1,xmm6
movdqa xmm6,[rsp]
movdqa xmm4,xmm6
por xmm0,xmm13
punpcklbw xmm4,xmm5
punpckhbw xmm6,xmm5
movdqa xmm3,xmm4
packuswb xmm1,xmm0
movdqa xmm0,xmm1
punpckhbw xmm1,xmm15
punpcklbw xmm0,xmm15
punpcklwd xmm3,xmm0
punpckhwd xmm4,xmm0
movdqa xmm0,xmm6
movdqa xmm2,xmm3
punpcklwd xmm0,xmm1
punpckhwd xmm6,xmm1
movdqa xmm1,xmm4
punpckldq xmm2,xmm0
punpckhdq xmm3,xmm0
punpckldq xmm1,xmm6
movdqa xmm0,xmm2
punpcklqdq xmm0,xmm1
punpckhdq xmm4,xmm6
punpckhqdq xmm2,xmm1
movdqa [rsp+10h],xmm0
movdqa [rsp+60h],xmm2
movdqa xmm0,xmm3
mov eax,[rsp+10h]
mov [rcx-2],eax
mov eax,[rsp+60h]
punpcklqdq xmm0,xmm4
punpckhqdq xmm3,xmm4
mov [r10+rcx-2],eax
movdqa [rsp+20h],xmm0
mov eax, [rsp+20h]
movdqa [rsp+70h],xmm3
mov [rcx+r10*2-2],eax
mov eax,[rsp+70h]
mov [rdx+rcx-2],eax
mov eax,[rsp+18h]
mov [r11],eax
mov eax,[rsp+68h]
mov [r10+r11],eax
mov eax,[rsp+28h]
mov [r11+r10*2],eax
mov eax,[rsp+78h]
mov [rdx+r11],eax
mov eax,[rsp+14h]
mov [rdi-2],eax
mov eax,[rsp+64h]
mov [r10+rdi-2],eax
mov eax,[rsp+24h]
mov [rdi+r10*2-2],eax
mov eax, [rsp+74h]
mov [rdx+rdi-2],eax
mov eax, [rsp+1Ch]
mov [rbx],eax
mov eax, [rsp+6Ch]
mov [r10+rbx],eax
mov eax,[rsp+2Ch]
mov [rbx+r10*2],eax
mov eax,[rsp+7Ch]
mov [rdx+rbx],eax
lea r11,[rsp+140h]
mov rbx, [r11+28h]
mov rsp,r11
pop r12
pop rbp
pop rbx
ret
%elifdef X86_32
;***************************************************************************
; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta)
;***************************************************************************
WELS_EXTERN DeblockChromaEq4H_ssse3
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
sub esp,0C8h
mov ecx,dword [ebp+8]
mov edx,dword [ebp+0Ch]
mov eax,dword [ebp+10h]
sub ecx,2
sub edx,2
push esi
lea esi,[eax+eax*2]
mov dword [esp+18h],ecx
mov dword [esp+4],edx
lea ecx,[ecx+eax*4]
lea edx,[edx+eax*4]
lea eax,[esp+7Ch]
push edi
mov dword [esp+14h],esi
mov dword [esp+18h],ecx
mov dword [esp+0Ch],edx
mov dword [esp+10h],eax
mov esi,dword [esp+1Ch]
mov ecx,dword [ebp+10h]
mov edx,dword [esp+14h]
movd xmm0,dword [esi]
movd xmm1,dword [esi+ecx]
movd xmm2,dword [esi+ecx*2]
movd xmm3,dword [esi+edx]
mov esi,dword [esp+8]
movd xmm4,dword [esi]
movd xmm5,dword [esi+ecx]
movd xmm6,dword [esi+ecx*2]
movd xmm7,dword [esi+edx]
punpckldq xmm0,xmm4
punpckldq xmm1,xmm5
punpckldq xmm2,xmm6
punpckldq xmm3,xmm7
mov esi,dword [esp+18h]
mov edi,dword [esp+0Ch]
movd xmm4,dword [esi]
movd xmm5,dword [edi]
punpckldq xmm4,xmm5
punpcklqdq xmm0,xmm4
movd xmm4,dword [esi+ecx]
movd xmm5,dword [edi+ecx]
punpckldq xmm4,xmm5
punpcklqdq xmm1,xmm4
movd xmm4,dword [esi+ecx*2]
movd xmm5,dword [edi+ecx*2]
punpckldq xmm4,xmm5
punpcklqdq xmm2,xmm4
movd xmm4,dword [esi+edx]
movd xmm5,dword [edi+edx]
punpckldq xmm4,xmm5
punpcklqdq xmm3,xmm4
movdqa xmm6,xmm0
punpcklbw xmm0,xmm1
punpckhbw xmm6,xmm1
movdqa xmm7,xmm2
punpcklbw xmm2,xmm3
punpckhbw xmm7,xmm3
movdqa xmm4,xmm0
movdqa xmm5,xmm6
punpcklwd xmm0,xmm2
punpckhwd xmm4,xmm2
punpcklwd xmm6,xmm7
punpckhwd xmm5,xmm7
movdqa xmm1,xmm0
movdqa xmm2,xmm4
punpckldq xmm0,xmm6
punpckhdq xmm1,xmm6
punpckldq xmm4,xmm5
punpckhdq xmm2,xmm5
movdqa xmm5,xmm0
movdqa xmm6,xmm1
punpcklqdq xmm0,xmm4
punpckhqdq xmm5,xmm4
punpcklqdq xmm1,xmm2
punpckhqdq xmm6,xmm2
mov edi,dword [esp+10h]
movdqa [edi],xmm0
movdqa [edi+10h],xmm5
movdqa [edi+20h],xmm1
movdqa [edi+30h],xmm6
movsx ecx,word [ebp+14h]
movsx edx,word [ebp+18h]
movdqa xmm6,[esp+80h]
movdqa xmm4,[esp+90h]
movdqa xmm5,[esp+0A0h]
movdqa xmm7,[esp+0B0h]
pxor xmm0,xmm0
movd xmm1,ecx
movdqa xmm2,xmm1
punpcklwd xmm2,xmm1
pshufd xmm1,xmm2,0
movd xmm2,edx
movdqa xmm3,xmm2
punpcklwd xmm3,xmm2
pshufd xmm2,xmm3,0
movdqa xmm3,xmm6
punpckhbw xmm6,xmm0
movdqa [esp+60h],xmm6
movdqa xmm6,[esp+90h]
punpckhbw xmm6,xmm0
movdqa [esp+30h],xmm6
movdqa xmm6,[esp+0A0h]
punpckhbw xmm6,xmm0
movdqa [esp+40h],xmm6
movdqa xmm6,[esp+0B0h]
punpckhbw xmm6,xmm0
movdqa [esp+70h],xmm6
punpcklbw xmm7,xmm0
punpcklbw xmm4,xmm0
punpcklbw xmm5,xmm0
punpcklbw xmm3,xmm0
movdqa [esp+50h],xmm7
movdqa xmm6,xmm4
psubw xmm6,xmm5
pabsw xmm6,xmm6
movdqa xmm0,xmm1
pcmpgtw xmm0,xmm6
movdqa xmm6,xmm3
psubw xmm6,xmm4
pabsw xmm6,xmm6
movdqa xmm7,xmm2
pcmpgtw xmm7,xmm6
movdqa xmm6,[esp+50h]
psubw xmm6,xmm5
pabsw xmm6,xmm6
pand xmm0,xmm7
movdqa xmm7,xmm2
pcmpgtw xmm7,xmm6
movdqa xmm6,[esp+30h]
psubw xmm6,[esp+40h]
pabsw xmm6,xmm6
pcmpgtw xmm1,xmm6
movdqa xmm6,[esp+60h]
psubw xmm6,[esp+30h]
pabsw xmm6,xmm6
pand xmm0,xmm7
movdqa xmm7,xmm2
pcmpgtw xmm7,xmm6
movdqa xmm6,[esp+70h]
psubw xmm6,[esp+40h]
pabsw xmm6,xmm6
pand xmm1,xmm7
pcmpgtw xmm2,xmm6
pand xmm1,xmm2
mov eax,2
movsx ecx,ax
movd xmm2,ecx
movdqa xmm6,xmm2
punpcklwd xmm6,xmm2
pshufd xmm2,xmm6,0
movdqa [esp+20h],xmm2
movdqa xmm2,xmm3
paddw xmm2,xmm3
paddw xmm2,xmm4
paddw xmm2,[esp+50h]
paddw xmm2,[esp+20h]
psraw xmm2,2
movdqa xmm6,xmm0
pand xmm6,xmm2
movdqa xmm2,xmm0
pandn xmm2,xmm4
por xmm6,xmm2
movdqa xmm2,[esp+60h]
movdqa xmm7,xmm2
paddw xmm7,xmm2
paddw xmm7,[esp+30h]
paddw xmm7,[esp+70h]
paddw xmm7,[esp+20h]
movdqa xmm4,xmm1
movdqa xmm2,xmm1
pandn xmm2,[esp+30h]
psraw xmm7,2
pand xmm4,xmm7
por xmm4,xmm2
movdqa xmm2,[esp+50h]
packuswb xmm6,xmm4
movdqa [esp+90h],xmm6
movdqa xmm6,xmm2
paddw xmm6,xmm2
movdqa xmm2,[esp+20h]
paddw xmm6,xmm5
paddw xmm6,xmm3
movdqa xmm4,xmm0
pandn xmm0,xmm5
paddw xmm6,xmm2
psraw xmm6,2
pand xmm4,xmm6
por xmm4,xmm0
movdqa xmm0,[esp+70h]
movdqa xmm5,xmm0
paddw xmm5,xmm0
movdqa xmm0,[esp+40h]
paddw xmm5,xmm0
paddw xmm5,[esp+60h]
movdqa xmm3,xmm1
paddw xmm5,xmm2
psraw xmm5,2
pand xmm3,xmm5
pandn xmm1,xmm0
por xmm3,xmm1
packuswb xmm4,xmm3
movdqa [esp+0A0h],xmm4
mov esi,dword [esp+10h]
movdqa xmm0,[esi]
movdqa xmm1,[esi+10h]
movdqa xmm2,[esi+20h]
movdqa xmm3,[esi+30h]
movdqa xmm6,xmm0
punpcklbw xmm0,xmm1
punpckhbw xmm6,xmm1
movdqa xmm7,xmm2
punpcklbw xmm2,xmm3
punpckhbw xmm7,xmm3
movdqa xmm4,xmm0
movdqa xmm5,xmm6
punpcklwd xmm0,xmm2
punpckhwd xmm4,xmm2
punpcklwd xmm6,xmm7
punpckhwd xmm5,xmm7
movdqa xmm1,xmm0
movdqa xmm2,xmm4
punpckldq xmm0,xmm6
punpckhdq xmm1,xmm6
punpckldq xmm4,xmm5
punpckhdq xmm2,xmm5
movdqa xmm5,xmm0
movdqa xmm6,xmm1
punpcklqdq xmm0,xmm4
punpckhqdq xmm5,xmm4
punpcklqdq xmm1,xmm2
punpckhqdq xmm6,xmm2
mov esi,dword [esp+1Ch]
mov ecx,dword [ebp+10h]
mov edx,dword [esp+14h]
mov edi,dword [esp+8]
movd dword [esi],xmm0
movd dword [esi+ecx],xmm5
movd dword [esi+ecx*2],xmm1
movd dword [esi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
mov esi,dword [esp+18h]
movd dword [edi],xmm0
movd dword [edi+ecx],xmm5
movd dword [edi+ecx*2],xmm1
movd dword [edi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
movd dword [esi],xmm0
movd dword [esi+ecx],xmm5
movd dword [esi+ecx*2],xmm1
movd dword [esi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
mov edi,dword [esp+0Ch]
movd dword [edi],xmm0
movd dword [edi+ecx],xmm5
movd dword [edi+ecx*2],xmm1
movd dword [edi+edx],xmm6
pop edi
pop esi
mov esp,ebp
pop ebp
ret
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
movd xmm7, arg4d
pxor xmm0, xmm0
pshufb xmm7, xmm0 ; iAlpha
lea r3, [3 * r2 - 1] ; 3 * iStride - 1
SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
SSSE3_DeblockChromaEq4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, xmm2, xmm3, xmm6
%ifdef X86_32
push r4
push r5
SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
pop r5
pop r4
%else
SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
%endif
POP_XMM
LOAD_4_PARA_POP
ret
;********************************************************************************