[Common/x86] DeblockChromaEq4H_ssse3 optimizations
Use packed 8-bit operations rather than unpack to 16-bit. ~5.80x speedup on Haswell (x86-64). ~1.69x speedup on Haswell (x86 32-bit).
This commit is contained in:
parent
9909c306f1
commit
a009153741
@ -509,6 +509,32 @@ WELS_EXTERN DeblockLumaEq4V_ssse3
|
||||
SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9
|
||||
%endmacro
|
||||
|
||||
; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 xmmclobber=%7,%8,%9
|
||||
%macro SSSE3_DeblockChromaEq4 9
|
||||
movdqa %7, %3
|
||||
SSE2_AbsDiffUB %7, %2, %8 ; |p0 - q0|
|
||||
SSE2_CmpgeUB %7, %5 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
|
||||
movdqa %8, %4
|
||||
SSE2_AbsDiffUB %8, %3, %5 ; |q1 - q0|
|
||||
movdqa %9, %1
|
||||
SSE2_AbsDiffUB %9, %2, %5 ; |p1 - p0|
|
||||
pmaxub %8, %9 ; max(|q1 - q0|, |p1 - p0|)
|
||||
pxor %9, %9
|
||||
movd %5, %6
|
||||
pshufb %5, %9 ; iBeta
|
||||
SSE2_CmpgeUB %8, %5 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
|
||||
por %7, %8 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
|
||||
WELS_DB1 %5
|
||||
movdqa %8, %2
|
||||
SSE2_AvgbFloor1 %8, %4, %5, %9 ; (p0 + q1) >> 1
|
||||
pavgb %8, %1 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
|
||||
movdqa %9, %7
|
||||
SSE2_Blend %2, %8, %7 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
|
||||
SSE2_AvgbFloor1 %1, %3, %5, %7 ; (q0 + p1) >> 1
|
||||
pavgb %1, %4 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
|
||||
SSE2_Blend %3, %1, %9 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
|
||||
%endmacro
|
||||
|
||||
|
||||
;******************************************************************************
|
||||
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
||||
@ -572,42 +598,15 @@ WELS_EXTERN DeblockChromaEq4V_ssse3
|
||||
movhps xmm0, [r1 + 0 * r2] ; q0 cr
|
||||
movq xmm2, [r0 + 1 * r3] ; p0 cb
|
||||
movhps xmm2, [r1 + 1 * r3] ; p0 cr
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
SSE2_AbsDiffUB xmm4, xmm2, xmm5 ; |p0 - q0|
|
||||
SSE2_CmpgeUB xmm4, xmm7 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
|
||||
|
||||
movq xmm1, [r0 + 1 * r2] ; q1 cb
|
||||
movhps xmm1, [r1 + 1 * r2] ; q1 cr
|
||||
movq xmm3, [r0 + 2 * r3] ; p1 cb
|
||||
movhps xmm3, [r1 + 2 * r3] ; p1 cr
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
SSE2_AbsDiffUB xmm5, xmm0, xmm7 ; |q1 - q0|
|
||||
movdqa xmm6, xmm3
|
||||
SSE2_AbsDiffUB xmm6, xmm2, xmm7 ; |p1 - p0|
|
||||
pmaxub xmm5, xmm6 ; max(|q1 - q0|, |p1 - p0|)
|
||||
SSSE3_DeblockChromaEq4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, xmm4, xmm5, xmm6
|
||||
|
||||
pxor xmm6, xmm6
|
||||
movd xmm7, arg5d
|
||||
pshufb xmm7, xmm6 ; iBeta
|
||||
|
||||
SSE2_CmpgeUB xmm5, xmm7 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
|
||||
por xmm4, xmm5 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
|
||||
|
||||
WELS_DB1 xmm7
|
||||
movdqa xmm5, xmm2
|
||||
SSE2_AvgbFloor1 xmm2, xmm1, xmm7, xmm6 ; (p0 + q1) >> 1
|
||||
pavgb xmm2, xmm3 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
|
||||
movdqa xmm6, xmm4
|
||||
SSE2_Blend xmm5, xmm2, xmm4 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
|
||||
|
||||
SSE2_AvgbFloor1 xmm3, xmm0, xmm7, xmm4 ; (q0 + p1) >> 1
|
||||
pavgb xmm3, xmm1 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
|
||||
SSE2_Blend xmm0, xmm3, xmm6 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
|
||||
|
||||
movlps [r0 + 1 * r3], xmm5 ; store p0 cb
|
||||
movhps [r1 + 1 * r3], xmm5 ; store p0 cr
|
||||
movlps [r0 + 1 * r3], xmm2 ; store p0 cb
|
||||
movhps [r1 + 1 * r3], xmm2 ; store p0 cr
|
||||
movlps [r0 + 0 * r2], xmm0 ; store q0 cb
|
||||
movhps [r1 + 0 * r2], xmm0 ; store q0 cr
|
||||
|
||||
@ -640,834 +639,36 @@ WELS_EXTERN DeblockChromaLt4H_ssse3
|
||||
ret
|
||||
|
||||
|
||||
%ifdef WIN64
|
||||
|
||||
|
||||
WELS_EXTERN DeblockChromaEq4H_ssse3
|
||||
mov rax,rsp
|
||||
mov [rax+20h],rbx
|
||||
push rdi
|
||||
PUSH_XMM 16
|
||||
sub rsp,140h
|
||||
mov rdi,rdx
|
||||
lea eax,[r8*4]
|
||||
movsxd r10,eax
|
||||
mov eax,[rcx-2]
|
||||
mov [rsp+10h],eax
|
||||
lea rbx,[r10+rdx-2]
|
||||
lea r11,[r10+rcx-2]
|
||||
movdqa xmm5,[rsp+10h]
|
||||
movsxd r10,r8d
|
||||
mov eax,[r10+rcx-2]
|
||||
lea rdx,[r10+r10*2]
|
||||
mov [rsp+20h],eax
|
||||
mov eax,[rcx+r10*2-2]
|
||||
mov [rsp+30h],eax
|
||||
mov eax,[rdx+rcx-2]
|
||||
movdqa xmm2,[rsp+20h]
|
||||
mov [rsp+40h],eax
|
||||
mov eax, [rdi-2]
|
||||
movdqa xmm4,[rsp+30h]
|
||||
mov [rsp+50h],eax
|
||||
mov eax,[r10+rdi-2]
|
||||
movdqa xmm3,[rsp+40h]
|
||||
mov [rsp+60h],eax
|
||||
mov eax,[rdi+r10*2-2]
|
||||
punpckldq xmm5,[rsp+50h]
|
||||
mov [rsp+70h],eax
|
||||
mov eax, [rdx+rdi-2]
|
||||
punpckldq xmm2, [rsp+60h]
|
||||
mov [rsp+80h],eax
|
||||
mov eax,[r11]
|
||||
punpckldq xmm4, [rsp+70h]
|
||||
mov [rsp+50h],eax
|
||||
mov eax,[rbx]
|
||||
punpckldq xmm3,[rsp+80h]
|
||||
mov [rsp+60h],eax
|
||||
mov eax,[r10+r11]
|
||||
movdqa xmm0, [rsp+50h]
|
||||
punpckldq xmm0, [rsp+60h]
|
||||
punpcklqdq xmm5,xmm0
|
||||
movdqa [rsp+50h],xmm0
|
||||
mov [rsp+50h],eax
|
||||
mov eax,[r10+rbx]
|
||||
movdqa xmm0,[rsp+50h]
|
||||
movdqa xmm1,xmm5
|
||||
mov [rsp+60h],eax
|
||||
mov eax,[r11+r10*2]
|
||||
punpckldq xmm0, [rsp+60h]
|
||||
punpcklqdq xmm2,xmm0
|
||||
punpcklbw xmm1,xmm2
|
||||
punpckhbw xmm5,xmm2
|
||||
movdqa [rsp+50h],xmm0
|
||||
mov [rsp+50h],eax
|
||||
mov eax,[rbx+r10*2]
|
||||
movdqa xmm0,[rsp+50h]
|
||||
mov [rsp+60h],eax
|
||||
mov eax, [rdx+r11]
|
||||
movdqa xmm15,xmm1
|
||||
punpckldq xmm0,[rsp+60h]
|
||||
punpcklqdq xmm4,xmm0
|
||||
movdqa [rsp+50h],xmm0
|
||||
mov [rsp+50h],eax
|
||||
mov eax, [rdx+rbx]
|
||||
movdqa xmm0,[rsp+50h]
|
||||
mov [rsp+60h],eax
|
||||
punpckldq xmm0, [rsp+60h]
|
||||
punpcklqdq xmm3,xmm0
|
||||
movdqa xmm0,xmm4
|
||||
punpcklbw xmm0,xmm3
|
||||
punpckhbw xmm4,xmm3
|
||||
punpcklwd xmm15,xmm0
|
||||
punpckhwd xmm1,xmm0
|
||||
movdqa xmm0,xmm5
|
||||
movdqa xmm12,xmm15
|
||||
punpcklwd xmm0,xmm4
|
||||
punpckhwd xmm5,xmm4
|
||||
punpckldq xmm12,xmm0
|
||||
punpckhdq xmm15,xmm0
|
||||
movdqa xmm0,xmm1
|
||||
movdqa xmm11,xmm12
|
||||
punpckldq xmm0,xmm5
|
||||
punpckhdq xmm1,xmm5
|
||||
punpcklqdq xmm11,xmm0
|
||||
punpckhqdq xmm12,xmm0
|
||||
movsx eax,r9w
|
||||
movdqa xmm14,xmm15
|
||||
punpcklqdq xmm14,xmm1
|
||||
punpckhqdq xmm15,xmm1
|
||||
pxor xmm1,xmm1
|
||||
movd xmm0,eax
|
||||
movdqa xmm4,xmm12
|
||||
movdqa xmm8,xmm11
|
||||
movsx eax,word [rsp+170h + 160] ; iBeta
|
||||
punpcklwd xmm0,xmm0
|
||||
punpcklbw xmm4,xmm1
|
||||
punpckhbw xmm12,xmm1
|
||||
movdqa xmm9,xmm14
|
||||
movdqa xmm7,xmm15
|
||||
movdqa xmm10,xmm15
|
||||
pshufd xmm13,xmm0,0
|
||||
punpcklbw xmm9,xmm1
|
||||
punpckhbw xmm14,xmm1
|
||||
movdqa xmm6,xmm13
|
||||
movd xmm0,eax
|
||||
movdqa [rsp],xmm11
|
||||
mov eax,2
|
||||
cwde
|
||||
punpckhbw xmm11,xmm1
|
||||
punpckhbw xmm10,xmm1
|
||||
punpcklbw xmm7,xmm1
|
||||
punpcklwd xmm0,xmm0
|
||||
punpcklbw xmm8,xmm1
|
||||
pshufd xmm3,xmm0,0
|
||||
movdqa xmm1,xmm8
|
||||
movdqa xmm0,xmm4
|
||||
psubw xmm0,xmm9
|
||||
psubw xmm1,xmm4
|
||||
movdqa xmm2,xmm3
|
||||
pabsw xmm0,xmm0
|
||||
pcmpgtw xmm6,xmm0
|
||||
pabsw xmm0,xmm1
|
||||
movdqa xmm1,xmm3
|
||||
pcmpgtw xmm2,xmm0
|
||||
pand xmm6,xmm2
|
||||
movdqa xmm0,xmm7
|
||||
movdqa xmm2,xmm3
|
||||
psubw xmm0,xmm9
|
||||
pabsw xmm0,xmm0
|
||||
pcmpgtw xmm1,xmm0
|
||||
pand xmm6,xmm1
|
||||
movdqa xmm0,xmm12
|
||||
movdqa xmm1,xmm11
|
||||
psubw xmm0,xmm14
|
||||
psubw xmm1,xmm12
|
||||
movdqa xmm5,xmm6
|
||||
pabsw xmm0,xmm0
|
||||
pcmpgtw xmm13,xmm0
|
||||
pabsw xmm0,xmm1
|
||||
movdqa xmm1,xmm8
|
||||
pcmpgtw xmm2,xmm0
|
||||
paddw xmm1,xmm8
|
||||
movdqa xmm0,xmm10
|
||||
pand xmm13,xmm2
|
||||
psubw xmm0,xmm14
|
||||
paddw xmm1,xmm4
|
||||
movdqa xmm2,xmm11
|
||||
pabsw xmm0,xmm0
|
||||
paddw xmm2,xmm11
|
||||
paddw xmm1,xmm7
|
||||
pcmpgtw xmm3,xmm0
|
||||
paddw xmm2,xmm12
|
||||
movd xmm0,eax
|
||||
pand xmm13,xmm3
|
||||
paddw xmm2,xmm10
|
||||
punpcklwd xmm0,xmm0
|
||||
pshufd xmm3,xmm0,0
|
||||
movdqa xmm0,xmm6
|
||||
paddw xmm1,xmm3
|
||||
pandn xmm0,xmm4
|
||||
paddw xmm2,xmm3
|
||||
psraw xmm1,2
|
||||
pand xmm5,xmm1
|
||||
por xmm5,xmm0
|
||||
paddw xmm7,xmm7
|
||||
paddw xmm10,xmm10
|
||||
psraw xmm2,2
|
||||
movdqa xmm1,xmm13
|
||||
movdqa xmm0,xmm13
|
||||
pandn xmm0,xmm12
|
||||
pand xmm1,xmm2
|
||||
paddw xmm7,xmm9
|
||||
por xmm1,xmm0
|
||||
paddw xmm10,xmm14
|
||||
paddw xmm7,xmm8
|
||||
movdqa xmm0,xmm13
|
||||
packuswb xmm5,xmm1
|
||||
paddw xmm7,xmm3
|
||||
paddw xmm10,xmm11
|
||||
movdqa xmm1,xmm6
|
||||
paddw xmm10,xmm3
|
||||
pandn xmm6,xmm9
|
||||
psraw xmm7,2
|
||||
pand xmm1,xmm7
|
||||
psraw xmm10,2
|
||||
pandn xmm13,xmm14
|
||||
pand xmm0,xmm10
|
||||
por xmm1,xmm6
|
||||
movdqa xmm6,[rsp]
|
||||
movdqa xmm4,xmm6
|
||||
por xmm0,xmm13
|
||||
punpcklbw xmm4,xmm5
|
||||
punpckhbw xmm6,xmm5
|
||||
movdqa xmm3,xmm4
|
||||
packuswb xmm1,xmm0
|
||||
movdqa xmm0,xmm1
|
||||
punpckhbw xmm1,xmm15
|
||||
punpcklbw xmm0,xmm15
|
||||
punpcklwd xmm3,xmm0
|
||||
punpckhwd xmm4,xmm0
|
||||
movdqa xmm0,xmm6
|
||||
movdqa xmm2,xmm3
|
||||
punpcklwd xmm0,xmm1
|
||||
punpckhwd xmm6,xmm1
|
||||
movdqa xmm1,xmm4
|
||||
punpckldq xmm2,xmm0
|
||||
punpckhdq xmm3,xmm0
|
||||
punpckldq xmm1,xmm6
|
||||
movdqa xmm0,xmm2
|
||||
punpcklqdq xmm0,xmm1
|
||||
punpckhdq xmm4,xmm6
|
||||
punpckhqdq xmm2,xmm1
|
||||
movdqa [rsp+10h],xmm0
|
||||
movdqa [rsp+60h],xmm2
|
||||
movdqa xmm0,xmm3
|
||||
mov eax,[rsp+10h]
|
||||
mov [rcx-2],eax
|
||||
mov eax,[rsp+60h]
|
||||
punpcklqdq xmm0,xmm4
|
||||
punpckhqdq xmm3,xmm4
|
||||
mov [r10+rcx-2],eax
|
||||
movdqa [rsp+20h],xmm0
|
||||
mov eax, [rsp+20h]
|
||||
movdqa [rsp+70h],xmm3
|
||||
mov [rcx+r10*2-2],eax
|
||||
mov eax,[rsp+70h]
|
||||
mov [rdx+rcx-2],eax
|
||||
mov eax,[rsp+18h]
|
||||
mov [r11],eax
|
||||
mov eax,[rsp+68h]
|
||||
mov [r10+r11],eax
|
||||
mov eax,[rsp+28h]
|
||||
mov [r11+r10*2],eax
|
||||
mov eax,[rsp+78h]
|
||||
mov [rdx+r11],eax
|
||||
mov eax,[rsp+14h]
|
||||
mov [rdi-2],eax
|
||||
mov eax,[rsp+64h]
|
||||
mov [r10+rdi-2],eax
|
||||
mov eax,[rsp+24h]
|
||||
mov [rdi+r10*2-2],eax
|
||||
mov eax, [rsp+74h]
|
||||
mov [rdx+rdi-2],eax
|
||||
mov eax, [rsp+1Ch]
|
||||
mov [rbx],eax
|
||||
mov eax, [rsp+6Ch]
|
||||
mov [r10+rbx],eax
|
||||
mov eax,[rsp+2Ch]
|
||||
mov [rbx+r10*2],eax
|
||||
mov eax,[rsp+7Ch]
|
||||
mov [rdx+rbx],eax
|
||||
lea rsp,[rsp+140h]
|
||||
POP_XMM
|
||||
mov rbx, [rsp+28h]
|
||||
pop rdi
|
||||
ret
|
||||
|
||||
|
||||
|
||||
%elifdef UNIX64
|
||||
|
||||
|
||||
WELS_EXTERN DeblockChromaEq4H_ssse3
|
||||
mov rax,rsp
|
||||
push rbx
|
||||
push rbp
|
||||
push r12
|
||||
|
||||
mov rbp, r8
|
||||
mov r8, rdx
|
||||
mov r9, rcx
|
||||
mov rcx, rdi
|
||||
mov rdx, rsi
|
||||
mov rdi, rdx
|
||||
|
||||
sub rsp,140h
|
||||
lea eax,[r8*4]
|
||||
movsxd r10,eax
|
||||
mov eax,[rcx-2]
|
||||
mov [rsp+10h],eax
|
||||
lea rbx,[r10+rdx-2]
|
||||
lea r11,[r10+rcx-2]
|
||||
|
||||
movdqa xmm5,[rsp+10h]
|
||||
movsxd r10,r8d
|
||||
mov eax,[r10+rcx-2]
|
||||
lea rdx,[r10+r10*2]
|
||||
mov [rsp+20h],eax
|
||||
mov eax,[rcx+r10*2-2]
|
||||
mov [rsp+30h],eax
|
||||
mov eax,[rdx+rcx-2]
|
||||
movdqa xmm2,[rsp+20h]
|
||||
mov [rsp+40h],eax
|
||||
mov eax, [rdi-2]
|
||||
movdqa xmm4,[rsp+30h]
|
||||
mov [rsp+50h],eax
|
||||
mov eax,[r10+rdi-2]
|
||||
movdqa xmm3,[rsp+40h]
|
||||
mov [rsp+60h],eax
|
||||
mov eax,[rdi+r10*2-2]
|
||||
punpckldq xmm5,[rsp+50h]
|
||||
mov [rsp+70h],eax
|
||||
mov eax, [rdx+rdi-2]
|
||||
punpckldq xmm2, [rsp+60h]
|
||||
mov [rsp+80h],eax
|
||||
mov eax,[r11]
|
||||
punpckldq xmm4, [rsp+70h]
|
||||
mov [rsp+50h],eax
|
||||
mov eax,[rbx]
|
||||
punpckldq xmm3,[rsp+80h]
|
||||
mov [rsp+60h],eax
|
||||
mov eax,[r10+r11]
|
||||
movdqa xmm0, [rsp+50h]
|
||||
punpckldq xmm0, [rsp+60h]
|
||||
punpcklqdq xmm5,xmm0
|
||||
movdqa [rsp+50h],xmm0
|
||||
mov [rsp+50h],eax
|
||||
mov eax,[r10+rbx]
|
||||
movdqa xmm0,[rsp+50h]
|
||||
movdqa xmm1,xmm5
|
||||
mov [rsp+60h],eax
|
||||
mov eax,[r11+r10*2]
|
||||
punpckldq xmm0, [rsp+60h]
|
||||
punpcklqdq xmm2,xmm0
|
||||
punpcklbw xmm1,xmm2
|
||||
punpckhbw xmm5,xmm2
|
||||
movdqa [rsp+50h],xmm0
|
||||
mov [rsp+50h],eax
|
||||
mov eax,[rbx+r10*2]
|
||||
movdqa xmm0,[rsp+50h]
|
||||
mov [rsp+60h],eax
|
||||
mov eax, [rdx+r11]
|
||||
movdqa xmm15,xmm1
|
||||
punpckldq xmm0,[rsp+60h]
|
||||
punpcklqdq xmm4,xmm0
|
||||
movdqa [rsp+50h],xmm0
|
||||
mov [rsp+50h],eax
|
||||
mov eax, [rdx+rbx]
|
||||
movdqa xmm0,[rsp+50h]
|
||||
mov [rsp+60h],eax
|
||||
punpckldq xmm0, [rsp+60h]
|
||||
punpcklqdq xmm3,xmm0
|
||||
movdqa xmm0,xmm4
|
||||
punpcklbw xmm0,xmm3
|
||||
punpckhbw xmm4,xmm3
|
||||
punpcklwd xmm15,xmm0
|
||||
punpckhwd xmm1,xmm0
|
||||
movdqa xmm0,xmm5
|
||||
movdqa xmm12,xmm15
|
||||
punpcklwd xmm0,xmm4
|
||||
punpckhwd xmm5,xmm4
|
||||
punpckldq xmm12,xmm0
|
||||
punpckhdq xmm15,xmm0
|
||||
movdqa xmm0,xmm1
|
||||
movdqa xmm11,xmm12
|
||||
punpckldq xmm0,xmm5
|
||||
punpckhdq xmm1,xmm5
|
||||
punpcklqdq xmm11,xmm0
|
||||
punpckhqdq xmm12,xmm0
|
||||
movsx eax,r9w
|
||||
movdqa xmm14,xmm15
|
||||
punpcklqdq xmm14,xmm1
|
||||
punpckhqdq xmm15,xmm1
|
||||
pxor xmm1,xmm1
|
||||
movd xmm0,eax
|
||||
movdqa xmm4,xmm12
|
||||
movdqa xmm8,xmm11
|
||||
mov eax, ebp ; iBeta
|
||||
punpcklwd xmm0,xmm0
|
||||
punpcklbw xmm4,xmm1
|
||||
punpckhbw xmm12,xmm1
|
||||
movdqa xmm9,xmm14
|
||||
movdqa xmm7,xmm15
|
||||
movdqa xmm10,xmm15
|
||||
pshufd xmm13,xmm0,0
|
||||
punpcklbw xmm9,xmm1
|
||||
punpckhbw xmm14,xmm1
|
||||
movdqa xmm6,xmm13
|
||||
movd xmm0,eax
|
||||
movdqa [rsp],xmm11
|
||||
mov eax,2
|
||||
cwde
|
||||
punpckhbw xmm11,xmm1
|
||||
punpckhbw xmm10,xmm1
|
||||
punpcklbw xmm7,xmm1
|
||||
punpcklwd xmm0,xmm0
|
||||
punpcklbw xmm8,xmm1
|
||||
pshufd xmm3,xmm0,0
|
||||
movdqa xmm1,xmm8
|
||||
movdqa xmm0,xmm4
|
||||
psubw xmm0,xmm9
|
||||
psubw xmm1,xmm4
|
||||
movdqa xmm2,xmm3
|
||||
pabsw xmm0,xmm0
|
||||
pcmpgtw xmm6,xmm0
|
||||
pabsw xmm0,xmm1
|
||||
movdqa xmm1,xmm3
|
||||
pcmpgtw xmm2,xmm0
|
||||
pand xmm6,xmm2
|
||||
movdqa xmm0,xmm7
|
||||
movdqa xmm2,xmm3
|
||||
psubw xmm0,xmm9
|
||||
pabsw xmm0,xmm0
|
||||
pcmpgtw xmm1,xmm0
|
||||
pand xmm6,xmm1
|
||||
movdqa xmm0,xmm12
|
||||
movdqa xmm1,xmm11
|
||||
psubw xmm0,xmm14
|
||||
psubw xmm1,xmm12
|
||||
movdqa xmm5,xmm6
|
||||
pabsw xmm0,xmm0
|
||||
pcmpgtw xmm13,xmm0
|
||||
pabsw xmm0,xmm1
|
||||
movdqa xmm1,xmm8
|
||||
pcmpgtw xmm2,xmm0
|
||||
paddw xmm1,xmm8
|
||||
movdqa xmm0,xmm10
|
||||
pand xmm13,xmm2
|
||||
psubw xmm0,xmm14
|
||||
paddw xmm1,xmm4
|
||||
movdqa xmm2,xmm11
|
||||
pabsw xmm0,xmm0
|
||||
paddw xmm2,xmm11
|
||||
paddw xmm1,xmm7
|
||||
pcmpgtw xmm3,xmm0
|
||||
paddw xmm2,xmm12
|
||||
movd xmm0,eax
|
||||
pand xmm13,xmm3
|
||||
paddw xmm2,xmm10
|
||||
punpcklwd xmm0,xmm0
|
||||
pshufd xmm3,xmm0,0
|
||||
movdqa xmm0,xmm6
|
||||
paddw xmm1,xmm3
|
||||
pandn xmm0,xmm4
|
||||
paddw xmm2,xmm3
|
||||
psraw xmm1,2
|
||||
pand xmm5,xmm1
|
||||
por xmm5,xmm0
|
||||
paddw xmm7,xmm7
|
||||
paddw xmm10,xmm10
|
||||
psraw xmm2,2
|
||||
movdqa xmm1,xmm13
|
||||
movdqa xmm0,xmm13
|
||||
pandn xmm0,xmm12
|
||||
pand xmm1,xmm2
|
||||
paddw xmm7,xmm9
|
||||
por xmm1,xmm0
|
||||
paddw xmm10,xmm14
|
||||
paddw xmm7,xmm8
|
||||
movdqa xmm0,xmm13
|
||||
packuswb xmm5,xmm1
|
||||
paddw xmm7,xmm3
|
||||
paddw xmm10,xmm11
|
||||
movdqa xmm1,xmm6
|
||||
paddw xmm10,xmm3
|
||||
pandn xmm6,xmm9
|
||||
psraw xmm7,2
|
||||
pand xmm1,xmm7
|
||||
psraw xmm10,2
|
||||
pandn xmm13,xmm14
|
||||
pand xmm0,xmm10
|
||||
por xmm1,xmm6
|
||||
movdqa xmm6,[rsp]
|
||||
movdqa xmm4,xmm6
|
||||
por xmm0,xmm13
|
||||
punpcklbw xmm4,xmm5
|
||||
punpckhbw xmm6,xmm5
|
||||
movdqa xmm3,xmm4
|
||||
packuswb xmm1,xmm0
|
||||
movdqa xmm0,xmm1
|
||||
punpckhbw xmm1,xmm15
|
||||
punpcklbw xmm0,xmm15
|
||||
punpcklwd xmm3,xmm0
|
||||
punpckhwd xmm4,xmm0
|
||||
movdqa xmm0,xmm6
|
||||
movdqa xmm2,xmm3
|
||||
punpcklwd xmm0,xmm1
|
||||
punpckhwd xmm6,xmm1
|
||||
movdqa xmm1,xmm4
|
||||
punpckldq xmm2,xmm0
|
||||
punpckhdq xmm3,xmm0
|
||||
punpckldq xmm1,xmm6
|
||||
movdqa xmm0,xmm2
|
||||
punpcklqdq xmm0,xmm1
|
||||
punpckhdq xmm4,xmm6
|
||||
punpckhqdq xmm2,xmm1
|
||||
movdqa [rsp+10h],xmm0
|
||||
movdqa [rsp+60h],xmm2
|
||||
movdqa xmm0,xmm3
|
||||
mov eax,[rsp+10h]
|
||||
mov [rcx-2],eax
|
||||
mov eax,[rsp+60h]
|
||||
punpcklqdq xmm0,xmm4
|
||||
punpckhqdq xmm3,xmm4
|
||||
mov [r10+rcx-2],eax
|
||||
movdqa [rsp+20h],xmm0
|
||||
mov eax, [rsp+20h]
|
||||
movdqa [rsp+70h],xmm3
|
||||
mov [rcx+r10*2-2],eax
|
||||
mov eax,[rsp+70h]
|
||||
mov [rdx+rcx-2],eax
|
||||
mov eax,[rsp+18h]
|
||||
mov [r11],eax
|
||||
mov eax,[rsp+68h]
|
||||
mov [r10+r11],eax
|
||||
mov eax,[rsp+28h]
|
||||
mov [r11+r10*2],eax
|
||||
mov eax,[rsp+78h]
|
||||
mov [rdx+r11],eax
|
||||
mov eax,[rsp+14h]
|
||||
mov [rdi-2],eax
|
||||
mov eax,[rsp+64h]
|
||||
mov [r10+rdi-2],eax
|
||||
mov eax,[rsp+24h]
|
||||
mov [rdi+r10*2-2],eax
|
||||
mov eax, [rsp+74h]
|
||||
mov [rdx+rdi-2],eax
|
||||
mov eax, [rsp+1Ch]
|
||||
mov [rbx],eax
|
||||
mov eax, [rsp+6Ch]
|
||||
mov [r10+rbx],eax
|
||||
mov eax,[rsp+2Ch]
|
||||
mov [rbx+r10*2],eax
|
||||
mov eax,[rsp+7Ch]
|
||||
mov [rdx+rbx],eax
|
||||
lea r11,[rsp+140h]
|
||||
mov rbx, [r11+28h]
|
||||
mov rsp,r11
|
||||
pop r12
|
||||
pop rbp
|
||||
pop rbx
|
||||
ret
|
||||
|
||||
|
||||
|
||||
%elifdef X86_32
|
||||
|
||||
;***************************************************************************
|
||||
; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
||||
; int32_t iAlpha, int32_t iBeta)
|
||||
;***************************************************************************
|
||||
|
||||
WELS_EXTERN DeblockChromaEq4H_ssse3
|
||||
push ebp
|
||||
mov ebp,esp
|
||||
and esp,0FFFFFFF0h
|
||||
sub esp,0C8h
|
||||
mov ecx,dword [ebp+8]
|
||||
mov edx,dword [ebp+0Ch]
|
||||
mov eax,dword [ebp+10h]
|
||||
sub ecx,2
|
||||
sub edx,2
|
||||
push esi
|
||||
lea esi,[eax+eax*2]
|
||||
mov dword [esp+18h],ecx
|
||||
mov dword [esp+4],edx
|
||||
lea ecx,[ecx+eax*4]
|
||||
lea edx,[edx+eax*4]
|
||||
lea eax,[esp+7Ch]
|
||||
push edi
|
||||
mov dword [esp+14h],esi
|
||||
mov dword [esp+18h],ecx
|
||||
mov dword [esp+0Ch],edx
|
||||
mov dword [esp+10h],eax
|
||||
mov esi,dword [esp+1Ch]
|
||||
mov ecx,dword [ebp+10h]
|
||||
mov edx,dword [esp+14h]
|
||||
movd xmm0,dword [esi]
|
||||
movd xmm1,dword [esi+ecx]
|
||||
movd xmm2,dword [esi+ecx*2]
|
||||
movd xmm3,dword [esi+edx]
|
||||
mov esi,dword [esp+8]
|
||||
movd xmm4,dword [esi]
|
||||
movd xmm5,dword [esi+ecx]
|
||||
movd xmm6,dword [esi+ecx*2]
|
||||
movd xmm7,dword [esi+edx]
|
||||
punpckldq xmm0,xmm4
|
||||
punpckldq xmm1,xmm5
|
||||
punpckldq xmm2,xmm6
|
||||
punpckldq xmm3,xmm7
|
||||
mov esi,dword [esp+18h]
|
||||
mov edi,dword [esp+0Ch]
|
||||
movd xmm4,dword [esi]
|
||||
movd xmm5,dword [edi]
|
||||
punpckldq xmm4,xmm5
|
||||
punpcklqdq xmm0,xmm4
|
||||
movd xmm4,dword [esi+ecx]
|
||||
movd xmm5,dword [edi+ecx]
|
||||
punpckldq xmm4,xmm5
|
||||
punpcklqdq xmm1,xmm4
|
||||
movd xmm4,dword [esi+ecx*2]
|
||||
movd xmm5,dword [edi+ecx*2]
|
||||
punpckldq xmm4,xmm5
|
||||
punpcklqdq xmm2,xmm4
|
||||
movd xmm4,dword [esi+edx]
|
||||
movd xmm5,dword [edi+edx]
|
||||
punpckldq xmm4,xmm5
|
||||
punpcklqdq xmm3,xmm4
|
||||
movdqa xmm6,xmm0
|
||||
punpcklbw xmm0,xmm1
|
||||
punpckhbw xmm6,xmm1
|
||||
movdqa xmm7,xmm2
|
||||
punpcklbw xmm2,xmm3
|
||||
punpckhbw xmm7,xmm3
|
||||
movdqa xmm4,xmm0
|
||||
movdqa xmm5,xmm6
|
||||
punpcklwd xmm0,xmm2
|
||||
punpckhwd xmm4,xmm2
|
||||
punpcklwd xmm6,xmm7
|
||||
punpckhwd xmm5,xmm7
|
||||
movdqa xmm1,xmm0
|
||||
movdqa xmm2,xmm4
|
||||
punpckldq xmm0,xmm6
|
||||
punpckhdq xmm1,xmm6
|
||||
punpckldq xmm4,xmm5
|
||||
punpckhdq xmm2,xmm5
|
||||
movdqa xmm5,xmm0
|
||||
movdqa xmm6,xmm1
|
||||
punpcklqdq xmm0,xmm4
|
||||
punpckhqdq xmm5,xmm4
|
||||
punpcklqdq xmm1,xmm2
|
||||
punpckhqdq xmm6,xmm2
|
||||
mov edi,dword [esp+10h]
|
||||
movdqa [edi],xmm0
|
||||
movdqa [edi+10h],xmm5
|
||||
movdqa [edi+20h],xmm1
|
||||
movdqa [edi+30h],xmm6
|
||||
movsx ecx,word [ebp+14h]
|
||||
movsx edx,word [ebp+18h]
|
||||
movdqa xmm6,[esp+80h]
|
||||
movdqa xmm4,[esp+90h]
|
||||
movdqa xmm5,[esp+0A0h]
|
||||
movdqa xmm7,[esp+0B0h]
|
||||
pxor xmm0,xmm0
|
||||
movd xmm1,ecx
|
||||
movdqa xmm2,xmm1
|
||||
punpcklwd xmm2,xmm1
|
||||
pshufd xmm1,xmm2,0
|
||||
movd xmm2,edx
|
||||
movdqa xmm3,xmm2
|
||||
punpcklwd xmm3,xmm2
|
||||
pshufd xmm2,xmm3,0
|
||||
movdqa xmm3,xmm6
|
||||
punpckhbw xmm6,xmm0
|
||||
movdqa [esp+60h],xmm6
|
||||
movdqa xmm6,[esp+90h]
|
||||
punpckhbw xmm6,xmm0
|
||||
movdqa [esp+30h],xmm6
|
||||
movdqa xmm6,[esp+0A0h]
|
||||
punpckhbw xmm6,xmm0
|
||||
movdqa [esp+40h],xmm6
|
||||
movdqa xmm6,[esp+0B0h]
|
||||
punpckhbw xmm6,xmm0
|
||||
movdqa [esp+70h],xmm6
|
||||
punpcklbw xmm7,xmm0
|
||||
punpcklbw xmm4,xmm0
|
||||
punpcklbw xmm5,xmm0
|
||||
punpcklbw xmm3,xmm0
|
||||
movdqa [esp+50h],xmm7
|
||||
movdqa xmm6,xmm4
|
||||
psubw xmm6,xmm5
|
||||
pabsw xmm6,xmm6
|
||||
movdqa xmm0,xmm1
|
||||
pcmpgtw xmm0,xmm6
|
||||
movdqa xmm6,xmm3
|
||||
psubw xmm6,xmm4
|
||||
pabsw xmm6,xmm6
|
||||
movdqa xmm7,xmm2
|
||||
pcmpgtw xmm7,xmm6
|
||||
movdqa xmm6,[esp+50h]
|
||||
psubw xmm6,xmm5
|
||||
pabsw xmm6,xmm6
|
||||
pand xmm0,xmm7
|
||||
movdqa xmm7,xmm2
|
||||
pcmpgtw xmm7,xmm6
|
||||
movdqa xmm6,[esp+30h]
|
||||
psubw xmm6,[esp+40h]
|
||||
pabsw xmm6,xmm6
|
||||
pcmpgtw xmm1,xmm6
|
||||
movdqa xmm6,[esp+60h]
|
||||
psubw xmm6,[esp+30h]
|
||||
pabsw xmm6,xmm6
|
||||
pand xmm0,xmm7
|
||||
movdqa xmm7,xmm2
|
||||
pcmpgtw xmm7,xmm6
|
||||
movdqa xmm6,[esp+70h]
|
||||
psubw xmm6,[esp+40h]
|
||||
pabsw xmm6,xmm6
|
||||
pand xmm1,xmm7
|
||||
pcmpgtw xmm2,xmm6
|
||||
pand xmm1,xmm2
|
||||
mov eax,2
|
||||
movsx ecx,ax
|
||||
movd xmm2,ecx
|
||||
movdqa xmm6,xmm2
|
||||
punpcklwd xmm6,xmm2
|
||||
pshufd xmm2,xmm6,0
|
||||
movdqa [esp+20h],xmm2
|
||||
movdqa xmm2,xmm3
|
||||
paddw xmm2,xmm3
|
||||
paddw xmm2,xmm4
|
||||
paddw xmm2,[esp+50h]
|
||||
paddw xmm2,[esp+20h]
|
||||
psraw xmm2,2
|
||||
movdqa xmm6,xmm0
|
||||
pand xmm6,xmm2
|
||||
movdqa xmm2,xmm0
|
||||
pandn xmm2,xmm4
|
||||
por xmm6,xmm2
|
||||
movdqa xmm2,[esp+60h]
|
||||
movdqa xmm7,xmm2
|
||||
paddw xmm7,xmm2
|
||||
paddw xmm7,[esp+30h]
|
||||
paddw xmm7,[esp+70h]
|
||||
paddw xmm7,[esp+20h]
|
||||
movdqa xmm4,xmm1
|
||||
movdqa xmm2,xmm1
|
||||
pandn xmm2,[esp+30h]
|
||||
psraw xmm7,2
|
||||
pand xmm4,xmm7
|
||||
por xmm4,xmm2
|
||||
movdqa xmm2,[esp+50h]
|
||||
packuswb xmm6,xmm4
|
||||
movdqa [esp+90h],xmm6
|
||||
movdqa xmm6,xmm2
|
||||
paddw xmm6,xmm2
|
||||
movdqa xmm2,[esp+20h]
|
||||
paddw xmm6,xmm5
|
||||
paddw xmm6,xmm3
|
||||
movdqa xmm4,xmm0
|
||||
pandn xmm0,xmm5
|
||||
paddw xmm6,xmm2
|
||||
psraw xmm6,2
|
||||
pand xmm4,xmm6
|
||||
por xmm4,xmm0
|
||||
movdqa xmm0,[esp+70h]
|
||||
movdqa xmm5,xmm0
|
||||
paddw xmm5,xmm0
|
||||
movdqa xmm0,[esp+40h]
|
||||
paddw xmm5,xmm0
|
||||
paddw xmm5,[esp+60h]
|
||||
movdqa xmm3,xmm1
|
||||
paddw xmm5,xmm2
|
||||
psraw xmm5,2
|
||||
pand xmm3,xmm5
|
||||
pandn xmm1,xmm0
|
||||
por xmm3,xmm1
|
||||
packuswb xmm4,xmm3
|
||||
movdqa [esp+0A0h],xmm4
|
||||
mov esi,dword [esp+10h]
|
||||
movdqa xmm0,[esi]
|
||||
movdqa xmm1,[esi+10h]
|
||||
movdqa xmm2,[esi+20h]
|
||||
movdqa xmm3,[esi+30h]
|
||||
movdqa xmm6,xmm0
|
||||
punpcklbw xmm0,xmm1
|
||||
punpckhbw xmm6,xmm1
|
||||
movdqa xmm7,xmm2
|
||||
punpcklbw xmm2,xmm3
|
||||
punpckhbw xmm7,xmm3
|
||||
movdqa xmm4,xmm0
|
||||
movdqa xmm5,xmm6
|
||||
punpcklwd xmm0,xmm2
|
||||
punpckhwd xmm4,xmm2
|
||||
punpcklwd xmm6,xmm7
|
||||
punpckhwd xmm5,xmm7
|
||||
movdqa xmm1,xmm0
|
||||
movdqa xmm2,xmm4
|
||||
punpckldq xmm0,xmm6
|
||||
punpckhdq xmm1,xmm6
|
||||
punpckldq xmm4,xmm5
|
||||
punpckhdq xmm2,xmm5
|
||||
movdqa xmm5,xmm0
|
||||
movdqa xmm6,xmm1
|
||||
punpcklqdq xmm0,xmm4
|
||||
punpckhqdq xmm5,xmm4
|
||||
punpcklqdq xmm1,xmm2
|
||||
punpckhqdq xmm6,xmm2
|
||||
mov esi,dword [esp+1Ch]
|
||||
mov ecx,dword [ebp+10h]
|
||||
mov edx,dword [esp+14h]
|
||||
mov edi,dword [esp+8]
|
||||
movd dword [esi],xmm0
|
||||
movd dword [esi+ecx],xmm5
|
||||
movd dword [esi+ecx*2],xmm1
|
||||
movd dword [esi+edx],xmm6
|
||||
psrldq xmm0,4
|
||||
psrldq xmm5,4
|
||||
psrldq xmm1,4
|
||||
psrldq xmm6,4
|
||||
mov esi,dword [esp+18h]
|
||||
movd dword [edi],xmm0
|
||||
movd dword [edi+ecx],xmm5
|
||||
movd dword [edi+ecx*2],xmm1
|
||||
movd dword [edi+edx],xmm6
|
||||
psrldq xmm0,4
|
||||
psrldq xmm5,4
|
||||
psrldq xmm1,4
|
||||
psrldq xmm6,4
|
||||
movd dword [esi],xmm0
|
||||
movd dword [esi+ecx],xmm5
|
||||
movd dword [esi+ecx*2],xmm1
|
||||
movd dword [esi+edx],xmm6
|
||||
psrldq xmm0,4
|
||||
psrldq xmm5,4
|
||||
psrldq xmm1,4
|
||||
psrldq xmm6,4
|
||||
mov edi,dword [esp+0Ch]
|
||||
movd dword [edi],xmm0
|
||||
movd dword [edi+ecx],xmm5
|
||||
movd dword [edi+ecx*2],xmm1
|
||||
movd dword [edi+edx],xmm6
|
||||
pop edi
|
||||
pop esi
|
||||
mov esp,ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r2, r2d
|
||||
movd xmm7, arg4d
|
||||
pxor xmm0, xmm0
|
||||
pshufb xmm7, xmm0 ; iAlpha
|
||||
lea r3, [3 * r2 - 1] ; 3 * iStride - 1
|
||||
|
||||
SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
|
||||
SSSE3_DeblockChromaEq4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, xmm2, xmm3, xmm6
|
||||
%ifdef X86_32
|
||||
push r4
|
||||
push r5
|
||||
SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
|
||||
pop r5
|
||||
pop r4
|
||||
%else
|
||||
SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
|
||||
%endif
|
||||
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
||||
|
||||
;********************************************************************************
|
||||
|
Loading…
Reference in New Issue
Block a user