diff --git a/codec/common/x86/deblock.asm b/codec/common/x86/deblock.asm index 608c96f0..1192f4b0 100644 --- a/codec/common/x86/deblock.asm +++ b/codec/common/x86/deblock.asm @@ -509,6 +509,32 @@ WELS_EXTERN DeblockLumaEq4V_ssse3 SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9 %endmacro +; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 xmmclobber=%7,%8,%9 +%macro SSSE3_DeblockChromaEq4 9 + movdqa %7, %3 + SSE2_AbsDiffUB %7, %2, %8 ; |p0 - q0| + SSE2_CmpgeUB %7, %5 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha + movdqa %8, %4 + SSE2_AbsDiffUB %8, %3, %5 ; |q1 - q0| + movdqa %9, %1 + SSE2_AbsDiffUB %9, %2, %5 ; |p1 - p0| + pmaxub %8, %9 ; max(|q1 - q0|, |p1 - p0|) + pxor %9, %9 + movd %5, %6 + pshufb %5, %9 ; iBeta + SSE2_CmpgeUB %8, %5 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta + por %7, %8 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0 + WELS_DB1 %5 + movdqa %8, %2 + SSE2_AvgbFloor1 %8, %4, %5, %9 ; (p0 + q1) >> 1 + pavgb %8, %1 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1 + movdqa %9, %7 + SSE2_Blend %2, %8, %7 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0 + SSE2_AvgbFloor1 %1, %3, %5, %7 ; (q0 + p1) >> 1 + pavgb %1, %4 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1 + SSE2_Blend %3, %1, %9 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0 +%endmacro + ;****************************************************************************** ; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, @@ -572,42 +598,15 @@ WELS_EXTERN DeblockChromaEq4V_ssse3 movhps xmm0, [r1 + 0 * r2] ; q0 cr movq xmm2, [r0 + 1 * r3] ; p0 cb movhps xmm2, [r1 + 1 * r3] ; p0 cr - - movdqa xmm4, xmm0 - SSE2_AbsDiffUB xmm4, xmm2, xmm5 ; |p0 - q0| - SSE2_CmpgeUB xmm4, xmm7 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha - movq xmm1, [r0 + 1 * r2] ; q1 cb movhps xmm1, [r1 + 1 * r2] ; q1 cr movq xmm3, [r0 + 2 * r3] ; p1 cb movhps xmm3, [r1 + 2 * r3] ; p1 cr - movdqa xmm5, xmm1 - SSE2_AbsDiffUB xmm5, xmm0, xmm7 ; |q1 - q0| - movdqa xmm6, xmm3 - SSE2_AbsDiffUB xmm6, xmm2, xmm7 ; |p1 - p0| - pmaxub xmm5, xmm6 ; max(|q1 - q0|, |p1 - p0|) + SSSE3_DeblockChromaEq4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, xmm4, xmm5, xmm6 - pxor xmm6, xmm6 - movd xmm7, arg5d - pshufb xmm7, xmm6 ; iBeta - - SSE2_CmpgeUB xmm5, xmm7 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta - por xmm4, xmm5 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0 - - WELS_DB1 xmm7 - movdqa xmm5, xmm2 - SSE2_AvgbFloor1 xmm2, xmm1, xmm7, xmm6 ; (p0 + q1) >> 1 - pavgb xmm2, xmm3 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1 - movdqa xmm6, xmm4 - SSE2_Blend xmm5, xmm2, xmm4 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0 - - SSE2_AvgbFloor1 xmm3, xmm0, xmm7, xmm4 ; (q0 + p1) >> 1 - pavgb xmm3, xmm1 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1 - SSE2_Blend xmm0, xmm3, xmm6 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0 - - movlps [r0 + 1 * r3], xmm5 ; store p0 cb - movhps [r1 + 1 * r3], xmm5 ; store p0 cr + movlps [r0 + 1 * r3], xmm2 ; store p0 cb + movhps [r1 + 1 * r3], xmm2 ; store p0 cr movlps [r0 + 0 * r2], xmm0 ; store q0 cb movhps [r1 + 0 * r2], xmm0 ; store q0 cr @@ -640,834 +639,36 @@ WELS_EXTERN DeblockChromaLt4H_ssse3 ret -%ifdef WIN64 - - -WELS_EXTERN DeblockChromaEq4H_ssse3 - mov rax,rsp - mov [rax+20h],rbx - push rdi - PUSH_XMM 16 - sub rsp,140h - mov rdi,rdx - lea eax,[r8*4] - movsxd r10,eax - mov eax,[rcx-2] - mov [rsp+10h],eax - lea rbx,[r10+rdx-2] - lea r11,[r10+rcx-2] - movdqa xmm5,[rsp+10h] - movsxd r10,r8d - mov eax,[r10+rcx-2] - lea rdx,[r10+r10*2] - mov [rsp+20h],eax - mov eax,[rcx+r10*2-2] - mov [rsp+30h],eax - mov eax,[rdx+rcx-2] - movdqa xmm2,[rsp+20h] - mov [rsp+40h],eax - mov eax, [rdi-2] - movdqa xmm4,[rsp+30h] - mov [rsp+50h],eax - mov eax,[r10+rdi-2] - movdqa xmm3,[rsp+40h] - mov [rsp+60h],eax - mov eax,[rdi+r10*2-2] - punpckldq xmm5,[rsp+50h] - mov [rsp+70h],eax - mov eax, [rdx+rdi-2] - punpckldq xmm2, [rsp+60h] - mov [rsp+80h],eax - mov eax,[r11] - punpckldq xmm4, [rsp+70h] - mov [rsp+50h],eax - mov eax,[rbx] - punpckldq xmm3,[rsp+80h] - mov [rsp+60h],eax - mov eax,[r10+r11] - movdqa xmm0, [rsp+50h] - punpckldq xmm0, [rsp+60h] - punpcklqdq xmm5,xmm0 - movdqa [rsp+50h],xmm0 - mov [rsp+50h],eax - mov eax,[r10+rbx] - movdqa xmm0,[rsp+50h] - movdqa xmm1,xmm5 - mov [rsp+60h],eax - mov eax,[r11+r10*2] - punpckldq xmm0, [rsp+60h] - punpcklqdq xmm2,xmm0 - punpcklbw xmm1,xmm2 - punpckhbw xmm5,xmm2 - movdqa [rsp+50h],xmm0 - mov [rsp+50h],eax - mov eax,[rbx+r10*2] - movdqa xmm0,[rsp+50h] - mov [rsp+60h],eax - mov eax, [rdx+r11] - movdqa xmm15,xmm1 - punpckldq xmm0,[rsp+60h] - punpcklqdq xmm4,xmm0 - movdqa [rsp+50h],xmm0 - mov [rsp+50h],eax - mov eax, [rdx+rbx] - movdqa xmm0,[rsp+50h] - mov [rsp+60h],eax - punpckldq xmm0, [rsp+60h] - punpcklqdq xmm3,xmm0 - movdqa xmm0,xmm4 - punpcklbw xmm0,xmm3 - punpckhbw xmm4,xmm3 - punpcklwd xmm15,xmm0 - punpckhwd xmm1,xmm0 - movdqa xmm0,xmm5 - movdqa xmm12,xmm15 - punpcklwd xmm0,xmm4 - punpckhwd xmm5,xmm4 - punpckldq xmm12,xmm0 - punpckhdq xmm15,xmm0 - movdqa xmm0,xmm1 - movdqa xmm11,xmm12 - punpckldq xmm0,xmm5 - punpckhdq xmm1,xmm5 - punpcklqdq xmm11,xmm0 - punpckhqdq xmm12,xmm0 - movsx eax,r9w - movdqa xmm14,xmm15 - punpcklqdq xmm14,xmm1 - punpckhqdq xmm15,xmm1 - pxor xmm1,xmm1 - movd xmm0,eax - movdqa xmm4,xmm12 - movdqa xmm8,xmm11 - movsx eax,word [rsp+170h + 160] ; iBeta - punpcklwd xmm0,xmm0 - punpcklbw xmm4,xmm1 - punpckhbw xmm12,xmm1 - movdqa xmm9,xmm14 - movdqa xmm7,xmm15 - movdqa xmm10,xmm15 - pshufd xmm13,xmm0,0 - punpcklbw xmm9,xmm1 - punpckhbw xmm14,xmm1 - movdqa xmm6,xmm13 - movd xmm0,eax - movdqa [rsp],xmm11 - mov eax,2 - cwde - punpckhbw xmm11,xmm1 - punpckhbw xmm10,xmm1 - punpcklbw xmm7,xmm1 - punpcklwd xmm0,xmm0 - punpcklbw xmm8,xmm1 - pshufd xmm3,xmm0,0 - movdqa xmm1,xmm8 - movdqa xmm0,xmm4 - psubw xmm0,xmm9 - psubw xmm1,xmm4 - movdqa xmm2,xmm3 - pabsw xmm0,xmm0 - pcmpgtw xmm6,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,xmm3 - pcmpgtw xmm2,xmm0 - pand xmm6,xmm2 - movdqa xmm0,xmm7 - movdqa xmm2,xmm3 - psubw xmm0,xmm9 - pabsw xmm0,xmm0 - pcmpgtw xmm1,xmm0 - pand xmm6,xmm1 - movdqa xmm0,xmm12 - movdqa xmm1,xmm11 - psubw xmm0,xmm14 - psubw xmm1,xmm12 - movdqa xmm5,xmm6 - pabsw xmm0,xmm0 - pcmpgtw xmm13,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,xmm8 - pcmpgtw xmm2,xmm0 - paddw xmm1,xmm8 - movdqa xmm0,xmm10 - pand xmm13,xmm2 - psubw xmm0,xmm14 - paddw xmm1,xmm4 - movdqa xmm2,xmm11 - pabsw xmm0,xmm0 - paddw xmm2,xmm11 - paddw xmm1,xmm7 - pcmpgtw xmm3,xmm0 - paddw xmm2,xmm12 - movd xmm0,eax - pand xmm13,xmm3 - paddw xmm2,xmm10 - punpcklwd xmm0,xmm0 - pshufd xmm3,xmm0,0 - movdqa xmm0,xmm6 - paddw xmm1,xmm3 - pandn xmm0,xmm4 - paddw xmm2,xmm3 - psraw xmm1,2 - pand xmm5,xmm1 - por xmm5,xmm0 - paddw xmm7,xmm7 - paddw xmm10,xmm10 - psraw xmm2,2 - movdqa xmm1,xmm13 - movdqa xmm0,xmm13 - pandn xmm0,xmm12 - pand xmm1,xmm2 - paddw xmm7,xmm9 - por xmm1,xmm0 - paddw xmm10,xmm14 - paddw xmm7,xmm8 - movdqa xmm0,xmm13 - packuswb xmm5,xmm1 - paddw xmm7,xmm3 - paddw xmm10,xmm11 - movdqa xmm1,xmm6 - paddw xmm10,xmm3 - pandn xmm6,xmm9 - psraw xmm7,2 - pand xmm1,xmm7 - psraw xmm10,2 - pandn xmm13,xmm14 - pand xmm0,xmm10 - por xmm1,xmm6 - movdqa xmm6,[rsp] - movdqa xmm4,xmm6 - por xmm0,xmm13 - punpcklbw xmm4,xmm5 - punpckhbw xmm6,xmm5 - movdqa xmm3,xmm4 - packuswb xmm1,xmm0 - movdqa xmm0,xmm1 - punpckhbw xmm1,xmm15 - punpcklbw xmm0,xmm15 - punpcklwd xmm3,xmm0 - punpckhwd xmm4,xmm0 - movdqa xmm0,xmm6 - movdqa xmm2,xmm3 - punpcklwd xmm0,xmm1 - punpckhwd xmm6,xmm1 - movdqa xmm1,xmm4 - punpckldq xmm2,xmm0 - punpckhdq xmm3,xmm0 - punpckldq xmm1,xmm6 - movdqa xmm0,xmm2 - punpcklqdq xmm0,xmm1 - punpckhdq xmm4,xmm6 - punpckhqdq xmm2,xmm1 - movdqa [rsp+10h],xmm0 - movdqa [rsp+60h],xmm2 - movdqa xmm0,xmm3 - mov eax,[rsp+10h] - mov [rcx-2],eax - mov eax,[rsp+60h] - punpcklqdq xmm0,xmm4 - punpckhqdq xmm3,xmm4 - mov [r10+rcx-2],eax - movdqa [rsp+20h],xmm0 - mov eax, [rsp+20h] - movdqa [rsp+70h],xmm3 - mov [rcx+r10*2-2],eax - mov eax,[rsp+70h] - mov [rdx+rcx-2],eax - mov eax,[rsp+18h] - mov [r11],eax - mov eax,[rsp+68h] - mov [r10+r11],eax - mov eax,[rsp+28h] - mov [r11+r10*2],eax - mov eax,[rsp+78h] - mov [rdx+r11],eax - mov eax,[rsp+14h] - mov [rdi-2],eax - mov eax,[rsp+64h] - mov [r10+rdi-2],eax - mov eax,[rsp+24h] - mov [rdi+r10*2-2],eax - mov eax, [rsp+74h] - mov [rdx+rdi-2],eax - mov eax, [rsp+1Ch] - mov [rbx],eax - mov eax, [rsp+6Ch] - mov [r10+rbx],eax - mov eax,[rsp+2Ch] - mov [rbx+r10*2],eax - mov eax,[rsp+7Ch] - mov [rdx+rbx],eax - lea rsp,[rsp+140h] - POP_XMM - mov rbx, [rsp+28h] - pop rdi - ret - - - -%elifdef UNIX64 - - -WELS_EXTERN DeblockChromaEq4H_ssse3 - mov rax,rsp - push rbx - push rbp - push r12 - - mov rbp, r8 - mov r8, rdx - mov r9, rcx - mov rcx, rdi - mov rdx, rsi - mov rdi, rdx - - sub rsp,140h - lea eax,[r8*4] - movsxd r10,eax - mov eax,[rcx-2] - mov [rsp+10h],eax - lea rbx,[r10+rdx-2] - lea r11,[r10+rcx-2] - - movdqa xmm5,[rsp+10h] - movsxd r10,r8d - mov eax,[r10+rcx-2] - lea rdx,[r10+r10*2] - mov [rsp+20h],eax - mov eax,[rcx+r10*2-2] - mov [rsp+30h],eax - mov eax,[rdx+rcx-2] - movdqa xmm2,[rsp+20h] - mov [rsp+40h],eax - mov eax, [rdi-2] - movdqa xmm4,[rsp+30h] - mov [rsp+50h],eax - mov eax,[r10+rdi-2] - movdqa xmm3,[rsp+40h] - mov [rsp+60h],eax - mov eax,[rdi+r10*2-2] - punpckldq xmm5,[rsp+50h] - mov [rsp+70h],eax - mov eax, [rdx+rdi-2] - punpckldq xmm2, [rsp+60h] - mov [rsp+80h],eax - mov eax,[r11] - punpckldq xmm4, [rsp+70h] - mov [rsp+50h],eax - mov eax,[rbx] - punpckldq xmm3,[rsp+80h] - mov [rsp+60h],eax - mov eax,[r10+r11] - movdqa xmm0, [rsp+50h] - punpckldq xmm0, [rsp+60h] - punpcklqdq xmm5,xmm0 - movdqa [rsp+50h],xmm0 - mov [rsp+50h],eax - mov eax,[r10+rbx] - movdqa xmm0,[rsp+50h] - movdqa xmm1,xmm5 - mov [rsp+60h],eax - mov eax,[r11+r10*2] - punpckldq xmm0, [rsp+60h] - punpcklqdq xmm2,xmm0 - punpcklbw xmm1,xmm2 - punpckhbw xmm5,xmm2 - movdqa [rsp+50h],xmm0 - mov [rsp+50h],eax - mov eax,[rbx+r10*2] - movdqa xmm0,[rsp+50h] - mov [rsp+60h],eax - mov eax, [rdx+r11] - movdqa xmm15,xmm1 - punpckldq xmm0,[rsp+60h] - punpcklqdq xmm4,xmm0 - movdqa [rsp+50h],xmm0 - mov [rsp+50h],eax - mov eax, [rdx+rbx] - movdqa xmm0,[rsp+50h] - mov [rsp+60h],eax - punpckldq xmm0, [rsp+60h] - punpcklqdq xmm3,xmm0 - movdqa xmm0,xmm4 - punpcklbw xmm0,xmm3 - punpckhbw xmm4,xmm3 - punpcklwd xmm15,xmm0 - punpckhwd xmm1,xmm0 - movdqa xmm0,xmm5 - movdqa xmm12,xmm15 - punpcklwd xmm0,xmm4 - punpckhwd xmm5,xmm4 - punpckldq xmm12,xmm0 - punpckhdq xmm15,xmm0 - movdqa xmm0,xmm1 - movdqa xmm11,xmm12 - punpckldq xmm0,xmm5 - punpckhdq xmm1,xmm5 - punpcklqdq xmm11,xmm0 - punpckhqdq xmm12,xmm0 - movsx eax,r9w - movdqa xmm14,xmm15 - punpcklqdq xmm14,xmm1 - punpckhqdq xmm15,xmm1 - pxor xmm1,xmm1 - movd xmm0,eax - movdqa xmm4,xmm12 - movdqa xmm8,xmm11 - mov eax, ebp ; iBeta - punpcklwd xmm0,xmm0 - punpcklbw xmm4,xmm1 - punpckhbw xmm12,xmm1 - movdqa xmm9,xmm14 - movdqa xmm7,xmm15 - movdqa xmm10,xmm15 - pshufd xmm13,xmm0,0 - punpcklbw xmm9,xmm1 - punpckhbw xmm14,xmm1 - movdqa xmm6,xmm13 - movd xmm0,eax - movdqa [rsp],xmm11 - mov eax,2 - cwde - punpckhbw xmm11,xmm1 - punpckhbw xmm10,xmm1 - punpcklbw xmm7,xmm1 - punpcklwd xmm0,xmm0 - punpcklbw xmm8,xmm1 - pshufd xmm3,xmm0,0 - movdqa xmm1,xmm8 - movdqa xmm0,xmm4 - psubw xmm0,xmm9 - psubw xmm1,xmm4 - movdqa xmm2,xmm3 - pabsw xmm0,xmm0 - pcmpgtw xmm6,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,xmm3 - pcmpgtw xmm2,xmm0 - pand xmm6,xmm2 - movdqa xmm0,xmm7 - movdqa xmm2,xmm3 - psubw xmm0,xmm9 - pabsw xmm0,xmm0 - pcmpgtw xmm1,xmm0 - pand xmm6,xmm1 - movdqa xmm0,xmm12 - movdqa xmm1,xmm11 - psubw xmm0,xmm14 - psubw xmm1,xmm12 - movdqa xmm5,xmm6 - pabsw xmm0,xmm0 - pcmpgtw xmm13,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,xmm8 - pcmpgtw xmm2,xmm0 - paddw xmm1,xmm8 - movdqa xmm0,xmm10 - pand xmm13,xmm2 - psubw xmm0,xmm14 - paddw xmm1,xmm4 - movdqa xmm2,xmm11 - pabsw xmm0,xmm0 - paddw xmm2,xmm11 - paddw xmm1,xmm7 - pcmpgtw xmm3,xmm0 - paddw xmm2,xmm12 - movd xmm0,eax - pand xmm13,xmm3 - paddw xmm2,xmm10 - punpcklwd xmm0,xmm0 - pshufd xmm3,xmm0,0 - movdqa xmm0,xmm6 - paddw xmm1,xmm3 - pandn xmm0,xmm4 - paddw xmm2,xmm3 - psraw xmm1,2 - pand xmm5,xmm1 - por xmm5,xmm0 - paddw xmm7,xmm7 - paddw xmm10,xmm10 - psraw xmm2,2 - movdqa xmm1,xmm13 - movdqa xmm0,xmm13 - pandn xmm0,xmm12 - pand xmm1,xmm2 - paddw xmm7,xmm9 - por xmm1,xmm0 - paddw xmm10,xmm14 - paddw xmm7,xmm8 - movdqa xmm0,xmm13 - packuswb xmm5,xmm1 - paddw xmm7,xmm3 - paddw xmm10,xmm11 - movdqa xmm1,xmm6 - paddw xmm10,xmm3 - pandn xmm6,xmm9 - psraw xmm7,2 - pand xmm1,xmm7 - psraw xmm10,2 - pandn xmm13,xmm14 - pand xmm0,xmm10 - por xmm1,xmm6 - movdqa xmm6,[rsp] - movdqa xmm4,xmm6 - por xmm0,xmm13 - punpcklbw xmm4,xmm5 - punpckhbw xmm6,xmm5 - movdqa xmm3,xmm4 - packuswb xmm1,xmm0 - movdqa xmm0,xmm1 - punpckhbw xmm1,xmm15 - punpcklbw xmm0,xmm15 - punpcklwd xmm3,xmm0 - punpckhwd xmm4,xmm0 - movdqa xmm0,xmm6 - movdqa xmm2,xmm3 - punpcklwd xmm0,xmm1 - punpckhwd xmm6,xmm1 - movdqa xmm1,xmm4 - punpckldq xmm2,xmm0 - punpckhdq xmm3,xmm0 - punpckldq xmm1,xmm6 - movdqa xmm0,xmm2 - punpcklqdq xmm0,xmm1 - punpckhdq xmm4,xmm6 - punpckhqdq xmm2,xmm1 - movdqa [rsp+10h],xmm0 - movdqa [rsp+60h],xmm2 - movdqa xmm0,xmm3 - mov eax,[rsp+10h] - mov [rcx-2],eax - mov eax,[rsp+60h] - punpcklqdq xmm0,xmm4 - punpckhqdq xmm3,xmm4 - mov [r10+rcx-2],eax - movdqa [rsp+20h],xmm0 - mov eax, [rsp+20h] - movdqa [rsp+70h],xmm3 - mov [rcx+r10*2-2],eax - mov eax,[rsp+70h] - mov [rdx+rcx-2],eax - mov eax,[rsp+18h] - mov [r11],eax - mov eax,[rsp+68h] - mov [r10+r11],eax - mov eax,[rsp+28h] - mov [r11+r10*2],eax - mov eax,[rsp+78h] - mov [rdx+r11],eax - mov eax,[rsp+14h] - mov [rdi-2],eax - mov eax,[rsp+64h] - mov [r10+rdi-2],eax - mov eax,[rsp+24h] - mov [rdi+r10*2-2],eax - mov eax, [rsp+74h] - mov [rdx+rdi-2],eax - mov eax, [rsp+1Ch] - mov [rbx],eax - mov eax, [rsp+6Ch] - mov [r10+rbx],eax - mov eax,[rsp+2Ch] - mov [rbx+r10*2],eax - mov eax,[rsp+7Ch] - mov [rdx+rbx],eax - lea r11,[rsp+140h] - mov rbx, [r11+28h] - mov rsp,r11 - pop r12 - pop rbp - pop rbx - ret - - - -%elifdef X86_32 - ;*************************************************************************** ; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, ; int32_t iAlpha, int32_t iBeta) ;*************************************************************************** WELS_EXTERN DeblockChromaEq4H_ssse3 - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,0C8h - mov ecx,dword [ebp+8] - mov edx,dword [ebp+0Ch] - mov eax,dword [ebp+10h] - sub ecx,2 - sub edx,2 - push esi - lea esi,[eax+eax*2] - mov dword [esp+18h],ecx - mov dword [esp+4],edx - lea ecx,[ecx+eax*4] - lea edx,[edx+eax*4] - lea eax,[esp+7Ch] - push edi - mov dword [esp+14h],esi - mov dword [esp+18h],ecx - mov dword [esp+0Ch],edx - mov dword [esp+10h],eax - mov esi,dword [esp+1Ch] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+14h] - movd xmm0,dword [esi] - movd xmm1,dword [esi+ecx] - movd xmm2,dword [esi+ecx*2] - movd xmm3,dword [esi+edx] - mov esi,dword [esp+8] - movd xmm4,dword [esi] - movd xmm5,dword [esi+ecx] - movd xmm6,dword [esi+ecx*2] - movd xmm7,dword [esi+edx] - punpckldq xmm0,xmm4 - punpckldq xmm1,xmm5 - punpckldq xmm2,xmm6 - punpckldq xmm3,xmm7 - mov esi,dword [esp+18h] - mov edi,dword [esp+0Ch] - movd xmm4,dword [esi] - movd xmm5,dword [edi] - punpckldq xmm4,xmm5 - punpcklqdq xmm0,xmm4 - movd xmm4,dword [esi+ecx] - movd xmm5,dword [edi+ecx] - punpckldq xmm4,xmm5 - punpcklqdq xmm1,xmm4 - movd xmm4,dword [esi+ecx*2] - movd xmm5,dword [edi+ecx*2] - punpckldq xmm4,xmm5 - punpcklqdq xmm2,xmm4 - movd xmm4,dword [esi+edx] - movd xmm5,dword [edi+edx] - punpckldq xmm4,xmm5 - punpcklqdq xmm3,xmm4 - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov edi,dword [esp+10h] - movdqa [edi],xmm0 - movdqa [edi+10h],xmm5 - movdqa [edi+20h],xmm1 - movdqa [edi+30h],xmm6 - movsx ecx,word [ebp+14h] - movsx edx,word [ebp+18h] - movdqa xmm6,[esp+80h] - movdqa xmm4,[esp+90h] - movdqa xmm5,[esp+0A0h] - movdqa xmm7,[esp+0B0h] - pxor xmm0,xmm0 - movd xmm1,ecx - movdqa xmm2,xmm1 - punpcklwd xmm2,xmm1 - pshufd xmm1,xmm2,0 - movd xmm2,edx - movdqa xmm3,xmm2 - punpcklwd xmm3,xmm2 - pshufd xmm2,xmm3,0 - movdqa xmm3,xmm6 - punpckhbw xmm6,xmm0 - movdqa [esp+60h],xmm6 - movdqa xmm6,[esp+90h] - punpckhbw xmm6,xmm0 - movdqa [esp+30h],xmm6 - movdqa xmm6,[esp+0A0h] - punpckhbw xmm6,xmm0 - movdqa [esp+40h],xmm6 - movdqa xmm6,[esp+0B0h] - punpckhbw xmm6,xmm0 - movdqa [esp+70h],xmm6 - punpcklbw xmm7,xmm0 - punpcklbw xmm4,xmm0 - punpcklbw xmm5,xmm0 - punpcklbw xmm3,xmm0 - movdqa [esp+50h],xmm7 - movdqa xmm6,xmm4 - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - movdqa xmm0,xmm1 - pcmpgtw xmm0,xmm6 - movdqa xmm6,xmm3 - psubw xmm6,xmm4 - pabsw xmm6,xmm6 - movdqa xmm7,xmm2 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+50h] - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pand xmm0,xmm7 - movdqa xmm7,xmm2 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+30h] - psubw xmm6,[esp+40h] - pabsw xmm6,xmm6 - pcmpgtw xmm1,xmm6 - movdqa xmm6,[esp+60h] - psubw xmm6,[esp+30h] - pabsw xmm6,xmm6 - pand xmm0,xmm7 - movdqa xmm7,xmm2 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+70h] - psubw xmm6,[esp+40h] - pabsw xmm6,xmm6 - pand xmm1,xmm7 - pcmpgtw xmm2,xmm6 - pand xmm1,xmm2 - mov eax,2 - movsx ecx,ax - movd xmm2,ecx - movdqa xmm6,xmm2 - punpcklwd xmm6,xmm2 - pshufd xmm2,xmm6,0 - movdqa [esp+20h],xmm2 - movdqa xmm2,xmm3 - paddw xmm2,xmm3 - paddw xmm2,xmm4 - paddw xmm2,[esp+50h] - paddw xmm2,[esp+20h] - psraw xmm2,2 - movdqa xmm6,xmm0 - pand xmm6,xmm2 - movdqa xmm2,xmm0 - pandn xmm2,xmm4 - por xmm6,xmm2 - movdqa xmm2,[esp+60h] - movdqa xmm7,xmm2 - paddw xmm7,xmm2 - paddw xmm7,[esp+30h] - paddw xmm7,[esp+70h] - paddw xmm7,[esp+20h] - movdqa xmm4,xmm1 - movdqa xmm2,xmm1 - pandn xmm2,[esp+30h] - psraw xmm7,2 - pand xmm4,xmm7 - por xmm4,xmm2 - movdqa xmm2,[esp+50h] - packuswb xmm6,xmm4 - movdqa [esp+90h],xmm6 - movdqa xmm6,xmm2 - paddw xmm6,xmm2 - movdqa xmm2,[esp+20h] - paddw xmm6,xmm5 - paddw xmm6,xmm3 - movdqa xmm4,xmm0 - pandn xmm0,xmm5 - paddw xmm6,xmm2 - psraw xmm6,2 - pand xmm4,xmm6 - por xmm4,xmm0 - movdqa xmm0,[esp+70h] - movdqa xmm5,xmm0 - paddw xmm5,xmm0 - movdqa xmm0,[esp+40h] - paddw xmm5,xmm0 - paddw xmm5,[esp+60h] - movdqa xmm3,xmm1 - paddw xmm5,xmm2 - psraw xmm5,2 - pand xmm3,xmm5 - pandn xmm1,xmm0 - por xmm3,xmm1 - packuswb xmm4,xmm3 - movdqa [esp+0A0h],xmm4 - mov esi,dword [esp+10h] - movdqa xmm0,[esi] - movdqa xmm1,[esi+10h] - movdqa xmm2,[esi+20h] - movdqa xmm3,[esi+30h] - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov esi,dword [esp+1Ch] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+14h] - mov edi,dword [esp+8] - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov esi,dword [esp+18h] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov edi,dword [esp+0Ch] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - pop edi - pop esi - mov esp,ebp - pop ebp - ret - + %assign push_num 0 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r2, r2d + movd xmm7, arg4d + pxor xmm0, xmm0 + pshufb xmm7, xmm0 ; iAlpha + lea r3, [3 * r2 - 1] ; 3 * iStride - 1 + SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6 + SSSE3_DeblockChromaEq4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, xmm2, xmm3, xmm6 +%ifdef X86_32 + push r4 + push r5 + SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0 + pop r5 + pop r4 +%else + SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0 %endif + POP_XMM + LOAD_4_PARA_POP + ret ;********************************************************************************