2113 lines
53 KiB
NASM
2113 lines
53 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* deblock.asm
|
|
;*
|
|
;* Abstract
|
|
;* edge loop
|
|
;*
|
|
;* History
|
|
;* 08/07/2009 Created
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
%include "asm_inc.asm"
|
|
BITS 32
|
|
|
|
;*******************************************************************************
|
|
; Macros and other preprocessor constants
|
|
;*******************************************************************************
|
|
|
|
%ifdef FORMAT_COFF
|
|
SECTION .rodata pData
|
|
%else
|
|
SECTION .rodata align=16
|
|
%endif
|
|
|
|
SECTION .text
|
|
|
|
;********************************************************************************
|
|
; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
; int32_t iAlpha, int32_t iBeta)
|
|
;********************************************************************************
|
|
WELS_EXTERN DeblockChromaEq4V_sse2
|
|
|
|
ALIGN 16
|
|
DeblockChromaEq4V_sse2:
|
|
push ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,68h
|
|
mov edx,[ebp+10h] ; iStride
|
|
mov eax,[ebp+8] ; pPixCb
|
|
mov ecx,[ebp+0Ch] ; pPixCr
|
|
movq xmm4,[ecx]
|
|
movq xmm5,[edx+ecx]
|
|
push esi
|
|
push edi
|
|
lea esi,[edx+edx]
|
|
mov edi,eax
|
|
sub edi,esi
|
|
movq xmm1,[edi]
|
|
mov edi,ecx
|
|
sub edi,esi
|
|
movq xmm2,[edi]
|
|
punpcklqdq xmm1,xmm2
|
|
mov esi,eax
|
|
sub esi,edx
|
|
movq xmm2,[esi]
|
|
mov edi,ecx
|
|
sub edi,edx
|
|
movq xmm3,[edi]
|
|
punpcklqdq xmm2,xmm3
|
|
movq xmm3,[eax]
|
|
punpcklqdq xmm3,xmm4
|
|
movq xmm4,[edx+eax]
|
|
mov edx, [ebp + 14h]
|
|
punpcklqdq xmm4,xmm5
|
|
movd xmm5,edx
|
|
mov edx, [ebp + 18h]
|
|
pxor xmm0,xmm0
|
|
movdqa xmm6,xmm5
|
|
punpcklwd xmm6,xmm5
|
|
pshufd xmm5,xmm6,0
|
|
movd xmm6,edx
|
|
movdqa xmm7,xmm6
|
|
punpcklwd xmm7,xmm6
|
|
pshufd xmm6,xmm7,0
|
|
movdqa xmm7,xmm1
|
|
punpckhbw xmm1,xmm0
|
|
punpcklbw xmm7,xmm0
|
|
movdqa [esp+40h],xmm1
|
|
movdqa [esp+60h],xmm7
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm7,xmm0
|
|
movdqa [esp+10h],xmm7
|
|
movdqa xmm7,xmm3
|
|
punpcklbw xmm7,xmm0
|
|
punpckhbw xmm3,xmm0
|
|
movdqa [esp+50h],xmm7
|
|
movdqa xmm7,xmm4
|
|
punpckhbw xmm4,xmm0
|
|
punpckhbw xmm2,xmm0
|
|
punpcklbw xmm7,xmm0
|
|
movdqa [esp+30h],xmm3
|
|
movdqa xmm3,[esp+10h]
|
|
movdqa xmm1,xmm3
|
|
psubw xmm1,[esp+50h]
|
|
pabsw xmm1,xmm1
|
|
movdqa [esp+20h],xmm4
|
|
movdqa xmm0,xmm5
|
|
pcmpgtw xmm0,xmm1
|
|
movdqa xmm1,[esp+60h]
|
|
psubw xmm1,xmm3
|
|
pabsw xmm1,xmm1
|
|
movdqa xmm4,xmm6
|
|
pcmpgtw xmm4,xmm1
|
|
pand xmm0,xmm4
|
|
movdqa xmm1,xmm7
|
|
psubw xmm1,[esp+50h]
|
|
pabsw xmm1,xmm1
|
|
movdqa xmm4,xmm6
|
|
pcmpgtw xmm4,xmm1
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,[esp+30h]
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm5,xmm1
|
|
movdqa xmm1,[esp+40h]
|
|
pand xmm0,xmm4
|
|
psubw xmm1,xmm2
|
|
pabsw xmm1,xmm1
|
|
movdqa xmm4,xmm6
|
|
pcmpgtw xmm4,xmm1
|
|
movdqa xmm1,[esp+20h]
|
|
psubw xmm1,[esp+30h]
|
|
pand xmm5,xmm4
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm6,xmm1
|
|
pand xmm5,xmm6
|
|
mov edx,2
|
|
movsx edx,dx
|
|
movd xmm1,edx
|
|
movdqa xmm4,xmm1
|
|
punpcklwd xmm4,xmm1
|
|
pshufd xmm1,xmm4,0
|
|
movdqa xmm4,[esp+60h]
|
|
movdqa xmm6,xmm4
|
|
paddw xmm6,xmm4
|
|
paddw xmm6,xmm3
|
|
paddw xmm6,xmm7
|
|
movdqa [esp+10h],xmm1
|
|
paddw xmm6,[esp+10h]
|
|
psraw xmm6,2
|
|
movdqa xmm4,xmm0
|
|
pandn xmm4,xmm3
|
|
movdqa xmm3,[esp+40h]
|
|
movdqa xmm1,xmm0
|
|
pand xmm1,xmm6
|
|
por xmm1,xmm4
|
|
movdqa xmm6,xmm3
|
|
paddw xmm6,xmm3
|
|
movdqa xmm3,[esp+10h]
|
|
paddw xmm6,xmm2
|
|
paddw xmm6,[esp+20h]
|
|
paddw xmm6,xmm3
|
|
psraw xmm6,2
|
|
movdqa xmm4,xmm5
|
|
pand xmm4,xmm6
|
|
movdqa xmm6,xmm5
|
|
pandn xmm6,xmm2
|
|
por xmm4,xmm6
|
|
packuswb xmm1,xmm4
|
|
movdqa xmm4,[esp+50h]
|
|
movdqa xmm6,xmm7
|
|
paddw xmm6,xmm7
|
|
paddw xmm6,xmm4
|
|
paddw xmm6,[esp+60h]
|
|
paddw xmm6,xmm3
|
|
psraw xmm6,2
|
|
movdqa xmm2,xmm0
|
|
pand xmm2,xmm6
|
|
pandn xmm0,xmm4
|
|
por xmm2,xmm0
|
|
movdqa xmm0,[esp+20h]
|
|
movdqa xmm6,xmm0
|
|
paddw xmm6,xmm0
|
|
movdqa xmm0,[esp+30h]
|
|
paddw xmm6,xmm0
|
|
paddw xmm6,[esp+40h]
|
|
movdqa xmm4,xmm5
|
|
paddw xmm6,xmm3
|
|
movq [esi],xmm1
|
|
psraw xmm6,2
|
|
pand xmm4,xmm6
|
|
pandn xmm5,xmm0
|
|
por xmm4,xmm5
|
|
packuswb xmm2,xmm4
|
|
movq [eax],xmm2
|
|
psrldq xmm1,8
|
|
movq [edi],xmm1
|
|
pop edi
|
|
psrldq xmm2,8
|
|
movq [ecx],xmm2
|
|
pop esi
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
;******************************************************************************
|
|
; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
|
|
;*******************************************************************************
|
|
|
|
WELS_EXTERN DeblockChromaLt4V_sse2
|
|
|
|
DeblockChromaLt4V_sse2:
|
|
push ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,0E4h
|
|
push ebx
|
|
push esi
|
|
mov esi, [ebp+1Ch] ; pTC
|
|
movsx ebx, byte [esi+2]
|
|
push edi
|
|
movsx di,byte [esi+3]
|
|
mov word [esp+0Ch],bx
|
|
movsx bx,byte [esi+1]
|
|
movsx esi,byte [esi]
|
|
mov word [esp+0Eh],si
|
|
movzx esi,di
|
|
movd xmm1,esi
|
|
movzx esi,di
|
|
movd xmm2,esi
|
|
mov si,word [esp+0Ch]
|
|
mov edx, [ebp + 10h]
|
|
mov eax, [ebp + 08h]
|
|
movzx edi,si
|
|
movzx esi,si
|
|
mov ecx, [ebp + 0Ch]
|
|
movd xmm4,esi
|
|
movzx esi,bx
|
|
movd xmm5,esi
|
|
movd xmm3,edi
|
|
movzx esi,bx
|
|
movd xmm6,esi
|
|
mov si,word [esp+0Eh]
|
|
movzx edi,si
|
|
movzx esi,si
|
|
punpcklwd xmm6,xmm2
|
|
pxor xmm0,xmm0
|
|
movdqa [esp+40h],xmm0
|
|
movd xmm7,edi
|
|
movd xmm0,esi
|
|
lea esi,[edx+edx]
|
|
mov edi,eax
|
|
sub edi,esi
|
|
punpcklwd xmm5,xmm1
|
|
movdqa xmm1,[esp+40h]
|
|
punpcklwd xmm0,xmm4
|
|
movq xmm4,[edx+ecx]
|
|
punpcklwd xmm7,xmm3
|
|
movq xmm3,[eax]
|
|
punpcklwd xmm0,xmm6
|
|
movq xmm6,[edi]
|
|
punpcklwd xmm7,xmm5
|
|
punpcklwd xmm0,xmm7
|
|
mov edi,ecx
|
|
sub edi,esi
|
|
movdqa xmm2,xmm1
|
|
psubw xmm2,xmm0
|
|
movdqa [esp+60h],xmm2
|
|
movq xmm2, [edi]
|
|
punpcklqdq xmm6,xmm2
|
|
mov esi,eax
|
|
sub esi,edx
|
|
movq xmm7,[esi]
|
|
mov edi,ecx
|
|
sub edi,edx
|
|
movq xmm2,[edi]
|
|
punpcklqdq xmm7,xmm2
|
|
movq xmm2,[ecx]
|
|
punpcklqdq xmm3,xmm2
|
|
movq xmm2,[edx+eax]
|
|
movsx edx,word [ebp + 14h]
|
|
punpcklqdq xmm2,xmm4
|
|
movdqa [esp+0E0h],xmm2
|
|
movd xmm2,edx
|
|
movsx edx,word [ebp + 18h]
|
|
movdqa xmm4,xmm2
|
|
punpcklwd xmm4,xmm2
|
|
movd xmm2,edx
|
|
movdqa xmm5,xmm2
|
|
punpcklwd xmm5,xmm2
|
|
pshufd xmm2,xmm5,0
|
|
movdqa [esp+50h],xmm2
|
|
movdqa xmm2,xmm6
|
|
punpcklbw xmm2,xmm1
|
|
movdqa [esp+0D0h],xmm3
|
|
pshufd xmm4,xmm4,0
|
|
movdqa [esp+30h],xmm2
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+80h],xmm6
|
|
movdqa xmm6,[esp+0D0h]
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+70h],xmm6
|
|
movdqa xmm6, [esp+0E0h]
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+90h],xmm6
|
|
movdqa xmm5, [esp+0E0h]
|
|
movdqa xmm2,xmm7
|
|
punpckhbw xmm7,xmm1
|
|
punpcklbw xmm5,xmm1
|
|
movdqa [esp+0A0h],xmm7
|
|
punpcklbw xmm3,xmm1
|
|
mov edx,4
|
|
punpcklbw xmm2,xmm1
|
|
movsx edx,dx
|
|
movd xmm6,edx
|
|
movdqa xmm7,xmm6
|
|
punpcklwd xmm7,xmm6
|
|
pshufd xmm6,xmm7,0
|
|
movdqa xmm7,[esp+30h]
|
|
movdqa [esp+20h],xmm6
|
|
psubw xmm7,xmm5
|
|
movdqa xmm6,xmm0
|
|
pcmpgtw xmm6,xmm1
|
|
movdqa xmm1,[esp+60h]
|
|
movdqa [esp+40h],xmm6
|
|
movdqa xmm6,xmm3
|
|
psubw xmm6,xmm2
|
|
psllw xmm6,2
|
|
paddw xmm6,xmm7
|
|
paddw xmm6, [esp+20h]
|
|
movdqa xmm7, [esp+50h]
|
|
psraw xmm6,3
|
|
pmaxsw xmm1,xmm6
|
|
movdqa [esp+10h],xmm0
|
|
movdqa xmm6, [esp+10h]
|
|
pminsw xmm6,xmm1
|
|
movdqa [esp+10h],xmm6
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm3
|
|
pabsw xmm1,xmm1
|
|
movdqa xmm6,xmm4
|
|
pcmpgtw xmm6,xmm1
|
|
movdqa xmm1, [esp+30h]
|
|
psubw xmm1,xmm2
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm7,xmm1
|
|
movdqa xmm1,[esp+50h]
|
|
pand xmm6,xmm7
|
|
movdqa xmm7,[esp+50h]
|
|
psubw xmm5,xmm3
|
|
pabsw xmm5,xmm5
|
|
pcmpgtw xmm1,xmm5
|
|
movdqa xmm5,[esp+80h]
|
|
psubw xmm5,[esp+90h]
|
|
pand xmm6,xmm1
|
|
pand xmm6,[esp+40h]
|
|
movdqa xmm1,[esp+10h]
|
|
pand xmm1,xmm6
|
|
movdqa xmm6,[esp+70h]
|
|
movdqa [esp+30h],xmm1
|
|
movdqa xmm1,[esp+0A0h]
|
|
psubw xmm6,xmm1
|
|
psllw xmm6,2
|
|
paddw xmm6,xmm5
|
|
paddw xmm6,[esp+20h]
|
|
movdqa xmm5,[esp+60h]
|
|
psraw xmm6,3
|
|
pmaxsw xmm5,xmm6
|
|
pminsw xmm0,xmm5
|
|
movdqa xmm5,[esp+70h]
|
|
movdqa xmm6,xmm1
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm4,xmm6
|
|
movdqa xmm6,[esp+80h]
|
|
psubw xmm6,xmm1
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6,[esp+90h]
|
|
pand xmm4,xmm7
|
|
movdqa xmm7,[esp+50h]
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm7,xmm6
|
|
pand xmm4,xmm7
|
|
pand xmm4,[esp+40h]
|
|
pand xmm0,xmm4
|
|
movdqa xmm4,[esp+30h]
|
|
paddw xmm2,xmm4
|
|
paddw xmm1,xmm0
|
|
packuswb xmm2,xmm1
|
|
movq [esi],xmm2
|
|
psubw xmm3,xmm4
|
|
psubw xmm5,xmm0
|
|
packuswb xmm3,xmm5
|
|
movq [eax],xmm3
|
|
psrldq xmm2,8
|
|
movq [edi],xmm2
|
|
pop edi
|
|
pop esi
|
|
psrldq xmm3,8
|
|
movq [ecx],xmm3
|
|
pop ebx
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
;***************************************************************************
|
|
; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
; int32_t iAlpha, int32_t iBeta)
|
|
;***************************************************************************
|
|
|
|
WELS_EXTERN DeblockChromaEq4H_sse2
|
|
|
|
ALIGN 16
|
|
|
|
DeblockChromaEq4H_sse2:
|
|
push ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,0C8h
|
|
mov ecx,dword [ebp+8]
|
|
mov edx,dword [ebp+0Ch]
|
|
mov eax,dword [ebp+10h]
|
|
sub ecx,2
|
|
sub edx,2
|
|
push esi
|
|
lea esi,[eax+eax*2]
|
|
mov dword [esp+18h],ecx
|
|
mov dword [esp+4],edx
|
|
lea ecx,[ecx+eax*4]
|
|
lea edx,[edx+eax*4]
|
|
lea eax,[esp+7Ch]
|
|
push edi
|
|
mov dword [esp+14h],esi
|
|
mov dword [esp+18h],ecx
|
|
mov dword [esp+0Ch],edx
|
|
mov dword [esp+10h],eax
|
|
mov esi,dword [esp+1Ch]
|
|
mov ecx,dword [ebp+10h]
|
|
mov edx,dword [esp+14h]
|
|
movd xmm0,dword [esi]
|
|
movd xmm1,dword [esi+ecx]
|
|
movd xmm2,dword [esi+ecx*2]
|
|
movd xmm3,dword [esi+edx]
|
|
mov esi,dword [esp+8]
|
|
movd xmm4,dword [esi]
|
|
movd xmm5,dword [esi+ecx]
|
|
movd xmm6,dword [esi+ecx*2]
|
|
movd xmm7,dword [esi+edx]
|
|
punpckldq xmm0,xmm4
|
|
punpckldq xmm1,xmm5
|
|
punpckldq xmm2,xmm6
|
|
punpckldq xmm3,xmm7
|
|
mov esi,dword [esp+18h]
|
|
mov edi,dword [esp+0Ch]
|
|
movd xmm4,dword [esi]
|
|
movd xmm5,dword [edi]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm0,xmm4
|
|
movd xmm4,dword [esi+ecx]
|
|
movd xmm5,dword [edi+ecx]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm1,xmm4
|
|
movd xmm4,dword [esi+ecx*2]
|
|
movd xmm5,dword [edi+ecx*2]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm2,xmm4
|
|
movd xmm4,dword [esi+edx]
|
|
movd xmm5,dword [edi+edx]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm3,xmm4
|
|
movdqa xmm6,xmm0
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm2,xmm3
|
|
punpckhbw xmm7,xmm3
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm6
|
|
punpcklwd xmm0,xmm2
|
|
punpckhwd xmm4,xmm2
|
|
punpcklwd xmm6,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,xmm4
|
|
punpckldq xmm0,xmm6
|
|
punpckhdq xmm1,xmm6
|
|
punpckldq xmm4,xmm5
|
|
punpckhdq xmm2,xmm5
|
|
movdqa xmm5,xmm0
|
|
movdqa xmm6,xmm1
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm5,xmm4
|
|
punpcklqdq xmm1,xmm2
|
|
punpckhqdq xmm6,xmm2
|
|
mov edi,dword [esp+10h]
|
|
movdqa [edi],xmm0
|
|
movdqa [edi+10h],xmm5
|
|
movdqa [edi+20h],xmm1
|
|
movdqa [edi+30h],xmm6
|
|
movsx ecx,word [ebp+14h]
|
|
movsx edx,word [ebp+18h]
|
|
movdqa xmm6,[esp+80h]
|
|
movdqa xmm4,[esp+90h]
|
|
movdqa xmm5,[esp+0A0h]
|
|
movdqa xmm7,[esp+0B0h]
|
|
pxor xmm0,xmm0
|
|
movd xmm1,ecx
|
|
movdqa xmm2,xmm1
|
|
punpcklwd xmm2,xmm1
|
|
pshufd xmm1,xmm2,0
|
|
movd xmm2,edx
|
|
movdqa xmm3,xmm2
|
|
punpcklwd xmm3,xmm2
|
|
pshufd xmm2,xmm3,0
|
|
movdqa xmm3,xmm6
|
|
punpckhbw xmm6,xmm0
|
|
movdqa [esp+60h],xmm6
|
|
movdqa xmm6,[esp+90h]
|
|
punpckhbw xmm6,xmm0
|
|
movdqa [esp+30h],xmm6
|
|
movdqa xmm6,[esp+0A0h]
|
|
punpckhbw xmm6,xmm0
|
|
movdqa [esp+40h],xmm6
|
|
movdqa xmm6,[esp+0B0h]
|
|
punpckhbw xmm6,xmm0
|
|
movdqa [esp+70h],xmm6
|
|
punpcklbw xmm7,xmm0
|
|
punpcklbw xmm4,xmm0
|
|
punpcklbw xmm5,xmm0
|
|
punpcklbw xmm3,xmm0
|
|
movdqa [esp+50h],xmm7
|
|
movdqa xmm6,xmm4
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
movdqa xmm0,xmm1
|
|
pcmpgtw xmm0,xmm6
|
|
movdqa xmm6,xmm3
|
|
psubw xmm6,xmm4
|
|
pabsw xmm6,xmm6
|
|
movdqa xmm7,xmm2
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6,[esp+50h]
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pand xmm0,xmm7
|
|
movdqa xmm7,xmm2
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6,[esp+30h]
|
|
psubw xmm6,[esp+40h]
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm1,xmm6
|
|
movdqa xmm6,[esp+60h]
|
|
psubw xmm6,[esp+30h]
|
|
pabsw xmm6,xmm6
|
|
pand xmm0,xmm7
|
|
movdqa xmm7,xmm2
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6,[esp+70h]
|
|
psubw xmm6,[esp+40h]
|
|
pabsw xmm6,xmm6
|
|
pand xmm1,xmm7
|
|
pcmpgtw xmm2,xmm6
|
|
pand xmm1,xmm2
|
|
mov eax,2
|
|
movsx ecx,ax
|
|
movd xmm2,ecx
|
|
movdqa xmm6,xmm2
|
|
punpcklwd xmm6,xmm2
|
|
pshufd xmm2,xmm6,0
|
|
movdqa [esp+20h],xmm2
|
|
movdqa xmm2,xmm3
|
|
paddw xmm2,xmm3
|
|
paddw xmm2,xmm4
|
|
paddw xmm2,[esp+50h]
|
|
paddw xmm2,[esp+20h]
|
|
psraw xmm2,2
|
|
movdqa xmm6,xmm0
|
|
pand xmm6,xmm2
|
|
movdqa xmm2,xmm0
|
|
pandn xmm2,xmm4
|
|
por xmm6,xmm2
|
|
movdqa xmm2,[esp+60h]
|
|
movdqa xmm7,xmm2
|
|
paddw xmm7,xmm2
|
|
paddw xmm7,[esp+30h]
|
|
paddw xmm7,[esp+70h]
|
|
paddw xmm7,[esp+20h]
|
|
movdqa xmm4,xmm1
|
|
movdqa xmm2,xmm1
|
|
pandn xmm2,[esp+30h]
|
|
psraw xmm7,2
|
|
pand xmm4,xmm7
|
|
por xmm4,xmm2
|
|
movdqa xmm2,[esp+50h]
|
|
packuswb xmm6,xmm4
|
|
movdqa [esp+90h],xmm6
|
|
movdqa xmm6,xmm2
|
|
paddw xmm6,xmm2
|
|
movdqa xmm2,[esp+20h]
|
|
paddw xmm6,xmm5
|
|
paddw xmm6,xmm3
|
|
movdqa xmm4,xmm0
|
|
pandn xmm0,xmm5
|
|
paddw xmm6,xmm2
|
|
psraw xmm6,2
|
|
pand xmm4,xmm6
|
|
por xmm4,xmm0
|
|
movdqa xmm0,[esp+70h]
|
|
movdqa xmm5,xmm0
|
|
paddw xmm5,xmm0
|
|
movdqa xmm0,[esp+40h]
|
|
paddw xmm5,xmm0
|
|
paddw xmm5,[esp+60h]
|
|
movdqa xmm3,xmm1
|
|
paddw xmm5,xmm2
|
|
psraw xmm5,2
|
|
pand xmm3,xmm5
|
|
pandn xmm1,xmm0
|
|
por xmm3,xmm1
|
|
packuswb xmm4,xmm3
|
|
movdqa [esp+0A0h],xmm4
|
|
mov esi,dword [esp+10h]
|
|
movdqa xmm0,[esi]
|
|
movdqa xmm1,[esi+10h]
|
|
movdqa xmm2,[esi+20h]
|
|
movdqa xmm3,[esi+30h]
|
|
movdqa xmm6,xmm0
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm2,xmm3
|
|
punpckhbw xmm7,xmm3
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm6
|
|
punpcklwd xmm0,xmm2
|
|
punpckhwd xmm4,xmm2
|
|
punpcklwd xmm6,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,xmm4
|
|
punpckldq xmm0,xmm6
|
|
punpckhdq xmm1,xmm6
|
|
punpckldq xmm4,xmm5
|
|
punpckhdq xmm2,xmm5
|
|
movdqa xmm5,xmm0
|
|
movdqa xmm6,xmm1
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm5,xmm4
|
|
punpcklqdq xmm1,xmm2
|
|
punpckhqdq xmm6,xmm2
|
|
mov esi,dword [esp+1Ch]
|
|
mov ecx,dword [ebp+10h]
|
|
mov edx,dword [esp+14h]
|
|
mov edi,dword [esp+8]
|
|
movd dword [esi],xmm0
|
|
movd dword [esi+ecx],xmm5
|
|
movd dword [esi+ecx*2],xmm1
|
|
movd dword [esi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
mov esi,dword [esp+18h]
|
|
movd dword [edi],xmm0
|
|
movd dword [edi+ecx],xmm5
|
|
movd dword [edi+ecx*2],xmm1
|
|
movd dword [edi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
movd dword [esi],xmm0
|
|
movd dword [esi+ecx],xmm5
|
|
movd dword [esi+ecx*2],xmm1
|
|
movd dword [esi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
mov edi,dword [esp+0Ch]
|
|
movd dword [edi],xmm0
|
|
movd dword [edi+ecx],xmm5
|
|
movd dword [edi+ecx*2],xmm1
|
|
movd dword [edi+edx],xmm6
|
|
pop edi
|
|
pop esi
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
;*******************************************************************************
|
|
; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
|
|
;*******************************************************************************
|
|
|
|
WELS_EXTERN DeblockChromaLt4H_sse2
|
|
|
|
ALIGN 16
|
|
|
|
DeblockChromaLt4H_sse2:
|
|
push ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,108h
|
|
mov ecx,dword [ebp+8]
|
|
mov edx,dword [ebp+0Ch]
|
|
mov eax,dword [ebp+10h]
|
|
sub ecx,2
|
|
sub edx,2
|
|
push esi
|
|
lea esi,[eax+eax*2]
|
|
mov dword [esp+10h],ecx
|
|
mov dword [esp+4],edx
|
|
lea ecx,[ecx+eax*4]
|
|
lea edx,[edx+eax*4]
|
|
lea eax,[esp+6Ch]
|
|
push edi
|
|
mov dword [esp+0Ch],esi
|
|
mov dword [esp+18h],ecx
|
|
mov dword [esp+10h],edx
|
|
mov dword [esp+1Ch],eax
|
|
mov esi,dword [esp+14h]
|
|
mov ecx,dword [ebp+10h]
|
|
mov edx,dword [esp+0Ch]
|
|
movd xmm0,dword [esi]
|
|
movd xmm1,dword [esi+ecx]
|
|
movd xmm2,dword [esi+ecx*2]
|
|
movd xmm3,dword [esi+edx]
|
|
mov esi,dword [esp+8]
|
|
movd xmm4,dword [esi]
|
|
movd xmm5,dword [esi+ecx]
|
|
movd xmm6,dword [esi+ecx*2]
|
|
movd xmm7,dword [esi+edx]
|
|
punpckldq xmm0,xmm4
|
|
punpckldq xmm1,xmm5
|
|
punpckldq xmm2,xmm6
|
|
punpckldq xmm3,xmm7
|
|
mov esi,dword [esp+18h]
|
|
mov edi,dword [esp+10h]
|
|
movd xmm4,dword [esi]
|
|
movd xmm5,dword [edi]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm0,xmm4
|
|
movd xmm4,dword [esi+ecx]
|
|
movd xmm5,dword [edi+ecx]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm1,xmm4
|
|
movd xmm4,dword [esi+ecx*2]
|
|
movd xmm5,dword [edi+ecx*2]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm2,xmm4
|
|
movd xmm4,dword [esi+edx]
|
|
movd xmm5,dword [edi+edx]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm3,xmm4
|
|
movdqa xmm6,xmm0
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm2,xmm3
|
|
punpckhbw xmm7,xmm3
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm6
|
|
punpcklwd xmm0,xmm2
|
|
punpckhwd xmm4,xmm2
|
|
punpcklwd xmm6,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,xmm4
|
|
punpckldq xmm0,xmm6
|
|
punpckhdq xmm1,xmm6
|
|
punpckldq xmm4,xmm5
|
|
punpckhdq xmm2,xmm5
|
|
movdqa xmm5,xmm0
|
|
movdqa xmm6,xmm1
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm5,xmm4
|
|
punpcklqdq xmm1,xmm2
|
|
punpckhqdq xmm6,xmm2
|
|
mov edi,dword [esp+1Ch]
|
|
movdqa [edi],xmm0
|
|
movdqa [edi+10h],xmm5
|
|
movdqa [edi+20h],xmm1
|
|
movdqa [edi+30h],xmm6
|
|
mov eax,dword [ebp+1Ch]
|
|
movsx cx,byte [eax+3]
|
|
movsx dx,byte [eax+2]
|
|
movsx si,byte [eax+1]
|
|
movsx ax,byte [eax]
|
|
movzx edi,cx
|
|
movzx ecx,cx
|
|
movd xmm2,ecx
|
|
movzx ecx,dx
|
|
movzx edx,dx
|
|
movd xmm3,ecx
|
|
movd xmm4,edx
|
|
movzx ecx,si
|
|
movzx edx,si
|
|
movd xmm5,ecx
|
|
pxor xmm0,xmm0
|
|
movd xmm6,edx
|
|
movzx ecx,ax
|
|
movdqa [esp+60h],xmm0
|
|
movzx edx,ax
|
|
movsx eax,word [ebp+14h]
|
|
punpcklwd xmm6,xmm2
|
|
movd xmm1,edi
|
|
movd xmm7,ecx
|
|
movsx ecx,word [ebp+18h]
|
|
movd xmm0,edx
|
|
punpcklwd xmm7,xmm3
|
|
punpcklwd xmm5,xmm1
|
|
movdqa xmm1,[esp+60h]
|
|
punpcklwd xmm7,xmm5
|
|
movdqa xmm5,[esp+0A0h]
|
|
punpcklwd xmm0,xmm4
|
|
punpcklwd xmm0,xmm6
|
|
movdqa xmm6, [esp+70h]
|
|
punpcklwd xmm0,xmm7
|
|
movdqa xmm7,[esp+80h]
|
|
movdqa xmm2,xmm1
|
|
psubw xmm2,xmm0
|
|
movdqa [esp+0D0h],xmm2
|
|
movd xmm2,eax
|
|
movdqa xmm3,xmm2
|
|
punpcklwd xmm3,xmm2
|
|
pshufd xmm4,xmm3,0
|
|
movd xmm2,ecx
|
|
movdqa xmm3,xmm2
|
|
punpcklwd xmm3,xmm2
|
|
pshufd xmm2,xmm3,0
|
|
movdqa xmm3, [esp+90h]
|
|
movdqa [esp+50h],xmm2
|
|
movdqa xmm2,xmm6
|
|
punpcklbw xmm2,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+40h],xmm2
|
|
movdqa [esp+0B0h],xmm6
|
|
movdqa xmm6,[esp+90h]
|
|
movdqa xmm2,xmm7
|
|
punpckhbw xmm7,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
punpcklbw xmm2,xmm1
|
|
punpcklbw xmm3,xmm1
|
|
punpcklbw xmm5,xmm1
|
|
movdqa [esp+0F0h],xmm7
|
|
movdqa [esp+0C0h],xmm6
|
|
movdqa xmm6, [esp+0A0h]
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+0E0h],xmm6
|
|
mov edx,4
|
|
movsx eax,dx
|
|
movd xmm6,eax
|
|
movdqa xmm7,xmm6
|
|
punpcklwd xmm7,xmm6
|
|
pshufd xmm6,xmm7,0
|
|
movdqa [esp+30h],xmm6
|
|
movdqa xmm7, [esp+40h]
|
|
psubw xmm7,xmm5
|
|
movdqa xmm6,xmm0
|
|
pcmpgtw xmm6,xmm1
|
|
movdqa [esp+60h],xmm6
|
|
movdqa xmm1, [esp+0D0h]
|
|
movdqa xmm6,xmm3
|
|
psubw xmm6,xmm2
|
|
psllw xmm6,2
|
|
paddw xmm6,xmm7
|
|
paddw xmm6,[esp+30h]
|
|
psraw xmm6,3
|
|
pmaxsw xmm1,xmm6
|
|
movdqa xmm7,[esp+50h]
|
|
movdqa [esp+20h],xmm0
|
|
movdqa xmm6, [esp+20h]
|
|
pminsw xmm6,xmm1
|
|
movdqa [esp+20h],xmm6
|
|
movdqa xmm6,xmm4
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm3
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm6,xmm1
|
|
movdqa xmm1, [esp+40h]
|
|
psubw xmm1,xmm2
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm7,xmm1
|
|
movdqa xmm1, [esp+50h]
|
|
pand xmm6,xmm7
|
|
movdqa xmm7, [esp+50h]
|
|
psubw xmm5,xmm3
|
|
pabsw xmm5,xmm5
|
|
pcmpgtw xmm1,xmm5
|
|
movdqa xmm5, [esp+0B0h]
|
|
psubw xmm5,[esp+0E0h]
|
|
pand xmm6,xmm1
|
|
pand xmm6, [esp+60h]
|
|
movdqa xmm1, [esp+20h]
|
|
pand xmm1,xmm6
|
|
movdqa xmm6, [esp+0C0h]
|
|
movdqa [esp+40h],xmm1
|
|
movdqa xmm1, [esp+0F0h]
|
|
psubw xmm6,xmm1
|
|
psllw xmm6,2
|
|
paddw xmm6,xmm5
|
|
paddw xmm6, [esp+30h]
|
|
movdqa xmm5, [esp+0D0h]
|
|
psraw xmm6,3
|
|
pmaxsw xmm5,xmm6
|
|
pminsw xmm0,xmm5
|
|
movdqa xmm5,[esp+0C0h]
|
|
movdqa xmm6,xmm1
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm4,xmm6
|
|
movdqa xmm6,[esp+0B0h]
|
|
psubw xmm6,xmm1
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6, [esp+0E0h]
|
|
pand xmm4,xmm7
|
|
movdqa xmm7, [esp+50h]
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm7,xmm6
|
|
pand xmm4,xmm7
|
|
pand xmm4,[esp+60h]
|
|
pand xmm0,xmm4
|
|
movdqa xmm4, [esp+40h]
|
|
paddw xmm2,xmm4
|
|
paddw xmm1,xmm0
|
|
psubw xmm3,xmm4
|
|
psubw xmm5,xmm0
|
|
packuswb xmm2,xmm1
|
|
packuswb xmm3,xmm5
|
|
movdqa [esp+80h],xmm2
|
|
movdqa [esp+90h],xmm3
|
|
mov esi,dword [esp+1Ch]
|
|
movdqa xmm0, [esi]
|
|
movdqa xmm1, [esi+10h]
|
|
movdqa xmm2, [esi+20h]
|
|
movdqa xmm3, [esi+30h]
|
|
movdqa xmm6,xmm0
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm2,xmm3
|
|
punpckhbw xmm7,xmm3
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm6
|
|
punpcklwd xmm0,xmm2
|
|
punpckhwd xmm4,xmm2
|
|
punpcklwd xmm6,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,xmm4
|
|
punpckldq xmm0,xmm6
|
|
punpckhdq xmm1,xmm6
|
|
punpckldq xmm4,xmm5
|
|
punpckhdq xmm2,xmm5
|
|
movdqa xmm5,xmm0
|
|
movdqa xmm6,xmm1
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm5,xmm4
|
|
punpcklqdq xmm1,xmm2
|
|
punpckhqdq xmm6,xmm2
|
|
mov esi,dword [esp+14h]
|
|
mov ecx,dword [ebp+10h]
|
|
mov edx,dword [esp+0Ch]
|
|
mov edi,dword [esp+8]
|
|
movd dword [esi],xmm0
|
|
movd dword [esi+ecx],xmm5
|
|
movd dword [esi+ecx*2],xmm1
|
|
movd dword [esi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
mov esi,dword [esp+18h]
|
|
movd dword [edi],xmm0
|
|
movd dword [edi+ecx],xmm5
|
|
movd dword [edi+ecx*2],xmm1
|
|
movd dword [edi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
movd dword [esi],xmm0
|
|
movd dword [esi+ecx],xmm5
|
|
movd dword [esi+ecx*2],xmm1
|
|
movd dword [esi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
mov edi,dword [esp+10h]
|
|
movd dword [edi],xmm0
|
|
movd dword [edi+ecx],xmm5
|
|
movd dword [edi+ecx*2],xmm1
|
|
movd dword [edi+edx],xmm6
|
|
pop edi
|
|
pop esi
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
|
|
|
|
;*******************************************************************************
|
|
; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
|
|
; int32_t iBeta, int8_t * pTC)
|
|
;*******************************************************************************
|
|
|
|
|
|
WELS_EXTERN DeblockLumaLt4V_sse2
|
|
|
|
ALIGN 16
|
|
|
|
DeblockLumaLt4V_sse2:
|
|
push ebp
|
|
mov ebp, esp
|
|
and esp, -16 ; fffffff0H
|
|
sub esp, 420 ; 000001a4H
|
|
mov eax, dword [ebp+8]
|
|
mov ecx, dword [ebp+12]
|
|
|
|
pxor xmm0, xmm0
|
|
push ebx
|
|
mov edx, dword [ebp+24]
|
|
movdqa [esp+424-384], xmm0
|
|
push esi
|
|
|
|
lea esi, [ecx+ecx*2]
|
|
push edi
|
|
mov edi, eax
|
|
sub edi, esi
|
|
movdqa xmm0, [edi]
|
|
|
|
lea esi, [ecx+ecx]
|
|
movdqa [esp+432-208], xmm0
|
|
mov edi, eax
|
|
sub edi, esi
|
|
movdqa xmm0, [edi]
|
|
movdqa [esp+448-208], xmm0
|
|
|
|
mov ebx, eax
|
|
sub ebx, ecx
|
|
movdqa xmm0, [ebx]
|
|
movdqa [esp+464-208], xmm0
|
|
|
|
movdqa xmm0, [eax]
|
|
|
|
add ecx, eax
|
|
movdqa [esp+480-208], xmm0
|
|
movdqa xmm0, [ecx]
|
|
mov dword [esp+432-404], ecx
|
|
|
|
movsx ecx, word [ebp+16]
|
|
movdqa [esp+496-208], xmm0
|
|
movdqa xmm0, [esi+eax]
|
|
|
|
movsx si, byte [edx]
|
|
movdqa [esp+512-208], xmm0
|
|
movd xmm0, ecx
|
|
movsx ecx, word [ebp+20]
|
|
movdqa xmm1, xmm0
|
|
punpcklwd xmm1, xmm0
|
|
pshufd xmm0, xmm1, 0
|
|
movdqa [esp+432-112], xmm0
|
|
movd xmm0, ecx
|
|
movsx cx, byte [edx+1]
|
|
movdqa xmm1, xmm0
|
|
punpcklwd xmm1, xmm0
|
|
mov dword [esp+432-408], ebx
|
|
movzx ebx, cx
|
|
pshufd xmm0, xmm1, 0
|
|
movd xmm1, ebx
|
|
movzx ebx, cx
|
|
movd xmm2, ebx
|
|
movzx ebx, cx
|
|
movzx ecx, cx
|
|
movd xmm4, ecx
|
|
movzx ecx, si
|
|
movd xmm5, ecx
|
|
movzx ecx, si
|
|
movd xmm6, ecx
|
|
movzx ecx, si
|
|
movd xmm7, ecx
|
|
movzx ecx, si
|
|
movdqa [esp+432-336], xmm0
|
|
movd xmm0, ecx
|
|
|
|
movsx cx, byte [edx+3]
|
|
movsx dx, byte [edx+2]
|
|
movd xmm3, ebx
|
|
punpcklwd xmm0, xmm4
|
|
movzx esi, cx
|
|
punpcklwd xmm6, xmm2
|
|
punpcklwd xmm5, xmm1
|
|
punpcklwd xmm0, xmm6
|
|
punpcklwd xmm7, xmm3
|
|
punpcklwd xmm7, xmm5
|
|
punpcklwd xmm0, xmm7
|
|
movdqa [esp+432-400], xmm0
|
|
movd xmm0, esi
|
|
movzx esi, cx
|
|
movd xmm2, esi
|
|
movzx esi, cx
|
|
movzx ecx, cx
|
|
movd xmm4, ecx
|
|
movzx ecx, dx
|
|
movd xmm3, esi
|
|
movd xmm5, ecx
|
|
punpcklwd xmm5, xmm0
|
|
|
|
movdqa xmm0, [esp+432-384]
|
|
movzx ecx, dx
|
|
movd xmm6, ecx
|
|
movzx ecx, dx
|
|
movzx edx, dx
|
|
punpcklwd xmm6, xmm2
|
|
movd xmm7, ecx
|
|
movd xmm1, edx
|
|
|
|
movdqa xmm2, [esp+448-208]
|
|
punpcklbw xmm2, xmm0
|
|
|
|
mov ecx, 4
|
|
movsx edx, cx
|
|
punpcklwd xmm7, xmm3
|
|
punpcklwd xmm7, xmm5
|
|
movdqa xmm5, [esp+496-208]
|
|
movdqa xmm3, [esp+464-208]
|
|
punpcklbw xmm5, xmm0
|
|
movdqa [esp+432-240], xmm5
|
|
movdqa xmm5, [esp+512-208]
|
|
punpcklbw xmm5, xmm0
|
|
movdqa [esp+432-352], xmm5
|
|
punpcklwd xmm1, xmm4
|
|
movdqa xmm4, [esp+432-208]
|
|
punpcklwd xmm1, xmm6
|
|
movdqa xmm6, [esp+480-208]
|
|
punpcklwd xmm1, xmm7
|
|
punpcklbw xmm6, xmm0
|
|
punpcklbw xmm3, xmm0
|
|
punpcklbw xmm4, xmm0
|
|
movdqa xmm7, xmm3
|
|
psubw xmm7, xmm4
|
|
pabsw xmm7, xmm7
|
|
movdqa [esp+432-272], xmm4
|
|
movdqa xmm4, [esp+432-336]
|
|
movdqa xmm5, xmm4
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa [esp+432-288], xmm5
|
|
movdqa xmm7, xmm6
|
|
psubw xmm7, [esp+432-352]
|
|
pabsw xmm7, xmm7
|
|
movdqa xmm5, xmm4
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa [esp+432-256], xmm5
|
|
movdqa xmm5, xmm3
|
|
pavgw xmm5, xmm6
|
|
movdqa [esp+432-304], xmm5
|
|
movdqa xmm5, [esp+432-400]
|
|
psubw xmm5, [esp+432-288]
|
|
psubw xmm5, [esp+432-256]
|
|
movdqa [esp+432-224], xmm5
|
|
movdqa xmm5, xmm6
|
|
psubw xmm5, xmm3
|
|
movdqa [esp+432-32], xmm6
|
|
psubw xmm6, [esp+432-240]
|
|
movdqa xmm7, xmm5
|
|
movdqa [esp+432-384], xmm5
|
|
movdqa xmm5, [esp+432-112]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm5, xmm7
|
|
pabsw xmm6, xmm6
|
|
movdqa xmm7, xmm4
|
|
pcmpgtw xmm7, xmm6
|
|
|
|
pand xmm5, xmm7
|
|
movdqa xmm6, xmm3
|
|
psubw xmm6, xmm2
|
|
pabsw xmm6, xmm6
|
|
movdqa xmm7, xmm4
|
|
pcmpgtw xmm7, xmm6
|
|
movdqa xmm6, [esp+432-400]
|
|
pand xmm5, xmm7
|
|
movdqa xmm7, xmm6
|
|
pcmpeqw xmm6, xmm0
|
|
pcmpgtw xmm7, xmm0
|
|
por xmm7, xmm6
|
|
pand xmm5, xmm7
|
|
movdqa [esp+432-320], xmm5
|
|
movd xmm5, edx
|
|
movdqa xmm6, xmm5
|
|
punpcklwd xmm6, xmm5
|
|
pshufd xmm5, xmm6, 0
|
|
movdqa [esp+432-336], xmm5
|
|
movdqa xmm5, [esp+432-224]
|
|
movdqa [esp+432-368], xmm5
|
|
movdqa xmm6, xmm0
|
|
psubw xmm6, xmm5
|
|
movdqa xmm5, [esp+432-384]
|
|
psllw xmm5, 2
|
|
movdqa xmm7, xmm2
|
|
psubw xmm7, [esp+432-240]
|
|
paddw xmm7, xmm5
|
|
paddw xmm7, [esp+432-336]
|
|
movdqa xmm5, [esp+432-368]
|
|
psraw xmm7, 3
|
|
pmaxsw xmm6, xmm7
|
|
pminsw xmm5, xmm6
|
|
|
|
pand xmm5, [esp+432-320]
|
|
movdqa xmm6, [esp+432-400]
|
|
movdqa [esp+432-64], xmm5
|
|
movdqa [esp+432-384], xmm6
|
|
movdqa xmm5, xmm0
|
|
psubw xmm5, xmm6
|
|
movdqa [esp+432-368], xmm5
|
|
movdqa xmm6, xmm5
|
|
movdqa xmm5, [esp+432-272]
|
|
paddw xmm5, [esp+432-304]
|
|
movdqa xmm7, xmm2
|
|
paddw xmm7, xmm2
|
|
psubw xmm5, xmm7
|
|
psraw xmm5, 1
|
|
pmaxsw xmm6, xmm5
|
|
movdqa xmm5, [esp+432-384]
|
|
pminsw xmm5, xmm6
|
|
|
|
pand xmm5, [esp+432-320]
|
|
pand xmm5, [esp+432-288]
|
|
movdqa xmm6, [esp+432-240]
|
|
movdqa [esp+432-96], xmm5
|
|
movdqa xmm5, [esp+432-352]
|
|
paddw xmm5, [esp+432-304]
|
|
movdqa xmm7, xmm6
|
|
paddw xmm7, xmm6
|
|
movdqa xmm6, [esp+432-368]
|
|
psubw xmm5, xmm7
|
|
|
|
movdqa xmm7, [esp+496-208]
|
|
psraw xmm5, 1
|
|
pmaxsw xmm6, xmm5
|
|
movdqa xmm5, [esp+432-400]
|
|
pminsw xmm5, xmm6
|
|
pand xmm5, [esp+432-320]
|
|
pand xmm5, [esp+432-256]
|
|
movdqa xmm6, [esp+448-208]
|
|
punpckhbw xmm7, xmm0
|
|
movdqa [esp+432-352], xmm7
|
|
|
|
movdqa xmm7, [esp+512-208]
|
|
punpckhbw xmm6, xmm0
|
|
movdqa [esp+432-48], xmm5
|
|
movdqa xmm5, [esp+432-208]
|
|
movdqa [esp+432-368], xmm6
|
|
movdqa xmm6, [esp+464-208]
|
|
punpckhbw xmm7, xmm0
|
|
punpckhbw xmm5, xmm0
|
|
movdqa [esp+432-384], xmm7
|
|
punpckhbw xmm6, xmm0
|
|
movdqa [esp+432-400], xmm6
|
|
|
|
movdqa xmm7, [esp+432-400]
|
|
movdqa xmm6, [esp+480-208]
|
|
psubw xmm7, xmm5
|
|
movdqa [esp+432-16], xmm5
|
|
pabsw xmm7, xmm7
|
|
punpckhbw xmm6, xmm0
|
|
movdqa xmm5, xmm4
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa [esp+432-288], xmm5
|
|
|
|
movdqa xmm7, xmm6
|
|
psubw xmm7, [esp+432-384]
|
|
pabsw xmm7, xmm7
|
|
movdqa xmm5, xmm4
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa [esp+432-256], xmm5
|
|
|
|
movdqa xmm5, [esp+432-400]
|
|
movdqa [esp+432-80], xmm6
|
|
pavgw xmm5, xmm6
|
|
movdqa [esp+432-304], xmm5
|
|
|
|
movdqa xmm5, xmm1
|
|
psubw xmm5, [esp+432-288]
|
|
psubw xmm5, [esp+432-256]
|
|
movdqa [esp+432-224], xmm5
|
|
movdqa xmm5, xmm6
|
|
psubw xmm5, [esp+432-400]
|
|
psubw xmm6, [esp+432-352]
|
|
movdqa [esp+432-272], xmm5
|
|
movdqa xmm7, xmm5
|
|
movdqa xmm5, [esp+432-112]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa xmm7, xmm4
|
|
pabsw xmm6, xmm6
|
|
pcmpgtw xmm7, xmm6
|
|
movdqa xmm6, [esp+432-368]
|
|
|
|
pand xmm5, xmm7
|
|
movdqa xmm7, [esp+432-400]
|
|
psubw xmm7, xmm6
|
|
psubw xmm6, [esp+432-352]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm4, xmm7
|
|
pand xmm5, xmm4
|
|
|
|
paddw xmm2, [esp+432-96]
|
|
movdqa xmm4, xmm1
|
|
pcmpgtw xmm4, xmm0
|
|
movdqa xmm7, xmm1
|
|
pcmpeqw xmm7, xmm0
|
|
por xmm4, xmm7
|
|
pand xmm5, xmm4
|
|
movdqa xmm4, [esp+432-224]
|
|
movdqa [esp+432-320], xmm5
|
|
movdqa xmm5, [esp+432-272]
|
|
movdqa xmm7, xmm0
|
|
psubw xmm7, xmm4
|
|
psubw xmm0, xmm1
|
|
psllw xmm5, 2
|
|
paddw xmm6, xmm5
|
|
paddw xmm6, [esp+432-336]
|
|
movdqa xmm5, [esp+432-368]
|
|
movdqa [esp+432-336], xmm0
|
|
psraw xmm6, 3
|
|
pmaxsw xmm7, xmm6
|
|
pminsw xmm4, xmm7
|
|
pand xmm4, [esp+432-320]
|
|
movdqa xmm6, xmm0
|
|
movdqa xmm0, [esp+432-16]
|
|
paddw xmm0, [esp+432-304]
|
|
movdqa [esp+432-272], xmm4
|
|
movdqa xmm4, [esp+432-368]
|
|
paddw xmm4, xmm4
|
|
psubw xmm0, xmm4
|
|
|
|
movdqa xmm4, [esp+432-64]
|
|
psraw xmm0, 1
|
|
pmaxsw xmm6, xmm0
|
|
movdqa xmm0, [esp+432-400]
|
|
movdqa xmm7, xmm1
|
|
pminsw xmm7, xmm6
|
|
movdqa xmm6, [esp+432-320]
|
|
pand xmm7, xmm6
|
|
pand xmm7, [esp+432-288]
|
|
paddw xmm5, xmm7
|
|
packuswb xmm2, xmm5
|
|
movdqa xmm5, [esp+432-272]
|
|
paddw xmm0, xmm5
|
|
paddw xmm3, xmm4
|
|
packuswb xmm3, xmm0
|
|
|
|
movdqa xmm0, [esp+432-32]
|
|
psubw xmm0, xmm4
|
|
movdqa xmm4, [esp+432-80]
|
|
psubw xmm4, xmm5
|
|
|
|
movdqa xmm5, [esp+432-240]
|
|
paddw xmm5, [esp+432-48]
|
|
packuswb xmm0, xmm4
|
|
movdqa xmm4, [esp+432-384]
|
|
paddw xmm4, [esp+432-304]
|
|
movdqa [esp+480-208], xmm0
|
|
movdqa xmm0, [esp+432-352]
|
|
movdqa xmm7, xmm0
|
|
paddw xmm0, xmm0
|
|
|
|
mov ecx, dword [esp+432-408]
|
|
|
|
mov edx, dword [esp+432-404]
|
|
psubw xmm4, xmm0
|
|
movdqa xmm0, [esp+432-336]
|
|
movdqa [edi], xmm2
|
|
psraw xmm4, 1
|
|
pmaxsw xmm0, xmm4
|
|
pminsw xmm1, xmm0
|
|
movdqa xmm0, [esp+480-208]
|
|
|
|
pop edi
|
|
pand xmm1, xmm6
|
|
pand xmm1, [esp+428-256]
|
|
movdqa [ecx], xmm3
|
|
paddw xmm7, xmm1
|
|
pop esi
|
|
packuswb xmm5, xmm7
|
|
movdqa [eax], xmm0
|
|
movdqa [edx], xmm5
|
|
pop ebx
|
|
mov esp, ebp
|
|
pop ebp
|
|
ret
|
|
|
|
|
|
;*******************************************************************************
|
|
; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
|
|
; int32_t iBeta)
|
|
;*******************************************************************************
|
|
|
|
WELS_EXTERN DeblockLumaEq4V_sse2
|
|
|
|
ALIGN 16
|
|
|
|
DeblockLumaEq4V_sse2:
|
|
|
|
push ebp
|
|
mov ebp, esp
|
|
and esp, -16 ; fffffff0H
|
|
sub esp, 628 ; 00000274H
|
|
mov eax, dword [ebp+8]
|
|
mov ecx, dword [ebp+12]
|
|
push ebx
|
|
push esi
|
|
|
|
lea edx, [ecx*4]
|
|
pxor xmm0, xmm0
|
|
movdqa xmm2, xmm0
|
|
|
|
movdqa xmm0, [ecx+eax]
|
|
mov esi, eax
|
|
sub esi, edx
|
|
movdqa xmm3, [esi]
|
|
movdqa xmm5, [eax]
|
|
push edi
|
|
lea edi, [ecx+ecx]
|
|
lea ebx, [ecx+ecx*2]
|
|
mov dword [esp+640-600], edi
|
|
mov esi, eax
|
|
sub esi, edi
|
|
movdqa xmm1, [esi]
|
|
movdqa [esp+720-272], xmm0
|
|
mov edi, eax
|
|
sub edi, ecx
|
|
movdqa xmm4, [edi]
|
|
add ecx, eax
|
|
mov dword [esp+640-596], ecx
|
|
|
|
mov ecx, dword [esp+640-600]
|
|
movdqa xmm0, [ecx+eax]
|
|
movdqa [esp+736-272], xmm0
|
|
|
|
movdqa xmm0, [eax+ebx]
|
|
mov edx, eax
|
|
sub edx, ebx
|
|
|
|
movsx ebx, word [ebp+16]
|
|
movdqa xmm6, [edx]
|
|
add ecx, eax
|
|
movdqa [esp+752-272], xmm0
|
|
movd xmm0, ebx
|
|
|
|
movsx ebx, word [ebp+20]
|
|
movdqa xmm7, xmm0
|
|
punpcklwd xmm7, xmm0
|
|
pshufd xmm0, xmm7, 0
|
|
movdqa [esp+640-320], xmm0
|
|
movd xmm0, ebx
|
|
movdqa xmm7, xmm0
|
|
punpcklwd xmm7, xmm0
|
|
pshufd xmm0, xmm7, 0
|
|
|
|
movdqa xmm7, [esp+736-272]
|
|
punpcklbw xmm7, xmm2
|
|
movdqa [esp+640-416], xmm7
|
|
movdqa [esp+640-512], xmm0
|
|
movdqa xmm0, xmm1
|
|
movdqa [esp+672-272], xmm1
|
|
movdqa xmm1, xmm4
|
|
movdqa [esp+704-272], xmm5
|
|
punpcklbw xmm5, xmm2
|
|
punpcklbw xmm1, xmm2
|
|
|
|
movdqa xmm7, xmm5
|
|
psubw xmm7, xmm1
|
|
pabsw xmm7, xmm7
|
|
movdqa [esp+640-560], xmm7
|
|
punpcklbw xmm0, xmm2
|
|
movdqa [esp+688-272], xmm4
|
|
movdqa xmm4, [esp+720-272]
|
|
movdqa [esp+640-480], xmm0
|
|
|
|
movdqa xmm7, xmm1
|
|
psubw xmm7, xmm0
|
|
|
|
movdqa xmm0, [esp+640-512]
|
|
pabsw xmm7, xmm7
|
|
punpcklbw xmm4, xmm2
|
|
pcmpgtw xmm0, xmm7
|
|
movdqa [esp+640-384], xmm4
|
|
movdqa xmm7, xmm5
|
|
psubw xmm7, xmm4
|
|
movdqa xmm4, [esp+640-512]
|
|
movdqa [esp+656-272], xmm6
|
|
punpcklbw xmm6, xmm2
|
|
pabsw xmm7, xmm7
|
|
movdqa [esp+640-48], xmm2
|
|
movdqa [esp+640-368], xmm6
|
|
movdqa [esp+640-144], xmm1
|
|
movdqa [esp+640-400], xmm5
|
|
pcmpgtw xmm4, xmm7
|
|
pand xmm0, xmm4
|
|
movdqa xmm4, [esp+640-320]
|
|
pcmpgtw xmm4, [esp+640-560]
|
|
pand xmm0, xmm4
|
|
|
|
mov ebx, 2
|
|
movsx ebx, bx
|
|
movd xmm4, ebx
|
|
movdqa xmm7, xmm4
|
|
punpcklwd xmm7, xmm4
|
|
movdqa xmm4, [esp+640-320]
|
|
psraw xmm4, 2
|
|
pshufd xmm7, xmm7, 0
|
|
paddw xmm4, xmm7
|
|
movdqa [esp+640-576], xmm4
|
|
pcmpgtw xmm4, [esp+640-560]
|
|
movdqa [esp+640-560], xmm4
|
|
|
|
movdqa xmm4, [esp+640-512]
|
|
movdqa [esp+640-624], xmm7
|
|
movdqa xmm7, xmm1
|
|
psubw xmm7, xmm6
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm4, xmm7
|
|
|
|
pand xmm4, [esp+640-560]
|
|
movdqa [esp+640-544], xmm4
|
|
movdqa xmm4, [esp+640-512]
|
|
movdqa xmm7, xmm5
|
|
psubw xmm7, [esp+640-416]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm4, xmm7
|
|
|
|
pand xmm4, [esp+640-560]
|
|
movdqa [esp+640-560], xmm4
|
|
|
|
movdqa xmm4, [esp+640-544]
|
|
pandn xmm4, xmm6
|
|
movdqa [esp+640-16], xmm4
|
|
mov ebx, 4
|
|
movsx ebx, bx
|
|
movd xmm4, ebx
|
|
movdqa xmm7, xmm4
|
|
punpcklwd xmm7, xmm4
|
|
movdqa xmm4, xmm3
|
|
punpcklbw xmm4, xmm2
|
|
psllw xmm4, 1
|
|
paddw xmm4, xmm6
|
|
paddw xmm4, xmm6
|
|
paddw xmm4, xmm6
|
|
paddw xmm4, [esp+640-480]
|
|
|
|
movdqa xmm6, [esp+640-560]
|
|
pshufd xmm7, xmm7, 0
|
|
paddw xmm4, xmm1
|
|
movdqa [esp+640-592], xmm7
|
|
paddw xmm4, xmm5
|
|
paddw xmm4, xmm7
|
|
movdqa xmm7, [esp+640-416]
|
|
pandn xmm6, xmm7
|
|
movdqa [esp+640-80], xmm6
|
|
movdqa xmm6, [esp+752-272]
|
|
punpcklbw xmm6, xmm2
|
|
psllw xmm6, 1
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, [esp+640-384]
|
|
|
|
movdqa xmm7, [esp+640-480]
|
|
paddw xmm6, xmm5
|
|
paddw xmm6, xmm1
|
|
paddw xmm6, [esp+640-592]
|
|
psraw xmm6, 3
|
|
pand xmm6, [esp+640-560]
|
|
movdqa [esp+640-112], xmm6
|
|
movdqa xmm6, [esp+640-544]
|
|
pandn xmm6, xmm7
|
|
movdqa [esp+640-336], xmm6
|
|
movdqa xmm6, [esp+640-544]
|
|
movdqa [esp+640-528], xmm6
|
|
movdqa xmm6, [esp+640-368]
|
|
paddw xmm6, xmm7
|
|
movdqa xmm7, xmm1
|
|
psraw xmm4, 3
|
|
pand xmm4, [esp+640-544]
|
|
paddw xmm7, xmm5
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, [esp+640-624]
|
|
movdqa xmm7, [esp+640-528]
|
|
|
|
paddw xmm5, xmm1
|
|
psraw xmm6, 2
|
|
pand xmm7, xmm6
|
|
|
|
movdqa xmm6, [esp+640-384]
|
|
movdqa [esp+640-64], xmm7
|
|
movdqa xmm7, [esp+640-560]
|
|
pandn xmm7, xmm6
|
|
movdqa [esp+640-304], xmm7
|
|
movdqa xmm7, [esp+640-560]
|
|
movdqa [esp+640-528], xmm7
|
|
movdqa xmm7, [esp+640-416]
|
|
paddw xmm7, xmm6
|
|
paddw xmm7, xmm5
|
|
paddw xmm7, [esp+640-624]
|
|
movdqa xmm5, [esp+640-528]
|
|
psraw xmm7, 2
|
|
pand xmm5, xmm7
|
|
movdqa [esp+640-32], xmm5
|
|
|
|
movdqa xmm5, [esp+640-544]
|
|
movdqa [esp+640-528], xmm5
|
|
movdqa xmm5, [esp+640-480]
|
|
movdqa xmm7, xmm5
|
|
paddw xmm7, xmm5
|
|
movdqa xmm5, xmm1
|
|
paddw xmm5, xmm6
|
|
paddw xmm6, [esp+640-592]
|
|
paddw xmm7, xmm5
|
|
paddw xmm7, [esp+640-624]
|
|
movdqa xmm5, [esp+640-528]
|
|
psraw xmm7, 2
|
|
pandn xmm5, xmm7
|
|
movdqa xmm7, [esp+640-480]
|
|
paddw xmm7, xmm1
|
|
paddw xmm7, [esp+640-400]
|
|
movdqa xmm1, [esp+640-544]
|
|
movdqa [esp+640-352], xmm5
|
|
movdqa xmm5, [esp+640-368]
|
|
psllw xmm7, 1
|
|
paddw xmm7, xmm6
|
|
paddw xmm5, xmm7
|
|
|
|
movdqa xmm7, [esp+640-400]
|
|
psraw xmm5, 3
|
|
pand xmm1, xmm5
|
|
movdqa xmm5, [esp+640-480]
|
|
movdqa [esp+640-96], xmm1
|
|
movdqa xmm1, [esp+640-560]
|
|
movdqa [esp+640-528], xmm1
|
|
movdqa xmm1, [esp+640-384]
|
|
movdqa xmm6, xmm1
|
|
paddw xmm6, xmm1
|
|
paddw xmm1, [esp+640-400]
|
|
paddw xmm1, [esp+640-144]
|
|
paddw xmm7, xmm5
|
|
paddw xmm5, [esp+640-592]
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, [esp+640-624]
|
|
movdqa xmm7, [esp+640-528]
|
|
psraw xmm6, 2
|
|
psllw xmm1, 1
|
|
paddw xmm1, xmm5
|
|
|
|
movdqa xmm5, [esp+656-272]
|
|
pandn xmm7, xmm6
|
|
movdqa xmm6, [esp+640-416]
|
|
paddw xmm6, xmm1
|
|
movdqa xmm1, [esp+640-560]
|
|
psraw xmm6, 3
|
|
pand xmm1, xmm6
|
|
|
|
movdqa xmm6, [esp+704-272]
|
|
movdqa [esp+640-128], xmm1
|
|
movdqa xmm1, [esp+672-272]
|
|
punpckhbw xmm1, xmm2
|
|
movdqa [esp+640-448], xmm1
|
|
movdqa xmm1, [esp+688-272]
|
|
punpckhbw xmm1, xmm2
|
|
punpckhbw xmm6, xmm2
|
|
movdqa [esp+640-288], xmm7
|
|
punpckhbw xmm5, xmm2
|
|
movdqa [esp+640-496], xmm1
|
|
movdqa [esp+640-432], xmm6
|
|
|
|
movdqa xmm7, [esp+720-272]
|
|
punpckhbw xmm7, xmm2
|
|
movdqa [esp+640-464], xmm7
|
|
|
|
movdqa xmm7, [esp+736-272]
|
|
punpckhbw xmm7, xmm2
|
|
movdqa [esp+640-528], xmm7
|
|
|
|
movdqa xmm7, xmm6
|
|
|
|
psubw xmm6, [esp+640-464]
|
|
psubw xmm7, xmm1
|
|
pabsw xmm7, xmm7
|
|
movdqa [esp+640-560], xmm7
|
|
por xmm4, [esp+640-16]
|
|
pabsw xmm6, xmm6
|
|
movdqa xmm7, xmm1
|
|
psubw xmm7, [esp+640-448]
|
|
|
|
movdqa xmm1, [esp+640-512]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm1, xmm7
|
|
movdqa xmm7, [esp+640-512]
|
|
pcmpgtw xmm7, xmm6
|
|
movdqa xmm6, [esp+640-320]
|
|
pand xmm1, xmm7
|
|
movdqa xmm7, [esp+640-560]
|
|
pcmpgtw xmm6, xmm7
|
|
pand xmm1, xmm6
|
|
|
|
movdqa xmm6, [esp+640-576]
|
|
pcmpgtw xmm6, xmm7
|
|
|
|
movdqa xmm7, [esp+640-496]
|
|
punpckhbw xmm3, xmm2
|
|
movdqa [esp+640-560], xmm6
|
|
movdqa xmm6, [esp+640-512]
|
|
psubw xmm7, xmm5
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm6, xmm7
|
|
|
|
pand xmm6, [esp+640-560]
|
|
movdqa xmm7, [esp+640-432]
|
|
psubw xmm7, [esp+640-528]
|
|
|
|
psllw xmm3, 1
|
|
movdqa [esp+640-544], xmm6
|
|
movdqa xmm6, [esp+640-512]
|
|
|
|
movdqa xmm2, [esp+640-544]
|
|
paddw xmm3, xmm5
|
|
paddw xmm3, xmm5
|
|
paddw xmm3, xmm5
|
|
paddw xmm3, [esp+640-448]
|
|
paddw xmm3, [esp+640-496]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm6, xmm7
|
|
pand xmm6, [esp+640-560]
|
|
movdqa [esp+640-560], xmm6
|
|
|
|
movdqa xmm6, xmm0
|
|
pand xmm6, xmm4
|
|
movdqa xmm4, xmm0
|
|
pandn xmm4, [esp+640-368]
|
|
por xmm6, xmm4
|
|
movdqa xmm4, [esp+640-432]
|
|
paddw xmm3, xmm4
|
|
paddw xmm3, [esp+640-592]
|
|
psraw xmm3, 3
|
|
pand xmm3, xmm2
|
|
pandn xmm2, xmm5
|
|
por xmm3, xmm2
|
|
movdqa xmm7, xmm1
|
|
pand xmm7, xmm3
|
|
movdqa xmm3, [esp+640-64]
|
|
por xmm3, [esp+640-336]
|
|
movdqa xmm2, xmm1
|
|
pandn xmm2, xmm5
|
|
por xmm7, xmm2
|
|
|
|
movdqa xmm2, xmm0
|
|
pand xmm2, xmm3
|
|
movdqa xmm3, xmm0
|
|
pandn xmm3, [esp+640-480]
|
|
por xmm2, xmm3
|
|
packuswb xmm6, xmm7
|
|
movdqa [esp+640-336], xmm2
|
|
movdqa [esp+656-272], xmm6
|
|
movdqa xmm6, [esp+640-544]
|
|
movdqa xmm2, xmm5
|
|
paddw xmm2, [esp+640-448]
|
|
movdqa xmm3, xmm1
|
|
movdqa xmm7, [esp+640-496]
|
|
paddw xmm7, xmm4
|
|
paddw xmm2, xmm7
|
|
paddw xmm2, [esp+640-624]
|
|
movdqa xmm7, [esp+640-544]
|
|
psraw xmm2, 2
|
|
pand xmm6, xmm2
|
|
movdqa xmm2, [esp+640-448]
|
|
pandn xmm7, xmm2
|
|
por xmm6, xmm7
|
|
pand xmm3, xmm6
|
|
movdqa xmm6, xmm1
|
|
pandn xmm6, xmm2
|
|
paddw xmm2, [esp+640-496]
|
|
paddw xmm2, xmm4
|
|
por xmm3, xmm6
|
|
movdqa xmm6, [esp+640-336]
|
|
packuswb xmm6, xmm3
|
|
psllw xmm2, 1
|
|
movdqa [esp+672-272], xmm6
|
|
movdqa xmm6, [esp+640-96]
|
|
por xmm6, [esp+640-352]
|
|
|
|
movdqa xmm3, xmm0
|
|
pand xmm3, xmm6
|
|
movdqa xmm6, xmm0
|
|
pandn xmm6, [esp+640-144]
|
|
por xmm3, xmm6
|
|
movdqa xmm6, [esp+640-544]
|
|
movdqa [esp+640-352], xmm3
|
|
movdqa xmm3, [esp+640-464]
|
|
paddw xmm3, [esp+640-592]
|
|
paddw xmm2, xmm3
|
|
movdqa xmm3, [esp+640-448]
|
|
paddw xmm5, xmm2
|
|
movdqa xmm2, [esp+640-496]
|
|
psraw xmm5, 3
|
|
pand xmm6, xmm5
|
|
movdqa xmm5, [esp+640-464]
|
|
paddw xmm2, xmm5
|
|
paddw xmm5, [esp+640-432]
|
|
movdqa xmm4, xmm3
|
|
paddw xmm4, xmm3
|
|
paddw xmm4, xmm2
|
|
paddw xmm4, [esp+640-624]
|
|
movdqa xmm2, [esp+640-544]
|
|
paddw xmm3, [esp+640-592]
|
|
psraw xmm4, 2
|
|
pandn xmm2, xmm4
|
|
por xmm6, xmm2
|
|
movdqa xmm7, xmm1
|
|
pand xmm7, xmm6
|
|
movdqa xmm6, [esp+640-496]
|
|
movdqa xmm2, xmm1
|
|
pandn xmm2, xmm6
|
|
por xmm7, xmm2
|
|
movdqa xmm2, [esp+640-352]
|
|
packuswb xmm2, xmm7
|
|
movdqa [esp+688-272], xmm2
|
|
movdqa xmm2, [esp+640-128]
|
|
por xmm2, [esp+640-288]
|
|
|
|
movdqa xmm4, xmm0
|
|
pand xmm4, xmm2
|
|
paddw xmm5, xmm6
|
|
movdqa xmm2, xmm0
|
|
pandn xmm2, [esp+640-400]
|
|
por xmm4, xmm2
|
|
movdqa xmm2, [esp+640-528]
|
|
psllw xmm5, 1
|
|
paddw xmm5, xmm3
|
|
movdqa xmm3, [esp+640-560]
|
|
paddw xmm2, xmm5
|
|
psraw xmm2, 3
|
|
movdqa [esp+640-288], xmm4
|
|
movdqa xmm4, [esp+640-560]
|
|
pand xmm4, xmm2
|
|
movdqa xmm2, [esp+640-464]
|
|
movdqa xmm5, xmm2
|
|
paddw xmm5, xmm2
|
|
movdqa xmm2, [esp+640-432]
|
|
paddw xmm2, [esp+640-448]
|
|
movdqa xmm7, xmm1
|
|
paddw xmm5, xmm2
|
|
paddw xmm5, [esp+640-624]
|
|
movdqa xmm6, [esp+640-560]
|
|
psraw xmm5, 2
|
|
pandn xmm3, xmm5
|
|
por xmm4, xmm3
|
|
movdqa xmm3, [esp+640-32]
|
|
por xmm3, [esp+640-304]
|
|
pand xmm7, xmm4
|
|
movdqa xmm4, [esp+640-432]
|
|
movdqa xmm5, [esp+640-464]
|
|
movdqa xmm2, xmm1
|
|
pandn xmm2, xmm4
|
|
paddw xmm4, [esp+640-496]
|
|
por xmm7, xmm2
|
|
movdqa xmm2, [esp+640-288]
|
|
packuswb xmm2, xmm7
|
|
movdqa [esp+704-272], xmm2
|
|
|
|
movdqa xmm2, xmm0
|
|
pand xmm2, xmm3
|
|
movdqa xmm3, xmm0
|
|
pandn xmm3, [esp+640-384]
|
|
por xmm2, xmm3
|
|
movdqa [esp+640-304], xmm2
|
|
movdqa xmm2, [esp+640-528]
|
|
movdqa xmm3, xmm2
|
|
paddw xmm3, [esp+640-464]
|
|
paddw xmm3, xmm4
|
|
paddw xmm3, [esp+640-624]
|
|
psraw xmm3, 2
|
|
pand xmm6, xmm3
|
|
movdqa xmm3, [esp+640-560]
|
|
movdqa xmm4, xmm3
|
|
pandn xmm4, xmm5
|
|
por xmm6, xmm4
|
|
movdqa xmm7, xmm1
|
|
pand xmm7, xmm6
|
|
movdqa xmm6, [esp+640-304]
|
|
movdqa xmm4, xmm1
|
|
pandn xmm4, xmm5
|
|
por xmm7, xmm4
|
|
|
|
movdqa xmm4, xmm0
|
|
pandn xmm0, [esp+640-416]
|
|
packuswb xmm6, xmm7
|
|
movdqa xmm7, [esp+640-112]
|
|
por xmm7, [esp+640-80]
|
|
pand xmm4, xmm7
|
|
por xmm4, xmm0
|
|
movdqa xmm0, [esp+752-272]
|
|
punpckhbw xmm0, [esp+640-48]
|
|
psllw xmm0, 1
|
|
paddw xmm0, xmm2
|
|
paddw xmm0, xmm2
|
|
paddw xmm0, xmm2
|
|
paddw xmm0, xmm5
|
|
paddw xmm0, [esp+640-432]
|
|
paddw xmm0, [esp+640-496]
|
|
paddw xmm0, [esp+640-592]
|
|
psraw xmm0, 3
|
|
pand xmm0, xmm3
|
|
movdqa xmm7, xmm1
|
|
pandn xmm3, xmm2
|
|
por xmm0, xmm3
|
|
pand xmm7, xmm0
|
|
|
|
movdqa xmm0, [esp+656-272]
|
|
movdqa [edx], xmm0
|
|
|
|
movdqa xmm0, [esp+672-272]
|
|
|
|
mov edx, dword [esp+640-596]
|
|
movdqa [esi], xmm0
|
|
movdqa xmm0, [esp+688-272]
|
|
movdqa [edi], xmm0
|
|
movdqa xmm0, [esp+704-272]
|
|
|
|
pop edi
|
|
pandn xmm1, xmm2
|
|
movdqa [eax], xmm0
|
|
por xmm7, xmm1
|
|
pop esi
|
|
packuswb xmm4, xmm7
|
|
movdqa [edx], xmm6
|
|
movdqa [ecx], xmm4
|
|
pop ebx
|
|
mov esp, ebp
|
|
pop ebp
|
|
ret
|
|
|
|
|
|
;********************************************************************************
|
|
;
|
|
; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
|
|
;
|
|
;********************************************************************************
|
|
|
|
WELS_EXTERN DeblockLumaTransposeH2V_sse2
|
|
|
|
ALIGN 16
|
|
|
|
DeblockLumaTransposeH2V_sse2:
|
|
push ebp
|
|
push ebx
|
|
mov ebp, esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp, 10h
|
|
|
|
mov eax, [ebp + 0Ch]
|
|
mov ecx, [ebp + 10h]
|
|
lea edx, [eax + ecx * 8]
|
|
lea ebx, [ecx*3]
|
|
|
|
movq xmm0, [eax]
|
|
movq xmm7, [edx]
|
|
punpcklqdq xmm0, xmm7
|
|
movq xmm1, [eax + ecx]
|
|
movq xmm7, [edx + ecx]
|
|
punpcklqdq xmm1, xmm7
|
|
movq xmm2, [eax + ecx*2]
|
|
movq xmm7, [edx + ecx*2]
|
|
punpcklqdq xmm2, xmm7
|
|
movq xmm3, [eax + ebx]
|
|
movq xmm7, [edx + ebx]
|
|
punpcklqdq xmm3, xmm7
|
|
|
|
lea eax, [eax + ecx * 4]
|
|
lea edx, [edx + ecx * 4]
|
|
movq xmm4, [eax]
|
|
movq xmm7, [edx]
|
|
punpcklqdq xmm4, xmm7
|
|
movq xmm5, [eax + ecx]
|
|
movq xmm7, [edx + ecx]
|
|
punpcklqdq xmm5, xmm7
|
|
movq xmm6, [eax + ecx*2]
|
|
movq xmm7, [edx + ecx*2]
|
|
punpcklqdq xmm6, xmm7
|
|
|
|
movdqa [esp], xmm0
|
|
movq xmm7, [eax + ebx]
|
|
movq xmm0, [edx + ebx]
|
|
punpcklqdq xmm7, xmm0
|
|
movdqa xmm0, [esp]
|
|
|
|
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
|
|
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
|
|
|
mov eax, [ebp + 14h]
|
|
movdqa [eax], xmm4
|
|
movdqa [eax + 10h], xmm2
|
|
movdqa [eax + 20h], xmm3
|
|
movdqa [eax + 30h], xmm7
|
|
movdqa [eax + 40h], xmm5
|
|
movdqa [eax + 50h], xmm1
|
|
movdqa [eax + 60h], xmm6
|
|
movdqa [eax + 70h], xmm0
|
|
|
|
mov esp, ebp
|
|
pop ebx
|
|
pop ebp
|
|
ret
|
|
|
|
|
|
|
|
;*******************************************************************************************
|
|
;
|
|
; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
|
|
;
|
|
;*******************************************************************************************
|
|
|
|
WELS_EXTERN DeblockLumaTransposeV2H_sse2
|
|
|
|
ALIGN 16
|
|
|
|
DeblockLumaTransposeV2H_sse2:
|
|
push ebp
|
|
mov ebp, esp
|
|
|
|
and esp, 0FFFFFFF0h
|
|
sub esp, 10h
|
|
|
|
mov eax, [ebp + 10h]
|
|
mov ecx, [ebp + 0Ch]
|
|
mov edx, [ebp + 08h]
|
|
|
|
movdqa xmm0, [eax]
|
|
movdqa xmm1, [eax + 10h]
|
|
movdqa xmm2, [eax + 20h]
|
|
movdqa xmm3, [eax + 30h]
|
|
movdqa xmm4, [eax + 40h]
|
|
movdqa xmm5, [eax + 50h]
|
|
movdqa xmm6, [eax + 60h]
|
|
movdqa xmm7, [eax + 70h]
|
|
|
|
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
|
|
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
|
|
|
lea eax, [ecx * 3]
|
|
|
|
movq [edx], xmm4
|
|
movq [edx + ecx], xmm2
|
|
movq [edx + ecx*2], xmm3
|
|
movq [edx + eax], xmm7
|
|
|
|
lea edx, [edx + ecx*4]
|
|
movq [edx], xmm5
|
|
movq [edx + ecx], xmm1
|
|
movq [edx + ecx*2], xmm6
|
|
movq [edx + eax], xmm0
|
|
|
|
psrldq xmm4, 8
|
|
psrldq xmm2, 8
|
|
psrldq xmm3, 8
|
|
psrldq xmm7, 8
|
|
psrldq xmm5, 8
|
|
psrldq xmm1, 8
|
|
psrldq xmm6, 8
|
|
psrldq xmm0, 8
|
|
|
|
lea edx, [edx + ecx*4]
|
|
movq [edx], xmm4
|
|
movq [edx + ecx], xmm2
|
|
movq [edx + ecx*2], xmm3
|
|
movq [edx + eax], xmm7
|
|
|
|
lea edx, [edx + ecx*4]
|
|
movq [edx], xmm5
|
|
movq [edx + ecx], xmm1
|
|
movq [edx + ecx*2], xmm6
|
|
movq [edx + eax], xmm0
|
|
|
|
|
|
mov esp, ebp
|
|
pop ebp
|
|
ret |