openh264/codec/encoder/core/asm/deblock.asm

2113 lines
53 KiB
NASM
Raw Normal View History

2013-12-09 13:51:09 +01:00
;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* deblock.asm
;*
;* Abstract
;* edge loop
;*
;* History
;* 08/07/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
BITS 32
;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
%ifdef FORMAT_COFF
SECTION .rodata pData
%else
SECTION .rodata align=16
%endif
SECTION .text
;********************************************************************************
; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta)
;********************************************************************************
WELS_EXTERN DeblockChromaEq4V_sse2
ALIGN 16
DeblockChromaEq4V_sse2:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
sub esp,68h
mov edx,[ebp+10h] ; iStride
mov eax,[ebp+8] ; pPixCb
mov ecx,[ebp+0Ch] ; pPixCr
movq xmm4,[ecx]
movq xmm5,[edx+ecx]
push esi
push edi
lea esi,[edx+edx]
mov edi,eax
sub edi,esi
movq xmm1,[edi]
mov edi,ecx
sub edi,esi
movq xmm2,[edi]
punpcklqdq xmm1,xmm2
mov esi,eax
sub esi,edx
movq xmm2,[esi]
mov edi,ecx
sub edi,edx
movq xmm3,[edi]
punpcklqdq xmm2,xmm3
movq xmm3,[eax]
punpcklqdq xmm3,xmm4
movq xmm4,[edx+eax]
mov edx, [ebp + 14h]
punpcklqdq xmm4,xmm5
movd xmm5,edx
mov edx, [ebp + 18h]
pxor xmm0,xmm0
movdqa xmm6,xmm5
punpcklwd xmm6,xmm5
pshufd xmm5,xmm6,0
movd xmm6,edx
movdqa xmm7,xmm6
punpcklwd xmm7,xmm6
pshufd xmm6,xmm7,0
movdqa xmm7,xmm1
punpckhbw xmm1,xmm0
punpcklbw xmm7,xmm0
movdqa [esp+40h],xmm1
movdqa [esp+60h],xmm7
movdqa xmm7,xmm2
punpcklbw xmm7,xmm0
movdqa [esp+10h],xmm7
movdqa xmm7,xmm3
punpcklbw xmm7,xmm0
punpckhbw xmm3,xmm0
movdqa [esp+50h],xmm7
movdqa xmm7,xmm4
punpckhbw xmm4,xmm0
punpckhbw xmm2,xmm0
punpcklbw xmm7,xmm0
movdqa [esp+30h],xmm3
movdqa xmm3,[esp+10h]
movdqa xmm1,xmm3
psubw xmm1,[esp+50h]
pabsw xmm1,xmm1
movdqa [esp+20h],xmm4
movdqa xmm0,xmm5
pcmpgtw xmm0,xmm1
movdqa xmm1,[esp+60h]
psubw xmm1,xmm3
pabsw xmm1,xmm1
movdqa xmm4,xmm6
pcmpgtw xmm4,xmm1
pand xmm0,xmm4
movdqa xmm1,xmm7
psubw xmm1,[esp+50h]
pabsw xmm1,xmm1
movdqa xmm4,xmm6
pcmpgtw xmm4,xmm1
movdqa xmm1,xmm2
psubw xmm1,[esp+30h]
pabsw xmm1,xmm1
pcmpgtw xmm5,xmm1
movdqa xmm1,[esp+40h]
pand xmm0,xmm4
psubw xmm1,xmm2
pabsw xmm1,xmm1
movdqa xmm4,xmm6
pcmpgtw xmm4,xmm1
movdqa xmm1,[esp+20h]
psubw xmm1,[esp+30h]
pand xmm5,xmm4
pabsw xmm1,xmm1
pcmpgtw xmm6,xmm1
pand xmm5,xmm6
mov edx,2
movsx edx,dx
movd xmm1,edx
movdqa xmm4,xmm1
punpcklwd xmm4,xmm1
pshufd xmm1,xmm4,0
movdqa xmm4,[esp+60h]
movdqa xmm6,xmm4
paddw xmm6,xmm4
paddw xmm6,xmm3
paddw xmm6,xmm7
movdqa [esp+10h],xmm1
paddw xmm6,[esp+10h]
psraw xmm6,2
movdqa xmm4,xmm0
pandn xmm4,xmm3
movdqa xmm3,[esp+40h]
movdqa xmm1,xmm0
pand xmm1,xmm6
por xmm1,xmm4
movdqa xmm6,xmm3
paddw xmm6,xmm3
movdqa xmm3,[esp+10h]
paddw xmm6,xmm2
paddw xmm6,[esp+20h]
paddw xmm6,xmm3
psraw xmm6,2
movdqa xmm4,xmm5
pand xmm4,xmm6
movdqa xmm6,xmm5
pandn xmm6,xmm2
por xmm4,xmm6
packuswb xmm1,xmm4
movdqa xmm4,[esp+50h]
movdqa xmm6,xmm7
paddw xmm6,xmm7
paddw xmm6,xmm4
paddw xmm6,[esp+60h]
paddw xmm6,xmm3
psraw xmm6,2
movdqa xmm2,xmm0
pand xmm2,xmm6
pandn xmm0,xmm4
por xmm2,xmm0
movdqa xmm0,[esp+20h]
movdqa xmm6,xmm0
paddw xmm6,xmm0
movdqa xmm0,[esp+30h]
paddw xmm6,xmm0
paddw xmm6,[esp+40h]
movdqa xmm4,xmm5
paddw xmm6,xmm3
movq [esi],xmm1
psraw xmm6,2
pand xmm4,xmm6
pandn xmm5,xmm0
por xmm4,xmm5
packuswb xmm2,xmm4
movq [eax],xmm2
psrldq xmm1,8
movq [edi],xmm1
pop edi
psrldq xmm2,8
movq [ecx],xmm2
pop esi
mov esp,ebp
pop ebp
ret
;******************************************************************************
; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************
WELS_EXTERN DeblockChromaLt4V_sse2
DeblockChromaLt4V_sse2:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
sub esp,0E4h
push ebx
push esi
mov esi, [ebp+1Ch] ; pTC
movsx ebx, byte [esi+2]
push edi
movsx di,byte [esi+3]
mov word [esp+0Ch],bx
movsx bx,byte [esi+1]
movsx esi,byte [esi]
mov word [esp+0Eh],si
movzx esi,di
movd xmm1,esi
movzx esi,di
movd xmm2,esi
mov si,word [esp+0Ch]
mov edx, [ebp + 10h]
mov eax, [ebp + 08h]
movzx edi,si
movzx esi,si
mov ecx, [ebp + 0Ch]
movd xmm4,esi
movzx esi,bx
movd xmm5,esi
movd xmm3,edi
movzx esi,bx
movd xmm6,esi
mov si,word [esp+0Eh]
movzx edi,si
movzx esi,si
punpcklwd xmm6,xmm2
pxor xmm0,xmm0
movdqa [esp+40h],xmm0
movd xmm7,edi
movd xmm0,esi
lea esi,[edx+edx]
mov edi,eax
sub edi,esi
punpcklwd xmm5,xmm1
movdqa xmm1,[esp+40h]
punpcklwd xmm0,xmm4
movq xmm4,[edx+ecx]
punpcklwd xmm7,xmm3
movq xmm3,[eax]
punpcklwd xmm0,xmm6
movq xmm6,[edi]
punpcklwd xmm7,xmm5
punpcklwd xmm0,xmm7
mov edi,ecx
sub edi,esi
movdqa xmm2,xmm1
psubw xmm2,xmm0
movdqa [esp+60h],xmm2
movq xmm2, [edi]
punpcklqdq xmm6,xmm2
mov esi,eax
sub esi,edx
movq xmm7,[esi]
mov edi,ecx
sub edi,edx
movq xmm2,[edi]
punpcklqdq xmm7,xmm2
movq xmm2,[ecx]
punpcklqdq xmm3,xmm2
movq xmm2,[edx+eax]
movsx edx,word [ebp + 14h]
punpcklqdq xmm2,xmm4
movdqa [esp+0E0h],xmm2
movd xmm2,edx
movsx edx,word [ebp + 18h]
movdqa xmm4,xmm2
punpcklwd xmm4,xmm2
movd xmm2,edx
movdqa xmm5,xmm2
punpcklwd xmm5,xmm2
pshufd xmm2,xmm5,0
movdqa [esp+50h],xmm2
movdqa xmm2,xmm6
punpcklbw xmm2,xmm1
movdqa [esp+0D0h],xmm3
pshufd xmm4,xmm4,0
movdqa [esp+30h],xmm2
punpckhbw xmm6,xmm1
movdqa [esp+80h],xmm6
movdqa xmm6,[esp+0D0h]
punpckhbw xmm6,xmm1
movdqa [esp+70h],xmm6
movdqa xmm6, [esp+0E0h]
punpckhbw xmm6,xmm1
movdqa [esp+90h],xmm6
movdqa xmm5, [esp+0E0h]
movdqa xmm2,xmm7
punpckhbw xmm7,xmm1
punpcklbw xmm5,xmm1
movdqa [esp+0A0h],xmm7
punpcklbw xmm3,xmm1
mov edx,4
punpcklbw xmm2,xmm1
movsx edx,dx
movd xmm6,edx
movdqa xmm7,xmm6
punpcklwd xmm7,xmm6
pshufd xmm6,xmm7,0
movdqa xmm7,[esp+30h]
movdqa [esp+20h],xmm6
psubw xmm7,xmm5
movdqa xmm6,xmm0
pcmpgtw xmm6,xmm1
movdqa xmm1,[esp+60h]
movdqa [esp+40h],xmm6
movdqa xmm6,xmm3
psubw xmm6,xmm2
psllw xmm6,2
paddw xmm6,xmm7
paddw xmm6, [esp+20h]
movdqa xmm7, [esp+50h]
psraw xmm6,3
pmaxsw xmm1,xmm6
movdqa [esp+10h],xmm0
movdqa xmm6, [esp+10h]
pminsw xmm6,xmm1
movdqa [esp+10h],xmm6
movdqa xmm1,xmm2
psubw xmm1,xmm3
pabsw xmm1,xmm1
movdqa xmm6,xmm4
pcmpgtw xmm6,xmm1
movdqa xmm1, [esp+30h]
psubw xmm1,xmm2
pabsw xmm1,xmm1
pcmpgtw xmm7,xmm1
movdqa xmm1,[esp+50h]
pand xmm6,xmm7
movdqa xmm7,[esp+50h]
psubw xmm5,xmm3
pabsw xmm5,xmm5
pcmpgtw xmm1,xmm5
movdqa xmm5,[esp+80h]
psubw xmm5,[esp+90h]
pand xmm6,xmm1
pand xmm6,[esp+40h]
movdqa xmm1,[esp+10h]
pand xmm1,xmm6
movdqa xmm6,[esp+70h]
movdqa [esp+30h],xmm1
movdqa xmm1,[esp+0A0h]
psubw xmm6,xmm1
psllw xmm6,2
paddw xmm6,xmm5
paddw xmm6,[esp+20h]
movdqa xmm5,[esp+60h]
psraw xmm6,3
pmaxsw xmm5,xmm6
pminsw xmm0,xmm5
movdqa xmm5,[esp+70h]
movdqa xmm6,xmm1
psubw xmm6,xmm5
pabsw xmm6,xmm6
pcmpgtw xmm4,xmm6
movdqa xmm6,[esp+80h]
psubw xmm6,xmm1
pabsw xmm6,xmm6
pcmpgtw xmm7,xmm6
movdqa xmm6,[esp+90h]
pand xmm4,xmm7
movdqa xmm7,[esp+50h]
psubw xmm6,xmm5
pabsw xmm6,xmm6
pcmpgtw xmm7,xmm6
pand xmm4,xmm7
pand xmm4,[esp+40h]
pand xmm0,xmm4
movdqa xmm4,[esp+30h]
paddw xmm2,xmm4
paddw xmm1,xmm0
packuswb xmm2,xmm1
movq [esi],xmm2
psubw xmm3,xmm4
psubw xmm5,xmm0
packuswb xmm3,xmm5
movq [eax],xmm3
psrldq xmm2,8
movq [edi],xmm2
pop edi
pop esi
psrldq xmm3,8
movq [ecx],xmm3
pop ebx
mov esp,ebp
pop ebp
ret
;***************************************************************************
; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta)
;***************************************************************************
WELS_EXTERN DeblockChromaEq4H_sse2
ALIGN 16
DeblockChromaEq4H_sse2:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
sub esp,0C8h
mov ecx,dword [ebp+8]
mov edx,dword [ebp+0Ch]
mov eax,dword [ebp+10h]
sub ecx,2
sub edx,2
push esi
lea esi,[eax+eax*2]
mov dword [esp+18h],ecx
mov dword [esp+4],edx
lea ecx,[ecx+eax*4]
lea edx,[edx+eax*4]
lea eax,[esp+7Ch]
push edi
mov dword [esp+14h],esi
mov dword [esp+18h],ecx
mov dword [esp+0Ch],edx
mov dword [esp+10h],eax
mov esi,dword [esp+1Ch]
mov ecx,dword [ebp+10h]
mov edx,dword [esp+14h]
movd xmm0,dword [esi]
movd xmm1,dword [esi+ecx]
movd xmm2,dword [esi+ecx*2]
movd xmm3,dword [esi+edx]
mov esi,dword [esp+8]
movd xmm4,dword [esi]
movd xmm5,dword [esi+ecx]
movd xmm6,dword [esi+ecx*2]
movd xmm7,dword [esi+edx]
punpckldq xmm0,xmm4
punpckldq xmm1,xmm5
punpckldq xmm2,xmm6
punpckldq xmm3,xmm7
mov esi,dword [esp+18h]
mov edi,dword [esp+0Ch]
movd xmm4,dword [esi]
movd xmm5,dword [edi]
punpckldq xmm4,xmm5
punpcklqdq xmm0,xmm4
movd xmm4,dword [esi+ecx]
movd xmm5,dword [edi+ecx]
punpckldq xmm4,xmm5
punpcklqdq xmm1,xmm4
movd xmm4,dword [esi+ecx*2]
movd xmm5,dword [edi+ecx*2]
punpckldq xmm4,xmm5
punpcklqdq xmm2,xmm4
movd xmm4,dword [esi+edx]
movd xmm5,dword [edi+edx]
punpckldq xmm4,xmm5
punpcklqdq xmm3,xmm4
movdqa xmm6,xmm0
punpcklbw xmm0,xmm1
punpckhbw xmm6,xmm1
movdqa xmm7,xmm2
punpcklbw xmm2,xmm3
punpckhbw xmm7,xmm3
movdqa xmm4,xmm0
movdqa xmm5,xmm6
punpcklwd xmm0,xmm2
punpckhwd xmm4,xmm2
punpcklwd xmm6,xmm7
punpckhwd xmm5,xmm7
movdqa xmm1,xmm0
movdqa xmm2,xmm4
punpckldq xmm0,xmm6
punpckhdq xmm1,xmm6
punpckldq xmm4,xmm5
punpckhdq xmm2,xmm5
movdqa xmm5,xmm0
movdqa xmm6,xmm1
punpcklqdq xmm0,xmm4
punpckhqdq xmm5,xmm4
punpcklqdq xmm1,xmm2
punpckhqdq xmm6,xmm2
mov edi,dword [esp+10h]
movdqa [edi],xmm0
movdqa [edi+10h],xmm5
movdqa [edi+20h],xmm1
movdqa [edi+30h],xmm6
movsx ecx,word [ebp+14h]
movsx edx,word [ebp+18h]
movdqa xmm6,[esp+80h]
movdqa xmm4,[esp+90h]
movdqa xmm5,[esp+0A0h]
movdqa xmm7,[esp+0B0h]
pxor xmm0,xmm0
movd xmm1,ecx
movdqa xmm2,xmm1
punpcklwd xmm2,xmm1
pshufd xmm1,xmm2,0
movd xmm2,edx
movdqa xmm3,xmm2
punpcklwd xmm3,xmm2
pshufd xmm2,xmm3,0
movdqa xmm3,xmm6
punpckhbw xmm6,xmm0
movdqa [esp+60h],xmm6
movdqa xmm6,[esp+90h]
punpckhbw xmm6,xmm0
movdqa [esp+30h],xmm6
movdqa xmm6,[esp+0A0h]
punpckhbw xmm6,xmm0
movdqa [esp+40h],xmm6
movdqa xmm6,[esp+0B0h]
punpckhbw xmm6,xmm0
movdqa [esp+70h],xmm6
punpcklbw xmm7,xmm0
punpcklbw xmm4,xmm0
punpcklbw xmm5,xmm0
punpcklbw xmm3,xmm0
movdqa [esp+50h],xmm7
movdqa xmm6,xmm4
psubw xmm6,xmm5
pabsw xmm6,xmm6
movdqa xmm0,xmm1
pcmpgtw xmm0,xmm6
movdqa xmm6,xmm3
psubw xmm6,xmm4
pabsw xmm6,xmm6
movdqa xmm7,xmm2
pcmpgtw xmm7,xmm6
movdqa xmm6,[esp+50h]
psubw xmm6,xmm5
pabsw xmm6,xmm6
pand xmm0,xmm7
movdqa xmm7,xmm2
pcmpgtw xmm7,xmm6
movdqa xmm6,[esp+30h]
psubw xmm6,[esp+40h]
pabsw xmm6,xmm6
pcmpgtw xmm1,xmm6
movdqa xmm6,[esp+60h]
psubw xmm6,[esp+30h]
pabsw xmm6,xmm6
pand xmm0,xmm7
movdqa xmm7,xmm2
pcmpgtw xmm7,xmm6
movdqa xmm6,[esp+70h]
psubw xmm6,[esp+40h]
pabsw xmm6,xmm6
pand xmm1,xmm7
pcmpgtw xmm2,xmm6
pand xmm1,xmm2
mov eax,2
movsx ecx,ax
movd xmm2,ecx
movdqa xmm6,xmm2
punpcklwd xmm6,xmm2
pshufd xmm2,xmm6,0
movdqa [esp+20h],xmm2
movdqa xmm2,xmm3
paddw xmm2,xmm3
paddw xmm2,xmm4
paddw xmm2,[esp+50h]
paddw xmm2,[esp+20h]
psraw xmm2,2
movdqa xmm6,xmm0
pand xmm6,xmm2
movdqa xmm2,xmm0
pandn xmm2,xmm4
por xmm6,xmm2
movdqa xmm2,[esp+60h]
movdqa xmm7,xmm2
paddw xmm7,xmm2
paddw xmm7,[esp+30h]
paddw xmm7,[esp+70h]
paddw xmm7,[esp+20h]
movdqa xmm4,xmm1
movdqa xmm2,xmm1
pandn xmm2,[esp+30h]
psraw xmm7,2
pand xmm4,xmm7
por xmm4,xmm2
movdqa xmm2,[esp+50h]
packuswb xmm6,xmm4
movdqa [esp+90h],xmm6
movdqa xmm6,xmm2
paddw xmm6,xmm2
movdqa xmm2,[esp+20h]
paddw xmm6,xmm5
paddw xmm6,xmm3
movdqa xmm4,xmm0
pandn xmm0,xmm5
paddw xmm6,xmm2
psraw xmm6,2
pand xmm4,xmm6
por xmm4,xmm0
movdqa xmm0,[esp+70h]
movdqa xmm5,xmm0
paddw xmm5,xmm0
movdqa xmm0,[esp+40h]
paddw xmm5,xmm0
paddw xmm5,[esp+60h]
movdqa xmm3,xmm1
paddw xmm5,xmm2
psraw xmm5,2
pand xmm3,xmm5
pandn xmm1,xmm0
por xmm3,xmm1
packuswb xmm4,xmm3
movdqa [esp+0A0h],xmm4
mov esi,dword [esp+10h]
movdqa xmm0,[esi]
movdqa xmm1,[esi+10h]
movdqa xmm2,[esi+20h]
movdqa xmm3,[esi+30h]
movdqa xmm6,xmm0
punpcklbw xmm0,xmm1
punpckhbw xmm6,xmm1
movdqa xmm7,xmm2
punpcklbw xmm2,xmm3
punpckhbw xmm7,xmm3
movdqa xmm4,xmm0
movdqa xmm5,xmm6
punpcklwd xmm0,xmm2
punpckhwd xmm4,xmm2
punpcklwd xmm6,xmm7
punpckhwd xmm5,xmm7
movdqa xmm1,xmm0
movdqa xmm2,xmm4
punpckldq xmm0,xmm6
punpckhdq xmm1,xmm6
punpckldq xmm4,xmm5
punpckhdq xmm2,xmm5
movdqa xmm5,xmm0
movdqa xmm6,xmm1
punpcklqdq xmm0,xmm4
punpckhqdq xmm5,xmm4
punpcklqdq xmm1,xmm2
punpckhqdq xmm6,xmm2
mov esi,dword [esp+1Ch]
mov ecx,dword [ebp+10h]
mov edx,dword [esp+14h]
mov edi,dword [esp+8]
movd dword [esi],xmm0
movd dword [esi+ecx],xmm5
movd dword [esi+ecx*2],xmm1
movd dword [esi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
mov esi,dword [esp+18h]
movd dword [edi],xmm0
movd dword [edi+ecx],xmm5
movd dword [edi+ecx*2],xmm1
movd dword [edi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
movd dword [esi],xmm0
movd dword [esi+ecx],xmm5
movd dword [esi+ecx*2],xmm1
movd dword [esi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
mov edi,dword [esp+0Ch]
movd dword [edi],xmm0
movd dword [edi+ecx],xmm5
movd dword [edi+ecx*2],xmm1
movd dword [edi+edx],xmm6
pop edi
pop esi
mov esp,ebp
pop ebp
ret
;*******************************************************************************
; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************
WELS_EXTERN DeblockChromaLt4H_sse2
ALIGN 16
DeblockChromaLt4H_sse2:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
sub esp,108h
mov ecx,dword [ebp+8]
mov edx,dword [ebp+0Ch]
mov eax,dword [ebp+10h]
sub ecx,2
sub edx,2
push esi
lea esi,[eax+eax*2]
mov dword [esp+10h],ecx
mov dword [esp+4],edx
lea ecx,[ecx+eax*4]
lea edx,[edx+eax*4]
lea eax,[esp+6Ch]
push edi
mov dword [esp+0Ch],esi
mov dword [esp+18h],ecx
mov dword [esp+10h],edx
mov dword [esp+1Ch],eax
mov esi,dword [esp+14h]
mov ecx,dword [ebp+10h]
mov edx,dword [esp+0Ch]
movd xmm0,dword [esi]
movd xmm1,dword [esi+ecx]
movd xmm2,dword [esi+ecx*2]
movd xmm3,dword [esi+edx]
mov esi,dword [esp+8]
movd xmm4,dword [esi]
movd xmm5,dword [esi+ecx]
movd xmm6,dword [esi+ecx*2]
movd xmm7,dword [esi+edx]
punpckldq xmm0,xmm4
punpckldq xmm1,xmm5
punpckldq xmm2,xmm6
punpckldq xmm3,xmm7
mov esi,dword [esp+18h]
mov edi,dword [esp+10h]
movd xmm4,dword [esi]
movd xmm5,dword [edi]
punpckldq xmm4,xmm5
punpcklqdq xmm0,xmm4
movd xmm4,dword [esi+ecx]
movd xmm5,dword [edi+ecx]
punpckldq xmm4,xmm5
punpcklqdq xmm1,xmm4
movd xmm4,dword [esi+ecx*2]
movd xmm5,dword [edi+ecx*2]
punpckldq xmm4,xmm5
punpcklqdq xmm2,xmm4
movd xmm4,dword [esi+edx]
movd xmm5,dword [edi+edx]
punpckldq xmm4,xmm5
punpcklqdq xmm3,xmm4
movdqa xmm6,xmm0
punpcklbw xmm0,xmm1
punpckhbw xmm6,xmm1
movdqa xmm7,xmm2
punpcklbw xmm2,xmm3
punpckhbw xmm7,xmm3
movdqa xmm4,xmm0
movdqa xmm5,xmm6
punpcklwd xmm0,xmm2
punpckhwd xmm4,xmm2
punpcklwd xmm6,xmm7
punpckhwd xmm5,xmm7
movdqa xmm1,xmm0
movdqa xmm2,xmm4
punpckldq xmm0,xmm6
punpckhdq xmm1,xmm6
punpckldq xmm4,xmm5
punpckhdq xmm2,xmm5
movdqa xmm5,xmm0
movdqa xmm6,xmm1
punpcklqdq xmm0,xmm4
punpckhqdq xmm5,xmm4
punpcklqdq xmm1,xmm2
punpckhqdq xmm6,xmm2
mov edi,dword [esp+1Ch]
movdqa [edi],xmm0
movdqa [edi+10h],xmm5
movdqa [edi+20h],xmm1
movdqa [edi+30h],xmm6
mov eax,dword [ebp+1Ch]
movsx cx,byte [eax+3]
movsx dx,byte [eax+2]
movsx si,byte [eax+1]
movsx ax,byte [eax]
movzx edi,cx
movzx ecx,cx
movd xmm2,ecx
movzx ecx,dx
movzx edx,dx
movd xmm3,ecx
movd xmm4,edx
movzx ecx,si
movzx edx,si
movd xmm5,ecx
pxor xmm0,xmm0
movd xmm6,edx
movzx ecx,ax
movdqa [esp+60h],xmm0
movzx edx,ax
movsx eax,word [ebp+14h]
punpcklwd xmm6,xmm2
movd xmm1,edi
movd xmm7,ecx
movsx ecx,word [ebp+18h]
movd xmm0,edx
punpcklwd xmm7,xmm3
punpcklwd xmm5,xmm1
movdqa xmm1,[esp+60h]
punpcklwd xmm7,xmm5
movdqa xmm5,[esp+0A0h]
punpcklwd xmm0,xmm4
punpcklwd xmm0,xmm6
movdqa xmm6, [esp+70h]
punpcklwd xmm0,xmm7
movdqa xmm7,[esp+80h]
movdqa xmm2,xmm1
psubw xmm2,xmm0
movdqa [esp+0D0h],xmm2
movd xmm2,eax
movdqa xmm3,xmm2
punpcklwd xmm3,xmm2
pshufd xmm4,xmm3,0
movd xmm2,ecx
movdqa xmm3,xmm2
punpcklwd xmm3,xmm2
pshufd xmm2,xmm3,0
movdqa xmm3, [esp+90h]
movdqa [esp+50h],xmm2
movdqa xmm2,xmm6
punpcklbw xmm2,xmm1
punpckhbw xmm6,xmm1
movdqa [esp+40h],xmm2
movdqa [esp+0B0h],xmm6
movdqa xmm6,[esp+90h]
movdqa xmm2,xmm7
punpckhbw xmm7,xmm1
punpckhbw xmm6,xmm1
punpcklbw xmm2,xmm1
punpcklbw xmm3,xmm1
punpcklbw xmm5,xmm1
movdqa [esp+0F0h],xmm7
movdqa [esp+0C0h],xmm6
movdqa xmm6, [esp+0A0h]
punpckhbw xmm6,xmm1
movdqa [esp+0E0h],xmm6
mov edx,4
movsx eax,dx
movd xmm6,eax
movdqa xmm7,xmm6
punpcklwd xmm7,xmm6
pshufd xmm6,xmm7,0
movdqa [esp+30h],xmm6
movdqa xmm7, [esp+40h]
psubw xmm7,xmm5
movdqa xmm6,xmm0
pcmpgtw xmm6,xmm1
movdqa [esp+60h],xmm6
movdqa xmm1, [esp+0D0h]
movdqa xmm6,xmm3
psubw xmm6,xmm2
psllw xmm6,2
paddw xmm6,xmm7
paddw xmm6,[esp+30h]
psraw xmm6,3
pmaxsw xmm1,xmm6
movdqa xmm7,[esp+50h]
movdqa [esp+20h],xmm0
movdqa xmm6, [esp+20h]
pminsw xmm6,xmm1
movdqa [esp+20h],xmm6
movdqa xmm6,xmm4
movdqa xmm1,xmm2
psubw xmm1,xmm3
pabsw xmm1,xmm1
pcmpgtw xmm6,xmm1
movdqa xmm1, [esp+40h]
psubw xmm1,xmm2
pabsw xmm1,xmm1
pcmpgtw xmm7,xmm1
movdqa xmm1, [esp+50h]
pand xmm6,xmm7
movdqa xmm7, [esp+50h]
psubw xmm5,xmm3
pabsw xmm5,xmm5
pcmpgtw xmm1,xmm5
movdqa xmm5, [esp+0B0h]
psubw xmm5,[esp+0E0h]
pand xmm6,xmm1
pand xmm6, [esp+60h]
movdqa xmm1, [esp+20h]
pand xmm1,xmm6
movdqa xmm6, [esp+0C0h]
movdqa [esp+40h],xmm1
movdqa xmm1, [esp+0F0h]
psubw xmm6,xmm1
psllw xmm6,2
paddw xmm6,xmm5
paddw xmm6, [esp+30h]
movdqa xmm5, [esp+0D0h]
psraw xmm6,3
pmaxsw xmm5,xmm6
pminsw xmm0,xmm5
movdqa xmm5,[esp+0C0h]
movdqa xmm6,xmm1
psubw xmm6,xmm5
pabsw xmm6,xmm6
pcmpgtw xmm4,xmm6
movdqa xmm6,[esp+0B0h]
psubw xmm6,xmm1
pabsw xmm6,xmm6
pcmpgtw xmm7,xmm6
movdqa xmm6, [esp+0E0h]
pand xmm4,xmm7
movdqa xmm7, [esp+50h]
psubw xmm6,xmm5
pabsw xmm6,xmm6
pcmpgtw xmm7,xmm6
pand xmm4,xmm7
pand xmm4,[esp+60h]
pand xmm0,xmm4
movdqa xmm4, [esp+40h]
paddw xmm2,xmm4
paddw xmm1,xmm0
psubw xmm3,xmm4
psubw xmm5,xmm0
packuswb xmm2,xmm1
packuswb xmm3,xmm5
movdqa [esp+80h],xmm2
movdqa [esp+90h],xmm3
mov esi,dword [esp+1Ch]
movdqa xmm0, [esi]
movdqa xmm1, [esi+10h]
movdqa xmm2, [esi+20h]
movdqa xmm3, [esi+30h]
movdqa xmm6,xmm0
punpcklbw xmm0,xmm1
punpckhbw xmm6,xmm1
movdqa xmm7,xmm2
punpcklbw xmm2,xmm3
punpckhbw xmm7,xmm3
movdqa xmm4,xmm0
movdqa xmm5,xmm6
punpcklwd xmm0,xmm2
punpckhwd xmm4,xmm2
punpcklwd xmm6,xmm7
punpckhwd xmm5,xmm7
movdqa xmm1,xmm0
movdqa xmm2,xmm4
punpckldq xmm0,xmm6
punpckhdq xmm1,xmm6
punpckldq xmm4,xmm5
punpckhdq xmm2,xmm5
movdqa xmm5,xmm0
movdqa xmm6,xmm1
punpcklqdq xmm0,xmm4
punpckhqdq xmm5,xmm4
punpcklqdq xmm1,xmm2
punpckhqdq xmm6,xmm2
mov esi,dword [esp+14h]
mov ecx,dword [ebp+10h]
mov edx,dword [esp+0Ch]
mov edi,dword [esp+8]
movd dword [esi],xmm0
movd dword [esi+ecx],xmm5
movd dword [esi+ecx*2],xmm1
movd dword [esi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
mov esi,dword [esp+18h]
movd dword [edi],xmm0
movd dword [edi+ecx],xmm5
movd dword [edi+ecx*2],xmm1
movd dword [edi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
movd dword [esi],xmm0
movd dword [esi+ecx],xmm5
movd dword [esi+ecx*2],xmm1
movd dword [esi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
mov edi,dword [esp+10h]
movd dword [edi],xmm0
movd dword [edi+ecx],xmm5
movd dword [edi+ecx*2],xmm1
movd dword [edi+edx],xmm6
pop edi
pop esi
mov esp,ebp
pop ebp
ret
;*******************************************************************************
; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
; int32_t iBeta, int8_t * pTC)
;*******************************************************************************
WELS_EXTERN DeblockLumaLt4V_sse2
ALIGN 16
DeblockLumaLt4V_sse2:
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
sub esp, 420 ; 000001a4H
mov eax, dword [ebp+8]
mov ecx, dword [ebp+12]
pxor xmm0, xmm0
push ebx
mov edx, dword [ebp+24]
movdqa [esp+424-384], xmm0
push esi
lea esi, [ecx+ecx*2]
push edi
mov edi, eax
sub edi, esi
movdqa xmm0, [edi]
lea esi, [ecx+ecx]
movdqa [esp+432-208], xmm0
mov edi, eax
sub edi, esi
movdqa xmm0, [edi]
movdqa [esp+448-208], xmm0
mov ebx, eax
sub ebx, ecx
movdqa xmm0, [ebx]
movdqa [esp+464-208], xmm0
movdqa xmm0, [eax]
add ecx, eax
movdqa [esp+480-208], xmm0
movdqa xmm0, [ecx]
mov dword [esp+432-404], ecx
movsx ecx, word [ebp+16]
movdqa [esp+496-208], xmm0
movdqa xmm0, [esi+eax]
movsx si, byte [edx]
movdqa [esp+512-208], xmm0
movd xmm0, ecx
movsx ecx, word [ebp+20]
movdqa xmm1, xmm0
punpcklwd xmm1, xmm0
pshufd xmm0, xmm1, 0
movdqa [esp+432-112], xmm0
movd xmm0, ecx
movsx cx, byte [edx+1]
movdqa xmm1, xmm0
punpcklwd xmm1, xmm0
mov dword [esp+432-408], ebx
movzx ebx, cx
pshufd xmm0, xmm1, 0
movd xmm1, ebx
movzx ebx, cx
movd xmm2, ebx
movzx ebx, cx
movzx ecx, cx
movd xmm4, ecx
movzx ecx, si
movd xmm5, ecx
movzx ecx, si
movd xmm6, ecx
movzx ecx, si
movd xmm7, ecx
movzx ecx, si
movdqa [esp+432-336], xmm0
movd xmm0, ecx
movsx cx, byte [edx+3]
movsx dx, byte [edx+2]
movd xmm3, ebx
punpcklwd xmm0, xmm4
movzx esi, cx
punpcklwd xmm6, xmm2
punpcklwd xmm5, xmm1
punpcklwd xmm0, xmm6
punpcklwd xmm7, xmm3
punpcklwd xmm7, xmm5
punpcklwd xmm0, xmm7
movdqa [esp+432-400], xmm0
movd xmm0, esi
movzx esi, cx
movd xmm2, esi
movzx esi, cx
movzx ecx, cx
movd xmm4, ecx
movzx ecx, dx
movd xmm3, esi
movd xmm5, ecx
punpcklwd xmm5, xmm0
movdqa xmm0, [esp+432-384]
movzx ecx, dx
movd xmm6, ecx
movzx ecx, dx
movzx edx, dx
punpcklwd xmm6, xmm2
movd xmm7, ecx
movd xmm1, edx
movdqa xmm2, [esp+448-208]
punpcklbw xmm2, xmm0
mov ecx, 4
movsx edx, cx
punpcklwd xmm7, xmm3
punpcklwd xmm7, xmm5
movdqa xmm5, [esp+496-208]
movdqa xmm3, [esp+464-208]
punpcklbw xmm5, xmm0
movdqa [esp+432-240], xmm5
movdqa xmm5, [esp+512-208]
punpcklbw xmm5, xmm0
movdqa [esp+432-352], xmm5
punpcklwd xmm1, xmm4
movdqa xmm4, [esp+432-208]
punpcklwd xmm1, xmm6
movdqa xmm6, [esp+480-208]
punpcklwd xmm1, xmm7
punpcklbw xmm6, xmm0
punpcklbw xmm3, xmm0
punpcklbw xmm4, xmm0
movdqa xmm7, xmm3
psubw xmm7, xmm4
pabsw xmm7, xmm7
movdqa [esp+432-272], xmm4
movdqa xmm4, [esp+432-336]
movdqa xmm5, xmm4
pcmpgtw xmm5, xmm7
movdqa [esp+432-288], xmm5
movdqa xmm7, xmm6
psubw xmm7, [esp+432-352]
pabsw xmm7, xmm7
movdqa xmm5, xmm4
pcmpgtw xmm5, xmm7
movdqa [esp+432-256], xmm5
movdqa xmm5, xmm3
pavgw xmm5, xmm6
movdqa [esp+432-304], xmm5
movdqa xmm5, [esp+432-400]
psubw xmm5, [esp+432-288]
psubw xmm5, [esp+432-256]
movdqa [esp+432-224], xmm5
movdqa xmm5, xmm6
psubw xmm5, xmm3
movdqa [esp+432-32], xmm6
psubw xmm6, [esp+432-240]
movdqa xmm7, xmm5
movdqa [esp+432-384], xmm5
movdqa xmm5, [esp+432-112]
pabsw xmm7, xmm7
pcmpgtw xmm5, xmm7
pabsw xmm6, xmm6
movdqa xmm7, xmm4
pcmpgtw xmm7, xmm6
pand xmm5, xmm7
movdqa xmm6, xmm3
psubw xmm6, xmm2
pabsw xmm6, xmm6
movdqa xmm7, xmm4
pcmpgtw xmm7, xmm6
movdqa xmm6, [esp+432-400]
pand xmm5, xmm7
movdqa xmm7, xmm6
pcmpeqw xmm6, xmm0
pcmpgtw xmm7, xmm0
por xmm7, xmm6
pand xmm5, xmm7
movdqa [esp+432-320], xmm5
movd xmm5, edx
movdqa xmm6, xmm5
punpcklwd xmm6, xmm5
pshufd xmm5, xmm6, 0
movdqa [esp+432-336], xmm5
movdqa xmm5, [esp+432-224]
movdqa [esp+432-368], xmm5
movdqa xmm6, xmm0
psubw xmm6, xmm5
movdqa xmm5, [esp+432-384]
psllw xmm5, 2
movdqa xmm7, xmm2
psubw xmm7, [esp+432-240]
paddw xmm7, xmm5
paddw xmm7, [esp+432-336]
movdqa xmm5, [esp+432-368]
psraw xmm7, 3
pmaxsw xmm6, xmm7
pminsw xmm5, xmm6
pand xmm5, [esp+432-320]
movdqa xmm6, [esp+432-400]
movdqa [esp+432-64], xmm5
movdqa [esp+432-384], xmm6
movdqa xmm5, xmm0
psubw xmm5, xmm6
movdqa [esp+432-368], xmm5
movdqa xmm6, xmm5
movdqa xmm5, [esp+432-272]
paddw xmm5, [esp+432-304]
movdqa xmm7, xmm2
paddw xmm7, xmm2
psubw xmm5, xmm7
psraw xmm5, 1
pmaxsw xmm6, xmm5
movdqa xmm5, [esp+432-384]
pminsw xmm5, xmm6
pand xmm5, [esp+432-320]
pand xmm5, [esp+432-288]
movdqa xmm6, [esp+432-240]
movdqa [esp+432-96], xmm5
movdqa xmm5, [esp+432-352]
paddw xmm5, [esp+432-304]
movdqa xmm7, xmm6
paddw xmm7, xmm6
movdqa xmm6, [esp+432-368]
psubw xmm5, xmm7
movdqa xmm7, [esp+496-208]
psraw xmm5, 1
pmaxsw xmm6, xmm5
movdqa xmm5, [esp+432-400]
pminsw xmm5, xmm6
pand xmm5, [esp+432-320]
pand xmm5, [esp+432-256]
movdqa xmm6, [esp+448-208]
punpckhbw xmm7, xmm0
movdqa [esp+432-352], xmm7
movdqa xmm7, [esp+512-208]
punpckhbw xmm6, xmm0
movdqa [esp+432-48], xmm5
movdqa xmm5, [esp+432-208]
movdqa [esp+432-368], xmm6
movdqa xmm6, [esp+464-208]
punpckhbw xmm7, xmm0
punpckhbw xmm5, xmm0
movdqa [esp+432-384], xmm7
punpckhbw xmm6, xmm0
movdqa [esp+432-400], xmm6
movdqa xmm7, [esp+432-400]
movdqa xmm6, [esp+480-208]
psubw xmm7, xmm5
movdqa [esp+432-16], xmm5
pabsw xmm7, xmm7
punpckhbw xmm6, xmm0
movdqa xmm5, xmm4
pcmpgtw xmm5, xmm7
movdqa [esp+432-288], xmm5
movdqa xmm7, xmm6
psubw xmm7, [esp+432-384]
pabsw xmm7, xmm7
movdqa xmm5, xmm4
pcmpgtw xmm5, xmm7
movdqa [esp+432-256], xmm5
movdqa xmm5, [esp+432-400]
movdqa [esp+432-80], xmm6
pavgw xmm5, xmm6
movdqa [esp+432-304], xmm5
movdqa xmm5, xmm1
psubw xmm5, [esp+432-288]
psubw xmm5, [esp+432-256]
movdqa [esp+432-224], xmm5
movdqa xmm5, xmm6
psubw xmm5, [esp+432-400]
psubw xmm6, [esp+432-352]
movdqa [esp+432-272], xmm5
movdqa xmm7, xmm5
movdqa xmm5, [esp+432-112]
pabsw xmm7, xmm7
pcmpgtw xmm5, xmm7
movdqa xmm7, xmm4
pabsw xmm6, xmm6
pcmpgtw xmm7, xmm6
movdqa xmm6, [esp+432-368]
pand xmm5, xmm7
movdqa xmm7, [esp+432-400]
psubw xmm7, xmm6
psubw xmm6, [esp+432-352]
pabsw xmm7, xmm7
pcmpgtw xmm4, xmm7
pand xmm5, xmm4
paddw xmm2, [esp+432-96]
movdqa xmm4, xmm1
pcmpgtw xmm4, xmm0
movdqa xmm7, xmm1
pcmpeqw xmm7, xmm0
por xmm4, xmm7
pand xmm5, xmm4
movdqa xmm4, [esp+432-224]
movdqa [esp+432-320], xmm5
movdqa xmm5, [esp+432-272]
movdqa xmm7, xmm0
psubw xmm7, xmm4
psubw xmm0, xmm1
psllw xmm5, 2
paddw xmm6, xmm5
paddw xmm6, [esp+432-336]
movdqa xmm5, [esp+432-368]
movdqa [esp+432-336], xmm0
psraw xmm6, 3
pmaxsw xmm7, xmm6
pminsw xmm4, xmm7
pand xmm4, [esp+432-320]
movdqa xmm6, xmm0
movdqa xmm0, [esp+432-16]
paddw xmm0, [esp+432-304]
movdqa [esp+432-272], xmm4
movdqa xmm4, [esp+432-368]
paddw xmm4, xmm4
psubw xmm0, xmm4
movdqa xmm4, [esp+432-64]
psraw xmm0, 1
pmaxsw xmm6, xmm0
movdqa xmm0, [esp+432-400]
movdqa xmm7, xmm1
pminsw xmm7, xmm6
movdqa xmm6, [esp+432-320]
pand xmm7, xmm6
pand xmm7, [esp+432-288]
paddw xmm5, xmm7
packuswb xmm2, xmm5
movdqa xmm5, [esp+432-272]
paddw xmm0, xmm5
paddw xmm3, xmm4
packuswb xmm3, xmm0
movdqa xmm0, [esp+432-32]
psubw xmm0, xmm4
movdqa xmm4, [esp+432-80]
psubw xmm4, xmm5
movdqa xmm5, [esp+432-240]
paddw xmm5, [esp+432-48]
packuswb xmm0, xmm4
movdqa xmm4, [esp+432-384]
paddw xmm4, [esp+432-304]
movdqa [esp+480-208], xmm0
movdqa xmm0, [esp+432-352]
movdqa xmm7, xmm0
paddw xmm0, xmm0
mov ecx, dword [esp+432-408]
mov edx, dword [esp+432-404]
psubw xmm4, xmm0
movdqa xmm0, [esp+432-336]
movdqa [edi], xmm2
psraw xmm4, 1
pmaxsw xmm0, xmm4
pminsw xmm1, xmm0
movdqa xmm0, [esp+480-208]
pop edi
pand xmm1, xmm6
pand xmm1, [esp+428-256]
movdqa [ecx], xmm3
paddw xmm7, xmm1
pop esi
packuswb xmm5, xmm7
movdqa [eax], xmm0
movdqa [edx], xmm5
pop ebx
mov esp, ebp
pop ebp
ret
;*******************************************************************************
; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
; int32_t iBeta)
;*******************************************************************************
WELS_EXTERN DeblockLumaEq4V_sse2
ALIGN 16
DeblockLumaEq4V_sse2:
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
sub esp, 628 ; 00000274H
mov eax, dword [ebp+8]
mov ecx, dword [ebp+12]
push ebx
push esi
lea edx, [ecx*4]
pxor xmm0, xmm0
movdqa xmm2, xmm0
movdqa xmm0, [ecx+eax]
mov esi, eax
sub esi, edx
movdqa xmm3, [esi]
movdqa xmm5, [eax]
push edi
lea edi, [ecx+ecx]
lea ebx, [ecx+ecx*2]
mov dword [esp+640-600], edi
mov esi, eax
sub esi, edi
movdqa xmm1, [esi]
movdqa [esp+720-272], xmm0
mov edi, eax
sub edi, ecx
movdqa xmm4, [edi]
add ecx, eax
mov dword [esp+640-596], ecx
mov ecx, dword [esp+640-600]
movdqa xmm0, [ecx+eax]
movdqa [esp+736-272], xmm0
movdqa xmm0, [eax+ebx]
mov edx, eax
sub edx, ebx
movsx ebx, word [ebp+16]
movdqa xmm6, [edx]
add ecx, eax
movdqa [esp+752-272], xmm0
movd xmm0, ebx
movsx ebx, word [ebp+20]
movdqa xmm7, xmm0
punpcklwd xmm7, xmm0
pshufd xmm0, xmm7, 0
movdqa [esp+640-320], xmm0
movd xmm0, ebx
movdqa xmm7, xmm0
punpcklwd xmm7, xmm0
pshufd xmm0, xmm7, 0
movdqa xmm7, [esp+736-272]
punpcklbw xmm7, xmm2
movdqa [esp+640-416], xmm7
movdqa [esp+640-512], xmm0
movdqa xmm0, xmm1
movdqa [esp+672-272], xmm1
movdqa xmm1, xmm4
movdqa [esp+704-272], xmm5
punpcklbw xmm5, xmm2
punpcklbw xmm1, xmm2
movdqa xmm7, xmm5
psubw xmm7, xmm1
pabsw xmm7, xmm7
movdqa [esp+640-560], xmm7
punpcklbw xmm0, xmm2
movdqa [esp+688-272], xmm4
movdqa xmm4, [esp+720-272]
movdqa [esp+640-480], xmm0
movdqa xmm7, xmm1
psubw xmm7, xmm0
movdqa xmm0, [esp+640-512]
pabsw xmm7, xmm7
punpcklbw xmm4, xmm2
pcmpgtw xmm0, xmm7
movdqa [esp+640-384], xmm4
movdqa xmm7, xmm5
psubw xmm7, xmm4
movdqa xmm4, [esp+640-512]
movdqa [esp+656-272], xmm6
punpcklbw xmm6, xmm2
pabsw xmm7, xmm7
movdqa [esp+640-48], xmm2
movdqa [esp+640-368], xmm6
movdqa [esp+640-144], xmm1
movdqa [esp+640-400], xmm5
pcmpgtw xmm4, xmm7
pand xmm0, xmm4
movdqa xmm4, [esp+640-320]
pcmpgtw xmm4, [esp+640-560]
pand xmm0, xmm4
mov ebx, 2
movsx ebx, bx
movd xmm4, ebx
movdqa xmm7, xmm4
punpcklwd xmm7, xmm4
movdqa xmm4, [esp+640-320]
psraw xmm4, 2
pshufd xmm7, xmm7, 0
paddw xmm4, xmm7
movdqa [esp+640-576], xmm4
pcmpgtw xmm4, [esp+640-560]
movdqa [esp+640-560], xmm4
movdqa xmm4, [esp+640-512]
movdqa [esp+640-624], xmm7
movdqa xmm7, xmm1
psubw xmm7, xmm6
pabsw xmm7, xmm7
pcmpgtw xmm4, xmm7
pand xmm4, [esp+640-560]
movdqa [esp+640-544], xmm4
movdqa xmm4, [esp+640-512]
movdqa xmm7, xmm5
psubw xmm7, [esp+640-416]
pabsw xmm7, xmm7
pcmpgtw xmm4, xmm7
pand xmm4, [esp+640-560]
movdqa [esp+640-560], xmm4
movdqa xmm4, [esp+640-544]
pandn xmm4, xmm6
movdqa [esp+640-16], xmm4
mov ebx, 4
movsx ebx, bx
movd xmm4, ebx
movdqa xmm7, xmm4
punpcklwd xmm7, xmm4
movdqa xmm4, xmm3
punpcklbw xmm4, xmm2
psllw xmm4, 1
paddw xmm4, xmm6
paddw xmm4, xmm6
paddw xmm4, xmm6
paddw xmm4, [esp+640-480]
movdqa xmm6, [esp+640-560]
pshufd xmm7, xmm7, 0
paddw xmm4, xmm1
movdqa [esp+640-592], xmm7
paddw xmm4, xmm5
paddw xmm4, xmm7
movdqa xmm7, [esp+640-416]
pandn xmm6, xmm7
movdqa [esp+640-80], xmm6
movdqa xmm6, [esp+752-272]
punpcklbw xmm6, xmm2
psllw xmm6, 1
paddw xmm6, xmm7
paddw xmm6, xmm7
paddw xmm6, xmm7
paddw xmm6, [esp+640-384]
movdqa xmm7, [esp+640-480]
paddw xmm6, xmm5
paddw xmm6, xmm1
paddw xmm6, [esp+640-592]
psraw xmm6, 3
pand xmm6, [esp+640-560]
movdqa [esp+640-112], xmm6
movdqa xmm6, [esp+640-544]
pandn xmm6, xmm7
movdqa [esp+640-336], xmm6
movdqa xmm6, [esp+640-544]
movdqa [esp+640-528], xmm6
movdqa xmm6, [esp+640-368]
paddw xmm6, xmm7
movdqa xmm7, xmm1
psraw xmm4, 3
pand xmm4, [esp+640-544]
paddw xmm7, xmm5
paddw xmm6, xmm7
paddw xmm6, [esp+640-624]
movdqa xmm7, [esp+640-528]
paddw xmm5, xmm1
psraw xmm6, 2
pand xmm7, xmm6
movdqa xmm6, [esp+640-384]
movdqa [esp+640-64], xmm7
movdqa xmm7, [esp+640-560]
pandn xmm7, xmm6
movdqa [esp+640-304], xmm7
movdqa xmm7, [esp+640-560]
movdqa [esp+640-528], xmm7
movdqa xmm7, [esp+640-416]
paddw xmm7, xmm6
paddw xmm7, xmm5
paddw xmm7, [esp+640-624]
movdqa xmm5, [esp+640-528]
psraw xmm7, 2
pand xmm5, xmm7
movdqa [esp+640-32], xmm5
movdqa xmm5, [esp+640-544]
movdqa [esp+640-528], xmm5
movdqa xmm5, [esp+640-480]
movdqa xmm7, xmm5
paddw xmm7, xmm5
movdqa xmm5, xmm1
paddw xmm5, xmm6
paddw xmm6, [esp+640-592]
paddw xmm7, xmm5
paddw xmm7, [esp+640-624]
movdqa xmm5, [esp+640-528]
psraw xmm7, 2
pandn xmm5, xmm7
movdqa xmm7, [esp+640-480]
paddw xmm7, xmm1
paddw xmm7, [esp+640-400]
movdqa xmm1, [esp+640-544]
movdqa [esp+640-352], xmm5
movdqa xmm5, [esp+640-368]
psllw xmm7, 1
paddw xmm7, xmm6
paddw xmm5, xmm7
movdqa xmm7, [esp+640-400]
psraw xmm5, 3
pand xmm1, xmm5
movdqa xmm5, [esp+640-480]
movdqa [esp+640-96], xmm1
movdqa xmm1, [esp+640-560]
movdqa [esp+640-528], xmm1
movdqa xmm1, [esp+640-384]
movdqa xmm6, xmm1
paddw xmm6, xmm1
paddw xmm1, [esp+640-400]
paddw xmm1, [esp+640-144]
paddw xmm7, xmm5
paddw xmm5, [esp+640-592]
paddw xmm6, xmm7
paddw xmm6, [esp+640-624]
movdqa xmm7, [esp+640-528]
psraw xmm6, 2
psllw xmm1, 1
paddw xmm1, xmm5
movdqa xmm5, [esp+656-272]
pandn xmm7, xmm6
movdqa xmm6, [esp+640-416]
paddw xmm6, xmm1
movdqa xmm1, [esp+640-560]
psraw xmm6, 3
pand xmm1, xmm6
movdqa xmm6, [esp+704-272]
movdqa [esp+640-128], xmm1
movdqa xmm1, [esp+672-272]
punpckhbw xmm1, xmm2
movdqa [esp+640-448], xmm1
movdqa xmm1, [esp+688-272]
punpckhbw xmm1, xmm2
punpckhbw xmm6, xmm2
movdqa [esp+640-288], xmm7
punpckhbw xmm5, xmm2
movdqa [esp+640-496], xmm1
movdqa [esp+640-432], xmm6
movdqa xmm7, [esp+720-272]
punpckhbw xmm7, xmm2
movdqa [esp+640-464], xmm7
movdqa xmm7, [esp+736-272]
punpckhbw xmm7, xmm2
movdqa [esp+640-528], xmm7
movdqa xmm7, xmm6
psubw xmm6, [esp+640-464]
psubw xmm7, xmm1
pabsw xmm7, xmm7
movdqa [esp+640-560], xmm7
por xmm4, [esp+640-16]
pabsw xmm6, xmm6
movdqa xmm7, xmm1
psubw xmm7, [esp+640-448]
movdqa xmm1, [esp+640-512]
pabsw xmm7, xmm7
pcmpgtw xmm1, xmm7
movdqa xmm7, [esp+640-512]
pcmpgtw xmm7, xmm6
movdqa xmm6, [esp+640-320]
pand xmm1, xmm7
movdqa xmm7, [esp+640-560]
pcmpgtw xmm6, xmm7
pand xmm1, xmm6
movdqa xmm6, [esp+640-576]
pcmpgtw xmm6, xmm7
movdqa xmm7, [esp+640-496]
punpckhbw xmm3, xmm2
movdqa [esp+640-560], xmm6
movdqa xmm6, [esp+640-512]
psubw xmm7, xmm5
pabsw xmm7, xmm7
pcmpgtw xmm6, xmm7
pand xmm6, [esp+640-560]
movdqa xmm7, [esp+640-432]
psubw xmm7, [esp+640-528]
psllw xmm3, 1
movdqa [esp+640-544], xmm6
movdqa xmm6, [esp+640-512]
movdqa xmm2, [esp+640-544]
paddw xmm3, xmm5
paddw xmm3, xmm5
paddw xmm3, xmm5
paddw xmm3, [esp+640-448]
paddw xmm3, [esp+640-496]
pabsw xmm7, xmm7
pcmpgtw xmm6, xmm7
pand xmm6, [esp+640-560]
movdqa [esp+640-560], xmm6
movdqa xmm6, xmm0
pand xmm6, xmm4
movdqa xmm4, xmm0
pandn xmm4, [esp+640-368]
por xmm6, xmm4
movdqa xmm4, [esp+640-432]
paddw xmm3, xmm4
paddw xmm3, [esp+640-592]
psraw xmm3, 3
pand xmm3, xmm2
pandn xmm2, xmm5
por xmm3, xmm2
movdqa xmm7, xmm1
pand xmm7, xmm3
movdqa xmm3, [esp+640-64]
por xmm3, [esp+640-336]
movdqa xmm2, xmm1
pandn xmm2, xmm5
por xmm7, xmm2
movdqa xmm2, xmm0
pand xmm2, xmm3
movdqa xmm3, xmm0
pandn xmm3, [esp+640-480]
por xmm2, xmm3
packuswb xmm6, xmm7
movdqa [esp+640-336], xmm2
movdqa [esp+656-272], xmm6
movdqa xmm6, [esp+640-544]
movdqa xmm2, xmm5
paddw xmm2, [esp+640-448]
movdqa xmm3, xmm1
movdqa xmm7, [esp+640-496]
paddw xmm7, xmm4
paddw xmm2, xmm7
paddw xmm2, [esp+640-624]
movdqa xmm7, [esp+640-544]
psraw xmm2, 2
pand xmm6, xmm2
movdqa xmm2, [esp+640-448]
pandn xmm7, xmm2
por xmm6, xmm7
pand xmm3, xmm6
movdqa xmm6, xmm1
pandn xmm6, xmm2
paddw xmm2, [esp+640-496]
paddw xmm2, xmm4
por xmm3, xmm6
movdqa xmm6, [esp+640-336]
packuswb xmm6, xmm3
psllw xmm2, 1
movdqa [esp+672-272], xmm6
movdqa xmm6, [esp+640-96]
por xmm6, [esp+640-352]
movdqa xmm3, xmm0
pand xmm3, xmm6
movdqa xmm6, xmm0
pandn xmm6, [esp+640-144]
por xmm3, xmm6
movdqa xmm6, [esp+640-544]
movdqa [esp+640-352], xmm3
movdqa xmm3, [esp+640-464]
paddw xmm3, [esp+640-592]
paddw xmm2, xmm3
movdqa xmm3, [esp+640-448]
paddw xmm5, xmm2
movdqa xmm2, [esp+640-496]
psraw xmm5, 3
pand xmm6, xmm5
movdqa xmm5, [esp+640-464]
paddw xmm2, xmm5
paddw xmm5, [esp+640-432]
movdqa xmm4, xmm3
paddw xmm4, xmm3
paddw xmm4, xmm2
paddw xmm4, [esp+640-624]
movdqa xmm2, [esp+640-544]
paddw xmm3, [esp+640-592]
psraw xmm4, 2
pandn xmm2, xmm4
por xmm6, xmm2
movdqa xmm7, xmm1
pand xmm7, xmm6
movdqa xmm6, [esp+640-496]
movdqa xmm2, xmm1
pandn xmm2, xmm6
por xmm7, xmm2
movdqa xmm2, [esp+640-352]
packuswb xmm2, xmm7
movdqa [esp+688-272], xmm2
movdqa xmm2, [esp+640-128]
por xmm2, [esp+640-288]
movdqa xmm4, xmm0
pand xmm4, xmm2
paddw xmm5, xmm6
movdqa xmm2, xmm0
pandn xmm2, [esp+640-400]
por xmm4, xmm2
movdqa xmm2, [esp+640-528]
psllw xmm5, 1
paddw xmm5, xmm3
movdqa xmm3, [esp+640-560]
paddw xmm2, xmm5
psraw xmm2, 3
movdqa [esp+640-288], xmm4
movdqa xmm4, [esp+640-560]
pand xmm4, xmm2
movdqa xmm2, [esp+640-464]
movdqa xmm5, xmm2
paddw xmm5, xmm2
movdqa xmm2, [esp+640-432]
paddw xmm2, [esp+640-448]
movdqa xmm7, xmm1
paddw xmm5, xmm2
paddw xmm5, [esp+640-624]
movdqa xmm6, [esp+640-560]
psraw xmm5, 2
pandn xmm3, xmm5
por xmm4, xmm3
movdqa xmm3, [esp+640-32]
por xmm3, [esp+640-304]
pand xmm7, xmm4
movdqa xmm4, [esp+640-432]
movdqa xmm5, [esp+640-464]
movdqa xmm2, xmm1
pandn xmm2, xmm4
paddw xmm4, [esp+640-496]
por xmm7, xmm2
movdqa xmm2, [esp+640-288]
packuswb xmm2, xmm7
movdqa [esp+704-272], xmm2
movdqa xmm2, xmm0
pand xmm2, xmm3
movdqa xmm3, xmm0
pandn xmm3, [esp+640-384]
por xmm2, xmm3
movdqa [esp+640-304], xmm2
movdqa xmm2, [esp+640-528]
movdqa xmm3, xmm2
paddw xmm3, [esp+640-464]
paddw xmm3, xmm4
paddw xmm3, [esp+640-624]
psraw xmm3, 2
pand xmm6, xmm3
movdqa xmm3, [esp+640-560]
movdqa xmm4, xmm3
pandn xmm4, xmm5
por xmm6, xmm4
movdqa xmm7, xmm1
pand xmm7, xmm6
movdqa xmm6, [esp+640-304]
movdqa xmm4, xmm1
pandn xmm4, xmm5
por xmm7, xmm4
movdqa xmm4, xmm0
pandn xmm0, [esp+640-416]
packuswb xmm6, xmm7
movdqa xmm7, [esp+640-112]
por xmm7, [esp+640-80]
pand xmm4, xmm7
por xmm4, xmm0
movdqa xmm0, [esp+752-272]
punpckhbw xmm0, [esp+640-48]
psllw xmm0, 1
paddw xmm0, xmm2
paddw xmm0, xmm2
paddw xmm0, xmm2
paddw xmm0, xmm5
paddw xmm0, [esp+640-432]
paddw xmm0, [esp+640-496]
paddw xmm0, [esp+640-592]
psraw xmm0, 3
pand xmm0, xmm3
movdqa xmm7, xmm1
pandn xmm3, xmm2
por xmm0, xmm3
pand xmm7, xmm0
movdqa xmm0, [esp+656-272]
movdqa [edx], xmm0
movdqa xmm0, [esp+672-272]
mov edx, dword [esp+640-596]
movdqa [esi], xmm0
movdqa xmm0, [esp+688-272]
movdqa [edi], xmm0
movdqa xmm0, [esp+704-272]
pop edi
pandn xmm1, xmm2
movdqa [eax], xmm0
por xmm7, xmm1
pop esi
packuswb xmm4, xmm7
movdqa [edx], xmm6
movdqa [ecx], xmm4
pop ebx
mov esp, ebp
pop ebp
ret
;********************************************************************************
;
; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
;
;********************************************************************************
WELS_EXTERN DeblockLumaTransposeH2V_sse2
ALIGN 16
DeblockLumaTransposeH2V_sse2:
push ebp
push ebx
mov ebp, esp
and esp,0FFFFFFF0h
sub esp, 10h
mov eax, [ebp + 0Ch]
mov ecx, [ebp + 10h]
lea edx, [eax + ecx * 8]
lea ebx, [ecx*3]
movq xmm0, [eax]
movq xmm7, [edx]
punpcklqdq xmm0, xmm7
movq xmm1, [eax + ecx]
movq xmm7, [edx + ecx]
punpcklqdq xmm1, xmm7
movq xmm2, [eax + ecx*2]
movq xmm7, [edx + ecx*2]
punpcklqdq xmm2, xmm7
movq xmm3, [eax + ebx]
movq xmm7, [edx + ebx]
punpcklqdq xmm3, xmm7
lea eax, [eax + ecx * 4]
lea edx, [edx + ecx * 4]
movq xmm4, [eax]
movq xmm7, [edx]
punpcklqdq xmm4, xmm7
movq xmm5, [eax + ecx]
movq xmm7, [edx + ecx]
punpcklqdq xmm5, xmm7
movq xmm6, [eax + ecx*2]
movq xmm7, [edx + ecx*2]
punpcklqdq xmm6, xmm7
movdqa [esp], xmm0
movq xmm7, [eax + ebx]
movq xmm0, [edx + ebx]
punpcklqdq xmm7, xmm0
movdqa xmm0, [esp]
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
mov eax, [ebp + 14h]
movdqa [eax], xmm4
movdqa [eax + 10h], xmm2
movdqa [eax + 20h], xmm3
movdqa [eax + 30h], xmm7
movdqa [eax + 40h], xmm5
movdqa [eax + 50h], xmm1
movdqa [eax + 60h], xmm6
movdqa [eax + 70h], xmm0
mov esp, ebp
pop ebx
pop ebp
ret
;*******************************************************************************************
;
; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
;
;*******************************************************************************************
WELS_EXTERN DeblockLumaTransposeV2H_sse2
ALIGN 16
DeblockLumaTransposeV2H_sse2:
push ebp
mov ebp, esp
and esp, 0FFFFFFF0h
sub esp, 10h
mov eax, [ebp + 10h]
mov ecx, [ebp + 0Ch]
mov edx, [ebp + 08h]
movdqa xmm0, [eax]
movdqa xmm1, [eax + 10h]
movdqa xmm2, [eax + 20h]
movdqa xmm3, [eax + 30h]
movdqa xmm4, [eax + 40h]
movdqa xmm5, [eax + 50h]
movdqa xmm6, [eax + 60h]
movdqa xmm7, [eax + 70h]
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
lea eax, [ecx * 3]
movq [edx], xmm4
movq [edx + ecx], xmm2
movq [edx + ecx*2], xmm3
movq [edx + eax], xmm7
lea edx, [edx + ecx*4]
movq [edx], xmm5
movq [edx + ecx], xmm1
movq [edx + ecx*2], xmm6
movq [edx + eax], xmm0
psrldq xmm4, 8
psrldq xmm2, 8
psrldq xmm3, 8
psrldq xmm7, 8
psrldq xmm5, 8
psrldq xmm1, 8
psrldq xmm6, 8
psrldq xmm0, 8
lea edx, [edx + ecx*4]
movq [edx], xmm4
movq [edx + ecx], xmm2
movq [edx + ecx*2], xmm3
movq [edx + eax], xmm7
lea edx, [edx + ecx*4]
movq [edx], xmm5
movq [edx + ecx], xmm1
movq [edx + ecx*2], xmm6
movq [edx + eax], xmm0
mov esp, ebp
pop ebp
ret