openh264/codec/common/deblock.asm

5326 lines
127 KiB
NASM
Raw Normal View History

2014-01-03 07:49:45 +01:00
;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* deblock.asm
;*
;* Abstract
;* edge loop
;*
;* History
;* 08/07/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
%ifdef FORMAT_COFF
SECTION .rodata pData
%else
SECTION .rodata align=16
%endif
ALIGN 16
FOUR_16B_SSE2: dw 4, 4, 4, 4, 4, 4, 4, 4
SECTION .text
%ifdef WIN64
2014-01-03 07:49:45 +01:00
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockLumaLt4V_ssse3
2014-01-03 07:49:45 +01:00
2014-01-16 08:57:22 +01:00
DeblockLumaLt4V_ssse3:
push rbp
mov r11,[rsp + 16 + 20h] ; pTC
sub rsp,1B0h
lea rbp,[rsp+20h]
movd xmm4,r8d
movd xmm2,r9d
mov qword [rbp+180h],r12
mov r10,rcx
movsxd r12,edx
add edx,edx
movsxd rdx,edx
sub r10,r12
movsx r8d,byte [r11]
pxor xmm3,xmm3
punpcklwd xmm2,xmm2
movaps [rbp+50h],xmm14
lea rax,[r12+r12*2]
movdqa xmm14,[rdx+rcx]
neg rax
pshufd xmm0,xmm2,0
movd xmm2,r8d
movsx edx,byte [r11+1]
movsx r8d,byte [r11+2]
movsx r11d,byte [r11+3]
movaps [rbp+70h],xmm12
movd xmm1,edx
movaps [rbp+80h],xmm11
movd xmm12,r8d
movd xmm11,r11d
movdqa xmm5, [rax+rcx]
lea rax,[r12+r12]
punpcklwd xmm12,xmm12
neg rax
punpcklwd xmm11,xmm11
movaps [rbp],xmm8
movdqa xmm8, [r10]
punpcklwd xmm2,xmm2
punpcklwd xmm1,xmm1
punpcklqdq xmm12,xmm12
punpcklqdq xmm11,xmm11
punpcklqdq xmm2,xmm2
punpcklqdq xmm1,xmm1
shufps xmm12,xmm11,88h
movdqa xmm11,xmm8
movaps [rbp+30h],xmm9
movdqa xmm9,[rcx]
shufps xmm2,xmm1,88h
movdqa xmm1,xmm5
punpcklbw xmm11,xmm3
movaps [rbp+20h],xmm6
movaps [rbp+60h],xmm13
movdqa xmm13,xmm11
movaps [rbp+90h],xmm10
movdqa xmm10,xmm9
movdqa xmm6,[rax+rcx]
punpcklbw xmm1,xmm3
movaps [rbp+0A0h],xmm12
psubw xmm13,xmm1
movaps [rbp+40h],xmm15
movdqa xmm15,xmm14
movaps [rbp+10h],xmm7
movdqa xmm7,xmm6
punpcklbw xmm10,xmm3
movdqa xmm12,[r12+rcx]
punpcklbw xmm7,xmm3
punpcklbw xmm12,xmm3
punpcklbw xmm15,xmm3
pabsw xmm3,xmm13
movdqa xmm13,xmm10
psubw xmm13,xmm15
movdqa [rbp+0F0h],xmm15
pabsw xmm15,xmm13
movdqa xmm13,xmm11
movdqa [rbp+0B0h],xmm1
movdqa xmm1,xmm0
pavgw xmm13,xmm10
pcmpgtw xmm1,xmm3
movdqa [rbp+120h],xmm13
movaps xmm13,xmm2
punpcklwd xmm4,xmm4
movdqa xmm3,xmm0
movdqa [rbp+100h],xmm1
psubw xmm13,xmm1
movdqa xmm1,xmm10
pcmpgtw xmm3,xmm15
pshufd xmm4,xmm4,0
psubw xmm1,xmm11
movdqa [rbp+0D0h],xmm10
psubw xmm13,xmm3
movdqa [rbp+110h],xmm3
pabsw xmm15,xmm1
movdqa xmm3,xmm4
psubw xmm10,xmm12
pcmpgtw xmm3,xmm15
pabsw xmm15,xmm10
movdqa xmm10,xmm0
psllw xmm1,2
movdqa [rbp+0C0h],xmm11
psubw xmm11,xmm7
pcmpgtw xmm10,xmm15
pabsw xmm11,xmm11
movdqa xmm15,xmm0
pand xmm3,xmm10
pcmpgtw xmm15,xmm11
movaps xmm11,xmm2
pxor xmm10,xmm10
pand xmm3,xmm15
pcmpgtw xmm11,xmm10
pcmpeqw xmm10,xmm2
por xmm11,xmm10
pand xmm3,xmm11
movdqa xmm11,xmm7
psubw xmm11,xmm12
pxor xmm15,xmm15
paddw xmm11,xmm1
psubw xmm15,xmm13
movdqa [rbp+0E0h],xmm12
paddw xmm11,[FOUR_16B_SSE2]
pxor xmm12,xmm12
psraw xmm11,3
punpckhbw xmm8,xmm12
pmaxsw xmm15,xmm11
punpckhbw xmm5,xmm12
movdqa xmm11,xmm8
pminsw xmm13,xmm15
psubw xmm11,xmm5
punpckhbw xmm9,xmm12
pand xmm13,xmm3
movdqa [rbp+130h],xmm13
pabsw xmm13,xmm11
punpckhbw xmm14,xmm12
movdqa xmm11,xmm9
psubw xmm11,xmm14
movdqa xmm15,xmm0
movdqa [rbp+140h],xmm14
pabsw xmm14,xmm11
movdqa xmm11,xmm8
pcmpgtw xmm15,xmm14
movdqa xmm1,[r12+rcx]
pavgw xmm11,xmm9
movdqa [rbp+170h],xmm11
movdqa xmm10,xmm9
punpckhbw xmm6,xmm12
psubw xmm10,xmm8
punpckhbw xmm1,xmm12
movdqa xmm12,xmm0
movaps xmm11,[rbp+0A0h]
pcmpgtw xmm12,xmm13
movaps xmm13,xmm11
psubw xmm13,xmm12
movdqa [rbp+160h],xmm15
psubw xmm13,xmm15
movdqa xmm15,xmm9
psubw xmm15,xmm1
movdqa [rbp+150h],xmm12
pabsw xmm12,xmm10
pabsw xmm14,xmm15
movdqa xmm15,xmm8
pcmpgtw xmm4,xmm12
movdqa xmm12,xmm0
psubw xmm15,xmm6
pcmpgtw xmm12,xmm14
pabsw xmm14,xmm15
psllw xmm10,2
pcmpgtw xmm0,xmm14
movdqa xmm14,xmm6
psubw xmm14,xmm1
pand xmm4,xmm12
paddw xmm14,xmm10
pand xmm4,xmm0
paddw xmm14,[FOUR_16B_SSE2]
pxor xmm15,xmm15
movaps xmm12,xmm11
psubw xmm15,xmm13
pxor xmm0,xmm0
psraw xmm14,3
pcmpgtw xmm12,xmm0
pcmpeqw xmm0,xmm11
pmaxsw xmm15,xmm14
por xmm12,xmm0
movdqa xmm0,[rbp+120h]
pminsw xmm13,xmm15
movdqa xmm15,[rbp+0B0h]
movdqa xmm10,xmm7
pand xmm4,xmm12
paddw xmm15,xmm0
pxor xmm12,xmm12
paddw xmm10,xmm7
movdqa xmm14,xmm12
psubw xmm15,xmm10
psubw xmm14,xmm2
psraw xmm15,1
pmaxsw xmm15,xmm14
movdqa xmm10,xmm6
pminsw xmm15,xmm2
paddw xmm10,xmm6
pand xmm15,xmm3
psubw xmm12,xmm11
pand xmm15,[rbp+100h]
pand xmm13,xmm4
paddw xmm7,xmm15
paddw xmm8,xmm13
movdqa xmm15,[rbp+170h]
psubw xmm9,xmm13
paddw xmm5,xmm15
psubw xmm5,xmm10
psraw xmm5,1
pmaxsw xmm5,xmm12
pminsw xmm5,xmm11
pand xmm5,xmm4
pand xmm5,[rbp+150h]
paddw xmm6,xmm5
movdqa xmm5,[rbp+0C0h]
packuswb xmm7,xmm6
movdqa xmm6,[rbp+130h]
paddw xmm5,xmm6
packuswb xmm5,xmm8
movdqa xmm8,[rbp+0D0h]
psubw xmm8,xmm6
movdqa xmm6,[rbp+0F0h]
paddw xmm6,xmm0
movdqa xmm0,[rbp+0E0h]
packuswb xmm8,xmm9
movdqa xmm9,xmm0
paddw xmm9,xmm0
psubw xmm6,xmm9
psraw xmm6,1
pmaxsw xmm14,xmm6
pminsw xmm2,xmm14
pand xmm2,xmm3
pand xmm2,[rbp+110h]
paddw xmm0,xmm2
movdqa xmm2,[rbp+140h]
paddw xmm2,xmm15
movdqa xmm15,xmm1
paddw xmm15,xmm1
psubw xmm2,xmm15
psraw xmm2,1
pmaxsw xmm12,xmm2
pminsw xmm11,xmm12
pand xmm11,xmm4
pand xmm11,[rbp+160h]
paddw xmm1,xmm11
movdqa [rax+rcx],xmm7
movdqa [r10],xmm5
packuswb xmm0,xmm1
movdqa [rcx],xmm8
movdqa [r12+rcx],xmm0
mov r12,qword [rbp+180h]
lea rsp,[rbp+190h]
pop rbp
ret
2014-01-03 07:49:45 +01:00
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockLumaEq4V_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockLumaEq4V_ssse3:
mov rax,rsp
push rbx
push rbp
push rsi
push rdi
sub rsp,1D8h
movaps [rax-38h],xmm6
movaps [rax-48h],xmm7
movaps [rax-58h],xmm8
pxor xmm1,xmm1
movsxd r10,edx
mov rbp,rcx
mov r11d,r8d
mov rdx,rcx
mov rdi,rbp
mov rbx,rbp
movdqa xmm5,[rbp]
movaps [rax-68h],xmm9
movaps [rax-78h],xmm10
punpcklbw xmm5,xmm1
movaps [rax-88h],xmm11
movaps [rax-98h],xmm12
movaps [rax-0A8h],xmm13
movaps [rax-0B8h],xmm14
movdqa xmm14,[r10+rbp]
movaps [rax-0C8h],xmm15
lea eax,[r10*4]
movsxd r8,eax
lea eax,[r10+r10*2]
movsxd rcx,eax
lea eax,[r10+r10]
sub rdx,r8
punpcklbw xmm14,xmm1
movdqa [rsp+90h],xmm5
movdqa [rsp+30h],xmm14
movsxd rsi,eax
movsx eax,r11w
sub rdi,rcx
sub rbx,rsi
mov r8,rbp
sub r8,r10
movd xmm0,eax
movsx eax,r9w
movdqa xmm12,[rdi]
movdqa xmm6, [rsi+rbp]
movdqa xmm13,[rbx]
punpcklwd xmm0,xmm0
pshufd xmm11,xmm0,0
punpcklbw xmm13,xmm1
punpcklbw xmm6,xmm1
movdqa xmm8,[r8]
movd xmm0,eax
movdqa xmm10,xmm11
mov eax,2
punpcklbw xmm8,xmm1
punpcklbw xmm12,xmm1
cwde
punpcklwd xmm0,xmm0
psraw xmm10,2
movdqa xmm1,xmm8
movdqa [rsp+0F0h],xmm13
movdqa [rsp+0B0h],xmm8
pshufd xmm7,xmm0,0
psubw xmm1,xmm13
movdqa xmm0,xmm5
movdqa xmm4,xmm7
movdqa xmm2,xmm7
psubw xmm0,xmm8
pabsw xmm3,xmm0
pabsw xmm0,xmm1
movdqa xmm1,xmm5
movdqa [rsp+40h],xmm7
movdqa [rsp+60h],xmm6
pcmpgtw xmm4,xmm0
psubw xmm1,xmm14
pabsw xmm0,xmm1
pcmpgtw xmm2,xmm0
pand xmm4,xmm2
movdqa xmm0,xmm11
pcmpgtw xmm0,xmm3
pand xmm4,xmm0
movd xmm0,eax
movdqa [rsp+20h],xmm4
punpcklwd xmm0,xmm0
pshufd xmm2,xmm0,0
paddw xmm10,xmm2
movdqa [rsp+0A0h],xmm2
movdqa xmm15,xmm7
pxor xmm4,xmm4
movdqa xmm0,xmm8
psubw xmm0,xmm12
mov eax,4
pabsw xmm0,xmm0
movdqa xmm1,xmm10
cwde
pcmpgtw xmm15,xmm0
pcmpgtw xmm1,xmm3
movdqa xmm3,xmm7
movdqa xmm7,[rdx]
movdqa xmm0,xmm5
psubw xmm0,xmm6
pand xmm15,xmm1
punpcklbw xmm7,xmm4
movdqa xmm9,xmm15
pabsw xmm0,xmm0
psllw xmm7,1
pandn xmm9,xmm12
pcmpgtw xmm3,xmm0
paddw xmm7,xmm12
movd xmm0,eax
pand xmm3,xmm1
paddw xmm7,xmm12
punpcklwd xmm0,xmm0
paddw xmm7,xmm12
pshufd xmm1,xmm0,0
paddw xmm7,xmm13
movdqa xmm0,xmm3
pandn xmm0,xmm6
paddw xmm7,xmm8
movdqa [rsp+70h],xmm1
paddw xmm7,xmm5
movdqa [rsp+120h],xmm0
movdqa xmm0,[rcx+rbp]
punpcklbw xmm0,xmm4
paddw xmm7,xmm1
movdqa xmm4,xmm15
psllw xmm0,1
psraw xmm7,3
paddw xmm0,xmm6
pand xmm7,xmm15
paddw xmm0,xmm6
paddw xmm0,xmm6
paddw xmm0,xmm14
movdqa xmm6,xmm15
paddw xmm0,xmm5
pandn xmm6,xmm13
paddw xmm0,xmm8
paddw xmm0,xmm1
psraw xmm0,3
movdqa xmm1,xmm12
paddw xmm1,xmm13
pand xmm0,xmm3
movdqa [rsp+100h],xmm0
movdqa xmm0,xmm8
paddw xmm0,xmm5
paddw xmm1,xmm0
movdqa xmm0,xmm3
paddw xmm1,xmm2
psraw xmm1,2
pandn xmm0,xmm14
pand xmm4,xmm1
movdqa [rsp+0E0h],xmm0
movdqa xmm0,xmm5
paddw xmm0,xmm8
movdqa xmm1,[rsp+60h]
paddw xmm1,xmm14
movdqa xmm14,xmm3
paddw xmm1,xmm0
movdqa xmm0,xmm8
paddw xmm0,[rsp+30h]
paddw xmm1,xmm2
psraw xmm1,2
pand xmm14,xmm1
movdqa xmm1,xmm13
paddw xmm1,xmm13
paddw xmm1,xmm0
paddw xmm1,xmm2
psraw xmm1,2
movdqa xmm0,[rsp+30h]
movdqa xmm2,xmm13
movdqa xmm5,xmm15
paddw xmm0,[rsp+70h]
pandn xmm5,xmm1
paddw xmm2,xmm8
movdqa xmm8,[rsp+90h]
movdqa xmm1,xmm12
paddw xmm2,xmm8
psllw xmm2,1
paddw xmm2,xmm0
paddw xmm1,xmm2
movdqa xmm0,xmm8
movdqa xmm8,xmm3
movdqa xmm2,[rsp+30h]
paddw xmm0,xmm13
psraw xmm1,3
pand xmm15,xmm1
movdqa xmm1,xmm2
paddw xmm1,xmm2
paddw xmm2,[rsp+90h]
paddw xmm2,[rsp+0B0h]
paddw xmm1,xmm0
movdqa xmm0,xmm13
movdqa xmm13,[r8]
paddw xmm0, [rsp+70h]
paddw xmm1, [rsp+0A0h]
psllw xmm2,1
paddw xmm2,xmm0
psraw xmm1,2
movdqa xmm0, [rdi]
pandn xmm8,xmm1
movdqa xmm1, [rsp+60h]
paddw xmm1,xmm2
movdqa xmm2, [rbx]
psraw xmm1,3
pand xmm3,xmm1
movdqa xmm1, [rbp]
movdqa [rsp+0D0h],xmm3
pxor xmm3,xmm3
punpckhbw xmm0,xmm3
punpckhbw xmm1,xmm3
punpckhbw xmm13,xmm3
movdqa [rsp+0C0h],xmm0
movdqa xmm0,[r10+rbp]
movdqa [rsp],xmm1
punpckhbw xmm0,xmm3
punpckhbw xmm2,xmm3
movdqa [rsp+80h],xmm0
movdqa xmm0,[rsi+rbp]
movdqa [rsp+10h],xmm13
punpckhbw xmm0,xmm3
movdqa [rsp+50h],xmm0
movdqa xmm0,xmm1
movdqa xmm1,xmm13
psubw xmm0,xmm13
psubw xmm1,xmm2
pabsw xmm3,xmm0
pabsw xmm0,xmm1
movdqa xmm1,[rsp]
movdqa xmm13,[rsp+40h]
movdqa [rsp+110h],xmm2
psubw xmm1, [rsp+80h]
pcmpgtw xmm13,xmm0
pcmpgtw xmm11,xmm3
pabsw xmm0,xmm1
pcmpgtw xmm10,xmm3
movdqa xmm1, [rsp+40h]
movdqa xmm2,xmm1
movdqa xmm3,xmm1
pcmpgtw xmm2,xmm0
movdqa xmm0, [rsp+10h]
pand xmm13,xmm2
pand xmm13,xmm11
movdqa xmm11,[rsp+0C0h]
psubw xmm0,xmm11
pabsw xmm0,xmm0
pcmpgtw xmm3,xmm0
pand xmm3,xmm10
movdqa xmm0,[rsp]
psubw xmm0,[rsp+50h]
movdqa xmm2,[rdx]
pabsw xmm0,xmm0
por xmm7,xmm9
movdqa xmm9,[rsp+20h]
pcmpgtw xmm1,xmm0
pand xmm9,xmm7
movdqa xmm7,[rsp+20h]
movdqa xmm0,xmm7
pandn xmm0,xmm12
movdqa xmm12,[rsp+110h]
pand xmm1,xmm10
movdqa xmm10,[rsp+70h]
movdqa [rsp+40h],xmm1
movdqa xmm1,xmm13
por xmm9,xmm0
pxor xmm0,xmm0
por xmm4,xmm6
movdqa xmm6,xmm7
punpckhbw xmm2,xmm0
por xmm15,xmm5
movdqa xmm5,[rsp+20h]
movdqa xmm0,xmm3
psllw xmm2,1
pandn xmm0,xmm11
pand xmm6,xmm4
movdqa xmm4,[rsp]
paddw xmm2,xmm11
pand xmm5,xmm15
movdqa xmm15,[rsp+20h]
paddw xmm2,xmm11
paddw xmm2,xmm11
paddw xmm2,xmm12
paddw xmm2,[rsp+10h]
paddw xmm2,[rsp]
paddw xmm2,xmm10
psraw xmm2,3
pand xmm2,xmm3
por xmm2,xmm0
pand xmm1,xmm2
movdqa xmm0,xmm13
movdqa xmm2,xmm11
pandn xmm0,xmm11
paddw xmm2,xmm12
por xmm1,xmm0
packuswb xmm9,xmm1
movdqa xmm0,xmm7
movdqa xmm7,[rsp+0A0h]
pandn xmm0,[rsp+0F0h]
movdqa xmm1,xmm3
por xmm6,xmm0
movdqa xmm0,[rsp+10h]
paddw xmm0,xmm4
paddw xmm2,xmm0
paddw xmm2,xmm7
movdqa xmm0,xmm3
pandn xmm0,xmm12
psraw xmm2,2
pand xmm1,xmm2
por xmm1,xmm0
movdqa xmm2,xmm13
movdqa xmm0,xmm13
pand xmm2,xmm1
pandn xmm0,xmm12
movdqa xmm1,xmm12
paddw xmm1,[rsp+10h]
por xmm2,xmm0
movdqa xmm0,xmm15
pandn xmm0,[rsp+0B0h]
paddw xmm1,xmm4
packuswb xmm6,xmm2
movdqa xmm2,xmm3
psllw xmm1,1
por xmm5,xmm0
movdqa xmm0,[rsp+80h]
paddw xmm0,xmm10
paddw xmm1,xmm0
paddw xmm11,xmm1
psraw xmm11,3
movdqa xmm1,xmm12
pand xmm2,xmm11
paddw xmm1,xmm12
movdqa xmm11,[rsp+80h]
movdqa xmm0, [rsp+10h]
por xmm14,[rsp+0E0h]
paddw xmm0,xmm11
movdqa xmm4,xmm15
paddw xmm1,xmm0
movdqa xmm0,xmm13
paddw xmm1,xmm7
psraw xmm1,2
pandn xmm3,xmm1
por xmm2,xmm3
movdqa xmm1,xmm13
movdqa xmm3,[rsp+10h]
pandn xmm0,xmm3
pand xmm1,xmm2
movdqa xmm2,xmm11
paddw xmm2,[rsp]
por xmm1,xmm0
movdqa xmm0,[rsp+0D0h]
por xmm0,xmm8
paddw xmm2,xmm3
packuswb xmm5,xmm1
movdqa xmm8,[rsp+40h]
movdqa xmm1,[rsp+50h]
movdqa xmm3,xmm8
pand xmm4,xmm0
psllw xmm2,1
movdqa xmm0,xmm15
pandn xmm0,[rsp+90h]
por xmm4,xmm0
movdqa xmm0,xmm12
paddw xmm0,xmm10
paddw xmm2,xmm0
paddw xmm1,xmm2
movdqa xmm0,[rsp]
movdqa xmm2,xmm11
paddw xmm0,xmm12
movdqa xmm12,[rsp]
paddw xmm2,xmm11
paddw xmm2,xmm0
psraw xmm1,3
movdqa xmm0,xmm8
pand xmm3,xmm1
paddw xmm2,xmm7
movdqa xmm1,xmm13
psraw xmm2,2
pandn xmm0,xmm2
por xmm3,xmm0
movdqa xmm2,[rsp+50h]
movdqa xmm0,xmm13
pandn xmm0,xmm12
pand xmm1,xmm3
paddw xmm2,xmm11
movdqa xmm3,xmm15
por xmm1,xmm0
pand xmm3,xmm14
movdqa xmm14,[rsp+10h]
movdqa xmm0,xmm15
pandn xmm0,[rsp+30h]
packuswb xmm4,xmm1
movdqa xmm1,xmm8
por xmm3,xmm0
movdqa xmm0,xmm12
paddw xmm0,xmm14
paddw xmm2,xmm0
paddw xmm2,xmm7
movdqa xmm0,xmm8
pandn xmm0,xmm11
psraw xmm2,2
pand xmm1,xmm2
por xmm1,xmm0
movdqa xmm2,xmm13
movdqa xmm0,xmm13
pandn xmm0,xmm11
pand xmm2,xmm1
movdqa xmm1,xmm15
por xmm2,xmm0
packuswb xmm3,xmm2
movdqa xmm0,[rsp+100h]
por xmm0,[rsp+120h]
pand xmm1,xmm0
movdqa xmm2,[rcx+rbp]
movdqa xmm7,[rsp+50h]
pandn xmm15,[rsp+60h]
lea r11,[rsp+1D8h]
pxor xmm0,xmm0
por xmm1,xmm15
movaps xmm15,[r11-0A8h]
movdqa [rdi],xmm9
movaps xmm9,[r11-48h]
punpckhbw xmm2,xmm0
psllw xmm2,1
paddw xmm2,xmm7
paddw xmm2,xmm7
movdqa [rbx],xmm6
movaps xmm6,[r11-18h]
paddw xmm2,xmm7
paddw xmm2,xmm11
movaps xmm11,[r11-68h]
paddw xmm2,xmm12
movaps xmm12,[r11-78h]
paddw xmm2,xmm14
paddw xmm2,xmm10
psraw xmm2,3
movaps xmm10,[r11-58h]
movaps xmm14,[r11-98h]
movdqa xmm0,xmm13
pand xmm2,xmm8
pandn xmm8,xmm7
pandn xmm13,xmm7
por xmm2,xmm8
movaps xmm7,[r11-28h]
movaps xmm8,[r11-38h]
movdqa [r8],xmm5
pand xmm0,xmm2
por xmm0,xmm13
packuswb xmm1,xmm0
movaps xmm13,[r11-88h]
movdqa [rbp],xmm4
movdqa [r10+rbp],xmm3
movdqa [rsi+rbp],xmm1
mov rsp,r11
pop rdi
pop rsi
pop rbp
pop rbx
2014-01-03 07:49:45 +01:00
ret
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockChromaLt4V_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockChromaLt4V_ssse3:
mov rax,rsp
push rbx
push rdi
sub rsp,0C8h
2014-01-03 07:49:45 +01:00
mov r10,qword [rax + 30h] ; pTC
pxor xmm1,xmm1
mov rbx,rcx
movsxd r11,r8d
movsx ecx,byte [r10]
movsx r8d,byte [r10+2]
mov rdi,rdx
movq xmm2,[rbx]
movq xmm9,[r11+rbx]
movsx edx,byte [r10+1]
mov word [rsp+2],cx
mov word [rsp],cx
movsx eax,byte [r10+3]
mov word [rsp+6],dx
mov word [rsp+4],dx
movdqa xmm11,xmm1
mov word [rsp+0Eh],ax
mov word [rsp+0Ch],ax
lea eax,[r11+r11]
movsxd rcx,eax
mov rax,rbx
mov rdx,rdi
sub rax,rcx
mov word [rsp+0Ah],r8w
mov word [rsp+8],r8w
movdqa xmm6,[rsp]
movdqa xmm7,xmm6
movq xmm13, [rax]
mov rax,rdi
sub rax,rcx
mov rcx,rbx
pcmpgtw xmm7,xmm1
psubw xmm11,xmm6
sub rcx,r11
sub rdx,r11
movq xmm0,[rax]
movsx eax,r9w
movq xmm15,[rcx]
punpcklqdq xmm13,xmm0
movq xmm0, [rdx]
movdqa xmm4,xmm13
punpcklqdq xmm15,xmm0
movq xmm0, [rdi]
punpcklbw xmm4,xmm1
movdqa xmm12,xmm15
punpcklqdq xmm2,xmm0
movq xmm0, [r11+rdi]
punpcklbw xmm12,xmm1
movdqa xmm14,xmm2
punpcklqdq xmm9,xmm0
punpckhbw xmm2,xmm1
punpcklbw xmm14,xmm1
movd xmm0,eax
2014-01-03 07:49:45 +01:00
movsx eax,word [rsp + 0C8h + 38h] ; iBeta
punpckhbw xmm13,xmm1
punpckhbw xmm15,xmm1
movdqa xmm3,xmm9
movdqa [rsp+10h],xmm2
punpcklwd xmm0,xmm0
punpckhbw xmm9,xmm1
punpcklbw xmm3,xmm1
movdqa xmm1,xmm14
pshufd xmm10,xmm0,0
movd xmm0,eax
mov eax,4
cwde
punpcklwd xmm0,xmm0
pshufd xmm8,xmm0,0
movd xmm0,eax
punpcklwd xmm0,xmm0
pshufd xmm5,xmm0,0
psubw xmm1,xmm12
movdqa xmm2,xmm10
lea r11,[rsp+0C8h]
psllw xmm1,2
movdqa xmm0,xmm4
psubw xmm4,xmm12
psubw xmm0,xmm3
psubw xmm3,xmm14
paddw xmm1,xmm0
paddw xmm1,xmm5
movdqa xmm0,xmm11
psraw xmm1,3
pmaxsw xmm0,xmm1
pminsw xmm6,xmm0
movdqa xmm1,xmm8
movdqa xmm0,xmm12
psubw xmm0,xmm14
pabsw xmm0,xmm0
pcmpgtw xmm2,xmm0
pabsw xmm0,xmm4
pcmpgtw xmm1,xmm0
pabsw xmm0,xmm3
movdqa xmm3,[rsp]
pand xmm2,xmm1
movdqa xmm1,xmm8
pcmpgtw xmm1,xmm0
movdqa xmm0,xmm13
pand xmm2,xmm1
psubw xmm0,xmm9
psubw xmm13,xmm15
pand xmm2,xmm7
pand xmm6,xmm2
paddw xmm12,xmm6
psubw xmm14,xmm6
movdqa xmm2,[rsp+10h]
movaps xmm6,[r11-18h]
movdqa xmm1,xmm2
psubw xmm1,xmm15
psubw xmm9,xmm2
psllw xmm1,2
paddw xmm1,xmm0
paddw xmm1,xmm5
movdqa xmm0,xmm15
psubw xmm0,xmm2
psraw xmm1,3
pmaxsw xmm11,xmm1
pabsw xmm0,xmm0
movdqa xmm1,xmm8
pcmpgtw xmm10,xmm0
pabsw xmm0,xmm13
pminsw xmm3,xmm11
movaps xmm11,[r11-68h]
movaps xmm13,[rsp+40h]
pcmpgtw xmm1,xmm0
pabsw xmm0,xmm9
movaps xmm9, [r11-48h]
pand xmm10,xmm1
pcmpgtw xmm8,xmm0
pand xmm10,xmm8
pand xmm10,xmm7
movaps xmm8,[r11-38h]
movaps xmm7,[r11-28h]
pand xmm3,xmm10
paddw xmm15,xmm3
psubw xmm2,xmm3
movaps xmm10,[r11-58h]
packuswb xmm12,xmm15
movaps xmm15,[rsp+20h]
packuswb xmm14,xmm2
movq [rcx],xmm12
movq [rbx],xmm14
psrldq xmm12,8
psrldq xmm14,8
movq [rdx],xmm12
movaps xmm12,[r11-78h]
movq [rdi],xmm14
movaps xmm14,[rsp+30h]
mov rsp,r11
pop rdi
pop rbx
2014-01-03 07:49:45 +01:00
ret
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockChromaEq4V_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockChromaEq4V_ssse3:
mov rax,rsp
push rbx
sub rsp,90h
pxor xmm1,xmm1
mov r11,rcx
mov rbx,rdx
mov r10d,r9d
movq xmm13,[r11]
lea eax,[r8+r8]
movsxd r9,eax
mov rax,rcx
sub rax,r9
movq xmm14,[rax]
mov rax,rdx
sub rax,r9
movq xmm0,[rax]
movsxd rax,r8d
sub rcx,rax
sub rdx,rax
movq xmm12,[rax+r11]
movq xmm10,[rcx]
punpcklqdq xmm14,xmm0
movdqa xmm8,xmm14
movq xmm0,[rdx]
punpcklbw xmm8,xmm1
punpckhbw xmm14,xmm1
punpcklqdq xmm10,xmm0
movq xmm0,[rbx]
movdqa xmm5,xmm10
punpcklqdq xmm13,xmm0
movq xmm0, [rax+rbx]
punpcklbw xmm5,xmm1
movsx eax,r10w
movdqa xmm9,xmm13
punpcklqdq xmm12,xmm0
punpcklbw xmm9,xmm1
punpckhbw xmm10,xmm1
movd xmm0,eax
2014-01-03 07:49:45 +01:00
movsx eax,word [rsp + 90h + 8h + 28h] ; iBeta
punpckhbw xmm13,xmm1
movdqa xmm7,xmm12
punpcklwd xmm0,xmm0
punpckhbw xmm12,xmm1
pshufd xmm11,xmm0,0
punpcklbw xmm7,xmm1
movd xmm0,eax
movdqa xmm1,xmm8
psubw xmm1,xmm5
punpcklwd xmm0,xmm0
movdqa xmm6,xmm11
pshufd xmm3,xmm0,0
movdqa xmm0,xmm5
psubw xmm0,xmm9
movdqa xmm2,xmm3
pabsw xmm0,xmm0
pcmpgtw xmm6,xmm0
pabsw xmm0,xmm1
movdqa xmm1,xmm3
pcmpgtw xmm2,xmm0
pand xmm6,xmm2
movdqa xmm0,xmm7
movdqa xmm2,xmm3
psubw xmm0,xmm9
pabsw xmm0,xmm0
pcmpgtw xmm1,xmm0
pand xmm6,xmm1
movdqa xmm0,xmm10
movdqa xmm1,xmm14
psubw xmm0,xmm13
psubw xmm1,xmm10
pabsw xmm0,xmm0
pcmpgtw xmm11,xmm0
pabsw xmm0,xmm1
pcmpgtw xmm2,xmm0
pand xmm11,xmm2
movdqa xmm0,xmm12
movdqa xmm4,xmm6
movdqa xmm1,xmm8
mov eax,2
cwde
paddw xmm1,xmm8
psubw xmm0,xmm13
paddw xmm1,xmm5
pabsw xmm0,xmm0
movdqa xmm2,xmm14
paddw xmm1,xmm7
pcmpgtw xmm3,xmm0
paddw xmm2,xmm14
movd xmm0,eax
pand xmm11,xmm3
paddw xmm7,xmm7
paddw xmm2,xmm10
punpcklwd xmm0,xmm0
paddw xmm2,xmm12
paddw xmm12,xmm12
pshufd xmm3,xmm0,0
paddw xmm7,xmm9
paddw xmm12,xmm13
movdqa xmm0,xmm6
paddw xmm1,xmm3
pandn xmm0,xmm5
paddw xmm7,xmm8
psraw xmm1,2
paddw xmm12,xmm14
paddw xmm7,xmm3
movaps xmm14,[rsp]
pand xmm4,xmm1
paddw xmm12,xmm3
psraw xmm7,2
movdqa xmm1,xmm11
por xmm4,xmm0
psraw xmm12,2
paddw xmm2,xmm3
movdqa xmm0,xmm11
pandn xmm0,xmm10
psraw xmm2,2
pand xmm1,xmm2
por xmm1,xmm0
packuswb xmm4,xmm1
movdqa xmm0,xmm11
movdqa xmm1,xmm6
pand xmm1,xmm7
movaps xmm7,[rsp+70h]
movq [rcx],xmm4
pandn xmm6,xmm9
pandn xmm11,xmm13
pand xmm0,xmm12
por xmm1,xmm6
por xmm0,xmm11
psrldq xmm4,8
packuswb xmm1,xmm0
movq [r11],xmm1
psrldq xmm1,8
movq [rdx],xmm4
lea r11,[rsp+90h]
movaps xmm6,[r11-10h]
movaps xmm8,[r11-30h]
movaps xmm9,[r11-40h]
movq [rbx],xmm1
movaps xmm10,[r11-50h]
movaps xmm11,[r11-60h]
movaps xmm12,[r11-70h]
movaps xmm13,[r11-80h]
mov rsp,r11
pop rbx
2014-01-03 07:49:45 +01:00
ret
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockChromaEq4H_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockChromaEq4H_ssse3:
mov rax,rsp
mov [rax+20h],rbx
push rdi
sub rsp,140h
mov rdi,rdx
lea eax,[r8*4]
movsxd r10,eax
mov eax,[rcx-2]
mov [rsp+10h],eax
lea rbx,[r10+rdx-2]
lea r11,[r10+rcx-2]
movdqa xmm5,[rsp+10h]
movsxd r10,r8d
mov eax,[r10+rcx-2]
lea rdx,[r10+r10*2]
mov [rsp+20h],eax
mov eax,[rcx+r10*2-2]
mov [rsp+30h],eax
mov eax,[rdx+rcx-2]
movdqa xmm2,[rsp+20h]
mov [rsp+40h],eax
mov eax, [rdi-2]
movdqa xmm4,[rsp+30h]
mov [rsp+50h],eax
mov eax,[r10+rdi-2]
movdqa xmm3,[rsp+40h]
mov [rsp+60h],eax
mov eax,[rdi+r10*2-2]
punpckldq xmm5,[rsp+50h]
mov [rsp+70h],eax
mov eax, [rdx+rdi-2]
punpckldq xmm2, [rsp+60h]
mov [rsp+80h],eax
mov eax,[r11]
punpckldq xmm4, [rsp+70h]
mov [rsp+50h],eax
mov eax,[rbx]
punpckldq xmm3,[rsp+80h]
mov [rsp+60h],eax
mov eax,[r10+r11]
movdqa xmm0, [rsp+50h]
punpckldq xmm0, [rsp+60h]
punpcklqdq xmm5,xmm0
movdqa [rsp+50h],xmm0
mov [rsp+50h],eax
mov eax,[r10+rbx]
movdqa xmm0,[rsp+50h]
movdqa xmm1,xmm5
mov [rsp+60h],eax
mov eax,[r11+r10*2]
punpckldq xmm0, [rsp+60h]
punpcklqdq xmm2,xmm0
punpcklbw xmm1,xmm2
punpckhbw xmm5,xmm2
movdqa [rsp+50h],xmm0
mov [rsp+50h],eax
mov eax,[rbx+r10*2]
movdqa xmm0,[rsp+50h]
mov [rsp+60h],eax
mov eax, [rdx+r11]
movdqa xmm15,xmm1
punpckldq xmm0,[rsp+60h]
punpcklqdq xmm4,xmm0
movdqa [rsp+50h],xmm0
mov [rsp+50h],eax
mov eax, [rdx+rbx]
movdqa xmm0,[rsp+50h]
mov [rsp+60h],eax
punpckldq xmm0, [rsp+60h]
punpcklqdq xmm3,xmm0
movdqa xmm0,xmm4
punpcklbw xmm0,xmm3
punpckhbw xmm4,xmm3
punpcklwd xmm15,xmm0
punpckhwd xmm1,xmm0
movdqa xmm0,xmm5
movdqa xmm12,xmm15
punpcklwd xmm0,xmm4
punpckhwd xmm5,xmm4
punpckldq xmm12,xmm0
punpckhdq xmm15,xmm0
movdqa xmm0,xmm1
movdqa xmm11,xmm12
punpckldq xmm0,xmm5
punpckhdq xmm1,xmm5
punpcklqdq xmm11,xmm0
punpckhqdq xmm12,xmm0
movsx eax,r9w
movdqa xmm14,xmm15
punpcklqdq xmm14,xmm1
punpckhqdq xmm15,xmm1
pxor xmm1,xmm1
movd xmm0,eax
movdqa xmm4,xmm12
movdqa xmm8,xmm11
2014-01-03 07:49:45 +01:00
movsx eax,word [rsp+170h] ; iBeta
punpcklwd xmm0,xmm0
punpcklbw xmm4,xmm1
punpckhbw xmm12,xmm1
movdqa xmm9,xmm14
movdqa xmm7,xmm15
movdqa xmm10,xmm15
pshufd xmm13,xmm0,0
punpcklbw xmm9,xmm1
punpckhbw xmm14,xmm1
movdqa xmm6,xmm13
movd xmm0,eax
movdqa [rsp],xmm11
mov eax,2
cwde
punpckhbw xmm11,xmm1
punpckhbw xmm10,xmm1
punpcklbw xmm7,xmm1
punpcklwd xmm0,xmm0
punpcklbw xmm8,xmm1
pshufd xmm3,xmm0,0
movdqa xmm1,xmm8
movdqa xmm0,xmm4
psubw xmm0,xmm9
psubw xmm1,xmm4
movdqa xmm2,xmm3
pabsw xmm0,xmm0
pcmpgtw xmm6,xmm0
pabsw xmm0,xmm1
movdqa xmm1,xmm3
pcmpgtw xmm2,xmm0
pand xmm6,xmm2
movdqa xmm0,xmm7
movdqa xmm2,xmm3
psubw xmm0,xmm9
pabsw xmm0,xmm0
pcmpgtw xmm1,xmm0
pand xmm6,xmm1
movdqa xmm0,xmm12
movdqa xmm1,xmm11
psubw xmm0,xmm14
psubw xmm1,xmm12
movdqa xmm5,xmm6
pabsw xmm0,xmm0
pcmpgtw xmm13,xmm0
pabsw xmm0,xmm1
movdqa xmm1,xmm8
pcmpgtw xmm2,xmm0
paddw xmm1,xmm8
movdqa xmm0,xmm10
pand xmm13,xmm2
psubw xmm0,xmm14
paddw xmm1,xmm4
movdqa xmm2,xmm11
pabsw xmm0,xmm0
paddw xmm2,xmm11
paddw xmm1,xmm7
pcmpgtw xmm3,xmm0
paddw xmm2,xmm12
movd xmm0,eax
pand xmm13,xmm3
paddw xmm2,xmm10
punpcklwd xmm0,xmm0
pshufd xmm3,xmm0,0
movdqa xmm0,xmm6
paddw xmm1,xmm3
pandn xmm0,xmm4
paddw xmm2,xmm3
psraw xmm1,2
pand xmm5,xmm1
por xmm5,xmm0
paddw xmm7,xmm7
paddw xmm10,xmm10
psraw xmm2,2
movdqa xmm1,xmm13
movdqa xmm0,xmm13
pandn xmm0,xmm12
pand xmm1,xmm2
paddw xmm7,xmm9
por xmm1,xmm0
paddw xmm10,xmm14
paddw xmm7,xmm8
movdqa xmm0,xmm13
packuswb xmm5,xmm1
paddw xmm7,xmm3
paddw xmm10,xmm11
movdqa xmm1,xmm6
paddw xmm10,xmm3
pandn xmm6,xmm9
psraw xmm7,2
pand xmm1,xmm7
psraw xmm10,2
pandn xmm13,xmm14
pand xmm0,xmm10
por xmm1,xmm6
movdqa xmm6,[rsp]
movdqa xmm4,xmm6
por xmm0,xmm13
punpcklbw xmm4,xmm5
punpckhbw xmm6,xmm5
movdqa xmm3,xmm4
packuswb xmm1,xmm0
movdqa xmm0,xmm1
punpckhbw xmm1,xmm15
punpcklbw xmm0,xmm15
punpcklwd xmm3,xmm0
punpckhwd xmm4,xmm0
movdqa xmm0,xmm6
movdqa xmm2,xmm3
punpcklwd xmm0,xmm1
punpckhwd xmm6,xmm1
movdqa xmm1,xmm4
punpckldq xmm2,xmm0
punpckhdq xmm3,xmm0
punpckldq xmm1,xmm6
movdqa xmm0,xmm2
punpcklqdq xmm0,xmm1
punpckhdq xmm4,xmm6
punpckhqdq xmm2,xmm1
movdqa [rsp+10h],xmm0
movdqa [rsp+60h],xmm2
movdqa xmm0,xmm3
mov eax,[rsp+10h]
mov [rcx-2],eax
mov eax,[rsp+60h]
punpcklqdq xmm0,xmm4
punpckhqdq xmm3,xmm4
mov [r10+rcx-2],eax
movdqa [rsp+20h],xmm0
mov eax, [rsp+20h]
movdqa [rsp+70h],xmm3
mov [rcx+r10*2-2],eax
mov eax,[rsp+70h]
mov [rdx+rcx-2],eax
mov eax,[rsp+18h]
mov [r11],eax
mov eax,[rsp+68h]
mov [r10+r11],eax
mov eax,[rsp+28h]
mov [r11+r10*2],eax
mov eax,[rsp+78h]
mov [rdx+r11],eax
mov eax,[rsp+14h]
mov [rdi-2],eax
mov eax,[rsp+64h]
mov [r10+rdi-2],eax
mov eax,[rsp+24h]
mov [rdi+r10*2-2],eax
mov eax, [rsp+74h]
mov [rdx+rdi-2],eax
mov eax, [rsp+1Ch]
mov [rbx],eax
mov eax, [rsp+6Ch]
mov [r10+rbx],eax
mov eax,[rsp+2Ch]
mov [rbx+r10*2],eax
mov eax,[rsp+7Ch]
mov [rdx+rbx],eax
lea r11,[rsp+140h]
mov rbx, [r11+28h]
mov rsp,r11
pop rdi
2014-01-03 07:49:45 +01:00
ret
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockChromaLt4H_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockChromaLt4H_ssse3:
mov rax,rsp
push rbx
push rbp
push rsi
push rdi
push r12
sub rsp,170h
movsxd rsi,r8d
lea eax,[r8*4]
mov r11d,r9d
movsxd r10,eax
mov eax, [rcx-2]
mov r12,rdx
mov [rsp+40h],eax
mov eax, [rsi+rcx-2]
lea rbx,[r10+rcx-2]
movdqa xmm5,[rsp+40h]
mov [rsp+50h],eax
mov eax, [rcx+rsi*2-2]
lea rbp,[r10+rdx-2]
movdqa xmm2, [rsp+50h]
mov [rsp+60h],eax
lea r10,[rsi+rsi*2]
mov rdi,rcx
mov eax,[r10+rcx-2]
movdqa xmm4,[rsp+60h]
mov [rsp+70h],eax
mov eax,[rdx-2]
mov [rsp+80h],eax
mov eax, [rsi+rdx-2]
movdqa xmm3,[rsp+70h]
mov [rsp+90h],eax
mov eax,[rdx+rsi*2-2]
punpckldq xmm5,[rsp+80h]
mov [rsp+0A0h],eax
mov eax, [r10+rdx-2]
punpckldq xmm2,[rsp+90h]
mov [rsp+0B0h],eax
mov eax, [rbx]
punpckldq xmm4,[rsp+0A0h]
mov [rsp+80h],eax
mov eax,[rbp]
punpckldq xmm3,[rsp+0B0h]
mov [rsp+90h],eax
mov eax,[rsi+rbx]
movdqa xmm0,[rsp+80h]
punpckldq xmm0,[rsp+90h]
punpcklqdq xmm5,xmm0
movdqa [rsp+80h],xmm0
mov [rsp+80h],eax
mov eax,[rsi+rbp]
movdqa xmm0,[rsp+80h]
movdqa xmm1,xmm5
mov [rsp+90h],eax
mov eax,[rbx+rsi*2]
punpckldq xmm0,[rsp+90h]
punpcklqdq xmm2,xmm0
punpcklbw xmm1,xmm2
punpckhbw xmm5,xmm2
movdqa [rsp+80h],xmm0
mov [rsp+80h],eax
mov eax,[rbp+rsi*2]
movdqa xmm0, [rsp+80h]
mov [rsp+90h],eax
mov eax,[r10+rbx]
movdqa xmm7,xmm1
punpckldq xmm0,[rsp+90h]
punpcklqdq xmm4,xmm0
movdqa [rsp+80h],xmm0
mov [rsp+80h],eax
mov eax, [r10+rbp]
movdqa xmm0,[rsp+80h]
mov [rsp+90h],eax
punpckldq xmm0,[rsp+90h]
punpcklqdq xmm3,xmm0
movdqa xmm0,xmm4
punpcklbw xmm0,xmm3
punpckhbw xmm4,xmm3
punpcklwd xmm7,xmm0
punpckhwd xmm1,xmm0
movdqa xmm0,xmm5
movdqa xmm6,xmm7
punpcklwd xmm0,xmm4
punpckhwd xmm5,xmm4
punpckldq xmm6,xmm0
punpckhdq xmm7,xmm0
movdqa xmm0,xmm1
punpckldq xmm0,xmm5
2014-01-03 07:49:45 +01:00
mov rax, [rsp+1C8h] ; pTC
punpckhdq xmm1,xmm5
movdqa xmm9,xmm6
punpckhqdq xmm6,xmm0
punpcklqdq xmm9,xmm0
movdqa xmm2,xmm7
movdqa xmm13,xmm6
movdqa xmm4,xmm9
movdqa [rsp+10h],xmm9
punpcklqdq xmm2,xmm1
punpckhqdq xmm7,xmm1
pxor xmm1,xmm1
movsx ecx,byte [rax+3]
movsx edx,byte [rax+2]
movsx r8d,byte [rax+1]
movsx r9d,byte [rax]
movdqa xmm10,xmm1
movdqa xmm15,xmm2
punpckhbw xmm2,xmm1
punpckhbw xmm6,xmm1
punpcklbw xmm4,xmm1
movsx eax,r11w
mov word [rsp+0Eh],cx
mov word [rsp+0Ch],cx
movdqa xmm3,xmm7
movdqa xmm8,xmm7
movdqa [rsp+20h],xmm7
punpcklbw xmm15,xmm1
punpcklbw xmm13,xmm1
punpcklbw xmm3,xmm1
mov word [rsp+0Ah],dx
mov word [rsp+8],dx
mov word [rsp+6],r8w
movd xmm0,eax
movdqa [rsp+30h],xmm6
punpckhbw xmm9,xmm1
punpckhbw xmm8,xmm1
punpcklwd xmm0,xmm0
2014-01-03 07:49:45 +01:00
movsx eax,word [rsp+1C0h] ; iBeta
mov word [rsp+4],r8w
mov word [rsp+2],r9w
pshufd xmm12,xmm0,0
mov word [rsp],r9w
movd xmm0,eax
mov eax,4
cwde
movdqa xmm14, [rsp]
movdqa [rsp],xmm2
movdqa xmm2,xmm12
punpcklwd xmm0,xmm0
pshufd xmm11,xmm0,0
psubw xmm10,xmm14
movd xmm0,eax
movdqa xmm7,xmm14
movdqa xmm6,xmm14
pcmpgtw xmm7,xmm1
punpcklwd xmm0,xmm0
pshufd xmm5,xmm0,0
movdqa xmm0,xmm4
movdqa xmm1,xmm15
psubw xmm4,xmm13
psubw xmm0,xmm3
psubw xmm1,xmm13
psubw xmm3,xmm15
psllw xmm1,2
paddw xmm1,xmm0
paddw xmm1,xmm5
movdqa xmm0,xmm10
psraw xmm1,3
pmaxsw xmm0,xmm1
pminsw xmm6,xmm0
movdqa xmm1,xmm11
movdqa xmm0,xmm13
psubw xmm0,xmm15
pabsw xmm0,xmm0
pcmpgtw xmm2,xmm0
pabsw xmm0,xmm4
pcmpgtw xmm1,xmm0
pabsw xmm0,xmm3
pand xmm2,xmm1
movdqa xmm1,xmm11
movdqa xmm3,[rsp+30h]
pcmpgtw xmm1,xmm0
movdqa xmm0,xmm9
pand xmm2,xmm1
psubw xmm0,xmm8
psubw xmm9,xmm3
pand xmm2,xmm7
pand xmm6,xmm2
psubw xmm15,xmm6
paddw xmm13,xmm6
movdqa xmm2,[rsp]
movdqa xmm1,xmm2
psubw xmm1,xmm3
psubw xmm8,xmm2
psllw xmm1,2
paddw xmm1,xmm0
paddw xmm1,xmm5
movdqa xmm0,xmm3
movdqa xmm5,[rsp+10h]
psubw xmm0,xmm2
psraw xmm1,3
movdqa xmm4,xmm5
pabsw xmm0,xmm0
pmaxsw xmm10,xmm1
movdqa xmm1,xmm11
pcmpgtw xmm12,xmm0
pabsw xmm0,xmm9
pminsw xmm14,xmm10
pcmpgtw xmm1,xmm0
pabsw xmm0,xmm8
pcmpgtw xmm11,xmm0
pand xmm12,xmm1
movdqa xmm1,[rsp+20h]
pand xmm12,xmm11
pand xmm12,xmm7
pand xmm14,xmm12
paddw xmm3,xmm14
psubw xmm2,xmm14
packuswb xmm13,xmm3
packuswb xmm15,xmm2
punpcklbw xmm4,xmm13
punpckhbw xmm5,xmm13
movdqa xmm0,xmm15
punpcklbw xmm0,xmm1
punpckhbw xmm15,xmm1
movdqa xmm3,xmm4
punpcklwd xmm3,xmm0
punpckhwd xmm4,xmm0
movdqa xmm0,xmm5
movdqa xmm2,xmm3
movdqa xmm1,xmm4
punpcklwd xmm0,xmm15
punpckhwd xmm5,xmm15
punpckldq xmm2,xmm0
punpckhdq xmm3,xmm0
punpckldq xmm1,xmm5
movdqa xmm0,xmm2
punpcklqdq xmm0,xmm1
punpckhdq xmm4,xmm5
punpckhqdq xmm2,xmm1
movdqa [rsp+40h],xmm0
movdqa xmm0,xmm3
movdqa [rsp+90h],xmm2
mov eax,[rsp+40h]
mov [rdi-2],eax
mov eax, [rsp+90h]
punpcklqdq xmm0,xmm4
punpckhqdq xmm3,xmm4
mov [rsi+rdi-2],eax
movdqa [rsp+50h],xmm0
mov eax,[rsp+50h]
movdqa [rsp+0A0h],xmm3
mov [rdi+rsi*2-2],eax
mov eax,[rsp+0A0h]
mov [r10+rdi-2],eax
mov eax,[rsp+48h]
mov [rbx],eax
mov eax,[rsp+98h]
mov [rsi+rbx],eax
mov eax,[rsp+58h]
mov [rbx+rsi*2],eax
mov eax, [rsp+0A8h]
mov [r10+rbx],eax
mov eax, [rsp+44h]
mov [r12-2],eax
mov eax,[rsp+94h]
mov [rsi+r12-2],eax
mov eax,[rsp+54h]
mov [r12+rsi*2-2],eax
mov eax, [rsp+0A4h]
mov [r10+r12-2],eax
mov eax,[rsp+4Ch]
mov [rbp],eax
mov eax,[rsp+9Ch]
mov [rsi+rbp],eax
mov eax, [rsp+5Ch]
mov [rbp+rsi*2],eax
mov eax,[rsp+0ACh]
mov [r10+rbp],eax
lea r11,[rsp+170h]
mov rsp,r11
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
2014-01-03 07:49:45 +01:00
%elifdef UNIX64
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockLumaLt4V_ssse3
2014-01-03 07:49:45 +01:00
2014-01-16 08:57:22 +01:00
DeblockLumaLt4V_ssse3:
push rbp
mov r11,r8 ; pTC
sub rsp,1B0h
lea rbp,[rsp+20h]
movd xmm4,edx
movd xmm2,ecx
mov qword [rbp+180h],r12
mov r10,rdi
movsxd r12,esi
2014-01-03 07:49:45 +01:00
add rsi,rsi
movsxd rdx,esi
sub r10,r12
movsx r8d,byte [r11]
pxor xmm3,xmm3
punpcklwd xmm2,xmm2
movaps [rbp+50h],xmm14
lea rax,[r12+r12*2]
movdqa xmm14,[rdx+rdi]
neg rax
pshufd xmm0,xmm2,0
movd xmm2,r8d
movsx rsi,byte [r11+1]
movsx r8d,byte [r11+2]
movsx r11d,byte [r11+3]
movaps [rbp+70h],xmm12
movd xmm1,esi
movaps [rbp+80h],xmm11
movd xmm12,r8d
movd xmm11,r11d
movdqa xmm5, [rax+rdi]
lea rax,[r12+r12]
punpcklwd xmm12,xmm12
neg rax
punpcklwd xmm11,xmm11
movaps [rbp],xmm8
movdqa xmm8, [r10]
punpcklwd xmm2,xmm2
punpcklwd xmm1,xmm1
punpcklqdq xmm12,xmm12
punpcklqdq xmm11,xmm11
punpcklqdq xmm2,xmm2
punpcklqdq xmm1,xmm1
shufps xmm12,xmm11,88h
movdqa xmm11,xmm8
movaps [rbp+30h],xmm9
movdqa xmm9,[rdi]
shufps xmm2,xmm1,88h
movdqa xmm1,xmm5
punpcklbw xmm11,xmm3
movaps [rbp+20h],xmm6
movaps [rbp+60h],xmm13
movdqa xmm13,xmm11
movaps [rbp+90h],xmm10
movdqa xmm10,xmm9
movdqa xmm6,[rax+rdi]
punpcklbw xmm1,xmm3
movaps [rbp+0A0h],xmm12
psubw xmm13,xmm1
movaps [rbp+40h],xmm15
movdqa xmm15,xmm14
movaps [rbp+10h],xmm7
movdqa xmm7,xmm6
punpcklbw xmm10,xmm3
movdqa xmm12,[r12+rdi]
punpcklbw xmm7,xmm3
punpcklbw xmm12,xmm3
punpcklbw xmm15,xmm3
pabsw xmm3,xmm13
movdqa xmm13,xmm10
psubw xmm13,xmm15
movdqa [rbp+0F0h],xmm15
pabsw xmm15,xmm13
movdqa xmm13,xmm11
movdqa [rbp+0B0h],xmm1
movdqa xmm1,xmm0
pavgw xmm13,xmm10
pcmpgtw xmm1,xmm3
movdqa [rbp+120h],xmm13
movaps xmm13,xmm2
punpcklwd xmm4,xmm4
movdqa xmm3,xmm0
movdqa [rbp+100h],xmm1
psubw xmm13,xmm1
movdqa xmm1,xmm10
pcmpgtw xmm3,xmm15
pshufd xmm4,xmm4,0
psubw xmm1,xmm11
movdqa [rbp+0D0h],xmm10
psubw xmm13,xmm3
movdqa [rbp+110h],xmm3
pabsw xmm15,xmm1
movdqa xmm3,xmm4
psubw xmm10,xmm12
pcmpgtw xmm3,xmm15
pabsw xmm15,xmm10
movdqa xmm10,xmm0
psllw xmm1,2
movdqa [rbp+0C0h],xmm11
psubw xmm11,xmm7
pcmpgtw xmm10,xmm15
pabsw xmm11,xmm11
movdqa xmm15,xmm0
pand xmm3,xmm10
pcmpgtw xmm15,xmm11
movaps xmm11,xmm2
pxor xmm10,xmm10
pand xmm3,xmm15
pcmpgtw xmm11,xmm10
pcmpeqw xmm10,xmm2
por xmm11,xmm10
pand xmm3,xmm11
movdqa xmm11,xmm7
psubw xmm11,xmm12
pxor xmm15,xmm15
paddw xmm11,xmm1
psubw xmm15,xmm13
movdqa [rbp+0E0h],xmm12
paddw xmm11,[FOUR_16B_SSE2]
pxor xmm12,xmm12
psraw xmm11,3
punpckhbw xmm8,xmm12
pmaxsw xmm15,xmm11
punpckhbw xmm5,xmm12
movdqa xmm11,xmm8
pminsw xmm13,xmm15
psubw xmm11,xmm5
punpckhbw xmm9,xmm12
pand xmm13,xmm3
movdqa [rbp+130h],xmm13
pabsw xmm13,xmm11
punpckhbw xmm14,xmm12
movdqa xmm11,xmm9
psubw xmm11,xmm14
movdqa xmm15,xmm0
movdqa [rbp+140h],xmm14
pabsw xmm14,xmm11
movdqa xmm11,xmm8
pcmpgtw xmm15,xmm14
movdqa xmm1,[r12+rdi]
pavgw xmm11,xmm9
movdqa [rbp+170h],xmm11
movdqa xmm10,xmm9
punpckhbw xmm6,xmm12
psubw xmm10,xmm8
punpckhbw xmm1,xmm12
movdqa xmm12,xmm0
movaps xmm11,[rbp+0A0h]
pcmpgtw xmm12,xmm13
movaps xmm13,xmm11
psubw xmm13,xmm12
movdqa [rbp+160h],xmm15
psubw xmm13,xmm15
movdqa xmm15,xmm9
psubw xmm15,xmm1
movdqa [rbp+150h],xmm12
pabsw xmm12,xmm10
pabsw xmm14,xmm15
movdqa xmm15,xmm8
pcmpgtw xmm4,xmm12
movdqa xmm12,xmm0
psubw xmm15,xmm6
pcmpgtw xmm12,xmm14
pabsw xmm14,xmm15
psllw xmm10,2
pcmpgtw xmm0,xmm14
movdqa xmm14,xmm6
psubw xmm14,xmm1
pand xmm4,xmm12
paddw xmm14,xmm10
pand xmm4,xmm0
paddw xmm14,[FOUR_16B_SSE2]
pxor xmm15,xmm15
movaps xmm12,xmm11
psubw xmm15,xmm13
pxor xmm0,xmm0
psraw xmm14,3
pcmpgtw xmm12,xmm0
pcmpeqw xmm0,xmm11
pmaxsw xmm15,xmm14
por xmm12,xmm0
movdqa xmm0,[rbp+120h]
pminsw xmm13,xmm15
movdqa xmm15,[rbp+0B0h]
movdqa xmm10,xmm7
pand xmm4,xmm12
paddw xmm15,xmm0
pxor xmm12,xmm12
paddw xmm10,xmm7
movdqa xmm14,xmm12
psubw xmm15,xmm10
psubw xmm14,xmm2
psraw xmm15,1
pmaxsw xmm15,xmm14
movdqa xmm10,xmm6
pminsw xmm15,xmm2
paddw xmm10,xmm6
pand xmm15,xmm3
psubw xmm12,xmm11
pand xmm15,[rbp+100h]
pand xmm13,xmm4
paddw xmm7,xmm15
paddw xmm8,xmm13
movdqa xmm15,[rbp+170h]
psubw xmm9,xmm13
paddw xmm5,xmm15
psubw xmm5,xmm10
psraw xmm5,1
pmaxsw xmm5,xmm12
pminsw xmm5,xmm11
pand xmm5,xmm4
pand xmm5,[rbp+150h]
paddw xmm6,xmm5
movdqa xmm5,[rbp+0C0h]
packuswb xmm7,xmm6
movdqa xmm6,[rbp+130h]
paddw xmm5,xmm6
packuswb xmm5,xmm8
movdqa xmm8,[rbp+0D0h]
psubw xmm8,xmm6
movdqa xmm6,[rbp+0F0h]
paddw xmm6,xmm0
movdqa xmm0,[rbp+0E0h]
packuswb xmm8,xmm9
movdqa xmm9,xmm0
paddw xmm9,xmm0
psubw xmm6,xmm9
psraw xmm6,1
pmaxsw xmm14,xmm6
pminsw xmm2,xmm14
pand xmm2,xmm3
pand xmm2,[rbp+110h]
paddw xmm0,xmm2
movdqa xmm2,[rbp+140h]
paddw xmm2,xmm15
movdqa xmm15,xmm1
paddw xmm15,xmm1
psubw xmm2,xmm15
psraw xmm2,1
pmaxsw xmm12,xmm2
pminsw xmm11,xmm12
pand xmm11,xmm4
pand xmm11,[rbp+160h]
paddw xmm1,xmm11
movdqa [rax+rdi],xmm7
movdqa [r10],xmm5
packuswb xmm0,xmm1
movdqa [rdi],xmm8
movdqa [r12+rdi],xmm0
mov r12,qword [rbp+180h]
lea rsp,[rbp+190h]
pop rbp
ret
2014-01-03 07:49:45 +01:00
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockLumaEq4V_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockLumaEq4V_ssse3:
mov rax,rsp
push rbx
push rbp
2014-01-03 07:49:45 +01:00
mov r8, rdx
mov r9, rcx
mov rcx, rdi
mov rdx, rsi
sub rsp,1D8h
movaps [rax-38h],xmm6
movaps [rax-48h],xmm7
movaps [rax-58h],xmm8
pxor xmm1,xmm1
movsxd r10,edx
mov rbp,rcx
mov r11d,r8d
mov rdx,rcx
mov rdi,rbp
mov rbx,rbp
movdqa xmm5,[rbp]
movaps [rax-68h],xmm9
movaps [rax-78h],xmm10
punpcklbw xmm5,xmm1
movaps [rax-88h],xmm11
movaps [rax-98h],xmm12
movaps [rax-0A8h],xmm13
movaps [rax-0B8h],xmm14
movdqa xmm14,[r10+rbp]
movaps [rax-0C8h],xmm15
lea eax,[r10*4]
movsxd r8,eax
lea eax,[r10+r10*2]
movsxd rcx,eax
lea eax,[r10+r10]
sub rdx,r8
punpcklbw xmm14,xmm1
movdqa [rsp+90h],xmm5
movdqa [rsp+30h],xmm14
movsxd rsi,eax
movsx eax,r11w
sub rdi,rcx
sub rbx,rsi
mov r8,rbp
sub r8,r10
movd xmm0,eax
movsx eax,r9w
movdqa xmm12,[rdi]
movdqa xmm6, [rsi+rbp]
movdqa xmm13,[rbx]
punpcklwd xmm0,xmm0
pshufd xmm11,xmm0,0
punpcklbw xmm13,xmm1
punpcklbw xmm6,xmm1
movdqa xmm8,[r8]
movd xmm0,eax
movdqa xmm10,xmm11
mov eax,2
punpcklbw xmm8,xmm1
punpcklbw xmm12,xmm1
cwde
punpcklwd xmm0,xmm0
psraw xmm10,2
movdqa xmm1,xmm8
movdqa [rsp+0F0h],xmm13
movdqa [rsp+0B0h],xmm8
pshufd xmm7,xmm0,0
psubw xmm1,xmm13
movdqa xmm0,xmm5
movdqa xmm4,xmm7
movdqa xmm2,xmm7
psubw xmm0,xmm8
pabsw xmm3,xmm0
pabsw xmm0,xmm1
movdqa xmm1,xmm5
movdqa [rsp+40h],xmm7
movdqa [rsp+60h],xmm6
pcmpgtw xmm4,xmm0
psubw xmm1,xmm14
pabsw xmm0,xmm1
pcmpgtw xmm2,xmm0
pand xmm4,xmm2
movdqa xmm0,xmm11
pcmpgtw xmm0,xmm3
pand xmm4,xmm0
movd xmm0,eax
movdqa [rsp+20h],xmm4
punpcklwd xmm0,xmm0
pshufd xmm2,xmm0,0
paddw xmm10,xmm2
movdqa [rsp+0A0h],xmm2
movdqa xmm15,xmm7
pxor xmm4,xmm4
movdqa xmm0,xmm8
psubw xmm0,xmm12
mov eax,4
pabsw xmm0,xmm0
movdqa xmm1,xmm10
cwde
pcmpgtw xmm15,xmm0
pcmpgtw xmm1,xmm3
movdqa xmm3,xmm7
movdqa xmm7,[rdx]
movdqa xmm0,xmm5
psubw xmm0,xmm6
pand xmm15,xmm1
punpcklbw xmm7,xmm4
movdqa xmm9,xmm15
pabsw xmm0,xmm0
psllw xmm7,1
pandn xmm9,xmm12
pcmpgtw xmm3,xmm0
paddw xmm7,xmm12
movd xmm0,eax
pand xmm3,xmm1
paddw xmm7,xmm12
punpcklwd xmm0,xmm0
paddw xmm7,xmm12
pshufd xmm1,xmm0,0
paddw xmm7,xmm13
movdqa xmm0,xmm3
pandn xmm0,xmm6
paddw xmm7,xmm8
movdqa [rsp+70h],xmm1
paddw xmm7,xmm5
movdqa [rsp+120h],xmm0
movdqa xmm0,[rcx+rbp]
punpcklbw xmm0,xmm4
paddw xmm7,xmm1
movdqa xmm4,xmm15
psllw xmm0,1
psraw xmm7,3
paddw xmm0,xmm6
pand xmm7,xmm15
paddw xmm0,xmm6
paddw xmm0,xmm6
paddw xmm0,xmm14
movdqa xmm6,xmm15
paddw xmm0,xmm5
pandn xmm6,xmm13
paddw xmm0,xmm8
paddw xmm0,xmm1
psraw xmm0,3
movdqa xmm1,xmm12
paddw xmm1,xmm13
pand xmm0,xmm3
movdqa [rsp+100h],xmm0
movdqa xmm0,xmm8
paddw xmm0,xmm5
paddw xmm1,xmm0
movdqa xmm0,xmm3
paddw xmm1,xmm2
psraw xmm1,2
pandn xmm0,xmm14
pand xmm4,xmm1
movdqa [rsp+0E0h],xmm0
movdqa xmm0,xmm5
paddw xmm0,xmm8
movdqa xmm1,[rsp+60h]
paddw xmm1,xmm14
movdqa xmm14,xmm3
paddw xmm1,xmm0
movdqa xmm0,xmm8
paddw xmm0,[rsp+30h]
paddw xmm1,xmm2
psraw xmm1,2
pand xmm14,xmm1
movdqa xmm1,xmm13
paddw xmm1,xmm13
paddw xmm1,xmm0
paddw xmm1,xmm2
psraw xmm1,2
movdqa xmm0,[rsp+30h]
movdqa xmm2,xmm13
movdqa xmm5,xmm15
paddw xmm0,[rsp+70h]
pandn xmm5,xmm1
paddw xmm2,xmm8
movdqa xmm8,[rsp+90h]
movdqa xmm1,xmm12
paddw xmm2,xmm8
psllw xmm2,1
paddw xmm2,xmm0
paddw xmm1,xmm2
movdqa xmm0,xmm8
movdqa xmm8,xmm3
movdqa xmm2,[rsp+30h]
paddw xmm0,xmm13
psraw xmm1,3
pand xmm15,xmm1
movdqa xmm1,xmm2
paddw xmm1,xmm2
paddw xmm2,[rsp+90h]
paddw xmm2,[rsp+0B0h]
paddw xmm1,xmm0
movdqa xmm0,xmm13
movdqa xmm13,[r8]
paddw xmm0, [rsp+70h]
paddw xmm1, [rsp+0A0h]
psllw xmm2,1
paddw xmm2,xmm0
psraw xmm1,2
movdqa xmm0, [rdi]
pandn xmm8,xmm1
movdqa xmm1, [rsp+60h]
paddw xmm1,xmm2
movdqa xmm2, [rbx]
psraw xmm1,3
pand xmm3,xmm1
movdqa xmm1, [rbp]
movdqa [rsp+0D0h],xmm3
pxor xmm3,xmm3
punpckhbw xmm0,xmm3
punpckhbw xmm1,xmm3
punpckhbw xmm13,xmm3
movdqa [rsp+0C0h],xmm0
movdqa xmm0,[r10+rbp]
movdqa [rsp],xmm1
punpckhbw xmm0,xmm3
punpckhbw xmm2,xmm3
movdqa [rsp+80h],xmm0
movdqa xmm0,[rsi+rbp]
movdqa [rsp+10h],xmm13
punpckhbw xmm0,xmm3
movdqa [rsp+50h],xmm0
movdqa xmm0,xmm1
movdqa xmm1,xmm13
psubw xmm0,xmm13
psubw xmm1,xmm2
pabsw xmm3,xmm0
pabsw xmm0,xmm1
movdqa xmm1,[rsp]
movdqa xmm13,[rsp+40h]
movdqa [rsp+110h],xmm2
psubw xmm1, [rsp+80h]
pcmpgtw xmm13,xmm0
pcmpgtw xmm11,xmm3
pabsw xmm0,xmm1
pcmpgtw xmm10,xmm3
movdqa xmm1, [rsp+40h]
movdqa xmm2,xmm1
movdqa xmm3,xmm1
pcmpgtw xmm2,xmm0
movdqa xmm0, [rsp+10h]
pand xmm13,xmm2
pand xmm13,xmm11
movdqa xmm11,[rsp+0C0h]
psubw xmm0,xmm11
pabsw xmm0,xmm0
pcmpgtw xmm3,xmm0
pand xmm3,xmm10
movdqa xmm0,[rsp]
psubw xmm0,[rsp+50h]
movdqa xmm2,[rdx]
pabsw xmm0,xmm0
por xmm7,xmm9
movdqa xmm9,[rsp+20h]
pcmpgtw xmm1,xmm0
pand xmm9,xmm7
movdqa xmm7,[rsp+20h]
movdqa xmm0,xmm7
pandn xmm0,xmm12
movdqa xmm12,[rsp+110h]
pand xmm1,xmm10
movdqa xmm10,[rsp+70h]
movdqa [rsp+40h],xmm1
movdqa xmm1,xmm13
por xmm9,xmm0
pxor xmm0,xmm0
por xmm4,xmm6
movdqa xmm6,xmm7
punpckhbw xmm2,xmm0
por xmm15,xmm5
movdqa xmm5,[rsp+20h]
movdqa xmm0,xmm3
psllw xmm2,1
pandn xmm0,xmm11
pand xmm6,xmm4
movdqa xmm4,[rsp]
paddw xmm2,xmm11
pand xmm5,xmm15
movdqa xmm15,[rsp+20h]
paddw xmm2,xmm11
paddw xmm2,xmm11
paddw xmm2,xmm12
paddw xmm2,[rsp+10h]
paddw xmm2,[rsp]
paddw xmm2,xmm10
psraw xmm2,3
pand xmm2,xmm3
por xmm2,xmm0
pand xmm1,xmm2
movdqa xmm0,xmm13
movdqa xmm2,xmm11
pandn xmm0,xmm11
paddw xmm2,xmm12
por xmm1,xmm0
packuswb xmm9,xmm1
movdqa xmm0,xmm7
movdqa xmm7,[rsp+0A0h]
pandn xmm0,[rsp+0F0h]
movdqa xmm1,xmm3
por xmm6,xmm0
movdqa xmm0,[rsp+10h]
paddw xmm0,xmm4
paddw xmm2,xmm0
paddw xmm2,xmm7
movdqa xmm0,xmm3
pandn xmm0,xmm12
psraw xmm2,2
pand xmm1,xmm2
por xmm1,xmm0
movdqa xmm2,xmm13
movdqa xmm0,xmm13
pand xmm2,xmm1
pandn xmm0,xmm12
movdqa xmm1,xmm12
paddw xmm1,[rsp+10h]
por xmm2,xmm0
movdqa xmm0,xmm15
pandn xmm0,[rsp+0B0h]
paddw xmm1,xmm4
packuswb xmm6,xmm2
movdqa xmm2,xmm3
psllw xmm1,1
por xmm5,xmm0
movdqa xmm0,[rsp+80h]
paddw xmm0,xmm10
paddw xmm1,xmm0
paddw xmm11,xmm1
psraw xmm11,3
movdqa xmm1,xmm12
pand xmm2,xmm11
paddw xmm1,xmm12
movdqa xmm11,[rsp+80h]
movdqa xmm0, [rsp+10h]
por xmm14,[rsp+0E0h]
paddw xmm0,xmm11
movdqa xmm4,xmm15
paddw xmm1,xmm0
movdqa xmm0,xmm13
paddw xmm1,xmm7
psraw xmm1,2
pandn xmm3,xmm1
por xmm2,xmm3
movdqa xmm1,xmm13
movdqa xmm3,[rsp+10h]
pandn xmm0,xmm3
pand xmm1,xmm2
movdqa xmm2,xmm11
paddw xmm2,[rsp]
por xmm1,xmm0
movdqa xmm0,[rsp+0D0h]
por xmm0,xmm8
paddw xmm2,xmm3
packuswb xmm5,xmm1
movdqa xmm8,[rsp+40h]
movdqa xmm1,[rsp+50h]
movdqa xmm3,xmm8
pand xmm4,xmm0
psllw xmm2,1
movdqa xmm0,xmm15
pandn xmm0,[rsp+90h]
por xmm4,xmm0
movdqa xmm0,xmm12
paddw xmm0,xmm10
paddw xmm2,xmm0
paddw xmm1,xmm2
movdqa xmm0,[rsp]
movdqa xmm2,xmm11
paddw xmm0,xmm12
movdqa xmm12,[rsp]
paddw xmm2,xmm11
paddw xmm2,xmm0
psraw xmm1,3
movdqa xmm0,xmm8
pand xmm3,xmm1
paddw xmm2,xmm7
movdqa xmm1,xmm13
psraw xmm2,2
pandn xmm0,xmm2
por xmm3,xmm0
movdqa xmm2,[rsp+50h]
movdqa xmm0,xmm13
pandn xmm0,xmm12
pand xmm1,xmm3
paddw xmm2,xmm11
movdqa xmm3,xmm15
por xmm1,xmm0
pand xmm3,xmm14
movdqa xmm14,[rsp+10h]
movdqa xmm0,xmm15
pandn xmm0,[rsp+30h]
packuswb xmm4,xmm1
movdqa xmm1,xmm8
por xmm3,xmm0
movdqa xmm0,xmm12
paddw xmm0,xmm14
paddw xmm2,xmm0
paddw xmm2,xmm7
movdqa xmm0,xmm8
pandn xmm0,xmm11
psraw xmm2,2
pand xmm1,xmm2
por xmm1,xmm0
movdqa xmm2,xmm13
movdqa xmm0,xmm13
pandn xmm0,xmm11
pand xmm2,xmm1
movdqa xmm1,xmm15
por xmm2,xmm0
packuswb xmm3,xmm2
movdqa xmm0,[rsp+100h]
por xmm0,[rsp+120h]
pand xmm1,xmm0
movdqa xmm2,[rcx+rbp]
movdqa xmm7,[rsp+50h]
pandn xmm15,[rsp+60h]
lea r11,[rsp+1D8h]
pxor xmm0,xmm0
por xmm1,xmm15
movaps xmm15,[r11-0A8h]
movdqa [rdi],xmm9
movaps xmm9,[r11-48h]
punpckhbw xmm2,xmm0
psllw xmm2,1
paddw xmm2,xmm7
paddw xmm2,xmm7
movdqa [rbx],xmm6
movaps xmm6,[r11-18h]
paddw xmm2,xmm7
paddw xmm2,xmm11
movaps xmm11,[r11-68h]
paddw xmm2,xmm12
movaps xmm12,[r11-78h]
paddw xmm2,xmm14
paddw xmm2,xmm10
psraw xmm2,3
movaps xmm10,[r11-58h]
movaps xmm14,[r11-98h]
movdqa xmm0,xmm13
pand xmm2,xmm8
pandn xmm8,xmm7
pandn xmm13,xmm7
por xmm2,xmm8
movaps xmm7,[r11-28h]
movaps xmm8,[r11-38h]
movdqa [r8],xmm5
pand xmm0,xmm2
por xmm0,xmm13
packuswb xmm1,xmm0
movaps xmm13,[r11-88h]
movdqa [rbp],xmm4
movdqa [r10+rbp],xmm3
movdqa [rsi+rbp],xmm1
mov rsp,r11
pop rbp
pop rbx
2014-01-03 07:49:45 +01:00
ret
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockChromaLt4V_ssse3
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockChromaLt4V_ssse3:
mov rax,rsp
push rbx
push rbp
2014-01-03 07:49:45 +01:00
mov r10, rdx
mov r11, rcx
mov rcx, rdi
mov rdx, rsi
2014-01-03 07:49:45 +01:00
mov rsi, r10
mov r10, r9
mov rbp, r8
mov r8, rsi
mov r9, r11
sub rsp,0C8h
pxor xmm1,xmm1
mov rbx,rcx
movsxd r11,r8d
movsx ecx,byte [r10]
movsx r8d,byte [r10+2]
mov rdi,rdx
movq xmm2,[rbx]
movq xmm9,[r11+rbx]
movsx edx,byte [r10+1]
mov word [rsp+2],cx
mov word [rsp],cx
movsx eax,byte [r10+3]
mov word [rsp+6],dx
mov word [rsp+4],dx
movdqa xmm11,xmm1
mov word [rsp+0Eh],ax
mov word [rsp+0Ch],ax
lea eax,[r11+r11]
movsxd rcx,eax
mov rax,rbx
mov rdx,rdi
sub rax,rcx
mov word [rsp+0Ah],r8w
mov word [rsp+8],r8w
movdqa xmm6,[rsp]
movdqa xmm7,xmm6
movq xmm13, [rax]
mov rax,rdi
sub rax,rcx
mov rcx,rbx
pcmpgtw xmm7,xmm1
psubw xmm11,xmm6
sub rcx,r11
sub rdx,r11
movq xmm0,[rax]
movsx eax,r9w
movq xmm15,[rcx]
punpcklqdq xmm13,xmm0
movq xmm0, [rdx]
movdqa xmm4,xmm13
punpcklqdq xmm15,xmm0
movq xmm0, [rdi]
punpcklbw xmm4,xmm1
movdqa xmm12,xmm15
punpcklqdq xmm2,xmm0
movq xmm0, [r11+rdi]
punpcklbw xmm12,xmm1
movdqa xmm14,xmm2
punpcklqdq xmm9,xmm0
punpckhbw xmm2,xmm1
punpcklbw xmm14,xmm1
movd xmm0,eax
2014-01-03 07:49:45 +01:00
mov eax, ebp ; iBeta
punpckhbw xmm13,xmm1
punpckhbw xmm15,xmm1
movdqa xmm3,xmm9
movdqa [rsp+10h],xmm2
punpcklwd xmm0,xmm0
punpckhbw xmm9,xmm1
punpcklbw xmm3,xmm1
movdqa xmm1,xmm14
pshufd xmm10,xmm0,0
movd xmm0,eax
mov eax,4
cwde
punpcklwd xmm0,xmm0
pshufd xmm8,xmm0,0
movd xmm0,eax
punpcklwd xmm0,xmm0
pshufd xmm5,xmm0,0
psubw xmm1,xmm12
movdqa xmm2,xmm10
lea r11,[rsp+0C8h]
psllw xmm1,2
movdqa xmm0,xmm4
psubw xmm4,xmm12
psubw xmm0,xmm3
psubw xmm3,xmm14
paddw xmm1,xmm0
paddw xmm1,xmm5
movdqa xmm0,xmm11
psraw xmm1,3
pmaxsw xmm0,xmm1
pminsw xmm6,xmm0
movdqa xmm1,xmm8
movdqa xmm0,xmm12
psubw xmm0,xmm14
pabsw xmm0,xmm0
pcmpgtw xmm2,xmm0
pabsw xmm0,xmm4
pcmpgtw xmm1,xmm0
pabsw xmm0,xmm3
movdqa xmm3,[rsp]
pand xmm2,xmm1
movdqa xmm1,xmm8
pcmpgtw xmm1,xmm0
movdqa xmm0,xmm13
pand xmm2,xmm1
psubw xmm0,xmm9
psubw xmm13,xmm15
pand xmm2,xmm7
pand xmm6,xmm2
paddw xmm12,xmm6
psubw xmm14,xmm6
movdqa xmm2,[rsp+10h]
movaps xmm6,[r11-18h]
movdqa xmm1,xmm2
psubw xmm1,xmm15
psubw xmm9,xmm2
psllw xmm1,2
paddw xmm1,xmm0
paddw xmm1,xmm5
movdqa xmm0,xmm15
psubw xmm0,xmm2
psraw xmm1,3
pmaxsw xmm11,xmm1
pabsw xmm0,xmm0
movdqa xmm1,xmm8
pcmpgtw xmm10,xmm0
pabsw xmm0,xmm13
pminsw xmm3,xmm11
movaps xmm11,[r11-68h]
movaps xmm13,[rsp+40h]
pcmpgtw xmm1,xmm0
pabsw xmm0,xmm9
movaps xmm9, [r11-48h]
pand xmm10,xmm1
pcmpgtw xmm8,xmm0
pand xmm10,xmm8
pand xmm10,xmm7
movaps xmm8,[r11-38h]
movaps xmm7,[r11-28h]
pand xmm3,xmm10
paddw xmm15,xmm3
psubw xmm2,xmm3
movaps xmm10,[r11-58h]
packuswb xmm12,xmm15
movaps xmm15,[rsp+20h]
packuswb xmm14,xmm2
movq [rcx],xmm12
movq [rbx],xmm14
psrldq xmm12,8
psrldq xmm14,8
movq [rdx],xmm12
movaps xmm12,[r11-78h]
movq [rdi],xmm14
movaps xmm14,[rsp+30h]
mov rsp,r11
pop rbp
pop rbx
2014-01-03 07:49:45 +01:00
ret
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockChromaEq4V_ssse3
DeblockChromaEq4V_ssse3:
mov rax,rsp
push rbx
2014-01-03 07:49:45 +01:00
push rbp
mov rbp, r8
mov r8, rdx
mov r9, rcx
mov rcx, rdi
mov rdx, rsi
sub rsp,90h
pxor xmm1,xmm1
mov r11,rcx
mov rbx,rdx
mov r10d,r9d
movq xmm13,[r11]
lea eax,[r8+r8]
movsxd r9,eax
mov rax,rcx
sub rax,r9
movq xmm14,[rax]
mov rax,rdx
sub rax,r9
movq xmm0,[rax]
movsxd rax,r8d
sub rcx,rax
sub rdx,rax
movq xmm12,[rax+r11]
movq xmm10,[rcx]
punpcklqdq xmm14,xmm0
movdqa xmm8,xmm14
movq xmm0,[rdx]
punpcklbw xmm8,xmm1
punpckhbw xmm14,xmm1
punpcklqdq xmm10,xmm0
movq xmm0,[rbx]
movdqa xmm5,xmm10
punpcklqdq xmm13,xmm0
movq xmm0, [rax+rbx]
punpcklbw xmm5,xmm1
movsx eax,r10w
movdqa xmm9,xmm13
punpcklqdq xmm12,xmm0
punpcklbw xmm9,xmm1
punpckhbw xmm10,xmm1
movd xmm0,eax
2014-01-03 07:49:45 +01:00
mov eax, ebp ; iBeta
punpckhbw xmm13,xmm1
movdqa xmm7,xmm12
punpcklwd xmm0,xmm0
punpckhbw xmm12,xmm1
pshufd xmm11,xmm0,0
punpcklbw xmm7,xmm1
movd xmm0,eax
movdqa xmm1,xmm8
psubw xmm1,xmm5
punpcklwd xmm0,xmm0
movdqa xmm6,xmm11
pshufd xmm3,xmm0,0
movdqa xmm0,xmm5
psubw xmm0,xmm9
movdqa xmm2,xmm3
pabsw xmm0,xmm0
pcmpgtw xmm6,xmm0
pabsw xmm0,xmm1
movdqa xmm1,xmm3
pcmpgtw xmm2,xmm0
pand xmm6,xmm2
movdqa xmm0,xmm7
movdqa xmm2,xmm3
psubw xmm0,xmm9
pabsw xmm0,xmm0
pcmpgtw xmm1,xmm0
pand xmm6,xmm1
movdqa xmm0,xmm10
movdqa xmm1,xmm14
psubw xmm0,xmm13
psubw xmm1,xmm10
pabsw xmm0,xmm0
pcmpgtw xmm11,xmm0
pabsw xmm0,xmm1
pcmpgtw xmm2,xmm0
pand xmm11,xmm2
movdqa xmm0,xmm12
movdqa xmm4,xmm6
movdqa xmm1,xmm8
mov eax,2
cwde
paddw xmm1,xmm8
psubw xmm0,xmm13
paddw xmm1,xmm5
pabsw xmm0,xmm0
movdqa xmm2,xmm14
paddw xmm1,xmm7
pcmpgtw xmm3,xmm0
paddw xmm2,xmm14
movd xmm0,eax
pand xmm11,xmm3
paddw xmm7,xmm7
paddw xmm2,xmm10
punpcklwd xmm0,xmm0
paddw xmm2,xmm12
paddw xmm12,xmm12
pshufd xmm3,xmm0,0
paddw xmm7,xmm9
paddw xmm12,xmm13
movdqa xmm0,xmm6
paddw xmm1,xmm3
pandn xmm0,xmm5
paddw xmm7,xmm8
psraw xmm1,2
paddw xmm12,xmm14
paddw xmm7,xmm3
;movaps xmm14,[rsp]
pand xmm4,xmm1
paddw xmm12,xmm3
psraw xmm7,2
movdqa xmm1,xmm11
por xmm4,xmm0
psraw xmm12,2
paddw xmm2,xmm3
movdqa xmm0,xmm11
pandn xmm0,xmm10
psraw xmm2,2
pand xmm1,xmm2
por xmm1,xmm0
packuswb xmm4,xmm1
movdqa xmm0,xmm11
movdqa xmm1,xmm6
pand xmm1,xmm7
movq [rcx],xmm4
pandn xmm6,xmm9
pandn xmm11,xmm13
pand xmm0,xmm12
por xmm1,xmm6
por xmm0,xmm11
psrldq xmm4,8
packuswb xmm1,xmm0
movq [r11],xmm1
psrldq xmm1,8
movq [rdx],xmm4
lea r11,[rsp+90h]
movq [rbx],xmm1
mov rsp,r11
2014-01-03 07:49:45 +01:00
pop rbp
pop rbx
2014-01-03 07:49:45 +01:00
ret
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockChromaEq4H_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockChromaEq4H_ssse3:
mov rax,rsp
push rbx
push rbp
2014-01-03 07:49:45 +01:00
push r12
mov rbp, r8
2014-01-03 07:49:45 +01:00
mov r8, rdx
mov r9, rcx
mov rcx, rdi
mov rdx, rsi
2014-01-03 07:49:45 +01:00
mov rdi, rdx
sub rsp,140h
lea eax,[r8*4]
movsxd r10,eax
mov eax,[rcx-2]
mov [rsp+10h],eax
lea rbx,[r10+rdx-2]
lea r11,[r10+rcx-2]
movdqa xmm5,[rsp+10h]
movsxd r10,r8d
mov eax,[r10+rcx-2]
lea rdx,[r10+r10*2]
mov [rsp+20h],eax
mov eax,[rcx+r10*2-2]
mov [rsp+30h],eax
2014-01-03 07:49:45 +01:00
mov eax,[rdx+rcx-2]
movdqa xmm2,[rsp+20h]
mov [rsp+40h],eax
mov eax, [rdi-2]
movdqa xmm4,[rsp+30h]
mov [rsp+50h],eax
mov eax,[r10+rdi-2]
movdqa xmm3,[rsp+40h]
mov [rsp+60h],eax
mov eax,[rdi+r10*2-2]
punpckldq xmm5,[rsp+50h]
mov [rsp+70h],eax
mov eax, [rdx+rdi-2]
punpckldq xmm2, [rsp+60h]
mov [rsp+80h],eax
mov eax,[r11]
punpckldq xmm4, [rsp+70h]
mov [rsp+50h],eax
mov eax,[rbx]
punpckldq xmm3,[rsp+80h]
mov [rsp+60h],eax
mov eax,[r10+r11]
movdqa xmm0, [rsp+50h]
punpckldq xmm0, [rsp+60h]
punpcklqdq xmm5,xmm0
movdqa [rsp+50h],xmm0
mov [rsp+50h],eax
mov eax,[r10+rbx]
movdqa xmm0,[rsp+50h]
movdqa xmm1,xmm5
mov [rsp+60h],eax
mov eax,[r11+r10*2]
punpckldq xmm0, [rsp+60h]
punpcklqdq xmm2,xmm0
punpcklbw xmm1,xmm2
punpckhbw xmm5,xmm2
movdqa [rsp+50h],xmm0
mov [rsp+50h],eax
mov eax,[rbx+r10*2]
movdqa xmm0,[rsp+50h]
mov [rsp+60h],eax
mov eax, [rdx+r11]
movdqa xmm15,xmm1
punpckldq xmm0,[rsp+60h]
punpcklqdq xmm4,xmm0
movdqa [rsp+50h],xmm0
mov [rsp+50h],eax
mov eax, [rdx+rbx]
movdqa xmm0,[rsp+50h]
mov [rsp+60h],eax
punpckldq xmm0, [rsp+60h]
punpcklqdq xmm3,xmm0
movdqa xmm0,xmm4
punpcklbw xmm0,xmm3
punpckhbw xmm4,xmm3
punpcklwd xmm15,xmm0
punpckhwd xmm1,xmm0
movdqa xmm0,xmm5
movdqa xmm12,xmm15
punpcklwd xmm0,xmm4
punpckhwd xmm5,xmm4
punpckldq xmm12,xmm0
punpckhdq xmm15,xmm0
movdqa xmm0,xmm1
movdqa xmm11,xmm12
punpckldq xmm0,xmm5
punpckhdq xmm1,xmm5
punpcklqdq xmm11,xmm0
punpckhqdq xmm12,xmm0
movsx eax,r9w
movdqa xmm14,xmm15
punpcklqdq xmm14,xmm1
punpckhqdq xmm15,xmm1
pxor xmm1,xmm1
movd xmm0,eax
movdqa xmm4,xmm12
movdqa xmm8,xmm11
2014-01-03 07:49:45 +01:00
mov eax, ebp ; iBeta
punpcklwd xmm0,xmm0
punpcklbw xmm4,xmm1
punpckhbw xmm12,xmm1
movdqa xmm9,xmm14
movdqa xmm7,xmm15
movdqa xmm10,xmm15
pshufd xmm13,xmm0,0
punpcklbw xmm9,xmm1
punpckhbw xmm14,xmm1
movdqa xmm6,xmm13
movd xmm0,eax
movdqa [rsp],xmm11
mov eax,2
cwde
punpckhbw xmm11,xmm1
punpckhbw xmm10,xmm1
punpcklbw xmm7,xmm1
punpcklwd xmm0,xmm0
punpcklbw xmm8,xmm1
pshufd xmm3,xmm0,0
movdqa xmm1,xmm8
movdqa xmm0,xmm4
psubw xmm0,xmm9
psubw xmm1,xmm4
movdqa xmm2,xmm3
pabsw xmm0,xmm0
pcmpgtw xmm6,xmm0
pabsw xmm0,xmm1
movdqa xmm1,xmm3
pcmpgtw xmm2,xmm0
pand xmm6,xmm2
movdqa xmm0,xmm7
movdqa xmm2,xmm3
psubw xmm0,xmm9
pabsw xmm0,xmm0
pcmpgtw xmm1,xmm0
pand xmm6,xmm1
movdqa xmm0,xmm12
movdqa xmm1,xmm11
psubw xmm0,xmm14
psubw xmm1,xmm12
movdqa xmm5,xmm6
pabsw xmm0,xmm0
pcmpgtw xmm13,xmm0
pabsw xmm0,xmm1
movdqa xmm1,xmm8
pcmpgtw xmm2,xmm0
paddw xmm1,xmm8
movdqa xmm0,xmm10
pand xmm13,xmm2
psubw xmm0,xmm14
paddw xmm1,xmm4
movdqa xmm2,xmm11
pabsw xmm0,xmm0
paddw xmm2,xmm11
paddw xmm1,xmm7
pcmpgtw xmm3,xmm0
paddw xmm2,xmm12
movd xmm0,eax
pand xmm13,xmm3
paddw xmm2,xmm10
punpcklwd xmm0,xmm0
pshufd xmm3,xmm0,0
movdqa xmm0,xmm6
paddw xmm1,xmm3
pandn xmm0,xmm4
paddw xmm2,xmm3
psraw xmm1,2
pand xmm5,xmm1
por xmm5,xmm0
paddw xmm7,xmm7
paddw xmm10,xmm10
psraw xmm2,2
movdqa xmm1,xmm13
movdqa xmm0,xmm13
pandn xmm0,xmm12
pand xmm1,xmm2
paddw xmm7,xmm9
por xmm1,xmm0
paddw xmm10,xmm14
paddw xmm7,xmm8
movdqa xmm0,xmm13
packuswb xmm5,xmm1
paddw xmm7,xmm3
paddw xmm10,xmm11
movdqa xmm1,xmm6
paddw xmm10,xmm3
pandn xmm6,xmm9
psraw xmm7,2
pand xmm1,xmm7
psraw xmm10,2
pandn xmm13,xmm14
pand xmm0,xmm10
por xmm1,xmm6
movdqa xmm6,[rsp]
movdqa xmm4,xmm6
por xmm0,xmm13
punpcklbw xmm4,xmm5
punpckhbw xmm6,xmm5
movdqa xmm3,xmm4
packuswb xmm1,xmm0
movdqa xmm0,xmm1
punpckhbw xmm1,xmm15
punpcklbw xmm0,xmm15
punpcklwd xmm3,xmm0
punpckhwd xmm4,xmm0
movdqa xmm0,xmm6
movdqa xmm2,xmm3
punpcklwd xmm0,xmm1
punpckhwd xmm6,xmm1
movdqa xmm1,xmm4
punpckldq xmm2,xmm0
punpckhdq xmm3,xmm0
punpckldq xmm1,xmm6
movdqa xmm0,xmm2
punpcklqdq xmm0,xmm1
punpckhdq xmm4,xmm6
punpckhqdq xmm2,xmm1
movdqa [rsp+10h],xmm0
movdqa [rsp+60h],xmm2
movdqa xmm0,xmm3
mov eax,[rsp+10h]
mov [rcx-2],eax
mov eax,[rsp+60h]
punpcklqdq xmm0,xmm4
punpckhqdq xmm3,xmm4
mov [r10+rcx-2],eax
movdqa [rsp+20h],xmm0
mov eax, [rsp+20h]
movdqa [rsp+70h],xmm3
mov [rcx+r10*2-2],eax
mov eax,[rsp+70h]
mov [rdx+rcx-2],eax
mov eax,[rsp+18h]
mov [r11],eax
mov eax,[rsp+68h]
mov [r10+r11],eax
mov eax,[rsp+28h]
mov [r11+r10*2],eax
mov eax,[rsp+78h]
mov [rdx+r11],eax
mov eax,[rsp+14h]
mov [rdi-2],eax
mov eax,[rsp+64h]
mov [r10+rdi-2],eax
mov eax,[rsp+24h]
mov [rdi+r10*2-2],eax
mov eax, [rsp+74h]
mov [rdx+rdi-2],eax
mov eax, [rsp+1Ch]
mov [rbx],eax
mov eax, [rsp+6Ch]
mov [r10+rbx],eax
mov eax,[rsp+2Ch]
mov [rbx+r10*2],eax
mov eax,[rsp+7Ch]
mov [rdx+rbx],eax
lea r11,[rsp+140h]
mov rbx, [r11+28h]
2014-01-03 07:49:45 +01:00
mov rsp,r11
pop r12
pop rbp
pop rbx
ret
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockChromaLt4H_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockChromaLt4H_ssse3:
mov rax,rsp
push rbx
push rbp
push r12
2014-01-03 07:49:45 +01:00
push r13
push r14
sub rsp,170h
2014-01-03 07:49:45 +01:00
mov r13, r8
mov r14, r9
mov r8, rdx
mov r9, rcx
mov rdx, rdi
mov rcx, rsi
movsxd rsi,r8d
lea eax,[r8*4]
mov r11d,r9d
movsxd r10,eax
mov eax, [rcx-2]
mov r12,rdx
mov [rsp+40h],eax
mov eax, [rsi+rcx-2]
lea rbx,[r10+rcx-2]
movdqa xmm5,[rsp+40h]
mov [rsp+50h],eax
mov eax, [rcx+rsi*2-2]
lea rbp,[r10+rdx-2]
movdqa xmm2, [rsp+50h]
mov [rsp+60h],eax
lea r10,[rsi+rsi*2]
mov rdi,rcx
mov eax,[r10+rcx-2]
movdqa xmm4,[rsp+60h]
mov [rsp+70h],eax
mov eax,[rdx-2]
mov [rsp+80h],eax
mov eax, [rsi+rdx-2]
movdqa xmm3,[rsp+70h]
mov [rsp+90h],eax
mov eax,[rdx+rsi*2-2]
punpckldq xmm5,[rsp+80h]
mov [rsp+0A0h],eax
mov eax, [r10+rdx-2]
punpckldq xmm2,[rsp+90h]
mov [rsp+0B0h],eax
mov eax, [rbx]
punpckldq xmm4,[rsp+0A0h]
mov [rsp+80h],eax
mov eax,[rbp]
punpckldq xmm3,[rsp+0B0h]
mov [rsp+90h],eax
mov eax,[rsi+rbx]
movdqa xmm0,[rsp+80h]
punpckldq xmm0,[rsp+90h]
punpcklqdq xmm5,xmm0
movdqa [rsp+80h],xmm0
mov [rsp+80h],eax
mov eax,[rsi+rbp]
movdqa xmm0,[rsp+80h]
movdqa xmm1,xmm5
mov [rsp+90h],eax
mov eax,[rbx+rsi*2]
punpckldq xmm0,[rsp+90h]
punpcklqdq xmm2,xmm0
punpcklbw xmm1,xmm2
punpckhbw xmm5,xmm2
movdqa [rsp+80h],xmm0
mov [rsp+80h],eax
mov eax,[rbp+rsi*2]
movdqa xmm0, [rsp+80h]
mov [rsp+90h],eax
mov eax,[r10+rbx]
movdqa xmm7,xmm1
punpckldq xmm0,[rsp+90h]
punpcklqdq xmm4,xmm0
movdqa [rsp+80h],xmm0
mov [rsp+80h],eax
mov eax, [r10+rbp]
movdqa xmm0,[rsp+80h]
mov [rsp+90h],eax
punpckldq xmm0,[rsp+90h]
punpcklqdq xmm3,xmm0
movdqa xmm0,xmm4
punpcklbw xmm0,xmm3
punpckhbw xmm4,xmm3
punpcklwd xmm7,xmm0
punpckhwd xmm1,xmm0
movdqa xmm0,xmm5
movdqa xmm6,xmm7
punpcklwd xmm0,xmm4
punpckhwd xmm5,xmm4
punpckldq xmm6,xmm0
punpckhdq xmm7,xmm0
movdqa xmm0,xmm1
punpckldq xmm0,xmm5
2014-01-03 07:49:45 +01:00
mov rax, r14 ; pTC
punpckhdq xmm1,xmm5
movdqa xmm9,xmm6
punpckhqdq xmm6,xmm0
punpcklqdq xmm9,xmm0
movdqa xmm2,xmm7
movdqa xmm13,xmm6
movdqa xmm4,xmm9
movdqa [rsp+10h],xmm9
punpcklqdq xmm2,xmm1
punpckhqdq xmm7,xmm1
pxor xmm1,xmm1
movsx ecx,byte [rax+3]
movsx edx,byte [rax+2]
movsx r8d,byte [rax+1]
movsx r9d,byte [rax]
movdqa xmm10,xmm1
movdqa xmm15,xmm2
punpckhbw xmm2,xmm1
punpckhbw xmm6,xmm1
punpcklbw xmm4,xmm1
movsx eax,r11w
mov word [rsp+0Eh],cx
mov word [rsp+0Ch],cx
movdqa xmm3,xmm7
movdqa xmm8,xmm7
movdqa [rsp+20h],xmm7
punpcklbw xmm15,xmm1
punpcklbw xmm13,xmm1
punpcklbw xmm3,xmm1
mov word [rsp+0Ah],dx
mov word [rsp+8],dx
mov word [rsp+6],r8w
movd xmm0,eax
movdqa [rsp+30h],xmm6
punpckhbw xmm9,xmm1
punpckhbw xmm8,xmm1
punpcklwd xmm0,xmm0
2014-01-03 07:49:45 +01:00
mov eax, r13d ; iBeta
mov word [rsp+4],r8w
mov word [rsp+2],r9w
pshufd xmm12,xmm0,0
mov word [rsp],r9w
movd xmm0,eax
mov eax,4
cwde
movdqa xmm14, [rsp]
movdqa [rsp],xmm2
movdqa xmm2,xmm12
punpcklwd xmm0,xmm0
pshufd xmm11,xmm0,0
psubw xmm10,xmm14
movd xmm0,eax
movdqa xmm7,xmm14
movdqa xmm6,xmm14
pcmpgtw xmm7,xmm1
punpcklwd xmm0,xmm0
pshufd xmm5,xmm0,0
movdqa xmm0,xmm4
movdqa xmm1,xmm15
psubw xmm4,xmm13
psubw xmm0,xmm3
psubw xmm1,xmm13
psubw xmm3,xmm15
psllw xmm1,2
paddw xmm1,xmm0
paddw xmm1,xmm5
movdqa xmm0,xmm10
psraw xmm1,3
pmaxsw xmm0,xmm1
pminsw xmm6,xmm0
movdqa xmm1,xmm11
movdqa xmm0,xmm13
psubw xmm0,xmm15
pabsw xmm0,xmm0
pcmpgtw xmm2,xmm0
pabsw xmm0,xmm4
pcmpgtw xmm1,xmm0
pabsw xmm0,xmm3
pand xmm2,xmm1
movdqa xmm1,xmm11
movdqa xmm3,[rsp+30h]
pcmpgtw xmm1,xmm0
movdqa xmm0,xmm9
pand xmm2,xmm1
psubw xmm0,xmm8
psubw xmm9,xmm3
pand xmm2,xmm7
pand xmm6,xmm2
psubw xmm15,xmm6
paddw xmm13,xmm6
movdqa xmm2,[rsp]
movdqa xmm1,xmm2
psubw xmm1,xmm3
psubw xmm8,xmm2
psllw xmm1,2
paddw xmm1,xmm0
paddw xmm1,xmm5
movdqa xmm0,xmm3
movdqa xmm5,[rsp+10h]
psubw xmm0,xmm2
psraw xmm1,3
movdqa xmm4,xmm5
pabsw xmm0,xmm0
pmaxsw xmm10,xmm1
movdqa xmm1,xmm11
pcmpgtw xmm12,xmm0
pabsw xmm0,xmm9
pminsw xmm14,xmm10
pcmpgtw xmm1,xmm0
pabsw xmm0,xmm8
pcmpgtw xmm11,xmm0
pand xmm12,xmm1
movdqa xmm1,[rsp+20h]
pand xmm12,xmm11
pand xmm12,xmm7
pand xmm14,xmm12
paddw xmm3,xmm14
psubw xmm2,xmm14
packuswb xmm13,xmm3
packuswb xmm15,xmm2
punpcklbw xmm4,xmm13
punpckhbw xmm5,xmm13
movdqa xmm0,xmm15
punpcklbw xmm0,xmm1
punpckhbw xmm15,xmm1
movdqa xmm3,xmm4
punpcklwd xmm3,xmm0
punpckhwd xmm4,xmm0
movdqa xmm0,xmm5
movdqa xmm2,xmm3
movdqa xmm1,xmm4
punpcklwd xmm0,xmm15
punpckhwd xmm5,xmm15
punpckldq xmm2,xmm0
punpckhdq xmm3,xmm0
punpckldq xmm1,xmm5
movdqa xmm0,xmm2
punpcklqdq xmm0,xmm1
punpckhdq xmm4,xmm5
punpckhqdq xmm2,xmm1
movdqa [rsp+40h],xmm0
movdqa xmm0,xmm3
movdqa [rsp+90h],xmm2
mov eax,[rsp+40h]
mov [rdi-2],eax
mov eax, [rsp+90h]
punpcklqdq xmm0,xmm4
punpckhqdq xmm3,xmm4
mov [rsi+rdi-2],eax
movdqa [rsp+50h],xmm0
mov eax,[rsp+50h]
movdqa [rsp+0A0h],xmm3
mov [rdi+rsi*2-2],eax
mov eax,[rsp+0A0h]
mov [r10+rdi-2],eax
mov eax,[rsp+48h]
mov [rbx],eax
mov eax,[rsp+98h]
mov [rsi+rbx],eax
mov eax,[rsp+58h]
mov [rbx+rsi*2],eax
mov eax, [rsp+0A8h]
mov [r10+rbx],eax
mov eax, [rsp+44h]
mov [r12-2],eax
mov eax,[rsp+94h]
mov [rsi+r12-2],eax
mov eax,[rsp+54h]
mov [r12+rsi*2-2],eax
mov eax, [rsp+0A4h]
mov [r10+r12-2],eax
mov eax,[rsp+4Ch]
mov [rbp],eax
mov eax,[rsp+9Ch]
mov [rsi+rbp],eax
mov eax, [rsp+5Ch]
mov [rbp+rsi*2],eax
mov eax,[rsp+0ACh]
mov [r10+rbp],eax
lea r11,[rsp+170h]
mov rsp,r11
2014-01-03 07:49:45 +01:00
pop r14
pop r13
pop r12
pop rbp
pop rbx
ret
2014-01-03 07:49:45 +01:00
%elifdef X86_32
;********************************************************************************
2014-01-16 08:57:22 +01:00
; void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
2014-01-03 07:49:45 +01:00
; int32_t iAlpha, int32_t iBeta)
;********************************************************************************
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockChromaEq4V_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockChromaEq4V_ssse3:
2014-01-03 07:49:45 +01:00
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
sub esp,68h
mov edx,[ebp+10h] ; iStride
mov eax,[ebp+8] ; pPixCb
mov ecx,[ebp+0Ch] ; pPixCr
movq xmm4,[ecx]
movq xmm5,[edx+ecx]
push esi
push edi
lea esi,[edx+edx]
mov edi,eax
sub edi,esi
movq xmm1,[edi]
mov edi,ecx
sub edi,esi
movq xmm2,[edi]
punpcklqdq xmm1,xmm2
mov esi,eax
sub esi,edx
movq xmm2,[esi]
mov edi,ecx
sub edi,edx
movq xmm3,[edi]
punpcklqdq xmm2,xmm3
movq xmm3,[eax]
punpcklqdq xmm3,xmm4
movq xmm4,[edx+eax]
mov edx, [ebp + 14h]
punpcklqdq xmm4,xmm5
movd xmm5,edx
mov edx, [ebp + 18h]
pxor xmm0,xmm0
movdqa xmm6,xmm5
punpcklwd xmm6,xmm5
pshufd xmm5,xmm6,0
movd xmm6,edx
movdqa xmm7,xmm6
punpcklwd xmm7,xmm6
pshufd xmm6,xmm7,0
movdqa xmm7,xmm1
punpckhbw xmm1,xmm0
punpcklbw xmm7,xmm0
movdqa [esp+40h],xmm1
movdqa [esp+60h],xmm7
movdqa xmm7,xmm2
punpcklbw xmm7,xmm0
movdqa [esp+10h],xmm7
movdqa xmm7,xmm3
punpcklbw xmm7,xmm0
punpckhbw xmm3,xmm0
movdqa [esp+50h],xmm7
movdqa xmm7,xmm4
punpckhbw xmm4,xmm0
punpckhbw xmm2,xmm0
punpcklbw xmm7,xmm0
movdqa [esp+30h],xmm3
movdqa xmm3,[esp+10h]
movdqa xmm1,xmm3
psubw xmm1,[esp+50h]
pabsw xmm1,xmm1
movdqa [esp+20h],xmm4
movdqa xmm0,xmm5
pcmpgtw xmm0,xmm1
movdqa xmm1,[esp+60h]
psubw xmm1,xmm3
pabsw xmm1,xmm1
movdqa xmm4,xmm6
pcmpgtw xmm4,xmm1
pand xmm0,xmm4
movdqa xmm1,xmm7
psubw xmm1,[esp+50h]
pabsw xmm1,xmm1
movdqa xmm4,xmm6
pcmpgtw xmm4,xmm1
movdqa xmm1,xmm2
psubw xmm1,[esp+30h]
pabsw xmm1,xmm1
pcmpgtw xmm5,xmm1
movdqa xmm1,[esp+40h]
pand xmm0,xmm4
psubw xmm1,xmm2
pabsw xmm1,xmm1
movdqa xmm4,xmm6
pcmpgtw xmm4,xmm1
movdqa xmm1,[esp+20h]
psubw xmm1,[esp+30h]
pand xmm5,xmm4
pabsw xmm1,xmm1
pcmpgtw xmm6,xmm1
pand xmm5,xmm6
mov edx,2
movsx edx,dx
movd xmm1,edx
movdqa xmm4,xmm1
punpcklwd xmm4,xmm1
pshufd xmm1,xmm4,0
movdqa xmm4,[esp+60h]
movdqa xmm6,xmm4
paddw xmm6,xmm4
paddw xmm6,xmm3
paddw xmm6,xmm7
movdqa [esp+10h],xmm1
paddw xmm6,[esp+10h]
psraw xmm6,2
movdqa xmm4,xmm0
pandn xmm4,xmm3
movdqa xmm3,[esp+40h]
movdqa xmm1,xmm0
pand xmm1,xmm6
por xmm1,xmm4
movdqa xmm6,xmm3
paddw xmm6,xmm3
movdqa xmm3,[esp+10h]
paddw xmm6,xmm2
paddw xmm6,[esp+20h]
paddw xmm6,xmm3
psraw xmm6,2
movdqa xmm4,xmm5
pand xmm4,xmm6
movdqa xmm6,xmm5
pandn xmm6,xmm2
por xmm4,xmm6
packuswb xmm1,xmm4
movdqa xmm4,[esp+50h]
movdqa xmm6,xmm7
paddw xmm6,xmm7
paddw xmm6,xmm4
paddw xmm6,[esp+60h]
paddw xmm6,xmm3
psraw xmm6,2
movdqa xmm2,xmm0
pand xmm2,xmm6
pandn xmm0,xmm4
por xmm2,xmm0
movdqa xmm0,[esp+20h]
movdqa xmm6,xmm0
paddw xmm6,xmm0
movdqa xmm0,[esp+30h]
paddw xmm6,xmm0
paddw xmm6,[esp+40h]
movdqa xmm4,xmm5
paddw xmm6,xmm3
movq [esi],xmm1
psraw xmm6,2
pand xmm4,xmm6
pandn xmm5,xmm0
por xmm4,xmm5
packuswb xmm2,xmm4
movq [eax],xmm2
psrldq xmm1,8
movq [edi],xmm1
pop edi
psrldq xmm2,8
movq [ecx],xmm2
pop esi
mov esp,ebp
pop ebp
ret
;******************************************************************************
2014-01-16 08:57:22 +01:00
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
2014-01-03 07:49:45 +01:00
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockChromaLt4V_ssse3
2014-01-03 07:49:45 +01:00
2014-01-16 08:57:22 +01:00
DeblockChromaLt4V_ssse3:
2014-01-03 07:49:45 +01:00
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
sub esp,0E4h
push ebx
push esi
mov esi, [ebp+1Ch] ; pTC
movsx ebx, byte [esi+2]
push edi
movsx di,byte [esi+3]
mov word [esp+0Ch],bx
movsx bx,byte [esi+1]
movsx esi,byte [esi]
mov word [esp+0Eh],si
movzx esi,di
movd xmm1,esi
movzx esi,di
movd xmm2,esi
mov si,word [esp+0Ch]
mov edx, [ebp + 10h]
mov eax, [ebp + 08h]
movzx edi,si
movzx esi,si
mov ecx, [ebp + 0Ch]
movd xmm4,esi
movzx esi,bx
movd xmm5,esi
movd xmm3,edi
movzx esi,bx
movd xmm6,esi
mov si,word [esp+0Eh]
movzx edi,si
movzx esi,si
punpcklwd xmm6,xmm2
pxor xmm0,xmm0
movdqa [esp+40h],xmm0
movd xmm7,edi
movd xmm0,esi
lea esi,[edx+edx]
mov edi,eax
sub edi,esi
punpcklwd xmm5,xmm1
movdqa xmm1,[esp+40h]
punpcklwd xmm0,xmm4
movq xmm4,[edx+ecx]
punpcklwd xmm7,xmm3
movq xmm3,[eax]
punpcklwd xmm0,xmm6
movq xmm6,[edi]
punpcklwd xmm7,xmm5
punpcklwd xmm0,xmm7
mov edi,ecx
sub edi,esi
movdqa xmm2,xmm1
psubw xmm2,xmm0
movdqa [esp+60h],xmm2
movq xmm2, [edi]
punpcklqdq xmm6,xmm2
mov esi,eax
sub esi,edx
movq xmm7,[esi]
mov edi,ecx
sub edi,edx
movq xmm2,[edi]
punpcklqdq xmm7,xmm2
movq xmm2,[ecx]
punpcklqdq xmm3,xmm2
movq xmm2,[edx+eax]
movsx edx,word [ebp + 14h]
punpcklqdq xmm2,xmm4
movdqa [esp+0E0h],xmm2
movd xmm2,edx
movsx edx,word [ebp + 18h]
movdqa xmm4,xmm2
punpcklwd xmm4,xmm2
movd xmm2,edx
movdqa xmm5,xmm2
punpcklwd xmm5,xmm2
pshufd xmm2,xmm5,0
movdqa [esp+50h],xmm2
movdqa xmm2,xmm6
punpcklbw xmm2,xmm1
movdqa [esp+0D0h],xmm3
pshufd xmm4,xmm4,0
movdqa [esp+30h],xmm2
punpckhbw xmm6,xmm1
movdqa [esp+80h],xmm6
movdqa xmm6,[esp+0D0h]
punpckhbw xmm6,xmm1
movdqa [esp+70h],xmm6
movdqa xmm6, [esp+0E0h]
punpckhbw xmm6,xmm1
movdqa [esp+90h],xmm6
movdqa xmm5, [esp+0E0h]
movdqa xmm2,xmm7
punpckhbw xmm7,xmm1
punpcklbw xmm5,xmm1
movdqa [esp+0A0h],xmm7
punpcklbw xmm3,xmm1
mov edx,4
punpcklbw xmm2,xmm1
movsx edx,dx
movd xmm6,edx
movdqa xmm7,xmm6
punpcklwd xmm7,xmm6
pshufd xmm6,xmm7,0
movdqa xmm7,[esp+30h]
movdqa [esp+20h],xmm6
psubw xmm7,xmm5
movdqa xmm6,xmm0
pcmpgtw xmm6,xmm1
movdqa xmm1,[esp+60h]
movdqa [esp+40h],xmm6
movdqa xmm6,xmm3
psubw xmm6,xmm2
psllw xmm6,2
paddw xmm6,xmm7
paddw xmm6, [esp+20h]
movdqa xmm7, [esp+50h]
psraw xmm6,3
pmaxsw xmm1,xmm6
movdqa [esp+10h],xmm0
movdqa xmm6, [esp+10h]
pminsw xmm6,xmm1
movdqa [esp+10h],xmm6
movdqa xmm1,xmm2
psubw xmm1,xmm3
pabsw xmm1,xmm1
movdqa xmm6,xmm4
pcmpgtw xmm6,xmm1
movdqa xmm1, [esp+30h]
psubw xmm1,xmm2
pabsw xmm1,xmm1
pcmpgtw xmm7,xmm1
movdqa xmm1,[esp+50h]
pand xmm6,xmm7
movdqa xmm7,[esp+50h]
psubw xmm5,xmm3
pabsw xmm5,xmm5
pcmpgtw xmm1,xmm5
movdqa xmm5,[esp+80h]
psubw xmm5,[esp+90h]
pand xmm6,xmm1
pand xmm6,[esp+40h]
movdqa xmm1,[esp+10h]
pand xmm1,xmm6
movdqa xmm6,[esp+70h]
movdqa [esp+30h],xmm1
movdqa xmm1,[esp+0A0h]
psubw xmm6,xmm1
psllw xmm6,2
paddw xmm6,xmm5
paddw xmm6,[esp+20h]
movdqa xmm5,[esp+60h]
psraw xmm6,3
pmaxsw xmm5,xmm6
pminsw xmm0,xmm5
movdqa xmm5,[esp+70h]
movdqa xmm6,xmm1
psubw xmm6,xmm5
pabsw xmm6,xmm6
pcmpgtw xmm4,xmm6
movdqa xmm6,[esp+80h]
psubw xmm6,xmm1
pabsw xmm6,xmm6
pcmpgtw xmm7,xmm6
movdqa xmm6,[esp+90h]
pand xmm4,xmm7
movdqa xmm7,[esp+50h]
psubw xmm6,xmm5
pabsw xmm6,xmm6
pcmpgtw xmm7,xmm6
pand xmm4,xmm7
pand xmm4,[esp+40h]
pand xmm0,xmm4
movdqa xmm4,[esp+30h]
paddw xmm2,xmm4
paddw xmm1,xmm0
packuswb xmm2,xmm1
movq [esi],xmm2
psubw xmm3,xmm4
psubw xmm5,xmm0
packuswb xmm3,xmm5
movq [eax],xmm3
psrldq xmm2,8
movq [edi],xmm2
pop edi
pop esi
psrldq xmm3,8
movq [ecx],xmm3
pop ebx
mov esp,ebp
pop ebp
ret
;***************************************************************************
2014-01-16 08:57:22 +01:00
; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
2014-01-03 07:49:45 +01:00
; int32_t iAlpha, int32_t iBeta)
;***************************************************************************
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockChromaEq4H_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockChromaEq4H_ssse3:
2014-01-03 07:49:45 +01:00
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
sub esp,0C8h
mov ecx,dword [ebp+8]
mov edx,dword [ebp+0Ch]
mov eax,dword [ebp+10h]
sub ecx,2
sub edx,2
push esi
lea esi,[eax+eax*2]
mov dword [esp+18h],ecx
mov dword [esp+4],edx
lea ecx,[ecx+eax*4]
lea edx,[edx+eax*4]
lea eax,[esp+7Ch]
push edi
mov dword [esp+14h],esi
mov dword [esp+18h],ecx
mov dword [esp+0Ch],edx
mov dword [esp+10h],eax
mov esi,dword [esp+1Ch]
mov ecx,dword [ebp+10h]
mov edx,dword [esp+14h]
movd xmm0,dword [esi]
movd xmm1,dword [esi+ecx]
movd xmm2,dword [esi+ecx*2]
movd xmm3,dword [esi+edx]
mov esi,dword [esp+8]
movd xmm4,dword [esi]
movd xmm5,dword [esi+ecx]
movd xmm6,dword [esi+ecx*2]
movd xmm7,dword [esi+edx]
punpckldq xmm0,xmm4
punpckldq xmm1,xmm5
punpckldq xmm2,xmm6
punpckldq xmm3,xmm7
mov esi,dword [esp+18h]
mov edi,dword [esp+0Ch]
movd xmm4,dword [esi]
movd xmm5,dword [edi]
punpckldq xmm4,xmm5
punpcklqdq xmm0,xmm4
movd xmm4,dword [esi+ecx]
movd xmm5,dword [edi+ecx]
punpckldq xmm4,xmm5
punpcklqdq xmm1,xmm4
movd xmm4,dword [esi+ecx*2]
movd xmm5,dword [edi+ecx*2]
punpckldq xmm4,xmm5
punpcklqdq xmm2,xmm4
movd xmm4,dword [esi+edx]
movd xmm5,dword [edi+edx]
punpckldq xmm4,xmm5
punpcklqdq xmm3,xmm4
movdqa xmm6,xmm0
punpcklbw xmm0,xmm1
punpckhbw xmm6,xmm1
movdqa xmm7,xmm2
punpcklbw xmm2,xmm3
punpckhbw xmm7,xmm3
movdqa xmm4,xmm0
movdqa xmm5,xmm6
punpcklwd xmm0,xmm2
punpckhwd xmm4,xmm2
punpcklwd xmm6,xmm7
punpckhwd xmm5,xmm7
movdqa xmm1,xmm0
movdqa xmm2,xmm4
punpckldq xmm0,xmm6
punpckhdq xmm1,xmm6
punpckldq xmm4,xmm5
punpckhdq xmm2,xmm5
movdqa xmm5,xmm0
movdqa xmm6,xmm1
punpcklqdq xmm0,xmm4
punpckhqdq xmm5,xmm4
punpcklqdq xmm1,xmm2
punpckhqdq xmm6,xmm2
mov edi,dword [esp+10h]
movdqa [edi],xmm0
movdqa [edi+10h],xmm5
movdqa [edi+20h],xmm1
movdqa [edi+30h],xmm6
movsx ecx,word [ebp+14h]
movsx edx,word [ebp+18h]
movdqa xmm6,[esp+80h]
movdqa xmm4,[esp+90h]
movdqa xmm5,[esp+0A0h]
movdqa xmm7,[esp+0B0h]
pxor xmm0,xmm0
movd xmm1,ecx
movdqa xmm2,xmm1
punpcklwd xmm2,xmm1
pshufd xmm1,xmm2,0
movd xmm2,edx
movdqa xmm3,xmm2
punpcklwd xmm3,xmm2
pshufd xmm2,xmm3,0
movdqa xmm3,xmm6
punpckhbw xmm6,xmm0
movdqa [esp+60h],xmm6
movdqa xmm6,[esp+90h]
punpckhbw xmm6,xmm0
movdqa [esp+30h],xmm6
movdqa xmm6,[esp+0A0h]
punpckhbw xmm6,xmm0
movdqa [esp+40h],xmm6
movdqa xmm6,[esp+0B0h]
punpckhbw xmm6,xmm0
movdqa [esp+70h],xmm6
punpcklbw xmm7,xmm0
punpcklbw xmm4,xmm0
punpcklbw xmm5,xmm0
punpcklbw xmm3,xmm0
movdqa [esp+50h],xmm7
movdqa xmm6,xmm4
psubw xmm6,xmm5
pabsw xmm6,xmm6
movdqa xmm0,xmm1
pcmpgtw xmm0,xmm6
movdqa xmm6,xmm3
psubw xmm6,xmm4
pabsw xmm6,xmm6
movdqa xmm7,xmm2
pcmpgtw xmm7,xmm6
movdqa xmm6,[esp+50h]
psubw xmm6,xmm5
pabsw xmm6,xmm6
pand xmm0,xmm7
movdqa xmm7,xmm2
pcmpgtw xmm7,xmm6
movdqa xmm6,[esp+30h]
psubw xmm6,[esp+40h]
pabsw xmm6,xmm6
pcmpgtw xmm1,xmm6
movdqa xmm6,[esp+60h]
psubw xmm6,[esp+30h]
pabsw xmm6,xmm6
pand xmm0,xmm7
movdqa xmm7,xmm2
pcmpgtw xmm7,xmm6
movdqa xmm6,[esp+70h]
psubw xmm6,[esp+40h]
pabsw xmm6,xmm6
pand xmm1,xmm7
pcmpgtw xmm2,xmm6
pand xmm1,xmm2
mov eax,2
movsx ecx,ax
movd xmm2,ecx
movdqa xmm6,xmm2
punpcklwd xmm6,xmm2
pshufd xmm2,xmm6,0
movdqa [esp+20h],xmm2
movdqa xmm2,xmm3
paddw xmm2,xmm3
paddw xmm2,xmm4
paddw xmm2,[esp+50h]
paddw xmm2,[esp+20h]
psraw xmm2,2
movdqa xmm6,xmm0
pand xmm6,xmm2
movdqa xmm2,xmm0
pandn xmm2,xmm4
por xmm6,xmm2
movdqa xmm2,[esp+60h]
movdqa xmm7,xmm2
paddw xmm7,xmm2
paddw xmm7,[esp+30h]
paddw xmm7,[esp+70h]
paddw xmm7,[esp+20h]
movdqa xmm4,xmm1
movdqa xmm2,xmm1
pandn xmm2,[esp+30h]
psraw xmm7,2
pand xmm4,xmm7
por xmm4,xmm2
movdqa xmm2,[esp+50h]
packuswb xmm6,xmm4
movdqa [esp+90h],xmm6
movdqa xmm6,xmm2
paddw xmm6,xmm2
movdqa xmm2,[esp+20h]
paddw xmm6,xmm5
paddw xmm6,xmm3
movdqa xmm4,xmm0
pandn xmm0,xmm5
paddw xmm6,xmm2
psraw xmm6,2
pand xmm4,xmm6
por xmm4,xmm0
movdqa xmm0,[esp+70h]
movdqa xmm5,xmm0
paddw xmm5,xmm0
movdqa xmm0,[esp+40h]
paddw xmm5,xmm0
paddw xmm5,[esp+60h]
movdqa xmm3,xmm1
paddw xmm5,xmm2
psraw xmm5,2
pand xmm3,xmm5
pandn xmm1,xmm0
por xmm3,xmm1
packuswb xmm4,xmm3
movdqa [esp+0A0h],xmm4
mov esi,dword [esp+10h]
movdqa xmm0,[esi]
movdqa xmm1,[esi+10h]
movdqa xmm2,[esi+20h]
movdqa xmm3,[esi+30h]
movdqa xmm6,xmm0
punpcklbw xmm0,xmm1
punpckhbw xmm6,xmm1
movdqa xmm7,xmm2
punpcklbw xmm2,xmm3
punpckhbw xmm7,xmm3
movdqa xmm4,xmm0
movdqa xmm5,xmm6
punpcklwd xmm0,xmm2
punpckhwd xmm4,xmm2
punpcklwd xmm6,xmm7
punpckhwd xmm5,xmm7
movdqa xmm1,xmm0
movdqa xmm2,xmm4
punpckldq xmm0,xmm6
punpckhdq xmm1,xmm6
punpckldq xmm4,xmm5
punpckhdq xmm2,xmm5
movdqa xmm5,xmm0
movdqa xmm6,xmm1
punpcklqdq xmm0,xmm4
punpckhqdq xmm5,xmm4
punpcklqdq xmm1,xmm2
punpckhqdq xmm6,xmm2
mov esi,dword [esp+1Ch]
mov ecx,dword [ebp+10h]
mov edx,dword [esp+14h]
mov edi,dword [esp+8]
movd dword [esi],xmm0
movd dword [esi+ecx],xmm5
movd dword [esi+ecx*2],xmm1
movd dword [esi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
mov esi,dword [esp+18h]
movd dword [edi],xmm0
movd dword [edi+ecx],xmm5
movd dword [edi+ecx*2],xmm1
movd dword [edi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
movd dword [esi],xmm0
movd dword [esi+ecx],xmm5
movd dword [esi+ecx*2],xmm1
movd dword [esi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
mov edi,dword [esp+0Ch]
movd dword [edi],xmm0
movd dword [edi+ecx],xmm5
movd dword [edi+ecx*2],xmm1
movd dword [edi+edx],xmm6
pop edi
pop esi
mov esp,ebp
pop ebp
ret
;*******************************************************************************
2014-01-16 08:57:22 +01:00
; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
2014-01-03 07:49:45 +01:00
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockChromaLt4H_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockChromaLt4H_ssse3:
2014-01-03 07:49:45 +01:00
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
sub esp,108h
mov ecx,dword [ebp+8]
mov edx,dword [ebp+0Ch]
mov eax,dword [ebp+10h]
sub ecx,2
sub edx,2
push esi
lea esi,[eax+eax*2]
mov dword [esp+10h],ecx
mov dword [esp+4],edx
lea ecx,[ecx+eax*4]
lea edx,[edx+eax*4]
lea eax,[esp+6Ch]
push edi
mov dword [esp+0Ch],esi
mov dword [esp+18h],ecx
mov dword [esp+10h],edx
mov dword [esp+1Ch],eax
mov esi,dword [esp+14h]
mov ecx,dword [ebp+10h]
mov edx,dword [esp+0Ch]
movd xmm0,dword [esi]
movd xmm1,dword [esi+ecx]
movd xmm2,dword [esi+ecx*2]
movd xmm3,dword [esi+edx]
mov esi,dword [esp+8]
movd xmm4,dword [esi]
movd xmm5,dword [esi+ecx]
movd xmm6,dword [esi+ecx*2]
movd xmm7,dword [esi+edx]
punpckldq xmm0,xmm4
punpckldq xmm1,xmm5
punpckldq xmm2,xmm6
punpckldq xmm3,xmm7
mov esi,dword [esp+18h]
mov edi,dword [esp+10h]
movd xmm4,dword [esi]
movd xmm5,dword [edi]
punpckldq xmm4,xmm5
punpcklqdq xmm0,xmm4
movd xmm4,dword [esi+ecx]
movd xmm5,dword [edi+ecx]
punpckldq xmm4,xmm5
punpcklqdq xmm1,xmm4
movd xmm4,dword [esi+ecx*2]
movd xmm5,dword [edi+ecx*2]
punpckldq xmm4,xmm5
punpcklqdq xmm2,xmm4
movd xmm4,dword [esi+edx]
movd xmm5,dword [edi+edx]
punpckldq xmm4,xmm5
punpcklqdq xmm3,xmm4
movdqa xmm6,xmm0
punpcklbw xmm0,xmm1
punpckhbw xmm6,xmm1
movdqa xmm7,xmm2
punpcklbw xmm2,xmm3
punpckhbw xmm7,xmm3
movdqa xmm4,xmm0
movdqa xmm5,xmm6
punpcklwd xmm0,xmm2
punpckhwd xmm4,xmm2
punpcklwd xmm6,xmm7
punpckhwd xmm5,xmm7
movdqa xmm1,xmm0
movdqa xmm2,xmm4
punpckldq xmm0,xmm6
punpckhdq xmm1,xmm6
punpckldq xmm4,xmm5
punpckhdq xmm2,xmm5
movdqa xmm5,xmm0
movdqa xmm6,xmm1
punpcklqdq xmm0,xmm4
punpckhqdq xmm5,xmm4
punpcklqdq xmm1,xmm2
punpckhqdq xmm6,xmm2
mov edi,dword [esp+1Ch]
movdqa [edi],xmm0
movdqa [edi+10h],xmm5
movdqa [edi+20h],xmm1
movdqa [edi+30h],xmm6
mov eax,dword [ebp+1Ch]
movsx cx,byte [eax+3]
movsx dx,byte [eax+2]
movsx si,byte [eax+1]
movsx ax,byte [eax]
movzx edi,cx
movzx ecx,cx
movd xmm2,ecx
movzx ecx,dx
movzx edx,dx
movd xmm3,ecx
movd xmm4,edx
movzx ecx,si
movzx edx,si
movd xmm5,ecx
pxor xmm0,xmm0
movd xmm6,edx
movzx ecx,ax
movdqa [esp+60h],xmm0
movzx edx,ax
movsx eax,word [ebp+14h]
punpcklwd xmm6,xmm2
movd xmm1,edi
movd xmm7,ecx
movsx ecx,word [ebp+18h]
movd xmm0,edx
punpcklwd xmm7,xmm3
punpcklwd xmm5,xmm1
movdqa xmm1,[esp+60h]
punpcklwd xmm7,xmm5
movdqa xmm5,[esp+0A0h]
punpcklwd xmm0,xmm4
punpcklwd xmm0,xmm6
movdqa xmm6, [esp+70h]
punpcklwd xmm0,xmm7
movdqa xmm7,[esp+80h]
movdqa xmm2,xmm1
psubw xmm2,xmm0
movdqa [esp+0D0h],xmm2
movd xmm2,eax
movdqa xmm3,xmm2
punpcklwd xmm3,xmm2
pshufd xmm4,xmm3,0
movd xmm2,ecx
movdqa xmm3,xmm2
punpcklwd xmm3,xmm2
pshufd xmm2,xmm3,0
movdqa xmm3, [esp+90h]
movdqa [esp+50h],xmm2
movdqa xmm2,xmm6
punpcklbw xmm2,xmm1
punpckhbw xmm6,xmm1
movdqa [esp+40h],xmm2
movdqa [esp+0B0h],xmm6
movdqa xmm6,[esp+90h]
movdqa xmm2,xmm7
punpckhbw xmm7,xmm1
punpckhbw xmm6,xmm1
punpcklbw xmm2,xmm1
punpcklbw xmm3,xmm1
punpcklbw xmm5,xmm1
movdqa [esp+0F0h],xmm7
movdqa [esp+0C0h],xmm6
movdqa xmm6, [esp+0A0h]
punpckhbw xmm6,xmm1
movdqa [esp+0E0h],xmm6
mov edx,4
movsx eax,dx
movd xmm6,eax
movdqa xmm7,xmm6
punpcklwd xmm7,xmm6
pshufd xmm6,xmm7,0
movdqa [esp+30h],xmm6
movdqa xmm7, [esp+40h]
psubw xmm7,xmm5
movdqa xmm6,xmm0
pcmpgtw xmm6,xmm1
movdqa [esp+60h],xmm6
movdqa xmm1, [esp+0D0h]
movdqa xmm6,xmm3
psubw xmm6,xmm2
psllw xmm6,2
paddw xmm6,xmm7
paddw xmm6,[esp+30h]
psraw xmm6,3
pmaxsw xmm1,xmm6
movdqa xmm7,[esp+50h]
movdqa [esp+20h],xmm0
movdqa xmm6, [esp+20h]
pminsw xmm6,xmm1
movdqa [esp+20h],xmm6
movdqa xmm6,xmm4
movdqa xmm1,xmm2
psubw xmm1,xmm3
pabsw xmm1,xmm1
pcmpgtw xmm6,xmm1
movdqa xmm1, [esp+40h]
psubw xmm1,xmm2
pabsw xmm1,xmm1
pcmpgtw xmm7,xmm1
movdqa xmm1, [esp+50h]
pand xmm6,xmm7
movdqa xmm7, [esp+50h]
psubw xmm5,xmm3
pabsw xmm5,xmm5
pcmpgtw xmm1,xmm5
movdqa xmm5, [esp+0B0h]
psubw xmm5,[esp+0E0h]
pand xmm6,xmm1
pand xmm6, [esp+60h]
movdqa xmm1, [esp+20h]
pand xmm1,xmm6
movdqa xmm6, [esp+0C0h]
movdqa [esp+40h],xmm1
movdqa xmm1, [esp+0F0h]
psubw xmm6,xmm1
psllw xmm6,2
paddw xmm6,xmm5
paddw xmm6, [esp+30h]
movdqa xmm5, [esp+0D0h]
psraw xmm6,3
pmaxsw xmm5,xmm6
pminsw xmm0,xmm5
movdqa xmm5,[esp+0C0h]
movdqa xmm6,xmm1
psubw xmm6,xmm5
pabsw xmm6,xmm6
pcmpgtw xmm4,xmm6
movdqa xmm6,[esp+0B0h]
psubw xmm6,xmm1
pabsw xmm6,xmm6
pcmpgtw xmm7,xmm6
movdqa xmm6, [esp+0E0h]
pand xmm4,xmm7
movdqa xmm7, [esp+50h]
psubw xmm6,xmm5
pabsw xmm6,xmm6
pcmpgtw xmm7,xmm6
pand xmm4,xmm7
pand xmm4,[esp+60h]
pand xmm0,xmm4
movdqa xmm4, [esp+40h]
paddw xmm2,xmm4
paddw xmm1,xmm0
psubw xmm3,xmm4
psubw xmm5,xmm0
packuswb xmm2,xmm1
packuswb xmm3,xmm5
movdqa [esp+80h],xmm2
movdqa [esp+90h],xmm3
mov esi,dword [esp+1Ch]
movdqa xmm0, [esi]
movdqa xmm1, [esi+10h]
movdqa xmm2, [esi+20h]
movdqa xmm3, [esi+30h]
movdqa xmm6,xmm0
punpcklbw xmm0,xmm1
punpckhbw xmm6,xmm1
movdqa xmm7,xmm2
punpcklbw xmm2,xmm3
punpckhbw xmm7,xmm3
movdqa xmm4,xmm0
movdqa xmm5,xmm6
punpcklwd xmm0,xmm2
punpckhwd xmm4,xmm2
punpcklwd xmm6,xmm7
punpckhwd xmm5,xmm7
movdqa xmm1,xmm0
movdqa xmm2,xmm4
punpckldq xmm0,xmm6
punpckhdq xmm1,xmm6
punpckldq xmm4,xmm5
punpckhdq xmm2,xmm5
movdqa xmm5,xmm0
movdqa xmm6,xmm1
punpcklqdq xmm0,xmm4
punpckhqdq xmm5,xmm4
punpcklqdq xmm1,xmm2
punpckhqdq xmm6,xmm2
mov esi,dword [esp+14h]
mov ecx,dword [ebp+10h]
mov edx,dword [esp+0Ch]
mov edi,dword [esp+8]
movd dword [esi],xmm0
movd dword [esi+ecx],xmm5
movd dword [esi+ecx*2],xmm1
movd dword [esi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
mov esi,dword [esp+18h]
movd dword [edi],xmm0
movd dword [edi+ecx],xmm5
movd dword [edi+ecx*2],xmm1
movd dword [edi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
movd dword [esi],xmm0
movd dword [esi+ecx],xmm5
movd dword [esi+ecx*2],xmm1
movd dword [esi+edx],xmm6
psrldq xmm0,4
psrldq xmm5,4
psrldq xmm1,4
psrldq xmm6,4
mov edi,dword [esp+10h]
movd dword [edi],xmm0
movd dword [edi+ecx],xmm5
movd dword [edi+ecx*2],xmm1
movd dword [edi+edx],xmm6
pop edi
pop esi
mov esp,ebp
pop ebp
ret
;*******************************************************************************
2014-01-16 08:57:22 +01:00
; void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
2014-01-03 07:49:45 +01:00
; int32_t iBeta, int8_t * pTC)
;*******************************************************************************
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockLumaLt4V_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockLumaLt4V_ssse3:
2014-01-03 07:49:45 +01:00
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
sub esp, 420 ; 000001a4H
mov eax, dword [ebp+8]
mov ecx, dword [ebp+12]
pxor xmm0, xmm0
push ebx
mov edx, dword [ebp+24]
movdqa [esp+424-384], xmm0
push esi
lea esi, [ecx+ecx*2]
push edi
mov edi, eax
sub edi, esi
movdqa xmm0, [edi]
lea esi, [ecx+ecx]
movdqa [esp+432-208], xmm0
mov edi, eax
sub edi, esi
movdqa xmm0, [edi]
movdqa [esp+448-208], xmm0
mov ebx, eax
sub ebx, ecx
movdqa xmm0, [ebx]
movdqa [esp+464-208], xmm0
movdqa xmm0, [eax]
add ecx, eax
movdqa [esp+480-208], xmm0
movdqa xmm0, [ecx]
mov dword [esp+432-404], ecx
movsx ecx, word [ebp+16]
movdqa [esp+496-208], xmm0
movdqa xmm0, [esi+eax]
movsx si, byte [edx]
movdqa [esp+512-208], xmm0
movd xmm0, ecx
movsx ecx, word [ebp+20]
movdqa xmm1, xmm0
punpcklwd xmm1, xmm0
pshufd xmm0, xmm1, 0
movdqa [esp+432-112], xmm0
movd xmm0, ecx
movsx cx, byte [edx+1]
movdqa xmm1, xmm0
punpcklwd xmm1, xmm0
mov dword [esp+432-408], ebx
movzx ebx, cx
pshufd xmm0, xmm1, 0
movd xmm1, ebx
movzx ebx, cx
movd xmm2, ebx
movzx ebx, cx
movzx ecx, cx
movd xmm4, ecx
movzx ecx, si
movd xmm5, ecx
movzx ecx, si
movd xmm6, ecx
movzx ecx, si
movd xmm7, ecx
movzx ecx, si
movdqa [esp+432-336], xmm0
movd xmm0, ecx
movsx cx, byte [edx+3]
movsx dx, byte [edx+2]
movd xmm3, ebx
punpcklwd xmm0, xmm4
movzx esi, cx
punpcklwd xmm6, xmm2
punpcklwd xmm5, xmm1
punpcklwd xmm0, xmm6
punpcklwd xmm7, xmm3
punpcklwd xmm7, xmm5
punpcklwd xmm0, xmm7
movdqa [esp+432-400], xmm0
movd xmm0, esi
movzx esi, cx
movd xmm2, esi
movzx esi, cx
movzx ecx, cx
movd xmm4, ecx
movzx ecx, dx
movd xmm3, esi
movd xmm5, ecx
punpcklwd xmm5, xmm0
movdqa xmm0, [esp+432-384]
movzx ecx, dx
movd xmm6, ecx
movzx ecx, dx
movzx edx, dx
punpcklwd xmm6, xmm2
movd xmm7, ecx
movd xmm1, edx
movdqa xmm2, [esp+448-208]
punpcklbw xmm2, xmm0
mov ecx, 4
movsx edx, cx
punpcklwd xmm7, xmm3
punpcklwd xmm7, xmm5
movdqa xmm5, [esp+496-208]
movdqa xmm3, [esp+464-208]
punpcklbw xmm5, xmm0
movdqa [esp+432-240], xmm5
movdqa xmm5, [esp+512-208]
punpcklbw xmm5, xmm0
movdqa [esp+432-352], xmm5
punpcklwd xmm1, xmm4
movdqa xmm4, [esp+432-208]
punpcklwd xmm1, xmm6
movdqa xmm6, [esp+480-208]
punpcklwd xmm1, xmm7
punpcklbw xmm6, xmm0
punpcklbw xmm3, xmm0
punpcklbw xmm4, xmm0
movdqa xmm7, xmm3
psubw xmm7, xmm4
pabsw xmm7, xmm7
movdqa [esp+432-272], xmm4
movdqa xmm4, [esp+432-336]
movdqa xmm5, xmm4
pcmpgtw xmm5, xmm7
movdqa [esp+432-288], xmm5
movdqa xmm7, xmm6
psubw xmm7, [esp+432-352]
pabsw xmm7, xmm7
movdqa xmm5, xmm4
pcmpgtw xmm5, xmm7
movdqa [esp+432-256], xmm5
movdqa xmm5, xmm3
pavgw xmm5, xmm6
movdqa [esp+432-304], xmm5
movdqa xmm5, [esp+432-400]
psubw xmm5, [esp+432-288]
psubw xmm5, [esp+432-256]
movdqa [esp+432-224], xmm5
movdqa xmm5, xmm6
psubw xmm5, xmm3
movdqa [esp+432-32], xmm6
psubw xmm6, [esp+432-240]
movdqa xmm7, xmm5
movdqa [esp+432-384], xmm5
movdqa xmm5, [esp+432-112]
pabsw xmm7, xmm7
pcmpgtw xmm5, xmm7
pabsw xmm6, xmm6
movdqa xmm7, xmm4
pcmpgtw xmm7, xmm6
pand xmm5, xmm7
movdqa xmm6, xmm3
psubw xmm6, xmm2
pabsw xmm6, xmm6
movdqa xmm7, xmm4
pcmpgtw xmm7, xmm6
movdqa xmm6, [esp+432-400]
pand xmm5, xmm7
movdqa xmm7, xmm6
pcmpeqw xmm6, xmm0
pcmpgtw xmm7, xmm0
por xmm7, xmm6
pand xmm5, xmm7
movdqa [esp+432-320], xmm5
movd xmm5, edx
movdqa xmm6, xmm5
punpcklwd xmm6, xmm5
pshufd xmm5, xmm6, 0
movdqa [esp+432-336], xmm5
movdqa xmm5, [esp+432-224]
movdqa [esp+432-368], xmm5
movdqa xmm6, xmm0
psubw xmm6, xmm5
movdqa xmm5, [esp+432-384]
psllw xmm5, 2
movdqa xmm7, xmm2
psubw xmm7, [esp+432-240]
paddw xmm7, xmm5
paddw xmm7, [esp+432-336]
movdqa xmm5, [esp+432-368]
psraw xmm7, 3
pmaxsw xmm6, xmm7
pminsw xmm5, xmm6
pand xmm5, [esp+432-320]
movdqa xmm6, [esp+432-400]
movdqa [esp+432-64], xmm5
movdqa [esp+432-384], xmm6
movdqa xmm5, xmm0
psubw xmm5, xmm6
movdqa [esp+432-368], xmm5
movdqa xmm6, xmm5
movdqa xmm5, [esp+432-272]
paddw xmm5, [esp+432-304]
movdqa xmm7, xmm2
paddw xmm7, xmm2
psubw xmm5, xmm7
psraw xmm5, 1
pmaxsw xmm6, xmm5
movdqa xmm5, [esp+432-384]
pminsw xmm5, xmm6
pand xmm5, [esp+432-320]
pand xmm5, [esp+432-288]
movdqa xmm6, [esp+432-240]
movdqa [esp+432-96], xmm5
movdqa xmm5, [esp+432-352]
paddw xmm5, [esp+432-304]
movdqa xmm7, xmm6
paddw xmm7, xmm6
movdqa xmm6, [esp+432-368]
psubw xmm5, xmm7
movdqa xmm7, [esp+496-208]
psraw xmm5, 1
pmaxsw xmm6, xmm5
movdqa xmm5, [esp+432-400]
pminsw xmm5, xmm6
pand xmm5, [esp+432-320]
pand xmm5, [esp+432-256]
movdqa xmm6, [esp+448-208]
punpckhbw xmm7, xmm0
movdqa [esp+432-352], xmm7
movdqa xmm7, [esp+512-208]
punpckhbw xmm6, xmm0
movdqa [esp+432-48], xmm5
movdqa xmm5, [esp+432-208]
movdqa [esp+432-368], xmm6
movdqa xmm6, [esp+464-208]
punpckhbw xmm7, xmm0
punpckhbw xmm5, xmm0
movdqa [esp+432-384], xmm7
punpckhbw xmm6, xmm0
movdqa [esp+432-400], xmm6
movdqa xmm7, [esp+432-400]
movdqa xmm6, [esp+480-208]
psubw xmm7, xmm5
movdqa [esp+432-16], xmm5
pabsw xmm7, xmm7
punpckhbw xmm6, xmm0
movdqa xmm5, xmm4
pcmpgtw xmm5, xmm7
movdqa [esp+432-288], xmm5
movdqa xmm7, xmm6
psubw xmm7, [esp+432-384]
pabsw xmm7, xmm7
movdqa xmm5, xmm4
pcmpgtw xmm5, xmm7
movdqa [esp+432-256], xmm5
movdqa xmm5, [esp+432-400]
movdqa [esp+432-80], xmm6
pavgw xmm5, xmm6
movdqa [esp+432-304], xmm5
movdqa xmm5, xmm1
psubw xmm5, [esp+432-288]
psubw xmm5, [esp+432-256]
movdqa [esp+432-224], xmm5
movdqa xmm5, xmm6
psubw xmm5, [esp+432-400]
psubw xmm6, [esp+432-352]
movdqa [esp+432-272], xmm5
movdqa xmm7, xmm5
movdqa xmm5, [esp+432-112]
pabsw xmm7, xmm7
pcmpgtw xmm5, xmm7
movdqa xmm7, xmm4
pabsw xmm6, xmm6
pcmpgtw xmm7, xmm6
movdqa xmm6, [esp+432-368]
pand xmm5, xmm7
movdqa xmm7, [esp+432-400]
psubw xmm7, xmm6
psubw xmm6, [esp+432-352]
pabsw xmm7, xmm7
pcmpgtw xmm4, xmm7
pand xmm5, xmm4
paddw xmm2, [esp+432-96]
movdqa xmm4, xmm1
pcmpgtw xmm4, xmm0
movdqa xmm7, xmm1
pcmpeqw xmm7, xmm0
por xmm4, xmm7
pand xmm5, xmm4
movdqa xmm4, [esp+432-224]
movdqa [esp+432-320], xmm5
movdqa xmm5, [esp+432-272]
movdqa xmm7, xmm0
psubw xmm7, xmm4
psubw xmm0, xmm1
psllw xmm5, 2
paddw xmm6, xmm5
paddw xmm6, [esp+432-336]
movdqa xmm5, [esp+432-368]
movdqa [esp+432-336], xmm0
psraw xmm6, 3
pmaxsw xmm7, xmm6
pminsw xmm4, xmm7
pand xmm4, [esp+432-320]
movdqa xmm6, xmm0
movdqa xmm0, [esp+432-16]
paddw xmm0, [esp+432-304]
movdqa [esp+432-272], xmm4
movdqa xmm4, [esp+432-368]
paddw xmm4, xmm4
psubw xmm0, xmm4
movdqa xmm4, [esp+432-64]
psraw xmm0, 1
pmaxsw xmm6, xmm0
movdqa xmm0, [esp+432-400]
movdqa xmm7, xmm1
pminsw xmm7, xmm6
movdqa xmm6, [esp+432-320]
pand xmm7, xmm6
pand xmm7, [esp+432-288]
paddw xmm5, xmm7
packuswb xmm2, xmm5
movdqa xmm5, [esp+432-272]
paddw xmm0, xmm5
paddw xmm3, xmm4
packuswb xmm3, xmm0
movdqa xmm0, [esp+432-32]
psubw xmm0, xmm4
movdqa xmm4, [esp+432-80]
psubw xmm4, xmm5
movdqa xmm5, [esp+432-240]
paddw xmm5, [esp+432-48]
packuswb xmm0, xmm4
movdqa xmm4, [esp+432-384]
paddw xmm4, [esp+432-304]
movdqa [esp+480-208], xmm0
movdqa xmm0, [esp+432-352]
movdqa xmm7, xmm0
paddw xmm0, xmm0
mov ecx, dword [esp+432-408]
mov edx, dword [esp+432-404]
psubw xmm4, xmm0
movdqa xmm0, [esp+432-336]
movdqa [edi], xmm2
psraw xmm4, 1
pmaxsw xmm0, xmm4
pminsw xmm1, xmm0
movdqa xmm0, [esp+480-208]
pop edi
pand xmm1, xmm6
pand xmm1, [esp+428-256]
movdqa [ecx], xmm3
paddw xmm7, xmm1
pop esi
packuswb xmm5, xmm7
movdqa [eax], xmm0
movdqa [edx], xmm5
pop ebx
mov esp, ebp
pop ebp
ret
;*******************************************************************************
2014-01-16 08:57:22 +01:00
; void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
2014-01-03 07:49:45 +01:00
; int32_t iBeta)
;*******************************************************************************
2014-01-16 08:57:22 +01:00
WELS_EXTERN DeblockLumaEq4V_ssse3
2014-01-03 07:49:45 +01:00
ALIGN 16
2014-01-16 08:57:22 +01:00
DeblockLumaEq4V_ssse3:
2014-01-03 07:49:45 +01:00
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
sub esp, 628 ; 00000274H
mov eax, dword [ebp+8]
mov ecx, dword [ebp+12]
push ebx
push esi
lea edx, [ecx*4]
pxor xmm0, xmm0
movdqa xmm2, xmm0
movdqa xmm0, [ecx+eax]
mov esi, eax
sub esi, edx
movdqa xmm3, [esi]
movdqa xmm5, [eax]
push edi
lea edi, [ecx+ecx]
lea ebx, [ecx+ecx*2]
mov dword [esp+640-600], edi
mov esi, eax
sub esi, edi
movdqa xmm1, [esi]
movdqa [esp+720-272], xmm0
mov edi, eax
sub edi, ecx
movdqa xmm4, [edi]
add ecx, eax
mov dword [esp+640-596], ecx
mov ecx, dword [esp+640-600]
movdqa xmm0, [ecx+eax]
movdqa [esp+736-272], xmm0
movdqa xmm0, [eax+ebx]
mov edx, eax
sub edx, ebx
movsx ebx, word [ebp+16]
movdqa xmm6, [edx]
add ecx, eax
movdqa [esp+752-272], xmm0
movd xmm0, ebx
movsx ebx, word [ebp+20]
movdqa xmm7, xmm0
punpcklwd xmm7, xmm0
pshufd xmm0, xmm7, 0
movdqa [esp+640-320], xmm0
movd xmm0, ebx
movdqa xmm7, xmm0
punpcklwd xmm7, xmm0
pshufd xmm0, xmm7, 0
movdqa xmm7, [esp+736-272]
punpcklbw xmm7, xmm2
movdqa [esp+640-416], xmm7
movdqa [esp+640-512], xmm0
movdqa xmm0, xmm1
movdqa [esp+672-272], xmm1
movdqa xmm1, xmm4
movdqa [esp+704-272], xmm5
punpcklbw xmm5, xmm2
punpcklbw xmm1, xmm2
movdqa xmm7, xmm5
psubw xmm7, xmm1
pabsw xmm7, xmm7
movdqa [esp+640-560], xmm7
punpcklbw xmm0, xmm2
movdqa [esp+688-272], xmm4
movdqa xmm4, [esp+720-272]
movdqa [esp+640-480], xmm0
movdqa xmm7, xmm1
psubw xmm7, xmm0
movdqa xmm0, [esp+640-512]
pabsw xmm7, xmm7
punpcklbw xmm4, xmm2
pcmpgtw xmm0, xmm7
movdqa [esp+640-384], xmm4
movdqa xmm7, xmm5
psubw xmm7, xmm4
movdqa xmm4, [esp+640-512]
movdqa [esp+656-272], xmm6
punpcklbw xmm6, xmm2
pabsw xmm7, xmm7
movdqa [esp+640-48], xmm2
movdqa [esp+640-368], xmm6
movdqa [esp+640-144], xmm1
movdqa [esp+640-400], xmm5
pcmpgtw xmm4, xmm7
pand xmm0, xmm4
movdqa xmm4, [esp+640-320]
pcmpgtw xmm4, [esp+640-560]
pand xmm0, xmm4
mov ebx, 2
movsx ebx, bx
movd xmm4, ebx
movdqa xmm7, xmm4
punpcklwd xmm7, xmm4
movdqa xmm4, [esp+640-320]
psraw xmm4, 2
pshufd xmm7, xmm7, 0
paddw xmm4, xmm7
movdqa [esp+640-576], xmm4
pcmpgtw xmm4, [esp+640-560]
movdqa [esp+640-560], xmm4
movdqa xmm4, [esp+640-512]
movdqa [esp+640-624], xmm7
movdqa xmm7, xmm1
psubw xmm7, xmm6
pabsw xmm7, xmm7
pcmpgtw xmm4, xmm7
pand xmm4, [esp+640-560]
movdqa [esp+640-544], xmm4
movdqa xmm4, [esp+640-512]
movdqa xmm7, xmm5
psubw xmm7, [esp+640-416]
pabsw xmm7, xmm7
pcmpgtw xmm4, xmm7
pand xmm4, [esp+640-560]
movdqa [esp+640-560], xmm4
movdqa xmm4, [esp+640-544]
pandn xmm4, xmm6
movdqa [esp+640-16], xmm4
mov ebx, 4
movsx ebx, bx
movd xmm4, ebx
movdqa xmm7, xmm4
punpcklwd xmm7, xmm4
movdqa xmm4, xmm3
punpcklbw xmm4, xmm2
psllw xmm4, 1
paddw xmm4, xmm6
paddw xmm4, xmm6
paddw xmm4, xmm6
paddw xmm4, [esp+640-480]
movdqa xmm6, [esp+640-560]
pshufd xmm7, xmm7, 0
paddw xmm4, xmm1
movdqa [esp+640-592], xmm7
paddw xmm4, xmm5
paddw xmm4, xmm7
movdqa xmm7, [esp+640-416]
pandn xmm6, xmm7
movdqa [esp+640-80], xmm6
movdqa xmm6, [esp+752-272]
punpcklbw xmm6, xmm2
psllw xmm6, 1
paddw xmm6, xmm7
paddw xmm6, xmm7
paddw xmm6, xmm7
paddw xmm6, [esp+640-384]
movdqa xmm7, [esp+640-480]
paddw xmm6, xmm5
paddw xmm6, xmm1
paddw xmm6, [esp+640-592]
psraw xmm6, 3
pand xmm6, [esp+640-560]
movdqa [esp+640-112], xmm6
movdqa xmm6, [esp+640-544]
pandn xmm6, xmm7
movdqa [esp+640-336], xmm6
movdqa xmm6, [esp+640-544]
movdqa [esp+640-528], xmm6
movdqa xmm6, [esp+640-368]
paddw xmm6, xmm7
movdqa xmm7, xmm1
psraw xmm4, 3
pand xmm4, [esp+640-544]
paddw xmm7, xmm5
paddw xmm6, xmm7
paddw xmm6, [esp+640-624]
movdqa xmm7, [esp+640-528]
paddw xmm5, xmm1
psraw xmm6, 2
pand xmm7, xmm6
movdqa xmm6, [esp+640-384]
movdqa [esp+640-64], xmm7
movdqa xmm7, [esp+640-560]
pandn xmm7, xmm6
movdqa [esp+640-304], xmm7
movdqa xmm7, [esp+640-560]
movdqa [esp+640-528], xmm7
movdqa xmm7, [esp+640-416]
paddw xmm7, xmm6
paddw xmm7, xmm5
paddw xmm7, [esp+640-624]
movdqa xmm5, [esp+640-528]
psraw xmm7, 2
pand xmm5, xmm7
movdqa [esp+640-32], xmm5
movdqa xmm5, [esp+640-544]
movdqa [esp+640-528], xmm5
movdqa xmm5, [esp+640-480]
movdqa xmm7, xmm5
paddw xmm7, xmm5
movdqa xmm5, xmm1
paddw xmm5, xmm6
paddw xmm6, [esp+640-592]
paddw xmm7, xmm5
paddw xmm7, [esp+640-624]
movdqa xmm5, [esp+640-528]
psraw xmm7, 2
pandn xmm5, xmm7
movdqa xmm7, [esp+640-480]
paddw xmm7, xmm1
paddw xmm7, [esp+640-400]
movdqa xmm1, [esp+640-544]
movdqa [esp+640-352], xmm5
movdqa xmm5, [esp+640-368]
psllw xmm7, 1
paddw xmm7, xmm6
paddw xmm5, xmm7
movdqa xmm7, [esp+640-400]
psraw xmm5, 3
pand xmm1, xmm5
movdqa xmm5, [esp+640-480]
movdqa [esp+640-96], xmm1
movdqa xmm1, [esp+640-560]
movdqa [esp+640-528], xmm1
movdqa xmm1, [esp+640-384]
movdqa xmm6, xmm1
paddw xmm6, xmm1
paddw xmm1, [esp+640-400]
paddw xmm1, [esp+640-144]
paddw xmm7, xmm5
paddw xmm5, [esp+640-592]
paddw xmm6, xmm7
paddw xmm6, [esp+640-624]
movdqa xmm7, [esp+640-528]
psraw xmm6, 2
psllw xmm1, 1
paddw xmm1, xmm5
movdqa xmm5, [esp+656-272]
pandn xmm7, xmm6
movdqa xmm6, [esp+640-416]
paddw xmm6, xmm1
movdqa xmm1, [esp+640-560]
psraw xmm6, 3
pand xmm1, xmm6
movdqa xmm6, [esp+704-272]
movdqa [esp+640-128], xmm1
movdqa xmm1, [esp+672-272]
punpckhbw xmm1, xmm2
movdqa [esp+640-448], xmm1
movdqa xmm1, [esp+688-272]
punpckhbw xmm1, xmm2
punpckhbw xmm6, xmm2
movdqa [esp+640-288], xmm7
punpckhbw xmm5, xmm2
movdqa [esp+640-496], xmm1
movdqa [esp+640-432], xmm6
movdqa xmm7, [esp+720-272]
punpckhbw xmm7, xmm2
movdqa [esp+640-464], xmm7
movdqa xmm7, [esp+736-272]
punpckhbw xmm7, xmm2
movdqa [esp+640-528], xmm7
movdqa xmm7, xmm6
psubw xmm6, [esp+640-464]
psubw xmm7, xmm1
pabsw xmm7, xmm7
movdqa [esp+640-560], xmm7
por xmm4, [esp+640-16]
pabsw xmm6, xmm6
movdqa xmm7, xmm1
psubw xmm7, [esp+640-448]
movdqa xmm1, [esp+640-512]
pabsw xmm7, xmm7
pcmpgtw xmm1, xmm7
movdqa xmm7, [esp+640-512]
pcmpgtw xmm7, xmm6
movdqa xmm6, [esp+640-320]
pand xmm1, xmm7
movdqa xmm7, [esp+640-560]
pcmpgtw xmm6, xmm7
pand xmm1, xmm6
movdqa xmm6, [esp+640-576]
pcmpgtw xmm6, xmm7
movdqa xmm7, [esp+640-496]
punpckhbw xmm3, xmm2
movdqa [esp+640-560], xmm6
movdqa xmm6, [esp+640-512]
psubw xmm7, xmm5
pabsw xmm7, xmm7
pcmpgtw xmm6, xmm7
pand xmm6, [esp+640-560]
movdqa xmm7, [esp+640-432]
psubw xmm7, [esp+640-528]
psllw xmm3, 1
movdqa [esp+640-544], xmm6
movdqa xmm6, [esp+640-512]
movdqa xmm2, [esp+640-544]
paddw xmm3, xmm5
paddw xmm3, xmm5
paddw xmm3, xmm5
paddw xmm3, [esp+640-448]
paddw xmm3, [esp+640-496]
pabsw xmm7, xmm7
pcmpgtw xmm6, xmm7
pand xmm6, [esp+640-560]
movdqa [esp+640-560], xmm6
movdqa xmm6, xmm0
pand xmm6, xmm4
movdqa xmm4, xmm0
pandn xmm4, [esp+640-368]
por xmm6, xmm4
movdqa xmm4, [esp+640-432]
paddw xmm3, xmm4
paddw xmm3, [esp+640-592]
psraw xmm3, 3
pand xmm3, xmm2
pandn xmm2, xmm5
por xmm3, xmm2
movdqa xmm7, xmm1
pand xmm7, xmm3
movdqa xmm3, [esp+640-64]
por xmm3, [esp+640-336]
movdqa xmm2, xmm1
pandn xmm2, xmm5
por xmm7, xmm2
movdqa xmm2, xmm0
pand xmm2, xmm3
movdqa xmm3, xmm0
pandn xmm3, [esp+640-480]
por xmm2, xmm3
packuswb xmm6, xmm7
movdqa [esp+640-336], xmm2
movdqa [esp+656-272], xmm6
movdqa xmm6, [esp+640-544]
movdqa xmm2, xmm5
paddw xmm2, [esp+640-448]
movdqa xmm3, xmm1
movdqa xmm7, [esp+640-496]
paddw xmm7, xmm4
paddw xmm2, xmm7
paddw xmm2, [esp+640-624]
movdqa xmm7, [esp+640-544]
psraw xmm2, 2
pand xmm6, xmm2
movdqa xmm2, [esp+640-448]
pandn xmm7, xmm2
por xmm6, xmm7
pand xmm3, xmm6
movdqa xmm6, xmm1
pandn xmm6, xmm2
paddw xmm2, [esp+640-496]
paddw xmm2, xmm4
por xmm3, xmm6
movdqa xmm6, [esp+640-336]
packuswb xmm6, xmm3
psllw xmm2, 1
movdqa [esp+672-272], xmm6
movdqa xmm6, [esp+640-96]
por xmm6, [esp+640-352]
movdqa xmm3, xmm0
pand xmm3, xmm6
movdqa xmm6, xmm0
pandn xmm6, [esp+640-144]
por xmm3, xmm6
movdqa xmm6, [esp+640-544]
movdqa [esp+640-352], xmm3
movdqa xmm3, [esp+640-464]
paddw xmm3, [esp+640-592]
paddw xmm2, xmm3
movdqa xmm3, [esp+640-448]
paddw xmm5, xmm2
movdqa xmm2, [esp+640-496]
psraw xmm5, 3
pand xmm6, xmm5
movdqa xmm5, [esp+640-464]
paddw xmm2, xmm5
paddw xmm5, [esp+640-432]
movdqa xmm4, xmm3
paddw xmm4, xmm3
paddw xmm4, xmm2
paddw xmm4, [esp+640-624]
movdqa xmm2, [esp+640-544]
paddw xmm3, [esp+640-592]
psraw xmm4, 2
pandn xmm2, xmm4
por xmm6, xmm2
movdqa xmm7, xmm1
pand xmm7, xmm6
movdqa xmm6, [esp+640-496]
movdqa xmm2, xmm1
pandn xmm2, xmm6
por xmm7, xmm2
movdqa xmm2, [esp+640-352]
packuswb xmm2, xmm7
movdqa [esp+688-272], xmm2
movdqa xmm2, [esp+640-128]
por xmm2, [esp+640-288]
movdqa xmm4, xmm0
pand xmm4, xmm2
paddw xmm5, xmm6
movdqa xmm2, xmm0
pandn xmm2, [esp+640-400]
por xmm4, xmm2
movdqa xmm2, [esp+640-528]
psllw xmm5, 1
paddw xmm5, xmm3
movdqa xmm3, [esp+640-560]
paddw xmm2, xmm5
psraw xmm2, 3
movdqa [esp+640-288], xmm4
movdqa xmm4, [esp+640-560]
pand xmm4, xmm2
movdqa xmm2, [esp+640-464]
movdqa xmm5, xmm2
paddw xmm5, xmm2
movdqa xmm2, [esp+640-432]
paddw xmm2, [esp+640-448]
movdqa xmm7, xmm1
paddw xmm5, xmm2
paddw xmm5, [esp+640-624]
movdqa xmm6, [esp+640-560]
psraw xmm5, 2
pandn xmm3, xmm5
por xmm4, xmm3
movdqa xmm3, [esp+640-32]
por xmm3, [esp+640-304]
pand xmm7, xmm4
movdqa xmm4, [esp+640-432]
movdqa xmm5, [esp+640-464]
movdqa xmm2, xmm1
pandn xmm2, xmm4
paddw xmm4, [esp+640-496]
por xmm7, xmm2
movdqa xmm2, [esp+640-288]
packuswb xmm2, xmm7
movdqa [esp+704-272], xmm2
movdqa xmm2, xmm0
pand xmm2, xmm3
movdqa xmm3, xmm0
pandn xmm3, [esp+640-384]
por xmm2, xmm3
movdqa [esp+640-304], xmm2
movdqa xmm2, [esp+640-528]
movdqa xmm3, xmm2
paddw xmm3, [esp+640-464]
paddw xmm3, xmm4
paddw xmm3, [esp+640-624]
psraw xmm3, 2
pand xmm6, xmm3
movdqa xmm3, [esp+640-560]
movdqa xmm4, xmm3
pandn xmm4, xmm5
por xmm6, xmm4
movdqa xmm7, xmm1
pand xmm7, xmm6
movdqa xmm6, [esp+640-304]
movdqa xmm4, xmm1
pandn xmm4, xmm5
por xmm7, xmm4
movdqa xmm4, xmm0
pandn xmm0, [esp+640-416]
packuswb xmm6, xmm7
movdqa xmm7, [esp+640-112]
por xmm7, [esp+640-80]
pand xmm4, xmm7
por xmm4, xmm0
movdqa xmm0, [esp+752-272]
punpckhbw xmm0, [esp+640-48]
psllw xmm0, 1
paddw xmm0, xmm2
paddw xmm0, xmm2
paddw xmm0, xmm2
paddw xmm0, xmm5
paddw xmm0, [esp+640-432]
paddw xmm0, [esp+640-496]
paddw xmm0, [esp+640-592]
psraw xmm0, 3
pand xmm0, xmm3
movdqa xmm7, xmm1
pandn xmm3, xmm2
por xmm0, xmm3
pand xmm7, xmm0
movdqa xmm0, [esp+656-272]
movdqa [edx], xmm0
movdqa xmm0, [esp+672-272]
mov edx, dword [esp+640-596]
movdqa [esi], xmm0
movdqa xmm0, [esp+688-272]
movdqa [edi], xmm0
movdqa xmm0, [esp+704-272]
pop edi
pandn xmm1, xmm2
movdqa [eax], xmm0
por xmm7, xmm1
pop esi
packuswb xmm4, xmm7
movdqa [edx], xmm6
movdqa [ecx], xmm4
pop ebx
mov esp, ebp
pop ebp
ret
2014-01-03 07:49:45 +01:00
%endif
;********************************************************************************
;
; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
;
;********************************************************************************
WELS_EXTERN DeblockLumaTransposeH2V_sse2
ALIGN 16
DeblockLumaTransposeH2V_sse2:
push r3
push r4
2014-01-03 07:49:45 +01:00
push r5
%assign push_num 3
LOAD_3_PARA
2014-01-03 07:49:45 +01:00
SIGN_EXTENSION r1, r1d
2014-01-03 07:49:45 +01:00
mov r5, r7
2014-01-03 07:49:45 +01:00
mov r3, r7
and r3, 0Fh
sub r7, r3
sub r7, 10h
lea r3, [r0 + r1 * 8]
lea r4, [r1 * 3]
movq xmm0, [r0]
movq xmm7, [r3]
punpcklqdq xmm0, xmm7
movq xmm1, [r0 + r1]
movq xmm7, [r3 + r1]
punpcklqdq xmm1, xmm7
movq xmm2, [r0 + r1*2]
movq xmm7, [r3 + r1*2]
punpcklqdq xmm2, xmm7
movq xmm3, [r0 + r4]
movq xmm7, [r3 + r4]
punpcklqdq xmm3, xmm7
lea r0, [r0 + r1 * 4]
lea r3, [r3 + r1 * 4]
movq xmm4, [r0]
movq xmm7, [r3]
punpcklqdq xmm4, xmm7
movq xmm5, [r0 + r1]
movq xmm7, [r3 + r1]
punpcklqdq xmm5, xmm7
movq xmm6, [r0 + r1*2]
movq xmm7, [r3 + r1*2]
punpcklqdq xmm6, xmm7
movdqa [r7], xmm0
movq xmm7, [r0 + r4]
movq xmm0, [r3 + r4]
punpcklqdq xmm7, xmm0
movdqa xmm0, [r7]
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
2014-01-03 07:49:45 +01:00
movdqa [r2], xmm4
movdqa [r2 + 10h], xmm2
movdqa [r2 + 20h], xmm3
movdqa [r2 + 30h], xmm7
movdqa [r2 + 40h], xmm5
movdqa [r2 + 50h], xmm1
movdqa [r2 + 60h], xmm6
movdqa [r2 + 70h], xmm0
mov r7, r5
pop r5
pop r4
pop r3
ret
;*******************************************************************************************
;
; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
;
;*******************************************************************************************
WELS_EXTERN DeblockLumaTransposeV2H_sse2
ALIGN 16
DeblockLumaTransposeV2H_sse2:
push r3
push r4
2014-01-03 07:49:45 +01:00
%assign push_num 2
LOAD_3_PARA
SIGN_EXTENSION r1, r1d
2014-01-03 07:49:45 +01:00
mov r4, r7
mov r3, r7
2014-01-03 07:49:45 +01:00
and r3, 0Fh
sub r7, r3
2014-01-03 07:49:45 +01:00
sub r7, 10h
movdqa xmm0, [r2]
movdqa xmm1, [r2 + 10h]
movdqa xmm2, [r2 + 20h]
movdqa xmm3, [r2 + 30h]
movdqa xmm4, [r2 + 40h]
movdqa xmm5, [r2 + 50h]
movdqa xmm6, [r2 + 60h]
movdqa xmm7, [r2 + 70h]
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
lea r2, [r1 * 3]
movq [r0], xmm4
movq [r0 + r1], xmm2
movq [r0 + r1*2], xmm3
movq [r0 + r2], xmm7
lea r0, [r0 + r1*4]
movq [r0], xmm5
movq [r0 + r1], xmm1
movq [r0 + r1*2], xmm6
movq [r0 + r2], xmm0
psrldq xmm4, 8
psrldq xmm2, 8
psrldq xmm3, 8
psrldq xmm7, 8
psrldq xmm5, 8
psrldq xmm1, 8
psrldq xmm6, 8
psrldq xmm0, 8
lea r0, [r0 + r1*4]
movq [r0], xmm4
movq [r0 + r1], xmm2
movq [r0 + r1*2], xmm3
movq [r0 + r2], xmm7
lea r0, [r0 + r1*4]
movq [r0], xmm5
movq [r0 + r1], xmm1
movq [r0 + r1*2], xmm6
movq [r0 + r2], xmm0
mov r7, r4
pop r4
pop r3
ret