5326 lines
127 KiB
NASM
5326 lines
127 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* deblock.asm
|
|
;*
|
|
;* Abstract
|
|
;* edge loop
|
|
;*
|
|
;* History
|
|
;* 08/07/2009 Created
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
%include "asm_inc.asm"
|
|
|
|
;*******************************************************************************
|
|
; Macros and other preprocessor constants
|
|
;*******************************************************************************
|
|
|
|
%ifdef FORMAT_COFF
|
|
SECTION .rodata pData
|
|
%else
|
|
SECTION .rodata align=16
|
|
%endif
|
|
|
|
ALIGN 16
|
|
FOUR_16B_SSE2: dw 4, 4, 4, 4, 4, 4, 4, 4
|
|
|
|
|
|
SECTION .text
|
|
|
|
%ifdef WIN64
|
|
|
|
|
|
WELS_EXTERN DeblockLumaLt4V_ssse3
|
|
|
|
DeblockLumaLt4V_ssse3:
|
|
push rbp
|
|
mov r11,[rsp + 16 + 20h] ; pTC
|
|
sub rsp,1B0h
|
|
lea rbp,[rsp+20h]
|
|
movd xmm4,r8d
|
|
movd xmm2,r9d
|
|
mov qword [rbp+180h],r12
|
|
mov r10,rcx
|
|
movsxd r12,edx
|
|
add edx,edx
|
|
movsxd rdx,edx
|
|
sub r10,r12
|
|
movsx r8d,byte [r11]
|
|
pxor xmm3,xmm3
|
|
punpcklwd xmm2,xmm2
|
|
movaps [rbp+50h],xmm14
|
|
lea rax,[r12+r12*2]
|
|
movdqa xmm14,[rdx+rcx]
|
|
neg rax
|
|
pshufd xmm0,xmm2,0
|
|
movd xmm2,r8d
|
|
movsx edx,byte [r11+1]
|
|
movsx r8d,byte [r11+2]
|
|
movsx r11d,byte [r11+3]
|
|
movaps [rbp+70h],xmm12
|
|
movd xmm1,edx
|
|
movaps [rbp+80h],xmm11
|
|
movd xmm12,r8d
|
|
movd xmm11,r11d
|
|
movdqa xmm5, [rax+rcx]
|
|
lea rax,[r12+r12]
|
|
punpcklwd xmm12,xmm12
|
|
neg rax
|
|
punpcklwd xmm11,xmm11
|
|
movaps [rbp],xmm8
|
|
movdqa xmm8, [r10]
|
|
punpcklwd xmm2,xmm2
|
|
punpcklwd xmm1,xmm1
|
|
punpcklqdq xmm12,xmm12
|
|
punpcklqdq xmm11,xmm11
|
|
punpcklqdq xmm2,xmm2
|
|
punpcklqdq xmm1,xmm1
|
|
shufps xmm12,xmm11,88h
|
|
movdqa xmm11,xmm8
|
|
movaps [rbp+30h],xmm9
|
|
movdqa xmm9,[rcx]
|
|
shufps xmm2,xmm1,88h
|
|
movdqa xmm1,xmm5
|
|
punpcklbw xmm11,xmm3
|
|
movaps [rbp+20h],xmm6
|
|
movaps [rbp+60h],xmm13
|
|
movdqa xmm13,xmm11
|
|
movaps [rbp+90h],xmm10
|
|
movdqa xmm10,xmm9
|
|
movdqa xmm6,[rax+rcx]
|
|
punpcklbw xmm1,xmm3
|
|
movaps [rbp+0A0h],xmm12
|
|
psubw xmm13,xmm1
|
|
movaps [rbp+40h],xmm15
|
|
movdqa xmm15,xmm14
|
|
movaps [rbp+10h],xmm7
|
|
movdqa xmm7,xmm6
|
|
punpcklbw xmm10,xmm3
|
|
movdqa xmm12,[r12+rcx]
|
|
punpcklbw xmm7,xmm3
|
|
punpcklbw xmm12,xmm3
|
|
punpcklbw xmm15,xmm3
|
|
pabsw xmm3,xmm13
|
|
movdqa xmm13,xmm10
|
|
psubw xmm13,xmm15
|
|
movdqa [rbp+0F0h],xmm15
|
|
pabsw xmm15,xmm13
|
|
movdqa xmm13,xmm11
|
|
movdqa [rbp+0B0h],xmm1
|
|
movdqa xmm1,xmm0
|
|
pavgw xmm13,xmm10
|
|
pcmpgtw xmm1,xmm3
|
|
movdqa [rbp+120h],xmm13
|
|
movaps xmm13,xmm2
|
|
punpcklwd xmm4,xmm4
|
|
movdqa xmm3,xmm0
|
|
movdqa [rbp+100h],xmm1
|
|
psubw xmm13,xmm1
|
|
movdqa xmm1,xmm10
|
|
pcmpgtw xmm3,xmm15
|
|
pshufd xmm4,xmm4,0
|
|
psubw xmm1,xmm11
|
|
movdqa [rbp+0D0h],xmm10
|
|
psubw xmm13,xmm3
|
|
movdqa [rbp+110h],xmm3
|
|
pabsw xmm15,xmm1
|
|
movdqa xmm3,xmm4
|
|
psubw xmm10,xmm12
|
|
pcmpgtw xmm3,xmm15
|
|
pabsw xmm15,xmm10
|
|
movdqa xmm10,xmm0
|
|
psllw xmm1,2
|
|
movdqa [rbp+0C0h],xmm11
|
|
psubw xmm11,xmm7
|
|
pcmpgtw xmm10,xmm15
|
|
pabsw xmm11,xmm11
|
|
movdqa xmm15,xmm0
|
|
pand xmm3,xmm10
|
|
pcmpgtw xmm15,xmm11
|
|
movaps xmm11,xmm2
|
|
pxor xmm10,xmm10
|
|
pand xmm3,xmm15
|
|
pcmpgtw xmm11,xmm10
|
|
pcmpeqw xmm10,xmm2
|
|
por xmm11,xmm10
|
|
pand xmm3,xmm11
|
|
movdqa xmm11,xmm7
|
|
psubw xmm11,xmm12
|
|
pxor xmm15,xmm15
|
|
paddw xmm11,xmm1
|
|
psubw xmm15,xmm13
|
|
movdqa [rbp+0E0h],xmm12
|
|
paddw xmm11,[FOUR_16B_SSE2]
|
|
pxor xmm12,xmm12
|
|
psraw xmm11,3
|
|
punpckhbw xmm8,xmm12
|
|
pmaxsw xmm15,xmm11
|
|
punpckhbw xmm5,xmm12
|
|
movdqa xmm11,xmm8
|
|
pminsw xmm13,xmm15
|
|
psubw xmm11,xmm5
|
|
punpckhbw xmm9,xmm12
|
|
pand xmm13,xmm3
|
|
movdqa [rbp+130h],xmm13
|
|
pabsw xmm13,xmm11
|
|
punpckhbw xmm14,xmm12
|
|
movdqa xmm11,xmm9
|
|
psubw xmm11,xmm14
|
|
movdqa xmm15,xmm0
|
|
movdqa [rbp+140h],xmm14
|
|
pabsw xmm14,xmm11
|
|
movdqa xmm11,xmm8
|
|
pcmpgtw xmm15,xmm14
|
|
movdqa xmm1,[r12+rcx]
|
|
pavgw xmm11,xmm9
|
|
movdqa [rbp+170h],xmm11
|
|
movdqa xmm10,xmm9
|
|
punpckhbw xmm6,xmm12
|
|
psubw xmm10,xmm8
|
|
punpckhbw xmm1,xmm12
|
|
movdqa xmm12,xmm0
|
|
movaps xmm11,[rbp+0A0h]
|
|
pcmpgtw xmm12,xmm13
|
|
movaps xmm13,xmm11
|
|
psubw xmm13,xmm12
|
|
movdqa [rbp+160h],xmm15
|
|
psubw xmm13,xmm15
|
|
movdqa xmm15,xmm9
|
|
psubw xmm15,xmm1
|
|
movdqa [rbp+150h],xmm12
|
|
pabsw xmm12,xmm10
|
|
pabsw xmm14,xmm15
|
|
movdqa xmm15,xmm8
|
|
pcmpgtw xmm4,xmm12
|
|
movdqa xmm12,xmm0
|
|
psubw xmm15,xmm6
|
|
pcmpgtw xmm12,xmm14
|
|
pabsw xmm14,xmm15
|
|
psllw xmm10,2
|
|
pcmpgtw xmm0,xmm14
|
|
movdqa xmm14,xmm6
|
|
psubw xmm14,xmm1
|
|
pand xmm4,xmm12
|
|
paddw xmm14,xmm10
|
|
pand xmm4,xmm0
|
|
paddw xmm14,[FOUR_16B_SSE2]
|
|
pxor xmm15,xmm15
|
|
movaps xmm12,xmm11
|
|
psubw xmm15,xmm13
|
|
pxor xmm0,xmm0
|
|
psraw xmm14,3
|
|
pcmpgtw xmm12,xmm0
|
|
pcmpeqw xmm0,xmm11
|
|
pmaxsw xmm15,xmm14
|
|
por xmm12,xmm0
|
|
movdqa xmm0,[rbp+120h]
|
|
pminsw xmm13,xmm15
|
|
movdqa xmm15,[rbp+0B0h]
|
|
movdqa xmm10,xmm7
|
|
pand xmm4,xmm12
|
|
paddw xmm15,xmm0
|
|
pxor xmm12,xmm12
|
|
paddw xmm10,xmm7
|
|
movdqa xmm14,xmm12
|
|
psubw xmm15,xmm10
|
|
psubw xmm14,xmm2
|
|
psraw xmm15,1
|
|
pmaxsw xmm15,xmm14
|
|
movdqa xmm10,xmm6
|
|
pminsw xmm15,xmm2
|
|
paddw xmm10,xmm6
|
|
pand xmm15,xmm3
|
|
psubw xmm12,xmm11
|
|
pand xmm15,[rbp+100h]
|
|
pand xmm13,xmm4
|
|
paddw xmm7,xmm15
|
|
paddw xmm8,xmm13
|
|
movdqa xmm15,[rbp+170h]
|
|
psubw xmm9,xmm13
|
|
paddw xmm5,xmm15
|
|
psubw xmm5,xmm10
|
|
psraw xmm5,1
|
|
pmaxsw xmm5,xmm12
|
|
pminsw xmm5,xmm11
|
|
pand xmm5,xmm4
|
|
pand xmm5,[rbp+150h]
|
|
paddw xmm6,xmm5
|
|
movdqa xmm5,[rbp+0C0h]
|
|
packuswb xmm7,xmm6
|
|
movdqa xmm6,[rbp+130h]
|
|
paddw xmm5,xmm6
|
|
packuswb xmm5,xmm8
|
|
movdqa xmm8,[rbp+0D0h]
|
|
psubw xmm8,xmm6
|
|
movdqa xmm6,[rbp+0F0h]
|
|
paddw xmm6,xmm0
|
|
movdqa xmm0,[rbp+0E0h]
|
|
packuswb xmm8,xmm9
|
|
movdqa xmm9,xmm0
|
|
paddw xmm9,xmm0
|
|
psubw xmm6,xmm9
|
|
psraw xmm6,1
|
|
pmaxsw xmm14,xmm6
|
|
pminsw xmm2,xmm14
|
|
pand xmm2,xmm3
|
|
pand xmm2,[rbp+110h]
|
|
paddw xmm0,xmm2
|
|
movdqa xmm2,[rbp+140h]
|
|
paddw xmm2,xmm15
|
|
movdqa xmm15,xmm1
|
|
paddw xmm15,xmm1
|
|
psubw xmm2,xmm15
|
|
psraw xmm2,1
|
|
pmaxsw xmm12,xmm2
|
|
pminsw xmm11,xmm12
|
|
pand xmm11,xmm4
|
|
pand xmm11,[rbp+160h]
|
|
paddw xmm1,xmm11
|
|
movdqa [rax+rcx],xmm7
|
|
movdqa [r10],xmm5
|
|
packuswb xmm0,xmm1
|
|
movdqa [rcx],xmm8
|
|
movdqa [r12+rcx],xmm0
|
|
mov r12,qword [rbp+180h]
|
|
lea rsp,[rbp+190h]
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
WELS_EXTERN DeblockLumaEq4V_ssse3
|
|
|
|
ALIGN 16
|
|
DeblockLumaEq4V_ssse3:
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
push rsi
|
|
push rdi
|
|
sub rsp,1D8h
|
|
movaps [rax-38h],xmm6
|
|
movaps [rax-48h],xmm7
|
|
movaps [rax-58h],xmm8
|
|
pxor xmm1,xmm1
|
|
movsxd r10,edx
|
|
mov rbp,rcx
|
|
mov r11d,r8d
|
|
mov rdx,rcx
|
|
mov rdi,rbp
|
|
mov rbx,rbp
|
|
movdqa xmm5,[rbp]
|
|
movaps [rax-68h],xmm9
|
|
movaps [rax-78h],xmm10
|
|
punpcklbw xmm5,xmm1
|
|
movaps [rax-88h],xmm11
|
|
movaps [rax-98h],xmm12
|
|
movaps [rax-0A8h],xmm13
|
|
movaps [rax-0B8h],xmm14
|
|
movdqa xmm14,[r10+rbp]
|
|
movaps [rax-0C8h],xmm15
|
|
lea eax,[r10*4]
|
|
movsxd r8,eax
|
|
lea eax,[r10+r10*2]
|
|
movsxd rcx,eax
|
|
lea eax,[r10+r10]
|
|
sub rdx,r8
|
|
punpcklbw xmm14,xmm1
|
|
movdqa [rsp+90h],xmm5
|
|
movdqa [rsp+30h],xmm14
|
|
movsxd rsi,eax
|
|
movsx eax,r11w
|
|
sub rdi,rcx
|
|
sub rbx,rsi
|
|
mov r8,rbp
|
|
sub r8,r10
|
|
movd xmm0,eax
|
|
movsx eax,r9w
|
|
movdqa xmm12,[rdi]
|
|
movdqa xmm6, [rsi+rbp]
|
|
movdqa xmm13,[rbx]
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm11,xmm0,0
|
|
punpcklbw xmm13,xmm1
|
|
punpcklbw xmm6,xmm1
|
|
movdqa xmm8,[r8]
|
|
movd xmm0,eax
|
|
movdqa xmm10,xmm11
|
|
mov eax,2
|
|
punpcklbw xmm8,xmm1
|
|
punpcklbw xmm12,xmm1
|
|
cwde
|
|
punpcklwd xmm0,xmm0
|
|
psraw xmm10,2
|
|
movdqa xmm1,xmm8
|
|
movdqa [rsp+0F0h],xmm13
|
|
movdqa [rsp+0B0h],xmm8
|
|
pshufd xmm7,xmm0,0
|
|
psubw xmm1,xmm13
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm4,xmm7
|
|
movdqa xmm2,xmm7
|
|
psubw xmm0,xmm8
|
|
pabsw xmm3,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm5
|
|
movdqa [rsp+40h],xmm7
|
|
movdqa [rsp+60h],xmm6
|
|
pcmpgtw xmm4,xmm0
|
|
psubw xmm1,xmm14
|
|
pabsw xmm0,xmm1
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm4,xmm2
|
|
movdqa xmm0,xmm11
|
|
pcmpgtw xmm0,xmm3
|
|
pand xmm4,xmm0
|
|
movd xmm0,eax
|
|
movdqa [rsp+20h],xmm4
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm2,xmm0,0
|
|
paddw xmm10,xmm2
|
|
movdqa [rsp+0A0h],xmm2
|
|
movdqa xmm15,xmm7
|
|
pxor xmm4,xmm4
|
|
movdqa xmm0,xmm8
|
|
psubw xmm0,xmm12
|
|
mov eax,4
|
|
pabsw xmm0,xmm0
|
|
movdqa xmm1,xmm10
|
|
cwde
|
|
pcmpgtw xmm15,xmm0
|
|
pcmpgtw xmm1,xmm3
|
|
movdqa xmm3,xmm7
|
|
movdqa xmm7,[rdx]
|
|
movdqa xmm0,xmm5
|
|
psubw xmm0,xmm6
|
|
pand xmm15,xmm1
|
|
punpcklbw xmm7,xmm4
|
|
movdqa xmm9,xmm15
|
|
pabsw xmm0,xmm0
|
|
psllw xmm7,1
|
|
pandn xmm9,xmm12
|
|
pcmpgtw xmm3,xmm0
|
|
paddw xmm7,xmm12
|
|
movd xmm0,eax
|
|
pand xmm3,xmm1
|
|
paddw xmm7,xmm12
|
|
punpcklwd xmm0,xmm0
|
|
paddw xmm7,xmm12
|
|
pshufd xmm1,xmm0,0
|
|
paddw xmm7,xmm13
|
|
movdqa xmm0,xmm3
|
|
pandn xmm0,xmm6
|
|
paddw xmm7,xmm8
|
|
movdqa [rsp+70h],xmm1
|
|
paddw xmm7,xmm5
|
|
movdqa [rsp+120h],xmm0
|
|
movdqa xmm0,[rcx+rbp]
|
|
punpcklbw xmm0,xmm4
|
|
paddw xmm7,xmm1
|
|
movdqa xmm4,xmm15
|
|
psllw xmm0,1
|
|
psraw xmm7,3
|
|
paddw xmm0,xmm6
|
|
pand xmm7,xmm15
|
|
paddw xmm0,xmm6
|
|
paddw xmm0,xmm6
|
|
paddw xmm0,xmm14
|
|
movdqa xmm6,xmm15
|
|
paddw xmm0,xmm5
|
|
pandn xmm6,xmm13
|
|
paddw xmm0,xmm8
|
|
paddw xmm0,xmm1
|
|
psraw xmm0,3
|
|
movdqa xmm1,xmm12
|
|
paddw xmm1,xmm13
|
|
pand xmm0,xmm3
|
|
movdqa [rsp+100h],xmm0
|
|
movdqa xmm0,xmm8
|
|
paddw xmm0,xmm5
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm3
|
|
paddw xmm1,xmm2
|
|
psraw xmm1,2
|
|
pandn xmm0,xmm14
|
|
pand xmm4,xmm1
|
|
movdqa [rsp+0E0h],xmm0
|
|
movdqa xmm0,xmm5
|
|
paddw xmm0,xmm8
|
|
movdqa xmm1,[rsp+60h]
|
|
paddw xmm1,xmm14
|
|
movdqa xmm14,xmm3
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm8
|
|
paddw xmm0,[rsp+30h]
|
|
paddw xmm1,xmm2
|
|
psraw xmm1,2
|
|
pand xmm14,xmm1
|
|
movdqa xmm1,xmm13
|
|
paddw xmm1,xmm13
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm2
|
|
psraw xmm1,2
|
|
movdqa xmm0,[rsp+30h]
|
|
movdqa xmm2,xmm13
|
|
movdqa xmm5,xmm15
|
|
paddw xmm0,[rsp+70h]
|
|
pandn xmm5,xmm1
|
|
paddw xmm2,xmm8
|
|
movdqa xmm8,[rsp+90h]
|
|
movdqa xmm1,xmm12
|
|
paddw xmm2,xmm8
|
|
psllw xmm2,1
|
|
paddw xmm2,xmm0
|
|
paddw xmm1,xmm2
|
|
movdqa xmm0,xmm8
|
|
movdqa xmm8,xmm3
|
|
movdqa xmm2,[rsp+30h]
|
|
paddw xmm0,xmm13
|
|
psraw xmm1,3
|
|
pand xmm15,xmm1
|
|
movdqa xmm1,xmm2
|
|
paddw xmm1,xmm2
|
|
paddw xmm2,[rsp+90h]
|
|
paddw xmm2,[rsp+0B0h]
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm13
|
|
movdqa xmm13,[r8]
|
|
paddw xmm0, [rsp+70h]
|
|
paddw xmm1, [rsp+0A0h]
|
|
psllw xmm2,1
|
|
paddw xmm2,xmm0
|
|
psraw xmm1,2
|
|
movdqa xmm0, [rdi]
|
|
pandn xmm8,xmm1
|
|
movdqa xmm1, [rsp+60h]
|
|
paddw xmm1,xmm2
|
|
movdqa xmm2, [rbx]
|
|
psraw xmm1,3
|
|
pand xmm3,xmm1
|
|
movdqa xmm1, [rbp]
|
|
movdqa [rsp+0D0h],xmm3
|
|
pxor xmm3,xmm3
|
|
punpckhbw xmm0,xmm3
|
|
punpckhbw xmm1,xmm3
|
|
punpckhbw xmm13,xmm3
|
|
movdqa [rsp+0C0h],xmm0
|
|
movdqa xmm0,[r10+rbp]
|
|
movdqa [rsp],xmm1
|
|
punpckhbw xmm0,xmm3
|
|
punpckhbw xmm2,xmm3
|
|
movdqa [rsp+80h],xmm0
|
|
movdqa xmm0,[rsi+rbp]
|
|
movdqa [rsp+10h],xmm13
|
|
punpckhbw xmm0,xmm3
|
|
movdqa [rsp+50h],xmm0
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm1,xmm13
|
|
psubw xmm0,xmm13
|
|
psubw xmm1,xmm2
|
|
pabsw xmm3,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,[rsp]
|
|
movdqa xmm13,[rsp+40h]
|
|
movdqa [rsp+110h],xmm2
|
|
psubw xmm1, [rsp+80h]
|
|
pcmpgtw xmm13,xmm0
|
|
pcmpgtw xmm11,xmm3
|
|
pabsw xmm0,xmm1
|
|
pcmpgtw xmm10,xmm3
|
|
movdqa xmm1, [rsp+40h]
|
|
movdqa xmm2,xmm1
|
|
movdqa xmm3,xmm1
|
|
pcmpgtw xmm2,xmm0
|
|
movdqa xmm0, [rsp+10h]
|
|
pand xmm13,xmm2
|
|
pand xmm13,xmm11
|
|
movdqa xmm11,[rsp+0C0h]
|
|
psubw xmm0,xmm11
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm3,xmm0
|
|
pand xmm3,xmm10
|
|
movdqa xmm0,[rsp]
|
|
psubw xmm0,[rsp+50h]
|
|
movdqa xmm2,[rdx]
|
|
pabsw xmm0,xmm0
|
|
por xmm7,xmm9
|
|
movdqa xmm9,[rsp+20h]
|
|
pcmpgtw xmm1,xmm0
|
|
pand xmm9,xmm7
|
|
movdqa xmm7,[rsp+20h]
|
|
movdqa xmm0,xmm7
|
|
pandn xmm0,xmm12
|
|
movdqa xmm12,[rsp+110h]
|
|
pand xmm1,xmm10
|
|
movdqa xmm10,[rsp+70h]
|
|
movdqa [rsp+40h],xmm1
|
|
movdqa xmm1,xmm13
|
|
por xmm9,xmm0
|
|
pxor xmm0,xmm0
|
|
por xmm4,xmm6
|
|
movdqa xmm6,xmm7
|
|
punpckhbw xmm2,xmm0
|
|
por xmm15,xmm5
|
|
movdqa xmm5,[rsp+20h]
|
|
movdqa xmm0,xmm3
|
|
psllw xmm2,1
|
|
pandn xmm0,xmm11
|
|
pand xmm6,xmm4
|
|
movdqa xmm4,[rsp]
|
|
paddw xmm2,xmm11
|
|
pand xmm5,xmm15
|
|
movdqa xmm15,[rsp+20h]
|
|
paddw xmm2,xmm11
|
|
paddw xmm2,xmm11
|
|
paddw xmm2,xmm12
|
|
paddw xmm2,[rsp+10h]
|
|
paddw xmm2,[rsp]
|
|
paddw xmm2,xmm10
|
|
psraw xmm2,3
|
|
pand xmm2,xmm3
|
|
por xmm2,xmm0
|
|
pand xmm1,xmm2
|
|
movdqa xmm0,xmm13
|
|
movdqa xmm2,xmm11
|
|
pandn xmm0,xmm11
|
|
paddw xmm2,xmm12
|
|
por xmm1,xmm0
|
|
packuswb xmm9,xmm1
|
|
movdqa xmm0,xmm7
|
|
movdqa xmm7,[rsp+0A0h]
|
|
pandn xmm0,[rsp+0F0h]
|
|
movdqa xmm1,xmm3
|
|
por xmm6,xmm0
|
|
movdqa xmm0,[rsp+10h]
|
|
paddw xmm0,xmm4
|
|
paddw xmm2,xmm0
|
|
paddw xmm2,xmm7
|
|
movdqa xmm0,xmm3
|
|
pandn xmm0,xmm12
|
|
psraw xmm2,2
|
|
pand xmm1,xmm2
|
|
por xmm1,xmm0
|
|
movdqa xmm2,xmm13
|
|
movdqa xmm0,xmm13
|
|
pand xmm2,xmm1
|
|
pandn xmm0,xmm12
|
|
movdqa xmm1,xmm12
|
|
paddw xmm1,[rsp+10h]
|
|
por xmm2,xmm0
|
|
movdqa xmm0,xmm15
|
|
pandn xmm0,[rsp+0B0h]
|
|
paddw xmm1,xmm4
|
|
packuswb xmm6,xmm2
|
|
movdqa xmm2,xmm3
|
|
psllw xmm1,1
|
|
por xmm5,xmm0
|
|
movdqa xmm0,[rsp+80h]
|
|
paddw xmm0,xmm10
|
|
paddw xmm1,xmm0
|
|
paddw xmm11,xmm1
|
|
psraw xmm11,3
|
|
movdqa xmm1,xmm12
|
|
pand xmm2,xmm11
|
|
paddw xmm1,xmm12
|
|
movdqa xmm11,[rsp+80h]
|
|
movdqa xmm0, [rsp+10h]
|
|
por xmm14,[rsp+0E0h]
|
|
paddw xmm0,xmm11
|
|
movdqa xmm4,xmm15
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm13
|
|
paddw xmm1,xmm7
|
|
psraw xmm1,2
|
|
pandn xmm3,xmm1
|
|
por xmm2,xmm3
|
|
movdqa xmm1,xmm13
|
|
movdqa xmm3,[rsp+10h]
|
|
pandn xmm0,xmm3
|
|
pand xmm1,xmm2
|
|
movdqa xmm2,xmm11
|
|
paddw xmm2,[rsp]
|
|
por xmm1,xmm0
|
|
movdqa xmm0,[rsp+0D0h]
|
|
por xmm0,xmm8
|
|
paddw xmm2,xmm3
|
|
packuswb xmm5,xmm1
|
|
movdqa xmm8,[rsp+40h]
|
|
movdqa xmm1,[rsp+50h]
|
|
movdqa xmm3,xmm8
|
|
pand xmm4,xmm0
|
|
psllw xmm2,1
|
|
movdqa xmm0,xmm15
|
|
pandn xmm0,[rsp+90h]
|
|
por xmm4,xmm0
|
|
movdqa xmm0,xmm12
|
|
paddw xmm0,xmm10
|
|
paddw xmm2,xmm0
|
|
paddw xmm1,xmm2
|
|
movdqa xmm0,[rsp]
|
|
movdqa xmm2,xmm11
|
|
paddw xmm0,xmm12
|
|
movdqa xmm12,[rsp]
|
|
paddw xmm2,xmm11
|
|
paddw xmm2,xmm0
|
|
psraw xmm1,3
|
|
movdqa xmm0,xmm8
|
|
pand xmm3,xmm1
|
|
paddw xmm2,xmm7
|
|
movdqa xmm1,xmm13
|
|
psraw xmm2,2
|
|
pandn xmm0,xmm2
|
|
por xmm3,xmm0
|
|
movdqa xmm2,[rsp+50h]
|
|
movdqa xmm0,xmm13
|
|
pandn xmm0,xmm12
|
|
pand xmm1,xmm3
|
|
paddw xmm2,xmm11
|
|
movdqa xmm3,xmm15
|
|
por xmm1,xmm0
|
|
pand xmm3,xmm14
|
|
movdqa xmm14,[rsp+10h]
|
|
movdqa xmm0,xmm15
|
|
pandn xmm0,[rsp+30h]
|
|
packuswb xmm4,xmm1
|
|
movdqa xmm1,xmm8
|
|
por xmm3,xmm0
|
|
movdqa xmm0,xmm12
|
|
paddw xmm0,xmm14
|
|
paddw xmm2,xmm0
|
|
paddw xmm2,xmm7
|
|
movdqa xmm0,xmm8
|
|
pandn xmm0,xmm11
|
|
psraw xmm2,2
|
|
pand xmm1,xmm2
|
|
por xmm1,xmm0
|
|
movdqa xmm2,xmm13
|
|
movdqa xmm0,xmm13
|
|
pandn xmm0,xmm11
|
|
pand xmm2,xmm1
|
|
movdqa xmm1,xmm15
|
|
por xmm2,xmm0
|
|
packuswb xmm3,xmm2
|
|
movdqa xmm0,[rsp+100h]
|
|
por xmm0,[rsp+120h]
|
|
pand xmm1,xmm0
|
|
movdqa xmm2,[rcx+rbp]
|
|
movdqa xmm7,[rsp+50h]
|
|
pandn xmm15,[rsp+60h]
|
|
lea r11,[rsp+1D8h]
|
|
pxor xmm0,xmm0
|
|
por xmm1,xmm15
|
|
movaps xmm15,[r11-0A8h]
|
|
movdqa [rdi],xmm9
|
|
movaps xmm9,[r11-48h]
|
|
punpckhbw xmm2,xmm0
|
|
psllw xmm2,1
|
|
paddw xmm2,xmm7
|
|
paddw xmm2,xmm7
|
|
movdqa [rbx],xmm6
|
|
movaps xmm6,[r11-18h]
|
|
paddw xmm2,xmm7
|
|
paddw xmm2,xmm11
|
|
movaps xmm11,[r11-68h]
|
|
paddw xmm2,xmm12
|
|
movaps xmm12,[r11-78h]
|
|
paddw xmm2,xmm14
|
|
paddw xmm2,xmm10
|
|
psraw xmm2,3
|
|
movaps xmm10,[r11-58h]
|
|
movaps xmm14,[r11-98h]
|
|
movdqa xmm0,xmm13
|
|
pand xmm2,xmm8
|
|
pandn xmm8,xmm7
|
|
pandn xmm13,xmm7
|
|
por xmm2,xmm8
|
|
movaps xmm7,[r11-28h]
|
|
movaps xmm8,[r11-38h]
|
|
movdqa [r8],xmm5
|
|
pand xmm0,xmm2
|
|
por xmm0,xmm13
|
|
packuswb xmm1,xmm0
|
|
movaps xmm13,[r11-88h]
|
|
movdqa [rbp],xmm4
|
|
movdqa [r10+rbp],xmm3
|
|
movdqa [rsi+rbp],xmm1
|
|
mov rsp,r11
|
|
pop rdi
|
|
pop rsi
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
|
|
WELS_EXTERN DeblockChromaLt4V_ssse3
|
|
|
|
ALIGN 16
|
|
DeblockChromaLt4V_ssse3:
|
|
mov rax,rsp
|
|
push rbx
|
|
push rdi
|
|
sub rsp,0C8h
|
|
mov r10,qword [rax + 30h] ; pTC
|
|
pxor xmm1,xmm1
|
|
mov rbx,rcx
|
|
movsxd r11,r8d
|
|
movsx ecx,byte [r10]
|
|
movsx r8d,byte [r10+2]
|
|
mov rdi,rdx
|
|
movq xmm2,[rbx]
|
|
movq xmm9,[r11+rbx]
|
|
movsx edx,byte [r10+1]
|
|
mov word [rsp+2],cx
|
|
mov word [rsp],cx
|
|
movsx eax,byte [r10+3]
|
|
mov word [rsp+6],dx
|
|
mov word [rsp+4],dx
|
|
movdqa xmm11,xmm1
|
|
mov word [rsp+0Eh],ax
|
|
mov word [rsp+0Ch],ax
|
|
lea eax,[r11+r11]
|
|
movsxd rcx,eax
|
|
mov rax,rbx
|
|
mov rdx,rdi
|
|
sub rax,rcx
|
|
mov word [rsp+0Ah],r8w
|
|
mov word [rsp+8],r8w
|
|
movdqa xmm6,[rsp]
|
|
movdqa xmm7,xmm6
|
|
movq xmm13, [rax]
|
|
mov rax,rdi
|
|
sub rax,rcx
|
|
mov rcx,rbx
|
|
pcmpgtw xmm7,xmm1
|
|
psubw xmm11,xmm6
|
|
sub rcx,r11
|
|
sub rdx,r11
|
|
movq xmm0,[rax]
|
|
movsx eax,r9w
|
|
movq xmm15,[rcx]
|
|
punpcklqdq xmm13,xmm0
|
|
movq xmm0, [rdx]
|
|
movdqa xmm4,xmm13
|
|
punpcklqdq xmm15,xmm0
|
|
movq xmm0, [rdi]
|
|
punpcklbw xmm4,xmm1
|
|
movdqa xmm12,xmm15
|
|
punpcklqdq xmm2,xmm0
|
|
movq xmm0, [r11+rdi]
|
|
punpcklbw xmm12,xmm1
|
|
movdqa xmm14,xmm2
|
|
punpcklqdq xmm9,xmm0
|
|
punpckhbw xmm2,xmm1
|
|
punpcklbw xmm14,xmm1
|
|
movd xmm0,eax
|
|
movsx eax,word [rsp + 0C8h + 38h] ; iBeta
|
|
punpckhbw xmm13,xmm1
|
|
punpckhbw xmm15,xmm1
|
|
movdqa xmm3,xmm9
|
|
movdqa [rsp+10h],xmm2
|
|
punpcklwd xmm0,xmm0
|
|
punpckhbw xmm9,xmm1
|
|
punpcklbw xmm3,xmm1
|
|
movdqa xmm1,xmm14
|
|
pshufd xmm10,xmm0,0
|
|
movd xmm0,eax
|
|
mov eax,4
|
|
cwde
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm8,xmm0,0
|
|
movd xmm0,eax
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm5,xmm0,0
|
|
psubw xmm1,xmm12
|
|
movdqa xmm2,xmm10
|
|
lea r11,[rsp+0C8h]
|
|
psllw xmm1,2
|
|
movdqa xmm0,xmm4
|
|
psubw xmm4,xmm12
|
|
psubw xmm0,xmm3
|
|
psubw xmm3,xmm14
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm11
|
|
psraw xmm1,3
|
|
pmaxsw xmm0,xmm1
|
|
pminsw xmm6,xmm0
|
|
movdqa xmm1,xmm8
|
|
movdqa xmm0,xmm12
|
|
psubw xmm0,xmm14
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm2,xmm0
|
|
pabsw xmm0,xmm4
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm3
|
|
movdqa xmm3,[rsp]
|
|
pand xmm2,xmm1
|
|
movdqa xmm1,xmm8
|
|
pcmpgtw xmm1,xmm0
|
|
movdqa xmm0,xmm13
|
|
pand xmm2,xmm1
|
|
psubw xmm0,xmm9
|
|
psubw xmm13,xmm15
|
|
pand xmm2,xmm7
|
|
pand xmm6,xmm2
|
|
paddw xmm12,xmm6
|
|
psubw xmm14,xmm6
|
|
movdqa xmm2,[rsp+10h]
|
|
movaps xmm6,[r11-18h]
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm15
|
|
psubw xmm9,xmm2
|
|
psllw xmm1,2
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm15
|
|
psubw xmm0,xmm2
|
|
psraw xmm1,3
|
|
pmaxsw xmm11,xmm1
|
|
pabsw xmm0,xmm0
|
|
movdqa xmm1,xmm8
|
|
pcmpgtw xmm10,xmm0
|
|
pabsw xmm0,xmm13
|
|
pminsw xmm3,xmm11
|
|
movaps xmm11,[r11-68h]
|
|
movaps xmm13,[rsp+40h]
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm9
|
|
movaps xmm9, [r11-48h]
|
|
pand xmm10,xmm1
|
|
pcmpgtw xmm8,xmm0
|
|
pand xmm10,xmm8
|
|
pand xmm10,xmm7
|
|
movaps xmm8,[r11-38h]
|
|
movaps xmm7,[r11-28h]
|
|
pand xmm3,xmm10
|
|
paddw xmm15,xmm3
|
|
psubw xmm2,xmm3
|
|
movaps xmm10,[r11-58h]
|
|
packuswb xmm12,xmm15
|
|
movaps xmm15,[rsp+20h]
|
|
packuswb xmm14,xmm2
|
|
movq [rcx],xmm12
|
|
movq [rbx],xmm14
|
|
psrldq xmm12,8
|
|
psrldq xmm14,8
|
|
movq [rdx],xmm12
|
|
movaps xmm12,[r11-78h]
|
|
movq [rdi],xmm14
|
|
movaps xmm14,[rsp+30h]
|
|
mov rsp,r11
|
|
pop rdi
|
|
pop rbx
|
|
ret
|
|
|
|
|
|
WELS_EXTERN DeblockChromaEq4V_ssse3
|
|
ALIGN 16
|
|
DeblockChromaEq4V_ssse3:
|
|
mov rax,rsp
|
|
push rbx
|
|
sub rsp,90h
|
|
pxor xmm1,xmm1
|
|
mov r11,rcx
|
|
mov rbx,rdx
|
|
mov r10d,r9d
|
|
movq xmm13,[r11]
|
|
lea eax,[r8+r8]
|
|
movsxd r9,eax
|
|
mov rax,rcx
|
|
sub rax,r9
|
|
movq xmm14,[rax]
|
|
mov rax,rdx
|
|
sub rax,r9
|
|
movq xmm0,[rax]
|
|
movsxd rax,r8d
|
|
sub rcx,rax
|
|
sub rdx,rax
|
|
movq xmm12,[rax+r11]
|
|
movq xmm10,[rcx]
|
|
punpcklqdq xmm14,xmm0
|
|
movdqa xmm8,xmm14
|
|
movq xmm0,[rdx]
|
|
punpcklbw xmm8,xmm1
|
|
punpckhbw xmm14,xmm1
|
|
punpcklqdq xmm10,xmm0
|
|
movq xmm0,[rbx]
|
|
movdqa xmm5,xmm10
|
|
punpcklqdq xmm13,xmm0
|
|
movq xmm0, [rax+rbx]
|
|
punpcklbw xmm5,xmm1
|
|
movsx eax,r10w
|
|
movdqa xmm9,xmm13
|
|
punpcklqdq xmm12,xmm0
|
|
punpcklbw xmm9,xmm1
|
|
punpckhbw xmm10,xmm1
|
|
movd xmm0,eax
|
|
movsx eax,word [rsp + 90h + 8h + 28h] ; iBeta
|
|
punpckhbw xmm13,xmm1
|
|
movdqa xmm7,xmm12
|
|
punpcklwd xmm0,xmm0
|
|
punpckhbw xmm12,xmm1
|
|
pshufd xmm11,xmm0,0
|
|
punpcklbw xmm7,xmm1
|
|
movd xmm0,eax
|
|
movdqa xmm1,xmm8
|
|
psubw xmm1,xmm5
|
|
punpcklwd xmm0,xmm0
|
|
movdqa xmm6,xmm11
|
|
pshufd xmm3,xmm0,0
|
|
movdqa xmm0,xmm5
|
|
psubw xmm0,xmm9
|
|
movdqa xmm2,xmm3
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm6,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm3
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm6,xmm2
|
|
movdqa xmm0,xmm7
|
|
movdqa xmm2,xmm3
|
|
psubw xmm0,xmm9
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm1,xmm0
|
|
pand xmm6,xmm1
|
|
movdqa xmm0,xmm10
|
|
movdqa xmm1,xmm14
|
|
psubw xmm0,xmm13
|
|
psubw xmm1,xmm10
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm11,xmm0
|
|
pabsw xmm0,xmm1
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm11,xmm2
|
|
movdqa xmm0,xmm12
|
|
movdqa xmm4,xmm6
|
|
movdqa xmm1,xmm8
|
|
mov eax,2
|
|
cwde
|
|
paddw xmm1,xmm8
|
|
psubw xmm0,xmm13
|
|
paddw xmm1,xmm5
|
|
pabsw xmm0,xmm0
|
|
movdqa xmm2,xmm14
|
|
paddw xmm1,xmm7
|
|
pcmpgtw xmm3,xmm0
|
|
paddw xmm2,xmm14
|
|
movd xmm0,eax
|
|
pand xmm11,xmm3
|
|
paddw xmm7,xmm7
|
|
paddw xmm2,xmm10
|
|
punpcklwd xmm0,xmm0
|
|
paddw xmm2,xmm12
|
|
paddw xmm12,xmm12
|
|
pshufd xmm3,xmm0,0
|
|
paddw xmm7,xmm9
|
|
paddw xmm12,xmm13
|
|
movdqa xmm0,xmm6
|
|
paddw xmm1,xmm3
|
|
pandn xmm0,xmm5
|
|
paddw xmm7,xmm8
|
|
psraw xmm1,2
|
|
paddw xmm12,xmm14
|
|
paddw xmm7,xmm3
|
|
movaps xmm14,[rsp]
|
|
pand xmm4,xmm1
|
|
paddw xmm12,xmm3
|
|
psraw xmm7,2
|
|
movdqa xmm1,xmm11
|
|
por xmm4,xmm0
|
|
psraw xmm12,2
|
|
paddw xmm2,xmm3
|
|
movdqa xmm0,xmm11
|
|
pandn xmm0,xmm10
|
|
psraw xmm2,2
|
|
pand xmm1,xmm2
|
|
por xmm1,xmm0
|
|
packuswb xmm4,xmm1
|
|
movdqa xmm0,xmm11
|
|
movdqa xmm1,xmm6
|
|
pand xmm1,xmm7
|
|
movaps xmm7,[rsp+70h]
|
|
movq [rcx],xmm4
|
|
pandn xmm6,xmm9
|
|
pandn xmm11,xmm13
|
|
pand xmm0,xmm12
|
|
por xmm1,xmm6
|
|
por xmm0,xmm11
|
|
psrldq xmm4,8
|
|
packuswb xmm1,xmm0
|
|
movq [r11],xmm1
|
|
psrldq xmm1,8
|
|
movq [rdx],xmm4
|
|
lea r11,[rsp+90h]
|
|
movaps xmm6,[r11-10h]
|
|
movaps xmm8,[r11-30h]
|
|
movaps xmm9,[r11-40h]
|
|
movq [rbx],xmm1
|
|
movaps xmm10,[r11-50h]
|
|
movaps xmm11,[r11-60h]
|
|
movaps xmm12,[r11-70h]
|
|
movaps xmm13,[r11-80h]
|
|
mov rsp,r11
|
|
pop rbx
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
WELS_EXTERN DeblockChromaEq4H_ssse3
|
|
ALIGN 16
|
|
DeblockChromaEq4H_ssse3:
|
|
mov rax,rsp
|
|
mov [rax+20h],rbx
|
|
push rdi
|
|
sub rsp,140h
|
|
mov rdi,rdx
|
|
lea eax,[r8*4]
|
|
movsxd r10,eax
|
|
mov eax,[rcx-2]
|
|
mov [rsp+10h],eax
|
|
lea rbx,[r10+rdx-2]
|
|
lea r11,[r10+rcx-2]
|
|
movdqa xmm5,[rsp+10h]
|
|
movsxd r10,r8d
|
|
mov eax,[r10+rcx-2]
|
|
lea rdx,[r10+r10*2]
|
|
mov [rsp+20h],eax
|
|
mov eax,[rcx+r10*2-2]
|
|
mov [rsp+30h],eax
|
|
mov eax,[rdx+rcx-2]
|
|
movdqa xmm2,[rsp+20h]
|
|
mov [rsp+40h],eax
|
|
mov eax, [rdi-2]
|
|
movdqa xmm4,[rsp+30h]
|
|
mov [rsp+50h],eax
|
|
mov eax,[r10+rdi-2]
|
|
movdqa xmm3,[rsp+40h]
|
|
mov [rsp+60h],eax
|
|
mov eax,[rdi+r10*2-2]
|
|
punpckldq xmm5,[rsp+50h]
|
|
mov [rsp+70h],eax
|
|
mov eax, [rdx+rdi-2]
|
|
punpckldq xmm2, [rsp+60h]
|
|
mov [rsp+80h],eax
|
|
mov eax,[r11]
|
|
punpckldq xmm4, [rsp+70h]
|
|
mov [rsp+50h],eax
|
|
mov eax,[rbx]
|
|
punpckldq xmm3,[rsp+80h]
|
|
mov [rsp+60h],eax
|
|
mov eax,[r10+r11]
|
|
movdqa xmm0, [rsp+50h]
|
|
punpckldq xmm0, [rsp+60h]
|
|
punpcklqdq xmm5,xmm0
|
|
movdqa [rsp+50h],xmm0
|
|
mov [rsp+50h],eax
|
|
mov eax,[r10+rbx]
|
|
movdqa xmm0,[rsp+50h]
|
|
movdqa xmm1,xmm5
|
|
mov [rsp+60h],eax
|
|
mov eax,[r11+r10*2]
|
|
punpckldq xmm0, [rsp+60h]
|
|
punpcklqdq xmm2,xmm0
|
|
punpcklbw xmm1,xmm2
|
|
punpckhbw xmm5,xmm2
|
|
movdqa [rsp+50h],xmm0
|
|
mov [rsp+50h],eax
|
|
mov eax,[rbx+r10*2]
|
|
movdqa xmm0,[rsp+50h]
|
|
mov [rsp+60h],eax
|
|
mov eax, [rdx+r11]
|
|
movdqa xmm15,xmm1
|
|
punpckldq xmm0,[rsp+60h]
|
|
punpcklqdq xmm4,xmm0
|
|
movdqa [rsp+50h],xmm0
|
|
mov [rsp+50h],eax
|
|
mov eax, [rdx+rbx]
|
|
movdqa xmm0,[rsp+50h]
|
|
mov [rsp+60h],eax
|
|
punpckldq xmm0, [rsp+60h]
|
|
punpcklqdq xmm3,xmm0
|
|
movdqa xmm0,xmm4
|
|
punpcklbw xmm0,xmm3
|
|
punpckhbw xmm4,xmm3
|
|
punpcklwd xmm15,xmm0
|
|
punpckhwd xmm1,xmm0
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm12,xmm15
|
|
punpcklwd xmm0,xmm4
|
|
punpckhwd xmm5,xmm4
|
|
punpckldq xmm12,xmm0
|
|
punpckhdq xmm15,xmm0
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm11,xmm12
|
|
punpckldq xmm0,xmm5
|
|
punpckhdq xmm1,xmm5
|
|
punpcklqdq xmm11,xmm0
|
|
punpckhqdq xmm12,xmm0
|
|
movsx eax,r9w
|
|
movdqa xmm14,xmm15
|
|
punpcklqdq xmm14,xmm1
|
|
punpckhqdq xmm15,xmm1
|
|
pxor xmm1,xmm1
|
|
movd xmm0,eax
|
|
movdqa xmm4,xmm12
|
|
movdqa xmm8,xmm11
|
|
movsx eax,word [rsp+170h] ; iBeta
|
|
punpcklwd xmm0,xmm0
|
|
punpcklbw xmm4,xmm1
|
|
punpckhbw xmm12,xmm1
|
|
movdqa xmm9,xmm14
|
|
movdqa xmm7,xmm15
|
|
movdqa xmm10,xmm15
|
|
pshufd xmm13,xmm0,0
|
|
punpcklbw xmm9,xmm1
|
|
punpckhbw xmm14,xmm1
|
|
movdqa xmm6,xmm13
|
|
movd xmm0,eax
|
|
movdqa [rsp],xmm11
|
|
mov eax,2
|
|
cwde
|
|
punpckhbw xmm11,xmm1
|
|
punpckhbw xmm10,xmm1
|
|
punpcklbw xmm7,xmm1
|
|
punpcklwd xmm0,xmm0
|
|
punpcklbw xmm8,xmm1
|
|
pshufd xmm3,xmm0,0
|
|
movdqa xmm1,xmm8
|
|
movdqa xmm0,xmm4
|
|
psubw xmm0,xmm9
|
|
psubw xmm1,xmm4
|
|
movdqa xmm2,xmm3
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm6,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm3
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm6,xmm2
|
|
movdqa xmm0,xmm7
|
|
movdqa xmm2,xmm3
|
|
psubw xmm0,xmm9
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm1,xmm0
|
|
pand xmm6,xmm1
|
|
movdqa xmm0,xmm12
|
|
movdqa xmm1,xmm11
|
|
psubw xmm0,xmm14
|
|
psubw xmm1,xmm12
|
|
movdqa xmm5,xmm6
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm13,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm8
|
|
pcmpgtw xmm2,xmm0
|
|
paddw xmm1,xmm8
|
|
movdqa xmm0,xmm10
|
|
pand xmm13,xmm2
|
|
psubw xmm0,xmm14
|
|
paddw xmm1,xmm4
|
|
movdqa xmm2,xmm11
|
|
pabsw xmm0,xmm0
|
|
paddw xmm2,xmm11
|
|
paddw xmm1,xmm7
|
|
pcmpgtw xmm3,xmm0
|
|
paddw xmm2,xmm12
|
|
movd xmm0,eax
|
|
pand xmm13,xmm3
|
|
paddw xmm2,xmm10
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm3,xmm0,0
|
|
movdqa xmm0,xmm6
|
|
paddw xmm1,xmm3
|
|
pandn xmm0,xmm4
|
|
paddw xmm2,xmm3
|
|
psraw xmm1,2
|
|
pand xmm5,xmm1
|
|
por xmm5,xmm0
|
|
paddw xmm7,xmm7
|
|
paddw xmm10,xmm10
|
|
psraw xmm2,2
|
|
movdqa xmm1,xmm13
|
|
movdqa xmm0,xmm13
|
|
pandn xmm0,xmm12
|
|
pand xmm1,xmm2
|
|
paddw xmm7,xmm9
|
|
por xmm1,xmm0
|
|
paddw xmm10,xmm14
|
|
paddw xmm7,xmm8
|
|
movdqa xmm0,xmm13
|
|
packuswb xmm5,xmm1
|
|
paddw xmm7,xmm3
|
|
paddw xmm10,xmm11
|
|
movdqa xmm1,xmm6
|
|
paddw xmm10,xmm3
|
|
pandn xmm6,xmm9
|
|
psraw xmm7,2
|
|
pand xmm1,xmm7
|
|
psraw xmm10,2
|
|
pandn xmm13,xmm14
|
|
pand xmm0,xmm10
|
|
por xmm1,xmm6
|
|
movdqa xmm6,[rsp]
|
|
movdqa xmm4,xmm6
|
|
por xmm0,xmm13
|
|
punpcklbw xmm4,xmm5
|
|
punpckhbw xmm6,xmm5
|
|
movdqa xmm3,xmm4
|
|
packuswb xmm1,xmm0
|
|
movdqa xmm0,xmm1
|
|
punpckhbw xmm1,xmm15
|
|
punpcklbw xmm0,xmm15
|
|
punpcklwd xmm3,xmm0
|
|
punpckhwd xmm4,xmm0
|
|
movdqa xmm0,xmm6
|
|
movdqa xmm2,xmm3
|
|
punpcklwd xmm0,xmm1
|
|
punpckhwd xmm6,xmm1
|
|
movdqa xmm1,xmm4
|
|
punpckldq xmm2,xmm0
|
|
punpckhdq xmm3,xmm0
|
|
punpckldq xmm1,xmm6
|
|
movdqa xmm0,xmm2
|
|
punpcklqdq xmm0,xmm1
|
|
punpckhdq xmm4,xmm6
|
|
punpckhqdq xmm2,xmm1
|
|
movdqa [rsp+10h],xmm0
|
|
movdqa [rsp+60h],xmm2
|
|
movdqa xmm0,xmm3
|
|
mov eax,[rsp+10h]
|
|
mov [rcx-2],eax
|
|
mov eax,[rsp+60h]
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm3,xmm4
|
|
mov [r10+rcx-2],eax
|
|
movdqa [rsp+20h],xmm0
|
|
mov eax, [rsp+20h]
|
|
movdqa [rsp+70h],xmm3
|
|
mov [rcx+r10*2-2],eax
|
|
mov eax,[rsp+70h]
|
|
mov [rdx+rcx-2],eax
|
|
mov eax,[rsp+18h]
|
|
mov [r11],eax
|
|
mov eax,[rsp+68h]
|
|
mov [r10+r11],eax
|
|
mov eax,[rsp+28h]
|
|
mov [r11+r10*2],eax
|
|
mov eax,[rsp+78h]
|
|
mov [rdx+r11],eax
|
|
mov eax,[rsp+14h]
|
|
mov [rdi-2],eax
|
|
mov eax,[rsp+64h]
|
|
mov [r10+rdi-2],eax
|
|
mov eax,[rsp+24h]
|
|
mov [rdi+r10*2-2],eax
|
|
mov eax, [rsp+74h]
|
|
mov [rdx+rdi-2],eax
|
|
mov eax, [rsp+1Ch]
|
|
mov [rbx],eax
|
|
mov eax, [rsp+6Ch]
|
|
mov [r10+rbx],eax
|
|
mov eax,[rsp+2Ch]
|
|
mov [rbx+r10*2],eax
|
|
mov eax,[rsp+7Ch]
|
|
mov [rdx+rbx],eax
|
|
lea r11,[rsp+140h]
|
|
mov rbx, [r11+28h]
|
|
mov rsp,r11
|
|
pop rdi
|
|
ret
|
|
|
|
|
|
|
|
WELS_EXTERN DeblockChromaLt4H_ssse3
|
|
ALIGN 16
|
|
DeblockChromaLt4H_ssse3:
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
push rsi
|
|
push rdi
|
|
push r12
|
|
sub rsp,170h
|
|
|
|
movsxd rsi,r8d
|
|
lea eax,[r8*4]
|
|
mov r11d,r9d
|
|
movsxd r10,eax
|
|
mov eax, [rcx-2]
|
|
mov r12,rdx
|
|
mov [rsp+40h],eax
|
|
mov eax, [rsi+rcx-2]
|
|
lea rbx,[r10+rcx-2]
|
|
movdqa xmm5,[rsp+40h]
|
|
mov [rsp+50h],eax
|
|
mov eax, [rcx+rsi*2-2]
|
|
lea rbp,[r10+rdx-2]
|
|
movdqa xmm2, [rsp+50h]
|
|
mov [rsp+60h],eax
|
|
lea r10,[rsi+rsi*2]
|
|
mov rdi,rcx
|
|
mov eax,[r10+rcx-2]
|
|
movdqa xmm4,[rsp+60h]
|
|
mov [rsp+70h],eax
|
|
mov eax,[rdx-2]
|
|
mov [rsp+80h],eax
|
|
mov eax, [rsi+rdx-2]
|
|
movdqa xmm3,[rsp+70h]
|
|
mov [rsp+90h],eax
|
|
mov eax,[rdx+rsi*2-2]
|
|
punpckldq xmm5,[rsp+80h]
|
|
mov [rsp+0A0h],eax
|
|
mov eax, [r10+rdx-2]
|
|
punpckldq xmm2,[rsp+90h]
|
|
mov [rsp+0B0h],eax
|
|
mov eax, [rbx]
|
|
punpckldq xmm4,[rsp+0A0h]
|
|
mov [rsp+80h],eax
|
|
mov eax,[rbp]
|
|
punpckldq xmm3,[rsp+0B0h]
|
|
mov [rsp+90h],eax
|
|
mov eax,[rsi+rbx]
|
|
movdqa xmm0,[rsp+80h]
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm5,xmm0
|
|
movdqa [rsp+80h],xmm0
|
|
mov [rsp+80h],eax
|
|
mov eax,[rsi+rbp]
|
|
movdqa xmm0,[rsp+80h]
|
|
movdqa xmm1,xmm5
|
|
mov [rsp+90h],eax
|
|
mov eax,[rbx+rsi*2]
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm2,xmm0
|
|
punpcklbw xmm1,xmm2
|
|
punpckhbw xmm5,xmm2
|
|
movdqa [rsp+80h],xmm0
|
|
mov [rsp+80h],eax
|
|
mov eax,[rbp+rsi*2]
|
|
movdqa xmm0, [rsp+80h]
|
|
mov [rsp+90h],eax
|
|
mov eax,[r10+rbx]
|
|
movdqa xmm7,xmm1
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm4,xmm0
|
|
movdqa [rsp+80h],xmm0
|
|
mov [rsp+80h],eax
|
|
mov eax, [r10+rbp]
|
|
movdqa xmm0,[rsp+80h]
|
|
mov [rsp+90h],eax
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm3,xmm0
|
|
movdqa xmm0,xmm4
|
|
punpcklbw xmm0,xmm3
|
|
punpckhbw xmm4,xmm3
|
|
punpcklwd xmm7,xmm0
|
|
punpckhwd xmm1,xmm0
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm6,xmm7
|
|
punpcklwd xmm0,xmm4
|
|
punpckhwd xmm5,xmm4
|
|
punpckldq xmm6,xmm0
|
|
punpckhdq xmm7,xmm0
|
|
movdqa xmm0,xmm1
|
|
punpckldq xmm0,xmm5
|
|
mov rax, [rsp+1C8h] ; pTC
|
|
punpckhdq xmm1,xmm5
|
|
movdqa xmm9,xmm6
|
|
punpckhqdq xmm6,xmm0
|
|
punpcklqdq xmm9,xmm0
|
|
movdqa xmm2,xmm7
|
|
movdqa xmm13,xmm6
|
|
movdqa xmm4,xmm9
|
|
movdqa [rsp+10h],xmm9
|
|
punpcklqdq xmm2,xmm1
|
|
punpckhqdq xmm7,xmm1
|
|
pxor xmm1,xmm1
|
|
movsx ecx,byte [rax+3]
|
|
movsx edx,byte [rax+2]
|
|
movsx r8d,byte [rax+1]
|
|
movsx r9d,byte [rax]
|
|
movdqa xmm10,xmm1
|
|
movdqa xmm15,xmm2
|
|
punpckhbw xmm2,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
punpcklbw xmm4,xmm1
|
|
movsx eax,r11w
|
|
mov word [rsp+0Eh],cx
|
|
mov word [rsp+0Ch],cx
|
|
movdqa xmm3,xmm7
|
|
movdqa xmm8,xmm7
|
|
movdqa [rsp+20h],xmm7
|
|
punpcklbw xmm15,xmm1
|
|
punpcklbw xmm13,xmm1
|
|
punpcklbw xmm3,xmm1
|
|
mov word [rsp+0Ah],dx
|
|
mov word [rsp+8],dx
|
|
mov word [rsp+6],r8w
|
|
movd xmm0,eax
|
|
movdqa [rsp+30h],xmm6
|
|
punpckhbw xmm9,xmm1
|
|
punpckhbw xmm8,xmm1
|
|
punpcklwd xmm0,xmm0
|
|
movsx eax,word [rsp+1C0h] ; iBeta
|
|
mov word [rsp+4],r8w
|
|
mov word [rsp+2],r9w
|
|
pshufd xmm12,xmm0,0
|
|
mov word [rsp],r9w
|
|
movd xmm0,eax
|
|
mov eax,4
|
|
cwde
|
|
movdqa xmm14, [rsp]
|
|
movdqa [rsp],xmm2
|
|
movdqa xmm2,xmm12
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm11,xmm0,0
|
|
psubw xmm10,xmm14
|
|
movd xmm0,eax
|
|
movdqa xmm7,xmm14
|
|
movdqa xmm6,xmm14
|
|
pcmpgtw xmm7,xmm1
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm5,xmm0,0
|
|
movdqa xmm0,xmm4
|
|
movdqa xmm1,xmm15
|
|
psubw xmm4,xmm13
|
|
psubw xmm0,xmm3
|
|
psubw xmm1,xmm13
|
|
psubw xmm3,xmm15
|
|
psllw xmm1,2
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm10
|
|
psraw xmm1,3
|
|
pmaxsw xmm0,xmm1
|
|
pminsw xmm6,xmm0
|
|
movdqa xmm1,xmm11
|
|
movdqa xmm0,xmm13
|
|
psubw xmm0,xmm15
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm2,xmm0
|
|
pabsw xmm0,xmm4
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm3
|
|
pand xmm2,xmm1
|
|
movdqa xmm1,xmm11
|
|
movdqa xmm3,[rsp+30h]
|
|
pcmpgtw xmm1,xmm0
|
|
movdqa xmm0,xmm9
|
|
pand xmm2,xmm1
|
|
psubw xmm0,xmm8
|
|
psubw xmm9,xmm3
|
|
pand xmm2,xmm7
|
|
pand xmm6,xmm2
|
|
psubw xmm15,xmm6
|
|
paddw xmm13,xmm6
|
|
movdqa xmm2,[rsp]
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm3
|
|
psubw xmm8,xmm2
|
|
psllw xmm1,2
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm3
|
|
movdqa xmm5,[rsp+10h]
|
|
psubw xmm0,xmm2
|
|
psraw xmm1,3
|
|
movdqa xmm4,xmm5
|
|
pabsw xmm0,xmm0
|
|
pmaxsw xmm10,xmm1
|
|
movdqa xmm1,xmm11
|
|
pcmpgtw xmm12,xmm0
|
|
pabsw xmm0,xmm9
|
|
pminsw xmm14,xmm10
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm8
|
|
pcmpgtw xmm11,xmm0
|
|
pand xmm12,xmm1
|
|
movdqa xmm1,[rsp+20h]
|
|
pand xmm12,xmm11
|
|
pand xmm12,xmm7
|
|
pand xmm14,xmm12
|
|
paddw xmm3,xmm14
|
|
psubw xmm2,xmm14
|
|
packuswb xmm13,xmm3
|
|
packuswb xmm15,xmm2
|
|
punpcklbw xmm4,xmm13
|
|
punpckhbw xmm5,xmm13
|
|
movdqa xmm0,xmm15
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm15,xmm1
|
|
movdqa xmm3,xmm4
|
|
punpcklwd xmm3,xmm0
|
|
punpckhwd xmm4,xmm0
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm2,xmm3
|
|
movdqa xmm1,xmm4
|
|
punpcklwd xmm0,xmm15
|
|
punpckhwd xmm5,xmm15
|
|
punpckldq xmm2,xmm0
|
|
punpckhdq xmm3,xmm0
|
|
punpckldq xmm1,xmm5
|
|
movdqa xmm0,xmm2
|
|
punpcklqdq xmm0,xmm1
|
|
punpckhdq xmm4,xmm5
|
|
punpckhqdq xmm2,xmm1
|
|
movdqa [rsp+40h],xmm0
|
|
movdqa xmm0,xmm3
|
|
movdqa [rsp+90h],xmm2
|
|
mov eax,[rsp+40h]
|
|
mov [rdi-2],eax
|
|
mov eax, [rsp+90h]
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm3,xmm4
|
|
mov [rsi+rdi-2],eax
|
|
movdqa [rsp+50h],xmm0
|
|
mov eax,[rsp+50h]
|
|
movdqa [rsp+0A0h],xmm3
|
|
mov [rdi+rsi*2-2],eax
|
|
mov eax,[rsp+0A0h]
|
|
mov [r10+rdi-2],eax
|
|
mov eax,[rsp+48h]
|
|
mov [rbx],eax
|
|
mov eax,[rsp+98h]
|
|
mov [rsi+rbx],eax
|
|
mov eax,[rsp+58h]
|
|
mov [rbx+rsi*2],eax
|
|
mov eax, [rsp+0A8h]
|
|
mov [r10+rbx],eax
|
|
mov eax, [rsp+44h]
|
|
mov [r12-2],eax
|
|
mov eax,[rsp+94h]
|
|
mov [rsi+r12-2],eax
|
|
mov eax,[rsp+54h]
|
|
mov [r12+rsi*2-2],eax
|
|
mov eax, [rsp+0A4h]
|
|
mov [r10+r12-2],eax
|
|
mov eax,[rsp+4Ch]
|
|
mov [rbp],eax
|
|
mov eax,[rsp+9Ch]
|
|
mov [rsi+rbp],eax
|
|
mov eax, [rsp+5Ch]
|
|
mov [rbp+rsi*2],eax
|
|
mov eax,[rsp+0ACh]
|
|
mov [r10+rbp],eax
|
|
lea r11,[rsp+170h]
|
|
mov rsp,r11
|
|
pop r12
|
|
pop rdi
|
|
pop rsi
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
|
|
|
|
%elifdef UNIX64
|
|
|
|
|
|
WELS_EXTERN DeblockLumaLt4V_ssse3
|
|
|
|
DeblockLumaLt4V_ssse3:
|
|
push rbp
|
|
mov r11,r8 ; pTC
|
|
sub rsp,1B0h
|
|
lea rbp,[rsp+20h]
|
|
movd xmm4,edx
|
|
movd xmm2,ecx
|
|
mov qword [rbp+180h],r12
|
|
mov r10,rdi
|
|
movsxd r12,esi
|
|
add rsi,rsi
|
|
movsxd rdx,esi
|
|
sub r10,r12
|
|
movsx r8d,byte [r11]
|
|
pxor xmm3,xmm3
|
|
punpcklwd xmm2,xmm2
|
|
movaps [rbp+50h],xmm14
|
|
lea rax,[r12+r12*2]
|
|
movdqa xmm14,[rdx+rdi]
|
|
neg rax
|
|
pshufd xmm0,xmm2,0
|
|
movd xmm2,r8d
|
|
movsx rsi,byte [r11+1]
|
|
movsx r8d,byte [r11+2]
|
|
movsx r11d,byte [r11+3]
|
|
movaps [rbp+70h],xmm12
|
|
movd xmm1,esi
|
|
movaps [rbp+80h],xmm11
|
|
movd xmm12,r8d
|
|
movd xmm11,r11d
|
|
movdqa xmm5, [rax+rdi]
|
|
lea rax,[r12+r12]
|
|
punpcklwd xmm12,xmm12
|
|
neg rax
|
|
punpcklwd xmm11,xmm11
|
|
movaps [rbp],xmm8
|
|
movdqa xmm8, [r10]
|
|
punpcklwd xmm2,xmm2
|
|
punpcklwd xmm1,xmm1
|
|
punpcklqdq xmm12,xmm12
|
|
punpcklqdq xmm11,xmm11
|
|
punpcklqdq xmm2,xmm2
|
|
punpcklqdq xmm1,xmm1
|
|
shufps xmm12,xmm11,88h
|
|
movdqa xmm11,xmm8
|
|
movaps [rbp+30h],xmm9
|
|
movdqa xmm9,[rdi]
|
|
shufps xmm2,xmm1,88h
|
|
movdqa xmm1,xmm5
|
|
punpcklbw xmm11,xmm3
|
|
movaps [rbp+20h],xmm6
|
|
movaps [rbp+60h],xmm13
|
|
movdqa xmm13,xmm11
|
|
movaps [rbp+90h],xmm10
|
|
movdqa xmm10,xmm9
|
|
movdqa xmm6,[rax+rdi]
|
|
punpcklbw xmm1,xmm3
|
|
movaps [rbp+0A0h],xmm12
|
|
psubw xmm13,xmm1
|
|
movaps [rbp+40h],xmm15
|
|
movdqa xmm15,xmm14
|
|
movaps [rbp+10h],xmm7
|
|
movdqa xmm7,xmm6
|
|
punpcklbw xmm10,xmm3
|
|
movdqa xmm12,[r12+rdi]
|
|
punpcklbw xmm7,xmm3
|
|
punpcklbw xmm12,xmm3
|
|
punpcklbw xmm15,xmm3
|
|
pabsw xmm3,xmm13
|
|
movdqa xmm13,xmm10
|
|
psubw xmm13,xmm15
|
|
movdqa [rbp+0F0h],xmm15
|
|
pabsw xmm15,xmm13
|
|
movdqa xmm13,xmm11
|
|
movdqa [rbp+0B0h],xmm1
|
|
movdqa xmm1,xmm0
|
|
pavgw xmm13,xmm10
|
|
pcmpgtw xmm1,xmm3
|
|
movdqa [rbp+120h],xmm13
|
|
movaps xmm13,xmm2
|
|
punpcklwd xmm4,xmm4
|
|
movdqa xmm3,xmm0
|
|
movdqa [rbp+100h],xmm1
|
|
psubw xmm13,xmm1
|
|
movdqa xmm1,xmm10
|
|
pcmpgtw xmm3,xmm15
|
|
pshufd xmm4,xmm4,0
|
|
psubw xmm1,xmm11
|
|
movdqa [rbp+0D0h],xmm10
|
|
psubw xmm13,xmm3
|
|
movdqa [rbp+110h],xmm3
|
|
pabsw xmm15,xmm1
|
|
movdqa xmm3,xmm4
|
|
psubw xmm10,xmm12
|
|
pcmpgtw xmm3,xmm15
|
|
pabsw xmm15,xmm10
|
|
movdqa xmm10,xmm0
|
|
psllw xmm1,2
|
|
movdqa [rbp+0C0h],xmm11
|
|
psubw xmm11,xmm7
|
|
pcmpgtw xmm10,xmm15
|
|
pabsw xmm11,xmm11
|
|
movdqa xmm15,xmm0
|
|
pand xmm3,xmm10
|
|
pcmpgtw xmm15,xmm11
|
|
movaps xmm11,xmm2
|
|
pxor xmm10,xmm10
|
|
pand xmm3,xmm15
|
|
pcmpgtw xmm11,xmm10
|
|
pcmpeqw xmm10,xmm2
|
|
por xmm11,xmm10
|
|
pand xmm3,xmm11
|
|
movdqa xmm11,xmm7
|
|
psubw xmm11,xmm12
|
|
pxor xmm15,xmm15
|
|
paddw xmm11,xmm1
|
|
psubw xmm15,xmm13
|
|
movdqa [rbp+0E0h],xmm12
|
|
paddw xmm11,[FOUR_16B_SSE2]
|
|
pxor xmm12,xmm12
|
|
psraw xmm11,3
|
|
punpckhbw xmm8,xmm12
|
|
pmaxsw xmm15,xmm11
|
|
punpckhbw xmm5,xmm12
|
|
movdqa xmm11,xmm8
|
|
pminsw xmm13,xmm15
|
|
psubw xmm11,xmm5
|
|
punpckhbw xmm9,xmm12
|
|
pand xmm13,xmm3
|
|
movdqa [rbp+130h],xmm13
|
|
pabsw xmm13,xmm11
|
|
punpckhbw xmm14,xmm12
|
|
movdqa xmm11,xmm9
|
|
psubw xmm11,xmm14
|
|
movdqa xmm15,xmm0
|
|
movdqa [rbp+140h],xmm14
|
|
pabsw xmm14,xmm11
|
|
movdqa xmm11,xmm8
|
|
pcmpgtw xmm15,xmm14
|
|
movdqa xmm1,[r12+rdi]
|
|
pavgw xmm11,xmm9
|
|
movdqa [rbp+170h],xmm11
|
|
movdqa xmm10,xmm9
|
|
punpckhbw xmm6,xmm12
|
|
psubw xmm10,xmm8
|
|
punpckhbw xmm1,xmm12
|
|
movdqa xmm12,xmm0
|
|
movaps xmm11,[rbp+0A0h]
|
|
pcmpgtw xmm12,xmm13
|
|
movaps xmm13,xmm11
|
|
psubw xmm13,xmm12
|
|
movdqa [rbp+160h],xmm15
|
|
psubw xmm13,xmm15
|
|
movdqa xmm15,xmm9
|
|
psubw xmm15,xmm1
|
|
movdqa [rbp+150h],xmm12
|
|
pabsw xmm12,xmm10
|
|
pabsw xmm14,xmm15
|
|
movdqa xmm15,xmm8
|
|
pcmpgtw xmm4,xmm12
|
|
movdqa xmm12,xmm0
|
|
psubw xmm15,xmm6
|
|
pcmpgtw xmm12,xmm14
|
|
pabsw xmm14,xmm15
|
|
psllw xmm10,2
|
|
pcmpgtw xmm0,xmm14
|
|
movdqa xmm14,xmm6
|
|
psubw xmm14,xmm1
|
|
pand xmm4,xmm12
|
|
paddw xmm14,xmm10
|
|
pand xmm4,xmm0
|
|
paddw xmm14,[FOUR_16B_SSE2]
|
|
pxor xmm15,xmm15
|
|
movaps xmm12,xmm11
|
|
psubw xmm15,xmm13
|
|
pxor xmm0,xmm0
|
|
psraw xmm14,3
|
|
pcmpgtw xmm12,xmm0
|
|
pcmpeqw xmm0,xmm11
|
|
pmaxsw xmm15,xmm14
|
|
por xmm12,xmm0
|
|
movdqa xmm0,[rbp+120h]
|
|
pminsw xmm13,xmm15
|
|
movdqa xmm15,[rbp+0B0h]
|
|
movdqa xmm10,xmm7
|
|
pand xmm4,xmm12
|
|
paddw xmm15,xmm0
|
|
pxor xmm12,xmm12
|
|
paddw xmm10,xmm7
|
|
movdqa xmm14,xmm12
|
|
psubw xmm15,xmm10
|
|
psubw xmm14,xmm2
|
|
psraw xmm15,1
|
|
pmaxsw xmm15,xmm14
|
|
movdqa xmm10,xmm6
|
|
pminsw xmm15,xmm2
|
|
paddw xmm10,xmm6
|
|
pand xmm15,xmm3
|
|
psubw xmm12,xmm11
|
|
pand xmm15,[rbp+100h]
|
|
pand xmm13,xmm4
|
|
paddw xmm7,xmm15
|
|
paddw xmm8,xmm13
|
|
movdqa xmm15,[rbp+170h]
|
|
psubw xmm9,xmm13
|
|
paddw xmm5,xmm15
|
|
psubw xmm5,xmm10
|
|
psraw xmm5,1
|
|
pmaxsw xmm5,xmm12
|
|
pminsw xmm5,xmm11
|
|
pand xmm5,xmm4
|
|
pand xmm5,[rbp+150h]
|
|
paddw xmm6,xmm5
|
|
movdqa xmm5,[rbp+0C0h]
|
|
packuswb xmm7,xmm6
|
|
movdqa xmm6,[rbp+130h]
|
|
paddw xmm5,xmm6
|
|
packuswb xmm5,xmm8
|
|
movdqa xmm8,[rbp+0D0h]
|
|
psubw xmm8,xmm6
|
|
movdqa xmm6,[rbp+0F0h]
|
|
paddw xmm6,xmm0
|
|
movdqa xmm0,[rbp+0E0h]
|
|
packuswb xmm8,xmm9
|
|
movdqa xmm9,xmm0
|
|
paddw xmm9,xmm0
|
|
psubw xmm6,xmm9
|
|
psraw xmm6,1
|
|
pmaxsw xmm14,xmm6
|
|
pminsw xmm2,xmm14
|
|
pand xmm2,xmm3
|
|
pand xmm2,[rbp+110h]
|
|
paddw xmm0,xmm2
|
|
movdqa xmm2,[rbp+140h]
|
|
paddw xmm2,xmm15
|
|
movdqa xmm15,xmm1
|
|
paddw xmm15,xmm1
|
|
psubw xmm2,xmm15
|
|
psraw xmm2,1
|
|
pmaxsw xmm12,xmm2
|
|
pminsw xmm11,xmm12
|
|
pand xmm11,xmm4
|
|
pand xmm11,[rbp+160h]
|
|
paddw xmm1,xmm11
|
|
movdqa [rax+rdi],xmm7
|
|
movdqa [r10],xmm5
|
|
packuswb xmm0,xmm1
|
|
movdqa [rdi],xmm8
|
|
movdqa [r12+rdi],xmm0
|
|
mov r12,qword [rbp+180h]
|
|
lea rsp,[rbp+190h]
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
WELS_EXTERN DeblockLumaEq4V_ssse3
|
|
|
|
ALIGN 16
|
|
DeblockLumaEq4V_ssse3:
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
mov r8, rdx
|
|
mov r9, rcx
|
|
mov rcx, rdi
|
|
mov rdx, rsi
|
|
sub rsp,1D8h
|
|
movaps [rax-38h],xmm6
|
|
movaps [rax-48h],xmm7
|
|
movaps [rax-58h],xmm8
|
|
pxor xmm1,xmm1
|
|
movsxd r10,edx
|
|
mov rbp,rcx
|
|
mov r11d,r8d
|
|
mov rdx,rcx
|
|
mov rdi,rbp
|
|
mov rbx,rbp
|
|
movdqa xmm5,[rbp]
|
|
movaps [rax-68h],xmm9
|
|
movaps [rax-78h],xmm10
|
|
punpcklbw xmm5,xmm1
|
|
movaps [rax-88h],xmm11
|
|
movaps [rax-98h],xmm12
|
|
movaps [rax-0A8h],xmm13
|
|
movaps [rax-0B8h],xmm14
|
|
movdqa xmm14,[r10+rbp]
|
|
movaps [rax-0C8h],xmm15
|
|
lea eax,[r10*4]
|
|
movsxd r8,eax
|
|
lea eax,[r10+r10*2]
|
|
movsxd rcx,eax
|
|
lea eax,[r10+r10]
|
|
sub rdx,r8
|
|
punpcklbw xmm14,xmm1
|
|
movdqa [rsp+90h],xmm5
|
|
movdqa [rsp+30h],xmm14
|
|
movsxd rsi,eax
|
|
movsx eax,r11w
|
|
sub rdi,rcx
|
|
sub rbx,rsi
|
|
mov r8,rbp
|
|
sub r8,r10
|
|
movd xmm0,eax
|
|
movsx eax,r9w
|
|
movdqa xmm12,[rdi]
|
|
movdqa xmm6, [rsi+rbp]
|
|
movdqa xmm13,[rbx]
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm11,xmm0,0
|
|
punpcklbw xmm13,xmm1
|
|
punpcklbw xmm6,xmm1
|
|
movdqa xmm8,[r8]
|
|
movd xmm0,eax
|
|
movdqa xmm10,xmm11
|
|
mov eax,2
|
|
punpcklbw xmm8,xmm1
|
|
punpcklbw xmm12,xmm1
|
|
cwde
|
|
punpcklwd xmm0,xmm0
|
|
psraw xmm10,2
|
|
movdqa xmm1,xmm8
|
|
movdqa [rsp+0F0h],xmm13
|
|
movdqa [rsp+0B0h],xmm8
|
|
pshufd xmm7,xmm0,0
|
|
psubw xmm1,xmm13
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm4,xmm7
|
|
movdqa xmm2,xmm7
|
|
psubw xmm0,xmm8
|
|
pabsw xmm3,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm5
|
|
movdqa [rsp+40h],xmm7
|
|
movdqa [rsp+60h],xmm6
|
|
pcmpgtw xmm4,xmm0
|
|
psubw xmm1,xmm14
|
|
pabsw xmm0,xmm1
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm4,xmm2
|
|
movdqa xmm0,xmm11
|
|
pcmpgtw xmm0,xmm3
|
|
pand xmm4,xmm0
|
|
movd xmm0,eax
|
|
movdqa [rsp+20h],xmm4
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm2,xmm0,0
|
|
paddw xmm10,xmm2
|
|
movdqa [rsp+0A0h],xmm2
|
|
movdqa xmm15,xmm7
|
|
pxor xmm4,xmm4
|
|
movdqa xmm0,xmm8
|
|
psubw xmm0,xmm12
|
|
mov eax,4
|
|
pabsw xmm0,xmm0
|
|
movdqa xmm1,xmm10
|
|
cwde
|
|
pcmpgtw xmm15,xmm0
|
|
pcmpgtw xmm1,xmm3
|
|
movdqa xmm3,xmm7
|
|
movdqa xmm7,[rdx]
|
|
movdqa xmm0,xmm5
|
|
psubw xmm0,xmm6
|
|
pand xmm15,xmm1
|
|
punpcklbw xmm7,xmm4
|
|
movdqa xmm9,xmm15
|
|
pabsw xmm0,xmm0
|
|
psllw xmm7,1
|
|
pandn xmm9,xmm12
|
|
pcmpgtw xmm3,xmm0
|
|
paddw xmm7,xmm12
|
|
movd xmm0,eax
|
|
pand xmm3,xmm1
|
|
paddw xmm7,xmm12
|
|
punpcklwd xmm0,xmm0
|
|
paddw xmm7,xmm12
|
|
pshufd xmm1,xmm0,0
|
|
paddw xmm7,xmm13
|
|
movdqa xmm0,xmm3
|
|
pandn xmm0,xmm6
|
|
paddw xmm7,xmm8
|
|
movdqa [rsp+70h],xmm1
|
|
paddw xmm7,xmm5
|
|
movdqa [rsp+120h],xmm0
|
|
movdqa xmm0,[rcx+rbp]
|
|
punpcklbw xmm0,xmm4
|
|
paddw xmm7,xmm1
|
|
movdqa xmm4,xmm15
|
|
psllw xmm0,1
|
|
psraw xmm7,3
|
|
paddw xmm0,xmm6
|
|
pand xmm7,xmm15
|
|
paddw xmm0,xmm6
|
|
paddw xmm0,xmm6
|
|
paddw xmm0,xmm14
|
|
movdqa xmm6,xmm15
|
|
paddw xmm0,xmm5
|
|
pandn xmm6,xmm13
|
|
paddw xmm0,xmm8
|
|
paddw xmm0,xmm1
|
|
psraw xmm0,3
|
|
movdqa xmm1,xmm12
|
|
paddw xmm1,xmm13
|
|
pand xmm0,xmm3
|
|
movdqa [rsp+100h],xmm0
|
|
movdqa xmm0,xmm8
|
|
paddw xmm0,xmm5
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm3
|
|
paddw xmm1,xmm2
|
|
psraw xmm1,2
|
|
pandn xmm0,xmm14
|
|
pand xmm4,xmm1
|
|
movdqa [rsp+0E0h],xmm0
|
|
movdqa xmm0,xmm5
|
|
paddw xmm0,xmm8
|
|
movdqa xmm1,[rsp+60h]
|
|
paddw xmm1,xmm14
|
|
movdqa xmm14,xmm3
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm8
|
|
paddw xmm0,[rsp+30h]
|
|
paddw xmm1,xmm2
|
|
psraw xmm1,2
|
|
pand xmm14,xmm1
|
|
movdqa xmm1,xmm13
|
|
paddw xmm1,xmm13
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm2
|
|
psraw xmm1,2
|
|
movdqa xmm0,[rsp+30h]
|
|
movdqa xmm2,xmm13
|
|
movdqa xmm5,xmm15
|
|
paddw xmm0,[rsp+70h]
|
|
pandn xmm5,xmm1
|
|
paddw xmm2,xmm8
|
|
movdqa xmm8,[rsp+90h]
|
|
movdqa xmm1,xmm12
|
|
paddw xmm2,xmm8
|
|
psllw xmm2,1
|
|
paddw xmm2,xmm0
|
|
paddw xmm1,xmm2
|
|
movdqa xmm0,xmm8
|
|
movdqa xmm8,xmm3
|
|
movdqa xmm2,[rsp+30h]
|
|
paddw xmm0,xmm13
|
|
psraw xmm1,3
|
|
pand xmm15,xmm1
|
|
movdqa xmm1,xmm2
|
|
paddw xmm1,xmm2
|
|
paddw xmm2,[rsp+90h]
|
|
paddw xmm2,[rsp+0B0h]
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm13
|
|
movdqa xmm13,[r8]
|
|
paddw xmm0, [rsp+70h]
|
|
paddw xmm1, [rsp+0A0h]
|
|
psllw xmm2,1
|
|
paddw xmm2,xmm0
|
|
psraw xmm1,2
|
|
movdqa xmm0, [rdi]
|
|
pandn xmm8,xmm1
|
|
movdqa xmm1, [rsp+60h]
|
|
paddw xmm1,xmm2
|
|
movdqa xmm2, [rbx]
|
|
psraw xmm1,3
|
|
pand xmm3,xmm1
|
|
movdqa xmm1, [rbp]
|
|
movdqa [rsp+0D0h],xmm3
|
|
pxor xmm3,xmm3
|
|
punpckhbw xmm0,xmm3
|
|
punpckhbw xmm1,xmm3
|
|
punpckhbw xmm13,xmm3
|
|
movdqa [rsp+0C0h],xmm0
|
|
movdqa xmm0,[r10+rbp]
|
|
movdqa [rsp],xmm1
|
|
punpckhbw xmm0,xmm3
|
|
punpckhbw xmm2,xmm3
|
|
movdqa [rsp+80h],xmm0
|
|
movdqa xmm0,[rsi+rbp]
|
|
movdqa [rsp+10h],xmm13
|
|
punpckhbw xmm0,xmm3
|
|
movdqa [rsp+50h],xmm0
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm1,xmm13
|
|
psubw xmm0,xmm13
|
|
psubw xmm1,xmm2
|
|
pabsw xmm3,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,[rsp]
|
|
movdqa xmm13,[rsp+40h]
|
|
movdqa [rsp+110h],xmm2
|
|
psubw xmm1, [rsp+80h]
|
|
pcmpgtw xmm13,xmm0
|
|
pcmpgtw xmm11,xmm3
|
|
pabsw xmm0,xmm1
|
|
pcmpgtw xmm10,xmm3
|
|
movdqa xmm1, [rsp+40h]
|
|
movdqa xmm2,xmm1
|
|
movdqa xmm3,xmm1
|
|
pcmpgtw xmm2,xmm0
|
|
movdqa xmm0, [rsp+10h]
|
|
pand xmm13,xmm2
|
|
pand xmm13,xmm11
|
|
movdqa xmm11,[rsp+0C0h]
|
|
psubw xmm0,xmm11
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm3,xmm0
|
|
pand xmm3,xmm10
|
|
movdqa xmm0,[rsp]
|
|
psubw xmm0,[rsp+50h]
|
|
movdqa xmm2,[rdx]
|
|
pabsw xmm0,xmm0
|
|
por xmm7,xmm9
|
|
movdqa xmm9,[rsp+20h]
|
|
pcmpgtw xmm1,xmm0
|
|
pand xmm9,xmm7
|
|
movdqa xmm7,[rsp+20h]
|
|
movdqa xmm0,xmm7
|
|
pandn xmm0,xmm12
|
|
movdqa xmm12,[rsp+110h]
|
|
pand xmm1,xmm10
|
|
movdqa xmm10,[rsp+70h]
|
|
movdqa [rsp+40h],xmm1
|
|
movdqa xmm1,xmm13
|
|
por xmm9,xmm0
|
|
pxor xmm0,xmm0
|
|
por xmm4,xmm6
|
|
movdqa xmm6,xmm7
|
|
punpckhbw xmm2,xmm0
|
|
por xmm15,xmm5
|
|
movdqa xmm5,[rsp+20h]
|
|
movdqa xmm0,xmm3
|
|
psllw xmm2,1
|
|
pandn xmm0,xmm11
|
|
pand xmm6,xmm4
|
|
movdqa xmm4,[rsp]
|
|
paddw xmm2,xmm11
|
|
pand xmm5,xmm15
|
|
movdqa xmm15,[rsp+20h]
|
|
paddw xmm2,xmm11
|
|
paddw xmm2,xmm11
|
|
paddw xmm2,xmm12
|
|
paddw xmm2,[rsp+10h]
|
|
paddw xmm2,[rsp]
|
|
paddw xmm2,xmm10
|
|
psraw xmm2,3
|
|
pand xmm2,xmm3
|
|
por xmm2,xmm0
|
|
pand xmm1,xmm2
|
|
movdqa xmm0,xmm13
|
|
movdqa xmm2,xmm11
|
|
pandn xmm0,xmm11
|
|
paddw xmm2,xmm12
|
|
por xmm1,xmm0
|
|
packuswb xmm9,xmm1
|
|
movdqa xmm0,xmm7
|
|
movdqa xmm7,[rsp+0A0h]
|
|
pandn xmm0,[rsp+0F0h]
|
|
movdqa xmm1,xmm3
|
|
por xmm6,xmm0
|
|
movdqa xmm0,[rsp+10h]
|
|
paddw xmm0,xmm4
|
|
paddw xmm2,xmm0
|
|
paddw xmm2,xmm7
|
|
movdqa xmm0,xmm3
|
|
pandn xmm0,xmm12
|
|
psraw xmm2,2
|
|
pand xmm1,xmm2
|
|
por xmm1,xmm0
|
|
movdqa xmm2,xmm13
|
|
movdqa xmm0,xmm13
|
|
pand xmm2,xmm1
|
|
pandn xmm0,xmm12
|
|
movdqa xmm1,xmm12
|
|
paddw xmm1,[rsp+10h]
|
|
por xmm2,xmm0
|
|
movdqa xmm0,xmm15
|
|
pandn xmm0,[rsp+0B0h]
|
|
paddw xmm1,xmm4
|
|
packuswb xmm6,xmm2
|
|
movdqa xmm2,xmm3
|
|
psllw xmm1,1
|
|
por xmm5,xmm0
|
|
movdqa xmm0,[rsp+80h]
|
|
paddw xmm0,xmm10
|
|
paddw xmm1,xmm0
|
|
paddw xmm11,xmm1
|
|
psraw xmm11,3
|
|
movdqa xmm1,xmm12
|
|
pand xmm2,xmm11
|
|
paddw xmm1,xmm12
|
|
movdqa xmm11,[rsp+80h]
|
|
movdqa xmm0, [rsp+10h]
|
|
por xmm14,[rsp+0E0h]
|
|
paddw xmm0,xmm11
|
|
movdqa xmm4,xmm15
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm13
|
|
paddw xmm1,xmm7
|
|
psraw xmm1,2
|
|
pandn xmm3,xmm1
|
|
por xmm2,xmm3
|
|
movdqa xmm1,xmm13
|
|
movdqa xmm3,[rsp+10h]
|
|
pandn xmm0,xmm3
|
|
pand xmm1,xmm2
|
|
movdqa xmm2,xmm11
|
|
paddw xmm2,[rsp]
|
|
por xmm1,xmm0
|
|
movdqa xmm0,[rsp+0D0h]
|
|
por xmm0,xmm8
|
|
paddw xmm2,xmm3
|
|
packuswb xmm5,xmm1
|
|
movdqa xmm8,[rsp+40h]
|
|
movdqa xmm1,[rsp+50h]
|
|
movdqa xmm3,xmm8
|
|
pand xmm4,xmm0
|
|
psllw xmm2,1
|
|
movdqa xmm0,xmm15
|
|
pandn xmm0,[rsp+90h]
|
|
por xmm4,xmm0
|
|
movdqa xmm0,xmm12
|
|
paddw xmm0,xmm10
|
|
paddw xmm2,xmm0
|
|
paddw xmm1,xmm2
|
|
movdqa xmm0,[rsp]
|
|
movdqa xmm2,xmm11
|
|
paddw xmm0,xmm12
|
|
movdqa xmm12,[rsp]
|
|
paddw xmm2,xmm11
|
|
paddw xmm2,xmm0
|
|
psraw xmm1,3
|
|
movdqa xmm0,xmm8
|
|
pand xmm3,xmm1
|
|
paddw xmm2,xmm7
|
|
movdqa xmm1,xmm13
|
|
psraw xmm2,2
|
|
pandn xmm0,xmm2
|
|
por xmm3,xmm0
|
|
movdqa xmm2,[rsp+50h]
|
|
movdqa xmm0,xmm13
|
|
pandn xmm0,xmm12
|
|
pand xmm1,xmm3
|
|
paddw xmm2,xmm11
|
|
movdqa xmm3,xmm15
|
|
por xmm1,xmm0
|
|
pand xmm3,xmm14
|
|
movdqa xmm14,[rsp+10h]
|
|
movdqa xmm0,xmm15
|
|
pandn xmm0,[rsp+30h]
|
|
packuswb xmm4,xmm1
|
|
movdqa xmm1,xmm8
|
|
por xmm3,xmm0
|
|
movdqa xmm0,xmm12
|
|
paddw xmm0,xmm14
|
|
paddw xmm2,xmm0
|
|
paddw xmm2,xmm7
|
|
movdqa xmm0,xmm8
|
|
pandn xmm0,xmm11
|
|
psraw xmm2,2
|
|
pand xmm1,xmm2
|
|
por xmm1,xmm0
|
|
movdqa xmm2,xmm13
|
|
movdqa xmm0,xmm13
|
|
pandn xmm0,xmm11
|
|
pand xmm2,xmm1
|
|
movdqa xmm1,xmm15
|
|
por xmm2,xmm0
|
|
packuswb xmm3,xmm2
|
|
movdqa xmm0,[rsp+100h]
|
|
por xmm0,[rsp+120h]
|
|
pand xmm1,xmm0
|
|
movdqa xmm2,[rcx+rbp]
|
|
movdqa xmm7,[rsp+50h]
|
|
pandn xmm15,[rsp+60h]
|
|
lea r11,[rsp+1D8h]
|
|
pxor xmm0,xmm0
|
|
por xmm1,xmm15
|
|
movaps xmm15,[r11-0A8h]
|
|
movdqa [rdi],xmm9
|
|
movaps xmm9,[r11-48h]
|
|
punpckhbw xmm2,xmm0
|
|
psllw xmm2,1
|
|
paddw xmm2,xmm7
|
|
paddw xmm2,xmm7
|
|
movdqa [rbx],xmm6
|
|
movaps xmm6,[r11-18h]
|
|
paddw xmm2,xmm7
|
|
paddw xmm2,xmm11
|
|
movaps xmm11,[r11-68h]
|
|
paddw xmm2,xmm12
|
|
movaps xmm12,[r11-78h]
|
|
paddw xmm2,xmm14
|
|
paddw xmm2,xmm10
|
|
psraw xmm2,3
|
|
movaps xmm10,[r11-58h]
|
|
movaps xmm14,[r11-98h]
|
|
movdqa xmm0,xmm13
|
|
pand xmm2,xmm8
|
|
pandn xmm8,xmm7
|
|
pandn xmm13,xmm7
|
|
por xmm2,xmm8
|
|
movaps xmm7,[r11-28h]
|
|
movaps xmm8,[r11-38h]
|
|
movdqa [r8],xmm5
|
|
pand xmm0,xmm2
|
|
por xmm0,xmm13
|
|
packuswb xmm1,xmm0
|
|
movaps xmm13,[r11-88h]
|
|
movdqa [rbp],xmm4
|
|
movdqa [r10+rbp],xmm3
|
|
movdqa [rsi+rbp],xmm1
|
|
mov rsp,r11
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
WELS_EXTERN DeblockChromaLt4V_ssse3
|
|
ALIGN 16
|
|
DeblockChromaLt4V_ssse3:
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
mov r10, rdx
|
|
mov r11, rcx
|
|
mov rcx, rdi
|
|
mov rdx, rsi
|
|
mov rsi, r10
|
|
mov r10, r9
|
|
mov rbp, r8
|
|
mov r8, rsi
|
|
mov r9, r11
|
|
sub rsp,0C8h
|
|
pxor xmm1,xmm1
|
|
mov rbx,rcx
|
|
movsxd r11,r8d
|
|
movsx ecx,byte [r10]
|
|
movsx r8d,byte [r10+2]
|
|
mov rdi,rdx
|
|
movq xmm2,[rbx]
|
|
movq xmm9,[r11+rbx]
|
|
movsx edx,byte [r10+1]
|
|
mov word [rsp+2],cx
|
|
mov word [rsp],cx
|
|
movsx eax,byte [r10+3]
|
|
mov word [rsp+6],dx
|
|
mov word [rsp+4],dx
|
|
movdqa xmm11,xmm1
|
|
mov word [rsp+0Eh],ax
|
|
mov word [rsp+0Ch],ax
|
|
lea eax,[r11+r11]
|
|
movsxd rcx,eax
|
|
mov rax,rbx
|
|
mov rdx,rdi
|
|
sub rax,rcx
|
|
mov word [rsp+0Ah],r8w
|
|
mov word [rsp+8],r8w
|
|
movdqa xmm6,[rsp]
|
|
movdqa xmm7,xmm6
|
|
movq xmm13, [rax]
|
|
mov rax,rdi
|
|
sub rax,rcx
|
|
mov rcx,rbx
|
|
pcmpgtw xmm7,xmm1
|
|
psubw xmm11,xmm6
|
|
sub rcx,r11
|
|
sub rdx,r11
|
|
movq xmm0,[rax]
|
|
movsx eax,r9w
|
|
movq xmm15,[rcx]
|
|
punpcklqdq xmm13,xmm0
|
|
movq xmm0, [rdx]
|
|
movdqa xmm4,xmm13
|
|
punpcklqdq xmm15,xmm0
|
|
movq xmm0, [rdi]
|
|
punpcklbw xmm4,xmm1
|
|
movdqa xmm12,xmm15
|
|
punpcklqdq xmm2,xmm0
|
|
movq xmm0, [r11+rdi]
|
|
punpcklbw xmm12,xmm1
|
|
movdqa xmm14,xmm2
|
|
punpcklqdq xmm9,xmm0
|
|
punpckhbw xmm2,xmm1
|
|
punpcklbw xmm14,xmm1
|
|
movd xmm0,eax
|
|
mov eax, ebp ; iBeta
|
|
punpckhbw xmm13,xmm1
|
|
punpckhbw xmm15,xmm1
|
|
movdqa xmm3,xmm9
|
|
movdqa [rsp+10h],xmm2
|
|
punpcklwd xmm0,xmm0
|
|
punpckhbw xmm9,xmm1
|
|
punpcklbw xmm3,xmm1
|
|
movdqa xmm1,xmm14
|
|
pshufd xmm10,xmm0,0
|
|
movd xmm0,eax
|
|
mov eax,4
|
|
cwde
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm8,xmm0,0
|
|
movd xmm0,eax
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm5,xmm0,0
|
|
psubw xmm1,xmm12
|
|
movdqa xmm2,xmm10
|
|
lea r11,[rsp+0C8h]
|
|
psllw xmm1,2
|
|
movdqa xmm0,xmm4
|
|
psubw xmm4,xmm12
|
|
psubw xmm0,xmm3
|
|
psubw xmm3,xmm14
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm11
|
|
psraw xmm1,3
|
|
pmaxsw xmm0,xmm1
|
|
pminsw xmm6,xmm0
|
|
movdqa xmm1,xmm8
|
|
movdqa xmm0,xmm12
|
|
psubw xmm0,xmm14
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm2,xmm0
|
|
pabsw xmm0,xmm4
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm3
|
|
movdqa xmm3,[rsp]
|
|
pand xmm2,xmm1
|
|
movdqa xmm1,xmm8
|
|
pcmpgtw xmm1,xmm0
|
|
movdqa xmm0,xmm13
|
|
pand xmm2,xmm1
|
|
psubw xmm0,xmm9
|
|
psubw xmm13,xmm15
|
|
pand xmm2,xmm7
|
|
pand xmm6,xmm2
|
|
paddw xmm12,xmm6
|
|
psubw xmm14,xmm6
|
|
movdqa xmm2,[rsp+10h]
|
|
movaps xmm6,[r11-18h]
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm15
|
|
psubw xmm9,xmm2
|
|
psllw xmm1,2
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm15
|
|
psubw xmm0,xmm2
|
|
psraw xmm1,3
|
|
pmaxsw xmm11,xmm1
|
|
pabsw xmm0,xmm0
|
|
movdqa xmm1,xmm8
|
|
pcmpgtw xmm10,xmm0
|
|
pabsw xmm0,xmm13
|
|
pminsw xmm3,xmm11
|
|
movaps xmm11,[r11-68h]
|
|
movaps xmm13,[rsp+40h]
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm9
|
|
movaps xmm9, [r11-48h]
|
|
pand xmm10,xmm1
|
|
pcmpgtw xmm8,xmm0
|
|
pand xmm10,xmm8
|
|
pand xmm10,xmm7
|
|
movaps xmm8,[r11-38h]
|
|
movaps xmm7,[r11-28h]
|
|
pand xmm3,xmm10
|
|
paddw xmm15,xmm3
|
|
psubw xmm2,xmm3
|
|
movaps xmm10,[r11-58h]
|
|
packuswb xmm12,xmm15
|
|
movaps xmm15,[rsp+20h]
|
|
packuswb xmm14,xmm2
|
|
movq [rcx],xmm12
|
|
movq [rbx],xmm14
|
|
psrldq xmm12,8
|
|
psrldq xmm14,8
|
|
movq [rdx],xmm12
|
|
movaps xmm12,[r11-78h]
|
|
movq [rdi],xmm14
|
|
movaps xmm14,[rsp+30h]
|
|
mov rsp,r11
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
WELS_EXTERN DeblockChromaEq4V_ssse3
|
|
|
|
DeblockChromaEq4V_ssse3:
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
|
|
mov rbp, r8
|
|
mov r8, rdx
|
|
mov r9, rcx
|
|
mov rcx, rdi
|
|
mov rdx, rsi
|
|
|
|
sub rsp,90h
|
|
pxor xmm1,xmm1
|
|
mov r11,rcx
|
|
mov rbx,rdx
|
|
mov r10d,r9d
|
|
movq xmm13,[r11]
|
|
lea eax,[r8+r8]
|
|
movsxd r9,eax
|
|
mov rax,rcx
|
|
sub rax,r9
|
|
movq xmm14,[rax]
|
|
mov rax,rdx
|
|
sub rax,r9
|
|
movq xmm0,[rax]
|
|
movsxd rax,r8d
|
|
sub rcx,rax
|
|
sub rdx,rax
|
|
movq xmm12,[rax+r11]
|
|
movq xmm10,[rcx]
|
|
punpcklqdq xmm14,xmm0
|
|
movdqa xmm8,xmm14
|
|
movq xmm0,[rdx]
|
|
punpcklbw xmm8,xmm1
|
|
punpckhbw xmm14,xmm1
|
|
punpcklqdq xmm10,xmm0
|
|
movq xmm0,[rbx]
|
|
movdqa xmm5,xmm10
|
|
punpcklqdq xmm13,xmm0
|
|
movq xmm0, [rax+rbx]
|
|
punpcklbw xmm5,xmm1
|
|
movsx eax,r10w
|
|
movdqa xmm9,xmm13
|
|
punpcklqdq xmm12,xmm0
|
|
punpcklbw xmm9,xmm1
|
|
punpckhbw xmm10,xmm1
|
|
movd xmm0,eax
|
|
mov eax, ebp ; iBeta
|
|
punpckhbw xmm13,xmm1
|
|
movdqa xmm7,xmm12
|
|
punpcklwd xmm0,xmm0
|
|
punpckhbw xmm12,xmm1
|
|
pshufd xmm11,xmm0,0
|
|
punpcklbw xmm7,xmm1
|
|
movd xmm0,eax
|
|
movdqa xmm1,xmm8
|
|
psubw xmm1,xmm5
|
|
punpcklwd xmm0,xmm0
|
|
movdqa xmm6,xmm11
|
|
pshufd xmm3,xmm0,0
|
|
movdqa xmm0,xmm5
|
|
psubw xmm0,xmm9
|
|
movdqa xmm2,xmm3
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm6,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm3
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm6,xmm2
|
|
movdqa xmm0,xmm7
|
|
movdqa xmm2,xmm3
|
|
psubw xmm0,xmm9
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm1,xmm0
|
|
pand xmm6,xmm1
|
|
movdqa xmm0,xmm10
|
|
movdqa xmm1,xmm14
|
|
psubw xmm0,xmm13
|
|
psubw xmm1,xmm10
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm11,xmm0
|
|
pabsw xmm0,xmm1
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm11,xmm2
|
|
movdqa xmm0,xmm12
|
|
movdqa xmm4,xmm6
|
|
movdqa xmm1,xmm8
|
|
mov eax,2
|
|
cwde
|
|
paddw xmm1,xmm8
|
|
psubw xmm0,xmm13
|
|
paddw xmm1,xmm5
|
|
pabsw xmm0,xmm0
|
|
movdqa xmm2,xmm14
|
|
paddw xmm1,xmm7
|
|
pcmpgtw xmm3,xmm0
|
|
paddw xmm2,xmm14
|
|
movd xmm0,eax
|
|
pand xmm11,xmm3
|
|
paddw xmm7,xmm7
|
|
paddw xmm2,xmm10
|
|
punpcklwd xmm0,xmm0
|
|
paddw xmm2,xmm12
|
|
paddw xmm12,xmm12
|
|
pshufd xmm3,xmm0,0
|
|
paddw xmm7,xmm9
|
|
paddw xmm12,xmm13
|
|
movdqa xmm0,xmm6
|
|
paddw xmm1,xmm3
|
|
pandn xmm0,xmm5
|
|
paddw xmm7,xmm8
|
|
psraw xmm1,2
|
|
paddw xmm12,xmm14
|
|
paddw xmm7,xmm3
|
|
;movaps xmm14,[rsp]
|
|
pand xmm4,xmm1
|
|
paddw xmm12,xmm3
|
|
psraw xmm7,2
|
|
movdqa xmm1,xmm11
|
|
por xmm4,xmm0
|
|
psraw xmm12,2
|
|
paddw xmm2,xmm3
|
|
movdqa xmm0,xmm11
|
|
pandn xmm0,xmm10
|
|
psraw xmm2,2
|
|
pand xmm1,xmm2
|
|
por xmm1,xmm0
|
|
packuswb xmm4,xmm1
|
|
movdqa xmm0,xmm11
|
|
movdqa xmm1,xmm6
|
|
pand xmm1,xmm7
|
|
movq [rcx],xmm4
|
|
pandn xmm6,xmm9
|
|
pandn xmm11,xmm13
|
|
pand xmm0,xmm12
|
|
por xmm1,xmm6
|
|
por xmm0,xmm11
|
|
psrldq xmm4,8
|
|
packuswb xmm1,xmm0
|
|
movq [r11],xmm1
|
|
psrldq xmm1,8
|
|
movq [rdx],xmm4
|
|
lea r11,[rsp+90h]
|
|
movq [rbx],xmm1
|
|
mov rsp,r11
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
WELS_EXTERN DeblockChromaEq4H_ssse3
|
|
|
|
ALIGN 16
|
|
DeblockChromaEq4H_ssse3:
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
|
|
mov rbp, r8
|
|
mov r8, rdx
|
|
mov r9, rcx
|
|
mov rcx, rdi
|
|
mov rdx, rsi
|
|
mov rdi, rdx
|
|
|
|
sub rsp,140h
|
|
lea eax,[r8*4]
|
|
movsxd r10,eax
|
|
mov eax,[rcx-2]
|
|
mov [rsp+10h],eax
|
|
lea rbx,[r10+rdx-2]
|
|
lea r11,[r10+rcx-2]
|
|
|
|
movdqa xmm5,[rsp+10h]
|
|
movsxd r10,r8d
|
|
mov eax,[r10+rcx-2]
|
|
lea rdx,[r10+r10*2]
|
|
mov [rsp+20h],eax
|
|
mov eax,[rcx+r10*2-2]
|
|
mov [rsp+30h],eax
|
|
mov eax,[rdx+rcx-2]
|
|
movdqa xmm2,[rsp+20h]
|
|
mov [rsp+40h],eax
|
|
mov eax, [rdi-2]
|
|
movdqa xmm4,[rsp+30h]
|
|
mov [rsp+50h],eax
|
|
mov eax,[r10+rdi-2]
|
|
movdqa xmm3,[rsp+40h]
|
|
mov [rsp+60h],eax
|
|
mov eax,[rdi+r10*2-2]
|
|
punpckldq xmm5,[rsp+50h]
|
|
mov [rsp+70h],eax
|
|
mov eax, [rdx+rdi-2]
|
|
punpckldq xmm2, [rsp+60h]
|
|
mov [rsp+80h],eax
|
|
mov eax,[r11]
|
|
punpckldq xmm4, [rsp+70h]
|
|
mov [rsp+50h],eax
|
|
mov eax,[rbx]
|
|
punpckldq xmm3,[rsp+80h]
|
|
mov [rsp+60h],eax
|
|
mov eax,[r10+r11]
|
|
movdqa xmm0, [rsp+50h]
|
|
punpckldq xmm0, [rsp+60h]
|
|
punpcklqdq xmm5,xmm0
|
|
movdqa [rsp+50h],xmm0
|
|
mov [rsp+50h],eax
|
|
mov eax,[r10+rbx]
|
|
movdqa xmm0,[rsp+50h]
|
|
movdqa xmm1,xmm5
|
|
mov [rsp+60h],eax
|
|
mov eax,[r11+r10*2]
|
|
punpckldq xmm0, [rsp+60h]
|
|
punpcklqdq xmm2,xmm0
|
|
punpcklbw xmm1,xmm2
|
|
punpckhbw xmm5,xmm2
|
|
movdqa [rsp+50h],xmm0
|
|
mov [rsp+50h],eax
|
|
mov eax,[rbx+r10*2]
|
|
movdqa xmm0,[rsp+50h]
|
|
mov [rsp+60h],eax
|
|
mov eax, [rdx+r11]
|
|
movdqa xmm15,xmm1
|
|
punpckldq xmm0,[rsp+60h]
|
|
punpcklqdq xmm4,xmm0
|
|
movdqa [rsp+50h],xmm0
|
|
mov [rsp+50h],eax
|
|
mov eax, [rdx+rbx]
|
|
movdqa xmm0,[rsp+50h]
|
|
mov [rsp+60h],eax
|
|
punpckldq xmm0, [rsp+60h]
|
|
punpcklqdq xmm3,xmm0
|
|
movdqa xmm0,xmm4
|
|
punpcklbw xmm0,xmm3
|
|
punpckhbw xmm4,xmm3
|
|
punpcklwd xmm15,xmm0
|
|
punpckhwd xmm1,xmm0
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm12,xmm15
|
|
punpcklwd xmm0,xmm4
|
|
punpckhwd xmm5,xmm4
|
|
punpckldq xmm12,xmm0
|
|
punpckhdq xmm15,xmm0
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm11,xmm12
|
|
punpckldq xmm0,xmm5
|
|
punpckhdq xmm1,xmm5
|
|
punpcklqdq xmm11,xmm0
|
|
punpckhqdq xmm12,xmm0
|
|
movsx eax,r9w
|
|
movdqa xmm14,xmm15
|
|
punpcklqdq xmm14,xmm1
|
|
punpckhqdq xmm15,xmm1
|
|
pxor xmm1,xmm1
|
|
movd xmm0,eax
|
|
movdqa xmm4,xmm12
|
|
movdqa xmm8,xmm11
|
|
mov eax, ebp ; iBeta
|
|
punpcklwd xmm0,xmm0
|
|
punpcklbw xmm4,xmm1
|
|
punpckhbw xmm12,xmm1
|
|
movdqa xmm9,xmm14
|
|
movdqa xmm7,xmm15
|
|
movdqa xmm10,xmm15
|
|
pshufd xmm13,xmm0,0
|
|
punpcklbw xmm9,xmm1
|
|
punpckhbw xmm14,xmm1
|
|
movdqa xmm6,xmm13
|
|
movd xmm0,eax
|
|
movdqa [rsp],xmm11
|
|
mov eax,2
|
|
cwde
|
|
punpckhbw xmm11,xmm1
|
|
punpckhbw xmm10,xmm1
|
|
punpcklbw xmm7,xmm1
|
|
punpcklwd xmm0,xmm0
|
|
punpcklbw xmm8,xmm1
|
|
pshufd xmm3,xmm0,0
|
|
movdqa xmm1,xmm8
|
|
movdqa xmm0,xmm4
|
|
psubw xmm0,xmm9
|
|
psubw xmm1,xmm4
|
|
movdqa xmm2,xmm3
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm6,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm3
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm6,xmm2
|
|
movdqa xmm0,xmm7
|
|
movdqa xmm2,xmm3
|
|
psubw xmm0,xmm9
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm1,xmm0
|
|
pand xmm6,xmm1
|
|
movdqa xmm0,xmm12
|
|
movdqa xmm1,xmm11
|
|
psubw xmm0,xmm14
|
|
psubw xmm1,xmm12
|
|
movdqa xmm5,xmm6
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm13,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm8
|
|
pcmpgtw xmm2,xmm0
|
|
paddw xmm1,xmm8
|
|
movdqa xmm0,xmm10
|
|
pand xmm13,xmm2
|
|
psubw xmm0,xmm14
|
|
paddw xmm1,xmm4
|
|
movdqa xmm2,xmm11
|
|
pabsw xmm0,xmm0
|
|
paddw xmm2,xmm11
|
|
paddw xmm1,xmm7
|
|
pcmpgtw xmm3,xmm0
|
|
paddw xmm2,xmm12
|
|
movd xmm0,eax
|
|
pand xmm13,xmm3
|
|
paddw xmm2,xmm10
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm3,xmm0,0
|
|
movdqa xmm0,xmm6
|
|
paddw xmm1,xmm3
|
|
pandn xmm0,xmm4
|
|
paddw xmm2,xmm3
|
|
psraw xmm1,2
|
|
pand xmm5,xmm1
|
|
por xmm5,xmm0
|
|
paddw xmm7,xmm7
|
|
paddw xmm10,xmm10
|
|
psraw xmm2,2
|
|
movdqa xmm1,xmm13
|
|
movdqa xmm0,xmm13
|
|
pandn xmm0,xmm12
|
|
pand xmm1,xmm2
|
|
paddw xmm7,xmm9
|
|
por xmm1,xmm0
|
|
paddw xmm10,xmm14
|
|
paddw xmm7,xmm8
|
|
movdqa xmm0,xmm13
|
|
packuswb xmm5,xmm1
|
|
paddw xmm7,xmm3
|
|
paddw xmm10,xmm11
|
|
movdqa xmm1,xmm6
|
|
paddw xmm10,xmm3
|
|
pandn xmm6,xmm9
|
|
psraw xmm7,2
|
|
pand xmm1,xmm7
|
|
psraw xmm10,2
|
|
pandn xmm13,xmm14
|
|
pand xmm0,xmm10
|
|
por xmm1,xmm6
|
|
movdqa xmm6,[rsp]
|
|
movdqa xmm4,xmm6
|
|
por xmm0,xmm13
|
|
punpcklbw xmm4,xmm5
|
|
punpckhbw xmm6,xmm5
|
|
movdqa xmm3,xmm4
|
|
packuswb xmm1,xmm0
|
|
movdqa xmm0,xmm1
|
|
punpckhbw xmm1,xmm15
|
|
punpcklbw xmm0,xmm15
|
|
punpcklwd xmm3,xmm0
|
|
punpckhwd xmm4,xmm0
|
|
movdqa xmm0,xmm6
|
|
movdqa xmm2,xmm3
|
|
punpcklwd xmm0,xmm1
|
|
punpckhwd xmm6,xmm1
|
|
movdqa xmm1,xmm4
|
|
punpckldq xmm2,xmm0
|
|
punpckhdq xmm3,xmm0
|
|
punpckldq xmm1,xmm6
|
|
movdqa xmm0,xmm2
|
|
punpcklqdq xmm0,xmm1
|
|
punpckhdq xmm4,xmm6
|
|
punpckhqdq xmm2,xmm1
|
|
movdqa [rsp+10h],xmm0
|
|
movdqa [rsp+60h],xmm2
|
|
movdqa xmm0,xmm3
|
|
mov eax,[rsp+10h]
|
|
mov [rcx-2],eax
|
|
mov eax,[rsp+60h]
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm3,xmm4
|
|
mov [r10+rcx-2],eax
|
|
movdqa [rsp+20h],xmm0
|
|
mov eax, [rsp+20h]
|
|
movdqa [rsp+70h],xmm3
|
|
mov [rcx+r10*2-2],eax
|
|
mov eax,[rsp+70h]
|
|
mov [rdx+rcx-2],eax
|
|
mov eax,[rsp+18h]
|
|
mov [r11],eax
|
|
mov eax,[rsp+68h]
|
|
mov [r10+r11],eax
|
|
mov eax,[rsp+28h]
|
|
mov [r11+r10*2],eax
|
|
mov eax,[rsp+78h]
|
|
mov [rdx+r11],eax
|
|
mov eax,[rsp+14h]
|
|
mov [rdi-2],eax
|
|
mov eax,[rsp+64h]
|
|
mov [r10+rdi-2],eax
|
|
mov eax,[rsp+24h]
|
|
mov [rdi+r10*2-2],eax
|
|
mov eax, [rsp+74h]
|
|
mov [rdx+rdi-2],eax
|
|
mov eax, [rsp+1Ch]
|
|
mov [rbx],eax
|
|
mov eax, [rsp+6Ch]
|
|
mov [r10+rbx],eax
|
|
mov eax,[rsp+2Ch]
|
|
mov [rbx+r10*2],eax
|
|
mov eax,[rsp+7Ch]
|
|
mov [rdx+rbx],eax
|
|
lea r11,[rsp+140h]
|
|
mov rbx, [r11+28h]
|
|
mov rsp,r11
|
|
pop r12
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
|
|
WELS_EXTERN DeblockChromaLt4H_ssse3
|
|
ALIGN 16
|
|
DeblockChromaLt4H_ssse3:
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
sub rsp,170h
|
|
|
|
mov r13, r8
|
|
mov r14, r9
|
|
mov r8, rdx
|
|
mov r9, rcx
|
|
mov rdx, rdi
|
|
mov rcx, rsi
|
|
|
|
movsxd rsi,r8d
|
|
lea eax,[r8*4]
|
|
mov r11d,r9d
|
|
movsxd r10,eax
|
|
mov eax, [rcx-2]
|
|
mov r12,rdx
|
|
mov [rsp+40h],eax
|
|
mov eax, [rsi+rcx-2]
|
|
lea rbx,[r10+rcx-2]
|
|
movdqa xmm5,[rsp+40h]
|
|
mov [rsp+50h],eax
|
|
mov eax, [rcx+rsi*2-2]
|
|
lea rbp,[r10+rdx-2]
|
|
movdqa xmm2, [rsp+50h]
|
|
mov [rsp+60h],eax
|
|
lea r10,[rsi+rsi*2]
|
|
mov rdi,rcx
|
|
mov eax,[r10+rcx-2]
|
|
movdqa xmm4,[rsp+60h]
|
|
mov [rsp+70h],eax
|
|
mov eax,[rdx-2]
|
|
mov [rsp+80h],eax
|
|
mov eax, [rsi+rdx-2]
|
|
movdqa xmm3,[rsp+70h]
|
|
mov [rsp+90h],eax
|
|
mov eax,[rdx+rsi*2-2]
|
|
punpckldq xmm5,[rsp+80h]
|
|
mov [rsp+0A0h],eax
|
|
mov eax, [r10+rdx-2]
|
|
punpckldq xmm2,[rsp+90h]
|
|
mov [rsp+0B0h],eax
|
|
mov eax, [rbx]
|
|
punpckldq xmm4,[rsp+0A0h]
|
|
mov [rsp+80h],eax
|
|
mov eax,[rbp]
|
|
punpckldq xmm3,[rsp+0B0h]
|
|
mov [rsp+90h],eax
|
|
mov eax,[rsi+rbx]
|
|
movdqa xmm0,[rsp+80h]
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm5,xmm0
|
|
movdqa [rsp+80h],xmm0
|
|
mov [rsp+80h],eax
|
|
mov eax,[rsi+rbp]
|
|
movdqa xmm0,[rsp+80h]
|
|
movdqa xmm1,xmm5
|
|
mov [rsp+90h],eax
|
|
mov eax,[rbx+rsi*2]
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm2,xmm0
|
|
punpcklbw xmm1,xmm2
|
|
punpckhbw xmm5,xmm2
|
|
movdqa [rsp+80h],xmm0
|
|
mov [rsp+80h],eax
|
|
mov eax,[rbp+rsi*2]
|
|
movdqa xmm0, [rsp+80h]
|
|
mov [rsp+90h],eax
|
|
mov eax,[r10+rbx]
|
|
movdqa xmm7,xmm1
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm4,xmm0
|
|
movdqa [rsp+80h],xmm0
|
|
mov [rsp+80h],eax
|
|
mov eax, [r10+rbp]
|
|
movdqa xmm0,[rsp+80h]
|
|
mov [rsp+90h],eax
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm3,xmm0
|
|
movdqa xmm0,xmm4
|
|
punpcklbw xmm0,xmm3
|
|
punpckhbw xmm4,xmm3
|
|
punpcklwd xmm7,xmm0
|
|
punpckhwd xmm1,xmm0
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm6,xmm7
|
|
punpcklwd xmm0,xmm4
|
|
punpckhwd xmm5,xmm4
|
|
punpckldq xmm6,xmm0
|
|
punpckhdq xmm7,xmm0
|
|
movdqa xmm0,xmm1
|
|
punpckldq xmm0,xmm5
|
|
mov rax, r14 ; pTC
|
|
punpckhdq xmm1,xmm5
|
|
movdqa xmm9,xmm6
|
|
punpckhqdq xmm6,xmm0
|
|
punpcklqdq xmm9,xmm0
|
|
movdqa xmm2,xmm7
|
|
movdqa xmm13,xmm6
|
|
movdqa xmm4,xmm9
|
|
movdqa [rsp+10h],xmm9
|
|
punpcklqdq xmm2,xmm1
|
|
punpckhqdq xmm7,xmm1
|
|
pxor xmm1,xmm1
|
|
movsx ecx,byte [rax+3]
|
|
movsx edx,byte [rax+2]
|
|
movsx r8d,byte [rax+1]
|
|
movsx r9d,byte [rax]
|
|
movdqa xmm10,xmm1
|
|
movdqa xmm15,xmm2
|
|
punpckhbw xmm2,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
punpcklbw xmm4,xmm1
|
|
movsx eax,r11w
|
|
mov word [rsp+0Eh],cx
|
|
mov word [rsp+0Ch],cx
|
|
movdqa xmm3,xmm7
|
|
movdqa xmm8,xmm7
|
|
movdqa [rsp+20h],xmm7
|
|
punpcklbw xmm15,xmm1
|
|
punpcklbw xmm13,xmm1
|
|
punpcklbw xmm3,xmm1
|
|
mov word [rsp+0Ah],dx
|
|
mov word [rsp+8],dx
|
|
mov word [rsp+6],r8w
|
|
movd xmm0,eax
|
|
movdqa [rsp+30h],xmm6
|
|
punpckhbw xmm9,xmm1
|
|
punpckhbw xmm8,xmm1
|
|
punpcklwd xmm0,xmm0
|
|
mov eax, r13d ; iBeta
|
|
mov word [rsp+4],r8w
|
|
mov word [rsp+2],r9w
|
|
pshufd xmm12,xmm0,0
|
|
mov word [rsp],r9w
|
|
movd xmm0,eax
|
|
mov eax,4
|
|
cwde
|
|
movdqa xmm14, [rsp]
|
|
movdqa [rsp],xmm2
|
|
movdqa xmm2,xmm12
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm11,xmm0,0
|
|
psubw xmm10,xmm14
|
|
movd xmm0,eax
|
|
movdqa xmm7,xmm14
|
|
movdqa xmm6,xmm14
|
|
pcmpgtw xmm7,xmm1
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm5,xmm0,0
|
|
movdqa xmm0,xmm4
|
|
movdqa xmm1,xmm15
|
|
psubw xmm4,xmm13
|
|
psubw xmm0,xmm3
|
|
psubw xmm1,xmm13
|
|
psubw xmm3,xmm15
|
|
psllw xmm1,2
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm10
|
|
psraw xmm1,3
|
|
pmaxsw xmm0,xmm1
|
|
pminsw xmm6,xmm0
|
|
movdqa xmm1,xmm11
|
|
movdqa xmm0,xmm13
|
|
psubw xmm0,xmm15
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm2,xmm0
|
|
pabsw xmm0,xmm4
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm3
|
|
pand xmm2,xmm1
|
|
movdqa xmm1,xmm11
|
|
movdqa xmm3,[rsp+30h]
|
|
pcmpgtw xmm1,xmm0
|
|
movdqa xmm0,xmm9
|
|
pand xmm2,xmm1
|
|
psubw xmm0,xmm8
|
|
psubw xmm9,xmm3
|
|
pand xmm2,xmm7
|
|
pand xmm6,xmm2
|
|
psubw xmm15,xmm6
|
|
paddw xmm13,xmm6
|
|
movdqa xmm2,[rsp]
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm3
|
|
psubw xmm8,xmm2
|
|
psllw xmm1,2
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm3
|
|
movdqa xmm5,[rsp+10h]
|
|
psubw xmm0,xmm2
|
|
psraw xmm1,3
|
|
movdqa xmm4,xmm5
|
|
pabsw xmm0,xmm0
|
|
pmaxsw xmm10,xmm1
|
|
movdqa xmm1,xmm11
|
|
pcmpgtw xmm12,xmm0
|
|
pabsw xmm0,xmm9
|
|
pminsw xmm14,xmm10
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm8
|
|
pcmpgtw xmm11,xmm0
|
|
pand xmm12,xmm1
|
|
movdqa xmm1,[rsp+20h]
|
|
pand xmm12,xmm11
|
|
pand xmm12,xmm7
|
|
pand xmm14,xmm12
|
|
paddw xmm3,xmm14
|
|
psubw xmm2,xmm14
|
|
packuswb xmm13,xmm3
|
|
packuswb xmm15,xmm2
|
|
punpcklbw xmm4,xmm13
|
|
punpckhbw xmm5,xmm13
|
|
movdqa xmm0,xmm15
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm15,xmm1
|
|
movdqa xmm3,xmm4
|
|
punpcklwd xmm3,xmm0
|
|
punpckhwd xmm4,xmm0
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm2,xmm3
|
|
movdqa xmm1,xmm4
|
|
punpcklwd xmm0,xmm15
|
|
punpckhwd xmm5,xmm15
|
|
punpckldq xmm2,xmm0
|
|
punpckhdq xmm3,xmm0
|
|
punpckldq xmm1,xmm5
|
|
movdqa xmm0,xmm2
|
|
punpcklqdq xmm0,xmm1
|
|
punpckhdq xmm4,xmm5
|
|
punpckhqdq xmm2,xmm1
|
|
movdqa [rsp+40h],xmm0
|
|
movdqa xmm0,xmm3
|
|
movdqa [rsp+90h],xmm2
|
|
mov eax,[rsp+40h]
|
|
mov [rdi-2],eax
|
|
mov eax, [rsp+90h]
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm3,xmm4
|
|
mov [rsi+rdi-2],eax
|
|
movdqa [rsp+50h],xmm0
|
|
mov eax,[rsp+50h]
|
|
movdqa [rsp+0A0h],xmm3
|
|
mov [rdi+rsi*2-2],eax
|
|
mov eax,[rsp+0A0h]
|
|
mov [r10+rdi-2],eax
|
|
mov eax,[rsp+48h]
|
|
mov [rbx],eax
|
|
mov eax,[rsp+98h]
|
|
mov [rsi+rbx],eax
|
|
mov eax,[rsp+58h]
|
|
mov [rbx+rsi*2],eax
|
|
mov eax, [rsp+0A8h]
|
|
mov [r10+rbx],eax
|
|
mov eax, [rsp+44h]
|
|
mov [r12-2],eax
|
|
mov eax,[rsp+94h]
|
|
mov [rsi+r12-2],eax
|
|
mov eax,[rsp+54h]
|
|
mov [r12+rsi*2-2],eax
|
|
mov eax, [rsp+0A4h]
|
|
mov [r10+r12-2],eax
|
|
mov eax,[rsp+4Ch]
|
|
mov [rbp],eax
|
|
mov eax,[rsp+9Ch]
|
|
mov [rsi+rbp],eax
|
|
mov eax, [rsp+5Ch]
|
|
mov [rbp+rsi*2],eax
|
|
mov eax,[rsp+0ACh]
|
|
mov [r10+rbp],eax
|
|
lea r11,[rsp+170h]
|
|
mov rsp,r11
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
|
|
|
|
%elifdef X86_32
|
|
|
|
;********************************************************************************
|
|
; void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
; int32_t iAlpha, int32_t iBeta)
|
|
;********************************************************************************
|
|
WELS_EXTERN DeblockChromaEq4V_ssse3
|
|
|
|
ALIGN 16
|
|
DeblockChromaEq4V_ssse3:
|
|
push ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,68h
|
|
mov edx,[ebp+10h] ; iStride
|
|
mov eax,[ebp+8] ; pPixCb
|
|
mov ecx,[ebp+0Ch] ; pPixCr
|
|
movq xmm4,[ecx]
|
|
movq xmm5,[edx+ecx]
|
|
push esi
|
|
push edi
|
|
lea esi,[edx+edx]
|
|
mov edi,eax
|
|
sub edi,esi
|
|
movq xmm1,[edi]
|
|
mov edi,ecx
|
|
sub edi,esi
|
|
movq xmm2,[edi]
|
|
punpcklqdq xmm1,xmm2
|
|
mov esi,eax
|
|
sub esi,edx
|
|
movq xmm2,[esi]
|
|
mov edi,ecx
|
|
sub edi,edx
|
|
movq xmm3,[edi]
|
|
punpcklqdq xmm2,xmm3
|
|
movq xmm3,[eax]
|
|
punpcklqdq xmm3,xmm4
|
|
movq xmm4,[edx+eax]
|
|
mov edx, [ebp + 14h]
|
|
punpcklqdq xmm4,xmm5
|
|
movd xmm5,edx
|
|
mov edx, [ebp + 18h]
|
|
pxor xmm0,xmm0
|
|
movdqa xmm6,xmm5
|
|
punpcklwd xmm6,xmm5
|
|
pshufd xmm5,xmm6,0
|
|
movd xmm6,edx
|
|
movdqa xmm7,xmm6
|
|
punpcklwd xmm7,xmm6
|
|
pshufd xmm6,xmm7,0
|
|
movdqa xmm7,xmm1
|
|
punpckhbw xmm1,xmm0
|
|
punpcklbw xmm7,xmm0
|
|
movdqa [esp+40h],xmm1
|
|
movdqa [esp+60h],xmm7
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm7,xmm0
|
|
movdqa [esp+10h],xmm7
|
|
movdqa xmm7,xmm3
|
|
punpcklbw xmm7,xmm0
|
|
punpckhbw xmm3,xmm0
|
|
movdqa [esp+50h],xmm7
|
|
movdqa xmm7,xmm4
|
|
punpckhbw xmm4,xmm0
|
|
punpckhbw xmm2,xmm0
|
|
punpcklbw xmm7,xmm0
|
|
movdqa [esp+30h],xmm3
|
|
movdqa xmm3,[esp+10h]
|
|
movdqa xmm1,xmm3
|
|
psubw xmm1,[esp+50h]
|
|
pabsw xmm1,xmm1
|
|
movdqa [esp+20h],xmm4
|
|
movdqa xmm0,xmm5
|
|
pcmpgtw xmm0,xmm1
|
|
movdqa xmm1,[esp+60h]
|
|
psubw xmm1,xmm3
|
|
pabsw xmm1,xmm1
|
|
movdqa xmm4,xmm6
|
|
pcmpgtw xmm4,xmm1
|
|
pand xmm0,xmm4
|
|
movdqa xmm1,xmm7
|
|
psubw xmm1,[esp+50h]
|
|
pabsw xmm1,xmm1
|
|
movdqa xmm4,xmm6
|
|
pcmpgtw xmm4,xmm1
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,[esp+30h]
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm5,xmm1
|
|
movdqa xmm1,[esp+40h]
|
|
pand xmm0,xmm4
|
|
psubw xmm1,xmm2
|
|
pabsw xmm1,xmm1
|
|
movdqa xmm4,xmm6
|
|
pcmpgtw xmm4,xmm1
|
|
movdqa xmm1,[esp+20h]
|
|
psubw xmm1,[esp+30h]
|
|
pand xmm5,xmm4
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm6,xmm1
|
|
pand xmm5,xmm6
|
|
mov edx,2
|
|
movsx edx,dx
|
|
movd xmm1,edx
|
|
movdqa xmm4,xmm1
|
|
punpcklwd xmm4,xmm1
|
|
pshufd xmm1,xmm4,0
|
|
movdqa xmm4,[esp+60h]
|
|
movdqa xmm6,xmm4
|
|
paddw xmm6,xmm4
|
|
paddw xmm6,xmm3
|
|
paddw xmm6,xmm7
|
|
movdqa [esp+10h],xmm1
|
|
paddw xmm6,[esp+10h]
|
|
psraw xmm6,2
|
|
movdqa xmm4,xmm0
|
|
pandn xmm4,xmm3
|
|
movdqa xmm3,[esp+40h]
|
|
movdqa xmm1,xmm0
|
|
pand xmm1,xmm6
|
|
por xmm1,xmm4
|
|
movdqa xmm6,xmm3
|
|
paddw xmm6,xmm3
|
|
movdqa xmm3,[esp+10h]
|
|
paddw xmm6,xmm2
|
|
paddw xmm6,[esp+20h]
|
|
paddw xmm6,xmm3
|
|
psraw xmm6,2
|
|
movdqa xmm4,xmm5
|
|
pand xmm4,xmm6
|
|
movdqa xmm6,xmm5
|
|
pandn xmm6,xmm2
|
|
por xmm4,xmm6
|
|
packuswb xmm1,xmm4
|
|
movdqa xmm4,[esp+50h]
|
|
movdqa xmm6,xmm7
|
|
paddw xmm6,xmm7
|
|
paddw xmm6,xmm4
|
|
paddw xmm6,[esp+60h]
|
|
paddw xmm6,xmm3
|
|
psraw xmm6,2
|
|
movdqa xmm2,xmm0
|
|
pand xmm2,xmm6
|
|
pandn xmm0,xmm4
|
|
por xmm2,xmm0
|
|
movdqa xmm0,[esp+20h]
|
|
movdqa xmm6,xmm0
|
|
paddw xmm6,xmm0
|
|
movdqa xmm0,[esp+30h]
|
|
paddw xmm6,xmm0
|
|
paddw xmm6,[esp+40h]
|
|
movdqa xmm4,xmm5
|
|
paddw xmm6,xmm3
|
|
movq [esi],xmm1
|
|
psraw xmm6,2
|
|
pand xmm4,xmm6
|
|
pandn xmm5,xmm0
|
|
por xmm4,xmm5
|
|
packuswb xmm2,xmm4
|
|
movq [eax],xmm2
|
|
psrldq xmm1,8
|
|
movq [edi],xmm1
|
|
pop edi
|
|
psrldq xmm2,8
|
|
movq [ecx],xmm2
|
|
pop esi
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
;******************************************************************************
|
|
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
|
|
;*******************************************************************************
|
|
|
|
WELS_EXTERN DeblockChromaLt4V_ssse3
|
|
|
|
DeblockChromaLt4V_ssse3:
|
|
push ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,0E4h
|
|
push ebx
|
|
push esi
|
|
mov esi, [ebp+1Ch] ; pTC
|
|
movsx ebx, byte [esi+2]
|
|
push edi
|
|
movsx di,byte [esi+3]
|
|
mov word [esp+0Ch],bx
|
|
movsx bx,byte [esi+1]
|
|
movsx esi,byte [esi]
|
|
mov word [esp+0Eh],si
|
|
movzx esi,di
|
|
movd xmm1,esi
|
|
movzx esi,di
|
|
movd xmm2,esi
|
|
mov si,word [esp+0Ch]
|
|
mov edx, [ebp + 10h]
|
|
mov eax, [ebp + 08h]
|
|
movzx edi,si
|
|
movzx esi,si
|
|
mov ecx, [ebp + 0Ch]
|
|
movd xmm4,esi
|
|
movzx esi,bx
|
|
movd xmm5,esi
|
|
movd xmm3,edi
|
|
movzx esi,bx
|
|
movd xmm6,esi
|
|
mov si,word [esp+0Eh]
|
|
movzx edi,si
|
|
movzx esi,si
|
|
punpcklwd xmm6,xmm2
|
|
pxor xmm0,xmm0
|
|
movdqa [esp+40h],xmm0
|
|
movd xmm7,edi
|
|
movd xmm0,esi
|
|
lea esi,[edx+edx]
|
|
mov edi,eax
|
|
sub edi,esi
|
|
punpcklwd xmm5,xmm1
|
|
movdqa xmm1,[esp+40h]
|
|
punpcklwd xmm0,xmm4
|
|
movq xmm4,[edx+ecx]
|
|
punpcklwd xmm7,xmm3
|
|
movq xmm3,[eax]
|
|
punpcklwd xmm0,xmm6
|
|
movq xmm6,[edi]
|
|
punpcklwd xmm7,xmm5
|
|
punpcklwd xmm0,xmm7
|
|
mov edi,ecx
|
|
sub edi,esi
|
|
movdqa xmm2,xmm1
|
|
psubw xmm2,xmm0
|
|
movdqa [esp+60h],xmm2
|
|
movq xmm2, [edi]
|
|
punpcklqdq xmm6,xmm2
|
|
mov esi,eax
|
|
sub esi,edx
|
|
movq xmm7,[esi]
|
|
mov edi,ecx
|
|
sub edi,edx
|
|
movq xmm2,[edi]
|
|
punpcklqdq xmm7,xmm2
|
|
movq xmm2,[ecx]
|
|
punpcklqdq xmm3,xmm2
|
|
movq xmm2,[edx+eax]
|
|
movsx edx,word [ebp + 14h]
|
|
punpcklqdq xmm2,xmm4
|
|
movdqa [esp+0E0h],xmm2
|
|
movd xmm2,edx
|
|
movsx edx,word [ebp + 18h]
|
|
movdqa xmm4,xmm2
|
|
punpcklwd xmm4,xmm2
|
|
movd xmm2,edx
|
|
movdqa xmm5,xmm2
|
|
punpcklwd xmm5,xmm2
|
|
pshufd xmm2,xmm5,0
|
|
movdqa [esp+50h],xmm2
|
|
movdqa xmm2,xmm6
|
|
punpcklbw xmm2,xmm1
|
|
movdqa [esp+0D0h],xmm3
|
|
pshufd xmm4,xmm4,0
|
|
movdqa [esp+30h],xmm2
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+80h],xmm6
|
|
movdqa xmm6,[esp+0D0h]
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+70h],xmm6
|
|
movdqa xmm6, [esp+0E0h]
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+90h],xmm6
|
|
movdqa xmm5, [esp+0E0h]
|
|
movdqa xmm2,xmm7
|
|
punpckhbw xmm7,xmm1
|
|
punpcklbw xmm5,xmm1
|
|
movdqa [esp+0A0h],xmm7
|
|
punpcklbw xmm3,xmm1
|
|
mov edx,4
|
|
punpcklbw xmm2,xmm1
|
|
movsx edx,dx
|
|
movd xmm6,edx
|
|
movdqa xmm7,xmm6
|
|
punpcklwd xmm7,xmm6
|
|
pshufd xmm6,xmm7,0
|
|
movdqa xmm7,[esp+30h]
|
|
movdqa [esp+20h],xmm6
|
|
psubw xmm7,xmm5
|
|
movdqa xmm6,xmm0
|
|
pcmpgtw xmm6,xmm1
|
|
movdqa xmm1,[esp+60h]
|
|
movdqa [esp+40h],xmm6
|
|
movdqa xmm6,xmm3
|
|
psubw xmm6,xmm2
|
|
psllw xmm6,2
|
|
paddw xmm6,xmm7
|
|
paddw xmm6, [esp+20h]
|
|
movdqa xmm7, [esp+50h]
|
|
psraw xmm6,3
|
|
pmaxsw xmm1,xmm6
|
|
movdqa [esp+10h],xmm0
|
|
movdqa xmm6, [esp+10h]
|
|
pminsw xmm6,xmm1
|
|
movdqa [esp+10h],xmm6
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm3
|
|
pabsw xmm1,xmm1
|
|
movdqa xmm6,xmm4
|
|
pcmpgtw xmm6,xmm1
|
|
movdqa xmm1, [esp+30h]
|
|
psubw xmm1,xmm2
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm7,xmm1
|
|
movdqa xmm1,[esp+50h]
|
|
pand xmm6,xmm7
|
|
movdqa xmm7,[esp+50h]
|
|
psubw xmm5,xmm3
|
|
pabsw xmm5,xmm5
|
|
pcmpgtw xmm1,xmm5
|
|
movdqa xmm5,[esp+80h]
|
|
psubw xmm5,[esp+90h]
|
|
pand xmm6,xmm1
|
|
pand xmm6,[esp+40h]
|
|
movdqa xmm1,[esp+10h]
|
|
pand xmm1,xmm6
|
|
movdqa xmm6,[esp+70h]
|
|
movdqa [esp+30h],xmm1
|
|
movdqa xmm1,[esp+0A0h]
|
|
psubw xmm6,xmm1
|
|
psllw xmm6,2
|
|
paddw xmm6,xmm5
|
|
paddw xmm6,[esp+20h]
|
|
movdqa xmm5,[esp+60h]
|
|
psraw xmm6,3
|
|
pmaxsw xmm5,xmm6
|
|
pminsw xmm0,xmm5
|
|
movdqa xmm5,[esp+70h]
|
|
movdqa xmm6,xmm1
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm4,xmm6
|
|
movdqa xmm6,[esp+80h]
|
|
psubw xmm6,xmm1
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6,[esp+90h]
|
|
pand xmm4,xmm7
|
|
movdqa xmm7,[esp+50h]
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm7,xmm6
|
|
pand xmm4,xmm7
|
|
pand xmm4,[esp+40h]
|
|
pand xmm0,xmm4
|
|
movdqa xmm4,[esp+30h]
|
|
paddw xmm2,xmm4
|
|
paddw xmm1,xmm0
|
|
packuswb xmm2,xmm1
|
|
movq [esi],xmm2
|
|
psubw xmm3,xmm4
|
|
psubw xmm5,xmm0
|
|
packuswb xmm3,xmm5
|
|
movq [eax],xmm3
|
|
psrldq xmm2,8
|
|
movq [edi],xmm2
|
|
pop edi
|
|
pop esi
|
|
psrldq xmm3,8
|
|
movq [ecx],xmm3
|
|
pop ebx
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
;***************************************************************************
|
|
; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
; int32_t iAlpha, int32_t iBeta)
|
|
;***************************************************************************
|
|
|
|
WELS_EXTERN DeblockChromaEq4H_ssse3
|
|
|
|
ALIGN 16
|
|
|
|
DeblockChromaEq4H_ssse3:
|
|
push ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,0C8h
|
|
mov ecx,dword [ebp+8]
|
|
mov edx,dword [ebp+0Ch]
|
|
mov eax,dword [ebp+10h]
|
|
sub ecx,2
|
|
sub edx,2
|
|
push esi
|
|
lea esi,[eax+eax*2]
|
|
mov dword [esp+18h],ecx
|
|
mov dword [esp+4],edx
|
|
lea ecx,[ecx+eax*4]
|
|
lea edx,[edx+eax*4]
|
|
lea eax,[esp+7Ch]
|
|
push edi
|
|
mov dword [esp+14h],esi
|
|
mov dword [esp+18h],ecx
|
|
mov dword [esp+0Ch],edx
|
|
mov dword [esp+10h],eax
|
|
mov esi,dword [esp+1Ch]
|
|
mov ecx,dword [ebp+10h]
|
|
mov edx,dword [esp+14h]
|
|
movd xmm0,dword [esi]
|
|
movd xmm1,dword [esi+ecx]
|
|
movd xmm2,dword [esi+ecx*2]
|
|
movd xmm3,dword [esi+edx]
|
|
mov esi,dword [esp+8]
|
|
movd xmm4,dword [esi]
|
|
movd xmm5,dword [esi+ecx]
|
|
movd xmm6,dword [esi+ecx*2]
|
|
movd xmm7,dword [esi+edx]
|
|
punpckldq xmm0,xmm4
|
|
punpckldq xmm1,xmm5
|
|
punpckldq xmm2,xmm6
|
|
punpckldq xmm3,xmm7
|
|
mov esi,dword [esp+18h]
|
|
mov edi,dword [esp+0Ch]
|
|
movd xmm4,dword [esi]
|
|
movd xmm5,dword [edi]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm0,xmm4
|
|
movd xmm4,dword [esi+ecx]
|
|
movd xmm5,dword [edi+ecx]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm1,xmm4
|
|
movd xmm4,dword [esi+ecx*2]
|
|
movd xmm5,dword [edi+ecx*2]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm2,xmm4
|
|
movd xmm4,dword [esi+edx]
|
|
movd xmm5,dword [edi+edx]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm3,xmm4
|
|
movdqa xmm6,xmm0
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm2,xmm3
|
|
punpckhbw xmm7,xmm3
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm6
|
|
punpcklwd xmm0,xmm2
|
|
punpckhwd xmm4,xmm2
|
|
punpcklwd xmm6,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,xmm4
|
|
punpckldq xmm0,xmm6
|
|
punpckhdq xmm1,xmm6
|
|
punpckldq xmm4,xmm5
|
|
punpckhdq xmm2,xmm5
|
|
movdqa xmm5,xmm0
|
|
movdqa xmm6,xmm1
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm5,xmm4
|
|
punpcklqdq xmm1,xmm2
|
|
punpckhqdq xmm6,xmm2
|
|
mov edi,dword [esp+10h]
|
|
movdqa [edi],xmm0
|
|
movdqa [edi+10h],xmm5
|
|
movdqa [edi+20h],xmm1
|
|
movdqa [edi+30h],xmm6
|
|
movsx ecx,word [ebp+14h]
|
|
movsx edx,word [ebp+18h]
|
|
movdqa xmm6,[esp+80h]
|
|
movdqa xmm4,[esp+90h]
|
|
movdqa xmm5,[esp+0A0h]
|
|
movdqa xmm7,[esp+0B0h]
|
|
pxor xmm0,xmm0
|
|
movd xmm1,ecx
|
|
movdqa xmm2,xmm1
|
|
punpcklwd xmm2,xmm1
|
|
pshufd xmm1,xmm2,0
|
|
movd xmm2,edx
|
|
movdqa xmm3,xmm2
|
|
punpcklwd xmm3,xmm2
|
|
pshufd xmm2,xmm3,0
|
|
movdqa xmm3,xmm6
|
|
punpckhbw xmm6,xmm0
|
|
movdqa [esp+60h],xmm6
|
|
movdqa xmm6,[esp+90h]
|
|
punpckhbw xmm6,xmm0
|
|
movdqa [esp+30h],xmm6
|
|
movdqa xmm6,[esp+0A0h]
|
|
punpckhbw xmm6,xmm0
|
|
movdqa [esp+40h],xmm6
|
|
movdqa xmm6,[esp+0B0h]
|
|
punpckhbw xmm6,xmm0
|
|
movdqa [esp+70h],xmm6
|
|
punpcklbw xmm7,xmm0
|
|
punpcklbw xmm4,xmm0
|
|
punpcklbw xmm5,xmm0
|
|
punpcklbw xmm3,xmm0
|
|
movdqa [esp+50h],xmm7
|
|
movdqa xmm6,xmm4
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
movdqa xmm0,xmm1
|
|
pcmpgtw xmm0,xmm6
|
|
movdqa xmm6,xmm3
|
|
psubw xmm6,xmm4
|
|
pabsw xmm6,xmm6
|
|
movdqa xmm7,xmm2
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6,[esp+50h]
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pand xmm0,xmm7
|
|
movdqa xmm7,xmm2
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6,[esp+30h]
|
|
psubw xmm6,[esp+40h]
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm1,xmm6
|
|
movdqa xmm6,[esp+60h]
|
|
psubw xmm6,[esp+30h]
|
|
pabsw xmm6,xmm6
|
|
pand xmm0,xmm7
|
|
movdqa xmm7,xmm2
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6,[esp+70h]
|
|
psubw xmm6,[esp+40h]
|
|
pabsw xmm6,xmm6
|
|
pand xmm1,xmm7
|
|
pcmpgtw xmm2,xmm6
|
|
pand xmm1,xmm2
|
|
mov eax,2
|
|
movsx ecx,ax
|
|
movd xmm2,ecx
|
|
movdqa xmm6,xmm2
|
|
punpcklwd xmm6,xmm2
|
|
pshufd xmm2,xmm6,0
|
|
movdqa [esp+20h],xmm2
|
|
movdqa xmm2,xmm3
|
|
paddw xmm2,xmm3
|
|
paddw xmm2,xmm4
|
|
paddw xmm2,[esp+50h]
|
|
paddw xmm2,[esp+20h]
|
|
psraw xmm2,2
|
|
movdqa xmm6,xmm0
|
|
pand xmm6,xmm2
|
|
movdqa xmm2,xmm0
|
|
pandn xmm2,xmm4
|
|
por xmm6,xmm2
|
|
movdqa xmm2,[esp+60h]
|
|
movdqa xmm7,xmm2
|
|
paddw xmm7,xmm2
|
|
paddw xmm7,[esp+30h]
|
|
paddw xmm7,[esp+70h]
|
|
paddw xmm7,[esp+20h]
|
|
movdqa xmm4,xmm1
|
|
movdqa xmm2,xmm1
|
|
pandn xmm2,[esp+30h]
|
|
psraw xmm7,2
|
|
pand xmm4,xmm7
|
|
por xmm4,xmm2
|
|
movdqa xmm2,[esp+50h]
|
|
packuswb xmm6,xmm4
|
|
movdqa [esp+90h],xmm6
|
|
movdqa xmm6,xmm2
|
|
paddw xmm6,xmm2
|
|
movdqa xmm2,[esp+20h]
|
|
paddw xmm6,xmm5
|
|
paddw xmm6,xmm3
|
|
movdqa xmm4,xmm0
|
|
pandn xmm0,xmm5
|
|
paddw xmm6,xmm2
|
|
psraw xmm6,2
|
|
pand xmm4,xmm6
|
|
por xmm4,xmm0
|
|
movdqa xmm0,[esp+70h]
|
|
movdqa xmm5,xmm0
|
|
paddw xmm5,xmm0
|
|
movdqa xmm0,[esp+40h]
|
|
paddw xmm5,xmm0
|
|
paddw xmm5,[esp+60h]
|
|
movdqa xmm3,xmm1
|
|
paddw xmm5,xmm2
|
|
psraw xmm5,2
|
|
pand xmm3,xmm5
|
|
pandn xmm1,xmm0
|
|
por xmm3,xmm1
|
|
packuswb xmm4,xmm3
|
|
movdqa [esp+0A0h],xmm4
|
|
mov esi,dword [esp+10h]
|
|
movdqa xmm0,[esi]
|
|
movdqa xmm1,[esi+10h]
|
|
movdqa xmm2,[esi+20h]
|
|
movdqa xmm3,[esi+30h]
|
|
movdqa xmm6,xmm0
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm2,xmm3
|
|
punpckhbw xmm7,xmm3
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm6
|
|
punpcklwd xmm0,xmm2
|
|
punpckhwd xmm4,xmm2
|
|
punpcklwd xmm6,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,xmm4
|
|
punpckldq xmm0,xmm6
|
|
punpckhdq xmm1,xmm6
|
|
punpckldq xmm4,xmm5
|
|
punpckhdq xmm2,xmm5
|
|
movdqa xmm5,xmm0
|
|
movdqa xmm6,xmm1
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm5,xmm4
|
|
punpcklqdq xmm1,xmm2
|
|
punpckhqdq xmm6,xmm2
|
|
mov esi,dword [esp+1Ch]
|
|
mov ecx,dword [ebp+10h]
|
|
mov edx,dword [esp+14h]
|
|
mov edi,dword [esp+8]
|
|
movd dword [esi],xmm0
|
|
movd dword [esi+ecx],xmm5
|
|
movd dword [esi+ecx*2],xmm1
|
|
movd dword [esi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
mov esi,dword [esp+18h]
|
|
movd dword [edi],xmm0
|
|
movd dword [edi+ecx],xmm5
|
|
movd dword [edi+ecx*2],xmm1
|
|
movd dword [edi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
movd dword [esi],xmm0
|
|
movd dword [esi+ecx],xmm5
|
|
movd dword [esi+ecx*2],xmm1
|
|
movd dword [esi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
mov edi,dword [esp+0Ch]
|
|
movd dword [edi],xmm0
|
|
movd dword [edi+ecx],xmm5
|
|
movd dword [edi+ecx*2],xmm1
|
|
movd dword [edi+edx],xmm6
|
|
pop edi
|
|
pop esi
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
;*******************************************************************************
|
|
; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
|
|
;*******************************************************************************
|
|
|
|
WELS_EXTERN DeblockChromaLt4H_ssse3
|
|
|
|
ALIGN 16
|
|
|
|
DeblockChromaLt4H_ssse3:
|
|
push ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,108h
|
|
mov ecx,dword [ebp+8]
|
|
mov edx,dword [ebp+0Ch]
|
|
mov eax,dword [ebp+10h]
|
|
sub ecx,2
|
|
sub edx,2
|
|
push esi
|
|
lea esi,[eax+eax*2]
|
|
mov dword [esp+10h],ecx
|
|
mov dword [esp+4],edx
|
|
lea ecx,[ecx+eax*4]
|
|
lea edx,[edx+eax*4]
|
|
lea eax,[esp+6Ch]
|
|
push edi
|
|
mov dword [esp+0Ch],esi
|
|
mov dword [esp+18h],ecx
|
|
mov dword [esp+10h],edx
|
|
mov dword [esp+1Ch],eax
|
|
mov esi,dword [esp+14h]
|
|
mov ecx,dword [ebp+10h]
|
|
mov edx,dword [esp+0Ch]
|
|
movd xmm0,dword [esi]
|
|
movd xmm1,dword [esi+ecx]
|
|
movd xmm2,dword [esi+ecx*2]
|
|
movd xmm3,dword [esi+edx]
|
|
mov esi,dword [esp+8]
|
|
movd xmm4,dword [esi]
|
|
movd xmm5,dword [esi+ecx]
|
|
movd xmm6,dword [esi+ecx*2]
|
|
movd xmm7,dword [esi+edx]
|
|
punpckldq xmm0,xmm4
|
|
punpckldq xmm1,xmm5
|
|
punpckldq xmm2,xmm6
|
|
punpckldq xmm3,xmm7
|
|
mov esi,dword [esp+18h]
|
|
mov edi,dword [esp+10h]
|
|
movd xmm4,dword [esi]
|
|
movd xmm5,dword [edi]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm0,xmm4
|
|
movd xmm4,dword [esi+ecx]
|
|
movd xmm5,dword [edi+ecx]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm1,xmm4
|
|
movd xmm4,dword [esi+ecx*2]
|
|
movd xmm5,dword [edi+ecx*2]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm2,xmm4
|
|
movd xmm4,dword [esi+edx]
|
|
movd xmm5,dword [edi+edx]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm3,xmm4
|
|
movdqa xmm6,xmm0
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm2,xmm3
|
|
punpckhbw xmm7,xmm3
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm6
|
|
punpcklwd xmm0,xmm2
|
|
punpckhwd xmm4,xmm2
|
|
punpcklwd xmm6,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,xmm4
|
|
punpckldq xmm0,xmm6
|
|
punpckhdq xmm1,xmm6
|
|
punpckldq xmm4,xmm5
|
|
punpckhdq xmm2,xmm5
|
|
movdqa xmm5,xmm0
|
|
movdqa xmm6,xmm1
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm5,xmm4
|
|
punpcklqdq xmm1,xmm2
|
|
punpckhqdq xmm6,xmm2
|
|
mov edi,dword [esp+1Ch]
|
|
movdqa [edi],xmm0
|
|
movdqa [edi+10h],xmm5
|
|
movdqa [edi+20h],xmm1
|
|
movdqa [edi+30h],xmm6
|
|
mov eax,dword [ebp+1Ch]
|
|
movsx cx,byte [eax+3]
|
|
movsx dx,byte [eax+2]
|
|
movsx si,byte [eax+1]
|
|
movsx ax,byte [eax]
|
|
movzx edi,cx
|
|
movzx ecx,cx
|
|
movd xmm2,ecx
|
|
movzx ecx,dx
|
|
movzx edx,dx
|
|
movd xmm3,ecx
|
|
movd xmm4,edx
|
|
movzx ecx,si
|
|
movzx edx,si
|
|
movd xmm5,ecx
|
|
pxor xmm0,xmm0
|
|
movd xmm6,edx
|
|
movzx ecx,ax
|
|
movdqa [esp+60h],xmm0
|
|
movzx edx,ax
|
|
movsx eax,word [ebp+14h]
|
|
punpcklwd xmm6,xmm2
|
|
movd xmm1,edi
|
|
movd xmm7,ecx
|
|
movsx ecx,word [ebp+18h]
|
|
movd xmm0,edx
|
|
punpcklwd xmm7,xmm3
|
|
punpcklwd xmm5,xmm1
|
|
movdqa xmm1,[esp+60h]
|
|
punpcklwd xmm7,xmm5
|
|
movdqa xmm5,[esp+0A0h]
|
|
punpcklwd xmm0,xmm4
|
|
punpcklwd xmm0,xmm6
|
|
movdqa xmm6, [esp+70h]
|
|
punpcklwd xmm0,xmm7
|
|
movdqa xmm7,[esp+80h]
|
|
movdqa xmm2,xmm1
|
|
psubw xmm2,xmm0
|
|
movdqa [esp+0D0h],xmm2
|
|
movd xmm2,eax
|
|
movdqa xmm3,xmm2
|
|
punpcklwd xmm3,xmm2
|
|
pshufd xmm4,xmm3,0
|
|
movd xmm2,ecx
|
|
movdqa xmm3,xmm2
|
|
punpcklwd xmm3,xmm2
|
|
pshufd xmm2,xmm3,0
|
|
movdqa xmm3, [esp+90h]
|
|
movdqa [esp+50h],xmm2
|
|
movdqa xmm2,xmm6
|
|
punpcklbw xmm2,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+40h],xmm2
|
|
movdqa [esp+0B0h],xmm6
|
|
movdqa xmm6,[esp+90h]
|
|
movdqa xmm2,xmm7
|
|
punpckhbw xmm7,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
punpcklbw xmm2,xmm1
|
|
punpcklbw xmm3,xmm1
|
|
punpcklbw xmm5,xmm1
|
|
movdqa [esp+0F0h],xmm7
|
|
movdqa [esp+0C0h],xmm6
|
|
movdqa xmm6, [esp+0A0h]
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+0E0h],xmm6
|
|
mov edx,4
|
|
movsx eax,dx
|
|
movd xmm6,eax
|
|
movdqa xmm7,xmm6
|
|
punpcklwd xmm7,xmm6
|
|
pshufd xmm6,xmm7,0
|
|
movdqa [esp+30h],xmm6
|
|
movdqa xmm7, [esp+40h]
|
|
psubw xmm7,xmm5
|
|
movdqa xmm6,xmm0
|
|
pcmpgtw xmm6,xmm1
|
|
movdqa [esp+60h],xmm6
|
|
movdqa xmm1, [esp+0D0h]
|
|
movdqa xmm6,xmm3
|
|
psubw xmm6,xmm2
|
|
psllw xmm6,2
|
|
paddw xmm6,xmm7
|
|
paddw xmm6,[esp+30h]
|
|
psraw xmm6,3
|
|
pmaxsw xmm1,xmm6
|
|
movdqa xmm7,[esp+50h]
|
|
movdqa [esp+20h],xmm0
|
|
movdqa xmm6, [esp+20h]
|
|
pminsw xmm6,xmm1
|
|
movdqa [esp+20h],xmm6
|
|
movdqa xmm6,xmm4
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm3
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm6,xmm1
|
|
movdqa xmm1, [esp+40h]
|
|
psubw xmm1,xmm2
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm7,xmm1
|
|
movdqa xmm1, [esp+50h]
|
|
pand xmm6,xmm7
|
|
movdqa xmm7, [esp+50h]
|
|
psubw xmm5,xmm3
|
|
pabsw xmm5,xmm5
|
|
pcmpgtw xmm1,xmm5
|
|
movdqa xmm5, [esp+0B0h]
|
|
psubw xmm5,[esp+0E0h]
|
|
pand xmm6,xmm1
|
|
pand xmm6, [esp+60h]
|
|
movdqa xmm1, [esp+20h]
|
|
pand xmm1,xmm6
|
|
movdqa xmm6, [esp+0C0h]
|
|
movdqa [esp+40h],xmm1
|
|
movdqa xmm1, [esp+0F0h]
|
|
psubw xmm6,xmm1
|
|
psllw xmm6,2
|
|
paddw xmm6,xmm5
|
|
paddw xmm6, [esp+30h]
|
|
movdqa xmm5, [esp+0D0h]
|
|
psraw xmm6,3
|
|
pmaxsw xmm5,xmm6
|
|
pminsw xmm0,xmm5
|
|
movdqa xmm5,[esp+0C0h]
|
|
movdqa xmm6,xmm1
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm4,xmm6
|
|
movdqa xmm6,[esp+0B0h]
|
|
psubw xmm6,xmm1
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6, [esp+0E0h]
|
|
pand xmm4,xmm7
|
|
movdqa xmm7, [esp+50h]
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm7,xmm6
|
|
pand xmm4,xmm7
|
|
pand xmm4,[esp+60h]
|
|
pand xmm0,xmm4
|
|
movdqa xmm4, [esp+40h]
|
|
paddw xmm2,xmm4
|
|
paddw xmm1,xmm0
|
|
psubw xmm3,xmm4
|
|
psubw xmm5,xmm0
|
|
packuswb xmm2,xmm1
|
|
packuswb xmm3,xmm5
|
|
movdqa [esp+80h],xmm2
|
|
movdqa [esp+90h],xmm3
|
|
mov esi,dword [esp+1Ch]
|
|
movdqa xmm0, [esi]
|
|
movdqa xmm1, [esi+10h]
|
|
movdqa xmm2, [esi+20h]
|
|
movdqa xmm3, [esi+30h]
|
|
movdqa xmm6,xmm0
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm2,xmm3
|
|
punpckhbw xmm7,xmm3
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm6
|
|
punpcklwd xmm0,xmm2
|
|
punpckhwd xmm4,xmm2
|
|
punpcklwd xmm6,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,xmm4
|
|
punpckldq xmm0,xmm6
|
|
punpckhdq xmm1,xmm6
|
|
punpckldq xmm4,xmm5
|
|
punpckhdq xmm2,xmm5
|
|
movdqa xmm5,xmm0
|
|
movdqa xmm6,xmm1
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm5,xmm4
|
|
punpcklqdq xmm1,xmm2
|
|
punpckhqdq xmm6,xmm2
|
|
mov esi,dword [esp+14h]
|
|
mov ecx,dword [ebp+10h]
|
|
mov edx,dword [esp+0Ch]
|
|
mov edi,dword [esp+8]
|
|
movd dword [esi],xmm0
|
|
movd dword [esi+ecx],xmm5
|
|
movd dword [esi+ecx*2],xmm1
|
|
movd dword [esi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
mov esi,dword [esp+18h]
|
|
movd dword [edi],xmm0
|
|
movd dword [edi+ecx],xmm5
|
|
movd dword [edi+ecx*2],xmm1
|
|
movd dword [edi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
movd dword [esi],xmm0
|
|
movd dword [esi+ecx],xmm5
|
|
movd dword [esi+ecx*2],xmm1
|
|
movd dword [esi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
mov edi,dword [esp+10h]
|
|
movd dword [edi],xmm0
|
|
movd dword [edi+ecx],xmm5
|
|
movd dword [edi+ecx*2],xmm1
|
|
movd dword [edi+edx],xmm6
|
|
pop edi
|
|
pop esi
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
|
|
|
|
;*******************************************************************************
|
|
; void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
|
|
; int32_t iBeta, int8_t * pTC)
|
|
;*******************************************************************************
|
|
|
|
|
|
WELS_EXTERN DeblockLumaLt4V_ssse3
|
|
|
|
ALIGN 16
|
|
|
|
DeblockLumaLt4V_ssse3:
|
|
push ebp
|
|
mov ebp, esp
|
|
and esp, -16 ; fffffff0H
|
|
sub esp, 420 ; 000001a4H
|
|
mov eax, dword [ebp+8]
|
|
mov ecx, dword [ebp+12]
|
|
|
|
pxor xmm0, xmm0
|
|
push ebx
|
|
mov edx, dword [ebp+24]
|
|
movdqa [esp+424-384], xmm0
|
|
push esi
|
|
|
|
lea esi, [ecx+ecx*2]
|
|
push edi
|
|
mov edi, eax
|
|
sub edi, esi
|
|
movdqa xmm0, [edi]
|
|
|
|
lea esi, [ecx+ecx]
|
|
movdqa [esp+432-208], xmm0
|
|
mov edi, eax
|
|
sub edi, esi
|
|
movdqa xmm0, [edi]
|
|
movdqa [esp+448-208], xmm0
|
|
|
|
mov ebx, eax
|
|
sub ebx, ecx
|
|
movdqa xmm0, [ebx]
|
|
movdqa [esp+464-208], xmm0
|
|
|
|
movdqa xmm0, [eax]
|
|
|
|
add ecx, eax
|
|
movdqa [esp+480-208], xmm0
|
|
movdqa xmm0, [ecx]
|
|
mov dword [esp+432-404], ecx
|
|
|
|
movsx ecx, word [ebp+16]
|
|
movdqa [esp+496-208], xmm0
|
|
movdqa xmm0, [esi+eax]
|
|
|
|
movsx si, byte [edx]
|
|
movdqa [esp+512-208], xmm0
|
|
movd xmm0, ecx
|
|
movsx ecx, word [ebp+20]
|
|
movdqa xmm1, xmm0
|
|
punpcklwd xmm1, xmm0
|
|
pshufd xmm0, xmm1, 0
|
|
movdqa [esp+432-112], xmm0
|
|
movd xmm0, ecx
|
|
movsx cx, byte [edx+1]
|
|
movdqa xmm1, xmm0
|
|
punpcklwd xmm1, xmm0
|
|
mov dword [esp+432-408], ebx
|
|
movzx ebx, cx
|
|
pshufd xmm0, xmm1, 0
|
|
movd xmm1, ebx
|
|
movzx ebx, cx
|
|
movd xmm2, ebx
|
|
movzx ebx, cx
|
|
movzx ecx, cx
|
|
movd xmm4, ecx
|
|
movzx ecx, si
|
|
movd xmm5, ecx
|
|
movzx ecx, si
|
|
movd xmm6, ecx
|
|
movzx ecx, si
|
|
movd xmm7, ecx
|
|
movzx ecx, si
|
|
movdqa [esp+432-336], xmm0
|
|
movd xmm0, ecx
|
|
|
|
movsx cx, byte [edx+3]
|
|
movsx dx, byte [edx+2]
|
|
movd xmm3, ebx
|
|
punpcklwd xmm0, xmm4
|
|
movzx esi, cx
|
|
punpcklwd xmm6, xmm2
|
|
punpcklwd xmm5, xmm1
|
|
punpcklwd xmm0, xmm6
|
|
punpcklwd xmm7, xmm3
|
|
punpcklwd xmm7, xmm5
|
|
punpcklwd xmm0, xmm7
|
|
movdqa [esp+432-400], xmm0
|
|
movd xmm0, esi
|
|
movzx esi, cx
|
|
movd xmm2, esi
|
|
movzx esi, cx
|
|
movzx ecx, cx
|
|
movd xmm4, ecx
|
|
movzx ecx, dx
|
|
movd xmm3, esi
|
|
movd xmm5, ecx
|
|
punpcklwd xmm5, xmm0
|
|
|
|
movdqa xmm0, [esp+432-384]
|
|
movzx ecx, dx
|
|
movd xmm6, ecx
|
|
movzx ecx, dx
|
|
movzx edx, dx
|
|
punpcklwd xmm6, xmm2
|
|
movd xmm7, ecx
|
|
movd xmm1, edx
|
|
|
|
movdqa xmm2, [esp+448-208]
|
|
punpcklbw xmm2, xmm0
|
|
|
|
mov ecx, 4
|
|
movsx edx, cx
|
|
punpcklwd xmm7, xmm3
|
|
punpcklwd xmm7, xmm5
|
|
movdqa xmm5, [esp+496-208]
|
|
movdqa xmm3, [esp+464-208]
|
|
punpcklbw xmm5, xmm0
|
|
movdqa [esp+432-240], xmm5
|
|
movdqa xmm5, [esp+512-208]
|
|
punpcklbw xmm5, xmm0
|
|
movdqa [esp+432-352], xmm5
|
|
punpcklwd xmm1, xmm4
|
|
movdqa xmm4, [esp+432-208]
|
|
punpcklwd xmm1, xmm6
|
|
movdqa xmm6, [esp+480-208]
|
|
punpcklwd xmm1, xmm7
|
|
punpcklbw xmm6, xmm0
|
|
punpcklbw xmm3, xmm0
|
|
punpcklbw xmm4, xmm0
|
|
movdqa xmm7, xmm3
|
|
psubw xmm7, xmm4
|
|
pabsw xmm7, xmm7
|
|
movdqa [esp+432-272], xmm4
|
|
movdqa xmm4, [esp+432-336]
|
|
movdqa xmm5, xmm4
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa [esp+432-288], xmm5
|
|
movdqa xmm7, xmm6
|
|
psubw xmm7, [esp+432-352]
|
|
pabsw xmm7, xmm7
|
|
movdqa xmm5, xmm4
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa [esp+432-256], xmm5
|
|
movdqa xmm5, xmm3
|
|
pavgw xmm5, xmm6
|
|
movdqa [esp+432-304], xmm5
|
|
movdqa xmm5, [esp+432-400]
|
|
psubw xmm5, [esp+432-288]
|
|
psubw xmm5, [esp+432-256]
|
|
movdqa [esp+432-224], xmm5
|
|
movdqa xmm5, xmm6
|
|
psubw xmm5, xmm3
|
|
movdqa [esp+432-32], xmm6
|
|
psubw xmm6, [esp+432-240]
|
|
movdqa xmm7, xmm5
|
|
movdqa [esp+432-384], xmm5
|
|
movdqa xmm5, [esp+432-112]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm5, xmm7
|
|
pabsw xmm6, xmm6
|
|
movdqa xmm7, xmm4
|
|
pcmpgtw xmm7, xmm6
|
|
|
|
pand xmm5, xmm7
|
|
movdqa xmm6, xmm3
|
|
psubw xmm6, xmm2
|
|
pabsw xmm6, xmm6
|
|
movdqa xmm7, xmm4
|
|
pcmpgtw xmm7, xmm6
|
|
movdqa xmm6, [esp+432-400]
|
|
pand xmm5, xmm7
|
|
movdqa xmm7, xmm6
|
|
pcmpeqw xmm6, xmm0
|
|
pcmpgtw xmm7, xmm0
|
|
por xmm7, xmm6
|
|
pand xmm5, xmm7
|
|
movdqa [esp+432-320], xmm5
|
|
movd xmm5, edx
|
|
movdqa xmm6, xmm5
|
|
punpcklwd xmm6, xmm5
|
|
pshufd xmm5, xmm6, 0
|
|
movdqa [esp+432-336], xmm5
|
|
movdqa xmm5, [esp+432-224]
|
|
movdqa [esp+432-368], xmm5
|
|
movdqa xmm6, xmm0
|
|
psubw xmm6, xmm5
|
|
movdqa xmm5, [esp+432-384]
|
|
psllw xmm5, 2
|
|
movdqa xmm7, xmm2
|
|
psubw xmm7, [esp+432-240]
|
|
paddw xmm7, xmm5
|
|
paddw xmm7, [esp+432-336]
|
|
movdqa xmm5, [esp+432-368]
|
|
psraw xmm7, 3
|
|
pmaxsw xmm6, xmm7
|
|
pminsw xmm5, xmm6
|
|
|
|
pand xmm5, [esp+432-320]
|
|
movdqa xmm6, [esp+432-400]
|
|
movdqa [esp+432-64], xmm5
|
|
movdqa [esp+432-384], xmm6
|
|
movdqa xmm5, xmm0
|
|
psubw xmm5, xmm6
|
|
movdqa [esp+432-368], xmm5
|
|
movdqa xmm6, xmm5
|
|
movdqa xmm5, [esp+432-272]
|
|
paddw xmm5, [esp+432-304]
|
|
movdqa xmm7, xmm2
|
|
paddw xmm7, xmm2
|
|
psubw xmm5, xmm7
|
|
psraw xmm5, 1
|
|
pmaxsw xmm6, xmm5
|
|
movdqa xmm5, [esp+432-384]
|
|
pminsw xmm5, xmm6
|
|
|
|
pand xmm5, [esp+432-320]
|
|
pand xmm5, [esp+432-288]
|
|
movdqa xmm6, [esp+432-240]
|
|
movdqa [esp+432-96], xmm5
|
|
movdqa xmm5, [esp+432-352]
|
|
paddw xmm5, [esp+432-304]
|
|
movdqa xmm7, xmm6
|
|
paddw xmm7, xmm6
|
|
movdqa xmm6, [esp+432-368]
|
|
psubw xmm5, xmm7
|
|
|
|
movdqa xmm7, [esp+496-208]
|
|
psraw xmm5, 1
|
|
pmaxsw xmm6, xmm5
|
|
movdqa xmm5, [esp+432-400]
|
|
pminsw xmm5, xmm6
|
|
pand xmm5, [esp+432-320]
|
|
pand xmm5, [esp+432-256]
|
|
movdqa xmm6, [esp+448-208]
|
|
punpckhbw xmm7, xmm0
|
|
movdqa [esp+432-352], xmm7
|
|
|
|
movdqa xmm7, [esp+512-208]
|
|
punpckhbw xmm6, xmm0
|
|
movdqa [esp+432-48], xmm5
|
|
movdqa xmm5, [esp+432-208]
|
|
movdqa [esp+432-368], xmm6
|
|
movdqa xmm6, [esp+464-208]
|
|
punpckhbw xmm7, xmm0
|
|
punpckhbw xmm5, xmm0
|
|
movdqa [esp+432-384], xmm7
|
|
punpckhbw xmm6, xmm0
|
|
movdqa [esp+432-400], xmm6
|
|
|
|
movdqa xmm7, [esp+432-400]
|
|
movdqa xmm6, [esp+480-208]
|
|
psubw xmm7, xmm5
|
|
movdqa [esp+432-16], xmm5
|
|
pabsw xmm7, xmm7
|
|
punpckhbw xmm6, xmm0
|
|
movdqa xmm5, xmm4
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa [esp+432-288], xmm5
|
|
|
|
movdqa xmm7, xmm6
|
|
psubw xmm7, [esp+432-384]
|
|
pabsw xmm7, xmm7
|
|
movdqa xmm5, xmm4
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa [esp+432-256], xmm5
|
|
|
|
movdqa xmm5, [esp+432-400]
|
|
movdqa [esp+432-80], xmm6
|
|
pavgw xmm5, xmm6
|
|
movdqa [esp+432-304], xmm5
|
|
|
|
movdqa xmm5, xmm1
|
|
psubw xmm5, [esp+432-288]
|
|
psubw xmm5, [esp+432-256]
|
|
movdqa [esp+432-224], xmm5
|
|
movdqa xmm5, xmm6
|
|
psubw xmm5, [esp+432-400]
|
|
psubw xmm6, [esp+432-352]
|
|
movdqa [esp+432-272], xmm5
|
|
movdqa xmm7, xmm5
|
|
movdqa xmm5, [esp+432-112]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa xmm7, xmm4
|
|
pabsw xmm6, xmm6
|
|
pcmpgtw xmm7, xmm6
|
|
movdqa xmm6, [esp+432-368]
|
|
|
|
pand xmm5, xmm7
|
|
movdqa xmm7, [esp+432-400]
|
|
psubw xmm7, xmm6
|
|
psubw xmm6, [esp+432-352]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm4, xmm7
|
|
pand xmm5, xmm4
|
|
|
|
paddw xmm2, [esp+432-96]
|
|
movdqa xmm4, xmm1
|
|
pcmpgtw xmm4, xmm0
|
|
movdqa xmm7, xmm1
|
|
pcmpeqw xmm7, xmm0
|
|
por xmm4, xmm7
|
|
pand xmm5, xmm4
|
|
movdqa xmm4, [esp+432-224]
|
|
movdqa [esp+432-320], xmm5
|
|
movdqa xmm5, [esp+432-272]
|
|
movdqa xmm7, xmm0
|
|
psubw xmm7, xmm4
|
|
psubw xmm0, xmm1
|
|
psllw xmm5, 2
|
|
paddw xmm6, xmm5
|
|
paddw xmm6, [esp+432-336]
|
|
movdqa xmm5, [esp+432-368]
|
|
movdqa [esp+432-336], xmm0
|
|
psraw xmm6, 3
|
|
pmaxsw xmm7, xmm6
|
|
pminsw xmm4, xmm7
|
|
pand xmm4, [esp+432-320]
|
|
movdqa xmm6, xmm0
|
|
movdqa xmm0, [esp+432-16]
|
|
paddw xmm0, [esp+432-304]
|
|
movdqa [esp+432-272], xmm4
|
|
movdqa xmm4, [esp+432-368]
|
|
paddw xmm4, xmm4
|
|
psubw xmm0, xmm4
|
|
|
|
movdqa xmm4, [esp+432-64]
|
|
psraw xmm0, 1
|
|
pmaxsw xmm6, xmm0
|
|
movdqa xmm0, [esp+432-400]
|
|
movdqa xmm7, xmm1
|
|
pminsw xmm7, xmm6
|
|
movdqa xmm6, [esp+432-320]
|
|
pand xmm7, xmm6
|
|
pand xmm7, [esp+432-288]
|
|
paddw xmm5, xmm7
|
|
packuswb xmm2, xmm5
|
|
movdqa xmm5, [esp+432-272]
|
|
paddw xmm0, xmm5
|
|
paddw xmm3, xmm4
|
|
packuswb xmm3, xmm0
|
|
|
|
movdqa xmm0, [esp+432-32]
|
|
psubw xmm0, xmm4
|
|
movdqa xmm4, [esp+432-80]
|
|
psubw xmm4, xmm5
|
|
|
|
movdqa xmm5, [esp+432-240]
|
|
paddw xmm5, [esp+432-48]
|
|
packuswb xmm0, xmm4
|
|
movdqa xmm4, [esp+432-384]
|
|
paddw xmm4, [esp+432-304]
|
|
movdqa [esp+480-208], xmm0
|
|
movdqa xmm0, [esp+432-352]
|
|
movdqa xmm7, xmm0
|
|
paddw xmm0, xmm0
|
|
|
|
mov ecx, dword [esp+432-408]
|
|
|
|
mov edx, dword [esp+432-404]
|
|
psubw xmm4, xmm0
|
|
movdqa xmm0, [esp+432-336]
|
|
movdqa [edi], xmm2
|
|
psraw xmm4, 1
|
|
pmaxsw xmm0, xmm4
|
|
pminsw xmm1, xmm0
|
|
movdqa xmm0, [esp+480-208]
|
|
|
|
pop edi
|
|
pand xmm1, xmm6
|
|
pand xmm1, [esp+428-256]
|
|
movdqa [ecx], xmm3
|
|
paddw xmm7, xmm1
|
|
pop esi
|
|
packuswb xmm5, xmm7
|
|
movdqa [eax], xmm0
|
|
movdqa [edx], xmm5
|
|
pop ebx
|
|
mov esp, ebp
|
|
pop ebp
|
|
ret
|
|
|
|
|
|
;*******************************************************************************
|
|
; void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
|
|
; int32_t iBeta)
|
|
;*******************************************************************************
|
|
|
|
WELS_EXTERN DeblockLumaEq4V_ssse3
|
|
|
|
ALIGN 16
|
|
|
|
DeblockLumaEq4V_ssse3:
|
|
|
|
push ebp
|
|
mov ebp, esp
|
|
and esp, -16 ; fffffff0H
|
|
sub esp, 628 ; 00000274H
|
|
mov eax, dword [ebp+8]
|
|
mov ecx, dword [ebp+12]
|
|
push ebx
|
|
push esi
|
|
|
|
lea edx, [ecx*4]
|
|
pxor xmm0, xmm0
|
|
movdqa xmm2, xmm0
|
|
|
|
movdqa xmm0, [ecx+eax]
|
|
mov esi, eax
|
|
sub esi, edx
|
|
movdqa xmm3, [esi]
|
|
movdqa xmm5, [eax]
|
|
push edi
|
|
lea edi, [ecx+ecx]
|
|
lea ebx, [ecx+ecx*2]
|
|
mov dword [esp+640-600], edi
|
|
mov esi, eax
|
|
sub esi, edi
|
|
movdqa xmm1, [esi]
|
|
movdqa [esp+720-272], xmm0
|
|
mov edi, eax
|
|
sub edi, ecx
|
|
movdqa xmm4, [edi]
|
|
add ecx, eax
|
|
mov dword [esp+640-596], ecx
|
|
|
|
mov ecx, dword [esp+640-600]
|
|
movdqa xmm0, [ecx+eax]
|
|
movdqa [esp+736-272], xmm0
|
|
|
|
movdqa xmm0, [eax+ebx]
|
|
mov edx, eax
|
|
sub edx, ebx
|
|
|
|
movsx ebx, word [ebp+16]
|
|
movdqa xmm6, [edx]
|
|
add ecx, eax
|
|
movdqa [esp+752-272], xmm0
|
|
movd xmm0, ebx
|
|
|
|
movsx ebx, word [ebp+20]
|
|
movdqa xmm7, xmm0
|
|
punpcklwd xmm7, xmm0
|
|
pshufd xmm0, xmm7, 0
|
|
movdqa [esp+640-320], xmm0
|
|
movd xmm0, ebx
|
|
movdqa xmm7, xmm0
|
|
punpcklwd xmm7, xmm0
|
|
pshufd xmm0, xmm7, 0
|
|
|
|
movdqa xmm7, [esp+736-272]
|
|
punpcklbw xmm7, xmm2
|
|
movdqa [esp+640-416], xmm7
|
|
movdqa [esp+640-512], xmm0
|
|
movdqa xmm0, xmm1
|
|
movdqa [esp+672-272], xmm1
|
|
movdqa xmm1, xmm4
|
|
movdqa [esp+704-272], xmm5
|
|
punpcklbw xmm5, xmm2
|
|
punpcklbw xmm1, xmm2
|
|
|
|
movdqa xmm7, xmm5
|
|
psubw xmm7, xmm1
|
|
pabsw xmm7, xmm7
|
|
movdqa [esp+640-560], xmm7
|
|
punpcklbw xmm0, xmm2
|
|
movdqa [esp+688-272], xmm4
|
|
movdqa xmm4, [esp+720-272]
|
|
movdqa [esp+640-480], xmm0
|
|
|
|
movdqa xmm7, xmm1
|
|
psubw xmm7, xmm0
|
|
|
|
movdqa xmm0, [esp+640-512]
|
|
pabsw xmm7, xmm7
|
|
punpcklbw xmm4, xmm2
|
|
pcmpgtw xmm0, xmm7
|
|
movdqa [esp+640-384], xmm4
|
|
movdqa xmm7, xmm5
|
|
psubw xmm7, xmm4
|
|
movdqa xmm4, [esp+640-512]
|
|
movdqa [esp+656-272], xmm6
|
|
punpcklbw xmm6, xmm2
|
|
pabsw xmm7, xmm7
|
|
movdqa [esp+640-48], xmm2
|
|
movdqa [esp+640-368], xmm6
|
|
movdqa [esp+640-144], xmm1
|
|
movdqa [esp+640-400], xmm5
|
|
pcmpgtw xmm4, xmm7
|
|
pand xmm0, xmm4
|
|
movdqa xmm4, [esp+640-320]
|
|
pcmpgtw xmm4, [esp+640-560]
|
|
pand xmm0, xmm4
|
|
|
|
mov ebx, 2
|
|
movsx ebx, bx
|
|
movd xmm4, ebx
|
|
movdqa xmm7, xmm4
|
|
punpcklwd xmm7, xmm4
|
|
movdqa xmm4, [esp+640-320]
|
|
psraw xmm4, 2
|
|
pshufd xmm7, xmm7, 0
|
|
paddw xmm4, xmm7
|
|
movdqa [esp+640-576], xmm4
|
|
pcmpgtw xmm4, [esp+640-560]
|
|
movdqa [esp+640-560], xmm4
|
|
|
|
movdqa xmm4, [esp+640-512]
|
|
movdqa [esp+640-624], xmm7
|
|
movdqa xmm7, xmm1
|
|
psubw xmm7, xmm6
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm4, xmm7
|
|
|
|
pand xmm4, [esp+640-560]
|
|
movdqa [esp+640-544], xmm4
|
|
movdqa xmm4, [esp+640-512]
|
|
movdqa xmm7, xmm5
|
|
psubw xmm7, [esp+640-416]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm4, xmm7
|
|
|
|
pand xmm4, [esp+640-560]
|
|
movdqa [esp+640-560], xmm4
|
|
|
|
movdqa xmm4, [esp+640-544]
|
|
pandn xmm4, xmm6
|
|
movdqa [esp+640-16], xmm4
|
|
mov ebx, 4
|
|
movsx ebx, bx
|
|
movd xmm4, ebx
|
|
movdqa xmm7, xmm4
|
|
punpcklwd xmm7, xmm4
|
|
movdqa xmm4, xmm3
|
|
punpcklbw xmm4, xmm2
|
|
psllw xmm4, 1
|
|
paddw xmm4, xmm6
|
|
paddw xmm4, xmm6
|
|
paddw xmm4, xmm6
|
|
paddw xmm4, [esp+640-480]
|
|
|
|
movdqa xmm6, [esp+640-560]
|
|
pshufd xmm7, xmm7, 0
|
|
paddw xmm4, xmm1
|
|
movdqa [esp+640-592], xmm7
|
|
paddw xmm4, xmm5
|
|
paddw xmm4, xmm7
|
|
movdqa xmm7, [esp+640-416]
|
|
pandn xmm6, xmm7
|
|
movdqa [esp+640-80], xmm6
|
|
movdqa xmm6, [esp+752-272]
|
|
punpcklbw xmm6, xmm2
|
|
psllw xmm6, 1
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, [esp+640-384]
|
|
|
|
movdqa xmm7, [esp+640-480]
|
|
paddw xmm6, xmm5
|
|
paddw xmm6, xmm1
|
|
paddw xmm6, [esp+640-592]
|
|
psraw xmm6, 3
|
|
pand xmm6, [esp+640-560]
|
|
movdqa [esp+640-112], xmm6
|
|
movdqa xmm6, [esp+640-544]
|
|
pandn xmm6, xmm7
|
|
movdqa [esp+640-336], xmm6
|
|
movdqa xmm6, [esp+640-544]
|
|
movdqa [esp+640-528], xmm6
|
|
movdqa xmm6, [esp+640-368]
|
|
paddw xmm6, xmm7
|
|
movdqa xmm7, xmm1
|
|
psraw xmm4, 3
|
|
pand xmm4, [esp+640-544]
|
|
paddw xmm7, xmm5
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, [esp+640-624]
|
|
movdqa xmm7, [esp+640-528]
|
|
|
|
paddw xmm5, xmm1
|
|
psraw xmm6, 2
|
|
pand xmm7, xmm6
|
|
|
|
movdqa xmm6, [esp+640-384]
|
|
movdqa [esp+640-64], xmm7
|
|
movdqa xmm7, [esp+640-560]
|
|
pandn xmm7, xmm6
|
|
movdqa [esp+640-304], xmm7
|
|
movdqa xmm7, [esp+640-560]
|
|
movdqa [esp+640-528], xmm7
|
|
movdqa xmm7, [esp+640-416]
|
|
paddw xmm7, xmm6
|
|
paddw xmm7, xmm5
|
|
paddw xmm7, [esp+640-624]
|
|
movdqa xmm5, [esp+640-528]
|
|
psraw xmm7, 2
|
|
pand xmm5, xmm7
|
|
movdqa [esp+640-32], xmm5
|
|
|
|
movdqa xmm5, [esp+640-544]
|
|
movdqa [esp+640-528], xmm5
|
|
movdqa xmm5, [esp+640-480]
|
|
movdqa xmm7, xmm5
|
|
paddw xmm7, xmm5
|
|
movdqa xmm5, xmm1
|
|
paddw xmm5, xmm6
|
|
paddw xmm6, [esp+640-592]
|
|
paddw xmm7, xmm5
|
|
paddw xmm7, [esp+640-624]
|
|
movdqa xmm5, [esp+640-528]
|
|
psraw xmm7, 2
|
|
pandn xmm5, xmm7
|
|
movdqa xmm7, [esp+640-480]
|
|
paddw xmm7, xmm1
|
|
paddw xmm7, [esp+640-400]
|
|
movdqa xmm1, [esp+640-544]
|
|
movdqa [esp+640-352], xmm5
|
|
movdqa xmm5, [esp+640-368]
|
|
psllw xmm7, 1
|
|
paddw xmm7, xmm6
|
|
paddw xmm5, xmm7
|
|
|
|
movdqa xmm7, [esp+640-400]
|
|
psraw xmm5, 3
|
|
pand xmm1, xmm5
|
|
movdqa xmm5, [esp+640-480]
|
|
movdqa [esp+640-96], xmm1
|
|
movdqa xmm1, [esp+640-560]
|
|
movdqa [esp+640-528], xmm1
|
|
movdqa xmm1, [esp+640-384]
|
|
movdqa xmm6, xmm1
|
|
paddw xmm6, xmm1
|
|
paddw xmm1, [esp+640-400]
|
|
paddw xmm1, [esp+640-144]
|
|
paddw xmm7, xmm5
|
|
paddw xmm5, [esp+640-592]
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, [esp+640-624]
|
|
movdqa xmm7, [esp+640-528]
|
|
psraw xmm6, 2
|
|
psllw xmm1, 1
|
|
paddw xmm1, xmm5
|
|
|
|
movdqa xmm5, [esp+656-272]
|
|
pandn xmm7, xmm6
|
|
movdqa xmm6, [esp+640-416]
|
|
paddw xmm6, xmm1
|
|
movdqa xmm1, [esp+640-560]
|
|
psraw xmm6, 3
|
|
pand xmm1, xmm6
|
|
|
|
movdqa xmm6, [esp+704-272]
|
|
movdqa [esp+640-128], xmm1
|
|
movdqa xmm1, [esp+672-272]
|
|
punpckhbw xmm1, xmm2
|
|
movdqa [esp+640-448], xmm1
|
|
movdqa xmm1, [esp+688-272]
|
|
punpckhbw xmm1, xmm2
|
|
punpckhbw xmm6, xmm2
|
|
movdqa [esp+640-288], xmm7
|
|
punpckhbw xmm5, xmm2
|
|
movdqa [esp+640-496], xmm1
|
|
movdqa [esp+640-432], xmm6
|
|
|
|
movdqa xmm7, [esp+720-272]
|
|
punpckhbw xmm7, xmm2
|
|
movdqa [esp+640-464], xmm7
|
|
|
|
movdqa xmm7, [esp+736-272]
|
|
punpckhbw xmm7, xmm2
|
|
movdqa [esp+640-528], xmm7
|
|
|
|
movdqa xmm7, xmm6
|
|
|
|
psubw xmm6, [esp+640-464]
|
|
psubw xmm7, xmm1
|
|
pabsw xmm7, xmm7
|
|
movdqa [esp+640-560], xmm7
|
|
por xmm4, [esp+640-16]
|
|
pabsw xmm6, xmm6
|
|
movdqa xmm7, xmm1
|
|
psubw xmm7, [esp+640-448]
|
|
|
|
movdqa xmm1, [esp+640-512]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm1, xmm7
|
|
movdqa xmm7, [esp+640-512]
|
|
pcmpgtw xmm7, xmm6
|
|
movdqa xmm6, [esp+640-320]
|
|
pand xmm1, xmm7
|
|
movdqa xmm7, [esp+640-560]
|
|
pcmpgtw xmm6, xmm7
|
|
pand xmm1, xmm6
|
|
|
|
movdqa xmm6, [esp+640-576]
|
|
pcmpgtw xmm6, xmm7
|
|
|
|
movdqa xmm7, [esp+640-496]
|
|
punpckhbw xmm3, xmm2
|
|
movdqa [esp+640-560], xmm6
|
|
movdqa xmm6, [esp+640-512]
|
|
psubw xmm7, xmm5
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm6, xmm7
|
|
|
|
pand xmm6, [esp+640-560]
|
|
movdqa xmm7, [esp+640-432]
|
|
psubw xmm7, [esp+640-528]
|
|
|
|
psllw xmm3, 1
|
|
movdqa [esp+640-544], xmm6
|
|
movdqa xmm6, [esp+640-512]
|
|
|
|
movdqa xmm2, [esp+640-544]
|
|
paddw xmm3, xmm5
|
|
paddw xmm3, xmm5
|
|
paddw xmm3, xmm5
|
|
paddw xmm3, [esp+640-448]
|
|
paddw xmm3, [esp+640-496]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm6, xmm7
|
|
pand xmm6, [esp+640-560]
|
|
movdqa [esp+640-560], xmm6
|
|
|
|
movdqa xmm6, xmm0
|
|
pand xmm6, xmm4
|
|
movdqa xmm4, xmm0
|
|
pandn xmm4, [esp+640-368]
|
|
por xmm6, xmm4
|
|
movdqa xmm4, [esp+640-432]
|
|
paddw xmm3, xmm4
|
|
paddw xmm3, [esp+640-592]
|
|
psraw xmm3, 3
|
|
pand xmm3, xmm2
|
|
pandn xmm2, xmm5
|
|
por xmm3, xmm2
|
|
movdqa xmm7, xmm1
|
|
pand xmm7, xmm3
|
|
movdqa xmm3, [esp+640-64]
|
|
por xmm3, [esp+640-336]
|
|
movdqa xmm2, xmm1
|
|
pandn xmm2, xmm5
|
|
por xmm7, xmm2
|
|
|
|
movdqa xmm2, xmm0
|
|
pand xmm2, xmm3
|
|
movdqa xmm3, xmm0
|
|
pandn xmm3, [esp+640-480]
|
|
por xmm2, xmm3
|
|
packuswb xmm6, xmm7
|
|
movdqa [esp+640-336], xmm2
|
|
movdqa [esp+656-272], xmm6
|
|
movdqa xmm6, [esp+640-544]
|
|
movdqa xmm2, xmm5
|
|
paddw xmm2, [esp+640-448]
|
|
movdqa xmm3, xmm1
|
|
movdqa xmm7, [esp+640-496]
|
|
paddw xmm7, xmm4
|
|
paddw xmm2, xmm7
|
|
paddw xmm2, [esp+640-624]
|
|
movdqa xmm7, [esp+640-544]
|
|
psraw xmm2, 2
|
|
pand xmm6, xmm2
|
|
movdqa xmm2, [esp+640-448]
|
|
pandn xmm7, xmm2
|
|
por xmm6, xmm7
|
|
pand xmm3, xmm6
|
|
movdqa xmm6, xmm1
|
|
pandn xmm6, xmm2
|
|
paddw xmm2, [esp+640-496]
|
|
paddw xmm2, xmm4
|
|
por xmm3, xmm6
|
|
movdqa xmm6, [esp+640-336]
|
|
packuswb xmm6, xmm3
|
|
psllw xmm2, 1
|
|
movdqa [esp+672-272], xmm6
|
|
movdqa xmm6, [esp+640-96]
|
|
por xmm6, [esp+640-352]
|
|
|
|
movdqa xmm3, xmm0
|
|
pand xmm3, xmm6
|
|
movdqa xmm6, xmm0
|
|
pandn xmm6, [esp+640-144]
|
|
por xmm3, xmm6
|
|
movdqa xmm6, [esp+640-544]
|
|
movdqa [esp+640-352], xmm3
|
|
movdqa xmm3, [esp+640-464]
|
|
paddw xmm3, [esp+640-592]
|
|
paddw xmm2, xmm3
|
|
movdqa xmm3, [esp+640-448]
|
|
paddw xmm5, xmm2
|
|
movdqa xmm2, [esp+640-496]
|
|
psraw xmm5, 3
|
|
pand xmm6, xmm5
|
|
movdqa xmm5, [esp+640-464]
|
|
paddw xmm2, xmm5
|
|
paddw xmm5, [esp+640-432]
|
|
movdqa xmm4, xmm3
|
|
paddw xmm4, xmm3
|
|
paddw xmm4, xmm2
|
|
paddw xmm4, [esp+640-624]
|
|
movdqa xmm2, [esp+640-544]
|
|
paddw xmm3, [esp+640-592]
|
|
psraw xmm4, 2
|
|
pandn xmm2, xmm4
|
|
por xmm6, xmm2
|
|
movdqa xmm7, xmm1
|
|
pand xmm7, xmm6
|
|
movdqa xmm6, [esp+640-496]
|
|
movdqa xmm2, xmm1
|
|
pandn xmm2, xmm6
|
|
por xmm7, xmm2
|
|
movdqa xmm2, [esp+640-352]
|
|
packuswb xmm2, xmm7
|
|
movdqa [esp+688-272], xmm2
|
|
movdqa xmm2, [esp+640-128]
|
|
por xmm2, [esp+640-288]
|
|
|
|
movdqa xmm4, xmm0
|
|
pand xmm4, xmm2
|
|
paddw xmm5, xmm6
|
|
movdqa xmm2, xmm0
|
|
pandn xmm2, [esp+640-400]
|
|
por xmm4, xmm2
|
|
movdqa xmm2, [esp+640-528]
|
|
psllw xmm5, 1
|
|
paddw xmm5, xmm3
|
|
movdqa xmm3, [esp+640-560]
|
|
paddw xmm2, xmm5
|
|
psraw xmm2, 3
|
|
movdqa [esp+640-288], xmm4
|
|
movdqa xmm4, [esp+640-560]
|
|
pand xmm4, xmm2
|
|
movdqa xmm2, [esp+640-464]
|
|
movdqa xmm5, xmm2
|
|
paddw xmm5, xmm2
|
|
movdqa xmm2, [esp+640-432]
|
|
paddw xmm2, [esp+640-448]
|
|
movdqa xmm7, xmm1
|
|
paddw xmm5, xmm2
|
|
paddw xmm5, [esp+640-624]
|
|
movdqa xmm6, [esp+640-560]
|
|
psraw xmm5, 2
|
|
pandn xmm3, xmm5
|
|
por xmm4, xmm3
|
|
movdqa xmm3, [esp+640-32]
|
|
por xmm3, [esp+640-304]
|
|
pand xmm7, xmm4
|
|
movdqa xmm4, [esp+640-432]
|
|
movdqa xmm5, [esp+640-464]
|
|
movdqa xmm2, xmm1
|
|
pandn xmm2, xmm4
|
|
paddw xmm4, [esp+640-496]
|
|
por xmm7, xmm2
|
|
movdqa xmm2, [esp+640-288]
|
|
packuswb xmm2, xmm7
|
|
movdqa [esp+704-272], xmm2
|
|
|
|
movdqa xmm2, xmm0
|
|
pand xmm2, xmm3
|
|
movdqa xmm3, xmm0
|
|
pandn xmm3, [esp+640-384]
|
|
por xmm2, xmm3
|
|
movdqa [esp+640-304], xmm2
|
|
movdqa xmm2, [esp+640-528]
|
|
movdqa xmm3, xmm2
|
|
paddw xmm3, [esp+640-464]
|
|
paddw xmm3, xmm4
|
|
paddw xmm3, [esp+640-624]
|
|
psraw xmm3, 2
|
|
pand xmm6, xmm3
|
|
movdqa xmm3, [esp+640-560]
|
|
movdqa xmm4, xmm3
|
|
pandn xmm4, xmm5
|
|
por xmm6, xmm4
|
|
movdqa xmm7, xmm1
|
|
pand xmm7, xmm6
|
|
movdqa xmm6, [esp+640-304]
|
|
movdqa xmm4, xmm1
|
|
pandn xmm4, xmm5
|
|
por xmm7, xmm4
|
|
|
|
movdqa xmm4, xmm0
|
|
pandn xmm0, [esp+640-416]
|
|
packuswb xmm6, xmm7
|
|
movdqa xmm7, [esp+640-112]
|
|
por xmm7, [esp+640-80]
|
|
pand xmm4, xmm7
|
|
por xmm4, xmm0
|
|
movdqa xmm0, [esp+752-272]
|
|
punpckhbw xmm0, [esp+640-48]
|
|
psllw xmm0, 1
|
|
paddw xmm0, xmm2
|
|
paddw xmm0, xmm2
|
|
paddw xmm0, xmm2
|
|
paddw xmm0, xmm5
|
|
paddw xmm0, [esp+640-432]
|
|
paddw xmm0, [esp+640-496]
|
|
paddw xmm0, [esp+640-592]
|
|
psraw xmm0, 3
|
|
pand xmm0, xmm3
|
|
movdqa xmm7, xmm1
|
|
pandn xmm3, xmm2
|
|
por xmm0, xmm3
|
|
pand xmm7, xmm0
|
|
|
|
movdqa xmm0, [esp+656-272]
|
|
movdqa [edx], xmm0
|
|
|
|
movdqa xmm0, [esp+672-272]
|
|
|
|
mov edx, dword [esp+640-596]
|
|
movdqa [esi], xmm0
|
|
movdqa xmm0, [esp+688-272]
|
|
movdqa [edi], xmm0
|
|
movdqa xmm0, [esp+704-272]
|
|
|
|
pop edi
|
|
pandn xmm1, xmm2
|
|
movdqa [eax], xmm0
|
|
por xmm7, xmm1
|
|
pop esi
|
|
packuswb xmm4, xmm7
|
|
movdqa [edx], xmm6
|
|
movdqa [ecx], xmm4
|
|
pop ebx
|
|
mov esp, ebp
|
|
pop ebp
|
|
ret
|
|
|
|
%endif
|
|
|
|
|
|
|
|
;********************************************************************************
|
|
;
|
|
; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
|
|
;
|
|
;********************************************************************************
|
|
|
|
WELS_EXTERN DeblockLumaTransposeH2V_sse2
|
|
|
|
ALIGN 16
|
|
|
|
DeblockLumaTransposeH2V_sse2:
|
|
push r3
|
|
push r4
|
|
push r5
|
|
|
|
%assign push_num 3
|
|
LOAD_3_PARA
|
|
|
|
SIGN_EXTENTION r1, r1d
|
|
|
|
mov r5, r7
|
|
mov r3, r7
|
|
and r3, 0Fh
|
|
sub r7, r3
|
|
sub r7, 10h
|
|
|
|
lea r3, [r0 + r1 * 8]
|
|
lea r4, [r1 * 3]
|
|
|
|
movq xmm0, [r0]
|
|
movq xmm7, [r3]
|
|
punpcklqdq xmm0, xmm7
|
|
movq xmm1, [r0 + r1]
|
|
movq xmm7, [r3 + r1]
|
|
punpcklqdq xmm1, xmm7
|
|
movq xmm2, [r0 + r1*2]
|
|
movq xmm7, [r3 + r1*2]
|
|
punpcklqdq xmm2, xmm7
|
|
movq xmm3, [r0 + r4]
|
|
movq xmm7, [r3 + r4]
|
|
punpcklqdq xmm3, xmm7
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
lea r3, [r3 + r1 * 4]
|
|
movq xmm4, [r0]
|
|
movq xmm7, [r3]
|
|
punpcklqdq xmm4, xmm7
|
|
movq xmm5, [r0 + r1]
|
|
movq xmm7, [r3 + r1]
|
|
punpcklqdq xmm5, xmm7
|
|
movq xmm6, [r0 + r1*2]
|
|
movq xmm7, [r3 + r1*2]
|
|
punpcklqdq xmm6, xmm7
|
|
|
|
movdqa [r7], xmm0
|
|
movq xmm7, [r0 + r4]
|
|
movq xmm0, [r3 + r4]
|
|
punpcklqdq xmm7, xmm0
|
|
movdqa xmm0, [r7]
|
|
|
|
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
|
|
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
|
|
|
movdqa [r2], xmm4
|
|
movdqa [r2 + 10h], xmm2
|
|
movdqa [r2 + 20h], xmm3
|
|
movdqa [r2 + 30h], xmm7
|
|
movdqa [r2 + 40h], xmm5
|
|
movdqa [r2 + 50h], xmm1
|
|
movdqa [r2 + 60h], xmm6
|
|
movdqa [r2 + 70h], xmm0
|
|
|
|
mov r7, r5
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
ret
|
|
|
|
|
|
;*******************************************************************************************
|
|
;
|
|
; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
|
|
;
|
|
;*******************************************************************************************
|
|
|
|
WELS_EXTERN DeblockLumaTransposeV2H_sse2
|
|
|
|
ALIGN 16
|
|
|
|
DeblockLumaTransposeV2H_sse2:
|
|
push r3
|
|
push r4
|
|
|
|
%assign push_num 2
|
|
LOAD_3_PARA
|
|
|
|
SIGN_EXTENTION r1, r1d
|
|
|
|
mov r4, r7
|
|
mov r3, r7
|
|
and r3, 0Fh
|
|
sub r7, r3
|
|
sub r7, 10h
|
|
|
|
movdqa xmm0, [r2]
|
|
movdqa xmm1, [r2 + 10h]
|
|
movdqa xmm2, [r2 + 20h]
|
|
movdqa xmm3, [r2 + 30h]
|
|
movdqa xmm4, [r2 + 40h]
|
|
movdqa xmm5, [r2 + 50h]
|
|
movdqa xmm6, [r2 + 60h]
|
|
movdqa xmm7, [r2 + 70h]
|
|
|
|
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
|
|
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
|
|
|
lea r2, [r1 * 3]
|
|
|
|
movq [r0], xmm4
|
|
movq [r0 + r1], xmm2
|
|
movq [r0 + r1*2], xmm3
|
|
movq [r0 + r2], xmm7
|
|
|
|
lea r0, [r0 + r1*4]
|
|
movq [r0], xmm5
|
|
movq [r0 + r1], xmm1
|
|
movq [r0 + r1*2], xmm6
|
|
movq [r0 + r2], xmm0
|
|
|
|
psrldq xmm4, 8
|
|
psrldq xmm2, 8
|
|
psrldq xmm3, 8
|
|
psrldq xmm7, 8
|
|
psrldq xmm5, 8
|
|
psrldq xmm1, 8
|
|
psrldq xmm6, 8
|
|
psrldq xmm0, 8
|
|
|
|
lea r0, [r0 + r1*4]
|
|
movq [r0], xmm4
|
|
movq [r0 + r1], xmm2
|
|
movq [r0 + r1*2], xmm3
|
|
movq [r0 + r2], xmm7
|
|
|
|
lea r0, [r0 + r1*4]
|
|
movq [r0], xmm5
|
|
movq [r0 + r1], xmm1
|
|
movq [r0 + r1*2], xmm6
|
|
movq [r0 + r2], xmm0
|
|
|
|
|
|
mov r7, r4
|
|
pop r4
|
|
pop r3
|
|
ret
|
|
|