57f6bcc4b0
Previously the assembly sources had mixed indentation consisting of both spaces and tabs, making it quite hard to read unless the right tab size was used in the editor. Tabs have been interpreted as 4 spaces in most cases, matching the surrounding code.
5279 lines
138 KiB
NASM
5279 lines
138 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* deblock.asm
|
|
;*
|
|
;* Abstract
|
|
;* edge loop
|
|
;*
|
|
;* History
|
|
;* 08/07/2009 Created
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
%include "asm_inc.asm"
|
|
|
|
;*******************************************************************************
|
|
; Macros and other preprocessor constants
|
|
;*******************************************************************************
|
|
|
|
SECTION .rodata align=16
|
|
|
|
ALIGN 16
|
|
FOUR_16B_SSE2: dw 4, 4, 4, 4, 4, 4, 4, 4
|
|
|
|
|
|
SECTION .text
|
|
|
|
%ifdef WIN64
|
|
|
|
|
|
WELS_EXTERN DeblockLumaLt4V_ssse3
|
|
push rbp
|
|
mov r11,[rsp + 16 + 20h] ; pTC
|
|
PUSH_XMM 16
|
|
sub rsp,1B0h
|
|
lea rbp,[rsp+20h]
|
|
movd xmm4,r8d
|
|
movd xmm2,r9d
|
|
mov qword [rbp+180h],r12
|
|
mov r10,rcx
|
|
movsxd r12,edx
|
|
add edx,edx
|
|
movsxd rdx,edx
|
|
sub r10,r12
|
|
movsx r8d,byte [r11]
|
|
pxor xmm3,xmm3
|
|
punpcklwd xmm2,xmm2
|
|
movaps [rbp+50h],xmm14
|
|
lea rax,[r12+r12*2]
|
|
movdqa xmm14,[rdx+rcx]
|
|
neg rax
|
|
pshufd xmm0,xmm2,0
|
|
movd xmm2,r8d
|
|
movsx edx,byte [r11+1]
|
|
movsx r8d,byte [r11+2]
|
|
movsx r11d,byte [r11+3]
|
|
movaps [rbp+70h],xmm12
|
|
movd xmm1,edx
|
|
movaps [rbp+80h],xmm11
|
|
movd xmm12,r8d
|
|
movd xmm11,r11d
|
|
movdqa xmm5, [rax+rcx]
|
|
lea rax,[r12+r12]
|
|
punpcklwd xmm12,xmm12
|
|
neg rax
|
|
punpcklwd xmm11,xmm11
|
|
movaps [rbp],xmm8
|
|
movdqa xmm8, [r10]
|
|
punpcklwd xmm2,xmm2
|
|
punpcklwd xmm1,xmm1
|
|
punpcklqdq xmm12,xmm12
|
|
punpcklqdq xmm11,xmm11
|
|
punpcklqdq xmm2,xmm2
|
|
punpcklqdq xmm1,xmm1
|
|
shufps xmm12,xmm11,88h
|
|
movdqa xmm11,xmm8
|
|
movaps [rbp+30h],xmm9
|
|
movdqa xmm9,[rcx]
|
|
shufps xmm2,xmm1,88h
|
|
movdqa xmm1,xmm5
|
|
punpcklbw xmm11,xmm3
|
|
movaps [rbp+20h],xmm6
|
|
movaps [rbp+60h],xmm13
|
|
movdqa xmm13,xmm11
|
|
movaps [rbp+90h],xmm10
|
|
movdqa xmm10,xmm9
|
|
movdqa xmm6,[rax+rcx]
|
|
punpcklbw xmm1,xmm3
|
|
movaps [rbp+0A0h],xmm12
|
|
psubw xmm13,xmm1
|
|
movaps [rbp+40h],xmm15
|
|
movdqa xmm15,xmm14
|
|
movaps [rbp+10h],xmm7
|
|
movdqa xmm7,xmm6
|
|
punpcklbw xmm10,xmm3
|
|
movdqa xmm12,[r12+rcx]
|
|
punpcklbw xmm7,xmm3
|
|
punpcklbw xmm12,xmm3
|
|
punpcklbw xmm15,xmm3
|
|
pabsw xmm3,xmm13
|
|
movdqa xmm13,xmm10
|
|
psubw xmm13,xmm15
|
|
movdqa [rbp+0F0h],xmm15
|
|
pabsw xmm15,xmm13
|
|
movdqa xmm13,xmm11
|
|
movdqa [rbp+0B0h],xmm1
|
|
movdqa xmm1,xmm0
|
|
pavgw xmm13,xmm10
|
|
pcmpgtw xmm1,xmm3
|
|
movdqa [rbp+120h],xmm13
|
|
movaps xmm13,xmm2
|
|
punpcklwd xmm4,xmm4
|
|
movdqa xmm3,xmm0
|
|
movdqa [rbp+100h],xmm1
|
|
psubw xmm13,xmm1
|
|
movdqa xmm1,xmm10
|
|
pcmpgtw xmm3,xmm15
|
|
pshufd xmm4,xmm4,0
|
|
psubw xmm1,xmm11
|
|
movdqa [rbp+0D0h],xmm10
|
|
psubw xmm13,xmm3
|
|
movdqa [rbp+110h],xmm3
|
|
pabsw xmm15,xmm1
|
|
movdqa xmm3,xmm4
|
|
psubw xmm10,xmm12
|
|
pcmpgtw xmm3,xmm15
|
|
pabsw xmm15,xmm10
|
|
movdqa xmm10,xmm0
|
|
psllw xmm1,2
|
|
movdqa [rbp+0C0h],xmm11
|
|
psubw xmm11,xmm7
|
|
pcmpgtw xmm10,xmm15
|
|
pabsw xmm11,xmm11
|
|
movdqa xmm15,xmm0
|
|
pand xmm3,xmm10
|
|
pcmpgtw xmm15,xmm11
|
|
movaps xmm11,xmm2
|
|
pxor xmm10,xmm10
|
|
pand xmm3,xmm15
|
|
pcmpgtw xmm11,xmm10
|
|
pcmpeqw xmm10,xmm2
|
|
por xmm11,xmm10
|
|
pand xmm3,xmm11
|
|
movdqa xmm11,xmm7
|
|
psubw xmm11,xmm12
|
|
pxor xmm15,xmm15
|
|
paddw xmm11,xmm1
|
|
psubw xmm15,xmm13
|
|
movdqa [rbp+0E0h],xmm12
|
|
paddw xmm11,[FOUR_16B_SSE2]
|
|
pxor xmm12,xmm12
|
|
psraw xmm11,3
|
|
punpckhbw xmm8,xmm12
|
|
pmaxsw xmm15,xmm11
|
|
punpckhbw xmm5,xmm12
|
|
movdqa xmm11,xmm8
|
|
pminsw xmm13,xmm15
|
|
psubw xmm11,xmm5
|
|
punpckhbw xmm9,xmm12
|
|
pand xmm13,xmm3
|
|
movdqa [rbp+130h],xmm13
|
|
pabsw xmm13,xmm11
|
|
punpckhbw xmm14,xmm12
|
|
movdqa xmm11,xmm9
|
|
psubw xmm11,xmm14
|
|
movdqa xmm15,xmm0
|
|
movdqa [rbp+140h],xmm14
|
|
pabsw xmm14,xmm11
|
|
movdqa xmm11,xmm8
|
|
pcmpgtw xmm15,xmm14
|
|
movdqa xmm1,[r12+rcx]
|
|
pavgw xmm11,xmm9
|
|
movdqa [rbp+170h],xmm11
|
|
movdqa xmm10,xmm9
|
|
punpckhbw xmm6,xmm12
|
|
psubw xmm10,xmm8
|
|
punpckhbw xmm1,xmm12
|
|
movdqa xmm12,xmm0
|
|
movaps xmm11,[rbp+0A0h]
|
|
pcmpgtw xmm12,xmm13
|
|
movaps xmm13,xmm11
|
|
psubw xmm13,xmm12
|
|
movdqa [rbp+160h],xmm15
|
|
psubw xmm13,xmm15
|
|
movdqa xmm15,xmm9
|
|
psubw xmm15,xmm1
|
|
movdqa [rbp+150h],xmm12
|
|
pabsw xmm12,xmm10
|
|
pabsw xmm14,xmm15
|
|
movdqa xmm15,xmm8
|
|
pcmpgtw xmm4,xmm12
|
|
movdqa xmm12,xmm0
|
|
psubw xmm15,xmm6
|
|
pcmpgtw xmm12,xmm14
|
|
pabsw xmm14,xmm15
|
|
psllw xmm10,2
|
|
pcmpgtw xmm0,xmm14
|
|
movdqa xmm14,xmm6
|
|
psubw xmm14,xmm1
|
|
pand xmm4,xmm12
|
|
paddw xmm14,xmm10
|
|
pand xmm4,xmm0
|
|
paddw xmm14,[FOUR_16B_SSE2]
|
|
pxor xmm15,xmm15
|
|
movaps xmm12,xmm11
|
|
psubw xmm15,xmm13
|
|
pxor xmm0,xmm0
|
|
psraw xmm14,3
|
|
pcmpgtw xmm12,xmm0
|
|
pcmpeqw xmm0,xmm11
|
|
pmaxsw xmm15,xmm14
|
|
por xmm12,xmm0
|
|
movdqa xmm0,[rbp+120h]
|
|
pminsw xmm13,xmm15
|
|
movdqa xmm15,[rbp+0B0h]
|
|
movdqa xmm10,xmm7
|
|
pand xmm4,xmm12
|
|
paddw xmm15,xmm0
|
|
pxor xmm12,xmm12
|
|
paddw xmm10,xmm7
|
|
movdqa xmm14,xmm12
|
|
psubw xmm15,xmm10
|
|
psubw xmm14,xmm2
|
|
psraw xmm15,1
|
|
pmaxsw xmm15,xmm14
|
|
movdqa xmm10,xmm6
|
|
pminsw xmm15,xmm2
|
|
paddw xmm10,xmm6
|
|
pand xmm15,xmm3
|
|
psubw xmm12,xmm11
|
|
pand xmm15,[rbp+100h]
|
|
pand xmm13,xmm4
|
|
paddw xmm7,xmm15
|
|
paddw xmm8,xmm13
|
|
movdqa xmm15,[rbp+170h]
|
|
psubw xmm9,xmm13
|
|
paddw xmm5,xmm15
|
|
psubw xmm5,xmm10
|
|
psraw xmm5,1
|
|
pmaxsw xmm5,xmm12
|
|
pminsw xmm5,xmm11
|
|
pand xmm5,xmm4
|
|
pand xmm5,[rbp+150h]
|
|
paddw xmm6,xmm5
|
|
movdqa xmm5,[rbp+0C0h]
|
|
packuswb xmm7,xmm6
|
|
movdqa xmm6,[rbp+130h]
|
|
paddw xmm5,xmm6
|
|
packuswb xmm5,xmm8
|
|
movdqa xmm8,[rbp+0D0h]
|
|
psubw xmm8,xmm6
|
|
movdqa xmm6,[rbp+0F0h]
|
|
paddw xmm6,xmm0
|
|
movdqa xmm0,[rbp+0E0h]
|
|
packuswb xmm8,xmm9
|
|
movdqa xmm9,xmm0
|
|
paddw xmm9,xmm0
|
|
psubw xmm6,xmm9
|
|
psraw xmm6,1
|
|
pmaxsw xmm14,xmm6
|
|
pminsw xmm2,xmm14
|
|
pand xmm2,xmm3
|
|
pand xmm2,[rbp+110h]
|
|
paddw xmm0,xmm2
|
|
movdqa xmm2,[rbp+140h]
|
|
paddw xmm2,xmm15
|
|
movdqa xmm15,xmm1
|
|
paddw xmm15,xmm1
|
|
psubw xmm2,xmm15
|
|
psraw xmm2,1
|
|
pmaxsw xmm12,xmm2
|
|
pminsw xmm11,xmm12
|
|
pand xmm11,xmm4
|
|
pand xmm11,[rbp+160h]
|
|
paddw xmm1,xmm11
|
|
movdqa [rax+rcx],xmm7
|
|
movdqa [r10],xmm5
|
|
packuswb xmm0,xmm1
|
|
movdqa [rcx],xmm8
|
|
movdqa [r12+rcx],xmm0
|
|
mov r12,qword [rbp+180h]
|
|
lea rsp,[rbp+190h]
|
|
POP_XMM
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
WELS_EXTERN DeblockLumaEq4V_ssse3
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
push rsi
|
|
push rdi
|
|
sub rsp,1D8h
|
|
movaps [rax-38h],xmm6
|
|
movaps [rax-48h],xmm7
|
|
movaps [rax-58h],xmm8
|
|
pxor xmm1,xmm1
|
|
movsxd r10,edx
|
|
mov rbp,rcx
|
|
mov r11d,r8d
|
|
mov rdx,rcx
|
|
mov rdi,rbp
|
|
mov rbx,rbp
|
|
movdqa xmm5,[rbp]
|
|
movaps [rax-68h],xmm9
|
|
movaps [rax-78h],xmm10
|
|
punpcklbw xmm5,xmm1
|
|
movaps [rax-88h],xmm11
|
|
movaps [rax-98h],xmm12
|
|
movaps [rax-0A8h],xmm13
|
|
movaps [rax-0B8h],xmm14
|
|
movdqa xmm14,[r10+rbp]
|
|
movaps [rax-0C8h],xmm15
|
|
lea eax,[r10*4]
|
|
movsxd r8,eax
|
|
lea eax,[r10+r10*2]
|
|
movsxd rcx,eax
|
|
lea eax,[r10+r10]
|
|
sub rdx,r8
|
|
punpcklbw xmm14,xmm1
|
|
movdqa [rsp+90h],xmm5
|
|
movdqa [rsp+30h],xmm14
|
|
movsxd rsi,eax
|
|
movsx eax,r11w
|
|
sub rdi,rcx
|
|
sub rbx,rsi
|
|
mov r8,rbp
|
|
sub r8,r10
|
|
movd xmm0,eax
|
|
movsx eax,r9w
|
|
movdqa xmm12,[rdi]
|
|
movdqa xmm6, [rsi+rbp]
|
|
movdqa xmm13,[rbx]
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm11,xmm0,0
|
|
punpcklbw xmm13,xmm1
|
|
punpcklbw xmm6,xmm1
|
|
movdqa xmm8,[r8]
|
|
movd xmm0,eax
|
|
movdqa xmm10,xmm11
|
|
mov eax,2
|
|
punpcklbw xmm8,xmm1
|
|
punpcklbw xmm12,xmm1
|
|
cwde
|
|
punpcklwd xmm0,xmm0
|
|
psraw xmm10,2
|
|
movdqa xmm1,xmm8
|
|
movdqa [rsp+0F0h],xmm13
|
|
movdqa [rsp+0B0h],xmm8
|
|
pshufd xmm7,xmm0,0
|
|
psubw xmm1,xmm13
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm4,xmm7
|
|
movdqa xmm2,xmm7
|
|
psubw xmm0,xmm8
|
|
pabsw xmm3,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm5
|
|
movdqa [rsp+40h],xmm7
|
|
movdqa [rsp+60h],xmm6
|
|
pcmpgtw xmm4,xmm0
|
|
psubw xmm1,xmm14
|
|
pabsw xmm0,xmm1
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm4,xmm2
|
|
movdqa xmm0,xmm11
|
|
pcmpgtw xmm0,xmm3
|
|
pand xmm4,xmm0
|
|
movd xmm0,eax
|
|
movdqa [rsp+20h],xmm4
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm2,xmm0,0
|
|
paddw xmm10,xmm2
|
|
movdqa [rsp+0A0h],xmm2
|
|
movdqa xmm15,xmm7
|
|
pxor xmm4,xmm4
|
|
movdqa xmm0,xmm8
|
|
psubw xmm0,xmm12
|
|
mov eax,4
|
|
pabsw xmm0,xmm0
|
|
movdqa xmm1,xmm10
|
|
cwde
|
|
pcmpgtw xmm15,xmm0
|
|
pcmpgtw xmm1,xmm3
|
|
movdqa xmm3,xmm7
|
|
movdqa xmm7,[rdx]
|
|
movdqa xmm0,xmm5
|
|
psubw xmm0,xmm6
|
|
pand xmm15,xmm1
|
|
punpcklbw xmm7,xmm4
|
|
movdqa xmm9,xmm15
|
|
pabsw xmm0,xmm0
|
|
psllw xmm7,1
|
|
pandn xmm9,xmm12
|
|
pcmpgtw xmm3,xmm0
|
|
paddw xmm7,xmm12
|
|
movd xmm0,eax
|
|
pand xmm3,xmm1
|
|
paddw xmm7,xmm12
|
|
punpcklwd xmm0,xmm0
|
|
paddw xmm7,xmm12
|
|
pshufd xmm1,xmm0,0
|
|
paddw xmm7,xmm13
|
|
movdqa xmm0,xmm3
|
|
pandn xmm0,xmm6
|
|
paddw xmm7,xmm8
|
|
movdqa [rsp+70h],xmm1
|
|
paddw xmm7,xmm5
|
|
movdqa [rsp+120h],xmm0
|
|
movdqa xmm0,[rcx+rbp]
|
|
punpcklbw xmm0,xmm4
|
|
paddw xmm7,xmm1
|
|
movdqa xmm4,xmm15
|
|
psllw xmm0,1
|
|
psraw xmm7,3
|
|
paddw xmm0,xmm6
|
|
pand xmm7,xmm15
|
|
paddw xmm0,xmm6
|
|
paddw xmm0,xmm6
|
|
paddw xmm0,xmm14
|
|
movdqa xmm6,xmm15
|
|
paddw xmm0,xmm5
|
|
pandn xmm6,xmm13
|
|
paddw xmm0,xmm8
|
|
paddw xmm0,xmm1
|
|
psraw xmm0,3
|
|
movdqa xmm1,xmm12
|
|
paddw xmm1,xmm13
|
|
pand xmm0,xmm3
|
|
movdqa [rsp+100h],xmm0
|
|
movdqa xmm0,xmm8
|
|
paddw xmm0,xmm5
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm3
|
|
paddw xmm1,xmm2
|
|
psraw xmm1,2
|
|
pandn xmm0,xmm14
|
|
pand xmm4,xmm1
|
|
movdqa [rsp+0E0h],xmm0
|
|
movdqa xmm0,xmm5
|
|
paddw xmm0,xmm8
|
|
movdqa xmm1,[rsp+60h]
|
|
paddw xmm1,xmm14
|
|
movdqa xmm14,xmm3
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm8
|
|
paddw xmm0,[rsp+30h]
|
|
paddw xmm1,xmm2
|
|
psraw xmm1,2
|
|
pand xmm14,xmm1
|
|
movdqa xmm1,xmm13
|
|
paddw xmm1,xmm13
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm2
|
|
psraw xmm1,2
|
|
movdqa xmm0,[rsp+30h]
|
|
movdqa xmm2,xmm13
|
|
movdqa xmm5,xmm15
|
|
paddw xmm0,[rsp+70h]
|
|
pandn xmm5,xmm1
|
|
paddw xmm2,xmm8
|
|
movdqa xmm8,[rsp+90h]
|
|
movdqa xmm1,xmm12
|
|
paddw xmm2,xmm8
|
|
psllw xmm2,1
|
|
paddw xmm2,xmm0
|
|
paddw xmm1,xmm2
|
|
movdqa xmm0,xmm8
|
|
movdqa xmm8,xmm3
|
|
movdqa xmm2,[rsp+30h]
|
|
paddw xmm0,xmm13
|
|
psraw xmm1,3
|
|
pand xmm15,xmm1
|
|
movdqa xmm1,xmm2
|
|
paddw xmm1,xmm2
|
|
paddw xmm2,[rsp+90h]
|
|
paddw xmm2,[rsp+0B0h]
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm13
|
|
movdqa xmm13,[r8]
|
|
paddw xmm0, [rsp+70h]
|
|
paddw xmm1, [rsp+0A0h]
|
|
psllw xmm2,1
|
|
paddw xmm2,xmm0
|
|
psraw xmm1,2
|
|
movdqa xmm0, [rdi]
|
|
pandn xmm8,xmm1
|
|
movdqa xmm1, [rsp+60h]
|
|
paddw xmm1,xmm2
|
|
movdqa xmm2, [rbx]
|
|
psraw xmm1,3
|
|
pand xmm3,xmm1
|
|
movdqa xmm1, [rbp]
|
|
movdqa [rsp+0D0h],xmm3
|
|
pxor xmm3,xmm3
|
|
punpckhbw xmm0,xmm3
|
|
punpckhbw xmm1,xmm3
|
|
punpckhbw xmm13,xmm3
|
|
movdqa [rsp+0C0h],xmm0
|
|
movdqa xmm0,[r10+rbp]
|
|
movdqa [rsp],xmm1
|
|
punpckhbw xmm0,xmm3
|
|
punpckhbw xmm2,xmm3
|
|
movdqa [rsp+80h],xmm0
|
|
movdqa xmm0,[rsi+rbp]
|
|
movdqa [rsp+10h],xmm13
|
|
punpckhbw xmm0,xmm3
|
|
movdqa [rsp+50h],xmm0
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm1,xmm13
|
|
psubw xmm0,xmm13
|
|
psubw xmm1,xmm2
|
|
pabsw xmm3,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,[rsp]
|
|
movdqa xmm13,[rsp+40h]
|
|
movdqa [rsp+110h],xmm2
|
|
psubw xmm1, [rsp+80h]
|
|
pcmpgtw xmm13,xmm0
|
|
pcmpgtw xmm11,xmm3
|
|
pabsw xmm0,xmm1
|
|
pcmpgtw xmm10,xmm3
|
|
movdqa xmm1, [rsp+40h]
|
|
movdqa xmm2,xmm1
|
|
movdqa xmm3,xmm1
|
|
pcmpgtw xmm2,xmm0
|
|
movdqa xmm0, [rsp+10h]
|
|
pand xmm13,xmm2
|
|
pand xmm13,xmm11
|
|
movdqa xmm11,[rsp+0C0h]
|
|
psubw xmm0,xmm11
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm3,xmm0
|
|
pand xmm3,xmm10
|
|
movdqa xmm0,[rsp]
|
|
psubw xmm0,[rsp+50h]
|
|
movdqa xmm2,[rdx]
|
|
pabsw xmm0,xmm0
|
|
por xmm7,xmm9
|
|
movdqa xmm9,[rsp+20h]
|
|
pcmpgtw xmm1,xmm0
|
|
pand xmm9,xmm7
|
|
movdqa xmm7,[rsp+20h]
|
|
movdqa xmm0,xmm7
|
|
pandn xmm0,xmm12
|
|
movdqa xmm12,[rsp+110h]
|
|
pand xmm1,xmm10
|
|
movdqa xmm10,[rsp+70h]
|
|
movdqa [rsp+40h],xmm1
|
|
movdqa xmm1,xmm13
|
|
por xmm9,xmm0
|
|
pxor xmm0,xmm0
|
|
por xmm4,xmm6
|
|
movdqa xmm6,xmm7
|
|
punpckhbw xmm2,xmm0
|
|
por xmm15,xmm5
|
|
movdqa xmm5,[rsp+20h]
|
|
movdqa xmm0,xmm3
|
|
psllw xmm2,1
|
|
pandn xmm0,xmm11
|
|
pand xmm6,xmm4
|
|
movdqa xmm4,[rsp]
|
|
paddw xmm2,xmm11
|
|
pand xmm5,xmm15
|
|
movdqa xmm15,[rsp+20h]
|
|
paddw xmm2,xmm11
|
|
paddw xmm2,xmm11
|
|
paddw xmm2,xmm12
|
|
paddw xmm2,[rsp+10h]
|
|
paddw xmm2,[rsp]
|
|
paddw xmm2,xmm10
|
|
psraw xmm2,3
|
|
pand xmm2,xmm3
|
|
por xmm2,xmm0
|
|
pand xmm1,xmm2
|
|
movdqa xmm0,xmm13
|
|
movdqa xmm2,xmm11
|
|
pandn xmm0,xmm11
|
|
paddw xmm2,xmm12
|
|
por xmm1,xmm0
|
|
packuswb xmm9,xmm1
|
|
movdqa xmm0,xmm7
|
|
movdqa xmm7,[rsp+0A0h]
|
|
pandn xmm0,[rsp+0F0h]
|
|
movdqa xmm1,xmm3
|
|
por xmm6,xmm0
|
|
movdqa xmm0,[rsp+10h]
|
|
paddw xmm0,xmm4
|
|
paddw xmm2,xmm0
|
|
paddw xmm2,xmm7
|
|
movdqa xmm0,xmm3
|
|
pandn xmm0,xmm12
|
|
psraw xmm2,2
|
|
pand xmm1,xmm2
|
|
por xmm1,xmm0
|
|
movdqa xmm2,xmm13
|
|
movdqa xmm0,xmm13
|
|
pand xmm2,xmm1
|
|
pandn xmm0,xmm12
|
|
movdqa xmm1,xmm12
|
|
paddw xmm1,[rsp+10h]
|
|
por xmm2,xmm0
|
|
movdqa xmm0,xmm15
|
|
pandn xmm0,[rsp+0B0h]
|
|
paddw xmm1,xmm4
|
|
packuswb xmm6,xmm2
|
|
movdqa xmm2,xmm3
|
|
psllw xmm1,1
|
|
por xmm5,xmm0
|
|
movdqa xmm0,[rsp+80h]
|
|
paddw xmm0,xmm10
|
|
paddw xmm1,xmm0
|
|
paddw xmm11,xmm1
|
|
psraw xmm11,3
|
|
movdqa xmm1,xmm12
|
|
pand xmm2,xmm11
|
|
paddw xmm1,xmm12
|
|
movdqa xmm11,[rsp+80h]
|
|
movdqa xmm0, [rsp+10h]
|
|
por xmm14,[rsp+0E0h]
|
|
paddw xmm0,xmm11
|
|
movdqa xmm4,xmm15
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm13
|
|
paddw xmm1,xmm7
|
|
psraw xmm1,2
|
|
pandn xmm3,xmm1
|
|
por xmm2,xmm3
|
|
movdqa xmm1,xmm13
|
|
movdqa xmm3,[rsp+10h]
|
|
pandn xmm0,xmm3
|
|
pand xmm1,xmm2
|
|
movdqa xmm2,xmm11
|
|
paddw xmm2,[rsp]
|
|
por xmm1,xmm0
|
|
movdqa xmm0,[rsp+0D0h]
|
|
por xmm0,xmm8
|
|
paddw xmm2,xmm3
|
|
packuswb xmm5,xmm1
|
|
movdqa xmm8,[rsp+40h]
|
|
movdqa xmm1,[rsp+50h]
|
|
movdqa xmm3,xmm8
|
|
pand xmm4,xmm0
|
|
psllw xmm2,1
|
|
movdqa xmm0,xmm15
|
|
pandn xmm0,[rsp+90h]
|
|
por xmm4,xmm0
|
|
movdqa xmm0,xmm12
|
|
paddw xmm0,xmm10
|
|
paddw xmm2,xmm0
|
|
paddw xmm1,xmm2
|
|
movdqa xmm0,[rsp]
|
|
movdqa xmm2,xmm11
|
|
paddw xmm0,xmm12
|
|
movdqa xmm12,[rsp]
|
|
paddw xmm2,xmm11
|
|
paddw xmm2,xmm0
|
|
psraw xmm1,3
|
|
movdqa xmm0,xmm8
|
|
pand xmm3,xmm1
|
|
paddw xmm2,xmm7
|
|
movdqa xmm1,xmm13
|
|
psraw xmm2,2
|
|
pandn xmm0,xmm2
|
|
por xmm3,xmm0
|
|
movdqa xmm2,[rsp+50h]
|
|
movdqa xmm0,xmm13
|
|
pandn xmm0,xmm12
|
|
pand xmm1,xmm3
|
|
paddw xmm2,xmm11
|
|
movdqa xmm3,xmm15
|
|
por xmm1,xmm0
|
|
pand xmm3,xmm14
|
|
movdqa xmm14,[rsp+10h]
|
|
movdqa xmm0,xmm15
|
|
pandn xmm0,[rsp+30h]
|
|
packuswb xmm4,xmm1
|
|
movdqa xmm1,xmm8
|
|
por xmm3,xmm0
|
|
movdqa xmm0,xmm12
|
|
paddw xmm0,xmm14
|
|
paddw xmm2,xmm0
|
|
paddw xmm2,xmm7
|
|
movdqa xmm0,xmm8
|
|
pandn xmm0,xmm11
|
|
psraw xmm2,2
|
|
pand xmm1,xmm2
|
|
por xmm1,xmm0
|
|
movdqa xmm2,xmm13
|
|
movdqa xmm0,xmm13
|
|
pandn xmm0,xmm11
|
|
pand xmm2,xmm1
|
|
movdqa xmm1,xmm15
|
|
por xmm2,xmm0
|
|
packuswb xmm3,xmm2
|
|
movdqa xmm0,[rsp+100h]
|
|
por xmm0,[rsp+120h]
|
|
pand xmm1,xmm0
|
|
movdqa xmm2,[rcx+rbp]
|
|
movdqa xmm7,[rsp+50h]
|
|
pandn xmm15,[rsp+60h]
|
|
lea r11,[rsp+1D8h]
|
|
pxor xmm0,xmm0
|
|
por xmm1,xmm15
|
|
movaps xmm15,[r11-0A8h]
|
|
movdqa [rdi],xmm9
|
|
movaps xmm9,[r11-48h]
|
|
punpckhbw xmm2,xmm0
|
|
psllw xmm2,1
|
|
paddw xmm2,xmm7
|
|
paddw xmm2,xmm7
|
|
movdqa [rbx],xmm6
|
|
movaps xmm6,[r11-18h]
|
|
paddw xmm2,xmm7
|
|
paddw xmm2,xmm11
|
|
movaps xmm11,[r11-68h]
|
|
paddw xmm2,xmm12
|
|
movaps xmm12,[r11-78h]
|
|
paddw xmm2,xmm14
|
|
paddw xmm2,xmm10
|
|
psraw xmm2,3
|
|
movaps xmm10,[r11-58h]
|
|
movaps xmm14,[r11-98h]
|
|
movdqa xmm0,xmm13
|
|
pand xmm2,xmm8
|
|
pandn xmm8,xmm7
|
|
pandn xmm13,xmm7
|
|
por xmm2,xmm8
|
|
movaps xmm7,[r11-28h]
|
|
movaps xmm8,[r11-38h]
|
|
movdqa [r8],xmm5
|
|
pand xmm0,xmm2
|
|
por xmm0,xmm13
|
|
packuswb xmm1,xmm0
|
|
movaps xmm13,[r11-88h]
|
|
movdqa [rbp],xmm4
|
|
movdqa [r10+rbp],xmm3
|
|
movdqa [rsi+rbp],xmm1
|
|
mov rsp,r11
|
|
pop rdi
|
|
pop rsi
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
|
|
WELS_EXTERN DeblockChromaLt4V_ssse3
|
|
mov rax,rsp
|
|
push rbx
|
|
push rdi
|
|
PUSH_XMM 16
|
|
sub rsp,0C8h
|
|
mov r10,qword [rax + 30h] ; pTC
|
|
pxor xmm1,xmm1
|
|
mov rbx,rcx
|
|
movsxd r11,r8d
|
|
movsx ecx,byte [r10]
|
|
movsx r8d,byte [r10+2]
|
|
mov rdi,rdx
|
|
movq xmm2,[rbx]
|
|
movq xmm9,[r11+rbx]
|
|
movsx edx,byte [r10+1]
|
|
mov word [rsp+2],cx
|
|
mov word [rsp],cx
|
|
movsx eax,byte [r10+3]
|
|
mov word [rsp+6],dx
|
|
mov word [rsp+4],dx
|
|
movdqa xmm11,xmm1
|
|
mov word [rsp+0Eh],ax
|
|
mov word [rsp+0Ch],ax
|
|
lea eax,[r11+r11]
|
|
movsxd rcx,eax
|
|
mov rax,rbx
|
|
mov rdx,rdi
|
|
sub rax,rcx
|
|
mov word [rsp+0Ah],r8w
|
|
mov word [rsp+8],r8w
|
|
movdqa xmm6,[rsp]
|
|
movdqa xmm7,xmm6
|
|
movq xmm13, [rax]
|
|
mov rax,rdi
|
|
sub rax,rcx
|
|
mov rcx,rbx
|
|
pcmpgtw xmm7,xmm1
|
|
psubw xmm11,xmm6
|
|
sub rcx,r11
|
|
sub rdx,r11
|
|
movq xmm0,[rax]
|
|
movsx eax,r9w
|
|
movq xmm15,[rcx]
|
|
punpcklqdq xmm13,xmm0
|
|
movq xmm0, [rdx]
|
|
movdqa xmm4,xmm13
|
|
punpcklqdq xmm15,xmm0
|
|
movq xmm0, [rdi]
|
|
punpcklbw xmm4,xmm1
|
|
movdqa xmm12,xmm15
|
|
punpcklqdq xmm2,xmm0
|
|
movq xmm0, [r11+rdi]
|
|
punpcklbw xmm12,xmm1
|
|
movdqa xmm14,xmm2
|
|
punpcklqdq xmm9,xmm0
|
|
punpckhbw xmm2,xmm1
|
|
punpcklbw xmm14,xmm1
|
|
movd xmm0,eax
|
|
movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta
|
|
punpckhbw xmm13,xmm1
|
|
punpckhbw xmm15,xmm1
|
|
movdqa xmm3,xmm9
|
|
movdqa [rsp+10h],xmm2
|
|
punpcklwd xmm0,xmm0
|
|
punpckhbw xmm9,xmm1
|
|
punpcklbw xmm3,xmm1
|
|
movdqa xmm1,xmm14
|
|
pshufd xmm10,xmm0,0
|
|
movd xmm0,eax
|
|
mov eax,4
|
|
cwde
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm8,xmm0,0
|
|
movd xmm0,eax
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm5,xmm0,0
|
|
psubw xmm1,xmm12
|
|
movdqa xmm2,xmm10
|
|
lea r11,[rsp+0C8h]
|
|
psllw xmm1,2
|
|
movdqa xmm0,xmm4
|
|
psubw xmm4,xmm12
|
|
psubw xmm0,xmm3
|
|
psubw xmm3,xmm14
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm11
|
|
psraw xmm1,3
|
|
pmaxsw xmm0,xmm1
|
|
pminsw xmm6,xmm0
|
|
movdqa xmm1,xmm8
|
|
movdqa xmm0,xmm12
|
|
psubw xmm0,xmm14
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm2,xmm0
|
|
pabsw xmm0,xmm4
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm3
|
|
movdqa xmm3,[rsp]
|
|
pand xmm2,xmm1
|
|
movdqa xmm1,xmm8
|
|
pcmpgtw xmm1,xmm0
|
|
movdqa xmm0,xmm13
|
|
pand xmm2,xmm1
|
|
psubw xmm0,xmm9
|
|
psubw xmm13,xmm15
|
|
pand xmm2,xmm7
|
|
pand xmm6,xmm2
|
|
paddw xmm12,xmm6
|
|
psubw xmm14,xmm6
|
|
movdqa xmm2,[rsp+10h]
|
|
movaps xmm6,[r11-18h]
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm15
|
|
psubw xmm9,xmm2
|
|
psllw xmm1,2
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm15
|
|
psubw xmm0,xmm2
|
|
psraw xmm1,3
|
|
pmaxsw xmm11,xmm1
|
|
pabsw xmm0,xmm0
|
|
movdqa xmm1,xmm8
|
|
pcmpgtw xmm10,xmm0
|
|
pabsw xmm0,xmm13
|
|
pminsw xmm3,xmm11
|
|
movaps xmm11,[r11-68h]
|
|
movaps xmm13,[rsp+40h]
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm9
|
|
movaps xmm9, [r11-48h]
|
|
pand xmm10,xmm1
|
|
pcmpgtw xmm8,xmm0
|
|
pand xmm10,xmm8
|
|
pand xmm10,xmm7
|
|
movaps xmm8,[r11-38h]
|
|
movaps xmm7,[r11-28h]
|
|
pand xmm3,xmm10
|
|
paddw xmm15,xmm3
|
|
psubw xmm2,xmm3
|
|
movaps xmm10,[r11-58h]
|
|
packuswb xmm12,xmm15
|
|
movaps xmm15,[rsp+20h]
|
|
packuswb xmm14,xmm2
|
|
movq [rcx],xmm12
|
|
movq [rbx],xmm14
|
|
psrldq xmm12,8
|
|
psrldq xmm14,8
|
|
movq [rdx],xmm12
|
|
movaps xmm12,[r11-78h]
|
|
movq [rdi],xmm14
|
|
movaps xmm14,[rsp+30h]
|
|
mov rsp,r11
|
|
POP_XMM
|
|
pop rdi
|
|
pop rbx
|
|
ret
|
|
|
|
|
|
WELS_EXTERN DeblockChromaEq4V_ssse3
|
|
mov rax,rsp
|
|
push rbx
|
|
PUSH_XMM 15
|
|
sub rsp,90h
|
|
pxor xmm1,xmm1
|
|
mov r11,rcx
|
|
mov rbx,rdx
|
|
mov r10d,r9d
|
|
movq xmm13,[r11]
|
|
lea eax,[r8+r8]
|
|
movsxd r9,eax
|
|
mov rax,rcx
|
|
sub rax,r9
|
|
movq xmm14,[rax]
|
|
mov rax,rdx
|
|
sub rax,r9
|
|
movq xmm0,[rax]
|
|
movsxd rax,r8d
|
|
sub rcx,rax
|
|
sub rdx,rax
|
|
movq xmm12,[rax+r11]
|
|
movq xmm10,[rcx]
|
|
punpcklqdq xmm14,xmm0
|
|
movdqa xmm8,xmm14
|
|
movq xmm0,[rdx]
|
|
punpcklbw xmm8,xmm1
|
|
punpckhbw xmm14,xmm1
|
|
punpcklqdq xmm10,xmm0
|
|
movq xmm0,[rbx]
|
|
movdqa xmm5,xmm10
|
|
punpcklqdq xmm13,xmm0
|
|
movq xmm0, [rax+rbx]
|
|
punpcklbw xmm5,xmm1
|
|
movsx eax,r10w
|
|
movdqa xmm9,xmm13
|
|
punpcklqdq xmm12,xmm0
|
|
punpcklbw xmm9,xmm1
|
|
punpckhbw xmm10,xmm1
|
|
movd xmm0,eax
|
|
movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta
|
|
punpckhbw xmm13,xmm1
|
|
movdqa xmm7,xmm12
|
|
punpcklwd xmm0,xmm0
|
|
punpckhbw xmm12,xmm1
|
|
pshufd xmm11,xmm0,0
|
|
punpcklbw xmm7,xmm1
|
|
movd xmm0,eax
|
|
movdqa xmm1,xmm8
|
|
psubw xmm1,xmm5
|
|
punpcklwd xmm0,xmm0
|
|
movdqa xmm6,xmm11
|
|
pshufd xmm3,xmm0,0
|
|
movdqa xmm0,xmm5
|
|
psubw xmm0,xmm9
|
|
movdqa xmm2,xmm3
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm6,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm3
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm6,xmm2
|
|
movdqa xmm0,xmm7
|
|
movdqa xmm2,xmm3
|
|
psubw xmm0,xmm9
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm1,xmm0
|
|
pand xmm6,xmm1
|
|
movdqa xmm0,xmm10
|
|
movdqa xmm1,xmm14
|
|
psubw xmm0,xmm13
|
|
psubw xmm1,xmm10
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm11,xmm0
|
|
pabsw xmm0,xmm1
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm11,xmm2
|
|
movdqa xmm0,xmm12
|
|
movdqa xmm4,xmm6
|
|
movdqa xmm1,xmm8
|
|
mov eax,2
|
|
cwde
|
|
paddw xmm1,xmm8
|
|
psubw xmm0,xmm13
|
|
paddw xmm1,xmm5
|
|
pabsw xmm0,xmm0
|
|
movdqa xmm2,xmm14
|
|
paddw xmm1,xmm7
|
|
pcmpgtw xmm3,xmm0
|
|
paddw xmm2,xmm14
|
|
movd xmm0,eax
|
|
pand xmm11,xmm3
|
|
paddw xmm7,xmm7
|
|
paddw xmm2,xmm10
|
|
punpcklwd xmm0,xmm0
|
|
paddw xmm2,xmm12
|
|
paddw xmm12,xmm12
|
|
pshufd xmm3,xmm0,0
|
|
paddw xmm7,xmm9
|
|
paddw xmm12,xmm13
|
|
movdqa xmm0,xmm6
|
|
paddw xmm1,xmm3
|
|
pandn xmm0,xmm5
|
|
paddw xmm7,xmm8
|
|
psraw xmm1,2
|
|
paddw xmm12,xmm14
|
|
paddw xmm7,xmm3
|
|
movaps xmm14,[rsp]
|
|
pand xmm4,xmm1
|
|
paddw xmm12,xmm3
|
|
psraw xmm7,2
|
|
movdqa xmm1,xmm11
|
|
por xmm4,xmm0
|
|
psraw xmm12,2
|
|
paddw xmm2,xmm3
|
|
movdqa xmm0,xmm11
|
|
pandn xmm0,xmm10
|
|
psraw xmm2,2
|
|
pand xmm1,xmm2
|
|
por xmm1,xmm0
|
|
packuswb xmm4,xmm1
|
|
movdqa xmm0,xmm11
|
|
movdqa xmm1,xmm6
|
|
pand xmm1,xmm7
|
|
movaps xmm7,[rsp+70h]
|
|
movq [rcx],xmm4
|
|
pandn xmm6,xmm9
|
|
pandn xmm11,xmm13
|
|
pand xmm0,xmm12
|
|
por xmm1,xmm6
|
|
por xmm0,xmm11
|
|
psrldq xmm4,8
|
|
packuswb xmm1,xmm0
|
|
movq [r11],xmm1
|
|
psrldq xmm1,8
|
|
movq [rdx],xmm4
|
|
lea r11,[rsp+90h]
|
|
movaps xmm6,[r11-10h]
|
|
movaps xmm8,[r11-30h]
|
|
movaps xmm9,[r11-40h]
|
|
movq [rbx],xmm1
|
|
movaps xmm10,[r11-50h]
|
|
movaps xmm11,[r11-60h]
|
|
movaps xmm12,[r11-70h]
|
|
movaps xmm13,[r11-80h]
|
|
mov rsp,r11
|
|
POP_XMM
|
|
pop rbx
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
WELS_EXTERN DeblockChromaEq4H_ssse3
|
|
mov rax,rsp
|
|
mov [rax+20h],rbx
|
|
push rdi
|
|
PUSH_XMM 16
|
|
sub rsp,140h
|
|
mov rdi,rdx
|
|
lea eax,[r8*4]
|
|
movsxd r10,eax
|
|
mov eax,[rcx-2]
|
|
mov [rsp+10h],eax
|
|
lea rbx,[r10+rdx-2]
|
|
lea r11,[r10+rcx-2]
|
|
movdqa xmm5,[rsp+10h]
|
|
movsxd r10,r8d
|
|
mov eax,[r10+rcx-2]
|
|
lea rdx,[r10+r10*2]
|
|
mov [rsp+20h],eax
|
|
mov eax,[rcx+r10*2-2]
|
|
mov [rsp+30h],eax
|
|
mov eax,[rdx+rcx-2]
|
|
movdqa xmm2,[rsp+20h]
|
|
mov [rsp+40h],eax
|
|
mov eax, [rdi-2]
|
|
movdqa xmm4,[rsp+30h]
|
|
mov [rsp+50h],eax
|
|
mov eax,[r10+rdi-2]
|
|
movdqa xmm3,[rsp+40h]
|
|
mov [rsp+60h],eax
|
|
mov eax,[rdi+r10*2-2]
|
|
punpckldq xmm5,[rsp+50h]
|
|
mov [rsp+70h],eax
|
|
mov eax, [rdx+rdi-2]
|
|
punpckldq xmm2, [rsp+60h]
|
|
mov [rsp+80h],eax
|
|
mov eax,[r11]
|
|
punpckldq xmm4, [rsp+70h]
|
|
mov [rsp+50h],eax
|
|
mov eax,[rbx]
|
|
punpckldq xmm3,[rsp+80h]
|
|
mov [rsp+60h],eax
|
|
mov eax,[r10+r11]
|
|
movdqa xmm0, [rsp+50h]
|
|
punpckldq xmm0, [rsp+60h]
|
|
punpcklqdq xmm5,xmm0
|
|
movdqa [rsp+50h],xmm0
|
|
mov [rsp+50h],eax
|
|
mov eax,[r10+rbx]
|
|
movdqa xmm0,[rsp+50h]
|
|
movdqa xmm1,xmm5
|
|
mov [rsp+60h],eax
|
|
mov eax,[r11+r10*2]
|
|
punpckldq xmm0, [rsp+60h]
|
|
punpcklqdq xmm2,xmm0
|
|
punpcklbw xmm1,xmm2
|
|
punpckhbw xmm5,xmm2
|
|
movdqa [rsp+50h],xmm0
|
|
mov [rsp+50h],eax
|
|
mov eax,[rbx+r10*2]
|
|
movdqa xmm0,[rsp+50h]
|
|
mov [rsp+60h],eax
|
|
mov eax, [rdx+r11]
|
|
movdqa xmm15,xmm1
|
|
punpckldq xmm0,[rsp+60h]
|
|
punpcklqdq xmm4,xmm0
|
|
movdqa [rsp+50h],xmm0
|
|
mov [rsp+50h],eax
|
|
mov eax, [rdx+rbx]
|
|
movdqa xmm0,[rsp+50h]
|
|
mov [rsp+60h],eax
|
|
punpckldq xmm0, [rsp+60h]
|
|
punpcklqdq xmm3,xmm0
|
|
movdqa xmm0,xmm4
|
|
punpcklbw xmm0,xmm3
|
|
punpckhbw xmm4,xmm3
|
|
punpcklwd xmm15,xmm0
|
|
punpckhwd xmm1,xmm0
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm12,xmm15
|
|
punpcklwd xmm0,xmm4
|
|
punpckhwd xmm5,xmm4
|
|
punpckldq xmm12,xmm0
|
|
punpckhdq xmm15,xmm0
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm11,xmm12
|
|
punpckldq xmm0,xmm5
|
|
punpckhdq xmm1,xmm5
|
|
punpcklqdq xmm11,xmm0
|
|
punpckhqdq xmm12,xmm0
|
|
movsx eax,r9w
|
|
movdqa xmm14,xmm15
|
|
punpcklqdq xmm14,xmm1
|
|
punpckhqdq xmm15,xmm1
|
|
pxor xmm1,xmm1
|
|
movd xmm0,eax
|
|
movdqa xmm4,xmm12
|
|
movdqa xmm8,xmm11
|
|
movsx eax,word [rsp+170h + 160] ; iBeta
|
|
punpcklwd xmm0,xmm0
|
|
punpcklbw xmm4,xmm1
|
|
punpckhbw xmm12,xmm1
|
|
movdqa xmm9,xmm14
|
|
movdqa xmm7,xmm15
|
|
movdqa xmm10,xmm15
|
|
pshufd xmm13,xmm0,0
|
|
punpcklbw xmm9,xmm1
|
|
punpckhbw xmm14,xmm1
|
|
movdqa xmm6,xmm13
|
|
movd xmm0,eax
|
|
movdqa [rsp],xmm11
|
|
mov eax,2
|
|
cwde
|
|
punpckhbw xmm11,xmm1
|
|
punpckhbw xmm10,xmm1
|
|
punpcklbw xmm7,xmm1
|
|
punpcklwd xmm0,xmm0
|
|
punpcklbw xmm8,xmm1
|
|
pshufd xmm3,xmm0,0
|
|
movdqa xmm1,xmm8
|
|
movdqa xmm0,xmm4
|
|
psubw xmm0,xmm9
|
|
psubw xmm1,xmm4
|
|
movdqa xmm2,xmm3
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm6,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm3
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm6,xmm2
|
|
movdqa xmm0,xmm7
|
|
movdqa xmm2,xmm3
|
|
psubw xmm0,xmm9
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm1,xmm0
|
|
pand xmm6,xmm1
|
|
movdqa xmm0,xmm12
|
|
movdqa xmm1,xmm11
|
|
psubw xmm0,xmm14
|
|
psubw xmm1,xmm12
|
|
movdqa xmm5,xmm6
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm13,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm8
|
|
pcmpgtw xmm2,xmm0
|
|
paddw xmm1,xmm8
|
|
movdqa xmm0,xmm10
|
|
pand xmm13,xmm2
|
|
psubw xmm0,xmm14
|
|
paddw xmm1,xmm4
|
|
movdqa xmm2,xmm11
|
|
pabsw xmm0,xmm0
|
|
paddw xmm2,xmm11
|
|
paddw xmm1,xmm7
|
|
pcmpgtw xmm3,xmm0
|
|
paddw xmm2,xmm12
|
|
movd xmm0,eax
|
|
pand xmm13,xmm3
|
|
paddw xmm2,xmm10
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm3,xmm0,0
|
|
movdqa xmm0,xmm6
|
|
paddw xmm1,xmm3
|
|
pandn xmm0,xmm4
|
|
paddw xmm2,xmm3
|
|
psraw xmm1,2
|
|
pand xmm5,xmm1
|
|
por xmm5,xmm0
|
|
paddw xmm7,xmm7
|
|
paddw xmm10,xmm10
|
|
psraw xmm2,2
|
|
movdqa xmm1,xmm13
|
|
movdqa xmm0,xmm13
|
|
pandn xmm0,xmm12
|
|
pand xmm1,xmm2
|
|
paddw xmm7,xmm9
|
|
por xmm1,xmm0
|
|
paddw xmm10,xmm14
|
|
paddw xmm7,xmm8
|
|
movdqa xmm0,xmm13
|
|
packuswb xmm5,xmm1
|
|
paddw xmm7,xmm3
|
|
paddw xmm10,xmm11
|
|
movdqa xmm1,xmm6
|
|
paddw xmm10,xmm3
|
|
pandn xmm6,xmm9
|
|
psraw xmm7,2
|
|
pand xmm1,xmm7
|
|
psraw xmm10,2
|
|
pandn xmm13,xmm14
|
|
pand xmm0,xmm10
|
|
por xmm1,xmm6
|
|
movdqa xmm6,[rsp]
|
|
movdqa xmm4,xmm6
|
|
por xmm0,xmm13
|
|
punpcklbw xmm4,xmm5
|
|
punpckhbw xmm6,xmm5
|
|
movdqa xmm3,xmm4
|
|
packuswb xmm1,xmm0
|
|
movdqa xmm0,xmm1
|
|
punpckhbw xmm1,xmm15
|
|
punpcklbw xmm0,xmm15
|
|
punpcklwd xmm3,xmm0
|
|
punpckhwd xmm4,xmm0
|
|
movdqa xmm0,xmm6
|
|
movdqa xmm2,xmm3
|
|
punpcklwd xmm0,xmm1
|
|
punpckhwd xmm6,xmm1
|
|
movdqa xmm1,xmm4
|
|
punpckldq xmm2,xmm0
|
|
punpckhdq xmm3,xmm0
|
|
punpckldq xmm1,xmm6
|
|
movdqa xmm0,xmm2
|
|
punpcklqdq xmm0,xmm1
|
|
punpckhdq xmm4,xmm6
|
|
punpckhqdq xmm2,xmm1
|
|
movdqa [rsp+10h],xmm0
|
|
movdqa [rsp+60h],xmm2
|
|
movdqa xmm0,xmm3
|
|
mov eax,[rsp+10h]
|
|
mov [rcx-2],eax
|
|
mov eax,[rsp+60h]
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm3,xmm4
|
|
mov [r10+rcx-2],eax
|
|
movdqa [rsp+20h],xmm0
|
|
mov eax, [rsp+20h]
|
|
movdqa [rsp+70h],xmm3
|
|
mov [rcx+r10*2-2],eax
|
|
mov eax,[rsp+70h]
|
|
mov [rdx+rcx-2],eax
|
|
mov eax,[rsp+18h]
|
|
mov [r11],eax
|
|
mov eax,[rsp+68h]
|
|
mov [r10+r11],eax
|
|
mov eax,[rsp+28h]
|
|
mov [r11+r10*2],eax
|
|
mov eax,[rsp+78h]
|
|
mov [rdx+r11],eax
|
|
mov eax,[rsp+14h]
|
|
mov [rdi-2],eax
|
|
mov eax,[rsp+64h]
|
|
mov [r10+rdi-2],eax
|
|
mov eax,[rsp+24h]
|
|
mov [rdi+r10*2-2],eax
|
|
mov eax, [rsp+74h]
|
|
mov [rdx+rdi-2],eax
|
|
mov eax, [rsp+1Ch]
|
|
mov [rbx],eax
|
|
mov eax, [rsp+6Ch]
|
|
mov [r10+rbx],eax
|
|
mov eax,[rsp+2Ch]
|
|
mov [rbx+r10*2],eax
|
|
mov eax,[rsp+7Ch]
|
|
mov [rdx+rbx],eax
|
|
lea rsp,[rsp+140h]
|
|
POP_XMM
|
|
mov rbx, [rsp+28h]
|
|
pop rdi
|
|
ret
|
|
|
|
|
|
|
|
WELS_EXTERN DeblockChromaLt4H_ssse3
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
push rsi
|
|
push rdi
|
|
push r12
|
|
PUSH_XMM 16
|
|
sub rsp,170h
|
|
|
|
movsxd rsi,r8d
|
|
lea eax,[r8*4]
|
|
mov r11d,r9d
|
|
movsxd r10,eax
|
|
mov eax, [rcx-2]
|
|
mov r12,rdx
|
|
mov [rsp+40h],eax
|
|
mov eax, [rsi+rcx-2]
|
|
lea rbx,[r10+rcx-2]
|
|
movdqa xmm5,[rsp+40h]
|
|
mov [rsp+50h],eax
|
|
mov eax, [rcx+rsi*2-2]
|
|
lea rbp,[r10+rdx-2]
|
|
movdqa xmm2, [rsp+50h]
|
|
mov [rsp+60h],eax
|
|
lea r10,[rsi+rsi*2]
|
|
mov rdi,rcx
|
|
mov eax,[r10+rcx-2]
|
|
movdqa xmm4,[rsp+60h]
|
|
mov [rsp+70h],eax
|
|
mov eax,[rdx-2]
|
|
mov [rsp+80h],eax
|
|
mov eax, [rsi+rdx-2]
|
|
movdqa xmm3,[rsp+70h]
|
|
mov [rsp+90h],eax
|
|
mov eax,[rdx+rsi*2-2]
|
|
punpckldq xmm5,[rsp+80h]
|
|
mov [rsp+0A0h],eax
|
|
mov eax, [r10+rdx-2]
|
|
punpckldq xmm2,[rsp+90h]
|
|
mov [rsp+0B0h],eax
|
|
mov eax, [rbx]
|
|
punpckldq xmm4,[rsp+0A0h]
|
|
mov [rsp+80h],eax
|
|
mov eax,[rbp]
|
|
punpckldq xmm3,[rsp+0B0h]
|
|
mov [rsp+90h],eax
|
|
mov eax,[rsi+rbx]
|
|
movdqa xmm0,[rsp+80h]
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm5,xmm0
|
|
movdqa [rsp+80h],xmm0
|
|
mov [rsp+80h],eax
|
|
mov eax,[rsi+rbp]
|
|
movdqa xmm0,[rsp+80h]
|
|
movdqa xmm1,xmm5
|
|
mov [rsp+90h],eax
|
|
mov eax,[rbx+rsi*2]
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm2,xmm0
|
|
punpcklbw xmm1,xmm2
|
|
punpckhbw xmm5,xmm2
|
|
movdqa [rsp+80h],xmm0
|
|
mov [rsp+80h],eax
|
|
mov eax,[rbp+rsi*2]
|
|
movdqa xmm0, [rsp+80h]
|
|
mov [rsp+90h],eax
|
|
mov eax,[r10+rbx]
|
|
movdqa xmm7,xmm1
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm4,xmm0
|
|
movdqa [rsp+80h],xmm0
|
|
mov [rsp+80h],eax
|
|
mov eax, [r10+rbp]
|
|
movdqa xmm0,[rsp+80h]
|
|
mov [rsp+90h],eax
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm3,xmm0
|
|
movdqa xmm0,xmm4
|
|
punpcklbw xmm0,xmm3
|
|
punpckhbw xmm4,xmm3
|
|
punpcklwd xmm7,xmm0
|
|
punpckhwd xmm1,xmm0
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm6,xmm7
|
|
punpcklwd xmm0,xmm4
|
|
punpckhwd xmm5,xmm4
|
|
punpckldq xmm6,xmm0
|
|
punpckhdq xmm7,xmm0
|
|
movdqa xmm0,xmm1
|
|
punpckldq xmm0,xmm5
|
|
mov rax, [rsp+1C8h+160] ; pTC
|
|
punpckhdq xmm1,xmm5
|
|
movdqa xmm9,xmm6
|
|
punpckhqdq xmm6,xmm0
|
|
punpcklqdq xmm9,xmm0
|
|
movdqa xmm2,xmm7
|
|
movdqa xmm13,xmm6
|
|
movdqa xmm4,xmm9
|
|
movdqa [rsp+10h],xmm9
|
|
punpcklqdq xmm2,xmm1
|
|
punpckhqdq xmm7,xmm1
|
|
pxor xmm1,xmm1
|
|
movsx ecx,byte [rax+3]
|
|
movsx edx,byte [rax+2]
|
|
movsx r8d,byte [rax+1]
|
|
movsx r9d,byte [rax]
|
|
movdqa xmm10,xmm1
|
|
movdqa xmm15,xmm2
|
|
punpckhbw xmm2,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
punpcklbw xmm4,xmm1
|
|
movsx eax,r11w
|
|
mov word [rsp+0Eh],cx
|
|
mov word [rsp+0Ch],cx
|
|
movdqa xmm3,xmm7
|
|
movdqa xmm8,xmm7
|
|
movdqa [rsp+20h],xmm7
|
|
punpcklbw xmm15,xmm1
|
|
punpcklbw xmm13,xmm1
|
|
punpcklbw xmm3,xmm1
|
|
mov word [rsp+0Ah],dx
|
|
mov word [rsp+8],dx
|
|
mov word [rsp+6],r8w
|
|
movd xmm0,eax
|
|
movdqa [rsp+30h],xmm6
|
|
punpckhbw xmm9,xmm1
|
|
punpckhbw xmm8,xmm1
|
|
punpcklwd xmm0,xmm0
|
|
movsx eax,word [rsp+1C0h+160] ; iBeta
|
|
mov word [rsp+4],r8w
|
|
mov word [rsp+2],r9w
|
|
pshufd xmm12,xmm0,0
|
|
mov word [rsp],r9w
|
|
movd xmm0,eax
|
|
mov eax,4
|
|
cwde
|
|
movdqa xmm14, [rsp]
|
|
movdqa [rsp],xmm2
|
|
movdqa xmm2,xmm12
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm11,xmm0,0
|
|
psubw xmm10,xmm14
|
|
movd xmm0,eax
|
|
movdqa xmm7,xmm14
|
|
movdqa xmm6,xmm14
|
|
pcmpgtw xmm7,xmm1
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm5,xmm0,0
|
|
movdqa xmm0,xmm4
|
|
movdqa xmm1,xmm15
|
|
psubw xmm4,xmm13
|
|
psubw xmm0,xmm3
|
|
psubw xmm1,xmm13
|
|
psubw xmm3,xmm15
|
|
psllw xmm1,2
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm10
|
|
psraw xmm1,3
|
|
pmaxsw xmm0,xmm1
|
|
pminsw xmm6,xmm0
|
|
movdqa xmm1,xmm11
|
|
movdqa xmm0,xmm13
|
|
psubw xmm0,xmm15
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm2,xmm0
|
|
pabsw xmm0,xmm4
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm3
|
|
pand xmm2,xmm1
|
|
movdqa xmm1,xmm11
|
|
movdqa xmm3,[rsp+30h]
|
|
pcmpgtw xmm1,xmm0
|
|
movdqa xmm0,xmm9
|
|
pand xmm2,xmm1
|
|
psubw xmm0,xmm8
|
|
psubw xmm9,xmm3
|
|
pand xmm2,xmm7
|
|
pand xmm6,xmm2
|
|
psubw xmm15,xmm6
|
|
paddw xmm13,xmm6
|
|
movdqa xmm2,[rsp]
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm3
|
|
psubw xmm8,xmm2
|
|
psllw xmm1,2
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm3
|
|
movdqa xmm5,[rsp+10h]
|
|
psubw xmm0,xmm2
|
|
psraw xmm1,3
|
|
movdqa xmm4,xmm5
|
|
pabsw xmm0,xmm0
|
|
pmaxsw xmm10,xmm1
|
|
movdqa xmm1,xmm11
|
|
pcmpgtw xmm12,xmm0
|
|
pabsw xmm0,xmm9
|
|
pminsw xmm14,xmm10
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm8
|
|
pcmpgtw xmm11,xmm0
|
|
pand xmm12,xmm1
|
|
movdqa xmm1,[rsp+20h]
|
|
pand xmm12,xmm11
|
|
pand xmm12,xmm7
|
|
pand xmm14,xmm12
|
|
paddw xmm3,xmm14
|
|
psubw xmm2,xmm14
|
|
packuswb xmm13,xmm3
|
|
packuswb xmm15,xmm2
|
|
punpcklbw xmm4,xmm13
|
|
punpckhbw xmm5,xmm13
|
|
movdqa xmm0,xmm15
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm15,xmm1
|
|
movdqa xmm3,xmm4
|
|
punpcklwd xmm3,xmm0
|
|
punpckhwd xmm4,xmm0
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm2,xmm3
|
|
movdqa xmm1,xmm4
|
|
punpcklwd xmm0,xmm15
|
|
punpckhwd xmm5,xmm15
|
|
punpckldq xmm2,xmm0
|
|
punpckhdq xmm3,xmm0
|
|
punpckldq xmm1,xmm5
|
|
movdqa xmm0,xmm2
|
|
punpcklqdq xmm0,xmm1
|
|
punpckhdq xmm4,xmm5
|
|
punpckhqdq xmm2,xmm1
|
|
movdqa [rsp+40h],xmm0
|
|
movdqa xmm0,xmm3
|
|
movdqa [rsp+90h],xmm2
|
|
mov eax,[rsp+40h]
|
|
mov [rdi-2],eax
|
|
mov eax, [rsp+90h]
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm3,xmm4
|
|
mov [rsi+rdi-2],eax
|
|
movdqa [rsp+50h],xmm0
|
|
mov eax,[rsp+50h]
|
|
movdqa [rsp+0A0h],xmm3
|
|
mov [rdi+rsi*2-2],eax
|
|
mov eax,[rsp+0A0h]
|
|
mov [r10+rdi-2],eax
|
|
mov eax,[rsp+48h]
|
|
mov [rbx],eax
|
|
mov eax,[rsp+98h]
|
|
mov [rsi+rbx],eax
|
|
mov eax,[rsp+58h]
|
|
mov [rbx+rsi*2],eax
|
|
mov eax, [rsp+0A8h]
|
|
mov [r10+rbx],eax
|
|
mov eax, [rsp+44h]
|
|
mov [r12-2],eax
|
|
mov eax,[rsp+94h]
|
|
mov [rsi+r12-2],eax
|
|
mov eax,[rsp+54h]
|
|
mov [r12+rsi*2-2],eax
|
|
mov eax, [rsp+0A4h]
|
|
mov [r10+r12-2],eax
|
|
mov eax,[rsp+4Ch]
|
|
mov [rbp],eax
|
|
mov eax,[rsp+9Ch]
|
|
mov [rsi+rbp],eax
|
|
mov eax, [rsp+5Ch]
|
|
mov [rbp+rsi*2],eax
|
|
mov eax,[rsp+0ACh]
|
|
mov [r10+rbp],eax
|
|
lea r11,[rsp+170h]
|
|
mov rsp,r11
|
|
POP_XMM
|
|
pop r12
|
|
pop rdi
|
|
pop rsi
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
|
|
|
|
%elifdef UNIX64
|
|
|
|
|
|
WELS_EXTERN DeblockLumaLt4V_ssse3
|
|
push rbp
|
|
mov r11,r8 ; pTC
|
|
sub rsp,1B0h
|
|
lea rbp,[rsp+20h]
|
|
movd xmm4,edx
|
|
movd xmm2,ecx
|
|
mov qword [rbp+180h],r12
|
|
mov r10,rdi
|
|
movsxd r12,esi
|
|
add rsi,rsi
|
|
movsxd rdx,esi
|
|
sub r10,r12
|
|
movsx r8d,byte [r11]
|
|
pxor xmm3,xmm3
|
|
punpcklwd xmm2,xmm2
|
|
movaps [rbp+50h],xmm14
|
|
lea rax,[r12+r12*2]
|
|
movdqa xmm14,[rdx+rdi]
|
|
neg rax
|
|
pshufd xmm0,xmm2,0
|
|
movd xmm2,r8d
|
|
movsx rsi,byte [r11+1]
|
|
movsx r8d,byte [r11+2]
|
|
movsx r11d,byte [r11+3]
|
|
movaps [rbp+70h],xmm12
|
|
movd xmm1,esi
|
|
movaps [rbp+80h],xmm11
|
|
movd xmm12,r8d
|
|
movd xmm11,r11d
|
|
movdqa xmm5, [rax+rdi]
|
|
lea rax,[r12+r12]
|
|
punpcklwd xmm12,xmm12
|
|
neg rax
|
|
punpcklwd xmm11,xmm11
|
|
movaps [rbp],xmm8
|
|
movdqa xmm8, [r10]
|
|
punpcklwd xmm2,xmm2
|
|
punpcklwd xmm1,xmm1
|
|
punpcklqdq xmm12,xmm12
|
|
punpcklqdq xmm11,xmm11
|
|
punpcklqdq xmm2,xmm2
|
|
punpcklqdq xmm1,xmm1
|
|
shufps xmm12,xmm11,88h
|
|
movdqa xmm11,xmm8
|
|
movaps [rbp+30h],xmm9
|
|
movdqa xmm9,[rdi]
|
|
shufps xmm2,xmm1,88h
|
|
movdqa xmm1,xmm5
|
|
punpcklbw xmm11,xmm3
|
|
movaps [rbp+20h],xmm6
|
|
movaps [rbp+60h],xmm13
|
|
movdqa xmm13,xmm11
|
|
movaps [rbp+90h],xmm10
|
|
movdqa xmm10,xmm9
|
|
movdqa xmm6,[rax+rdi]
|
|
punpcklbw xmm1,xmm3
|
|
movaps [rbp+0A0h],xmm12
|
|
psubw xmm13,xmm1
|
|
movaps [rbp+40h],xmm15
|
|
movdqa xmm15,xmm14
|
|
movaps [rbp+10h],xmm7
|
|
movdqa xmm7,xmm6
|
|
punpcklbw xmm10,xmm3
|
|
movdqa xmm12,[r12+rdi]
|
|
punpcklbw xmm7,xmm3
|
|
punpcklbw xmm12,xmm3
|
|
punpcklbw xmm15,xmm3
|
|
pabsw xmm3,xmm13
|
|
movdqa xmm13,xmm10
|
|
psubw xmm13,xmm15
|
|
movdqa [rbp+0F0h],xmm15
|
|
pabsw xmm15,xmm13
|
|
movdqa xmm13,xmm11
|
|
movdqa [rbp+0B0h],xmm1
|
|
movdqa xmm1,xmm0
|
|
pavgw xmm13,xmm10
|
|
pcmpgtw xmm1,xmm3
|
|
movdqa [rbp+120h],xmm13
|
|
movaps xmm13,xmm2
|
|
punpcklwd xmm4,xmm4
|
|
movdqa xmm3,xmm0
|
|
movdqa [rbp+100h],xmm1
|
|
psubw xmm13,xmm1
|
|
movdqa xmm1,xmm10
|
|
pcmpgtw xmm3,xmm15
|
|
pshufd xmm4,xmm4,0
|
|
psubw xmm1,xmm11
|
|
movdqa [rbp+0D0h],xmm10
|
|
psubw xmm13,xmm3
|
|
movdqa [rbp+110h],xmm3
|
|
pabsw xmm15,xmm1
|
|
movdqa xmm3,xmm4
|
|
psubw xmm10,xmm12
|
|
pcmpgtw xmm3,xmm15
|
|
pabsw xmm15,xmm10
|
|
movdqa xmm10,xmm0
|
|
psllw xmm1,2
|
|
movdqa [rbp+0C0h],xmm11
|
|
psubw xmm11,xmm7
|
|
pcmpgtw xmm10,xmm15
|
|
pabsw xmm11,xmm11
|
|
movdqa xmm15,xmm0
|
|
pand xmm3,xmm10
|
|
pcmpgtw xmm15,xmm11
|
|
movaps xmm11,xmm2
|
|
pxor xmm10,xmm10
|
|
pand xmm3,xmm15
|
|
pcmpgtw xmm11,xmm10
|
|
pcmpeqw xmm10,xmm2
|
|
por xmm11,xmm10
|
|
pand xmm3,xmm11
|
|
movdqa xmm11,xmm7
|
|
psubw xmm11,xmm12
|
|
pxor xmm15,xmm15
|
|
paddw xmm11,xmm1
|
|
psubw xmm15,xmm13
|
|
movdqa [rbp+0E0h],xmm12
|
|
paddw xmm11,[FOUR_16B_SSE2]
|
|
pxor xmm12,xmm12
|
|
psraw xmm11,3
|
|
punpckhbw xmm8,xmm12
|
|
pmaxsw xmm15,xmm11
|
|
punpckhbw xmm5,xmm12
|
|
movdqa xmm11,xmm8
|
|
pminsw xmm13,xmm15
|
|
psubw xmm11,xmm5
|
|
punpckhbw xmm9,xmm12
|
|
pand xmm13,xmm3
|
|
movdqa [rbp+130h],xmm13
|
|
pabsw xmm13,xmm11
|
|
punpckhbw xmm14,xmm12
|
|
movdqa xmm11,xmm9
|
|
psubw xmm11,xmm14
|
|
movdqa xmm15,xmm0
|
|
movdqa [rbp+140h],xmm14
|
|
pabsw xmm14,xmm11
|
|
movdqa xmm11,xmm8
|
|
pcmpgtw xmm15,xmm14
|
|
movdqa xmm1,[r12+rdi]
|
|
pavgw xmm11,xmm9
|
|
movdqa [rbp+170h],xmm11
|
|
movdqa xmm10,xmm9
|
|
punpckhbw xmm6,xmm12
|
|
psubw xmm10,xmm8
|
|
punpckhbw xmm1,xmm12
|
|
movdqa xmm12,xmm0
|
|
movaps xmm11,[rbp+0A0h]
|
|
pcmpgtw xmm12,xmm13
|
|
movaps xmm13,xmm11
|
|
psubw xmm13,xmm12
|
|
movdqa [rbp+160h],xmm15
|
|
psubw xmm13,xmm15
|
|
movdqa xmm15,xmm9
|
|
psubw xmm15,xmm1
|
|
movdqa [rbp+150h],xmm12
|
|
pabsw xmm12,xmm10
|
|
pabsw xmm14,xmm15
|
|
movdqa xmm15,xmm8
|
|
pcmpgtw xmm4,xmm12
|
|
movdqa xmm12,xmm0
|
|
psubw xmm15,xmm6
|
|
pcmpgtw xmm12,xmm14
|
|
pabsw xmm14,xmm15
|
|
psllw xmm10,2
|
|
pcmpgtw xmm0,xmm14
|
|
movdqa xmm14,xmm6
|
|
psubw xmm14,xmm1
|
|
pand xmm4,xmm12
|
|
paddw xmm14,xmm10
|
|
pand xmm4,xmm0
|
|
paddw xmm14,[FOUR_16B_SSE2]
|
|
pxor xmm15,xmm15
|
|
movaps xmm12,xmm11
|
|
psubw xmm15,xmm13
|
|
pxor xmm0,xmm0
|
|
psraw xmm14,3
|
|
pcmpgtw xmm12,xmm0
|
|
pcmpeqw xmm0,xmm11
|
|
pmaxsw xmm15,xmm14
|
|
por xmm12,xmm0
|
|
movdqa xmm0,[rbp+120h]
|
|
pminsw xmm13,xmm15
|
|
movdqa xmm15,[rbp+0B0h]
|
|
movdqa xmm10,xmm7
|
|
pand xmm4,xmm12
|
|
paddw xmm15,xmm0
|
|
pxor xmm12,xmm12
|
|
paddw xmm10,xmm7
|
|
movdqa xmm14,xmm12
|
|
psubw xmm15,xmm10
|
|
psubw xmm14,xmm2
|
|
psraw xmm15,1
|
|
pmaxsw xmm15,xmm14
|
|
movdqa xmm10,xmm6
|
|
pminsw xmm15,xmm2
|
|
paddw xmm10,xmm6
|
|
pand xmm15,xmm3
|
|
psubw xmm12,xmm11
|
|
pand xmm15,[rbp+100h]
|
|
pand xmm13,xmm4
|
|
paddw xmm7,xmm15
|
|
paddw xmm8,xmm13
|
|
movdqa xmm15,[rbp+170h]
|
|
psubw xmm9,xmm13
|
|
paddw xmm5,xmm15
|
|
psubw xmm5,xmm10
|
|
psraw xmm5,1
|
|
pmaxsw xmm5,xmm12
|
|
pminsw xmm5,xmm11
|
|
pand xmm5,xmm4
|
|
pand xmm5,[rbp+150h]
|
|
paddw xmm6,xmm5
|
|
movdqa xmm5,[rbp+0C0h]
|
|
packuswb xmm7,xmm6
|
|
movdqa xmm6,[rbp+130h]
|
|
paddw xmm5,xmm6
|
|
packuswb xmm5,xmm8
|
|
movdqa xmm8,[rbp+0D0h]
|
|
psubw xmm8,xmm6
|
|
movdqa xmm6,[rbp+0F0h]
|
|
paddw xmm6,xmm0
|
|
movdqa xmm0,[rbp+0E0h]
|
|
packuswb xmm8,xmm9
|
|
movdqa xmm9,xmm0
|
|
paddw xmm9,xmm0
|
|
psubw xmm6,xmm9
|
|
psraw xmm6,1
|
|
pmaxsw xmm14,xmm6
|
|
pminsw xmm2,xmm14
|
|
pand xmm2,xmm3
|
|
pand xmm2,[rbp+110h]
|
|
paddw xmm0,xmm2
|
|
movdqa xmm2,[rbp+140h]
|
|
paddw xmm2,xmm15
|
|
movdqa xmm15,xmm1
|
|
paddw xmm15,xmm1
|
|
psubw xmm2,xmm15
|
|
psraw xmm2,1
|
|
pmaxsw xmm12,xmm2
|
|
pminsw xmm11,xmm12
|
|
pand xmm11,xmm4
|
|
pand xmm11,[rbp+160h]
|
|
paddw xmm1,xmm11
|
|
movdqa [rax+rdi],xmm7
|
|
movdqa [r10],xmm5
|
|
packuswb xmm0,xmm1
|
|
movdqa [rdi],xmm8
|
|
movdqa [r12+rdi],xmm0
|
|
mov r12,qword [rbp+180h]
|
|
lea rsp,[rbp+190h]
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
WELS_EXTERN DeblockLumaEq4V_ssse3
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
mov r8, rdx
|
|
mov r9, rcx
|
|
mov rcx, rdi
|
|
mov rdx, rsi
|
|
sub rsp,1D8h
|
|
movaps [rax-38h],xmm6
|
|
movaps [rax-48h],xmm7
|
|
movaps [rax-58h],xmm8
|
|
pxor xmm1,xmm1
|
|
movsxd r10,edx
|
|
mov rbp,rcx
|
|
mov r11d,r8d
|
|
mov rdx,rcx
|
|
mov rdi,rbp
|
|
mov rbx,rbp
|
|
movdqa xmm5,[rbp]
|
|
movaps [rax-68h],xmm9
|
|
movaps [rax-78h],xmm10
|
|
punpcklbw xmm5,xmm1
|
|
movaps [rax-88h],xmm11
|
|
movaps [rax-98h],xmm12
|
|
movaps [rax-0A8h],xmm13
|
|
movaps [rax-0B8h],xmm14
|
|
movdqa xmm14,[r10+rbp]
|
|
movaps [rax-0C8h],xmm15
|
|
lea eax,[r10*4]
|
|
movsxd r8,eax
|
|
lea eax,[r10+r10*2]
|
|
movsxd rcx,eax
|
|
lea eax,[r10+r10]
|
|
sub rdx,r8
|
|
punpcklbw xmm14,xmm1
|
|
movdqa [rsp+90h],xmm5
|
|
movdqa [rsp+30h],xmm14
|
|
movsxd rsi,eax
|
|
movsx eax,r11w
|
|
sub rdi,rcx
|
|
sub rbx,rsi
|
|
mov r8,rbp
|
|
sub r8,r10
|
|
movd xmm0,eax
|
|
movsx eax,r9w
|
|
movdqa xmm12,[rdi]
|
|
movdqa xmm6, [rsi+rbp]
|
|
movdqa xmm13,[rbx]
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm11,xmm0,0
|
|
punpcklbw xmm13,xmm1
|
|
punpcklbw xmm6,xmm1
|
|
movdqa xmm8,[r8]
|
|
movd xmm0,eax
|
|
movdqa xmm10,xmm11
|
|
mov eax,2
|
|
punpcklbw xmm8,xmm1
|
|
punpcklbw xmm12,xmm1
|
|
cwde
|
|
punpcklwd xmm0,xmm0
|
|
psraw xmm10,2
|
|
movdqa xmm1,xmm8
|
|
movdqa [rsp+0F0h],xmm13
|
|
movdqa [rsp+0B0h],xmm8
|
|
pshufd xmm7,xmm0,0
|
|
psubw xmm1,xmm13
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm4,xmm7
|
|
movdqa xmm2,xmm7
|
|
psubw xmm0,xmm8
|
|
pabsw xmm3,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm5
|
|
movdqa [rsp+40h],xmm7
|
|
movdqa [rsp+60h],xmm6
|
|
pcmpgtw xmm4,xmm0
|
|
psubw xmm1,xmm14
|
|
pabsw xmm0,xmm1
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm4,xmm2
|
|
movdqa xmm0,xmm11
|
|
pcmpgtw xmm0,xmm3
|
|
pand xmm4,xmm0
|
|
movd xmm0,eax
|
|
movdqa [rsp+20h],xmm4
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm2,xmm0,0
|
|
paddw xmm10,xmm2
|
|
movdqa [rsp+0A0h],xmm2
|
|
movdqa xmm15,xmm7
|
|
pxor xmm4,xmm4
|
|
movdqa xmm0,xmm8
|
|
psubw xmm0,xmm12
|
|
mov eax,4
|
|
pabsw xmm0,xmm0
|
|
movdqa xmm1,xmm10
|
|
cwde
|
|
pcmpgtw xmm15,xmm0
|
|
pcmpgtw xmm1,xmm3
|
|
movdqa xmm3,xmm7
|
|
movdqa xmm7,[rdx]
|
|
movdqa xmm0,xmm5
|
|
psubw xmm0,xmm6
|
|
pand xmm15,xmm1
|
|
punpcklbw xmm7,xmm4
|
|
movdqa xmm9,xmm15
|
|
pabsw xmm0,xmm0
|
|
psllw xmm7,1
|
|
pandn xmm9,xmm12
|
|
pcmpgtw xmm3,xmm0
|
|
paddw xmm7,xmm12
|
|
movd xmm0,eax
|
|
pand xmm3,xmm1
|
|
paddw xmm7,xmm12
|
|
punpcklwd xmm0,xmm0
|
|
paddw xmm7,xmm12
|
|
pshufd xmm1,xmm0,0
|
|
paddw xmm7,xmm13
|
|
movdqa xmm0,xmm3
|
|
pandn xmm0,xmm6
|
|
paddw xmm7,xmm8
|
|
movdqa [rsp+70h],xmm1
|
|
paddw xmm7,xmm5
|
|
movdqa [rsp+120h],xmm0
|
|
movdqa xmm0,[rcx+rbp]
|
|
punpcklbw xmm0,xmm4
|
|
paddw xmm7,xmm1
|
|
movdqa xmm4,xmm15
|
|
psllw xmm0,1
|
|
psraw xmm7,3
|
|
paddw xmm0,xmm6
|
|
pand xmm7,xmm15
|
|
paddw xmm0,xmm6
|
|
paddw xmm0,xmm6
|
|
paddw xmm0,xmm14
|
|
movdqa xmm6,xmm15
|
|
paddw xmm0,xmm5
|
|
pandn xmm6,xmm13
|
|
paddw xmm0,xmm8
|
|
paddw xmm0,xmm1
|
|
psraw xmm0,3
|
|
movdqa xmm1,xmm12
|
|
paddw xmm1,xmm13
|
|
pand xmm0,xmm3
|
|
movdqa [rsp+100h],xmm0
|
|
movdqa xmm0,xmm8
|
|
paddw xmm0,xmm5
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm3
|
|
paddw xmm1,xmm2
|
|
psraw xmm1,2
|
|
pandn xmm0,xmm14
|
|
pand xmm4,xmm1
|
|
movdqa [rsp+0E0h],xmm0
|
|
movdqa xmm0,xmm5
|
|
paddw xmm0,xmm8
|
|
movdqa xmm1,[rsp+60h]
|
|
paddw xmm1,xmm14
|
|
movdqa xmm14,xmm3
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm8
|
|
paddw xmm0,[rsp+30h]
|
|
paddw xmm1,xmm2
|
|
psraw xmm1,2
|
|
pand xmm14,xmm1
|
|
movdqa xmm1,xmm13
|
|
paddw xmm1,xmm13
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm2
|
|
psraw xmm1,2
|
|
movdqa xmm0,[rsp+30h]
|
|
movdqa xmm2,xmm13
|
|
movdqa xmm5,xmm15
|
|
paddw xmm0,[rsp+70h]
|
|
pandn xmm5,xmm1
|
|
paddw xmm2,xmm8
|
|
movdqa xmm8,[rsp+90h]
|
|
movdqa xmm1,xmm12
|
|
paddw xmm2,xmm8
|
|
psllw xmm2,1
|
|
paddw xmm2,xmm0
|
|
paddw xmm1,xmm2
|
|
movdqa xmm0,xmm8
|
|
movdqa xmm8,xmm3
|
|
movdqa xmm2,[rsp+30h]
|
|
paddw xmm0,xmm13
|
|
psraw xmm1,3
|
|
pand xmm15,xmm1
|
|
movdqa xmm1,xmm2
|
|
paddw xmm1,xmm2
|
|
paddw xmm2,[rsp+90h]
|
|
paddw xmm2,[rsp+0B0h]
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm13
|
|
movdqa xmm13,[r8]
|
|
paddw xmm0, [rsp+70h]
|
|
paddw xmm1, [rsp+0A0h]
|
|
psllw xmm2,1
|
|
paddw xmm2,xmm0
|
|
psraw xmm1,2
|
|
movdqa xmm0, [rdi]
|
|
pandn xmm8,xmm1
|
|
movdqa xmm1, [rsp+60h]
|
|
paddw xmm1,xmm2
|
|
movdqa xmm2, [rbx]
|
|
psraw xmm1,3
|
|
pand xmm3,xmm1
|
|
movdqa xmm1, [rbp]
|
|
movdqa [rsp+0D0h],xmm3
|
|
pxor xmm3,xmm3
|
|
punpckhbw xmm0,xmm3
|
|
punpckhbw xmm1,xmm3
|
|
punpckhbw xmm13,xmm3
|
|
movdqa [rsp+0C0h],xmm0
|
|
movdqa xmm0,[r10+rbp]
|
|
movdqa [rsp],xmm1
|
|
punpckhbw xmm0,xmm3
|
|
punpckhbw xmm2,xmm3
|
|
movdqa [rsp+80h],xmm0
|
|
movdqa xmm0,[rsi+rbp]
|
|
movdqa [rsp+10h],xmm13
|
|
punpckhbw xmm0,xmm3
|
|
movdqa [rsp+50h],xmm0
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm1,xmm13
|
|
psubw xmm0,xmm13
|
|
psubw xmm1,xmm2
|
|
pabsw xmm3,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,[rsp]
|
|
movdqa xmm13,[rsp+40h]
|
|
movdqa [rsp+110h],xmm2
|
|
psubw xmm1, [rsp+80h]
|
|
pcmpgtw xmm13,xmm0
|
|
pcmpgtw xmm11,xmm3
|
|
pabsw xmm0,xmm1
|
|
pcmpgtw xmm10,xmm3
|
|
movdqa xmm1, [rsp+40h]
|
|
movdqa xmm2,xmm1
|
|
movdqa xmm3,xmm1
|
|
pcmpgtw xmm2,xmm0
|
|
movdqa xmm0, [rsp+10h]
|
|
pand xmm13,xmm2
|
|
pand xmm13,xmm11
|
|
movdqa xmm11,[rsp+0C0h]
|
|
psubw xmm0,xmm11
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm3,xmm0
|
|
pand xmm3,xmm10
|
|
movdqa xmm0,[rsp]
|
|
psubw xmm0,[rsp+50h]
|
|
movdqa xmm2,[rdx]
|
|
pabsw xmm0,xmm0
|
|
por xmm7,xmm9
|
|
movdqa xmm9,[rsp+20h]
|
|
pcmpgtw xmm1,xmm0
|
|
pand xmm9,xmm7
|
|
movdqa xmm7,[rsp+20h]
|
|
movdqa xmm0,xmm7
|
|
pandn xmm0,xmm12
|
|
movdqa xmm12,[rsp+110h]
|
|
pand xmm1,xmm10
|
|
movdqa xmm10,[rsp+70h]
|
|
movdqa [rsp+40h],xmm1
|
|
movdqa xmm1,xmm13
|
|
por xmm9,xmm0
|
|
pxor xmm0,xmm0
|
|
por xmm4,xmm6
|
|
movdqa xmm6,xmm7
|
|
punpckhbw xmm2,xmm0
|
|
por xmm15,xmm5
|
|
movdqa xmm5,[rsp+20h]
|
|
movdqa xmm0,xmm3
|
|
psllw xmm2,1
|
|
pandn xmm0,xmm11
|
|
pand xmm6,xmm4
|
|
movdqa xmm4,[rsp]
|
|
paddw xmm2,xmm11
|
|
pand xmm5,xmm15
|
|
movdqa xmm15,[rsp+20h]
|
|
paddw xmm2,xmm11
|
|
paddw xmm2,xmm11
|
|
paddw xmm2,xmm12
|
|
paddw xmm2,[rsp+10h]
|
|
paddw xmm2,[rsp]
|
|
paddw xmm2,xmm10
|
|
psraw xmm2,3
|
|
pand xmm2,xmm3
|
|
por xmm2,xmm0
|
|
pand xmm1,xmm2
|
|
movdqa xmm0,xmm13
|
|
movdqa xmm2,xmm11
|
|
pandn xmm0,xmm11
|
|
paddw xmm2,xmm12
|
|
por xmm1,xmm0
|
|
packuswb xmm9,xmm1
|
|
movdqa xmm0,xmm7
|
|
movdqa xmm7,[rsp+0A0h]
|
|
pandn xmm0,[rsp+0F0h]
|
|
movdqa xmm1,xmm3
|
|
por xmm6,xmm0
|
|
movdqa xmm0,[rsp+10h]
|
|
paddw xmm0,xmm4
|
|
paddw xmm2,xmm0
|
|
paddw xmm2,xmm7
|
|
movdqa xmm0,xmm3
|
|
pandn xmm0,xmm12
|
|
psraw xmm2,2
|
|
pand xmm1,xmm2
|
|
por xmm1,xmm0
|
|
movdqa xmm2,xmm13
|
|
movdqa xmm0,xmm13
|
|
pand xmm2,xmm1
|
|
pandn xmm0,xmm12
|
|
movdqa xmm1,xmm12
|
|
paddw xmm1,[rsp+10h]
|
|
por xmm2,xmm0
|
|
movdqa xmm0,xmm15
|
|
pandn xmm0,[rsp+0B0h]
|
|
paddw xmm1,xmm4
|
|
packuswb xmm6,xmm2
|
|
movdqa xmm2,xmm3
|
|
psllw xmm1,1
|
|
por xmm5,xmm0
|
|
movdqa xmm0,[rsp+80h]
|
|
paddw xmm0,xmm10
|
|
paddw xmm1,xmm0
|
|
paddw xmm11,xmm1
|
|
psraw xmm11,3
|
|
movdqa xmm1,xmm12
|
|
pand xmm2,xmm11
|
|
paddw xmm1,xmm12
|
|
movdqa xmm11,[rsp+80h]
|
|
movdqa xmm0, [rsp+10h]
|
|
por xmm14,[rsp+0E0h]
|
|
paddw xmm0,xmm11
|
|
movdqa xmm4,xmm15
|
|
paddw xmm1,xmm0
|
|
movdqa xmm0,xmm13
|
|
paddw xmm1,xmm7
|
|
psraw xmm1,2
|
|
pandn xmm3,xmm1
|
|
por xmm2,xmm3
|
|
movdqa xmm1,xmm13
|
|
movdqa xmm3,[rsp+10h]
|
|
pandn xmm0,xmm3
|
|
pand xmm1,xmm2
|
|
movdqa xmm2,xmm11
|
|
paddw xmm2,[rsp]
|
|
por xmm1,xmm0
|
|
movdqa xmm0,[rsp+0D0h]
|
|
por xmm0,xmm8
|
|
paddw xmm2,xmm3
|
|
packuswb xmm5,xmm1
|
|
movdqa xmm8,[rsp+40h]
|
|
movdqa xmm1,[rsp+50h]
|
|
movdqa xmm3,xmm8
|
|
pand xmm4,xmm0
|
|
psllw xmm2,1
|
|
movdqa xmm0,xmm15
|
|
pandn xmm0,[rsp+90h]
|
|
por xmm4,xmm0
|
|
movdqa xmm0,xmm12
|
|
paddw xmm0,xmm10
|
|
paddw xmm2,xmm0
|
|
paddw xmm1,xmm2
|
|
movdqa xmm0,[rsp]
|
|
movdqa xmm2,xmm11
|
|
paddw xmm0,xmm12
|
|
movdqa xmm12,[rsp]
|
|
paddw xmm2,xmm11
|
|
paddw xmm2,xmm0
|
|
psraw xmm1,3
|
|
movdqa xmm0,xmm8
|
|
pand xmm3,xmm1
|
|
paddw xmm2,xmm7
|
|
movdqa xmm1,xmm13
|
|
psraw xmm2,2
|
|
pandn xmm0,xmm2
|
|
por xmm3,xmm0
|
|
movdqa xmm2,[rsp+50h]
|
|
movdqa xmm0,xmm13
|
|
pandn xmm0,xmm12
|
|
pand xmm1,xmm3
|
|
paddw xmm2,xmm11
|
|
movdqa xmm3,xmm15
|
|
por xmm1,xmm0
|
|
pand xmm3,xmm14
|
|
movdqa xmm14,[rsp+10h]
|
|
movdqa xmm0,xmm15
|
|
pandn xmm0,[rsp+30h]
|
|
packuswb xmm4,xmm1
|
|
movdqa xmm1,xmm8
|
|
por xmm3,xmm0
|
|
movdqa xmm0,xmm12
|
|
paddw xmm0,xmm14
|
|
paddw xmm2,xmm0
|
|
paddw xmm2,xmm7
|
|
movdqa xmm0,xmm8
|
|
pandn xmm0,xmm11
|
|
psraw xmm2,2
|
|
pand xmm1,xmm2
|
|
por xmm1,xmm0
|
|
movdqa xmm2,xmm13
|
|
movdqa xmm0,xmm13
|
|
pandn xmm0,xmm11
|
|
pand xmm2,xmm1
|
|
movdqa xmm1,xmm15
|
|
por xmm2,xmm0
|
|
packuswb xmm3,xmm2
|
|
movdqa xmm0,[rsp+100h]
|
|
por xmm0,[rsp+120h]
|
|
pand xmm1,xmm0
|
|
movdqa xmm2,[rcx+rbp]
|
|
movdqa xmm7,[rsp+50h]
|
|
pandn xmm15,[rsp+60h]
|
|
lea r11,[rsp+1D8h]
|
|
pxor xmm0,xmm0
|
|
por xmm1,xmm15
|
|
movaps xmm15,[r11-0A8h]
|
|
movdqa [rdi],xmm9
|
|
movaps xmm9,[r11-48h]
|
|
punpckhbw xmm2,xmm0
|
|
psllw xmm2,1
|
|
paddw xmm2,xmm7
|
|
paddw xmm2,xmm7
|
|
movdqa [rbx],xmm6
|
|
movaps xmm6,[r11-18h]
|
|
paddw xmm2,xmm7
|
|
paddw xmm2,xmm11
|
|
movaps xmm11,[r11-68h]
|
|
paddw xmm2,xmm12
|
|
movaps xmm12,[r11-78h]
|
|
paddw xmm2,xmm14
|
|
paddw xmm2,xmm10
|
|
psraw xmm2,3
|
|
movaps xmm10,[r11-58h]
|
|
movaps xmm14,[r11-98h]
|
|
movdqa xmm0,xmm13
|
|
pand xmm2,xmm8
|
|
pandn xmm8,xmm7
|
|
pandn xmm13,xmm7
|
|
por xmm2,xmm8
|
|
movaps xmm7,[r11-28h]
|
|
movaps xmm8,[r11-38h]
|
|
movdqa [r8],xmm5
|
|
pand xmm0,xmm2
|
|
por xmm0,xmm13
|
|
packuswb xmm1,xmm0
|
|
movaps xmm13,[r11-88h]
|
|
movdqa [rbp],xmm4
|
|
movdqa [r10+rbp],xmm3
|
|
movdqa [rsi+rbp],xmm1
|
|
mov rsp,r11
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
WELS_EXTERN DeblockChromaLt4V_ssse3
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
mov r10, rdx
|
|
mov r11, rcx
|
|
mov rcx, rdi
|
|
mov rdx, rsi
|
|
mov rsi, r10
|
|
mov r10, r9
|
|
mov rbp, r8
|
|
mov r8, rsi
|
|
mov r9, r11
|
|
sub rsp,0C8h
|
|
pxor xmm1,xmm1
|
|
mov rbx,rcx
|
|
movsxd r11,r8d
|
|
movsx ecx,byte [r10]
|
|
movsx r8d,byte [r10+2]
|
|
mov rdi,rdx
|
|
movq xmm2,[rbx]
|
|
movq xmm9,[r11+rbx]
|
|
movsx edx,byte [r10+1]
|
|
mov word [rsp+2],cx
|
|
mov word [rsp],cx
|
|
movsx eax,byte [r10+3]
|
|
mov word [rsp+6],dx
|
|
mov word [rsp+4],dx
|
|
movdqa xmm11,xmm1
|
|
mov word [rsp+0Eh],ax
|
|
mov word [rsp+0Ch],ax
|
|
lea eax,[r11+r11]
|
|
movsxd rcx,eax
|
|
mov rax,rbx
|
|
mov rdx,rdi
|
|
sub rax,rcx
|
|
mov word [rsp+0Ah],r8w
|
|
mov word [rsp+8],r8w
|
|
movdqa xmm6,[rsp]
|
|
movdqa xmm7,xmm6
|
|
movq xmm13, [rax]
|
|
mov rax,rdi
|
|
sub rax,rcx
|
|
mov rcx,rbx
|
|
pcmpgtw xmm7,xmm1
|
|
psubw xmm11,xmm6
|
|
sub rcx,r11
|
|
sub rdx,r11
|
|
movq xmm0,[rax]
|
|
movsx eax,r9w
|
|
movq xmm15,[rcx]
|
|
punpcklqdq xmm13,xmm0
|
|
movq xmm0, [rdx]
|
|
movdqa xmm4,xmm13
|
|
punpcklqdq xmm15,xmm0
|
|
movq xmm0, [rdi]
|
|
punpcklbw xmm4,xmm1
|
|
movdqa xmm12,xmm15
|
|
punpcklqdq xmm2,xmm0
|
|
movq xmm0, [r11+rdi]
|
|
punpcklbw xmm12,xmm1
|
|
movdqa xmm14,xmm2
|
|
punpcklqdq xmm9,xmm0
|
|
punpckhbw xmm2,xmm1
|
|
punpcklbw xmm14,xmm1
|
|
movd xmm0,eax
|
|
mov eax, ebp ; iBeta
|
|
punpckhbw xmm13,xmm1
|
|
punpckhbw xmm15,xmm1
|
|
movdqa xmm3,xmm9
|
|
movdqa [rsp+10h],xmm2
|
|
punpcklwd xmm0,xmm0
|
|
punpckhbw xmm9,xmm1
|
|
punpcklbw xmm3,xmm1
|
|
movdqa xmm1,xmm14
|
|
pshufd xmm10,xmm0,0
|
|
movd xmm0,eax
|
|
mov eax,4
|
|
cwde
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm8,xmm0,0
|
|
movd xmm0,eax
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm5,xmm0,0
|
|
psubw xmm1,xmm12
|
|
movdqa xmm2,xmm10
|
|
lea r11,[rsp+0C8h]
|
|
psllw xmm1,2
|
|
movdqa xmm0,xmm4
|
|
psubw xmm4,xmm12
|
|
psubw xmm0,xmm3
|
|
psubw xmm3,xmm14
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm11
|
|
psraw xmm1,3
|
|
pmaxsw xmm0,xmm1
|
|
pminsw xmm6,xmm0
|
|
movdqa xmm1,xmm8
|
|
movdqa xmm0,xmm12
|
|
psubw xmm0,xmm14
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm2,xmm0
|
|
pabsw xmm0,xmm4
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm3
|
|
movdqa xmm3,[rsp]
|
|
pand xmm2,xmm1
|
|
movdqa xmm1,xmm8
|
|
pcmpgtw xmm1,xmm0
|
|
movdqa xmm0,xmm13
|
|
pand xmm2,xmm1
|
|
psubw xmm0,xmm9
|
|
psubw xmm13,xmm15
|
|
pand xmm2,xmm7
|
|
pand xmm6,xmm2
|
|
paddw xmm12,xmm6
|
|
psubw xmm14,xmm6
|
|
movdqa xmm2,[rsp+10h]
|
|
movaps xmm6,[r11-18h]
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm15
|
|
psubw xmm9,xmm2
|
|
psllw xmm1,2
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm15
|
|
psubw xmm0,xmm2
|
|
psraw xmm1,3
|
|
pmaxsw xmm11,xmm1
|
|
pabsw xmm0,xmm0
|
|
movdqa xmm1,xmm8
|
|
pcmpgtw xmm10,xmm0
|
|
pabsw xmm0,xmm13
|
|
pminsw xmm3,xmm11
|
|
movaps xmm11,[r11-68h]
|
|
movaps xmm13,[rsp+40h]
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm9
|
|
movaps xmm9, [r11-48h]
|
|
pand xmm10,xmm1
|
|
pcmpgtw xmm8,xmm0
|
|
pand xmm10,xmm8
|
|
pand xmm10,xmm7
|
|
movaps xmm8,[r11-38h]
|
|
movaps xmm7,[r11-28h]
|
|
pand xmm3,xmm10
|
|
paddw xmm15,xmm3
|
|
psubw xmm2,xmm3
|
|
movaps xmm10,[r11-58h]
|
|
packuswb xmm12,xmm15
|
|
movaps xmm15,[rsp+20h]
|
|
packuswb xmm14,xmm2
|
|
movq [rcx],xmm12
|
|
movq [rbx],xmm14
|
|
psrldq xmm12,8
|
|
psrldq xmm14,8
|
|
movq [rdx],xmm12
|
|
movaps xmm12,[r11-78h]
|
|
movq [rdi],xmm14
|
|
movaps xmm14,[rsp+30h]
|
|
mov rsp,r11
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
WELS_EXTERN DeblockChromaEq4V_ssse3
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
|
|
mov rbp, r8
|
|
mov r8, rdx
|
|
mov r9, rcx
|
|
mov rcx, rdi
|
|
mov rdx, rsi
|
|
|
|
sub rsp,90h
|
|
pxor xmm1,xmm1
|
|
mov r11,rcx
|
|
mov rbx,rdx
|
|
mov r10d,r9d
|
|
movq xmm13,[r11]
|
|
lea eax,[r8+r8]
|
|
movsxd r9,eax
|
|
mov rax,rcx
|
|
sub rax,r9
|
|
movq xmm14,[rax]
|
|
mov rax,rdx
|
|
sub rax,r9
|
|
movq xmm0,[rax]
|
|
movsxd rax,r8d
|
|
sub rcx,rax
|
|
sub rdx,rax
|
|
movq xmm12,[rax+r11]
|
|
movq xmm10,[rcx]
|
|
punpcklqdq xmm14,xmm0
|
|
movdqa xmm8,xmm14
|
|
movq xmm0,[rdx]
|
|
punpcklbw xmm8,xmm1
|
|
punpckhbw xmm14,xmm1
|
|
punpcklqdq xmm10,xmm0
|
|
movq xmm0,[rbx]
|
|
movdqa xmm5,xmm10
|
|
punpcklqdq xmm13,xmm0
|
|
movq xmm0, [rax+rbx]
|
|
punpcklbw xmm5,xmm1
|
|
movsx eax,r10w
|
|
movdqa xmm9,xmm13
|
|
punpcklqdq xmm12,xmm0
|
|
punpcklbw xmm9,xmm1
|
|
punpckhbw xmm10,xmm1
|
|
movd xmm0,eax
|
|
mov eax, ebp ; iBeta
|
|
punpckhbw xmm13,xmm1
|
|
movdqa xmm7,xmm12
|
|
punpcklwd xmm0,xmm0
|
|
punpckhbw xmm12,xmm1
|
|
pshufd xmm11,xmm0,0
|
|
punpcklbw xmm7,xmm1
|
|
movd xmm0,eax
|
|
movdqa xmm1,xmm8
|
|
psubw xmm1,xmm5
|
|
punpcklwd xmm0,xmm0
|
|
movdqa xmm6,xmm11
|
|
pshufd xmm3,xmm0,0
|
|
movdqa xmm0,xmm5
|
|
psubw xmm0,xmm9
|
|
movdqa xmm2,xmm3
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm6,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm3
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm6,xmm2
|
|
movdqa xmm0,xmm7
|
|
movdqa xmm2,xmm3
|
|
psubw xmm0,xmm9
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm1,xmm0
|
|
pand xmm6,xmm1
|
|
movdqa xmm0,xmm10
|
|
movdqa xmm1,xmm14
|
|
psubw xmm0,xmm13
|
|
psubw xmm1,xmm10
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm11,xmm0
|
|
pabsw xmm0,xmm1
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm11,xmm2
|
|
movdqa xmm0,xmm12
|
|
movdqa xmm4,xmm6
|
|
movdqa xmm1,xmm8
|
|
mov eax,2
|
|
cwde
|
|
paddw xmm1,xmm8
|
|
psubw xmm0,xmm13
|
|
paddw xmm1,xmm5
|
|
pabsw xmm0,xmm0
|
|
movdqa xmm2,xmm14
|
|
paddw xmm1,xmm7
|
|
pcmpgtw xmm3,xmm0
|
|
paddw xmm2,xmm14
|
|
movd xmm0,eax
|
|
pand xmm11,xmm3
|
|
paddw xmm7,xmm7
|
|
paddw xmm2,xmm10
|
|
punpcklwd xmm0,xmm0
|
|
paddw xmm2,xmm12
|
|
paddw xmm12,xmm12
|
|
pshufd xmm3,xmm0,0
|
|
paddw xmm7,xmm9
|
|
paddw xmm12,xmm13
|
|
movdqa xmm0,xmm6
|
|
paddw xmm1,xmm3
|
|
pandn xmm0,xmm5
|
|
paddw xmm7,xmm8
|
|
psraw xmm1,2
|
|
paddw xmm12,xmm14
|
|
paddw xmm7,xmm3
|
|
;movaps xmm14,[rsp]
|
|
pand xmm4,xmm1
|
|
paddw xmm12,xmm3
|
|
psraw xmm7,2
|
|
movdqa xmm1,xmm11
|
|
por xmm4,xmm0
|
|
psraw xmm12,2
|
|
paddw xmm2,xmm3
|
|
movdqa xmm0,xmm11
|
|
pandn xmm0,xmm10
|
|
psraw xmm2,2
|
|
pand xmm1,xmm2
|
|
por xmm1,xmm0
|
|
packuswb xmm4,xmm1
|
|
movdqa xmm0,xmm11
|
|
movdqa xmm1,xmm6
|
|
pand xmm1,xmm7
|
|
movq [rcx],xmm4
|
|
pandn xmm6,xmm9
|
|
pandn xmm11,xmm13
|
|
pand xmm0,xmm12
|
|
por xmm1,xmm6
|
|
por xmm0,xmm11
|
|
psrldq xmm4,8
|
|
packuswb xmm1,xmm0
|
|
movq [r11],xmm1
|
|
psrldq xmm1,8
|
|
movq [rdx],xmm4
|
|
lea r11,[rsp+90h]
|
|
movq [rbx],xmm1
|
|
mov rsp,r11
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
WELS_EXTERN DeblockChromaEq4H_ssse3
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
|
|
mov rbp, r8
|
|
mov r8, rdx
|
|
mov r9, rcx
|
|
mov rcx, rdi
|
|
mov rdx, rsi
|
|
mov rdi, rdx
|
|
|
|
sub rsp,140h
|
|
lea eax,[r8*4]
|
|
movsxd r10,eax
|
|
mov eax,[rcx-2]
|
|
mov [rsp+10h],eax
|
|
lea rbx,[r10+rdx-2]
|
|
lea r11,[r10+rcx-2]
|
|
|
|
movdqa xmm5,[rsp+10h]
|
|
movsxd r10,r8d
|
|
mov eax,[r10+rcx-2]
|
|
lea rdx,[r10+r10*2]
|
|
mov [rsp+20h],eax
|
|
mov eax,[rcx+r10*2-2]
|
|
mov [rsp+30h],eax
|
|
mov eax,[rdx+rcx-2]
|
|
movdqa xmm2,[rsp+20h]
|
|
mov [rsp+40h],eax
|
|
mov eax, [rdi-2]
|
|
movdqa xmm4,[rsp+30h]
|
|
mov [rsp+50h],eax
|
|
mov eax,[r10+rdi-2]
|
|
movdqa xmm3,[rsp+40h]
|
|
mov [rsp+60h],eax
|
|
mov eax,[rdi+r10*2-2]
|
|
punpckldq xmm5,[rsp+50h]
|
|
mov [rsp+70h],eax
|
|
mov eax, [rdx+rdi-2]
|
|
punpckldq xmm2, [rsp+60h]
|
|
mov [rsp+80h],eax
|
|
mov eax,[r11]
|
|
punpckldq xmm4, [rsp+70h]
|
|
mov [rsp+50h],eax
|
|
mov eax,[rbx]
|
|
punpckldq xmm3,[rsp+80h]
|
|
mov [rsp+60h],eax
|
|
mov eax,[r10+r11]
|
|
movdqa xmm0, [rsp+50h]
|
|
punpckldq xmm0, [rsp+60h]
|
|
punpcklqdq xmm5,xmm0
|
|
movdqa [rsp+50h],xmm0
|
|
mov [rsp+50h],eax
|
|
mov eax,[r10+rbx]
|
|
movdqa xmm0,[rsp+50h]
|
|
movdqa xmm1,xmm5
|
|
mov [rsp+60h],eax
|
|
mov eax,[r11+r10*2]
|
|
punpckldq xmm0, [rsp+60h]
|
|
punpcklqdq xmm2,xmm0
|
|
punpcklbw xmm1,xmm2
|
|
punpckhbw xmm5,xmm2
|
|
movdqa [rsp+50h],xmm0
|
|
mov [rsp+50h],eax
|
|
mov eax,[rbx+r10*2]
|
|
movdqa xmm0,[rsp+50h]
|
|
mov [rsp+60h],eax
|
|
mov eax, [rdx+r11]
|
|
movdqa xmm15,xmm1
|
|
punpckldq xmm0,[rsp+60h]
|
|
punpcklqdq xmm4,xmm0
|
|
movdqa [rsp+50h],xmm0
|
|
mov [rsp+50h],eax
|
|
mov eax, [rdx+rbx]
|
|
movdqa xmm0,[rsp+50h]
|
|
mov [rsp+60h],eax
|
|
punpckldq xmm0, [rsp+60h]
|
|
punpcklqdq xmm3,xmm0
|
|
movdqa xmm0,xmm4
|
|
punpcklbw xmm0,xmm3
|
|
punpckhbw xmm4,xmm3
|
|
punpcklwd xmm15,xmm0
|
|
punpckhwd xmm1,xmm0
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm12,xmm15
|
|
punpcklwd xmm0,xmm4
|
|
punpckhwd xmm5,xmm4
|
|
punpckldq xmm12,xmm0
|
|
punpckhdq xmm15,xmm0
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm11,xmm12
|
|
punpckldq xmm0,xmm5
|
|
punpckhdq xmm1,xmm5
|
|
punpcklqdq xmm11,xmm0
|
|
punpckhqdq xmm12,xmm0
|
|
movsx eax,r9w
|
|
movdqa xmm14,xmm15
|
|
punpcklqdq xmm14,xmm1
|
|
punpckhqdq xmm15,xmm1
|
|
pxor xmm1,xmm1
|
|
movd xmm0,eax
|
|
movdqa xmm4,xmm12
|
|
movdqa xmm8,xmm11
|
|
mov eax, ebp ; iBeta
|
|
punpcklwd xmm0,xmm0
|
|
punpcklbw xmm4,xmm1
|
|
punpckhbw xmm12,xmm1
|
|
movdqa xmm9,xmm14
|
|
movdqa xmm7,xmm15
|
|
movdqa xmm10,xmm15
|
|
pshufd xmm13,xmm0,0
|
|
punpcklbw xmm9,xmm1
|
|
punpckhbw xmm14,xmm1
|
|
movdqa xmm6,xmm13
|
|
movd xmm0,eax
|
|
movdqa [rsp],xmm11
|
|
mov eax,2
|
|
cwde
|
|
punpckhbw xmm11,xmm1
|
|
punpckhbw xmm10,xmm1
|
|
punpcklbw xmm7,xmm1
|
|
punpcklwd xmm0,xmm0
|
|
punpcklbw xmm8,xmm1
|
|
pshufd xmm3,xmm0,0
|
|
movdqa xmm1,xmm8
|
|
movdqa xmm0,xmm4
|
|
psubw xmm0,xmm9
|
|
psubw xmm1,xmm4
|
|
movdqa xmm2,xmm3
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm6,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm3
|
|
pcmpgtw xmm2,xmm0
|
|
pand xmm6,xmm2
|
|
movdqa xmm0,xmm7
|
|
movdqa xmm2,xmm3
|
|
psubw xmm0,xmm9
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm1,xmm0
|
|
pand xmm6,xmm1
|
|
movdqa xmm0,xmm12
|
|
movdqa xmm1,xmm11
|
|
psubw xmm0,xmm14
|
|
psubw xmm1,xmm12
|
|
movdqa xmm5,xmm6
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm13,xmm0
|
|
pabsw xmm0,xmm1
|
|
movdqa xmm1,xmm8
|
|
pcmpgtw xmm2,xmm0
|
|
paddw xmm1,xmm8
|
|
movdqa xmm0,xmm10
|
|
pand xmm13,xmm2
|
|
psubw xmm0,xmm14
|
|
paddw xmm1,xmm4
|
|
movdqa xmm2,xmm11
|
|
pabsw xmm0,xmm0
|
|
paddw xmm2,xmm11
|
|
paddw xmm1,xmm7
|
|
pcmpgtw xmm3,xmm0
|
|
paddw xmm2,xmm12
|
|
movd xmm0,eax
|
|
pand xmm13,xmm3
|
|
paddw xmm2,xmm10
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm3,xmm0,0
|
|
movdqa xmm0,xmm6
|
|
paddw xmm1,xmm3
|
|
pandn xmm0,xmm4
|
|
paddw xmm2,xmm3
|
|
psraw xmm1,2
|
|
pand xmm5,xmm1
|
|
por xmm5,xmm0
|
|
paddw xmm7,xmm7
|
|
paddw xmm10,xmm10
|
|
psraw xmm2,2
|
|
movdqa xmm1,xmm13
|
|
movdqa xmm0,xmm13
|
|
pandn xmm0,xmm12
|
|
pand xmm1,xmm2
|
|
paddw xmm7,xmm9
|
|
por xmm1,xmm0
|
|
paddw xmm10,xmm14
|
|
paddw xmm7,xmm8
|
|
movdqa xmm0,xmm13
|
|
packuswb xmm5,xmm1
|
|
paddw xmm7,xmm3
|
|
paddw xmm10,xmm11
|
|
movdqa xmm1,xmm6
|
|
paddw xmm10,xmm3
|
|
pandn xmm6,xmm9
|
|
psraw xmm7,2
|
|
pand xmm1,xmm7
|
|
psraw xmm10,2
|
|
pandn xmm13,xmm14
|
|
pand xmm0,xmm10
|
|
por xmm1,xmm6
|
|
movdqa xmm6,[rsp]
|
|
movdqa xmm4,xmm6
|
|
por xmm0,xmm13
|
|
punpcklbw xmm4,xmm5
|
|
punpckhbw xmm6,xmm5
|
|
movdqa xmm3,xmm4
|
|
packuswb xmm1,xmm0
|
|
movdqa xmm0,xmm1
|
|
punpckhbw xmm1,xmm15
|
|
punpcklbw xmm0,xmm15
|
|
punpcklwd xmm3,xmm0
|
|
punpckhwd xmm4,xmm0
|
|
movdqa xmm0,xmm6
|
|
movdqa xmm2,xmm3
|
|
punpcklwd xmm0,xmm1
|
|
punpckhwd xmm6,xmm1
|
|
movdqa xmm1,xmm4
|
|
punpckldq xmm2,xmm0
|
|
punpckhdq xmm3,xmm0
|
|
punpckldq xmm1,xmm6
|
|
movdqa xmm0,xmm2
|
|
punpcklqdq xmm0,xmm1
|
|
punpckhdq xmm4,xmm6
|
|
punpckhqdq xmm2,xmm1
|
|
movdqa [rsp+10h],xmm0
|
|
movdqa [rsp+60h],xmm2
|
|
movdqa xmm0,xmm3
|
|
mov eax,[rsp+10h]
|
|
mov [rcx-2],eax
|
|
mov eax,[rsp+60h]
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm3,xmm4
|
|
mov [r10+rcx-2],eax
|
|
movdqa [rsp+20h],xmm0
|
|
mov eax, [rsp+20h]
|
|
movdqa [rsp+70h],xmm3
|
|
mov [rcx+r10*2-2],eax
|
|
mov eax,[rsp+70h]
|
|
mov [rdx+rcx-2],eax
|
|
mov eax,[rsp+18h]
|
|
mov [r11],eax
|
|
mov eax,[rsp+68h]
|
|
mov [r10+r11],eax
|
|
mov eax,[rsp+28h]
|
|
mov [r11+r10*2],eax
|
|
mov eax,[rsp+78h]
|
|
mov [rdx+r11],eax
|
|
mov eax,[rsp+14h]
|
|
mov [rdi-2],eax
|
|
mov eax,[rsp+64h]
|
|
mov [r10+rdi-2],eax
|
|
mov eax,[rsp+24h]
|
|
mov [rdi+r10*2-2],eax
|
|
mov eax, [rsp+74h]
|
|
mov [rdx+rdi-2],eax
|
|
mov eax, [rsp+1Ch]
|
|
mov [rbx],eax
|
|
mov eax, [rsp+6Ch]
|
|
mov [r10+rbx],eax
|
|
mov eax,[rsp+2Ch]
|
|
mov [rbx+r10*2],eax
|
|
mov eax,[rsp+7Ch]
|
|
mov [rdx+rbx],eax
|
|
lea r11,[rsp+140h]
|
|
mov rbx, [r11+28h]
|
|
mov rsp,r11
|
|
pop r12
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
|
|
WELS_EXTERN DeblockChromaLt4H_ssse3
|
|
mov rax,rsp
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
sub rsp,170h
|
|
|
|
mov r13, r8
|
|
mov r14, r9
|
|
mov r8, rdx
|
|
mov r9, rcx
|
|
mov rdx, rdi
|
|
mov rcx, rsi
|
|
|
|
movsxd rsi,r8d
|
|
lea eax,[r8*4]
|
|
mov r11d,r9d
|
|
movsxd r10,eax
|
|
mov eax, [rcx-2]
|
|
mov r12,rdx
|
|
mov [rsp+40h],eax
|
|
mov eax, [rsi+rcx-2]
|
|
lea rbx,[r10+rcx-2]
|
|
movdqa xmm5,[rsp+40h]
|
|
mov [rsp+50h],eax
|
|
mov eax, [rcx+rsi*2-2]
|
|
lea rbp,[r10+rdx-2]
|
|
movdqa xmm2, [rsp+50h]
|
|
mov [rsp+60h],eax
|
|
lea r10,[rsi+rsi*2]
|
|
mov rdi,rcx
|
|
mov eax,[r10+rcx-2]
|
|
movdqa xmm4,[rsp+60h]
|
|
mov [rsp+70h],eax
|
|
mov eax,[rdx-2]
|
|
mov [rsp+80h],eax
|
|
mov eax, [rsi+rdx-2]
|
|
movdqa xmm3,[rsp+70h]
|
|
mov [rsp+90h],eax
|
|
mov eax,[rdx+rsi*2-2]
|
|
punpckldq xmm5,[rsp+80h]
|
|
mov [rsp+0A0h],eax
|
|
mov eax, [r10+rdx-2]
|
|
punpckldq xmm2,[rsp+90h]
|
|
mov [rsp+0B0h],eax
|
|
mov eax, [rbx]
|
|
punpckldq xmm4,[rsp+0A0h]
|
|
mov [rsp+80h],eax
|
|
mov eax,[rbp]
|
|
punpckldq xmm3,[rsp+0B0h]
|
|
mov [rsp+90h],eax
|
|
mov eax,[rsi+rbx]
|
|
movdqa xmm0,[rsp+80h]
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm5,xmm0
|
|
movdqa [rsp+80h],xmm0
|
|
mov [rsp+80h],eax
|
|
mov eax,[rsi+rbp]
|
|
movdqa xmm0,[rsp+80h]
|
|
movdqa xmm1,xmm5
|
|
mov [rsp+90h],eax
|
|
mov eax,[rbx+rsi*2]
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm2,xmm0
|
|
punpcklbw xmm1,xmm2
|
|
punpckhbw xmm5,xmm2
|
|
movdqa [rsp+80h],xmm0
|
|
mov [rsp+80h],eax
|
|
mov eax,[rbp+rsi*2]
|
|
movdqa xmm0, [rsp+80h]
|
|
mov [rsp+90h],eax
|
|
mov eax,[r10+rbx]
|
|
movdqa xmm7,xmm1
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm4,xmm0
|
|
movdqa [rsp+80h],xmm0
|
|
mov [rsp+80h],eax
|
|
mov eax, [r10+rbp]
|
|
movdqa xmm0,[rsp+80h]
|
|
mov [rsp+90h],eax
|
|
punpckldq xmm0,[rsp+90h]
|
|
punpcklqdq xmm3,xmm0
|
|
movdqa xmm0,xmm4
|
|
punpcklbw xmm0,xmm3
|
|
punpckhbw xmm4,xmm3
|
|
punpcklwd xmm7,xmm0
|
|
punpckhwd xmm1,xmm0
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm6,xmm7
|
|
punpcklwd xmm0,xmm4
|
|
punpckhwd xmm5,xmm4
|
|
punpckldq xmm6,xmm0
|
|
punpckhdq xmm7,xmm0
|
|
movdqa xmm0,xmm1
|
|
punpckldq xmm0,xmm5
|
|
mov rax, r14 ; pTC
|
|
punpckhdq xmm1,xmm5
|
|
movdqa xmm9,xmm6
|
|
punpckhqdq xmm6,xmm0
|
|
punpcklqdq xmm9,xmm0
|
|
movdqa xmm2,xmm7
|
|
movdqa xmm13,xmm6
|
|
movdqa xmm4,xmm9
|
|
movdqa [rsp+10h],xmm9
|
|
punpcklqdq xmm2,xmm1
|
|
punpckhqdq xmm7,xmm1
|
|
pxor xmm1,xmm1
|
|
movsx ecx,byte [rax+3]
|
|
movsx edx,byte [rax+2]
|
|
movsx r8d,byte [rax+1]
|
|
movsx r9d,byte [rax]
|
|
movdqa xmm10,xmm1
|
|
movdqa xmm15,xmm2
|
|
punpckhbw xmm2,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
punpcklbw xmm4,xmm1
|
|
movsx eax,r11w
|
|
mov word [rsp+0Eh],cx
|
|
mov word [rsp+0Ch],cx
|
|
movdqa xmm3,xmm7
|
|
movdqa xmm8,xmm7
|
|
movdqa [rsp+20h],xmm7
|
|
punpcklbw xmm15,xmm1
|
|
punpcklbw xmm13,xmm1
|
|
punpcklbw xmm3,xmm1
|
|
mov word [rsp+0Ah],dx
|
|
mov word [rsp+8],dx
|
|
mov word [rsp+6],r8w
|
|
movd xmm0,eax
|
|
movdqa [rsp+30h],xmm6
|
|
punpckhbw xmm9,xmm1
|
|
punpckhbw xmm8,xmm1
|
|
punpcklwd xmm0,xmm0
|
|
mov eax, r13d ; iBeta
|
|
mov word [rsp+4],r8w
|
|
mov word [rsp+2],r9w
|
|
pshufd xmm12,xmm0,0
|
|
mov word [rsp],r9w
|
|
movd xmm0,eax
|
|
mov eax,4
|
|
cwde
|
|
movdqa xmm14, [rsp]
|
|
movdqa [rsp],xmm2
|
|
movdqa xmm2,xmm12
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm11,xmm0,0
|
|
psubw xmm10,xmm14
|
|
movd xmm0,eax
|
|
movdqa xmm7,xmm14
|
|
movdqa xmm6,xmm14
|
|
pcmpgtw xmm7,xmm1
|
|
punpcklwd xmm0,xmm0
|
|
pshufd xmm5,xmm0,0
|
|
movdqa xmm0,xmm4
|
|
movdqa xmm1,xmm15
|
|
psubw xmm4,xmm13
|
|
psubw xmm0,xmm3
|
|
psubw xmm1,xmm13
|
|
psubw xmm3,xmm15
|
|
psllw xmm1,2
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm10
|
|
psraw xmm1,3
|
|
pmaxsw xmm0,xmm1
|
|
pminsw xmm6,xmm0
|
|
movdqa xmm1,xmm11
|
|
movdqa xmm0,xmm13
|
|
psubw xmm0,xmm15
|
|
pabsw xmm0,xmm0
|
|
pcmpgtw xmm2,xmm0
|
|
pabsw xmm0,xmm4
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm3
|
|
pand xmm2,xmm1
|
|
movdqa xmm1,xmm11
|
|
movdqa xmm3,[rsp+30h]
|
|
pcmpgtw xmm1,xmm0
|
|
movdqa xmm0,xmm9
|
|
pand xmm2,xmm1
|
|
psubw xmm0,xmm8
|
|
psubw xmm9,xmm3
|
|
pand xmm2,xmm7
|
|
pand xmm6,xmm2
|
|
psubw xmm15,xmm6
|
|
paddw xmm13,xmm6
|
|
movdqa xmm2,[rsp]
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm3
|
|
psubw xmm8,xmm2
|
|
psllw xmm1,2
|
|
paddw xmm1,xmm0
|
|
paddw xmm1,xmm5
|
|
movdqa xmm0,xmm3
|
|
movdqa xmm5,[rsp+10h]
|
|
psubw xmm0,xmm2
|
|
psraw xmm1,3
|
|
movdqa xmm4,xmm5
|
|
pabsw xmm0,xmm0
|
|
pmaxsw xmm10,xmm1
|
|
movdqa xmm1,xmm11
|
|
pcmpgtw xmm12,xmm0
|
|
pabsw xmm0,xmm9
|
|
pminsw xmm14,xmm10
|
|
pcmpgtw xmm1,xmm0
|
|
pabsw xmm0,xmm8
|
|
pcmpgtw xmm11,xmm0
|
|
pand xmm12,xmm1
|
|
movdqa xmm1,[rsp+20h]
|
|
pand xmm12,xmm11
|
|
pand xmm12,xmm7
|
|
pand xmm14,xmm12
|
|
paddw xmm3,xmm14
|
|
psubw xmm2,xmm14
|
|
packuswb xmm13,xmm3
|
|
packuswb xmm15,xmm2
|
|
punpcklbw xmm4,xmm13
|
|
punpckhbw xmm5,xmm13
|
|
movdqa xmm0,xmm15
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm15,xmm1
|
|
movdqa xmm3,xmm4
|
|
punpcklwd xmm3,xmm0
|
|
punpckhwd xmm4,xmm0
|
|
movdqa xmm0,xmm5
|
|
movdqa xmm2,xmm3
|
|
movdqa xmm1,xmm4
|
|
punpcklwd xmm0,xmm15
|
|
punpckhwd xmm5,xmm15
|
|
punpckldq xmm2,xmm0
|
|
punpckhdq xmm3,xmm0
|
|
punpckldq xmm1,xmm5
|
|
movdqa xmm0,xmm2
|
|
punpcklqdq xmm0,xmm1
|
|
punpckhdq xmm4,xmm5
|
|
punpckhqdq xmm2,xmm1
|
|
movdqa [rsp+40h],xmm0
|
|
movdqa xmm0,xmm3
|
|
movdqa [rsp+90h],xmm2
|
|
mov eax,[rsp+40h]
|
|
mov [rdi-2],eax
|
|
mov eax, [rsp+90h]
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm3,xmm4
|
|
mov [rsi+rdi-2],eax
|
|
movdqa [rsp+50h],xmm0
|
|
mov eax,[rsp+50h]
|
|
movdqa [rsp+0A0h],xmm3
|
|
mov [rdi+rsi*2-2],eax
|
|
mov eax,[rsp+0A0h]
|
|
mov [r10+rdi-2],eax
|
|
mov eax,[rsp+48h]
|
|
mov [rbx],eax
|
|
mov eax,[rsp+98h]
|
|
mov [rsi+rbx],eax
|
|
mov eax,[rsp+58h]
|
|
mov [rbx+rsi*2],eax
|
|
mov eax, [rsp+0A8h]
|
|
mov [r10+rbx],eax
|
|
mov eax, [rsp+44h]
|
|
mov [r12-2],eax
|
|
mov eax,[rsp+94h]
|
|
mov [rsi+r12-2],eax
|
|
mov eax,[rsp+54h]
|
|
mov [r12+rsi*2-2],eax
|
|
mov eax, [rsp+0A4h]
|
|
mov [r10+r12-2],eax
|
|
mov eax,[rsp+4Ch]
|
|
mov [rbp],eax
|
|
mov eax,[rsp+9Ch]
|
|
mov [rsi+rbp],eax
|
|
mov eax, [rsp+5Ch]
|
|
mov [rbp+rsi*2],eax
|
|
mov eax,[rsp+0ACh]
|
|
mov [r10+rbp],eax
|
|
lea r11,[rsp+170h]
|
|
mov rsp,r11
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
|
|
|
|
%elifdef X86_32
|
|
|
|
;********************************************************************************
|
|
; void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
; int32_t iAlpha, int32_t iBeta)
|
|
;********************************************************************************
|
|
WELS_EXTERN DeblockChromaEq4V_ssse3
|
|
push ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,68h
|
|
mov edx,[ebp+10h] ; iStride
|
|
mov eax,[ebp+8] ; pPixCb
|
|
mov ecx,[ebp+0Ch] ; pPixCr
|
|
movq xmm4,[ecx]
|
|
movq xmm5,[edx+ecx]
|
|
push esi
|
|
push edi
|
|
lea esi,[edx+edx]
|
|
mov edi,eax
|
|
sub edi,esi
|
|
movq xmm1,[edi]
|
|
mov edi,ecx
|
|
sub edi,esi
|
|
movq xmm2,[edi]
|
|
punpcklqdq xmm1,xmm2
|
|
mov esi,eax
|
|
sub esi,edx
|
|
movq xmm2,[esi]
|
|
mov edi,ecx
|
|
sub edi,edx
|
|
movq xmm3,[edi]
|
|
punpcklqdq xmm2,xmm3
|
|
movq xmm3,[eax]
|
|
punpcklqdq xmm3,xmm4
|
|
movq xmm4,[edx+eax]
|
|
mov edx, [ebp + 14h]
|
|
punpcklqdq xmm4,xmm5
|
|
movd xmm5,edx
|
|
mov edx, [ebp + 18h]
|
|
pxor xmm0,xmm0
|
|
movdqa xmm6,xmm5
|
|
punpcklwd xmm6,xmm5
|
|
pshufd xmm5,xmm6,0
|
|
movd xmm6,edx
|
|
movdqa xmm7,xmm6
|
|
punpcklwd xmm7,xmm6
|
|
pshufd xmm6,xmm7,0
|
|
movdqa xmm7,xmm1
|
|
punpckhbw xmm1,xmm0
|
|
punpcklbw xmm7,xmm0
|
|
movdqa [esp+40h],xmm1
|
|
movdqa [esp+60h],xmm7
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm7,xmm0
|
|
movdqa [esp+10h],xmm7
|
|
movdqa xmm7,xmm3
|
|
punpcklbw xmm7,xmm0
|
|
punpckhbw xmm3,xmm0
|
|
movdqa [esp+50h],xmm7
|
|
movdqa xmm7,xmm4
|
|
punpckhbw xmm4,xmm0
|
|
punpckhbw xmm2,xmm0
|
|
punpcklbw xmm7,xmm0
|
|
movdqa [esp+30h],xmm3
|
|
movdqa xmm3,[esp+10h]
|
|
movdqa xmm1,xmm3
|
|
psubw xmm1,[esp+50h]
|
|
pabsw xmm1,xmm1
|
|
movdqa [esp+20h],xmm4
|
|
movdqa xmm0,xmm5
|
|
pcmpgtw xmm0,xmm1
|
|
movdqa xmm1,[esp+60h]
|
|
psubw xmm1,xmm3
|
|
pabsw xmm1,xmm1
|
|
movdqa xmm4,xmm6
|
|
pcmpgtw xmm4,xmm1
|
|
pand xmm0,xmm4
|
|
movdqa xmm1,xmm7
|
|
psubw xmm1,[esp+50h]
|
|
pabsw xmm1,xmm1
|
|
movdqa xmm4,xmm6
|
|
pcmpgtw xmm4,xmm1
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,[esp+30h]
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm5,xmm1
|
|
movdqa xmm1,[esp+40h]
|
|
pand xmm0,xmm4
|
|
psubw xmm1,xmm2
|
|
pabsw xmm1,xmm1
|
|
movdqa xmm4,xmm6
|
|
pcmpgtw xmm4,xmm1
|
|
movdqa xmm1,[esp+20h]
|
|
psubw xmm1,[esp+30h]
|
|
pand xmm5,xmm4
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm6,xmm1
|
|
pand xmm5,xmm6
|
|
mov edx,2
|
|
movsx edx,dx
|
|
movd xmm1,edx
|
|
movdqa xmm4,xmm1
|
|
punpcklwd xmm4,xmm1
|
|
pshufd xmm1,xmm4,0
|
|
movdqa xmm4,[esp+60h]
|
|
movdqa xmm6,xmm4
|
|
paddw xmm6,xmm4
|
|
paddw xmm6,xmm3
|
|
paddw xmm6,xmm7
|
|
movdqa [esp+10h],xmm1
|
|
paddw xmm6,[esp+10h]
|
|
psraw xmm6,2
|
|
movdqa xmm4,xmm0
|
|
pandn xmm4,xmm3
|
|
movdqa xmm3,[esp+40h]
|
|
movdqa xmm1,xmm0
|
|
pand xmm1,xmm6
|
|
por xmm1,xmm4
|
|
movdqa xmm6,xmm3
|
|
paddw xmm6,xmm3
|
|
movdqa xmm3,[esp+10h]
|
|
paddw xmm6,xmm2
|
|
paddw xmm6,[esp+20h]
|
|
paddw xmm6,xmm3
|
|
psraw xmm6,2
|
|
movdqa xmm4,xmm5
|
|
pand xmm4,xmm6
|
|
movdqa xmm6,xmm5
|
|
pandn xmm6,xmm2
|
|
por xmm4,xmm6
|
|
packuswb xmm1,xmm4
|
|
movdqa xmm4,[esp+50h]
|
|
movdqa xmm6,xmm7
|
|
paddw xmm6,xmm7
|
|
paddw xmm6,xmm4
|
|
paddw xmm6,[esp+60h]
|
|
paddw xmm6,xmm3
|
|
psraw xmm6,2
|
|
movdqa xmm2,xmm0
|
|
pand xmm2,xmm6
|
|
pandn xmm0,xmm4
|
|
por xmm2,xmm0
|
|
movdqa xmm0,[esp+20h]
|
|
movdqa xmm6,xmm0
|
|
paddw xmm6,xmm0
|
|
movdqa xmm0,[esp+30h]
|
|
paddw xmm6,xmm0
|
|
paddw xmm6,[esp+40h]
|
|
movdqa xmm4,xmm5
|
|
paddw xmm6,xmm3
|
|
movq [esi],xmm1
|
|
psraw xmm6,2
|
|
pand xmm4,xmm6
|
|
pandn xmm5,xmm0
|
|
por xmm4,xmm5
|
|
packuswb xmm2,xmm4
|
|
movq [eax],xmm2
|
|
psrldq xmm1,8
|
|
movq [edi],xmm1
|
|
pop edi
|
|
psrldq xmm2,8
|
|
movq [ecx],xmm2
|
|
pop esi
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
;******************************************************************************
|
|
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
|
|
;*******************************************************************************
|
|
|
|
WELS_EXTERN DeblockChromaLt4V_ssse3
|
|
push ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,0E4h
|
|
push ebx
|
|
push esi
|
|
mov esi, [ebp+1Ch] ; pTC
|
|
movsx ebx, byte [esi+2]
|
|
push edi
|
|
movsx di,byte [esi+3]
|
|
mov word [esp+0Ch],bx
|
|
movsx bx,byte [esi+1]
|
|
movsx esi,byte [esi]
|
|
mov word [esp+0Eh],si
|
|
movzx esi,di
|
|
movd xmm1,esi
|
|
movzx esi,di
|
|
movd xmm2,esi
|
|
mov si,word [esp+0Ch]
|
|
mov edx, [ebp + 10h]
|
|
mov eax, [ebp + 08h]
|
|
movzx edi,si
|
|
movzx esi,si
|
|
mov ecx, [ebp + 0Ch]
|
|
movd xmm4,esi
|
|
movzx esi,bx
|
|
movd xmm5,esi
|
|
movd xmm3,edi
|
|
movzx esi,bx
|
|
movd xmm6,esi
|
|
mov si,word [esp+0Eh]
|
|
movzx edi,si
|
|
movzx esi,si
|
|
punpcklwd xmm6,xmm2
|
|
pxor xmm0,xmm0
|
|
movdqa [esp+40h],xmm0
|
|
movd xmm7,edi
|
|
movd xmm0,esi
|
|
lea esi,[edx+edx]
|
|
mov edi,eax
|
|
sub edi,esi
|
|
punpcklwd xmm5,xmm1
|
|
movdqa xmm1,[esp+40h]
|
|
punpcklwd xmm0,xmm4
|
|
movq xmm4,[edx+ecx]
|
|
punpcklwd xmm7,xmm3
|
|
movq xmm3,[eax]
|
|
punpcklwd xmm0,xmm6
|
|
movq xmm6,[edi]
|
|
punpcklwd xmm7,xmm5
|
|
punpcklwd xmm0,xmm7
|
|
mov edi,ecx
|
|
sub edi,esi
|
|
movdqa xmm2,xmm1
|
|
psubw xmm2,xmm0
|
|
movdqa [esp+60h],xmm2
|
|
movq xmm2, [edi]
|
|
punpcklqdq xmm6,xmm2
|
|
mov esi,eax
|
|
sub esi,edx
|
|
movq xmm7,[esi]
|
|
mov edi,ecx
|
|
sub edi,edx
|
|
movq xmm2,[edi]
|
|
punpcklqdq xmm7,xmm2
|
|
movq xmm2,[ecx]
|
|
punpcklqdq xmm3,xmm2
|
|
movq xmm2,[edx+eax]
|
|
movsx edx,word [ebp + 14h]
|
|
punpcklqdq xmm2,xmm4
|
|
movdqa [esp+0E0h],xmm2
|
|
movd xmm2,edx
|
|
movsx edx,word [ebp + 18h]
|
|
movdqa xmm4,xmm2
|
|
punpcklwd xmm4,xmm2
|
|
movd xmm2,edx
|
|
movdqa xmm5,xmm2
|
|
punpcklwd xmm5,xmm2
|
|
pshufd xmm2,xmm5,0
|
|
movdqa [esp+50h],xmm2
|
|
movdqa xmm2,xmm6
|
|
punpcklbw xmm2,xmm1
|
|
movdqa [esp+0D0h],xmm3
|
|
pshufd xmm4,xmm4,0
|
|
movdqa [esp+30h],xmm2
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+80h],xmm6
|
|
movdqa xmm6,[esp+0D0h]
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+70h],xmm6
|
|
movdqa xmm6, [esp+0E0h]
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+90h],xmm6
|
|
movdqa xmm5, [esp+0E0h]
|
|
movdqa xmm2,xmm7
|
|
punpckhbw xmm7,xmm1
|
|
punpcklbw xmm5,xmm1
|
|
movdqa [esp+0A0h],xmm7
|
|
punpcklbw xmm3,xmm1
|
|
mov edx,4
|
|
punpcklbw xmm2,xmm1
|
|
movsx edx,dx
|
|
movd xmm6,edx
|
|
movdqa xmm7,xmm6
|
|
punpcklwd xmm7,xmm6
|
|
pshufd xmm6,xmm7,0
|
|
movdqa xmm7,[esp+30h]
|
|
movdqa [esp+20h],xmm6
|
|
psubw xmm7,xmm5
|
|
movdqa xmm6,xmm0
|
|
pcmpgtw xmm6,xmm1
|
|
movdqa xmm1,[esp+60h]
|
|
movdqa [esp+40h],xmm6
|
|
movdqa xmm6,xmm3
|
|
psubw xmm6,xmm2
|
|
psllw xmm6,2
|
|
paddw xmm6,xmm7
|
|
paddw xmm6, [esp+20h]
|
|
movdqa xmm7, [esp+50h]
|
|
psraw xmm6,3
|
|
pmaxsw xmm1,xmm6
|
|
movdqa [esp+10h],xmm0
|
|
movdqa xmm6, [esp+10h]
|
|
pminsw xmm6,xmm1
|
|
movdqa [esp+10h],xmm6
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm3
|
|
pabsw xmm1,xmm1
|
|
movdqa xmm6,xmm4
|
|
pcmpgtw xmm6,xmm1
|
|
movdqa xmm1, [esp+30h]
|
|
psubw xmm1,xmm2
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm7,xmm1
|
|
movdqa xmm1,[esp+50h]
|
|
pand xmm6,xmm7
|
|
movdqa xmm7,[esp+50h]
|
|
psubw xmm5,xmm3
|
|
pabsw xmm5,xmm5
|
|
pcmpgtw xmm1,xmm5
|
|
movdqa xmm5,[esp+80h]
|
|
psubw xmm5,[esp+90h]
|
|
pand xmm6,xmm1
|
|
pand xmm6,[esp+40h]
|
|
movdqa xmm1,[esp+10h]
|
|
pand xmm1,xmm6
|
|
movdqa xmm6,[esp+70h]
|
|
movdqa [esp+30h],xmm1
|
|
movdqa xmm1,[esp+0A0h]
|
|
psubw xmm6,xmm1
|
|
psllw xmm6,2
|
|
paddw xmm6,xmm5
|
|
paddw xmm6,[esp+20h]
|
|
movdqa xmm5,[esp+60h]
|
|
psraw xmm6,3
|
|
pmaxsw xmm5,xmm6
|
|
pminsw xmm0,xmm5
|
|
movdqa xmm5,[esp+70h]
|
|
movdqa xmm6,xmm1
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm4,xmm6
|
|
movdqa xmm6,[esp+80h]
|
|
psubw xmm6,xmm1
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6,[esp+90h]
|
|
pand xmm4,xmm7
|
|
movdqa xmm7,[esp+50h]
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm7,xmm6
|
|
pand xmm4,xmm7
|
|
pand xmm4,[esp+40h]
|
|
pand xmm0,xmm4
|
|
movdqa xmm4,[esp+30h]
|
|
paddw xmm2,xmm4
|
|
paddw xmm1,xmm0
|
|
packuswb xmm2,xmm1
|
|
movq [esi],xmm2
|
|
psubw xmm3,xmm4
|
|
psubw xmm5,xmm0
|
|
packuswb xmm3,xmm5
|
|
movq [eax],xmm3
|
|
psrldq xmm2,8
|
|
movq [edi],xmm2
|
|
pop edi
|
|
pop esi
|
|
psrldq xmm3,8
|
|
movq [ecx],xmm3
|
|
pop ebx
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
;***************************************************************************
|
|
; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
; int32_t iAlpha, int32_t iBeta)
|
|
;***************************************************************************
|
|
|
|
WELS_EXTERN DeblockChromaEq4H_ssse3
|
|
push ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,0C8h
|
|
mov ecx,dword [ebp+8]
|
|
mov edx,dword [ebp+0Ch]
|
|
mov eax,dword [ebp+10h]
|
|
sub ecx,2
|
|
sub edx,2
|
|
push esi
|
|
lea esi,[eax+eax*2]
|
|
mov dword [esp+18h],ecx
|
|
mov dword [esp+4],edx
|
|
lea ecx,[ecx+eax*4]
|
|
lea edx,[edx+eax*4]
|
|
lea eax,[esp+7Ch]
|
|
push edi
|
|
mov dword [esp+14h],esi
|
|
mov dword [esp+18h],ecx
|
|
mov dword [esp+0Ch],edx
|
|
mov dword [esp+10h],eax
|
|
mov esi,dword [esp+1Ch]
|
|
mov ecx,dword [ebp+10h]
|
|
mov edx,dword [esp+14h]
|
|
movd xmm0,dword [esi]
|
|
movd xmm1,dword [esi+ecx]
|
|
movd xmm2,dword [esi+ecx*2]
|
|
movd xmm3,dword [esi+edx]
|
|
mov esi,dword [esp+8]
|
|
movd xmm4,dword [esi]
|
|
movd xmm5,dword [esi+ecx]
|
|
movd xmm6,dword [esi+ecx*2]
|
|
movd xmm7,dword [esi+edx]
|
|
punpckldq xmm0,xmm4
|
|
punpckldq xmm1,xmm5
|
|
punpckldq xmm2,xmm6
|
|
punpckldq xmm3,xmm7
|
|
mov esi,dword [esp+18h]
|
|
mov edi,dword [esp+0Ch]
|
|
movd xmm4,dword [esi]
|
|
movd xmm5,dword [edi]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm0,xmm4
|
|
movd xmm4,dword [esi+ecx]
|
|
movd xmm5,dword [edi+ecx]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm1,xmm4
|
|
movd xmm4,dword [esi+ecx*2]
|
|
movd xmm5,dword [edi+ecx*2]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm2,xmm4
|
|
movd xmm4,dword [esi+edx]
|
|
movd xmm5,dword [edi+edx]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm3,xmm4
|
|
movdqa xmm6,xmm0
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm2,xmm3
|
|
punpckhbw xmm7,xmm3
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm6
|
|
punpcklwd xmm0,xmm2
|
|
punpckhwd xmm4,xmm2
|
|
punpcklwd xmm6,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,xmm4
|
|
punpckldq xmm0,xmm6
|
|
punpckhdq xmm1,xmm6
|
|
punpckldq xmm4,xmm5
|
|
punpckhdq xmm2,xmm5
|
|
movdqa xmm5,xmm0
|
|
movdqa xmm6,xmm1
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm5,xmm4
|
|
punpcklqdq xmm1,xmm2
|
|
punpckhqdq xmm6,xmm2
|
|
mov edi,dword [esp+10h]
|
|
movdqa [edi],xmm0
|
|
movdqa [edi+10h],xmm5
|
|
movdqa [edi+20h],xmm1
|
|
movdqa [edi+30h],xmm6
|
|
movsx ecx,word [ebp+14h]
|
|
movsx edx,word [ebp+18h]
|
|
movdqa xmm6,[esp+80h]
|
|
movdqa xmm4,[esp+90h]
|
|
movdqa xmm5,[esp+0A0h]
|
|
movdqa xmm7,[esp+0B0h]
|
|
pxor xmm0,xmm0
|
|
movd xmm1,ecx
|
|
movdqa xmm2,xmm1
|
|
punpcklwd xmm2,xmm1
|
|
pshufd xmm1,xmm2,0
|
|
movd xmm2,edx
|
|
movdqa xmm3,xmm2
|
|
punpcklwd xmm3,xmm2
|
|
pshufd xmm2,xmm3,0
|
|
movdqa xmm3,xmm6
|
|
punpckhbw xmm6,xmm0
|
|
movdqa [esp+60h],xmm6
|
|
movdqa xmm6,[esp+90h]
|
|
punpckhbw xmm6,xmm0
|
|
movdqa [esp+30h],xmm6
|
|
movdqa xmm6,[esp+0A0h]
|
|
punpckhbw xmm6,xmm0
|
|
movdqa [esp+40h],xmm6
|
|
movdqa xmm6,[esp+0B0h]
|
|
punpckhbw xmm6,xmm0
|
|
movdqa [esp+70h],xmm6
|
|
punpcklbw xmm7,xmm0
|
|
punpcklbw xmm4,xmm0
|
|
punpcklbw xmm5,xmm0
|
|
punpcklbw xmm3,xmm0
|
|
movdqa [esp+50h],xmm7
|
|
movdqa xmm6,xmm4
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
movdqa xmm0,xmm1
|
|
pcmpgtw xmm0,xmm6
|
|
movdqa xmm6,xmm3
|
|
psubw xmm6,xmm4
|
|
pabsw xmm6,xmm6
|
|
movdqa xmm7,xmm2
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6,[esp+50h]
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pand xmm0,xmm7
|
|
movdqa xmm7,xmm2
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6,[esp+30h]
|
|
psubw xmm6,[esp+40h]
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm1,xmm6
|
|
movdqa xmm6,[esp+60h]
|
|
psubw xmm6,[esp+30h]
|
|
pabsw xmm6,xmm6
|
|
pand xmm0,xmm7
|
|
movdqa xmm7,xmm2
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6,[esp+70h]
|
|
psubw xmm6,[esp+40h]
|
|
pabsw xmm6,xmm6
|
|
pand xmm1,xmm7
|
|
pcmpgtw xmm2,xmm6
|
|
pand xmm1,xmm2
|
|
mov eax,2
|
|
movsx ecx,ax
|
|
movd xmm2,ecx
|
|
movdqa xmm6,xmm2
|
|
punpcklwd xmm6,xmm2
|
|
pshufd xmm2,xmm6,0
|
|
movdqa [esp+20h],xmm2
|
|
movdqa xmm2,xmm3
|
|
paddw xmm2,xmm3
|
|
paddw xmm2,xmm4
|
|
paddw xmm2,[esp+50h]
|
|
paddw xmm2,[esp+20h]
|
|
psraw xmm2,2
|
|
movdqa xmm6,xmm0
|
|
pand xmm6,xmm2
|
|
movdqa xmm2,xmm0
|
|
pandn xmm2,xmm4
|
|
por xmm6,xmm2
|
|
movdqa xmm2,[esp+60h]
|
|
movdqa xmm7,xmm2
|
|
paddw xmm7,xmm2
|
|
paddw xmm7,[esp+30h]
|
|
paddw xmm7,[esp+70h]
|
|
paddw xmm7,[esp+20h]
|
|
movdqa xmm4,xmm1
|
|
movdqa xmm2,xmm1
|
|
pandn xmm2,[esp+30h]
|
|
psraw xmm7,2
|
|
pand xmm4,xmm7
|
|
por xmm4,xmm2
|
|
movdqa xmm2,[esp+50h]
|
|
packuswb xmm6,xmm4
|
|
movdqa [esp+90h],xmm6
|
|
movdqa xmm6,xmm2
|
|
paddw xmm6,xmm2
|
|
movdqa xmm2,[esp+20h]
|
|
paddw xmm6,xmm5
|
|
paddw xmm6,xmm3
|
|
movdqa xmm4,xmm0
|
|
pandn xmm0,xmm5
|
|
paddw xmm6,xmm2
|
|
psraw xmm6,2
|
|
pand xmm4,xmm6
|
|
por xmm4,xmm0
|
|
movdqa xmm0,[esp+70h]
|
|
movdqa xmm5,xmm0
|
|
paddw xmm5,xmm0
|
|
movdqa xmm0,[esp+40h]
|
|
paddw xmm5,xmm0
|
|
paddw xmm5,[esp+60h]
|
|
movdqa xmm3,xmm1
|
|
paddw xmm5,xmm2
|
|
psraw xmm5,2
|
|
pand xmm3,xmm5
|
|
pandn xmm1,xmm0
|
|
por xmm3,xmm1
|
|
packuswb xmm4,xmm3
|
|
movdqa [esp+0A0h],xmm4
|
|
mov esi,dword [esp+10h]
|
|
movdqa xmm0,[esi]
|
|
movdqa xmm1,[esi+10h]
|
|
movdqa xmm2,[esi+20h]
|
|
movdqa xmm3,[esi+30h]
|
|
movdqa xmm6,xmm0
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm2,xmm3
|
|
punpckhbw xmm7,xmm3
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm6
|
|
punpcklwd xmm0,xmm2
|
|
punpckhwd xmm4,xmm2
|
|
punpcklwd xmm6,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,xmm4
|
|
punpckldq xmm0,xmm6
|
|
punpckhdq xmm1,xmm6
|
|
punpckldq xmm4,xmm5
|
|
punpckhdq xmm2,xmm5
|
|
movdqa xmm5,xmm0
|
|
movdqa xmm6,xmm1
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm5,xmm4
|
|
punpcklqdq xmm1,xmm2
|
|
punpckhqdq xmm6,xmm2
|
|
mov esi,dword [esp+1Ch]
|
|
mov ecx,dword [ebp+10h]
|
|
mov edx,dword [esp+14h]
|
|
mov edi,dword [esp+8]
|
|
movd dword [esi],xmm0
|
|
movd dword [esi+ecx],xmm5
|
|
movd dword [esi+ecx*2],xmm1
|
|
movd dword [esi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
mov esi,dword [esp+18h]
|
|
movd dword [edi],xmm0
|
|
movd dword [edi+ecx],xmm5
|
|
movd dword [edi+ecx*2],xmm1
|
|
movd dword [edi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
movd dword [esi],xmm0
|
|
movd dword [esi+ecx],xmm5
|
|
movd dword [esi+ecx*2],xmm1
|
|
movd dword [esi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
mov edi,dword [esp+0Ch]
|
|
movd dword [edi],xmm0
|
|
movd dword [edi+ecx],xmm5
|
|
movd dword [edi+ecx*2],xmm1
|
|
movd dword [edi+edx],xmm6
|
|
pop edi
|
|
pop esi
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
;*******************************************************************************
|
|
; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
|
|
;*******************************************************************************
|
|
|
|
WELS_EXTERN DeblockChromaLt4H_ssse3
|
|
push ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,108h
|
|
mov ecx,dword [ebp+8]
|
|
mov edx,dword [ebp+0Ch]
|
|
mov eax,dword [ebp+10h]
|
|
sub ecx,2
|
|
sub edx,2
|
|
push esi
|
|
lea esi,[eax+eax*2]
|
|
mov dword [esp+10h],ecx
|
|
mov dword [esp+4],edx
|
|
lea ecx,[ecx+eax*4]
|
|
lea edx,[edx+eax*4]
|
|
lea eax,[esp+6Ch]
|
|
push edi
|
|
mov dword [esp+0Ch],esi
|
|
mov dword [esp+18h],ecx
|
|
mov dword [esp+10h],edx
|
|
mov dword [esp+1Ch],eax
|
|
mov esi,dword [esp+14h]
|
|
mov ecx,dword [ebp+10h]
|
|
mov edx,dword [esp+0Ch]
|
|
movd xmm0,dword [esi]
|
|
movd xmm1,dword [esi+ecx]
|
|
movd xmm2,dword [esi+ecx*2]
|
|
movd xmm3,dword [esi+edx]
|
|
mov esi,dword [esp+8]
|
|
movd xmm4,dword [esi]
|
|
movd xmm5,dword [esi+ecx]
|
|
movd xmm6,dword [esi+ecx*2]
|
|
movd xmm7,dword [esi+edx]
|
|
punpckldq xmm0,xmm4
|
|
punpckldq xmm1,xmm5
|
|
punpckldq xmm2,xmm6
|
|
punpckldq xmm3,xmm7
|
|
mov esi,dword [esp+18h]
|
|
mov edi,dword [esp+10h]
|
|
movd xmm4,dword [esi]
|
|
movd xmm5,dword [edi]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm0,xmm4
|
|
movd xmm4,dword [esi+ecx]
|
|
movd xmm5,dword [edi+ecx]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm1,xmm4
|
|
movd xmm4,dword [esi+ecx*2]
|
|
movd xmm5,dword [edi+ecx*2]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm2,xmm4
|
|
movd xmm4,dword [esi+edx]
|
|
movd xmm5,dword [edi+edx]
|
|
punpckldq xmm4,xmm5
|
|
punpcklqdq xmm3,xmm4
|
|
movdqa xmm6,xmm0
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm2,xmm3
|
|
punpckhbw xmm7,xmm3
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm6
|
|
punpcklwd xmm0,xmm2
|
|
punpckhwd xmm4,xmm2
|
|
punpcklwd xmm6,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,xmm4
|
|
punpckldq xmm0,xmm6
|
|
punpckhdq xmm1,xmm6
|
|
punpckldq xmm4,xmm5
|
|
punpckhdq xmm2,xmm5
|
|
movdqa xmm5,xmm0
|
|
movdqa xmm6,xmm1
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm5,xmm4
|
|
punpcklqdq xmm1,xmm2
|
|
punpckhqdq xmm6,xmm2
|
|
mov edi,dword [esp+1Ch]
|
|
movdqa [edi],xmm0
|
|
movdqa [edi+10h],xmm5
|
|
movdqa [edi+20h],xmm1
|
|
movdqa [edi+30h],xmm6
|
|
mov eax,dword [ebp+1Ch]
|
|
movsx cx,byte [eax+3]
|
|
movsx dx,byte [eax+2]
|
|
movsx si,byte [eax+1]
|
|
movsx ax,byte [eax]
|
|
movzx edi,cx
|
|
movzx ecx,cx
|
|
movd xmm2,ecx
|
|
movzx ecx,dx
|
|
movzx edx,dx
|
|
movd xmm3,ecx
|
|
movd xmm4,edx
|
|
movzx ecx,si
|
|
movzx edx,si
|
|
movd xmm5,ecx
|
|
pxor xmm0,xmm0
|
|
movd xmm6,edx
|
|
movzx ecx,ax
|
|
movdqa [esp+60h],xmm0
|
|
movzx edx,ax
|
|
movsx eax,word [ebp+14h]
|
|
punpcklwd xmm6,xmm2
|
|
movd xmm1,edi
|
|
movd xmm7,ecx
|
|
movsx ecx,word [ebp+18h]
|
|
movd xmm0,edx
|
|
punpcklwd xmm7,xmm3
|
|
punpcklwd xmm5,xmm1
|
|
movdqa xmm1,[esp+60h]
|
|
punpcklwd xmm7,xmm5
|
|
movdqa xmm5,[esp+0A0h]
|
|
punpcklwd xmm0,xmm4
|
|
punpcklwd xmm0,xmm6
|
|
movdqa xmm6, [esp+70h]
|
|
punpcklwd xmm0,xmm7
|
|
movdqa xmm7,[esp+80h]
|
|
movdqa xmm2,xmm1
|
|
psubw xmm2,xmm0
|
|
movdqa [esp+0D0h],xmm2
|
|
movd xmm2,eax
|
|
movdqa xmm3,xmm2
|
|
punpcklwd xmm3,xmm2
|
|
pshufd xmm4,xmm3,0
|
|
movd xmm2,ecx
|
|
movdqa xmm3,xmm2
|
|
punpcklwd xmm3,xmm2
|
|
pshufd xmm2,xmm3,0
|
|
movdqa xmm3, [esp+90h]
|
|
movdqa [esp+50h],xmm2
|
|
movdqa xmm2,xmm6
|
|
punpcklbw xmm2,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+40h],xmm2
|
|
movdqa [esp+0B0h],xmm6
|
|
movdqa xmm6,[esp+90h]
|
|
movdqa xmm2,xmm7
|
|
punpckhbw xmm7,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
punpcklbw xmm2,xmm1
|
|
punpcklbw xmm3,xmm1
|
|
punpcklbw xmm5,xmm1
|
|
movdqa [esp+0F0h],xmm7
|
|
movdqa [esp+0C0h],xmm6
|
|
movdqa xmm6, [esp+0A0h]
|
|
punpckhbw xmm6,xmm1
|
|
movdqa [esp+0E0h],xmm6
|
|
mov edx,4
|
|
movsx eax,dx
|
|
movd xmm6,eax
|
|
movdqa xmm7,xmm6
|
|
punpcklwd xmm7,xmm6
|
|
pshufd xmm6,xmm7,0
|
|
movdqa [esp+30h],xmm6
|
|
movdqa xmm7, [esp+40h]
|
|
psubw xmm7,xmm5
|
|
movdqa xmm6,xmm0
|
|
pcmpgtw xmm6,xmm1
|
|
movdqa [esp+60h],xmm6
|
|
movdqa xmm1, [esp+0D0h]
|
|
movdqa xmm6,xmm3
|
|
psubw xmm6,xmm2
|
|
psllw xmm6,2
|
|
paddw xmm6,xmm7
|
|
paddw xmm6,[esp+30h]
|
|
psraw xmm6,3
|
|
pmaxsw xmm1,xmm6
|
|
movdqa xmm7,[esp+50h]
|
|
movdqa [esp+20h],xmm0
|
|
movdqa xmm6, [esp+20h]
|
|
pminsw xmm6,xmm1
|
|
movdqa [esp+20h],xmm6
|
|
movdqa xmm6,xmm4
|
|
movdqa xmm1,xmm2
|
|
psubw xmm1,xmm3
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm6,xmm1
|
|
movdqa xmm1, [esp+40h]
|
|
psubw xmm1,xmm2
|
|
pabsw xmm1,xmm1
|
|
pcmpgtw xmm7,xmm1
|
|
movdqa xmm1, [esp+50h]
|
|
pand xmm6,xmm7
|
|
movdqa xmm7, [esp+50h]
|
|
psubw xmm5,xmm3
|
|
pabsw xmm5,xmm5
|
|
pcmpgtw xmm1,xmm5
|
|
movdqa xmm5, [esp+0B0h]
|
|
psubw xmm5,[esp+0E0h]
|
|
pand xmm6,xmm1
|
|
pand xmm6, [esp+60h]
|
|
movdqa xmm1, [esp+20h]
|
|
pand xmm1,xmm6
|
|
movdqa xmm6, [esp+0C0h]
|
|
movdqa [esp+40h],xmm1
|
|
movdqa xmm1, [esp+0F0h]
|
|
psubw xmm6,xmm1
|
|
psllw xmm6,2
|
|
paddw xmm6,xmm5
|
|
paddw xmm6, [esp+30h]
|
|
movdqa xmm5, [esp+0D0h]
|
|
psraw xmm6,3
|
|
pmaxsw xmm5,xmm6
|
|
pminsw xmm0,xmm5
|
|
movdqa xmm5,[esp+0C0h]
|
|
movdqa xmm6,xmm1
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm4,xmm6
|
|
movdqa xmm6,[esp+0B0h]
|
|
psubw xmm6,xmm1
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm7,xmm6
|
|
movdqa xmm6, [esp+0E0h]
|
|
pand xmm4,xmm7
|
|
movdqa xmm7, [esp+50h]
|
|
psubw xmm6,xmm5
|
|
pabsw xmm6,xmm6
|
|
pcmpgtw xmm7,xmm6
|
|
pand xmm4,xmm7
|
|
pand xmm4,[esp+60h]
|
|
pand xmm0,xmm4
|
|
movdqa xmm4, [esp+40h]
|
|
paddw xmm2,xmm4
|
|
paddw xmm1,xmm0
|
|
psubw xmm3,xmm4
|
|
psubw xmm5,xmm0
|
|
packuswb xmm2,xmm1
|
|
packuswb xmm3,xmm5
|
|
movdqa [esp+80h],xmm2
|
|
movdqa [esp+90h],xmm3
|
|
mov esi,dword [esp+1Ch]
|
|
movdqa xmm0, [esi]
|
|
movdqa xmm1, [esi+10h]
|
|
movdqa xmm2, [esi+20h]
|
|
movdqa xmm3, [esi+30h]
|
|
movdqa xmm6,xmm0
|
|
punpcklbw xmm0,xmm1
|
|
punpckhbw xmm6,xmm1
|
|
movdqa xmm7,xmm2
|
|
punpcklbw xmm2,xmm3
|
|
punpckhbw xmm7,xmm3
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm6
|
|
punpcklwd xmm0,xmm2
|
|
punpckhwd xmm4,xmm2
|
|
punpcklwd xmm6,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm2,xmm4
|
|
punpckldq xmm0,xmm6
|
|
punpckhdq xmm1,xmm6
|
|
punpckldq xmm4,xmm5
|
|
punpckhdq xmm2,xmm5
|
|
movdqa xmm5,xmm0
|
|
movdqa xmm6,xmm1
|
|
punpcklqdq xmm0,xmm4
|
|
punpckhqdq xmm5,xmm4
|
|
punpcklqdq xmm1,xmm2
|
|
punpckhqdq xmm6,xmm2
|
|
mov esi,dword [esp+14h]
|
|
mov ecx,dword [ebp+10h]
|
|
mov edx,dword [esp+0Ch]
|
|
mov edi,dword [esp+8]
|
|
movd dword [esi],xmm0
|
|
movd dword [esi+ecx],xmm5
|
|
movd dword [esi+ecx*2],xmm1
|
|
movd dword [esi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
mov esi,dword [esp+18h]
|
|
movd dword [edi],xmm0
|
|
movd dword [edi+ecx],xmm5
|
|
movd dword [edi+ecx*2],xmm1
|
|
movd dword [edi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
movd dword [esi],xmm0
|
|
movd dword [esi+ecx],xmm5
|
|
movd dword [esi+ecx*2],xmm1
|
|
movd dword [esi+edx],xmm6
|
|
psrldq xmm0,4
|
|
psrldq xmm5,4
|
|
psrldq xmm1,4
|
|
psrldq xmm6,4
|
|
mov edi,dword [esp+10h]
|
|
movd dword [edi],xmm0
|
|
movd dword [edi+ecx],xmm5
|
|
movd dword [edi+ecx*2],xmm1
|
|
movd dword [edi+edx],xmm6
|
|
pop edi
|
|
pop esi
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret
|
|
|
|
|
|
|
|
;*******************************************************************************
|
|
; void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
|
|
; int32_t iBeta, int8_t * pTC)
|
|
;*******************************************************************************
|
|
|
|
|
|
WELS_EXTERN DeblockLumaLt4V_ssse3
|
|
push ebp
|
|
mov ebp, esp
|
|
and esp, -16 ; fffffff0H
|
|
sub esp, 420 ; 000001a4H
|
|
mov eax, dword [ebp+8]
|
|
mov ecx, dword [ebp+12]
|
|
|
|
pxor xmm0, xmm0
|
|
push ebx
|
|
mov edx, dword [ebp+24]
|
|
movdqa [esp+424-384], xmm0
|
|
push esi
|
|
|
|
lea esi, [ecx+ecx*2]
|
|
push edi
|
|
mov edi, eax
|
|
sub edi, esi
|
|
movdqa xmm0, [edi]
|
|
|
|
lea esi, [ecx+ecx]
|
|
movdqa [esp+432-208], xmm0
|
|
mov edi, eax
|
|
sub edi, esi
|
|
movdqa xmm0, [edi]
|
|
movdqa [esp+448-208], xmm0
|
|
|
|
mov ebx, eax
|
|
sub ebx, ecx
|
|
movdqa xmm0, [ebx]
|
|
movdqa [esp+464-208], xmm0
|
|
|
|
movdqa xmm0, [eax]
|
|
|
|
add ecx, eax
|
|
movdqa [esp+480-208], xmm0
|
|
movdqa xmm0, [ecx]
|
|
mov dword [esp+432-404], ecx
|
|
|
|
movsx ecx, word [ebp+16]
|
|
movdqa [esp+496-208], xmm0
|
|
movdqa xmm0, [esi+eax]
|
|
|
|
movsx si, byte [edx]
|
|
movdqa [esp+512-208], xmm0
|
|
movd xmm0, ecx
|
|
movsx ecx, word [ebp+20]
|
|
movdqa xmm1, xmm0
|
|
punpcklwd xmm1, xmm0
|
|
pshufd xmm0, xmm1, 0
|
|
movdqa [esp+432-112], xmm0
|
|
movd xmm0, ecx
|
|
movsx cx, byte [edx+1]
|
|
movdqa xmm1, xmm0
|
|
punpcklwd xmm1, xmm0
|
|
mov dword [esp+432-408], ebx
|
|
movzx ebx, cx
|
|
pshufd xmm0, xmm1, 0
|
|
movd xmm1, ebx
|
|
movzx ebx, cx
|
|
movd xmm2, ebx
|
|
movzx ebx, cx
|
|
movzx ecx, cx
|
|
movd xmm4, ecx
|
|
movzx ecx, si
|
|
movd xmm5, ecx
|
|
movzx ecx, si
|
|
movd xmm6, ecx
|
|
movzx ecx, si
|
|
movd xmm7, ecx
|
|
movzx ecx, si
|
|
movdqa [esp+432-336], xmm0
|
|
movd xmm0, ecx
|
|
|
|
movsx cx, byte [edx+3]
|
|
movsx dx, byte [edx+2]
|
|
movd xmm3, ebx
|
|
punpcklwd xmm0, xmm4
|
|
movzx esi, cx
|
|
punpcklwd xmm6, xmm2
|
|
punpcklwd xmm5, xmm1
|
|
punpcklwd xmm0, xmm6
|
|
punpcklwd xmm7, xmm3
|
|
punpcklwd xmm7, xmm5
|
|
punpcklwd xmm0, xmm7
|
|
movdqa [esp+432-400], xmm0
|
|
movd xmm0, esi
|
|
movzx esi, cx
|
|
movd xmm2, esi
|
|
movzx esi, cx
|
|
movzx ecx, cx
|
|
movd xmm4, ecx
|
|
movzx ecx, dx
|
|
movd xmm3, esi
|
|
movd xmm5, ecx
|
|
punpcklwd xmm5, xmm0
|
|
|
|
movdqa xmm0, [esp+432-384]
|
|
movzx ecx, dx
|
|
movd xmm6, ecx
|
|
movzx ecx, dx
|
|
movzx edx, dx
|
|
punpcklwd xmm6, xmm2
|
|
movd xmm7, ecx
|
|
movd xmm1, edx
|
|
|
|
movdqa xmm2, [esp+448-208]
|
|
punpcklbw xmm2, xmm0
|
|
|
|
mov ecx, 4
|
|
movsx edx, cx
|
|
punpcklwd xmm7, xmm3
|
|
punpcklwd xmm7, xmm5
|
|
movdqa xmm5, [esp+496-208]
|
|
movdqa xmm3, [esp+464-208]
|
|
punpcklbw xmm5, xmm0
|
|
movdqa [esp+432-240], xmm5
|
|
movdqa xmm5, [esp+512-208]
|
|
punpcklbw xmm5, xmm0
|
|
movdqa [esp+432-352], xmm5
|
|
punpcklwd xmm1, xmm4
|
|
movdqa xmm4, [esp+432-208]
|
|
punpcklwd xmm1, xmm6
|
|
movdqa xmm6, [esp+480-208]
|
|
punpcklwd xmm1, xmm7
|
|
punpcklbw xmm6, xmm0
|
|
punpcklbw xmm3, xmm0
|
|
punpcklbw xmm4, xmm0
|
|
movdqa xmm7, xmm3
|
|
psubw xmm7, xmm4
|
|
pabsw xmm7, xmm7
|
|
movdqa [esp+432-272], xmm4
|
|
movdqa xmm4, [esp+432-336]
|
|
movdqa xmm5, xmm4
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa [esp+432-288], xmm5
|
|
movdqa xmm7, xmm6
|
|
psubw xmm7, [esp+432-352]
|
|
pabsw xmm7, xmm7
|
|
movdqa xmm5, xmm4
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa [esp+432-256], xmm5
|
|
movdqa xmm5, xmm3
|
|
pavgw xmm5, xmm6
|
|
movdqa [esp+432-304], xmm5
|
|
movdqa xmm5, [esp+432-400]
|
|
psubw xmm5, [esp+432-288]
|
|
psubw xmm5, [esp+432-256]
|
|
movdqa [esp+432-224], xmm5
|
|
movdqa xmm5, xmm6
|
|
psubw xmm5, xmm3
|
|
movdqa [esp+432-32], xmm6
|
|
psubw xmm6, [esp+432-240]
|
|
movdqa xmm7, xmm5
|
|
movdqa [esp+432-384], xmm5
|
|
movdqa xmm5, [esp+432-112]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm5, xmm7
|
|
pabsw xmm6, xmm6
|
|
movdqa xmm7, xmm4
|
|
pcmpgtw xmm7, xmm6
|
|
|
|
pand xmm5, xmm7
|
|
movdqa xmm6, xmm3
|
|
psubw xmm6, xmm2
|
|
pabsw xmm6, xmm6
|
|
movdqa xmm7, xmm4
|
|
pcmpgtw xmm7, xmm6
|
|
movdqa xmm6, [esp+432-400]
|
|
pand xmm5, xmm7
|
|
movdqa xmm7, xmm6
|
|
pcmpeqw xmm6, xmm0
|
|
pcmpgtw xmm7, xmm0
|
|
por xmm7, xmm6
|
|
pand xmm5, xmm7
|
|
movdqa [esp+432-320], xmm5
|
|
movd xmm5, edx
|
|
movdqa xmm6, xmm5
|
|
punpcklwd xmm6, xmm5
|
|
pshufd xmm5, xmm6, 0
|
|
movdqa [esp+432-336], xmm5
|
|
movdqa xmm5, [esp+432-224]
|
|
movdqa [esp+432-368], xmm5
|
|
movdqa xmm6, xmm0
|
|
psubw xmm6, xmm5
|
|
movdqa xmm5, [esp+432-384]
|
|
psllw xmm5, 2
|
|
movdqa xmm7, xmm2
|
|
psubw xmm7, [esp+432-240]
|
|
paddw xmm7, xmm5
|
|
paddw xmm7, [esp+432-336]
|
|
movdqa xmm5, [esp+432-368]
|
|
psraw xmm7, 3
|
|
pmaxsw xmm6, xmm7
|
|
pminsw xmm5, xmm6
|
|
|
|
pand xmm5, [esp+432-320]
|
|
movdqa xmm6, [esp+432-400]
|
|
movdqa [esp+432-64], xmm5
|
|
movdqa [esp+432-384], xmm6
|
|
movdqa xmm5, xmm0
|
|
psubw xmm5, xmm6
|
|
movdqa [esp+432-368], xmm5
|
|
movdqa xmm6, xmm5
|
|
movdqa xmm5, [esp+432-272]
|
|
paddw xmm5, [esp+432-304]
|
|
movdqa xmm7, xmm2
|
|
paddw xmm7, xmm2
|
|
psubw xmm5, xmm7
|
|
psraw xmm5, 1
|
|
pmaxsw xmm6, xmm5
|
|
movdqa xmm5, [esp+432-384]
|
|
pminsw xmm5, xmm6
|
|
|
|
pand xmm5, [esp+432-320]
|
|
pand xmm5, [esp+432-288]
|
|
movdqa xmm6, [esp+432-240]
|
|
movdqa [esp+432-96], xmm5
|
|
movdqa xmm5, [esp+432-352]
|
|
paddw xmm5, [esp+432-304]
|
|
movdqa xmm7, xmm6
|
|
paddw xmm7, xmm6
|
|
movdqa xmm6, [esp+432-368]
|
|
psubw xmm5, xmm7
|
|
|
|
movdqa xmm7, [esp+496-208]
|
|
psraw xmm5, 1
|
|
pmaxsw xmm6, xmm5
|
|
movdqa xmm5, [esp+432-400]
|
|
pminsw xmm5, xmm6
|
|
pand xmm5, [esp+432-320]
|
|
pand xmm5, [esp+432-256]
|
|
movdqa xmm6, [esp+448-208]
|
|
punpckhbw xmm7, xmm0
|
|
movdqa [esp+432-352], xmm7
|
|
|
|
movdqa xmm7, [esp+512-208]
|
|
punpckhbw xmm6, xmm0
|
|
movdqa [esp+432-48], xmm5
|
|
movdqa xmm5, [esp+432-208]
|
|
movdqa [esp+432-368], xmm6
|
|
movdqa xmm6, [esp+464-208]
|
|
punpckhbw xmm7, xmm0
|
|
punpckhbw xmm5, xmm0
|
|
movdqa [esp+432-384], xmm7
|
|
punpckhbw xmm6, xmm0
|
|
movdqa [esp+432-400], xmm6
|
|
|
|
movdqa xmm7, [esp+432-400]
|
|
movdqa xmm6, [esp+480-208]
|
|
psubw xmm7, xmm5
|
|
movdqa [esp+432-16], xmm5
|
|
pabsw xmm7, xmm7
|
|
punpckhbw xmm6, xmm0
|
|
movdqa xmm5, xmm4
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa [esp+432-288], xmm5
|
|
|
|
movdqa xmm7, xmm6
|
|
psubw xmm7, [esp+432-384]
|
|
pabsw xmm7, xmm7
|
|
movdqa xmm5, xmm4
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa [esp+432-256], xmm5
|
|
|
|
movdqa xmm5, [esp+432-400]
|
|
movdqa [esp+432-80], xmm6
|
|
pavgw xmm5, xmm6
|
|
movdqa [esp+432-304], xmm5
|
|
|
|
movdqa xmm5, xmm1
|
|
psubw xmm5, [esp+432-288]
|
|
psubw xmm5, [esp+432-256]
|
|
movdqa [esp+432-224], xmm5
|
|
movdqa xmm5, xmm6
|
|
psubw xmm5, [esp+432-400]
|
|
psubw xmm6, [esp+432-352]
|
|
movdqa [esp+432-272], xmm5
|
|
movdqa xmm7, xmm5
|
|
movdqa xmm5, [esp+432-112]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm5, xmm7
|
|
movdqa xmm7, xmm4
|
|
pabsw xmm6, xmm6
|
|
pcmpgtw xmm7, xmm6
|
|
movdqa xmm6, [esp+432-368]
|
|
|
|
pand xmm5, xmm7
|
|
movdqa xmm7, [esp+432-400]
|
|
psubw xmm7, xmm6
|
|
psubw xmm6, [esp+432-352]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm4, xmm7
|
|
pand xmm5, xmm4
|
|
|
|
paddw xmm2, [esp+432-96]
|
|
movdqa xmm4, xmm1
|
|
pcmpgtw xmm4, xmm0
|
|
movdqa xmm7, xmm1
|
|
pcmpeqw xmm7, xmm0
|
|
por xmm4, xmm7
|
|
pand xmm5, xmm4
|
|
movdqa xmm4, [esp+432-224]
|
|
movdqa [esp+432-320], xmm5
|
|
movdqa xmm5, [esp+432-272]
|
|
movdqa xmm7, xmm0
|
|
psubw xmm7, xmm4
|
|
psubw xmm0, xmm1
|
|
psllw xmm5, 2
|
|
paddw xmm6, xmm5
|
|
paddw xmm6, [esp+432-336]
|
|
movdqa xmm5, [esp+432-368]
|
|
movdqa [esp+432-336], xmm0
|
|
psraw xmm6, 3
|
|
pmaxsw xmm7, xmm6
|
|
pminsw xmm4, xmm7
|
|
pand xmm4, [esp+432-320]
|
|
movdqa xmm6, xmm0
|
|
movdqa xmm0, [esp+432-16]
|
|
paddw xmm0, [esp+432-304]
|
|
movdqa [esp+432-272], xmm4
|
|
movdqa xmm4, [esp+432-368]
|
|
paddw xmm4, xmm4
|
|
psubw xmm0, xmm4
|
|
|
|
movdqa xmm4, [esp+432-64]
|
|
psraw xmm0, 1
|
|
pmaxsw xmm6, xmm0
|
|
movdqa xmm0, [esp+432-400]
|
|
movdqa xmm7, xmm1
|
|
pminsw xmm7, xmm6
|
|
movdqa xmm6, [esp+432-320]
|
|
pand xmm7, xmm6
|
|
pand xmm7, [esp+432-288]
|
|
paddw xmm5, xmm7
|
|
packuswb xmm2, xmm5
|
|
movdqa xmm5, [esp+432-272]
|
|
paddw xmm0, xmm5
|
|
paddw xmm3, xmm4
|
|
packuswb xmm3, xmm0
|
|
|
|
movdqa xmm0, [esp+432-32]
|
|
psubw xmm0, xmm4
|
|
movdqa xmm4, [esp+432-80]
|
|
psubw xmm4, xmm5
|
|
|
|
movdqa xmm5, [esp+432-240]
|
|
paddw xmm5, [esp+432-48]
|
|
packuswb xmm0, xmm4
|
|
movdqa xmm4, [esp+432-384]
|
|
paddw xmm4, [esp+432-304]
|
|
movdqa [esp+480-208], xmm0
|
|
movdqa xmm0, [esp+432-352]
|
|
movdqa xmm7, xmm0
|
|
paddw xmm0, xmm0
|
|
|
|
mov ecx, dword [esp+432-408]
|
|
|
|
mov edx, dword [esp+432-404]
|
|
psubw xmm4, xmm0
|
|
movdqa xmm0, [esp+432-336]
|
|
movdqa [edi], xmm2
|
|
psraw xmm4, 1
|
|
pmaxsw xmm0, xmm4
|
|
pminsw xmm1, xmm0
|
|
movdqa xmm0, [esp+480-208]
|
|
|
|
pop edi
|
|
pand xmm1, xmm6
|
|
pand xmm1, [esp+428-256]
|
|
movdqa [ecx], xmm3
|
|
paddw xmm7, xmm1
|
|
pop esi
|
|
packuswb xmm5, xmm7
|
|
movdqa [eax], xmm0
|
|
movdqa [edx], xmm5
|
|
pop ebx
|
|
mov esp, ebp
|
|
pop ebp
|
|
ret
|
|
|
|
|
|
;*******************************************************************************
|
|
; void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
|
|
; int32_t iBeta)
|
|
;*******************************************************************************
|
|
|
|
|
|
WELS_EXTERN DeblockLumaEq4V_ssse3
|
|
|
|
push ebp
|
|
mov ebp, esp
|
|
and esp, -16 ; fffffff0H
|
|
sub esp, 628 ; 00000274H
|
|
mov eax, dword [ebp+8]
|
|
mov ecx, dword [ebp+12]
|
|
push ebx
|
|
push esi
|
|
|
|
lea edx, [ecx*4]
|
|
pxor xmm0, xmm0
|
|
movdqa xmm2, xmm0
|
|
|
|
movdqa xmm0, [ecx+eax]
|
|
mov esi, eax
|
|
sub esi, edx
|
|
movdqa xmm3, [esi]
|
|
movdqa xmm5, [eax]
|
|
push edi
|
|
lea edi, [ecx+ecx]
|
|
lea ebx, [ecx+ecx*2]
|
|
mov dword [esp+640-600], edi
|
|
mov esi, eax
|
|
sub esi, edi
|
|
movdqa xmm1, [esi]
|
|
movdqa [esp+720-272], xmm0
|
|
mov edi, eax
|
|
sub edi, ecx
|
|
movdqa xmm4, [edi]
|
|
add ecx, eax
|
|
mov dword [esp+640-596], ecx
|
|
|
|
mov ecx, dword [esp+640-600]
|
|
movdqa xmm0, [ecx+eax]
|
|
movdqa [esp+736-272], xmm0
|
|
|
|
movdqa xmm0, [eax+ebx]
|
|
mov edx, eax
|
|
sub edx, ebx
|
|
|
|
movsx ebx, word [ebp+16]
|
|
movdqa xmm6, [edx]
|
|
add ecx, eax
|
|
movdqa [esp+752-272], xmm0
|
|
movd xmm0, ebx
|
|
|
|
movsx ebx, word [ebp+20]
|
|
movdqa xmm7, xmm0
|
|
punpcklwd xmm7, xmm0
|
|
pshufd xmm0, xmm7, 0
|
|
movdqa [esp+640-320], xmm0
|
|
movd xmm0, ebx
|
|
movdqa xmm7, xmm0
|
|
punpcklwd xmm7, xmm0
|
|
pshufd xmm0, xmm7, 0
|
|
|
|
movdqa xmm7, [esp+736-272]
|
|
punpcklbw xmm7, xmm2
|
|
movdqa [esp+640-416], xmm7
|
|
movdqa [esp+640-512], xmm0
|
|
movdqa xmm0, xmm1
|
|
movdqa [esp+672-272], xmm1
|
|
movdqa xmm1, xmm4
|
|
movdqa [esp+704-272], xmm5
|
|
punpcklbw xmm5, xmm2
|
|
punpcklbw xmm1, xmm2
|
|
|
|
movdqa xmm7, xmm5
|
|
psubw xmm7, xmm1
|
|
pabsw xmm7, xmm7
|
|
movdqa [esp+640-560], xmm7
|
|
punpcklbw xmm0, xmm2
|
|
movdqa [esp+688-272], xmm4
|
|
movdqa xmm4, [esp+720-272]
|
|
movdqa [esp+640-480], xmm0
|
|
|
|
movdqa xmm7, xmm1
|
|
psubw xmm7, xmm0
|
|
|
|
movdqa xmm0, [esp+640-512]
|
|
pabsw xmm7, xmm7
|
|
punpcklbw xmm4, xmm2
|
|
pcmpgtw xmm0, xmm7
|
|
movdqa [esp+640-384], xmm4
|
|
movdqa xmm7, xmm5
|
|
psubw xmm7, xmm4
|
|
movdqa xmm4, [esp+640-512]
|
|
movdqa [esp+656-272], xmm6
|
|
punpcklbw xmm6, xmm2
|
|
pabsw xmm7, xmm7
|
|
movdqa [esp+640-48], xmm2
|
|
movdqa [esp+640-368], xmm6
|
|
movdqa [esp+640-144], xmm1
|
|
movdqa [esp+640-400], xmm5
|
|
pcmpgtw xmm4, xmm7
|
|
pand xmm0, xmm4
|
|
movdqa xmm4, [esp+640-320]
|
|
pcmpgtw xmm4, [esp+640-560]
|
|
pand xmm0, xmm4
|
|
|
|
mov ebx, 2
|
|
movsx ebx, bx
|
|
movd xmm4, ebx
|
|
movdqa xmm7, xmm4
|
|
punpcklwd xmm7, xmm4
|
|
movdqa xmm4, [esp+640-320]
|
|
psraw xmm4, 2
|
|
pshufd xmm7, xmm7, 0
|
|
paddw xmm4, xmm7
|
|
movdqa [esp+640-576], xmm4
|
|
pcmpgtw xmm4, [esp+640-560]
|
|
movdqa [esp+640-560], xmm4
|
|
|
|
movdqa xmm4, [esp+640-512]
|
|
movdqa [esp+640-624], xmm7
|
|
movdqa xmm7, xmm1
|
|
psubw xmm7, xmm6
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm4, xmm7
|
|
|
|
pand xmm4, [esp+640-560]
|
|
movdqa [esp+640-544], xmm4
|
|
movdqa xmm4, [esp+640-512]
|
|
movdqa xmm7, xmm5
|
|
psubw xmm7, [esp+640-416]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm4, xmm7
|
|
|
|
pand xmm4, [esp+640-560]
|
|
movdqa [esp+640-560], xmm4
|
|
|
|
movdqa xmm4, [esp+640-544]
|
|
pandn xmm4, xmm6
|
|
movdqa [esp+640-16], xmm4
|
|
mov ebx, 4
|
|
movsx ebx, bx
|
|
movd xmm4, ebx
|
|
movdqa xmm7, xmm4
|
|
punpcklwd xmm7, xmm4
|
|
movdqa xmm4, xmm3
|
|
punpcklbw xmm4, xmm2
|
|
psllw xmm4, 1
|
|
paddw xmm4, xmm6
|
|
paddw xmm4, xmm6
|
|
paddw xmm4, xmm6
|
|
paddw xmm4, [esp+640-480]
|
|
|
|
movdqa xmm6, [esp+640-560]
|
|
pshufd xmm7, xmm7, 0
|
|
paddw xmm4, xmm1
|
|
movdqa [esp+640-592], xmm7
|
|
paddw xmm4, xmm5
|
|
paddw xmm4, xmm7
|
|
movdqa xmm7, [esp+640-416]
|
|
pandn xmm6, xmm7
|
|
movdqa [esp+640-80], xmm6
|
|
movdqa xmm6, [esp+752-272]
|
|
punpcklbw xmm6, xmm2
|
|
psllw xmm6, 1
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, [esp+640-384]
|
|
|
|
movdqa xmm7, [esp+640-480]
|
|
paddw xmm6, xmm5
|
|
paddw xmm6, xmm1
|
|
paddw xmm6, [esp+640-592]
|
|
psraw xmm6, 3
|
|
pand xmm6, [esp+640-560]
|
|
movdqa [esp+640-112], xmm6
|
|
movdqa xmm6, [esp+640-544]
|
|
pandn xmm6, xmm7
|
|
movdqa [esp+640-336], xmm6
|
|
movdqa xmm6, [esp+640-544]
|
|
movdqa [esp+640-528], xmm6
|
|
movdqa xmm6, [esp+640-368]
|
|
paddw xmm6, xmm7
|
|
movdqa xmm7, xmm1
|
|
psraw xmm4, 3
|
|
pand xmm4, [esp+640-544]
|
|
paddw xmm7, xmm5
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, [esp+640-624]
|
|
movdqa xmm7, [esp+640-528]
|
|
|
|
paddw xmm5, xmm1
|
|
psraw xmm6, 2
|
|
pand xmm7, xmm6
|
|
|
|
movdqa xmm6, [esp+640-384]
|
|
movdqa [esp+640-64], xmm7
|
|
movdqa xmm7, [esp+640-560]
|
|
pandn xmm7, xmm6
|
|
movdqa [esp+640-304], xmm7
|
|
movdqa xmm7, [esp+640-560]
|
|
movdqa [esp+640-528], xmm7
|
|
movdqa xmm7, [esp+640-416]
|
|
paddw xmm7, xmm6
|
|
paddw xmm7, xmm5
|
|
paddw xmm7, [esp+640-624]
|
|
movdqa xmm5, [esp+640-528]
|
|
psraw xmm7, 2
|
|
pand xmm5, xmm7
|
|
movdqa [esp+640-32], xmm5
|
|
|
|
movdqa xmm5, [esp+640-544]
|
|
movdqa [esp+640-528], xmm5
|
|
movdqa xmm5, [esp+640-480]
|
|
movdqa xmm7, xmm5
|
|
paddw xmm7, xmm5
|
|
movdqa xmm5, xmm1
|
|
paddw xmm5, xmm6
|
|
paddw xmm6, [esp+640-592]
|
|
paddw xmm7, xmm5
|
|
paddw xmm7, [esp+640-624]
|
|
movdqa xmm5, [esp+640-528]
|
|
psraw xmm7, 2
|
|
pandn xmm5, xmm7
|
|
movdqa xmm7, [esp+640-480]
|
|
paddw xmm7, xmm1
|
|
paddw xmm7, [esp+640-400]
|
|
movdqa xmm1, [esp+640-544]
|
|
movdqa [esp+640-352], xmm5
|
|
movdqa xmm5, [esp+640-368]
|
|
psllw xmm7, 1
|
|
paddw xmm7, xmm6
|
|
paddw xmm5, xmm7
|
|
|
|
movdqa xmm7, [esp+640-400]
|
|
psraw xmm5, 3
|
|
pand xmm1, xmm5
|
|
movdqa xmm5, [esp+640-480]
|
|
movdqa [esp+640-96], xmm1
|
|
movdqa xmm1, [esp+640-560]
|
|
movdqa [esp+640-528], xmm1
|
|
movdqa xmm1, [esp+640-384]
|
|
movdqa xmm6, xmm1
|
|
paddw xmm6, xmm1
|
|
paddw xmm1, [esp+640-400]
|
|
paddw xmm1, [esp+640-144]
|
|
paddw xmm7, xmm5
|
|
paddw xmm5, [esp+640-592]
|
|
paddw xmm6, xmm7
|
|
paddw xmm6, [esp+640-624]
|
|
movdqa xmm7, [esp+640-528]
|
|
psraw xmm6, 2
|
|
psllw xmm1, 1
|
|
paddw xmm1, xmm5
|
|
|
|
movdqa xmm5, [esp+656-272]
|
|
pandn xmm7, xmm6
|
|
movdqa xmm6, [esp+640-416]
|
|
paddw xmm6, xmm1
|
|
movdqa xmm1, [esp+640-560]
|
|
psraw xmm6, 3
|
|
pand xmm1, xmm6
|
|
|
|
movdqa xmm6, [esp+704-272]
|
|
movdqa [esp+640-128], xmm1
|
|
movdqa xmm1, [esp+672-272]
|
|
punpckhbw xmm1, xmm2
|
|
movdqa [esp+640-448], xmm1
|
|
movdqa xmm1, [esp+688-272]
|
|
punpckhbw xmm1, xmm2
|
|
punpckhbw xmm6, xmm2
|
|
movdqa [esp+640-288], xmm7
|
|
punpckhbw xmm5, xmm2
|
|
movdqa [esp+640-496], xmm1
|
|
movdqa [esp+640-432], xmm6
|
|
|
|
movdqa xmm7, [esp+720-272]
|
|
punpckhbw xmm7, xmm2
|
|
movdqa [esp+640-464], xmm7
|
|
|
|
movdqa xmm7, [esp+736-272]
|
|
punpckhbw xmm7, xmm2
|
|
movdqa [esp+640-528], xmm7
|
|
|
|
movdqa xmm7, xmm6
|
|
|
|
psubw xmm6, [esp+640-464]
|
|
psubw xmm7, xmm1
|
|
pabsw xmm7, xmm7
|
|
movdqa [esp+640-560], xmm7
|
|
por xmm4, [esp+640-16]
|
|
pabsw xmm6, xmm6
|
|
movdqa xmm7, xmm1
|
|
psubw xmm7, [esp+640-448]
|
|
|
|
movdqa xmm1, [esp+640-512]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm1, xmm7
|
|
movdqa xmm7, [esp+640-512]
|
|
pcmpgtw xmm7, xmm6
|
|
movdqa xmm6, [esp+640-320]
|
|
pand xmm1, xmm7
|
|
movdqa xmm7, [esp+640-560]
|
|
pcmpgtw xmm6, xmm7
|
|
pand xmm1, xmm6
|
|
|
|
movdqa xmm6, [esp+640-576]
|
|
pcmpgtw xmm6, xmm7
|
|
|
|
movdqa xmm7, [esp+640-496]
|
|
punpckhbw xmm3, xmm2
|
|
movdqa [esp+640-560], xmm6
|
|
movdqa xmm6, [esp+640-512]
|
|
psubw xmm7, xmm5
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm6, xmm7
|
|
|
|
pand xmm6, [esp+640-560]
|
|
movdqa xmm7, [esp+640-432]
|
|
psubw xmm7, [esp+640-528]
|
|
|
|
psllw xmm3, 1
|
|
movdqa [esp+640-544], xmm6
|
|
movdqa xmm6, [esp+640-512]
|
|
|
|
movdqa xmm2, [esp+640-544]
|
|
paddw xmm3, xmm5
|
|
paddw xmm3, xmm5
|
|
paddw xmm3, xmm5
|
|
paddw xmm3, [esp+640-448]
|
|
paddw xmm3, [esp+640-496]
|
|
pabsw xmm7, xmm7
|
|
pcmpgtw xmm6, xmm7
|
|
pand xmm6, [esp+640-560]
|
|
movdqa [esp+640-560], xmm6
|
|
|
|
movdqa xmm6, xmm0
|
|
pand xmm6, xmm4
|
|
movdqa xmm4, xmm0
|
|
pandn xmm4, [esp+640-368]
|
|
por xmm6, xmm4
|
|
movdqa xmm4, [esp+640-432]
|
|
paddw xmm3, xmm4
|
|
paddw xmm3, [esp+640-592]
|
|
psraw xmm3, 3
|
|
pand xmm3, xmm2
|
|
pandn xmm2, xmm5
|
|
por xmm3, xmm2
|
|
movdqa xmm7, xmm1
|
|
pand xmm7, xmm3
|
|
movdqa xmm3, [esp+640-64]
|
|
por xmm3, [esp+640-336]
|
|
movdqa xmm2, xmm1
|
|
pandn xmm2, xmm5
|
|
por xmm7, xmm2
|
|
|
|
movdqa xmm2, xmm0
|
|
pand xmm2, xmm3
|
|
movdqa xmm3, xmm0
|
|
pandn xmm3, [esp+640-480]
|
|
por xmm2, xmm3
|
|
packuswb xmm6, xmm7
|
|
movdqa [esp+640-336], xmm2
|
|
movdqa [esp+656-272], xmm6
|
|
movdqa xmm6, [esp+640-544]
|
|
movdqa xmm2, xmm5
|
|
paddw xmm2, [esp+640-448]
|
|
movdqa xmm3, xmm1
|
|
movdqa xmm7, [esp+640-496]
|
|
paddw xmm7, xmm4
|
|
paddw xmm2, xmm7
|
|
paddw xmm2, [esp+640-624]
|
|
movdqa xmm7, [esp+640-544]
|
|
psraw xmm2, 2
|
|
pand xmm6, xmm2
|
|
movdqa xmm2, [esp+640-448]
|
|
pandn xmm7, xmm2
|
|
por xmm6, xmm7
|
|
pand xmm3, xmm6
|
|
movdqa xmm6, xmm1
|
|
pandn xmm6, xmm2
|
|
paddw xmm2, [esp+640-496]
|
|
paddw xmm2, xmm4
|
|
por xmm3, xmm6
|
|
movdqa xmm6, [esp+640-336]
|
|
packuswb xmm6, xmm3
|
|
psllw xmm2, 1
|
|
movdqa [esp+672-272], xmm6
|
|
movdqa xmm6, [esp+640-96]
|
|
por xmm6, [esp+640-352]
|
|
|
|
movdqa xmm3, xmm0
|
|
pand xmm3, xmm6
|
|
movdqa xmm6, xmm0
|
|
pandn xmm6, [esp+640-144]
|
|
por xmm3, xmm6
|
|
movdqa xmm6, [esp+640-544]
|
|
movdqa [esp+640-352], xmm3
|
|
movdqa xmm3, [esp+640-464]
|
|
paddw xmm3, [esp+640-592]
|
|
paddw xmm2, xmm3
|
|
movdqa xmm3, [esp+640-448]
|
|
paddw xmm5, xmm2
|
|
movdqa xmm2, [esp+640-496]
|
|
psraw xmm5, 3
|
|
pand xmm6, xmm5
|
|
movdqa xmm5, [esp+640-464]
|
|
paddw xmm2, xmm5
|
|
paddw xmm5, [esp+640-432]
|
|
movdqa xmm4, xmm3
|
|
paddw xmm4, xmm3
|
|
paddw xmm4, xmm2
|
|
paddw xmm4, [esp+640-624]
|
|
movdqa xmm2, [esp+640-544]
|
|
paddw xmm3, [esp+640-592]
|
|
psraw xmm4, 2
|
|
pandn xmm2, xmm4
|
|
por xmm6, xmm2
|
|
movdqa xmm7, xmm1
|
|
pand xmm7, xmm6
|
|
movdqa xmm6, [esp+640-496]
|
|
movdqa xmm2, xmm1
|
|
pandn xmm2, xmm6
|
|
por xmm7, xmm2
|
|
movdqa xmm2, [esp+640-352]
|
|
packuswb xmm2, xmm7
|
|
movdqa [esp+688-272], xmm2
|
|
movdqa xmm2, [esp+640-128]
|
|
por xmm2, [esp+640-288]
|
|
|
|
movdqa xmm4, xmm0
|
|
pand xmm4, xmm2
|
|
paddw xmm5, xmm6
|
|
movdqa xmm2, xmm0
|
|
pandn xmm2, [esp+640-400]
|
|
por xmm4, xmm2
|
|
movdqa xmm2, [esp+640-528]
|
|
psllw xmm5, 1
|
|
paddw xmm5, xmm3
|
|
movdqa xmm3, [esp+640-560]
|
|
paddw xmm2, xmm5
|
|
psraw xmm2, 3
|
|
movdqa [esp+640-288], xmm4
|
|
movdqa xmm4, [esp+640-560]
|
|
pand xmm4, xmm2
|
|
movdqa xmm2, [esp+640-464]
|
|
movdqa xmm5, xmm2
|
|
paddw xmm5, xmm2
|
|
movdqa xmm2, [esp+640-432]
|
|
paddw xmm2, [esp+640-448]
|
|
movdqa xmm7, xmm1
|
|
paddw xmm5, xmm2
|
|
paddw xmm5, [esp+640-624]
|
|
movdqa xmm6, [esp+640-560]
|
|
psraw xmm5, 2
|
|
pandn xmm3, xmm5
|
|
por xmm4, xmm3
|
|
movdqa xmm3, [esp+640-32]
|
|
por xmm3, [esp+640-304]
|
|
pand xmm7, xmm4
|
|
movdqa xmm4, [esp+640-432]
|
|
movdqa xmm5, [esp+640-464]
|
|
movdqa xmm2, xmm1
|
|
pandn xmm2, xmm4
|
|
paddw xmm4, [esp+640-496]
|
|
por xmm7, xmm2
|
|
movdqa xmm2, [esp+640-288]
|
|
packuswb xmm2, xmm7
|
|
movdqa [esp+704-272], xmm2
|
|
|
|
movdqa xmm2, xmm0
|
|
pand xmm2, xmm3
|
|
movdqa xmm3, xmm0
|
|
pandn xmm3, [esp+640-384]
|
|
por xmm2, xmm3
|
|
movdqa [esp+640-304], xmm2
|
|
movdqa xmm2, [esp+640-528]
|
|
movdqa xmm3, xmm2
|
|
paddw xmm3, [esp+640-464]
|
|
paddw xmm3, xmm4
|
|
paddw xmm3, [esp+640-624]
|
|
psraw xmm3, 2
|
|
pand xmm6, xmm3
|
|
movdqa xmm3, [esp+640-560]
|
|
movdqa xmm4, xmm3
|
|
pandn xmm4, xmm5
|
|
por xmm6, xmm4
|
|
movdqa xmm7, xmm1
|
|
pand xmm7, xmm6
|
|
movdqa xmm6, [esp+640-304]
|
|
movdqa xmm4, xmm1
|
|
pandn xmm4, xmm5
|
|
por xmm7, xmm4
|
|
|
|
movdqa xmm4, xmm0
|
|
pandn xmm0, [esp+640-416]
|
|
packuswb xmm6, xmm7
|
|
movdqa xmm7, [esp+640-112]
|
|
por xmm7, [esp+640-80]
|
|
pand xmm4, xmm7
|
|
por xmm4, xmm0
|
|
movdqa xmm0, [esp+752-272]
|
|
punpckhbw xmm0, [esp+640-48]
|
|
psllw xmm0, 1
|
|
paddw xmm0, xmm2
|
|
paddw xmm0, xmm2
|
|
paddw xmm0, xmm2
|
|
paddw xmm0, xmm5
|
|
paddw xmm0, [esp+640-432]
|
|
paddw xmm0, [esp+640-496]
|
|
paddw xmm0, [esp+640-592]
|
|
psraw xmm0, 3
|
|
pand xmm0, xmm3
|
|
movdqa xmm7, xmm1
|
|
pandn xmm3, xmm2
|
|
por xmm0, xmm3
|
|
pand xmm7, xmm0
|
|
|
|
movdqa xmm0, [esp+656-272]
|
|
movdqa [edx], xmm0
|
|
|
|
movdqa xmm0, [esp+672-272]
|
|
|
|
mov edx, dword [esp+640-596]
|
|
movdqa [esi], xmm0
|
|
movdqa xmm0, [esp+688-272]
|
|
movdqa [edi], xmm0
|
|
movdqa xmm0, [esp+704-272]
|
|
|
|
pop edi
|
|
pandn xmm1, xmm2
|
|
movdqa [eax], xmm0
|
|
por xmm7, xmm1
|
|
pop esi
|
|
packuswb xmm4, xmm7
|
|
movdqa [edx], xmm6
|
|
movdqa [ecx], xmm4
|
|
pop ebx
|
|
mov esp, ebp
|
|
pop ebp
|
|
ret
|
|
|
|
%endif
|
|
|
|
|
|
|
|
;********************************************************************************
|
|
;
|
|
; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
|
|
;
|
|
;********************************************************************************
|
|
|
|
WELS_EXTERN DeblockLumaTransposeH2V_sse2
|
|
push r3
|
|
push r4
|
|
push r5
|
|
|
|
%assign push_num 3
|
|
LOAD_3_PARA
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
mov r5, r7
|
|
mov r3, r7
|
|
and r3, 0Fh
|
|
sub r7, r3
|
|
sub r7, 10h
|
|
|
|
lea r3, [r0 + r1 * 8]
|
|
lea r4, [r1 * 3]
|
|
|
|
movq xmm0, [r0]
|
|
movq xmm7, [r3]
|
|
punpcklqdq xmm0, xmm7
|
|
movq xmm1, [r0 + r1]
|
|
movq xmm7, [r3 + r1]
|
|
punpcklqdq xmm1, xmm7
|
|
movq xmm2, [r0 + r1*2]
|
|
movq xmm7, [r3 + r1*2]
|
|
punpcklqdq xmm2, xmm7
|
|
movq xmm3, [r0 + r4]
|
|
movq xmm7, [r3 + r4]
|
|
punpcklqdq xmm3, xmm7
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
lea r3, [r3 + r1 * 4]
|
|
movq xmm4, [r0]
|
|
movq xmm7, [r3]
|
|
punpcklqdq xmm4, xmm7
|
|
movq xmm5, [r0 + r1]
|
|
movq xmm7, [r3 + r1]
|
|
punpcklqdq xmm5, xmm7
|
|
movq xmm6, [r0 + r1*2]
|
|
movq xmm7, [r3 + r1*2]
|
|
punpcklqdq xmm6, xmm7
|
|
|
|
movdqa [r7], xmm0
|
|
movq xmm7, [r0 + r4]
|
|
movq xmm0, [r3 + r4]
|
|
punpcklqdq xmm7, xmm0
|
|
movdqa xmm0, [r7]
|
|
|
|
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
|
|
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
|
|
|
movdqa [r2], xmm4
|
|
movdqa [r2 + 10h], xmm2
|
|
movdqa [r2 + 20h], xmm3
|
|
movdqa [r2 + 30h], xmm7
|
|
movdqa [r2 + 40h], xmm5
|
|
movdqa [r2 + 50h], xmm1
|
|
movdqa [r2 + 60h], xmm6
|
|
movdqa [r2 + 70h], xmm0
|
|
|
|
mov r7, r5
|
|
POP_XMM
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
ret
|
|
|
|
|
|
;*******************************************************************************************
|
|
;
|
|
; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
|
|
;
|
|
;*******************************************************************************************
|
|
|
|
WELS_EXTERN DeblockLumaTransposeV2H_sse2
|
|
push r3
|
|
push r4
|
|
|
|
%assign push_num 2
|
|
LOAD_3_PARA
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
mov r4, r7
|
|
mov r3, r7
|
|
and r3, 0Fh
|
|
sub r7, r3
|
|
sub r7, 10h
|
|
|
|
movdqa xmm0, [r2]
|
|
movdqa xmm1, [r2 + 10h]
|
|
movdqa xmm2, [r2 + 20h]
|
|
movdqa xmm3, [r2 + 30h]
|
|
movdqa xmm4, [r2 + 40h]
|
|
movdqa xmm5, [r2 + 50h]
|
|
movdqa xmm6, [r2 + 60h]
|
|
movdqa xmm7, [r2 + 70h]
|
|
|
|
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
|
|
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
|
|
|
lea r2, [r1 * 3]
|
|
|
|
movq [r0], xmm4
|
|
movq [r0 + r1], xmm2
|
|
movq [r0 + r1*2], xmm3
|
|
movq [r0 + r2], xmm7
|
|
|
|
lea r0, [r0 + r1*4]
|
|
movq [r0], xmm5
|
|
movq [r0 + r1], xmm1
|
|
movq [r0 + r1*2], xmm6
|
|
movq [r0 + r2], xmm0
|
|
|
|
psrldq xmm4, 8
|
|
psrldq xmm2, 8
|
|
psrldq xmm3, 8
|
|
psrldq xmm7, 8
|
|
psrldq xmm5, 8
|
|
psrldq xmm1, 8
|
|
psrldq xmm6, 8
|
|
psrldq xmm0, 8
|
|
|
|
lea r0, [r0 + r1*4]
|
|
movq [r0], xmm4
|
|
movq [r0 + r1], xmm2
|
|
movq [r0 + r1*2], xmm3
|
|
movq [r0 + r2], xmm7
|
|
|
|
lea r0, [r0 + r1*4]
|
|
movq [r0], xmm5
|
|
movq [r0 + r1], xmm1
|
|
movq [r0 + r1*2], xmm6
|
|
movq [r0 + r2], xmm0
|
|
|
|
|
|
mov r7, r4
|
|
POP_XMM
|
|
pop r4
|
|
pop r3
|
|
ret
|
|
|