2014-01-03 14:49:45 +08:00
|
|
|
;*!
|
|
|
|
;* \copy
|
|
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
|
|
;* All rights reserved.
|
|
|
|
;*
|
|
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
|
|
;* modification, are permitted provided that the following conditions
|
|
|
|
;* are met:
|
|
|
|
;*
|
|
|
|
;* * Redistributions of source code must retain the above copyright
|
|
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
|
|
;*
|
|
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
|
|
;* the documentation and/or other materials provided with the
|
|
|
|
;* distribution.
|
|
|
|
;*
|
|
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
;*
|
|
|
|
;*
|
|
|
|
;* deblock.asm
|
|
|
|
;*
|
|
|
|
;* Abstract
|
|
|
|
;* edge loop
|
|
|
|
;*
|
|
|
|
;* History
|
|
|
|
;* 08/07/2009 Created
|
|
|
|
;*
|
|
|
|
;*
|
|
|
|
;*************************************************************************/
|
|
|
|
%include "asm_inc.asm"
|
|
|
|
|
|
|
|
;*******************************************************************************
|
|
|
|
; Macros and other preprocessor constants
|
|
|
|
;*******************************************************************************
|
|
|
|
|
|
|
|
SECTION .rodata align=16
|
|
|
|
|
|
|
|
ALIGN 16
|
|
|
|
FOUR_16B_SSE2: dw 4, 4, 4, 4, 4, 4, 4, 4
|
|
|
|
|
2016-02-01 11:48:25 +01:00
|
|
|
ALIGN 16
|
2016-02-09 23:28:14 +01:00
|
|
|
WELS_DB1_16:
|
|
|
|
times 16 db 1
|
2016-02-01 11:48:25 +01:00
|
|
|
WELS_DB127_16:
|
|
|
|
times 16 db 127
|
2016-02-09 23:28:14 +01:00
|
|
|
WELS_DB96_16:
|
|
|
|
times 16 db 96
|
2016-02-01 11:48:25 +01:00
|
|
|
WELS_SHUFB0000111122223333:
|
|
|
|
times 4 db 0
|
|
|
|
times 4 db 1
|
|
|
|
times 4 db 2
|
|
|
|
times 4 db 3
|
|
|
|
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
SECTION .text
|
|
|
|
|
2016-02-01 11:48:25 +01:00
|
|
|
; Unsigned byte absolute difference.
|
|
|
|
; a=%1 b=%2 clobber=%3
|
|
|
|
; Subtract once in each direction with saturation and return the maximum.
|
|
|
|
%macro SSE2_AbsDiffUB 3
|
|
|
|
movdqa %3, %2
|
|
|
|
psubusb %3, %1
|
|
|
|
psubusb %1, %2
|
|
|
|
por %1, %3
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
; Unsigned byte compare less than.
|
|
|
|
; lhs=%1 rhs^0x7f=%2 0x7f=%3
|
|
|
|
; No unsigned byte lt/gt compare instruction available; xor by 0x7f and use a
|
|
|
|
; signed compare. Some other options do exist. This one allows modifying the lhs
|
|
|
|
; without mov and uses a bitwise op which can be executed on most ports on
|
|
|
|
; common architectures.
|
|
|
|
%macro SSE2_CmpltUB 3
|
|
|
|
pxor %1, %3
|
|
|
|
pcmpgtb %1, %2
|
|
|
|
%endmacro
|
|
|
|
|
2016-02-09 23:28:14 +01:00
|
|
|
; Unsigned byte compare greater than or equal.
|
|
|
|
%macro SSE2_CmpgeUB 2
|
|
|
|
pminub %1, %2
|
|
|
|
pcmpeqb %1, %2
|
|
|
|
%endmacro
|
|
|
|
|
2016-02-01 11:48:25 +01:00
|
|
|
; Clip unsigned bytes to ref +/- diff.
|
|
|
|
; data=%1 ref=%2 maxdiff_from_ref=%3 clobber=%4
|
|
|
|
%macro SSE2_ClipUB 4
|
|
|
|
movdqa %4, %2
|
|
|
|
psubusb %4, %3
|
|
|
|
paddusb %3, %2
|
|
|
|
pmaxub %1, %4
|
|
|
|
pminub %1, %3
|
|
|
|
%endmacro
|
|
|
|
|
2016-02-09 23:28:14 +01:00
|
|
|
; (a + b + 1 - c) >> 1
|
|
|
|
; a=%1 b=%2 c=%3 [out:a^b&c]=%4
|
|
|
|
%macro SSE2_AvgbFloor1 4
|
|
|
|
movdqa %4, %1
|
|
|
|
pxor %4, %2
|
|
|
|
pavgb %1, %2
|
|
|
|
pand %4, %3
|
|
|
|
psubb %1, %4
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
; (a + b + carry) >> 1
|
|
|
|
; a=%1 b=%2 carry-1=%3
|
|
|
|
%macro SSE2_AvgbFloor2 3
|
|
|
|
pxor %1, %3
|
|
|
|
pxor %2, %3
|
|
|
|
pavgb %1, %2
|
|
|
|
pxor %1, %3
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
; a = (a & m) | (b & ~m)
|
|
|
|
; a=%1 b=%2 m=%3
|
|
|
|
%macro SSE2_Blend 3
|
|
|
|
pand %1, %3
|
|
|
|
pandn %3, %2
|
|
|
|
por %1, %3
|
|
|
|
%endmacro
|
|
|
|
|
2016-02-25 15:57:20 +01:00
|
|
|
; Compute
|
|
|
|
; p0 = clip(p0 + clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255)
|
|
|
|
; q0 = clip(q0 - clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255)
|
|
|
|
; 16-wide parallel in packed byte representation in xmm registers.
|
|
|
|
;
|
|
|
|
; p1=%1 p0=%2 q0=%3 q1=%4 iTc=%5 FFh=%6 xmmclobber=%7,%8
|
|
|
|
%macro SSE2_DeblockP0Q0_Lt4 8
|
|
|
|
; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
|
|
|
|
; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
|
|
|
|
; Bias so that unsigned saturation can be used.
|
|
|
|
; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
|
|
|
|
; q0 - p0 is split into a non-negative and non-positive part. The latter is
|
|
|
|
; subtracted from the biased value.
|
|
|
|
movdqa %7, %2
|
|
|
|
psubusb %7, %3 ; clip(p0 - q0, 0, 255)
|
|
|
|
; ((p1 - q1) >> 2) + 0xc0
|
|
|
|
pxor %4, %6 ; q1 ^ 0xff aka -q1 - 1 & 0xff
|
|
|
|
pavgb %1, %4 ; (((p1 - q1 + 0x100) >> 1)
|
|
|
|
pavgb %1, %6 ; + 0x100) >> 1
|
|
|
|
psubusb %1, %7 ; -= clip(p0 - q0, 0, 255) saturate.
|
|
|
|
movdqa %8, %3
|
|
|
|
psubusb %8, %2 ; (clip(q0 - p0, 0, 255)
|
|
|
|
pavgb %8, %1 ; + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1
|
|
|
|
|
|
|
|
; Unbias and split into a non-negative and a non-positive part.
|
|
|
|
; Clip each part to iTc via minub.
|
|
|
|
; Add/subtract each part to/from p0/q0 and clip.
|
|
|
|
movdqa %6, [WELS_DB96_16]
|
|
|
|
psubusb %6, %8
|
|
|
|
psubusb %8, [WELS_DB96_16]
|
|
|
|
pminub %6, %5
|
|
|
|
pminub %8, %5
|
|
|
|
psubusb %2, %6
|
|
|
|
paddusb %2, %8 ; p0
|
|
|
|
paddusb %3, %6
|
|
|
|
psubusb %3, %8 ; q0
|
|
|
|
%endmacro
|
|
|
|
|
2014-01-03 14:49:45 +08:00
|
|
|
|
2016-02-01 11:48:25 +01:00
|
|
|
;*******************************************************************************
|
|
|
|
; void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
|
|
|
|
; int32_t iBeta, int8_t * pTC)
|
|
|
|
;*******************************************************************************
|
2014-01-03 14:49:45 +08:00
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockLumaLt4V_ssse3
|
2016-02-01 11:48:25 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
|
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
movd xmm1, arg3d
|
|
|
|
movd xmm2, arg4d
|
|
|
|
pxor xmm3, xmm3
|
|
|
|
pxor xmm1, [WELS_DB127_16]
|
|
|
|
pxor xmm2, [WELS_DB127_16]
|
|
|
|
pshufb xmm1, xmm3 ; iAlpha ^ 0x7f
|
|
|
|
pshufb xmm2, xmm3 ; iBeta ^ 0x7f
|
|
|
|
mov r2, r1 ; iStride
|
|
|
|
neg r1 ; -iStride
|
|
|
|
lea r3, [r0 + r1] ; pPix - iStride
|
|
|
|
|
|
|
|
; Compute masks to enable/disable deblocking.
|
|
|
|
MOVDQ xmm6, [r3 + 0 * r1] ; p0
|
|
|
|
MOVDQ xmm7, [r3 + 1 * r1] ; p1
|
|
|
|
MOVDQ xmm0, [r0 + 0 * r2] ; q0
|
|
|
|
movdqa xmm4, xmm6
|
|
|
|
SSE2_AbsDiffUB xmm6, xmm0, xmm3 ; |p0 - q0|
|
|
|
|
SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
|
|
|
|
MOVDQ xmm1, [r0 + 1 * r2] ; q1
|
|
|
|
SSE2_AbsDiffUB xmm7, xmm4, xmm3 ; |p1 - p0|
|
|
|
|
SSE2_AbsDiffUB xmm0, xmm1, xmm3 ; |q1 - q0|
|
|
|
|
pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|)
|
|
|
|
SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
|
|
|
|
pand xmm6, xmm7 ; bDeltaP0Q0P1P0Q1Q0 = bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
|
|
|
|
MOVDQ xmm7, [r3 + 2 * r1] ; p2
|
|
|
|
movdqa xmm0, xmm7
|
|
|
|
SSE2_AbsDiffUB xmm7, xmm4, xmm3 ; |p2 - p0|
|
|
|
|
SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP2P0 = |p2 - p0| < iBeta
|
|
|
|
MOVDQ xmm5, [r0 + 2 * r2] ; q2
|
|
|
|
MOVDQ xmm3, [r0 + 0 * r2] ; q0
|
|
|
|
movdqa xmm1, xmm5
|
|
|
|
SSE2_AbsDiffUB xmm5, xmm3, xmm4 ; |q2 - q0|
|
|
|
|
SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
|
|
|
|
|
|
|
|
pavgb xmm3, [r3 + 0 * r1]
|
|
|
|
pcmpeqw xmm2, xmm2 ; FFh
|
|
|
|
pxor xmm3, xmm2
|
|
|
|
; (p2 + ((p0 + q0 + 1) >> 1)) >> 1
|
|
|
|
pxor xmm0, xmm2
|
|
|
|
pavgb xmm0, xmm3
|
|
|
|
pxor xmm0, xmm2
|
|
|
|
; (q2 + ((p0 + q0 + 1) >> 1)) >> 1
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
pavgb xmm1, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
|
|
|
|
movd xmm3, [r4]
|
|
|
|
pshufb xmm3, [WELS_SHUFB0000111122223333] ; iTc
|
|
|
|
movdqa xmm4, xmm3 ; iTc0 = iTc
|
|
|
|
pcmpgtb xmm3, xmm2 ; iTc > -1 ? 0xff : 0x00
|
|
|
|
pand xmm6, xmm3 ; bDeltaP0Q0P1P0Q1Q0 &= iTc > -1
|
|
|
|
movdqa xmm3, xmm4
|
|
|
|
psubb xmm3, xmm7 ; iTc -= bDeltaP2P0 ? -1 : 0
|
|
|
|
psubb xmm3, xmm5 ; iTc -= bDeltaQ2Q0 ? -1 : 0
|
|
|
|
pand xmm3, xmm6 ; iTc &= bDeltaP0Q0P1P0Q1Q0 ? 0xff : 0
|
|
|
|
pand xmm7, xmm6 ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0
|
|
|
|
pand xmm5, xmm6 ; bDeltaQ2Q0 &= bDeltaP0Q0P1P0Q1Q0
|
|
|
|
pand xmm7, xmm4 ; iTc0 & (bDeltaP2P0 ? 0xff : 0)
|
|
|
|
pand xmm5, xmm4 ; iTc0 & (bDeltaQ2Q0 ? 0xff : 0)
|
|
|
|
|
|
|
|
MOVDQ xmm4, [r3 + 1 * r1]
|
|
|
|
SSE2_ClipUB xmm0, xmm4, xmm7, xmm6 ; clip p1.
|
|
|
|
MOVDQ xmm6, [r0 + 1 * r2]
|
|
|
|
MOVDQ [r3 + 1 * r1], xmm0 ; store p1.
|
|
|
|
SSE2_ClipUB xmm1, xmm6, xmm5, xmm7 ; clip q1.
|
|
|
|
MOVDQ [r0 + 1 * r2], xmm1 ; store q1.
|
|
|
|
|
2016-02-25 15:57:20 +01:00
|
|
|
MOVDQ xmm1, [r3 + 0 * r1] ; p0
|
|
|
|
MOVDQ xmm0, [r0 + 0 * r2] ; q0
|
|
|
|
SSE2_DeblockP0Q0_Lt4 xmm4, xmm1, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7
|
2016-02-01 11:48:25 +01:00
|
|
|
MOVDQ [r3 + 0 * r1], xmm1 ; store p0.
|
2016-02-25 15:57:20 +01:00
|
|
|
MOVDQ [r0 + 0 * r2], xmm0 ; store q0.
|
2016-02-01 11:48:25 +01:00
|
|
|
|
2014-05-31 14:13:34 +03:00
|
|
|
POP_XMM
|
2016-02-01 11:48:25 +01:00
|
|
|
LOAD_5_PARA_POP
|
2014-05-31 14:13:34 +03:00
|
|
|
ret
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
2016-02-09 23:28:14 +01:00
|
|
|
; Deblock 3x16 luma pixels for the eq4 case.
|
|
|
|
;
|
|
|
|
; Compose 8-bit averages from pavgbs. Ie. (p1 + p0 + p2 + q0 + 2) >> 2 can be
|
|
|
|
; written as (((p1 + p0) >> 1) + ((p2 + q0 + (p1 ^ p0 & 1)) >> 1) + 1) >> 1,
|
|
|
|
; which maps to 3 pavgbs.
|
|
|
|
;
|
|
|
|
; pPix=%1 iStride=%2 [in:q0,out:p0]=%3 [in:q1,out:p1]=%4 bDeltaP0Q0P1P0Q1Q0=%5 bDeltaP2P0=%6 clobber=%7,%8,%9,%10 preserve_p0p1=%11 db1=%12
|
|
|
|
%macro SSE2_DeblockLumaEq4_3x16P 12
|
|
|
|
movdqa %7, %3
|
|
|
|
movdqa %8, %6
|
|
|
|
MOVDQ %10, [%1 + 1 * %2] ; p1
|
|
|
|
SSE2_Blend %7, %10, %8 ; t0 = bDeltaP2P0 ? q0 : p1
|
|
|
|
movdqa %8, %6
|
|
|
|
MOVDQ %9, [%1 + 2 * %2] ; p2
|
|
|
|
SSE2_Blend %9, %4, %8 ; t1 = bDeltaP2P0 ? p2 : q1
|
|
|
|
SSE2_AvgbFloor1 %4, %9, %12, %8 ; t1 = (t1 + q1) >> 1
|
|
|
|
SSE2_AvgbFloor1 %10, [%1], %12, %8 ; (p0 + p1) >> 1, p0 ^ p1
|
|
|
|
pxor %8, %12
|
|
|
|
SSE2_AvgbFloor1 %7, %4, %8, %9 ; (t0 + t1 + (p0 ^ p1 & 1)) >> 1
|
|
|
|
MOVDQ %9, [%1 + 2 * %2] ; p2
|
|
|
|
SSE2_AvgbFloor1 %3, %9, %8, %4 ; (p2 + q0 + (p0 ^ p1 & 1)) >> 1
|
|
|
|
pavgb %7, %10 ; p0' = (p0 + p1 + t0 + t1 + 2) >> 2
|
|
|
|
movdqa %8, %10
|
|
|
|
pxor %8, %3 ; (p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1
|
|
|
|
pand %8, %12 ; & 1
|
|
|
|
pavgb %10, %3 ; p1' = (p0 + p1 + p2 + q0 + 2) >> 2
|
|
|
|
pand %6, %5 ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0
|
|
|
|
%if %11
|
|
|
|
MOVDQ %3, [%1 + 0 * %2] ; p0
|
|
|
|
movdqa %4, %5
|
|
|
|
SSE2_Blend %7, %3, %4 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
|
|
|
|
%else
|
|
|
|
SSE2_Blend %7, [%1 + 0 * %2], %5 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
|
|
|
|
%endif
|
|
|
|
MOVDQ [%1 + 0 * %2], %7 ; store p0
|
|
|
|
add %1, %2
|
|
|
|
movdqa %7, %10
|
|
|
|
psubb %10, %8 ; (p0 + p1 + p2 + q0) >> 2
|
|
|
|
psubb %8, %12
|
|
|
|
MOVDQ %4, [%1 + (3 - 1) * %2] ; p3
|
|
|
|
SSE2_AvgbFloor2 %4, %9, %8 ; (p2 + p3 + ((p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1 & 1)) >> 1
|
|
|
|
pavgb %10, %4 ; p2' = (((p0 + p1 + p2 + q0) >> 1) + p2 + p3 + 2) >> 2
|
|
|
|
movdqa %8, %6
|
|
|
|
SSE2_Blend %10, [%1 + (2 - 1) * %2], %8 ; p2out = bDeltaP2P0 ? p2' : p2
|
|
|
|
MOVDQ [%1 + (2 - 1) * %2], %10 ; store p2
|
|
|
|
%if %11
|
|
|
|
MOVDQ %4, [%1 + (1 - 1) * %2] ; p1
|
|
|
|
SSE2_Blend %7, %4, %6 ; p1out = bDeltaP2P0 ? p1' : p1
|
|
|
|
%else
|
|
|
|
SSE2_Blend %7, [%1 + (1 - 1) * %2], %6 ; p1out = bDeltaP2P0 ? p1' : p1
|
|
|
|
%endif
|
|
|
|
MOVDQ [%1 + (1 - 1) * %2], %7 ; store p1
|
|
|
|
%endmacro
|
2016-02-01 11:48:25 +01:00
|
|
|
|
|
|
|
|
2016-02-09 23:28:14 +01:00
|
|
|
;*******************************************************************************
|
|
|
|
; void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
|
|
|
|
; int32_t iBeta)
|
|
|
|
;*******************************************************************************
|
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockLumaEq4V_ssse3
|
2016-02-09 23:28:14 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
|
|
|
PUSH_XMM 10
|
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
movd xmm1, arg3d
|
|
|
|
movd xmm2, arg4d
|
|
|
|
shr r2, 2
|
|
|
|
add r2, 1
|
|
|
|
movd xmm3, r2d
|
|
|
|
pxor xmm4, xmm4
|
|
|
|
pxor xmm1, [WELS_DB127_16]
|
|
|
|
pxor xmm2, [WELS_DB127_16]
|
|
|
|
pshufb xmm1, xmm4 ; iAlpha ^ 0x7f
|
|
|
|
pshufb xmm2, xmm4 ; iBeta ^ 0x7f
|
|
|
|
pshufb xmm3, xmm4 ; (iAlpha >> 2) + 1
|
|
|
|
mov r2, r1 ; iStride
|
|
|
|
neg r1 ; -iStride
|
|
|
|
lea r3, [r0 + r1] ; pPix - iStride
|
|
|
|
|
|
|
|
; Compute masks to enable/disable filtering.
|
|
|
|
MOVDQ xmm7, [r3 + 1 * r1] ; p1
|
|
|
|
MOVDQ xmm6, [r3 + 0 * r1] ; p0
|
|
|
|
MOVDQ xmm0, [r0 + 0 * r2] ; q0
|
|
|
|
movdqa xmm4, xmm6
|
|
|
|
SSE2_AbsDiffUB xmm6, xmm0, xmm5 ; |p0 - q0|
|
|
|
|
SSE2_CmpgeUB xmm3, xmm6 ; |p0 - q0| < (iAlpha >> 2) + 2
|
|
|
|
SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
|
|
|
|
MOVDQ xmm1, [r0 + 1 * r2] ; q1
|
|
|
|
SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p1 - p0|
|
|
|
|
SSE2_AbsDiffUB xmm0, xmm1, xmm5 ; |q1 - q0|
|
|
|
|
pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|)
|
|
|
|
SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
|
|
|
|
pand xmm6, xmm7 ; & bDeltaP0Q0
|
|
|
|
|
|
|
|
MOVDQ xmm7, [r3 + 2 * r1] ; p2
|
|
|
|
SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p2 - p0|
|
|
|
|
SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP2P0 = |p2 - p0| < iBeta
|
|
|
|
pand xmm7, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2
|
|
|
|
|
|
|
|
MOVDQ xmm0, [r0 + 0 * r2] ; q0
|
|
|
|
MOVDQ xmm5, [r0 + 2 * r2] ; q2
|
|
|
|
SSE2_AbsDiffUB xmm5, xmm0, xmm4 ; |q2 - q0|
|
|
|
|
SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
|
|
|
|
pand xmm5, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2
|
|
|
|
|
|
|
|
%ifdef X86_32
|
|
|
|
; Push xmm5 to free up one register. Align stack so as to ensure that failed
|
|
|
|
; store forwarding penalty cannot occur (up to ~50 cycles for 128-bit on IVB).
|
|
|
|
mov r2, esp
|
|
|
|
sub esp, 16
|
|
|
|
and esp, -16
|
|
|
|
movdqa [esp], xmm5
|
|
|
|
SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [WELS_DB1_16]
|
|
|
|
movdqa xmm5, [esp]
|
|
|
|
mov esp, r2
|
|
|
|
neg r1
|
|
|
|
SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [WELS_DB1_16]
|
|
|
|
%else
|
|
|
|
movdqa xmm9, [WELS_DB1_16]
|
|
|
|
SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9
|
|
|
|
SSE2_DeblockLumaEq4_3x16P r0, r2, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, xmm9
|
|
|
|
%endif
|
|
|
|
|
|
|
|
POP_XMM
|
|
|
|
LOAD_4_PARA_POP
|
2014-05-31 14:13:34 +03:00
|
|
|
ret
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
2016-02-25 15:57:20 +01:00
|
|
|
; [out:p1,p0,q0,q1]=%1,%2,%3,%4 pPixCb=%5 pPixCr=%6 iStride=%7 3*iStride-1=%8 xmmclobber=%9,%10,%11
|
|
|
|
%macro SSE2_LoadCbCr_4x16H 11
|
|
|
|
movd %1, [%5 + 0 * %7 - 2] ; [p1,p0,q0,q1] cb line 0
|
|
|
|
movd %2, [%5 + 2 * %7 - 2] ; [p1,p0,q0,q1] cb line 2
|
|
|
|
punpcklbw %1, %2 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 0,2
|
|
|
|
movd %2, [%5 + 4 * %7 - 2] ; [p1,p0,q0,q1] cb line 4
|
|
|
|
movd %9, [%5 + 2 * %8] ; [p1,p0,q0,q1] cb line 6
|
|
|
|
punpcklbw %2, %9 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 4,6
|
|
|
|
punpcklwd %1, %2 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 0,2,4,6
|
|
|
|
movd %2, [%6 + 0 * %7 - 2] ; [p1,p0,q0,q1] cr line 0
|
|
|
|
movd %9, [%6 + 2 * %7 - 2] ; [p1,p0,q0,q1] cr line 2
|
|
|
|
punpcklbw %2, %9 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 0,2
|
|
|
|
movd %9, [%6 + 4 * %7 - 2] ; [p1,p0,q0,q1] cr line 4
|
|
|
|
movd %10, [%6 + 2 * %8] ; [p1,p0,q0,q1] cr line 6
|
|
|
|
punpcklbw %9, %10 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 4,6
|
|
|
|
punpcklwd %2, %9 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 0,2,4,6
|
|
|
|
add %5, %7 ; pPixCb += iStride
|
|
|
|
add %6, %7 ; pPixCr += iStride
|
|
|
|
movd %9, [%5 + 0 * %7 - 2] ; [p1,p0,q0,q1] cb line 1
|
|
|
|
movd %10, [%5 + 2 * %7 - 2] ; [p1,p0,q0,q1] cb line 3
|
|
|
|
punpcklbw %9, %10 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 1,3
|
|
|
|
movd %10, [%5 + 4 * %7 - 2] ; [p1,p0,q0,q1] cb line 5
|
|
|
|
movd %3, [%5 + 2 * %8] ; [p1,p0,q0,q1] cb line 7
|
|
|
|
punpcklbw %10, %3 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 5,7
|
|
|
|
punpcklwd %9, %10 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 1,3,5,7
|
|
|
|
movd %10, [%6 + 0 * %7 - 2] ; [p1,p0,q0,q1] cr line 1
|
|
|
|
movd %3, [%6 + 2 * %7 - 2] ; [p1,p0,q0,q1] cr line 3
|
|
|
|
punpcklbw %10, %3 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 1,3
|
|
|
|
movd %3, [%6 + 4 * %7 - 2] ; [p1,p0,q0,q1] cr line 5
|
|
|
|
movd %4, [%6 + 2 * %8] ; [p1,p0,q0,q1] cr line 7
|
|
|
|
punpcklbw %3, %4 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 5,7
|
|
|
|
punpcklwd %10, %3 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 1,3,5,7
|
|
|
|
movdqa %3, %1
|
|
|
|
punpckldq %1, %2 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6
|
|
|
|
punpckhdq %3, %2 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6
|
|
|
|
movdqa %11, %9
|
|
|
|
punpckldq %9, %10 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 1,3,5,7
|
|
|
|
punpckhdq %11, %10 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 1,3,5,7
|
|
|
|
movdqa %2, %1
|
|
|
|
punpcklqdq %1, %9 ; [p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1] cb/cr line 0,2,4,6,1,3,5,7
|
|
|
|
punpckhqdq %2, %9 ; [p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6,1,3,5,7
|
|
|
|
movdqa %4, %3
|
|
|
|
punpcklqdq %3, %11 ; [q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0] cb/cr line 0,2,4,6,1,3,5,7
|
|
|
|
punpckhqdq %4, %11 ; [q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6,1,3,5,7
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
; pPixCb+iStride=%1 pPixCr+iStride=%2 iStride=%3 3*iStride-1=%4 p0=%5 q0=%6 rclobber=%7 dwclobber={%8,%9} xmmclobber=%10
|
|
|
|
%macro SSE2_StoreCbCr_4x16H 10
|
|
|
|
movdqa %10, %5
|
|
|
|
punpcklbw %10, %6 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6
|
|
|
|
punpckhbw %5, %6 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7
|
|
|
|
mov %7, r7 ; preserve stack pointer
|
|
|
|
and r7, -16 ; align stack pointer
|
|
|
|
sub r7, 32 ; allocate stack space
|
|
|
|
movdqa [r7 ], %10 ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6 on the stack
|
|
|
|
movdqa [r7 + 16], %5 ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7 on the stack
|
|
|
|
mov %8, [r7 + 16] ; [p0,q0,p0,q0] cb line 1,3
|
|
|
|
mov [%1 + 0 * %3 - 1], %9 ; store [p0,q0] cb line 1
|
|
|
|
shr %8, 16 ; [p0,q0] cb line 3
|
|
|
|
mov [%1 + 2 * %3 - 1], %9 ; store [p0,q0] cb line 3
|
|
|
|
mov %8, [r7 + 20] ; [p0,q0,p0,q0] cb line 5,7
|
|
|
|
mov [%1 + 4 * %3 - 1], %9 ; store [p0,q0] cb line 5
|
|
|
|
shr %8, 16 ; [p0,q0] cb line 7
|
|
|
|
mov [%1 + 2 * %4 + 1], %9 ; store [p0,q0] cb line 7
|
|
|
|
mov %8, [r7 + 24] ; [p0,q0,p0,q0] cr line 1,3
|
|
|
|
mov [%2 + 0 * %3 - 1], %9 ; store [p0,q0] cr line 1
|
|
|
|
shr %8, 16 ; [p0,q0] cr line 3
|
|
|
|
mov [%2 + 2 * %3 - 1], %9 ; store [p0,q0] cr line 3
|
|
|
|
mov %8, [r7 + 28] ; [p0,q0,p0,q0] cr line 5,7
|
|
|
|
mov [%2 + 4 * %3 - 1], %9 ; store [p0,q0] cr line 5
|
|
|
|
shr %8, 16 ; [p0,q0] cr line 7
|
|
|
|
mov [%2 + 2 * %4 + 1], %9 ; store [p0,q0] cr line 7
|
|
|
|
sub %1, %3 ; pPixCb -= iStride
|
|
|
|
sub %2, %3 ; pPixCr -= iStride
|
|
|
|
mov %8, [r7 ] ; [p0,q0,p0,q0] cb line 0,2
|
|
|
|
mov [%1 + 0 * %3 - 1], %9 ; store [p0,q0] cb line 0
|
|
|
|
shr %8, 16 ; [p0,q0] cb line 2
|
|
|
|
mov [%1 + 2 * %3 - 1], %9 ; store [p0,q0] cb line 2
|
|
|
|
mov %8, [r7 + 4] ; [p0,q0,p0,q0] cb line 4,6
|
|
|
|
mov [%1 + 4 * %3 - 1], %9 ; store [p0,q0] cb line 4
|
|
|
|
shr %8, 16 ; [p0,q0] cb line 6
|
|
|
|
mov [%1 + 2 * %4 + 1], %9 ; store [p0,q0] cb line 6
|
|
|
|
mov %8, [r7 + 8] ; [p0,q0,p0,q0] cr line 0,2
|
|
|
|
mov [%2 + 0 * %3 - 1], %9 ; store [p0,q0] cr line 0
|
|
|
|
shr %8, 16 ; [p0,q0] cr line 2
|
|
|
|
mov [%2 + 2 * %3 - 1], %9 ; store [p0,q0] cr line 2
|
|
|
|
mov %8, [r7 + 12] ; [p0,q0,p0,q0] cr line 4,6
|
|
|
|
mov [%2 + 4 * %3 - 1], %9 ; store [p0,q0] cr line 4
|
|
|
|
shr %8, 16 ; [p0,q0] cr line 6
|
|
|
|
mov [%2 + 2 * %4 + 1], %9 ; store [p0,q0] cr line 6
|
|
|
|
mov r7, %7 ; restore stack pointer
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 pTC=%7 xmmclobber=%8,%9,%10 interleaveTC=%11
|
|
|
|
%macro SSSE3_DeblockChromaLt4 11
|
|
|
|
movdqa %8, %3
|
|
|
|
SSE2_AbsDiffUB %8, %2, %9 ; |p0 - q0|
|
|
|
|
SSE2_CmpgeUB %8, %5 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
|
|
|
|
movdqa %9, %4
|
|
|
|
SSE2_AbsDiffUB %9, %3, %5 ; |q1 - q0|
|
|
|
|
movdqa %10, %1
|
|
|
|
SSE2_AbsDiffUB %10, %2, %5 ; |p1 - p0|
|
|
|
|
pmaxub %9, %10 ; max(|q1 - q0|, |p1 - p0|)
|
|
|
|
pxor %10, %10
|
|
|
|
movd %5, %6
|
|
|
|
pshufb %5, %10 ; iBeta
|
|
|
|
SSE2_CmpgeUB %9, %5 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
|
|
|
|
por %8, %9 ; | !bDeltaP0Q0
|
|
|
|
movd %5, [%7]
|
|
|
|
%if %11
|
|
|
|
punpckldq %5, %5
|
|
|
|
punpcklbw %5, %5 ; iTc
|
|
|
|
%else
|
|
|
|
pshufd %5, %5, 0 ; iTc
|
|
|
|
%endif
|
|
|
|
pcmpeqw %10, %10 ; FFh
|
|
|
|
movdqa %9, %5
|
|
|
|
pcmpgtb %9, %10 ; iTc > -1 ? FFh : 00h
|
|
|
|
pandn %8, %5 ; iTc & bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
|
|
|
|
pand %8, %9 ; &= (iTc > -1 ? FFh : 00h)
|
|
|
|
SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
|
2016-02-12 20:59:22 +01:00
|
|
|
;******************************************************************************
|
|
|
|
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
|
|
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
|
|
|
|
;*******************************************************************************
|
2016-02-09 23:28:14 +01:00
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockChromaLt4V_ssse3
|
2016-02-12 20:59:22 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
|
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r2, r2d
|
|
|
|
movd xmm7, arg4d
|
|
|
|
pxor xmm0, xmm0
|
|
|
|
pshufb xmm7, xmm0 ; iAlpha
|
|
|
|
mov r3, r2
|
|
|
|
neg r3 ; -iStride
|
|
|
|
|
|
|
|
movq xmm0, [r0 + 0 * r2] ; q0 cb
|
|
|
|
movhps xmm0, [r1 + 0 * r2] ; q0 cr
|
|
|
|
movq xmm2, [r0 + 1 * r3] ; p0 cb
|
|
|
|
movhps xmm2, [r1 + 1 * r3] ; p0 cr
|
|
|
|
movq xmm1, [r0 + 1 * r2] ; q1 cb
|
|
|
|
movhps xmm1, [r1 + 1 * r2] ; q1 cr
|
|
|
|
movq xmm3, [r0 + 2 * r3] ; p1 cb
|
|
|
|
movhps xmm3, [r1 + 2 * r3] ; p1 cr
|
|
|
|
|
|
|
|
%ifidni arg6, r5
|
2016-02-25 15:57:20 +01:00
|
|
|
SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, arg6, xmm4, xmm5, xmm6, 1
|
2016-02-12 20:59:22 +01:00
|
|
|
%else
|
|
|
|
mov r2, arg6
|
2016-02-25 15:57:20 +01:00
|
|
|
SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, r2, xmm4, xmm5, xmm6, 1
|
2016-02-12 20:59:22 +01:00
|
|
|
%endif
|
|
|
|
|
|
|
|
movlps [r0 + 1 * r3], xmm2 ; store p0 cb
|
|
|
|
movhps [r1 + 1 * r3], xmm2 ; store p0 cr
|
|
|
|
movlps [r0 ], xmm0 ; store q0 cb
|
|
|
|
movhps [r1 ], xmm0 ; store q0 cr
|
|
|
|
|
2014-05-31 14:13:34 +03:00
|
|
|
POP_XMM
|
2016-02-12 20:59:22 +01:00
|
|
|
LOAD_4_PARA_POP
|
2014-05-31 14:13:34 +03:00
|
|
|
ret
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
2016-02-12 21:01:24 +01:00
|
|
|
;********************************************************************************
|
|
|
|
; void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
|
|
; int32_t iAlpha, int32_t iBeta)
|
|
|
|
;********************************************************************************
|
2016-02-12 20:59:22 +01:00
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockChromaEq4V_ssse3
|
2016-02-12 21:01:24 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
|
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r2, r2d
|
|
|
|
movd xmm7, arg4d
|
|
|
|
pxor xmm0, xmm0
|
|
|
|
pshufb xmm7, xmm0 ; iAlpha
|
|
|
|
mov r3, r2
|
|
|
|
neg r3 ; -iStride
|
|
|
|
|
|
|
|
movq xmm0, [r0 + 0 * r2] ; q0 cb
|
|
|
|
movhps xmm0, [r1 + 0 * r2] ; q0 cr
|
|
|
|
movq xmm2, [r0 + 1 * r3] ; p0 cb
|
|
|
|
movhps xmm2, [r1 + 1 * r3] ; p0 cr
|
|
|
|
|
|
|
|
movdqa xmm4, xmm0
|
|
|
|
SSE2_AbsDiffUB xmm4, xmm2, xmm5 ; |p0 - q0|
|
|
|
|
SSE2_CmpgeUB xmm4, xmm7 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
|
|
|
|
|
|
|
|
movq xmm1, [r0 + 1 * r2] ; q1 cb
|
|
|
|
movhps xmm1, [r1 + 1 * r2] ; q1 cr
|
|
|
|
movq xmm3, [r0 + 2 * r3] ; p1 cb
|
|
|
|
movhps xmm3, [r1 + 2 * r3] ; p1 cr
|
|
|
|
|
|
|
|
movdqa xmm5, xmm1
|
|
|
|
SSE2_AbsDiffUB xmm5, xmm0, xmm7 ; |q1 - q0|
|
|
|
|
movdqa xmm6, xmm3
|
|
|
|
SSE2_AbsDiffUB xmm6, xmm2, xmm7 ; |p1 - p0|
|
|
|
|
pmaxub xmm5, xmm6 ; max(|q1 - q0|, |p1 - p0|)
|
|
|
|
|
|
|
|
pxor xmm6, xmm6
|
|
|
|
movd xmm7, arg5d
|
|
|
|
pshufb xmm7, xmm6 ; iBeta
|
|
|
|
|
|
|
|
SSE2_CmpgeUB xmm5, xmm7 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
|
|
|
|
por xmm4, xmm5 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
|
|
|
|
|
|
|
|
WELS_DB1 xmm7
|
|
|
|
movdqa xmm5, xmm2
|
|
|
|
SSE2_AvgbFloor1 xmm2, xmm1, xmm7, xmm6 ; (p0 + q1) >> 1
|
|
|
|
pavgb xmm2, xmm3 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
|
|
|
|
movdqa xmm6, xmm4
|
|
|
|
SSE2_Blend xmm5, xmm2, xmm4 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
|
|
|
|
|
|
|
|
SSE2_AvgbFloor1 xmm3, xmm0, xmm7, xmm4 ; (q0 + p1) >> 1
|
|
|
|
pavgb xmm3, xmm1 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
|
|
|
|
SSE2_Blend xmm0, xmm3, xmm6 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
|
|
|
|
|
|
|
|
movlps [r0 + 1 * r3], xmm5 ; store p0 cb
|
|
|
|
movhps [r1 + 1 * r3], xmm5 ; store p0 cr
|
|
|
|
movlps [r0 + 0 * r2], xmm0 ; store q0 cb
|
|
|
|
movhps [r1 + 0 * r2], xmm0 ; store q0 cr
|
|
|
|
|
2014-05-31 14:13:34 +03:00
|
|
|
POP_XMM
|
2016-02-12 21:01:24 +01:00
|
|
|
LOAD_4_PARA_POP
|
2014-05-31 14:13:34 +03:00
|
|
|
ret
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
2016-02-25 15:57:20 +01:00
|
|
|
;*******************************************************************************
|
|
|
|
; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
|
|
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
|
|
|
|
;*******************************************************************************
|
|
|
|
|
|
|
|
WELS_EXTERN DeblockChromaLt4H_ssse3
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_6_PARA
|
|
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r2, r2d
|
|
|
|
movd xmm7, arg4d
|
|
|
|
pxor xmm0, xmm0
|
|
|
|
pshufb xmm7, xmm0 ; iAlpha
|
|
|
|
lea r3, [3 * r2 - 1] ; 3 * iStride - 1
|
|
|
|
|
|
|
|
SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
|
|
|
|
SSSE3_DeblockChromaLt4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, r5, xmm2, xmm3, xmm6, 0
|
|
|
|
SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
|
|
|
|
|
|
|
|
POP_XMM
|
|
|
|
LOAD_6_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2016-02-12 21:01:24 +01:00
|
|
|
%ifdef WIN64
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockChromaEq4H_ssse3
|
2014-05-31 14:13:34 +03:00
|
|
|
mov rax,rsp
|
|
|
|
mov [rax+20h],rbx
|
|
|
|
push rdi
|
|
|
|
PUSH_XMM 16
|
|
|
|
sub rsp,140h
|
|
|
|
mov rdi,rdx
|
|
|
|
lea eax,[r8*4]
|
|
|
|
movsxd r10,eax
|
|
|
|
mov eax,[rcx-2]
|
|
|
|
mov [rsp+10h],eax
|
|
|
|
lea rbx,[r10+rdx-2]
|
|
|
|
lea r11,[r10+rcx-2]
|
|
|
|
movdqa xmm5,[rsp+10h]
|
|
|
|
movsxd r10,r8d
|
|
|
|
mov eax,[r10+rcx-2]
|
|
|
|
lea rdx,[r10+r10*2]
|
|
|
|
mov [rsp+20h],eax
|
|
|
|
mov eax,[rcx+r10*2-2]
|
|
|
|
mov [rsp+30h],eax
|
|
|
|
mov eax,[rdx+rcx-2]
|
|
|
|
movdqa xmm2,[rsp+20h]
|
|
|
|
mov [rsp+40h],eax
|
|
|
|
mov eax, [rdi-2]
|
|
|
|
movdqa xmm4,[rsp+30h]
|
|
|
|
mov [rsp+50h],eax
|
|
|
|
mov eax,[r10+rdi-2]
|
|
|
|
movdqa xmm3,[rsp+40h]
|
|
|
|
mov [rsp+60h],eax
|
|
|
|
mov eax,[rdi+r10*2-2]
|
|
|
|
punpckldq xmm5,[rsp+50h]
|
|
|
|
mov [rsp+70h],eax
|
|
|
|
mov eax, [rdx+rdi-2]
|
|
|
|
punpckldq xmm2, [rsp+60h]
|
|
|
|
mov [rsp+80h],eax
|
|
|
|
mov eax,[r11]
|
|
|
|
punpckldq xmm4, [rsp+70h]
|
|
|
|
mov [rsp+50h],eax
|
|
|
|
mov eax,[rbx]
|
|
|
|
punpckldq xmm3,[rsp+80h]
|
|
|
|
mov [rsp+60h],eax
|
|
|
|
mov eax,[r10+r11]
|
|
|
|
movdqa xmm0, [rsp+50h]
|
|
|
|
punpckldq xmm0, [rsp+60h]
|
|
|
|
punpcklqdq xmm5,xmm0
|
|
|
|
movdqa [rsp+50h],xmm0
|
|
|
|
mov [rsp+50h],eax
|
|
|
|
mov eax,[r10+rbx]
|
|
|
|
movdqa xmm0,[rsp+50h]
|
|
|
|
movdqa xmm1,xmm5
|
|
|
|
mov [rsp+60h],eax
|
|
|
|
mov eax,[r11+r10*2]
|
|
|
|
punpckldq xmm0, [rsp+60h]
|
|
|
|
punpcklqdq xmm2,xmm0
|
|
|
|
punpcklbw xmm1,xmm2
|
|
|
|
punpckhbw xmm5,xmm2
|
|
|
|
movdqa [rsp+50h],xmm0
|
|
|
|
mov [rsp+50h],eax
|
|
|
|
mov eax,[rbx+r10*2]
|
|
|
|
movdqa xmm0,[rsp+50h]
|
|
|
|
mov [rsp+60h],eax
|
|
|
|
mov eax, [rdx+r11]
|
|
|
|
movdqa xmm15,xmm1
|
|
|
|
punpckldq xmm0,[rsp+60h]
|
|
|
|
punpcklqdq xmm4,xmm0
|
|
|
|
movdqa [rsp+50h],xmm0
|
|
|
|
mov [rsp+50h],eax
|
|
|
|
mov eax, [rdx+rbx]
|
|
|
|
movdqa xmm0,[rsp+50h]
|
|
|
|
mov [rsp+60h],eax
|
|
|
|
punpckldq xmm0, [rsp+60h]
|
|
|
|
punpcklqdq xmm3,xmm0
|
|
|
|
movdqa xmm0,xmm4
|
|
|
|
punpcklbw xmm0,xmm3
|
|
|
|
punpckhbw xmm4,xmm3
|
|
|
|
punpcklwd xmm15,xmm0
|
|
|
|
punpckhwd xmm1,xmm0
|
|
|
|
movdqa xmm0,xmm5
|
|
|
|
movdqa xmm12,xmm15
|
|
|
|
punpcklwd xmm0,xmm4
|
|
|
|
punpckhwd xmm5,xmm4
|
|
|
|
punpckldq xmm12,xmm0
|
|
|
|
punpckhdq xmm15,xmm0
|
|
|
|
movdqa xmm0,xmm1
|
|
|
|
movdqa xmm11,xmm12
|
|
|
|
punpckldq xmm0,xmm5
|
|
|
|
punpckhdq xmm1,xmm5
|
|
|
|
punpcklqdq xmm11,xmm0
|
|
|
|
punpckhqdq xmm12,xmm0
|
|
|
|
movsx eax,r9w
|
|
|
|
movdqa xmm14,xmm15
|
|
|
|
punpcklqdq xmm14,xmm1
|
|
|
|
punpckhqdq xmm15,xmm1
|
|
|
|
pxor xmm1,xmm1
|
|
|
|
movd xmm0,eax
|
|
|
|
movdqa xmm4,xmm12
|
|
|
|
movdqa xmm8,xmm11
|
|
|
|
movsx eax,word [rsp+170h + 160] ; iBeta
|
|
|
|
punpcklwd xmm0,xmm0
|
|
|
|
punpcklbw xmm4,xmm1
|
|
|
|
punpckhbw xmm12,xmm1
|
|
|
|
movdqa xmm9,xmm14
|
|
|
|
movdqa xmm7,xmm15
|
|
|
|
movdqa xmm10,xmm15
|
|
|
|
pshufd xmm13,xmm0,0
|
|
|
|
punpcklbw xmm9,xmm1
|
|
|
|
punpckhbw xmm14,xmm1
|
|
|
|
movdqa xmm6,xmm13
|
|
|
|
movd xmm0,eax
|
|
|
|
movdqa [rsp],xmm11
|
|
|
|
mov eax,2
|
|
|
|
cwde
|
|
|
|
punpckhbw xmm11,xmm1
|
|
|
|
punpckhbw xmm10,xmm1
|
|
|
|
punpcklbw xmm7,xmm1
|
|
|
|
punpcklwd xmm0,xmm0
|
|
|
|
punpcklbw xmm8,xmm1
|
|
|
|
pshufd xmm3,xmm0,0
|
|
|
|
movdqa xmm1,xmm8
|
|
|
|
movdqa xmm0,xmm4
|
|
|
|
psubw xmm0,xmm9
|
|
|
|
psubw xmm1,xmm4
|
|
|
|
movdqa xmm2,xmm3
|
|
|
|
pabsw xmm0,xmm0
|
|
|
|
pcmpgtw xmm6,xmm0
|
|
|
|
pabsw xmm0,xmm1
|
|
|
|
movdqa xmm1,xmm3
|
|
|
|
pcmpgtw xmm2,xmm0
|
|
|
|
pand xmm6,xmm2
|
|
|
|
movdqa xmm0,xmm7
|
|
|
|
movdqa xmm2,xmm3
|
|
|
|
psubw xmm0,xmm9
|
|
|
|
pabsw xmm0,xmm0
|
|
|
|
pcmpgtw xmm1,xmm0
|
|
|
|
pand xmm6,xmm1
|
|
|
|
movdqa xmm0,xmm12
|
|
|
|
movdqa xmm1,xmm11
|
|
|
|
psubw xmm0,xmm14
|
|
|
|
psubw xmm1,xmm12
|
|
|
|
movdqa xmm5,xmm6
|
|
|
|
pabsw xmm0,xmm0
|
|
|
|
pcmpgtw xmm13,xmm0
|
|
|
|
pabsw xmm0,xmm1
|
|
|
|
movdqa xmm1,xmm8
|
|
|
|
pcmpgtw xmm2,xmm0
|
|
|
|
paddw xmm1,xmm8
|
|
|
|
movdqa xmm0,xmm10
|
|
|
|
pand xmm13,xmm2
|
|
|
|
psubw xmm0,xmm14
|
|
|
|
paddw xmm1,xmm4
|
|
|
|
movdqa xmm2,xmm11
|
|
|
|
pabsw xmm0,xmm0
|
|
|
|
paddw xmm2,xmm11
|
|
|
|
paddw xmm1,xmm7
|
|
|
|
pcmpgtw xmm3,xmm0
|
|
|
|
paddw xmm2,xmm12
|
|
|
|
movd xmm0,eax
|
|
|
|
pand xmm13,xmm3
|
|
|
|
paddw xmm2,xmm10
|
|
|
|
punpcklwd xmm0,xmm0
|
|
|
|
pshufd xmm3,xmm0,0
|
|
|
|
movdqa xmm0,xmm6
|
|
|
|
paddw xmm1,xmm3
|
|
|
|
pandn xmm0,xmm4
|
|
|
|
paddw xmm2,xmm3
|
|
|
|
psraw xmm1,2
|
|
|
|
pand xmm5,xmm1
|
|
|
|
por xmm5,xmm0
|
|
|
|
paddw xmm7,xmm7
|
|
|
|
paddw xmm10,xmm10
|
|
|
|
psraw xmm2,2
|
|
|
|
movdqa xmm1,xmm13
|
|
|
|
movdqa xmm0,xmm13
|
|
|
|
pandn xmm0,xmm12
|
|
|
|
pand xmm1,xmm2
|
|
|
|
paddw xmm7,xmm9
|
|
|
|
por xmm1,xmm0
|
|
|
|
paddw xmm10,xmm14
|
|
|
|
paddw xmm7,xmm8
|
|
|
|
movdqa xmm0,xmm13
|
|
|
|
packuswb xmm5,xmm1
|
|
|
|
paddw xmm7,xmm3
|
|
|
|
paddw xmm10,xmm11
|
|
|
|
movdqa xmm1,xmm6
|
|
|
|
paddw xmm10,xmm3
|
|
|
|
pandn xmm6,xmm9
|
|
|
|
psraw xmm7,2
|
|
|
|
pand xmm1,xmm7
|
|
|
|
psraw xmm10,2
|
|
|
|
pandn xmm13,xmm14
|
|
|
|
pand xmm0,xmm10
|
|
|
|
por xmm1,xmm6
|
|
|
|
movdqa xmm6,[rsp]
|
|
|
|
movdqa xmm4,xmm6
|
|
|
|
por xmm0,xmm13
|
|
|
|
punpcklbw xmm4,xmm5
|
|
|
|
punpckhbw xmm6,xmm5
|
|
|
|
movdqa xmm3,xmm4
|
|
|
|
packuswb xmm1,xmm0
|
|
|
|
movdqa xmm0,xmm1
|
|
|
|
punpckhbw xmm1,xmm15
|
|
|
|
punpcklbw xmm0,xmm15
|
|
|
|
punpcklwd xmm3,xmm0
|
|
|
|
punpckhwd xmm4,xmm0
|
|
|
|
movdqa xmm0,xmm6
|
|
|
|
movdqa xmm2,xmm3
|
|
|
|
punpcklwd xmm0,xmm1
|
|
|
|
punpckhwd xmm6,xmm1
|
|
|
|
movdqa xmm1,xmm4
|
|
|
|
punpckldq xmm2,xmm0
|
|
|
|
punpckhdq xmm3,xmm0
|
|
|
|
punpckldq xmm1,xmm6
|
|
|
|
movdqa xmm0,xmm2
|
|
|
|
punpcklqdq xmm0,xmm1
|
|
|
|
punpckhdq xmm4,xmm6
|
|
|
|
punpckhqdq xmm2,xmm1
|
|
|
|
movdqa [rsp+10h],xmm0
|
|
|
|
movdqa [rsp+60h],xmm2
|
|
|
|
movdqa xmm0,xmm3
|
|
|
|
mov eax,[rsp+10h]
|
|
|
|
mov [rcx-2],eax
|
|
|
|
mov eax,[rsp+60h]
|
|
|
|
punpcklqdq xmm0,xmm4
|
|
|
|
punpckhqdq xmm3,xmm4
|
|
|
|
mov [r10+rcx-2],eax
|
|
|
|
movdqa [rsp+20h],xmm0
|
|
|
|
mov eax, [rsp+20h]
|
|
|
|
movdqa [rsp+70h],xmm3
|
|
|
|
mov [rcx+r10*2-2],eax
|
|
|
|
mov eax,[rsp+70h]
|
|
|
|
mov [rdx+rcx-2],eax
|
|
|
|
mov eax,[rsp+18h]
|
|
|
|
mov [r11],eax
|
|
|
|
mov eax,[rsp+68h]
|
|
|
|
mov [r10+r11],eax
|
|
|
|
mov eax,[rsp+28h]
|
|
|
|
mov [r11+r10*2],eax
|
|
|
|
mov eax,[rsp+78h]
|
|
|
|
mov [rdx+r11],eax
|
|
|
|
mov eax,[rsp+14h]
|
|
|
|
mov [rdi-2],eax
|
|
|
|
mov eax,[rsp+64h]
|
|
|
|
mov [r10+rdi-2],eax
|
|
|
|
mov eax,[rsp+24h]
|
|
|
|
mov [rdi+r10*2-2],eax
|
|
|
|
mov eax, [rsp+74h]
|
|
|
|
mov [rdx+rdi-2],eax
|
|
|
|
mov eax, [rsp+1Ch]
|
|
|
|
mov [rbx],eax
|
|
|
|
mov eax, [rsp+6Ch]
|
|
|
|
mov [r10+rbx],eax
|
|
|
|
mov eax,[rsp+2Ch]
|
|
|
|
mov [rbx+r10*2],eax
|
|
|
|
mov eax,[rsp+7Ch]
|
|
|
|
mov [rdx+rbx],eax
|
|
|
|
lea rsp,[rsp+140h]
|
|
|
|
POP_XMM
|
|
|
|
mov rbx, [rsp+28h]
|
|
|
|
pop rdi
|
|
|
|
ret
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%elifdef UNIX64
|
|
|
|
|
|
|
|
|
2014-01-16 15:57:22 +08:00
|
|
|
WELS_EXTERN DeblockChromaEq4H_ssse3
|
2014-05-31 14:13:34 +03:00
|
|
|
mov rax,rsp
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
push r12
|
|
|
|
|
|
|
|
mov rbp, r8
|
|
|
|
mov r8, rdx
|
|
|
|
mov r9, rcx
|
|
|
|
mov rcx, rdi
|
|
|
|
mov rdx, rsi
|
|
|
|
mov rdi, rdx
|
|
|
|
|
|
|
|
sub rsp,140h
|
|
|
|
lea eax,[r8*4]
|
|
|
|
movsxd r10,eax
|
|
|
|
mov eax,[rcx-2]
|
|
|
|
mov [rsp+10h],eax
|
|
|
|
lea rbx,[r10+rdx-2]
|
|
|
|
lea r11,[r10+rcx-2]
|
|
|
|
|
|
|
|
movdqa xmm5,[rsp+10h]
|
|
|
|
movsxd r10,r8d
|
|
|
|
mov eax,[r10+rcx-2]
|
|
|
|
lea rdx,[r10+r10*2]
|
|
|
|
mov [rsp+20h],eax
|
|
|
|
mov eax,[rcx+r10*2-2]
|
|
|
|
mov [rsp+30h],eax
|
|
|
|
mov eax,[rdx+rcx-2]
|
|
|
|
movdqa xmm2,[rsp+20h]
|
|
|
|
mov [rsp+40h],eax
|
|
|
|
mov eax, [rdi-2]
|
|
|
|
movdqa xmm4,[rsp+30h]
|
|
|
|
mov [rsp+50h],eax
|
|
|
|
mov eax,[r10+rdi-2]
|
|
|
|
movdqa xmm3,[rsp+40h]
|
|
|
|
mov [rsp+60h],eax
|
|
|
|
mov eax,[rdi+r10*2-2]
|
|
|
|
punpckldq xmm5,[rsp+50h]
|
|
|
|
mov [rsp+70h],eax
|
|
|
|
mov eax, [rdx+rdi-2]
|
|
|
|
punpckldq xmm2, [rsp+60h]
|
|
|
|
mov [rsp+80h],eax
|
|
|
|
mov eax,[r11]
|
|
|
|
punpckldq xmm4, [rsp+70h]
|
|
|
|
mov [rsp+50h],eax
|
|
|
|
mov eax,[rbx]
|
|
|
|
punpckldq xmm3,[rsp+80h]
|
|
|
|
mov [rsp+60h],eax
|
|
|
|
mov eax,[r10+r11]
|
|
|
|
movdqa xmm0, [rsp+50h]
|
|
|
|
punpckldq xmm0, [rsp+60h]
|
|
|
|
punpcklqdq xmm5,xmm0
|
|
|
|
movdqa [rsp+50h],xmm0
|
|
|
|
mov [rsp+50h],eax
|
|
|
|
mov eax,[r10+rbx]
|
|
|
|
movdqa xmm0,[rsp+50h]
|
|
|
|
movdqa xmm1,xmm5
|
|
|
|
mov [rsp+60h],eax
|
|
|
|
mov eax,[r11+r10*2]
|
|
|
|
punpckldq xmm0, [rsp+60h]
|
|
|
|
punpcklqdq xmm2,xmm0
|
|
|
|
punpcklbw xmm1,xmm2
|
|
|
|
punpckhbw xmm5,xmm2
|
|
|
|
movdqa [rsp+50h],xmm0
|
|
|
|
mov [rsp+50h],eax
|
|
|
|
mov eax,[rbx+r10*2]
|
|
|
|
movdqa xmm0,[rsp+50h]
|
|
|
|
mov [rsp+60h],eax
|
|
|
|
mov eax, [rdx+r11]
|
|
|
|
movdqa xmm15,xmm1
|
|
|
|
punpckldq xmm0,[rsp+60h]
|
|
|
|
punpcklqdq xmm4,xmm0
|
|
|
|
movdqa [rsp+50h],xmm0
|
|
|
|
mov [rsp+50h],eax
|
|
|
|
mov eax, [rdx+rbx]
|
|
|
|
movdqa xmm0,[rsp+50h]
|
|
|
|
mov [rsp+60h],eax
|
|
|
|
punpckldq xmm0, [rsp+60h]
|
|
|
|
punpcklqdq xmm3,xmm0
|
|
|
|
movdqa xmm0,xmm4
|
|
|
|
punpcklbw xmm0,xmm3
|
|
|
|
punpckhbw xmm4,xmm3
|
|
|
|
punpcklwd xmm15,xmm0
|
|
|
|
punpckhwd xmm1,xmm0
|
|
|
|
movdqa xmm0,xmm5
|
|
|
|
movdqa xmm12,xmm15
|
|
|
|
punpcklwd xmm0,xmm4
|
|
|
|
punpckhwd xmm5,xmm4
|
|
|
|
punpckldq xmm12,xmm0
|
|
|
|
punpckhdq xmm15,xmm0
|
|
|
|
movdqa xmm0,xmm1
|
|
|
|
movdqa xmm11,xmm12
|
|
|
|
punpckldq xmm0,xmm5
|
|
|
|
punpckhdq xmm1,xmm5
|
|
|
|
punpcklqdq xmm11,xmm0
|
|
|
|
punpckhqdq xmm12,xmm0
|
|
|
|
movsx eax,r9w
|
|
|
|
movdqa xmm14,xmm15
|
|
|
|
punpcklqdq xmm14,xmm1
|
|
|
|
punpckhqdq xmm15,xmm1
|
|
|
|
pxor xmm1,xmm1
|
|
|
|
movd xmm0,eax
|
|
|
|
movdqa xmm4,xmm12
|
|
|
|
movdqa xmm8,xmm11
|
|
|
|
mov eax, ebp ; iBeta
|
|
|
|
punpcklwd xmm0,xmm0
|
|
|
|
punpcklbw xmm4,xmm1
|
|
|
|
punpckhbw xmm12,xmm1
|
|
|
|
movdqa xmm9,xmm14
|
|
|
|
movdqa xmm7,xmm15
|
|
|
|
movdqa xmm10,xmm15
|
|
|
|
pshufd xmm13,xmm0,0
|
|
|
|
punpcklbw xmm9,xmm1
|
|
|
|
punpckhbw xmm14,xmm1
|
|
|
|
movdqa xmm6,xmm13
|
|
|
|
movd xmm0,eax
|
|
|
|
movdqa [rsp],xmm11
|
|
|
|
mov eax,2
|
|
|
|
cwde
|
|
|
|
punpckhbw xmm11,xmm1
|
|
|
|
punpckhbw xmm10,xmm1
|
|
|
|
punpcklbw xmm7,xmm1
|
|
|
|
punpcklwd xmm0,xmm0
|
|
|
|
punpcklbw xmm8,xmm1
|
|
|
|
pshufd xmm3,xmm0,0
|
|
|
|
movdqa xmm1,xmm8
|
|
|
|
movdqa xmm0,xmm4
|
|
|
|
psubw xmm0,xmm9
|
|
|
|
psubw xmm1,xmm4
|
|
|
|
movdqa xmm2,xmm3
|
|
|
|
pabsw xmm0,xmm0
|
|
|
|
pcmpgtw xmm6,xmm0
|
|
|
|
pabsw xmm0,xmm1
|
|
|
|
movdqa xmm1,xmm3
|
|
|
|
pcmpgtw xmm2,xmm0
|
|
|
|
pand xmm6,xmm2
|
|
|
|
movdqa xmm0,xmm7
|
|
|
|
movdqa xmm2,xmm3
|
|
|
|
psubw xmm0,xmm9
|
|
|
|
pabsw xmm0,xmm0
|
|
|
|
pcmpgtw xmm1,xmm0
|
|
|
|
pand xmm6,xmm1
|
|
|
|
movdqa xmm0,xmm12
|
|
|
|
movdqa xmm1,xmm11
|
|
|
|
psubw xmm0,xmm14
|
|
|
|
psubw xmm1,xmm12
|
|
|
|
movdqa xmm5,xmm6
|
|
|
|
pabsw xmm0,xmm0
|
|
|
|
pcmpgtw xmm13,xmm0
|
|
|
|
pabsw xmm0,xmm1
|
|
|
|
movdqa xmm1,xmm8
|
|
|
|
pcmpgtw xmm2,xmm0
|
|
|
|
paddw xmm1,xmm8
|
|
|
|
movdqa xmm0,xmm10
|
|
|
|
pand xmm13,xmm2
|
|
|
|
psubw xmm0,xmm14
|
|
|
|
paddw xmm1,xmm4
|
|
|
|
movdqa xmm2,xmm11
|
|
|
|
pabsw xmm0,xmm0
|
|
|
|
paddw xmm2,xmm11
|
|
|
|
paddw xmm1,xmm7
|
|
|
|
pcmpgtw xmm3,xmm0
|
|
|
|
paddw xmm2,xmm12
|
|
|
|
movd xmm0,eax
|
|
|
|
pand xmm13,xmm3
|
|
|
|
paddw xmm2,xmm10
|
|
|
|
punpcklwd xmm0,xmm0
|
|
|
|
pshufd xmm3,xmm0,0
|
|
|
|
movdqa xmm0,xmm6
|
|
|
|
paddw xmm1,xmm3
|
|
|
|
pandn xmm0,xmm4
|
|
|
|
paddw xmm2,xmm3
|
|
|
|
psraw xmm1,2
|
|
|
|
pand xmm5,xmm1
|
|
|
|
por xmm5,xmm0
|
|
|
|
paddw xmm7,xmm7
|
|
|
|
paddw xmm10,xmm10
|
|
|
|
psraw xmm2,2
|
|
|
|
movdqa xmm1,xmm13
|
|
|
|
movdqa xmm0,xmm13
|
|
|
|
pandn xmm0,xmm12
|
|
|
|
pand xmm1,xmm2
|
|
|
|
paddw xmm7,xmm9
|
|
|
|
por xmm1,xmm0
|
|
|
|
paddw xmm10,xmm14
|
|
|
|
paddw xmm7,xmm8
|
|
|
|
movdqa xmm0,xmm13
|
|
|
|
packuswb xmm5,xmm1
|
|
|
|
paddw xmm7,xmm3
|
|
|
|
paddw xmm10,xmm11
|
|
|
|
movdqa xmm1,xmm6
|
|
|
|
paddw xmm10,xmm3
|
|
|
|
pandn xmm6,xmm9
|
|
|
|
psraw xmm7,2
|
|
|
|
pand xmm1,xmm7
|
|
|
|
psraw xmm10,2
|
|
|
|
pandn xmm13,xmm14
|
|
|
|
pand xmm0,xmm10
|
|
|
|
por xmm1,xmm6
|
|
|
|
movdqa xmm6,[rsp]
|
|
|
|
movdqa xmm4,xmm6
|
|
|
|
por xmm0,xmm13
|
|
|
|
punpcklbw xmm4,xmm5
|
|
|
|
punpckhbw xmm6,xmm5
|
|
|
|
movdqa xmm3,xmm4
|
|
|
|
packuswb xmm1,xmm0
|
|
|
|
movdqa xmm0,xmm1
|
|
|
|
punpckhbw xmm1,xmm15
|
|
|
|
punpcklbw xmm0,xmm15
|
|
|
|
punpcklwd xmm3,xmm0
|
|
|
|
punpckhwd xmm4,xmm0
|
|
|
|
movdqa xmm0,xmm6
|
|
|
|
movdqa xmm2,xmm3
|
|
|
|
punpcklwd xmm0,xmm1
|
|
|
|
punpckhwd xmm6,xmm1
|
|
|
|
movdqa xmm1,xmm4
|
|
|
|
punpckldq xmm2,xmm0
|
|
|
|
punpckhdq xmm3,xmm0
|
|
|
|
punpckldq xmm1,xmm6
|
|
|
|
movdqa xmm0,xmm2
|
|
|
|
punpcklqdq xmm0,xmm1
|
|
|
|
punpckhdq xmm4,xmm6
|
|
|
|
punpckhqdq xmm2,xmm1
|
|
|
|
movdqa [rsp+10h],xmm0
|
|
|
|
movdqa [rsp+60h],xmm2
|
|
|
|
movdqa xmm0,xmm3
|
|
|
|
mov eax,[rsp+10h]
|
|
|
|
mov [rcx-2],eax
|
|
|
|
mov eax,[rsp+60h]
|
|
|
|
punpcklqdq xmm0,xmm4
|
|
|
|
punpckhqdq xmm3,xmm4
|
|
|
|
mov [r10+rcx-2],eax
|
|
|
|
movdqa [rsp+20h],xmm0
|
|
|
|
mov eax, [rsp+20h]
|
|
|
|
movdqa [rsp+70h],xmm3
|
|
|
|
mov [rcx+r10*2-2],eax
|
|
|
|
mov eax,[rsp+70h]
|
|
|
|
mov [rdx+rcx-2],eax
|
|
|
|
mov eax,[rsp+18h]
|
|
|
|
mov [r11],eax
|
|
|
|
mov eax,[rsp+68h]
|
|
|
|
mov [r10+r11],eax
|
|
|
|
mov eax,[rsp+28h]
|
|
|
|
mov [r11+r10*2],eax
|
|
|
|
mov eax,[rsp+78h]
|
|
|
|
mov [rdx+r11],eax
|
|
|
|
mov eax,[rsp+14h]
|
|
|
|
mov [rdi-2],eax
|
|
|
|
mov eax,[rsp+64h]
|
|
|
|
mov [r10+rdi-2],eax
|
|
|
|
mov eax,[rsp+24h]
|
|
|
|
mov [rdi+r10*2-2],eax
|
|
|
|
mov eax, [rsp+74h]
|
|
|
|
mov [rdx+rdi-2],eax
|
|
|
|
mov eax, [rsp+1Ch]
|
|
|
|
mov [rbx],eax
|
|
|
|
mov eax, [rsp+6Ch]
|
|
|
|
mov [r10+rbx],eax
|
|
|
|
mov eax,[rsp+2Ch]
|
|
|
|
mov [rbx+r10*2],eax
|
|
|
|
mov eax,[rsp+7Ch]
|
|
|
|
mov [rdx+rbx],eax
|
|
|
|
lea r11,[rsp+140h]
|
|
|
|
mov rbx, [r11+28h]
|
|
|
|
mov rsp,r11
|
|
|
|
pop r12
|
|
|
|
pop rbp
|
|
|
|
pop rbx
|
|
|
|
ret
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%elifdef X86_32
|
|
|
|
|
|
|
|
;***************************************************************************
|
2014-01-16 15:57:22 +08:00
|
|
|
; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
2014-01-03 14:49:45 +08:00
|
|
|
; int32_t iAlpha, int32_t iBeta)
|
|
|
|
;***************************************************************************
|
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockChromaEq4H_ssse3
|
2014-05-31 14:13:34 +03:00
|
|
|
push ebp
|
|
|
|
mov ebp,esp
|
|
|
|
and esp,0FFFFFFF0h
|
|
|
|
sub esp,0C8h
|
|
|
|
mov ecx,dword [ebp+8]
|
|
|
|
mov edx,dword [ebp+0Ch]
|
|
|
|
mov eax,dword [ebp+10h]
|
|
|
|
sub ecx,2
|
|
|
|
sub edx,2
|
|
|
|
push esi
|
|
|
|
lea esi,[eax+eax*2]
|
|
|
|
mov dword [esp+18h],ecx
|
|
|
|
mov dword [esp+4],edx
|
|
|
|
lea ecx,[ecx+eax*4]
|
|
|
|
lea edx,[edx+eax*4]
|
|
|
|
lea eax,[esp+7Ch]
|
|
|
|
push edi
|
|
|
|
mov dword [esp+14h],esi
|
|
|
|
mov dword [esp+18h],ecx
|
|
|
|
mov dword [esp+0Ch],edx
|
|
|
|
mov dword [esp+10h],eax
|
|
|
|
mov esi,dword [esp+1Ch]
|
|
|
|
mov ecx,dword [ebp+10h]
|
|
|
|
mov edx,dword [esp+14h]
|
|
|
|
movd xmm0,dword [esi]
|
|
|
|
movd xmm1,dword [esi+ecx]
|
|
|
|
movd xmm2,dword [esi+ecx*2]
|
|
|
|
movd xmm3,dword [esi+edx]
|
|
|
|
mov esi,dword [esp+8]
|
|
|
|
movd xmm4,dword [esi]
|
|
|
|
movd xmm5,dword [esi+ecx]
|
|
|
|
movd xmm6,dword [esi+ecx*2]
|
|
|
|
movd xmm7,dword [esi+edx]
|
|
|
|
punpckldq xmm0,xmm4
|
|
|
|
punpckldq xmm1,xmm5
|
|
|
|
punpckldq xmm2,xmm6
|
|
|
|
punpckldq xmm3,xmm7
|
|
|
|
mov esi,dword [esp+18h]
|
|
|
|
mov edi,dword [esp+0Ch]
|
|
|
|
movd xmm4,dword [esi]
|
|
|
|
movd xmm5,dword [edi]
|
|
|
|
punpckldq xmm4,xmm5
|
|
|
|
punpcklqdq xmm0,xmm4
|
|
|
|
movd xmm4,dword [esi+ecx]
|
|
|
|
movd xmm5,dword [edi+ecx]
|
|
|
|
punpckldq xmm4,xmm5
|
|
|
|
punpcklqdq xmm1,xmm4
|
|
|
|
movd xmm4,dword [esi+ecx*2]
|
|
|
|
movd xmm5,dword [edi+ecx*2]
|
|
|
|
punpckldq xmm4,xmm5
|
|
|
|
punpcklqdq xmm2,xmm4
|
|
|
|
movd xmm4,dword [esi+edx]
|
|
|
|
movd xmm5,dword [edi+edx]
|
|
|
|
punpckldq xmm4,xmm5
|
|
|
|
punpcklqdq xmm3,xmm4
|
|
|
|
movdqa xmm6,xmm0
|
|
|
|
punpcklbw xmm0,xmm1
|
|
|
|
punpckhbw xmm6,xmm1
|
|
|
|
movdqa xmm7,xmm2
|
|
|
|
punpcklbw xmm2,xmm3
|
|
|
|
punpckhbw xmm7,xmm3
|
|
|
|
movdqa xmm4,xmm0
|
|
|
|
movdqa xmm5,xmm6
|
|
|
|
punpcklwd xmm0,xmm2
|
|
|
|
punpckhwd xmm4,xmm2
|
|
|
|
punpcklwd xmm6,xmm7
|
|
|
|
punpckhwd xmm5,xmm7
|
|
|
|
movdqa xmm1,xmm0
|
|
|
|
movdqa xmm2,xmm4
|
|
|
|
punpckldq xmm0,xmm6
|
|
|
|
punpckhdq xmm1,xmm6
|
|
|
|
punpckldq xmm4,xmm5
|
|
|
|
punpckhdq xmm2,xmm5
|
|
|
|
movdqa xmm5,xmm0
|
|
|
|
movdqa xmm6,xmm1
|
|
|
|
punpcklqdq xmm0,xmm4
|
|
|
|
punpckhqdq xmm5,xmm4
|
|
|
|
punpcklqdq xmm1,xmm2
|
|
|
|
punpckhqdq xmm6,xmm2
|
|
|
|
mov edi,dword [esp+10h]
|
|
|
|
movdqa [edi],xmm0
|
|
|
|
movdqa [edi+10h],xmm5
|
|
|
|
movdqa [edi+20h],xmm1
|
|
|
|
movdqa [edi+30h],xmm6
|
|
|
|
movsx ecx,word [ebp+14h]
|
|
|
|
movsx edx,word [ebp+18h]
|
|
|
|
movdqa xmm6,[esp+80h]
|
|
|
|
movdqa xmm4,[esp+90h]
|
|
|
|
movdqa xmm5,[esp+0A0h]
|
|
|
|
movdqa xmm7,[esp+0B0h]
|
|
|
|
pxor xmm0,xmm0
|
|
|
|
movd xmm1,ecx
|
|
|
|
movdqa xmm2,xmm1
|
|
|
|
punpcklwd xmm2,xmm1
|
|
|
|
pshufd xmm1,xmm2,0
|
|
|
|
movd xmm2,edx
|
|
|
|
movdqa xmm3,xmm2
|
|
|
|
punpcklwd xmm3,xmm2
|
|
|
|
pshufd xmm2,xmm3,0
|
|
|
|
movdqa xmm3,xmm6
|
|
|
|
punpckhbw xmm6,xmm0
|
|
|
|
movdqa [esp+60h],xmm6
|
|
|
|
movdqa xmm6,[esp+90h]
|
|
|
|
punpckhbw xmm6,xmm0
|
|
|
|
movdqa [esp+30h],xmm6
|
|
|
|
movdqa xmm6,[esp+0A0h]
|
|
|
|
punpckhbw xmm6,xmm0
|
|
|
|
movdqa [esp+40h],xmm6
|
|
|
|
movdqa xmm6,[esp+0B0h]
|
|
|
|
punpckhbw xmm6,xmm0
|
|
|
|
movdqa [esp+70h],xmm6
|
|
|
|
punpcklbw xmm7,xmm0
|
|
|
|
punpcklbw xmm4,xmm0
|
|
|
|
punpcklbw xmm5,xmm0
|
|
|
|
punpcklbw xmm3,xmm0
|
|
|
|
movdqa [esp+50h],xmm7
|
|
|
|
movdqa xmm6,xmm4
|
|
|
|
psubw xmm6,xmm5
|
|
|
|
pabsw xmm6,xmm6
|
|
|
|
movdqa xmm0,xmm1
|
|
|
|
pcmpgtw xmm0,xmm6
|
|
|
|
movdqa xmm6,xmm3
|
|
|
|
psubw xmm6,xmm4
|
|
|
|
pabsw xmm6,xmm6
|
|
|
|
movdqa xmm7,xmm2
|
|
|
|
pcmpgtw xmm7,xmm6
|
|
|
|
movdqa xmm6,[esp+50h]
|
|
|
|
psubw xmm6,xmm5
|
|
|
|
pabsw xmm6,xmm6
|
|
|
|
pand xmm0,xmm7
|
|
|
|
movdqa xmm7,xmm2
|
|
|
|
pcmpgtw xmm7,xmm6
|
|
|
|
movdqa xmm6,[esp+30h]
|
|
|
|
psubw xmm6,[esp+40h]
|
|
|
|
pabsw xmm6,xmm6
|
|
|
|
pcmpgtw xmm1,xmm6
|
|
|
|
movdqa xmm6,[esp+60h]
|
|
|
|
psubw xmm6,[esp+30h]
|
|
|
|
pabsw xmm6,xmm6
|
|
|
|
pand xmm0,xmm7
|
|
|
|
movdqa xmm7,xmm2
|
|
|
|
pcmpgtw xmm7,xmm6
|
|
|
|
movdqa xmm6,[esp+70h]
|
|
|
|
psubw xmm6,[esp+40h]
|
|
|
|
pabsw xmm6,xmm6
|
|
|
|
pand xmm1,xmm7
|
|
|
|
pcmpgtw xmm2,xmm6
|
|
|
|
pand xmm1,xmm2
|
|
|
|
mov eax,2
|
|
|
|
movsx ecx,ax
|
|
|
|
movd xmm2,ecx
|
|
|
|
movdqa xmm6,xmm2
|
|
|
|
punpcklwd xmm6,xmm2
|
|
|
|
pshufd xmm2,xmm6,0
|
|
|
|
movdqa [esp+20h],xmm2
|
|
|
|
movdqa xmm2,xmm3
|
|
|
|
paddw xmm2,xmm3
|
|
|
|
paddw xmm2,xmm4
|
|
|
|
paddw xmm2,[esp+50h]
|
|
|
|
paddw xmm2,[esp+20h]
|
|
|
|
psraw xmm2,2
|
|
|
|
movdqa xmm6,xmm0
|
|
|
|
pand xmm6,xmm2
|
|
|
|
movdqa xmm2,xmm0
|
|
|
|
pandn xmm2,xmm4
|
|
|
|
por xmm6,xmm2
|
|
|
|
movdqa xmm2,[esp+60h]
|
|
|
|
movdqa xmm7,xmm2
|
|
|
|
paddw xmm7,xmm2
|
|
|
|
paddw xmm7,[esp+30h]
|
|
|
|
paddw xmm7,[esp+70h]
|
|
|
|
paddw xmm7,[esp+20h]
|
|
|
|
movdqa xmm4,xmm1
|
|
|
|
movdqa xmm2,xmm1
|
|
|
|
pandn xmm2,[esp+30h]
|
|
|
|
psraw xmm7,2
|
|
|
|
pand xmm4,xmm7
|
|
|
|
por xmm4,xmm2
|
|
|
|
movdqa xmm2,[esp+50h]
|
|
|
|
packuswb xmm6,xmm4
|
|
|
|
movdqa [esp+90h],xmm6
|
|
|
|
movdqa xmm6,xmm2
|
|
|
|
paddw xmm6,xmm2
|
|
|
|
movdqa xmm2,[esp+20h]
|
|
|
|
paddw xmm6,xmm5
|
|
|
|
paddw xmm6,xmm3
|
|
|
|
movdqa xmm4,xmm0
|
|
|
|
pandn xmm0,xmm5
|
|
|
|
paddw xmm6,xmm2
|
|
|
|
psraw xmm6,2
|
|
|
|
pand xmm4,xmm6
|
|
|
|
por xmm4,xmm0
|
|
|
|
movdqa xmm0,[esp+70h]
|
|
|
|
movdqa xmm5,xmm0
|
|
|
|
paddw xmm5,xmm0
|
|
|
|
movdqa xmm0,[esp+40h]
|
|
|
|
paddw xmm5,xmm0
|
|
|
|
paddw xmm5,[esp+60h]
|
|
|
|
movdqa xmm3,xmm1
|
|
|
|
paddw xmm5,xmm2
|
|
|
|
psraw xmm5,2
|
|
|
|
pand xmm3,xmm5
|
|
|
|
pandn xmm1,xmm0
|
|
|
|
por xmm3,xmm1
|
|
|
|
packuswb xmm4,xmm3
|
|
|
|
movdqa [esp+0A0h],xmm4
|
|
|
|
mov esi,dword [esp+10h]
|
|
|
|
movdqa xmm0,[esi]
|
|
|
|
movdqa xmm1,[esi+10h]
|
|
|
|
movdqa xmm2,[esi+20h]
|
|
|
|
movdqa xmm3,[esi+30h]
|
|
|
|
movdqa xmm6,xmm0
|
|
|
|
punpcklbw xmm0,xmm1
|
|
|
|
punpckhbw xmm6,xmm1
|
|
|
|
movdqa xmm7,xmm2
|
|
|
|
punpcklbw xmm2,xmm3
|
|
|
|
punpckhbw xmm7,xmm3
|
|
|
|
movdqa xmm4,xmm0
|
|
|
|
movdqa xmm5,xmm6
|
|
|
|
punpcklwd xmm0,xmm2
|
|
|
|
punpckhwd xmm4,xmm2
|
|
|
|
punpcklwd xmm6,xmm7
|
|
|
|
punpckhwd xmm5,xmm7
|
|
|
|
movdqa xmm1,xmm0
|
|
|
|
movdqa xmm2,xmm4
|
|
|
|
punpckldq xmm0,xmm6
|
|
|
|
punpckhdq xmm1,xmm6
|
|
|
|
punpckldq xmm4,xmm5
|
|
|
|
punpckhdq xmm2,xmm5
|
|
|
|
movdqa xmm5,xmm0
|
|
|
|
movdqa xmm6,xmm1
|
|
|
|
punpcklqdq xmm0,xmm4
|
|
|
|
punpckhqdq xmm5,xmm4
|
|
|
|
punpcklqdq xmm1,xmm2
|
|
|
|
punpckhqdq xmm6,xmm2
|
|
|
|
mov esi,dword [esp+1Ch]
|
|
|
|
mov ecx,dword [ebp+10h]
|
|
|
|
mov edx,dword [esp+14h]
|
|
|
|
mov edi,dword [esp+8]
|
|
|
|
movd dword [esi],xmm0
|
|
|
|
movd dword [esi+ecx],xmm5
|
|
|
|
movd dword [esi+ecx*2],xmm1
|
|
|
|
movd dword [esi+edx],xmm6
|
|
|
|
psrldq xmm0,4
|
|
|
|
psrldq xmm5,4
|
|
|
|
psrldq xmm1,4
|
|
|
|
psrldq xmm6,4
|
|
|
|
mov esi,dword [esp+18h]
|
|
|
|
movd dword [edi],xmm0
|
|
|
|
movd dword [edi+ecx],xmm5
|
|
|
|
movd dword [edi+ecx*2],xmm1
|
|
|
|
movd dword [edi+edx],xmm6
|
|
|
|
psrldq xmm0,4
|
|
|
|
psrldq xmm5,4
|
|
|
|
psrldq xmm1,4
|
|
|
|
psrldq xmm6,4
|
|
|
|
movd dword [esi],xmm0
|
|
|
|
movd dword [esi+ecx],xmm5
|
|
|
|
movd dword [esi+ecx*2],xmm1
|
|
|
|
movd dword [esi+edx],xmm6
|
|
|
|
psrldq xmm0,4
|
|
|
|
psrldq xmm5,4
|
|
|
|
psrldq xmm1,4
|
|
|
|
psrldq xmm6,4
|
|
|
|
mov edi,dword [esp+0Ch]
|
|
|
|
movd dword [edi],xmm0
|
|
|
|
movd dword [edi+ecx],xmm5
|
|
|
|
movd dword [edi+ecx*2],xmm1
|
|
|
|
movd dword [edi+edx],xmm6
|
|
|
|
pop edi
|
|
|
|
pop esi
|
|
|
|
mov esp,ebp
|
|
|
|
pop ebp
|
|
|
|
ret
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
|
|
|
%endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;********************************************************************************
|
|
|
|
;
|
|
|
|
; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
|
|
|
|
;
|
|
|
|
;********************************************************************************
|
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockLumaTransposeH2V_sse2
|
2014-01-05 14:16:22 +02:00
|
|
|
push r3
|
|
|
|
push r4
|
2014-01-03 14:49:45 +08:00
|
|
|
push r5
|
|
|
|
|
2014-01-05 14:16:22 +02:00
|
|
|
%assign push_num 3
|
|
|
|
LOAD_3_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 10:29:53 +02:00
|
|
|
PUSH_XMM 8
|
2014-01-03 14:49:45 +08:00
|
|
|
|
2014-03-14 10:13:18 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
2014-01-03 14:49:45 +08:00
|
|
|
|
2014-01-05 14:16:22 +02:00
|
|
|
mov r5, r7
|
2014-01-03 14:49:45 +08:00
|
|
|
mov r3, r7
|
|
|
|
and r3, 0Fh
|
|
|
|
sub r7, r3
|
|
|
|
sub r7, 10h
|
|
|
|
|
|
|
|
lea r3, [r0 + r1 * 8]
|
|
|
|
lea r4, [r1 * 3]
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movq xmm7, [r3]
|
|
|
|
punpcklqdq xmm0, xmm7
|
|
|
|
movq xmm1, [r0 + r1]
|
|
|
|
movq xmm7, [r3 + r1]
|
|
|
|
punpcklqdq xmm1, xmm7
|
|
|
|
movq xmm2, [r0 + r1*2]
|
|
|
|
movq xmm7, [r3 + r1*2]
|
|
|
|
punpcklqdq xmm2, xmm7
|
|
|
|
movq xmm3, [r0 + r4]
|
|
|
|
movq xmm7, [r3 + r4]
|
|
|
|
punpcklqdq xmm3, xmm7
|
|
|
|
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
lea r3, [r3 + r1 * 4]
|
|
|
|
movq xmm4, [r0]
|
|
|
|
movq xmm7, [r3]
|
|
|
|
punpcklqdq xmm4, xmm7
|
|
|
|
movq xmm5, [r0 + r1]
|
|
|
|
movq xmm7, [r3 + r1]
|
|
|
|
punpcklqdq xmm5, xmm7
|
|
|
|
movq xmm6, [r0 + r1*2]
|
|
|
|
movq xmm7, [r3 + r1*2]
|
|
|
|
punpcklqdq xmm6, xmm7
|
|
|
|
|
|
|
|
movdqa [r7], xmm0
|
|
|
|
movq xmm7, [r0 + r4]
|
|
|
|
movq xmm0, [r3 + r4]
|
|
|
|
punpcklqdq xmm7, xmm0
|
|
|
|
movdqa xmm0, [r7]
|
|
|
|
|
|
|
|
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
|
|
|
|
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
2014-01-05 14:16:22 +02:00
|
|
|
|
2014-01-03 14:49:45 +08:00
|
|
|
movdqa [r2], xmm4
|
|
|
|
movdqa [r2 + 10h], xmm2
|
|
|
|
movdqa [r2 + 20h], xmm3
|
|
|
|
movdqa [r2 + 30h], xmm7
|
|
|
|
movdqa [r2 + 40h], xmm5
|
|
|
|
movdqa [r2 + 50h], xmm1
|
|
|
|
movdqa [r2 + 60h], xmm6
|
|
|
|
movdqa [r2 + 70h], xmm0
|
|
|
|
|
|
|
|
mov r7, r5
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 10:29:53 +02:00
|
|
|
POP_XMM
|
2014-01-03 14:49:45 +08:00
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
pop r3
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
;*******************************************************************************************
|
|
|
|
;
|
|
|
|
; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
|
|
|
|
;
|
|
|
|
;*******************************************************************************************
|
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockLumaTransposeV2H_sse2
|
2014-01-03 14:49:45 +08:00
|
|
|
push r3
|
2014-01-05 14:16:22 +02:00
|
|
|
push r4
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
%assign push_num 2
|
|
|
|
LOAD_3_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 10:29:53 +02:00
|
|
|
PUSH_XMM 8
|
2014-01-03 14:49:45 +08:00
|
|
|
|
2014-03-14 10:13:18 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
mov r4, r7
|
2014-01-05 14:16:22 +02:00
|
|
|
mov r3, r7
|
2014-01-03 14:49:45 +08:00
|
|
|
and r3, 0Fh
|
2014-01-05 14:16:22 +02:00
|
|
|
sub r7, r3
|
2014-01-03 14:49:45 +08:00
|
|
|
sub r7, 10h
|
|
|
|
|
|
|
|
movdqa xmm0, [r2]
|
|
|
|
movdqa xmm1, [r2 + 10h]
|
|
|
|
movdqa xmm2, [r2 + 20h]
|
|
|
|
movdqa xmm3, [r2 + 30h]
|
|
|
|
movdqa xmm4, [r2 + 40h]
|
|
|
|
movdqa xmm5, [r2 + 50h]
|
|
|
|
movdqa xmm6, [r2 + 60h]
|
|
|
|
movdqa xmm7, [r2 + 70h]
|
|
|
|
|
|
|
|
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
|
|
|
|
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
|
|
|
|
|
|
|
lea r2, [r1 * 3]
|
|
|
|
|
|
|
|
movq [r0], xmm4
|
|
|
|
movq [r0 + r1], xmm2
|
|
|
|
movq [r0 + r1*2], xmm3
|
|
|
|
movq [r0 + r2], xmm7
|
|
|
|
|
|
|
|
lea r0, [r0 + r1*4]
|
|
|
|
movq [r0], xmm5
|
|
|
|
movq [r0 + r1], xmm1
|
|
|
|
movq [r0 + r1*2], xmm6
|
|
|
|
movq [r0 + r2], xmm0
|
|
|
|
|
|
|
|
psrldq xmm4, 8
|
|
|
|
psrldq xmm2, 8
|
|
|
|
psrldq xmm3, 8
|
|
|
|
psrldq xmm7, 8
|
|
|
|
psrldq xmm5, 8
|
|
|
|
psrldq xmm1, 8
|
|
|
|
psrldq xmm6, 8
|
|
|
|
psrldq xmm0, 8
|
|
|
|
|
|
|
|
lea r0, [r0 + r1*4]
|
|
|
|
movq [r0], xmm4
|
|
|
|
movq [r0 + r1], xmm2
|
|
|
|
movq [r0 + r1*2], xmm3
|
|
|
|
movq [r0 + r2], xmm7
|
|
|
|
|
|
|
|
lea r0, [r0 + r1*4]
|
|
|
|
movq [r0], xmm5
|
|
|
|
movq [r0 + r1], xmm1
|
|
|
|
movq [r0 + r1*2], xmm6
|
|
|
|
movq [r0 + r2], xmm0
|
|
|
|
|
|
|
|
|
|
|
|
mov r7, r4
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 10:29:53 +02:00
|
|
|
POP_XMM
|
2014-01-03 14:49:45 +08:00
|
|
|
pop r4
|
|
|
|
pop r3
|
|
|
|
ret
|
|
|
|
|
2015-01-04 16:39:17 +08:00
|
|
|
WELS_EXTERN WelsNonZeroCount_sse2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_1_PARA
|
|
|
|
movdqu xmm0, [r0]
|
|
|
|
movq xmm1, [r0+16]
|
|
|
|
WELS_DB1 xmm2
|
|
|
|
pminub xmm0, xmm2
|
|
|
|
pminub xmm1, xmm2
|
|
|
|
movdqu [r0], xmm0
|
|
|
|
movq [r0+16], xmm1
|
|
|
|
ret
|