2014-01-03 14:49:45 +08:00
|
|
|
;*!
|
|
|
|
;* \copy
|
|
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
|
|
;* All rights reserved.
|
|
|
|
;*
|
|
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
|
|
;* modification, are permitted provided that the following conditions
|
|
|
|
;* are met:
|
|
|
|
;*
|
|
|
|
;* * Redistributions of source code must retain the above copyright
|
|
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
|
|
;*
|
|
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
|
|
;* the documentation and/or other materials provided with the
|
|
|
|
;* distribution.
|
|
|
|
;*
|
|
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
;*
|
|
|
|
;*
|
|
|
|
;* deblock.asm
|
|
|
|
;*
|
|
|
|
;* Abstract
|
|
|
|
;* edge loop
|
|
|
|
;*
|
|
|
|
;* History
|
|
|
|
;* 08/07/2009 Created
|
|
|
|
;*
|
|
|
|
;*
|
|
|
|
;*************************************************************************/
|
|
|
|
%include "asm_inc.asm"
|
|
|
|
|
|
|
|
;*******************************************************************************
|
|
|
|
; Macros and other preprocessor constants
|
|
|
|
;*******************************************************************************
|
|
|
|
|
|
|
|
SECTION .rodata align=16
|
|
|
|
|
|
|
|
ALIGN 16
|
|
|
|
FOUR_16B_SSE2: dw 4, 4, 4, 4, 4, 4, 4, 4
|
|
|
|
|
2016-02-01 11:48:25 +01:00
|
|
|
ALIGN 16
|
2016-02-09 23:28:14 +01:00
|
|
|
WELS_DB1_16:
|
|
|
|
times 16 db 1
|
2016-02-01 11:48:25 +01:00
|
|
|
WELS_DB127_16:
|
|
|
|
times 16 db 127
|
2016-02-09 23:28:14 +01:00
|
|
|
WELS_DB96_16:
|
|
|
|
times 16 db 96
|
2016-02-01 11:48:25 +01:00
|
|
|
WELS_SHUFB0000111122223333:
|
|
|
|
times 4 db 0
|
|
|
|
times 4 db 1
|
|
|
|
times 4 db 2
|
|
|
|
times 4 db 3
|
|
|
|
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
SECTION .text
|
|
|
|
|
2016-02-01 11:48:25 +01:00
|
|
|
; Unsigned byte absolute difference.
|
|
|
|
; a=%1 b=%2 clobber=%3
|
|
|
|
; Subtract once in each direction with saturation and return the maximum.
|
|
|
|
%macro SSE2_AbsDiffUB 3
|
|
|
|
movdqa %3, %2
|
|
|
|
psubusb %3, %1
|
|
|
|
psubusb %1, %2
|
|
|
|
por %1, %3
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
; Unsigned byte compare less than.
|
|
|
|
; lhs=%1 rhs^0x7f=%2 0x7f=%3
|
|
|
|
; No unsigned byte lt/gt compare instruction available; xor by 0x7f and use a
|
|
|
|
; signed compare. Some other options do exist. This one allows modifying the lhs
|
|
|
|
; without mov and uses a bitwise op which can be executed on most ports on
|
|
|
|
; common architectures.
|
|
|
|
%macro SSE2_CmpltUB 3
|
|
|
|
pxor %1, %3
|
|
|
|
pcmpgtb %1, %2
|
|
|
|
%endmacro
|
|
|
|
|
2016-02-09 23:28:14 +01:00
|
|
|
; Unsigned byte compare greater than or equal.
|
|
|
|
%macro SSE2_CmpgeUB 2
|
|
|
|
pminub %1, %2
|
|
|
|
pcmpeqb %1, %2
|
|
|
|
%endmacro
|
|
|
|
|
2016-02-01 11:48:25 +01:00
|
|
|
; Clip unsigned bytes to ref +/- diff.
|
|
|
|
; data=%1 ref=%2 maxdiff_from_ref=%3 clobber=%4
|
|
|
|
%macro SSE2_ClipUB 4
|
|
|
|
movdqa %4, %2
|
|
|
|
psubusb %4, %3
|
|
|
|
paddusb %3, %2
|
|
|
|
pmaxub %1, %4
|
|
|
|
pminub %1, %3
|
|
|
|
%endmacro
|
|
|
|
|
2016-02-09 23:28:14 +01:00
|
|
|
; (a + b + 1 - c) >> 1
|
|
|
|
; a=%1 b=%2 c=%3 [out:a^b&c]=%4
|
|
|
|
%macro SSE2_AvgbFloor1 4
|
|
|
|
movdqa %4, %1
|
|
|
|
pxor %4, %2
|
|
|
|
pavgb %1, %2
|
|
|
|
pand %4, %3
|
|
|
|
psubb %1, %4
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
; (a + b + carry) >> 1
|
|
|
|
; a=%1 b=%2 carry-1=%3
|
|
|
|
%macro SSE2_AvgbFloor2 3
|
|
|
|
pxor %1, %3
|
|
|
|
pxor %2, %3
|
|
|
|
pavgb %1, %2
|
|
|
|
pxor %1, %3
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
; a = (a & m) | (b & ~m)
|
|
|
|
; a=%1 b=%2 m=%3
|
|
|
|
%macro SSE2_Blend 3
|
|
|
|
pand %1, %3
|
|
|
|
pandn %3, %2
|
|
|
|
por %1, %3
|
|
|
|
%endmacro
|
|
|
|
|
2016-02-25 15:57:20 +01:00
|
|
|
; Compute
|
|
|
|
; p0 = clip(p0 + clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255)
|
|
|
|
; q0 = clip(q0 - clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255)
|
|
|
|
; 16-wide parallel in packed byte representation in xmm registers.
|
|
|
|
;
|
|
|
|
; p1=%1 p0=%2 q0=%3 q1=%4 iTc=%5 FFh=%6 xmmclobber=%7,%8
|
|
|
|
%macro SSE2_DeblockP0Q0_Lt4 8
|
|
|
|
; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
|
|
|
|
; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
|
|
|
|
; Bias so that unsigned saturation can be used.
|
|
|
|
; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
|
|
|
|
; q0 - p0 is split into a non-negative and non-positive part. The latter is
|
|
|
|
; subtracted from the biased value.
|
|
|
|
movdqa %7, %2
|
|
|
|
psubusb %7, %3 ; clip(p0 - q0, 0, 255)
|
|
|
|
; ((p1 - q1) >> 2) + 0xc0
|
|
|
|
pxor %4, %6 ; q1 ^ 0xff aka -q1 - 1 & 0xff
|
|
|
|
pavgb %1, %4 ; (((p1 - q1 + 0x100) >> 1)
|
|
|
|
pavgb %1, %6 ; + 0x100) >> 1
|
|
|
|
psubusb %1, %7 ; -= clip(p0 - q0, 0, 255) saturate.
|
|
|
|
movdqa %8, %3
|
|
|
|
psubusb %8, %2 ; (clip(q0 - p0, 0, 255)
|
|
|
|
pavgb %8, %1 ; + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1
|
|
|
|
|
|
|
|
; Unbias and split into a non-negative and a non-positive part.
|
|
|
|
; Clip each part to iTc via minub.
|
|
|
|
; Add/subtract each part to/from p0/q0 and clip.
|
|
|
|
movdqa %6, [WELS_DB96_16]
|
|
|
|
psubusb %6, %8
|
|
|
|
psubusb %8, [WELS_DB96_16]
|
|
|
|
pminub %6, %5
|
|
|
|
pminub %8, %5
|
|
|
|
psubusb %2, %6
|
|
|
|
paddusb %2, %8 ; p0
|
|
|
|
paddusb %3, %6
|
|
|
|
psubusb %3, %8 ; q0
|
|
|
|
%endmacro
|
|
|
|
|
2014-01-03 14:49:45 +08:00
|
|
|
|
2016-02-01 11:48:25 +01:00
|
|
|
;*******************************************************************************
|
|
|
|
; void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
|
|
|
|
; int32_t iBeta, int8_t * pTC)
|
|
|
|
;*******************************************************************************
|
2014-01-03 14:49:45 +08:00
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockLumaLt4V_ssse3
|
2016-02-01 11:48:25 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
|
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
movd xmm1, arg3d
|
|
|
|
movd xmm2, arg4d
|
|
|
|
pxor xmm3, xmm3
|
|
|
|
pxor xmm1, [WELS_DB127_16]
|
|
|
|
pxor xmm2, [WELS_DB127_16]
|
|
|
|
pshufb xmm1, xmm3 ; iAlpha ^ 0x7f
|
|
|
|
pshufb xmm2, xmm3 ; iBeta ^ 0x7f
|
|
|
|
mov r2, r1 ; iStride
|
|
|
|
neg r1 ; -iStride
|
|
|
|
lea r3, [r0 + r1] ; pPix - iStride
|
|
|
|
|
|
|
|
; Compute masks to enable/disable deblocking.
|
|
|
|
MOVDQ xmm6, [r3 + 0 * r1] ; p0
|
|
|
|
MOVDQ xmm7, [r3 + 1 * r1] ; p1
|
|
|
|
MOVDQ xmm0, [r0 + 0 * r2] ; q0
|
|
|
|
movdqa xmm4, xmm6
|
|
|
|
SSE2_AbsDiffUB xmm6, xmm0, xmm3 ; |p0 - q0|
|
|
|
|
SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
|
|
|
|
MOVDQ xmm1, [r0 + 1 * r2] ; q1
|
|
|
|
SSE2_AbsDiffUB xmm7, xmm4, xmm3 ; |p1 - p0|
|
|
|
|
SSE2_AbsDiffUB xmm0, xmm1, xmm3 ; |q1 - q0|
|
|
|
|
pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|)
|
|
|
|
SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
|
|
|
|
pand xmm6, xmm7 ; bDeltaP0Q0P1P0Q1Q0 = bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
|
|
|
|
MOVDQ xmm7, [r3 + 2 * r1] ; p2
|
|
|
|
movdqa xmm0, xmm7
|
|
|
|
SSE2_AbsDiffUB xmm7, xmm4, xmm3 ; |p2 - p0|
|
|
|
|
SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP2P0 = |p2 - p0| < iBeta
|
|
|
|
MOVDQ xmm5, [r0 + 2 * r2] ; q2
|
|
|
|
MOVDQ xmm3, [r0 + 0 * r2] ; q0
|
|
|
|
movdqa xmm1, xmm5
|
|
|
|
SSE2_AbsDiffUB xmm5, xmm3, xmm4 ; |q2 - q0|
|
|
|
|
SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
|
|
|
|
|
|
|
|
pavgb xmm3, [r3 + 0 * r1]
|
|
|
|
pcmpeqw xmm2, xmm2 ; FFh
|
|
|
|
pxor xmm3, xmm2
|
|
|
|
; (p2 + ((p0 + q0 + 1) >> 1)) >> 1
|
|
|
|
pxor xmm0, xmm2
|
|
|
|
pavgb xmm0, xmm3
|
|
|
|
pxor xmm0, xmm2
|
|
|
|
; (q2 + ((p0 + q0 + 1) >> 1)) >> 1
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
pavgb xmm1, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
|
|
|
|
movd xmm3, [r4]
|
|
|
|
pshufb xmm3, [WELS_SHUFB0000111122223333] ; iTc
|
|
|
|
movdqa xmm4, xmm3 ; iTc0 = iTc
|
|
|
|
pcmpgtb xmm3, xmm2 ; iTc > -1 ? 0xff : 0x00
|
|
|
|
pand xmm6, xmm3 ; bDeltaP0Q0P1P0Q1Q0 &= iTc > -1
|
|
|
|
movdqa xmm3, xmm4
|
|
|
|
psubb xmm3, xmm7 ; iTc -= bDeltaP2P0 ? -1 : 0
|
|
|
|
psubb xmm3, xmm5 ; iTc -= bDeltaQ2Q0 ? -1 : 0
|
|
|
|
pand xmm3, xmm6 ; iTc &= bDeltaP0Q0P1P0Q1Q0 ? 0xff : 0
|
|
|
|
pand xmm7, xmm6 ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0
|
|
|
|
pand xmm5, xmm6 ; bDeltaQ2Q0 &= bDeltaP0Q0P1P0Q1Q0
|
|
|
|
pand xmm7, xmm4 ; iTc0 & (bDeltaP2P0 ? 0xff : 0)
|
|
|
|
pand xmm5, xmm4 ; iTc0 & (bDeltaQ2Q0 ? 0xff : 0)
|
|
|
|
|
|
|
|
MOVDQ xmm4, [r3 + 1 * r1]
|
|
|
|
SSE2_ClipUB xmm0, xmm4, xmm7, xmm6 ; clip p1.
|
|
|
|
MOVDQ xmm6, [r0 + 1 * r2]
|
|
|
|
MOVDQ [r3 + 1 * r1], xmm0 ; store p1.
|
|
|
|
SSE2_ClipUB xmm1, xmm6, xmm5, xmm7 ; clip q1.
|
|
|
|
MOVDQ [r0 + 1 * r2], xmm1 ; store q1.
|
|
|
|
|
2016-02-25 15:57:20 +01:00
|
|
|
MOVDQ xmm1, [r3 + 0 * r1] ; p0
|
|
|
|
MOVDQ xmm0, [r0 + 0 * r2] ; q0
|
|
|
|
SSE2_DeblockP0Q0_Lt4 xmm4, xmm1, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7
|
2016-02-01 11:48:25 +01:00
|
|
|
MOVDQ [r3 + 0 * r1], xmm1 ; store p0.
|
2016-02-25 15:57:20 +01:00
|
|
|
MOVDQ [r0 + 0 * r2], xmm0 ; store q0.
|
2016-02-01 11:48:25 +01:00
|
|
|
|
2014-05-31 14:13:34 +03:00
|
|
|
POP_XMM
|
2016-02-01 11:48:25 +01:00
|
|
|
LOAD_5_PARA_POP
|
2014-05-31 14:13:34 +03:00
|
|
|
ret
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
2016-02-09 23:28:14 +01:00
|
|
|
; Deblock 3x16 luma pixels for the eq4 case.
|
|
|
|
;
|
|
|
|
; Compose 8-bit averages from pavgbs. Ie. (p1 + p0 + p2 + q0 + 2) >> 2 can be
|
|
|
|
; written as (((p1 + p0) >> 1) + ((p2 + q0 + (p1 ^ p0 & 1)) >> 1) + 1) >> 1,
|
|
|
|
; which maps to 3 pavgbs.
|
|
|
|
;
|
|
|
|
; pPix=%1 iStride=%2 [in:q0,out:p0]=%3 [in:q1,out:p1]=%4 bDeltaP0Q0P1P0Q1Q0=%5 bDeltaP2P0=%6 clobber=%7,%8,%9,%10 preserve_p0p1=%11 db1=%12
|
|
|
|
%macro SSE2_DeblockLumaEq4_3x16P 12
|
|
|
|
movdqa %7, %3
|
|
|
|
movdqa %8, %6
|
|
|
|
MOVDQ %10, [%1 + 1 * %2] ; p1
|
|
|
|
SSE2_Blend %7, %10, %8 ; t0 = bDeltaP2P0 ? q0 : p1
|
|
|
|
movdqa %8, %6
|
|
|
|
MOVDQ %9, [%1 + 2 * %2] ; p2
|
|
|
|
SSE2_Blend %9, %4, %8 ; t1 = bDeltaP2P0 ? p2 : q1
|
|
|
|
SSE2_AvgbFloor1 %4, %9, %12, %8 ; t1 = (t1 + q1) >> 1
|
|
|
|
SSE2_AvgbFloor1 %10, [%1], %12, %8 ; (p0 + p1) >> 1, p0 ^ p1
|
|
|
|
pxor %8, %12
|
|
|
|
SSE2_AvgbFloor1 %7, %4, %8, %9 ; (t0 + t1 + (p0 ^ p1 & 1)) >> 1
|
|
|
|
MOVDQ %9, [%1 + 2 * %2] ; p2
|
|
|
|
SSE2_AvgbFloor1 %3, %9, %8, %4 ; (p2 + q0 + (p0 ^ p1 & 1)) >> 1
|
|
|
|
pavgb %7, %10 ; p0' = (p0 + p1 + t0 + t1 + 2) >> 2
|
|
|
|
movdqa %8, %10
|
|
|
|
pxor %8, %3 ; (p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1
|
|
|
|
pand %8, %12 ; & 1
|
|
|
|
pavgb %10, %3 ; p1' = (p0 + p1 + p2 + q0 + 2) >> 2
|
|
|
|
pand %6, %5 ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0
|
|
|
|
%if %11
|
|
|
|
MOVDQ %3, [%1 + 0 * %2] ; p0
|
|
|
|
movdqa %4, %5
|
|
|
|
SSE2_Blend %7, %3, %4 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
|
|
|
|
%else
|
|
|
|
SSE2_Blend %7, [%1 + 0 * %2], %5 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
|
|
|
|
%endif
|
|
|
|
MOVDQ [%1 + 0 * %2], %7 ; store p0
|
|
|
|
add %1, %2
|
|
|
|
movdqa %7, %10
|
|
|
|
psubb %10, %8 ; (p0 + p1 + p2 + q0) >> 2
|
|
|
|
psubb %8, %12
|
|
|
|
MOVDQ %4, [%1 + (3 - 1) * %2] ; p3
|
|
|
|
SSE2_AvgbFloor2 %4, %9, %8 ; (p2 + p3 + ((p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1 & 1)) >> 1
|
|
|
|
pavgb %10, %4 ; p2' = (((p0 + p1 + p2 + q0) >> 1) + p2 + p3 + 2) >> 2
|
|
|
|
movdqa %8, %6
|
|
|
|
SSE2_Blend %10, [%1 + (2 - 1) * %2], %8 ; p2out = bDeltaP2P0 ? p2' : p2
|
|
|
|
MOVDQ [%1 + (2 - 1) * %2], %10 ; store p2
|
|
|
|
%if %11
|
|
|
|
MOVDQ %4, [%1 + (1 - 1) * %2] ; p1
|
|
|
|
SSE2_Blend %7, %4, %6 ; p1out = bDeltaP2P0 ? p1' : p1
|
|
|
|
%else
|
|
|
|
SSE2_Blend %7, [%1 + (1 - 1) * %2], %6 ; p1out = bDeltaP2P0 ? p1' : p1
|
|
|
|
%endif
|
|
|
|
MOVDQ [%1 + (1 - 1) * %2], %7 ; store p1
|
|
|
|
%endmacro
|
2016-02-01 11:48:25 +01:00
|
|
|
|
|
|
|
|
2016-02-09 23:28:14 +01:00
|
|
|
;*******************************************************************************
|
|
|
|
; void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
|
|
|
|
; int32_t iBeta)
|
|
|
|
;*******************************************************************************
|
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockLumaEq4V_ssse3
|
2016-02-09 23:28:14 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
|
|
|
PUSH_XMM 10
|
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
movd xmm1, arg3d
|
|
|
|
movd xmm2, arg4d
|
|
|
|
shr r2, 2
|
|
|
|
add r2, 1
|
|
|
|
movd xmm3, r2d
|
|
|
|
pxor xmm4, xmm4
|
|
|
|
pxor xmm1, [WELS_DB127_16]
|
|
|
|
pxor xmm2, [WELS_DB127_16]
|
|
|
|
pshufb xmm1, xmm4 ; iAlpha ^ 0x7f
|
|
|
|
pshufb xmm2, xmm4 ; iBeta ^ 0x7f
|
|
|
|
pshufb xmm3, xmm4 ; (iAlpha >> 2) + 1
|
|
|
|
mov r2, r1 ; iStride
|
|
|
|
neg r1 ; -iStride
|
|
|
|
lea r3, [r0 + r1] ; pPix - iStride
|
|
|
|
|
|
|
|
; Compute masks to enable/disable filtering.
|
|
|
|
MOVDQ xmm7, [r3 + 1 * r1] ; p1
|
|
|
|
MOVDQ xmm6, [r3 + 0 * r1] ; p0
|
|
|
|
MOVDQ xmm0, [r0 + 0 * r2] ; q0
|
|
|
|
movdqa xmm4, xmm6
|
|
|
|
SSE2_AbsDiffUB xmm6, xmm0, xmm5 ; |p0 - q0|
|
|
|
|
SSE2_CmpgeUB xmm3, xmm6 ; |p0 - q0| < (iAlpha >> 2) + 2
|
|
|
|
SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
|
|
|
|
MOVDQ xmm1, [r0 + 1 * r2] ; q1
|
|
|
|
SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p1 - p0|
|
|
|
|
SSE2_AbsDiffUB xmm0, xmm1, xmm5 ; |q1 - q0|
|
|
|
|
pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|)
|
|
|
|
SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
|
|
|
|
pand xmm6, xmm7 ; & bDeltaP0Q0
|
|
|
|
|
|
|
|
MOVDQ xmm7, [r3 + 2 * r1] ; p2
|
|
|
|
SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p2 - p0|
|
|
|
|
SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP2P0 = |p2 - p0| < iBeta
|
|
|
|
pand xmm7, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2
|
|
|
|
|
|
|
|
MOVDQ xmm0, [r0 + 0 * r2] ; q0
|
|
|
|
MOVDQ xmm5, [r0 + 2 * r2] ; q2
|
|
|
|
SSE2_AbsDiffUB xmm5, xmm0, xmm4 ; |q2 - q0|
|
|
|
|
SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
|
|
|
|
pand xmm5, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2
|
|
|
|
|
|
|
|
%ifdef X86_32
|
|
|
|
; Push xmm5 to free up one register. Align stack so as to ensure that failed
|
|
|
|
; store forwarding penalty cannot occur (up to ~50 cycles for 128-bit on IVB).
|
|
|
|
mov r2, esp
|
|
|
|
sub esp, 16
|
|
|
|
and esp, -16
|
|
|
|
movdqa [esp], xmm5
|
|
|
|
SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [WELS_DB1_16]
|
|
|
|
movdqa xmm5, [esp]
|
|
|
|
mov esp, r2
|
|
|
|
neg r1
|
|
|
|
SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [WELS_DB1_16]
|
|
|
|
%else
|
|
|
|
movdqa xmm9, [WELS_DB1_16]
|
|
|
|
SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9
|
|
|
|
SSE2_DeblockLumaEq4_3x16P r0, r2, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, xmm9
|
|
|
|
%endif
|
|
|
|
|
|
|
|
POP_XMM
|
|
|
|
LOAD_4_PARA_POP
|
2014-05-31 14:13:34 +03:00
|
|
|
ret
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
2016-02-25 15:57:20 +01:00
|
|
|
; [out:p1,p0,q0,q1]=%1,%2,%3,%4 pPixCb=%5 pPixCr=%6 iStride=%7 3*iStride-1=%8 xmmclobber=%9,%10,%11
|
|
|
|
%macro SSE2_LoadCbCr_4x16H 11
|
|
|
|
movd %1, [%5 + 0 * %7 - 2] ; [p1,p0,q0,q1] cb line 0
|
|
|
|
movd %2, [%5 + 2 * %7 - 2] ; [p1,p0,q0,q1] cb line 2
|
|
|
|
punpcklbw %1, %2 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 0,2
|
|
|
|
movd %2, [%5 + 4 * %7 - 2] ; [p1,p0,q0,q1] cb line 4
|
|
|
|
movd %9, [%5 + 2 * %8] ; [p1,p0,q0,q1] cb line 6
|
|
|
|
punpcklbw %2, %9 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 4,6
|
|
|
|
punpcklwd %1, %2 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 0,2,4,6
|
|
|
|
movd %2, [%6 + 0 * %7 - 2] ; [p1,p0,q0,q1] cr line 0
|
|
|
|
movd %9, [%6 + 2 * %7 - 2] ; [p1,p0,q0,q1] cr line 2
|
|
|
|
punpcklbw %2, %9 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 0,2
|
|
|
|
movd %9, [%6 + 4 * %7 - 2] ; [p1,p0,q0,q1] cr line 4
|
|
|
|
movd %10, [%6 + 2 * %8] ; [p1,p0,q0,q1] cr line 6
|
|
|
|
punpcklbw %9, %10 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 4,6
|
|
|
|
punpcklwd %2, %9 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 0,2,4,6
|
|
|
|
add %5, %7 ; pPixCb += iStride
|
|
|
|
add %6, %7 ; pPixCr += iStride
|
|
|
|
movd %9, [%5 + 0 * %7 - 2] ; [p1,p0,q0,q1] cb line 1
|
|
|
|
movd %10, [%5 + 2 * %7 - 2] ; [p1,p0,q0,q1] cb line 3
|
|
|
|
punpcklbw %9, %10 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 1,3
|
|
|
|
movd %10, [%5 + 4 * %7 - 2] ; [p1,p0,q0,q1] cb line 5
|
|
|
|
movd %3, [%5 + 2 * %8] ; [p1,p0,q0,q1] cb line 7
|
|
|
|
punpcklbw %10, %3 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 5,7
|
|
|
|
punpcklwd %9, %10 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 1,3,5,7
|
|
|
|
movd %10, [%6 + 0 * %7 - 2] ; [p1,p0,q0,q1] cr line 1
|
|
|
|
movd %3, [%6 + 2 * %7 - 2] ; [p1,p0,q0,q1] cr line 3
|
|
|
|
punpcklbw %10, %3 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 1,3
|
|
|
|
movd %3, [%6 + 4 * %7 - 2] ; [p1,p0,q0,q1] cr line 5
|
|
|
|
movd %4, [%6 + 2 * %8] ; [p1,p0,q0,q1] cr line 7
|
|
|
|
punpcklbw %3, %4 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 5,7
|
|
|
|
punpcklwd %10, %3 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 1,3,5,7
|
|
|
|
movdqa %3, %1
|
|
|
|
punpckldq %1, %2 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6
|
|
|
|
punpckhdq %3, %2 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6
|
|
|
|
movdqa %11, %9
|
|
|
|
punpckldq %9, %10 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 1,3,5,7
|
|
|
|
punpckhdq %11, %10 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 1,3,5,7
|
|
|
|
movdqa %2, %1
|
|
|
|
punpcklqdq %1, %9 ; [p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1] cb/cr line 0,2,4,6,1,3,5,7
|
|
|
|
punpckhqdq %2, %9 ; [p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6,1,3,5,7
|
|
|
|
movdqa %4, %3
|
|
|
|
punpcklqdq %3, %11 ; [q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0] cb/cr line 0,2,4,6,1,3,5,7
|
|
|
|
punpckhqdq %4, %11 ; [q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6,1,3,5,7
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
; pPixCb+iStride=%1 pPixCr+iStride=%2 iStride=%3 3*iStride-1=%4 p0=%5 q0=%6 rclobber=%7 dwclobber={%8,%9} xmmclobber=%10
|
|
|
|
%macro SSE2_StoreCbCr_4x16H 10
|
|
|
|
movdqa %10, %5
|
|
|
|
punpcklbw %10, %6 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6
|
|
|
|
punpckhbw %5, %6 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7
|
|
|
|
mov %7, r7 ; preserve stack pointer
|
|
|
|
and r7, -16 ; align stack pointer
|
|
|
|
sub r7, 32 ; allocate stack space
|
|
|
|
movdqa [r7 ], %10 ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6 on the stack
|
|
|
|
movdqa [r7 + 16], %5 ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7 on the stack
|
|
|
|
mov %8, [r7 + 16] ; [p0,q0,p0,q0] cb line 1,3
|
|
|
|
mov [%1 + 0 * %3 - 1], %9 ; store [p0,q0] cb line 1
|
|
|
|
shr %8, 16 ; [p0,q0] cb line 3
|
|
|
|
mov [%1 + 2 * %3 - 1], %9 ; store [p0,q0] cb line 3
|
|
|
|
mov %8, [r7 + 20] ; [p0,q0,p0,q0] cb line 5,7
|
|
|
|
mov [%1 + 4 * %3 - 1], %9 ; store [p0,q0] cb line 5
|
|
|
|
shr %8, 16 ; [p0,q0] cb line 7
|
|
|
|
mov [%1 + 2 * %4 + 1], %9 ; store [p0,q0] cb line 7
|
|
|
|
mov %8, [r7 + 24] ; [p0,q0,p0,q0] cr line 1,3
|
|
|
|
mov [%2 + 0 * %3 - 1], %9 ; store [p0,q0] cr line 1
|
|
|
|
shr %8, 16 ; [p0,q0] cr line 3
|
|
|
|
mov [%2 + 2 * %3 - 1], %9 ; store [p0,q0] cr line 3
|
|
|
|
mov %8, [r7 + 28] ; [p0,q0,p0,q0] cr line 5,7
|
|
|
|
mov [%2 + 4 * %3 - 1], %9 ; store [p0,q0] cr line 5
|
|
|
|
shr %8, 16 ; [p0,q0] cr line 7
|
|
|
|
mov [%2 + 2 * %4 + 1], %9 ; store [p0,q0] cr line 7
|
|
|
|
sub %1, %3 ; pPixCb -= iStride
|
|
|
|
sub %2, %3 ; pPixCr -= iStride
|
|
|
|
mov %8, [r7 ] ; [p0,q0,p0,q0] cb line 0,2
|
|
|
|
mov [%1 + 0 * %3 - 1], %9 ; store [p0,q0] cb line 0
|
|
|
|
shr %8, 16 ; [p0,q0] cb line 2
|
|
|
|
mov [%1 + 2 * %3 - 1], %9 ; store [p0,q0] cb line 2
|
|
|
|
mov %8, [r7 + 4] ; [p0,q0,p0,q0] cb line 4,6
|
|
|
|
mov [%1 + 4 * %3 - 1], %9 ; store [p0,q0] cb line 4
|
|
|
|
shr %8, 16 ; [p0,q0] cb line 6
|
|
|
|
mov [%1 + 2 * %4 + 1], %9 ; store [p0,q0] cb line 6
|
|
|
|
mov %8, [r7 + 8] ; [p0,q0,p0,q0] cr line 0,2
|
|
|
|
mov [%2 + 0 * %3 - 1], %9 ; store [p0,q0] cr line 0
|
|
|
|
shr %8, 16 ; [p0,q0] cr line 2
|
|
|
|
mov [%2 + 2 * %3 - 1], %9 ; store [p0,q0] cr line 2
|
|
|
|
mov %8, [r7 + 12] ; [p0,q0,p0,q0] cr line 4,6
|
|
|
|
mov [%2 + 4 * %3 - 1], %9 ; store [p0,q0] cr line 4
|
|
|
|
shr %8, 16 ; [p0,q0] cr line 6
|
|
|
|
mov [%2 + 2 * %4 + 1], %9 ; store [p0,q0] cr line 6
|
|
|
|
mov r7, %7 ; restore stack pointer
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 pTC=%7 xmmclobber=%8,%9,%10 interleaveTC=%11
|
|
|
|
%macro SSSE3_DeblockChromaLt4 11
|
|
|
|
movdqa %8, %3
|
|
|
|
SSE2_AbsDiffUB %8, %2, %9 ; |p0 - q0|
|
|
|
|
SSE2_CmpgeUB %8, %5 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
|
|
|
|
movdqa %9, %4
|
|
|
|
SSE2_AbsDiffUB %9, %3, %5 ; |q1 - q0|
|
|
|
|
movdqa %10, %1
|
|
|
|
SSE2_AbsDiffUB %10, %2, %5 ; |p1 - p0|
|
|
|
|
pmaxub %9, %10 ; max(|q1 - q0|, |p1 - p0|)
|
|
|
|
pxor %10, %10
|
|
|
|
movd %5, %6
|
|
|
|
pshufb %5, %10 ; iBeta
|
|
|
|
SSE2_CmpgeUB %9, %5 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
|
|
|
|
por %8, %9 ; | !bDeltaP0Q0
|
|
|
|
movd %5, [%7]
|
|
|
|
%if %11
|
|
|
|
punpckldq %5, %5
|
|
|
|
punpcklbw %5, %5 ; iTc
|
|
|
|
%else
|
|
|
|
pshufd %5, %5, 0 ; iTc
|
|
|
|
%endif
|
|
|
|
pcmpeqw %10, %10 ; FFh
|
|
|
|
movdqa %9, %5
|
|
|
|
pcmpgtb %9, %10 ; iTc > -1 ? FFh : 00h
|
|
|
|
pandn %8, %5 ; iTc & bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
|
|
|
|
pand %8, %9 ; &= (iTc > -1 ? FFh : 00h)
|
|
|
|
SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9
|
|
|
|
%endmacro
|
|
|
|
|
2016-02-25 16:00:26 +01:00
|
|
|
; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 xmmclobber=%7,%8,%9
|
|
|
|
%macro SSSE3_DeblockChromaEq4 9
|
|
|
|
movdqa %7, %3
|
|
|
|
SSE2_AbsDiffUB %7, %2, %8 ; |p0 - q0|
|
|
|
|
SSE2_CmpgeUB %7, %5 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
|
|
|
|
movdqa %8, %4
|
|
|
|
SSE2_AbsDiffUB %8, %3, %5 ; |q1 - q0|
|
|
|
|
movdqa %9, %1
|
|
|
|
SSE2_AbsDiffUB %9, %2, %5 ; |p1 - p0|
|
|
|
|
pmaxub %8, %9 ; max(|q1 - q0|, |p1 - p0|)
|
|
|
|
pxor %9, %9
|
|
|
|
movd %5, %6
|
|
|
|
pshufb %5, %9 ; iBeta
|
|
|
|
SSE2_CmpgeUB %8, %5 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
|
|
|
|
por %7, %8 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
|
|
|
|
WELS_DB1 %5
|
|
|
|
movdqa %8, %2
|
|
|
|
SSE2_AvgbFloor1 %8, %4, %5, %9 ; (p0 + q1) >> 1
|
|
|
|
pavgb %8, %1 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
|
|
|
|
movdqa %9, %7
|
|
|
|
SSE2_Blend %2, %8, %7 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
|
|
|
|
SSE2_AvgbFloor1 %1, %3, %5, %7 ; (q0 + p1) >> 1
|
|
|
|
pavgb %1, %4 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
|
|
|
|
SSE2_Blend %3, %1, %9 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
|
|
|
|
%endmacro
|
|
|
|
|
2016-02-25 15:57:20 +01:00
|
|
|
|
2016-02-12 20:59:22 +01:00
|
|
|
;******************************************************************************
|
|
|
|
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
|
|
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
|
|
|
|
;*******************************************************************************
|
2016-02-09 23:28:14 +01:00
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockChromaLt4V_ssse3
|
2016-02-12 20:59:22 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
|
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r2, r2d
|
|
|
|
movd xmm7, arg4d
|
|
|
|
pxor xmm0, xmm0
|
|
|
|
pshufb xmm7, xmm0 ; iAlpha
|
|
|
|
mov r3, r2
|
|
|
|
neg r3 ; -iStride
|
|
|
|
|
|
|
|
movq xmm0, [r0 + 0 * r2] ; q0 cb
|
|
|
|
movhps xmm0, [r1 + 0 * r2] ; q0 cr
|
|
|
|
movq xmm2, [r0 + 1 * r3] ; p0 cb
|
|
|
|
movhps xmm2, [r1 + 1 * r3] ; p0 cr
|
|
|
|
movq xmm1, [r0 + 1 * r2] ; q1 cb
|
|
|
|
movhps xmm1, [r1 + 1 * r2] ; q1 cr
|
|
|
|
movq xmm3, [r0 + 2 * r3] ; p1 cb
|
|
|
|
movhps xmm3, [r1 + 2 * r3] ; p1 cr
|
|
|
|
|
|
|
|
%ifidni arg6, r5
|
2016-02-25 15:57:20 +01:00
|
|
|
SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, arg6, xmm4, xmm5, xmm6, 1
|
2016-02-12 20:59:22 +01:00
|
|
|
%else
|
|
|
|
mov r2, arg6
|
2016-02-25 15:57:20 +01:00
|
|
|
SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, r2, xmm4, xmm5, xmm6, 1
|
2016-02-12 20:59:22 +01:00
|
|
|
%endif
|
|
|
|
|
|
|
|
movlps [r0 + 1 * r3], xmm2 ; store p0 cb
|
|
|
|
movhps [r1 + 1 * r3], xmm2 ; store p0 cr
|
|
|
|
movlps [r0 ], xmm0 ; store q0 cb
|
|
|
|
movhps [r1 ], xmm0 ; store q0 cr
|
|
|
|
|
2014-05-31 14:13:34 +03:00
|
|
|
POP_XMM
|
2016-02-12 20:59:22 +01:00
|
|
|
LOAD_4_PARA_POP
|
2014-05-31 14:13:34 +03:00
|
|
|
ret
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
2016-02-12 21:01:24 +01:00
|
|
|
;********************************************************************************
|
|
|
|
; void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
|
|
; int32_t iAlpha, int32_t iBeta)
|
|
|
|
;********************************************************************************
|
2016-02-12 20:59:22 +01:00
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockChromaEq4V_ssse3
|
2016-02-12 21:01:24 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
|
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r2, r2d
|
|
|
|
movd xmm7, arg4d
|
|
|
|
pxor xmm0, xmm0
|
|
|
|
pshufb xmm7, xmm0 ; iAlpha
|
|
|
|
mov r3, r2
|
|
|
|
neg r3 ; -iStride
|
|
|
|
|
|
|
|
movq xmm0, [r0 + 0 * r2] ; q0 cb
|
|
|
|
movhps xmm0, [r1 + 0 * r2] ; q0 cr
|
|
|
|
movq xmm2, [r0 + 1 * r3] ; p0 cb
|
|
|
|
movhps xmm2, [r1 + 1 * r3] ; p0 cr
|
|
|
|
movq xmm1, [r0 + 1 * r2] ; q1 cb
|
|
|
|
movhps xmm1, [r1 + 1 * r2] ; q1 cr
|
|
|
|
movq xmm3, [r0 + 2 * r3] ; p1 cb
|
|
|
|
movhps xmm3, [r1 + 2 * r3] ; p1 cr
|
|
|
|
|
2016-02-25 16:00:26 +01:00
|
|
|
SSSE3_DeblockChromaEq4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, xmm4, xmm5, xmm6
|
2016-02-12 21:01:24 +01:00
|
|
|
|
2016-02-25 16:00:26 +01:00
|
|
|
movlps [r0 + 1 * r3], xmm2 ; store p0 cb
|
|
|
|
movhps [r1 + 1 * r3], xmm2 ; store p0 cr
|
2016-02-12 21:01:24 +01:00
|
|
|
movlps [r0 + 0 * r2], xmm0 ; store q0 cb
|
|
|
|
movhps [r1 + 0 * r2], xmm0 ; store q0 cr
|
|
|
|
|
2014-05-31 14:13:34 +03:00
|
|
|
POP_XMM
|
2016-02-12 21:01:24 +01:00
|
|
|
LOAD_4_PARA_POP
|
2014-05-31 14:13:34 +03:00
|
|
|
ret
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
2016-02-25 15:57:20 +01:00
|
|
|
;*******************************************************************************
|
|
|
|
; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
|
|
|
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
|
|
|
|
;*******************************************************************************
|
|
|
|
|
|
|
|
WELS_EXTERN DeblockChromaLt4H_ssse3
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_6_PARA
|
|
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r2, r2d
|
|
|
|
movd xmm7, arg4d
|
|
|
|
pxor xmm0, xmm0
|
|
|
|
pshufb xmm7, xmm0 ; iAlpha
|
|
|
|
lea r3, [3 * r2 - 1] ; 3 * iStride - 1
|
|
|
|
|
|
|
|
SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
|
|
|
|
SSSE3_DeblockChromaLt4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, r5, xmm2, xmm3, xmm6, 0
|
|
|
|
SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
|
|
|
|
|
|
|
|
POP_XMM
|
|
|
|
LOAD_6_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2014-01-03 14:49:45 +08:00
|
|
|
;***************************************************************************
|
2014-01-16 15:57:22 +08:00
|
|
|
; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
|
2014-01-03 14:49:45 +08:00
|
|
|
; int32_t iAlpha, int32_t iBeta)
|
|
|
|
;***************************************************************************
|
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockChromaEq4H_ssse3
|
2016-02-25 16:00:26 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
|
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r2, r2d
|
|
|
|
movd xmm7, arg4d
|
|
|
|
pxor xmm0, xmm0
|
|
|
|
pshufb xmm7, xmm0 ; iAlpha
|
|
|
|
lea r3, [3 * r2 - 1] ; 3 * iStride - 1
|
2014-01-03 14:49:45 +08:00
|
|
|
|
2016-02-25 16:00:26 +01:00
|
|
|
SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
|
|
|
|
SSSE3_DeblockChromaEq4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, xmm2, xmm3, xmm6
|
|
|
|
%ifdef X86_32
|
|
|
|
push r4
|
|
|
|
push r5
|
|
|
|
SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
|
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
%else
|
|
|
|
SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
|
2014-01-03 14:49:45 +08:00
|
|
|
%endif
|
|
|
|
|
2016-02-25 16:00:26 +01:00
|
|
|
POP_XMM
|
|
|
|
LOAD_4_PARA_POP
|
|
|
|
ret
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
|
|
|
|
;********************************************************************************
|
|
|
|
;
|
|
|
|
; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
|
|
|
|
;
|
|
|
|
;********************************************************************************
|
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockLumaTransposeH2V_sse2
|
2014-01-05 14:16:22 +02:00
|
|
|
push r3
|
|
|
|
push r4
|
2014-01-03 14:49:45 +08:00
|
|
|
push r5
|
|
|
|
|
2014-01-05 14:16:22 +02:00
|
|
|
%assign push_num 3
|
|
|
|
LOAD_3_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 10:29:53 +02:00
|
|
|
PUSH_XMM 8
|
2014-01-03 14:49:45 +08:00
|
|
|
|
2014-03-14 10:13:18 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
2014-01-03 14:49:45 +08:00
|
|
|
|
2014-01-05 14:16:22 +02:00
|
|
|
mov r5, r7
|
2014-01-03 14:49:45 +08:00
|
|
|
mov r3, r7
|
|
|
|
and r3, 0Fh
|
|
|
|
sub r7, r3
|
|
|
|
sub r7, 10h
|
|
|
|
|
|
|
|
lea r3, [r0 + r1 * 8]
|
|
|
|
lea r4, [r1 * 3]
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movq xmm7, [r3]
|
|
|
|
punpcklqdq xmm0, xmm7
|
|
|
|
movq xmm1, [r0 + r1]
|
|
|
|
movq xmm7, [r3 + r1]
|
|
|
|
punpcklqdq xmm1, xmm7
|
|
|
|
movq xmm2, [r0 + r1*2]
|
|
|
|
movq xmm7, [r3 + r1*2]
|
|
|
|
punpcklqdq xmm2, xmm7
|
|
|
|
movq xmm3, [r0 + r4]
|
|
|
|
movq xmm7, [r3 + r4]
|
|
|
|
punpcklqdq xmm3, xmm7
|
|
|
|
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
lea r3, [r3 + r1 * 4]
|
|
|
|
movq xmm4, [r0]
|
|
|
|
movq xmm7, [r3]
|
|
|
|
punpcklqdq xmm4, xmm7
|
|
|
|
movq xmm5, [r0 + r1]
|
|
|
|
movq xmm7, [r3 + r1]
|
|
|
|
punpcklqdq xmm5, xmm7
|
|
|
|
movq xmm6, [r0 + r1*2]
|
|
|
|
movq xmm7, [r3 + r1*2]
|
|
|
|
punpcklqdq xmm6, xmm7
|
|
|
|
|
|
|
|
movdqa [r7], xmm0
|
|
|
|
movq xmm7, [r0 + r4]
|
|
|
|
movq xmm0, [r3 + r4]
|
|
|
|
punpcklqdq xmm7, xmm0
|
|
|
|
movdqa xmm0, [r7]
|
|
|
|
|
|
|
|
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
|
|
|
|
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
2014-01-05 14:16:22 +02:00
|
|
|
|
2014-01-03 14:49:45 +08:00
|
|
|
movdqa [r2], xmm4
|
|
|
|
movdqa [r2 + 10h], xmm2
|
|
|
|
movdqa [r2 + 20h], xmm3
|
|
|
|
movdqa [r2 + 30h], xmm7
|
|
|
|
movdqa [r2 + 40h], xmm5
|
|
|
|
movdqa [r2 + 50h], xmm1
|
|
|
|
movdqa [r2 + 60h], xmm6
|
|
|
|
movdqa [r2 + 70h], xmm0
|
|
|
|
|
|
|
|
mov r7, r5
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 10:29:53 +02:00
|
|
|
POP_XMM
|
2014-01-03 14:49:45 +08:00
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
pop r3
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
;*******************************************************************************************
|
|
|
|
;
|
|
|
|
; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
|
|
|
|
;
|
|
|
|
;*******************************************************************************************
|
|
|
|
|
2014-06-09 11:03:25 +03:00
|
|
|
WELS_EXTERN DeblockLumaTransposeV2H_sse2
|
2014-01-03 14:49:45 +08:00
|
|
|
push r3
|
2014-01-05 14:16:22 +02:00
|
|
|
push r4
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
%assign push_num 2
|
|
|
|
LOAD_3_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 10:29:53 +02:00
|
|
|
PUSH_XMM 8
|
2014-01-03 14:49:45 +08:00
|
|
|
|
2014-03-14 10:13:18 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
2014-01-03 14:49:45 +08:00
|
|
|
|
|
|
|
mov r4, r7
|
2014-01-05 14:16:22 +02:00
|
|
|
mov r3, r7
|
2014-01-03 14:49:45 +08:00
|
|
|
and r3, 0Fh
|
2014-01-05 14:16:22 +02:00
|
|
|
sub r7, r3
|
2014-01-03 14:49:45 +08:00
|
|
|
sub r7, 10h
|
|
|
|
|
|
|
|
movdqa xmm0, [r2]
|
|
|
|
movdqa xmm1, [r2 + 10h]
|
|
|
|
movdqa xmm2, [r2 + 20h]
|
|
|
|
movdqa xmm3, [r2 + 30h]
|
|
|
|
movdqa xmm4, [r2 + 40h]
|
|
|
|
movdqa xmm5, [r2 + 50h]
|
|
|
|
movdqa xmm6, [r2 + 60h]
|
|
|
|
movdqa xmm7, [r2 + 70h]
|
|
|
|
|
|
|
|
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
|
|
|
|
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
|
|
|
|
|
|
|
lea r2, [r1 * 3]
|
|
|
|
|
|
|
|
movq [r0], xmm4
|
|
|
|
movq [r0 + r1], xmm2
|
|
|
|
movq [r0 + r1*2], xmm3
|
|
|
|
movq [r0 + r2], xmm7
|
|
|
|
|
|
|
|
lea r0, [r0 + r1*4]
|
|
|
|
movq [r0], xmm5
|
|
|
|
movq [r0 + r1], xmm1
|
|
|
|
movq [r0 + r1*2], xmm6
|
|
|
|
movq [r0 + r2], xmm0
|
|
|
|
|
|
|
|
psrldq xmm4, 8
|
|
|
|
psrldq xmm2, 8
|
|
|
|
psrldq xmm3, 8
|
|
|
|
psrldq xmm7, 8
|
|
|
|
psrldq xmm5, 8
|
|
|
|
psrldq xmm1, 8
|
|
|
|
psrldq xmm6, 8
|
|
|
|
psrldq xmm0, 8
|
|
|
|
|
|
|
|
lea r0, [r0 + r1*4]
|
|
|
|
movq [r0], xmm4
|
|
|
|
movq [r0 + r1], xmm2
|
|
|
|
movq [r0 + r1*2], xmm3
|
|
|
|
movq [r0 + r2], xmm7
|
|
|
|
|
|
|
|
lea r0, [r0 + r1*4]
|
|
|
|
movq [r0], xmm5
|
|
|
|
movq [r0 + r1], xmm1
|
|
|
|
movq [r0 + r1*2], xmm6
|
|
|
|
movq [r0 + r2], xmm0
|
|
|
|
|
|
|
|
|
|
|
|
mov r7, r4
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 10:29:53 +02:00
|
|
|
POP_XMM
|
2014-01-03 14:49:45 +08:00
|
|
|
pop r4
|
|
|
|
pop r3
|
|
|
|
ret
|
|
|
|
|
2015-01-04 16:39:17 +08:00
|
|
|
WELS_EXTERN WelsNonZeroCount_sse2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_1_PARA
|
|
|
|
movdqu xmm0, [r0]
|
|
|
|
movq xmm1, [r0+16]
|
|
|
|
WELS_DB1 xmm2
|
|
|
|
pminub xmm0, xmm2
|
|
|
|
pminub xmm1, xmm2
|
|
|
|
movdqu [r0], xmm0
|
|
|
|
movq [r0+16], xmm1
|
|
|
|
ret
|