2185 lines
50 KiB
NASM
2185 lines
50 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* satd_sad.asm
|
|
;*
|
|
;* Abstract
|
|
;* WelsSampleSatd4x4_sse2
|
|
;* WelsSampleSatd8x8_sse2
|
|
;* WelsSampleSatd16x8_sse2
|
|
;* WelsSampleSatd8x16_sse2
|
|
;* WelsSampleSatd16x16_sse2
|
|
;*
|
|
;* WelsSampleSad16x8_sse2
|
|
;* WelsSampleSad16x16_sse2
|
|
;*
|
|
;* History
|
|
;* 8/5/2009 Created
|
|
;* 24/9/2009 modified
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
|
|
%include "asm_inc.asm"
|
|
|
|
;***********************************************************************
|
|
; Data
|
|
;***********************************************************************
|
|
SECTION .rodata align=16
|
|
|
|
align 16
|
|
HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
|
|
align 16
|
|
HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
|
|
align 16
|
|
PDW1: dw 1,1,1,1,1,1,1,1
|
|
align 16
|
|
PDQ2: dw 2,0,0,0,2,0,0,0
|
|
align 16
|
|
HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
|
|
|
|
;***********************************************************************
|
|
; Code
|
|
;***********************************************************************
|
|
SECTION .text
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_satd_wxh_sse2 BEGIN
|
|
;
|
|
;***********************************************************************
|
|
%macro MMX_DW_1_2REG 2
|
|
pxor %1, %1
|
|
pcmpeqw %2, %2
|
|
psubw %1, %2
|
|
%endmacro
|
|
|
|
%macro SSE2_SumWHorizon1 2
|
|
movdqa %2, %1
|
|
psrldq %2, 8
|
|
paddusw %1, %2
|
|
movdqa %2, %1
|
|
psrldq %2, 4
|
|
paddusw %1, %2
|
|
movdqa %2, %1
|
|
psrldq %2, 2
|
|
paddusw %1, %2
|
|
%endmacro
|
|
|
|
%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
|
|
SSE2_SumSub %1, %2, %5
|
|
SSE2_SumSub %3, %4, %5
|
|
SSE2_SumSub %2, %4, %5
|
|
SSE2_SumSub %1, %3, %5
|
|
%endmacro
|
|
|
|
%macro SSE2_SumAbs4 7
|
|
WELS_AbsW %1, %3
|
|
WELS_AbsW %2, %3
|
|
WELS_AbsW %4, %6
|
|
WELS_AbsW %5, %6
|
|
paddusw %1, %2
|
|
paddusw %4, %5
|
|
paddusw %7, %1
|
|
paddusw %7, %4
|
|
%endmacro
|
|
|
|
%macro SSE2_SumWHorizon 3
|
|
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
|
|
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
|
|
punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
|
|
movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
|
|
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
|
|
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
|
|
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
|
|
%endmacro
|
|
|
|
%macro SSE2_GetSatd8x8 0
|
|
SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
|
|
SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
|
|
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
|
|
|
|
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
|
|
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
|
|
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
|
|
SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
|
|
SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
|
|
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
|
|
|
|
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
|
|
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
|
|
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
|
|
SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd4x4_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movd xmm0, [r0]
|
|
movd xmm1, [r0+r1]
|
|
lea r0 , [r0+2*r1]
|
|
movd xmm2, [r0]
|
|
movd xmm3, [r0+r1]
|
|
punpckldq xmm0, xmm2
|
|
punpckldq xmm1, xmm3
|
|
|
|
movd xmm4, [r2]
|
|
movd xmm5, [r2+r3]
|
|
lea r2 , [r2+2*r3]
|
|
movd xmm6, [r2]
|
|
movd xmm7, [r2+r3]
|
|
punpckldq xmm4, xmm6
|
|
punpckldq xmm5, xmm7
|
|
|
|
pxor xmm6, xmm6
|
|
punpcklbw xmm0, xmm6
|
|
punpcklbw xmm1, xmm6
|
|
punpcklbw xmm4, xmm6
|
|
punpcklbw xmm5, xmm6
|
|
|
|
psubw xmm0, xmm4
|
|
psubw xmm1, xmm5
|
|
|
|
movdqa xmm2, xmm0
|
|
paddw xmm0, xmm1
|
|
psubw xmm2, xmm1
|
|
SSE2_XSawp qdq, xmm0, xmm2, xmm3
|
|
|
|
movdqa xmm4, xmm0
|
|
paddw xmm0, xmm3
|
|
psubw xmm4, xmm3
|
|
|
|
movdqa xmm2, xmm0
|
|
punpcklwd xmm0, xmm4
|
|
punpckhwd xmm4, xmm2
|
|
|
|
SSE2_XSawp dq, xmm0, xmm4, xmm3
|
|
SSE2_XSawp qdq, xmm0, xmm3, xmm5
|
|
|
|
movdqa xmm7, xmm0
|
|
paddw xmm0, xmm5
|
|
psubw xmm7, xmm5
|
|
|
|
SSE2_XSawp qdq, xmm0, xmm7, xmm1
|
|
|
|
movdqa xmm2, xmm0
|
|
paddw xmm0, xmm1
|
|
psubw xmm2, xmm1
|
|
|
|
WELS_AbsW xmm0, xmm3
|
|
paddusw xmm6, xmm0
|
|
WELS_AbsW xmm2, xmm4
|
|
paddusw xmm6, xmm2
|
|
SSE2_SumWHorizon1 xmm6, xmm4
|
|
movd retrd, xmm6
|
|
and retrd, 0xffff
|
|
shr retrd, 1
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd8x8_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm6, xmm6
|
|
pxor xmm7, xmm7
|
|
SSE2_GetSatd8x8
|
|
psrlw xmm6, 1
|
|
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
|
movd retrd, xmm6
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd8x16_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm6, xmm6
|
|
pxor xmm7, xmm7
|
|
|
|
SSE2_GetSatd8x8
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSatd8x8
|
|
|
|
psrlw xmm6, 1
|
|
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
|
movd retrd, xmm6
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd16x8_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
push r0
|
|
push r2
|
|
pxor xmm6, xmm6
|
|
pxor xmm7, xmm7
|
|
|
|
SSE2_GetSatd8x8
|
|
|
|
pop r2
|
|
pop r0
|
|
add r0, 8
|
|
add r2, 8
|
|
SSE2_GetSatd8x8
|
|
|
|
psrlw xmm6, 1
|
|
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
|
movd retrd, xmm6
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd16x16_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
push r0
|
|
push r2
|
|
pxor xmm6, xmm6
|
|
pxor xmm7, xmm7
|
|
|
|
SSE2_GetSatd8x8
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSatd8x8
|
|
|
|
pop r2
|
|
pop r0
|
|
add r0, 8
|
|
add r2, 8
|
|
|
|
SSE2_GetSatd8x8
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSatd8x8
|
|
|
|
; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
|
|
psrlw xmm6, 1
|
|
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
|
movd retrd, xmm6
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_satd_wxh_sse2 END
|
|
;
|
|
;***********************************************************************
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_satd_intra_sse2 BEGIN
|
|
;
|
|
;***********************************************************************
|
|
|
|
%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
|
|
pmaddubsw %1, xmm5
|
|
movdqa %2, %1
|
|
pmaddwd %1, xmm7
|
|
pmaddwd %2, xmm6
|
|
movdqa %3, %1
|
|
punpckldq %1, %2
|
|
punpckhdq %2, %3
|
|
movdqa %3, %1
|
|
punpcklqdq %1, %2
|
|
punpckhqdq %3, %2
|
|
paddd xmm4, %1 ;for dc
|
|
paddd xmm4, %3 ;for dc
|
|
packssdw %1, %3
|
|
psllw %1, 2
|
|
%endmacro
|
|
%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
|
|
pmaddubsw %1, xmm5
|
|
movdqa %2, %1
|
|
pmaddwd %1, xmm7
|
|
pmaddwd %2, xmm6
|
|
movdqa %3, %1
|
|
punpckldq %1, %2
|
|
punpckhdq %2, %3
|
|
movdqa %3, %1
|
|
punpcklqdq %1, %2
|
|
punpckhqdq %3, %2
|
|
; paddd xmm4, %1 ;for dc
|
|
; paddd xmm4, %3 ;for dc
|
|
movdqa %4, %1
|
|
punpcklqdq %4, %3
|
|
packssdw %1, %3
|
|
psllw %1, 2
|
|
%endmacro
|
|
|
|
%macro SSE41_GetX38x4SatdDec 0
|
|
pxor xmm7, xmm7
|
|
movq xmm0, [eax]
|
|
movq xmm1, [eax+ebx]
|
|
lea eax, [eax+2*ebx]
|
|
movq xmm2, [eax]
|
|
movq xmm3, [eax+ebx]
|
|
lea eax, [eax+2*ebx]
|
|
punpcklbw xmm0, xmm7
|
|
punpcklbw xmm1, xmm7
|
|
punpcklbw xmm2, xmm7
|
|
punpcklbw xmm3, xmm7
|
|
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
|
|
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
|
|
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
|
|
;doesn't need another transpose
|
|
%endmacro
|
|
%macro SSE41_GetX38x4SatdV 2
|
|
pxor xmm0, xmm0
|
|
pinsrw xmm0, word[esi+%2], 0
|
|
pinsrw xmm0, word[esi+%2+8], 4
|
|
psubsw xmm0, xmm7
|
|
pabsw xmm0, xmm0
|
|
paddw xmm4, xmm0
|
|
pxor xmm0, xmm0
|
|
pinsrw xmm0, word[esi+%2+2], 0
|
|
pinsrw xmm0, word[esi+%2+10], 4
|
|
psubsw xmm0, xmm1
|
|
pabsw xmm0, xmm0
|
|
paddw xmm4, xmm0
|
|
pxor xmm0, xmm0
|
|
pinsrw xmm0, word[esi+%2+4], 0
|
|
pinsrw xmm0, word[esi+%2+12], 4
|
|
psubsw xmm0, xmm3
|
|
pabsw xmm0, xmm0
|
|
paddw xmm4, xmm0
|
|
pxor xmm0, xmm0
|
|
pinsrw xmm0, word[esi+%2+6], 0
|
|
pinsrw xmm0, word[esi+%2+14], 4
|
|
psubsw xmm0, xmm2
|
|
pabsw xmm0, xmm0
|
|
paddw xmm4, xmm0
|
|
%endmacro
|
|
%macro SSE41_GetX38x4SatdH 3
|
|
movq xmm0, [esi+%3+8*%1]
|
|
punpcklqdq xmm0, xmm0
|
|
psubsw xmm0, xmm7
|
|
pabsw xmm0, xmm0
|
|
paddw xmm5, xmm0
|
|
pabsw xmm1, xmm1
|
|
pabsw xmm2, xmm2
|
|
pabsw xmm3, xmm3
|
|
paddw xmm2, xmm1;for DC
|
|
paddw xmm2, xmm3;for DC
|
|
paddw xmm5, xmm2
|
|
%endmacro
|
|
%macro SSE41_I16X16GetX38x4SatdDC 0
|
|
pxor xmm0, xmm0
|
|
movq2dq xmm0, mm4
|
|
punpcklqdq xmm0, xmm0
|
|
psubsw xmm0, xmm7
|
|
pabsw xmm0, xmm0
|
|
paddw xmm6, xmm0
|
|
paddw xmm6, xmm2
|
|
%endmacro
|
|
%macro SSE41_ChromaGetX38x4SatdDC 1
|
|
shl %1, 4
|
|
movdqa xmm0, [esi+32+%1]
|
|
psubsw xmm0, xmm7
|
|
pabsw xmm0, xmm0
|
|
paddw xmm6, xmm0
|
|
paddw xmm6, xmm2
|
|
%endmacro
|
|
%macro SSE41_I16x16GetX38x4Satd 2
|
|
SSE41_GetX38x4SatdDec
|
|
SSE41_GetX38x4SatdV %1, %2
|
|
SSE41_GetX38x4SatdH %1, %2, 32
|
|
SSE41_I16X16GetX38x4SatdDC
|
|
%endmacro
|
|
%macro SSE41_ChromaGetX38x4Satd 2
|
|
SSE41_GetX38x4SatdDec
|
|
SSE41_GetX38x4SatdV %1, %2
|
|
SSE41_GetX38x4SatdH %1, %2, 16
|
|
SSE41_ChromaGetX38x4SatdDC %1
|
|
%endmacro
|
|
%macro SSE41_HSum8W 3
|
|
pmaddwd %1, %2
|
|
movhlps %3, %1
|
|
paddd %1, %3
|
|
pshuflw %3, %1,0Eh
|
|
paddd %1, %3
|
|
%endmacro
|
|
|
|
|
|
%ifdef X86_32
|
|
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
mov ecx, [esp+16]
|
|
mov edx, [esp+20]
|
|
mov eax, [esp+24]
|
|
mov ebx, [esp+28]
|
|
mov esi, [esp+40] ;temp_satd
|
|
pxor xmm4, xmm4
|
|
movdqa xmm5, [HSumSubDB1]
|
|
movdqa xmm6, [HSumSubDW1]
|
|
movdqa xmm7, [PDW1]
|
|
sub ecx, edx
|
|
movdqu xmm0, [ecx]
|
|
movhlps xmm1, xmm0
|
|
punpcklqdq xmm0, xmm0
|
|
punpcklqdq xmm1, xmm1
|
|
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
|
|
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
|
|
movdqa [esi], xmm0 ;V
|
|
movdqa [esi+16], xmm1
|
|
add ecx, edx
|
|
pinsrb xmm0, byte[ecx-1], 0
|
|
pinsrb xmm0, byte[ecx+edx-1], 1
|
|
lea ecx, [ecx+2*edx]
|
|
pinsrb xmm0, byte[ecx-1], 2
|
|
pinsrb xmm0, byte[ecx+edx-1], 3
|
|
lea ecx, [ecx+2*edx]
|
|
pinsrb xmm0, byte[ecx-1], 4
|
|
pinsrb xmm0, byte[ecx+edx-1], 5
|
|
lea ecx, [ecx+2*edx]
|
|
pinsrb xmm0, byte[ecx-1], 6
|
|
pinsrb xmm0, byte[ecx+edx-1], 7
|
|
lea ecx, [ecx+2*edx]
|
|
pinsrb xmm0, byte[ecx-1], 8
|
|
pinsrb xmm0, byte[ecx+edx-1], 9
|
|
lea ecx, [ecx+2*edx]
|
|
pinsrb xmm0, byte[ecx-1], 10
|
|
pinsrb xmm0, byte[ecx+edx-1], 11
|
|
lea ecx, [ecx+2*edx]
|
|
pinsrb xmm0, byte[ecx-1], 12
|
|
pinsrb xmm0, byte[ecx+edx-1], 13
|
|
lea ecx, [ecx+2*edx]
|
|
pinsrb xmm0, byte[ecx-1], 14
|
|
pinsrb xmm0, byte[ecx+edx-1], 15
|
|
movhlps xmm1, xmm0
|
|
punpcklqdq xmm0, xmm0
|
|
punpcklqdq xmm1, xmm1
|
|
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
|
|
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
|
|
movdqa [esi+32], xmm0 ;H
|
|
movdqa [esi+48], xmm1
|
|
movd ecx, xmm4 ;dc
|
|
add ecx, 16 ;(sum+16)
|
|
shr ecx, 5 ;((sum+16)>>5)
|
|
shl ecx, 4 ;
|
|
movd mm4, ecx ; mm4 copy DC
|
|
pxor xmm4, xmm4 ;V
|
|
pxor xmm5, xmm5 ;H
|
|
pxor xmm6, xmm6 ;DC
|
|
mov ecx, 0
|
|
mov edi, 0
|
|
.loop16x16_get_satd:
|
|
.loopStart1:
|
|
SSE41_I16x16GetX38x4Satd ecx, edi
|
|
inc ecx
|
|
cmp ecx, 4
|
|
jl .loopStart1
|
|
cmp edi, 16
|
|
je .loop16x16_get_satd_end
|
|
mov eax, [esp+24]
|
|
add eax, 8
|
|
mov ecx, 0
|
|
add edi, 16
|
|
jmp .loop16x16_get_satd
|
|
.loop16x16_get_satd_end:
|
|
MMX_DW_1_2REG xmm0, xmm1
|
|
psrlw xmm4, 1 ;/2
|
|
psrlw xmm5, 1 ;/2
|
|
psrlw xmm6, 1 ;/2
|
|
SSE41_HSum8W xmm4, xmm0, xmm1
|
|
SSE41_HSum8W xmm5, xmm0, xmm1
|
|
SSE41_HSum8W xmm6, xmm0, xmm1
|
|
|
|
; comparing order: DC H V
|
|
movd ebx, xmm6 ;DC
|
|
movd edi, xmm5 ;H
|
|
movd ecx, xmm4 ;V
|
|
mov edx, [esp+36]
|
|
shl edx, 1
|
|
add edi, edx
|
|
add ebx, edx
|
|
mov edx, [esp+32]
|
|
cmp ebx, edi
|
|
jge near not_dc_16x16
|
|
cmp ebx, ecx
|
|
jge near not_dc_h_16x16
|
|
|
|
; for DC mode
|
|
mov dword[edx], 2;I16_PRED_DC
|
|
mov eax, ebx
|
|
jmp near return_satd_intra_16x16_x3
|
|
not_dc_16x16:
|
|
; for H mode
|
|
cmp edi, ecx
|
|
jge near not_dc_h_16x16
|
|
mov dword[edx], 1;I16_PRED_H
|
|
mov eax, edi
|
|
jmp near return_satd_intra_16x16_x3
|
|
not_dc_h_16x16:
|
|
; for V mode
|
|
mov dword[edx], 0;I16_PRED_V
|
|
mov eax, ecx
|
|
return_satd_intra_16x16_x3:
|
|
WELSEMMS
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
|
|
%macro SSE41_ChromaGetX38x8Satd 0
|
|
movdqa xmm5, [HSumSubDB1]
|
|
movdqa xmm6, [HSumSubDW1]
|
|
movdqa xmm7, [PDW1]
|
|
sub ecx, edx
|
|
movq xmm0, [ecx]
|
|
punpcklqdq xmm0, xmm0
|
|
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
|
|
movdqa [esi], xmm0 ;V
|
|
add ecx, edx
|
|
pinsrb xmm0, byte[ecx-1], 0
|
|
pinsrb xmm0, byte[ecx+edx-1], 1
|
|
lea ecx, [ecx+2*edx]
|
|
pinsrb xmm0, byte[ecx-1], 2
|
|
pinsrb xmm0, byte[ecx+edx-1], 3
|
|
lea ecx, [ecx+2*edx]
|
|
pinsrb xmm0, byte[ecx-1], 4
|
|
pinsrb xmm0, byte[ecx+edx-1], 5
|
|
lea ecx, [ecx+2*edx]
|
|
pinsrb xmm0, byte[ecx-1], 6
|
|
pinsrb xmm0, byte[ecx+edx-1], 7
|
|
punpcklqdq xmm0, xmm0
|
|
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
|
|
movdqa [esi+16], xmm0 ;H
|
|
;(sum+2)>>2
|
|
movdqa xmm6, [PDQ2]
|
|
movdqa xmm5, xmm4
|
|
punpckhqdq xmm5, xmm1
|
|
paddd xmm5, xmm6
|
|
psrld xmm5, 2
|
|
;(sum1+sum2+4)>>3
|
|
paddd xmm6, xmm6
|
|
paddd xmm4, xmm1
|
|
paddd xmm4, xmm6
|
|
psrld xmm4, 3
|
|
;satd *16
|
|
pslld xmm5, 4
|
|
pslld xmm4, 4
|
|
;temp satd
|
|
movdqa xmm6, xmm4
|
|
punpcklqdq xmm4, xmm5
|
|
psllq xmm4, 32
|
|
psrlq xmm4, 32
|
|
movdqa [esi+32], xmm4
|
|
punpckhqdq xmm5, xmm6
|
|
psllq xmm5, 32
|
|
psrlq xmm5, 32
|
|
movdqa [esi+48], xmm5
|
|
|
|
pxor xmm4, xmm4 ;V
|
|
pxor xmm5, xmm5 ;H
|
|
pxor xmm6, xmm6 ;DC
|
|
mov ecx, 0
|
|
loop_chroma_satdx3_cb_cr:
|
|
SSE41_ChromaGetX38x4Satd ecx, 0
|
|
inc ecx
|
|
cmp ecx, 2
|
|
jl loop_chroma_satdx3_cb_cr
|
|
%endmacro
|
|
|
|
%macro SSEReg2MMX 3
|
|
movdq2q %2, %1
|
|
movhlps %1, %1
|
|
movdq2q %3, %1
|
|
%endmacro
|
|
%macro MMXReg2SSE 4
|
|
movq2dq %1, %3
|
|
movq2dq %2, %4
|
|
punpcklqdq %1, %2
|
|
%endmacro
|
|
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
|
|
|
|
WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
mov ecx, [esp+16]
|
|
mov edx, [esp+20]
|
|
mov eax, [esp+24]
|
|
mov ebx, [esp+28]
|
|
mov esi, [esp+40] ;temp_satd
|
|
xor edi, edi
|
|
loop_chroma_satdx3:
|
|
SSE41_ChromaGetX38x8Satd
|
|
cmp edi, 1
|
|
je loop_chroma_satdx3end
|
|
inc edi
|
|
SSEReg2MMX xmm4, mm0,mm1
|
|
SSEReg2MMX xmm5, mm2,mm3
|
|
SSEReg2MMX xmm6, mm5,mm6
|
|
mov ecx, [esp+44]
|
|
mov eax, [esp+48]
|
|
jmp loop_chroma_satdx3
|
|
loop_chroma_satdx3end:
|
|
MMXReg2SSE xmm0, xmm3, mm0, mm1
|
|
MMXReg2SSE xmm1, xmm3, mm2, mm3
|
|
MMXReg2SSE xmm2, xmm3, mm5, mm6
|
|
|
|
paddw xmm4, xmm0
|
|
paddw xmm5, xmm1
|
|
paddw xmm6, xmm2
|
|
|
|
MMX_DW_1_2REG xmm0, xmm1
|
|
psrlw xmm4, 1 ;/2
|
|
psrlw xmm5, 1 ;/2
|
|
psrlw xmm6, 1 ;/2
|
|
SSE41_HSum8W xmm4, xmm0, xmm1
|
|
SSE41_HSum8W xmm5, xmm0, xmm1
|
|
SSE41_HSum8W xmm6, xmm0, xmm1
|
|
; comparing order: DC H V
|
|
movd ebx, xmm6 ;DC
|
|
movd edi, xmm5 ;H
|
|
movd ecx, xmm4 ;V
|
|
mov edx, [esp+36]
|
|
shl edx, 1
|
|
add edi, edx
|
|
add ecx, edx
|
|
mov edx, [esp+32]
|
|
cmp ebx, edi
|
|
jge near not_dc_8x8
|
|
cmp ebx, ecx
|
|
jge near not_dc_h_8x8
|
|
|
|
; for DC mode
|
|
mov dword[edx], 0;I8_PRED_DC
|
|
mov eax, ebx
|
|
jmp near return_satd_intra_8x8_x3
|
|
not_dc_8x8:
|
|
; for H mode
|
|
cmp edi, ecx
|
|
jge near not_dc_h_8x8
|
|
mov dword[edx], 1;I8_PRED_H
|
|
mov eax, edi
|
|
jmp near return_satd_intra_8x8_x3
|
|
not_dc_h_8x8:
|
|
; for V mode
|
|
mov dword[edx], 2;I8_PRED_V
|
|
mov eax, ecx
|
|
return_satd_intra_8x8_x3:
|
|
WELSEMMS
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_satd_intra_sse2 END
|
|
;
|
|
;***********************************************************************
|
|
%macro SSSE3_Get16BSadHVDC 2
|
|
movd xmm6,%1
|
|
pshufb xmm6,xmm1
|
|
movdqa %1, xmm6
|
|
movdqa xmm0,%2
|
|
psadbw xmm0,xmm7
|
|
paddw xmm4,xmm0
|
|
movdqa xmm0,%2
|
|
psadbw xmm0,xmm5
|
|
paddw xmm2,xmm0
|
|
psadbw xmm6,%2
|
|
paddw xmm3,xmm6
|
|
%endmacro
|
|
%macro WelsAddDCValue 4
|
|
movzx %2, byte %1
|
|
mov %3, %2
|
|
add %4, %2
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_sad_intra_ssse3 BEGIN
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
mov ecx, [esp+16]
|
|
mov edx, [esp+20]
|
|
mov edi, [esp+40] ;temp_sad
|
|
sub ecx, edx
|
|
movdqa xmm5,[ecx]
|
|
pxor xmm0,xmm0
|
|
psadbw xmm0,xmm5
|
|
movhlps xmm1,xmm0
|
|
paddw xmm0,xmm1
|
|
movd eax,xmm0
|
|
|
|
add ecx,edx
|
|
lea ebx, [edx+2*edx]
|
|
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
|
|
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
|
|
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
|
|
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
|
|
lea ecx, [ecx+4*edx]
|
|
add edi, 64
|
|
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
|
|
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
|
|
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
|
|
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
|
|
lea ecx, [ecx+4*edx]
|
|
add edi, 64
|
|
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
|
|
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
|
|
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
|
|
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
|
|
lea ecx, [ecx+4*edx]
|
|
add edi, 64
|
|
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
|
|
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
|
|
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
|
|
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
|
|
sub edi, 192
|
|
add eax,10h
|
|
shr eax,5
|
|
movd xmm7,eax
|
|
pxor xmm1,xmm1
|
|
pshufb xmm7,xmm1
|
|
pxor xmm4,xmm4
|
|
pxor xmm3,xmm3
|
|
pxor xmm2,xmm2
|
|
;sad begin
|
|
mov eax, [esp+24]
|
|
mov ebx, [esp+28]
|
|
lea esi, [ebx+2*ebx]
|
|
SSSE3_Get16BSadHVDC [edi], [eax]
|
|
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
|
|
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
|
|
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
|
|
add edi, 64
|
|
lea eax, [eax+4*ebx]
|
|
SSSE3_Get16BSadHVDC [edi], [eax]
|
|
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
|
|
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
|
|
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
|
|
add edi, 64
|
|
lea eax, [eax+4*ebx]
|
|
SSSE3_Get16BSadHVDC [edi], [eax]
|
|
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
|
|
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
|
|
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
|
|
add edi, 64
|
|
lea eax, [eax+4*ebx]
|
|
SSSE3_Get16BSadHVDC [edi], [eax]
|
|
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
|
|
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
|
|
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
|
|
|
|
pslldq xmm3,4
|
|
por xmm3,xmm2
|
|
movhlps xmm1,xmm3
|
|
paddw xmm3,xmm1
|
|
movhlps xmm0,xmm4
|
|
paddw xmm4,xmm0
|
|
; comparing order: DC H V
|
|
movd ebx, xmm4 ;DC
|
|
movd ecx, xmm3 ;V
|
|
psrldq xmm3, 4
|
|
movd esi, xmm3 ;H
|
|
mov eax, [esp+36] ;lamda
|
|
shl eax, 1
|
|
add esi, eax
|
|
add ebx, eax
|
|
mov edx, [esp+32]
|
|
cmp ebx, esi
|
|
jge near not_dc_16x16_sad
|
|
cmp ebx, ecx
|
|
jge near not_dc_h_16x16_sad
|
|
; for DC mode
|
|
mov dword[edx], 2;I16_PRED_DC
|
|
mov eax, ebx
|
|
sub edi, 192
|
|
%assign x 0
|
|
%rep 16
|
|
movdqa [edi+16*x], xmm7
|
|
%assign x x+1
|
|
%endrep
|
|
jmp near return_sad_intra_16x16_x3
|
|
not_dc_16x16_sad:
|
|
; for H mode
|
|
cmp esi, ecx
|
|
jge near not_dc_h_16x16_sad
|
|
mov dword[edx], 1;I16_PRED_H
|
|
mov eax, esi
|
|
jmp near return_sad_intra_16x16_x3
|
|
not_dc_h_16x16_sad:
|
|
; for V mode
|
|
mov dword[edx], 0;I16_PRED_V
|
|
mov eax, ecx
|
|
sub edi, 192
|
|
%assign x 0
|
|
%rep 16
|
|
movdqa [edi+16*x], xmm5
|
|
%assign x x+1
|
|
%endrep
|
|
return_sad_intra_16x16_x3:
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
%endif
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_sad_intra_ssse3 END
|
|
;
|
|
;***********************************************************************
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_satd_wxh_sse41 BEGIN
|
|
;
|
|
;***********************************************************************
|
|
|
|
;SSE4.1
|
|
%macro SSE41_GetSatd8x4 0
|
|
movq xmm0, [r0]
|
|
punpcklqdq xmm0, xmm0
|
|
pmaddubsw xmm0, xmm7
|
|
movq xmm1, [r0+r1]
|
|
punpcklqdq xmm1, xmm1
|
|
pmaddubsw xmm1, xmm7
|
|
movq xmm2, [r2]
|
|
punpcklqdq xmm2, xmm2
|
|
pmaddubsw xmm2, xmm7
|
|
movq xmm3, [r2+r3]
|
|
punpcklqdq xmm3, xmm3
|
|
pmaddubsw xmm3, xmm7
|
|
psubsw xmm0, xmm2
|
|
psubsw xmm1, xmm3
|
|
movq xmm2, [r0+2*r1]
|
|
punpcklqdq xmm2, xmm2
|
|
pmaddubsw xmm2, xmm7
|
|
movq xmm3, [r0+r4]
|
|
punpcklqdq xmm3, xmm3
|
|
pmaddubsw xmm3, xmm7
|
|
movq xmm4, [r2+2*r3]
|
|
punpcklqdq xmm4, xmm4
|
|
pmaddubsw xmm4, xmm7
|
|
movq xmm5, [r2+r5]
|
|
punpcklqdq xmm5, xmm5
|
|
pmaddubsw xmm5, xmm7
|
|
psubsw xmm2, xmm4
|
|
psubsw xmm3, xmm5
|
|
SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
|
|
pabsw xmm0, xmm0
|
|
pabsw xmm2, xmm2
|
|
pabsw xmm1, xmm1
|
|
pabsw xmm3, xmm3
|
|
movdqa xmm4, xmm3
|
|
pblendw xmm3, xmm1, 0xAA
|
|
pslld xmm1, 16
|
|
psrld xmm4, 16
|
|
por xmm1, xmm4
|
|
pmaxuw xmm1, xmm3
|
|
paddw xmm6, xmm1
|
|
movdqa xmm4, xmm0
|
|
pblendw xmm0, xmm2, 0xAA
|
|
pslld xmm2, 16
|
|
psrld xmm4, 16
|
|
por xmm2, xmm4
|
|
pmaxuw xmm0, xmm2
|
|
paddw xmm6, xmm0
|
|
%endmacro
|
|
|
|
%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
|
|
MMX_DW_1_2REG %3, %4
|
|
pmaddwd %2, %3
|
|
movhlps %4, %2
|
|
paddd %2, %4
|
|
pshuflw %4, %2,0Eh
|
|
paddd %2, %4
|
|
movd %1, %2
|
|
%endmacro
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd4x4_sse41
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movdqa xmm4,[HSwapSumSubDB1]
|
|
movd xmm2,[r2]
|
|
movd xmm5,[r2+r3]
|
|
shufps xmm2,xmm5,0
|
|
movd xmm3,[r2+r3*2]
|
|
lea r2, [r3*2+r2]
|
|
movd xmm5,[r2+r3]
|
|
shufps xmm3,xmm5,0
|
|
movd xmm0,[r0]
|
|
movd xmm5,[r0+r1]
|
|
shufps xmm0,xmm5,0
|
|
movd xmm1,[r0+r1*2]
|
|
lea r0, [r1*2+r0]
|
|
movd xmm5,[r0+r1]
|
|
shufps xmm1,xmm5,0
|
|
pmaddubsw xmm0,xmm4
|
|
pmaddubsw xmm1,xmm4
|
|
pmaddubsw xmm2,xmm4
|
|
pmaddubsw xmm3,xmm4
|
|
psubw xmm0,xmm2
|
|
psubw xmm1,xmm3
|
|
movdqa xmm2,xmm0
|
|
paddw xmm0,xmm1
|
|
psubw xmm1,xmm2
|
|
movdqa xmm2,xmm0
|
|
punpcklqdq xmm0,xmm1
|
|
punpckhqdq xmm2,xmm1
|
|
movdqa xmm1,xmm0
|
|
paddw xmm0,xmm2
|
|
psubw xmm2,xmm1
|
|
movdqa xmm1,xmm0
|
|
pblendw xmm0,xmm2,0AAh
|
|
pslld xmm2,16
|
|
psrld xmm1,16
|
|
por xmm2,xmm1
|
|
pabsw xmm0,xmm0
|
|
pabsw xmm2,xmm2
|
|
pmaxsw xmm0,xmm2
|
|
SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd8x8_sse41
|
|
%ifdef X86_32
|
|
push r4
|
|
push r5
|
|
%endif
|
|
%assign push_num 2
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movdqa xmm7, [HSumSubDB1]
|
|
lea r4, [r1+r1*2]
|
|
lea r5, [r3+r3*2]
|
|
pxor xmm6, xmm6
|
|
SSE41_GetSatd8x4
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
SSE41_GetSatd8x4
|
|
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
%ifdef X86_32
|
|
pop r5
|
|
pop r4
|
|
%endif
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd8x16_sse41
|
|
%ifdef X86_32
|
|
push r4
|
|
push r5
|
|
push r6
|
|
%endif
|
|
%assign push_num 3
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movdqa xmm7, [HSumSubDB1]
|
|
lea r4, [r1+r1*2]
|
|
lea r5, [r3+r3*2]
|
|
pxor xmm6, xmm6
|
|
mov r6, 0
|
|
loop_get_satd_8x16:
|
|
SSE41_GetSatd8x4
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
inc r6
|
|
cmp r6, 4
|
|
jl loop_get_satd_8x16
|
|
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
%ifdef X86_32
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
%endif
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd16x8_sse41
|
|
%ifdef X86_32
|
|
push r4
|
|
push r5
|
|
%endif
|
|
%assign push_num 2
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
push r0
|
|
push r2
|
|
|
|
movdqa xmm7, [HSumSubDB1]
|
|
lea r4, [r1+r1*2]
|
|
lea r5, [r3+r3*2]
|
|
pxor xmm6, xmm6
|
|
SSE41_GetSatd8x4
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
SSE41_GetSatd8x4
|
|
|
|
pop r2
|
|
pop r0
|
|
add r0, 8
|
|
add r2, 8
|
|
SSE41_GetSatd8x4
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
SSE41_GetSatd8x4
|
|
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
%ifdef X86_32
|
|
pop r5
|
|
pop r4
|
|
%endif
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsSampleSatd16x16_sse41
|
|
%ifdef X86_32
|
|
push r4
|
|
push r5
|
|
push r6
|
|
%endif
|
|
%assign push_num 3
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
push r0
|
|
push r2
|
|
|
|
movdqa xmm7, [HSumSubDB1]
|
|
lea r4, [r1+r1*2]
|
|
lea r5, [r3+r3*2]
|
|
pxor xmm6, xmm6
|
|
mov r6, 0
|
|
loop_get_satd_16x16_left:
|
|
SSE41_GetSatd8x4
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
inc r6
|
|
cmp r6, 4
|
|
jl loop_get_satd_16x16_left
|
|
|
|
pop r2
|
|
pop r0
|
|
add r0, 8
|
|
add r2, 8
|
|
mov r6, 0
|
|
loop_get_satd_16x16_right:
|
|
SSE41_GetSatd8x4
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
inc r6
|
|
cmp r6, 4
|
|
jl loop_get_satd_16x16_right
|
|
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
%ifdef X86_32
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
%endif
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_satd_wxh_sse41 END
|
|
;
|
|
;***********************************************************************
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_sad_wxh_sse2 BEGIN
|
|
;
|
|
;***********************************************************************
|
|
|
|
%macro SSE2_GetSad2x16 0
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqu xmm1, [r2]
|
|
MOVDQ xmm2, [r0];[eax] must aligned 16
|
|
psadbw xmm1, xmm2
|
|
paddw xmm0, xmm1
|
|
movdqu xmm1, [r2+r3]
|
|
MOVDQ xmm2, [r0+r1]
|
|
psadbw xmm1, xmm2
|
|
paddw xmm0, xmm1
|
|
%endmacro
|
|
|
|
|
|
%macro SSE2_GetSad4x16 0
|
|
movdqu xmm0, [r2]
|
|
MOVDQ xmm2, [r0]
|
|
psadbw xmm0, xmm2
|
|
paddw xmm7, xmm0
|
|
movdqu xmm1, [r2+r3]
|
|
MOVDQ xmm2, [r0+r1]
|
|
psadbw xmm1, xmm2
|
|
paddw xmm7, xmm1
|
|
movdqu xmm1, [r2+2*r3]
|
|
MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
|
|
psadbw xmm1, xmm2
|
|
paddw xmm7, xmm1
|
|
movdqu xmm1, [r2+r5]
|
|
MOVDQ xmm2, [r0+r4]
|
|
psadbw xmm1, xmm2
|
|
paddw xmm7, xmm1
|
|
%endmacro
|
|
|
|
|
|
%macro SSE2_GetSad8x4 0
|
|
movq xmm0, [r0]
|
|
movq xmm1, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movhps xmm0, [r0]
|
|
movhps xmm1, [r0+r1]
|
|
|
|
movq xmm2, [r2]
|
|
movq xmm3, [r2+r3]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm2, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm2
|
|
psadbw xmm1, xmm3
|
|
paddw xmm6, xmm0
|
|
paddw xmm6, xmm1
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
|
|
;First parameter can align to 16 bytes,
|
|
;In wels, the third parameter can't align to 16 bytes.
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSad16x16_sse2
|
|
%ifdef X86_32
|
|
push r4
|
|
push r5
|
|
%endif
|
|
|
|
%assign push_num 2
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
lea r4, [3*r1]
|
|
lea r5, [3*r3]
|
|
|
|
pxor xmm7, xmm7
|
|
SSE2_GetSad4x16
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
SSE2_GetSad4x16
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
SSE2_GetSad4x16
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
SSE2_GetSad4x16
|
|
movhlps xmm0, xmm7
|
|
paddw xmm0, xmm7
|
|
movd retrd, xmm0
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
%ifdef X86_32
|
|
pop r5
|
|
pop r4
|
|
%endif
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
|
|
;First parameter can align to 16 bytes,
|
|
;In wels, the third parameter can't align to 16 bytes.
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSad16x8_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movdqu xmm0, [r2]
|
|
MOVDQ xmm2, [r0]
|
|
psadbw xmm0, xmm2
|
|
movdqu xmm1, [r2+r3]
|
|
MOVDQ xmm2, [r0+r1]
|
|
psadbw xmm1, xmm2
|
|
paddw xmm0, xmm1
|
|
|
|
SSE2_GetSad2x16
|
|
SSE2_GetSad2x16
|
|
SSE2_GetSad2x16
|
|
|
|
movhlps xmm1, xmm0
|
|
paddw xmm0, xmm1
|
|
movd retrd, xmm0
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
|
|
|
|
WELS_EXTERN WelsSampleSad8x16_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 7
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm6, xmm6
|
|
|
|
SSE2_GetSad8x4
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSad8x4
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSad8x4
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSad8x4
|
|
|
|
movhlps xmm0, xmm6
|
|
paddw xmm0, xmm6
|
|
movd retrd, xmm0
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
|
|
%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
|
|
and %1, 0x1f|(%3>>1)
|
|
cmp %1, (32-%2)|(%3>>1)
|
|
%endmacro
|
|
|
|
WELS_EXTERN WelsSampleSad8x8_sse21
|
|
%assign push_num 0
|
|
mov r2, arg3
|
|
push r2
|
|
CACHE_SPLIT_CHECK r2, 8, 64
|
|
jle near .pixel_sad_8x8_nsplit
|
|
pop r2
|
|
%ifdef X86_32
|
|
push r3
|
|
push r4
|
|
push r5
|
|
%endif
|
|
%assign push_num 3
|
|
PUSH_XMM 8
|
|
mov r0, arg1
|
|
mov r1, arg2
|
|
SIGN_EXTENSION r1, r1d
|
|
pxor xmm7, xmm7
|
|
|
|
;ecx r2, edx r4, edi r5
|
|
|
|
mov r5, r2
|
|
and r5, 0x07
|
|
sub r2, r5
|
|
mov r4, 8
|
|
sub r4, r5
|
|
|
|
shl r5, 3
|
|
shl r4, 3
|
|
movd xmm5, r5d
|
|
movd xmm6, r4d
|
|
mov r5, 8
|
|
add r5, r2
|
|
mov r3, arg4
|
|
SIGN_EXTENSION r3, r3d
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
|
|
movq xmm1, [r2]
|
|
movq xmm2, [r5]
|
|
movhps xmm1, [r2+r3]
|
|
movhps xmm2, [r5+r3]
|
|
psrlq xmm1, xmm5
|
|
psllq xmm2, xmm6
|
|
por xmm1, xmm2
|
|
|
|
psadbw xmm0, xmm1
|
|
paddw xmm7, xmm0
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
lea r5, [r5+2*r3]
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
|
|
movq xmm1, [r2]
|
|
movq xmm2, [r5]
|
|
movhps xmm1, [r2+r3]
|
|
movhps xmm2, [r5+r3]
|
|
psrlq xmm1, xmm5
|
|
psllq xmm2, xmm6
|
|
por xmm1, xmm2
|
|
|
|
psadbw xmm0, xmm1
|
|
paddw xmm7, xmm0
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
lea r5, [r5+2*r3]
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
|
|
movq xmm1, [r2]
|
|
movq xmm2, [r5]
|
|
movhps xmm1, [r2+r3]
|
|
movhps xmm2, [r5+r3]
|
|
psrlq xmm1, xmm5
|
|
psllq xmm2, xmm6
|
|
por xmm1, xmm2
|
|
|
|
psadbw xmm0, xmm1
|
|
paddw xmm7, xmm0
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
lea r5, [r5+2*r3]
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
|
|
movq xmm1, [r2]
|
|
movq xmm2, [r5]
|
|
movhps xmm1, [r2+r3]
|
|
movhps xmm2, [r5+r3]
|
|
psrlq xmm1, xmm5
|
|
psllq xmm2, xmm6
|
|
por xmm1, xmm2
|
|
|
|
psadbw xmm0, xmm1
|
|
paddw xmm7, xmm0
|
|
|
|
movhlps xmm0, xmm7
|
|
paddw xmm0, xmm7
|
|
movd retrd, xmm0
|
|
POP_XMM
|
|
%ifdef X86_32
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
jmp .return
|
|
|
|
.pixel_sad_8x8_nsplit:
|
|
|
|
pop r2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 7
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm6, xmm6
|
|
SSE2_GetSad8x4
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSad8x4
|
|
movhlps xmm0, xmm6
|
|
paddw xmm0, xmm6
|
|
movd retrd, xmm0
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
.return:
|
|
ret
|
|
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_sad_wxh_sse2 END
|
|
;
|
|
;***********************************************************************
|
|
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_sad_4_wxh_sse2 BEGIN
|
|
;
|
|
;***********************************************************************
|
|
|
|
|
|
%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
|
|
psadbw %1, %4
|
|
paddw xmm5, %1
|
|
psadbw %4, %3
|
|
paddw xmm4, %4
|
|
movdqu %4, [%5-1]
|
|
psadbw %4, %2
|
|
paddw xmm6, %4
|
|
movdqu %4, [%5+1]
|
|
psadbw %4, %2
|
|
paddw xmm7, %4
|
|
%endmacro
|
|
WELS_EXTERN WelsSampleSadFour16x16_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
|
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
|
|
pxor xmm6, xmm6 ;sad pRefMb-1
|
|
pxor xmm7, xmm7 ;sad pRefMb+1
|
|
movdqa xmm0, [r0]
|
|
sub r2, r3
|
|
movdqu xmm3, [r2]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movdqa xmm1, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
psadbw xmm3, xmm1
|
|
paddw xmm4, xmm3
|
|
|
|
movdqu xmm2, [r2+r3-1]
|
|
psadbw xmm2, xmm0
|
|
paddw xmm6, xmm2
|
|
|
|
movdqu xmm3, [r2+r3+1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm2, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
|
|
movdqa xmm0, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm1, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
|
|
movdqa xmm2, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm0, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
|
|
movdqa xmm1, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm2, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
|
|
movdqa xmm0, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm1, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
|
|
movdqa xmm2, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm0, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
|
|
movdqa xmm1, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm2, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
|
|
movdqa xmm0, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
|
|
lea r2, [r2+2*r3]
|
|
movdqu xmm3, [r2]
|
|
psadbw xmm2, xmm3
|
|
paddw xmm5, xmm2
|
|
|
|
movdqu xmm2, [r2-1]
|
|
psadbw xmm2, xmm0
|
|
paddw xmm6, xmm2
|
|
|
|
movdqu xmm3, [r2+1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movhlps xmm0, xmm4
|
|
paddw xmm4, xmm0
|
|
movhlps xmm0, xmm5
|
|
paddw xmm5, xmm0
|
|
movhlps xmm0, xmm6
|
|
paddw xmm6, xmm0
|
|
movhlps xmm0, xmm7
|
|
paddw xmm7, xmm0
|
|
punpckldq xmm4, xmm5
|
|
punpckldq xmm6, xmm7
|
|
punpcklqdq xmm4, xmm6
|
|
movdqa [r4],xmm4
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
|
|
WELS_EXTERN WelsSampleSadFour16x8_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
|
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
|
|
pxor xmm6, xmm6 ;sad pRefMb-1
|
|
pxor xmm7, xmm7 ;sad pRefMb+1
|
|
movdqa xmm0, [r0]
|
|
sub r2, r3
|
|
movdqu xmm3, [r2]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movdqa xmm1, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
psadbw xmm3, xmm1
|
|
paddw xmm4, xmm3
|
|
|
|
movdqu xmm2, [r2+r3-1]
|
|
psadbw xmm2, xmm0
|
|
paddw xmm6, xmm2
|
|
|
|
movdqu xmm3, [r2+r3+1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm2, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
|
|
movdqa xmm0, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm1, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
|
|
movdqa xmm2, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm0, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
|
|
movdqa xmm1, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
|
|
lea r2, [r2+2*r3]
|
|
movdqu xmm3, [r2]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movdqu xmm0, [r2-1]
|
|
psadbw xmm0, xmm1
|
|
paddw xmm6, xmm0
|
|
|
|
movdqu xmm3, [r2+1]
|
|
psadbw xmm3, xmm1
|
|
paddw xmm7, xmm3
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
psadbw xmm1, xmm3
|
|
paddw xmm5, xmm1
|
|
|
|
movhlps xmm0, xmm4
|
|
paddw xmm4, xmm0
|
|
movhlps xmm0, xmm5
|
|
paddw xmm5, xmm0
|
|
movhlps xmm0, xmm6
|
|
paddw xmm6, xmm0
|
|
movhlps xmm0, xmm7
|
|
paddw xmm7, xmm0
|
|
punpckldq xmm4, xmm5
|
|
punpckldq xmm6, xmm7
|
|
punpcklqdq xmm4, xmm6
|
|
movdqa [r4],xmm4
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
WELS_EXTERN WelsSampleSadFour8x16_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
|
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
|
|
pxor xmm6, xmm6 ;sad pRefMb-1
|
|
pxor xmm7, xmm7 ;sad pRefMb+1
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
sub r2, r3
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movhlps xmm0, xmm4
|
|
paddw xmm4, xmm0
|
|
movhlps xmm0, xmm5
|
|
paddw xmm5, xmm0
|
|
movhlps xmm0, xmm6
|
|
paddw xmm6, xmm0
|
|
movhlps xmm0, xmm7
|
|
paddw xmm7, xmm0
|
|
punpckldq xmm4, xmm5
|
|
punpckldq xmm6, xmm7
|
|
punpcklqdq xmm4, xmm6
|
|
movdqa [r4],xmm4
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
|
|
WELS_EXTERN WelsSampleSadFour8x8_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
|
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
|
|
pxor xmm6, xmm6 ;sad pRefMb-1
|
|
pxor xmm7, xmm7 ;sad pRefMb+1
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
sub r2, r3
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movhlps xmm0, xmm4
|
|
paddw xmm4, xmm0
|
|
movhlps xmm0, xmm5
|
|
paddw xmm5, xmm0
|
|
movhlps xmm0, xmm6
|
|
paddw xmm6, xmm0
|
|
movhlps xmm0, xmm7
|
|
paddw xmm7, xmm0
|
|
punpckldq xmm4, xmm5
|
|
punpckldq xmm6, xmm7
|
|
punpcklqdq xmm4, xmm6
|
|
movdqa [r4],xmm4
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
WELS_EXTERN WelsSampleSadFour4x4_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movd xmm0, [r0]
|
|
movd xmm1, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movd xmm2, [r0]
|
|
movd xmm3, [r0+r1]
|
|
punpckldq xmm0, xmm1
|
|
punpckldq xmm2, xmm3
|
|
punpcklqdq xmm0, xmm2
|
|
sub r2, r3
|
|
movd xmm1, [r2]
|
|
movd xmm2, [r2+r3]
|
|
punpckldq xmm1, xmm2
|
|
movd xmm2, [r2+r3-1]
|
|
movd xmm3, [r2+r3+1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movd xmm4, [r2]
|
|
movd xmm5, [r2-1]
|
|
punpckldq xmm2, xmm5
|
|
movd xmm5, [r2+1]
|
|
punpckldq xmm3, xmm5
|
|
|
|
movd xmm5, [r2+r3]
|
|
punpckldq xmm4, xmm5
|
|
|
|
punpcklqdq xmm1, xmm4 ;-L
|
|
|
|
movd xmm5, [r2+r3-1]
|
|
movd xmm6, [r2+r3+1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
movd xmm7, [r2-1]
|
|
punpckldq xmm5, xmm7
|
|
punpcklqdq xmm2, xmm5 ;-1
|
|
movd xmm7, [r2+1]
|
|
punpckldq xmm6, xmm7
|
|
punpcklqdq xmm3, xmm6 ;+1
|
|
movd xmm6, [r2]
|
|
movd xmm7, [r2+r3]
|
|
punpckldq xmm6, xmm7
|
|
punpcklqdq xmm4, xmm6 ;+L
|
|
psadbw xmm1, xmm0
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
|
|
movhlps xmm0, xmm1
|
|
paddw xmm1, xmm0
|
|
movhlps xmm0, xmm2
|
|
paddw xmm2, xmm0
|
|
movhlps xmm0, xmm3
|
|
paddw xmm3, xmm0
|
|
movhlps xmm0, xmm4
|
|
paddw xmm4, xmm0
|
|
punpckldq xmm1, xmm4
|
|
punpckldq xmm2, xmm3
|
|
punpcklqdq xmm1, xmm2
|
|
movdqa [r4],xmm1
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_sad_4_wxh_sse2 END
|
|
;
|
|
;***********************************************************************
|
|
|
|
;***********************************************************************
|
|
; int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSad4x4_mmx
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movd mm0, [r0]
|
|
movd mm1, [r0+r1]
|
|
punpckldq mm0, mm1
|
|
|
|
movd mm3, [r2]
|
|
movd mm4, [r2+r3]
|
|
punpckldq mm3, mm4
|
|
psadbw mm0, mm3
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
|
|
movd mm1, [r0]
|
|
movd mm2, [r0+r1]
|
|
punpckldq mm1, mm2
|
|
|
|
movd mm3, [r2]
|
|
movd mm4, [r2+r3]
|
|
punpckldq mm3, mm4
|
|
psadbw mm1, mm3
|
|
paddw mm0, mm1
|
|
|
|
movd retrd, mm0
|
|
|
|
WELSEMMS
|
|
LOAD_4_PARA_POP
|
|
ret
|