2477 lines
55 KiB
NASM
2477 lines
55 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* satd_sad.asm
|
|
;*
|
|
;* Abstract
|
|
;* WelsSampleSatd4x4_sse2
|
|
;* WelsSampleSatd8x8_sse2
|
|
;* WelsSampleSatd16x8_sse2
|
|
;* WelsSampleSatd8x16_sse2
|
|
;* WelsSampleSatd16x16_sse2
|
|
;*
|
|
;* WelsSampleSad16x8_sse2
|
|
;* WelsSampleSad16x16_sse2
|
|
;*
|
|
;* History
|
|
;* 8/5/2009 Created
|
|
;* 24/9/2009 modified
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
|
|
%include "asm_inc.asm"
|
|
|
|
;***********************************************************************
|
|
; Data
|
|
;***********************************************************************
|
|
SECTION .rodata align=16
|
|
|
|
align 16
|
|
HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
|
|
align 16
|
|
HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
|
|
align 16
|
|
PDW1: dw 1,1,1,1,1,1,1,1
|
|
align 16
|
|
PDQ2: dw 2,0,0,0,2,0,0,0
|
|
align 16
|
|
HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
|
|
|
|
;***********************************************************************
|
|
; Code
|
|
;***********************************************************************
|
|
SECTION .text
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_satd_wxh_sse2 BEGIN
|
|
;
|
|
;***********************************************************************
|
|
%macro MMX_DW_1_2REG 2
|
|
pxor %1, %1
|
|
pcmpeqw %2, %2
|
|
psubw %1, %2
|
|
%endmacro
|
|
|
|
%macro SSE2_SumWHorizon1 2
|
|
movdqa %2, %1
|
|
psrldq %2, 8
|
|
paddusw %1, %2
|
|
movdqa %2, %1
|
|
psrldq %2, 4
|
|
paddusw %1, %2
|
|
movdqa %2, %1
|
|
psrldq %2, 2
|
|
paddusw %1, %2
|
|
%endmacro
|
|
|
|
%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
|
|
SSE2_SumSub %1, %2, %5
|
|
SSE2_SumSub %3, %4, %5
|
|
SSE2_SumSub %2, %4, %5
|
|
SSE2_SumSub %1, %3, %5
|
|
%endmacro
|
|
|
|
%macro SSE2_SumAbs4 7
|
|
WELS_AbsW %1, %3
|
|
WELS_AbsW %2, %3
|
|
WELS_AbsW %4, %6
|
|
WELS_AbsW %5, %6
|
|
paddusw %1, %2
|
|
paddusw %4, %5
|
|
paddusw %7, %1
|
|
paddusw %7, %4
|
|
%endmacro
|
|
|
|
%macro SSE2_SumWHorizon 3
|
|
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
|
|
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
|
|
punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
|
|
movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
|
|
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
|
|
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
|
|
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
|
|
%endmacro
|
|
|
|
%macro SSE2_GetSatd8x8 0
|
|
SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
|
|
SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
|
|
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
|
|
|
|
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
|
|
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
|
|
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
|
|
SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
|
|
SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
|
|
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
|
|
|
|
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
|
|
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
|
|
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
|
|
SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd4x4_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movd xmm0, [r0]
|
|
movd xmm1, [r0+r1]
|
|
lea r0 , [r0+2*r1]
|
|
movd xmm2, [r0]
|
|
movd xmm3, [r0+r1]
|
|
punpckldq xmm0, xmm2
|
|
punpckldq xmm1, xmm3
|
|
|
|
movd xmm4, [r2]
|
|
movd xmm5, [r2+r3]
|
|
lea r2 , [r2+2*r3]
|
|
movd xmm6, [r2]
|
|
movd xmm7, [r2+r3]
|
|
punpckldq xmm4, xmm6
|
|
punpckldq xmm5, xmm7
|
|
|
|
pxor xmm6, xmm6
|
|
punpcklbw xmm0, xmm6
|
|
punpcklbw xmm1, xmm6
|
|
punpcklbw xmm4, xmm6
|
|
punpcklbw xmm5, xmm6
|
|
|
|
psubw xmm0, xmm4
|
|
psubw xmm1, xmm5
|
|
|
|
movdqa xmm2, xmm0
|
|
paddw xmm0, xmm1
|
|
psubw xmm2, xmm1
|
|
SSE2_XSawp qdq, xmm0, xmm2, xmm3
|
|
|
|
movdqa xmm4, xmm0
|
|
paddw xmm0, xmm3
|
|
psubw xmm4, xmm3
|
|
|
|
movdqa xmm2, xmm0
|
|
punpcklwd xmm0, xmm4
|
|
punpckhwd xmm4, xmm2
|
|
|
|
SSE2_XSawp dq, xmm0, xmm4, xmm3
|
|
SSE2_XSawp qdq, xmm0, xmm3, xmm5
|
|
|
|
movdqa xmm7, xmm0
|
|
paddw xmm0, xmm5
|
|
psubw xmm7, xmm5
|
|
|
|
SSE2_XSawp qdq, xmm0, xmm7, xmm1
|
|
|
|
movdqa xmm2, xmm0
|
|
paddw xmm0, xmm1
|
|
psubw xmm2, xmm1
|
|
|
|
WELS_AbsW xmm0, xmm3
|
|
paddusw xmm6, xmm0
|
|
WELS_AbsW xmm2, xmm4
|
|
paddusw xmm6, xmm2
|
|
SSE2_SumWHorizon1 xmm6, xmm4
|
|
movd retrd, xmm6
|
|
and retrd, 0xffff
|
|
shr retrd, 1
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd8x8_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm6, xmm6
|
|
pxor xmm7, xmm7
|
|
SSE2_GetSatd8x8
|
|
psrlw xmm6, 1
|
|
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
|
movd retrd, xmm6
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd8x16_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm6, xmm6
|
|
pxor xmm7, xmm7
|
|
|
|
SSE2_GetSatd8x8
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSatd8x8
|
|
|
|
psrlw xmm6, 1
|
|
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
|
movd retrd, xmm6
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd16x8_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
push r0
|
|
push r2
|
|
pxor xmm6, xmm6
|
|
pxor xmm7, xmm7
|
|
|
|
SSE2_GetSatd8x8
|
|
|
|
pop r2
|
|
pop r0
|
|
add r0, 8
|
|
add r2, 8
|
|
SSE2_GetSatd8x8
|
|
|
|
psrlw xmm6, 1
|
|
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
|
movd retrd, xmm6
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd16x16_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
push r0
|
|
push r2
|
|
pxor xmm6, xmm6
|
|
pxor xmm7, xmm7
|
|
|
|
SSE2_GetSatd8x8
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSatd8x8
|
|
|
|
pop r2
|
|
pop r0
|
|
add r0, 8
|
|
add r2, 8
|
|
|
|
SSE2_GetSatd8x8
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSatd8x8
|
|
|
|
; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
|
|
psrlw xmm6, 1
|
|
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
|
movd retrd, xmm6
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_satd_wxh_sse2 END
|
|
;
|
|
;***********************************************************************
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_satd_intra_sse2 BEGIN
|
|
;
|
|
;***********************************************************************
|
|
|
|
|
|
%macro SSE_DB_1_2REG 2
|
|
pxor %1, %1
|
|
pcmpeqw %2, %2
|
|
psubb %1, %2
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
|
|
; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatdThree4x4_sse2
|
|
|
|
%ifdef X86_32
|
|
push r3
|
|
push r4
|
|
push r5
|
|
push r6
|
|
%assign push_num 4
|
|
%else
|
|
%assign push_num 0
|
|
%endif
|
|
PUSH_XMM 8
|
|
|
|
mov r2, arg3
|
|
mov r3, arg4
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
; load source 4x4 samples and Hadamard transform
|
|
movd xmm0, [r2]
|
|
movd xmm1, [r2+r3]
|
|
lea r2 , [r2+2*r3]
|
|
movd xmm2, [r2]
|
|
movd xmm3, [r2+r3]
|
|
punpckldq xmm0, xmm2
|
|
punpckldq xmm1, xmm3
|
|
|
|
pxor xmm6, xmm6
|
|
punpcklbw xmm0, xmm6
|
|
punpcklbw xmm1, xmm6
|
|
|
|
movdqa xmm2, xmm0
|
|
paddw xmm0, xmm1
|
|
psubw xmm2, xmm1
|
|
SSE2_XSawp qdq, xmm0, xmm2, xmm3
|
|
|
|
movdqa xmm4, xmm0
|
|
paddw xmm0, xmm3
|
|
psubw xmm4, xmm3
|
|
|
|
movdqa xmm2, xmm0
|
|
punpcklwd xmm0, xmm4
|
|
punpckhwd xmm4, xmm2
|
|
|
|
SSE2_XSawp dq, xmm0, xmm4, xmm3
|
|
SSE2_XSawp qdq, xmm0, xmm3, xmm5
|
|
|
|
movdqa xmm7, xmm0
|
|
paddw xmm0, xmm5
|
|
psubw xmm7, xmm5
|
|
|
|
SSE2_XSawp qdq, xmm0, xmm7, xmm1
|
|
|
|
; Hadamard transform results are saved in xmm0 and xmm2
|
|
movdqa xmm2, xmm0
|
|
paddw xmm0, xmm1
|
|
psubw xmm2, xmm1
|
|
|
|
;load top boundary samples: [a b c d]
|
|
mov r0, arg1
|
|
mov r1, arg2
|
|
SIGN_EXTENSION r1, r1d
|
|
sub r0, r1
|
|
%ifdef UNIX64
|
|
push r4
|
|
push r5
|
|
%endif
|
|
|
|
movzx r2d, byte [r0]
|
|
movzx r3d, byte [r0+1]
|
|
movzx r4d, byte [r0+2]
|
|
movzx r5d, byte [r0+3]
|
|
|
|
; get the transform results of top boundary samples: [a b c d]
|
|
add r3d, r2d ; r3d = a + b
|
|
add r5d, r4d ; r5d = c + d
|
|
add r2d, r2d ; r2d = a + a
|
|
add r4d, r4d ; r4d = c + c
|
|
sub r2d, r3d ; r2d = a + a - a - b = a - b
|
|
sub r4d, r5d ; r4d = c + c - c - d = c - d
|
|
add r5d, r3d ; r5d = (a + b) + (c + d)
|
|
add r3d, r3d
|
|
sub r3d, r5d ; r3d = (a + b) - (c + d)
|
|
add r4d, r2d ; r4d = (a - b) + (c - d)
|
|
add r2d, r2d
|
|
sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
|
|
|
|
movdqa xmm6, xmm0
|
|
movdqa xmm7, xmm2
|
|
movd xmm5, r5d ; store the edi for DC mode
|
|
pxor xmm3, xmm3
|
|
pxor xmm4, xmm4
|
|
pinsrw xmm3, r5d, 0
|
|
pinsrw xmm3, r4d, 4
|
|
psllw xmm3, 2
|
|
pinsrw xmm4, r3d, 0
|
|
pinsrw xmm4, r2d, 4
|
|
psllw xmm4, 2
|
|
|
|
; get the satd of H
|
|
psubw xmm0, xmm3
|
|
psubw xmm2, xmm4
|
|
|
|
WELS_AbsW xmm0, xmm1
|
|
WELS_AbsW xmm2, xmm1
|
|
paddusw xmm0, xmm2
|
|
SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0
|
|
|
|
;load left boundary samples: [a b c d]'
|
|
add r0, r1
|
|
|
|
movzx r2d, byte [r0-1]
|
|
movzx r3d, byte [r0+r1-1]
|
|
lea r0 , [r0+2*r1]
|
|
movzx r4d, byte [r0-1]
|
|
movzx r5d, byte [r0+r1-1]
|
|
|
|
; get the transform results of left boundary samples: [a b c d]'
|
|
add r3d, r2d ; r3d = a + b
|
|
add r5d, r4d ; r5d = c + d
|
|
add r2d, r2d ; r2d = a + a
|
|
add r4d, r4d ; r4d = c + c
|
|
sub r2d, r3d ; r2d = a + a - a - b = a - b
|
|
sub r4d, r5d ; r4d = c + c - c - d = c - d
|
|
add r5d, r3d ; r5d = (a + b) + (c + d)
|
|
add r3d, r3d
|
|
sub r3d, r5d ; r3d = (a + b) - (c + d)
|
|
add r4d, r2d ; r4d = (a - b) + (c - d)
|
|
add r2d, r2d
|
|
sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
|
|
|
|
; store the transform results in xmm3
|
|
movd xmm3, r5d
|
|
pinsrw xmm3, r3d, 1
|
|
pinsrw xmm3, r2d, 2
|
|
pinsrw xmm3, r4d, 3
|
|
psllw xmm3, 2
|
|
|
|
; get the satd of V
|
|
movdqa xmm2, xmm6
|
|
movdqa xmm4, xmm7
|
|
psubw xmm2, xmm3
|
|
WELS_AbsW xmm2, xmm1
|
|
WELS_AbsW xmm4, xmm1
|
|
paddusw xmm2, xmm4
|
|
SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2
|
|
|
|
; DC result is stored in xmm1
|
|
add r5d, 4
|
|
movd xmm1, r5d
|
|
paddw xmm1, xmm5
|
|
psrlw xmm1, 3
|
|
movdqa xmm5, xmm1
|
|
psllw xmm1, 4
|
|
|
|
; get the satd of DC
|
|
psubw xmm6, xmm1
|
|
WELS_AbsW xmm6, xmm1
|
|
WELS_AbsW xmm7, xmm1
|
|
paddusw xmm6, xmm7
|
|
SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6
|
|
%ifdef UNIX64
|
|
pop r5
|
|
pop r4
|
|
%endif
|
|
; comparing order: DC H V
|
|
|
|
mov r4, arg5
|
|
movd r2d, xmm6
|
|
movd r3d, xmm2
|
|
movd r6d, xmm0
|
|
|
|
and r2d, 0xffff
|
|
shr r2d, 1
|
|
and r3d, 0xffff
|
|
shr r3d, 1
|
|
and r6d, 0xffff
|
|
shr r6d, 1
|
|
add r2d, dword arg7
|
|
add r3d, dword arg8
|
|
add r6d, dword arg9
|
|
cmp r2w, r3w
|
|
jg near not_dc
|
|
cmp r2w, r6w
|
|
jg near not_dc_h
|
|
|
|
; for DC mode
|
|
movd r3d, xmm5
|
|
imul r3d, 0x01010101
|
|
movd xmm5, r3d
|
|
pshufd xmm5, xmm5, 0
|
|
movdqa [r4], xmm5
|
|
mov r5, arg6
|
|
mov dword [r5], 0x02
|
|
mov retrd, r2d
|
|
POP_XMM
|
|
%ifdef X86_32
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
ret
|
|
|
|
not_dc:
|
|
cmp r3w, r6w
|
|
jg near not_dc_h
|
|
|
|
; for H mode
|
|
SSE_DB_1_2REG xmm6, xmm7
|
|
sub r0, r1
|
|
sub r0, r1
|
|
movzx r6d, byte [r0-1]
|
|
movd xmm0, r6d
|
|
pmuludq xmm0, xmm6
|
|
|
|
movzx r6d, byte [r0+r1-1]
|
|
movd xmm1, r6d
|
|
pmuludq xmm1, xmm6
|
|
punpckldq xmm0, xmm1
|
|
|
|
lea r0, [r0+r1*2]
|
|
movzx r6d, byte [r0-1]
|
|
movd xmm2, r6d
|
|
pmuludq xmm2, xmm6
|
|
|
|
movzx r6d, byte [r0+r1-1]
|
|
movd xmm3, r6d
|
|
pmuludq xmm3, xmm6
|
|
punpckldq xmm2, xmm3
|
|
punpcklqdq xmm0, xmm2
|
|
|
|
movdqa [r4],xmm0
|
|
|
|
mov retrd, r3d
|
|
mov r5, arg6
|
|
mov dword [r5], 0x01
|
|
POP_XMM
|
|
%ifdef X86_32
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
ret
|
|
not_dc_h:
|
|
sub r0, r1
|
|
sub r0, r1
|
|
sub r0, r1
|
|
movd xmm0, [r0]
|
|
pshufd xmm0, xmm0, 0
|
|
movdqa [r4],xmm0
|
|
mov retrd, r6d
|
|
mov r5, arg6
|
|
mov dword [r5], 0x00
|
|
POP_XMM
|
|
%ifdef X86_32
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
ret
|
|
|
|
|
|
%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
|
|
pmaddubsw %1, xmm5
|
|
movdqa %2, %1
|
|
pmaddwd %1, xmm7
|
|
pmaddwd %2, xmm6
|
|
movdqa %3, %1
|
|
punpckldq %1, %2
|
|
punpckhdq %2, %3
|
|
movdqa %3, %1
|
|
punpcklqdq %1, %2
|
|
punpckhqdq %3, %2
|
|
paddd xmm4, %1 ;for dc
|
|
paddd xmm4, %3 ;for dc
|
|
packssdw %1, %3
|
|
psllw %1, 2
|
|
%endmacro
|
|
%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
|
|
pmaddubsw %1, xmm5
|
|
movdqa %2, %1
|
|
pmaddwd %1, xmm7
|
|
pmaddwd %2, xmm6
|
|
movdqa %3, %1
|
|
punpckldq %1, %2
|
|
punpckhdq %2, %3
|
|
movdqa %3, %1
|
|
punpcklqdq %1, %2
|
|
punpckhqdq %3, %2
|
|
; paddd xmm4, %1 ;for dc
|
|
; paddd xmm4, %3 ;for dc
|
|
movdqa %4, %1
|
|
punpcklqdq %4, %3
|
|
packssdw %1, %3
|
|
psllw %1, 2
|
|
%endmacro
|
|
|
|
%macro SSE41_GetX38x4SatdDec 0
|
|
pxor xmm7, xmm7
|
|
movq xmm0, [r2]
|
|
movq xmm1, [r2+r3]
|
|
lea r2, [r2+2*r3]
|
|
movq xmm2, [r2]
|
|
movq xmm3, [r2+r3]
|
|
lea r2, [r2+2*r3]
|
|
punpcklbw xmm0, xmm7
|
|
punpcklbw xmm1, xmm7
|
|
punpcklbw xmm2, xmm7
|
|
punpcklbw xmm3, xmm7
|
|
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
|
|
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
|
|
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
|
|
;doesn't need another transpose
|
|
%endmacro
|
|
|
|
%macro SSE41_GetX38x4SatdV 2
|
|
pxor xmm0, xmm0
|
|
pinsrw xmm0, word[r6+%2], 0
|
|
pinsrw xmm0, word[r6+%2+8], 4
|
|
psubsw xmm0, xmm7
|
|
pabsw xmm0, xmm0
|
|
paddw xmm4, xmm0
|
|
pxor xmm0, xmm0
|
|
pinsrw xmm0, word[r6+%2+2], 0
|
|
pinsrw xmm0, word[r6+%2+10], 4
|
|
psubsw xmm0, xmm1
|
|
pabsw xmm0, xmm0
|
|
paddw xmm4, xmm0
|
|
pxor xmm0, xmm0
|
|
pinsrw xmm0, word[r6+%2+4], 0
|
|
pinsrw xmm0, word[r6+%2+12], 4
|
|
psubsw xmm0, xmm3
|
|
pabsw xmm0, xmm0
|
|
paddw xmm4, xmm0
|
|
pxor xmm0, xmm0
|
|
pinsrw xmm0, word[r6+%2+6], 0
|
|
pinsrw xmm0, word[r6+%2+14], 4
|
|
psubsw xmm0, xmm2
|
|
pabsw xmm0, xmm0
|
|
paddw xmm4, xmm0
|
|
%endmacro
|
|
%macro SSE41_GetX38x4SatdH 3
|
|
movq xmm0, [r6+%3+8*%1]
|
|
punpcklqdq xmm0, xmm0
|
|
psubsw xmm0, xmm7
|
|
pabsw xmm0, xmm0
|
|
paddw xmm5, xmm0
|
|
pabsw xmm1, xmm1
|
|
pabsw xmm2, xmm2
|
|
pabsw xmm3, xmm3
|
|
paddw xmm2, xmm1;for DC
|
|
paddw xmm2, xmm3;for DC
|
|
paddw xmm5, xmm2
|
|
%endmacro
|
|
%macro SSE41_I16X16GetX38x4SatdDC 0
|
|
pxor xmm0, xmm0
|
|
movq2dq xmm0, mm4
|
|
punpcklqdq xmm0, xmm0
|
|
psubsw xmm0, xmm7
|
|
pabsw xmm0, xmm0
|
|
paddw xmm6, xmm0
|
|
paddw xmm6, xmm2
|
|
%endmacro
|
|
%macro SSE41_ChromaGetX38x4SatdDC 1
|
|
shl %1, 4
|
|
movdqa xmm0, [r6+32+%1]
|
|
psubsw xmm0, xmm7
|
|
pabsw xmm0, xmm0
|
|
paddw xmm6, xmm0
|
|
paddw xmm6, xmm2
|
|
%endmacro
|
|
%macro SSE41_I16x16GetX38x4Satd 2
|
|
SSE41_GetX38x4SatdDec
|
|
SSE41_GetX38x4SatdV %1, %2
|
|
SSE41_GetX38x4SatdH %1, %2, 32
|
|
SSE41_I16X16GetX38x4SatdDC
|
|
%endmacro
|
|
%macro SSE41_ChromaGetX38x4Satd 2
|
|
SSE41_GetX38x4SatdDec
|
|
SSE41_GetX38x4SatdV %1, %2
|
|
SSE41_GetX38x4SatdH %1, %2, 16
|
|
SSE41_ChromaGetX38x4SatdDC %1
|
|
%endmacro
|
|
%macro SSE41_HSum8W 3
|
|
pmaddwd %1, %2
|
|
movhlps %3, %1
|
|
paddd %1, %3
|
|
pshuflw %3, %1,0Eh
|
|
paddd %1, %3
|
|
%endmacro
|
|
|
|
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
|
|
%assign push_num 0
|
|
LOAD_7_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r5, r5d
|
|
|
|
%ifndef X86_32
|
|
push r12
|
|
mov r12, r2
|
|
%endif
|
|
|
|
pxor xmm4, xmm4
|
|
movdqa xmm5, [HSumSubDB1]
|
|
movdqa xmm6, [HSumSubDW1]
|
|
movdqa xmm7, [PDW1]
|
|
sub r0, r1
|
|
movdqu xmm0, [r0]
|
|
movhlps xmm1, xmm0
|
|
punpcklqdq xmm0, xmm0
|
|
punpcklqdq xmm1, xmm1
|
|
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
|
|
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
|
|
movdqa [r6], xmm0 ;V
|
|
movdqa [r6+16], xmm1
|
|
add r0, r1
|
|
pinsrb xmm0, byte[r0-1], 0
|
|
pinsrb xmm0, byte[r0+r1-1], 1
|
|
lea r0, [r0+2*r1]
|
|
pinsrb xmm0, byte[r0-1], 2
|
|
pinsrb xmm0, byte[r0+r1-1], 3
|
|
lea r0, [r0+2*r1]
|
|
pinsrb xmm0, byte[r0-1], 4
|
|
pinsrb xmm0, byte[r0+r1-1], 5
|
|
lea r0, [r0+2*r1]
|
|
pinsrb xmm0, byte[r0-1], 6
|
|
pinsrb xmm0, byte[r0+r1-1], 7
|
|
lea r0, [r0+2*r1]
|
|
pinsrb xmm0, byte[r0-1], 8
|
|
pinsrb xmm0, byte[r0+r1-1], 9
|
|
lea r0, [r0+2*r1]
|
|
pinsrb xmm0, byte[r0-1], 10
|
|
pinsrb xmm0, byte[r0+r1-1], 11
|
|
lea r0, [r0+2*r1]
|
|
pinsrb xmm0, byte[r0-1], 12
|
|
pinsrb xmm0, byte[r0+r1-1], 13
|
|
lea r0, [r0+2*r1]
|
|
pinsrb xmm0, byte[r0-1], 14
|
|
pinsrb xmm0, byte[r0+r1-1], 15
|
|
movhlps xmm1, xmm0
|
|
punpcklqdq xmm0, xmm0
|
|
punpcklqdq xmm1, xmm1
|
|
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
|
|
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
|
|
movdqa [r6+32], xmm0 ;H
|
|
movdqa [r6+48], xmm1
|
|
movd r0d, xmm4 ;dc
|
|
add r0d, 16 ;(sum+16)
|
|
shr r0d, 5 ;((sum+16)>>5)
|
|
shl r0d, 4 ;
|
|
movd mm4, r0d ; mm4 copy DC
|
|
pxor xmm4, xmm4 ;V
|
|
pxor xmm5, xmm5 ;H
|
|
pxor xmm6, xmm6 ;DC
|
|
%ifdef UNIX64
|
|
push r4
|
|
%endif
|
|
mov r0, 0
|
|
mov r4, 0
|
|
|
|
.loop16x16_get_satd:
|
|
.loopStart1:
|
|
SSE41_I16x16GetX38x4Satd r0, r4
|
|
inc r0
|
|
cmp r0, 4
|
|
jl .loopStart1
|
|
cmp r4, 16
|
|
je .loop16x16_get_satd_end
|
|
%ifdef X86_32
|
|
mov r2, arg3
|
|
%else
|
|
mov r2, r12
|
|
%endif
|
|
add r2, 8
|
|
mov r0, 0
|
|
add r4, 16
|
|
jmp .loop16x16_get_satd
|
|
.loop16x16_get_satd_end:
|
|
MMX_DW_1_2REG xmm0, xmm1
|
|
psrlw xmm4, 1 ;/2
|
|
psrlw xmm5, 1 ;/2
|
|
psrlw xmm6, 1 ;/2
|
|
SSE41_HSum8W xmm4, xmm0, xmm1
|
|
SSE41_HSum8W xmm5, xmm0, xmm1
|
|
SSE41_HSum8W xmm6, xmm0, xmm1
|
|
|
|
%ifdef UNIX64
|
|
pop r4
|
|
%endif
|
|
; comparing order: DC H V
|
|
movd r3d, xmm6 ;DC
|
|
movd r1d, xmm5 ;H
|
|
movd r0d, xmm4 ;V
|
|
%ifndef X86_32
|
|
pop r12
|
|
%endif
|
|
shl r5d, 1
|
|
add r1d, r5d
|
|
add r3d, r5d
|
|
mov r4, arg5
|
|
cmp r3d, r1d
|
|
jge near not_dc_16x16
|
|
cmp r3d, r0d
|
|
jge near not_dc_h_16x16
|
|
|
|
; for DC mode
|
|
mov dword[r4], 2;I16_PRED_DC
|
|
mov retrd, r3d
|
|
jmp near return_satd_intra_16x16_x3
|
|
not_dc_16x16:
|
|
; for H mode
|
|
cmp r1d, r0d
|
|
jge near not_dc_h_16x16
|
|
mov dword[r4], 1;I16_PRED_H
|
|
mov retrd, r1d
|
|
jmp near return_satd_intra_16x16_x3
|
|
not_dc_h_16x16:
|
|
; for V mode
|
|
mov dword[r4], 0;I16_PRED_V
|
|
mov retrd, r0d
|
|
return_satd_intra_16x16_x3:
|
|
WELSEMMS
|
|
POP_XMM
|
|
LOAD_7_PARA_POP
|
|
ret
|
|
|
|
%macro SSE41_ChromaGetX38x8Satd 0
|
|
movdqa xmm5, [HSumSubDB1]
|
|
movdqa xmm6, [HSumSubDW1]
|
|
movdqa xmm7, [PDW1]
|
|
sub r0, r1
|
|
movq xmm0, [r0]
|
|
punpcklqdq xmm0, xmm0
|
|
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
|
|
movdqa [r6], xmm0 ;V
|
|
add r0, r1
|
|
pinsrb xmm0, byte[r0-1], 0
|
|
pinsrb xmm0, byte[r0+r1-1], 1
|
|
lea r0, [r0+2*r1]
|
|
pinsrb xmm0, byte[r0-1], 2
|
|
pinsrb xmm0, byte[r0+r1-1], 3
|
|
lea r0, [r0+2*r1]
|
|
pinsrb xmm0, byte[r0-1], 4
|
|
pinsrb xmm0, byte[r0+r1-1], 5
|
|
lea r0, [r0+2*r1]
|
|
pinsrb xmm0, byte[r0-1], 6
|
|
pinsrb xmm0, byte[r0+r1-1], 7
|
|
punpcklqdq xmm0, xmm0
|
|
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
|
|
movdqa [r6+16], xmm0 ;H
|
|
;(sum+2)>>2
|
|
movdqa xmm6, [PDQ2]
|
|
movdqa xmm5, xmm4
|
|
punpckhqdq xmm5, xmm1
|
|
paddd xmm5, xmm6
|
|
psrld xmm5, 2
|
|
;(sum1+sum2+4)>>3
|
|
paddd xmm6, xmm6
|
|
paddd xmm4, xmm1
|
|
paddd xmm4, xmm6
|
|
psrld xmm4, 3
|
|
;satd *16
|
|
pslld xmm5, 4
|
|
pslld xmm4, 4
|
|
;temp satd
|
|
movdqa xmm6, xmm4
|
|
punpcklqdq xmm4, xmm5
|
|
psllq xmm4, 32
|
|
psrlq xmm4, 32
|
|
movdqa [r6+32], xmm4
|
|
punpckhqdq xmm5, xmm6
|
|
psllq xmm5, 32
|
|
psrlq xmm5, 32
|
|
movdqa [r6+48], xmm5
|
|
|
|
pxor xmm4, xmm4 ;V
|
|
pxor xmm5, xmm5 ;H
|
|
pxor xmm6, xmm6 ;DC
|
|
mov r0, 0
|
|
SSE41_ChromaGetX38x4Satd r0, 0
|
|
inc r0
|
|
SSE41_ChromaGetX38x4Satd r0, 0
|
|
%endmacro
|
|
|
|
%macro SSEReg2MMX 3
|
|
movdq2q %2, %1
|
|
movhlps %1, %1
|
|
movdq2q %3, %1
|
|
%endmacro
|
|
%macro MMXReg2SSE 4
|
|
movq2dq %1, %3
|
|
movq2dq %2, %4
|
|
punpcklqdq %1, %2
|
|
%endmacro
|
|
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
|
|
|
|
WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
|
|
%assign push_num 0
|
|
LOAD_7_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r5, r5d
|
|
loop_chroma_satdx3:
|
|
SSE41_ChromaGetX38x8Satd
|
|
SSEReg2MMX xmm4, mm0,mm1
|
|
SSEReg2MMX xmm5, mm2,mm3
|
|
SSEReg2MMX xmm6, mm5,mm6
|
|
mov r0, arg8
|
|
mov r2, arg9
|
|
|
|
SSE41_ChromaGetX38x8Satd
|
|
|
|
MMXReg2SSE xmm0, xmm3, mm0, mm1
|
|
MMXReg2SSE xmm1, xmm3, mm2, mm3
|
|
MMXReg2SSE xmm2, xmm3, mm5, mm6
|
|
|
|
paddw xmm4, xmm0
|
|
paddw xmm5, xmm1
|
|
paddw xmm6, xmm2
|
|
|
|
MMX_DW_1_2REG xmm0, xmm1
|
|
psrlw xmm4, 1 ;/2
|
|
psrlw xmm5, 1 ;/2
|
|
psrlw xmm6, 1 ;/2
|
|
SSE41_HSum8W xmm4, xmm0, xmm1
|
|
SSE41_HSum8W xmm5, xmm0, xmm1
|
|
SSE41_HSum8W xmm6, xmm0, xmm1
|
|
; comparing order: DC H V
|
|
movd r3d, xmm6 ;DC
|
|
movd r1d, xmm5 ;H
|
|
movd r0d, xmm4 ;V
|
|
|
|
|
|
shl r5d, 1
|
|
add r1d, r5d
|
|
add r0d, r5d
|
|
cmp r3d, r1d
|
|
jge near not_dc_8x8
|
|
cmp r3d, r0d
|
|
jge near not_dc_h_8x8
|
|
|
|
; for DC mode
|
|
mov dword[r4], 0;I8_PRED_DC
|
|
mov retrd, r3d
|
|
jmp near return_satd_intra_8x8_x3
|
|
not_dc_8x8:
|
|
; for H mode
|
|
cmp r1d, r0d
|
|
jge near not_dc_h_8x8
|
|
mov dword[r4], 1;I8_PRED_H
|
|
mov retrd, r1d
|
|
jmp near return_satd_intra_8x8_x3
|
|
not_dc_h_8x8:
|
|
; for V mode
|
|
mov dword[r4], 2;I8_PRED_V
|
|
mov retrd, r0d
|
|
return_satd_intra_8x8_x3:
|
|
WELSEMMS
|
|
POP_XMM
|
|
LOAD_7_PARA_POP
|
|
ret
|
|
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_satd_intra_sse2 END
|
|
;
|
|
;***********************************************************************
|
|
%macro SSSE3_Get16BSadHVDC 2
|
|
movd xmm6,%1
|
|
pshufb xmm6,xmm1
|
|
movdqa %1, xmm6
|
|
movdqa xmm0,%2
|
|
psadbw xmm0,xmm7
|
|
paddw xmm4,xmm0
|
|
movdqa xmm0,%2
|
|
psadbw xmm0,xmm5
|
|
paddw xmm2,xmm0
|
|
psadbw xmm6,%2
|
|
paddw xmm3,xmm6
|
|
%endmacro
|
|
%macro WelsAddDCValue 4
|
|
movzx %2, byte %1
|
|
mov %3, %2
|
|
add %4, %2
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_sad_intra_ssse3 BEGIN
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
|
|
%assign push_num 0
|
|
LOAD_7_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r5, r5d
|
|
|
|
push r5
|
|
push r4
|
|
push r3
|
|
|
|
sub r0, r1
|
|
movdqa xmm5,[r0]
|
|
pxor xmm0,xmm0
|
|
psadbw xmm0,xmm5
|
|
movhlps xmm1,xmm0
|
|
paddw xmm0,xmm1
|
|
movd r5d, xmm0
|
|
|
|
add r0,r1
|
|
lea r3,[r1+2*r1] ;ebx r3
|
|
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d
|
|
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
|
|
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
|
|
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
|
|
lea r0, [r0+4*r1]
|
|
add r6, 64
|
|
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
|
|
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
|
|
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
|
|
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
|
|
lea r0, [r0+4*r1]
|
|
add r6, 64
|
|
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
|
|
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
|
|
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
|
|
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
|
|
lea r0, [r0+4*r1]
|
|
add r6, 64
|
|
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
|
|
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
|
|
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
|
|
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
|
|
sub r6, 192
|
|
add r5d,10h
|
|
shr r5d,5
|
|
movd xmm7,r5d
|
|
pxor xmm1,xmm1
|
|
pshufb xmm7,xmm1
|
|
pxor xmm4,xmm4
|
|
pxor xmm3,xmm3
|
|
pxor xmm2,xmm2
|
|
;sad begin
|
|
pop r3
|
|
lea r4, [r3+2*r3] ;esi r4
|
|
SSSE3_Get16BSadHVDC [r6], [r2]
|
|
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
|
|
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
|
|
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
|
|
add r6, 64
|
|
lea r2, [r2+4*r3]
|
|
SSSE3_Get16BSadHVDC [r6], [r2]
|
|
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
|
|
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
|
|
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
|
|
add r6, 64
|
|
lea r2, [r2+4*r3]
|
|
SSSE3_Get16BSadHVDC [r6], [r2]
|
|
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
|
|
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
|
|
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
|
|
add r6, 64
|
|
lea r2, [r2+4*r3]
|
|
SSSE3_Get16BSadHVDC [r6], [r2]
|
|
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
|
|
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
|
|
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
|
|
|
|
pop r4
|
|
pop r5
|
|
pslldq xmm3,4
|
|
por xmm3,xmm2
|
|
movhlps xmm1,xmm3
|
|
paddw xmm3,xmm1
|
|
movhlps xmm0,xmm4
|
|
paddw xmm4,xmm0
|
|
; comparing order: DC H V
|
|
movd r1d, xmm4 ;DC ;ebx r1d
|
|
movd r0d, xmm3 ;V ;ecx r0d
|
|
psrldq xmm3, 4
|
|
movd r2d, xmm3 ;H ;esi r2d
|
|
|
|
;mov eax, [esp+36] ;lamda ;eax r5
|
|
shl r5d, 1
|
|
add r2d, r5d
|
|
add r1d, r5d
|
|
;mov edx, [esp+32] ;edx r4
|
|
cmp r1d, r2d
|
|
jge near not_dc_16x16_sad
|
|
cmp r1d, r0d
|
|
jge near not_dc_h_16x16_sad
|
|
; for DC mode
|
|
mov dword[r4], 2;I16_PRED_DC
|
|
mov retrd, r1d
|
|
sub r6, 192
|
|
%assign x 0
|
|
%rep 16
|
|
movdqa [r6+16*x], xmm7
|
|
%assign x x+1
|
|
%endrep
|
|
jmp near return_sad_intra_16x16_x3
|
|
not_dc_16x16_sad:
|
|
; for H mode
|
|
cmp r2d, r0d
|
|
jge near not_dc_h_16x16_sad
|
|
mov dword[r4], 1;I16_PRED_H
|
|
mov retrd, r2d
|
|
jmp near return_sad_intra_16x16_x3
|
|
not_dc_h_16x16_sad:
|
|
; for V mode
|
|
mov dword[r4], 0;I16_PRED_V
|
|
mov retrd, r0d
|
|
sub r6, 192
|
|
%assign x 0
|
|
%rep 16
|
|
movdqa [r6+16*x], xmm5
|
|
%assign x x+1
|
|
%endrep
|
|
return_sad_intra_16x16_x3:
|
|
POP_XMM
|
|
LOAD_7_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_sad_intra_ssse3 END
|
|
;
|
|
;***********************************************************************
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_satd_wxh_sse41 BEGIN
|
|
;
|
|
;***********************************************************************
|
|
|
|
;SSE4.1
|
|
%macro SSE41_GetSatd8x4 0
|
|
movq xmm0, [r0]
|
|
punpcklqdq xmm0, xmm0
|
|
pmaddubsw xmm0, xmm7
|
|
movq xmm1, [r0+r1]
|
|
punpcklqdq xmm1, xmm1
|
|
pmaddubsw xmm1, xmm7
|
|
movq xmm2, [r2]
|
|
punpcklqdq xmm2, xmm2
|
|
pmaddubsw xmm2, xmm7
|
|
movq xmm3, [r2+r3]
|
|
punpcklqdq xmm3, xmm3
|
|
pmaddubsw xmm3, xmm7
|
|
psubsw xmm0, xmm2
|
|
psubsw xmm1, xmm3
|
|
movq xmm2, [r0+2*r1]
|
|
punpcklqdq xmm2, xmm2
|
|
pmaddubsw xmm2, xmm7
|
|
movq xmm3, [r0+r4]
|
|
punpcklqdq xmm3, xmm3
|
|
pmaddubsw xmm3, xmm7
|
|
movq xmm4, [r2+2*r3]
|
|
punpcklqdq xmm4, xmm4
|
|
pmaddubsw xmm4, xmm7
|
|
movq xmm5, [r2+r5]
|
|
punpcklqdq xmm5, xmm5
|
|
pmaddubsw xmm5, xmm7
|
|
psubsw xmm2, xmm4
|
|
psubsw xmm3, xmm5
|
|
SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
|
|
pabsw xmm0, xmm0
|
|
pabsw xmm2, xmm2
|
|
pabsw xmm1, xmm1
|
|
pabsw xmm3, xmm3
|
|
movdqa xmm4, xmm3
|
|
pblendw xmm3, xmm1, 0xAA
|
|
pslld xmm1, 16
|
|
psrld xmm4, 16
|
|
por xmm1, xmm4
|
|
pmaxuw xmm1, xmm3
|
|
paddw xmm6, xmm1
|
|
movdqa xmm4, xmm0
|
|
pblendw xmm0, xmm2, 0xAA
|
|
pslld xmm2, 16
|
|
psrld xmm4, 16
|
|
por xmm2, xmm4
|
|
pmaxuw xmm0, xmm2
|
|
paddw xmm6, xmm0
|
|
%endmacro
|
|
|
|
%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
|
|
MMX_DW_1_2REG %3, %4
|
|
pmaddwd %2, %3
|
|
movhlps %4, %2
|
|
paddd %2, %4
|
|
pshuflw %4, %2,0Eh
|
|
paddd %2, %4
|
|
movd %1, %2
|
|
%endmacro
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd4x4_sse41
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movdqa xmm4,[HSwapSumSubDB1]
|
|
movd xmm2,[r2]
|
|
movd xmm5,[r2+r3]
|
|
shufps xmm2,xmm5,0
|
|
movd xmm3,[r2+r3*2]
|
|
lea r2, [r3*2+r2]
|
|
movd xmm5,[r2+r3]
|
|
shufps xmm3,xmm5,0
|
|
movd xmm0,[r0]
|
|
movd xmm5,[r0+r1]
|
|
shufps xmm0,xmm5,0
|
|
movd xmm1,[r0+r1*2]
|
|
lea r0, [r1*2+r0]
|
|
movd xmm5,[r0+r1]
|
|
shufps xmm1,xmm5,0
|
|
pmaddubsw xmm0,xmm4
|
|
pmaddubsw xmm1,xmm4
|
|
pmaddubsw xmm2,xmm4
|
|
pmaddubsw xmm3,xmm4
|
|
psubw xmm0,xmm2
|
|
psubw xmm1,xmm3
|
|
movdqa xmm2,xmm0
|
|
paddw xmm0,xmm1
|
|
psubw xmm1,xmm2
|
|
movdqa xmm2,xmm0
|
|
punpcklqdq xmm0,xmm1
|
|
punpckhqdq xmm2,xmm1
|
|
movdqa xmm1,xmm0
|
|
paddw xmm0,xmm2
|
|
psubw xmm2,xmm1
|
|
movdqa xmm1,xmm0
|
|
pblendw xmm0,xmm2,0AAh
|
|
pslld xmm2,16
|
|
psrld xmm1,16
|
|
por xmm2,xmm1
|
|
pabsw xmm0,xmm0
|
|
pabsw xmm2,xmm2
|
|
pmaxsw xmm0,xmm2
|
|
SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd8x8_sse41
|
|
%ifdef X86_32
|
|
push r4
|
|
push r5
|
|
%endif
|
|
%assign push_num 2
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movdqa xmm7, [HSumSubDB1]
|
|
lea r4, [r1+r1*2]
|
|
lea r5, [r3+r3*2]
|
|
pxor xmm6, xmm6
|
|
SSE41_GetSatd8x4
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
SSE41_GetSatd8x4
|
|
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
%ifdef X86_32
|
|
pop r5
|
|
pop r4
|
|
%endif
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd8x16_sse41
|
|
%ifdef X86_32
|
|
push r4
|
|
push r5
|
|
push r6
|
|
%endif
|
|
%assign push_num 3
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movdqa xmm7, [HSumSubDB1]
|
|
lea r4, [r1+r1*2]
|
|
lea r5, [r3+r3*2]
|
|
pxor xmm6, xmm6
|
|
mov r6, 0
|
|
loop_get_satd_8x16:
|
|
SSE41_GetSatd8x4
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
inc r6
|
|
cmp r6, 4
|
|
jl loop_get_satd_8x16
|
|
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
%ifdef X86_32
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
%endif
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSatd16x8_sse41
|
|
%ifdef X86_32
|
|
push r4
|
|
push r5
|
|
%endif
|
|
%assign push_num 2
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
push r0
|
|
push r2
|
|
|
|
movdqa xmm7, [HSumSubDB1]
|
|
lea r4, [r1+r1*2]
|
|
lea r5, [r3+r3*2]
|
|
pxor xmm6, xmm6
|
|
SSE41_GetSatd8x4
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
SSE41_GetSatd8x4
|
|
|
|
pop r2
|
|
pop r0
|
|
add r0, 8
|
|
add r2, 8
|
|
SSE41_GetSatd8x4
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
SSE41_GetSatd8x4
|
|
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
%ifdef X86_32
|
|
pop r5
|
|
pop r4
|
|
%endif
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
;
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsSampleSatd16x16_sse41
|
|
%ifdef X86_32
|
|
push r4
|
|
push r5
|
|
push r6
|
|
%endif
|
|
%assign push_num 3
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
push r0
|
|
push r2
|
|
|
|
movdqa xmm7, [HSumSubDB1]
|
|
lea r4, [r1+r1*2]
|
|
lea r5, [r3+r3*2]
|
|
pxor xmm6, xmm6
|
|
mov r6, 0
|
|
loop_get_satd_16x16_left:
|
|
SSE41_GetSatd8x4
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
inc r6
|
|
cmp r6, 4
|
|
jl loop_get_satd_16x16_left
|
|
|
|
pop r2
|
|
pop r0
|
|
add r0, 8
|
|
add r2, 8
|
|
mov r6, 0
|
|
loop_get_satd_16x16_right:
|
|
SSE41_GetSatd8x4
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
inc r6
|
|
cmp r6, 4
|
|
jl loop_get_satd_16x16_right
|
|
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
%ifdef X86_32
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
%endif
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_satd_wxh_sse41 END
|
|
;
|
|
;***********************************************************************
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_sad_wxh_sse2 BEGIN
|
|
;
|
|
;***********************************************************************
|
|
|
|
%macro SSE2_GetSad2x16 0
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqu xmm1, [r2]
|
|
MOVDQ xmm2, [r0];[eax] must aligned 16
|
|
psadbw xmm1, xmm2
|
|
paddw xmm0, xmm1
|
|
movdqu xmm1, [r2+r3]
|
|
MOVDQ xmm2, [r0+r1]
|
|
psadbw xmm1, xmm2
|
|
paddw xmm0, xmm1
|
|
%endmacro
|
|
|
|
|
|
%macro SSE2_GetSad4x16 0
|
|
movdqu xmm0, [r2]
|
|
MOVDQ xmm2, [r0]
|
|
psadbw xmm0, xmm2
|
|
paddw xmm7, xmm0
|
|
movdqu xmm1, [r2+r3]
|
|
MOVDQ xmm2, [r0+r1]
|
|
psadbw xmm1, xmm2
|
|
paddw xmm7, xmm1
|
|
movdqu xmm1, [r2+2*r3]
|
|
MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
|
|
psadbw xmm1, xmm2
|
|
paddw xmm7, xmm1
|
|
movdqu xmm1, [r2+r5]
|
|
MOVDQ xmm2, [r0+r4]
|
|
psadbw xmm1, xmm2
|
|
paddw xmm7, xmm1
|
|
%endmacro
|
|
|
|
|
|
%macro SSE2_GetSad8x4 0
|
|
movq xmm0, [r0]
|
|
movq xmm1, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movhps xmm0, [r0]
|
|
movhps xmm1, [r0+r1]
|
|
|
|
movq xmm2, [r2]
|
|
movq xmm3, [r2+r3]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm2, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm2
|
|
psadbw xmm1, xmm3
|
|
paddw xmm6, xmm0
|
|
paddw xmm6, xmm1
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
|
|
;First parameter can align to 16 bytes,
|
|
;In wels, the third parameter can't align to 16 bytes.
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSad16x16_sse2
|
|
%ifdef X86_32
|
|
push r4
|
|
push r5
|
|
%endif
|
|
|
|
%assign push_num 2
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
lea r4, [3*r1]
|
|
lea r5, [3*r3]
|
|
|
|
pxor xmm7, xmm7
|
|
SSE2_GetSad4x16
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
SSE2_GetSad4x16
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
SSE2_GetSad4x16
|
|
lea r0, [r0+4*r1]
|
|
lea r2, [r2+4*r3]
|
|
SSE2_GetSad4x16
|
|
movhlps xmm0, xmm7
|
|
paddw xmm0, xmm7
|
|
movd retrd, xmm0
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
%ifdef X86_32
|
|
pop r5
|
|
pop r4
|
|
%endif
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
|
|
;First parameter can align to 16 bytes,
|
|
;In wels, the third parameter can't align to 16 bytes.
|
|
;
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSad16x8_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movdqu xmm0, [r2]
|
|
MOVDQ xmm2, [r0]
|
|
psadbw xmm0, xmm2
|
|
movdqu xmm1, [r2+r3]
|
|
MOVDQ xmm2, [r0+r1]
|
|
psadbw xmm1, xmm2
|
|
paddw xmm0, xmm1
|
|
|
|
SSE2_GetSad2x16
|
|
SSE2_GetSad2x16
|
|
SSE2_GetSad2x16
|
|
|
|
movhlps xmm1, xmm0
|
|
paddw xmm0, xmm1
|
|
movd retrd, xmm0
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
|
|
|
|
WELS_EXTERN WelsSampleSad8x16_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 7
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm6, xmm6
|
|
|
|
SSE2_GetSad8x4
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSad8x4
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSad8x4
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSad8x4
|
|
|
|
movhlps xmm0, xmm6
|
|
paddw xmm0, xmm6
|
|
movd retrd, xmm0
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
ret
|
|
|
|
|
|
%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
|
|
and %1, 0x1f|(%3>>1)
|
|
cmp %1, (32-%2)|(%3>>1)
|
|
%endmacro
|
|
|
|
WELS_EXTERN WelsSampleSad8x8_sse21
|
|
%assign push_num 0
|
|
mov r2, arg3
|
|
push r2
|
|
CACHE_SPLIT_CHECK r2, 8, 64
|
|
jle near .pixel_sad_8x8_nsplit
|
|
pop r2
|
|
%ifdef X86_32
|
|
push r3
|
|
push r4
|
|
push r5
|
|
%endif
|
|
%assign push_num 3
|
|
PUSH_XMM 8
|
|
mov r0, arg1
|
|
mov r1, arg2
|
|
SIGN_EXTENSION r1, r1d
|
|
pxor xmm7, xmm7
|
|
|
|
;ecx r2, edx r4, edi r5
|
|
|
|
mov r5, r2
|
|
and r5, 0x07
|
|
sub r2, r5
|
|
mov r4, 8
|
|
sub r4, r5
|
|
|
|
shl r5, 3
|
|
shl r4, 3
|
|
movd xmm5, r5d
|
|
movd xmm6, r4d
|
|
mov r5, 8
|
|
add r5, r2
|
|
mov r3, arg4
|
|
SIGN_EXTENSION r3, r3d
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
|
|
movq xmm1, [r2]
|
|
movq xmm2, [r5]
|
|
movhps xmm1, [r2+r3]
|
|
movhps xmm2, [r5+r3]
|
|
psrlq xmm1, xmm5
|
|
psllq xmm2, xmm6
|
|
por xmm1, xmm2
|
|
|
|
psadbw xmm0, xmm1
|
|
paddw xmm7, xmm0
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
lea r5, [r5+2*r3]
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
|
|
movq xmm1, [r2]
|
|
movq xmm2, [r5]
|
|
movhps xmm1, [r2+r3]
|
|
movhps xmm2, [r5+r3]
|
|
psrlq xmm1, xmm5
|
|
psllq xmm2, xmm6
|
|
por xmm1, xmm2
|
|
|
|
psadbw xmm0, xmm1
|
|
paddw xmm7, xmm0
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
lea r5, [r5+2*r3]
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
|
|
movq xmm1, [r2]
|
|
movq xmm2, [r5]
|
|
movhps xmm1, [r2+r3]
|
|
movhps xmm2, [r5+r3]
|
|
psrlq xmm1, xmm5
|
|
psllq xmm2, xmm6
|
|
por xmm1, xmm2
|
|
|
|
psadbw xmm0, xmm1
|
|
paddw xmm7, xmm0
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
lea r5, [r5+2*r3]
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
|
|
movq xmm1, [r2]
|
|
movq xmm2, [r5]
|
|
movhps xmm1, [r2+r3]
|
|
movhps xmm2, [r5+r3]
|
|
psrlq xmm1, xmm5
|
|
psllq xmm2, xmm6
|
|
por xmm1, xmm2
|
|
|
|
psadbw xmm0, xmm1
|
|
paddw xmm7, xmm0
|
|
|
|
movhlps xmm0, xmm7
|
|
paddw xmm0, xmm7
|
|
movd retrd, xmm0
|
|
POP_XMM
|
|
%ifdef X86_32
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
jmp .return
|
|
|
|
.pixel_sad_8x8_nsplit:
|
|
|
|
pop r2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 7
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm6, xmm6
|
|
SSE2_GetSad8x4
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
SSE2_GetSad8x4
|
|
movhlps xmm0, xmm6
|
|
paddw xmm0, xmm6
|
|
movd retrd, xmm0
|
|
POP_XMM
|
|
LOAD_4_PARA_POP
|
|
.return:
|
|
ret
|
|
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_sad_wxh_sse2 END
|
|
;
|
|
;***********************************************************************
|
|
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_sad_4_wxh_sse2 BEGIN
|
|
;
|
|
;***********************************************************************
|
|
|
|
|
|
%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
|
|
psadbw %1, %4
|
|
paddw xmm5, %1
|
|
psadbw %4, %3
|
|
paddw xmm4, %4
|
|
movdqu %4, [%5-1]
|
|
psadbw %4, %2
|
|
paddw xmm6, %4
|
|
movdqu %4, [%5+1]
|
|
psadbw %4, %2
|
|
paddw xmm7, %4
|
|
%endmacro
|
|
WELS_EXTERN WelsSampleSadFour16x16_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
|
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
|
|
pxor xmm6, xmm6 ;sad pRefMb-1
|
|
pxor xmm7, xmm7 ;sad pRefMb+1
|
|
movdqa xmm0, [r0]
|
|
sub r2, r3
|
|
movdqu xmm3, [r2]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movdqa xmm1, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
psadbw xmm3, xmm1
|
|
paddw xmm4, xmm3
|
|
|
|
movdqu xmm2, [r2+r3-1]
|
|
psadbw xmm2, xmm0
|
|
paddw xmm6, xmm2
|
|
|
|
movdqu xmm3, [r2+r3+1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm2, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
|
|
movdqa xmm0, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm1, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
|
|
movdqa xmm2, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm0, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
|
|
movdqa xmm1, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm2, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
|
|
movdqa xmm0, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm1, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
|
|
movdqa xmm2, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm0, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
|
|
movdqa xmm1, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm2, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
|
|
movdqa xmm0, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
|
|
lea r2, [r2+2*r3]
|
|
movdqu xmm3, [r2]
|
|
psadbw xmm2, xmm3
|
|
paddw xmm5, xmm2
|
|
|
|
movdqu xmm2, [r2-1]
|
|
psadbw xmm2, xmm0
|
|
paddw xmm6, xmm2
|
|
|
|
movdqu xmm3, [r2+1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movhlps xmm0, xmm4
|
|
paddw xmm4, xmm0
|
|
movhlps xmm0, xmm5
|
|
paddw xmm5, xmm0
|
|
movhlps xmm0, xmm6
|
|
paddw xmm6, xmm0
|
|
movhlps xmm0, xmm7
|
|
paddw xmm7, xmm0
|
|
punpckldq xmm4, xmm5
|
|
punpckldq xmm6, xmm7
|
|
punpcklqdq xmm4, xmm6
|
|
movdqa [r4],xmm4
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
|
|
WELS_EXTERN WelsSampleSadFour16x8_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
|
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
|
|
pxor xmm6, xmm6 ;sad pRefMb-1
|
|
pxor xmm7, xmm7 ;sad pRefMb+1
|
|
movdqa xmm0, [r0]
|
|
sub r2, r3
|
|
movdqu xmm3, [r2]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movdqa xmm1, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
psadbw xmm3, xmm1
|
|
paddw xmm4, xmm3
|
|
|
|
movdqu xmm2, [r2+r3-1]
|
|
psadbw xmm2, xmm0
|
|
paddw xmm6, xmm2
|
|
|
|
movdqu xmm3, [r2+r3+1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm2, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
|
|
movdqa xmm0, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm1, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
|
|
movdqa xmm2, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm0, [r0]
|
|
movdqu xmm3, [r2]
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
|
|
movdqa xmm1, [r0+r1]
|
|
movdqu xmm3, [r2+r3]
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
|
|
lea r2, [r2+2*r3]
|
|
movdqu xmm3, [r2]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movdqu xmm0, [r2-1]
|
|
psadbw xmm0, xmm1
|
|
paddw xmm6, xmm0
|
|
|
|
movdqu xmm3, [r2+1]
|
|
psadbw xmm3, xmm1
|
|
paddw xmm7, xmm3
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
psadbw xmm1, xmm3
|
|
paddw xmm5, xmm1
|
|
|
|
movhlps xmm0, xmm4
|
|
paddw xmm4, xmm0
|
|
movhlps xmm0, xmm5
|
|
paddw xmm5, xmm0
|
|
movhlps xmm0, xmm6
|
|
paddw xmm6, xmm0
|
|
movhlps xmm0, xmm7
|
|
paddw xmm7, xmm0
|
|
punpckldq xmm4, xmm5
|
|
punpckldq xmm6, xmm7
|
|
punpcklqdq xmm4, xmm6
|
|
movdqa [r4],xmm4
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
WELS_EXTERN WelsSampleSadFour8x16_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
|
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
|
|
pxor xmm6, xmm6 ;sad pRefMb-1
|
|
pxor xmm7, xmm7 ;sad pRefMb+1
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
sub r2, r3
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movhlps xmm0, xmm4
|
|
paddw xmm4, xmm0
|
|
movhlps xmm0, xmm5
|
|
paddw xmm5, xmm0
|
|
movhlps xmm0, xmm6
|
|
paddw xmm6, xmm0
|
|
movhlps xmm0, xmm7
|
|
paddw xmm7, xmm0
|
|
punpckldq xmm4, xmm5
|
|
punpckldq xmm6, xmm7
|
|
punpcklqdq xmm4, xmm6
|
|
movdqa [r4],xmm4
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
|
|
WELS_EXTERN WelsSampleSadFour8x8_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
|
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
|
|
pxor xmm6, xmm6 ;sad pRefMb-1
|
|
pxor xmm7, xmm7 ;sad pRefMb+1
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
sub r2, r3
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movq xmm0, [r0]
|
|
movhps xmm0, [r0+r1]
|
|
psadbw xmm3, xmm0
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
movhps xmm1, [r2-1]
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
paddw xmm6, xmm1
|
|
psadbw xmm3, xmm0
|
|
paddw xmm7, xmm3
|
|
|
|
movq xmm3, [r2]
|
|
movhps xmm3, [r2+r3]
|
|
psadbw xmm0, xmm3
|
|
paddw xmm5, xmm0
|
|
|
|
movhlps xmm0, xmm4
|
|
paddw xmm4, xmm0
|
|
movhlps xmm0, xmm5
|
|
paddw xmm5, xmm0
|
|
movhlps xmm0, xmm6
|
|
paddw xmm6, xmm0
|
|
movhlps xmm0, xmm7
|
|
paddw xmm7, xmm0
|
|
punpckldq xmm4, xmm5
|
|
punpckldq xmm6, xmm7
|
|
punpcklqdq xmm4, xmm6
|
|
movdqa [r4],xmm4
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
WELS_EXTERN WelsSampleSadFour4x4_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movd xmm0, [r0]
|
|
movd xmm1, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movd xmm2, [r0]
|
|
movd xmm3, [r0+r1]
|
|
punpckldq xmm0, xmm1
|
|
punpckldq xmm2, xmm3
|
|
punpcklqdq xmm0, xmm2
|
|
sub r2, r3
|
|
movd xmm1, [r2]
|
|
movd xmm2, [r2+r3]
|
|
punpckldq xmm1, xmm2
|
|
movd xmm2, [r2+r3-1]
|
|
movd xmm3, [r2+r3+1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movd xmm4, [r2]
|
|
movd xmm5, [r2-1]
|
|
punpckldq xmm2, xmm5
|
|
movd xmm5, [r2+1]
|
|
punpckldq xmm3, xmm5
|
|
|
|
movd xmm5, [r2+r3]
|
|
punpckldq xmm4, xmm5
|
|
|
|
punpcklqdq xmm1, xmm4 ;-L
|
|
|
|
movd xmm5, [r2+r3-1]
|
|
movd xmm6, [r2+r3+1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
movd xmm7, [r2-1]
|
|
punpckldq xmm5, xmm7
|
|
punpcklqdq xmm2, xmm5 ;-1
|
|
movd xmm7, [r2+1]
|
|
punpckldq xmm6, xmm7
|
|
punpcklqdq xmm3, xmm6 ;+1
|
|
movd xmm6, [r2]
|
|
movd xmm7, [r2+r3]
|
|
punpckldq xmm6, xmm7
|
|
punpcklqdq xmm4, xmm6 ;+L
|
|
psadbw xmm1, xmm0
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
|
|
movhlps xmm0, xmm1
|
|
paddw xmm1, xmm0
|
|
movhlps xmm0, xmm2
|
|
paddw xmm2, xmm0
|
|
movhlps xmm0, xmm3
|
|
paddw xmm3, xmm0
|
|
movhlps xmm0, xmm4
|
|
paddw xmm4, xmm0
|
|
punpckldq xmm1, xmm4
|
|
punpckldq xmm2, xmm3
|
|
punpcklqdq xmm1, xmm2
|
|
movdqa [r4],xmm1
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;Pixel_sad_4_wxh_sse2 END
|
|
;
|
|
;***********************************************************************
|
|
|
|
;***********************************************************************
|
|
; int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsSampleSad4x4_mmx
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movd mm0, [r0]
|
|
movd mm1, [r0+r1]
|
|
punpckldq mm0, mm1
|
|
|
|
movd mm3, [r2]
|
|
movd mm4, [r2+r3]
|
|
punpckldq mm3, mm4
|
|
psadbw mm0, mm3
|
|
|
|
lea r0, [r0+2*r1]
|
|
lea r2, [r2+2*r3]
|
|
|
|
movd mm1, [r0]
|
|
movd mm2, [r0+r1]
|
|
punpckldq mm1, mm2
|
|
|
|
movd mm3, [r2]
|
|
movd mm4, [r2+r3]
|
|
punpckldq mm3, mm4
|
|
psadbw mm1, mm3
|
|
paddw mm0, mm1
|
|
|
|
movd retrd, mm0
|
|
|
|
WELSEMMS
|
|
LOAD_4_PARA_POP
|
|
ret
|