openh264/codec/encoder/core/asm/satd_sad.asm
2013-12-09 04:51:09 -08:00

2189 lines
52 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* satd_sad.asm
;*
;* Abstract
;* WelsSampleSatd4x4_sse2
;* WelsSampleSatd8x8_sse2
;* WelsSampleSatd16x8_sse2
;* WelsSampleSatd8x16_sse2
;* WelsSampleSatd16x16_sse2
;*
;* WelsSampleSad16x8_sse2
;* WelsSampleSad16x16_sse2
;*
;* History
;* 8/5/2009 Created
;* 24/9/2009 modified
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
BITS 32
;***********************************************************************
; Data
;***********************************************************************
SECTION .rodata align=16
align 16
HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
align 16
HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
align 16
PDW1: dw 1,1,1,1,1,1,1,1
align 16
PDQ2: dw 2,0,0,0,2,0,0,0
align 16
HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
;***********************************************************************
;
;Pixel_satd_wxh_sse2 BEGIN
;
;***********************************************************************
%macro MMX_DW_1_2REG 2
pxor %1, %1
pcmpeqw %2, %2
psubw %1, %2
%endmacro
%macro SSE2_SumWHorizon1 2
movdqa %2, %1
psrldq %2, 8
paddusw %1, %2
movdqa %2, %1
psrldq %2, 4
paddusw %1, %2
movdqa %2, %1
psrldq %2, 2
paddusw %1, %2
%endmacro
%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
SSE2_SumSub %1, %2, %5
SSE2_SumSub %3, %4, %5
SSE2_SumSub %2, %4, %5
SSE2_SumSub %1, %3, %5
%endmacro
%macro SSE2_SumAbs4 7
WELS_AbsW %1, %3
WELS_AbsW %2, %3
WELS_AbsW %4, %6
WELS_AbsW %5, %6
paddusw %1, %2
paddusw %4, %5
paddusw %7, %1
paddusw %7, %4
%endmacro
%macro SSE2_SumWHorizon 3
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
%endmacro
%macro SSE2_GetSatd8x8 0
SSE2_LoadDiff8P xmm0,xmm4,xmm7,[eax],[ecx]
SSE2_LoadDiff8P xmm1,xmm5,xmm7,[eax+ebx],[ecx+edx]
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[eax],[ecx]
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_LoadDiff8P xmm0,xmm4,xmm7,[eax],[ecx]
SSE2_LoadDiff8P xmm1,xmm5,xmm7,[eax+ebx],[ecx+edx]
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[eax],[ecx]
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
%endmacro
;***********************************************************************
;
;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse2
align 16
WelsSampleSatd4x4_sse2:
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
mov edx, [esp+20]
movd xmm0, [eax]
movd xmm1, [eax+ebx]
lea eax , [eax+2*ebx]
movd xmm2, [eax]
movd xmm3, [eax+ebx]
punpckldq xmm0, xmm2
punpckldq xmm1, xmm3
movd xmm4, [ecx]
movd xmm5, [ecx+edx]
lea ecx , [ecx+2*edx]
movd xmm6, [ecx]
movd xmm7, [ecx+edx]
punpckldq xmm4, xmm6
punpckldq xmm5, xmm7
pxor xmm6, xmm6
punpcklbw xmm0, xmm6
punpcklbw xmm1, xmm6
punpcklbw xmm4, xmm6
punpcklbw xmm5, xmm6
psubw xmm0, xmm4
psubw xmm1, xmm5
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
SSE2_XSawp qdq, xmm0, xmm2, xmm3
movdqa xmm4, xmm0
paddw xmm0, xmm3
psubw xmm4, xmm3
movdqa xmm2, xmm0
punpcklwd xmm0, xmm4
punpckhwd xmm4, xmm2
SSE2_XSawp dq, xmm0, xmm4, xmm3
SSE2_XSawp qdq, xmm0, xmm3, xmm5
movdqa xmm7, xmm0
paddw xmm0, xmm5
psubw xmm7, xmm5
SSE2_XSawp qdq, xmm0, xmm7, xmm1
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
WELS_AbsW xmm0, xmm3
paddusw xmm6, xmm0
WELS_AbsW xmm2, xmm4
paddusw xmm6, xmm2
SSE2_SumWHorizon1 xmm6, xmm4
movd eax, xmm6
and eax, 0xffff
shr eax, 1
pop ebx
ret
;***********************************************************************
;
;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse2
align 16
WelsSampleSatd8x8_sse2:
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
mov edx, [esp+20]
pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_GetSatd8x8
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd eax, xmm6
pop ebx
ret
;***********************************************************************
;
;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse2
align 16
WelsSampleSatd8x16_sse2:
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
mov edx, [esp+20]
pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_GetSatd8x8
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_GetSatd8x8
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd eax, xmm6
pop ebx
ret
;***********************************************************************
;
;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse2
align 16
WelsSampleSatd16x8_sse2:
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
mov edx, [esp+20]
pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_GetSatd8x8
mov eax, [esp+8]
mov ecx, [esp+16]
add eax, 8
add ecx, 8
SSE2_GetSatd8x8
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd eax, xmm6
pop ebx
ret
;***********************************************************************
;
;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_sse2
align 16
WelsSampleSatd16x16_sse2:
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
mov edx, [esp+20]
pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_GetSatd8x8
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_GetSatd8x8
mov eax, [esp+8]
mov ecx, [esp+16]
add eax, 8
add ecx, 8
SSE2_GetSatd8x8
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_GetSatd8x8
; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd eax, xmm6
pop ebx
ret
;***********************************************************************
;
;Pixel_satd_wxh_sse2 END
;
;***********************************************************************
;***********************************************************************
;
;Pixel_satd_intra_sse2 BEGIN
;
;***********************************************************************
%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
pmaddubsw %1, xmm5
movdqa %2, %1
pmaddwd %1, xmm7
pmaddwd %2, xmm6
movdqa %3, %1
punpckldq %1, %2
punpckhdq %2, %3
movdqa %3, %1
punpcklqdq %1, %2
punpckhqdq %3, %2
paddd xmm4, %1 ;for dc
paddd xmm4, %3 ;for dc
packssdw %1, %3
psllw %1, 2
%endmacro
%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
pmaddubsw %1, xmm5
movdqa %2, %1
pmaddwd %1, xmm7
pmaddwd %2, xmm6
movdqa %3, %1
punpckldq %1, %2
punpckhdq %2, %3
movdqa %3, %1
punpcklqdq %1, %2
punpckhqdq %3, %2
; paddd xmm4, %1 ;for dc
; paddd xmm4, %3 ;for dc
movdqa %4, %1
punpcklqdq %4, %3
packssdw %1, %3
psllw %1, 2
%endmacro
%macro SSE41_GetX38x4SatdDec 0
pxor xmm7, xmm7
movq xmm0, [eax]
movq xmm1, [eax+ebx]
lea eax, [eax+2*ebx]
movq xmm2, [eax]
movq xmm3, [eax+ebx]
lea eax, [eax+2*ebx]
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
punpcklbw xmm2, xmm7
punpcklbw xmm3, xmm7
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
;doesn't need another transpose
%endmacro
%macro SSE41_GetX38x4SatdV 2
pxor xmm0, xmm0
pinsrw xmm0, word[esi+%2], 0
pinsrw xmm0, word[esi+%2+8], 4
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm4, xmm0
pxor xmm0, xmm0
pinsrw xmm0, word[esi+%2+2], 0
pinsrw xmm0, word[esi+%2+10], 4
psubsw xmm0, xmm1
pabsw xmm0, xmm0
paddw xmm4, xmm0
pxor xmm0, xmm0
pinsrw xmm0, word[esi+%2+4], 0
pinsrw xmm0, word[esi+%2+12], 4
psubsw xmm0, xmm3
pabsw xmm0, xmm0
paddw xmm4, xmm0
pxor xmm0, xmm0
pinsrw xmm0, word[esi+%2+6], 0
pinsrw xmm0, word[esi+%2+14], 4
psubsw xmm0, xmm2
pabsw xmm0, xmm0
paddw xmm4, xmm0
%endmacro
%macro SSE41_GetX38x4SatdH 3
movq xmm0, [esi+%3+8*%1]
punpcklqdq xmm0, xmm0
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm5, xmm0
pabsw xmm1, xmm1
pabsw xmm2, xmm2
pabsw xmm3, xmm3
paddw xmm2, xmm1;for DC
paddw xmm2, xmm3;for DC
paddw xmm5, xmm2
%endmacro
%macro SSE41_I16X16GetX38x4SatdDC 0
pxor xmm0, xmm0
movq2dq xmm0, mm4
punpcklqdq xmm0, xmm0
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm6, xmm0
paddw xmm6, xmm2
%endmacro
%macro SSE41_ChromaGetX38x4SatdDC 1
shl %1, 4
movdqa xmm0, [esi+32+%1]
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm6, xmm0
paddw xmm6, xmm2
%endmacro
%macro SSE41_I16x16GetX38x4Satd 2
SSE41_GetX38x4SatdDec
SSE41_GetX38x4SatdV %1, %2
SSE41_GetX38x4SatdH %1, %2, 32
SSE41_I16X16GetX38x4SatdDC
%endmacro
%macro SSE41_ChromaGetX38x4Satd 2
SSE41_GetX38x4SatdDec
SSE41_GetX38x4SatdV %1, %2
SSE41_GetX38x4SatdH %1, %2, 16
SSE41_ChromaGetX38x4SatdDC %1
%endmacro
%macro SSE41_HSum8W 3
pmaddwd %1, %2
movhlps %3, %1
paddd %1, %3
pshuflw %3, %1,0Eh
paddd %1, %3
%endmacro
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
WelsIntra16x16Combined3Satd_sse41:
push ebx
push esi
push edi
mov ecx, [esp+16]
mov edx, [esp+20]
mov eax, [esp+24]
mov ebx, [esp+28]
mov esi, [esp+40] ;temp_satd
pxor xmm4, xmm4
movdqa xmm5, [HSumSubDB1]
movdqa xmm6, [HSumSubDW1]
movdqa xmm7, [PDW1]
sub ecx, edx
movdqu xmm0, [ecx]
movhlps xmm1, xmm0
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
movdqa [esi], xmm0 ;V
movdqa [esi+16], xmm1
add ecx, edx
pinsrb xmm0, byte[ecx-1], 0
pinsrb xmm0, byte[ecx+edx-1], 1
lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 2
pinsrb xmm0, byte[ecx+edx-1], 3
lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 4
pinsrb xmm0, byte[ecx+edx-1], 5
lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 6
pinsrb xmm0, byte[ecx+edx-1], 7
lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 8
pinsrb xmm0, byte[ecx+edx-1], 9
lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 10
pinsrb xmm0, byte[ecx+edx-1], 11
lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 12
pinsrb xmm0, byte[ecx+edx-1], 13
lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 14
pinsrb xmm0, byte[ecx+edx-1], 15
movhlps xmm1, xmm0
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
movdqa [esi+32], xmm0 ;H
movdqa [esi+48], xmm1
movd ecx, xmm4 ;dc
add ecx, 16 ;(sum+16)
shr ecx, 5 ;((sum+16)>>5)
shl ecx, 4 ;
movd mm4, ecx ; mm4 copy DC
pxor xmm4, xmm4 ;V
pxor xmm5, xmm5 ;H
pxor xmm6, xmm6 ;DC
mov ecx, 0
mov edi, 0
.loop16x16_get_satd:
.loopStart1:
SSE41_I16x16GetX38x4Satd ecx, edi
inc ecx
cmp ecx, 4
jl .loopStart1
cmp edi, 16
je .loop16x16_get_satd_end
mov eax, [esp+24]
add eax, 8
mov ecx, 0
add edi, 16
jmp .loop16x16_get_satd
.loop16x16_get_satd_end:
MMX_DW_1_2REG xmm0, xmm1
psrlw xmm4, 1 ;/2
psrlw xmm5, 1 ;/2
psrlw xmm6, 1 ;/2
SSE41_HSum8W xmm4, xmm0, xmm1
SSE41_HSum8W xmm5, xmm0, xmm1
SSE41_HSum8W xmm6, xmm0, xmm1
; comparing order: DC H V
movd ebx, xmm6 ;DC
movd edi, xmm5 ;H
movd ecx, xmm4 ;V
mov edx, [esp+36]
shl edx, 1
add edi, edx
add ebx, edx
mov edx, [esp+32]
cmp ebx, edi
jge near not_dc_16x16
cmp ebx, ecx
jge near not_dc_h_16x16
; for DC mode
mov dword[edx], 2;I16_PRED_DC
mov eax, ebx
jmp near return_satd_intra_16x16_x3
not_dc_16x16:
; for H mode
cmp edi, ecx
jge near not_dc_h_16x16
mov dword[edx], 1;I16_PRED_H
mov eax, edi
jmp near return_satd_intra_16x16_x3
not_dc_h_16x16:
; for V mode
mov dword[edx], 0;I16_PRED_V
mov eax, ecx
return_satd_intra_16x16_x3:
WELSEMMS
pop edi
pop esi
pop ebx
ret
%macro SSE41_ChromaGetX38x8Satd 0
movdqa xmm5, [HSumSubDB1]
movdqa xmm6, [HSumSubDW1]
movdqa xmm7, [PDW1]
sub ecx, edx
movq xmm0, [ecx]
punpcklqdq xmm0, xmm0
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
movdqa [esi], xmm0 ;V
add ecx, edx
pinsrb xmm0, byte[ecx-1], 0
pinsrb xmm0, byte[ecx+edx-1], 1
lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 2
pinsrb xmm0, byte[ecx+edx-1], 3
lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 4
pinsrb xmm0, byte[ecx+edx-1], 5
lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 6
pinsrb xmm0, byte[ecx+edx-1], 7
punpcklqdq xmm0, xmm0
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
movdqa [esi+16], xmm0 ;H
;(sum+2)>>2
movdqa xmm6, [PDQ2]
movdqa xmm5, xmm4
punpckhqdq xmm5, xmm1
paddd xmm5, xmm6
psrld xmm5, 2
;(sum1+sum2+4)>>3
paddd xmm6, xmm6
paddd xmm4, xmm1
paddd xmm4, xmm6
psrld xmm4, 3
;satd *16
pslld xmm5, 4
pslld xmm4, 4
;temp satd
movdqa xmm6, xmm4
punpcklqdq xmm4, xmm5
psllq xmm4, 32
psrlq xmm4, 32
movdqa [esi+32], xmm4
punpckhqdq xmm5, xmm6
psllq xmm5, 32
psrlq xmm5, 32
movdqa [esi+48], xmm5
pxor xmm4, xmm4 ;V
pxor xmm5, xmm5 ;H
pxor xmm6, xmm6 ;DC
mov ecx, 0
loop_chroma_satdx3_cb_cr:
SSE41_ChromaGetX38x4Satd ecx, 0
inc ecx
cmp ecx, 2
jl loop_chroma_satdx3_cb_cr
%endmacro
%macro SSEReg2MMX 3
movdq2q %2, %1
movhlps %1, %1
movdq2q %3, %1
%endmacro
%macro MMXReg2SSE 4
movq2dq %1, %3
movq2dq %2, %4
punpcklqdq %1, %2
%endmacro
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
WelsIntraChroma8x8Combined3Satd_sse41:
push ebx
push esi
push edi
mov ecx, [esp+16]
mov edx, [esp+20]
mov eax, [esp+24]
mov ebx, [esp+28]
mov esi, [esp+40] ;temp_satd
xor edi, edi
loop_chroma_satdx3:
SSE41_ChromaGetX38x8Satd
cmp edi, 1
je loop_chroma_satdx3end
inc edi
SSEReg2MMX xmm4, mm0,mm1
SSEReg2MMX xmm5, mm2,mm3
SSEReg2MMX xmm6, mm5,mm6
mov ecx, [esp+44]
mov eax, [esp+48]
jmp loop_chroma_satdx3
loop_chroma_satdx3end:
MMXReg2SSE xmm0, xmm3, mm0, mm1
MMXReg2SSE xmm1, xmm3, mm2, mm3
MMXReg2SSE xmm2, xmm3, mm5, mm6
paddw xmm4, xmm0
paddw xmm5, xmm1
paddw xmm6, xmm2
MMX_DW_1_2REG xmm0, xmm1
psrlw xmm4, 1 ;/2
psrlw xmm5, 1 ;/2
psrlw xmm6, 1 ;/2
SSE41_HSum8W xmm4, xmm0, xmm1
SSE41_HSum8W xmm5, xmm0, xmm1
SSE41_HSum8W xmm6, xmm0, xmm1
; comparing order: DC H V
movd ebx, xmm6 ;DC
movd edi, xmm5 ;H
movd ecx, xmm4 ;V
mov edx, [esp+36]
shl edx, 1
add edi, edx
add ecx, edx
mov edx, [esp+32]
cmp ebx, edi
jge near not_dc_8x8
cmp ebx, ecx
jge near not_dc_h_8x8
; for DC mode
mov dword[edx], 0;I8_PRED_DC
mov eax, ebx
jmp near return_satd_intra_8x8_x3
not_dc_8x8:
; for H mode
cmp edi, ecx
jge near not_dc_h_8x8
mov dword[edx], 1;I8_PRED_H
mov eax, edi
jmp near return_satd_intra_8x8_x3
not_dc_h_8x8:
; for V mode
mov dword[edx], 2;I8_PRED_V
mov eax, ecx
return_satd_intra_8x8_x3:
WELSEMMS
pop edi
pop esi
pop ebx
ret
;***********************************************************************
;
;Pixel_satd_intra_sse2 END
;
;***********************************************************************
%macro SSSE3_Get16BSadHVDC 2
movd xmm6,%1
pshufb xmm6,xmm1
movdqa %1, xmm6
movdqa xmm0,%2
psadbw xmm0,xmm7
paddw xmm4,xmm0
movdqa xmm0,%2
psadbw xmm0,xmm5
paddw xmm2,xmm0
psadbw xmm6,%2
paddw xmm3,xmm6
%endmacro
%macro WelsAddDCValue 4
movzx %2, byte %1
mov %3, %2
add %4, %2
%endmacro
;***********************************************************************
;
;Pixel_sad_intra_ssse3 BEGIN
;
;***********************************************************************
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
WelsIntra16x16Combined3Sad_ssse3:
push ebx
push esi
push edi
mov ecx, [esp+16]
mov edx, [esp+20]
mov edi, [esp+40] ;temp_sad
sub ecx, edx
movdqa xmm5,[ecx]
pxor xmm0,xmm0
psadbw xmm0,xmm5
movhlps xmm1,xmm0
paddw xmm0,xmm1
movd eax,xmm0
add ecx,edx
lea ebx, [edx+2*edx]
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
lea ecx, [ecx+4*edx]
add edi, 64
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
lea ecx, [ecx+4*edx]
add edi, 64
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
lea ecx, [ecx+4*edx]
add edi, 64
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
sub edi, 192
add eax,10h
shr eax,5
movd xmm7,eax
pxor xmm1,xmm1
pshufb xmm7,xmm1
pxor xmm4,xmm4
pxor xmm3,xmm3
pxor xmm2,xmm2
;sad begin
mov eax, [esp+24]
mov ebx, [esp+28]
lea esi, [ebx+2*ebx]
SSSE3_Get16BSadHVDC [edi], [eax]
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
add edi, 64
lea eax, [eax+4*ebx]
SSSE3_Get16BSadHVDC [edi], [eax]
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
add edi, 64
lea eax, [eax+4*ebx]
SSSE3_Get16BSadHVDC [edi], [eax]
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
add edi, 64
lea eax, [eax+4*ebx]
SSSE3_Get16BSadHVDC [edi], [eax]
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
pslldq xmm3,4
por xmm3,xmm2
movhlps xmm1,xmm3
paddw xmm3,xmm1
movhlps xmm0,xmm4
paddw xmm4,xmm0
; comparing order: DC H V
movd ebx, xmm4 ;DC
movd ecx, xmm3 ;V
psrldq xmm3, 4
movd esi, xmm3 ;H
mov eax, [esp+36] ;lamda
shl eax, 1
add esi, eax
add ebx, eax
mov edx, [esp+32]
cmp ebx, esi
jge near not_dc_16x16_sad
cmp ebx, ecx
jge near not_dc_h_16x16_sad
; for DC mode
mov dword[edx], 2;I16_PRED_DC
mov eax, ebx
sub edi, 192
%assign x 0
%rep 16
movdqa [edi+16*x], xmm7
%assign x x+1
%endrep
jmp near return_sad_intra_16x16_x3
not_dc_16x16_sad:
; for H mode
cmp esi, ecx
jge near not_dc_h_16x16_sad
mov dword[edx], 1;I16_PRED_H
mov eax, esi
jmp near return_sad_intra_16x16_x3
not_dc_h_16x16_sad:
; for V mode
mov dword[edx], 0;I16_PRED_V
mov eax, ecx
sub edi, 192
%assign x 0
%rep 16
movdqa [edi+16*x], xmm5
%assign x x+1
%endrep
return_sad_intra_16x16_x3:
pop edi
pop esi
pop ebx
ret
;***********************************************************************
;
;Pixel_sad_intra_ssse3 END
;
;***********************************************************************
;***********************************************************************
;
;Pixel_satd_wxh_sse41 BEGIN
;
;***********************************************************************
;SSE4.1
%macro SSE41_GetSatd8x4 0
movq xmm0, [eax]
punpcklqdq xmm0, xmm0
pmaddubsw xmm0, xmm7
movq xmm1, [eax+ebx]
punpcklqdq xmm1, xmm1
pmaddubsw xmm1, xmm7
movq xmm2, [ecx]
punpcklqdq xmm2, xmm2
pmaddubsw xmm2, xmm7
movq xmm3, [ecx+edx]
punpcklqdq xmm3, xmm3
pmaddubsw xmm3, xmm7
psubsw xmm0, xmm2
psubsw xmm1, xmm3
movq xmm2, [eax+2*ebx]
punpcklqdq xmm2, xmm2
pmaddubsw xmm2, xmm7
movq xmm3, [eax+esi]
punpcklqdq xmm3, xmm3
pmaddubsw xmm3, xmm7
movq xmm4, [ecx+2*edx]
punpcklqdq xmm4, xmm4
pmaddubsw xmm4, xmm7
movq xmm5, [ecx+edi]
punpcklqdq xmm5, xmm5
pmaddubsw xmm5, xmm7
psubsw xmm2, xmm4
psubsw xmm3, xmm5
SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
pabsw xmm0, xmm0
pabsw xmm2, xmm2
pabsw xmm1, xmm1
pabsw xmm3, xmm3
movdqa xmm4, xmm3
pblendw xmm3, xmm1, 0xAA
pslld xmm1, 16
psrld xmm4, 16
por xmm1, xmm4
pmaxuw xmm1, xmm3
paddw xmm6, xmm1
movdqa xmm4, xmm0
pblendw xmm0, xmm2, 0xAA
pslld xmm2, 16
psrld xmm4, 16
por xmm2, xmm4
pmaxuw xmm0, xmm2
paddw xmm6, xmm0
%endmacro
%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
MMX_DW_1_2REG %3, %4
pmaddwd %2, %3
movhlps %4, %2
paddd %2, %4
pshuflw %4, %2,0Eh
paddd %2, %4
movd %1, %2
%endmacro
;***********************************************************************
;
;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse41
WelsSampleSatd4x4_sse41:
push ebx
mov eax,[esp+8]
mov ebx,[esp+12]
mov ecx,[esp+16]
mov edx,[esp+20]
movdqa xmm4,[HSwapSumSubDB1]
movd xmm2,[ecx]
movd xmm5,[ecx+edx]
shufps xmm2,xmm5,0
movd xmm3,[ecx+edx*2]
lea ecx, [edx*2+ecx]
movd xmm5,[ecx+edx]
shufps xmm3,xmm5,0
movd xmm0,[eax]
movd xmm5,[eax+ebx]
shufps xmm0,xmm5,0
movd xmm1,[eax+ebx*2]
lea eax, [ebx*2+eax]
movd xmm5,[eax+ebx]
shufps xmm1,xmm5,0
pmaddubsw xmm0,xmm4
pmaddubsw xmm1,xmm4
pmaddubsw xmm2,xmm4
pmaddubsw xmm3,xmm4
psubw xmm0,xmm2
psubw xmm1,xmm3
movdqa xmm2,xmm0
paddw xmm0,xmm1
psubw xmm1,xmm2
movdqa xmm2,xmm0
punpcklqdq xmm0,xmm1
punpckhqdq xmm2,xmm1
movdqa xmm1,xmm0
paddw xmm0,xmm2
psubw xmm2,xmm1
movdqa xmm1,xmm0
pblendw xmm0,xmm2,0AAh
pslld xmm2,16
psrld xmm1,16
por xmm2,xmm1
pabsw xmm0,xmm0
pabsw xmm2,xmm2
pmaxsw xmm0,xmm2
SSSE3_SumWHorizon eax, xmm0, xmm5, xmm7
pop ebx
ret
;***********************************************************************
;
;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse41
align 16
WelsSampleSatd8x8_sse41:
push ebx
push esi
push edi
mov eax, [esp+16]
mov ebx, [esp+20]
mov ecx, [esp+24]
mov edx, [esp+28]
movdqa xmm7, [HSumSubDB1]
lea esi, [ebx+ebx*2]
lea edi, [edx+edx*2]
pxor xmm6, xmm6
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
SSE41_GetSatd8x4
SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
pop edi
pop esi
pop ebx
ret
;***********************************************************************
;
;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse41
align 16
WelsSampleSatd8x16_sse41:
push ebx
push esi
push edi
push ebp
%define pushsize 16
mov eax, [esp+pushsize+4]
mov ebx, [esp+pushsize+8]
mov ecx, [esp+pushsize+12]
mov edx, [esp+pushsize+16]
movdqa xmm7, [HSumSubDB1]
lea esi, [ebx+ebx*2]
lea edi, [edx+edx*2]
pxor xmm6, xmm6
mov ebp, 0
loop_get_satd_8x16:
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
inc ebp
cmp ebp, 4
jl loop_get_satd_8x16
SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
pop ebp
pop edi
pop esi
pop ebx
ret
;***********************************************************************
;
;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse41
align 16
WelsSampleSatd16x8_sse41:
push ebx
push esi
push edi
mov eax, [esp+16]
mov ebx, [esp+20]
mov ecx, [esp+24]
mov edx, [esp+28]
movdqa xmm7, [HSumSubDB1]
lea esi, [ebx+ebx*2]
lea edi, [edx+edx*2]
pxor xmm6, xmm6
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
SSE41_GetSatd8x4
mov eax, [esp+16]
mov ecx, [esp+24]
add eax, 8
add ecx, 8
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
SSE41_GetSatd8x4
SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
pop edi
pop esi
pop ebx
ret
;***********************************************************************
;
;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_sse41
align 16
WelsSampleSatd16x16_sse41:
push ebx
push esi
push edi
push ebp
%define pushsize 16
mov eax, [esp+pushsize+4]
mov ebx, [esp+pushsize+8]
mov ecx, [esp+pushsize+12]
mov edx, [esp+pushsize+16]
movdqa xmm7, [HSumSubDB1]
lea esi, [ebx+ebx*2]
lea edi, [edx+edx*2]
pxor xmm6, xmm6
mov ebp, 0
loop_get_satd_16x16_left:
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
inc ebp
cmp ebp, 4
jl loop_get_satd_16x16_left
mov eax, [esp+pushsize+4]
mov ecx, [esp+pushsize+12]
add eax, 8
add ecx, 8
mov ebp, 0
loop_get_satd_16x16_right:
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
inc ebp
cmp ebp, 4
jl loop_get_satd_16x16_right
SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
%undef pushsize
pop ebp
pop edi
pop esi
pop ebx
ret
;***********************************************************************
;
;Pixel_satd_wxh_sse41 END
;
;***********************************************************************
;***********************************************************************
;
;Pixel_sad_wxh_sse2 BEGIN
;
;***********************************************************************
%macro SSE2_GetSad2x16 0
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
movdqu xmm1, [ecx]
MOVDQ xmm2, [eax];[eax] must aligned 16
psadbw xmm1, xmm2
paddw xmm0, xmm1
movdqu xmm1, [ecx+edx]
MOVDQ xmm2, [eax+ebx]
psadbw xmm1, xmm2
paddw xmm0, xmm1
%endmacro
%macro SSE2_GetSad4x16 0
movdqu xmm0, [ecx]
MOVDQ xmm2, [eax]
psadbw xmm0, xmm2
paddw xmm7, xmm0
movdqu xmm1, [ecx+edx]
MOVDQ xmm2, [eax+ebx]
psadbw xmm1, xmm2
paddw xmm7, xmm1
movdqu xmm1, [ecx+2*edx]
MOVDQ xmm2, [eax+2*ebx];[eax] must aligned 16
psadbw xmm1, xmm2
paddw xmm7, xmm1
movdqu xmm1, [ecx+edi]
MOVDQ xmm2, [eax+esi]
psadbw xmm1, xmm2
paddw xmm7, xmm1
%endmacro
%macro SSE2_GetSad8x4 0
movq xmm0, [eax]
movq xmm1, [eax+ebx]
lea eax, [eax+2*ebx]
movhps xmm0, [eax]
movhps xmm1, [eax+ebx]
movq xmm2, [ecx]
movq xmm3, [ecx+edx]
lea ecx, [ecx+2*edx]
movhps xmm2, [ecx]
movhps xmm3, [ecx+edx]
psadbw xmm0, xmm2
psadbw xmm1, xmm3
paddw xmm6, xmm0
paddw xmm6, xmm1
%endmacro
;***********************************************************************
;
;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
;First parameter can align to 16 bytes,
;In wels, the third parameter can't align to 16 bytes.
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x16_sse2
align 16
WelsSampleSad16x16_sse2:
push ebx
push edi
push esi
%define _STACK_SIZE 12
mov eax, [esp+_STACK_SIZE+4 ]
mov ebx, [esp+_STACK_SIZE+8 ]
lea esi, [3*ebx]
mov ecx, [esp+_STACK_SIZE+12]
mov edx, [esp+_STACK_SIZE+16]
lea edi, [3*edx]
pxor xmm7, xmm7
SSE2_GetSad4x16
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
SSE2_GetSad4x16
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
SSE2_GetSad4x16
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
SSE2_GetSad4x16
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd eax, xmm0
%undef _STACK_SIZE
pop esi
pop edi
pop ebx
ret
;***********************************************************************
;
;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
;First parameter can align to 16 bytes,
;In wels, the third parameter can't align to 16 bytes.
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x8_sse2
align 16
WelsSampleSad16x8_sse2:
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
mov edx, [esp+20]
movdqu xmm0, [ecx]
MOVDQ xmm2, [eax]
psadbw xmm0, xmm2
movdqu xmm1, [ecx+edx]
MOVDQ xmm2, [eax+ebx]
psadbw xmm1, xmm2
paddw xmm0, xmm1
SSE2_GetSad2x16
SSE2_GetSad2x16
SSE2_GetSad2x16
movhlps xmm1, xmm0
paddw xmm0, xmm1
movd eax, xmm0
pop ebx
ret
WELS_EXTERN WelsSampleSad8x16_sse2
WelsSampleSad8x16_sse2:
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
mov edx, [esp+20]
pxor xmm6, xmm6
SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_GetSad8x4
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd eax, xmm0
pop ebx
ret
%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
and %1, 0x1f|(%3>>1)
cmp %1, (32-%2)|(%3>>1)
%endmacro
WELS_EXTERN WelsSampleSad8x8_sse21
WelsSampleSad8x8_sse21:
mov ecx, [esp+12]
mov edx, ecx
CACHE_SPLIT_CHECK edx, 8, 64
jle near .pixel_sad_8x8_nsplit
push ebx
push edi
mov eax, [esp+12]
mov ebx, [esp+16]
pxor xmm7, xmm7
mov edi, ecx
and edi, 0x07
sub ecx, edi
mov edx, 8
sub edx, edi
shl edi, 3
shl edx, 3
movd xmm5, edi
movd xmm6, edx
mov edi, 8
add edi, ecx
mov edx, [esp+24]
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
movhps xmm2, [edi+edx]
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
psadbw xmm0, xmm1
paddw xmm7, xmm0
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
movhps xmm2, [edi+edx]
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
psadbw xmm0, xmm1
paddw xmm7, xmm0
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
movhps xmm2, [edi+edx]
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
psadbw xmm0, xmm1
paddw xmm7, xmm0
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
movhps xmm2, [edi+edx]
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
psadbw xmm0, xmm1
paddw xmm7, xmm0
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd eax, xmm0
pop edi
jmp .return
.pixel_sad_8x8_nsplit:
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
mov edx, [esp+20]
pxor xmm6, xmm6
SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_GetSad8x4
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd eax, xmm0
.return:
pop ebx
ret
;***********************************************************************
;
;Pixel_sad_wxh_sse2 END
;
;***********************************************************************
;***********************************************************************
;
;Pixel_sad_4_wxh_sse2 BEGIN
;
;***********************************************************************
%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
psadbw %1, %4
paddw xmm5, %1
psadbw %4, %3
paddw xmm4, %4
movdqu %4, [%5-1]
psadbw %4, %2
paddw xmm6, %4
movdqu %4, [%5+1]
psadbw %4, %2
paddw xmm7, %4
%endmacro
WELS_EXTERN WelsSampleSadFour16x16_sse2
WelsSampleSadFour16x16_sse2:
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
mov edx, [esp+20]
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
pxor xmm6, xmm6 ;sad pRefMb-1
pxor xmm7, xmm7 ;sad pRefMb+1
movdqa xmm0, [eax]
sub ecx, edx
movdqu xmm3, [ecx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movdqa xmm1, [eax+ebx]
movdqu xmm3, [ecx+edx]
psadbw xmm3, xmm1
paddw xmm4, xmm3
movdqu xmm2, [ecx+edx-1]
psadbw xmm2, xmm0
paddw xmm6, xmm2
movdqu xmm3, [ecx+edx+1]
psadbw xmm3, xmm0
paddw xmm7, xmm3
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
movdqa xmm2, [eax]
movdqu xmm3, [ecx]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx
movdqa xmm0, [eax+ebx]
movdqu xmm3, [ecx+edx]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
movdqa xmm1, [eax]
movdqu xmm3, [ecx]
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx
movdqa xmm2, [eax+ebx]
movdqu xmm3, [ecx+edx]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx+edx
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
movdqa xmm0, [eax]
movdqu xmm3, [ecx]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx
movdqa xmm1, [eax+ebx]
movdqu xmm3, [ecx+edx]
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx+edx
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
movdqa xmm2, [eax]
movdqu xmm3, [ecx]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx
movdqa xmm0, [eax+ebx]
movdqu xmm3, [ecx+edx]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
movdqa xmm1, [eax]
movdqu xmm3, [ecx]
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx
movdqa xmm2, [eax+ebx]
movdqu xmm3, [ecx+edx]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx+edx
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
movdqa xmm0, [eax]
movdqu xmm3, [ecx]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx
movdqa xmm1, [eax+ebx]
movdqu xmm3, [ecx+edx]
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx+edx
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
movdqa xmm2, [eax]
movdqu xmm3, [ecx]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx
movdqa xmm0, [eax+ebx]
movdqu xmm3, [ecx+edx]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx
lea ecx, [ecx+2*edx]
movdqu xmm3, [ecx]
psadbw xmm2, xmm3
paddw xmm5, xmm2
movdqu xmm2, [ecx-1]
psadbw xmm2, xmm0
paddw xmm6, xmm2
movdqu xmm3, [ecx+1]
psadbw xmm3, xmm0
paddw xmm7, xmm3
movdqu xmm3, [ecx+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
mov ecx, [esp+24]
movhlps xmm0, xmm4
paddw xmm4, xmm0
movhlps xmm0, xmm5
paddw xmm5, xmm0
movhlps xmm0, xmm6
paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [ecx],xmm4
pop ebx
ret
WELS_EXTERN WelsSampleSadFour16x8_sse2
WelsSampleSadFour16x8_sse2:
push ebx
push edi
mov eax, [esp+12]
mov ebx, [esp+16]
mov edi, [esp+20]
mov edx, [esp+24]
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
pxor xmm6, xmm6 ;sad pRefMb-1
pxor xmm7, xmm7 ;sad pRefMb+1
movdqa xmm0, [eax]
sub edi, edx
movdqu xmm3, [edi]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movdqa xmm1, [eax+ebx]
movdqu xmm3, [edi+edx]
psadbw xmm3, xmm1
paddw xmm4, xmm3
movdqu xmm2, [edi+edx-1]
psadbw xmm2, xmm0
paddw xmm6, xmm2
movdqu xmm3, [edi+edx+1]
psadbw xmm3, xmm0
paddw xmm7, xmm3
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movdqa xmm2, [eax]
movdqu xmm3, [edi]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, edi
movdqa xmm0, [eax+ebx]
movdqu xmm3, [edi+edx]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, edi+edx
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movdqa xmm1, [eax]
movdqu xmm3, [edi]
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, edi
movdqa xmm2, [eax+ebx]
movdqu xmm3, [edi+edx]
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, edi+edx
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movdqa xmm0, [eax]
movdqu xmm3, [edi]
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, edi
movdqa xmm1, [eax+ebx]
movdqu xmm3, [edi+edx]
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, edi+edx
lea edi, [edi+2*edx]
movdqu xmm3, [edi]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movdqu xmm0, [edi-1]
psadbw xmm0, xmm1
paddw xmm6, xmm0
movdqu xmm3, [edi+1]
psadbw xmm3, xmm1
paddw xmm7, xmm3
movdqu xmm3, [edi+edx]
psadbw xmm1, xmm3
paddw xmm5, xmm1
mov edi, [esp+28]
movhlps xmm0, xmm4
paddw xmm4, xmm0
movhlps xmm0, xmm5
paddw xmm5, xmm0
movhlps xmm0, xmm6
paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [edi],xmm4
pop edi
pop ebx
ret
WELS_EXTERN WelsSampleSadFour8x16_sse2
WelsSampleSadFour8x16_sse2:
push ebx
push edi
mov eax, [esp+12]
mov ebx, [esp+16]
mov edi, [esp+20]
mov edx, [esp+24]
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
pxor xmm6, xmm6 ;sad pRefMb-1
pxor xmm7, xmm7 ;sad pRefMb+1
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
sub edi, edx
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
mov edi, [esp+28]
movhlps xmm0, xmm4
paddw xmm4, xmm0
movhlps xmm0, xmm5
paddw xmm5, xmm0
movhlps xmm0, xmm6
paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [edi],xmm4
pop edi
pop ebx
ret
WELS_EXTERN WelsSampleSadFour8x8_sse2
WelsSampleSadFour8x8_sse2:
push ebx
push edi
mov eax, [esp+12]
mov ebx, [esp+16]
mov edi, [esp+20]
mov edx, [esp+24]
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
pxor xmm6, xmm6 ;sad pRefMb-1
pxor xmm7, xmm7 ;sad pRefMb+1
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
sub edi, edx
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
mov edi, [esp+28]
movhlps xmm0, xmm4
paddw xmm4, xmm0
movhlps xmm0, xmm5
paddw xmm5, xmm0
movhlps xmm0, xmm6
paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [edi],xmm4
pop edi
pop ebx
ret
WELS_EXTERN WelsSampleSadFour4x4_sse2
WelsSampleSadFour4x4_sse2:
push ebx
push edi
mov eax, [esp+12]
mov ebx, [esp+16]
mov edi, [esp+20]
mov edx, [esp+24]
movd xmm0, [eax]
movd xmm1, [eax+ebx]
lea eax, [eax+2*ebx]
movd xmm2, [eax]
movd xmm3, [eax+ebx]
punpckldq xmm0, xmm1
punpckldq xmm2, xmm3
punpcklqdq xmm0, xmm2
sub edi, edx
movd xmm1, [edi]
movd xmm2, [edi+edx]
punpckldq xmm1, xmm2
movd xmm2, [edi+edx-1]
movd xmm3, [edi+edx+1]
lea edi, [edi+2*edx]
movd xmm4, [edi]
movd xmm5, [edi-1]
punpckldq xmm2, xmm5
movd xmm5, [edi+1]
punpckldq xmm3, xmm5
movd xmm5, [edi+edx]
punpckldq xmm4, xmm5
punpcklqdq xmm1, xmm4 ;-L
movd xmm5, [edi+edx-1]
movd xmm6, [edi+edx+1]
lea edi, [edi+2*edx]
movd xmm7, [edi-1]
punpckldq xmm5, xmm7
punpcklqdq xmm2, xmm5 ;-1
movd xmm7, [edi+1]
punpckldq xmm6, xmm7
punpcklqdq xmm3, xmm6 ;+1
movd xmm6, [edi]
movd xmm7, [edi+edx]
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6 ;+L
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
movhlps xmm0, xmm1
paddw xmm1, xmm0
movhlps xmm0, xmm2
paddw xmm2, xmm0
movhlps xmm0, xmm3
paddw xmm3, xmm0
movhlps xmm0, xmm4
paddw xmm4, xmm0
mov edi, [esp+28]
punpckldq xmm1, xmm4
punpckldq xmm2, xmm3
punpcklqdq xmm1, xmm2
movdqa [edi],xmm1
pop edi
pop ebx
ret
;***********************************************************************
;
;Pixel_sad_4_wxh_sse2 END
;
;***********************************************************************
WELS_EXTERN WelsSampleSad4x4_mmx
align 16
;***********************************************************************
; int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
;***********************************************************************
WelsSampleSad4x4_mmx:
push ebx
%define pushsize 4
%define pix1address esp+pushsize+4
%define pix1stride esp+pushsize+8
%define pix2address esp+pushsize+12
%define pix2stride esp+pushsize+16
mov eax, [pix1address]
mov ebx, [pix1stride ]
mov ecx, [pix2address]
mov edx, [pix2stride ]
movd mm0, [eax]
movd mm1, [eax+ebx]
punpckldq mm0, mm1
movd mm3, [ecx]
movd mm4, [ecx+edx]
punpckldq mm3, mm4
psadbw mm0, mm3
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
movd mm1, [eax]
movd mm2, [eax+ebx]
punpckldq mm1, mm2
movd mm3, [ecx]
movd mm4, [ecx+edx]
punpckldq mm3, mm4
psadbw mm1, mm3
paddw mm0, mm1
movd eax, mm0
WELSEMMS
pop ebx
%undef pushsize
%undef pix1address
%undef pix1stride
%undef pix2address
%undef pix2stride
ret