openh264/codec/encoder/core/x86/sample_sc.asm
2014-08-15 09:22:37 +08:00

1823 lines
46 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
SECTION .rodata align=16
ALIGN 16
mv_x_inc_x4 dw 0x10, 0x10, 0x10, 0x10
mv_y_inc_x4 dw 0x04, 0x04, 0x04, 0x04
mx_x_offset_x4 dw 0x00, 0x04, 0x08, 0x0C
SECTION .text
%ifdef X86_32
;**********************************************************************************************************************
;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
;*********************************************************************************************************************
WELS_EXTERN SumOf8x8BlockOfFrame_sse2
%define pushsize 16
%define localsize 4
%define ref esp + pushsize + localsize + 4
%define sum_ref esp + pushsize + localsize + 20
%define times_of_sum esp + pushsize + localsize + 24
%define width esp + pushsize + localsize + 8
%define height esp + pushsize + localsize + 12
%define linesize esp + pushsize + localsize + 16
%define tmp_width esp + 0
push ebx
push ebp
push esi
push edi
sub esp, localsize
pxor xmm0, xmm0
mov esi, [ref]
mov edi, [sum_ref]
mov edx, [times_of_sum]
mov ebx, [linesize]
mov eax, [width]
lea ecx, [ebx+ebx*2] ; 3*linesize
mov [tmp_width], eax
lea ebp, [esi+ebx*4]
FIRST_ROW:
movq xmm1, [esi]
movq xmm2, [esi+ebx]
movq xmm3, [esi+ebx*2]
movq xmm4, [esi+ecx]
shufps xmm1, xmm2, 01000100b
shufps xmm3, xmm4, 01000100b
psadbw xmm1, xmm0
psadbw xmm3, xmm0
paddd xmm1, xmm3
movq xmm2, [ebp]
movq xmm3, [ebp+ebx]
movq xmm4, [ebp+ebx*2]
movq xmm5, [ebp+ecx]
shufps xmm2, xmm3, 01000100b
shufps xmm4, xmm5, 01000100b
psadbw xmm2, xmm0
psadbw xmm4, xmm0
paddd xmm2, xmm4
paddd xmm1, xmm2
pshufd xmm2, xmm1, 00001110b
paddd xmm1, xmm2
movd eax, xmm1
mov [edi], ax
inc dword [edx+eax*4]
inc esi
inc ebp
add edi, 2
dec dword [tmp_width]
jg FIRST_ROW
mov esi, [ref]
mov edi, [sum_ref]
mov ebp, [width]
dec dword [height]
HEIGHT_LOOP:
mov [tmp_width], ebp
WIDTH_LOOP:
movq xmm1, [esi+ebx*8]
movq xmm2, [esi]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psubd xmm1, xmm2
movd eax, xmm1
mov cx, [edi]
add eax, ecx
mov [edi+ebp*2], ax
inc dword [edx+eax*4]
inc esi
add edi, 2
dec dword [tmp_width]
jg WIDTH_LOOP
add esi, ebx
sub esi, ebp
dec dword [height]
jg HEIGHT_LOOP
add esp, localsize
pop edi
pop esi
pop ebp
pop ebx
%undef pushsize
%undef localsize
%undef ref
%undef sum_ref
%undef times_of_sum
%undef width
%undef height
%undef linesize
%undef tmp_width
ret
%macro COUNT_SUM 3
%define xmm_reg %1
%define tmp_reg %2
movd tmp_reg, xmm_reg
inc dword [edx+tmp_reg*4]
%if %3 == 1
psrldq xmm_reg, 4
%endif
%endmacro
;-----------------------------------------------------------------------------
; requires: width % 8 == 0 && height > 1
;-----------------------------------------------------------------------------
;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
;-----------------------------------------------------------------------------
; read extra (16 - (width % 8) ) mod 16 bytes of every line
; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
WELS_EXTERN SumOf8x8BlockOfFrame_sse4
%define pushsize 16
%define localsize 4
%define ref esp + pushsize + localsize + 4
%define sum_ref esp + pushsize + localsize + 20
%define times_of_sum esp + pushsize + localsize + 24
%define width esp + pushsize + localsize + 8
%define height esp + pushsize + localsize + 12
%define linesize esp + pushsize + localsize + 16
%define tmp_width esp + 0
push ebx
push ebp
push esi
push edi
sub esp, localsize
pxor xmm0, xmm0
mov esi, [ref]
mov edi, [sum_ref]
mov edx, [times_of_sum]
mov ebx, [linesize]
mov eax, [width]
lea ecx, [ebx+ebx*2] ; 3*linesize
mov [tmp_width], eax
lea ebp, [esi+ebx*4]
FIRST_ROW_SSE4:
movdqu xmm1, [esi]
movdqu xmm3, [esi+ebx]
movdqu xmm5, [esi+ebx*2]
movdqu xmm7, [esi+ecx]
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 000b
mpsadbw xmm2, xmm0, 100b
paddw xmm1, xmm2 ; 8 sums of line1
movdqa xmm4, xmm3
mpsadbw xmm3, xmm0, 000b
mpsadbw xmm4, xmm0, 100b
paddw xmm3, xmm4 ; 8 sums of line2
movdqa xmm2, xmm5
mpsadbw xmm5, xmm0, 000b
mpsadbw xmm2, xmm0, 100b
paddw xmm5, xmm2 ; 8 sums of line3
movdqa xmm4, xmm7
mpsadbw xmm7, xmm0, 000b
mpsadbw xmm4, xmm0, 100b
paddw xmm7, xmm4 ; 8 sums of line4
paddw xmm1, xmm3
paddw xmm5, xmm7
paddw xmm1, xmm5 ; sum the upper 4 lines first
movdqu xmm2, [ebp]
movdqu xmm3, [ebp+ebx]
movdqu xmm4, [ebp+ebx*2]
movdqu xmm5, [ebp+ecx]
movdqa xmm6, xmm2
mpsadbw xmm2, xmm0, 000b
mpsadbw xmm6, xmm0, 100b
paddw xmm2, xmm6
movdqa xmm7, xmm3
mpsadbw xmm3, xmm0, 000b
mpsadbw xmm7, xmm0, 100b
paddw xmm3, xmm7
movdqa xmm6, xmm4
mpsadbw xmm4, xmm0, 000b
mpsadbw xmm6, xmm0, 100b
paddw xmm4, xmm6
movdqa xmm7, xmm5
mpsadbw xmm5, xmm0, 000b
mpsadbw xmm7, xmm0, 100b
paddw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
paddw xmm1, xmm2
paddw xmm1, xmm4 ; sum of lines 1- 8
movdqu [edi], xmm1
movdqa xmm2, xmm1
punpcklwd xmm1, xmm0
punpckhwd xmm2, xmm0
COUNT_SUM xmm1, eax, 1
COUNT_SUM xmm1, eax, 1
COUNT_SUM xmm1, eax, 1
COUNT_SUM xmm1, eax, 0
COUNT_SUM xmm2, eax, 1
COUNT_SUM xmm2, eax, 1
COUNT_SUM xmm2, eax, 1
COUNT_SUM xmm2, eax, 0
lea esi, [esi+8]
lea ebp, [ebp+8]
lea edi, [edi+16] ; element size is 2
sub dword [tmp_width], 8
jg near FIRST_ROW_SSE4
mov esi, [ref]
mov edi, [sum_ref]
mov ebp, [width]
dec dword [height]
HEIGHT_LOOP_SSE4:
mov ecx, ebp
WIDTH_LOOP_SSE4:
movdqu xmm1, [esi+ebx*8]
movdqu xmm2, [esi]
movdqu xmm7, [edi]
movdqa xmm3, xmm1
mpsadbw xmm1, xmm0, 000b
mpsadbw xmm3, xmm0, 100b
paddw xmm1, xmm3
movdqa xmm4, xmm2
mpsadbw xmm2, xmm0, 000b
mpsadbw xmm4, xmm0, 100b
paddw xmm2, xmm4
paddw xmm7, xmm1
psubw xmm7, xmm2
movdqu [edi+ebp*2], xmm7
movdqa xmm6, xmm7
punpcklwd xmm7, xmm0
punpckhwd xmm6, xmm0
COUNT_SUM xmm7, eax, 1
COUNT_SUM xmm7, eax, 1
COUNT_SUM xmm7, eax, 1
COUNT_SUM xmm7, eax, 0
COUNT_SUM xmm6, eax, 1
COUNT_SUM xmm6, eax, 1
COUNT_SUM xmm6, eax, 1
COUNT_SUM xmm6, eax, 0
lea esi, [esi+8]
lea edi, [edi+16]
sub ecx, 8
jg near WIDTH_LOOP_SSE4
lea esi, [esi+ebx]
sub esi, ebp
dec dword [height]
jg near HEIGHT_LOOP_SSE4
add esp, localsize
pop edi
pop esi
pop ebp
pop ebx
%undef pushsize
%undef localsize
%undef ref
%undef sum_ref
%undef times_of_sum
%undef width
%undef height
%undef linesize
%undef tmp_width
ret
;****************************************************************************************************************************************************
;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
;****************************************************************************************************************************************************
WELS_EXTERN SumOf16x16BlockOfFrame_sse2
%define pushsize 16
%define localsize 4
%define ref esp + pushsize + localsize + 4
%define sum_ref esp + pushsize + localsize + 20
%define times_of_sum esp + pushsize + localsize + 24
%define width esp + pushsize + localsize + 8
%define height esp + pushsize + localsize + 12
%define linesize esp + pushsize + localsize + 16
%define tmp_width esp
push ebx
push ebp
push esi
push edi
sub esp, localsize
pxor xmm0, xmm0
mov esi, [ref]
mov edi, [sum_ref]
mov edx, [times_of_sum]
mov ebx, [linesize]
mov eax, [width]
lea ecx, [ebx+ebx*2]
mov [tmp_width], eax
FIRST_ROW_X16H:
movdqu xmm1, [esi]
movdqu xmm2, [esi+ebx]
movdqu xmm3, [esi+ebx*2]
movdqu xmm4, [esi+ecx]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
paddw xmm1, xmm2
paddw xmm3, xmm4
paddw xmm1, xmm3
lea ebp, [esi+ebx*4]
movdqu xmm2, [ebp]
movdqu xmm3, [ebp+ebx]
movdqu xmm4, [ebp+ebx*2]
movdqu xmm5, [ebp+ecx]
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
psadbw xmm5, xmm0
paddw xmm2, xmm3
paddw xmm4, xmm5
paddw xmm2, xmm4
paddw xmm1, xmm2
lea ebp, [ebp+ebx*4]
movdqu xmm2, [ebp]
movdqu xmm3, [ebp+ebx]
movdqu xmm4, [ebp+ebx*2]
movdqu xmm5, [ebp+ecx]
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
psadbw xmm5, xmm0
paddw xmm2, xmm3
paddw xmm4, xmm5
paddw xmm2, xmm4
paddw xmm1, xmm2
lea ebp, [ebp+ebx*4]
movdqu xmm2, [ebp]
movdqu xmm3, [ebp+ebx]
movdqu xmm4, [ebp+ebx*2]
movdqu xmm5, [ebp+ecx]
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
psadbw xmm5, xmm0
paddw xmm2, xmm3
paddw xmm4, xmm5
paddw xmm2, xmm4
paddw xmm1, xmm2
movdqa xmm2, xmm1
punpckhwd xmm2, xmm0
paddw xmm1, xmm2
movd eax, xmm1
mov [edi], ax
inc dword [edx+eax*4]
inc esi
lea edi, [edi+2]
dec dword [tmp_width]
jg near FIRST_ROW_X16H
mov esi, [ref]
mov edi, [sum_ref]
mov ebp, [width]
dec dword [height]
mov ecx, ebx
sal ecx, 4 ; succeeded 16th line
HEIGHT_LOOP_X16:
mov [tmp_width], ebp
WIDTH_LOOP_X16:
movdqu xmm1, [esi+ecx]
movdqu xmm2, [esi]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psubw xmm1, xmm2
movdqa xmm2, xmm1
punpckhwd xmm2, xmm0
paddw xmm1, xmm2
movd eax, xmm1
add ax, word [edi]
mov [edi+ebp*2], ax
inc dword [edx+eax*4]
inc esi
add edi, 2
dec dword [tmp_width]
jg near WIDTH_LOOP_X16
add esi, ebx
sub esi, ebp
dec dword [height]
jg near HEIGHT_LOOP_X16
add esp, localsize
pop edi
pop esi
pop ebp
pop ebx
%undef pushsize
%undef localsize
%undef ref
%undef sum_ref
%undef times_of_sum
%undef width
%undef height
%undef linesize
%undef tmp_width
ret
; requires: width % 16 == 0 && height > 1
;-----------------------------------------------------------------------------------------------------------------------------
;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
;-----------------------------------------------------------------------------------------------------------------------------
; try 8 mv via offset
%macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1
movdqu %2, [%1]
movdqu %3, [%1+8h]
movdqa %4, %2
movdqa %5, %3
mpsadbw %2, xmm0, 0 ; 000 B
mpsadbw %4, xmm0, 5 ; 101 B
mpsadbw %3, xmm0, 2 ; 010 B
mpsadbw %5, xmm0, 7 ; 111 B
paddw %2, %4
paddw %3, %5
paddw %2, %3 ; accumulate cost
%endmacro ; end of SAD_16x16_LINE_SSE41
WELS_EXTERN SumOf16x16BlockOfFrame_sse4
%define pushsize 16
%define localsize 4
%define ref esp + pushsize + localsize + 4
%define sum_ref esp + pushsize + localsize + 20
%define times_of_sum esp + pushsize + localsize + 24
%define width esp + pushsize + localsize + 8
%define height esp + pushsize + localsize + 12
%define linesize esp + pushsize + localsize + 16
%define tmp_width esp
push ebx
push ebp
push esi
push edi
sub esp, localsize
pxor xmm0, xmm0
mov esi, [ref]
mov edi, [sum_ref]
mov edx, [times_of_sum]
mov ebx, [linesize]
mov eax, [width]
lea ecx, [ebx+ebx*2]
mov [tmp_width], eax
FIRST_ROW_X16_SSE4:
SUM_LINE_X16_SSE41 esi, xmm1, xmm2, xmm3, xmm4
SUM_LINE_X16_SSE41 esi+ebx, xmm2, xmm3, xmm4, xmm5
SUM_LINE_X16_SSE41 esi+ebx*2, xmm3, xmm4, xmm5, xmm6
SUM_LINE_X16_SSE41 esi+ecx, xmm4, xmm5, xmm6, xmm7
paddw xmm1, xmm2
paddw xmm3, xmm4
paddw xmm1, xmm3
lea ebp, [esi+ebx*4]
SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
lea ebp, [ebp+ebx*4]
SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
lea ebp, [ebp+ebx*4]
SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
movdqa [edi], xmm1
movdqa xmm2, xmm1
punpcklwd xmm1, xmm0
punpckhwd xmm2, xmm0
COUNT_SUM xmm1, eax, 1
COUNT_SUM xmm1, eax, 1
COUNT_SUM xmm1, eax, 1
COUNT_SUM xmm1, eax, 0
COUNT_SUM xmm2, eax, 1
COUNT_SUM xmm2, eax, 1
COUNT_SUM xmm2, eax, 1
COUNT_SUM xmm2, eax, 0
lea esi, [esi+8]
lea edi, [edi+16] ; element size is 2
sub dword [tmp_width], 8
jg near FIRST_ROW_X16_SSE4
mov esi, [ref]
mov edi, [sum_ref]
mov ebp, [width]
dec dword [height]
mov ecx, ebx
sal ecx, 4 ; succeeded 16th line
HEIGHT_LOOP_X16_SSE4:
mov [tmp_width], ebp
WIDTH_LOOP_X16_SSE4:
movdqa xmm7, [edi]
SUM_LINE_X16_SSE41 esi+ecx, xmm1, xmm2, xmm3, xmm4
SUM_LINE_X16_SSE41 esi, xmm2, xmm3, xmm4, xmm5
paddw xmm7, xmm1
psubw xmm7, xmm2
movdqa [edi+ebp*2], xmm7
movdqa xmm6, xmm7
punpcklwd xmm7, xmm0
punpckhwd xmm6, xmm0
COUNT_SUM xmm7, eax, 1
COUNT_SUM xmm7, eax, 1
COUNT_SUM xmm7, eax, 1
COUNT_SUM xmm7, eax, 0
COUNT_SUM xmm6, eax, 1
COUNT_SUM xmm6, eax, 1
COUNT_SUM xmm6, eax, 1
COUNT_SUM xmm6, eax, 0
lea esi, [esi+8]
lea edi, [edi+16]
sub dword [tmp_width], 8
jg near WIDTH_LOOP_X16_SSE4
add esi, ebx
sub esi, ebp
dec dword [height]
jg near HEIGHT_LOOP_X16_SSE4
add esp, localsize
pop edi
pop esi
pop ebp
pop ebx
%undef pushsize
%undef localsize
%undef ref
%undef sum_ref
%undef times_of_sum
%undef width
%undef height
%undef linesize
%undef tmp_width
ret
;-----------------------------------------------------------------------------------------------------------------------------
; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
;-----------------------------------------------------------------------------------------------------------------------------
WELS_EXTERN FillQpelLocationByFeatureValue_sse2
push esi
push edi
push ebx
push ebp
%define _ps 16 ; push size
%define _ls 4 ; local size
%define sum_ref esp+_ps+_ls+4
%define pos_list esp+_ps+_ls+16
%define width esp+_ps+_ls+8
%define height esp+_ps+_ls+12
%define i_height esp
sub esp, _ls
mov esi, [sum_ref]
mov edi, [pos_list]
mov ebp, [width]
mov ebx, [height]
mov [i_height], ebx
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
pxor xmm4, xmm4
pxor xmm3, xmm3 ; y_qpel vector
HASH_HEIGHT_LOOP_SSE2:
movdqa xmm2, xmm5 ; x_qpel vector
mov ecx, ebp
HASH_WIDTH_LOOP_SSE2:
movq xmm0, [esi] ; load x8 sum
punpcklwd xmm0, xmm4
movdqa xmm1, xmm2
punpcklwd xmm1, xmm3
%rep 3
movd edx, xmm0
lea ebx, [edi+edx*4]
mov eax, [ebx]
movd [eax], xmm1
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
lea eax, [eax+4]
mov [ebx], eax
psrldq xmm1, 4
psrldq xmm0, 4
%endrep
movd edx, xmm0
lea ebx, [edi+edx*4]
mov eax, [ebx]
movd [eax], xmm1
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
lea eax, [eax+4]
mov [ebx], eax
paddw xmm2, xmm7
lea esi, [esi+8]
sub ecx, 4
jnz near HASH_WIDTH_LOOP_SSE2
paddw xmm3, xmm6
dec dword [i_height]
jnz near HASH_HEIGHT_LOOP_SSE2
add esp, _ls
%undef _ps
%undef _ls
%undef sum_ref
%undef pos_list
%undef width
%undef height
%undef i_height
pop ebp
pop ebx
pop edi
pop esi
ret
;---------------------------------------------------------------------------------------------------------------------------------------------------
; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
;---------------------------------------------------------------------------------------------------------------------------------------------------
WELS_EXTERN InitializeHashforFeature_sse2
push ebx
push esi
push edi
push ebp
%define _ps 16 ; push size
mov edi, [esp+_ps+16] ; pPositionOfSum
mov ebp, [esp+_ps+20] ; sum_idx_list
mov esi, [esp+_ps+4] ; pTimesOfSum
mov ebx, [esp+_ps+8] ; pBuf
mov edx, [esp+_ps+12] ; list_sz
sar edx, 2
mov ecx, 0
pxor xmm7, xmm7
hash_assign_loop_x4_sse2:
movdqa xmm0, [esi+ecx]
pslld xmm0, 2
movdqa xmm1, xmm0
pcmpeqd xmm1, xmm7
movmskps eax, xmm1
cmp eax, 0x0f
je near hash_assign_with_copy_sse2
%assign x 0
%rep 4
lea eax, [edi+ecx+x]
mov [eax], ebx
lea eax, [ebp+ecx+x]
mov [eax], ebx
movd eax, xmm0
add ebx, eax
psrldq xmm0, 4
%assign x x+4
%endrep
jmp near assign_next_sse2
hash_assign_with_copy_sse2:
movd xmm1, ebx
pshufd xmm2, xmm1, 0
movdqa [edi+ecx], xmm2
movdqa [ebp+ecx], xmm2
assign_next_sse2:
add ecx, 16
dec edx
jnz near hash_assign_loop_x4_sse2
mov edx, [esp+_ps+12] ; list_sz
and edx, 3
jz near hash_assign_no_rem_sse2
hash_assign_loop_x4_rem_sse2:
lea eax, [edi+ecx]
mov [eax], ebx
lea eax, [ebp+ecx]
mov [eax], ebx
mov eax, [esi+ecx]
sal eax, 2
add ebx, eax
add ecx, 4
dec edx
jnz near hash_assign_loop_x4_rem_sse2
hash_assign_no_rem_sse2:
%undef _ps
pop ebp
pop edi
pop esi
pop ebx
ret
%else
;**********************************************************************************************************************
;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
;*********************************************************************************************************************
WELS_EXTERN SumOf8x8BlockOfFrame_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 6
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r3, r3d
push r12
push r13
push r0
push r2
push r4
pxor xmm0, xmm0
lea r6, [r3+r3*2]
mov r12, r1 ;r12:tmp_width
lea r13, [r0+r3*4] ;rbp:r13
FIRST_ROW:
movq xmm1, [r0]
movq xmm2, [r0+r3]
movq xmm3, [r0+r3*2]
movq xmm4, [r0+r6]
shufps xmm1, xmm2, 01000100b
shufps xmm3, xmm4, 01000100b
psadbw xmm1, xmm0
psadbw xmm3, xmm0
paddd xmm1, xmm3
movq xmm2, [r13]
movq xmm3, [r13+r3]
movq xmm4, [r13+r3*2]
movq xmm5, [r13+r6]
shufps xmm2, xmm3, 01000100b
shufps xmm4, xmm5, 01000100b
psadbw xmm2, xmm0
psadbw xmm4, xmm0
paddd xmm2, xmm4
paddd xmm1, xmm2
pshufd xmm2, xmm1, 00001110b
paddd xmm1, xmm2
movd r2d, xmm1
mov [r4], r2w
inc dword [r5+r2*4]
inc r0
inc r13
add r4, 2
dec r12
jg FIRST_ROW
pop r4
pop r2
pop r0
mov r13, r2
dec r13
HEIGHT_LOOP:
mov r12, r1
WIDTH_LOOP:
movq xmm1, [r0+r3*8]
movq xmm2, [r0]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psubd xmm1, xmm2
movd r2d, xmm1
mov r6w, [r4]
add r2d, r6d
mov [r4+r1*2], r2w
inc dword [r5+r2*4]
inc r0
add r4, 2
dec r12
jg WIDTH_LOOP
add r0, r3
sub r0, r1
dec r13
jg HEIGHT_LOOP
pop r13
pop r12
POP_XMM
LOAD_6_PARA_POP
ret
%macro COUNT_SUM 4
%define xmm_reg %1
%define tmp_dreg %2
%define tmp_qreg %3
movd tmp_dreg, xmm_reg
inc dword [r5+tmp_qreg*4]
%if %4 == 1
psrldq xmm_reg, 4
%endif
%endmacro
;-----------------------------------------------------------------------------
; requires: width % 8 == 0 && height > 1
;-----------------------------------------------------------------------------
;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
;-----------------------------------------------------------------------------
; read extra (16 - (width % 8) ) mod 16 bytes of every line
; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
WELS_EXTERN SumOf8x8BlockOfFrame_sse4
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r3, r3d
push r12
push r13
push r0
push r2
push r4
pxor xmm0, xmm0
lea r6, [r3+r3*2]
mov r12, r1 ;r12:tmp_width
lea r13, [r0+r3*4] ;rbp:r13
FIRST_ROW_SSE4:
movdqu xmm1, [r0]
movdqu xmm3, [r0+r3]
movdqu xmm5, [r0+r3*2]
movdqu xmm7, [r0+r6]
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 000b
mpsadbw xmm2, xmm0, 100b
paddw xmm1, xmm2 ; 8 sums of line1
movdqa xmm4, xmm3
mpsadbw xmm3, xmm0, 000b
mpsadbw xmm4, xmm0, 100b
paddw xmm3, xmm4 ; 8 sums of line2
movdqa xmm2, xmm5
mpsadbw xmm5, xmm0, 000b
mpsadbw xmm2, xmm0, 100b
paddw xmm5, xmm2 ; 8 sums of line3
movdqa xmm4, xmm7
mpsadbw xmm7, xmm0, 000b
mpsadbw xmm4, xmm0, 100b
paddw xmm7, xmm4 ; 8 sums of line4
paddw xmm1, xmm3
paddw xmm5, xmm7
paddw xmm1, xmm5 ; sum the upper 4 lines first
movdqu xmm2, [r13]
movdqu xmm3, [r13+r3]
movdqu xmm4, [r13+r3*2]
movdqu xmm5, [r13+r6]
movdqa xmm6, xmm2
mpsadbw xmm2, xmm0, 000b
mpsadbw xmm6, xmm0, 100b
paddw xmm2, xmm6
movdqa xmm7, xmm3
mpsadbw xmm3, xmm0, 000b
mpsadbw xmm7, xmm0, 100b
paddw xmm3, xmm7
movdqa xmm6, xmm4
mpsadbw xmm4, xmm0, 000b
mpsadbw xmm6, xmm0, 100b
paddw xmm4, xmm6
movdqa xmm7, xmm5
mpsadbw xmm5, xmm0, 000b
mpsadbw xmm7, xmm0, 100b
paddw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
paddw xmm1, xmm2
paddw xmm1, xmm4 ; sum of lines 1- 8
movdqu [r4], xmm1
movdqa xmm2, xmm1
punpcklwd xmm1, xmm0
punpckhwd xmm2, xmm0
COUNT_SUM xmm1, r2d, r2, 1
COUNT_SUM xmm1, r2d, r2, 1
COUNT_SUM xmm1, r2d, r2, 1
COUNT_SUM xmm1, r2d, r2, 0
COUNT_SUM xmm2, r2d, r2 ,1
COUNT_SUM xmm2, r2d, r2 ,1
COUNT_SUM xmm2, r2d, r2 ,1
COUNT_SUM xmm2, r2d, r2 ,0
lea r0, [r0+8]
lea r13, [r13+8]
lea r4, [r4+16] ; element size is 2
sub r12, 8
jg near FIRST_ROW_SSE4
pop r4
pop r2
pop r0
mov r13, r2
dec r13
HEIGHT_LOOP_SSE4:
mov r12, r1
WIDTH_LOOP_SSE4:
movdqu xmm1, [r0+r3*8]
movdqu xmm2, [r0]
movdqu xmm7, [r4]
movdqa xmm3, xmm1
mpsadbw xmm1, xmm0, 000b
mpsadbw xmm3, xmm0, 100b
paddw xmm1, xmm3
movdqa xmm4, xmm2
mpsadbw xmm2, xmm0, 000b
mpsadbw xmm4, xmm0, 100b
paddw xmm2, xmm4
paddw xmm7, xmm1
psubw xmm7, xmm2
movdqu [r4+r1*2], xmm7
movdqa xmm6, xmm7
punpcklwd xmm7, xmm0
punpckhwd xmm6, xmm0
COUNT_SUM xmm7, r2d, r2, 1
COUNT_SUM xmm7, r2d, r2, 1
COUNT_SUM xmm7, r2d, r2, 1
COUNT_SUM xmm7, r2d, r2, 0
COUNT_SUM xmm6, r2d, r2, 1
COUNT_SUM xmm6, r2d, r2, 1
COUNT_SUM xmm6, r2d, r2, 1
COUNT_SUM xmm6, r2d, r2, 0
lea r0, [r0+8]
lea r4, [r4+16]
sub r12, 8
jg near WIDTH_LOOP_SSE4
lea r0, [r0+r3]
sub r0, r1
dec r13
jg near HEIGHT_LOOP_SSE4
pop r13
pop r12
POP_XMM
LOAD_6_PARA_POP
ret
;****************************************************************************************************************************************************
;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
;****************************************************************************************************************************************************
WELS_EXTERN SumOf16x16BlockOfFrame_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 6
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r3, r3d
push r12
push r13
push r0
push r2
push r4
pxor xmm0, xmm0
lea r6, [r3+r3*2]
mov r12, r1 ;r12:tmp_width
FIRST_ROW_X16H:
movdqu xmm1, [r0]
movdqu xmm2, [r0+r3]
movdqu xmm3, [r0+r3*2]
movdqu xmm4, [r0+r6]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
paddw xmm1, xmm2
paddw xmm3, xmm4
paddw xmm1, xmm3
lea r13, [r0+r3*4] ;ebp:r13
movdqu xmm2, [r13]
movdqu xmm3, [r13+r3]
movdqu xmm4, [r13+r3*2]
movdqu xmm5, [r13+r6]
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
psadbw xmm5, xmm0
paddw xmm2, xmm3
paddw xmm4, xmm5
paddw xmm2, xmm4
paddw xmm1, xmm2
lea r13, [r13+r3*4]
movdqu xmm2, [r13]
movdqu xmm3, [r13+r3]
movdqu xmm4, [r13+r3*2]
movdqu xmm5, [r13+r6]
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
psadbw xmm5, xmm0
paddw xmm2, xmm3
paddw xmm4, xmm5
paddw xmm2, xmm4
paddw xmm1, xmm2
lea r13, [r13+r3*4]
movdqu xmm2, [r13]
movdqu xmm3, [r13+r3]
movdqu xmm4, [r13+r3*2]
movdqu xmm5, [r13+r6]
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
psadbw xmm5, xmm0
paddw xmm2, xmm3
paddw xmm4, xmm5
paddw xmm2, xmm4
paddw xmm1, xmm2
movdqa xmm2, xmm1
punpckhwd xmm2, xmm0
paddw xmm1, xmm2
movd r2d, xmm1
mov [r4], r2w
inc dword [r5+r2*4]
inc r0
lea r4, [r4+2]
dec r12
jg near FIRST_ROW_X16H
pop r4
pop r2
pop r0
mov r13, r2
dec r13
mov r6, r3
sal r6, 4 ; succeeded 16th line
HEIGHT_LOOP_X16:
mov r12, r1
WIDTH_LOOP_X16:
movdqu xmm1, [r0+r6]
movdqu xmm2, [r0]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psubw xmm1, xmm2
movdqa xmm2, xmm1
punpckhwd xmm2, xmm0
paddw xmm1, xmm2
movd r2d, xmm1
add r2w, word [r4]
mov [r4+r1*2], r2w
inc dword [r5+r2*4]
inc r0
add r4, 2
dec r12
jg near WIDTH_LOOP_X16
add r0, r3
sub r0, r1
dec r13
jg near HEIGHT_LOOP_X16
pop r13
pop r12
POP_XMM
LOAD_6_PARA_POP
ret
; requires: width % 16 == 0 && height > 1
;-----------------------------------------------------------------------------------------------------------------------------
;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
;-----------------------------------------------------------------------------------------------------------------------------
; try 8 mv via offset
%macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1
movdqu %2, [%1]
movdqu %3, [%1+8h]
movdqa %4, %2
movdqa %5, %3
mpsadbw %2, xmm0, 0 ; 000 B
mpsadbw %4, xmm0, 5 ; 101 B
mpsadbw %3, xmm0, 2 ; 010 B
mpsadbw %5, xmm0, 7 ; 111 B
paddw %2, %4
paddw %3, %5
paddw %2, %3 ; accumulate cost
%endmacro ; end of SAD_16x16_LINE_SSE41
WELS_EXTERN SumOf16x16BlockOfFrame_sse4
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r3, r3d
push r12
push r13
push r0
push r2
push r4
pxor xmm0, xmm0
lea r6, [r3+r3*2]
mov r12, r1 ;r12:tmp_width
FIRST_ROW_X16_SSE4:
SUM_LINE_X16_SSE41 r0, xmm1, xmm2, xmm3, xmm4
SUM_LINE_X16_SSE41 r0+r3, xmm2, xmm3, xmm4, xmm5
SUM_LINE_X16_SSE41 r0+r3*2,xmm3, xmm4, xmm5, xmm6
SUM_LINE_X16_SSE41 r0+r6, xmm4, xmm5, xmm6, xmm7
paddw xmm1, xmm2
paddw xmm3, xmm4
paddw xmm1, xmm3
lea r13, [r0+r3*4]
SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
lea r13, [r13+r3*4]
SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
lea r13, [r13+r3*4]
SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
paddw xmm1, xmm2
movdqa [r4], xmm1
movdqa xmm2, xmm1
punpcklwd xmm1, xmm0
punpckhwd xmm2, xmm0
COUNT_SUM xmm1, r2d, r2, 1
COUNT_SUM xmm1, r2d, r2, 1
COUNT_SUM xmm1, r2d, r2, 1
COUNT_SUM xmm1, r2d, r2, 0
COUNT_SUM xmm2, r2d, r2, 1
COUNT_SUM xmm2, r2d, r2, 1
COUNT_SUM xmm2, r2d, r2, 1
COUNT_SUM xmm2, r2d, r2, 0
lea r0, [r0+8]
lea r4, [r4+16] ; element size is 2
sub r12, 8
jg near FIRST_ROW_X16_SSE4
pop r4
pop r2
pop r0
mov r13, r2
dec r13
mov r6, r3
sal r6, 4 ; succeeded 16th line
HEIGHT_LOOP_X16_SSE4:
mov r12, r1
WIDTH_LOOP_X16_SSE4:
movdqa xmm7, [r4]
SUM_LINE_X16_SSE41 r0+r6, xmm1, xmm2, xmm3, xmm4
SUM_LINE_X16_SSE41 r0, xmm2, xmm3, xmm4, xmm5
paddw xmm7, xmm1
psubw xmm7, xmm2
movdqa [r4+r1*2], xmm7
movdqa xmm6, xmm7
punpcklwd xmm7, xmm0
punpckhwd xmm6, xmm0
COUNT_SUM xmm7, r2d, r2, 1
COUNT_SUM xmm7, r2d, r2, 1
COUNT_SUM xmm7, r2d, r2, 1
COUNT_SUM xmm7, r2d, r2, 0
COUNT_SUM xmm6, r2d, r2, 1
COUNT_SUM xmm6, r2d, r2, 1
COUNT_SUM xmm6, r2d, r2, 1
COUNT_SUM xmm6, r2d, r2, 0
lea r0, [r0+8]
lea r4, [r4+16]
sub r12, 8
jg near WIDTH_LOOP_X16_SSE4
add r0, r3
sub r0, r1
dec r13
jg near HEIGHT_LOOP_X16_SSE4
pop r13
pop r12
POP_XMM
LOAD_6_PARA_POP
ret
;-----------------------------------------------------------------------------------------------------------------------------
; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
;-----------------------------------------------------------------------------------------------------------------------------
WELS_EXTERN FillQpelLocationByFeatureValue_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r2, r2d
push r12
push r13
mov r12, r2
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
pxor xmm4, xmm4
pxor xmm3, xmm3 ; y_qpel vector
HASH_HEIGHT_LOOP_SSE2:
movdqa xmm2, xmm5 ; x_qpel vector
mov r4, r1
HASH_WIDTH_LOOP_SSE2:
movq xmm0, [r0] ; load x8 sum
punpcklwd xmm0, xmm4
movdqa xmm1, xmm2
punpcklwd xmm1, xmm3
%rep 3
movd r2d, xmm0 ;edx:r3
lea r5, [r3+r2*8] ;ebx:r5
mov r6, [r5] ;eax:r6
movd [r6], xmm1
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
lea r6, [r6+4]
mov [r5], r6
psrldq xmm1, 4
psrldq xmm0, 4
%endrep
movd r2d, xmm0
lea r5, [r3+r2*8] ;ebx:r5
mov r6, [r5] ;eax:r6
movd [r6], xmm1
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
lea r6, [r6+4]
mov [r5], r6
paddw xmm2, xmm7
lea r0, [r0+8]
sub r4, 4
jnz near HASH_WIDTH_LOOP_SSE2
paddw xmm3, xmm6
dec r12
jnz near HASH_HEIGHT_LOOP_SSE2
pop r13
pop r12
POP_XMM
ret
;---------------------------------------------------------------------------------------------------------------------------------------------------
; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
;uint16_t** pPositionOfSum, uint16_t** sum_idx_list, uint32_t* pTimesOfSum, uint16_t* pBuf, const int32_t list_sz )
;---------------------------------------------------------------------------------------------------------------------------------------------------
WELS_EXTERN InitializeHashforFeature_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r2, r2d
push r12
push r13
mov r12, r2
sar r2, 2
mov r5, 0 ;r5:ecx
xor r6, r6
pxor xmm3, xmm3
hash_assign_loop_x4_sse2:
movdqa xmm0, [r0+r5]
pslld xmm0, 2
movdqa xmm1, xmm0
pcmpeqd xmm1, xmm3
movmskps r6, xmm1
cmp r6, 0x0f
jz near hash_assign_with_copy_sse2
%assign x 0
%rep 4
lea r13, [r3+r5*2+x]
mov [r13], r1
lea r13, [r4+r5*2+x]
mov [r13], r1
movd r6d, xmm0
add r1, r6
psrldq xmm0, 4
%assign x x+8
%endrep
jmp near assign_next_sse2
hash_assign_with_copy_sse2:
movq xmm1, r1
pshufd xmm2, xmm1, 01000100b
movdqa [r3+r5*2], xmm2
movdqa [r4+r5*2], xmm2
movdqa [r3+r5*2+16], xmm2
movdqa [r4+r5*2+16], xmm2
assign_next_sse2:
add r5, 16
dec r2
jnz near hash_assign_loop_x4_sse2
and r12, 3
jz near hash_assign_no_rem_sse2
hash_assign_loop_x4_rem_sse2:
lea r13, [r3+r5*2]
mov [r13], r1
lea r13, [r4+r5*2]
mov [r13], r1
mov r6d, [r0+r5]
sal r6, 2
add r1, r6
add r5, 4
dec r12
jnz near hash_assign_loop_x4_rem_sse2
hash_assign_no_rem_sse2:
pop r13
pop r12
ret
%endif
;**********************************************************************************************************************************
; int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
;**********************************************************************************************************************************
WELS_EXTERN SumOf8x8SingleBlock_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
pxor xmm0, xmm0
movq xmm1, [r0]
movhps xmm1, [r0+r1]
lea r0, [r0+2*r1]
movq xmm2, [r0]
movhps xmm2, [r0+r1]
lea r0, [r0+2*r1]
movq xmm3, [r0]
movhps xmm3, [r0+r1]
lea r0, [r0+2*r1]
movq xmm4, [r0]
movhps xmm4, [r0+r1]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
paddw xmm1, xmm2
paddw xmm3, xmm4
paddw xmm1, xmm3
movdqa xmm2, xmm1
punpckhwd xmm2, xmm0
paddw xmm1, xmm2
movd retrd, xmm1
ret
;**********************************************************************************************************************************
; int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
;**********************************************************************************************************************************
WELS_EXTERN SumOf16x16SingleBlock_sse2
%assign push_num 0
LOAD_2_PARA
PUSH_XMM 6
SIGN_EXTENSION r1, r1d
pxor xmm0, xmm0
movdqa xmm1, [r0]
movdqa xmm2, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm3, [r0]
movdqa xmm4, [r0+r1]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
paddw xmm1, xmm2
paddw xmm3, xmm4
paddw xmm1, xmm3
lea r0, [r0+2*r1]
movdqa xmm2, [r0]
movdqa xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm4, [r0]
movdqa xmm5, [r0+r1]
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
psadbw xmm5, xmm0
paddw xmm2, xmm3
paddw xmm4, xmm5
paddw xmm2, xmm4
paddw xmm1, xmm2
lea r0, [r0+2*r1]
movdqa xmm2, [r0]
movdqa xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm4, [r0]
movdqa xmm5, [r0+r1]
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
psadbw xmm5, xmm0
paddw xmm2, xmm3
paddw xmm4, xmm5
paddw xmm2, xmm4
paddw xmm1, xmm2
lea r0, [r0+2*r1]
movdqa xmm2, [r0]
movdqa xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm4, [r0]
movdqa xmm5, [r0+r1]
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
psadbw xmm5, xmm0
paddw xmm2, xmm3
paddw xmm4, xmm5
paddw xmm2, xmm4
paddw xmm1, xmm2
movdqa xmm2, xmm1
punpckhwd xmm2, xmm0
paddw xmm1, xmm2
movd retrd, xmm1
POP_XMM
ret
;**********************************************************************************************************************************
;
; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
;
; \note:
; src need align with 16 bytes, ref is optional
; \return value:
; return minimal SAD cost, according index carried by index_min_cost
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
movdqa xmm0, [%1]
movdqu xmm1, [%2]
movdqu xmm2, [%2+8h]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm3, xmm0, 5 ; 101 B
paddw xmm7, xmm3 ; accumulate cost
mpsadbw xmm2, xmm0, 2 ; 010 B
paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm4, xmm0, 7 ; 111 B
paddw xmm7, xmm4 ; accumulate cost
add %1, %3
add %2, %4
%endmacro ; end of SAD_16x16_LINE_SSE41
%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
movdqa xmm0, [%1]
movdqu xmm1, [%2]
movdqu xmm2, [%2+8h]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm3, xmm0, 5 ; 101 B
paddw xmm7, xmm3 ; accumulate cost
mpsadbw xmm2, xmm0, 2 ; 010 B
paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm4, xmm0, 7 ; 111 B
paddw xmm7, xmm4 ; accumulate cost
%endmacro ; end of SAD_16x16_LINE_SSE41E
WELS_EXTERN SampleSad16x16Hor8_sse41
;push ebx
;push esi
;mov eax, [esp+12] ; src
;mov ecx, [esp+16] ; stride_src
;mov ebx, [esp+20] ; ref
;mov edx, [esp+24] ; stride_ref
;mov esi, [esp+28] ; base_cost
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm7, xmm7
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41E r0, r2, r1, r3
pxor xmm0, xmm0
movdqa xmm6, xmm7
punpcklwd xmm6, xmm0
punpckhwd xmm7, xmm0
movdqa xmm5, [r4]
movdqa xmm4, xmm5
punpcklwd xmm4, xmm0
punpckhwd xmm5, xmm0
paddd xmm4, xmm6
paddd xmm5, xmm7
movdqa xmm3, xmm4
pminud xmm3, xmm5
pshufd xmm2, xmm3, 01001110B
pminud xmm2, xmm3
pshufd xmm3, xmm2, 10110001B
pminud xmm2, xmm3
movd retrd, xmm2
pcmpeqd xmm4, xmm2
movmskps r2d, xmm4
bsf r1d, r2d
jnz near WRITE_INDEX
pcmpeqd xmm5, xmm2
movmskps r2d, xmm5
bsf r1d, r2d
add r1d, 4
WRITE_INDEX:
mov [r5], r1d
POP_XMM
LOAD_6_PARA_POP
ret
;**********************************************************************************************************************************
;
; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
;
; \note:
; src and ref is optional to align with 16 due inter 8x8
; \return value:
; return minimal SAD cost, according index carried by index_min_cost
;
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
movdqu xmm0, [%1]
movdqu xmm1, [%2]
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm2, xmm0, 5 ; 101 B
paddw xmm7, xmm2 ; accumulate cost
add %1, %3
add %2, %4
%endmacro ; end of SAD_8x8_LINE_SSE41
%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
movdqu xmm0, [%1]
movdqu xmm1, [%2]
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm2, xmm0, 5 ; 101 B
paddw xmm7, xmm2 ; accumulate cost
%endmacro ; end of SAD_8x8_LINE_SSE41E
WELS_EXTERN SampleSad8x8Hor8_sse41
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm7, [r4] ; load base cost list
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41E r0, r2, r1, r3
phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
mov r1d, retrd
and retrd, 0xFFFF
sar r1d, 16
mov [r5], r1d
POP_XMM
LOAD_6_PARA_POP
ret