a00e2e7229
This makes them consistent with the rest of the assembly source files. Prior to f2314151e8, all the assembly files had consistent indentation, but after that, this file had been made different.
1823 lines
50 KiB
NASM
1823 lines
50 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*************************************************************************/
|
|
%include "asm_inc.asm"
|
|
|
|
;***********************************************************************
|
|
; Local Data (Read Only)
|
|
;***********************************************************************
|
|
SECTION .rodata align=16
|
|
|
|
ALIGN 16
|
|
mv_x_inc_x4 dw 0x10, 0x10, 0x10, 0x10
|
|
mv_y_inc_x4 dw 0x04, 0x04, 0x04, 0x04
|
|
mx_x_offset_x4 dw 0x00, 0x04, 0x08, 0x0C
|
|
|
|
SECTION .text
|
|
%ifdef X86_32
|
|
;**********************************************************************************************************************
|
|
;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
|
|
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
|
;*********************************************************************************************************************
|
|
WELS_EXTERN SumOf8x8BlockOfFrame_sse2
|
|
%define pushsize 16
|
|
%define localsize 4
|
|
%define ref esp + pushsize + localsize + 4
|
|
%define sum_ref esp + pushsize + localsize + 20
|
|
%define times_of_sum esp + pushsize + localsize + 24
|
|
%define width esp + pushsize + localsize + 8
|
|
%define height esp + pushsize + localsize + 12
|
|
%define linesize esp + pushsize + localsize + 16
|
|
%define tmp_width esp + 0
|
|
push ebx
|
|
push ebp
|
|
push esi
|
|
push edi
|
|
sub esp, localsize
|
|
|
|
pxor xmm0, xmm0
|
|
mov esi, [ref]
|
|
mov edi, [sum_ref]
|
|
mov edx, [times_of_sum]
|
|
mov ebx, [linesize]
|
|
mov eax, [width]
|
|
lea ecx, [ebx+ebx*2] ; 3*linesize
|
|
|
|
mov [tmp_width], eax
|
|
lea ebp, [esi+ebx*4]
|
|
FIRST_ROW:
|
|
movq xmm1, [esi]
|
|
movq xmm2, [esi+ebx]
|
|
movq xmm3, [esi+ebx*2]
|
|
movq xmm4, [esi+ecx]
|
|
|
|
shufps xmm1, xmm2, 01000100b
|
|
shufps xmm3, xmm4, 01000100b
|
|
psadbw xmm1, xmm0
|
|
psadbw xmm3, xmm0
|
|
paddd xmm1, xmm3
|
|
|
|
movq xmm2, [ebp]
|
|
movq xmm3, [ebp+ebx]
|
|
movq xmm4, [ebp+ebx*2]
|
|
movq xmm5, [ebp+ecx]
|
|
|
|
shufps xmm2, xmm3, 01000100b
|
|
shufps xmm4, xmm5, 01000100b
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm4, xmm0
|
|
paddd xmm2, xmm4
|
|
|
|
paddd xmm1, xmm2
|
|
pshufd xmm2, xmm1, 00001110b
|
|
paddd xmm1, xmm2
|
|
movd eax, xmm1
|
|
mov [edi], ax
|
|
inc dword [edx+eax*4]
|
|
|
|
inc esi
|
|
inc ebp
|
|
add edi, 2
|
|
|
|
dec dword [tmp_width]
|
|
jg FIRST_ROW
|
|
|
|
mov esi, [ref]
|
|
mov edi, [sum_ref]
|
|
mov ebp, [width]
|
|
dec dword [height]
|
|
HEIGHT_LOOP:
|
|
mov [tmp_width], ebp
|
|
WIDTH_LOOP:
|
|
movq xmm1, [esi+ebx*8]
|
|
movq xmm2, [esi]
|
|
psadbw xmm1, xmm0
|
|
psadbw xmm2, xmm0
|
|
psubd xmm1, xmm2
|
|
movd eax, xmm1
|
|
mov cx, [edi]
|
|
add eax, ecx
|
|
|
|
mov [edi+ebp*2], ax
|
|
inc dword [edx+eax*4]
|
|
|
|
inc esi
|
|
add edi, 2
|
|
|
|
dec dword [tmp_width]
|
|
jg WIDTH_LOOP
|
|
|
|
add esi, ebx
|
|
sub esi, ebp
|
|
|
|
dec dword [height]
|
|
jg HEIGHT_LOOP
|
|
|
|
add esp, localsize
|
|
pop edi
|
|
pop esi
|
|
pop ebp
|
|
pop ebx
|
|
%undef pushsize
|
|
%undef localsize
|
|
%undef ref
|
|
%undef sum_ref
|
|
%undef times_of_sum
|
|
%undef width
|
|
%undef height
|
|
%undef linesize
|
|
%undef tmp_width
|
|
ret
|
|
|
|
|
|
%macro COUNT_SUM 3
|
|
%define xmm_reg %1
|
|
%define tmp_reg %2
|
|
movd tmp_reg, xmm_reg
|
|
inc dword [edx+tmp_reg*4]
|
|
%if %3 == 1
|
|
psrldq xmm_reg, 4
|
|
%endif
|
|
%endmacro
|
|
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; requires: width % 8 == 0 && height > 1
|
|
;-----------------------------------------------------------------------------
|
|
;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
|
|
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
|
;-----------------------------------------------------------------------------
|
|
; read extra (16 - (width % 8) ) mod 16 bytes of every line
|
|
; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
|
|
WELS_EXTERN SumOf8x8BlockOfFrame_sse4
|
|
%define pushsize 16
|
|
%define localsize 4
|
|
%define ref esp + pushsize + localsize + 4
|
|
%define sum_ref esp + pushsize + localsize + 20
|
|
%define times_of_sum esp + pushsize + localsize + 24
|
|
%define width esp + pushsize + localsize + 8
|
|
%define height esp + pushsize + localsize + 12
|
|
%define linesize esp + pushsize + localsize + 16
|
|
%define tmp_width esp + 0
|
|
push ebx
|
|
push ebp
|
|
push esi
|
|
push edi
|
|
sub esp, localsize
|
|
|
|
pxor xmm0, xmm0
|
|
mov esi, [ref]
|
|
mov edi, [sum_ref]
|
|
mov edx, [times_of_sum]
|
|
mov ebx, [linesize]
|
|
mov eax, [width]
|
|
lea ecx, [ebx+ebx*2] ; 3*linesize
|
|
|
|
mov [tmp_width], eax
|
|
lea ebp, [esi+ebx*4]
|
|
FIRST_ROW_SSE4:
|
|
movdqu xmm1, [esi]
|
|
movdqu xmm3, [esi+ebx]
|
|
movdqu xmm5, [esi+ebx*2]
|
|
movdqu xmm7, [esi+ecx]
|
|
|
|
movdqa xmm2, xmm1
|
|
mpsadbw xmm1, xmm0, 000b
|
|
mpsadbw xmm2, xmm0, 100b
|
|
paddw xmm1, xmm2 ; 8 sums of line1
|
|
|
|
movdqa xmm4, xmm3
|
|
mpsadbw xmm3, xmm0, 000b
|
|
mpsadbw xmm4, xmm0, 100b
|
|
paddw xmm3, xmm4 ; 8 sums of line2
|
|
|
|
movdqa xmm2, xmm5
|
|
mpsadbw xmm5, xmm0, 000b
|
|
mpsadbw xmm2, xmm0, 100b
|
|
paddw xmm5, xmm2 ; 8 sums of line3
|
|
|
|
movdqa xmm4, xmm7
|
|
mpsadbw xmm7, xmm0, 000b
|
|
mpsadbw xmm4, xmm0, 100b
|
|
paddw xmm7, xmm4 ; 8 sums of line4
|
|
|
|
paddw xmm1, xmm3
|
|
paddw xmm5, xmm7
|
|
paddw xmm1, xmm5 ; sum the upper 4 lines first
|
|
|
|
movdqu xmm2, [ebp]
|
|
movdqu xmm3, [ebp+ebx]
|
|
movdqu xmm4, [ebp+ebx*2]
|
|
movdqu xmm5, [ebp+ecx]
|
|
|
|
movdqa xmm6, xmm2
|
|
mpsadbw xmm2, xmm0, 000b
|
|
mpsadbw xmm6, xmm0, 100b
|
|
paddw xmm2, xmm6
|
|
|
|
movdqa xmm7, xmm3
|
|
mpsadbw xmm3, xmm0, 000b
|
|
mpsadbw xmm7, xmm0, 100b
|
|
paddw xmm3, xmm7
|
|
|
|
movdqa xmm6, xmm4
|
|
mpsadbw xmm4, xmm0, 000b
|
|
mpsadbw xmm6, xmm0, 100b
|
|
paddw xmm4, xmm6
|
|
|
|
movdqa xmm7, xmm5
|
|
mpsadbw xmm5, xmm0, 000b
|
|
mpsadbw xmm7, xmm0, 100b
|
|
paddw xmm5, xmm7
|
|
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
paddw xmm1, xmm4 ; sum of lines 1- 8
|
|
|
|
movdqu [edi], xmm1
|
|
|
|
movdqa xmm2, xmm1
|
|
punpcklwd xmm1, xmm0
|
|
punpckhwd xmm2, xmm0
|
|
|
|
COUNT_SUM xmm1, eax, 1
|
|
COUNT_SUM xmm1, eax, 1
|
|
COUNT_SUM xmm1, eax, 1
|
|
COUNT_SUM xmm1, eax, 0
|
|
COUNT_SUM xmm2, eax, 1
|
|
COUNT_SUM xmm2, eax, 1
|
|
COUNT_SUM xmm2, eax, 1
|
|
COUNT_SUM xmm2, eax, 0
|
|
|
|
lea esi, [esi+8]
|
|
lea ebp, [ebp+8]
|
|
lea edi, [edi+16] ; element size is 2
|
|
|
|
sub dword [tmp_width], 8
|
|
jg near FIRST_ROW_SSE4
|
|
|
|
mov esi, [ref]
|
|
mov edi, [sum_ref]
|
|
mov ebp, [width]
|
|
dec dword [height]
|
|
HEIGHT_LOOP_SSE4:
|
|
mov ecx, ebp
|
|
WIDTH_LOOP_SSE4:
|
|
movdqu xmm1, [esi+ebx*8]
|
|
movdqu xmm2, [esi]
|
|
movdqu xmm7, [edi]
|
|
|
|
movdqa xmm3, xmm1
|
|
mpsadbw xmm1, xmm0, 000b
|
|
mpsadbw xmm3, xmm0, 100b
|
|
paddw xmm1, xmm3
|
|
|
|
movdqa xmm4, xmm2
|
|
mpsadbw xmm2, xmm0, 000b
|
|
mpsadbw xmm4, xmm0, 100b
|
|
paddw xmm2, xmm4
|
|
|
|
paddw xmm7, xmm1
|
|
psubw xmm7, xmm2
|
|
movdqu [edi+ebp*2], xmm7
|
|
|
|
movdqa xmm6, xmm7
|
|
punpcklwd xmm7, xmm0
|
|
punpckhwd xmm6, xmm0
|
|
|
|
COUNT_SUM xmm7, eax, 1
|
|
COUNT_SUM xmm7, eax, 1
|
|
COUNT_SUM xmm7, eax, 1
|
|
COUNT_SUM xmm7, eax, 0
|
|
COUNT_SUM xmm6, eax, 1
|
|
COUNT_SUM xmm6, eax, 1
|
|
COUNT_SUM xmm6, eax, 1
|
|
COUNT_SUM xmm6, eax, 0
|
|
|
|
lea esi, [esi+8]
|
|
lea edi, [edi+16]
|
|
|
|
sub ecx, 8
|
|
jg near WIDTH_LOOP_SSE4
|
|
|
|
lea esi, [esi+ebx]
|
|
sub esi, ebp
|
|
|
|
dec dword [height]
|
|
jg near HEIGHT_LOOP_SSE4
|
|
|
|
add esp, localsize
|
|
pop edi
|
|
pop esi
|
|
pop ebp
|
|
pop ebx
|
|
%undef pushsize
|
|
%undef localsize
|
|
%undef ref
|
|
%undef sum_ref
|
|
%undef times_of_sum
|
|
%undef width
|
|
%undef height
|
|
%undef linesize
|
|
%undef tmp_width
|
|
ret
|
|
|
|
|
|
;****************************************************************************************************************************************************
|
|
;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
|
|
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
|
;****************************************************************************************************************************************************
|
|
WELS_EXTERN SumOf16x16BlockOfFrame_sse2
|
|
%define pushsize 16
|
|
%define localsize 4
|
|
%define ref esp + pushsize + localsize + 4
|
|
%define sum_ref esp + pushsize + localsize + 20
|
|
%define times_of_sum esp + pushsize + localsize + 24
|
|
%define width esp + pushsize + localsize + 8
|
|
%define height esp + pushsize + localsize + 12
|
|
%define linesize esp + pushsize + localsize + 16
|
|
%define tmp_width esp
|
|
push ebx
|
|
push ebp
|
|
push esi
|
|
push edi
|
|
sub esp, localsize
|
|
|
|
pxor xmm0, xmm0
|
|
mov esi, [ref]
|
|
mov edi, [sum_ref]
|
|
mov edx, [times_of_sum]
|
|
mov ebx, [linesize]
|
|
mov eax, [width]
|
|
|
|
lea ecx, [ebx+ebx*2]
|
|
mov [tmp_width], eax
|
|
FIRST_ROW_X16H:
|
|
movdqu xmm1, [esi]
|
|
movdqu xmm2, [esi+ebx]
|
|
movdqu xmm3, [esi+ebx*2]
|
|
movdqu xmm4, [esi+ecx]
|
|
|
|
psadbw xmm1, xmm0
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
paddw xmm1, xmm2
|
|
paddw xmm3, xmm4
|
|
paddw xmm1, xmm3
|
|
|
|
lea ebp, [esi+ebx*4]
|
|
movdqu xmm2, [ebp]
|
|
movdqu xmm3, [ebp+ebx]
|
|
movdqu xmm4, [ebp+ebx*2]
|
|
movdqu xmm5, [ebp+ecx]
|
|
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
psadbw xmm5, xmm0
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
paddw xmm2, xmm4
|
|
|
|
paddw xmm1, xmm2
|
|
|
|
lea ebp, [ebp+ebx*4]
|
|
movdqu xmm2, [ebp]
|
|
movdqu xmm3, [ebp+ebx]
|
|
movdqu xmm4, [ebp+ebx*2]
|
|
movdqu xmm5, [ebp+ecx]
|
|
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
psadbw xmm5, xmm0
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
paddw xmm2, xmm4
|
|
|
|
paddw xmm1, xmm2
|
|
|
|
lea ebp, [ebp+ebx*4]
|
|
movdqu xmm2, [ebp]
|
|
movdqu xmm3, [ebp+ebx]
|
|
movdqu xmm4, [ebp+ebx*2]
|
|
movdqu xmm5, [ebp+ecx]
|
|
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
psadbw xmm5, xmm0
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
paddw xmm2, xmm4
|
|
|
|
paddw xmm1, xmm2
|
|
movdqa xmm2, xmm1
|
|
punpckhwd xmm2, xmm0
|
|
paddw xmm1, xmm2
|
|
movd eax, xmm1
|
|
mov [edi], ax
|
|
inc dword [edx+eax*4]
|
|
|
|
inc esi
|
|
lea edi, [edi+2]
|
|
|
|
dec dword [tmp_width]
|
|
jg near FIRST_ROW_X16H
|
|
|
|
mov esi, [ref]
|
|
mov edi, [sum_ref]
|
|
mov ebp, [width]
|
|
dec dword [height]
|
|
|
|
mov ecx, ebx
|
|
sal ecx, 4 ; succeeded 16th line
|
|
HEIGHT_LOOP_X16:
|
|
mov [tmp_width], ebp
|
|
WIDTH_LOOP_X16:
|
|
movdqu xmm1, [esi+ecx]
|
|
movdqu xmm2, [esi]
|
|
psadbw xmm1, xmm0
|
|
psadbw xmm2, xmm0
|
|
psubw xmm1, xmm2
|
|
movdqa xmm2, xmm1
|
|
punpckhwd xmm2, xmm0
|
|
paddw xmm1, xmm2
|
|
movd eax, xmm1
|
|
add ax, word [edi]
|
|
mov [edi+ebp*2], ax
|
|
inc dword [edx+eax*4]
|
|
|
|
inc esi
|
|
add edi, 2
|
|
|
|
dec dword [tmp_width]
|
|
jg near WIDTH_LOOP_X16
|
|
|
|
add esi, ebx
|
|
sub esi, ebp
|
|
|
|
dec dword [height]
|
|
jg near HEIGHT_LOOP_X16
|
|
|
|
add esp, localsize
|
|
pop edi
|
|
pop esi
|
|
pop ebp
|
|
pop ebx
|
|
%undef pushsize
|
|
%undef localsize
|
|
%undef ref
|
|
%undef sum_ref
|
|
%undef times_of_sum
|
|
%undef width
|
|
%undef height
|
|
%undef linesize
|
|
%undef tmp_width
|
|
ret
|
|
|
|
; requires: width % 16 == 0 && height > 1
|
|
;-----------------------------------------------------------------------------------------------------------------------------
|
|
;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
|
|
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
|
;-----------------------------------------------------------------------------------------------------------------------------
|
|
; try 8 mv via offset
|
|
%macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1
|
|
movdqu %2, [%1]
|
|
movdqu %3, [%1+8h]
|
|
movdqa %4, %2
|
|
movdqa %5, %3
|
|
|
|
mpsadbw %2, xmm0, 0 ; 000 B
|
|
mpsadbw %4, xmm0, 5 ; 101 B
|
|
mpsadbw %3, xmm0, 2 ; 010 B
|
|
mpsadbw %5, xmm0, 7 ; 111 B
|
|
paddw %2, %4
|
|
paddw %3, %5
|
|
paddw %2, %3 ; accumulate cost
|
|
%endmacro ; end of SAD_16x16_LINE_SSE41
|
|
|
|
WELS_EXTERN SumOf16x16BlockOfFrame_sse4
|
|
%define pushsize 16
|
|
%define localsize 4
|
|
%define ref esp + pushsize + localsize + 4
|
|
%define sum_ref esp + pushsize + localsize + 20
|
|
%define times_of_sum esp + pushsize + localsize + 24
|
|
%define width esp + pushsize + localsize + 8
|
|
%define height esp + pushsize + localsize + 12
|
|
%define linesize esp + pushsize + localsize + 16
|
|
%define tmp_width esp
|
|
push ebx
|
|
push ebp
|
|
push esi
|
|
push edi
|
|
sub esp, localsize
|
|
|
|
pxor xmm0, xmm0
|
|
mov esi, [ref]
|
|
mov edi, [sum_ref]
|
|
mov edx, [times_of_sum]
|
|
mov ebx, [linesize]
|
|
mov eax, [width]
|
|
|
|
lea ecx, [ebx+ebx*2]
|
|
mov [tmp_width], eax
|
|
FIRST_ROW_X16_SSE4:
|
|
SUM_LINE_X16_SSE41 esi, xmm1, xmm2, xmm3, xmm4
|
|
SUM_LINE_X16_SSE41 esi+ebx, xmm2, xmm3, xmm4, xmm5
|
|
SUM_LINE_X16_SSE41 esi+ebx*2, xmm3, xmm4, xmm5, xmm6
|
|
SUM_LINE_X16_SSE41 esi+ecx, xmm4, xmm5, xmm6, xmm7
|
|
paddw xmm1, xmm2
|
|
paddw xmm3, xmm4
|
|
paddw xmm1, xmm3
|
|
|
|
lea ebp, [esi+ebx*4]
|
|
SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
|
|
lea ebp, [ebp+ebx*4]
|
|
SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
|
|
lea ebp, [ebp+ebx*4]
|
|
SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
|
|
movdqa [edi], xmm1
|
|
movdqa xmm2, xmm1
|
|
punpcklwd xmm1, xmm0
|
|
punpckhwd xmm2, xmm0
|
|
|
|
COUNT_SUM xmm1, eax, 1
|
|
COUNT_SUM xmm1, eax, 1
|
|
COUNT_SUM xmm1, eax, 1
|
|
COUNT_SUM xmm1, eax, 0
|
|
COUNT_SUM xmm2, eax, 1
|
|
COUNT_SUM xmm2, eax, 1
|
|
COUNT_SUM xmm2, eax, 1
|
|
COUNT_SUM xmm2, eax, 0
|
|
|
|
lea esi, [esi+8]
|
|
lea edi, [edi+16] ; element size is 2
|
|
|
|
sub dword [tmp_width], 8
|
|
jg near FIRST_ROW_X16_SSE4
|
|
|
|
mov esi, [ref]
|
|
mov edi, [sum_ref]
|
|
mov ebp, [width]
|
|
dec dword [height]
|
|
|
|
mov ecx, ebx
|
|
sal ecx, 4 ; succeeded 16th line
|
|
|
|
HEIGHT_LOOP_X16_SSE4:
|
|
mov [tmp_width], ebp
|
|
WIDTH_LOOP_X16_SSE4:
|
|
movdqa xmm7, [edi]
|
|
SUM_LINE_X16_SSE41 esi+ecx, xmm1, xmm2, xmm3, xmm4
|
|
SUM_LINE_X16_SSE41 esi, xmm2, xmm3, xmm4, xmm5
|
|
|
|
paddw xmm7, xmm1
|
|
psubw xmm7, xmm2
|
|
movdqa [edi+ebp*2], xmm7
|
|
|
|
movdqa xmm6, xmm7
|
|
punpcklwd xmm7, xmm0
|
|
punpckhwd xmm6, xmm0
|
|
|
|
COUNT_SUM xmm7, eax, 1
|
|
COUNT_SUM xmm7, eax, 1
|
|
COUNT_SUM xmm7, eax, 1
|
|
COUNT_SUM xmm7, eax, 0
|
|
COUNT_SUM xmm6, eax, 1
|
|
COUNT_SUM xmm6, eax, 1
|
|
COUNT_SUM xmm6, eax, 1
|
|
COUNT_SUM xmm6, eax, 0
|
|
|
|
lea esi, [esi+8]
|
|
lea edi, [edi+16]
|
|
|
|
sub dword [tmp_width], 8
|
|
jg near WIDTH_LOOP_X16_SSE4
|
|
|
|
add esi, ebx
|
|
sub esi, ebp
|
|
|
|
dec dword [height]
|
|
jg near HEIGHT_LOOP_X16_SSE4
|
|
|
|
add esp, localsize
|
|
pop edi
|
|
pop esi
|
|
pop ebp
|
|
pop ebx
|
|
%undef pushsize
|
|
%undef localsize
|
|
%undef ref
|
|
%undef sum_ref
|
|
%undef times_of_sum
|
|
%undef width
|
|
%undef height
|
|
%undef linesize
|
|
%undef tmp_width
|
|
ret
|
|
|
|
|
|
;-----------------------------------------------------------------------------------------------------------------------------
|
|
; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
|
|
;-----------------------------------------------------------------------------------------------------------------------------
|
|
WELS_EXTERN FillQpelLocationByFeatureValue_sse2
|
|
push esi
|
|
push edi
|
|
push ebx
|
|
push ebp
|
|
|
|
%define _ps 16 ; push size
|
|
%define _ls 4 ; local size
|
|
%define sum_ref esp+_ps+_ls+4
|
|
%define pos_list esp+_ps+_ls+16
|
|
%define width esp+_ps+_ls+8
|
|
%define height esp+_ps+_ls+12
|
|
%define i_height esp
|
|
sub esp, _ls
|
|
|
|
mov esi, [sum_ref]
|
|
mov edi, [pos_list]
|
|
mov ebp, [width]
|
|
mov ebx, [height]
|
|
mov [i_height], ebx
|
|
|
|
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
|
|
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
|
|
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
|
|
pxor xmm4, xmm4
|
|
pxor xmm3, xmm3 ; y_qpel vector
|
|
HASH_HEIGHT_LOOP_SSE2:
|
|
movdqa xmm2, xmm5 ; x_qpel vector
|
|
mov ecx, ebp
|
|
HASH_WIDTH_LOOP_SSE2:
|
|
movq xmm0, [esi] ; load x8 sum
|
|
punpcklwd xmm0, xmm4
|
|
movdqa xmm1, xmm2
|
|
punpcklwd xmm1, xmm3
|
|
%rep 3
|
|
movd edx, xmm0
|
|
lea ebx, [edi+edx*4]
|
|
mov eax, [ebx]
|
|
movd [eax], xmm1
|
|
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
|
|
lea eax, [eax+4]
|
|
mov [ebx], eax
|
|
psrldq xmm1, 4
|
|
psrldq xmm0, 4
|
|
%endrep
|
|
movd edx, xmm0
|
|
lea ebx, [edi+edx*4]
|
|
mov eax, [ebx]
|
|
movd [eax], xmm1
|
|
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
|
|
lea eax, [eax+4]
|
|
mov [ebx], eax
|
|
|
|
paddw xmm2, xmm7
|
|
lea esi, [esi+8]
|
|
sub ecx, 4
|
|
jnz near HASH_WIDTH_LOOP_SSE2
|
|
paddw xmm3, xmm6
|
|
dec dword [i_height]
|
|
jnz near HASH_HEIGHT_LOOP_SSE2
|
|
|
|
add esp, _ls
|
|
%undef _ps
|
|
%undef _ls
|
|
%undef sum_ref
|
|
%undef pos_list
|
|
%undef width
|
|
%undef height
|
|
%undef i_height
|
|
pop ebp
|
|
pop ebx
|
|
pop edi
|
|
pop esi
|
|
ret
|
|
|
|
;---------------------------------------------------------------------------------------------------------------------------------------------------
|
|
; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
|
|
; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
|
|
;---------------------------------------------------------------------------------------------------------------------------------------------------
|
|
WELS_EXTERN InitializeHashforFeature_sse2
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
push ebp
|
|
%define _ps 16 ; push size
|
|
mov edi, [esp+_ps+16] ; pPositionOfSum
|
|
mov ebp, [esp+_ps+20] ; sum_idx_list
|
|
mov esi, [esp+_ps+4] ; pTimesOfSum
|
|
mov ebx, [esp+_ps+8] ; pBuf
|
|
mov edx, [esp+_ps+12] ; list_sz
|
|
sar edx, 2
|
|
mov ecx, 0
|
|
pxor xmm7, xmm7
|
|
hash_assign_loop_x4_sse2:
|
|
movdqa xmm0, [esi+ecx]
|
|
pslld xmm0, 2
|
|
|
|
movdqa xmm1, xmm0
|
|
pcmpeqd xmm1, xmm7
|
|
movmskps eax, xmm1
|
|
cmp eax, 0x0f
|
|
je near hash_assign_with_copy_sse2
|
|
|
|
%assign x 0
|
|
%rep 4
|
|
lea eax, [edi+ecx+x]
|
|
mov [eax], ebx
|
|
lea eax, [ebp+ecx+x]
|
|
mov [eax], ebx
|
|
movd eax, xmm0
|
|
add ebx, eax
|
|
psrldq xmm0, 4
|
|
%assign x x+4
|
|
%endrep
|
|
jmp near assign_next_sse2
|
|
|
|
hash_assign_with_copy_sse2:
|
|
movd xmm1, ebx
|
|
pshufd xmm2, xmm1, 0
|
|
movdqa [edi+ecx], xmm2
|
|
movdqa [ebp+ecx], xmm2
|
|
|
|
assign_next_sse2:
|
|
add ecx, 16
|
|
dec edx
|
|
jnz near hash_assign_loop_x4_sse2
|
|
|
|
mov edx, [esp+_ps+12] ; list_sz
|
|
and edx, 3
|
|
jz near hash_assign_no_rem_sse2
|
|
hash_assign_loop_x4_rem_sse2:
|
|
lea eax, [edi+ecx]
|
|
mov [eax], ebx
|
|
lea eax, [ebp+ecx]
|
|
mov [eax], ebx
|
|
mov eax, [esi+ecx]
|
|
sal eax, 2
|
|
add ebx, eax
|
|
add ecx, 4
|
|
dec edx
|
|
jnz near hash_assign_loop_x4_rem_sse2
|
|
|
|
hash_assign_no_rem_sse2:
|
|
%undef _ps
|
|
pop ebp
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
%else
|
|
|
|
;**********************************************************************************************************************
|
|
;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
|
|
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
|
;*********************************************************************************************************************
|
|
WELS_EXTERN SumOf8x8BlockOfFrame_sse2
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 6
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r2, r2d
|
|
SIGN_EXTENSION r3, r3d
|
|
push r12
|
|
push r13
|
|
push r0
|
|
push r2
|
|
push r4
|
|
|
|
pxor xmm0, xmm0
|
|
lea r6, [r3+r3*2]
|
|
|
|
mov r12, r1 ;r12:tmp_width
|
|
lea r13, [r0+r3*4] ;rbp:r13
|
|
FIRST_ROW:
|
|
movq xmm1, [r0]
|
|
movq xmm2, [r0+r3]
|
|
movq xmm3, [r0+r3*2]
|
|
movq xmm4, [r0+r6]
|
|
|
|
shufps xmm1, xmm2, 01000100b
|
|
shufps xmm3, xmm4, 01000100b
|
|
psadbw xmm1, xmm0
|
|
psadbw xmm3, xmm0
|
|
paddd xmm1, xmm3
|
|
|
|
movq xmm2, [r13]
|
|
movq xmm3, [r13+r3]
|
|
movq xmm4, [r13+r3*2]
|
|
movq xmm5, [r13+r6]
|
|
|
|
shufps xmm2, xmm3, 01000100b
|
|
shufps xmm4, xmm5, 01000100b
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm4, xmm0
|
|
paddd xmm2, xmm4
|
|
|
|
paddd xmm1, xmm2
|
|
pshufd xmm2, xmm1, 00001110b
|
|
paddd xmm1, xmm2
|
|
movd r2d, xmm1
|
|
mov [r4], r2w
|
|
inc dword [r5+r2*4]
|
|
|
|
inc r0
|
|
inc r13
|
|
add r4, 2
|
|
|
|
dec r12
|
|
jg FIRST_ROW
|
|
|
|
pop r4
|
|
pop r2
|
|
pop r0
|
|
mov r13, r2
|
|
dec r13
|
|
HEIGHT_LOOP:
|
|
mov r12, r1
|
|
WIDTH_LOOP:
|
|
movq xmm1, [r0+r3*8]
|
|
movq xmm2, [r0]
|
|
psadbw xmm1, xmm0
|
|
psadbw xmm2, xmm0
|
|
psubd xmm1, xmm2
|
|
movd r2d, xmm1
|
|
mov r6w, [r4]
|
|
add r2d, r6d
|
|
mov [r4+r1*2], r2w
|
|
inc dword [r5+r2*4]
|
|
|
|
inc r0
|
|
add r4, 2
|
|
|
|
dec r12
|
|
jg WIDTH_LOOP
|
|
|
|
add r0, r3
|
|
sub r0, r1
|
|
|
|
|
|
dec r13
|
|
jg HEIGHT_LOOP
|
|
|
|
pop r13
|
|
pop r12
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|
|
|
|
%macro COUNT_SUM 4
|
|
%define xmm_reg %1
|
|
%define tmp_dreg %2
|
|
%define tmp_qreg %3
|
|
movd tmp_dreg, xmm_reg
|
|
inc dword [r5+tmp_qreg*4]
|
|
%if %4 == 1
|
|
psrldq xmm_reg, 4
|
|
%endif
|
|
%endmacro
|
|
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; requires: width % 8 == 0 && height > 1
|
|
;-----------------------------------------------------------------------------
|
|
;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
|
|
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
|
;-----------------------------------------------------------------------------
|
|
; read extra (16 - (width % 8) ) mod 16 bytes of every line
|
|
; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
|
|
WELS_EXTERN SumOf8x8BlockOfFrame_sse4
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r2, r2d
|
|
SIGN_EXTENSION r3, r3d
|
|
push r12
|
|
push r13
|
|
push r0
|
|
push r2
|
|
push r4
|
|
|
|
pxor xmm0, xmm0
|
|
lea r6, [r3+r3*2]
|
|
|
|
mov r12, r1 ;r12:tmp_width
|
|
lea r13, [r0+r3*4] ;rbp:r13
|
|
FIRST_ROW_SSE4:
|
|
movdqu xmm1, [r0]
|
|
movdqu xmm3, [r0+r3]
|
|
movdqu xmm5, [r0+r3*2]
|
|
movdqu xmm7, [r0+r6]
|
|
|
|
movdqa xmm2, xmm1
|
|
mpsadbw xmm1, xmm0, 000b
|
|
mpsadbw xmm2, xmm0, 100b
|
|
paddw xmm1, xmm2 ; 8 sums of line1
|
|
|
|
movdqa xmm4, xmm3
|
|
mpsadbw xmm3, xmm0, 000b
|
|
mpsadbw xmm4, xmm0, 100b
|
|
paddw xmm3, xmm4 ; 8 sums of line2
|
|
|
|
movdqa xmm2, xmm5
|
|
mpsadbw xmm5, xmm0, 000b
|
|
mpsadbw xmm2, xmm0, 100b
|
|
paddw xmm5, xmm2 ; 8 sums of line3
|
|
|
|
movdqa xmm4, xmm7
|
|
mpsadbw xmm7, xmm0, 000b
|
|
mpsadbw xmm4, xmm0, 100b
|
|
paddw xmm7, xmm4 ; 8 sums of line4
|
|
|
|
paddw xmm1, xmm3
|
|
paddw xmm5, xmm7
|
|
paddw xmm1, xmm5 ; sum the upper 4 lines first
|
|
|
|
movdqu xmm2, [r13]
|
|
movdqu xmm3, [r13+r3]
|
|
movdqu xmm4, [r13+r3*2]
|
|
movdqu xmm5, [r13+r6]
|
|
|
|
movdqa xmm6, xmm2
|
|
mpsadbw xmm2, xmm0, 000b
|
|
mpsadbw xmm6, xmm0, 100b
|
|
paddw xmm2, xmm6
|
|
|
|
movdqa xmm7, xmm3
|
|
mpsadbw xmm3, xmm0, 000b
|
|
mpsadbw xmm7, xmm0, 100b
|
|
paddw xmm3, xmm7
|
|
|
|
movdqa xmm6, xmm4
|
|
mpsadbw xmm4, xmm0, 000b
|
|
mpsadbw xmm6, xmm0, 100b
|
|
paddw xmm4, xmm6
|
|
|
|
movdqa xmm7, xmm5
|
|
mpsadbw xmm5, xmm0, 000b
|
|
mpsadbw xmm7, xmm0, 100b
|
|
paddw xmm5, xmm7
|
|
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
paddw xmm1, xmm4 ; sum of lines 1- 8
|
|
|
|
movdqu [r4], xmm1
|
|
|
|
movdqa xmm2, xmm1
|
|
punpcklwd xmm1, xmm0
|
|
punpckhwd xmm2, xmm0
|
|
|
|
COUNT_SUM xmm1, r2d, r2, 1
|
|
COUNT_SUM xmm1, r2d, r2, 1
|
|
COUNT_SUM xmm1, r2d, r2, 1
|
|
COUNT_SUM xmm1, r2d, r2, 0
|
|
COUNT_SUM xmm2, r2d, r2 ,1
|
|
COUNT_SUM xmm2, r2d, r2 ,1
|
|
COUNT_SUM xmm2, r2d, r2 ,1
|
|
COUNT_SUM xmm2, r2d, r2 ,0
|
|
|
|
lea r0, [r0+8]
|
|
lea r13, [r13+8]
|
|
lea r4, [r4+16] ; element size is 2
|
|
|
|
sub r12, 8
|
|
jg near FIRST_ROW_SSE4
|
|
|
|
pop r4
|
|
pop r2
|
|
pop r0
|
|
mov r13, r2
|
|
dec r13
|
|
HEIGHT_LOOP_SSE4:
|
|
mov r12, r1
|
|
WIDTH_LOOP_SSE4:
|
|
movdqu xmm1, [r0+r3*8]
|
|
movdqu xmm2, [r0]
|
|
movdqu xmm7, [r4]
|
|
|
|
movdqa xmm3, xmm1
|
|
mpsadbw xmm1, xmm0, 000b
|
|
mpsadbw xmm3, xmm0, 100b
|
|
paddw xmm1, xmm3
|
|
|
|
movdqa xmm4, xmm2
|
|
mpsadbw xmm2, xmm0, 000b
|
|
mpsadbw xmm4, xmm0, 100b
|
|
paddw xmm2, xmm4
|
|
|
|
paddw xmm7, xmm1
|
|
psubw xmm7, xmm2
|
|
movdqu [r4+r1*2], xmm7
|
|
|
|
movdqa xmm6, xmm7
|
|
punpcklwd xmm7, xmm0
|
|
punpckhwd xmm6, xmm0
|
|
|
|
COUNT_SUM xmm7, r2d, r2, 1
|
|
COUNT_SUM xmm7, r2d, r2, 1
|
|
COUNT_SUM xmm7, r2d, r2, 1
|
|
COUNT_SUM xmm7, r2d, r2, 0
|
|
COUNT_SUM xmm6, r2d, r2, 1
|
|
COUNT_SUM xmm6, r2d, r2, 1
|
|
COUNT_SUM xmm6, r2d, r2, 1
|
|
COUNT_SUM xmm6, r2d, r2, 0
|
|
|
|
lea r0, [r0+8]
|
|
lea r4, [r4+16]
|
|
|
|
sub r12, 8
|
|
jg near WIDTH_LOOP_SSE4
|
|
|
|
lea r0, [r0+r3]
|
|
sub r0, r1
|
|
|
|
dec r13
|
|
jg near HEIGHT_LOOP_SSE4
|
|
|
|
pop r13
|
|
pop r12
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|
|
|
|
;****************************************************************************************************************************************************
|
|
;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
|
|
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
|
;****************************************************************************************************************************************************
|
|
WELS_EXTERN SumOf16x16BlockOfFrame_sse2
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 6
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r2, r2d
|
|
SIGN_EXTENSION r3, r3d
|
|
push r12
|
|
push r13
|
|
push r0
|
|
push r2
|
|
push r4
|
|
|
|
pxor xmm0, xmm0
|
|
lea r6, [r3+r3*2]
|
|
|
|
mov r12, r1 ;r12:tmp_width
|
|
FIRST_ROW_X16H:
|
|
movdqu xmm1, [r0]
|
|
movdqu xmm2, [r0+r3]
|
|
movdqu xmm3, [r0+r3*2]
|
|
movdqu xmm4, [r0+r6]
|
|
|
|
psadbw xmm1, xmm0
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
paddw xmm1, xmm2
|
|
paddw xmm3, xmm4
|
|
paddw xmm1, xmm3
|
|
|
|
lea r13, [r0+r3*4] ;ebp:r13
|
|
movdqu xmm2, [r13]
|
|
movdqu xmm3, [r13+r3]
|
|
movdqu xmm4, [r13+r3*2]
|
|
movdqu xmm5, [r13+r6]
|
|
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
psadbw xmm5, xmm0
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
paddw xmm2, xmm4
|
|
|
|
paddw xmm1, xmm2
|
|
|
|
lea r13, [r13+r3*4]
|
|
movdqu xmm2, [r13]
|
|
movdqu xmm3, [r13+r3]
|
|
movdqu xmm4, [r13+r3*2]
|
|
movdqu xmm5, [r13+r6]
|
|
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
psadbw xmm5, xmm0
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
paddw xmm2, xmm4
|
|
|
|
paddw xmm1, xmm2
|
|
|
|
lea r13, [r13+r3*4]
|
|
movdqu xmm2, [r13]
|
|
movdqu xmm3, [r13+r3]
|
|
movdqu xmm4, [r13+r3*2]
|
|
movdqu xmm5, [r13+r6]
|
|
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
psadbw xmm5, xmm0
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
paddw xmm2, xmm4
|
|
|
|
paddw xmm1, xmm2
|
|
movdqa xmm2, xmm1
|
|
punpckhwd xmm2, xmm0
|
|
paddw xmm1, xmm2
|
|
movd r2d, xmm1
|
|
mov [r4], r2w
|
|
inc dword [r5+r2*4]
|
|
|
|
inc r0
|
|
lea r4, [r4+2]
|
|
|
|
dec r12
|
|
jg near FIRST_ROW_X16H
|
|
|
|
pop r4
|
|
pop r2
|
|
pop r0
|
|
mov r13, r2
|
|
dec r13
|
|
mov r6, r3
|
|
sal r6, 4 ; succeeded 16th line
|
|
HEIGHT_LOOP_X16:
|
|
mov r12, r1
|
|
WIDTH_LOOP_X16:
|
|
movdqu xmm1, [r0+r6]
|
|
movdqu xmm2, [r0]
|
|
psadbw xmm1, xmm0
|
|
psadbw xmm2, xmm0
|
|
psubw xmm1, xmm2
|
|
movdqa xmm2, xmm1
|
|
punpckhwd xmm2, xmm0
|
|
paddw xmm1, xmm2
|
|
movd r2d, xmm1
|
|
add r2w, word [r4]
|
|
mov [r4+r1*2], r2w
|
|
inc dword [r5+r2*4]
|
|
|
|
inc r0
|
|
add r4, 2
|
|
|
|
dec r12
|
|
jg near WIDTH_LOOP_X16
|
|
|
|
add r0, r3
|
|
sub r0, r1
|
|
|
|
dec r13
|
|
jg near HEIGHT_LOOP_X16
|
|
|
|
pop r13
|
|
pop r12
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|
|
; requires: width % 16 == 0 && height > 1
|
|
;-----------------------------------------------------------------------------------------------------------------------------
|
|
;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
|
|
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
|
;-----------------------------------------------------------------------------------------------------------------------------
|
|
; try 8 mv via offset
|
|
%macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1
|
|
movdqu %2, [%1]
|
|
movdqu %3, [%1+8h]
|
|
movdqa %4, %2
|
|
movdqa %5, %3
|
|
|
|
mpsadbw %2, xmm0, 0 ; 000 B
|
|
mpsadbw %4, xmm0, 5 ; 101 B
|
|
mpsadbw %3, xmm0, 2 ; 010 B
|
|
mpsadbw %5, xmm0, 7 ; 111 B
|
|
paddw %2, %4
|
|
paddw %3, %5
|
|
paddw %2, %3 ; accumulate cost
|
|
%endmacro ; end of SAD_16x16_LINE_SSE41
|
|
|
|
WELS_EXTERN SumOf16x16BlockOfFrame_sse4
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r2, r2d
|
|
SIGN_EXTENSION r3, r3d
|
|
push r12
|
|
push r13
|
|
push r0
|
|
push r2
|
|
push r4
|
|
|
|
pxor xmm0, xmm0
|
|
lea r6, [r3+r3*2]
|
|
|
|
mov r12, r1 ;r12:tmp_width
|
|
FIRST_ROW_X16_SSE4:
|
|
SUM_LINE_X16_SSE41 r0, xmm1, xmm2, xmm3, xmm4
|
|
SUM_LINE_X16_SSE41 r0+r3, xmm2, xmm3, xmm4, xmm5
|
|
SUM_LINE_X16_SSE41 r0+r3*2,xmm3, xmm4, xmm5, xmm6
|
|
SUM_LINE_X16_SSE41 r0+r6, xmm4, xmm5, xmm6, xmm7
|
|
paddw xmm1, xmm2
|
|
paddw xmm3, xmm4
|
|
paddw xmm1, xmm3
|
|
|
|
lea r13, [r0+r3*4]
|
|
SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
|
|
lea r13, [r13+r3*4]
|
|
SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
|
|
lea r13, [r13+r3*4]
|
|
SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
|
|
paddw xmm1, xmm2
|
|
|
|
movdqa [r4], xmm1
|
|
movdqa xmm2, xmm1
|
|
punpcklwd xmm1, xmm0
|
|
punpckhwd xmm2, xmm0
|
|
|
|
COUNT_SUM xmm1, r2d, r2, 1
|
|
COUNT_SUM xmm1, r2d, r2, 1
|
|
COUNT_SUM xmm1, r2d, r2, 1
|
|
COUNT_SUM xmm1, r2d, r2, 0
|
|
COUNT_SUM xmm2, r2d, r2, 1
|
|
COUNT_SUM xmm2, r2d, r2, 1
|
|
COUNT_SUM xmm2, r2d, r2, 1
|
|
COUNT_SUM xmm2, r2d, r2, 0
|
|
|
|
lea r0, [r0+8]
|
|
lea r4, [r4+16] ; element size is 2
|
|
|
|
sub r12, 8
|
|
jg near FIRST_ROW_X16_SSE4
|
|
|
|
pop r4
|
|
pop r2
|
|
pop r0
|
|
mov r13, r2
|
|
dec r13
|
|
mov r6, r3
|
|
sal r6, 4 ; succeeded 16th line
|
|
|
|
HEIGHT_LOOP_X16_SSE4:
|
|
mov r12, r1
|
|
WIDTH_LOOP_X16_SSE4:
|
|
movdqa xmm7, [r4]
|
|
SUM_LINE_X16_SSE41 r0+r6, xmm1, xmm2, xmm3, xmm4
|
|
SUM_LINE_X16_SSE41 r0, xmm2, xmm3, xmm4, xmm5
|
|
|
|
paddw xmm7, xmm1
|
|
psubw xmm7, xmm2
|
|
movdqa [r4+r1*2], xmm7
|
|
|
|
movdqa xmm6, xmm7
|
|
punpcklwd xmm7, xmm0
|
|
punpckhwd xmm6, xmm0
|
|
|
|
COUNT_SUM xmm7, r2d, r2, 1
|
|
COUNT_SUM xmm7, r2d, r2, 1
|
|
COUNT_SUM xmm7, r2d, r2, 1
|
|
COUNT_SUM xmm7, r2d, r2, 0
|
|
COUNT_SUM xmm6, r2d, r2, 1
|
|
COUNT_SUM xmm6, r2d, r2, 1
|
|
COUNT_SUM xmm6, r2d, r2, 1
|
|
COUNT_SUM xmm6, r2d, r2, 0
|
|
|
|
lea r0, [r0+8]
|
|
lea r4, [r4+16]
|
|
|
|
sub r12, 8
|
|
jg near WIDTH_LOOP_X16_SSE4
|
|
|
|
add r0, r3
|
|
sub r0, r1
|
|
|
|
dec r13
|
|
jg near HEIGHT_LOOP_X16_SSE4
|
|
|
|
pop r13
|
|
pop r12
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|
|
;-----------------------------------------------------------------------------------------------------------------------------
|
|
; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
|
|
;-----------------------------------------------------------------------------------------------------------------------------
|
|
WELS_EXTERN FillQpelLocationByFeatureValue_sse2
|
|
%assign push_num 0
|
|
LOAD_4_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r2, r2d
|
|
push r12
|
|
push r13
|
|
mov r12, r2
|
|
|
|
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
|
|
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
|
|
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
|
|
pxor xmm4, xmm4
|
|
pxor xmm3, xmm3 ; y_qpel vector
|
|
HASH_HEIGHT_LOOP_SSE2:
|
|
movdqa xmm2, xmm5 ; x_qpel vector
|
|
mov r4, r1
|
|
HASH_WIDTH_LOOP_SSE2:
|
|
movq xmm0, [r0] ; load x8 sum
|
|
punpcklwd xmm0, xmm4
|
|
movdqa xmm1, xmm2
|
|
punpcklwd xmm1, xmm3
|
|
%rep 3
|
|
movd r2d, xmm0 ;edx:r3
|
|
lea r5, [r3+r2*8] ;ebx:r5
|
|
mov r6, [r5] ;eax:r6
|
|
movd [r6], xmm1
|
|
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
|
|
lea r6, [r6+4]
|
|
mov [r5], r6
|
|
psrldq xmm1, 4
|
|
psrldq xmm0, 4
|
|
%endrep
|
|
movd r2d, xmm0
|
|
lea r5, [r3+r2*8] ;ebx:r5
|
|
mov r6, [r5] ;eax:r6
|
|
movd [r6], xmm1
|
|
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
|
|
lea r6, [r6+4]
|
|
mov [r5], r6
|
|
|
|
paddw xmm2, xmm7
|
|
lea r0, [r0+8]
|
|
sub r4, 4
|
|
jnz near HASH_WIDTH_LOOP_SSE2
|
|
paddw xmm3, xmm6
|
|
dec r12
|
|
jnz near HASH_HEIGHT_LOOP_SSE2
|
|
|
|
pop r13
|
|
pop r12
|
|
POP_XMM
|
|
ret
|
|
|
|
;---------------------------------------------------------------------------------------------------------------------------------------------------
|
|
; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
|
|
; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
|
|
;uint16_t** pPositionOfSum, uint16_t** sum_idx_list, uint32_t* pTimesOfSum, uint16_t* pBuf, const int32_t list_sz )
|
|
;---------------------------------------------------------------------------------------------------------------------------------------------------
|
|
WELS_EXTERN InitializeHashforFeature_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
push r12
|
|
push r13
|
|
mov r12, r2
|
|
sar r2, 2
|
|
mov r5, 0 ;r5:ecx
|
|
xor r6, r6
|
|
pxor xmm3, xmm3
|
|
hash_assign_loop_x4_sse2:
|
|
movdqa xmm0, [r0+r5]
|
|
pslld xmm0, 2
|
|
|
|
movdqa xmm1, xmm0
|
|
pcmpeqd xmm1, xmm3
|
|
movmskps r6, xmm1
|
|
cmp r6, 0x0f
|
|
jz near hash_assign_with_copy_sse2
|
|
|
|
%assign x 0
|
|
%rep 4
|
|
lea r13, [r3+r5*2+x]
|
|
mov [r13], r1
|
|
lea r13, [r4+r5*2+x]
|
|
mov [r13], r1
|
|
movd r6d, xmm0
|
|
add r1, r6
|
|
psrldq xmm0, 4
|
|
%assign x x+8
|
|
%endrep
|
|
jmp near assign_next_sse2
|
|
|
|
hash_assign_with_copy_sse2:
|
|
movq xmm1, r1
|
|
pshufd xmm2, xmm1, 01000100b
|
|
movdqa [r3+r5*2], xmm2
|
|
movdqa [r4+r5*2], xmm2
|
|
movdqa [r3+r5*2+16], xmm2
|
|
movdqa [r4+r5*2+16], xmm2
|
|
|
|
assign_next_sse2:
|
|
add r5, 16
|
|
dec r2
|
|
jnz near hash_assign_loop_x4_sse2
|
|
|
|
and r12, 3
|
|
jz near hash_assign_no_rem_sse2
|
|
hash_assign_loop_x4_rem_sse2:
|
|
lea r13, [r3+r5*2]
|
|
mov [r13], r1
|
|
lea r13, [r4+r5*2]
|
|
mov [r13], r1
|
|
mov r6d, [r0+r5]
|
|
sal r6, 2
|
|
add r1, r6
|
|
add r5, 4
|
|
dec r12
|
|
jnz near hash_assign_loop_x4_rem_sse2
|
|
|
|
hash_assign_no_rem_sse2:
|
|
pop r13
|
|
pop r12
|
|
ret
|
|
|
|
%endif
|
|
|
|
;**********************************************************************************************************************************
|
|
; int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
|
|
;**********************************************************************************************************************************
|
|
WELS_EXTERN SumOf8x8SingleBlock_sse2
|
|
%assign push_num 0
|
|
LOAD_2_PARA
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
pxor xmm0, xmm0
|
|
movq xmm1, [r0]
|
|
movhps xmm1, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movq xmm2, [r0]
|
|
movhps xmm2, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movq xmm3, [r0]
|
|
movhps xmm3, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movq xmm4, [r0]
|
|
movhps xmm4, [r0+r1]
|
|
|
|
psadbw xmm1, xmm0
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
paddw xmm1, xmm2
|
|
paddw xmm3, xmm4
|
|
paddw xmm1, xmm3
|
|
|
|
movdqa xmm2, xmm1
|
|
punpckhwd xmm2, xmm0
|
|
paddw xmm1, xmm2
|
|
|
|
movd retrd, xmm1
|
|
ret
|
|
|
|
;**********************************************************************************************************************************
|
|
; int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
|
|
;**********************************************************************************************************************************
|
|
WELS_EXTERN SumOf16x16SingleBlock_sse2
|
|
%assign push_num 0
|
|
LOAD_2_PARA
|
|
PUSH_XMM 6
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
pxor xmm0, xmm0
|
|
movdqa xmm1, [r0]
|
|
movdqa xmm2, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm3, [r0]
|
|
movdqa xmm4, [r0+r1]
|
|
psadbw xmm1, xmm0
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
paddw xmm1, xmm2
|
|
paddw xmm3, xmm4
|
|
paddw xmm1, xmm3
|
|
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm2, [r0]
|
|
movdqa xmm3, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm4, [r0]
|
|
movdqa xmm5, [r0+r1]
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
psadbw xmm5, xmm0
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
paddw xmm2, xmm4
|
|
|
|
paddw xmm1, xmm2
|
|
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm2, [r0]
|
|
movdqa xmm3, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm4, [r0]
|
|
movdqa xmm5, [r0+r1]
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
psadbw xmm5, xmm0
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
paddw xmm2, xmm4
|
|
|
|
paddw xmm1, xmm2
|
|
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm2, [r0]
|
|
movdqa xmm3, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm4, [r0]
|
|
movdqa xmm5, [r0+r1]
|
|
psadbw xmm2, xmm0
|
|
psadbw xmm3, xmm0
|
|
psadbw xmm4, xmm0
|
|
psadbw xmm5, xmm0
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
paddw xmm2, xmm4
|
|
|
|
paddw xmm1, xmm2
|
|
|
|
movdqa xmm2, xmm1
|
|
punpckhwd xmm2, xmm0
|
|
paddw xmm1, xmm2
|
|
|
|
movd retrd, xmm1
|
|
POP_XMM
|
|
ret
|
|
|
|
;**********************************************************************************************************************************
|
|
;
|
|
; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
|
|
;
|
|
; \note:
|
|
; src need align with 16 bytes, ref is optional
|
|
; \return value:
|
|
; return minimal SAD cost, according index carried by index_min_cost
|
|
;**********************************************************************************************************************************
|
|
; try 8 mv via offset
|
|
; xmm7 store sad costs
|
|
%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
|
|
movdqa xmm0, [%1]
|
|
movdqu xmm1, [%2]
|
|
movdqu xmm2, [%2+8h]
|
|
movdqa xmm3, xmm1
|
|
movdqa xmm4, xmm2
|
|
|
|
mpsadbw xmm1, xmm0, 0 ; 000 B
|
|
paddw xmm7, xmm1 ; accumulate cost
|
|
|
|
mpsadbw xmm3, xmm0, 5 ; 101 B
|
|
paddw xmm7, xmm3 ; accumulate cost
|
|
|
|
mpsadbw xmm2, xmm0, 2 ; 010 B
|
|
paddw xmm7, xmm2 ; accumulate cost
|
|
|
|
mpsadbw xmm4, xmm0, 7 ; 111 B
|
|
paddw xmm7, xmm4 ; accumulate cost
|
|
|
|
add %1, %3
|
|
add %2, %4
|
|
%endmacro ; end of SAD_16x16_LINE_SSE41
|
|
%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
|
|
movdqa xmm0, [%1]
|
|
movdqu xmm1, [%2]
|
|
movdqu xmm2, [%2+8h]
|
|
movdqa xmm3, xmm1
|
|
movdqa xmm4, xmm2
|
|
|
|
mpsadbw xmm1, xmm0, 0 ; 000 B
|
|
paddw xmm7, xmm1 ; accumulate cost
|
|
|
|
mpsadbw xmm3, xmm0, 5 ; 101 B
|
|
paddw xmm7, xmm3 ; accumulate cost
|
|
|
|
mpsadbw xmm2, xmm0, 2 ; 010 B
|
|
paddw xmm7, xmm2 ; accumulate cost
|
|
|
|
mpsadbw xmm4, xmm0, 7 ; 111 B
|
|
paddw xmm7, xmm4 ; accumulate cost
|
|
%endmacro ; end of SAD_16x16_LINE_SSE41E
|
|
|
|
WELS_EXTERN SampleSad16x16Hor8_sse41
|
|
;push ebx
|
|
;push esi
|
|
;mov eax, [esp+12] ; src
|
|
;mov ecx, [esp+16] ; stride_src
|
|
;mov ebx, [esp+20] ; ref
|
|
;mov edx, [esp+24] ; stride_ref
|
|
;mov esi, [esp+28] ; base_cost
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm7, xmm7
|
|
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41E r0, r2, r1, r3
|
|
|
|
pxor xmm0, xmm0
|
|
movdqa xmm6, xmm7
|
|
punpcklwd xmm6, xmm0
|
|
punpckhwd xmm7, xmm0
|
|
|
|
movdqa xmm5, [r4]
|
|
movdqa xmm4, xmm5
|
|
punpcklwd xmm4, xmm0
|
|
punpckhwd xmm5, xmm0
|
|
|
|
paddd xmm4, xmm6
|
|
paddd xmm5, xmm7
|
|
movdqa xmm3, xmm4
|
|
pminud xmm3, xmm5
|
|
pshufd xmm2, xmm3, 01001110B
|
|
pminud xmm2, xmm3
|
|
pshufd xmm3, xmm2, 10110001B
|
|
pminud xmm2, xmm3
|
|
movd retrd, xmm2
|
|
pcmpeqd xmm4, xmm2
|
|
movmskps r2d, xmm4
|
|
bsf r1d, r2d
|
|
jnz near WRITE_INDEX
|
|
|
|
pcmpeqd xmm5, xmm2
|
|
movmskps r2d, xmm5
|
|
bsf r1d, r2d
|
|
add r1d, 4
|
|
|
|
WRITE_INDEX:
|
|
mov [r5], r1d
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|
|
;**********************************************************************************************************************************
|
|
;
|
|
; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
|
|
;
|
|
; \note:
|
|
; src and ref is optional to align with 16 due inter 8x8
|
|
; \return value:
|
|
; return minimal SAD cost, according index carried by index_min_cost
|
|
;
|
|
;**********************************************************************************************************************************
|
|
; try 8 mv via offset
|
|
; xmm7 store sad costs
|
|
%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
|
|
movdqu xmm0, [%1]
|
|
movdqu xmm1, [%2]
|
|
movdqa xmm2, xmm1
|
|
|
|
mpsadbw xmm1, xmm0, 0 ; 000 B
|
|
paddw xmm7, xmm1 ; accumulate cost
|
|
|
|
mpsadbw xmm2, xmm0, 5 ; 101 B
|
|
paddw xmm7, xmm2 ; accumulate cost
|
|
|
|
add %1, %3
|
|
add %2, %4
|
|
%endmacro ; end of SAD_8x8_LINE_SSE41
|
|
%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
|
|
movdqu xmm0, [%1]
|
|
movdqu xmm1, [%2]
|
|
movdqa xmm2, xmm1
|
|
|
|
mpsadbw xmm1, xmm0, 0 ; 000 B
|
|
paddw xmm7, xmm1 ; accumulate cost
|
|
|
|
mpsadbw xmm2, xmm0, 5 ; 101 B
|
|
paddw xmm7, xmm2 ; accumulate cost
|
|
%endmacro ; end of SAD_8x8_LINE_SSE41E
|
|
|
|
WELS_EXTERN SampleSad8x8Hor8_sse41
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movdqa xmm7, [r4] ; load base cost list
|
|
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_8x8_LINE_SSE41E r0, r2, r1, r3
|
|
|
|
phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
|
|
movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
|
|
mov r1d, retrd
|
|
and retrd, 0xFFFF
|
|
sar r1d, 16
|
|
mov [r5], r1d
|
|
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|