5c60e8f868
Add asm level functions Add asm code for ME Modify format Add unit test for asm code. Modify function name and format. Remove unuse comment Modify targets file Add Macro protect for SSE41 funtion test Modify according to review request.
226 lines
6.9 KiB
NASM
226 lines
6.9 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*************************************************************************/
|
|
%include "asm_inc.asm"
|
|
|
|
SECTION .text
|
|
|
|
;**********************************************************************************************************************************
|
|
;
|
|
; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
|
|
;
|
|
; \note:
|
|
; src need align with 16 bytes, ref is optional
|
|
; \return value:
|
|
; return minimal SAD cost, according index carried by index_min_cost
|
|
;**********************************************************************************************************************************
|
|
; try 8 mv via offset
|
|
; xmm7 store sad costs
|
|
%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
|
|
movdqa xmm0, [%1]
|
|
movdqu xmm1, [%2]
|
|
movdqu xmm2, [%2+8h]
|
|
movdqa xmm3, xmm1
|
|
movdqa xmm4, xmm2
|
|
|
|
mpsadbw xmm1, xmm0, 0 ; 000 B
|
|
paddw xmm7, xmm1 ; accumulate cost
|
|
|
|
mpsadbw xmm3, xmm0, 5 ; 101 B
|
|
paddw xmm7, xmm3 ; accumulate cost
|
|
|
|
mpsadbw xmm2, xmm0, 2 ; 010 B
|
|
paddw xmm7, xmm2 ; accumulate cost
|
|
|
|
mpsadbw xmm4, xmm0, 7 ; 111 B
|
|
paddw xmm7, xmm4 ; accumulate cost
|
|
|
|
add %1, %3
|
|
add %2, %4
|
|
%endmacro ; end of SAD_16x16_LINE_SSE41
|
|
%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
|
|
movdqa xmm0, [%1]
|
|
movdqu xmm1, [%2]
|
|
movdqu xmm2, [%2+8h]
|
|
movdqa xmm3, xmm1
|
|
movdqa xmm4, xmm2
|
|
|
|
mpsadbw xmm1, xmm0, 0 ; 000 B
|
|
paddw xmm7, xmm1 ; accumulate cost
|
|
|
|
mpsadbw xmm3, xmm0, 5 ; 101 B
|
|
paddw xmm7, xmm3 ; accumulate cost
|
|
|
|
mpsadbw xmm2, xmm0, 2 ; 010 B
|
|
paddw xmm7, xmm2 ; accumulate cost
|
|
|
|
mpsadbw xmm4, xmm0, 7 ; 111 B
|
|
paddw xmm7, xmm4 ; accumulate cost
|
|
%endmacro ; end of SAD_16x16_LINE_SSE41E
|
|
|
|
WELS_EXTERN SampleSad16x16Hor8_sse41
|
|
;push ebx
|
|
;push esi
|
|
;mov eax, [esp+12] ; src
|
|
;mov ecx, [esp+16] ; stride_src
|
|
;mov ebx, [esp+20] ; ref
|
|
;mov edx, [esp+24] ; stride_ref
|
|
;mov esi, [esp+28] ; base_cost
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
pxor xmm7, xmm7
|
|
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_16x16_LINE_SSE41E r0, r2, r1, r3
|
|
|
|
pxor xmm0, xmm0
|
|
movdqa xmm6, xmm7
|
|
punpcklwd xmm6, xmm0
|
|
punpckhwd xmm7, xmm0
|
|
|
|
movdqa xmm5, [r4]
|
|
movdqa xmm4, xmm5
|
|
punpcklwd xmm4, xmm0
|
|
punpckhwd xmm5, xmm0
|
|
|
|
paddd xmm4, xmm6
|
|
paddd xmm5, xmm7
|
|
movdqa xmm3, xmm4
|
|
pminud xmm3, xmm5
|
|
pshufd xmm2, xmm3, 01001110B
|
|
pminud xmm2, xmm3
|
|
pshufd xmm3, xmm2, 10110001B
|
|
pminud xmm2, xmm3
|
|
movd retrd, xmm2
|
|
pcmpeqd xmm4, xmm2
|
|
movmskps r2d, xmm4
|
|
bsf r1d, r2d
|
|
jnz near WRITE_INDEX
|
|
|
|
pcmpeqd xmm5, xmm2
|
|
movmskps r2d, xmm5
|
|
bsf r1d, r2d
|
|
add r1d, 4
|
|
|
|
WRITE_INDEX:
|
|
mov [r5], r1d
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|
|
;**********************************************************************************************************************************
|
|
;
|
|
; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
|
|
;
|
|
; \note:
|
|
; src and ref is optional to align with 16 due inter 8x8
|
|
; \return value:
|
|
; return minimal SAD cost, according index carried by index_min_cost
|
|
;
|
|
;**********************************************************************************************************************************
|
|
; try 8 mv via offset
|
|
; xmm7 store sad costs
|
|
%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
|
|
movdqu xmm0, [%1]
|
|
movdqu xmm1, [%2]
|
|
movdqa xmm2, xmm1
|
|
|
|
mpsadbw xmm1, xmm0, 0 ; 000 B
|
|
paddw xmm7, xmm1 ; accumulate cost
|
|
|
|
mpsadbw xmm2, xmm0, 5 ; 101 B
|
|
paddw xmm7, xmm2 ; accumulate cost
|
|
|
|
add %1, %3
|
|
add %2, %4
|
|
%endmacro ; end of SAD_8x8_LINE_SSE41
|
|
%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
|
|
movdqu xmm0, [%1]
|
|
movdqu xmm1, [%2]
|
|
movdqa xmm2, xmm1
|
|
|
|
mpsadbw xmm1, xmm0, 0 ; 000 B
|
|
paddw xmm7, xmm1 ; accumulate cost
|
|
|
|
mpsadbw xmm2, xmm0, 5 ; 101 B
|
|
paddw xmm7, xmm2 ; accumulate cost
|
|
%endmacro ; end of SAD_8x8_LINE_SSE41E
|
|
|
|
WELS_EXTERN SampleSad8x8Hor8_sse41
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
movdqa xmm7, [r4] ; load base cost list
|
|
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
|
SAD_8x8_LINE_SSE41E r0, r2, r1, r3
|
|
|
|
phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
|
|
movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
|
|
mov r1d, retrd
|
|
and retrd, 0xFFFF
|
|
sar r1d, 16
|
|
mov [r5], r1d
|
|
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|