openh264/codec/encoder/core/x86/sample_sc.asm
Licai Guo 5c60e8f868 Add ASM related functions for ME cross search
Add asm level functions

Add asm code for ME

Modify format

Add unit test for asm code.

Modify function name and format.

Remove unuse comment

Modify targets file

Add Macro protect for SSE41 funtion test

Modify according to review request.
2014-04-08 11:24:45 +08:00

226 lines
6.9 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*************************************************************************/
%include "asm_inc.asm"
SECTION .text
;**********************************************************************************************************************************
;
; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
;
; \note:
; src need align with 16 bytes, ref is optional
; \return value:
; return minimal SAD cost, according index carried by index_min_cost
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
movdqa xmm0, [%1]
movdqu xmm1, [%2]
movdqu xmm2, [%2+8h]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm3, xmm0, 5 ; 101 B
paddw xmm7, xmm3 ; accumulate cost
mpsadbw xmm2, xmm0, 2 ; 010 B
paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm4, xmm0, 7 ; 111 B
paddw xmm7, xmm4 ; accumulate cost
add %1, %3
add %2, %4
%endmacro ; end of SAD_16x16_LINE_SSE41
%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
movdqa xmm0, [%1]
movdqu xmm1, [%2]
movdqu xmm2, [%2+8h]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm3, xmm0, 5 ; 101 B
paddw xmm7, xmm3 ; accumulate cost
mpsadbw xmm2, xmm0, 2 ; 010 B
paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm4, xmm0, 7 ; 111 B
paddw xmm7, xmm4 ; accumulate cost
%endmacro ; end of SAD_16x16_LINE_SSE41E
WELS_EXTERN SampleSad16x16Hor8_sse41
;push ebx
;push esi
;mov eax, [esp+12] ; src
;mov ecx, [esp+16] ; stride_src
;mov ebx, [esp+20] ; ref
;mov edx, [esp+24] ; stride_ref
;mov esi, [esp+28] ; base_cost
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm7, xmm7
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41E r0, r2, r1, r3
pxor xmm0, xmm0
movdqa xmm6, xmm7
punpcklwd xmm6, xmm0
punpckhwd xmm7, xmm0
movdqa xmm5, [r4]
movdqa xmm4, xmm5
punpcklwd xmm4, xmm0
punpckhwd xmm5, xmm0
paddd xmm4, xmm6
paddd xmm5, xmm7
movdqa xmm3, xmm4
pminud xmm3, xmm5
pshufd xmm2, xmm3, 01001110B
pminud xmm2, xmm3
pshufd xmm3, xmm2, 10110001B
pminud xmm2, xmm3
movd retrd, xmm2
pcmpeqd xmm4, xmm2
movmskps r2d, xmm4
bsf r1d, r2d
jnz near WRITE_INDEX
pcmpeqd xmm5, xmm2
movmskps r2d, xmm5
bsf r1d, r2d
add r1d, 4
WRITE_INDEX:
mov [r5], r1d
POP_XMM
LOAD_6_PARA_POP
ret
;**********************************************************************************************************************************
;
; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
;
; \note:
; src and ref is optional to align with 16 due inter 8x8
; \return value:
; return minimal SAD cost, according index carried by index_min_cost
;
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
movdqu xmm0, [%1]
movdqu xmm1, [%2]
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm2, xmm0, 5 ; 101 B
paddw xmm7, xmm2 ; accumulate cost
add %1, %3
add %2, %4
%endmacro ; end of SAD_8x8_LINE_SSE41
%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
movdqu xmm0, [%1]
movdqu xmm1, [%2]
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm2, xmm0, 5 ; 101 B
paddw xmm7, xmm2 ; accumulate cost
%endmacro ; end of SAD_8x8_LINE_SSE41E
WELS_EXTERN SampleSad8x8Hor8_sse41
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm7, [r4] ; load base cost list
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41E r0, r2, r1, r3
phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
mov r1d, retrd
and retrd, 0xFFFF
sar r1d, 16
mov [r5], r1d
POP_XMM
LOAD_6_PARA_POP
ret