openh264/codec/common/x86/mb_copy.asm

582 lines
15 KiB
NASM
Raw Normal View History

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* mb_copy.asm
;*
;* Abstract
;* mb_copy and mb_copy1
;*
;* History
;* 15/09/2009 Created
;* 12/28/2009 Modified with larger throughput
;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
;*
;*
;*********************************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
;***********************************************************************
; void WelsCopy16x16_sse2( uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy16x16_sse2
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+2*r3]
movdqa xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+2*r3]
movdqa xmm7, [r2+r5]
lea r2, [r2+4*r3]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
lea r0, [r0+4*r1]
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+2*r3]
movdqa xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+2*r3]
movdqa xmm7, [r2+r5]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
;***********************************************************************
; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
WELS_EXTERN WelsCopy16x16NotAligned_sse2
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm4, [r2]
movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5]
lea r2, [r2+4*r3]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
lea r0, [r0+4*r1]
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm4, [r2]
movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
; , 12/29/2011
;***********************************************************************
; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy16x8NotAligned_sse2
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm4, [r2]
movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
;***********************************************************************
; void WelsCopy8x16_mmx(uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy8x16_mmx
%assign push_num 0
LOAD_4_PARA
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
lea r2, [r2+2*r3]
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
movq [r0], mm2
movq [r0+r1], mm3
lea r0, [r0+2*r1]
movq [r0], mm4
movq [r0+r1], mm5
lea r0, [r0+2*r1]
movq [r0], mm6
movq [r0+r1], mm7
lea r0, [r0+2*r1]
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
movq [r0], mm2
movq [r0+r1], mm3
lea r0, [r0+2*r1]
movq [r0], mm4
movq [r0+r1], mm5
lea r0, [r0+2*r1]
movq [r0], mm6
movq [r0+r1], mm7
WELSEMMS
LOAD_4_PARA_POP
ret
;***********************************************************************
; void WelsCopy8x8_mmx( uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy8x8_mmx
push r4
%assign push_num 1
LOAD_4_PARA
lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
; to prefetch next loop
prefetchnta [r2+2*r3]
prefetchnta [r2+r4]
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
; to prefetch next loop
prefetchnta [r2+2*r3]
prefetchnta [r2+r4]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
; to prefetch next loop
prefetchnta [r2+2*r3]
prefetchnta [r2+r4]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
movq [r0], mm2
movq [r0+r1], mm3
lea r0, [r0+2*r1]
movq [r0], mm4
movq [r0+r1], mm5
lea r0, [r0+2*r1]
movq [r0], mm6
movq [r0+r1], mm7
WELSEMMS
LOAD_4_PARA_POP
pop r4
ret
; (dunhuang@cisco), 12/21/2011
;***********************************************************************
; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
;***********************************************************************
WELS_EXTERN UpdateMbMv_sse2
%assign push_num 0
LOAD_2_PARA
movd xmm0, r1d ; _mv
pshufd xmm1, xmm0, $00
movdqa [r0 ], xmm1
movdqa [r0+0x10], xmm1
movdqa [r0+0x20], xmm1
movdqa [r0+0x30], xmm1
ret
;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
;*******************************************************************************
; void PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride,
; uint8_t *pSrcA, int iSrcAStride,
; uint8_t *pSrcB, int iSrcBStride,
; int iHeight );
;*******************************************************************************
WELS_EXTERN PixelAvgWidthEq4_mmx
%assign push_num 0
LOAD_7_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
movd mm0, [r4]
pavgb mm0, [r2]
movd [r0], mm0
dec r6
lea r0, [r0+r1]
lea r2, [r2+r3]
lea r4, [r4+r5]
jne .height_loop
WELSEMMS
LOAD_7_PARA_POP
ret
;*******************************************************************************
; void PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride,
; uint8_t *pSrcA, int iSrcAStride,
; uint8_t *pSrcB, int iSrcBStride,
; int iHeight );
;*******************************************************************************
WELS_EXTERN PixelAvgWidthEq8_mmx
%assign push_num 0
LOAD_7_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
movq mm0, [r2]
pavgb mm0, [r4]
movq [r0], mm0
movq mm0, [r2+r3]
pavgb mm0, [r4+r5]
movq [r0+r1], mm0
lea r2, [r2+2*r3]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
sub r6, 2
jnz .height_loop
WELSEMMS
LOAD_7_PARA_POP
ret
;*******************************************************************************
; void PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride,
; uint8_t *pSrcA, int iSrcAStride,
; uint8_t *pSrcB, int iSrcBStride,
; int iHeight );
;*******************************************************************************
WELS_EXTERN PixelAvgWidthEq16_sse2
%assign push_num 0
LOAD_7_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
movdqu xmm0, [r2]
movdqu xmm1, [r4]
pavgb xmm0, xmm1
;pavgb xmm0, [r4]
movdqu [r0], xmm0
movdqu xmm0, [r2+r3]
movdqu xmm1, [r4+r5]
pavgb xmm0, xmm1
movdqu [r0+r1], xmm0
movdqu xmm0, [r2+2*r3]
movdqu xmm1, [r4+2*r5]
pavgb xmm0, xmm1
movdqu [r0+2*r1], xmm0
lea r2, [r2+2*r3]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
movdqu xmm0, [r2+r3]
movdqu xmm1, [r4+r5]
pavgb xmm0, xmm1
movdqu [r0+r1], xmm0
lea r2, [r2+2*r3]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
sub r6, 4
jne .height_loop
WELSEMMS
LOAD_7_PARA_POP
ret
;*******************************************************************************
; void McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
WELS_EXTERN McCopyWidthEq4_mmx
push r5
%assign push_num 1
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
mov r5d, [r0]
mov [r2], r5d
add r0, r1
add r2, r3
dec r4
jnz .height_loop
WELSEMMS
LOAD_5_PARA_POP
pop r5
ret
;*******************************************************************************
; void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
WELS_EXTERN McCopyWidthEq8_mmx
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
movq mm0, [r0]
movq [r2], mm0
add r0, r1
add r2, r3
dec r4
jnz .height_loop
WELSEMMS
LOAD_5_PARA_POP
ret
;*******************************************************************************
; void McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
;read unaligned memory
%macro SSE_READ_UNA 2
movq %1, [%2]
movhps %1, [%2+8]
%endmacro
;write unaligned memory
%macro SSE_WRITE_UNA 2
movq [%1], %2
movhps [%1+8], %2
%endmacro
WELS_EXTERN McCopyWidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
SSE_READ_UNA xmm0, r0
SSE_READ_UNA xmm1, r0+r1
SSE_WRITE_UNA r2, xmm0
SSE_WRITE_UNA r2+r3, xmm1
sub r4, 2
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
jnz .height_loop
LOAD_5_PARA_POP
ret