openh264/codec/common/x86/mb_copy.asm
Martin Storsjö 57f6bcc4b0 Convert all tabs to spaces in assembly sources, unify indentation
Previously the assembly sources had mixed indentation consisting
of both spaces and tabs, making it quite hard to read unless
the right tab size was used in the editor.

Tabs have been interpreted as 4 spaces in most cases, matching
the surrounding code.
2014-06-01 01:35:43 +03:00

582 lines
15 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* mb_copy.asm
;*
;* Abstract
;* mb_copy and mb_copy1
;*
;* History
;* 15/09/2009 Created
;* 12/28/2009 Modified with larger throughput
;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
;*
;*
;*********************************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
;***********************************************************************
; void WelsCopy16x16_sse2( uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy16x16_sse2
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+2*r3]
movdqa xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+2*r3]
movdqa xmm7, [r2+r5]
lea r2, [r2+4*r3]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
lea r0, [r0+4*r1]
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+2*r3]
movdqa xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+2*r3]
movdqa xmm7, [r2+r5]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
;***********************************************************************
; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
WELS_EXTERN WelsCopy16x16NotAligned_sse2
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm4, [r2]
movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5]
lea r2, [r2+4*r3]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
lea r0, [r0+4*r1]
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm4, [r2]
movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
; , 12/29/2011
;***********************************************************************
; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy16x8NotAligned_sse2
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
movdqu xmm2, [r2+2*r3]
movdqu xmm3, [r2+r5]
lea r2, [r2+4*r3]
movdqu xmm4, [r2]
movdqu xmm5, [r2+r3]
movdqu xmm6, [r2+2*r3]
movdqu xmm7, [r2+r5]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+2*r1], xmm2
movdqa [r0+r4], xmm3
lea r0, [r0+4*r1]
movdqa [r0], xmm4
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
;***********************************************************************
; void WelsCopy8x16_mmx(uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy8x16_mmx
%assign push_num 0
LOAD_4_PARA
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
lea r2, [r2+2*r3]
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
movq [r0], mm2
movq [r0+r1], mm3
lea r0, [r0+2*r1]
movq [r0], mm4
movq [r0+r1], mm5
lea r0, [r0+2*r1]
movq [r0], mm6
movq [r0+r1], mm7
lea r0, [r0+2*r1]
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
movq [r0], mm2
movq [r0+r1], mm3
lea r0, [r0+2*r1]
movq [r0], mm4
movq [r0+r1], mm5
lea r0, [r0+2*r1]
movq [r0], mm6
movq [r0+r1], mm7
WELSEMMS
LOAD_4_PARA_POP
ret
;***********************************************************************
; void WelsCopy8x8_mmx( uint8_t* Dst,
; int32_t iStrideD,
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy8x8_mmx
push r4
%assign push_num 1
LOAD_4_PARA
lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
; to prefetch next loop
prefetchnta [r2+2*r3]
prefetchnta [r2+r4]
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
; to prefetch next loop
prefetchnta [r2+2*r3]
prefetchnta [r2+r4]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
; to prefetch next loop
prefetchnta [r2+2*r3]
prefetchnta [r2+r4]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
movq [r0], mm2
movq [r0+r1], mm3
lea r0, [r0+2*r1]
movq [r0], mm4
movq [r0+r1], mm5
lea r0, [r0+2*r1]
movq [r0], mm6
movq [r0+r1], mm7
WELSEMMS
LOAD_4_PARA_POP
pop r4
ret
; (dunhuang@cisco), 12/21/2011
;***********************************************************************
; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
;***********************************************************************
WELS_EXTERN UpdateMbMv_sse2
%assign push_num 0
LOAD_2_PARA
movd xmm0, r1d ; _mv
pshufd xmm1, xmm0, $00
movdqa [r0 ], xmm1
movdqa [r0+0x10], xmm1
movdqa [r0+0x20], xmm1
movdqa [r0+0x30], xmm1
ret
;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
;*******************************************************************************
; void PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride,
; uint8_t *pSrcA, int iSrcAStride,
; uint8_t *pSrcB, int iSrcBStride,
; int iHeight );
;*******************************************************************************
WELS_EXTERN PixelAvgWidthEq4_mmx
%assign push_num 0
LOAD_7_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
movd mm0, [r4]
pavgb mm0, [r2]
movd [r0], mm0
dec r6
lea r0, [r0+r1]
lea r2, [r2+r3]
lea r4, [r4+r5]
jne .height_loop
WELSEMMS
LOAD_7_PARA_POP
ret
;*******************************************************************************
; void PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride,
; uint8_t *pSrcA, int iSrcAStride,
; uint8_t *pSrcB, int iSrcBStride,
; int iHeight );
;*******************************************************************************
WELS_EXTERN PixelAvgWidthEq8_mmx
%assign push_num 0
LOAD_7_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
movq mm0, [r2]
pavgb mm0, [r4]
movq [r0], mm0
movq mm0, [r2+r3]
pavgb mm0, [r4+r5]
movq [r0+r1], mm0
lea r2, [r2+2*r3]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
sub r6, 2
jnz .height_loop
WELSEMMS
LOAD_7_PARA_POP
ret
;*******************************************************************************
; void PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride,
; uint8_t *pSrcA, int iSrcAStride,
; uint8_t *pSrcB, int iSrcBStride,
; int iHeight );
;*******************************************************************************
WELS_EXTERN PixelAvgWidthEq16_sse2
%assign push_num 0
LOAD_7_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
movdqu xmm0, [r2]
movdqu xmm1, [r4]
pavgb xmm0, xmm1
;pavgb xmm0, [r4]
movdqu [r0], xmm0
movdqu xmm0, [r2+r3]
movdqu xmm1, [r4+r5]
pavgb xmm0, xmm1
movdqu [r0+r1], xmm0
movdqu xmm0, [r2+2*r3]
movdqu xmm1, [r4+2*r5]
pavgb xmm0, xmm1
movdqu [r0+2*r1], xmm0
lea r2, [r2+2*r3]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
movdqu xmm0, [r2+r3]
movdqu xmm1, [r4+r5]
pavgb xmm0, xmm1
movdqu [r0+r1], xmm0
lea r2, [r2+2*r3]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
sub r6, 4
jne .height_loop
WELSEMMS
LOAD_7_PARA_POP
ret
;*******************************************************************************
; void McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
WELS_EXTERN McCopyWidthEq4_mmx
push r5
%assign push_num 1
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
mov r5d, [r0]
mov [r2], r5d
add r0, r1
add r2, r3
dec r4
jnz .height_loop
WELSEMMS
LOAD_5_PARA_POP
pop r5
ret
;*******************************************************************************
; void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
WELS_EXTERN McCopyWidthEq8_mmx
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
movq mm0, [r0]
movq [r2], mm0
add r0, r1
add r2, r3
dec r4
jnz .height_loop
WELSEMMS
LOAD_5_PARA_POP
ret
;*******************************************************************************
; void McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
;read unaligned memory
%macro SSE_READ_UNA 2
movq %1, [%2]
movhps %1, [%2+8]
%endmacro
;write unaligned memory
%macro SSE_WRITE_UNA 2
movq [%1], %2
movhps [%1+8], %2
%endmacro
WELS_EXTERN McCopyWidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
SSE_READ_UNA xmm0, r0
SSE_READ_UNA xmm1, r0+r1
SSE_WRITE_UNA r2, xmm0
SSE_WRITE_UNA r2+r3, xmm1
sub r4, 2
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
jnz .height_loop
LOAD_5_PARA_POP
ret