;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* * Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* * Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* ;* mb_copy.asm ;* ;* Abstract ;* mb_copy and mb_copy1 ;* ;* History ;* 15/09/2009 Created ;* 12/28/2009 Modified with larger throughput ;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, ;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc; ;* ;* ;*********************************************************************************************/ %include "asm_inc.asm" ;*********************************************************************** ; Macros and other preprocessor constants ;*********************************************************************** ;*********************************************************************** ; Code ;*********************************************************************** SECTION .text ;*********************************************************************** ; void WelsCopy16x16_sse2( uint8_t* Dst, ; int32_t iStrideD, ; uint8_t* Src, ; int32_t iStrideS ) ;*********************************************************************** WELS_EXTERN WelsCopy16x16_sse2 push r4 push r5 %assign push_num 2 LOAD_4_PARA PUSH_XMM 8 lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 movdqa xmm0, [r2] movdqa xmm1, [r2+r3] movdqa xmm2, [r2+2*r3] movdqa xmm3, [r2+r5] lea r2, [r2+4*r3] movdqa xmm4, [r2] movdqa xmm5, [r2+r3] movdqa xmm6, [r2+2*r3] movdqa xmm7, [r2+r5] lea r2, [r2+4*r3] movdqa [r0], xmm0 movdqa [r0+r1], xmm1 movdqa [r0+2*r1], xmm2 movdqa [r0+r4], xmm3 lea r0, [r0+4*r1] movdqa [r0], xmm4 movdqa [r0+r1], xmm5 movdqa [r0+2*r1], xmm6 movdqa [r0+r4], xmm7 lea r0, [r0+4*r1] movdqa xmm0, [r2] movdqa xmm1, [r2+r3] movdqa xmm2, [r2+2*r3] movdqa xmm3, [r2+r5] lea r2, [r2+4*r3] movdqa xmm4, [r2] movdqa xmm5, [r2+r3] movdqa xmm6, [r2+2*r3] movdqa xmm7, [r2+r5] movdqa [r0], xmm0 movdqa [r0+r1], xmm1 movdqa [r0+2*r1], xmm2 movdqa [r0+r4], xmm3 lea r0, [r0+4*r1] movdqa [r0], xmm4 movdqa [r0+r1], xmm5 movdqa [r0+2*r1], xmm6 movdqa [r0+r4], xmm7 POP_XMM LOAD_4_PARA_POP pop r5 pop r4 ret ;*********************************************************************** ; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst, ; int32_t iStrideD, ; uint8_t* Src, ; int32_t iStrideS ) ;*********************************************************************** ; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011 WELS_EXTERN WelsCopy16x16NotAligned_sse2 push r4 push r5 %assign push_num 2 LOAD_4_PARA PUSH_XMM 8 lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 movdqu xmm0, [r2] movdqu xmm1, [r2+r3] movdqu xmm2, [r2+2*r3] movdqu xmm3, [r2+r5] lea r2, [r2+4*r3] movdqu xmm4, [r2] movdqu xmm5, [r2+r3] movdqu xmm6, [r2+2*r3] movdqu xmm7, [r2+r5] lea r2, [r2+4*r3] movdqa [r0], xmm0 movdqa [r0+r1], xmm1 movdqa [r0+2*r1], xmm2 movdqa [r0+r4], xmm3 lea r0, [r0+4*r1] movdqa [r0], xmm4 movdqa [r0+r1], xmm5 movdqa [r0+2*r1], xmm6 movdqa [r0+r4], xmm7 lea r0, [r0+4*r1] movdqu xmm0, [r2] movdqu xmm1, [r2+r3] movdqu xmm2, [r2+2*r3] movdqu xmm3, [r2+r5] lea r2, [r2+4*r3] movdqu xmm4, [r2] movdqu xmm5, [r2+r3] movdqu xmm6, [r2+2*r3] movdqu xmm7, [r2+r5] movdqa [r0], xmm0 movdqa [r0+r1], xmm1 movdqa [r0+2*r1], xmm2 movdqa [r0+r4], xmm3 lea r0, [r0+4*r1] movdqa [r0], xmm4 movdqa [r0+r1], xmm5 movdqa [r0+2*r1], xmm6 movdqa [r0+r4], xmm7 POP_XMM LOAD_4_PARA_POP pop r5 pop r4 ret ; , 12/29/2011 ;*********************************************************************** ; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst, ; int32_t iStrideD, ; uint8_t* Src, ; int32_t iStrideS ) ;*********************************************************************** WELS_EXTERN WelsCopy16x8NotAligned_sse2 push r4 push r5 %assign push_num 2 LOAD_4_PARA PUSH_XMM 8 lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 movdqu xmm0, [r2] movdqu xmm1, [r2+r3] movdqu xmm2, [r2+2*r3] movdqu xmm3, [r2+r5] lea r2, [r2+4*r3] movdqu xmm4, [r2] movdqu xmm5, [r2+r3] movdqu xmm6, [r2+2*r3] movdqu xmm7, [r2+r5] movdqa [r0], xmm0 movdqa [r0+r1], xmm1 movdqa [r0+2*r1], xmm2 movdqa [r0+r4], xmm3 lea r0, [r0+4*r1] movdqa [r0], xmm4 movdqa [r0+r1], xmm5 movdqa [r0+2*r1], xmm6 movdqa [r0+r4], xmm7 POP_XMM LOAD_4_PARA_POP pop r5 pop r4 ret ;*********************************************************************** ; void WelsCopy8x16_mmx(uint8_t* Dst, ; int32_t iStrideD, ; uint8_t* Src, ; int32_t iStrideS ) ;*********************************************************************** WELS_EXTERN WelsCopy8x16_mmx %assign push_num 0 LOAD_4_PARA movq mm0, [r2] movq mm1, [r2+r3] lea r2, [r2+2*r3] movq mm2, [r2] movq mm3, [r2+r3] lea r2, [r2+2*r3] movq mm4, [r2] movq mm5, [r2+r3] lea r2, [r2+2*r3] movq mm6, [r2] movq mm7, [r2+r3] lea r2, [r2+2*r3] movq [r0], mm0 movq [r0+r1], mm1 lea r0, [r0+2*r1] movq [r0], mm2 movq [r0+r1], mm3 lea r0, [r0+2*r1] movq [r0], mm4 movq [r0+r1], mm5 lea r0, [r0+2*r1] movq [r0], mm6 movq [r0+r1], mm7 lea r0, [r0+2*r1] movq mm0, [r2] movq mm1, [r2+r3] lea r2, [r2+2*r3] movq mm2, [r2] movq mm3, [r2+r3] lea r2, [r2+2*r3] movq mm4, [r2] movq mm5, [r2+r3] lea r2, [r2+2*r3] movq mm6, [r2] movq mm7, [r2+r3] movq [r0], mm0 movq [r0+r1], mm1 lea r0, [r0+2*r1] movq [r0], mm2 movq [r0+r1], mm3 lea r0, [r0+2*r1] movq [r0], mm4 movq [r0+r1], mm5 lea r0, [r0+2*r1] movq [r0], mm6 movq [r0+r1], mm7 WELSEMMS LOAD_4_PARA_POP ret ;*********************************************************************** ; void WelsCopy8x8_mmx( uint8_t* Dst, ; int32_t iStrideD, ; uint8_t* Src, ; int32_t iStrideS ) ;*********************************************************************** WELS_EXTERN WelsCopy8x8_mmx push r4 %assign push_num 1 LOAD_4_PARA lea r4, [r3+2*r3] ;edx, [ebx+2*ebx] ; to prefetch next loop prefetchnta [r2+2*r3] prefetchnta [r2+r4] movq mm0, [r2] movq mm1, [r2+r3] lea r2, [r2+2*r3] ; to prefetch next loop prefetchnta [r2+2*r3] prefetchnta [r2+r4] movq mm2, [r2] movq mm3, [r2+r3] lea r2, [r2+2*r3] ; to prefetch next loop prefetchnta [r2+2*r3] prefetchnta [r2+r4] movq mm4, [r2] movq mm5, [r2+r3] lea r2, [r2+2*r3] movq mm6, [r2] movq mm7, [r2+r3] movq [r0], mm0 movq [r0+r1], mm1 lea r0, [r0+2*r1] movq [r0], mm2 movq [r0+r1], mm3 lea r0, [r0+2*r1] movq [r0], mm4 movq [r0+r1], mm5 lea r0, [r0+2*r1] movq [r0], mm6 movq [r0+r1], mm7 WELSEMMS LOAD_4_PARA_POP pop r4 ret ; (dunhuang@cisco), 12/21/2011 ;*********************************************************************** ; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv ) ;*********************************************************************** WELS_EXTERN UpdateMbMv_sse2 %assign push_num 0 LOAD_2_PARA movd xmm0, r1d ; _mv pshufd xmm1, xmm0, $00 movdqa [r0 ], xmm1 movdqa [r0+0x10], xmm1 movdqa [r0+0x20], xmm1 movdqa [r0+0x30], xmm1 ret ;******************************************************************************* ; Macros and other preprocessor constants ;******************************************************************************* ;******************************************************************************* ; Code ;******************************************************************************* SECTION .text ;******************************************************************************* ; void PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride, ; uint8_t *pSrcA, int iSrcAStride, ; uint8_t *pSrcB, int iSrcBStride, ; int iHeight ); ;******************************************************************************* WELS_EXTERN PixelAvgWidthEq4_mmx %assign push_num 0 LOAD_7_PARA SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r5, r5d SIGN_EXTENSION r6, r6d ALIGN 4 .height_loop: movd mm0, [r4] pavgb mm0, [r2] movd [r0], mm0 dec r6 lea r0, [r0+r1] lea r2, [r2+r3] lea r4, [r4+r5] jne .height_loop WELSEMMS LOAD_7_PARA_POP ret ;******************************************************************************* ; void PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride, ; uint8_t *pSrcA, int iSrcAStride, ; uint8_t *pSrcB, int iSrcBStride, ; int iHeight ); ;******************************************************************************* WELS_EXTERN PixelAvgWidthEq8_mmx %assign push_num 0 LOAD_7_PARA SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r5, r5d SIGN_EXTENSION r6, r6d ALIGN 4 .height_loop: movq mm0, [r2] pavgb mm0, [r4] movq [r0], mm0 movq mm0, [r2+r3] pavgb mm0, [r4+r5] movq [r0+r1], mm0 lea r2, [r2+2*r3] lea r4, [r4+2*r5] lea r0, [r0+2*r1] sub r6, 2 jnz .height_loop WELSEMMS LOAD_7_PARA_POP ret ;******************************************************************************* ; void PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride, ; uint8_t *pSrcA, int iSrcAStride, ; uint8_t *pSrcB, int iSrcBStride, ; int iHeight ); ;******************************************************************************* WELS_EXTERN PixelAvgWidthEq16_sse2 %assign push_num 0 LOAD_7_PARA SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r5, r5d SIGN_EXTENSION r6, r6d ALIGN 4 .height_loop: movdqu xmm0, [r2] movdqu xmm1, [r4] pavgb xmm0, xmm1 ;pavgb xmm0, [r4] movdqu [r0], xmm0 movdqu xmm0, [r2+r3] movdqu xmm1, [r4+r5] pavgb xmm0, xmm1 movdqu [r0+r1], xmm0 movdqu xmm0, [r2+2*r3] movdqu xmm1, [r4+2*r5] pavgb xmm0, xmm1 movdqu [r0+2*r1], xmm0 lea r2, [r2+2*r3] lea r4, [r4+2*r5] lea r0, [r0+2*r1] movdqu xmm0, [r2+r3] movdqu xmm1, [r4+r5] pavgb xmm0, xmm1 movdqu [r0+r1], xmm0 lea r2, [r2+2*r3] lea r4, [r4+2*r5] lea r0, [r0+2*r1] sub r6, 4 jne .height_loop WELSEMMS LOAD_7_PARA_POP ret ;******************************************************************************* ; void McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride, ; uint8_t *pDst, int iDstStride, int iHeight ) ;******************************************************************************* WELS_EXTERN McCopyWidthEq4_mmx push r5 %assign push_num 1 LOAD_5_PARA SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r4, r4d ALIGN 4 .height_loop: mov r5d, [r0] mov [r2], r5d add r0, r1 add r2, r3 dec r4 jnz .height_loop WELSEMMS LOAD_5_PARA_POP pop r5 ret ;******************************************************************************* ; void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride, ; uint8_t *pDst, int iDstStride, int iHeight ) ;******************************************************************************* WELS_EXTERN McCopyWidthEq8_mmx %assign push_num 0 LOAD_5_PARA SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r4, r4d ALIGN 4 .height_loop: movq mm0, [r0] movq [r2], mm0 add r0, r1 add r2, r3 dec r4 jnz .height_loop WELSEMMS LOAD_5_PARA_POP ret ;******************************************************************************* ; void McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight ) ;******************************************************************************* ;read unaligned memory %macro SSE_READ_UNA 2 movq %1, [%2] movhps %1, [%2+8] %endmacro ;write unaligned memory %macro SSE_WRITE_UNA 2 movq [%1], %2 movhps [%1+8], %2 %endmacro WELS_EXTERN McCopyWidthEq16_sse2 %assign push_num 0 LOAD_5_PARA SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r4, r4d ALIGN 4 .height_loop: SSE_READ_UNA xmm0, r0 SSE_READ_UNA xmm1, r0+r1 SSE_WRITE_UNA r2, xmm0 SSE_WRITE_UNA r2+r3, xmm1 sub r4, 2 lea r0, [r0+r1*2] lea r2, [r2+r3*2] jnz .height_loop LOAD_5_PARA_POP ret