From ddcfc09c495c81dd6cf4824d1f3f345481c75d61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sun, 5 Jan 2014 14:11:41 +0200 Subject: [PATCH] Convert some assembly files to unix newlines This makes them consistent with the rest of them. --- codec/common/mb_copy.asm | 1402 ++++---- codec/common/mc_chroma.asm | 690 ++-- codec/common/mc_luma.asm | 2584 +++++++-------- codec/encoder/core/asm/satd_sad.asm | 4688 +++++++++++++-------------- 4 files changed, 4682 insertions(+), 4682 deletions(-) diff --git a/codec/common/mb_copy.asm b/codec/common/mb_copy.asm index 290faf88..a59234fd 100644 --- a/codec/common/mb_copy.asm +++ b/codec/common/mb_copy.asm @@ -1,701 +1,701 @@ -;*! -;* \copy -;* Copyright (c) 2009-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* mb_copy.asm -;* -;* Abstract -;* mb_copy and mb_copy1 -;* -;* History -;* 15/09/2009 Created -;* 12/28/2009 Modified with larger throughput -;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, -;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc; -;* -;* -;*********************************************************************************************/ -%include "asm_inc.asm" - -;*********************************************************************** -; Macros and other preprocessor constants -;*********************************************************************** - -;*********************************************************************** -; Code -;*********************************************************************** - -SECTION .text - -WELS_EXTERN WelsCopy16x16_sse2 -WELS_EXTERN WelsCopy16x16NotAligned_sse2 -WELS_EXTERN WelsCopy8x8_mmx -WELS_EXTERN WelsCopy16x8NotAligned_sse2 ; -WELS_EXTERN WelsCopy8x16_mmx ; -WELS_EXTERN UpdateMbMv_sse2 ; - -;*********************************************************************** -; void WelsCopy16x16_sse2( uint8_t* Dst, -; int32_t iStrideD, -; uint8_t* Src, -; int32_t iStrideS ) -;*********************************************************************** -ALIGN 16 -WelsCopy16x16_sse2: - - push r4 - push r5 - %assign push_num 2 - LOAD_4_PARA - - lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 - lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 - - movdqa xmm0, [r2] - movdqa xmm1, [r2+r3] - movdqa xmm2, [r2+2*r3] - movdqa xmm3, [r2+r5] - lea r2, [r2+4*r3] - movdqa xmm4, [r2] - movdqa xmm5, [r2+r3] - movdqa xmm6, [r2+2*r3] - movdqa xmm7, [r2+r5] - lea r2, [r2+4*r3] - - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm2 - movdqa [r0+r4], xmm3 - lea r0, [r0+4*r1] - movdqa [r0], xmm4 - movdqa [r0+r1], xmm5 - movdqa [r0+2*r1], xmm6 - movdqa [r0+r4], xmm7 - lea r0, [r0+4*r1] - - movdqa xmm0, [r2] - movdqa xmm1, [r2+r3] - movdqa xmm2, [r2+2*r3] - movdqa xmm3, [r2+r5] - lea r2, [r2+4*r3] - movdqa xmm4, [r2] - movdqa xmm5, [r2+r3] - movdqa xmm6, [r2+2*r3] - movdqa xmm7, [r2+r5] - - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm2 - movdqa [r0+r4], xmm3 - lea r0, [r0+4*r1] - movdqa [r0], xmm4 - movdqa [r0+r1], xmm5 - movdqa [r0+2*r1], xmm6 - movdqa [r0+r4], xmm7 - LOAD_4_PARA_POP - pop r5 - pop r4 - ret - -;*********************************************************************** -; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst, -; int32_t iStrideD, -; uint8_t* Src, -; int32_t iStrideS ) -;*********************************************************************** -ALIGN 16 -; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011 -WelsCopy16x16NotAligned_sse2: - ;push esi - ;push edi - ;push ebx - - ;mov edi, [esp+16] ; Dst - ;mov eax, [esp+20] ; iStrideD - ;mov esi, [esp+24] ; Src - ;mov ecx, [esp+28] ; iStrideS - - push r4 - push r5 - %assign push_num 2 - LOAD_4_PARA - - lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 - lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 - - movdqu xmm0, [r2] - movdqu xmm1, [r2+r3] - movdqu xmm2, [r2+2*r3] - movdqu xmm3, [r2+r5] - lea r2, [r2+4*r3] - movdqu xmm4, [r2] - movdqu xmm5, [r2+r3] - movdqu xmm6, [r2+2*r3] - movdqu xmm7, [r2+r5] - lea r2, [r2+4*r3] - - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm2 - movdqa [r0+r4], xmm3 - lea r0, [r0+4*r1] - movdqa [r0], xmm4 - movdqa [r0+r1], xmm5 - movdqa [r0+2*r1], xmm6 - movdqa [r0+r4], xmm7 - lea r0, [r0+4*r1] - - movdqu xmm0, [r2] - movdqu xmm1, [r2+r3] - movdqu xmm2, [r2+2*r3] - movdqu xmm3, [r2+r5] - lea r2, [r2+4*r3] - movdqu xmm4, [r2] - movdqu xmm5, [r2+r3] - movdqu xmm6, [r2+2*r3] - movdqu xmm7, [r2+r5] - - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm2 - movdqa [r0+r4], xmm3 - lea r0, [r0+4*r1] - movdqa [r0], xmm4 - movdqa [r0+r1], xmm5 - movdqa [r0+2*r1], xmm6 - movdqa [r0+r4], xmm7 - LOAD_4_PARA_POP - pop r5 - pop r4 - ret - -; , 12/29/2011 -;*********************************************************************** -; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst, -; int32_t iStrideD, -; uint8_t* Src, -; int32_t iStrideS ) -;*********************************************************************** -ALIGN 16 -WelsCopy16x8NotAligned_sse2: - ;push esi - ;push edi - ;push ebx - - ;mov edi, [esp+16] ; Dst - ;mov eax, [esp+20] ; iStrideD - ;mov esi, [esp+24] ; Src - ;mov ecx, [esp+28] ; iStrideS - - push r4 - push r5 - %assign push_num 2 - LOAD_4_PARA - - lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 - lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 - - movdqu xmm0, [r2] - movdqu xmm1, [r2+r3] - movdqu xmm2, [r2+2*r3] - movdqu xmm3, [r2+r5] - lea r2, [r2+4*r3] - movdqu xmm4, [r2] - movdqu xmm5, [r2+r3] - movdqu xmm6, [r2+2*r3] - movdqu xmm7, [r2+r5] - - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm2 - movdqa [r0+r4], xmm3 - lea r0, [r0+4*r1] - movdqa [r0], xmm4 - movdqa [r0+r1], xmm5 - movdqa [r0+2*r1], xmm6 - movdqa [r0+r4], xmm7 - LOAD_4_PARA_POP - pop r5 - pop r4 - ret - - -;*********************************************************************** -; void WelsCopy8x16_mmx(uint8_t* Dst, -; int32_t iStrideD, -; uint8_t* Src, -; int32_t iStrideS ) -;*********************************************************************** -ALIGN 16 -WelsCopy8x16_mmx: - ;push ebx - - ;mov eax, [esp + 8 ] ;Dst - ;mov ecx, [esp + 12] ;iStrideD - ;mov ebx, [esp + 16] ;Src - ;mov edx, [esp + 20] ;iStrideS - - %assign push_num 0 - LOAD_4_PARA - - movq mm0, [r2] - movq mm1, [r2+r3] - lea r2, [r2+2*r3] - movq mm2, [r2] - movq mm3, [r2+r3] - lea r2, [r2+2*r3] - movq mm4, [r2] - movq mm5, [r2+r3] - lea r2, [r2+2*r3] - movq mm6, [r2] - movq mm7, [r2+r3] - lea r2, [r2+2*r3] - - movq [r0], mm0 - movq [r0+r1], mm1 - lea r0, [r0+2*r1] - movq [r0], mm2 - movq [r0+r1], mm3 - lea r0, [r0+2*r1] - movq [r0], mm4 - movq [r0+r1], mm5 - lea r0, [r0+2*r1] - movq [r0], mm6 - movq [r0+r1], mm7 - lea r0, [r0+2*r1] - - movq mm0, [r2] - movq mm1, [r2+r3] - lea r2, [r2+2*r3] - movq mm2, [r2] - movq mm3, [r2+r3] - lea r2, [r2+2*r3] - movq mm4, [r2] - movq mm5, [r2+r3] - lea r2, [r2+2*r3] - movq mm6, [r2] - movq mm7, [r2+r3] - - movq [r0], mm0 - movq [r0+r1], mm1 - lea r0, [r0+2*r1] - movq [r0], mm2 - movq [r0+r1], mm3 - lea r0, [r0+2*r1] - movq [r0], mm4 - movq [r0+r1], mm5 - lea r0, [r0+2*r1] - movq [r0], mm6 - movq [r0+r1], mm7 - - WELSEMMS - LOAD_4_PARA_POP - ret - -;*********************************************************************** -; void WelsCopy8x8_mmx( uint8_t* Dst, -; int32_t iStrideD, -; uint8_t* Src, -; int32_t iStrideS ) -;*********************************************************************** -ALIGN 16 -WelsCopy8x8_mmx: - ;push ebx - ;push esi - ;mov eax, [esp + 12] ;Dst - ;mov ecx, [esp + 16] ;iStrideD - ;mov esi, [esp + 20] ;Src - ;mov ebx, [esp + 24] ;iStrideS - - push r4 - %assign push_num 1 - LOAD_4_PARA - lea r4, [r3+2*r3] ;edx, [ebx+2*ebx] - - ; to prefetch next loop - prefetchnta [r2+2*r3] - prefetchnta [r2+r4] - movq mm0, [r2] - movq mm1, [r2+r3] - lea r2, [r2+2*r3] - ; to prefetch next loop - prefetchnta [r2+2*r3] - prefetchnta [r2+r4] - movq mm2, [r2] - movq mm3, [r2+r3] - lea r2, [r2+2*r3] - ; to prefetch next loop - prefetchnta [r2+2*r3] - prefetchnta [r2+r4] - movq mm4, [r2] - movq mm5, [r2+r3] - lea r2, [r2+2*r3] - movq mm6, [r2] - movq mm7, [r2+r3] - - movq [r0], mm0 - movq [r0+r1], mm1 - lea r0, [r0+2*r1] - movq [r0], mm2 - movq [r0+r1], mm3 - lea r0, [r0+2*r1] - movq [r0], mm4 - movq [r0+r1], mm5 - lea r0, [r0+2*r1] - movq [r0], mm6 - movq [r0+r1], mm7 - - WELSEMMS - ;pop esi - ;pop ebx - LOAD_4_PARA_POP - pop r4 - ret - -; (dunhuang@cisco), 12/21/2011 -;*********************************************************************** -; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv ) -;*********************************************************************** -ALIGN 16 -UpdateMbMv_sse2: - - %assign push_num 0 - LOAD_2_PARA - - ;mov eax, [esp+4] ; mv_buffer - ;movd xmm0, [esp+8] ; _mv - movd xmm0, r1d ; _mv - pshufd xmm1, xmm0, $0 - movdqa [r0 ], xmm1 - movdqa [r0+0x10], xmm1 - movdqa [r0+0x20], xmm1 - movdqa [r0+0x30], xmm1 - ret - -;******************************************************************************* -; Macros and other preprocessor constants -;******************************************************************************* - -;******************************************************************************* -; Local Data (Read Only) -;******************************************************************************* - -;SECTION .rodata data align=16 - -;******************************************************************************* -; Various memory constants (trigonometric values or rounding values) -;******************************************************************************* - -ALIGN 16 - -;******************************************************************************* -; Code -;******************************************************************************* - -SECTION .text - -WELS_EXTERN PixelAvgWidthEq4_mmx -WELS_EXTERN PixelAvgWidthEq8_mmx -WELS_EXTERN PixelAvgWidthEq16_sse2 - -WELS_EXTERN McCopyWidthEq4_mmx -WELS_EXTERN McCopyWidthEq8_mmx -WELS_EXTERN McCopyWidthEq16_sse2 - - -ALIGN 16 -;******************************************************************************* -; void_t PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride, -; uint8_t *pSrcA, int iSrcAStride, -; uint8_t *pSrcB, int iSrcBStride, -; int iHeight ); -;******************************************************************************* -PixelAvgWidthEq4_mmx: - - %assign push_num 0 - LOAD_7_PARA - -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r5, r5d - movsx r6, r6d -%endif - -ALIGN 4 -.height_loop: - movd mm0, [r4] - pavgb mm0, [r2] - movd [r0], mm0 - - dec r6 - lea r0, [r0+r1] - lea r2, [r2+r3] - lea r4, [r4+r5] - jne .height_loop - - WELSEMMS - LOAD_7_PARA_POP - ret - - -ALIGN 16 -;******************************************************************************* -; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride, -; uint8_t *pSrcA, int iSrcAStride, -; uint8_t *pSrcB, int iSrcBStride, -; int iHeight ); -;******************************************************************************* -PixelAvgWidthEq8_mmx: - - ;push esi - ;push edi - ;push ebp - ;push ebx - - ;mov edi, [esp+20] ; pDst - ;mov eax, [esp+24] ; iDstStride - ;mov esi, [esp+28] ; pSrcA - ;mov ecx, [esp+32] ; iSrcAStride - ;mov ebp, [esp+36] ; pSrcB - ;mov edx, [esp+40] ; iSrcBStride - ;mov ebx, [esp+44] ; iHeight - - %assign push_num 0 - LOAD_7_PARA - -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r5, r5d - movsx r6, r6d -%endif - -ALIGN 4 -.height_loop: - movq mm0, [r2] - pavgb mm0, [r4] - movq [r0], mm0 - movq mm0, [r2+r3] - pavgb mm0, [r4+r5] - movq [r0+r1], mm0 - - lea r2, [r2+2*r3] - lea r4, [r4+2*r5] - lea r0, [r0+2*r1] - - sub r6, 2 - jnz .height_loop - - WELSEMMS - LOAD_7_PARA_POP - ret - - - -ALIGN 16 -;******************************************************************************* -; void_t PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride, -; uint8_t *pSrcA, int iSrcAStride, -; uint8_t *pSrcB, int iSrcBStride, -; int iHeight ); -;******************************************************************************* -PixelAvgWidthEq16_sse2: - - %assign push_num 0 - LOAD_7_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r5, r5d - movsx r6, r6d -%endif -ALIGN 4 -.height_loop: - movdqu xmm0, [r2] - movdqu xmm1, [r4] - pavgb xmm0, xmm1 - ;pavgb xmm0, [r4] - movdqu [r0], xmm0 - - movdqu xmm0, [r2+r3] - movdqu xmm1, [r4+r5] - pavgb xmm0, xmm1 - movdqu [r0+r1], xmm0 - - movdqu xmm0, [r2+2*r3] - movdqu xmm1, [r4+2*r5] - pavgb xmm0, xmm1 - movdqu [r0+2*r1], xmm0 - - lea r2, [r2+2*r3] - lea r4, [r4+2*r5] - lea r0, [r0+2*r1] - - movdqu xmm0, [r2+r3] - movdqu xmm1, [r4+r5] - pavgb xmm0, xmm1 - movdqu [r0+r1], xmm0 - - lea r2, [r2+2*r3] - lea r4, [r4+2*r5] - lea r0, [r0+2*r1] - - sub r6, 4 - jne .height_loop - - WELSEMMS - LOAD_7_PARA_POP - ret - -ALIGN 16 -;******************************************************************************* -; void_t McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride, -; uint8_t *pDst, int iDstStride, int iHeight ) -;******************************************************************************* -McCopyWidthEq4_mmx: - ;push esi - ;push edi - ;push ebx - - - ;mov esi, [esp+16] - ;mov eax, [esp+20] - ;mov edi, [esp+24] - ;mov ecx, [esp+28] - ;mov edx, [esp+32] - - push r5 - %assign push_num 1 - LOAD_5_PARA - -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d -%endif - -ALIGN 4 -.height_loop: - mov r5d, [r0] - mov [r2], r5d - - add r0, r1 - add r2, r3 - dec r4 - jnz .height_loop - WELSEMMS - LOAD_5_PARA_POP - pop r5 - ret - -ALIGN 16 -;******************************************************************************* -; void_t McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride, -; uint8_t *pDst, int iDstStride, int iHeight ) -;******************************************************************************* -McCopyWidthEq8_mmx: - ;push esi - ;push edi - ;mov esi, [esp+12] - ;mov eax, [esp+16] - ;mov edi, [esp+20] - ;mov ecx, [esp+24] - ;mov edx, [esp+28] - - %assign push_num 0 - LOAD_5_PARA - -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d -%endif - -ALIGN 4 -.height_loop: - movq mm0, [r0] - movq [r2], mm0 - add r0, r1 - add r2, r3 - dec r4 - jnz .height_loop - - WELSEMMS - LOAD_5_PARA_POP - ret - - -ALIGN 16 -;******************************************************************************* -; void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight ) -;******************************************************************************* -;read unaligned memory -%macro SSE_READ_UNA 2 - movq %1, [%2] - movhps %1, [%2+8] -%endmacro - -;write unaligned memory -%macro SSE_WRITE_UNA 2 - movq [%1], %2 - movhps [%1+8], %2 -%endmacro -McCopyWidthEq16_sse2: - ;push esi - ;push edi - - ;mov esi, [esp+12] ; pSrc - ;mov eax, [esp+16] ; iSrcStride - ;mov edi, [esp+20] ; pDst - ;mov edx, [esp+24] ; iDstStride - ;mov ecx, [esp+28] ; iHeight - - %assign push_num 0 - LOAD_5_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d -%endif -ALIGN 4 -.height_loop: - SSE_READ_UNA xmm0, r0 - SSE_READ_UNA xmm1, r0+r1 - SSE_WRITE_UNA r2, xmm0 - SSE_WRITE_UNA r2+r3, xmm1 - - sub r4, 2 - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - jnz .height_loop - - LOAD_5_PARA_POP - ret +;*! +;* \copy +;* Copyright (c) 2009-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* mb_copy.asm +;* +;* Abstract +;* mb_copy and mb_copy1 +;* +;* History +;* 15/09/2009 Created +;* 12/28/2009 Modified with larger throughput +;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, +;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc; +;* +;* +;*********************************************************************************************/ +%include "asm_inc.asm" + +;*********************************************************************** +; Macros and other preprocessor constants +;*********************************************************************** + +;*********************************************************************** +; Code +;*********************************************************************** + +SECTION .text + +WELS_EXTERN WelsCopy16x16_sse2 +WELS_EXTERN WelsCopy16x16NotAligned_sse2 +WELS_EXTERN WelsCopy8x8_mmx +WELS_EXTERN WelsCopy16x8NotAligned_sse2 ; +WELS_EXTERN WelsCopy8x16_mmx ; +WELS_EXTERN UpdateMbMv_sse2 ; + +;*********************************************************************** +; void WelsCopy16x16_sse2( uint8_t* Dst, +; int32_t iStrideD, +; uint8_t* Src, +; int32_t iStrideS ) +;*********************************************************************** +ALIGN 16 +WelsCopy16x16_sse2: + + push r4 + push r5 + %assign push_num 2 + LOAD_4_PARA + + lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 + lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 + + movdqa xmm0, [r2] + movdqa xmm1, [r2+r3] + movdqa xmm2, [r2+2*r3] + movdqa xmm3, [r2+r5] + lea r2, [r2+4*r3] + movdqa xmm4, [r2] + movdqa xmm5, [r2+r3] + movdqa xmm6, [r2+2*r3] + movdqa xmm7, [r2+r5] + lea r2, [r2+4*r3] + + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm2 + movdqa [r0+r4], xmm3 + lea r0, [r0+4*r1] + movdqa [r0], xmm4 + movdqa [r0+r1], xmm5 + movdqa [r0+2*r1], xmm6 + movdqa [r0+r4], xmm7 + lea r0, [r0+4*r1] + + movdqa xmm0, [r2] + movdqa xmm1, [r2+r3] + movdqa xmm2, [r2+2*r3] + movdqa xmm3, [r2+r5] + lea r2, [r2+4*r3] + movdqa xmm4, [r2] + movdqa xmm5, [r2+r3] + movdqa xmm6, [r2+2*r3] + movdqa xmm7, [r2+r5] + + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm2 + movdqa [r0+r4], xmm3 + lea r0, [r0+4*r1] + movdqa [r0], xmm4 + movdqa [r0+r1], xmm5 + movdqa [r0+2*r1], xmm6 + movdqa [r0+r4], xmm7 + LOAD_4_PARA_POP + pop r5 + pop r4 + ret + +;*********************************************************************** +; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst, +; int32_t iStrideD, +; uint8_t* Src, +; int32_t iStrideS ) +;*********************************************************************** +ALIGN 16 +; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011 +WelsCopy16x16NotAligned_sse2: + ;push esi + ;push edi + ;push ebx + + ;mov edi, [esp+16] ; Dst + ;mov eax, [esp+20] ; iStrideD + ;mov esi, [esp+24] ; Src + ;mov ecx, [esp+28] ; iStrideS + + push r4 + push r5 + %assign push_num 2 + LOAD_4_PARA + + lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 + lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 + + movdqu xmm0, [r2] + movdqu xmm1, [r2+r3] + movdqu xmm2, [r2+2*r3] + movdqu xmm3, [r2+r5] + lea r2, [r2+4*r3] + movdqu xmm4, [r2] + movdqu xmm5, [r2+r3] + movdqu xmm6, [r2+2*r3] + movdqu xmm7, [r2+r5] + lea r2, [r2+4*r3] + + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm2 + movdqa [r0+r4], xmm3 + lea r0, [r0+4*r1] + movdqa [r0], xmm4 + movdqa [r0+r1], xmm5 + movdqa [r0+2*r1], xmm6 + movdqa [r0+r4], xmm7 + lea r0, [r0+4*r1] + + movdqu xmm0, [r2] + movdqu xmm1, [r2+r3] + movdqu xmm2, [r2+2*r3] + movdqu xmm3, [r2+r5] + lea r2, [r2+4*r3] + movdqu xmm4, [r2] + movdqu xmm5, [r2+r3] + movdqu xmm6, [r2+2*r3] + movdqu xmm7, [r2+r5] + + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm2 + movdqa [r0+r4], xmm3 + lea r0, [r0+4*r1] + movdqa [r0], xmm4 + movdqa [r0+r1], xmm5 + movdqa [r0+2*r1], xmm6 + movdqa [r0+r4], xmm7 + LOAD_4_PARA_POP + pop r5 + pop r4 + ret + +; , 12/29/2011 +;*********************************************************************** +; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst, +; int32_t iStrideD, +; uint8_t* Src, +; int32_t iStrideS ) +;*********************************************************************** +ALIGN 16 +WelsCopy16x8NotAligned_sse2: + ;push esi + ;push edi + ;push ebx + + ;mov edi, [esp+16] ; Dst + ;mov eax, [esp+20] ; iStrideD + ;mov esi, [esp+24] ; Src + ;mov ecx, [esp+28] ; iStrideS + + push r4 + push r5 + %assign push_num 2 + LOAD_4_PARA + + lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 + lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 + + movdqu xmm0, [r2] + movdqu xmm1, [r2+r3] + movdqu xmm2, [r2+2*r3] + movdqu xmm3, [r2+r5] + lea r2, [r2+4*r3] + movdqu xmm4, [r2] + movdqu xmm5, [r2+r3] + movdqu xmm6, [r2+2*r3] + movdqu xmm7, [r2+r5] + + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm2 + movdqa [r0+r4], xmm3 + lea r0, [r0+4*r1] + movdqa [r0], xmm4 + movdqa [r0+r1], xmm5 + movdqa [r0+2*r1], xmm6 + movdqa [r0+r4], xmm7 + LOAD_4_PARA_POP + pop r5 + pop r4 + ret + + +;*********************************************************************** +; void WelsCopy8x16_mmx(uint8_t* Dst, +; int32_t iStrideD, +; uint8_t* Src, +; int32_t iStrideS ) +;*********************************************************************** +ALIGN 16 +WelsCopy8x16_mmx: + ;push ebx + + ;mov eax, [esp + 8 ] ;Dst + ;mov ecx, [esp + 12] ;iStrideD + ;mov ebx, [esp + 16] ;Src + ;mov edx, [esp + 20] ;iStrideS + + %assign push_num 0 + LOAD_4_PARA + + movq mm0, [r2] + movq mm1, [r2+r3] + lea r2, [r2+2*r3] + movq mm2, [r2] + movq mm3, [r2+r3] + lea r2, [r2+2*r3] + movq mm4, [r2] + movq mm5, [r2+r3] + lea r2, [r2+2*r3] + movq mm6, [r2] + movq mm7, [r2+r3] + lea r2, [r2+2*r3] + + movq [r0], mm0 + movq [r0+r1], mm1 + lea r0, [r0+2*r1] + movq [r0], mm2 + movq [r0+r1], mm3 + lea r0, [r0+2*r1] + movq [r0], mm4 + movq [r0+r1], mm5 + lea r0, [r0+2*r1] + movq [r0], mm6 + movq [r0+r1], mm7 + lea r0, [r0+2*r1] + + movq mm0, [r2] + movq mm1, [r2+r3] + lea r2, [r2+2*r3] + movq mm2, [r2] + movq mm3, [r2+r3] + lea r2, [r2+2*r3] + movq mm4, [r2] + movq mm5, [r2+r3] + lea r2, [r2+2*r3] + movq mm6, [r2] + movq mm7, [r2+r3] + + movq [r0], mm0 + movq [r0+r1], mm1 + lea r0, [r0+2*r1] + movq [r0], mm2 + movq [r0+r1], mm3 + lea r0, [r0+2*r1] + movq [r0], mm4 + movq [r0+r1], mm5 + lea r0, [r0+2*r1] + movq [r0], mm6 + movq [r0+r1], mm7 + + WELSEMMS + LOAD_4_PARA_POP + ret + +;*********************************************************************** +; void WelsCopy8x8_mmx( uint8_t* Dst, +; int32_t iStrideD, +; uint8_t* Src, +; int32_t iStrideS ) +;*********************************************************************** +ALIGN 16 +WelsCopy8x8_mmx: + ;push ebx + ;push esi + ;mov eax, [esp + 12] ;Dst + ;mov ecx, [esp + 16] ;iStrideD + ;mov esi, [esp + 20] ;Src + ;mov ebx, [esp + 24] ;iStrideS + + push r4 + %assign push_num 1 + LOAD_4_PARA + lea r4, [r3+2*r3] ;edx, [ebx+2*ebx] + + ; to prefetch next loop + prefetchnta [r2+2*r3] + prefetchnta [r2+r4] + movq mm0, [r2] + movq mm1, [r2+r3] + lea r2, [r2+2*r3] + ; to prefetch next loop + prefetchnta [r2+2*r3] + prefetchnta [r2+r4] + movq mm2, [r2] + movq mm3, [r2+r3] + lea r2, [r2+2*r3] + ; to prefetch next loop + prefetchnta [r2+2*r3] + prefetchnta [r2+r4] + movq mm4, [r2] + movq mm5, [r2+r3] + lea r2, [r2+2*r3] + movq mm6, [r2] + movq mm7, [r2+r3] + + movq [r0], mm0 + movq [r0+r1], mm1 + lea r0, [r0+2*r1] + movq [r0], mm2 + movq [r0+r1], mm3 + lea r0, [r0+2*r1] + movq [r0], mm4 + movq [r0+r1], mm5 + lea r0, [r0+2*r1] + movq [r0], mm6 + movq [r0+r1], mm7 + + WELSEMMS + ;pop esi + ;pop ebx + LOAD_4_PARA_POP + pop r4 + ret + +; (dunhuang@cisco), 12/21/2011 +;*********************************************************************** +; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv ) +;*********************************************************************** +ALIGN 16 +UpdateMbMv_sse2: + + %assign push_num 0 + LOAD_2_PARA + + ;mov eax, [esp+4] ; mv_buffer + ;movd xmm0, [esp+8] ; _mv + movd xmm0, r1d ; _mv + pshufd xmm1, xmm0, $0 + movdqa [r0 ], xmm1 + movdqa [r0+0x10], xmm1 + movdqa [r0+0x20], xmm1 + movdqa [r0+0x30], xmm1 + ret + +;******************************************************************************* +; Macros and other preprocessor constants +;******************************************************************************* + +;******************************************************************************* +; Local Data (Read Only) +;******************************************************************************* + +;SECTION .rodata data align=16 + +;******************************************************************************* +; Various memory constants (trigonometric values or rounding values) +;******************************************************************************* + +ALIGN 16 + +;******************************************************************************* +; Code +;******************************************************************************* + +SECTION .text + +WELS_EXTERN PixelAvgWidthEq4_mmx +WELS_EXTERN PixelAvgWidthEq8_mmx +WELS_EXTERN PixelAvgWidthEq16_sse2 + +WELS_EXTERN McCopyWidthEq4_mmx +WELS_EXTERN McCopyWidthEq8_mmx +WELS_EXTERN McCopyWidthEq16_sse2 + + +ALIGN 16 +;******************************************************************************* +; void_t PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride, +; uint8_t *pSrcA, int iSrcAStride, +; uint8_t *pSrcB, int iSrcBStride, +; int iHeight ); +;******************************************************************************* +PixelAvgWidthEq4_mmx: + + %assign push_num 0 + LOAD_7_PARA + +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r5, r5d + movsx r6, r6d +%endif + +ALIGN 4 +.height_loop: + movd mm0, [r4] + pavgb mm0, [r2] + movd [r0], mm0 + + dec r6 + lea r0, [r0+r1] + lea r2, [r2+r3] + lea r4, [r4+r5] + jne .height_loop + + WELSEMMS + LOAD_7_PARA_POP + ret + + +ALIGN 16 +;******************************************************************************* +; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride, +; uint8_t *pSrcA, int iSrcAStride, +; uint8_t *pSrcB, int iSrcBStride, +; int iHeight ); +;******************************************************************************* +PixelAvgWidthEq8_mmx: + + ;push esi + ;push edi + ;push ebp + ;push ebx + + ;mov edi, [esp+20] ; pDst + ;mov eax, [esp+24] ; iDstStride + ;mov esi, [esp+28] ; pSrcA + ;mov ecx, [esp+32] ; iSrcAStride + ;mov ebp, [esp+36] ; pSrcB + ;mov edx, [esp+40] ; iSrcBStride + ;mov ebx, [esp+44] ; iHeight + + %assign push_num 0 + LOAD_7_PARA + +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r5, r5d + movsx r6, r6d +%endif + +ALIGN 4 +.height_loop: + movq mm0, [r2] + pavgb mm0, [r4] + movq [r0], mm0 + movq mm0, [r2+r3] + pavgb mm0, [r4+r5] + movq [r0+r1], mm0 + + lea r2, [r2+2*r3] + lea r4, [r4+2*r5] + lea r0, [r0+2*r1] + + sub r6, 2 + jnz .height_loop + + WELSEMMS + LOAD_7_PARA_POP + ret + + + +ALIGN 16 +;******************************************************************************* +; void_t PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride, +; uint8_t *pSrcA, int iSrcAStride, +; uint8_t *pSrcB, int iSrcBStride, +; int iHeight ); +;******************************************************************************* +PixelAvgWidthEq16_sse2: + + %assign push_num 0 + LOAD_7_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r5, r5d + movsx r6, r6d +%endif +ALIGN 4 +.height_loop: + movdqu xmm0, [r2] + movdqu xmm1, [r4] + pavgb xmm0, xmm1 + ;pavgb xmm0, [r4] + movdqu [r0], xmm0 + + movdqu xmm0, [r2+r3] + movdqu xmm1, [r4+r5] + pavgb xmm0, xmm1 + movdqu [r0+r1], xmm0 + + movdqu xmm0, [r2+2*r3] + movdqu xmm1, [r4+2*r5] + pavgb xmm0, xmm1 + movdqu [r0+2*r1], xmm0 + + lea r2, [r2+2*r3] + lea r4, [r4+2*r5] + lea r0, [r0+2*r1] + + movdqu xmm0, [r2+r3] + movdqu xmm1, [r4+r5] + pavgb xmm0, xmm1 + movdqu [r0+r1], xmm0 + + lea r2, [r2+2*r3] + lea r4, [r4+2*r5] + lea r0, [r0+2*r1] + + sub r6, 4 + jne .height_loop + + WELSEMMS + LOAD_7_PARA_POP + ret + +ALIGN 16 +;******************************************************************************* +; void_t McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride, +; uint8_t *pDst, int iDstStride, int iHeight ) +;******************************************************************************* +McCopyWidthEq4_mmx: + ;push esi + ;push edi + ;push ebx + + + ;mov esi, [esp+16] + ;mov eax, [esp+20] + ;mov edi, [esp+24] + ;mov ecx, [esp+28] + ;mov edx, [esp+32] + + push r5 + %assign push_num 1 + LOAD_5_PARA + +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d +%endif + +ALIGN 4 +.height_loop: + mov r5d, [r0] + mov [r2], r5d + + add r0, r1 + add r2, r3 + dec r4 + jnz .height_loop + WELSEMMS + LOAD_5_PARA_POP + pop r5 + ret + +ALIGN 16 +;******************************************************************************* +; void_t McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride, +; uint8_t *pDst, int iDstStride, int iHeight ) +;******************************************************************************* +McCopyWidthEq8_mmx: + ;push esi + ;push edi + ;mov esi, [esp+12] + ;mov eax, [esp+16] + ;mov edi, [esp+20] + ;mov ecx, [esp+24] + ;mov edx, [esp+28] + + %assign push_num 0 + LOAD_5_PARA + +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d +%endif + +ALIGN 4 +.height_loop: + movq mm0, [r0] + movq [r2], mm0 + add r0, r1 + add r2, r3 + dec r4 + jnz .height_loop + + WELSEMMS + LOAD_5_PARA_POP + ret + + +ALIGN 16 +;******************************************************************************* +; void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight ) +;******************************************************************************* +;read unaligned memory +%macro SSE_READ_UNA 2 + movq %1, [%2] + movhps %1, [%2+8] +%endmacro + +;write unaligned memory +%macro SSE_WRITE_UNA 2 + movq [%1], %2 + movhps [%1+8], %2 +%endmacro +McCopyWidthEq16_sse2: + ;push esi + ;push edi + + ;mov esi, [esp+12] ; pSrc + ;mov eax, [esp+16] ; iSrcStride + ;mov edi, [esp+20] ; pDst + ;mov edx, [esp+24] ; iDstStride + ;mov ecx, [esp+28] ; iHeight + + %assign push_num 0 + LOAD_5_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d +%endif +ALIGN 4 +.height_loop: + SSE_READ_UNA xmm0, r0 + SSE_READ_UNA xmm1, r0+r1 + SSE_WRITE_UNA r2, xmm0 + SSE_WRITE_UNA r2+r3, xmm1 + + sub r4, 2 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + jnz .height_loop + + LOAD_5_PARA_POP + ret diff --git a/codec/common/mc_chroma.asm b/codec/common/mc_chroma.asm index d0ff91e2..a783152b 100644 --- a/codec/common/mc_chroma.asm +++ b/codec/common/mc_chroma.asm @@ -1,345 +1,345 @@ -;*! -;* \copy -;* Copyright (c) 2004-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* mc_chroma.asm -;* -;* Abstract -;* mmx motion compensation for chroma -;* -;* History -;* 10/13/2004 Created -;* -;* -;*************************************************************************/ -%include "asm_inc.asm" - -;*********************************************************************** -; Local Data (Read Only) -;*********************************************************************** - -SECTION .rodata align=16 - -;*********************************************************************** -; Various memory constants (trigonometric values or rounding values) -;*********************************************************************** - -ALIGN 16 -h264_d0x20_sse2: - dw 32,32,32,32,32,32,32,32 -ALIGN 16 -h264_d0x20_mmx: - dw 32,32,32,32 - - -;============================================================================= -; Code -;============================================================================= - -SECTION .text - -ALIGN 16 -;******************************************************************************* -; void McChromaWidthEq4_mmx( uint8_t *src, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; uint8_t *pABCD, -; int32_t iHeigh ); -;******************************************************************************* -WELS_EXTERN McChromaWidthEq4_mmx -McChromaWidthEq4_mmx: - ;push esi - ;push edi - ;push ebx - - %assign push_num 0 - LOAD_6_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r5, r5d -%endif - - ;mov eax, [esp +12 + 20] - - movd mm3, [r4]; [eax] - WELS_Zero mm7 - punpcklbw mm3, mm3 - movq mm4, mm3 - punpcklwd mm3, mm3 - punpckhwd mm4, mm4 - - movq mm5, mm3 - punpcklbw mm3, mm7 - punpckhbw mm5, mm7 - - movq mm6, mm4 - punpcklbw mm4, mm7 - punpckhbw mm6, mm7 - - ;mov esi, [esp +12+ 4] - ;mov eax, [esp + 12 + 8] - ;mov edi, [esp + 12 + 12] - ;mov edx, [esp + 12 + 16] - ;mov ecx, [esp + 12 + 24] - - lea r4, [r0 + r1] ;lea ebx, [esi + eax] - movd mm0, [r0] - movd mm1, [r0+1] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 -.xloop: - - pmullw mm0, mm3 - pmullw mm1, mm5 - paddw mm0, mm1 - - movd mm1, [r4] - punpcklbw mm1, mm7 - movq mm2, mm1 - pmullw mm1, mm4 - paddw mm0, mm1 - - movd mm1, [r4+1] - punpcklbw mm1, mm7 - movq mm7, mm1 - pmullw mm1,mm6 - paddw mm0, mm1 - movq mm1,mm7 - - paddw mm0, [h264_d0x20_mmx] - psrlw mm0, 6 - - WELS_Zero mm7 - packuswb mm0, mm7 - movd [r2], mm0 - - movq mm0, mm2 - - lea r2, [r2 + r3] - lea r4, [r4 + r1] - - dec r5 - jnz near .xloop - WELSEMMS - LOAD_6_PARA_POP - ;pop ebx - ;pop edi - ;pop esi - ret - - -ALIGN 16 -;******************************************************************************* -; void McChromaWidthEq8_sse2( uint8_t *pSrc, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; uint8_t *pABCD, -; int32_t iheigh ); -;******************************************************************************* -WELS_EXTERN McChromaWidthEq8_sse2 -McChromaWidthEq8_sse2: - ;push esi - ;push edi - ;push ebx - - %assign push_num 0 - LOAD_6_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r5, r5d -%endif - - ;mov eax, [esp +12 + 20] - movd xmm3, [r4] - WELS_Zero xmm7 - punpcklbw xmm3, xmm3 - punpcklwd xmm3, xmm3 - - movdqa xmm4, xmm3 - punpckldq xmm3, xmm3 - punpckhdq xmm4, xmm4 - movdqa xmm5, xmm3 - movdqa xmm6, xmm4 - - punpcklbw xmm3, xmm7 - punpckhbw xmm5, xmm7 - punpcklbw xmm4, xmm7 - punpckhbw xmm6, xmm7 - - ;mov esi, [esp +12+ 4] - ;mov eax, [esp + 12 + 8] - ;mov edi, [esp + 12 + 12] - ;mov edx, [esp + 12 + 16] - ;mov ecx, [esp + 12 + 24] - - lea r4, [r0 + r1] ;lea ebx, [esi + eax] - movq xmm0, [r0] - movq xmm1, [r0+1] - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 -.xloop: - - pmullw xmm0, xmm3 - pmullw xmm1, xmm5 - paddw xmm0, xmm1 - - movq xmm1, [r4] - punpcklbw xmm1, xmm7 - movdqa xmm2, xmm1 - pmullw xmm1, xmm4 - paddw xmm0, xmm1 - - movq xmm1, [r4+1] - punpcklbw xmm1, xmm7 - movdqa xmm7, xmm1 - pmullw xmm1, xmm6 - paddw xmm0, xmm1 - movdqa xmm1,xmm7 - - paddw xmm0, [h264_d0x20_sse2] - psrlw xmm0, 6 - - WELS_Zero xmm7 - packuswb xmm0, xmm7 - movq [r2], xmm0 - - movdqa xmm0, xmm2 - - lea r2, [r2 + r3] - lea r4, [r4 + r1] - - dec r5 - jnz near .xloop - - LOAD_6_PARA_POP - - ;pop ebx - ;pop edi - ;pop esi - ret - - - - -ALIGN 16 -;*********************************************************************** -; void McChromaWidthEq8_ssse3( uint8_t *pSrc, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; uint8_t *pABCD, -; int32_t iHeigh); -;*********************************************************************** -WELS_EXTERN McChromaWidthEq8_ssse3 -McChromaWidthEq8_ssse3: - ;push ebx - ;push esi - ;push edi - %assign push_num 0 - LOAD_6_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r5, r5d -%endif - - ;mov eax, [esp + 12 + 20] - - pxor xmm7, xmm7 - movd xmm5, [r4] - punpcklwd xmm5, xmm5 - punpckldq xmm5, xmm5 - movdqa xmm6, xmm5 - punpcklqdq xmm5, xmm5 - punpckhqdq xmm6, xmm6 - - ;mov eax, [esp + 12 + 4] - ;mov edx, [esp + 12 + 8] - ;mov esi, [esp + 12 + 12] - ;mov edi, [esp + 12 + 16] - ;mov ecx, [esp + 12 + 24] - - sub r2, r3 ;sub esi, edi - sub r2, r3 - movdqa xmm7, [h264_d0x20_sse2] - - movdqu xmm0, [r0] - movdqa xmm1, xmm0 - psrldq xmm1, 1 - punpcklbw xmm0, xmm1 - -.hloop_chroma: - lea r2, [r2+2*r3] - - movdqu xmm2, [r0+r1] - movdqa xmm3, xmm2 - psrldq xmm3, 1 - punpcklbw xmm2, xmm3 - movdqa xmm4, xmm2 - - pmaddubsw xmm0, xmm5 - pmaddubsw xmm2, xmm6 - paddw xmm0, xmm2 - paddw xmm0, xmm7 - psrlw xmm0, 6 - packuswb xmm0, xmm0 - movq [r2],xmm0 - - lea r0, [r0+2*r1] - movdqu xmm2, [r0] - movdqa xmm3, xmm2 - psrldq xmm3, 1 - punpcklbw xmm2, xmm3 - movdqa xmm0, xmm2 - - pmaddubsw xmm4, xmm5 - pmaddubsw xmm2, xmm6 - paddw xmm4, xmm2 - paddw xmm4, xmm7 - psrlw xmm4, 6 - packuswb xmm4, xmm4 - movq [r2+r3],xmm4 - - sub r5, 2 - jnz .hloop_chroma - - LOAD_6_PARA_POP - - ;pop edi - ;pop esi - ;pop ebx - - ret - - +;*! +;* \copy +;* Copyright (c) 2004-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* mc_chroma.asm +;* +;* Abstract +;* mmx motion compensation for chroma +;* +;* History +;* 10/13/2004 Created +;* +;* +;*************************************************************************/ +%include "asm_inc.asm" + +;*********************************************************************** +; Local Data (Read Only) +;*********************************************************************** + +SECTION .rodata align=16 + +;*********************************************************************** +; Various memory constants (trigonometric values or rounding values) +;*********************************************************************** + +ALIGN 16 +h264_d0x20_sse2: + dw 32,32,32,32,32,32,32,32 +ALIGN 16 +h264_d0x20_mmx: + dw 32,32,32,32 + + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +ALIGN 16 +;******************************************************************************* +; void McChromaWidthEq4_mmx( uint8_t *src, +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; uint8_t *pABCD, +; int32_t iHeigh ); +;******************************************************************************* +WELS_EXTERN McChromaWidthEq4_mmx +McChromaWidthEq4_mmx: + ;push esi + ;push edi + ;push ebx + + %assign push_num 0 + LOAD_6_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r5, r5d +%endif + + ;mov eax, [esp +12 + 20] + + movd mm3, [r4]; [eax] + WELS_Zero mm7 + punpcklbw mm3, mm3 + movq mm4, mm3 + punpcklwd mm3, mm3 + punpckhwd mm4, mm4 + + movq mm5, mm3 + punpcklbw mm3, mm7 + punpckhbw mm5, mm7 + + movq mm6, mm4 + punpcklbw mm4, mm7 + punpckhbw mm6, mm7 + + ;mov esi, [esp +12+ 4] + ;mov eax, [esp + 12 + 8] + ;mov edi, [esp + 12 + 12] + ;mov edx, [esp + 12 + 16] + ;mov ecx, [esp + 12 + 24] + + lea r4, [r0 + r1] ;lea ebx, [esi + eax] + movd mm0, [r0] + movd mm1, [r0+1] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 +.xloop: + + pmullw mm0, mm3 + pmullw mm1, mm5 + paddw mm0, mm1 + + movd mm1, [r4] + punpcklbw mm1, mm7 + movq mm2, mm1 + pmullw mm1, mm4 + paddw mm0, mm1 + + movd mm1, [r4+1] + punpcklbw mm1, mm7 + movq mm7, mm1 + pmullw mm1,mm6 + paddw mm0, mm1 + movq mm1,mm7 + + paddw mm0, [h264_d0x20_mmx] + psrlw mm0, 6 + + WELS_Zero mm7 + packuswb mm0, mm7 + movd [r2], mm0 + + movq mm0, mm2 + + lea r2, [r2 + r3] + lea r4, [r4 + r1] + + dec r5 + jnz near .xloop + WELSEMMS + LOAD_6_PARA_POP + ;pop ebx + ;pop edi + ;pop esi + ret + + +ALIGN 16 +;******************************************************************************* +; void McChromaWidthEq8_sse2( uint8_t *pSrc, +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; uint8_t *pABCD, +; int32_t iheigh ); +;******************************************************************************* +WELS_EXTERN McChromaWidthEq8_sse2 +McChromaWidthEq8_sse2: + ;push esi + ;push edi + ;push ebx + + %assign push_num 0 + LOAD_6_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r5, r5d +%endif + + ;mov eax, [esp +12 + 20] + movd xmm3, [r4] + WELS_Zero xmm7 + punpcklbw xmm3, xmm3 + punpcklwd xmm3, xmm3 + + movdqa xmm4, xmm3 + punpckldq xmm3, xmm3 + punpckhdq xmm4, xmm4 + movdqa xmm5, xmm3 + movdqa xmm6, xmm4 + + punpcklbw xmm3, xmm7 + punpckhbw xmm5, xmm7 + punpcklbw xmm4, xmm7 + punpckhbw xmm6, xmm7 + + ;mov esi, [esp +12+ 4] + ;mov eax, [esp + 12 + 8] + ;mov edi, [esp + 12 + 12] + ;mov edx, [esp + 12 + 16] + ;mov ecx, [esp + 12 + 24] + + lea r4, [r0 + r1] ;lea ebx, [esi + eax] + movq xmm0, [r0] + movq xmm1, [r0+1] + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 +.xloop: + + pmullw xmm0, xmm3 + pmullw xmm1, xmm5 + paddw xmm0, xmm1 + + movq xmm1, [r4] + punpcklbw xmm1, xmm7 + movdqa xmm2, xmm1 + pmullw xmm1, xmm4 + paddw xmm0, xmm1 + + movq xmm1, [r4+1] + punpcklbw xmm1, xmm7 + movdqa xmm7, xmm1 + pmullw xmm1, xmm6 + paddw xmm0, xmm1 + movdqa xmm1,xmm7 + + paddw xmm0, [h264_d0x20_sse2] + psrlw xmm0, 6 + + WELS_Zero xmm7 + packuswb xmm0, xmm7 + movq [r2], xmm0 + + movdqa xmm0, xmm2 + + lea r2, [r2 + r3] + lea r4, [r4 + r1] + + dec r5 + jnz near .xloop + + LOAD_6_PARA_POP + + ;pop ebx + ;pop edi + ;pop esi + ret + + + + +ALIGN 16 +;*********************************************************************** +; void McChromaWidthEq8_ssse3( uint8_t *pSrc, +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; uint8_t *pABCD, +; int32_t iHeigh); +;*********************************************************************** +WELS_EXTERN McChromaWidthEq8_ssse3 +McChromaWidthEq8_ssse3: + ;push ebx + ;push esi + ;push edi + %assign push_num 0 + LOAD_6_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r5, r5d +%endif + + ;mov eax, [esp + 12 + 20] + + pxor xmm7, xmm7 + movd xmm5, [r4] + punpcklwd xmm5, xmm5 + punpckldq xmm5, xmm5 + movdqa xmm6, xmm5 + punpcklqdq xmm5, xmm5 + punpckhqdq xmm6, xmm6 + + ;mov eax, [esp + 12 + 4] + ;mov edx, [esp + 12 + 8] + ;mov esi, [esp + 12 + 12] + ;mov edi, [esp + 12 + 16] + ;mov ecx, [esp + 12 + 24] + + sub r2, r3 ;sub esi, edi + sub r2, r3 + movdqa xmm7, [h264_d0x20_sse2] + + movdqu xmm0, [r0] + movdqa xmm1, xmm0 + psrldq xmm1, 1 + punpcklbw xmm0, xmm1 + +.hloop_chroma: + lea r2, [r2+2*r3] + + movdqu xmm2, [r0+r1] + movdqa xmm3, xmm2 + psrldq xmm3, 1 + punpcklbw xmm2, xmm3 + movdqa xmm4, xmm2 + + pmaddubsw xmm0, xmm5 + pmaddubsw xmm2, xmm6 + paddw xmm0, xmm2 + paddw xmm0, xmm7 + psrlw xmm0, 6 + packuswb xmm0, xmm0 + movq [r2],xmm0 + + lea r0, [r0+2*r1] + movdqu xmm2, [r0] + movdqa xmm3, xmm2 + psrldq xmm3, 1 + punpcklbw xmm2, xmm3 + movdqa xmm0, xmm2 + + pmaddubsw xmm4, xmm5 + pmaddubsw xmm2, xmm6 + paddw xmm4, xmm2 + paddw xmm4, xmm7 + psrlw xmm4, 6 + packuswb xmm4, xmm4 + movq [r2+r3],xmm4 + + sub r5, 2 + jnz .hloop_chroma + + LOAD_6_PARA_POP + + ;pop edi + ;pop esi + ;pop ebx + + ret + + diff --git a/codec/common/mc_luma.asm b/codec/common/mc_luma.asm index 0e4b3eff..be89752e 100644 --- a/codec/common/mc_luma.asm +++ b/codec/common/mc_luma.asm @@ -1,1293 +1,1293 @@ -;*! -;* \copy -;* Copyright (c) 2009-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* mc_luma.asm -;* -;* Abstract -;* sse2 motion compensation -;* -;* History -;* 17/08/2009 Created -;* -;* -;*************************************************************************/ -%include "asm_inc.asm" - -;******************************************************************************* -; Local Data (Read Only) -;******************************************************************************* -%ifdef FORMAT_COFF -SECTION .rodata pData -%else -SECTION .rodata align=16 -%endif - -;******************************************************************************* -; Various memory constants (trigonometric values or rounding values) -;******************************************************************************* - -ALIGN 16 -h264_w0x10: - dw 16, 16, 16, 16 -ALIGN 16 -h264_w0x10_1: - dw 16, 16, 16, 16, 16, 16, 16, 16 -ALIGN 16 -h264_mc_hc_32: - dw 32, 32, 32, 32, 32, 32, 32, 32 - - -;******************************************************************************* -; Code -;******************************************************************************* - -SECTION .text - -WELS_EXTERN McHorVer20WidthEq4_mmx - - -ALIGN 16 -;******************************************************************************* -; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc, -; int iSrcStride, -; uint8_t *pDst, -; int iDstStride, -; int iHeight) -;******************************************************************************* -McHorVer20WidthEq4_mmx: - ;push esi - ;push edi - - ;mov esi, [esp+12] - ;mov eax, [esp+16] - ;mov edi, [esp+20] - ;mov ecx, [esp+24] - ;mov edx, [esp+28] - - %assign push_num 0 - LOAD_5_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d -%endif - - sub r0, 2 - WELS_Zero mm7 - movq mm6, [h264_w0x10] -.height_loop: - movd mm0, [r0] - punpcklbw mm0, mm7 - movd mm1, [r0+5] - punpcklbw mm1, mm7 - movd mm2, [r0+1] - punpcklbw mm2, mm7 - movd mm3, [r0+4] - punpcklbw mm3, mm7 - movd mm4, [r0+2] - punpcklbw mm4, mm7 - movd mm5, [r0+3] - punpcklbw mm5, mm7 - - paddw mm2, mm3 - paddw mm4, mm5 - psllw mm4, 2 - psubw mm4, mm2 - paddw mm0, mm1 - paddw mm0, mm4 - psllw mm4, 2 - paddw mm0, mm4 - paddw mm0, mm6 - psraw mm0, 5 - packuswb mm0, mm7 - movd [r2], mm0 - - add r0, r1 - add r2, r3 - dec r4 - jnz .height_loop - - WELSEMMS - LOAD_5_PARA_POP - ret - -;******************************************************************************* -; Macros and other preprocessor constants -;******************************************************************************* - - -%macro SSE_LOAD_8P 3 - movq %1, %3 - punpcklbw %1, %2 -%endmacro - -%macro FILTER_HV_W8 9 - paddw %1, %6 - movdqa %8, %3 - movdqa %7, %2 - paddw %1, [h264_w0x10_1] - paddw %8, %4 - paddw %7, %5 - psllw %8, 2 - psubw %8, %7 - paddw %1, %8 - psllw %8, 2 - paddw %1, %8 - psraw %1, 5 - WELS_Zero %8 - packuswb %1, %8 - movq %9, %1 -%endmacro - -;******************************************************************************* -; Code -;******************************************************************************* - -SECTION .text -WELS_EXTERN McHorVer22Width8HorFirst_sse2 -WELS_EXTERN McHorVer02WidthEq8_sse2 -WELS_EXTERN McHorVer20WidthEq8_sse2 -WELS_EXTERN McHorVer20WidthEq16_sse2 - -ALIGN 16 -;*********************************************************************** -; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc, -; int16_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride -; int32_t iHeight -; ) -;*********************************************************************** -McHorVer22Width8HorFirst_sse2: - ;push esi - ;push edi - ;push ebx - ;mov esi, [esp+16] ;pSrc - ;mov eax, [esp+20] ;iSrcStride - ;mov edi, [esp+24] ;pDst - ;mov edx, [esp+28] ;iDstStride - ;mov ebx, [esp+32] ;iHeight - - %assign push_num 0 - LOAD_5_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d -%endif - pxor xmm7, xmm7 - - sub r0, r1 ;;;;;;;;need more 5 lines. - sub r0, r1 - -.yloop_width_8: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 - - paddw xmm2, xmm3 - paddw xmm4, xmm5 - psllw xmm4, 2 - psubw xmm4, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm4 - psllw xmm4, 2 - paddw xmm0, xmm4 - movdqa [r2], xmm0 - - add r0, r1 - add r2, r3 - dec r4 - jnz .yloop_width_8 - LOAD_5_PARA_POP - ret - -ALIGN 16 -;******************************************************************************* -; void_t McHorVer20WidthEq8_sse2( uint8_t *pSrc, -; int iSrcStride, -; uint8_t *pDst, -; int iDstStride, -; int iHeight, -; ); -;******************************************************************************* -McHorVer20WidthEq8_sse2: - ;push esi - ;push edi - - ;mov esi, [esp + 12] ;pSrc - ;mov eax, [esp + 16] ;iSrcStride - ;mov edi, [esp + 20] ;pDst - ;mov ecx, [esp + 28] ;iHeight - ;mov edx, [esp + 24] ;iDstStride - - %assign push_num 0 - LOAD_5_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d -%endif - lea r0, [r0-2] ;pSrc -= 2; - - pxor xmm7, xmm7 - movdqa xmm6, [h264_w0x10_1] -.y_loop: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 - - paddw xmm2, xmm3 - paddw xmm4, xmm5 - psllw xmm4, 2 - psubw xmm4, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm4 - psllw xmm4, 2 - paddw xmm0, xmm4 - paddw xmm0, xmm6 - psraw xmm0, 5 - - packuswb xmm0, xmm7 - movq [r2], xmm0 - - lea r2, [r2+r3] - lea r0, [r0+r1] - dec r4 - jnz near .y_loop - - LOAD_5_PARA_POP - ret - -ALIGN 16 -;******************************************************************************* -; void_t McHorVer20WidthEq16_sse2( uint8_t *pSrc, -; int iSrcStride, -; uint8_t *pDst, -; int iDstStride, -; int iHeight, -; ); -;******************************************************************************* -McHorVer20WidthEq16_sse2: - ;push esi - ;push edi - ;mov esi, [esp + 12] ;pSrc - ;mov eax, [esp + 16] ;iSrcStride - ;mov edi, [esp + 20] ;pDst - ;mov ecx, [esp + 28] ;iHeight - ;mov edx, [esp + 24] ;iDstStride - - %assign push_num 0 - LOAD_5_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d -%endif - lea r0, [r0-2] ;pSrc -= 2; - - pxor xmm7, xmm7 - movdqa xmm6, [h264_w0x10_1] -.y_loop: - - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 - - paddw xmm2, xmm3 - paddw xmm4, xmm5 - psllw xmm4, 2 - psubw xmm4, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm4 - psllw xmm4, 2 - paddw xmm0, xmm4 - paddw xmm0, xmm6 - psraw xmm0, 5 - packuswb xmm0, xmm7 - movq [r2], xmm0 - - movq xmm0, [r0+8] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5+8] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1+8] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4+8] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2+8] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3+8] - punpcklbw xmm5, xmm7 - - paddw xmm2, xmm3 - paddw xmm4, xmm5 - psllw xmm4, 2 - psubw xmm4, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm4 - psllw xmm4, 2 - paddw xmm0, xmm4 - paddw xmm0, xmm6 - psraw xmm0, 5 - packuswb xmm0, xmm7 - movq [r2+8], xmm0 - - lea r2, [r2+r3] - lea r0, [r0+r1] - dec r4 - jnz near .y_loop - - LOAD_5_PARA_POP - ret - - -;******************************************************************************* -; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc, -; int iSrcStride, -; uint8_t *pDst, -; int iDstStride, -; int iHeight ) -;******************************************************************************* -ALIGN 16 -McHorVer02WidthEq8_sse2: - ;push esi - ;push edi - ;mov esi, [esp + 12] ;pSrc - ;mov edx, [esp + 16] ;iSrcStride - ;mov edi, [esp + 20] ;pDst - ;mov eax, [esp + 24] ;iDstStride - ;mov ecx, [esp + 28] ;iHeight - - %assign push_num 0 - LOAD_5_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d -%endif - sub r0, r1 - sub r0, r1 - - WELS_Zero xmm7 - - SSE_LOAD_8P xmm0, xmm7, [r0] - SSE_LOAD_8P xmm1, xmm7, [r0+r1] - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm2, xmm7, [r0] - SSE_LOAD_8P xmm3, xmm7, [r0+r1] - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm4, xmm7, [r0] - SSE_LOAD_8P xmm5, xmm7, [r0+r1] - -.start: - FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r4 - jz near .xx_exit - - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm6, xmm7, [r0] - FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3] - dec r4 - jz near .xx_exit - - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm7, xmm0, [r0+r1] - FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] - dec r4 - jz near .xx_exit - - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm0, xmm1, [r0] - FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3] - dec r4 - jz near .xx_exit - - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm1, xmm2, [r0+r1] - FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2] - dec r4 - jz near .xx_exit - - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm2, xmm3, [r0] - FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3] - dec r4 - jz near .xx_exit - - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm3, xmm4, [r0+r1] - FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2] - dec r4 - jz near .xx_exit - - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm4, xmm5, [r0] - FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3] - dec r4 - jz near .xx_exit - - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm5, xmm6, [r0+r1] - jmp near .start - -.xx_exit: - LOAD_5_PARA_POP - ret - -;*********************************************************************** -; Code -;*********************************************************************** - -SECTION .text - -WELS_EXTERN McHorVer20Width9Or17_sse2 -WELS_EXTERN McHorVer02Height9Or17_sse2 -WELS_EXTERN McHorVer22Width8VerLastAlign_sse2 -WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2 -WELS_EXTERN McHorVer22HorFirst_sse2 - - -;*********************************************************************** -; void McHorVer02Height9Or17_sse2( uint8_t *pSrc, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; int32_t iWidth, -; int32_t iHeight ) -;*********************************************************************** -ALIGN 16 -McHorVer02Height9Or17_sse2: - ;push esi - ;push edi - ;push ebx - - ;mov esi, [esp + 16] - ;mov edx, [esp + 20] - ;mov edi, [esp + 24] - ;mov eax, [esp + 28] - ;mov ecx, [esp + 36] - ;mov ebx, [esp + 32] - - %assign push_num 0 - LOAD_6_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d - movsx r5, r5d -%endif - -%ifndef X86_32 - push r12 - push r13 - push r14 - mov r12, r0 - mov r13, r2 - mov r14, r5 -%endif - - shr r4, 3 - sub r0, r1 - sub r0, r1 - -.xloop: - WELS_Zero xmm7 - SSE_LOAD_8P xmm0, xmm7, [r0] - SSE_LOAD_8P xmm1, xmm7, [r0+r1] - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm2, xmm7, [r0] - SSE_LOAD_8P xmm3, xmm7, [r0+r1] - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm4, xmm7, [r0] - SSE_LOAD_8P xmm5, xmm7, [r0+r1] - - FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r5 - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm6, xmm7, [r0] - movdqa xmm0,xmm1 - movdqa xmm1,xmm2 - movdqa xmm2,xmm3 - movdqa xmm3,xmm4 - movdqa xmm4,xmm5 - movdqa xmm5,xmm6 - add r2, r3 - sub r0, r1 - -.start: - FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r5 - jz near .x_loop_dec - - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm6, xmm7, [r0] - FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3] - dec r5 - jz near .x_loop_dec - - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm7, xmm0, [r0+r1] - FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] - dec r5 - jz near .x_loop_dec - - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm0, xmm1, [r0] - FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3] - dec r5 - jz near .x_loop_dec - - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm1, xmm2, [r0+r1] - FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2] - dec r5 - jz near .x_loop_dec - - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm2, xmm3, [r0] - FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3] - dec r5 - jz near .x_loop_dec - - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm3, xmm4, [r0+r1] - FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2] - dec r5 - jz near .x_loop_dec - - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm4, xmm5, [r0] - FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3] - dec r5 - jz near .x_loop_dec - - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm5, xmm6, [r0+r1] - jmp near .start - -.x_loop_dec: - dec r4 - jz near .xx_exit - ;mov esi, [esp + 16] - ;mov edi, [esp + 24] - ;mov ecx, [esp + 36] -%ifdef X86_32 - mov r0, arg1 - mov r2, arg3 - mov r5, arg6 -%else - mov r0, r12 - mov r2, r13 - mov r5, r14 -%endif - sub r0, r1 - sub r0, r1 - add r0, 8 - add r2, 8 - jmp near .xloop - -.xx_exit: -%ifndef X86_32 - pop r14 - pop r13 - pop r12 -%endif - LOAD_6_PARA_POP - ret - - -ALIGN 16 -;*********************************************************************** -; void McHorVer20Width9Or17_sse2( uint8_t *pSrc, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; int32_t iWidth, -; int32_t iHeight -; ); -;*********************************************************************** -McHorVer20Width9Or17_sse2: - ;push esi - ;push edi - ;push ebx - ;mov esi, [esp+16] - ;mov eax, [esp+20] - ;mov edi, [esp+24] - ;mov edx, [esp+28] - ;mov ecx, [esp+32] - ;mov ebx, [esp+36] - - %assign push_num 0 - LOAD_6_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d - movsx r5, r5d -%endif - sub r0, 2 - pxor xmm7, xmm7 - - cmp r4, 9 - jne near .width_17 - -.yloop_width_9: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 - - movdqa xmm7, xmm2 - paddw xmm7, xmm3 - movdqa xmm6, xmm4 - paddw xmm6, xmm5 - psllw xmm6, 2 - psubw xmm6, xmm7 - paddw xmm0, xmm1 - paddw xmm0, xmm6 - psllw xmm6, 2 - paddw xmm0, xmm6 - paddw xmm0, [h264_w0x10_1] - psraw xmm0, 5 - packuswb xmm0, xmm0 - movd [r2], xmm0 - - pxor xmm7, xmm7 - movq xmm0, [r0+6] - punpcklbw xmm0, xmm7 - - paddw xmm4, xmm1 - paddw xmm5, xmm3 - psllw xmm5, 2 - psubw xmm5, xmm4 - paddw xmm2, xmm0 - paddw xmm2, xmm5 - psllw xmm5, 2 - paddw xmm2, xmm5 - paddw xmm2, [h264_w0x10_1] - psraw xmm2, 5 - packuswb xmm2, xmm2 - movq [r2+1], xmm2 - - add r0, r1 - add r2, r3 - dec r5 - jnz .yloop_width_9 - LOAD_6_PARA_POP - ret - - -.width_17: -.yloop_width_17: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 - - paddw xmm2, xmm3 - paddw xmm4, xmm5 - psllw xmm4, 2 - psubw xmm4, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm4 - psllw xmm4, 2 - paddw xmm0, xmm4 - paddw xmm0, [h264_w0x10_1] - psraw xmm0, 5 - packuswb xmm0, xmm0 - movq [r2], xmm0 - - movq xmm0, [r0+8] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5+8] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1+8] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4+8] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2+8] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3+8] - punpcklbw xmm5, xmm7 - - movdqa xmm7, xmm2 - paddw xmm7, xmm3 - movdqa xmm6, xmm4 - paddw xmm6, xmm5 - psllw xmm6, 2 - psubw xmm6, xmm7 - paddw xmm0, xmm1 - paddw xmm0, xmm6 - psllw xmm6, 2 - paddw xmm0, xmm6 - paddw xmm0, [h264_w0x10_1] - psraw xmm0, 5 - packuswb xmm0, xmm0 - movd [r2+8], xmm0 - - - pxor xmm7, xmm7 - movq xmm0, [r0+6+8] - punpcklbw xmm0, xmm7 - - paddw xmm4, xmm1 - paddw xmm5, xmm3 - psllw xmm5, 2 - psubw xmm5, xmm4 - paddw xmm2, xmm0 - paddw xmm2, xmm5 - psllw xmm5, 2 - paddw xmm2, xmm5 - paddw xmm2, [h264_w0x10_1] - psraw xmm2, 5 - packuswb xmm2, xmm2 - movq [r2+9], xmm2 - add r0, r1 - add r2, r3 - dec r5 - jnz .yloop_width_17 - LOAD_6_PARA_POP - ret - - - -ALIGN 16 -;*********************************************************************** -;void McHorVer22HorFirst_sse2 -; (uint8_t *pSrc, -; int32_t iSrcStride, -; uint8_t * pTap, -; int32_t iTapStride, -; int32_t iWidth,int32_t iHeight); -;*********************************************************************** -McHorVer22HorFirst_sse2: - ;push esi - ;push edi - ;push ebx - ;mov esi, [esp+16] - ;mov eax, [esp+20] - ;mov edi, [esp+24] - ;mov edx, [esp+28] - ;mov ecx, [esp+32] - ;mov ebx, [esp+36] - - %assign push_num 0 - LOAD_6_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d - movsx r5, r5d -%endif - pxor xmm7, xmm7 - sub r0, r1 ;;;;;;;;need more 5 lines. - sub r0, r1 - - cmp r4, 9 - jne near .width_17 - -.yloop_width_9: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 - - movdqa xmm7, xmm2 - paddw xmm7, xmm3 - movdqa xmm6, xmm4 - paddw xmm6, xmm5 - psllw xmm6, 2 - psubw xmm6, xmm7 - paddw xmm0, xmm1 - paddw xmm0, xmm6 - psllw xmm6, 2 - paddw xmm0, xmm6 - movd [r2], xmm0 - - pxor xmm7, xmm7 - movq xmm0, [r0+6] - punpcklbw xmm0, xmm7 - - paddw xmm4, xmm1 - paddw xmm5, xmm3 - psllw xmm5, 2 - psubw xmm5, xmm4 - paddw xmm2, xmm0 - paddw xmm2, xmm5 - psllw xmm5, 2 - paddw xmm2, xmm5 - movq [r2+2], xmm2 - movhps [r2+2+8], xmm2 - - add r0, r1 - add r2, r3 - dec r5 - jnz .yloop_width_9 - LOAD_6_PARA_POP - ret - - -.width_17: -.yloop_width_17: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 - - paddw xmm2, xmm3 - paddw xmm4, xmm5 - psllw xmm4, 2 - psubw xmm4, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm4 - psllw xmm4, 2 - paddw xmm0, xmm4 - movdqa [r2], xmm0 - - movq xmm0, [r0+8] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5+8] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1+8] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4+8] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2+8] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3+8] - punpcklbw xmm5, xmm7 - - movdqa xmm7, xmm2 - paddw xmm7, xmm3 - movdqa xmm6, xmm4 - paddw xmm6, xmm5 - psllw xmm6, 2 - psubw xmm6, xmm7 - paddw xmm0, xmm1 - paddw xmm0, xmm6 - psllw xmm6, 2 - paddw xmm0, xmm6 - movd [r2+16], xmm0 - - - pxor xmm7, xmm7 - movq xmm0, [r0+6+8] - punpcklbw xmm0, xmm7 - - paddw xmm4, xmm1 - paddw xmm5, xmm3 - psllw xmm5, 2 - psubw xmm5, xmm4 - paddw xmm2, xmm0 - paddw xmm2, xmm5 - psllw xmm5, 2 - paddw xmm2, xmm5 - movq [r2+18], xmm2 - movhps [r2+18+8], xmm2 - - add r0, r1 - add r2, r3 - dec r5 - jnz .yloop_width_17 - LOAD_6_PARA_POP - ret - - -%macro FILTER_VER 9 - paddw %1, %6 - movdqa %7, %2 - movdqa %8, %3 - - - paddw %7, %5 - paddw %8, %4 - - psubw %1, %7 - psraw %1, 2 - paddw %1, %8 - psubw %1, %7 - psraw %1, 2 - paddw %8, %1 - paddw %8, [h264_mc_hc_32] - psraw %8, 6 - packuswb %8, %8 - movq %9, %8 -%endmacro -;*********************************************************************** -;void McHorVer22Width8VerLastAlign_sse2( -; uint8_t *pTap, -; int32_t iTapStride, -; uint8_t * pDst, -; int32_t iDstStride, -; int32_t iWidth, -; int32_t iHeight); -;*********************************************************************** - - McHorVer22Width8VerLastAlign_sse2: - ;push esi - ;push edi - ;push ebx - ;push ebp - - ;mov esi, [esp+20] - ;mov eax, [esp+24] - ;mov edi, [esp+28] - ;mov edx, [esp+32] - ;mov ebx, [esp+36] - ;mov ecx, [esp+40] - - %assign push_num 0 - LOAD_6_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d - movsx r5, r5d -%endif -%ifndef X86_32 - push r12 - push r13 - push r14 - mov r12, r0 - mov r13, r2 - mov r14, r5 -%endif - - shr r4, 3 - -.width_loop: - movdqa xmm0, [r0] - movdqa xmm1, [r0+r1] - lea r0, [r0+2*r1] - movdqa xmm2, [r0] - movdqa xmm3, [r0+r1] - lea r0, [r0+2*r1] - movdqa xmm4, [r0] - movdqa xmm5, [r0+r1] - - FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r5 - lea r0, [r0+2*r1] - movdqa xmm6, [r0] - - movdqa xmm0, xmm1 - movdqa xmm1, xmm2 - movdqa xmm2, xmm3 - movdqa xmm3, xmm4 - movdqa xmm4, xmm5 - movdqa xmm5, xmm6 - - add r2, r3 - sub r0, r1 - -.start: - FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r5 - jz near .x_loop_dec - - lea r0, [r0+2*r1] - movdqa xmm6, [r0] - FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3] - dec r5 - jz near .x_loop_dec - - lea r2, [r2+2*r3] - movdqa xmm7, [r0+r1] - FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] - dec r5 - jz near .x_loop_dec - - lea r0, [r0+2*r1] - movdqa xmm0, [r0] - FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3] - dec r5 - jz near .x_loop_dec - - lea r2, [r2+2*r3] - movdqa xmm1, [r0+r1] - FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2] - dec r5 - jz near .x_loop_dec - - lea r0, [r0+2*r1] - movdqa xmm2, [r0] - FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3] - dec r5 - jz near .x_loop_dec - - lea r2, [r2+2*r3] - movdqa xmm3, [r0+r1] - FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2] - dec r5 - jz near .x_loop_dec - - lea r0, [r0+2*r1] - movdqa xmm4, [r0] - FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3] - dec r5 - jz near .x_loop_dec - - lea r2, [r2+2*r3] - movdqa xmm5, [r0+r1] - jmp near .start - -.x_loop_dec: - dec r4 - jz near .exit - ;mov esi, [esp+20] - ;mov edi, [esp+28] - ;mov ecx, [esp+40] -%ifdef X86_32 - mov r0, arg1 - mov r2, arg3 - mov r5, arg6 -%else - mov r0, r12 - mov r2, r13 - mov r5, r14 -%endif - add r0, 16 - add r2, 8 - jmp .width_loop - -.exit: -%ifndef X86_32 - pop r14 - pop r13 - pop r12 -%endif - LOAD_6_PARA_POP - ret - -;*********************************************************************** -;void McHorVer22Width8VerLastUnAlign_sse2( -; uint8_t *pTap, -; int32_t iTapStride, -; uint8_t * pDst, -; int32_t iDstStride, -; int32_t iWidth, -; int32_t iHeight); -;*********************************************************************** - - McHorVer22Width8VerLastUnAlign_sse2: - ;push esi - ;push edi - ;push ebx - ;push ebp - - ;mov esi, [esp+20] - ;mov eax, [esp+24] - ;mov edi, [esp+28] - ;mov edx, [esp+32] - ;mov ebx, [esp+36] - ;mov ecx, [esp+40] - - %assign push_num 0 - LOAD_6_PARA -%ifndef X86_32 - movsx r1, r1d - movsx r3, r3d - movsx r4, r4d - movsx r5, r5d -%endif -%ifndef X86_32 - push r12 - push r13 - push r14 - mov r12, r0 - mov r13, r2 - mov r14, r5 -%endif - shr r4, 3 - -.width_loop: - movdqu xmm0, [r0] - movdqu xmm1, [r0+r1] - lea r0, [r0+2*r1] - movdqu xmm2, [r0] - movdqu xmm3, [r0+r1] - lea r0, [r0+2*r1] - movdqu xmm4, [r0] - movdqu xmm5, [r0+r1] - - FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r5 - lea r0, [r0+2*r1] - movdqu xmm6, [r0] - - movdqa xmm0, xmm1 - movdqa xmm1, xmm2 - movdqa xmm2, xmm3 - movdqa xmm3, xmm4 - movdqa xmm4, xmm5 - movdqa xmm5, xmm6 - - add r2, r3 - sub r0, r1 - -.start: - FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r5 - jz near .x_loop_dec - - lea r0, [r0+2*r1] - movdqu xmm6, [r0] - FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3] - dec r5 - jz near .x_loop_dec - - lea r2, [r2+2*r3] - movdqu xmm7, [r0+r1] - FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] - dec r5 - jz near .x_loop_dec - - lea r0, [r0+2*r1] - movdqu xmm0, [r0] - FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3] - dec r5 - jz near .x_loop_dec - - lea r2, [r2+2*r3] - movdqu xmm1, [r0+r1] - FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2] - dec r5 - jz near .x_loop_dec - - lea r0, [r0+2*r1] - movdqu xmm2, [r0] - FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3] - dec r5 - jz near .x_loop_dec - - lea r2, [r2+2*r3] - movdqu xmm3, [r0+r1] - FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2] - dec r5 - jz near .x_loop_dec - - lea r0, [r0+2*r1] - movdqu xmm4, [r0] - FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3] - dec r5 - jz near .x_loop_dec - - lea r2, [r2+2*r3] - movdqu xmm5, [r0+r1] - jmp near .start - -.x_loop_dec: - dec r4 - jz near .exit - ;mov esi, [esp+20] - ;mov edi, [esp+28] - ;mov ecx, [esp+40] -%ifdef X86_32 - mov r0, arg1 - mov r2, arg3 - mov r5, arg6 -%else - mov r0, r12 - mov r2, r13 - mov r5, r14 -%endif - add r0, 16 - add r2, 8 - jmp .width_loop - -.exit: -%ifndef X86_32 - pop r14 - pop r13 - pop r12 -%endif - LOAD_6_PARA_POP +;*! +;* \copy +;* Copyright (c) 2009-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* mc_luma.asm +;* +;* Abstract +;* sse2 motion compensation +;* +;* History +;* 17/08/2009 Created +;* +;* +;*************************************************************************/ +%include "asm_inc.asm" + +;******************************************************************************* +; Local Data (Read Only) +;******************************************************************************* +%ifdef FORMAT_COFF +SECTION .rodata pData +%else +SECTION .rodata align=16 +%endif + +;******************************************************************************* +; Various memory constants (trigonometric values or rounding values) +;******************************************************************************* + +ALIGN 16 +h264_w0x10: + dw 16, 16, 16, 16 +ALIGN 16 +h264_w0x10_1: + dw 16, 16, 16, 16, 16, 16, 16, 16 +ALIGN 16 +h264_mc_hc_32: + dw 32, 32, 32, 32, 32, 32, 32, 32 + + +;******************************************************************************* +; Code +;******************************************************************************* + +SECTION .text + +WELS_EXTERN McHorVer20WidthEq4_mmx + + +ALIGN 16 +;******************************************************************************* +; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc, +; int iSrcStride, +; uint8_t *pDst, +; int iDstStride, +; int iHeight) +;******************************************************************************* +McHorVer20WidthEq4_mmx: + ;push esi + ;push edi + + ;mov esi, [esp+12] + ;mov eax, [esp+16] + ;mov edi, [esp+20] + ;mov ecx, [esp+24] + ;mov edx, [esp+28] + + %assign push_num 0 + LOAD_5_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d +%endif + + sub r0, 2 + WELS_Zero mm7 + movq mm6, [h264_w0x10] +.height_loop: + movd mm0, [r0] + punpcklbw mm0, mm7 + movd mm1, [r0+5] + punpcklbw mm1, mm7 + movd mm2, [r0+1] + punpcklbw mm2, mm7 + movd mm3, [r0+4] + punpcklbw mm3, mm7 + movd mm4, [r0+2] + punpcklbw mm4, mm7 + movd mm5, [r0+3] + punpcklbw mm5, mm7 + + paddw mm2, mm3 + paddw mm4, mm5 + psllw mm4, 2 + psubw mm4, mm2 + paddw mm0, mm1 + paddw mm0, mm4 + psllw mm4, 2 + paddw mm0, mm4 + paddw mm0, mm6 + psraw mm0, 5 + packuswb mm0, mm7 + movd [r2], mm0 + + add r0, r1 + add r2, r3 + dec r4 + jnz .height_loop + + WELSEMMS + LOAD_5_PARA_POP + ret + +;******************************************************************************* +; Macros and other preprocessor constants +;******************************************************************************* + + +%macro SSE_LOAD_8P 3 + movq %1, %3 + punpcklbw %1, %2 +%endmacro + +%macro FILTER_HV_W8 9 + paddw %1, %6 + movdqa %8, %3 + movdqa %7, %2 + paddw %1, [h264_w0x10_1] + paddw %8, %4 + paddw %7, %5 + psllw %8, 2 + psubw %8, %7 + paddw %1, %8 + psllw %8, 2 + paddw %1, %8 + psraw %1, 5 + WELS_Zero %8 + packuswb %1, %8 + movq %9, %1 +%endmacro + +;******************************************************************************* +; Code +;******************************************************************************* + +SECTION .text +WELS_EXTERN McHorVer22Width8HorFirst_sse2 +WELS_EXTERN McHorVer02WidthEq8_sse2 +WELS_EXTERN McHorVer20WidthEq8_sse2 +WELS_EXTERN McHorVer20WidthEq16_sse2 + +ALIGN 16 +;*********************************************************************** +; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc, +; int16_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride +; int32_t iHeight +; ) +;*********************************************************************** +McHorVer22Width8HorFirst_sse2: + ;push esi + ;push edi + ;push ebx + ;mov esi, [esp+16] ;pSrc + ;mov eax, [esp+20] ;iSrcStride + ;mov edi, [esp+24] ;pDst + ;mov edx, [esp+28] ;iDstStride + ;mov ebx, [esp+32] ;iHeight + + %assign push_num 0 + LOAD_5_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d +%endif + pxor xmm7, xmm7 + + sub r0, r1 ;;;;;;;;need more 5 lines. + sub r0, r1 + +.yloop_width_8: + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm3 + paddw xmm4, xmm5 + psllw xmm4, 2 + psubw xmm4, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm4 + psllw xmm4, 2 + paddw xmm0, xmm4 + movdqa [r2], xmm0 + + add r0, r1 + add r2, r3 + dec r4 + jnz .yloop_width_8 + LOAD_5_PARA_POP + ret + +ALIGN 16 +;******************************************************************************* +; void_t McHorVer20WidthEq8_sse2( uint8_t *pSrc, +; int iSrcStride, +; uint8_t *pDst, +; int iDstStride, +; int iHeight, +; ); +;******************************************************************************* +McHorVer20WidthEq8_sse2: + ;push esi + ;push edi + + ;mov esi, [esp + 12] ;pSrc + ;mov eax, [esp + 16] ;iSrcStride + ;mov edi, [esp + 20] ;pDst + ;mov ecx, [esp + 28] ;iHeight + ;mov edx, [esp + 24] ;iDstStride + + %assign push_num 0 + LOAD_5_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d +%endif + lea r0, [r0-2] ;pSrc -= 2; + + pxor xmm7, xmm7 + movdqa xmm6, [h264_w0x10_1] +.y_loop: + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm3 + paddw xmm4, xmm5 + psllw xmm4, 2 + psubw xmm4, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm4 + psllw xmm4, 2 + paddw xmm0, xmm4 + paddw xmm0, xmm6 + psraw xmm0, 5 + + packuswb xmm0, xmm7 + movq [r2], xmm0 + + lea r2, [r2+r3] + lea r0, [r0+r1] + dec r4 + jnz near .y_loop + + LOAD_5_PARA_POP + ret + +ALIGN 16 +;******************************************************************************* +; void_t McHorVer20WidthEq16_sse2( uint8_t *pSrc, +; int iSrcStride, +; uint8_t *pDst, +; int iDstStride, +; int iHeight, +; ); +;******************************************************************************* +McHorVer20WidthEq16_sse2: + ;push esi + ;push edi + ;mov esi, [esp + 12] ;pSrc + ;mov eax, [esp + 16] ;iSrcStride + ;mov edi, [esp + 20] ;pDst + ;mov ecx, [esp + 28] ;iHeight + ;mov edx, [esp + 24] ;iDstStride + + %assign push_num 0 + LOAD_5_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d +%endif + lea r0, [r0-2] ;pSrc -= 2; + + pxor xmm7, xmm7 + movdqa xmm6, [h264_w0x10_1] +.y_loop: + + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm3 + paddw xmm4, xmm5 + psllw xmm4, 2 + psubw xmm4, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm4 + psllw xmm4, 2 + paddw xmm0, xmm4 + paddw xmm0, xmm6 + psraw xmm0, 5 + packuswb xmm0, xmm7 + movq [r2], xmm0 + + movq xmm0, [r0+8] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5+8] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1+8] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4+8] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2+8] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3+8] + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm3 + paddw xmm4, xmm5 + psllw xmm4, 2 + psubw xmm4, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm4 + psllw xmm4, 2 + paddw xmm0, xmm4 + paddw xmm0, xmm6 + psraw xmm0, 5 + packuswb xmm0, xmm7 + movq [r2+8], xmm0 + + lea r2, [r2+r3] + lea r0, [r0+r1] + dec r4 + jnz near .y_loop + + LOAD_5_PARA_POP + ret + + +;******************************************************************************* +; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc, +; int iSrcStride, +; uint8_t *pDst, +; int iDstStride, +; int iHeight ) +;******************************************************************************* +ALIGN 16 +McHorVer02WidthEq8_sse2: + ;push esi + ;push edi + ;mov esi, [esp + 12] ;pSrc + ;mov edx, [esp + 16] ;iSrcStride + ;mov edi, [esp + 20] ;pDst + ;mov eax, [esp + 24] ;iDstStride + ;mov ecx, [esp + 28] ;iHeight + + %assign push_num 0 + LOAD_5_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d +%endif + sub r0, r1 + sub r0, r1 + + WELS_Zero xmm7 + + SSE_LOAD_8P xmm0, xmm7, [r0] + SSE_LOAD_8P xmm1, xmm7, [r0+r1] + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm2, xmm7, [r0] + SSE_LOAD_8P xmm3, xmm7, [r0+r1] + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm4, xmm7, [r0] + SSE_LOAD_8P xmm5, xmm7, [r0+r1] + +.start: + FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r4 + jz near .xx_exit + + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm6, xmm7, [r0] + FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3] + dec r4 + jz near .xx_exit + + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm7, xmm0, [r0+r1] + FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] + dec r4 + jz near .xx_exit + + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm0, xmm1, [r0] + FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3] + dec r4 + jz near .xx_exit + + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm1, xmm2, [r0+r1] + FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2] + dec r4 + jz near .xx_exit + + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm2, xmm3, [r0] + FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3] + dec r4 + jz near .xx_exit + + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm3, xmm4, [r0+r1] + FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2] + dec r4 + jz near .xx_exit + + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm4, xmm5, [r0] + FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3] + dec r4 + jz near .xx_exit + + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm5, xmm6, [r0+r1] + jmp near .start + +.xx_exit: + LOAD_5_PARA_POP + ret + +;*********************************************************************** +; Code +;*********************************************************************** + +SECTION .text + +WELS_EXTERN McHorVer20Width9Or17_sse2 +WELS_EXTERN McHorVer02Height9Or17_sse2 +WELS_EXTERN McHorVer22Width8VerLastAlign_sse2 +WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2 +WELS_EXTERN McHorVer22HorFirst_sse2 + + +;*********************************************************************** +; void McHorVer02Height9Or17_sse2( uint8_t *pSrc, +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; int32_t iWidth, +; int32_t iHeight ) +;*********************************************************************** +ALIGN 16 +McHorVer02Height9Or17_sse2: + ;push esi + ;push edi + ;push ebx + + ;mov esi, [esp + 16] + ;mov edx, [esp + 20] + ;mov edi, [esp + 24] + ;mov eax, [esp + 28] + ;mov ecx, [esp + 36] + ;mov ebx, [esp + 32] + + %assign push_num 0 + LOAD_6_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d + movsx r5, r5d +%endif + +%ifndef X86_32 + push r12 + push r13 + push r14 + mov r12, r0 + mov r13, r2 + mov r14, r5 +%endif + + shr r4, 3 + sub r0, r1 + sub r0, r1 + +.xloop: + WELS_Zero xmm7 + SSE_LOAD_8P xmm0, xmm7, [r0] + SSE_LOAD_8P xmm1, xmm7, [r0+r1] + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm2, xmm7, [r0] + SSE_LOAD_8P xmm3, xmm7, [r0+r1] + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm4, xmm7, [r0] + SSE_LOAD_8P xmm5, xmm7, [r0+r1] + + FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r5 + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm6, xmm7, [r0] + movdqa xmm0,xmm1 + movdqa xmm1,xmm2 + movdqa xmm2,xmm3 + movdqa xmm3,xmm4 + movdqa xmm4,xmm5 + movdqa xmm5,xmm6 + add r2, r3 + sub r0, r1 + +.start: + FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r5 + jz near .x_loop_dec + + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm6, xmm7, [r0] + FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3] + dec r5 + jz near .x_loop_dec + + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm7, xmm0, [r0+r1] + FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] + dec r5 + jz near .x_loop_dec + + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm0, xmm1, [r0] + FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3] + dec r5 + jz near .x_loop_dec + + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm1, xmm2, [r0+r1] + FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2] + dec r5 + jz near .x_loop_dec + + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm2, xmm3, [r0] + FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3] + dec r5 + jz near .x_loop_dec + + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm3, xmm4, [r0+r1] + FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2] + dec r5 + jz near .x_loop_dec + + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm4, xmm5, [r0] + FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3] + dec r5 + jz near .x_loop_dec + + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm5, xmm6, [r0+r1] + jmp near .start + +.x_loop_dec: + dec r4 + jz near .xx_exit + ;mov esi, [esp + 16] + ;mov edi, [esp + 24] + ;mov ecx, [esp + 36] +%ifdef X86_32 + mov r0, arg1 + mov r2, arg3 + mov r5, arg6 +%else + mov r0, r12 + mov r2, r13 + mov r5, r14 +%endif + sub r0, r1 + sub r0, r1 + add r0, 8 + add r2, 8 + jmp near .xloop + +.xx_exit: +%ifndef X86_32 + pop r14 + pop r13 + pop r12 +%endif + LOAD_6_PARA_POP + ret + + +ALIGN 16 +;*********************************************************************** +; void McHorVer20Width9Or17_sse2( uint8_t *pSrc, +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; int32_t iWidth, +; int32_t iHeight +; ); +;*********************************************************************** +McHorVer20Width9Or17_sse2: + ;push esi + ;push edi + ;push ebx + ;mov esi, [esp+16] + ;mov eax, [esp+20] + ;mov edi, [esp+24] + ;mov edx, [esp+28] + ;mov ecx, [esp+32] + ;mov ebx, [esp+36] + + %assign push_num 0 + LOAD_6_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d + movsx r5, r5d +%endif + sub r0, 2 + pxor xmm7, xmm7 + + cmp r4, 9 + jne near .width_17 + +.yloop_width_9: + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 + + movdqa xmm7, xmm2 + paddw xmm7, xmm3 + movdqa xmm6, xmm4 + paddw xmm6, xmm5 + psllw xmm6, 2 + psubw xmm6, xmm7 + paddw xmm0, xmm1 + paddw xmm0, xmm6 + psllw xmm6, 2 + paddw xmm0, xmm6 + paddw xmm0, [h264_w0x10_1] + psraw xmm0, 5 + packuswb xmm0, xmm0 + movd [r2], xmm0 + + pxor xmm7, xmm7 + movq xmm0, [r0+6] + punpcklbw xmm0, xmm7 + + paddw xmm4, xmm1 + paddw xmm5, xmm3 + psllw xmm5, 2 + psubw xmm5, xmm4 + paddw xmm2, xmm0 + paddw xmm2, xmm5 + psllw xmm5, 2 + paddw xmm2, xmm5 + paddw xmm2, [h264_w0x10_1] + psraw xmm2, 5 + packuswb xmm2, xmm2 + movq [r2+1], xmm2 + + add r0, r1 + add r2, r3 + dec r5 + jnz .yloop_width_9 + LOAD_6_PARA_POP + ret + + +.width_17: +.yloop_width_17: + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm3 + paddw xmm4, xmm5 + psllw xmm4, 2 + psubw xmm4, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm4 + psllw xmm4, 2 + paddw xmm0, xmm4 + paddw xmm0, [h264_w0x10_1] + psraw xmm0, 5 + packuswb xmm0, xmm0 + movq [r2], xmm0 + + movq xmm0, [r0+8] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5+8] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1+8] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4+8] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2+8] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3+8] + punpcklbw xmm5, xmm7 + + movdqa xmm7, xmm2 + paddw xmm7, xmm3 + movdqa xmm6, xmm4 + paddw xmm6, xmm5 + psllw xmm6, 2 + psubw xmm6, xmm7 + paddw xmm0, xmm1 + paddw xmm0, xmm6 + psllw xmm6, 2 + paddw xmm0, xmm6 + paddw xmm0, [h264_w0x10_1] + psraw xmm0, 5 + packuswb xmm0, xmm0 + movd [r2+8], xmm0 + + + pxor xmm7, xmm7 + movq xmm0, [r0+6+8] + punpcklbw xmm0, xmm7 + + paddw xmm4, xmm1 + paddw xmm5, xmm3 + psllw xmm5, 2 + psubw xmm5, xmm4 + paddw xmm2, xmm0 + paddw xmm2, xmm5 + psllw xmm5, 2 + paddw xmm2, xmm5 + paddw xmm2, [h264_w0x10_1] + psraw xmm2, 5 + packuswb xmm2, xmm2 + movq [r2+9], xmm2 + add r0, r1 + add r2, r3 + dec r5 + jnz .yloop_width_17 + LOAD_6_PARA_POP + ret + + + +ALIGN 16 +;*********************************************************************** +;void McHorVer22HorFirst_sse2 +; (uint8_t *pSrc, +; int32_t iSrcStride, +; uint8_t * pTap, +; int32_t iTapStride, +; int32_t iWidth,int32_t iHeight); +;*********************************************************************** +McHorVer22HorFirst_sse2: + ;push esi + ;push edi + ;push ebx + ;mov esi, [esp+16] + ;mov eax, [esp+20] + ;mov edi, [esp+24] + ;mov edx, [esp+28] + ;mov ecx, [esp+32] + ;mov ebx, [esp+36] + + %assign push_num 0 + LOAD_6_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d + movsx r5, r5d +%endif + pxor xmm7, xmm7 + sub r0, r1 ;;;;;;;;need more 5 lines. + sub r0, r1 + + cmp r4, 9 + jne near .width_17 + +.yloop_width_9: + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 + + movdqa xmm7, xmm2 + paddw xmm7, xmm3 + movdqa xmm6, xmm4 + paddw xmm6, xmm5 + psllw xmm6, 2 + psubw xmm6, xmm7 + paddw xmm0, xmm1 + paddw xmm0, xmm6 + psllw xmm6, 2 + paddw xmm0, xmm6 + movd [r2], xmm0 + + pxor xmm7, xmm7 + movq xmm0, [r0+6] + punpcklbw xmm0, xmm7 + + paddw xmm4, xmm1 + paddw xmm5, xmm3 + psllw xmm5, 2 + psubw xmm5, xmm4 + paddw xmm2, xmm0 + paddw xmm2, xmm5 + psllw xmm5, 2 + paddw xmm2, xmm5 + movq [r2+2], xmm2 + movhps [r2+2+8], xmm2 + + add r0, r1 + add r2, r3 + dec r5 + jnz .yloop_width_9 + LOAD_6_PARA_POP + ret + + +.width_17: +.yloop_width_17: + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm3 + paddw xmm4, xmm5 + psllw xmm4, 2 + psubw xmm4, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm4 + psllw xmm4, 2 + paddw xmm0, xmm4 + movdqa [r2], xmm0 + + movq xmm0, [r0+8] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5+8] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1+8] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4+8] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2+8] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3+8] + punpcklbw xmm5, xmm7 + + movdqa xmm7, xmm2 + paddw xmm7, xmm3 + movdqa xmm6, xmm4 + paddw xmm6, xmm5 + psllw xmm6, 2 + psubw xmm6, xmm7 + paddw xmm0, xmm1 + paddw xmm0, xmm6 + psllw xmm6, 2 + paddw xmm0, xmm6 + movd [r2+16], xmm0 + + + pxor xmm7, xmm7 + movq xmm0, [r0+6+8] + punpcklbw xmm0, xmm7 + + paddw xmm4, xmm1 + paddw xmm5, xmm3 + psllw xmm5, 2 + psubw xmm5, xmm4 + paddw xmm2, xmm0 + paddw xmm2, xmm5 + psllw xmm5, 2 + paddw xmm2, xmm5 + movq [r2+18], xmm2 + movhps [r2+18+8], xmm2 + + add r0, r1 + add r2, r3 + dec r5 + jnz .yloop_width_17 + LOAD_6_PARA_POP + ret + + +%macro FILTER_VER 9 + paddw %1, %6 + movdqa %7, %2 + movdqa %8, %3 + + + paddw %7, %5 + paddw %8, %4 + + psubw %1, %7 + psraw %1, 2 + paddw %1, %8 + psubw %1, %7 + psraw %1, 2 + paddw %8, %1 + paddw %8, [h264_mc_hc_32] + psraw %8, 6 + packuswb %8, %8 + movq %9, %8 +%endmacro +;*********************************************************************** +;void McHorVer22Width8VerLastAlign_sse2( +; uint8_t *pTap, +; int32_t iTapStride, +; uint8_t * pDst, +; int32_t iDstStride, +; int32_t iWidth, +; int32_t iHeight); +;*********************************************************************** + + McHorVer22Width8VerLastAlign_sse2: + ;push esi + ;push edi + ;push ebx + ;push ebp + + ;mov esi, [esp+20] + ;mov eax, [esp+24] + ;mov edi, [esp+28] + ;mov edx, [esp+32] + ;mov ebx, [esp+36] + ;mov ecx, [esp+40] + + %assign push_num 0 + LOAD_6_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d + movsx r5, r5d +%endif +%ifndef X86_32 + push r12 + push r13 + push r14 + mov r12, r0 + mov r13, r2 + mov r14, r5 +%endif + + shr r4, 3 + +.width_loop: + movdqa xmm0, [r0] + movdqa xmm1, [r0+r1] + lea r0, [r0+2*r1] + movdqa xmm2, [r0] + movdqa xmm3, [r0+r1] + lea r0, [r0+2*r1] + movdqa xmm4, [r0] + movdqa xmm5, [r0+r1] + + FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r5 + lea r0, [r0+2*r1] + movdqa xmm6, [r0] + + movdqa xmm0, xmm1 + movdqa xmm1, xmm2 + movdqa xmm2, xmm3 + movdqa xmm3, xmm4 + movdqa xmm4, xmm5 + movdqa xmm5, xmm6 + + add r2, r3 + sub r0, r1 + +.start: + FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r5 + jz near .x_loop_dec + + lea r0, [r0+2*r1] + movdqa xmm6, [r0] + FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3] + dec r5 + jz near .x_loop_dec + + lea r2, [r2+2*r3] + movdqa xmm7, [r0+r1] + FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] + dec r5 + jz near .x_loop_dec + + lea r0, [r0+2*r1] + movdqa xmm0, [r0] + FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3] + dec r5 + jz near .x_loop_dec + + lea r2, [r2+2*r3] + movdqa xmm1, [r0+r1] + FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2] + dec r5 + jz near .x_loop_dec + + lea r0, [r0+2*r1] + movdqa xmm2, [r0] + FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3] + dec r5 + jz near .x_loop_dec + + lea r2, [r2+2*r3] + movdqa xmm3, [r0+r1] + FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2] + dec r5 + jz near .x_loop_dec + + lea r0, [r0+2*r1] + movdqa xmm4, [r0] + FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3] + dec r5 + jz near .x_loop_dec + + lea r2, [r2+2*r3] + movdqa xmm5, [r0+r1] + jmp near .start + +.x_loop_dec: + dec r4 + jz near .exit + ;mov esi, [esp+20] + ;mov edi, [esp+28] + ;mov ecx, [esp+40] +%ifdef X86_32 + mov r0, arg1 + mov r2, arg3 + mov r5, arg6 +%else + mov r0, r12 + mov r2, r13 + mov r5, r14 +%endif + add r0, 16 + add r2, 8 + jmp .width_loop + +.exit: +%ifndef X86_32 + pop r14 + pop r13 + pop r12 +%endif + LOAD_6_PARA_POP + ret + +;*********************************************************************** +;void McHorVer22Width8VerLastUnAlign_sse2( +; uint8_t *pTap, +; int32_t iTapStride, +; uint8_t * pDst, +; int32_t iDstStride, +; int32_t iWidth, +; int32_t iHeight); +;*********************************************************************** + + McHorVer22Width8VerLastUnAlign_sse2: + ;push esi + ;push edi + ;push ebx + ;push ebp + + ;mov esi, [esp+20] + ;mov eax, [esp+24] + ;mov edi, [esp+28] + ;mov edx, [esp+32] + ;mov ebx, [esp+36] + ;mov ecx, [esp+40] + + %assign push_num 0 + LOAD_6_PARA +%ifndef X86_32 + movsx r1, r1d + movsx r3, r3d + movsx r4, r4d + movsx r5, r5d +%endif +%ifndef X86_32 + push r12 + push r13 + push r14 + mov r12, r0 + mov r13, r2 + mov r14, r5 +%endif + shr r4, 3 + +.width_loop: + movdqu xmm0, [r0] + movdqu xmm1, [r0+r1] + lea r0, [r0+2*r1] + movdqu xmm2, [r0] + movdqu xmm3, [r0+r1] + lea r0, [r0+2*r1] + movdqu xmm4, [r0] + movdqu xmm5, [r0+r1] + + FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r5 + lea r0, [r0+2*r1] + movdqu xmm6, [r0] + + movdqa xmm0, xmm1 + movdqa xmm1, xmm2 + movdqa xmm2, xmm3 + movdqa xmm3, xmm4 + movdqa xmm4, xmm5 + movdqa xmm5, xmm6 + + add r2, r3 + sub r0, r1 + +.start: + FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r5 + jz near .x_loop_dec + + lea r0, [r0+2*r1] + movdqu xmm6, [r0] + FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3] + dec r5 + jz near .x_loop_dec + + lea r2, [r2+2*r3] + movdqu xmm7, [r0+r1] + FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] + dec r5 + jz near .x_loop_dec + + lea r0, [r0+2*r1] + movdqu xmm0, [r0] + FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3] + dec r5 + jz near .x_loop_dec + + lea r2, [r2+2*r3] + movdqu xmm1, [r0+r1] + FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2] + dec r5 + jz near .x_loop_dec + + lea r0, [r0+2*r1] + movdqu xmm2, [r0] + FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3] + dec r5 + jz near .x_loop_dec + + lea r2, [r2+2*r3] + movdqu xmm3, [r0+r1] + FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2] + dec r5 + jz near .x_loop_dec + + lea r0, [r0+2*r1] + movdqu xmm4, [r0] + FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3] + dec r5 + jz near .x_loop_dec + + lea r2, [r2+2*r3] + movdqu xmm5, [r0+r1] + jmp near .start + +.x_loop_dec: + dec r4 + jz near .exit + ;mov esi, [esp+20] + ;mov edi, [esp+28] + ;mov ecx, [esp+40] +%ifdef X86_32 + mov r0, arg1 + mov r2, arg3 + mov r5, arg6 +%else + mov r0, r12 + mov r2, r13 + mov r5, r14 +%endif + add r0, 16 + add r2, 8 + jmp .width_loop + +.exit: +%ifndef X86_32 + pop r14 + pop r13 + pop r12 +%endif + LOAD_6_PARA_POP ret \ No newline at end of file diff --git a/codec/encoder/core/asm/satd_sad.asm b/codec/encoder/core/asm/satd_sad.asm index ceffdbc6..03a3912d 100644 --- a/codec/encoder/core/asm/satd_sad.asm +++ b/codec/encoder/core/asm/satd_sad.asm @@ -1,2344 +1,2344 @@ -;*! -;* \copy -;* Copyright (c) 2009-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* satd_sad.asm -;* -;* Abstract -;* WelsSampleSatd4x4_sse2 -;* WelsSampleSatd8x8_sse2 -;* WelsSampleSatd16x8_sse2 -;* WelsSampleSatd8x16_sse2 -;* WelsSampleSatd16x16_sse2 -;* -;* WelsSampleSad16x8_sse2 -;* WelsSampleSad16x16_sse2 -;* -;* History -;* 8/5/2009 Created -;* 24/9/2009 modified -;* -;* -;*************************************************************************/ - -%include "asm_inc.asm" - -;*********************************************************************** -; Data -;*********************************************************************** -SECTION .rodata align=16 - -align 16 -HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1 -align 16 -HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1 -align 16 -PDW1: dw 1,1,1,1,1,1,1,1 -align 16 -PDQ2: dw 2,0,0,0,2,0,0,0 -align 16 -HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 - -;*********************************************************************** -; Code -;*********************************************************************** -SECTION .text - -;*********************************************************************** -; -;Pixel_satd_wxh_sse2 BEGIN -; -;*********************************************************************** -%macro MMX_DW_1_2REG 2 - pxor %1, %1 - pcmpeqw %2, %2 - psubw %1, %2 -%endmacro - -%macro SSE2_SumWHorizon1 2 - movdqa %2, %1 - psrldq %2, 8 - paddusw %1, %2 - movdqa %2, %1 - psrldq %2, 4 - paddusw %1, %2 - movdqa %2, %1 - psrldq %2, 2 - paddusw %1, %2 -%endmacro - -%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3 - SSE2_SumSub %1, %2, %5 - SSE2_SumSub %3, %4, %5 - SSE2_SumSub %2, %4, %5 - SSE2_SumSub %1, %3, %5 -%endmacro - -%macro SSE2_SumAbs4 7 - WELS_AbsW %1, %3 - WELS_AbsW %2, %3 - WELS_AbsW %4, %6 - WELS_AbsW %5, %6 - paddusw %1, %2 - paddusw %4, %5 - paddusw %7, %1 - paddusw %7, %4 -%endmacro - -%macro SSE2_SumWHorizon 3 - movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 - paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 - punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 - movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 - paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 - pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 - paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 -%endmacro - -%macro SSE2_GetSatd8x8 0 - SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2] - SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2] - SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3] - - SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 - SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 - SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 - SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2] - SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2] - SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3] - - SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 - SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 - SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 - SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 -%endmacro - -;*********************************************************************** -; -;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t ); -; -;*********************************************************************** -WELS_EXTERN WelsSampleSatd4x4_sse2 -align 16 -WelsSampleSatd4x4_sse2: - ;push ebx - ;mov eax, [esp+8] - ;mov ebx, [esp+12] - ;mov ecx, [esp+16] - ;mov edx, [esp+20] - - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - movd xmm0, [r0] - movd xmm1, [r0+r1] - lea r0 , [r0+2*r1] - movd xmm2, [r0] - movd xmm3, [r0+r1] - punpckldq xmm0, xmm2 - punpckldq xmm1, xmm3 - - movd xmm4, [r2] - movd xmm5, [r2+r3] - lea r2 , [r2+2*r3] - movd xmm6, [r2] - movd xmm7, [r2+r3] - punpckldq xmm4, xmm6 - punpckldq xmm5, xmm7 - - pxor xmm6, xmm6 - punpcklbw xmm0, xmm6 - punpcklbw xmm1, xmm6 - punpcklbw xmm4, xmm6 - punpcklbw xmm5, xmm6 - - psubw xmm0, xmm4 - psubw xmm1, xmm5 - - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - psubw xmm2, xmm1 - SSE2_XSawp qdq, xmm0, xmm2, xmm3 - - movdqa xmm4, xmm0 - paddw xmm0, xmm3 - psubw xmm4, xmm3 - - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm4 - punpckhwd xmm4, xmm2 - - SSE2_XSawp dq, xmm0, xmm4, xmm3 - SSE2_XSawp qdq, xmm0, xmm3, xmm5 - - movdqa xmm7, xmm0 - paddw xmm0, xmm5 - psubw xmm7, xmm5 - - SSE2_XSawp qdq, xmm0, xmm7, xmm1 - - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - psubw xmm2, xmm1 - - WELS_AbsW xmm0, xmm3 - paddusw xmm6, xmm0 - WELS_AbsW xmm2, xmm4 - paddusw xmm6, xmm2 - SSE2_SumWHorizon1 xmm6, xmm4 - movd retrd, xmm6 - and retrd, 0xffff - shr retrd, 1 - LOAD_4_PARA_POP - ret - - ;*********************************************************************** - ; - ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); - ; - ;*********************************************************************** - WELS_EXTERN WelsSampleSatd8x8_sse2 -align 16 - WelsSampleSatd8x8_sse2: - ;push ebx - ;mov eax, [esp+8] - ;mov ebx, [esp+12] - ;mov ecx, [esp+16] - ;mov edx, [esp+20] - - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - pxor xmm6, xmm6 - pxor xmm7, xmm7 - SSE2_GetSatd8x8 - psrlw xmm6, 1 - SSE2_SumWHorizon xmm6,xmm4,xmm7 - movd retrd, xmm6 - LOAD_4_PARA_POP - ret - - ;*********************************************************************** - ; - ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); - ; - ;*********************************************************************** - WELS_EXTERN WelsSampleSatd8x16_sse2 -align 16 - WelsSampleSatd8x16_sse2: - ;push ebx - ;mov eax, [esp+8] - ;mov ebx, [esp+12] - ;mov ecx, [esp+16] - ;mov edx, [esp+20] - - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - pxor xmm6, xmm6 - pxor xmm7, xmm7 - - SSE2_GetSatd8x8 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_GetSatd8x8 - - psrlw xmm6, 1 - SSE2_SumWHorizon xmm6,xmm4,xmm7 - movd retrd, xmm6 - LOAD_4_PARA_POP - ret - -;*********************************************************************** -; -;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); -; -;*********************************************************************** -WELS_EXTERN WelsSampleSatd16x8_sse2 -align 16 -WelsSampleSatd16x8_sse2: - ;push ebx - ;mov eax, [esp+8] - ;mov ebx, [esp+12] - ;mov ecx, [esp+16] - ;mov edx, [esp+20] - - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - push r0 - push r2 - pxor xmm6, xmm6 - pxor xmm7, xmm7 - - SSE2_GetSatd8x8 - - pop r2 - pop r0 - ;mov eax, [esp+8] - ;mov ecx, [esp+16] - add r0, 8 - add r2, 8 - SSE2_GetSatd8x8 - - psrlw xmm6, 1 - SSE2_SumWHorizon xmm6,xmm4,xmm7 - movd retrd, xmm6 - LOAD_4_PARA_POP - ret - -;*********************************************************************** -; -;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); -; -;*********************************************************************** -WELS_EXTERN WelsSampleSatd16x16_sse2 -align 16 -WelsSampleSatd16x16_sse2: - ;push ebx - ;mov eax, [esp+8] - ;mov ebx, [esp+12] - ;mov ecx, [esp+16] - ;mov edx, [esp+20] - - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - push r0 - push r2 - pxor xmm6, xmm6 - pxor xmm7, xmm7 - - SSE2_GetSatd8x8 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_GetSatd8x8 - - pop r2 - pop r0 - ;mov eax, [esp+8] - ;mov ecx, [esp+16] - add r0, 8 - add r2, 8 - - SSE2_GetSatd8x8 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_GetSatd8x8 - - ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. - psrlw xmm6, 1 - SSE2_SumWHorizon xmm6,xmm4,xmm7 - movd retrd, xmm6 - LOAD_4_PARA_POP - ret - -;*********************************************************************** -; -;Pixel_satd_wxh_sse2 END -; -;*********************************************************************** - -;*********************************************************************** -; -;Pixel_satd_intra_sse2 BEGIN -; -;*********************************************************************** - -%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 - pmaddubsw %1, xmm5 - movdqa %2, %1 - pmaddwd %1, xmm7 - pmaddwd %2, xmm6 - movdqa %3, %1 - punpckldq %1, %2 - punpckhdq %2, %3 - movdqa %3, %1 - punpcklqdq %1, %2 - punpckhqdq %3, %2 - paddd xmm4, %1 ;for dc - paddd xmm4, %3 ;for dc - packssdw %1, %3 - psllw %1, 2 -%endmacro -%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2 - pmaddubsw %1, xmm5 - movdqa %2, %1 - pmaddwd %1, xmm7 - pmaddwd %2, xmm6 - movdqa %3, %1 - punpckldq %1, %2 - punpckhdq %2, %3 - movdqa %3, %1 - punpcklqdq %1, %2 - punpckhqdq %3, %2 -; paddd xmm4, %1 ;for dc -; paddd xmm4, %3 ;for dc - movdqa %4, %1 - punpcklqdq %4, %3 - packssdw %1, %3 - psllw %1, 2 -%endmacro - -%macro SSE41_GetX38x4SatdDec 0 - pxor xmm7, xmm7 - movq xmm0, [eax] - movq xmm1, [eax+ebx] - lea eax, [eax+2*ebx] - movq xmm2, [eax] - movq xmm3, [eax+ebx] - lea eax, [eax+2*ebx] - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 - SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7 - SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7 - SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2 - ;doesn't need another transpose -%endmacro -%macro SSE41_GetX38x4SatdV 2 - pxor xmm0, xmm0 - pinsrw xmm0, word[esi+%2], 0 - pinsrw xmm0, word[esi+%2+8], 4 - psubsw xmm0, xmm7 - pabsw xmm0, xmm0 - paddw xmm4, xmm0 - pxor xmm0, xmm0 - pinsrw xmm0, word[esi+%2+2], 0 - pinsrw xmm0, word[esi+%2+10], 4 - psubsw xmm0, xmm1 - pabsw xmm0, xmm0 - paddw xmm4, xmm0 - pxor xmm0, xmm0 - pinsrw xmm0, word[esi+%2+4], 0 - pinsrw xmm0, word[esi+%2+12], 4 - psubsw xmm0, xmm3 - pabsw xmm0, xmm0 - paddw xmm4, xmm0 - pxor xmm0, xmm0 - pinsrw xmm0, word[esi+%2+6], 0 - pinsrw xmm0, word[esi+%2+14], 4 - psubsw xmm0, xmm2 - pabsw xmm0, xmm0 - paddw xmm4, xmm0 -%endmacro -%macro SSE41_GetX38x4SatdH 3 - movq xmm0, [esi+%3+8*%1] - punpcklqdq xmm0, xmm0 - psubsw xmm0, xmm7 - pabsw xmm0, xmm0 - paddw xmm5, xmm0 - pabsw xmm1, xmm1 - pabsw xmm2, xmm2 - pabsw xmm3, xmm3 - paddw xmm2, xmm1;for DC - paddw xmm2, xmm3;for DC - paddw xmm5, xmm2 -%endmacro -%macro SSE41_I16X16GetX38x4SatdDC 0 - pxor xmm0, xmm0 - movq2dq xmm0, mm4 - punpcklqdq xmm0, xmm0 - psubsw xmm0, xmm7 - pabsw xmm0, xmm0 - paddw xmm6, xmm0 - paddw xmm6, xmm2 -%endmacro -%macro SSE41_ChromaGetX38x4SatdDC 1 - shl %1, 4 - movdqa xmm0, [esi+32+%1] - psubsw xmm0, xmm7 - pabsw xmm0, xmm0 - paddw xmm6, xmm0 - paddw xmm6, xmm2 -%endmacro -%macro SSE41_I16x16GetX38x4Satd 2 - SSE41_GetX38x4SatdDec - SSE41_GetX38x4SatdV %1, %2 - SSE41_GetX38x4SatdH %1, %2, 32 - SSE41_I16X16GetX38x4SatdDC -%endmacro -%macro SSE41_ChromaGetX38x4Satd 2 - SSE41_GetX38x4SatdDec - SSE41_GetX38x4SatdV %1, %2 - SSE41_GetX38x4SatdH %1, %2, 16 - SSE41_ChromaGetX38x4SatdDC %1 -%endmacro -%macro SSE41_HSum8W 3 - pmaddwd %1, %2 - movhlps %3, %1 - paddd %1, %3 - pshuflw %3, %1,0Eh - paddd %1, %3 -%endmacro - - -%ifdef X86_32 -WELS_EXTERN WelsIntra16x16Combined3Satd_sse41 -WelsIntra16x16Combined3Satd_sse41: - push ebx - push esi - push edi - mov ecx, [esp+16] - mov edx, [esp+20] - mov eax, [esp+24] - mov ebx, [esp+28] - mov esi, [esp+40] ;temp_satd - pxor xmm4, xmm4 - movdqa xmm5, [HSumSubDB1] - movdqa xmm6, [HSumSubDW1] - movdqa xmm7, [PDW1] - sub ecx, edx - movdqu xmm0, [ecx] - movhlps xmm1, xmm0 - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 - SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 - movdqa [esi], xmm0 ;V - movdqa [esi+16], xmm1 - add ecx, edx - pinsrb xmm0, byte[ecx-1], 0 - pinsrb xmm0, byte[ecx+edx-1], 1 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 2 - pinsrb xmm0, byte[ecx+edx-1], 3 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 4 - pinsrb xmm0, byte[ecx+edx-1], 5 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 6 - pinsrb xmm0, byte[ecx+edx-1], 7 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 8 - pinsrb xmm0, byte[ecx+edx-1], 9 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 10 - pinsrb xmm0, byte[ecx+edx-1], 11 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 12 - pinsrb xmm0, byte[ecx+edx-1], 13 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 14 - pinsrb xmm0, byte[ecx+edx-1], 15 - movhlps xmm1, xmm0 - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 - SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 - movdqa [esi+32], xmm0 ;H - movdqa [esi+48], xmm1 - movd ecx, xmm4 ;dc - add ecx, 16 ;(sum+16) - shr ecx, 5 ;((sum+16)>>5) - shl ecx, 4 ; - movd mm4, ecx ; mm4 copy DC - pxor xmm4, xmm4 ;V - pxor xmm5, xmm5 ;H - pxor xmm6, xmm6 ;DC - mov ecx, 0 - mov edi, 0 -.loop16x16_get_satd: -.loopStart1: - SSE41_I16x16GetX38x4Satd ecx, edi - inc ecx - cmp ecx, 4 - jl .loopStart1 - cmp edi, 16 - je .loop16x16_get_satd_end - mov eax, [esp+24] - add eax, 8 - mov ecx, 0 - add edi, 16 - jmp .loop16x16_get_satd - .loop16x16_get_satd_end: - MMX_DW_1_2REG xmm0, xmm1 - psrlw xmm4, 1 ;/2 - psrlw xmm5, 1 ;/2 - psrlw xmm6, 1 ;/2 - SSE41_HSum8W xmm4, xmm0, xmm1 - SSE41_HSum8W xmm5, xmm0, xmm1 - SSE41_HSum8W xmm6, xmm0, xmm1 - - ; comparing order: DC H V - movd ebx, xmm6 ;DC - movd edi, xmm5 ;H - movd ecx, xmm4 ;V - mov edx, [esp+36] - shl edx, 1 - add edi, edx - add ebx, edx - mov edx, [esp+32] - cmp ebx, edi - jge near not_dc_16x16 - cmp ebx, ecx - jge near not_dc_h_16x16 - - ; for DC mode - mov dword[edx], 2;I16_PRED_DC - mov eax, ebx - jmp near return_satd_intra_16x16_x3 -not_dc_16x16: - ; for H mode - cmp edi, ecx - jge near not_dc_h_16x16 - mov dword[edx], 1;I16_PRED_H - mov eax, edi - jmp near return_satd_intra_16x16_x3 -not_dc_h_16x16: - ; for V mode - mov dword[edx], 0;I16_PRED_V - mov eax, ecx -return_satd_intra_16x16_x3: - WELSEMMS - pop edi - pop esi - pop ebx -ret - -%macro SSE41_ChromaGetX38x8Satd 0 - movdqa xmm5, [HSumSubDB1] - movdqa xmm6, [HSumSubDW1] - movdqa xmm7, [PDW1] - sub ecx, edx - movq xmm0, [ecx] - punpcklqdq xmm0, xmm0 - SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4 - movdqa [esi], xmm0 ;V - add ecx, edx - pinsrb xmm0, byte[ecx-1], 0 - pinsrb xmm0, byte[ecx+edx-1], 1 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 2 - pinsrb xmm0, byte[ecx+edx-1], 3 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 4 - pinsrb xmm0, byte[ecx+edx-1], 5 - lea ecx, [ecx+2*edx] - pinsrb xmm0, byte[ecx-1], 6 - pinsrb xmm0, byte[ecx+edx-1], 7 - punpcklqdq xmm0, xmm0 - SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1 - movdqa [esi+16], xmm0 ;H -;(sum+2)>>2 - movdqa xmm6, [PDQ2] - movdqa xmm5, xmm4 - punpckhqdq xmm5, xmm1 - paddd xmm5, xmm6 - psrld xmm5, 2 -;(sum1+sum2+4)>>3 - paddd xmm6, xmm6 - paddd xmm4, xmm1 - paddd xmm4, xmm6 - psrld xmm4, 3 -;satd *16 - pslld xmm5, 4 - pslld xmm4, 4 -;temp satd - movdqa xmm6, xmm4 - punpcklqdq xmm4, xmm5 - psllq xmm4, 32 - psrlq xmm4, 32 - movdqa [esi+32], xmm4 - punpckhqdq xmm5, xmm6 - psllq xmm5, 32 - psrlq xmm5, 32 - movdqa [esi+48], xmm5 - - pxor xmm4, xmm4 ;V - pxor xmm5, xmm5 ;H - pxor xmm6, xmm6 ;DC - mov ecx, 0 -loop_chroma_satdx3_cb_cr: - SSE41_ChromaGetX38x4Satd ecx, 0 - inc ecx - cmp ecx, 2 - jl loop_chroma_satdx3_cb_cr -%endmacro - -%macro SSEReg2MMX 3 - movdq2q %2, %1 - movhlps %1, %1 - movdq2q %3, %1 -%endmacro -%macro MMXReg2SSE 4 - movq2dq %1, %3 - movq2dq %2, %4 - punpcklqdq %1, %2 -%endmacro -;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41 - -WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41 -WelsIntraChroma8x8Combined3Satd_sse41: - push ebx - push esi - push edi - mov ecx, [esp+16] - mov edx, [esp+20] - mov eax, [esp+24] - mov ebx, [esp+28] - mov esi, [esp+40] ;temp_satd - xor edi, edi -loop_chroma_satdx3: - SSE41_ChromaGetX38x8Satd - cmp edi, 1 - je loop_chroma_satdx3end - inc edi - SSEReg2MMX xmm4, mm0,mm1 - SSEReg2MMX xmm5, mm2,mm3 - SSEReg2MMX xmm6, mm5,mm6 - mov ecx, [esp+44] - mov eax, [esp+48] - jmp loop_chroma_satdx3 -loop_chroma_satdx3end: - MMXReg2SSE xmm0, xmm3, mm0, mm1 - MMXReg2SSE xmm1, xmm3, mm2, mm3 - MMXReg2SSE xmm2, xmm3, mm5, mm6 - - paddw xmm4, xmm0 - paddw xmm5, xmm1 - paddw xmm6, xmm2 - - MMX_DW_1_2REG xmm0, xmm1 - psrlw xmm4, 1 ;/2 - psrlw xmm5, 1 ;/2 - psrlw xmm6, 1 ;/2 - SSE41_HSum8W xmm4, xmm0, xmm1 - SSE41_HSum8W xmm5, xmm0, xmm1 - SSE41_HSum8W xmm6, xmm0, xmm1 - ; comparing order: DC H V - movd ebx, xmm6 ;DC - movd edi, xmm5 ;H - movd ecx, xmm4 ;V - mov edx, [esp+36] - shl edx, 1 - add edi, edx - add ecx, edx - mov edx, [esp+32] - cmp ebx, edi - jge near not_dc_8x8 - cmp ebx, ecx - jge near not_dc_h_8x8 - - ; for DC mode - mov dword[edx], 0;I8_PRED_DC - mov eax, ebx - jmp near return_satd_intra_8x8_x3 -not_dc_8x8: - ; for H mode - cmp edi, ecx - jge near not_dc_h_8x8 - mov dword[edx], 1;I8_PRED_H - mov eax, edi - jmp near return_satd_intra_8x8_x3 -not_dc_h_8x8: - ; for V mode - mov dword[edx], 2;I8_PRED_V - mov eax, ecx -return_satd_intra_8x8_x3: - WELSEMMS - pop edi - pop esi - pop ebx -ret - - -;*********************************************************************** -; -;Pixel_satd_intra_sse2 END -; -;*********************************************************************** -%macro SSSE3_Get16BSadHVDC 2 - movd xmm6,%1 - pshufb xmm6,xmm1 - movdqa %1, xmm6 - movdqa xmm0,%2 - psadbw xmm0,xmm7 - paddw xmm4,xmm0 - movdqa xmm0,%2 - psadbw xmm0,xmm5 - paddw xmm2,xmm0 - psadbw xmm6,%2 - paddw xmm3,xmm6 -%endmacro -%macro WelsAddDCValue 4 - movzx %2, byte %1 - mov %3, %2 - add %4, %2 -%endmacro - -;*********************************************************************** -; -;Pixel_sad_intra_ssse3 BEGIN -; -;*********************************************************************** -WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3 -WelsIntra16x16Combined3Sad_ssse3: - push ebx - push esi - push edi - mov ecx, [esp+16] - mov edx, [esp+20] - mov edi, [esp+40] ;temp_sad - sub ecx, edx - movdqa xmm5,[ecx] - pxor xmm0,xmm0 - psadbw xmm0,xmm5 - movhlps xmm1,xmm0 - paddw xmm0,xmm1 - movd eax,xmm0 - - add ecx,edx - lea ebx, [edx+2*edx] - WelsAddDCValue [ecx-1 ], esi, [edi ], eax - WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax - WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax - WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax - lea ecx, [ecx+4*edx] - add edi, 64 - WelsAddDCValue [ecx-1 ], esi, [edi ], eax - WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax - WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax - WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax - lea ecx, [ecx+4*edx] - add edi, 64 - WelsAddDCValue [ecx-1 ], esi, [edi ], eax - WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax - WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax - WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax - lea ecx, [ecx+4*edx] - add edi, 64 - WelsAddDCValue [ecx-1 ], esi, [edi ], eax - WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax - WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax - WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax - sub edi, 192 - add eax,10h - shr eax,5 - movd xmm7,eax - pxor xmm1,xmm1 - pshufb xmm7,xmm1 - pxor xmm4,xmm4 - pxor xmm3,xmm3 - pxor xmm2,xmm2 -;sad begin - mov eax, [esp+24] - mov ebx, [esp+28] - lea esi, [ebx+2*ebx] - SSSE3_Get16BSadHVDC [edi], [eax] - SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] - SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] - SSSE3_Get16BSadHVDC [edi+48], [eax+esi] - add edi, 64 - lea eax, [eax+4*ebx] - SSSE3_Get16BSadHVDC [edi], [eax] - SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] - SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] - SSSE3_Get16BSadHVDC [edi+48], [eax+esi] - add edi, 64 - lea eax, [eax+4*ebx] - SSSE3_Get16BSadHVDC [edi], [eax] - SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] - SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] - SSSE3_Get16BSadHVDC [edi+48], [eax+esi] - add edi, 64 - lea eax, [eax+4*ebx] - SSSE3_Get16BSadHVDC [edi], [eax] - SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] - SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] - SSSE3_Get16BSadHVDC [edi+48], [eax+esi] - - pslldq xmm3,4 - por xmm3,xmm2 - movhlps xmm1,xmm3 - paddw xmm3,xmm1 - movhlps xmm0,xmm4 - paddw xmm4,xmm0 -; comparing order: DC H V - movd ebx, xmm4 ;DC - movd ecx, xmm3 ;V - psrldq xmm3, 4 - movd esi, xmm3 ;H - mov eax, [esp+36] ;lamda - shl eax, 1 - add esi, eax - add ebx, eax - mov edx, [esp+32] - cmp ebx, esi - jge near not_dc_16x16_sad - cmp ebx, ecx - jge near not_dc_h_16x16_sad - ; for DC mode - mov dword[edx], 2;I16_PRED_DC - mov eax, ebx - sub edi, 192 -%assign x 0 -%rep 16 - movdqa [edi+16*x], xmm7 -%assign x x+1 -%endrep - jmp near return_sad_intra_16x16_x3 -not_dc_16x16_sad: - ; for H mode - cmp esi, ecx - jge near not_dc_h_16x16_sad - mov dword[edx], 1;I16_PRED_H - mov eax, esi - jmp near return_sad_intra_16x16_x3 -not_dc_h_16x16_sad: - ; for V mode - mov dword[edx], 0;I16_PRED_V - mov eax, ecx - sub edi, 192 -%assign x 0 -%rep 16 - movdqa [edi+16*x], xmm5 -%assign x x+1 -%endrep -return_sad_intra_16x16_x3: - pop edi - pop esi - pop ebx - ret -%endif -;*********************************************************************** -; -;Pixel_sad_intra_ssse3 END -; -;*********************************************************************** -;*********************************************************************** -; -;Pixel_satd_wxh_sse41 BEGIN -; -;*********************************************************************** - -;SSE4.1 -%macro SSE41_GetSatd8x4 0 - movq xmm0, [r0] - punpcklqdq xmm0, xmm0 - pmaddubsw xmm0, xmm7 - movq xmm1, [r0+r1] - punpcklqdq xmm1, xmm1 - pmaddubsw xmm1, xmm7 - movq xmm2, [r2] - punpcklqdq xmm2, xmm2 - pmaddubsw xmm2, xmm7 - movq xmm3, [r2+r3] - punpcklqdq xmm3, xmm3 - pmaddubsw xmm3, xmm7 - psubsw xmm0, xmm2 - psubsw xmm1, xmm3 - movq xmm2, [r0+2*r1] - punpcklqdq xmm2, xmm2 - pmaddubsw xmm2, xmm7 - movq xmm3, [r0+r4] - punpcklqdq xmm3, xmm3 - pmaddubsw xmm3, xmm7 - movq xmm4, [r2+2*r3] - punpcklqdq xmm4, xmm4 - pmaddubsw xmm4, xmm7 - movq xmm5, [r2+r5] - punpcklqdq xmm5, xmm5 - pmaddubsw xmm5, xmm7 - psubsw xmm2, xmm4 - psubsw xmm3, xmm5 - SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4 - pabsw xmm0, xmm0 - pabsw xmm2, xmm2 - pabsw xmm1, xmm1 - pabsw xmm3, xmm3 - movdqa xmm4, xmm3 - pblendw xmm3, xmm1, 0xAA - pslld xmm1, 16 - psrld xmm4, 16 - por xmm1, xmm4 - pmaxuw xmm1, xmm3 - paddw xmm6, xmm1 - movdqa xmm4, xmm0 - pblendw xmm0, xmm2, 0xAA - pslld xmm2, 16 - psrld xmm4, 16 - por xmm2, xmm4 - pmaxuw xmm0, xmm2 - paddw xmm6, xmm0 -%endmacro - -%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE - MMX_DW_1_2REG %3, %4 - pmaddwd %2, %3 - movhlps %4, %2 - paddd %2, %4 - pshuflw %4, %2,0Eh - paddd %2, %4 - movd %1, %2 -%endmacro -;*********************************************************************** -; -;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t ); -; -;*********************************************************************** -WELS_EXTERN WelsSampleSatd4x4_sse41 -WelsSampleSatd4x4_sse41: - ;push ebx - ;mov eax,[esp+8] - ;mov ebx,[esp+12] - ;mov ecx,[esp+16] - ;mov edx,[esp+20] - - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - movdqa xmm4,[HSwapSumSubDB1] - movd xmm2,[r2] - movd xmm5,[r2+r3] - shufps xmm2,xmm5,0 - movd xmm3,[r2+r3*2] - lea r2, [r3*2+r2] - movd xmm5,[r2+r3] - shufps xmm3,xmm5,0 - movd xmm0,[r0] - movd xmm5,[r0+r1] - shufps xmm0,xmm5,0 - movd xmm1,[r0+r1*2] - lea r0, [r1*2+r0] - movd xmm5,[r0+r1] - shufps xmm1,xmm5,0 - pmaddubsw xmm0,xmm4 - pmaddubsw xmm1,xmm4 - pmaddubsw xmm2,xmm4 - pmaddubsw xmm3,xmm4 - psubw xmm0,xmm2 - psubw xmm1,xmm3 - movdqa xmm2,xmm0 - paddw xmm0,xmm1 - psubw xmm1,xmm2 - movdqa xmm2,xmm0 - punpcklqdq xmm0,xmm1 - punpckhqdq xmm2,xmm1 - movdqa xmm1,xmm0 - paddw xmm0,xmm2 - psubw xmm2,xmm1 - movdqa xmm1,xmm0 - pblendw xmm0,xmm2,0AAh - pslld xmm2,16 - psrld xmm1,16 - por xmm2,xmm1 - pabsw xmm0,xmm0 - pabsw xmm2,xmm2 - pmaxsw xmm0,xmm2 - SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7 - LOAD_4_PARA_POP - ret - -;*********************************************************************** -; -;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); -; -;*********************************************************************** -WELS_EXTERN WelsSampleSatd8x8_sse41 -align 16 -WelsSampleSatd8x8_sse41: - ;push ebx - ;push esi - ;push edi - ;mov eax, [esp+16] - ;mov ebx, [esp+20] - ;mov ecx, [esp+24] - ;mov edx, [esp+28] -%ifdef X86_32 - push r4 - push r5 -%endif - %assign push_num 2 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - movdqa xmm7, [HSumSubDB1] - lea r4, [r1+r1*2] - lea r5, [r3+r3*2] - pxor xmm6, xmm6 - SSE41_GetSatd8x4 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - SSE41_GetSatd8x4 - SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 - LOAD_4_PARA_POP -%ifdef X86_32 - pop r5 - pop r4 -%endif - ret - -;*********************************************************************** -; -;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); -; -;*********************************************************************** -WELS_EXTERN WelsSampleSatd8x16_sse41 -align 16 -WelsSampleSatd8x16_sse41: - ;push ebx - ;push esi - ;push edi - ;push ebp - ;%define pushsize 16 - ;mov eax, [esp+pushsize+4] - ;mov ebx, [esp+pushsize+8] - ;mov ecx, [esp+pushsize+12] - ;mov edx, [esp+pushsize+16] -%ifdef X86_32 - push r4 - push r5 - push r6 -%endif - %assign push_num 3 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - movdqa xmm7, [HSumSubDB1] - lea r4, [r1+r1*2] - lea r5, [r3+r3*2] - pxor xmm6, xmm6 - mov r6, 0 -loop_get_satd_8x16: - SSE41_GetSatd8x4 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - inc r6 - cmp r6, 4 - jl loop_get_satd_8x16 - SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 - LOAD_4_PARA_POP -%ifdef X86_32 - pop r6 - pop r5 - pop r4 -%endif - ret - -;*********************************************************************** -; -;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); -; -;*********************************************************************** -WELS_EXTERN WelsSampleSatd16x8_sse41 -align 16 -WelsSampleSatd16x8_sse41: - ;push ebx - ;push esi - ;push edi - ;mov eax, [esp+16] - ;mov ebx, [esp+20] - ;mov ecx, [esp+24] - ;mov edx, [esp+28] -%ifdef X86_32 - push r4 - push r5 -%endif - %assign push_num 2 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - push r0 - push r2 - - movdqa xmm7, [HSumSubDB1] - lea r4, [r1+r1*2] - lea r5, [r3+r3*2] - pxor xmm6, xmm6 - SSE41_GetSatd8x4 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - SSE41_GetSatd8x4 - - pop r2 - pop r0 - ;mov eax, [esp+16] - ;mov ecx, [esp+24] - add r0, 8 - add r2, 8 - SSE41_GetSatd8x4 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - SSE41_GetSatd8x4 - SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 - LOAD_4_PARA_POP -%ifdef X86_32 - pop r5 - pop r4 -%endif - ret - -;*********************************************************************** -; -;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); -; -;*********************************************************************** - -WELS_EXTERN WelsSampleSatd16x16_sse41 -align 16 -WelsSampleSatd16x16_sse41: - ;push ebx - ;push esi - ;push edi - ;push ebp - ;%define pushsize 16 - ;mov eax, [esp+pushsize+4] - ;mov ebx, [esp+pushsize+8] - ;mov ecx, [esp+pushsize+12] - ;mov edx, [esp+pushsize+16] -%ifdef X86_32 - push r4 - push r5 - push r6 -%endif - %assign push_num 3 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - - push r0 - push r2 - - movdqa xmm7, [HSumSubDB1] - lea r4, [r1+r1*2] - lea r5, [r3+r3*2] - pxor xmm6, xmm6 - mov r6, 0 -loop_get_satd_16x16_left: - SSE41_GetSatd8x4 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - inc r6 - cmp r6, 4 - jl loop_get_satd_16x16_left - - pop r2 - pop r0 - ;mov eax, [esp+pushsize+4] - ;mov ecx, [esp+pushsize+12] - add r0, 8 - add r2, 8 - mov r6, 0 -loop_get_satd_16x16_right: - SSE41_GetSatd8x4 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - inc r6 - cmp r6, 4 - jl loop_get_satd_16x16_right - SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 - ;%undef pushsize - LOAD_4_PARA_POP -%ifdef X86_32 - pop r6 - pop r5 - pop r4 -%endif - ret - -;*********************************************************************** -; -;Pixel_satd_wxh_sse41 END -; -;*********************************************************************** - -;*********************************************************************** -; -;Pixel_sad_wxh_sse2 BEGIN -; -;*********************************************************************** - -%macro SSE2_GetSad2x16 0 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqu xmm1, [r2] - MOVDQ xmm2, [r0];[eax] must aligned 16 - psadbw xmm1, xmm2 - paddw xmm0, xmm1 - movdqu xmm1, [r2+r3] - MOVDQ xmm2, [r0+r1] - psadbw xmm1, xmm2 - paddw xmm0, xmm1 -%endmacro - - -%macro SSE2_GetSad4x16 0 - movdqu xmm0, [r2] - MOVDQ xmm2, [r0] - psadbw xmm0, xmm2 - paddw xmm7, xmm0 - movdqu xmm1, [r2+r3] - MOVDQ xmm2, [r0+r1] - psadbw xmm1, xmm2 - paddw xmm7, xmm1 - movdqu xmm1, [r2+2*r3] - MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16 - psadbw xmm1, xmm2 - paddw xmm7, xmm1 - movdqu xmm1, [r2+r5] - MOVDQ xmm2, [r0+r4] - psadbw xmm1, xmm2 - paddw xmm7, xmm1 -%endmacro - - -%macro SSE2_GetSad8x4 0 - movq xmm0, [r0] - movq xmm1, [r0+r1] - lea r0, [r0+2*r1] - movhps xmm0, [r0] - movhps xmm1, [r0+r1] - - movq xmm2, [r2] - movq xmm3, [r2+r3] - lea r2, [r2+2*r3] - movhps xmm2, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm2 - psadbw xmm1, xmm3 - paddw xmm6, xmm0 - paddw xmm6, xmm1 -%endmacro - -;*********************************************************************** -; -;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ) -;First parameter can align to 16 bytes, -;In wels, the third parameter can't align to 16 bytes. -; -;*********************************************************************** -WELS_EXTERN WelsSampleSad16x16_sse2 -align 16 -WelsSampleSad16x16_sse2: - ;push ebx - ;push edi - ;push esi - ;%define _STACK_SIZE 12 - ;mov eax, [esp+_STACK_SIZE+4 ] - ;mov ebx, [esp+_STACK_SIZE+8 ] - ;mov ecx, [esp+_STACK_SIZE+12] - ;mov edx, [esp+_STACK_SIZE+16] -%ifdef X86_32 - push r4 - push r5 -%endif - - %assign push_num 2 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - lea r4, [3*r1] - lea r5, [3*r3] - - pxor xmm7, xmm7 - SSE2_GetSad4x16 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - SSE2_GetSad4x16 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - SSE2_GetSad4x16 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - SSE2_GetSad4x16 - movhlps xmm0, xmm7 - paddw xmm0, xmm7 - movd retrd, xmm0 - LOAD_4_PARA_POP -%ifdef X86_32 - pop r5 - pop r4 -%endif - ret - -;*********************************************************************** -; -;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ) -;First parameter can align to 16 bytes, -;In wels, the third parameter can't align to 16 bytes. -; -;*********************************************************************** -WELS_EXTERN WelsSampleSad16x8_sse2 -align 16 -WelsSampleSad16x8_sse2: - ;push ebx - ;mov eax, [esp+8] - ;mov ebx, [esp+12] - ;mov ecx, [esp+16] - ;mov edx, [esp+20] - - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - movdqu xmm0, [r2] - MOVDQ xmm2, [r0] - psadbw xmm0, xmm2 - movdqu xmm1, [r2+r3] - MOVDQ xmm2, [r0+r1] - psadbw xmm1, xmm2 - paddw xmm0, xmm1 - - SSE2_GetSad2x16 - SSE2_GetSad2x16 - SSE2_GetSad2x16 - - movhlps xmm1, xmm0 - paddw xmm0, xmm1 - movd retrd, xmm0 - LOAD_4_PARA_POP - ret - - - -WELS_EXTERN WelsSampleSad8x16_sse2 -WelsSampleSad8x16_sse2: - ;push ebx - ;mov eax, [esp+8] - ;mov ebx, [esp+12] - ;mov ecx, [esp+16] - ;mov edx, [esp+20] - - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - pxor xmm6, xmm6 - - SSE2_GetSad8x4 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_GetSad8x4 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_GetSad8x4 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_GetSad8x4 - - movhlps xmm0, xmm6 - paddw xmm0, xmm6 - movd retrd, xmm0 - LOAD_4_PARA_POP - ret - - -%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline -and %1, 0x1f|(%3>>1) -cmp %1, (32-%2)|(%3>>1) -%endmacro - -WELS_EXTERN WelsSampleSad8x8_sse21 -WelsSampleSad8x8_sse21: - ;mov ecx, [esp+12] - ;mov edx, ecx - ;CACHE_SPLIT_CHECK edx, 8, 64 - ;jle near .pixel_sad_8x8_nsplit - ;push ebx - ;push edi - ;mov eax, [esp+12] - ;mov ebx, [esp+16] - - %assign push_num 0 - mov r2, arg3 - push r2 - CACHE_SPLIT_CHECK r2, 8, 64 - jle near .pixel_sad_8x8_nsplit - pop r2 -%ifdef X86_32 - push r3 - push r4 - push r5 -%endif - %assign push_num 3 - mov r0, arg1 - mov r1, arg2 - SIGN_EXTENTION r1, r1d - pxor xmm7, xmm7 - - ;ecx r2, edx r4, edi r5 - - mov r5, r2 - and r5, 0x07 - sub r2, r5 - mov r4, 8 - sub r4, r5 - - shl r5, 3 - shl r4, 3 - movd xmm5, r5d - movd xmm6, r4d - mov r5, 8 - add r5, r2 - mov r3, arg4 - SIGN_EXTENTION r3, r3d - movq xmm0, [r0] - movhps xmm0, [r0+r1] - - movq xmm1, [r2] - movq xmm2, [r5] - movhps xmm1, [r2+r3] - movhps xmm2, [r5+r3] - psrlq xmm1, xmm5 - psllq xmm2, xmm6 - por xmm1, xmm2 - - psadbw xmm0, xmm1 - paddw xmm7, xmm0 - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - lea r5, [r5+2*r3] - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - - movq xmm1, [r2] - movq xmm2, [r5] - movhps xmm1, [r2+r3] - movhps xmm2, [r5+r3] - psrlq xmm1, xmm5 - psllq xmm2, xmm6 - por xmm1, xmm2 - - psadbw xmm0, xmm1 - paddw xmm7, xmm0 - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - lea r5, [r5+2*r3] - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - - movq xmm1, [r2] - movq xmm2, [r5] - movhps xmm1, [r2+r3] - movhps xmm2, [r5+r3] - psrlq xmm1, xmm5 - psllq xmm2, xmm6 - por xmm1, xmm2 - - psadbw xmm0, xmm1 - paddw xmm7, xmm0 - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - lea r5, [r5+2*r3] - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - - movq xmm1, [r2] - movq xmm2, [r5] - movhps xmm1, [r2+r3] - movhps xmm2, [r5+r3] - psrlq xmm1, xmm5 - psllq xmm2, xmm6 - por xmm1, xmm2 - - psadbw xmm0, xmm1 - paddw xmm7, xmm0 - - movhlps xmm0, xmm7 - paddw xmm0, xmm7 - movd retrd, xmm0 -%ifdef X86_32 - pop r5 - pop r4 - pop r3 -%endif - jmp .return - -.pixel_sad_8x8_nsplit: - ;push ebx - ;mov eax, [esp+8] - ;mov ebx, [esp+12] - ;mov edx, [esp+20] - - pop r2 - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - pxor xmm6, xmm6 - SSE2_GetSad8x4 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_GetSad8x4 - movhlps xmm0, xmm6 - paddw xmm0, xmm6 - movd retrd, xmm0 - LOAD_4_PARA_POP -.return: - ret - - -;*********************************************************************** -; -;Pixel_sad_wxh_sse2 END -; -;*********************************************************************** - - -;*********************************************************************** -; -;Pixel_sad_4_wxh_sse2 BEGIN -; -;*********************************************************************** - - -%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address - psadbw %1, %4 - paddw xmm5, %1 - psadbw %4, %3 - paddw xmm4, %4 - movdqu %4, [%5-1] - psadbw %4, %2 - paddw xmm6, %4 - movdqu %4, [%5+1] - psadbw %4, %2 - paddw xmm7, %4 -%endmacro -WELS_EXTERN WelsSampleSadFour16x16_sse2 -WelsSampleSadFour16x16_sse2: - ;push ebx - ;mov eax, [esp+8] - ;mov ebx, [esp+12] - ;mov ecx, [esp+16] - ;mov edx, [esp+20] - - %assign push_num 0 - LOAD_5_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref - pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref - pxor xmm6, xmm6 ;sad pRefMb-1 - pxor xmm7, xmm7 ;sad pRefMb+1 - movdqa xmm0, [r0] - sub r2, r3 - movdqu xmm3, [r2] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movdqa xmm1, [r0+r1] - movdqu xmm3, [r2+r3] - psadbw xmm3, xmm1 - paddw xmm4, xmm3 - - movdqu xmm2, [r2+r3-1] - psadbw xmm2, xmm0 - paddw xmm6, xmm2 - - movdqu xmm3, [r2+r3+1] - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm2, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 - movdqa xmm0, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm1, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 - movdqa xmm2, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm0, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 - movdqa xmm1, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm2, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 - movdqa xmm0, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm1, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 - movdqa xmm2, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm0, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 - movdqa xmm1, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm2, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 - movdqa xmm0, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 - lea r2, [r2+2*r3] - movdqu xmm3, [r2] - psadbw xmm2, xmm3 - paddw xmm5, xmm2 - - movdqu xmm2, [r2-1] - psadbw xmm2, xmm0 - paddw xmm6, xmm2 - - movdqu xmm3, [r2+1] - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movdqu xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - ;mov ecx, [esp+24] - movhlps xmm0, xmm4 - paddw xmm4, xmm0 - movhlps xmm0, xmm5 - paddw xmm5, xmm0 - movhlps xmm0, xmm6 - paddw xmm6, xmm0 - movhlps xmm0, xmm7 - paddw xmm7, xmm0 - punpckldq xmm4, xmm5 - punpckldq xmm6, xmm7 - punpcklqdq xmm4, xmm6 - movdqa [r4],xmm4 - LOAD_5_PARA_POP - ret - - -WELS_EXTERN WelsSampleSadFour16x8_sse2 -WelsSampleSadFour16x8_sse2: - ;push ebx - ;push edi - ;mov eax, [esp+12] - ;mov ebx, [esp+16] - ;mov edi, [esp+20] - ;mov edx, [esp+24] - - %assign push_num 0 - LOAD_5_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref - pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref - pxor xmm6, xmm6 ;sad pRefMb-1 - pxor xmm7, xmm7 ;sad pRefMb+1 - movdqa xmm0, [r0] - sub r2, r3 - movdqu xmm3, [r2] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movdqa xmm1, [r0+r1] - movdqu xmm3, [r2+r3] - psadbw xmm3, xmm1 - paddw xmm4, xmm3 - - movdqu xmm2, [r2+r3-1] - psadbw xmm2, xmm0 - paddw xmm6, xmm2 - - movdqu xmm3, [r2+r3+1] - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm2, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 - movdqa xmm0, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm1, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 - movdqa xmm2, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm0, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 - movdqa xmm1, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 - lea r2, [r2+2*r3] - movdqu xmm3, [r2] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - movdqu xmm0, [r2-1] - psadbw xmm0, xmm1 - paddw xmm6, xmm0 - - movdqu xmm3, [r2+1] - psadbw xmm3, xmm1 - paddw xmm7, xmm3 - - movdqu xmm3, [r2+r3] - psadbw xmm1, xmm3 - paddw xmm5, xmm1 - - ;mov edi, [esp+28] - movhlps xmm0, xmm4 - paddw xmm4, xmm0 - movhlps xmm0, xmm5 - paddw xmm5, xmm0 - movhlps xmm0, xmm6 - paddw xmm6, xmm0 - movhlps xmm0, xmm7 - paddw xmm7, xmm0 - punpckldq xmm4, xmm5 - punpckldq xmm6, xmm7 - punpcklqdq xmm4, xmm6 - movdqa [r4],xmm4 - LOAD_5_PARA_POP - ret - -WELS_EXTERN WelsSampleSadFour8x16_sse2 -WelsSampleSadFour8x16_sse2: - ;push ebx - ;push edi - ;mov eax, [esp+12] - ;mov ebx, [esp+16] - ;mov edi, [esp+20] - ;mov edx, [esp+24] - - %assign push_num 0 - LOAD_5_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref - pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref - pxor xmm6, xmm6 ;sad pRefMb-1 - pxor xmm7, xmm7 ;sad pRefMb+1 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - sub r2, r3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - ;mov edi, [esp+28] - movhlps xmm0, xmm4 - paddw xmm4, xmm0 - movhlps xmm0, xmm5 - paddw xmm5, xmm0 - movhlps xmm0, xmm6 - paddw xmm6, xmm0 - movhlps xmm0, xmm7 - paddw xmm7, xmm0 - punpckldq xmm4, xmm5 - punpckldq xmm6, xmm7 - punpcklqdq xmm4, xmm6 - movdqa [r4],xmm4 - LOAD_5_PARA_POP - ret - - -WELS_EXTERN WelsSampleSadFour8x8_sse2 -WelsSampleSadFour8x8_sse2: - ;push ebx - ;push edi - ;mov eax, [esp+12] - ;mov ebx, [esp+16] - ;mov edi, [esp+20] - ;mov edx, [esp+24] - - %assign push_num 0 - LOAD_5_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref - pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref - pxor xmm6, xmm6 ;sad pRefMb-1 - pxor xmm7, xmm7 ;sad pRefMb+1 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - sub r2, r3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 - - - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 - - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 - - ;mov edi, [esp+28] - movhlps xmm0, xmm4 - paddw xmm4, xmm0 - movhlps xmm0, xmm5 - paddw xmm5, xmm0 - movhlps xmm0, xmm6 - paddw xmm6, xmm0 - movhlps xmm0, xmm7 - paddw xmm7, xmm0 - punpckldq xmm4, xmm5 - punpckldq xmm6, xmm7 - punpcklqdq xmm4, xmm6 - movdqa [r4],xmm4 - LOAD_5_PARA_POP - ret - -WELS_EXTERN WelsSampleSadFour4x4_sse2 -WelsSampleSadFour4x4_sse2: - ;push ebx - ;push edi - ;mov eax, [esp+12] - ;mov ebx, [esp+16] - ;mov edi, [esp+20] - ;mov edx, [esp+24] - - %assign push_num 0 - LOAD_5_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - movd xmm0, [r0] - movd xmm1, [r0+r1] - lea r0, [r0+2*r1] - movd xmm2, [r0] - movd xmm3, [r0+r1] - punpckldq xmm0, xmm1 - punpckldq xmm2, xmm3 - punpcklqdq xmm0, xmm2 - sub r2, r3 - movd xmm1, [r2] - movd xmm2, [r2+r3] - punpckldq xmm1, xmm2 - movd xmm2, [r2+r3-1] - movd xmm3, [r2+r3+1] - - lea r2, [r2+2*r3] - - movd xmm4, [r2] - movd xmm5, [r2-1] - punpckldq xmm2, xmm5 - movd xmm5, [r2+1] - punpckldq xmm3, xmm5 - - movd xmm5, [r2+r3] - punpckldq xmm4, xmm5 - - punpcklqdq xmm1, xmm4 ;-L - - movd xmm5, [r2+r3-1] - movd xmm6, [r2+r3+1] - - lea r2, [r2+2*r3] - movd xmm7, [r2-1] - punpckldq xmm5, xmm7 - punpcklqdq xmm2, xmm5 ;-1 - movd xmm7, [r2+1] - punpckldq xmm6, xmm7 - punpcklqdq xmm3, xmm6 ;+1 - movd xmm6, [r2] - movd xmm7, [r2+r3] - punpckldq xmm6, xmm7 - punpcklqdq xmm4, xmm6 ;+L - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - psadbw xmm4, xmm0 - - movhlps xmm0, xmm1 - paddw xmm1, xmm0 - movhlps xmm0, xmm2 - paddw xmm2, xmm0 - movhlps xmm0, xmm3 - paddw xmm3, xmm0 - movhlps xmm0, xmm4 - paddw xmm4, xmm0 - ;mov edi, [esp+28] - punpckldq xmm1, xmm4 - punpckldq xmm2, xmm3 - punpcklqdq xmm1, xmm2 - movdqa [r4],xmm1 - LOAD_5_PARA_POP - ret - -;*********************************************************************** -; -;Pixel_sad_4_wxh_sse2 END -; -;*********************************************************************** - -WELS_EXTERN WelsSampleSad4x4_mmx - -align 16 -;*********************************************************************** -; int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t ) -;*********************************************************************** -WelsSampleSad4x4_mmx: - ;push ebx - ;%define pushsize 4 - ;%define pix1address esp+pushsize+4 - ;%define pix1stride esp+pushsize+8 - ;%define pix2address esp+pushsize+12 - ;%define pix2stride esp+pushsize+16 - ;mov eax, [pix1address] - ;mov ebx, [pix1stride ] - ;mov ecx, [pix2address] - ;mov edx, [pix2stride ] - - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - movd mm0, [r0] - movd mm1, [r0+r1] - punpckldq mm0, mm1 - - movd mm3, [r2] - movd mm4, [r2+r3] - punpckldq mm3, mm4 - psadbw mm0, mm3 - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - - movd mm1, [r0] - movd mm2, [r0+r1] - punpckldq mm1, mm2 - - movd mm3, [r2] - movd mm4, [r2+r3] - punpckldq mm3, mm4 - psadbw mm1, mm3 - paddw mm0, mm1 - - movd retrd, mm0 - - WELSEMMS - LOAD_4_PARA_POP - ret +;*! +;* \copy +;* Copyright (c) 2009-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* satd_sad.asm +;* +;* Abstract +;* WelsSampleSatd4x4_sse2 +;* WelsSampleSatd8x8_sse2 +;* WelsSampleSatd16x8_sse2 +;* WelsSampleSatd8x16_sse2 +;* WelsSampleSatd16x16_sse2 +;* +;* WelsSampleSad16x8_sse2 +;* WelsSampleSad16x16_sse2 +;* +;* History +;* 8/5/2009 Created +;* 24/9/2009 modified +;* +;* +;*************************************************************************/ + +%include "asm_inc.asm" + +;*********************************************************************** +; Data +;*********************************************************************** +SECTION .rodata align=16 + +align 16 +HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1 +align 16 +HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1 +align 16 +PDW1: dw 1,1,1,1,1,1,1,1 +align 16 +PDQ2: dw 2,0,0,0,2,0,0,0 +align 16 +HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 + +;*********************************************************************** +; Code +;*********************************************************************** +SECTION .text + +;*********************************************************************** +; +;Pixel_satd_wxh_sse2 BEGIN +; +;*********************************************************************** +%macro MMX_DW_1_2REG 2 + pxor %1, %1 + pcmpeqw %2, %2 + psubw %1, %2 +%endmacro + +%macro SSE2_SumWHorizon1 2 + movdqa %2, %1 + psrldq %2, 8 + paddusw %1, %2 + movdqa %2, %1 + psrldq %2, 4 + paddusw %1, %2 + movdqa %2, %1 + psrldq %2, 2 + paddusw %1, %2 +%endmacro + +%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3 + SSE2_SumSub %1, %2, %5 + SSE2_SumSub %3, %4, %5 + SSE2_SumSub %2, %4, %5 + SSE2_SumSub %1, %3, %5 +%endmacro + +%macro SSE2_SumAbs4 7 + WELS_AbsW %1, %3 + WELS_AbsW %2, %3 + WELS_AbsW %4, %6 + WELS_AbsW %5, %6 + paddusw %1, %2 + paddusw %4, %5 + paddusw %7, %1 + paddusw %7, %4 +%endmacro + +%macro SSE2_SumWHorizon 3 + movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 + paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 + punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 + movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 + paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 + pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 + paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 +%endmacro + +%macro SSE2_GetSatd8x8 0 + SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2] + SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2] + SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3] + + SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 + SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 + SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 + SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2] + SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2] + SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3] + + SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 + SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 + SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 + SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 +%endmacro + +;*********************************************************************** +; +;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t ); +; +;*********************************************************************** +WELS_EXTERN WelsSampleSatd4x4_sse2 +align 16 +WelsSampleSatd4x4_sse2: + ;push ebx + ;mov eax, [esp+8] + ;mov ebx, [esp+12] + ;mov ecx, [esp+16] + ;mov edx, [esp+20] + + %assign push_num 0 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + movd xmm0, [r0] + movd xmm1, [r0+r1] + lea r0 , [r0+2*r1] + movd xmm2, [r0] + movd xmm3, [r0+r1] + punpckldq xmm0, xmm2 + punpckldq xmm1, xmm3 + + movd xmm4, [r2] + movd xmm5, [r2+r3] + lea r2 , [r2+2*r3] + movd xmm6, [r2] + movd xmm7, [r2+r3] + punpckldq xmm4, xmm6 + punpckldq xmm5, xmm7 + + pxor xmm6, xmm6 + punpcklbw xmm0, xmm6 + punpcklbw xmm1, xmm6 + punpcklbw xmm4, xmm6 + punpcklbw xmm5, xmm6 + + psubw xmm0, xmm4 + psubw xmm1, xmm5 + + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + psubw xmm2, xmm1 + SSE2_XSawp qdq, xmm0, xmm2, xmm3 + + movdqa xmm4, xmm0 + paddw xmm0, xmm3 + psubw xmm4, xmm3 + + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm4 + punpckhwd xmm4, xmm2 + + SSE2_XSawp dq, xmm0, xmm4, xmm3 + SSE2_XSawp qdq, xmm0, xmm3, xmm5 + + movdqa xmm7, xmm0 + paddw xmm0, xmm5 + psubw xmm7, xmm5 + + SSE2_XSawp qdq, xmm0, xmm7, xmm1 + + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + psubw xmm2, xmm1 + + WELS_AbsW xmm0, xmm3 + paddusw xmm6, xmm0 + WELS_AbsW xmm2, xmm4 + paddusw xmm6, xmm2 + SSE2_SumWHorizon1 xmm6, xmm4 + movd retrd, xmm6 + and retrd, 0xffff + shr retrd, 1 + LOAD_4_PARA_POP + ret + + ;*********************************************************************** + ; + ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); + ; + ;*********************************************************************** + WELS_EXTERN WelsSampleSatd8x8_sse2 +align 16 + WelsSampleSatd8x8_sse2: + ;push ebx + ;mov eax, [esp+8] + ;mov ebx, [esp+12] + ;mov ecx, [esp+16] + ;mov edx, [esp+20] + + %assign push_num 0 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + pxor xmm6, xmm6 + pxor xmm7, xmm7 + SSE2_GetSatd8x8 + psrlw xmm6, 1 + SSE2_SumWHorizon xmm6,xmm4,xmm7 + movd retrd, xmm6 + LOAD_4_PARA_POP + ret + + ;*********************************************************************** + ; + ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); + ; + ;*********************************************************************** + WELS_EXTERN WelsSampleSatd8x16_sse2 +align 16 + WelsSampleSatd8x16_sse2: + ;push ebx + ;mov eax, [esp+8] + ;mov ebx, [esp+12] + ;mov ecx, [esp+16] + ;mov edx, [esp+20] + + %assign push_num 0 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + pxor xmm6, xmm6 + pxor xmm7, xmm7 + + SSE2_GetSatd8x8 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_GetSatd8x8 + + psrlw xmm6, 1 + SSE2_SumWHorizon xmm6,xmm4,xmm7 + movd retrd, xmm6 + LOAD_4_PARA_POP + ret + +;*********************************************************************** +; +;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); +; +;*********************************************************************** +WELS_EXTERN WelsSampleSatd16x8_sse2 +align 16 +WelsSampleSatd16x8_sse2: + ;push ebx + ;mov eax, [esp+8] + ;mov ebx, [esp+12] + ;mov ecx, [esp+16] + ;mov edx, [esp+20] + + %assign push_num 0 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + push r0 + push r2 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + + SSE2_GetSatd8x8 + + pop r2 + pop r0 + ;mov eax, [esp+8] + ;mov ecx, [esp+16] + add r0, 8 + add r2, 8 + SSE2_GetSatd8x8 + + psrlw xmm6, 1 + SSE2_SumWHorizon xmm6,xmm4,xmm7 + movd retrd, xmm6 + LOAD_4_PARA_POP + ret + +;*********************************************************************** +; +;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); +; +;*********************************************************************** +WELS_EXTERN WelsSampleSatd16x16_sse2 +align 16 +WelsSampleSatd16x16_sse2: + ;push ebx + ;mov eax, [esp+8] + ;mov ebx, [esp+12] + ;mov ecx, [esp+16] + ;mov edx, [esp+20] + + %assign push_num 0 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + push r0 + push r2 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + + SSE2_GetSatd8x8 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_GetSatd8x8 + + pop r2 + pop r0 + ;mov eax, [esp+8] + ;mov ecx, [esp+16] + add r0, 8 + add r2, 8 + + SSE2_GetSatd8x8 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_GetSatd8x8 + + ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. + psrlw xmm6, 1 + SSE2_SumWHorizon xmm6,xmm4,xmm7 + movd retrd, xmm6 + LOAD_4_PARA_POP + ret + +;*********************************************************************** +; +;Pixel_satd_wxh_sse2 END +; +;*********************************************************************** + +;*********************************************************************** +; +;Pixel_satd_intra_sse2 BEGIN +; +;*********************************************************************** + +%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 + pmaddubsw %1, xmm5 + movdqa %2, %1 + pmaddwd %1, xmm7 + pmaddwd %2, xmm6 + movdqa %3, %1 + punpckldq %1, %2 + punpckhdq %2, %3 + movdqa %3, %1 + punpcklqdq %1, %2 + punpckhqdq %3, %2 + paddd xmm4, %1 ;for dc + paddd xmm4, %3 ;for dc + packssdw %1, %3 + psllw %1, 2 +%endmacro +%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2 + pmaddubsw %1, xmm5 + movdqa %2, %1 + pmaddwd %1, xmm7 + pmaddwd %2, xmm6 + movdqa %3, %1 + punpckldq %1, %2 + punpckhdq %2, %3 + movdqa %3, %1 + punpcklqdq %1, %2 + punpckhqdq %3, %2 +; paddd xmm4, %1 ;for dc +; paddd xmm4, %3 ;for dc + movdqa %4, %1 + punpcklqdq %4, %3 + packssdw %1, %3 + psllw %1, 2 +%endmacro + +%macro SSE41_GetX38x4SatdDec 0 + pxor xmm7, xmm7 + movq xmm0, [eax] + movq xmm1, [eax+ebx] + lea eax, [eax+2*ebx] + movq xmm2, [eax] + movq xmm3, [eax+ebx] + lea eax, [eax+2*ebx] + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7 + SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7 + SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2 + ;doesn't need another transpose +%endmacro +%macro SSE41_GetX38x4SatdV 2 + pxor xmm0, xmm0 + pinsrw xmm0, word[esi+%2], 0 + pinsrw xmm0, word[esi+%2+8], 4 + psubsw xmm0, xmm7 + pabsw xmm0, xmm0 + paddw xmm4, xmm0 + pxor xmm0, xmm0 + pinsrw xmm0, word[esi+%2+2], 0 + pinsrw xmm0, word[esi+%2+10], 4 + psubsw xmm0, xmm1 + pabsw xmm0, xmm0 + paddw xmm4, xmm0 + pxor xmm0, xmm0 + pinsrw xmm0, word[esi+%2+4], 0 + pinsrw xmm0, word[esi+%2+12], 4 + psubsw xmm0, xmm3 + pabsw xmm0, xmm0 + paddw xmm4, xmm0 + pxor xmm0, xmm0 + pinsrw xmm0, word[esi+%2+6], 0 + pinsrw xmm0, word[esi+%2+14], 4 + psubsw xmm0, xmm2 + pabsw xmm0, xmm0 + paddw xmm4, xmm0 +%endmacro +%macro SSE41_GetX38x4SatdH 3 + movq xmm0, [esi+%3+8*%1] + punpcklqdq xmm0, xmm0 + psubsw xmm0, xmm7 + pabsw xmm0, xmm0 + paddw xmm5, xmm0 + pabsw xmm1, xmm1 + pabsw xmm2, xmm2 + pabsw xmm3, xmm3 + paddw xmm2, xmm1;for DC + paddw xmm2, xmm3;for DC + paddw xmm5, xmm2 +%endmacro +%macro SSE41_I16X16GetX38x4SatdDC 0 + pxor xmm0, xmm0 + movq2dq xmm0, mm4 + punpcklqdq xmm0, xmm0 + psubsw xmm0, xmm7 + pabsw xmm0, xmm0 + paddw xmm6, xmm0 + paddw xmm6, xmm2 +%endmacro +%macro SSE41_ChromaGetX38x4SatdDC 1 + shl %1, 4 + movdqa xmm0, [esi+32+%1] + psubsw xmm0, xmm7 + pabsw xmm0, xmm0 + paddw xmm6, xmm0 + paddw xmm6, xmm2 +%endmacro +%macro SSE41_I16x16GetX38x4Satd 2 + SSE41_GetX38x4SatdDec + SSE41_GetX38x4SatdV %1, %2 + SSE41_GetX38x4SatdH %1, %2, 32 + SSE41_I16X16GetX38x4SatdDC +%endmacro +%macro SSE41_ChromaGetX38x4Satd 2 + SSE41_GetX38x4SatdDec + SSE41_GetX38x4SatdV %1, %2 + SSE41_GetX38x4SatdH %1, %2, 16 + SSE41_ChromaGetX38x4SatdDC %1 +%endmacro +%macro SSE41_HSum8W 3 + pmaddwd %1, %2 + movhlps %3, %1 + paddd %1, %3 + pshuflw %3, %1,0Eh + paddd %1, %3 +%endmacro + + +%ifdef X86_32 +WELS_EXTERN WelsIntra16x16Combined3Satd_sse41 +WelsIntra16x16Combined3Satd_sse41: + push ebx + push esi + push edi + mov ecx, [esp+16] + mov edx, [esp+20] + mov eax, [esp+24] + mov ebx, [esp+28] + mov esi, [esp+40] ;temp_satd + pxor xmm4, xmm4 + movdqa xmm5, [HSumSubDB1] + movdqa xmm6, [HSumSubDW1] + movdqa xmm7, [PDW1] + sub ecx, edx + movdqu xmm0, [ecx] + movhlps xmm1, xmm0 + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 + SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 + movdqa [esi], xmm0 ;V + movdqa [esi+16], xmm1 + add ecx, edx + pinsrb xmm0, byte[ecx-1], 0 + pinsrb xmm0, byte[ecx+edx-1], 1 + lea ecx, [ecx+2*edx] + pinsrb xmm0, byte[ecx-1], 2 + pinsrb xmm0, byte[ecx+edx-1], 3 + lea ecx, [ecx+2*edx] + pinsrb xmm0, byte[ecx-1], 4 + pinsrb xmm0, byte[ecx+edx-1], 5 + lea ecx, [ecx+2*edx] + pinsrb xmm0, byte[ecx-1], 6 + pinsrb xmm0, byte[ecx+edx-1], 7 + lea ecx, [ecx+2*edx] + pinsrb xmm0, byte[ecx-1], 8 + pinsrb xmm0, byte[ecx+edx-1], 9 + lea ecx, [ecx+2*edx] + pinsrb xmm0, byte[ecx-1], 10 + pinsrb xmm0, byte[ecx+edx-1], 11 + lea ecx, [ecx+2*edx] + pinsrb xmm0, byte[ecx-1], 12 + pinsrb xmm0, byte[ecx+edx-1], 13 + lea ecx, [ecx+2*edx] + pinsrb xmm0, byte[ecx-1], 14 + pinsrb xmm0, byte[ecx+edx-1], 15 + movhlps xmm1, xmm0 + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 + SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 + movdqa [esi+32], xmm0 ;H + movdqa [esi+48], xmm1 + movd ecx, xmm4 ;dc + add ecx, 16 ;(sum+16) + shr ecx, 5 ;((sum+16)>>5) + shl ecx, 4 ; + movd mm4, ecx ; mm4 copy DC + pxor xmm4, xmm4 ;V + pxor xmm5, xmm5 ;H + pxor xmm6, xmm6 ;DC + mov ecx, 0 + mov edi, 0 +.loop16x16_get_satd: +.loopStart1: + SSE41_I16x16GetX38x4Satd ecx, edi + inc ecx + cmp ecx, 4 + jl .loopStart1 + cmp edi, 16 + je .loop16x16_get_satd_end + mov eax, [esp+24] + add eax, 8 + mov ecx, 0 + add edi, 16 + jmp .loop16x16_get_satd + .loop16x16_get_satd_end: + MMX_DW_1_2REG xmm0, xmm1 + psrlw xmm4, 1 ;/2 + psrlw xmm5, 1 ;/2 + psrlw xmm6, 1 ;/2 + SSE41_HSum8W xmm4, xmm0, xmm1 + SSE41_HSum8W xmm5, xmm0, xmm1 + SSE41_HSum8W xmm6, xmm0, xmm1 + + ; comparing order: DC H V + movd ebx, xmm6 ;DC + movd edi, xmm5 ;H + movd ecx, xmm4 ;V + mov edx, [esp+36] + shl edx, 1 + add edi, edx + add ebx, edx + mov edx, [esp+32] + cmp ebx, edi + jge near not_dc_16x16 + cmp ebx, ecx + jge near not_dc_h_16x16 + + ; for DC mode + mov dword[edx], 2;I16_PRED_DC + mov eax, ebx + jmp near return_satd_intra_16x16_x3 +not_dc_16x16: + ; for H mode + cmp edi, ecx + jge near not_dc_h_16x16 + mov dword[edx], 1;I16_PRED_H + mov eax, edi + jmp near return_satd_intra_16x16_x3 +not_dc_h_16x16: + ; for V mode + mov dword[edx], 0;I16_PRED_V + mov eax, ecx +return_satd_intra_16x16_x3: + WELSEMMS + pop edi + pop esi + pop ebx +ret + +%macro SSE41_ChromaGetX38x8Satd 0 + movdqa xmm5, [HSumSubDB1] + movdqa xmm6, [HSumSubDW1] + movdqa xmm7, [PDW1] + sub ecx, edx + movq xmm0, [ecx] + punpcklqdq xmm0, xmm0 + SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4 + movdqa [esi], xmm0 ;V + add ecx, edx + pinsrb xmm0, byte[ecx-1], 0 + pinsrb xmm0, byte[ecx+edx-1], 1 + lea ecx, [ecx+2*edx] + pinsrb xmm0, byte[ecx-1], 2 + pinsrb xmm0, byte[ecx+edx-1], 3 + lea ecx, [ecx+2*edx] + pinsrb xmm0, byte[ecx-1], 4 + pinsrb xmm0, byte[ecx+edx-1], 5 + lea ecx, [ecx+2*edx] + pinsrb xmm0, byte[ecx-1], 6 + pinsrb xmm0, byte[ecx+edx-1], 7 + punpcklqdq xmm0, xmm0 + SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1 + movdqa [esi+16], xmm0 ;H +;(sum+2)>>2 + movdqa xmm6, [PDQ2] + movdqa xmm5, xmm4 + punpckhqdq xmm5, xmm1 + paddd xmm5, xmm6 + psrld xmm5, 2 +;(sum1+sum2+4)>>3 + paddd xmm6, xmm6 + paddd xmm4, xmm1 + paddd xmm4, xmm6 + psrld xmm4, 3 +;satd *16 + pslld xmm5, 4 + pslld xmm4, 4 +;temp satd + movdqa xmm6, xmm4 + punpcklqdq xmm4, xmm5 + psllq xmm4, 32 + psrlq xmm4, 32 + movdqa [esi+32], xmm4 + punpckhqdq xmm5, xmm6 + psllq xmm5, 32 + psrlq xmm5, 32 + movdqa [esi+48], xmm5 + + pxor xmm4, xmm4 ;V + pxor xmm5, xmm5 ;H + pxor xmm6, xmm6 ;DC + mov ecx, 0 +loop_chroma_satdx3_cb_cr: + SSE41_ChromaGetX38x4Satd ecx, 0 + inc ecx + cmp ecx, 2 + jl loop_chroma_satdx3_cb_cr +%endmacro + +%macro SSEReg2MMX 3 + movdq2q %2, %1 + movhlps %1, %1 + movdq2q %3, %1 +%endmacro +%macro MMXReg2SSE 4 + movq2dq %1, %3 + movq2dq %2, %4 + punpcklqdq %1, %2 +%endmacro +;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41 + +WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41 +WelsIntraChroma8x8Combined3Satd_sse41: + push ebx + push esi + push edi + mov ecx, [esp+16] + mov edx, [esp+20] + mov eax, [esp+24] + mov ebx, [esp+28] + mov esi, [esp+40] ;temp_satd + xor edi, edi +loop_chroma_satdx3: + SSE41_ChromaGetX38x8Satd + cmp edi, 1 + je loop_chroma_satdx3end + inc edi + SSEReg2MMX xmm4, mm0,mm1 + SSEReg2MMX xmm5, mm2,mm3 + SSEReg2MMX xmm6, mm5,mm6 + mov ecx, [esp+44] + mov eax, [esp+48] + jmp loop_chroma_satdx3 +loop_chroma_satdx3end: + MMXReg2SSE xmm0, xmm3, mm0, mm1 + MMXReg2SSE xmm1, xmm3, mm2, mm3 + MMXReg2SSE xmm2, xmm3, mm5, mm6 + + paddw xmm4, xmm0 + paddw xmm5, xmm1 + paddw xmm6, xmm2 + + MMX_DW_1_2REG xmm0, xmm1 + psrlw xmm4, 1 ;/2 + psrlw xmm5, 1 ;/2 + psrlw xmm6, 1 ;/2 + SSE41_HSum8W xmm4, xmm0, xmm1 + SSE41_HSum8W xmm5, xmm0, xmm1 + SSE41_HSum8W xmm6, xmm0, xmm1 + ; comparing order: DC H V + movd ebx, xmm6 ;DC + movd edi, xmm5 ;H + movd ecx, xmm4 ;V + mov edx, [esp+36] + shl edx, 1 + add edi, edx + add ecx, edx + mov edx, [esp+32] + cmp ebx, edi + jge near not_dc_8x8 + cmp ebx, ecx + jge near not_dc_h_8x8 + + ; for DC mode + mov dword[edx], 0;I8_PRED_DC + mov eax, ebx + jmp near return_satd_intra_8x8_x3 +not_dc_8x8: + ; for H mode + cmp edi, ecx + jge near not_dc_h_8x8 + mov dword[edx], 1;I8_PRED_H + mov eax, edi + jmp near return_satd_intra_8x8_x3 +not_dc_h_8x8: + ; for V mode + mov dword[edx], 2;I8_PRED_V + mov eax, ecx +return_satd_intra_8x8_x3: + WELSEMMS + pop edi + pop esi + pop ebx +ret + + +;*********************************************************************** +; +;Pixel_satd_intra_sse2 END +; +;*********************************************************************** +%macro SSSE3_Get16BSadHVDC 2 + movd xmm6,%1 + pshufb xmm6,xmm1 + movdqa %1, xmm6 + movdqa xmm0,%2 + psadbw xmm0,xmm7 + paddw xmm4,xmm0 + movdqa xmm0,%2 + psadbw xmm0,xmm5 + paddw xmm2,xmm0 + psadbw xmm6,%2 + paddw xmm3,xmm6 +%endmacro +%macro WelsAddDCValue 4 + movzx %2, byte %1 + mov %3, %2 + add %4, %2 +%endmacro + +;*********************************************************************** +; +;Pixel_sad_intra_ssse3 BEGIN +; +;*********************************************************************** +WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3 +WelsIntra16x16Combined3Sad_ssse3: + push ebx + push esi + push edi + mov ecx, [esp+16] + mov edx, [esp+20] + mov edi, [esp+40] ;temp_sad + sub ecx, edx + movdqa xmm5,[ecx] + pxor xmm0,xmm0 + psadbw xmm0,xmm5 + movhlps xmm1,xmm0 + paddw xmm0,xmm1 + movd eax,xmm0 + + add ecx,edx + lea ebx, [edx+2*edx] + WelsAddDCValue [ecx-1 ], esi, [edi ], eax + WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax + WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax + WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax + lea ecx, [ecx+4*edx] + add edi, 64 + WelsAddDCValue [ecx-1 ], esi, [edi ], eax + WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax + WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax + WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax + lea ecx, [ecx+4*edx] + add edi, 64 + WelsAddDCValue [ecx-1 ], esi, [edi ], eax + WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax + WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax + WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax + lea ecx, [ecx+4*edx] + add edi, 64 + WelsAddDCValue [ecx-1 ], esi, [edi ], eax + WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax + WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax + WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax + sub edi, 192 + add eax,10h + shr eax,5 + movd xmm7,eax + pxor xmm1,xmm1 + pshufb xmm7,xmm1 + pxor xmm4,xmm4 + pxor xmm3,xmm3 + pxor xmm2,xmm2 +;sad begin + mov eax, [esp+24] + mov ebx, [esp+28] + lea esi, [ebx+2*ebx] + SSSE3_Get16BSadHVDC [edi], [eax] + SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] + SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] + SSSE3_Get16BSadHVDC [edi+48], [eax+esi] + add edi, 64 + lea eax, [eax+4*ebx] + SSSE3_Get16BSadHVDC [edi], [eax] + SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] + SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] + SSSE3_Get16BSadHVDC [edi+48], [eax+esi] + add edi, 64 + lea eax, [eax+4*ebx] + SSSE3_Get16BSadHVDC [edi], [eax] + SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] + SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] + SSSE3_Get16BSadHVDC [edi+48], [eax+esi] + add edi, 64 + lea eax, [eax+4*ebx] + SSSE3_Get16BSadHVDC [edi], [eax] + SSSE3_Get16BSadHVDC [edi+16], [eax+ebx] + SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx] + SSSE3_Get16BSadHVDC [edi+48], [eax+esi] + + pslldq xmm3,4 + por xmm3,xmm2 + movhlps xmm1,xmm3 + paddw xmm3,xmm1 + movhlps xmm0,xmm4 + paddw xmm4,xmm0 +; comparing order: DC H V + movd ebx, xmm4 ;DC + movd ecx, xmm3 ;V + psrldq xmm3, 4 + movd esi, xmm3 ;H + mov eax, [esp+36] ;lamda + shl eax, 1 + add esi, eax + add ebx, eax + mov edx, [esp+32] + cmp ebx, esi + jge near not_dc_16x16_sad + cmp ebx, ecx + jge near not_dc_h_16x16_sad + ; for DC mode + mov dword[edx], 2;I16_PRED_DC + mov eax, ebx + sub edi, 192 +%assign x 0 +%rep 16 + movdqa [edi+16*x], xmm7 +%assign x x+1 +%endrep + jmp near return_sad_intra_16x16_x3 +not_dc_16x16_sad: + ; for H mode + cmp esi, ecx + jge near not_dc_h_16x16_sad + mov dword[edx], 1;I16_PRED_H + mov eax, esi + jmp near return_sad_intra_16x16_x3 +not_dc_h_16x16_sad: + ; for V mode + mov dword[edx], 0;I16_PRED_V + mov eax, ecx + sub edi, 192 +%assign x 0 +%rep 16 + movdqa [edi+16*x], xmm5 +%assign x x+1 +%endrep +return_sad_intra_16x16_x3: + pop edi + pop esi + pop ebx + ret +%endif +;*********************************************************************** +; +;Pixel_sad_intra_ssse3 END +; +;*********************************************************************** +;*********************************************************************** +; +;Pixel_satd_wxh_sse41 BEGIN +; +;*********************************************************************** + +;SSE4.1 +%macro SSE41_GetSatd8x4 0 + movq xmm0, [r0] + punpcklqdq xmm0, xmm0 + pmaddubsw xmm0, xmm7 + movq xmm1, [r0+r1] + punpcklqdq xmm1, xmm1 + pmaddubsw xmm1, xmm7 + movq xmm2, [r2] + punpcklqdq xmm2, xmm2 + pmaddubsw xmm2, xmm7 + movq xmm3, [r2+r3] + punpcklqdq xmm3, xmm3 + pmaddubsw xmm3, xmm7 + psubsw xmm0, xmm2 + psubsw xmm1, xmm3 + movq xmm2, [r0+2*r1] + punpcklqdq xmm2, xmm2 + pmaddubsw xmm2, xmm7 + movq xmm3, [r0+r4] + punpcklqdq xmm3, xmm3 + pmaddubsw xmm3, xmm7 + movq xmm4, [r2+2*r3] + punpcklqdq xmm4, xmm4 + pmaddubsw xmm4, xmm7 + movq xmm5, [r2+r5] + punpcklqdq xmm5, xmm5 + pmaddubsw xmm5, xmm7 + psubsw xmm2, xmm4 + psubsw xmm3, xmm5 + SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4 + pabsw xmm0, xmm0 + pabsw xmm2, xmm2 + pabsw xmm1, xmm1 + pabsw xmm3, xmm3 + movdqa xmm4, xmm3 + pblendw xmm3, xmm1, 0xAA + pslld xmm1, 16 + psrld xmm4, 16 + por xmm1, xmm4 + pmaxuw xmm1, xmm3 + paddw xmm6, xmm1 + movdqa xmm4, xmm0 + pblendw xmm0, xmm2, 0xAA + pslld xmm2, 16 + psrld xmm4, 16 + por xmm2, xmm4 + pmaxuw xmm0, xmm2 + paddw xmm6, xmm0 +%endmacro + +%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE + MMX_DW_1_2REG %3, %4 + pmaddwd %2, %3 + movhlps %4, %2 + paddd %2, %4 + pshuflw %4, %2,0Eh + paddd %2, %4 + movd %1, %2 +%endmacro +;*********************************************************************** +; +;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t ); +; +;*********************************************************************** +WELS_EXTERN WelsSampleSatd4x4_sse41 +WelsSampleSatd4x4_sse41: + ;push ebx + ;mov eax,[esp+8] + ;mov ebx,[esp+12] + ;mov ecx,[esp+16] + ;mov edx,[esp+20] + + %assign push_num 0 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + movdqa xmm4,[HSwapSumSubDB1] + movd xmm2,[r2] + movd xmm5,[r2+r3] + shufps xmm2,xmm5,0 + movd xmm3,[r2+r3*2] + lea r2, [r3*2+r2] + movd xmm5,[r2+r3] + shufps xmm3,xmm5,0 + movd xmm0,[r0] + movd xmm5,[r0+r1] + shufps xmm0,xmm5,0 + movd xmm1,[r0+r1*2] + lea r0, [r1*2+r0] + movd xmm5,[r0+r1] + shufps xmm1,xmm5,0 + pmaddubsw xmm0,xmm4 + pmaddubsw xmm1,xmm4 + pmaddubsw xmm2,xmm4 + pmaddubsw xmm3,xmm4 + psubw xmm0,xmm2 + psubw xmm1,xmm3 + movdqa xmm2,xmm0 + paddw xmm0,xmm1 + psubw xmm1,xmm2 + movdqa xmm2,xmm0 + punpcklqdq xmm0,xmm1 + punpckhqdq xmm2,xmm1 + movdqa xmm1,xmm0 + paddw xmm0,xmm2 + psubw xmm2,xmm1 + movdqa xmm1,xmm0 + pblendw xmm0,xmm2,0AAh + pslld xmm2,16 + psrld xmm1,16 + por xmm2,xmm1 + pabsw xmm0,xmm0 + pabsw xmm2,xmm2 + pmaxsw xmm0,xmm2 + SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7 + LOAD_4_PARA_POP + ret + +;*********************************************************************** +; +;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); +; +;*********************************************************************** +WELS_EXTERN WelsSampleSatd8x8_sse41 +align 16 +WelsSampleSatd8x8_sse41: + ;push ebx + ;push esi + ;push edi + ;mov eax, [esp+16] + ;mov ebx, [esp+20] + ;mov ecx, [esp+24] + ;mov edx, [esp+28] +%ifdef X86_32 + push r4 + push r5 +%endif + %assign push_num 2 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + movdqa xmm7, [HSumSubDB1] + lea r4, [r1+r1*2] + lea r5, [r3+r3*2] + pxor xmm6, xmm6 + SSE41_GetSatd8x4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + SSE41_GetSatd8x4 + SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 + LOAD_4_PARA_POP +%ifdef X86_32 + pop r5 + pop r4 +%endif + ret + +;*********************************************************************** +; +;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); +; +;*********************************************************************** +WELS_EXTERN WelsSampleSatd8x16_sse41 +align 16 +WelsSampleSatd8x16_sse41: + ;push ebx + ;push esi + ;push edi + ;push ebp + ;%define pushsize 16 + ;mov eax, [esp+pushsize+4] + ;mov ebx, [esp+pushsize+8] + ;mov ecx, [esp+pushsize+12] + ;mov edx, [esp+pushsize+16] +%ifdef X86_32 + push r4 + push r5 + push r6 +%endif + %assign push_num 3 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + movdqa xmm7, [HSumSubDB1] + lea r4, [r1+r1*2] + lea r5, [r3+r3*2] + pxor xmm6, xmm6 + mov r6, 0 +loop_get_satd_8x16: + SSE41_GetSatd8x4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + inc r6 + cmp r6, 4 + jl loop_get_satd_8x16 + SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 + LOAD_4_PARA_POP +%ifdef X86_32 + pop r6 + pop r5 + pop r4 +%endif + ret + +;*********************************************************************** +; +;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); +; +;*********************************************************************** +WELS_EXTERN WelsSampleSatd16x8_sse41 +align 16 +WelsSampleSatd16x8_sse41: + ;push ebx + ;push esi + ;push edi + ;mov eax, [esp+16] + ;mov ebx, [esp+20] + ;mov ecx, [esp+24] + ;mov edx, [esp+28] +%ifdef X86_32 + push r4 + push r5 +%endif + %assign push_num 2 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + push r0 + push r2 + + movdqa xmm7, [HSumSubDB1] + lea r4, [r1+r1*2] + lea r5, [r3+r3*2] + pxor xmm6, xmm6 + SSE41_GetSatd8x4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + SSE41_GetSatd8x4 + + pop r2 + pop r0 + ;mov eax, [esp+16] + ;mov ecx, [esp+24] + add r0, 8 + add r2, 8 + SSE41_GetSatd8x4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + SSE41_GetSatd8x4 + SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 + LOAD_4_PARA_POP +%ifdef X86_32 + pop r5 + pop r4 +%endif + ret + +;*********************************************************************** +; +;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); +; +;*********************************************************************** + +WELS_EXTERN WelsSampleSatd16x16_sse41 +align 16 +WelsSampleSatd16x16_sse41: + ;push ebx + ;push esi + ;push edi + ;push ebp + ;%define pushsize 16 + ;mov eax, [esp+pushsize+4] + ;mov ebx, [esp+pushsize+8] + ;mov ecx, [esp+pushsize+12] + ;mov edx, [esp+pushsize+16] +%ifdef X86_32 + push r4 + push r5 + push r6 +%endif + %assign push_num 3 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + + push r0 + push r2 + + movdqa xmm7, [HSumSubDB1] + lea r4, [r1+r1*2] + lea r5, [r3+r3*2] + pxor xmm6, xmm6 + mov r6, 0 +loop_get_satd_16x16_left: + SSE41_GetSatd8x4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + inc r6 + cmp r6, 4 + jl loop_get_satd_16x16_left + + pop r2 + pop r0 + ;mov eax, [esp+pushsize+4] + ;mov ecx, [esp+pushsize+12] + add r0, 8 + add r2, 8 + mov r6, 0 +loop_get_satd_16x16_right: + SSE41_GetSatd8x4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + inc r6 + cmp r6, 4 + jl loop_get_satd_16x16_right + SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 + ;%undef pushsize + LOAD_4_PARA_POP +%ifdef X86_32 + pop r6 + pop r5 + pop r4 +%endif + ret + +;*********************************************************************** +; +;Pixel_satd_wxh_sse41 END +; +;*********************************************************************** + +;*********************************************************************** +; +;Pixel_sad_wxh_sse2 BEGIN +; +;*********************************************************************** + +%macro SSE2_GetSad2x16 0 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqu xmm1, [r2] + MOVDQ xmm2, [r0];[eax] must aligned 16 + psadbw xmm1, xmm2 + paddw xmm0, xmm1 + movdqu xmm1, [r2+r3] + MOVDQ xmm2, [r0+r1] + psadbw xmm1, xmm2 + paddw xmm0, xmm1 +%endmacro + + +%macro SSE2_GetSad4x16 0 + movdqu xmm0, [r2] + MOVDQ xmm2, [r0] + psadbw xmm0, xmm2 + paddw xmm7, xmm0 + movdqu xmm1, [r2+r3] + MOVDQ xmm2, [r0+r1] + psadbw xmm1, xmm2 + paddw xmm7, xmm1 + movdqu xmm1, [r2+2*r3] + MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16 + psadbw xmm1, xmm2 + paddw xmm7, xmm1 + movdqu xmm1, [r2+r5] + MOVDQ xmm2, [r0+r4] + psadbw xmm1, xmm2 + paddw xmm7, xmm1 +%endmacro + + +%macro SSE2_GetSad8x4 0 + movq xmm0, [r0] + movq xmm1, [r0+r1] + lea r0, [r0+2*r1] + movhps xmm0, [r0] + movhps xmm1, [r0+r1] + + movq xmm2, [r2] + movq xmm3, [r2+r3] + lea r2, [r2+2*r3] + movhps xmm2, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm2 + psadbw xmm1, xmm3 + paddw xmm6, xmm0 + paddw xmm6, xmm1 +%endmacro + +;*********************************************************************** +; +;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ) +;First parameter can align to 16 bytes, +;In wels, the third parameter can't align to 16 bytes. +; +;*********************************************************************** +WELS_EXTERN WelsSampleSad16x16_sse2 +align 16 +WelsSampleSad16x16_sse2: + ;push ebx + ;push edi + ;push esi + ;%define _STACK_SIZE 12 + ;mov eax, [esp+_STACK_SIZE+4 ] + ;mov ebx, [esp+_STACK_SIZE+8 ] + ;mov ecx, [esp+_STACK_SIZE+12] + ;mov edx, [esp+_STACK_SIZE+16] +%ifdef X86_32 + push r4 + push r5 +%endif + + %assign push_num 2 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + lea r4, [3*r1] + lea r5, [3*r3] + + pxor xmm7, xmm7 + SSE2_GetSad4x16 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + SSE2_GetSad4x16 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + SSE2_GetSad4x16 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + SSE2_GetSad4x16 + movhlps xmm0, xmm7 + paddw xmm0, xmm7 + movd retrd, xmm0 + LOAD_4_PARA_POP +%ifdef X86_32 + pop r5 + pop r4 +%endif + ret + +;*********************************************************************** +; +;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ) +;First parameter can align to 16 bytes, +;In wels, the third parameter can't align to 16 bytes. +; +;*********************************************************************** +WELS_EXTERN WelsSampleSad16x8_sse2 +align 16 +WelsSampleSad16x8_sse2: + ;push ebx + ;mov eax, [esp+8] + ;mov ebx, [esp+12] + ;mov ecx, [esp+16] + ;mov edx, [esp+20] + + %assign push_num 0 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + movdqu xmm0, [r2] + MOVDQ xmm2, [r0] + psadbw xmm0, xmm2 + movdqu xmm1, [r2+r3] + MOVDQ xmm2, [r0+r1] + psadbw xmm1, xmm2 + paddw xmm0, xmm1 + + SSE2_GetSad2x16 + SSE2_GetSad2x16 + SSE2_GetSad2x16 + + movhlps xmm1, xmm0 + paddw xmm0, xmm1 + movd retrd, xmm0 + LOAD_4_PARA_POP + ret + + + +WELS_EXTERN WelsSampleSad8x16_sse2 +WelsSampleSad8x16_sse2: + ;push ebx + ;mov eax, [esp+8] + ;mov ebx, [esp+12] + ;mov ecx, [esp+16] + ;mov edx, [esp+20] + + %assign push_num 0 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + pxor xmm6, xmm6 + + SSE2_GetSad8x4 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_GetSad8x4 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_GetSad8x4 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_GetSad8x4 + + movhlps xmm0, xmm6 + paddw xmm0, xmm6 + movd retrd, xmm0 + LOAD_4_PARA_POP + ret + + +%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline +and %1, 0x1f|(%3>>1) +cmp %1, (32-%2)|(%3>>1) +%endmacro + +WELS_EXTERN WelsSampleSad8x8_sse21 +WelsSampleSad8x8_sse21: + ;mov ecx, [esp+12] + ;mov edx, ecx + ;CACHE_SPLIT_CHECK edx, 8, 64 + ;jle near .pixel_sad_8x8_nsplit + ;push ebx + ;push edi + ;mov eax, [esp+12] + ;mov ebx, [esp+16] + + %assign push_num 0 + mov r2, arg3 + push r2 + CACHE_SPLIT_CHECK r2, 8, 64 + jle near .pixel_sad_8x8_nsplit + pop r2 +%ifdef X86_32 + push r3 + push r4 + push r5 +%endif + %assign push_num 3 + mov r0, arg1 + mov r1, arg2 + SIGN_EXTENTION r1, r1d + pxor xmm7, xmm7 + + ;ecx r2, edx r4, edi r5 + + mov r5, r2 + and r5, 0x07 + sub r2, r5 + mov r4, 8 + sub r4, r5 + + shl r5, 3 + shl r4, 3 + movd xmm5, r5d + movd xmm6, r4d + mov r5, 8 + add r5, r2 + mov r3, arg4 + SIGN_EXTENTION r3, r3d + movq xmm0, [r0] + movhps xmm0, [r0+r1] + + movq xmm1, [r2] + movq xmm2, [r5] + movhps xmm1, [r2+r3] + movhps xmm2, [r5+r3] + psrlq xmm1, xmm5 + psllq xmm2, xmm6 + por xmm1, xmm2 + + psadbw xmm0, xmm1 + paddw xmm7, xmm0 + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + lea r5, [r5+2*r3] + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + + movq xmm1, [r2] + movq xmm2, [r5] + movhps xmm1, [r2+r3] + movhps xmm2, [r5+r3] + psrlq xmm1, xmm5 + psllq xmm2, xmm6 + por xmm1, xmm2 + + psadbw xmm0, xmm1 + paddw xmm7, xmm0 + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + lea r5, [r5+2*r3] + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + + movq xmm1, [r2] + movq xmm2, [r5] + movhps xmm1, [r2+r3] + movhps xmm2, [r5+r3] + psrlq xmm1, xmm5 + psllq xmm2, xmm6 + por xmm1, xmm2 + + psadbw xmm0, xmm1 + paddw xmm7, xmm0 + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + lea r5, [r5+2*r3] + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + + movq xmm1, [r2] + movq xmm2, [r5] + movhps xmm1, [r2+r3] + movhps xmm2, [r5+r3] + psrlq xmm1, xmm5 + psllq xmm2, xmm6 + por xmm1, xmm2 + + psadbw xmm0, xmm1 + paddw xmm7, xmm0 + + movhlps xmm0, xmm7 + paddw xmm0, xmm7 + movd retrd, xmm0 +%ifdef X86_32 + pop r5 + pop r4 + pop r3 +%endif + jmp .return + +.pixel_sad_8x8_nsplit: + ;push ebx + ;mov eax, [esp+8] + ;mov ebx, [esp+12] + ;mov edx, [esp+20] + + pop r2 + %assign push_num 0 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + pxor xmm6, xmm6 + SSE2_GetSad8x4 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_GetSad8x4 + movhlps xmm0, xmm6 + paddw xmm0, xmm6 + movd retrd, xmm0 + LOAD_4_PARA_POP +.return: + ret + + +;*********************************************************************** +; +;Pixel_sad_wxh_sse2 END +; +;*********************************************************************** + + +;*********************************************************************** +; +;Pixel_sad_4_wxh_sse2 BEGIN +; +;*********************************************************************** + + +%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address + psadbw %1, %4 + paddw xmm5, %1 + psadbw %4, %3 + paddw xmm4, %4 + movdqu %4, [%5-1] + psadbw %4, %2 + paddw xmm6, %4 + movdqu %4, [%5+1] + psadbw %4, %2 + paddw xmm7, %4 +%endmacro +WELS_EXTERN WelsSampleSadFour16x16_sse2 +WelsSampleSadFour16x16_sse2: + ;push ebx + ;mov eax, [esp+8] + ;mov ebx, [esp+12] + ;mov ecx, [esp+16] + ;mov edx, [esp+20] + + %assign push_num 0 + LOAD_5_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref + pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref + pxor xmm6, xmm6 ;sad pRefMb-1 + pxor xmm7, xmm7 ;sad pRefMb+1 + movdqa xmm0, [r0] + sub r2, r3 + movdqu xmm3, [r2] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movdqa xmm1, [r0+r1] + movdqu xmm3, [r2+r3] + psadbw xmm3, xmm1 + paddw xmm4, xmm3 + + movdqu xmm2, [r2+r3-1] + psadbw xmm2, xmm0 + paddw xmm6, xmm2 + + movdqu xmm3, [r2+r3+1] + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm2, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 + movdqa xmm0, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm1, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 + movdqa xmm2, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm0, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 + movdqa xmm1, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm2, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 + movdqa xmm0, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm1, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 + movdqa xmm2, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm0, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 + movdqa xmm1, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm2, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 + movdqa xmm0, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 + lea r2, [r2+2*r3] + movdqu xmm3, [r2] + psadbw xmm2, xmm3 + paddw xmm5, xmm2 + + movdqu xmm2, [r2-1] + psadbw xmm2, xmm0 + paddw xmm6, xmm2 + + movdqu xmm3, [r2+1] + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movdqu xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + ;mov ecx, [esp+24] + movhlps xmm0, xmm4 + paddw xmm4, xmm0 + movhlps xmm0, xmm5 + paddw xmm5, xmm0 + movhlps xmm0, xmm6 + paddw xmm6, xmm0 + movhlps xmm0, xmm7 + paddw xmm7, xmm0 + punpckldq xmm4, xmm5 + punpckldq xmm6, xmm7 + punpcklqdq xmm4, xmm6 + movdqa [r4],xmm4 + LOAD_5_PARA_POP + ret + + +WELS_EXTERN WelsSampleSadFour16x8_sse2 +WelsSampleSadFour16x8_sse2: + ;push ebx + ;push edi + ;mov eax, [esp+12] + ;mov ebx, [esp+16] + ;mov edi, [esp+20] + ;mov edx, [esp+24] + + %assign push_num 0 + LOAD_5_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref + pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref + pxor xmm6, xmm6 ;sad pRefMb-1 + pxor xmm7, xmm7 ;sad pRefMb+1 + movdqa xmm0, [r0] + sub r2, r3 + movdqu xmm3, [r2] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movdqa xmm1, [r0+r1] + movdqu xmm3, [r2+r3] + psadbw xmm3, xmm1 + paddw xmm4, xmm3 + + movdqu xmm2, [r2+r3-1] + psadbw xmm2, xmm0 + paddw xmm6, xmm2 + + movdqu xmm3, [r2+r3+1] + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm2, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 + movdqa xmm0, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm1, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 + movdqa xmm2, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm0, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 + movdqa xmm1, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 + lea r2, [r2+2*r3] + movdqu xmm3, [r2] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + movdqu xmm0, [r2-1] + psadbw xmm0, xmm1 + paddw xmm6, xmm0 + + movdqu xmm3, [r2+1] + psadbw xmm3, xmm1 + paddw xmm7, xmm3 + + movdqu xmm3, [r2+r3] + psadbw xmm1, xmm3 + paddw xmm5, xmm1 + + ;mov edi, [esp+28] + movhlps xmm0, xmm4 + paddw xmm4, xmm0 + movhlps xmm0, xmm5 + paddw xmm5, xmm0 + movhlps xmm0, xmm6 + paddw xmm6, xmm0 + movhlps xmm0, xmm7 + paddw xmm7, xmm0 + punpckldq xmm4, xmm5 + punpckldq xmm6, xmm7 + punpcklqdq xmm4, xmm6 + movdqa [r4],xmm4 + LOAD_5_PARA_POP + ret + +WELS_EXTERN WelsSampleSadFour8x16_sse2 +WelsSampleSadFour8x16_sse2: + ;push ebx + ;push edi + ;mov eax, [esp+12] + ;mov ebx, [esp+16] + ;mov edi, [esp+20] + ;mov edx, [esp+24] + + %assign push_num 0 + LOAD_5_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref + pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref + pxor xmm6, xmm6 ;sad pRefMb-1 + pxor xmm7, xmm7 ;sad pRefMb+1 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + sub r2, r3 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + ;mov edi, [esp+28] + movhlps xmm0, xmm4 + paddw xmm4, xmm0 + movhlps xmm0, xmm5 + paddw xmm5, xmm0 + movhlps xmm0, xmm6 + paddw xmm6, xmm0 + movhlps xmm0, xmm7 + paddw xmm7, xmm0 + punpckldq xmm4, xmm5 + punpckldq xmm6, xmm7 + punpcklqdq xmm4, xmm6 + movdqa [r4],xmm4 + LOAD_5_PARA_POP + ret + + +WELS_EXTERN WelsSampleSadFour8x8_sse2 +WelsSampleSadFour8x8_sse2: + ;push ebx + ;push edi + ;mov eax, [esp+12] + ;mov ebx, [esp+16] + ;mov edi, [esp+20] + ;mov edx, [esp+24] + + %assign push_num 0 + LOAD_5_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref + pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref + pxor xmm6, xmm6 ;sad pRefMb-1 + pxor xmm7, xmm7 ;sad pRefMb+1 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + sub r2, r3 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 + + + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 + + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 + + ;mov edi, [esp+28] + movhlps xmm0, xmm4 + paddw xmm4, xmm0 + movhlps xmm0, xmm5 + paddw xmm5, xmm0 + movhlps xmm0, xmm6 + paddw xmm6, xmm0 + movhlps xmm0, xmm7 + paddw xmm7, xmm0 + punpckldq xmm4, xmm5 + punpckldq xmm6, xmm7 + punpcklqdq xmm4, xmm6 + movdqa [r4],xmm4 + LOAD_5_PARA_POP + ret + +WELS_EXTERN WelsSampleSadFour4x4_sse2 +WelsSampleSadFour4x4_sse2: + ;push ebx + ;push edi + ;mov eax, [esp+12] + ;mov ebx, [esp+16] + ;mov edi, [esp+20] + ;mov edx, [esp+24] + + %assign push_num 0 + LOAD_5_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + movd xmm0, [r0] + movd xmm1, [r0+r1] + lea r0, [r0+2*r1] + movd xmm2, [r0] + movd xmm3, [r0+r1] + punpckldq xmm0, xmm1 + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 + sub r2, r3 + movd xmm1, [r2] + movd xmm2, [r2+r3] + punpckldq xmm1, xmm2 + movd xmm2, [r2+r3-1] + movd xmm3, [r2+r3+1] + + lea r2, [r2+2*r3] + + movd xmm4, [r2] + movd xmm5, [r2-1] + punpckldq xmm2, xmm5 + movd xmm5, [r2+1] + punpckldq xmm3, xmm5 + + movd xmm5, [r2+r3] + punpckldq xmm4, xmm5 + + punpcklqdq xmm1, xmm4 ;-L + + movd xmm5, [r2+r3-1] + movd xmm6, [r2+r3+1] + + lea r2, [r2+2*r3] + movd xmm7, [r2-1] + punpckldq xmm5, xmm7 + punpcklqdq xmm2, xmm5 ;-1 + movd xmm7, [r2+1] + punpckldq xmm6, xmm7 + punpcklqdq xmm3, xmm6 ;+1 + movd xmm6, [r2] + movd xmm7, [r2+r3] + punpckldq xmm6, xmm7 + punpcklqdq xmm4, xmm6 ;+L + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + psadbw xmm4, xmm0 + + movhlps xmm0, xmm1 + paddw xmm1, xmm0 + movhlps xmm0, xmm2 + paddw xmm2, xmm0 + movhlps xmm0, xmm3 + paddw xmm3, xmm0 + movhlps xmm0, xmm4 + paddw xmm4, xmm0 + ;mov edi, [esp+28] + punpckldq xmm1, xmm4 + punpckldq xmm2, xmm3 + punpcklqdq xmm1, xmm2 + movdqa [r4],xmm1 + LOAD_5_PARA_POP + ret + +;*********************************************************************** +; +;Pixel_sad_4_wxh_sse2 END +; +;*********************************************************************** + +WELS_EXTERN WelsSampleSad4x4_mmx + +align 16 +;*********************************************************************** +; int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t ) +;*********************************************************************** +WelsSampleSad4x4_mmx: + ;push ebx + ;%define pushsize 4 + ;%define pix1address esp+pushsize+4 + ;%define pix1stride esp+pushsize+8 + ;%define pix2address esp+pushsize+12 + ;%define pix2stride esp+pushsize+16 + ;mov eax, [pix1address] + ;mov ebx, [pix1stride ] + ;mov ecx, [pix2address] + ;mov edx, [pix2stride ] + + %assign push_num 0 + LOAD_4_PARA + SIGN_EXTENTION r1, r1d + SIGN_EXTENTION r3, r3d + movd mm0, [r0] + movd mm1, [r0+r1] + punpckldq mm0, mm1 + + movd mm3, [r2] + movd mm4, [r2+r3] + punpckldq mm3, mm4 + psadbw mm0, mm3 + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + + movd mm1, [r0] + movd mm2, [r0+r1] + punpckldq mm1, mm2 + + movd mm3, [r2] + movd mm4, [r2+r3] + punpckldq mm3, mm4 + psadbw mm1, mm3 + paddw mm0, mm1 + + movd retrd, mm0 + + WELSEMMS + LOAD_4_PARA_POP + ret