openh264/codec/common/mc_chroma.asm

346 lines
7.2 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2004-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* mc_chroma.asm
;*
;* Abstract
;* mmx motion compensation for chroma
;*
;* History
;* 10/13/2004 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
SECTION .rodata align=16
;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************
ALIGN 16
h264_d0x20_sse2:
dw 32,32,32,32,32,32,32,32
ALIGN 16
h264_d0x20_mmx:
dw 32,32,32,32
;=============================================================================
; Code
;=============================================================================
SECTION .text
ALIGN 16
;*******************************************************************************
; void McChromaWidthEq4_mmx( uint8_t *src,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; uint8_t *pABCD,
; int32_t iHeigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx
McChromaWidthEq4_mmx:
;push esi
;push edi
;push ebx
%assign push_num 0
LOAD_6_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
movsx r5, r5d
%endif
;mov eax, [esp +12 + 20]
movd mm3, [r4]; [eax]
WELS_Zero mm7
punpcklbw mm3, mm3
movq mm4, mm3
punpcklwd mm3, mm3
punpckhwd mm4, mm4
movq mm5, mm3
punpcklbw mm3, mm7
punpckhbw mm5, mm7
movq mm6, mm4
punpcklbw mm4, mm7
punpckhbw mm6, mm7
;mov esi, [esp +12+ 4]
;mov eax, [esp + 12 + 8]
;mov edi, [esp + 12 + 12]
;mov edx, [esp + 12 + 16]
;mov ecx, [esp + 12 + 24]
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
movd mm0, [r0]
movd mm1, [r0+1]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
.xloop:
pmullw mm0, mm3
pmullw mm1, mm5
paddw mm0, mm1
movd mm1, [r4]
punpcklbw mm1, mm7
movq mm2, mm1
pmullw mm1, mm4
paddw mm0, mm1
movd mm1, [r4+1]
punpcklbw mm1, mm7
movq mm7, mm1
pmullw mm1,mm6
paddw mm0, mm1
movq mm1,mm7
paddw mm0, [h264_d0x20_mmx]
psrlw mm0, 6
WELS_Zero mm7
packuswb mm0, mm7
movd [r2], mm0
movq mm0, mm2
lea r2, [r2 + r3]
lea r4, [r4 + r1]
dec r5
jnz near .xloop
WELSEMMS
LOAD_6_PARA_POP
;pop ebx
;pop edi
;pop esi
ret
ALIGN 16
;*******************************************************************************
; void McChromaWidthEq8_sse2( uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; uint8_t *pABCD,
; int32_t iheigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2
McChromaWidthEq8_sse2:
;push esi
;push edi
;push ebx
%assign push_num 0
LOAD_6_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
movsx r5, r5d
%endif
;mov eax, [esp +12 + 20]
movd xmm3, [r4]
WELS_Zero xmm7
punpcklbw xmm3, xmm3
punpcklwd xmm3, xmm3
movdqa xmm4, xmm3
punpckldq xmm3, xmm3
punpckhdq xmm4, xmm4
movdqa xmm5, xmm3
movdqa xmm6, xmm4
punpcklbw xmm3, xmm7
punpckhbw xmm5, xmm7
punpcklbw xmm4, xmm7
punpckhbw xmm6, xmm7
;mov esi, [esp +12+ 4]
;mov eax, [esp + 12 + 8]
;mov edi, [esp + 12 + 12]
;mov edx, [esp + 12 + 16]
;mov ecx, [esp + 12 + 24]
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
movq xmm0, [r0]
movq xmm1, [r0+1]
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
.xloop:
pmullw xmm0, xmm3
pmullw xmm1, xmm5
paddw xmm0, xmm1
movq xmm1, [r4]
punpcklbw xmm1, xmm7
movdqa xmm2, xmm1
pmullw xmm1, xmm4
paddw xmm0, xmm1
movq xmm1, [r4+1]
punpcklbw xmm1, xmm7
movdqa xmm7, xmm1
pmullw xmm1, xmm6
paddw xmm0, xmm1
movdqa xmm1,xmm7
paddw xmm0, [h264_d0x20_sse2]
psrlw xmm0, 6
WELS_Zero xmm7
packuswb xmm0, xmm7
movq [r2], xmm0
movdqa xmm0, xmm2
lea r2, [r2 + r3]
lea r4, [r4 + r1]
dec r5
jnz near .xloop
LOAD_6_PARA_POP
;pop ebx
;pop edi
;pop esi
ret
ALIGN 16
;***********************************************************************
; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; uint8_t *pABCD,
; int32_t iHeigh);
;***********************************************************************
WELS_EXTERN McChromaWidthEq8_ssse3
McChromaWidthEq8_ssse3:
;push ebx
;push esi
;push edi
%assign push_num 0
LOAD_6_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
movsx r5, r5d
%endif
;mov eax, [esp + 12 + 20]
pxor xmm7, xmm7
movd xmm5, [r4]
punpcklwd xmm5, xmm5
punpckldq xmm5, xmm5
movdqa xmm6, xmm5
punpcklqdq xmm5, xmm5
punpckhqdq xmm6, xmm6
;mov eax, [esp + 12 + 4]
;mov edx, [esp + 12 + 8]
;mov esi, [esp + 12 + 12]
;mov edi, [esp + 12 + 16]
;mov ecx, [esp + 12 + 24]
sub r2, r3 ;sub esi, edi
sub r2, r3
movdqa xmm7, [h264_d0x20_sse2]
movdqu xmm0, [r0]
movdqa xmm1, xmm0
psrldq xmm1, 1
punpcklbw xmm0, xmm1
.hloop_chroma:
lea r2, [r2+2*r3]
movdqu xmm2, [r0+r1]
movdqa xmm3, xmm2
psrldq xmm3, 1
punpcklbw xmm2, xmm3
movdqa xmm4, xmm2
pmaddubsw xmm0, xmm5
pmaddubsw xmm2, xmm6
paddw xmm0, xmm2
paddw xmm0, xmm7
psrlw xmm0, 6
packuswb xmm0, xmm0
movq [r2],xmm0
lea r0, [r0+2*r1]
movdqu xmm2, [r0]
movdqa xmm3, xmm2
psrldq xmm3, 1
punpcklbw xmm2, xmm3
movdqa xmm0, xmm2
pmaddubsw xmm4, xmm5
pmaddubsw xmm2, xmm6
paddw xmm4, xmm2
paddw xmm4, xmm7
psrlw xmm4, 6
packuswb xmm4, xmm4
movq [r2+r3],xmm4
sub r5, 2
jnz .hloop_chroma
LOAD_6_PARA_POP
;pop edi
;pop esi
;pop ebx
ret