openh264/codec/common/x86/mc_luma.asm

1165 lines
23 KiB
NASM
Raw Normal View History

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* mc_luma.asm
;*
;* Abstract
;* sse2 motion compensation
;*
;* History
;* 17/08/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;*******************************************************************************
; Local Data (Read Only)
;*******************************************************************************
SECTION .rodata align=16
;*******************************************************************************
; Various memory constants (trigonometric values or rounding values)
;*******************************************************************************
ALIGN 16
h264_w0x10:
dw 16, 16, 16, 16
ALIGN 16
h264_w0x10_1:
dw 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 16
h264_mc_hc_32:
dw 32, 32, 32, 32, 32, 32, 32, 32
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
;*******************************************************************************
; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iHeight)
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq4_mmx
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
sub r0, 2
WELS_Zero mm7
movq mm6, [h264_w0x10]
.height_loop:
movd mm0, [r0]
punpcklbw mm0, mm7
movd mm1, [r0+5]
punpcklbw mm1, mm7
movd mm2, [r0+1]
punpcklbw mm2, mm7
movd mm3, [r0+4]
punpcklbw mm3, mm7
movd mm4, [r0+2]
punpcklbw mm4, mm7
movd mm5, [r0+3]
punpcklbw mm5, mm7
paddw mm2, mm3
paddw mm4, mm5
psllw mm4, 2
psubw mm4, mm2
paddw mm0, mm1
paddw mm0, mm4
psllw mm4, 2
paddw mm0, mm4
paddw mm0, mm6
psraw mm0, 5
packuswb mm0, mm7
movd [r2], mm0
add r0, r1
add r2, r3
dec r4
jnz .height_loop
WELSEMMS
LOAD_5_PARA_POP
ret
;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
%macro SSE_LOAD_8P 3
movq %1, %3
punpcklbw %1, %2
%endmacro
%macro FILTER_HV_W8 9
paddw %1, %6
movdqa %8, %3
movdqa %7, %2
paddw %1, [h264_w0x10_1]
paddw %8, %4
paddw %7, %5
psllw %8, 2
psubw %8, %7
paddw %1, %8
psllw %8, 2
paddw %1, %8
psraw %1, 5
WELS_Zero %8
packuswb %1, %8
movq %9, %1
%endmacro
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
;***********************************************************************
; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
; int16_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride
; int32_t iHeight
; )
;***********************************************************************
WELS_EXTERN McHorVer22Width8HorFirst_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
pxor xmm7, xmm7
sub r0, r1 ;;;;;;;;need more 5 lines.
sub r0, r1
.yloop_width_8:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
movdqa [r2], xmm0
add r0, r1
add r2, r3
dec r4
jnz .yloop_width_8
POP_XMM
LOAD_5_PARA_POP
ret
;*******************************************************************************
; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iHeight,
; );
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq8_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
lea r0, [r0-2] ;pSrc -= 2;
pxor xmm7, xmm7
movdqa xmm6, [h264_w0x10_1]
.y_loop:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
paddw xmm0, xmm6
psraw xmm0, 5
packuswb xmm0, xmm7
movq [r2], xmm0
lea r2, [r2+r3]
lea r0, [r0+r1]
dec r4
jnz near .y_loop
POP_XMM
LOAD_5_PARA_POP
ret
;*******************************************************************************
; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iHeight,
; );
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
lea r0, [r0-2] ;pSrc -= 2;
pxor xmm7, xmm7
movdqa xmm6, [h264_w0x10_1]
.y_loop:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
paddw xmm0, xmm6
psraw xmm0, 5
packuswb xmm0, xmm7
movq [r2], xmm0
movq xmm0, [r0+8]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5+8]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1+8]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4+8]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2+8]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3+8]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
paddw xmm0, xmm6
psraw xmm0, 5
packuswb xmm0, xmm7
movq [r2+8], xmm0
lea r2, [r2+r3]
lea r0, [r0+r1]
dec r4
jnz near .y_loop
POP_XMM
LOAD_5_PARA_POP
ret
;*******************************************************************************
; void McHorVer02WidthEq8_sse2( const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iHeight )
;*******************************************************************************
WELS_EXTERN McHorVer02WidthEq8_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
sub r0, r1
sub r0, r1
WELS_Zero xmm7
SSE_LOAD_8P xmm0, xmm7, [r0]
SSE_LOAD_8P xmm1, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm7, [r0]
SSE_LOAD_8P xmm3, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm7, [r0]
SSE_LOAD_8P xmm5, xmm7, [r0+r1]
.start:
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r4
jz near .xx_exit
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
dec r4
jz near .xx_exit
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm7, xmm0, [r0+r1]
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r4
jz near .xx_exit
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm0, xmm1, [r0]
FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
dec r4
jz near .xx_exit
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm1, xmm2, [r0+r1]
FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
dec r4
jz near .xx_exit
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm3, [r0]
FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
dec r4
jz near .xx_exit
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm3, xmm4, [r0+r1]
FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
dec r4
jz near .xx_exit
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm5, [r0]
FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
dec r4
jz near .xx_exit
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm5, xmm6, [r0+r1]
jmp near .start
.xx_exit:
POP_XMM
LOAD_5_PARA_POP
ret
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
;***********************************************************************
; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight )
;***********************************************************************
WELS_EXTERN McHorVer02Height9Or17_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov r12, r0
mov r13, r2
mov r14, r5
%endif
shr r4, 3
sub r0, r1
sub r0, r1
.xloop:
WELS_Zero xmm7
SSE_LOAD_8P xmm0, xmm7, [r0]
SSE_LOAD_8P xmm1, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm7, [r0]
SSE_LOAD_8P xmm3, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm7, [r0]
SSE_LOAD_8P xmm5, xmm7, [r0+r1]
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
movdqa xmm0,xmm1
movdqa xmm1,xmm2
movdqa xmm2,xmm3
movdqa xmm3,xmm4
movdqa xmm4,xmm5
movdqa xmm5,xmm6
add r2, r3
sub r0, r1
.start:
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm7, xmm0, [r0+r1]
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm0, xmm1, [r0]
FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm1, xmm2, [r0+r1]
FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm3, [r0]
FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm3, xmm4, [r0+r1]
FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm5, [r0]
FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm5, xmm6, [r0+r1]
jmp near .start
.x_loop_dec:
dec r4
jz near .xx_exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
sub r0, r1
sub r0, r1
add r0, 8
add r2, 8
jmp near .xloop
.xx_exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
ret
;***********************************************************************
; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight
; );
;***********************************************************************
WELS_EXTERN McHorVer20Width9Or17_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
sub r0, 2
pxor xmm7, xmm7
cmp r4, 9
jne near .width_17
.yloop_width_9:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
paddw xmm0, [h264_w0x10_1]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
paddw xmm2, [h264_w0x10_1]
psraw xmm2, 5
packuswb xmm2, xmm2
movq [r2+1], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_9
POP_XMM
LOAD_6_PARA_POP
ret
.width_17:
.yloop_width_17:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
paddw xmm0, [h264_w0x10_1]
psraw xmm0, 5
packuswb xmm0, xmm0
movq [r2], xmm0
movq xmm0, [r0+8]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5+8]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1+8]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4+8]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2+8]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3+8]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
paddw xmm0, [h264_w0x10_1]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2+8], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6+8]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
paddw xmm2, [h264_w0x10_1]
psraw xmm2, 5
packuswb xmm2, xmm2
movq [r2+9], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_17
POP_XMM
LOAD_6_PARA_POP
ret
;***********************************************************************
;void McHorVer22HorFirst_sse2
; (const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t * pTap,
; int32_t iTapStride,
; int32_t iWidth,int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22HorFirst_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
pxor xmm7, xmm7
sub r0, r1 ;;;;;;;;need more 5 lines.
sub r0, r1
cmp r4, 9
jne near .width_17
.yloop_width_9:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
movd [r2], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
movq [r2+2], xmm2
movhps [r2+2+8], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_9
POP_XMM
LOAD_6_PARA_POP
ret
.width_17:
.yloop_width_17:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
movdqa [r2], xmm0
movq xmm0, [r0+8]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5+8]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1+8]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4+8]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2+8]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3+8]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
movd [r2+16], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6+8]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
movq [r2+18], xmm2
movhps [r2+18+8], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_17
POP_XMM
LOAD_6_PARA_POP
ret
%macro FILTER_VER 9
paddw %1, %6
movdqa %7, %2
movdqa %8, %3
paddw %7, %5
paddw %8, %4
psubw %1, %7
psraw %1, 2
paddw %1, %8
psubw %1, %7
psraw %1, 2
paddw %8, %1
paddw %8, [h264_mc_hc_32]
psraw %8, 6
packuswb %8, %8
movq %9, %8
%endmacro
;***********************************************************************
;void McHorVer22Width8VerLastAlign_sse2(
; const uint8_t *pTap,
; int32_t iTapStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov r12, r0
mov r13, r2
mov r14, r5
%endif
shr r4, 3
.width_loop:
movdqa xmm0, [r0]
movdqa xmm1, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm2, [r0]
movdqa xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm4, [r0]
movdqa xmm5, [r0+r1]
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
movdqa xmm6, [r0]
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
add r2, r3
sub r0, r1
.start:
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm6, [r0]
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm7, [r0+r1]
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm0, [r0]
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm1, [r0+r1]
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm2, [r0]
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm3, [r0+r1]
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm4, [r0]
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm5, [r0+r1]
jmp near .start
.x_loop_dec:
dec r4
jz near .exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
add r0, 16
add r2, 8
jmp .width_loop
.exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
ret
;***********************************************************************
;void McHorVer22Width8VerLastUnAlign_sse2(
; const uint8_t *pTap,
; int32_t iTapStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov r12, r0
mov r13, r2
mov r14, r5
%endif
shr r4, 3
.width_loop:
movdqu xmm0, [r0]
movdqu xmm1, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm2, [r0]
movdqu xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm4, [r0]
movdqu xmm5, [r0+r1]
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
movdqu xmm6, [r0]
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
add r2, r3
sub r0, r1
.start:
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm6, [r0]
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm7, [r0+r1]
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm0, [r0]
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm1, [r0+r1]
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm2, [r0]
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm3, [r0+r1]
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm4, [r0]
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm5, [r0+r1]
jmp near .start
.x_loop_dec:
dec r4
jz near .exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
add r0, 16
add r2, 8
jmp .width_loop
.exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
ret