openh264/codec/common/mc_luma.asm
Martin Storsjö 3cf52554f7 Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.

This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.

This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.

This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-17 13:44:33 +02:00

1187 lines
24 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* mc_luma.asm
;*
;* Abstract
;* sse2 motion compensation
;*
;* History
;* 17/08/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;*******************************************************************************
; Local Data (Read Only)
;*******************************************************************************
%ifdef FORMAT_COFF
SECTION .rodata pData
%else
SECTION .rodata align=16
%endif
;*******************************************************************************
; Various memory constants (trigonometric values or rounding values)
;*******************************************************************************
ALIGN 16
h264_w0x10:
dw 16, 16, 16, 16
ALIGN 16
h264_w0x10_1:
dw 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 16
h264_mc_hc_32:
dw 32, 32, 32, 32, 32, 32, 32, 32
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
WELS_EXTERN McHorVer20WidthEq4_mmx
ALIGN 16
;*******************************************************************************
; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iHeight)
;*******************************************************************************
McHorVer20WidthEq4_mmx:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
sub r0, 2
WELS_Zero mm7
movq mm6, [h264_w0x10]
.height_loop:
movd mm0, [r0]
punpcklbw mm0, mm7
movd mm1, [r0+5]
punpcklbw mm1, mm7
movd mm2, [r0+1]
punpcklbw mm2, mm7
movd mm3, [r0+4]
punpcklbw mm3, mm7
movd mm4, [r0+2]
punpcklbw mm4, mm7
movd mm5, [r0+3]
punpcklbw mm5, mm7
paddw mm2, mm3
paddw mm4, mm5
psllw mm4, 2
psubw mm4, mm2
paddw mm0, mm1
paddw mm0, mm4
psllw mm4, 2
paddw mm0, mm4
paddw mm0, mm6
psraw mm0, 5
packuswb mm0, mm7
movd [r2], mm0
add r0, r1
add r2, r3
dec r4
jnz .height_loop
WELSEMMS
LOAD_5_PARA_POP
ret
;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
%macro SSE_LOAD_8P 3
movq %1, %3
punpcklbw %1, %2
%endmacro
%macro FILTER_HV_W8 9
paddw %1, %6
movdqa %8, %3
movdqa %7, %2
paddw %1, [h264_w0x10_1]
paddw %8, %4
paddw %7, %5
psllw %8, 2
psubw %8, %7
paddw %1, %8
psllw %8, 2
paddw %1, %8
psraw %1, 5
WELS_Zero %8
packuswb %1, %8
movq %9, %1
%endmacro
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
WELS_EXTERN McHorVer22Width8HorFirst_sse2
WELS_EXTERN McHorVer02WidthEq8_sse2
WELS_EXTERN McHorVer20WidthEq8_sse2
WELS_EXTERN McHorVer20WidthEq16_sse2
ALIGN 16
;***********************************************************************
; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
; int16_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride
; int32_t iHeight
; )
;***********************************************************************
McHorVer22Width8HorFirst_sse2:
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
pxor xmm7, xmm7
sub r0, r1 ;;;;;;;;need more 5 lines.
sub r0, r1
.yloop_width_8:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
movdqa [r2], xmm0
add r0, r1
add r2, r3
dec r4
jnz .yloop_width_8
POP_XMM
LOAD_5_PARA_POP
ret
ALIGN 16
;*******************************************************************************
; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iHeight,
; );
;*******************************************************************************
McHorVer20WidthEq8_sse2:
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
lea r0, [r0-2] ;pSrc -= 2;
pxor xmm7, xmm7
movdqa xmm6, [h264_w0x10_1]
.y_loop:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
paddw xmm0, xmm6
psraw xmm0, 5
packuswb xmm0, xmm7
movq [r2], xmm0
lea r2, [r2+r3]
lea r0, [r0+r1]
dec r4
jnz near .y_loop
POP_XMM
LOAD_5_PARA_POP
ret
ALIGN 16
;*******************************************************************************
; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iHeight,
; );
;*******************************************************************************
McHorVer20WidthEq16_sse2:
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
lea r0, [r0-2] ;pSrc -= 2;
pxor xmm7, xmm7
movdqa xmm6, [h264_w0x10_1]
.y_loop:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
paddw xmm0, xmm6
psraw xmm0, 5
packuswb xmm0, xmm7
movq [r2], xmm0
movq xmm0, [r0+8]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5+8]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1+8]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4+8]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2+8]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3+8]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
paddw xmm0, xmm6
psraw xmm0, 5
packuswb xmm0, xmm7
movq [r2+8], xmm0
lea r2, [r2+r3]
lea r0, [r0+r1]
dec r4
jnz near .y_loop
POP_XMM
LOAD_5_PARA_POP
ret
;*******************************************************************************
; void McHorVer02WidthEq8_sse2( const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iHeight )
;*******************************************************************************
ALIGN 16
McHorVer02WidthEq8_sse2:
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
sub r0, r1
sub r0, r1
WELS_Zero xmm7
SSE_LOAD_8P xmm0, xmm7, [r0]
SSE_LOAD_8P xmm1, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm7, [r0]
SSE_LOAD_8P xmm3, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm7, [r0]
SSE_LOAD_8P xmm5, xmm7, [r0+r1]
.start:
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r4
jz near .xx_exit
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
dec r4
jz near .xx_exit
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm7, xmm0, [r0+r1]
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r4
jz near .xx_exit
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm0, xmm1, [r0]
FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
dec r4
jz near .xx_exit
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm1, xmm2, [r0+r1]
FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
dec r4
jz near .xx_exit
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm3, [r0]
FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
dec r4
jz near .xx_exit
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm3, xmm4, [r0+r1]
FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
dec r4
jz near .xx_exit
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm5, [r0]
FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
dec r4
jz near .xx_exit
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm5, xmm6, [r0+r1]
jmp near .start
.xx_exit:
POP_XMM
LOAD_5_PARA_POP
ret
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
WELS_EXTERN McHorVer20Width9Or17_sse2
WELS_EXTERN McHorVer02Height9Or17_sse2
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
WELS_EXTERN McHorVer22HorFirst_sse2
;***********************************************************************
; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight )
;***********************************************************************
ALIGN 16
McHorVer02Height9Or17_sse2:
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov r12, r0
mov r13, r2
mov r14, r5
%endif
shr r4, 3
sub r0, r1
sub r0, r1
.xloop:
WELS_Zero xmm7
SSE_LOAD_8P xmm0, xmm7, [r0]
SSE_LOAD_8P xmm1, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm7, [r0]
SSE_LOAD_8P xmm3, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm7, [r0]
SSE_LOAD_8P xmm5, xmm7, [r0+r1]
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
movdqa xmm0,xmm1
movdqa xmm1,xmm2
movdqa xmm2,xmm3
movdqa xmm3,xmm4
movdqa xmm4,xmm5
movdqa xmm5,xmm6
add r2, r3
sub r0, r1
.start:
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm7, xmm0, [r0+r1]
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm0, xmm1, [r0]
FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm1, xmm2, [r0+r1]
FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm3, [r0]
FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm3, xmm4, [r0+r1]
FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm5, [r0]
FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm5, xmm6, [r0+r1]
jmp near .start
.x_loop_dec:
dec r4
jz near .xx_exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
sub r0, r1
sub r0, r1
add r0, 8
add r2, 8
jmp near .xloop
.xx_exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
ret
ALIGN 16
;***********************************************************************
; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight
; );
;***********************************************************************
McHorVer20Width9Or17_sse2:
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
sub r0, 2
pxor xmm7, xmm7
cmp r4, 9
jne near .width_17
.yloop_width_9:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
paddw xmm0, [h264_w0x10_1]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
paddw xmm2, [h264_w0x10_1]
psraw xmm2, 5
packuswb xmm2, xmm2
movq [r2+1], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_9
POP_XMM
LOAD_6_PARA_POP
ret
.width_17:
.yloop_width_17:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
paddw xmm0, [h264_w0x10_1]
psraw xmm0, 5
packuswb xmm0, xmm0
movq [r2], xmm0
movq xmm0, [r0+8]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5+8]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1+8]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4+8]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2+8]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3+8]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
paddw xmm0, [h264_w0x10_1]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2+8], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6+8]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
paddw xmm2, [h264_w0x10_1]
psraw xmm2, 5
packuswb xmm2, xmm2
movq [r2+9], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_17
POP_XMM
LOAD_6_PARA_POP
ret
ALIGN 16
;***********************************************************************
;void McHorVer22HorFirst_sse2
; (const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t * pTap,
; int32_t iTapStride,
; int32_t iWidth,int32_t iHeight);
;***********************************************************************
McHorVer22HorFirst_sse2:
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
pxor xmm7, xmm7
sub r0, r1 ;;;;;;;;need more 5 lines.
sub r0, r1
cmp r4, 9
jne near .width_17
.yloop_width_9:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
movd [r2], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
movq [r2+2], xmm2
movhps [r2+2+8], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_9
POP_XMM
LOAD_6_PARA_POP
ret
.width_17:
.yloop_width_17:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
movdqa [r2], xmm0
movq xmm0, [r0+8]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5+8]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1+8]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4+8]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2+8]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3+8]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
movd [r2+16], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6+8]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
movq [r2+18], xmm2
movhps [r2+18+8], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_17
POP_XMM
LOAD_6_PARA_POP
ret
%macro FILTER_VER 9
paddw %1, %6
movdqa %7, %2
movdqa %8, %3
paddw %7, %5
paddw %8, %4
psubw %1, %7
psraw %1, 2
paddw %1, %8
psubw %1, %7
psraw %1, 2
paddw %8, %1
paddw %8, [h264_mc_hc_32]
psraw %8, 6
packuswb %8, %8
movq %9, %8
%endmacro
;***********************************************************************
;void McHorVer22Width8VerLastAlign_sse2(
; const uint8_t *pTap,
; int32_t iTapStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
McHorVer22Width8VerLastAlign_sse2:
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov r12, r0
mov r13, r2
mov r14, r5
%endif
shr r4, 3
.width_loop:
movdqa xmm0, [r0]
movdqa xmm1, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm2, [r0]
movdqa xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm4, [r0]
movdqa xmm5, [r0+r1]
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
movdqa xmm6, [r0]
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
add r2, r3
sub r0, r1
.start:
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm6, [r0]
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm7, [r0+r1]
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm0, [r0]
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm1, [r0+r1]
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm2, [r0]
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm3, [r0+r1]
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm4, [r0]
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm5, [r0+r1]
jmp near .start
.x_loop_dec:
dec r4
jz near .exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
add r0, 16
add r2, 8
jmp .width_loop
.exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
ret
;***********************************************************************
;void McHorVer22Width8VerLastUnAlign_sse2(
; const uint8_t *pTap,
; int32_t iTapStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
McHorVer22Width8VerLastUnAlign_sse2:
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov r12, r0
mov r13, r2
mov r14, r5
%endif
shr r4, 3
.width_loop:
movdqu xmm0, [r0]
movdqu xmm1, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm2, [r0]
movdqu xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm4, [r0]
movdqu xmm5, [r0+r1]
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
movdqu xmm6, [r0]
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
add r2, r3
sub r0, r1
.start:
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm6, [r0]
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm7, [r0+r1]
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm0, [r0]
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm1, [r0+r1]
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm2, [r0]
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm3, [r0+r1]
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm4, [r0]
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm5, [r0+r1]
jmp near .start
.x_loop_dec:
dec r4
jz near .exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
add r0, 16
add r2, 8
jmp .width_loop
.exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
ret