1165 lines
23 KiB
NASM
1165 lines
23 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* mc_luma.asm
|
|
;*
|
|
;* Abstract
|
|
;* sse2 motion compensation
|
|
;*
|
|
;* History
|
|
;* 17/08/2009 Created
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
%include "asm_inc.asm"
|
|
|
|
;*******************************************************************************
|
|
; Local Data (Read Only)
|
|
;*******************************************************************************
|
|
SECTION .rodata align=16
|
|
|
|
;*******************************************************************************
|
|
; Various memory constants (trigonometric values or rounding values)
|
|
;*******************************************************************************
|
|
|
|
ALIGN 16
|
|
h264_w0x10:
|
|
dw 16, 16, 16, 16
|
|
ALIGN 16
|
|
h264_w0x10_1:
|
|
dw 16, 16, 16, 16, 16, 16, 16, 16
|
|
ALIGN 16
|
|
h264_mc_hc_32:
|
|
dw 32, 32, 32, 32, 32, 32, 32, 32
|
|
|
|
|
|
;*******************************************************************************
|
|
; Code
|
|
;*******************************************************************************
|
|
|
|
SECTION .text
|
|
|
|
|
|
|
|
;*******************************************************************************
|
|
; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
|
|
; int iSrcStride,
|
|
; uint8_t *pDst,
|
|
; int iDstStride,
|
|
; int iHeight)
|
|
;*******************************************************************************
|
|
WELS_EXTERN McHorVer20WidthEq4_mmx
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r4, r4d
|
|
|
|
sub r0, 2
|
|
WELS_Zero mm7
|
|
movq mm6, [h264_w0x10]
|
|
.height_loop:
|
|
movd mm0, [r0]
|
|
punpcklbw mm0, mm7
|
|
movd mm1, [r0+5]
|
|
punpcklbw mm1, mm7
|
|
movd mm2, [r0+1]
|
|
punpcklbw mm2, mm7
|
|
movd mm3, [r0+4]
|
|
punpcklbw mm3, mm7
|
|
movd mm4, [r0+2]
|
|
punpcklbw mm4, mm7
|
|
movd mm5, [r0+3]
|
|
punpcklbw mm5, mm7
|
|
|
|
paddw mm2, mm3
|
|
paddw mm4, mm5
|
|
psllw mm4, 2
|
|
psubw mm4, mm2
|
|
paddw mm0, mm1
|
|
paddw mm0, mm4
|
|
psllw mm4, 2
|
|
paddw mm0, mm4
|
|
paddw mm0, mm6
|
|
psraw mm0, 5
|
|
packuswb mm0, mm7
|
|
movd [r2], mm0
|
|
|
|
add r0, r1
|
|
add r2, r3
|
|
dec r4
|
|
jnz .height_loop
|
|
|
|
WELSEMMS
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
;*******************************************************************************
|
|
; Macros and other preprocessor constants
|
|
;*******************************************************************************
|
|
|
|
|
|
%macro SSE_LOAD_8P 3
|
|
movq %1, %3
|
|
punpcklbw %1, %2
|
|
%endmacro
|
|
|
|
%macro FILTER_HV_W8 9
|
|
paddw %1, %6
|
|
movdqa %8, %3
|
|
movdqa %7, %2
|
|
paddw %1, [h264_w0x10_1]
|
|
paddw %8, %4
|
|
paddw %7, %5
|
|
psllw %8, 2
|
|
psubw %8, %7
|
|
paddw %1, %8
|
|
psllw %8, 2
|
|
paddw %1, %8
|
|
psraw %1, 5
|
|
WELS_Zero %8
|
|
packuswb %1, %8
|
|
movq %9, %1
|
|
%endmacro
|
|
|
|
;*******************************************************************************
|
|
; Code
|
|
;*******************************************************************************
|
|
|
|
SECTION .text
|
|
|
|
;***********************************************************************
|
|
; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
|
|
; int16_t iSrcStride,
|
|
; uint8_t *pDst,
|
|
; int32_t iDstStride
|
|
; int32_t iHeight
|
|
; )
|
|
;***********************************************************************
|
|
WELS_EXTERN McHorVer22Width8HorFirst_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r4, r4d
|
|
pxor xmm7, xmm7
|
|
|
|
sub r0, r1 ;;;;;;;;need more 5 lines.
|
|
sub r0, r1
|
|
|
|
.yloop_width_8:
|
|
movq xmm0, [r0]
|
|
punpcklbw xmm0, xmm7
|
|
movq xmm1, [r0+5]
|
|
punpcklbw xmm1, xmm7
|
|
movq xmm2, [r0+1]
|
|
punpcklbw xmm2, xmm7
|
|
movq xmm3, [r0+4]
|
|
punpcklbw xmm3, xmm7
|
|
movq xmm4, [r0+2]
|
|
punpcklbw xmm4, xmm7
|
|
movq xmm5, [r0+3]
|
|
punpcklbw xmm5, xmm7
|
|
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
psllw xmm4, 2
|
|
psubw xmm4, xmm2
|
|
paddw xmm0, xmm1
|
|
paddw xmm0, xmm4
|
|
psllw xmm4, 2
|
|
paddw xmm0, xmm4
|
|
movdqa [r2], xmm0
|
|
|
|
add r0, r1
|
|
add r2, r3
|
|
dec r4
|
|
jnz .yloop_width_8
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
;*******************************************************************************
|
|
; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc,
|
|
; int iSrcStride,
|
|
; uint8_t *pDst,
|
|
; int iDstStride,
|
|
; int iHeight,
|
|
; );
|
|
;*******************************************************************************
|
|
WELS_EXTERN McHorVer20WidthEq8_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r4, r4d
|
|
lea r0, [r0-2] ;pSrc -= 2;
|
|
|
|
pxor xmm7, xmm7
|
|
movdqa xmm6, [h264_w0x10_1]
|
|
.y_loop:
|
|
movq xmm0, [r0]
|
|
punpcklbw xmm0, xmm7
|
|
movq xmm1, [r0+5]
|
|
punpcklbw xmm1, xmm7
|
|
movq xmm2, [r0+1]
|
|
punpcklbw xmm2, xmm7
|
|
movq xmm3, [r0+4]
|
|
punpcklbw xmm3, xmm7
|
|
movq xmm4, [r0+2]
|
|
punpcklbw xmm4, xmm7
|
|
movq xmm5, [r0+3]
|
|
punpcklbw xmm5, xmm7
|
|
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
psllw xmm4, 2
|
|
psubw xmm4, xmm2
|
|
paddw xmm0, xmm1
|
|
paddw xmm0, xmm4
|
|
psllw xmm4, 2
|
|
paddw xmm0, xmm4
|
|
paddw xmm0, xmm6
|
|
psraw xmm0, 5
|
|
|
|
packuswb xmm0, xmm7
|
|
movq [r2], xmm0
|
|
|
|
lea r2, [r2+r3]
|
|
lea r0, [r0+r1]
|
|
dec r4
|
|
jnz near .y_loop
|
|
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
;*******************************************************************************
|
|
; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc,
|
|
; int iSrcStride,
|
|
; uint8_t *pDst,
|
|
; int iDstStride,
|
|
; int iHeight,
|
|
; );
|
|
;*******************************************************************************
|
|
WELS_EXTERN McHorVer20WidthEq16_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r4, r4d
|
|
lea r0, [r0-2] ;pSrc -= 2;
|
|
|
|
pxor xmm7, xmm7
|
|
movdqa xmm6, [h264_w0x10_1]
|
|
.y_loop:
|
|
|
|
movq xmm0, [r0]
|
|
punpcklbw xmm0, xmm7
|
|
movq xmm1, [r0+5]
|
|
punpcklbw xmm1, xmm7
|
|
movq xmm2, [r0+1]
|
|
punpcklbw xmm2, xmm7
|
|
movq xmm3, [r0+4]
|
|
punpcklbw xmm3, xmm7
|
|
movq xmm4, [r0+2]
|
|
punpcklbw xmm4, xmm7
|
|
movq xmm5, [r0+3]
|
|
punpcklbw xmm5, xmm7
|
|
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
psllw xmm4, 2
|
|
psubw xmm4, xmm2
|
|
paddw xmm0, xmm1
|
|
paddw xmm0, xmm4
|
|
psllw xmm4, 2
|
|
paddw xmm0, xmm4
|
|
paddw xmm0, xmm6
|
|
psraw xmm0, 5
|
|
packuswb xmm0, xmm7
|
|
movq [r2], xmm0
|
|
|
|
movq xmm0, [r0+8]
|
|
punpcklbw xmm0, xmm7
|
|
movq xmm1, [r0+5+8]
|
|
punpcklbw xmm1, xmm7
|
|
movq xmm2, [r0+1+8]
|
|
punpcklbw xmm2, xmm7
|
|
movq xmm3, [r0+4+8]
|
|
punpcklbw xmm3, xmm7
|
|
movq xmm4, [r0+2+8]
|
|
punpcklbw xmm4, xmm7
|
|
movq xmm5, [r0+3+8]
|
|
punpcklbw xmm5, xmm7
|
|
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
psllw xmm4, 2
|
|
psubw xmm4, xmm2
|
|
paddw xmm0, xmm1
|
|
paddw xmm0, xmm4
|
|
psllw xmm4, 2
|
|
paddw xmm0, xmm4
|
|
paddw xmm0, xmm6
|
|
psraw xmm0, 5
|
|
packuswb xmm0, xmm7
|
|
movq [r2+8], xmm0
|
|
|
|
lea r2, [r2+r3]
|
|
lea r0, [r0+r1]
|
|
dec r4
|
|
jnz near .y_loop
|
|
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
|
|
;*******************************************************************************
|
|
; void McHorVer02WidthEq8_sse2( const uint8_t *pSrc,
|
|
; int iSrcStride,
|
|
; uint8_t *pDst,
|
|
; int iDstStride,
|
|
; int iHeight )
|
|
;*******************************************************************************
|
|
WELS_EXTERN McHorVer02WidthEq8_sse2
|
|
%assign push_num 0
|
|
LOAD_5_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r4, r4d
|
|
sub r0, r1
|
|
sub r0, r1
|
|
|
|
WELS_Zero xmm7
|
|
|
|
SSE_LOAD_8P xmm0, xmm7, [r0]
|
|
SSE_LOAD_8P xmm1, xmm7, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm2, xmm7, [r0]
|
|
SSE_LOAD_8P xmm3, xmm7, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm4, xmm7, [r0]
|
|
SSE_LOAD_8P xmm5, xmm7, [r0+r1]
|
|
|
|
.start:
|
|
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
dec r4
|
|
jz near .xx_exit
|
|
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm6, xmm7, [r0]
|
|
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
|
|
dec r4
|
|
jz near .xx_exit
|
|
|
|
lea r2, [r2+2*r3]
|
|
SSE_LOAD_8P xmm7, xmm0, [r0+r1]
|
|
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
|
|
dec r4
|
|
jz near .xx_exit
|
|
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm0, xmm1, [r0]
|
|
FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
|
|
dec r4
|
|
jz near .xx_exit
|
|
|
|
lea r2, [r2+2*r3]
|
|
SSE_LOAD_8P xmm1, xmm2, [r0+r1]
|
|
FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
|
|
dec r4
|
|
jz near .xx_exit
|
|
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm2, xmm3, [r0]
|
|
FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
|
|
dec r4
|
|
jz near .xx_exit
|
|
|
|
lea r2, [r2+2*r3]
|
|
SSE_LOAD_8P xmm3, xmm4, [r0+r1]
|
|
FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
|
|
dec r4
|
|
jz near .xx_exit
|
|
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm4, xmm5, [r0]
|
|
FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
|
|
dec r4
|
|
jz near .xx_exit
|
|
|
|
lea r2, [r2+2*r3]
|
|
SSE_LOAD_8P xmm5, xmm6, [r0+r1]
|
|
jmp near .start
|
|
|
|
.xx_exit:
|
|
POP_XMM
|
|
LOAD_5_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
; Code
|
|
;***********************************************************************
|
|
|
|
SECTION .text
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc,
|
|
; int32_t iSrcStride,
|
|
; uint8_t *pDst,
|
|
; int32_t iDstStride,
|
|
; int32_t iWidth,
|
|
; int32_t iHeight )
|
|
;***********************************************************************
|
|
WELS_EXTERN McHorVer02Height9Or17_sse2
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r4, r4d
|
|
SIGN_EXTENSION r5, r5d
|
|
|
|
%ifndef X86_32
|
|
push r12
|
|
push r13
|
|
push r14
|
|
mov r12, r0
|
|
mov r13, r2
|
|
mov r14, r5
|
|
%endif
|
|
|
|
shr r4, 3
|
|
sub r0, r1
|
|
sub r0, r1
|
|
|
|
.xloop:
|
|
WELS_Zero xmm7
|
|
SSE_LOAD_8P xmm0, xmm7, [r0]
|
|
SSE_LOAD_8P xmm1, xmm7, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm2, xmm7, [r0]
|
|
SSE_LOAD_8P xmm3, xmm7, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm4, xmm7, [r0]
|
|
SSE_LOAD_8P xmm5, xmm7, [r0+r1]
|
|
|
|
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
dec r5
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm6, xmm7, [r0]
|
|
movdqa xmm0,xmm1
|
|
movdqa xmm1,xmm2
|
|
movdqa xmm2,xmm3
|
|
movdqa xmm3,xmm4
|
|
movdqa xmm4,xmm5
|
|
movdqa xmm5,xmm6
|
|
add r2, r3
|
|
sub r0, r1
|
|
|
|
.start:
|
|
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm6, xmm7, [r0]
|
|
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r2, [r2+2*r3]
|
|
SSE_LOAD_8P xmm7, xmm0, [r0+r1]
|
|
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm0, xmm1, [r0]
|
|
FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r2, [r2+2*r3]
|
|
SSE_LOAD_8P xmm1, xmm2, [r0+r1]
|
|
FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm2, xmm3, [r0]
|
|
FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r2, [r2+2*r3]
|
|
SSE_LOAD_8P xmm3, xmm4, [r0+r1]
|
|
FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r0, [r0+2*r1]
|
|
SSE_LOAD_8P xmm4, xmm5, [r0]
|
|
FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r2, [r2+2*r3]
|
|
SSE_LOAD_8P xmm5, xmm6, [r0+r1]
|
|
jmp near .start
|
|
|
|
.x_loop_dec:
|
|
dec r4
|
|
jz near .xx_exit
|
|
%ifdef X86_32
|
|
mov r0, arg1
|
|
mov r2, arg3
|
|
mov r5, arg6
|
|
%else
|
|
mov r0, r12
|
|
mov r2, r13
|
|
mov r5, r14
|
|
%endif
|
|
sub r0, r1
|
|
sub r0, r1
|
|
add r0, 8
|
|
add r2, 8
|
|
jmp near .xloop
|
|
|
|
.xx_exit:
|
|
%ifndef X86_32
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
%endif
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|
|
|
|
;***********************************************************************
|
|
; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
|
|
; int32_t iSrcStride,
|
|
; uint8_t *pDst,
|
|
; int32_t iDstStride,
|
|
; int32_t iWidth,
|
|
; int32_t iHeight
|
|
; );
|
|
;***********************************************************************
|
|
WELS_EXTERN McHorVer20Width9Or17_sse2
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r4, r4d
|
|
SIGN_EXTENSION r5, r5d
|
|
sub r0, 2
|
|
pxor xmm7, xmm7
|
|
|
|
cmp r4, 9
|
|
jne near .width_17
|
|
|
|
.yloop_width_9:
|
|
movq xmm0, [r0]
|
|
punpcklbw xmm0, xmm7
|
|
movq xmm1, [r0+5]
|
|
punpcklbw xmm1, xmm7
|
|
movq xmm2, [r0+1]
|
|
punpcklbw xmm2, xmm7
|
|
movq xmm3, [r0+4]
|
|
punpcklbw xmm3, xmm7
|
|
movq xmm4, [r0+2]
|
|
punpcklbw xmm4, xmm7
|
|
movq xmm5, [r0+3]
|
|
punpcklbw xmm5, xmm7
|
|
|
|
movdqa xmm7, xmm2
|
|
paddw xmm7, xmm3
|
|
movdqa xmm6, xmm4
|
|
paddw xmm6, xmm5
|
|
psllw xmm6, 2
|
|
psubw xmm6, xmm7
|
|
paddw xmm0, xmm1
|
|
paddw xmm0, xmm6
|
|
psllw xmm6, 2
|
|
paddw xmm0, xmm6
|
|
paddw xmm0, [h264_w0x10_1]
|
|
psraw xmm0, 5
|
|
packuswb xmm0, xmm0
|
|
movd [r2], xmm0
|
|
|
|
pxor xmm7, xmm7
|
|
movq xmm0, [r0+6]
|
|
punpcklbw xmm0, xmm7
|
|
|
|
paddw xmm4, xmm1
|
|
paddw xmm5, xmm3
|
|
psllw xmm5, 2
|
|
psubw xmm5, xmm4
|
|
paddw xmm2, xmm0
|
|
paddw xmm2, xmm5
|
|
psllw xmm5, 2
|
|
paddw xmm2, xmm5
|
|
paddw xmm2, [h264_w0x10_1]
|
|
psraw xmm2, 5
|
|
packuswb xmm2, xmm2
|
|
movq [r2+1], xmm2
|
|
|
|
add r0, r1
|
|
add r2, r3
|
|
dec r5
|
|
jnz .yloop_width_9
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|
|
|
|
.width_17:
|
|
.yloop_width_17:
|
|
movq xmm0, [r0]
|
|
punpcklbw xmm0, xmm7
|
|
movq xmm1, [r0+5]
|
|
punpcklbw xmm1, xmm7
|
|
movq xmm2, [r0+1]
|
|
punpcklbw xmm2, xmm7
|
|
movq xmm3, [r0+4]
|
|
punpcklbw xmm3, xmm7
|
|
movq xmm4, [r0+2]
|
|
punpcklbw xmm4, xmm7
|
|
movq xmm5, [r0+3]
|
|
punpcklbw xmm5, xmm7
|
|
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
psllw xmm4, 2
|
|
psubw xmm4, xmm2
|
|
paddw xmm0, xmm1
|
|
paddw xmm0, xmm4
|
|
psllw xmm4, 2
|
|
paddw xmm0, xmm4
|
|
paddw xmm0, [h264_w0x10_1]
|
|
psraw xmm0, 5
|
|
packuswb xmm0, xmm0
|
|
movq [r2], xmm0
|
|
|
|
movq xmm0, [r0+8]
|
|
punpcklbw xmm0, xmm7
|
|
movq xmm1, [r0+5+8]
|
|
punpcklbw xmm1, xmm7
|
|
movq xmm2, [r0+1+8]
|
|
punpcklbw xmm2, xmm7
|
|
movq xmm3, [r0+4+8]
|
|
punpcklbw xmm3, xmm7
|
|
movq xmm4, [r0+2+8]
|
|
punpcklbw xmm4, xmm7
|
|
movq xmm5, [r0+3+8]
|
|
punpcklbw xmm5, xmm7
|
|
|
|
movdqa xmm7, xmm2
|
|
paddw xmm7, xmm3
|
|
movdqa xmm6, xmm4
|
|
paddw xmm6, xmm5
|
|
psllw xmm6, 2
|
|
psubw xmm6, xmm7
|
|
paddw xmm0, xmm1
|
|
paddw xmm0, xmm6
|
|
psllw xmm6, 2
|
|
paddw xmm0, xmm6
|
|
paddw xmm0, [h264_w0x10_1]
|
|
psraw xmm0, 5
|
|
packuswb xmm0, xmm0
|
|
movd [r2+8], xmm0
|
|
|
|
|
|
pxor xmm7, xmm7
|
|
movq xmm0, [r0+6+8]
|
|
punpcklbw xmm0, xmm7
|
|
|
|
paddw xmm4, xmm1
|
|
paddw xmm5, xmm3
|
|
psllw xmm5, 2
|
|
psubw xmm5, xmm4
|
|
paddw xmm2, xmm0
|
|
paddw xmm2, xmm5
|
|
psllw xmm5, 2
|
|
paddw xmm2, xmm5
|
|
paddw xmm2, [h264_w0x10_1]
|
|
psraw xmm2, 5
|
|
packuswb xmm2, xmm2
|
|
movq [r2+9], xmm2
|
|
add r0, r1
|
|
add r2, r3
|
|
dec r5
|
|
jnz .yloop_width_17
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
;void McHorVer22HorFirst_sse2
|
|
; (const uint8_t *pSrc,
|
|
; int32_t iSrcStride,
|
|
; uint8_t * pTap,
|
|
; int32_t iTapStride,
|
|
; int32_t iWidth,int32_t iHeight);
|
|
;***********************************************************************
|
|
WELS_EXTERN McHorVer22HorFirst_sse2
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r4, r4d
|
|
SIGN_EXTENSION r5, r5d
|
|
pxor xmm7, xmm7
|
|
sub r0, r1 ;;;;;;;;need more 5 lines.
|
|
sub r0, r1
|
|
|
|
cmp r4, 9
|
|
jne near .width_17
|
|
|
|
.yloop_width_9:
|
|
movq xmm0, [r0]
|
|
punpcklbw xmm0, xmm7
|
|
movq xmm1, [r0+5]
|
|
punpcklbw xmm1, xmm7
|
|
movq xmm2, [r0+1]
|
|
punpcklbw xmm2, xmm7
|
|
movq xmm3, [r0+4]
|
|
punpcklbw xmm3, xmm7
|
|
movq xmm4, [r0+2]
|
|
punpcklbw xmm4, xmm7
|
|
movq xmm5, [r0+3]
|
|
punpcklbw xmm5, xmm7
|
|
|
|
movdqa xmm7, xmm2
|
|
paddw xmm7, xmm3
|
|
movdqa xmm6, xmm4
|
|
paddw xmm6, xmm5
|
|
psllw xmm6, 2
|
|
psubw xmm6, xmm7
|
|
paddw xmm0, xmm1
|
|
paddw xmm0, xmm6
|
|
psllw xmm6, 2
|
|
paddw xmm0, xmm6
|
|
movd [r2], xmm0
|
|
|
|
pxor xmm7, xmm7
|
|
movq xmm0, [r0+6]
|
|
punpcklbw xmm0, xmm7
|
|
|
|
paddw xmm4, xmm1
|
|
paddw xmm5, xmm3
|
|
psllw xmm5, 2
|
|
psubw xmm5, xmm4
|
|
paddw xmm2, xmm0
|
|
paddw xmm2, xmm5
|
|
psllw xmm5, 2
|
|
paddw xmm2, xmm5
|
|
movq [r2+2], xmm2
|
|
movhps [r2+2+8], xmm2
|
|
|
|
add r0, r1
|
|
add r2, r3
|
|
dec r5
|
|
jnz .yloop_width_9
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|
|
|
|
.width_17:
|
|
.yloop_width_17:
|
|
movq xmm0, [r0]
|
|
punpcklbw xmm0, xmm7
|
|
movq xmm1, [r0+5]
|
|
punpcklbw xmm1, xmm7
|
|
movq xmm2, [r0+1]
|
|
punpcklbw xmm2, xmm7
|
|
movq xmm3, [r0+4]
|
|
punpcklbw xmm3, xmm7
|
|
movq xmm4, [r0+2]
|
|
punpcklbw xmm4, xmm7
|
|
movq xmm5, [r0+3]
|
|
punpcklbw xmm5, xmm7
|
|
|
|
paddw xmm2, xmm3
|
|
paddw xmm4, xmm5
|
|
psllw xmm4, 2
|
|
psubw xmm4, xmm2
|
|
paddw xmm0, xmm1
|
|
paddw xmm0, xmm4
|
|
psllw xmm4, 2
|
|
paddw xmm0, xmm4
|
|
movdqa [r2], xmm0
|
|
|
|
movq xmm0, [r0+8]
|
|
punpcklbw xmm0, xmm7
|
|
movq xmm1, [r0+5+8]
|
|
punpcklbw xmm1, xmm7
|
|
movq xmm2, [r0+1+8]
|
|
punpcklbw xmm2, xmm7
|
|
movq xmm3, [r0+4+8]
|
|
punpcklbw xmm3, xmm7
|
|
movq xmm4, [r0+2+8]
|
|
punpcklbw xmm4, xmm7
|
|
movq xmm5, [r0+3+8]
|
|
punpcklbw xmm5, xmm7
|
|
|
|
movdqa xmm7, xmm2
|
|
paddw xmm7, xmm3
|
|
movdqa xmm6, xmm4
|
|
paddw xmm6, xmm5
|
|
psllw xmm6, 2
|
|
psubw xmm6, xmm7
|
|
paddw xmm0, xmm1
|
|
paddw xmm0, xmm6
|
|
psllw xmm6, 2
|
|
paddw xmm0, xmm6
|
|
movd [r2+16], xmm0
|
|
|
|
|
|
pxor xmm7, xmm7
|
|
movq xmm0, [r0+6+8]
|
|
punpcklbw xmm0, xmm7
|
|
|
|
paddw xmm4, xmm1
|
|
paddw xmm5, xmm3
|
|
psllw xmm5, 2
|
|
psubw xmm5, xmm4
|
|
paddw xmm2, xmm0
|
|
paddw xmm2, xmm5
|
|
psllw xmm5, 2
|
|
paddw xmm2, xmm5
|
|
movq [r2+18], xmm2
|
|
movhps [r2+18+8], xmm2
|
|
|
|
add r0, r1
|
|
add r2, r3
|
|
dec r5
|
|
jnz .yloop_width_17
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|
|
|
|
%macro FILTER_VER 9
|
|
paddw %1, %6
|
|
movdqa %7, %2
|
|
movdqa %8, %3
|
|
|
|
|
|
paddw %7, %5
|
|
paddw %8, %4
|
|
|
|
psubw %1, %7
|
|
psraw %1, 2
|
|
paddw %1, %8
|
|
psubw %1, %7
|
|
psraw %1, 2
|
|
paddw %8, %1
|
|
paddw %8, [h264_mc_hc_32]
|
|
psraw %8, 6
|
|
packuswb %8, %8
|
|
movq %9, %8
|
|
%endmacro
|
|
;***********************************************************************
|
|
;void McHorVer22Width8VerLastAlign_sse2(
|
|
; const uint8_t *pTap,
|
|
; int32_t iTapStride,
|
|
; uint8_t * pDst,
|
|
; int32_t iDstStride,
|
|
; int32_t iWidth,
|
|
; int32_t iHeight);
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r4, r4d
|
|
SIGN_EXTENSION r5, r5d
|
|
%ifndef X86_32
|
|
push r12
|
|
push r13
|
|
push r14
|
|
mov r12, r0
|
|
mov r13, r2
|
|
mov r14, r5
|
|
%endif
|
|
|
|
shr r4, 3
|
|
|
|
.width_loop:
|
|
movdqa xmm0, [r0]
|
|
movdqa xmm1, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm2, [r0]
|
|
movdqa xmm3, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm4, [r0]
|
|
movdqa xmm5, [r0+r1]
|
|
|
|
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
dec r5
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm6, [r0]
|
|
|
|
movdqa xmm0, xmm1
|
|
movdqa xmm1, xmm2
|
|
movdqa xmm2, xmm3
|
|
movdqa xmm3, xmm4
|
|
movdqa xmm4, xmm5
|
|
movdqa xmm5, xmm6
|
|
|
|
add r2, r3
|
|
sub r0, r1
|
|
|
|
.start:
|
|
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm6, [r0]
|
|
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm7, [r0+r1]
|
|
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm0, [r0]
|
|
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm1, [r0+r1]
|
|
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm2, [r0]
|
|
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm3, [r0+r1]
|
|
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r0, [r0+2*r1]
|
|
movdqa xmm4, [r0]
|
|
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r2, [r2+2*r3]
|
|
movdqa xmm5, [r0+r1]
|
|
jmp near .start
|
|
|
|
.x_loop_dec:
|
|
dec r4
|
|
jz near .exit
|
|
%ifdef X86_32
|
|
mov r0, arg1
|
|
mov r2, arg3
|
|
mov r5, arg6
|
|
%else
|
|
mov r0, r12
|
|
mov r2, r13
|
|
mov r5, r14
|
|
%endif
|
|
add r0, 16
|
|
add r2, 8
|
|
jmp .width_loop
|
|
|
|
.exit:
|
|
%ifndef X86_32
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
%endif
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;void McHorVer22Width8VerLastUnAlign_sse2(
|
|
; const uint8_t *pTap,
|
|
; int32_t iTapStride,
|
|
; uint8_t * pDst,
|
|
; int32_t iDstStride,
|
|
; int32_t iWidth,
|
|
; int32_t iHeight);
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
|
|
%assign push_num 0
|
|
LOAD_6_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1, r1d
|
|
SIGN_EXTENSION r3, r3d
|
|
SIGN_EXTENSION r4, r4d
|
|
SIGN_EXTENSION r5, r5d
|
|
%ifndef X86_32
|
|
push r12
|
|
push r13
|
|
push r14
|
|
mov r12, r0
|
|
mov r13, r2
|
|
mov r14, r5
|
|
%endif
|
|
shr r4, 3
|
|
|
|
.width_loop:
|
|
movdqu xmm0, [r0]
|
|
movdqu xmm1, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movdqu xmm2, [r0]
|
|
movdqu xmm3, [r0+r1]
|
|
lea r0, [r0+2*r1]
|
|
movdqu xmm4, [r0]
|
|
movdqu xmm5, [r0+r1]
|
|
|
|
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
dec r5
|
|
lea r0, [r0+2*r1]
|
|
movdqu xmm6, [r0]
|
|
|
|
movdqa xmm0, xmm1
|
|
movdqa xmm1, xmm2
|
|
movdqa xmm2, xmm3
|
|
movdqa xmm3, xmm4
|
|
movdqa xmm4, xmm5
|
|
movdqa xmm5, xmm6
|
|
|
|
add r2, r3
|
|
sub r0, r1
|
|
|
|
.start:
|
|
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r0, [r0+2*r1]
|
|
movdqu xmm6, [r0]
|
|
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r2, [r2+2*r3]
|
|
movdqu xmm7, [r0+r1]
|
|
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r0, [r0+2*r1]
|
|
movdqu xmm0, [r0]
|
|
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r2, [r2+2*r3]
|
|
movdqu xmm1, [r0+r1]
|
|
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r0, [r0+2*r1]
|
|
movdqu xmm2, [r0]
|
|
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r2, [r2+2*r3]
|
|
movdqu xmm3, [r0+r1]
|
|
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r0, [r0+2*r1]
|
|
movdqu xmm4, [r0]
|
|
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
|
|
dec r5
|
|
jz near .x_loop_dec
|
|
|
|
lea r2, [r2+2*r3]
|
|
movdqu xmm5, [r0+r1]
|
|
jmp near .start
|
|
|
|
.x_loop_dec:
|
|
dec r4
|
|
jz near .exit
|
|
%ifdef X86_32
|
|
mov r0, arg1
|
|
mov r2, arg3
|
|
mov r5, arg6
|
|
%else
|
|
mov r0, r12
|
|
mov r2, r13
|
|
mov r5, r14
|
|
%endif
|
|
add r0, 16
|
|
add r2, 8
|
|
jmp .width_loop
|
|
|
|
.exit:
|
|
%ifndef X86_32
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
%endif
|
|
POP_XMM
|
|
LOAD_6_PARA_POP
|
|
ret
|