2014-01-05 13:11:41 +01:00
|
|
|
;*!
|
|
|
|
;* \copy
|
|
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
|
|
;* All rights reserved.
|
|
|
|
;*
|
|
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
|
|
;* modification, are permitted provided that the following conditions
|
|
|
|
;* are met:
|
|
|
|
;*
|
|
|
|
;* * Redistributions of source code must retain the above copyright
|
|
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
|
|
;*
|
|
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
|
|
;* the documentation and/or other materials provided with the
|
|
|
|
;* distribution.
|
|
|
|
;*
|
|
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
;*
|
|
|
|
;*
|
|
|
|
;* mc_luma.asm
|
|
|
|
;*
|
|
|
|
;* Abstract
|
|
|
|
;* sse2 motion compensation
|
|
|
|
;*
|
|
|
|
;* History
|
|
|
|
;* 17/08/2009 Created
|
|
|
|
;*
|
|
|
|
;*
|
|
|
|
;*************************************************************************/
|
|
|
|
%include "asm_inc.asm"
|
|
|
|
|
|
|
|
;*******************************************************************************
|
|
|
|
; Local Data (Read Only)
|
|
|
|
;*******************************************************************************
|
|
|
|
SECTION .rodata align=16
|
|
|
|
|
|
|
|
;*******************************************************************************
|
|
|
|
; Various memory constants (trigonometric values or rounding values)
|
|
|
|
;*******************************************************************************
|
|
|
|
|
|
|
|
ALIGN 16
|
|
|
|
h264_w0x10:
|
2014-05-31 13:13:34 +02:00
|
|
|
dw 16, 16, 16, 16
|
2014-01-05 13:11:41 +01:00
|
|
|
ALIGN 16
|
|
|
|
h264_w0x10_1:
|
2014-05-31 13:13:34 +02:00
|
|
|
dw 16, 16, 16, 16, 16, 16, 16, 16
|
2014-01-05 13:11:41 +01:00
|
|
|
ALIGN 16
|
|
|
|
h264_mc_hc_32:
|
2014-05-31 13:13:34 +02:00
|
|
|
dw 32, 32, 32, 32, 32, 32, 32, 32
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
|
|
|
|
;*******************************************************************************
|
|
|
|
; Code
|
|
|
|
;*******************************************************************************
|
|
|
|
|
|
|
|
SECTION .text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;*******************************************************************************
|
2014-02-09 01:05:47 +01:00
|
|
|
; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
|
2014-01-05 13:11:41 +01:00
|
|
|
; int iSrcStride,
|
2014-05-31 13:13:34 +02:00
|
|
|
; uint8_t *pDst,
|
|
|
|
; int iDstStride,
|
|
|
|
; int iHeight)
|
2014-01-05 13:11:41 +01:00
|
|
|
;*******************************************************************************
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN McHorVer20WidthEq4_mmx
|
2014-01-05 13:11:41 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
2014-05-31 13:13:34 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r4, r4d
|
2014-01-05 13:16:22 +01:00
|
|
|
|
2014-05-31 13:13:34 +02:00
|
|
|
sub r0, 2
|
|
|
|
WELS_Zero mm7
|
|
|
|
movq mm6, [h264_w0x10]
|
2014-01-05 13:11:41 +01:00
|
|
|
.height_loop:
|
2014-05-31 13:13:34 +02:00
|
|
|
movd mm0, [r0]
|
|
|
|
punpcklbw mm0, mm7
|
|
|
|
movd mm1, [r0+5]
|
|
|
|
punpcklbw mm1, mm7
|
|
|
|
movd mm2, [r0+1]
|
|
|
|
punpcklbw mm2, mm7
|
|
|
|
movd mm3, [r0+4]
|
|
|
|
punpcklbw mm3, mm7
|
|
|
|
movd mm4, [r0+2]
|
|
|
|
punpcklbw mm4, mm7
|
|
|
|
movd mm5, [r0+3]
|
|
|
|
punpcklbw mm5, mm7
|
|
|
|
|
|
|
|
paddw mm2, mm3
|
|
|
|
paddw mm4, mm5
|
|
|
|
psllw mm4, 2
|
|
|
|
psubw mm4, mm2
|
|
|
|
paddw mm0, mm1
|
|
|
|
paddw mm0, mm4
|
|
|
|
psllw mm4, 2
|
|
|
|
paddw mm0, mm4
|
|
|
|
paddw mm0, mm6
|
|
|
|
psraw mm0, 5
|
|
|
|
packuswb mm0, mm7
|
|
|
|
movd [r2], mm0
|
|
|
|
|
|
|
|
add r0, r1
|
|
|
|
add r2, r3
|
|
|
|
dec r4
|
|
|
|
jnz .height_loop
|
|
|
|
|
|
|
|
WELSEMMS
|
|
|
|
LOAD_5_PARA_POP
|
|
|
|
ret
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
;*******************************************************************************
|
|
|
|
; Macros and other preprocessor constants
|
|
|
|
;*******************************************************************************
|
|
|
|
|
|
|
|
|
|
|
|
%macro SSE_LOAD_8P 3
|
2014-05-31 13:13:34 +02:00
|
|
|
movq %1, %3
|
|
|
|
punpcklbw %1, %2
|
2014-01-05 13:11:41 +01:00
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro FILTER_HV_W8 9
|
2014-05-31 13:13:34 +02:00
|
|
|
paddw %1, %6
|
|
|
|
movdqa %8, %3
|
|
|
|
movdqa %7, %2
|
|
|
|
paddw %1, [h264_w0x10_1]
|
|
|
|
paddw %8, %4
|
|
|
|
paddw %7, %5
|
|
|
|
psllw %8, 2
|
|
|
|
psubw %8, %7
|
|
|
|
paddw %1, %8
|
|
|
|
psllw %8, 2
|
|
|
|
paddw %1, %8
|
|
|
|
psraw %1, 5
|
|
|
|
WELS_Zero %8
|
|
|
|
packuswb %1, %8
|
|
|
|
movq %9, %1
|
2014-01-05 13:11:41 +01:00
|
|
|
%endmacro
|
|
|
|
|
|
|
|
;*******************************************************************************
|
|
|
|
; Code
|
|
|
|
;*******************************************************************************
|
|
|
|
|
|
|
|
SECTION .text
|
|
|
|
|
|
|
|
;***********************************************************************
|
2014-02-09 01:05:47 +01:00
|
|
|
; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
|
2014-01-05 13:11:41 +01:00
|
|
|
; int16_t iSrcStride,
|
2014-05-31 13:13:34 +02:00
|
|
|
; uint8_t *pDst,
|
|
|
|
; int32_t iDstStride
|
|
|
|
; int32_t iHeight
|
2014-01-05 13:11:41 +01:00
|
|
|
; )
|
|
|
|
;***********************************************************************
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN McHorVer22Width8HorFirst_sse2
|
2014-05-31 13:13:34 +02:00
|
|
|
%assign push_num 0
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_5_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-05-31 13:13:34 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r4, r4d
|
|
|
|
pxor xmm7, xmm7
|
2014-01-05 13:11:41 +01:00
|
|
|
|
2014-05-31 13:13:34 +02:00
|
|
|
sub r0, r1 ;;;;;;;;need more 5 lines.
|
|
|
|
sub r0, r1
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.yloop_width_8:
|
2014-05-31 13:13:34 +02:00
|
|
|
movq xmm0, [r0]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
movq xmm1, [r0+5]
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
movq xmm2, [r0+1]
|
|
|
|
punpcklbw xmm2, xmm7
|
|
|
|
movq xmm3, [r0+4]
|
|
|
|
punpcklbw xmm3, xmm7
|
|
|
|
movq xmm4, [r0+2]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
|
|
movq xmm5, [r0+3]
|
|
|
|
punpcklbw xmm5, xmm7
|
|
|
|
|
|
|
|
paddw xmm2, xmm3
|
|
|
|
paddw xmm4, xmm5
|
|
|
|
psllw xmm4, 2
|
|
|
|
psubw xmm4, xmm2
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
paddw xmm0, xmm4
|
|
|
|
psllw xmm4, 2
|
|
|
|
paddw xmm0, xmm4
|
|
|
|
movdqa [r2], xmm0
|
|
|
|
|
|
|
|
add r0, r1
|
|
|
|
add r2, r3
|
|
|
|
dec r4
|
|
|
|
jnz .yloop_width_8
|
|
|
|
POP_XMM
|
|
|
|
LOAD_5_PARA_POP
|
|
|
|
ret
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
;*******************************************************************************
|
2014-02-09 01:05:47 +01:00
|
|
|
; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc,
|
2014-01-05 13:11:41 +01:00
|
|
|
; int iSrcStride,
|
2014-05-31 13:13:34 +02:00
|
|
|
; uint8_t *pDst,
|
|
|
|
; int iDstStride,
|
|
|
|
; int iHeight,
|
2014-01-05 13:11:41 +01:00
|
|
|
; );
|
|
|
|
;*******************************************************************************
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN McHorVer20WidthEq8_sse2
|
2014-05-31 13:13:34 +02:00
|
|
|
%assign push_num 0
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_5_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-05-31 13:13:34 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r4, r4d
|
|
|
|
lea r0, [r0-2] ;pSrc -= 2;
|
2014-01-05 13:11:41 +01:00
|
|
|
|
2014-05-31 13:13:34 +02:00
|
|
|
pxor xmm7, xmm7
|
|
|
|
movdqa xmm6, [h264_w0x10_1]
|
2014-01-05 13:11:41 +01:00
|
|
|
.y_loop:
|
2014-05-31 13:13:34 +02:00
|
|
|
movq xmm0, [r0]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
movq xmm1, [r0+5]
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
movq xmm2, [r0+1]
|
|
|
|
punpcklbw xmm2, xmm7
|
|
|
|
movq xmm3, [r0+4]
|
|
|
|
punpcklbw xmm3, xmm7
|
|
|
|
movq xmm4, [r0+2]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
|
|
movq xmm5, [r0+3]
|
|
|
|
punpcklbw xmm5, xmm7
|
|
|
|
|
|
|
|
paddw xmm2, xmm3
|
|
|
|
paddw xmm4, xmm5
|
|
|
|
psllw xmm4, 2
|
|
|
|
psubw xmm4, xmm2
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
paddw xmm0, xmm4
|
|
|
|
psllw xmm4, 2
|
|
|
|
paddw xmm0, xmm4
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
psraw xmm0, 5
|
|
|
|
|
|
|
|
packuswb xmm0, xmm7
|
|
|
|
movq [r2], xmm0
|
|
|
|
|
|
|
|
lea r2, [r2+r3]
|
|
|
|
lea r0, [r0+r1]
|
|
|
|
dec r4
|
|
|
|
jnz near .y_loop
|
|
|
|
|
|
|
|
POP_XMM
|
|
|
|
LOAD_5_PARA_POP
|
|
|
|
ret
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
;*******************************************************************************
|
2014-02-09 01:05:47 +01:00
|
|
|
; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc,
|
2014-01-05 13:11:41 +01:00
|
|
|
; int iSrcStride,
|
2014-05-31 13:13:34 +02:00
|
|
|
; uint8_t *pDst,
|
|
|
|
; int iDstStride,
|
|
|
|
; int iHeight,
|
2014-01-05 13:11:41 +01:00
|
|
|
; );
|
|
|
|
;*******************************************************************************
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN McHorVer20WidthEq16_sse2
|
2014-05-31 13:13:34 +02:00
|
|
|
%assign push_num 0
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_5_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-05-31 13:13:34 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r4, r4d
|
|
|
|
lea r0, [r0-2] ;pSrc -= 2;
|
2014-01-05 13:11:41 +01:00
|
|
|
|
2014-05-31 13:13:34 +02:00
|
|
|
pxor xmm7, xmm7
|
|
|
|
movdqa xmm6, [h264_w0x10_1]
|
2014-01-05 13:11:41 +01:00
|
|
|
.y_loop:
|
|
|
|
|
2014-05-31 13:13:34 +02:00
|
|
|
movq xmm0, [r0]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
movq xmm1, [r0+5]
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
movq xmm2, [r0+1]
|
|
|
|
punpcklbw xmm2, xmm7
|
|
|
|
movq xmm3, [r0+4]
|
|
|
|
punpcklbw xmm3, xmm7
|
|
|
|
movq xmm4, [r0+2]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
|
|
movq xmm5, [r0+3]
|
|
|
|
punpcklbw xmm5, xmm7
|
|
|
|
|
|
|
|
paddw xmm2, xmm3
|
|
|
|
paddw xmm4, xmm5
|
|
|
|
psllw xmm4, 2
|
|
|
|
psubw xmm4, xmm2
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
paddw xmm0, xmm4
|
|
|
|
psllw xmm4, 2
|
|
|
|
paddw xmm0, xmm4
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
psraw xmm0, 5
|
|
|
|
packuswb xmm0, xmm7
|
|
|
|
movq [r2], xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0+8]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
movq xmm1, [r0+5+8]
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
movq xmm2, [r0+1+8]
|
|
|
|
punpcklbw xmm2, xmm7
|
|
|
|
movq xmm3, [r0+4+8]
|
|
|
|
punpcklbw xmm3, xmm7
|
|
|
|
movq xmm4, [r0+2+8]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
|
|
movq xmm5, [r0+3+8]
|
|
|
|
punpcklbw xmm5, xmm7
|
|
|
|
|
|
|
|
paddw xmm2, xmm3
|
|
|
|
paddw xmm4, xmm5
|
|
|
|
psllw xmm4, 2
|
|
|
|
psubw xmm4, xmm2
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
paddw xmm0, xmm4
|
|
|
|
psllw xmm4, 2
|
|
|
|
paddw xmm0, xmm4
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
psraw xmm0, 5
|
|
|
|
packuswb xmm0, xmm7
|
|
|
|
movq [r2+8], xmm0
|
|
|
|
|
|
|
|
lea r2, [r2+r3]
|
|
|
|
lea r0, [r0+r1]
|
|
|
|
dec r4
|
|
|
|
jnz near .y_loop
|
|
|
|
|
|
|
|
POP_XMM
|
|
|
|
LOAD_5_PARA_POP
|
|
|
|
ret
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
|
|
|
|
;*******************************************************************************
|
2014-02-09 01:05:47 +01:00
|
|
|
; void McHorVer02WidthEq8_sse2( const uint8_t *pSrc,
|
2014-01-05 13:11:41 +01:00
|
|
|
; int iSrcStride,
|
|
|
|
; uint8_t *pDst,
|
|
|
|
; int iDstStride,
|
|
|
|
; int iHeight )
|
|
|
|
;*******************************************************************************
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN McHorVer02WidthEq8_sse2
|
2014-05-31 13:13:34 +02:00
|
|
|
%assign push_num 0
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_5_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-05-31 13:13:34 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r4, r4d
|
|
|
|
sub r0, r1
|
|
|
|
sub r0, r1
|
|
|
|
|
|
|
|
WELS_Zero xmm7
|
|
|
|
|
|
|
|
SSE_LOAD_8P xmm0, xmm7, [r0]
|
|
|
|
SSE_LOAD_8P xmm1, xmm7, [r0+r1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm2, xmm7, [r0]
|
|
|
|
SSE_LOAD_8P xmm3, xmm7, [r0+r1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm4, xmm7, [r0]
|
|
|
|
SSE_LOAD_8P xmm5, xmm7, [r0+r1]
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.start:
|
2014-05-31 13:13:34 +02:00
|
|
|
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
|
|
dec r4
|
|
|
|
jz near .xx_exit
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm6, xmm7, [r0]
|
|
|
|
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
|
|
|
|
dec r4
|
|
|
|
jz near .xx_exit
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE_LOAD_8P xmm7, xmm0, [r0+r1]
|
|
|
|
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
|
|
|
|
dec r4
|
|
|
|
jz near .xx_exit
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm0, xmm1, [r0]
|
|
|
|
FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
|
|
|
|
dec r4
|
|
|
|
jz near .xx_exit
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE_LOAD_8P xmm1, xmm2, [r0+r1]
|
|
|
|
FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
|
|
|
|
dec r4
|
|
|
|
jz near .xx_exit
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm2, xmm3, [r0]
|
|
|
|
FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
|
|
|
|
dec r4
|
|
|
|
jz near .xx_exit
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE_LOAD_8P xmm3, xmm4, [r0+r1]
|
|
|
|
FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
|
|
|
|
dec r4
|
|
|
|
jz near .xx_exit
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm4, xmm5, [r0]
|
|
|
|
FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
|
|
|
|
dec r4
|
|
|
|
jz near .xx_exit
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE_LOAD_8P xmm5, xmm6, [r0+r1]
|
|
|
|
jmp near .start
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.xx_exit:
|
2014-05-31 13:13:34 +02:00
|
|
|
POP_XMM
|
|
|
|
LOAD_5_PARA_POP
|
|
|
|
ret
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
; Code
|
|
|
|
;***********************************************************************
|
|
|
|
|
|
|
|
SECTION .text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;***********************************************************************
|
2014-05-31 13:13:34 +02:00
|
|
|
; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc,
|
2014-01-05 13:11:41 +01:00
|
|
|
; int32_t iSrcStride,
|
|
|
|
; uint8_t *pDst,
|
|
|
|
; int32_t iDstStride,
|
2014-05-31 13:13:34 +02:00
|
|
|
; int32_t iWidth,
|
2014-01-05 13:11:41 +01:00
|
|
|
; int32_t iHeight )
|
|
|
|
;***********************************************************************
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN McHorVer02Height9Or17_sse2
|
2014-05-31 13:13:34 +02:00
|
|
|
%assign push_num 0
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_6_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-05-31 13:13:34 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r4, r4d
|
|
|
|
SIGN_EXTENSION r5, r5d
|
2014-01-05 13:16:22 +01:00
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
%ifndef X86_32
|
2014-05-31 13:13:34 +02:00
|
|
|
push r12
|
|
|
|
push r13
|
|
|
|
push r14
|
|
|
|
mov r12, r0
|
|
|
|
mov r13, r2
|
|
|
|
mov r14, r5
|
2014-01-05 13:11:41 +01:00
|
|
|
%endif
|
2014-01-05 13:16:22 +01:00
|
|
|
|
2014-05-31 13:13:34 +02:00
|
|
|
shr r4, 3
|
|
|
|
sub r0, r1
|
|
|
|
sub r0, r1
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.xloop:
|
2014-05-31 13:13:34 +02:00
|
|
|
WELS_Zero xmm7
|
|
|
|
SSE_LOAD_8P xmm0, xmm7, [r0]
|
|
|
|
SSE_LOAD_8P xmm1, xmm7, [r0+r1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm2, xmm7, [r0]
|
|
|
|
SSE_LOAD_8P xmm3, xmm7, [r0+r1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm4, xmm7, [r0]
|
|
|
|
SSE_LOAD_8P xmm5, xmm7, [r0+r1]
|
|
|
|
|
|
|
|
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
|
|
dec r5
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm6, xmm7, [r0]
|
|
|
|
movdqa xmm0,xmm1
|
|
|
|
movdqa xmm1,xmm2
|
|
|
|
movdqa xmm2,xmm3
|
|
|
|
movdqa xmm3,xmm4
|
|
|
|
movdqa xmm4,xmm5
|
|
|
|
movdqa xmm5,xmm6
|
|
|
|
add r2, r3
|
|
|
|
sub r0, r1
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.start:
|
2014-05-31 13:13:34 +02:00
|
|
|
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm6, xmm7, [r0]
|
|
|
|
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE_LOAD_8P xmm7, xmm0, [r0+r1]
|
|
|
|
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm0, xmm1, [r0]
|
|
|
|
FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE_LOAD_8P xmm1, xmm2, [r0+r1]
|
|
|
|
FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm2, xmm3, [r0]
|
|
|
|
FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE_LOAD_8P xmm3, xmm4, [r0+r1]
|
|
|
|
FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
SSE_LOAD_8P xmm4, xmm5, [r0]
|
|
|
|
FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE_LOAD_8P xmm5, xmm6, [r0+r1]
|
|
|
|
jmp near .start
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.x_loop_dec:
|
2014-05-31 13:13:34 +02:00
|
|
|
dec r4
|
|
|
|
jz near .xx_exit
|
2014-01-05 13:11:41 +01:00
|
|
|
%ifdef X86_32
|
2014-05-31 13:13:34 +02:00
|
|
|
mov r0, arg1
|
|
|
|
mov r2, arg3
|
|
|
|
mov r5, arg6
|
2014-01-05 13:11:41 +01:00
|
|
|
%else
|
2014-05-31 13:13:34 +02:00
|
|
|
mov r0, r12
|
|
|
|
mov r2, r13
|
|
|
|
mov r5, r14
|
2014-01-05 13:11:41 +01:00
|
|
|
%endif
|
2014-05-31 13:13:34 +02:00
|
|
|
sub r0, r1
|
|
|
|
sub r0, r1
|
|
|
|
add r0, 8
|
|
|
|
add r2, 8
|
|
|
|
jmp near .xloop
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.xx_exit:
|
|
|
|
%ifndef X86_32
|
2014-05-31 13:13:34 +02:00
|
|
|
pop r14
|
|
|
|
pop r13
|
|
|
|
pop r12
|
2014-01-05 13:11:41 +01:00
|
|
|
%endif
|
2014-05-31 13:13:34 +02:00
|
|
|
POP_XMM
|
|
|
|
LOAD_6_PARA_POP
|
|
|
|
ret
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
|
|
|
|
;***********************************************************************
|
2014-05-31 13:13:34 +02:00
|
|
|
; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
|
2014-01-05 13:11:41 +01:00
|
|
|
; int32_t iSrcStride,
|
2014-05-31 13:13:34 +02:00
|
|
|
; uint8_t *pDst,
|
|
|
|
; int32_t iDstStride,
|
|
|
|
; int32_t iWidth,
|
|
|
|
; int32_t iHeight
|
2014-01-05 13:11:41 +01:00
|
|
|
; );
|
|
|
|
;***********************************************************************
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN McHorVer20Width9Or17_sse2
|
2014-05-31 13:13:34 +02:00
|
|
|
%assign push_num 0
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_6_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-05-31 13:13:34 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r4, r4d
|
|
|
|
SIGN_EXTENSION r5, r5d
|
|
|
|
sub r0, 2
|
|
|
|
pxor xmm7, xmm7
|
2014-01-05 13:11:41 +01:00
|
|
|
|
2014-05-31 13:13:34 +02:00
|
|
|
cmp r4, 9
|
|
|
|
jne near .width_17
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.yloop_width_9:
|
2014-05-31 13:13:34 +02:00
|
|
|
movq xmm0, [r0]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
movq xmm1, [r0+5]
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
movq xmm2, [r0+1]
|
|
|
|
punpcklbw xmm2, xmm7
|
|
|
|
movq xmm3, [r0+4]
|
|
|
|
punpcklbw xmm3, xmm7
|
|
|
|
movq xmm4, [r0+2]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
|
|
movq xmm5, [r0+3]
|
|
|
|
punpcklbw xmm5, xmm7
|
|
|
|
|
|
|
|
movdqa xmm7, xmm2
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
movdqa xmm6, xmm4
|
|
|
|
paddw xmm6, xmm5
|
|
|
|
psllw xmm6, 2
|
|
|
|
psubw xmm6, xmm7
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
psllw xmm6, 2
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
paddw xmm0, [h264_w0x10_1]
|
|
|
|
psraw xmm0, 5
|
|
|
|
packuswb xmm0, xmm0
|
|
|
|
movd [r2], xmm0
|
|
|
|
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
movq xmm0, [r0+6]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
|
|
|
|
paddw xmm4, xmm1
|
|
|
|
paddw xmm5, xmm3
|
|
|
|
psllw xmm5, 2
|
|
|
|
psubw xmm5, xmm4
|
|
|
|
paddw xmm2, xmm0
|
|
|
|
paddw xmm2, xmm5
|
|
|
|
psllw xmm5, 2
|
|
|
|
paddw xmm2, xmm5
|
|
|
|
paddw xmm2, [h264_w0x10_1]
|
|
|
|
psraw xmm2, 5
|
|
|
|
packuswb xmm2, xmm2
|
|
|
|
movq [r2+1], xmm2
|
|
|
|
|
|
|
|
add r0, r1
|
|
|
|
add r2, r3
|
|
|
|
dec r5
|
|
|
|
jnz .yloop_width_9
|
|
|
|
POP_XMM
|
|
|
|
LOAD_6_PARA_POP
|
|
|
|
ret
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
|
|
|
|
.width_17:
|
|
|
|
.yloop_width_17:
|
2014-05-31 13:13:34 +02:00
|
|
|
movq xmm0, [r0]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
movq xmm1, [r0+5]
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
movq xmm2, [r0+1]
|
|
|
|
punpcklbw xmm2, xmm7
|
|
|
|
movq xmm3, [r0+4]
|
|
|
|
punpcklbw xmm3, xmm7
|
|
|
|
movq xmm4, [r0+2]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
|
|
movq xmm5, [r0+3]
|
|
|
|
punpcklbw xmm5, xmm7
|
|
|
|
|
|
|
|
paddw xmm2, xmm3
|
|
|
|
paddw xmm4, xmm5
|
|
|
|
psllw xmm4, 2
|
|
|
|
psubw xmm4, xmm2
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
paddw xmm0, xmm4
|
|
|
|
psllw xmm4, 2
|
|
|
|
paddw xmm0, xmm4
|
|
|
|
paddw xmm0, [h264_w0x10_1]
|
|
|
|
psraw xmm0, 5
|
|
|
|
packuswb xmm0, xmm0
|
|
|
|
movq [r2], xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0+8]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
movq xmm1, [r0+5+8]
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
movq xmm2, [r0+1+8]
|
|
|
|
punpcklbw xmm2, xmm7
|
|
|
|
movq xmm3, [r0+4+8]
|
|
|
|
punpcklbw xmm3, xmm7
|
|
|
|
movq xmm4, [r0+2+8]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
|
|
movq xmm5, [r0+3+8]
|
|
|
|
punpcklbw xmm5, xmm7
|
|
|
|
|
|
|
|
movdqa xmm7, xmm2
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
movdqa xmm6, xmm4
|
|
|
|
paddw xmm6, xmm5
|
|
|
|
psllw xmm6, 2
|
|
|
|
psubw xmm6, xmm7
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
psllw xmm6, 2
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
paddw xmm0, [h264_w0x10_1]
|
|
|
|
psraw xmm0, 5
|
|
|
|
packuswb xmm0, xmm0
|
|
|
|
movd [r2+8], xmm0
|
|
|
|
|
|
|
|
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
movq xmm0, [r0+6+8]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
|
|
|
|
paddw xmm4, xmm1
|
|
|
|
paddw xmm5, xmm3
|
|
|
|
psllw xmm5, 2
|
|
|
|
psubw xmm5, xmm4
|
|
|
|
paddw xmm2, xmm0
|
|
|
|
paddw xmm2, xmm5
|
|
|
|
psllw xmm5, 2
|
|
|
|
paddw xmm2, xmm5
|
|
|
|
paddw xmm2, [h264_w0x10_1]
|
|
|
|
psraw xmm2, 5
|
|
|
|
packuswb xmm2, xmm2
|
|
|
|
movq [r2+9], xmm2
|
|
|
|
add r0, r1
|
|
|
|
add r2, r3
|
|
|
|
dec r5
|
|
|
|
jnz .yloop_width_17
|
|
|
|
POP_XMM
|
|
|
|
LOAD_6_PARA_POP
|
|
|
|
ret
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;void McHorVer22HorFirst_sse2
|
2014-05-31 13:13:34 +02:00
|
|
|
; (const uint8_t *pSrc,
|
|
|
|
; int32_t iSrcStride,
|
|
|
|
; uint8_t * pTap,
|
|
|
|
; int32_t iTapStride,
|
|
|
|
; int32_t iWidth,int32_t iHeight);
|
2014-01-05 13:11:41 +01:00
|
|
|
;***********************************************************************
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN McHorVer22HorFirst_sse2
|
2014-05-31 13:13:34 +02:00
|
|
|
%assign push_num 0
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_6_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-05-31 13:13:34 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r4, r4d
|
|
|
|
SIGN_EXTENSION r5, r5d
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
sub r0, r1 ;;;;;;;;need more 5 lines.
|
|
|
|
sub r0, r1
|
2014-01-05 13:11:41 +01:00
|
|
|
|
2014-05-31 13:13:34 +02:00
|
|
|
cmp r4, 9
|
|
|
|
jne near .width_17
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.yloop_width_9:
|
2014-05-31 13:13:34 +02:00
|
|
|
movq xmm0, [r0]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
movq xmm1, [r0+5]
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
movq xmm2, [r0+1]
|
|
|
|
punpcklbw xmm2, xmm7
|
|
|
|
movq xmm3, [r0+4]
|
|
|
|
punpcklbw xmm3, xmm7
|
|
|
|
movq xmm4, [r0+2]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
|
|
movq xmm5, [r0+3]
|
|
|
|
punpcklbw xmm5, xmm7
|
|
|
|
|
|
|
|
movdqa xmm7, xmm2
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
movdqa xmm6, xmm4
|
|
|
|
paddw xmm6, xmm5
|
|
|
|
psllw xmm6, 2
|
|
|
|
psubw xmm6, xmm7
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
psllw xmm6, 2
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
movd [r2], xmm0
|
|
|
|
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
movq xmm0, [r0+6]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
|
|
|
|
paddw xmm4, xmm1
|
|
|
|
paddw xmm5, xmm3
|
|
|
|
psllw xmm5, 2
|
|
|
|
psubw xmm5, xmm4
|
|
|
|
paddw xmm2, xmm0
|
|
|
|
paddw xmm2, xmm5
|
|
|
|
psllw xmm5, 2
|
|
|
|
paddw xmm2, xmm5
|
|
|
|
movq [r2+2], xmm2
|
|
|
|
movhps [r2+2+8], xmm2
|
|
|
|
|
|
|
|
add r0, r1
|
|
|
|
add r2, r3
|
|
|
|
dec r5
|
|
|
|
jnz .yloop_width_9
|
|
|
|
POP_XMM
|
|
|
|
LOAD_6_PARA_POP
|
|
|
|
ret
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
|
|
|
|
.width_17:
|
|
|
|
.yloop_width_17:
|
2014-05-31 13:13:34 +02:00
|
|
|
movq xmm0, [r0]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
movq xmm1, [r0+5]
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
movq xmm2, [r0+1]
|
|
|
|
punpcklbw xmm2, xmm7
|
|
|
|
movq xmm3, [r0+4]
|
|
|
|
punpcklbw xmm3, xmm7
|
|
|
|
movq xmm4, [r0+2]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
|
|
movq xmm5, [r0+3]
|
|
|
|
punpcklbw xmm5, xmm7
|
|
|
|
|
|
|
|
paddw xmm2, xmm3
|
|
|
|
paddw xmm4, xmm5
|
|
|
|
psllw xmm4, 2
|
|
|
|
psubw xmm4, xmm2
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
paddw xmm0, xmm4
|
|
|
|
psllw xmm4, 2
|
|
|
|
paddw xmm0, xmm4
|
|
|
|
movdqa [r2], xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0+8]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
movq xmm1, [r0+5+8]
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
movq xmm2, [r0+1+8]
|
|
|
|
punpcklbw xmm2, xmm7
|
|
|
|
movq xmm3, [r0+4+8]
|
|
|
|
punpcklbw xmm3, xmm7
|
|
|
|
movq xmm4, [r0+2+8]
|
|
|
|
punpcklbw xmm4, xmm7
|
|
|
|
movq xmm5, [r0+3+8]
|
|
|
|
punpcklbw xmm5, xmm7
|
|
|
|
|
|
|
|
movdqa xmm7, xmm2
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
movdqa xmm6, xmm4
|
|
|
|
paddw xmm6, xmm5
|
|
|
|
psllw xmm6, 2
|
|
|
|
psubw xmm6, xmm7
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
psllw xmm6, 2
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
movd [r2+16], xmm0
|
|
|
|
|
|
|
|
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
movq xmm0, [r0+6+8]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
|
|
|
|
paddw xmm4, xmm1
|
|
|
|
paddw xmm5, xmm3
|
|
|
|
psllw xmm5, 2
|
|
|
|
psubw xmm5, xmm4
|
|
|
|
paddw xmm2, xmm0
|
|
|
|
paddw xmm2, xmm5
|
|
|
|
psllw xmm5, 2
|
|
|
|
paddw xmm2, xmm5
|
|
|
|
movq [r2+18], xmm2
|
|
|
|
movhps [r2+18+8], xmm2
|
|
|
|
|
|
|
|
add r0, r1
|
|
|
|
add r2, r3
|
|
|
|
dec r5
|
|
|
|
jnz .yloop_width_17
|
|
|
|
POP_XMM
|
|
|
|
LOAD_6_PARA_POP
|
|
|
|
ret
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
|
|
|
|
%macro FILTER_VER 9
|
2014-05-31 13:13:34 +02:00
|
|
|
paddw %1, %6
|
|
|
|
movdqa %7, %2
|
|
|
|
movdqa %8, %3
|
|
|
|
|
|
|
|
|
|
|
|
paddw %7, %5
|
|
|
|
paddw %8, %4
|
|
|
|
|
|
|
|
psubw %1, %7
|
|
|
|
psraw %1, 2
|
|
|
|
paddw %1, %8
|
|
|
|
psubw %1, %7
|
|
|
|
psraw %1, 2
|
|
|
|
paddw %8, %1
|
|
|
|
paddw %8, [h264_mc_hc_32]
|
|
|
|
psraw %8, 6
|
|
|
|
packuswb %8, %8
|
|
|
|
movq %9, %8
|
2014-01-05 13:11:41 +01:00
|
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
|
|
;void McHorVer22Width8VerLastAlign_sse2(
|
2014-05-31 13:13:34 +02:00
|
|
|
; const uint8_t *pTap,
|
|
|
|
; int32_t iTapStride,
|
|
|
|
; uint8_t * pDst,
|
|
|
|
; int32_t iDstStride,
|
|
|
|
; int32_t iWidth,
|
|
|
|
; int32_t iHeight);
|
2014-01-05 13:11:41 +01:00
|
|
|
;***********************************************************************
|
|
|
|
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
|
2014-05-31 13:13:34 +02:00
|
|
|
%assign push_num 0
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_6_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-05-31 13:13:34 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r4, r4d
|
|
|
|
SIGN_EXTENSION r5, r5d
|
2014-01-05 13:11:41 +01:00
|
|
|
%ifndef X86_32
|
2014-05-31 13:13:34 +02:00
|
|
|
push r12
|
|
|
|
push r13
|
|
|
|
push r14
|
|
|
|
mov r12, r0
|
|
|
|
mov r13, r2
|
|
|
|
mov r14, r5
|
2014-01-05 13:11:41 +01:00
|
|
|
%endif
|
|
|
|
|
2014-05-31 13:13:34 +02:00
|
|
|
shr r4, 3
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.width_loop:
|
2014-05-31 13:13:34 +02:00
|
|
|
movdqa xmm0, [r0]
|
|
|
|
movdqa xmm1, [r0+r1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqa xmm2, [r0]
|
|
|
|
movdqa xmm3, [r0+r1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqa xmm4, [r0]
|
|
|
|
movdqa xmm5, [r0+r1]
|
|
|
|
|
|
|
|
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
|
|
dec r5
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqa xmm6, [r0]
|
|
|
|
|
|
|
|
movdqa xmm0, xmm1
|
|
|
|
movdqa xmm1, xmm2
|
|
|
|
movdqa xmm2, xmm3
|
|
|
|
movdqa xmm3, xmm4
|
|
|
|
movdqa xmm4, xmm5
|
|
|
|
movdqa xmm5, xmm6
|
|
|
|
|
|
|
|
add r2, r3
|
|
|
|
sub r0, r1
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.start:
|
2014-05-31 13:13:34 +02:00
|
|
|
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqa xmm6, [r0]
|
|
|
|
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm7, [r0+r1]
|
|
|
|
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqa xmm0, [r0]
|
|
|
|
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm1, [r0+r1]
|
|
|
|
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqa xmm2, [r0]
|
|
|
|
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm3, [r0+r1]
|
|
|
|
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqa xmm4, [r0]
|
|
|
|
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm5, [r0+r1]
|
|
|
|
jmp near .start
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.x_loop_dec:
|
2014-05-31 13:13:34 +02:00
|
|
|
dec r4
|
|
|
|
jz near .exit
|
2014-01-05 13:11:41 +01:00
|
|
|
%ifdef X86_32
|
2014-05-31 13:13:34 +02:00
|
|
|
mov r0, arg1
|
|
|
|
mov r2, arg3
|
|
|
|
mov r5, arg6
|
2014-01-05 13:11:41 +01:00
|
|
|
%else
|
2014-05-31 13:13:34 +02:00
|
|
|
mov r0, r12
|
|
|
|
mov r2, r13
|
|
|
|
mov r5, r14
|
2014-01-05 13:11:41 +01:00
|
|
|
%endif
|
2014-05-31 13:13:34 +02:00
|
|
|
add r0, 16
|
|
|
|
add r2, 8
|
|
|
|
jmp .width_loop
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.exit:
|
|
|
|
%ifndef X86_32
|
2014-05-31 13:13:34 +02:00
|
|
|
pop r14
|
|
|
|
pop r13
|
|
|
|
pop r12
|
2014-01-05 13:11:41 +01:00
|
|
|
%endif
|
2014-05-31 13:13:34 +02:00
|
|
|
POP_XMM
|
|
|
|
LOAD_6_PARA_POP
|
|
|
|
ret
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;void McHorVer22Width8VerLastUnAlign_sse2(
|
2014-05-31 13:13:34 +02:00
|
|
|
; const uint8_t *pTap,
|
|
|
|
; int32_t iTapStride,
|
|
|
|
; uint8_t * pDst,
|
|
|
|
; int32_t iDstStride,
|
|
|
|
; int32_t iWidth,
|
|
|
|
; int32_t iHeight);
|
2014-01-05 13:11:41 +01:00
|
|
|
;***********************************************************************
|
|
|
|
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
|
2014-05-31 13:13:34 +02:00
|
|
|
%assign push_num 0
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_6_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-05-31 13:13:34 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r4, r4d
|
|
|
|
SIGN_EXTENSION r5, r5d
|
2014-01-05 13:11:41 +01:00
|
|
|
%ifndef X86_32
|
2014-05-31 13:13:34 +02:00
|
|
|
push r12
|
|
|
|
push r13
|
|
|
|
push r14
|
|
|
|
mov r12, r0
|
|
|
|
mov r13, r2
|
|
|
|
mov r14, r5
|
2014-01-05 13:11:41 +01:00
|
|
|
%endif
|
2014-05-31 13:13:34 +02:00
|
|
|
shr r4, 3
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.width_loop:
|
2014-05-31 13:13:34 +02:00
|
|
|
movdqu xmm0, [r0]
|
|
|
|
movdqu xmm1, [r0+r1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqu xmm2, [r0]
|
|
|
|
movdqu xmm3, [r0+r1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqu xmm4, [r0]
|
|
|
|
movdqu xmm5, [r0+r1]
|
|
|
|
|
|
|
|
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
|
|
dec r5
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqu xmm6, [r0]
|
|
|
|
|
|
|
|
movdqa xmm0, xmm1
|
|
|
|
movdqa xmm1, xmm2
|
|
|
|
movdqa xmm2, xmm3
|
|
|
|
movdqa xmm3, xmm4
|
|
|
|
movdqa xmm4, xmm5
|
|
|
|
movdqa xmm5, xmm6
|
|
|
|
|
|
|
|
add r2, r3
|
|
|
|
sub r0, r1
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.start:
|
2014-05-31 13:13:34 +02:00
|
|
|
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqu xmm6, [r0]
|
|
|
|
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqu xmm7, [r0+r1]
|
|
|
|
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqu xmm0, [r0]
|
|
|
|
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqu xmm1, [r0+r1]
|
|
|
|
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqu xmm2, [r0]
|
|
|
|
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqu xmm3, [r0+r1]
|
|
|
|
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqu xmm4, [r0]
|
|
|
|
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
|
|
|
|
dec r5
|
|
|
|
jz near .x_loop_dec
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqu xmm5, [r0+r1]
|
|
|
|
jmp near .start
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.x_loop_dec:
|
2014-05-31 13:13:34 +02:00
|
|
|
dec r4
|
|
|
|
jz near .exit
|
2014-01-05 13:11:41 +01:00
|
|
|
%ifdef X86_32
|
2014-05-31 13:13:34 +02:00
|
|
|
mov r0, arg1
|
|
|
|
mov r2, arg3
|
|
|
|
mov r5, arg6
|
2014-01-05 13:11:41 +01:00
|
|
|
%else
|
2014-05-31 13:13:34 +02:00
|
|
|
mov r0, r12
|
|
|
|
mov r2, r13
|
|
|
|
mov r5, r14
|
2014-01-05 13:11:41 +01:00
|
|
|
%endif
|
2014-05-31 13:13:34 +02:00
|
|
|
add r0, 16
|
|
|
|
add r2, 8
|
|
|
|
jmp .width_loop
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
.exit:
|
|
|
|
%ifndef X86_32
|
2014-05-31 13:13:34 +02:00
|
|
|
pop r14
|
|
|
|
pop r13
|
|
|
|
pop r12
|
2014-01-05 13:11:41 +01:00
|
|
|
%endif
|
2014-05-31 13:13:34 +02:00
|
|
|
POP_XMM
|
|
|
|
LOAD_6_PARA_POP
|
|
|
|
ret
|