2014-01-05 14:11:41 +02:00
|
|
|
;*!
|
|
|
|
;* \copy
|
|
|
|
;* Copyright (c) 2004-2013, Cisco Systems
|
|
|
|
;* All rights reserved.
|
|
|
|
;*
|
|
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
|
|
;* modification, are permitted provided that the following conditions
|
|
|
|
;* are met:
|
|
|
|
;*
|
|
|
|
;* * Redistributions of source code must retain the above copyright
|
|
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
|
|
;*
|
|
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
|
|
;* the documentation and/or other materials provided with the
|
|
|
|
;* distribution.
|
|
|
|
;*
|
|
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
;*
|
|
|
|
;*
|
|
|
|
;* mc_chroma.asm
|
|
|
|
;*
|
|
|
|
;* Abstract
|
|
|
|
;* mmx motion compensation for chroma
|
|
|
|
;*
|
|
|
|
;* History
|
|
|
|
;* 10/13/2004 Created
|
|
|
|
;*
|
|
|
|
;*
|
|
|
|
;*************************************************************************/
|
|
|
|
%include "asm_inc.asm"
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
; Local Data (Read Only)
|
|
|
|
;***********************************************************************
|
|
|
|
|
|
|
|
SECTION .rodata align=16
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
; Various memory constants (trigonometric values or rounding values)
|
|
|
|
;***********************************************************************
|
|
|
|
|
|
|
|
ALIGN 16
|
|
|
|
h264_d0x20_sse2:
|
|
|
|
dw 32,32,32,32,32,32,32,32
|
|
|
|
ALIGN 16
|
|
|
|
h264_d0x20_mmx:
|
|
|
|
dw 32,32,32,32
|
|
|
|
|
|
|
|
|
|
|
|
;=============================================================================
|
|
|
|
; Code
|
|
|
|
;=============================================================================
|
|
|
|
|
|
|
|
SECTION .text
|
|
|
|
|
|
|
|
;*******************************************************************************
|
2014-02-09 02:05:47 +02:00
|
|
|
; void McChromaWidthEq4_mmx( const uint8_t *src,
|
2014-01-05 14:11:41 +02:00
|
|
|
; int32_t iSrcStride,
|
|
|
|
; uint8_t *pDst,
|
|
|
|
; int32_t iDstStride,
|
2014-02-09 02:05:47 +02:00
|
|
|
; const uint8_t *pABCD,
|
2014-01-05 14:11:41 +02:00
|
|
|
; int32_t iHeigh );
|
|
|
|
;*******************************************************************************
|
|
|
|
WELS_EXTERN McChromaWidthEq4_mmx
|
|
|
|
%assign push_num 0
|
2014-01-05 14:16:22 +02:00
|
|
|
LOAD_6_PARA
|
2014-03-14 14:32:41 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r5, r5d
|
2014-01-05 14:16:22 +02:00
|
|
|
|
2014-01-05 14:11:41 +02:00
|
|
|
movd mm3, [r4]; [eax]
|
|
|
|
WELS_Zero mm7
|
|
|
|
punpcklbw mm3, mm3
|
|
|
|
movq mm4, mm3
|
|
|
|
punpcklwd mm3, mm3
|
|
|
|
punpckhwd mm4, mm4
|
|
|
|
|
|
|
|
movq mm5, mm3
|
|
|
|
punpcklbw mm3, mm7
|
|
|
|
punpckhbw mm5, mm7
|
|
|
|
|
|
|
|
movq mm6, mm4
|
|
|
|
punpcklbw mm4, mm7
|
|
|
|
punpckhbw mm6, mm7
|
|
|
|
|
|
|
|
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
|
|
|
|
movd mm0, [r0]
|
|
|
|
movd mm1, [r0+1]
|
|
|
|
punpcklbw mm0, mm7
|
|
|
|
punpcklbw mm1, mm7
|
|
|
|
.xloop:
|
|
|
|
|
|
|
|
pmullw mm0, mm3
|
|
|
|
pmullw mm1, mm5
|
|
|
|
paddw mm0, mm1
|
|
|
|
|
|
|
|
movd mm1, [r4]
|
|
|
|
punpcklbw mm1, mm7
|
|
|
|
movq mm2, mm1
|
|
|
|
pmullw mm1, mm4
|
|
|
|
paddw mm0, mm1
|
|
|
|
|
|
|
|
movd mm1, [r4+1]
|
|
|
|
punpcklbw mm1, mm7
|
|
|
|
movq mm7, mm1
|
|
|
|
pmullw mm1,mm6
|
|
|
|
paddw mm0, mm1
|
|
|
|
movq mm1,mm7
|
|
|
|
|
|
|
|
paddw mm0, [h264_d0x20_mmx]
|
|
|
|
psrlw mm0, 6
|
|
|
|
|
|
|
|
WELS_Zero mm7
|
|
|
|
packuswb mm0, mm7
|
|
|
|
movd [r2], mm0
|
|
|
|
|
|
|
|
movq mm0, mm2
|
|
|
|
|
|
|
|
lea r2, [r2 + r3]
|
|
|
|
lea r4, [r4 + r1]
|
|
|
|
|
|
|
|
dec r5
|
|
|
|
jnz near .xloop
|
|
|
|
WELSEMMS
|
|
|
|
LOAD_6_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
;*******************************************************************************
|
2014-02-09 02:05:47 +02:00
|
|
|
; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
|
2014-01-05 14:11:41 +02:00
|
|
|
; int32_t iSrcStride,
|
|
|
|
; uint8_t *pDst,
|
|
|
|
; int32_t iDstStride,
|
2014-02-09 02:05:47 +02:00
|
|
|
; const uint8_t *pABCD,
|
2014-01-05 14:11:41 +02:00
|
|
|
; int32_t iheigh );
|
|
|
|
;*******************************************************************************
|
|
|
|
WELS_EXTERN McChromaWidthEq8_sse2
|
|
|
|
%assign push_num 0
|
2014-01-05 14:16:22 +02:00
|
|
|
LOAD_6_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 10:29:53 +02:00
|
|
|
PUSH_XMM 8
|
2014-03-14 14:32:41 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r5, r5d
|
2014-01-05 14:11:41 +02:00
|
|
|
|
|
|
|
movd xmm3, [r4]
|
|
|
|
WELS_Zero xmm7
|
|
|
|
punpcklbw xmm3, xmm3
|
|
|
|
punpcklwd xmm3, xmm3
|
|
|
|
|
|
|
|
movdqa xmm4, xmm3
|
|
|
|
punpckldq xmm3, xmm3
|
|
|
|
punpckhdq xmm4, xmm4
|
|
|
|
movdqa xmm5, xmm3
|
|
|
|
movdqa xmm6, xmm4
|
|
|
|
|
|
|
|
punpcklbw xmm3, xmm7
|
|
|
|
punpckhbw xmm5, xmm7
|
|
|
|
punpcklbw xmm4, xmm7
|
|
|
|
punpckhbw xmm6, xmm7
|
|
|
|
|
|
|
|
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movq xmm1, [r0+1]
|
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
.xloop:
|
|
|
|
|
|
|
|
pmullw xmm0, xmm3
|
|
|
|
pmullw xmm1, xmm5
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
|
|
|
|
movq xmm1, [r4]
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
movdqa xmm2, xmm1
|
|
|
|
pmullw xmm1, xmm4
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
|
|
|
|
movq xmm1, [r4+1]
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
movdqa xmm7, xmm1
|
|
|
|
pmullw xmm1, xmm6
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
movdqa xmm1,xmm7
|
|
|
|
|
|
|
|
paddw xmm0, [h264_d0x20_sse2]
|
|
|
|
psrlw xmm0, 6
|
|
|
|
|
|
|
|
WELS_Zero xmm7
|
|
|
|
packuswb xmm0, xmm7
|
|
|
|
movq [r2], xmm0
|
|
|
|
|
|
|
|
movdqa xmm0, xmm2
|
|
|
|
|
|
|
|
lea r2, [r2 + r3]
|
|
|
|
lea r4, [r4 + r1]
|
|
|
|
|
|
|
|
dec r5
|
|
|
|
jnz near .xloop
|
2014-01-05 14:16:22 +02:00
|
|
|
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 10:29:53 +02:00
|
|
|
POP_XMM
|
2014-01-05 14:11:41 +02:00
|
|
|
LOAD_6_PARA_POP
|
|
|
|
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;***********************************************************************
|
2014-02-09 02:05:47 +02:00
|
|
|
; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
|
2014-01-05 14:11:41 +02:00
|
|
|
; int32_t iSrcStride,
|
|
|
|
; uint8_t *pDst,
|
|
|
|
; int32_t iDstStride,
|
2014-02-09 02:05:47 +02:00
|
|
|
; const uint8_t *pABCD,
|
2014-01-05 14:11:41 +02:00
|
|
|
; int32_t iHeigh);
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN McChromaWidthEq8_ssse3
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_6_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 10:29:53 +02:00
|
|
|
PUSH_XMM 8
|
2014-03-14 14:32:41 +02:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r5, r5d
|
2014-01-05 14:16:22 +02:00
|
|
|
|
2014-01-05 14:11:41 +02:00
|
|
|
pxor xmm7, xmm7
|
|
|
|
movd xmm5, [r4]
|
|
|
|
punpcklwd xmm5, xmm5
|
|
|
|
punpckldq xmm5, xmm5
|
|
|
|
movdqa xmm6, xmm5
|
|
|
|
punpcklqdq xmm5, xmm5
|
|
|
|
punpckhqdq xmm6, xmm6
|
|
|
|
|
|
|
|
sub r2, r3 ;sub esi, edi
|
|
|
|
sub r2, r3
|
|
|
|
movdqa xmm7, [h264_d0x20_sse2]
|
|
|
|
|
|
|
|
movdqu xmm0, [r0]
|
|
|
|
movdqa xmm1, xmm0
|
|
|
|
psrldq xmm1, 1
|
|
|
|
punpcklbw xmm0, xmm1
|
|
|
|
|
|
|
|
.hloop_chroma:
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
|
|
|
|
movdqu xmm2, [r0+r1]
|
|
|
|
movdqa xmm3, xmm2
|
|
|
|
psrldq xmm3, 1
|
|
|
|
punpcklbw xmm2, xmm3
|
|
|
|
movdqa xmm4, xmm2
|
|
|
|
|
|
|
|
pmaddubsw xmm0, xmm5
|
|
|
|
pmaddubsw xmm2, xmm6
|
|
|
|
paddw xmm0, xmm2
|
|
|
|
paddw xmm0, xmm7
|
|
|
|
psrlw xmm0, 6
|
|
|
|
packuswb xmm0, xmm0
|
|
|
|
movq [r2],xmm0
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movdqu xmm2, [r0]
|
|
|
|
movdqa xmm3, xmm2
|
|
|
|
psrldq xmm3, 1
|
|
|
|
punpcklbw xmm2, xmm3
|
|
|
|
movdqa xmm0, xmm2
|
|
|
|
|
|
|
|
pmaddubsw xmm4, xmm5
|
|
|
|
pmaddubsw xmm2, xmm6
|
|
|
|
paddw xmm4, xmm2
|
|
|
|
paddw xmm4, xmm7
|
|
|
|
psrlw xmm4, 6
|
|
|
|
packuswb xmm4, xmm4
|
|
|
|
movq [r2+r3],xmm4
|
|
|
|
|
|
|
|
sub r5, 2
|
|
|
|
jnz .hloop_chroma
|
2014-01-05 14:16:22 +02:00
|
|
|
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 10:29:53 +02:00
|
|
|
POP_XMM
|
2014-01-05 14:11:41 +02:00
|
|
|
LOAD_6_PARA_POP
|
2014-01-05 14:16:22 +02:00
|
|
|
|
2014-01-05 14:11:41 +02:00
|
|
|
ret
|
|
|
|
|
|
|
|
|