2014-01-05 13:11:41 +01:00
|
|
|
;*!
|
|
|
|
;* \copy
|
|
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
|
|
;* All rights reserved.
|
|
|
|
;*
|
|
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
|
|
;* modification, are permitted provided that the following conditions
|
|
|
|
;* are met:
|
|
|
|
;*
|
|
|
|
;* * Redistributions of source code must retain the above copyright
|
|
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
|
|
;*
|
|
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
|
|
;* the documentation and/or other materials provided with the
|
|
|
|
;* distribution.
|
|
|
|
;*
|
|
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
;*
|
|
|
|
;*
|
|
|
|
;* satd_sad.asm
|
|
|
|
;*
|
|
|
|
;* Abstract
|
|
|
|
;* WelsSampleSatd4x4_sse2
|
|
|
|
;* WelsSampleSatd8x8_sse2
|
|
|
|
;* WelsSampleSatd16x8_sse2
|
|
|
|
;* WelsSampleSatd8x16_sse2
|
|
|
|
;* WelsSampleSatd16x16_sse2
|
|
|
|
;*
|
|
|
|
;* WelsSampleSad16x8_sse2
|
|
|
|
;* WelsSampleSad16x16_sse2
|
|
|
|
;*
|
|
|
|
;* History
|
|
|
|
;* 8/5/2009 Created
|
|
|
|
;* 24/9/2009 modified
|
|
|
|
;*
|
|
|
|
;*
|
|
|
|
;*************************************************************************/
|
|
|
|
|
|
|
|
%include "asm_inc.asm"
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
; Data
|
|
|
|
;***********************************************************************
|
|
|
|
SECTION .rodata align=16
|
|
|
|
|
|
|
|
align 16
|
|
|
|
HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
|
|
|
|
align 16
|
|
|
|
HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
|
|
|
|
align 16
|
|
|
|
PDW1: dw 1,1,1,1,1,1,1,1
|
|
|
|
align 16
|
|
|
|
PDQ2: dw 2,0,0,0,2,0,0,0
|
|
|
|
align 16
|
|
|
|
HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
; Code
|
|
|
|
;***********************************************************************
|
|
|
|
SECTION .text
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;Pixel_satd_wxh_sse2 BEGIN
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
%macro MMX_DW_1_2REG 2
|
|
|
|
pxor %1, %1
|
|
|
|
pcmpeqw %2, %2
|
|
|
|
psubw %1, %2
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_SumWHorizon1 2
|
|
|
|
movdqa %2, %1
|
|
|
|
psrldq %2, 8
|
|
|
|
paddusw %1, %2
|
|
|
|
movdqa %2, %1
|
|
|
|
psrldq %2, 4
|
|
|
|
paddusw %1, %2
|
|
|
|
movdqa %2, %1
|
|
|
|
psrldq %2, 2
|
|
|
|
paddusw %1, %2
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
|
|
|
|
SSE2_SumSub %1, %2, %5
|
|
|
|
SSE2_SumSub %3, %4, %5
|
|
|
|
SSE2_SumSub %2, %4, %5
|
|
|
|
SSE2_SumSub %1, %3, %5
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_SumAbs4 7
|
|
|
|
WELS_AbsW %1, %3
|
|
|
|
WELS_AbsW %2, %3
|
|
|
|
WELS_AbsW %4, %6
|
|
|
|
WELS_AbsW %5, %6
|
|
|
|
paddusw %1, %2
|
|
|
|
paddusw %4, %5
|
|
|
|
paddusw %7, %1
|
|
|
|
paddusw %7, %4
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_SumWHorizon 3
|
|
|
|
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
|
|
|
|
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
|
|
|
|
punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
|
|
|
|
movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
|
|
|
|
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
|
|
|
|
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
|
|
|
|
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_GetSatd8x8 0
|
|
|
|
SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
|
|
|
|
SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
|
|
|
|
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
|
|
|
|
|
|
|
|
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
|
|
|
|
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
|
|
|
|
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
|
|
|
|
SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
|
|
|
|
SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
|
|
|
|
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
|
|
|
|
|
|
|
|
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
|
|
|
|
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
|
|
|
|
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
|
|
|
|
SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsSampleSatd4x4_sse2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
movd xmm0, [r0]
|
|
|
|
movd xmm1, [r0+r1]
|
|
|
|
lea r0 , [r0+2*r1]
|
|
|
|
movd xmm2, [r0]
|
|
|
|
movd xmm3, [r0+r1]
|
|
|
|
punpckldq xmm0, xmm2
|
|
|
|
punpckldq xmm1, xmm3
|
|
|
|
|
|
|
|
movd xmm4, [r2]
|
|
|
|
movd xmm5, [r2+r3]
|
|
|
|
lea r2 , [r2+2*r3]
|
|
|
|
movd xmm6, [r2]
|
|
|
|
movd xmm7, [r2+r3]
|
|
|
|
punpckldq xmm4, xmm6
|
|
|
|
punpckldq xmm5, xmm7
|
|
|
|
|
|
|
|
pxor xmm6, xmm6
|
|
|
|
punpcklbw xmm0, xmm6
|
|
|
|
punpcklbw xmm1, xmm6
|
|
|
|
punpcklbw xmm4, xmm6
|
|
|
|
punpcklbw xmm5, xmm6
|
|
|
|
|
|
|
|
psubw xmm0, xmm4
|
|
|
|
psubw xmm1, xmm5
|
|
|
|
|
|
|
|
movdqa xmm2, xmm0
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
psubw xmm2, xmm1
|
|
|
|
SSE2_XSawp qdq, xmm0, xmm2, xmm3
|
|
|
|
|
|
|
|
movdqa xmm4, xmm0
|
|
|
|
paddw xmm0, xmm3
|
|
|
|
psubw xmm4, xmm3
|
|
|
|
|
|
|
|
movdqa xmm2, xmm0
|
|
|
|
punpcklwd xmm0, xmm4
|
|
|
|
punpckhwd xmm4, xmm2
|
|
|
|
|
|
|
|
SSE2_XSawp dq, xmm0, xmm4, xmm3
|
|
|
|
SSE2_XSawp qdq, xmm0, xmm3, xmm5
|
|
|
|
|
|
|
|
movdqa xmm7, xmm0
|
|
|
|
paddw xmm0, xmm5
|
|
|
|
psubw xmm7, xmm5
|
|
|
|
|
|
|
|
SSE2_XSawp qdq, xmm0, xmm7, xmm1
|
|
|
|
|
|
|
|
movdqa xmm2, xmm0
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
psubw xmm2, xmm1
|
|
|
|
|
|
|
|
WELS_AbsW xmm0, xmm3
|
|
|
|
paddusw xmm6, xmm0
|
|
|
|
WELS_AbsW xmm2, xmm4
|
|
|
|
paddusw xmm6, xmm2
|
|
|
|
SSE2_SumWHorizon1 xmm6, xmm4
|
|
|
|
movd retrd, xmm6
|
|
|
|
and retrd, 0xffff
|
|
|
|
shr retrd, 1
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN WelsSampleSatd8x8_sse2
|
2014-01-05 13:11:41 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm6, xmm6
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
SSE2_GetSatd8x8
|
|
|
|
psrlw xmm6, 1
|
|
|
|
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
|
|
|
movd retrd, xmm6
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN WelsSampleSatd8x16_sse2
|
2014-01-05 13:11:41 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm6, xmm6
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
|
|
|
|
SSE2_GetSatd8x8
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE2_GetSatd8x8
|
|
|
|
|
|
|
|
psrlw xmm6, 1
|
|
|
|
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
|
|
|
movd retrd, xmm6
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsSampleSatd16x8_sse2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
push r0
|
2014-01-05 13:16:22 +01:00
|
|
|
push r2
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm6, xmm6
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
|
|
|
|
SSE2_GetSatd8x8
|
2014-01-05 13:16:22 +01:00
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
pop r2
|
|
|
|
pop r0
|
|
|
|
add r0, 8
|
|
|
|
add r2, 8
|
|
|
|
SSE2_GetSatd8x8
|
|
|
|
|
|
|
|
psrlw xmm6, 1
|
|
|
|
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
|
|
|
movd retrd, xmm6
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsSampleSatd16x16_sse2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
push r0
|
2014-01-05 13:16:22 +01:00
|
|
|
push r2
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm6, xmm6
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
|
|
|
|
SSE2_GetSatd8x8
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE2_GetSatd8x8
|
|
|
|
|
|
|
|
pop r2
|
|
|
|
pop r0
|
|
|
|
add r0, 8
|
|
|
|
add r2, 8
|
|
|
|
|
|
|
|
SSE2_GetSatd8x8
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE2_GetSatd8x8
|
|
|
|
|
|
|
|
; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
|
|
|
|
psrlw xmm6, 1
|
|
|
|
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
|
|
|
movd retrd, xmm6
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;Pixel_satd_wxh_sse2 END
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;Pixel_satd_intra_sse2 BEGIN
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
|
2014-04-30 09:54:49 +02:00
|
|
|
|
|
|
|
%macro SSE_DB_1_2REG 2
|
|
|
|
pxor %1, %1
|
|
|
|
pcmpeqw %2, %2
|
|
|
|
psubb %1, %2
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
|
|
|
|
; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsSampleSatdThree4x4_sse2
|
|
|
|
|
|
|
|
%ifdef X86_32
|
|
|
|
push r3
|
|
|
|
push r4
|
|
|
|
push r5
|
|
|
|
push r6
|
|
|
|
%assign push_num 4
|
|
|
|
%else
|
|
|
|
%assign push_num 0
|
|
|
|
%endif
|
|
|
|
PUSH_XMM 8
|
|
|
|
|
|
|
|
mov r2, arg3
|
|
|
|
mov r3, arg4
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
|
|
|
|
; load source 4x4 samples and Hadamard transform
|
|
|
|
movd xmm0, [r2]
|
|
|
|
movd xmm1, [r2+r3]
|
|
|
|
lea r2 , [r2+2*r3]
|
|
|
|
movd xmm2, [r2]
|
|
|
|
movd xmm3, [r2+r3]
|
|
|
|
punpckldq xmm0, xmm2
|
|
|
|
punpckldq xmm1, xmm3
|
|
|
|
|
|
|
|
pxor xmm6, xmm6
|
|
|
|
punpcklbw xmm0, xmm6
|
|
|
|
punpcklbw xmm1, xmm6
|
|
|
|
|
|
|
|
movdqa xmm2, xmm0
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
psubw xmm2, xmm1
|
|
|
|
SSE2_XSawp qdq, xmm0, xmm2, xmm3
|
|
|
|
|
|
|
|
movdqa xmm4, xmm0
|
|
|
|
paddw xmm0, xmm3
|
|
|
|
psubw xmm4, xmm3
|
|
|
|
|
|
|
|
movdqa xmm2, xmm0
|
|
|
|
punpcklwd xmm0, xmm4
|
|
|
|
punpckhwd xmm4, xmm2
|
|
|
|
|
|
|
|
SSE2_XSawp dq, xmm0, xmm4, xmm3
|
|
|
|
SSE2_XSawp qdq, xmm0, xmm3, xmm5
|
|
|
|
|
|
|
|
movdqa xmm7, xmm0
|
|
|
|
paddw xmm0, xmm5
|
|
|
|
psubw xmm7, xmm5
|
|
|
|
|
|
|
|
SSE2_XSawp qdq, xmm0, xmm7, xmm1
|
|
|
|
|
|
|
|
; Hadamard transform results are saved in xmm0 and xmm2
|
|
|
|
movdqa xmm2, xmm0
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
psubw xmm2, xmm1
|
|
|
|
|
|
|
|
;load top boundary samples: [a b c d]
|
|
|
|
mov r0, arg1
|
|
|
|
mov r1, arg2
|
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
sub r0, r1
|
|
|
|
%ifdef UNIX64
|
|
|
|
push r4
|
|
|
|
push r5
|
|
|
|
%endif
|
|
|
|
|
|
|
|
movzx r2d, byte [r0]
|
|
|
|
movzx r3d, byte [r0+1]
|
|
|
|
movzx r4d, byte [r0+2]
|
|
|
|
movzx r5d, byte [r0+3]
|
|
|
|
|
|
|
|
; get the transform results of top boundary samples: [a b c d]
|
|
|
|
add r3d, r2d ; r3d = a + b
|
|
|
|
add r5d, r4d ; r5d = c + d
|
|
|
|
add r2d, r2d ; r2d = a + a
|
|
|
|
add r4d, r4d ; r4d = c + c
|
|
|
|
sub r2d, r3d ; r2d = a + a - a - b = a - b
|
|
|
|
sub r4d, r5d ; r4d = c + c - c - d = c - d
|
|
|
|
add r5d, r3d ; r5d = (a + b) + (c + d)
|
|
|
|
add r3d, r3d
|
|
|
|
sub r3d, r5d ; r3d = (a + b) - (c + d)
|
|
|
|
add r4d, r2d ; r4d = (a - b) + (c - d)
|
|
|
|
add r2d, r2d
|
|
|
|
sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
|
|
|
|
|
|
|
|
movdqa xmm6, xmm0
|
|
|
|
movdqa xmm7, xmm2
|
|
|
|
movd xmm5, r5d ; store the edi for DC mode
|
|
|
|
pxor xmm3, xmm3
|
|
|
|
pxor xmm4, xmm4
|
|
|
|
pinsrw xmm3, r5d, 0
|
|
|
|
pinsrw xmm3, r4d, 4
|
|
|
|
psllw xmm3, 2
|
|
|
|
pinsrw xmm4, r3d, 0
|
|
|
|
pinsrw xmm4, r2d, 4
|
|
|
|
psllw xmm4, 2
|
|
|
|
|
|
|
|
; get the satd of H
|
|
|
|
psubw xmm0, xmm3
|
|
|
|
psubw xmm2, xmm4
|
|
|
|
|
|
|
|
WELS_AbsW xmm0, xmm1
|
|
|
|
WELS_AbsW xmm2, xmm1
|
|
|
|
paddusw xmm0, xmm2
|
|
|
|
SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0
|
|
|
|
|
|
|
|
;load left boundary samples: [a b c d]'
|
|
|
|
add r0, r1
|
|
|
|
|
|
|
|
movzx r2d, byte [r0-1]
|
|
|
|
movzx r3d, byte [r0+r1-1]
|
|
|
|
lea r0 , [r0+2*r1]
|
|
|
|
movzx r4d, byte [r0-1]
|
|
|
|
movzx r5d, byte [r0+r1-1]
|
|
|
|
|
|
|
|
; get the transform results of left boundary samples: [a b c d]'
|
|
|
|
add r3d, r2d ; r3d = a + b
|
|
|
|
add r5d, r4d ; r5d = c + d
|
|
|
|
add r2d, r2d ; r2d = a + a
|
|
|
|
add r4d, r4d ; r4d = c + c
|
|
|
|
sub r2d, r3d ; r2d = a + a - a - b = a - b
|
|
|
|
sub r4d, r5d ; r4d = c + c - c - d = c - d
|
|
|
|
add r5d, r3d ; r5d = (a + b) + (c + d)
|
|
|
|
add r3d, r3d
|
|
|
|
sub r3d, r5d ; r3d = (a + b) - (c + d)
|
|
|
|
add r4d, r2d ; r4d = (a - b) + (c - d)
|
|
|
|
add r2d, r2d
|
|
|
|
sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
|
|
|
|
|
|
|
|
; store the transform results in xmm3
|
|
|
|
movd xmm3, r5d
|
|
|
|
pinsrw xmm3, r3d, 1
|
|
|
|
pinsrw xmm3, r2d, 2
|
|
|
|
pinsrw xmm3, r4d, 3
|
|
|
|
psllw xmm3, 2
|
|
|
|
|
|
|
|
; get the satd of V
|
|
|
|
movdqa xmm2, xmm6
|
|
|
|
movdqa xmm4, xmm7
|
|
|
|
psubw xmm2, xmm3
|
|
|
|
WELS_AbsW xmm2, xmm1
|
|
|
|
WELS_AbsW xmm4, xmm1
|
|
|
|
paddusw xmm2, xmm4
|
|
|
|
SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2
|
|
|
|
|
|
|
|
; DC result is stored in xmm1
|
|
|
|
add r5d, 4
|
|
|
|
movd xmm1, r5d
|
|
|
|
paddw xmm1, xmm5
|
|
|
|
psrlw xmm1, 3
|
|
|
|
movdqa xmm5, xmm1
|
|
|
|
psllw xmm1, 4
|
|
|
|
|
|
|
|
; get the satd of DC
|
|
|
|
psubw xmm6, xmm1
|
|
|
|
WELS_AbsW xmm6, xmm1
|
|
|
|
WELS_AbsW xmm7, xmm1
|
|
|
|
paddusw xmm6, xmm7
|
|
|
|
SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6
|
|
|
|
%ifdef UNIX64
|
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
%endif
|
|
|
|
; comparing order: DC H V
|
|
|
|
|
|
|
|
mov r4, arg5
|
|
|
|
movd r2d, xmm6
|
|
|
|
movd r3d, xmm2
|
|
|
|
movd r6d, xmm0
|
|
|
|
|
|
|
|
and r2d, 0xffff
|
|
|
|
shr r2d, 1
|
|
|
|
and r3d, 0xffff
|
|
|
|
shr r3d, 1
|
|
|
|
and r6d, 0xffff
|
|
|
|
shr r6d, 1
|
|
|
|
add r2d, dword arg7
|
|
|
|
add r3d, dword arg8
|
|
|
|
add r6d, dword arg9
|
|
|
|
cmp r2w, r3w
|
|
|
|
jg near not_dc
|
|
|
|
cmp r2w, r6w
|
|
|
|
jg near not_dc_h
|
|
|
|
|
|
|
|
; for DC mode
|
|
|
|
movd r3d, xmm5
|
|
|
|
imul r3d, 0x01010101
|
|
|
|
movd xmm5, r3d
|
|
|
|
pshufd xmm5, xmm5, 0
|
|
|
|
movdqa [r4], xmm5
|
|
|
|
mov r5, arg6
|
|
|
|
mov dword [r5], 0x02
|
|
|
|
mov retrd, r2d
|
|
|
|
POP_XMM
|
|
|
|
%ifdef X86_32
|
|
|
|
pop r6
|
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
pop r3
|
|
|
|
%endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
not_dc:
|
|
|
|
cmp r3w, r6w
|
|
|
|
jg near not_dc_h
|
|
|
|
|
|
|
|
; for H mode
|
|
|
|
SSE_DB_1_2REG xmm6, xmm7
|
|
|
|
sub r0, r1
|
|
|
|
sub r0, r1
|
|
|
|
movzx r6d, byte [r0-1]
|
|
|
|
movd xmm0, r6d
|
|
|
|
pmuludq xmm0, xmm6
|
|
|
|
|
|
|
|
movzx r6d, byte [r0+r1-1]
|
|
|
|
movd xmm1, r6d
|
|
|
|
pmuludq xmm1, xmm6
|
|
|
|
punpckldq xmm0, xmm1
|
|
|
|
|
|
|
|
lea r0, [r0+r1*2]
|
|
|
|
movzx r6d, byte [r0-1]
|
|
|
|
movd xmm2, r6d
|
|
|
|
pmuludq xmm2, xmm6
|
|
|
|
|
|
|
|
movzx r6d, byte [r0+r1-1]
|
|
|
|
movd xmm3, r6d
|
|
|
|
pmuludq xmm3, xmm6
|
|
|
|
punpckldq xmm2, xmm3
|
|
|
|
punpcklqdq xmm0, xmm2
|
|
|
|
|
|
|
|
movdqa [r4],xmm0
|
|
|
|
|
|
|
|
mov retrd, r3d
|
|
|
|
mov r5, arg6
|
|
|
|
mov dword [r5], 0x01
|
|
|
|
POP_XMM
|
|
|
|
%ifdef X86_32
|
|
|
|
pop r6
|
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
pop r3
|
|
|
|
%endif
|
|
|
|
ret
|
|
|
|
not_dc_h:
|
|
|
|
sub r0, r1
|
|
|
|
sub r0, r1
|
|
|
|
sub r0, r1
|
|
|
|
movd xmm0, [r0]
|
|
|
|
pshufd xmm0, xmm0, 0
|
|
|
|
movdqa [r4],xmm0
|
|
|
|
mov retrd, r6d
|
|
|
|
mov r5, arg6
|
|
|
|
mov dword [r5], 0x00
|
|
|
|
POP_XMM
|
|
|
|
%ifdef X86_32
|
|
|
|
pop r6
|
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
pop r3
|
|
|
|
%endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
|
|
|
|
pmaddubsw %1, xmm5
|
|
|
|
movdqa %2, %1
|
|
|
|
pmaddwd %1, xmm7
|
|
|
|
pmaddwd %2, xmm6
|
|
|
|
movdqa %3, %1
|
|
|
|
punpckldq %1, %2
|
|
|
|
punpckhdq %2, %3
|
|
|
|
movdqa %3, %1
|
|
|
|
punpcklqdq %1, %2
|
|
|
|
punpckhqdq %3, %2
|
|
|
|
paddd xmm4, %1 ;for dc
|
|
|
|
paddd xmm4, %3 ;for dc
|
|
|
|
packssdw %1, %3
|
|
|
|
psllw %1, 2
|
|
|
|
%endmacro
|
|
|
|
%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
|
|
|
|
pmaddubsw %1, xmm5
|
|
|
|
movdqa %2, %1
|
|
|
|
pmaddwd %1, xmm7
|
|
|
|
pmaddwd %2, xmm6
|
|
|
|
movdqa %3, %1
|
|
|
|
punpckldq %1, %2
|
|
|
|
punpckhdq %2, %3
|
|
|
|
movdqa %3, %1
|
|
|
|
punpcklqdq %1, %2
|
|
|
|
punpckhqdq %3, %2
|
|
|
|
; paddd xmm4, %1 ;for dc
|
|
|
|
; paddd xmm4, %3 ;for dc
|
|
|
|
movdqa %4, %1
|
|
|
|
punpcklqdq %4, %3
|
|
|
|
packssdw %1, %3
|
|
|
|
psllw %1, 2
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE41_GetX38x4SatdDec 0
|
|
|
|
pxor xmm7, xmm7
|
2014-04-30 09:54:49 +02:00
|
|
|
movq xmm0, [r2]
|
|
|
|
movq xmm1, [r2+r3]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movq xmm2, [r2]
|
|
|
|
movq xmm3, [r2+r3]
|
|
|
|
lea r2, [r2+2*r3]
|
2014-01-05 13:11:41 +01:00
|
|
|
punpcklbw xmm0, xmm7
|
|
|
|
punpcklbw xmm1, xmm7
|
|
|
|
punpcklbw xmm2, xmm7
|
|
|
|
punpcklbw xmm3, xmm7
|
|
|
|
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
|
|
|
|
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
|
|
|
|
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
|
|
|
|
;doesn't need another transpose
|
|
|
|
%endmacro
|
2014-04-30 09:54:49 +02:00
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
%macro SSE41_GetX38x4SatdV 2
|
|
|
|
pxor xmm0, xmm0
|
2014-04-30 09:54:49 +02:00
|
|
|
pinsrw xmm0, word[r6+%2], 0
|
|
|
|
pinsrw xmm0, word[r6+%2+8], 4
|
2014-01-05 13:11:41 +01:00
|
|
|
psubsw xmm0, xmm7
|
|
|
|
pabsw xmm0, xmm0
|
|
|
|
paddw xmm4, xmm0
|
|
|
|
pxor xmm0, xmm0
|
2014-04-30 09:54:49 +02:00
|
|
|
pinsrw xmm0, word[r6+%2+2], 0
|
|
|
|
pinsrw xmm0, word[r6+%2+10], 4
|
2014-01-05 13:11:41 +01:00
|
|
|
psubsw xmm0, xmm1
|
|
|
|
pabsw xmm0, xmm0
|
|
|
|
paddw xmm4, xmm0
|
|
|
|
pxor xmm0, xmm0
|
2014-04-30 09:54:49 +02:00
|
|
|
pinsrw xmm0, word[r6+%2+4], 0
|
|
|
|
pinsrw xmm0, word[r6+%2+12], 4
|
2014-01-05 13:11:41 +01:00
|
|
|
psubsw xmm0, xmm3
|
|
|
|
pabsw xmm0, xmm0
|
|
|
|
paddw xmm4, xmm0
|
|
|
|
pxor xmm0, xmm0
|
2014-04-30 09:54:49 +02:00
|
|
|
pinsrw xmm0, word[r6+%2+6], 0
|
|
|
|
pinsrw xmm0, word[r6+%2+14], 4
|
2014-01-05 13:11:41 +01:00
|
|
|
psubsw xmm0, xmm2
|
|
|
|
pabsw xmm0, xmm0
|
|
|
|
paddw xmm4, xmm0
|
|
|
|
%endmacro
|
|
|
|
%macro SSE41_GetX38x4SatdH 3
|
2014-04-30 09:54:49 +02:00
|
|
|
movq xmm0, [r6+%3+8*%1]
|
2014-01-05 13:11:41 +01:00
|
|
|
punpcklqdq xmm0, xmm0
|
|
|
|
psubsw xmm0, xmm7
|
|
|
|
pabsw xmm0, xmm0
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
pabsw xmm1, xmm1
|
|
|
|
pabsw xmm2, xmm2
|
|
|
|
pabsw xmm3, xmm3
|
|
|
|
paddw xmm2, xmm1;for DC
|
|
|
|
paddw xmm2, xmm3;for DC
|
|
|
|
paddw xmm5, xmm2
|
|
|
|
%endmacro
|
|
|
|
%macro SSE41_I16X16GetX38x4SatdDC 0
|
|
|
|
pxor xmm0, xmm0
|
|
|
|
movq2dq xmm0, mm4
|
|
|
|
punpcklqdq xmm0, xmm0
|
|
|
|
psubsw xmm0, xmm7
|
|
|
|
pabsw xmm0, xmm0
|
|
|
|
paddw xmm6, xmm0
|
|
|
|
paddw xmm6, xmm2
|
|
|
|
%endmacro
|
|
|
|
%macro SSE41_ChromaGetX38x4SatdDC 1
|
|
|
|
shl %1, 4
|
2014-04-30 09:54:49 +02:00
|
|
|
movdqa xmm0, [r6+32+%1]
|
2014-01-05 13:11:41 +01:00
|
|
|
psubsw xmm0, xmm7
|
|
|
|
pabsw xmm0, xmm0
|
|
|
|
paddw xmm6, xmm0
|
|
|
|
paddw xmm6, xmm2
|
|
|
|
%endmacro
|
|
|
|
%macro SSE41_I16x16GetX38x4Satd 2
|
|
|
|
SSE41_GetX38x4SatdDec
|
|
|
|
SSE41_GetX38x4SatdV %1, %2
|
|
|
|
SSE41_GetX38x4SatdH %1, %2, 32
|
|
|
|
SSE41_I16X16GetX38x4SatdDC
|
|
|
|
%endmacro
|
|
|
|
%macro SSE41_ChromaGetX38x4Satd 2
|
|
|
|
SSE41_GetX38x4SatdDec
|
|
|
|
SSE41_GetX38x4SatdV %1, %2
|
|
|
|
SSE41_GetX38x4SatdH %1, %2, 16
|
|
|
|
SSE41_ChromaGetX38x4SatdDC %1
|
|
|
|
%endmacro
|
|
|
|
%macro SSE41_HSum8W 3
|
|
|
|
pmaddwd %1, %2
|
|
|
|
movhlps %3, %1
|
|
|
|
paddd %1, %3
|
|
|
|
pshuflw %3, %1,0Eh
|
|
|
|
paddd %1, %3
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
|
2014-04-30 09:54:49 +02:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_7_PARA
|
|
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r5, r5d
|
|
|
|
|
|
|
|
%ifndef X86_32
|
|
|
|
push r12
|
|
|
|
mov r12, r2
|
|
|
|
%endif
|
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm4, xmm4
|
|
|
|
movdqa xmm5, [HSumSubDB1]
|
|
|
|
movdqa xmm6, [HSumSubDW1]
|
|
|
|
movdqa xmm7, [PDW1]
|
2014-04-30 09:54:49 +02:00
|
|
|
sub r0, r1
|
|
|
|
movdqu xmm0, [r0]
|
2014-01-05 13:11:41 +01:00
|
|
|
movhlps xmm1, xmm0
|
|
|
|
punpcklqdq xmm0, xmm0
|
|
|
|
punpcklqdq xmm1, xmm1
|
|
|
|
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
|
|
|
|
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
|
2014-04-30 09:54:49 +02:00
|
|
|
movdqa [r6], xmm0 ;V
|
|
|
|
movdqa [r6+16], xmm1
|
|
|
|
add r0, r1
|
|
|
|
pinsrb xmm0, byte[r0-1], 0
|
|
|
|
pinsrb xmm0, byte[r0+r1-1], 1
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
pinsrb xmm0, byte[r0-1], 2
|
|
|
|
pinsrb xmm0, byte[r0+r1-1], 3
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
pinsrb xmm0, byte[r0-1], 4
|
|
|
|
pinsrb xmm0, byte[r0+r1-1], 5
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
pinsrb xmm0, byte[r0-1], 6
|
|
|
|
pinsrb xmm0, byte[r0+r1-1], 7
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
pinsrb xmm0, byte[r0-1], 8
|
|
|
|
pinsrb xmm0, byte[r0+r1-1], 9
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
pinsrb xmm0, byte[r0-1], 10
|
|
|
|
pinsrb xmm0, byte[r0+r1-1], 11
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
pinsrb xmm0, byte[r0-1], 12
|
|
|
|
pinsrb xmm0, byte[r0+r1-1], 13
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
pinsrb xmm0, byte[r0-1], 14
|
|
|
|
pinsrb xmm0, byte[r0+r1-1], 15
|
2014-01-05 13:11:41 +01:00
|
|
|
movhlps xmm1, xmm0
|
|
|
|
punpcklqdq xmm0, xmm0
|
|
|
|
punpcklqdq xmm1, xmm1
|
|
|
|
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
|
|
|
|
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
|
2014-04-30 09:54:49 +02:00
|
|
|
movdqa [r6+32], xmm0 ;H
|
|
|
|
movdqa [r6+48], xmm1
|
|
|
|
movd r0d, xmm4 ;dc
|
|
|
|
add r0d, 16 ;(sum+16)
|
|
|
|
shr r0d, 5 ;((sum+16)>>5)
|
|
|
|
shl r0d, 4 ;
|
|
|
|
movd mm4, r0d ; mm4 copy DC
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm4, xmm4 ;V
|
|
|
|
pxor xmm5, xmm5 ;H
|
|
|
|
pxor xmm6, xmm6 ;DC
|
2014-04-30 09:54:49 +02:00
|
|
|
%ifdef UNIX64
|
|
|
|
push r4
|
|
|
|
%endif
|
|
|
|
mov r0, 0
|
|
|
|
mov r4, 0
|
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
.loop16x16_get_satd:
|
|
|
|
.loopStart1:
|
2014-04-30 09:54:49 +02:00
|
|
|
SSE41_I16x16GetX38x4Satd r0, r4
|
|
|
|
inc r0
|
|
|
|
cmp r0, 4
|
2014-01-05 13:11:41 +01:00
|
|
|
jl .loopStart1
|
2014-04-30 09:54:49 +02:00
|
|
|
cmp r4, 16
|
2014-01-05 13:11:41 +01:00
|
|
|
je .loop16x16_get_satd_end
|
2014-04-30 09:54:49 +02:00
|
|
|
%ifdef X86_32
|
|
|
|
mov r2, arg3
|
|
|
|
%else
|
|
|
|
mov r2, r12
|
|
|
|
%endif
|
|
|
|
add r2, 8
|
|
|
|
mov r0, 0
|
|
|
|
add r4, 16
|
2014-01-05 13:11:41 +01:00
|
|
|
jmp .loop16x16_get_satd
|
|
|
|
.loop16x16_get_satd_end:
|
|
|
|
MMX_DW_1_2REG xmm0, xmm1
|
|
|
|
psrlw xmm4, 1 ;/2
|
|
|
|
psrlw xmm5, 1 ;/2
|
|
|
|
psrlw xmm6, 1 ;/2
|
|
|
|
SSE41_HSum8W xmm4, xmm0, xmm1
|
|
|
|
SSE41_HSum8W xmm5, xmm0, xmm1
|
|
|
|
SSE41_HSum8W xmm6, xmm0, xmm1
|
|
|
|
|
2014-04-30 09:54:49 +02:00
|
|
|
%ifdef UNIX64
|
|
|
|
pop r4
|
|
|
|
%endif
|
2014-01-05 13:11:41 +01:00
|
|
|
; comparing order: DC H V
|
2014-04-30 09:54:49 +02:00
|
|
|
movd r3d, xmm6 ;DC
|
|
|
|
movd r1d, xmm5 ;H
|
|
|
|
movd r0d, xmm4 ;V
|
|
|
|
%ifndef X86_32
|
|
|
|
pop r12
|
|
|
|
%endif
|
|
|
|
shl r5d, 1
|
|
|
|
add r1d, r5d
|
|
|
|
add r3d, r5d
|
|
|
|
mov r4, arg5
|
|
|
|
cmp r3d, r1d
|
2014-01-05 13:11:41 +01:00
|
|
|
jge near not_dc_16x16
|
2014-04-30 09:54:49 +02:00
|
|
|
cmp r3d, r0d
|
2014-01-05 13:11:41 +01:00
|
|
|
jge near not_dc_h_16x16
|
|
|
|
|
|
|
|
; for DC mode
|
2014-04-30 09:54:49 +02:00
|
|
|
mov dword[r4], 2;I16_PRED_DC
|
|
|
|
mov retrd, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
jmp near return_satd_intra_16x16_x3
|
|
|
|
not_dc_16x16:
|
|
|
|
; for H mode
|
2014-04-30 09:54:49 +02:00
|
|
|
cmp r1d, r0d
|
2014-01-05 13:11:41 +01:00
|
|
|
jge near not_dc_h_16x16
|
2014-04-30 09:54:49 +02:00
|
|
|
mov dword[r4], 1;I16_PRED_H
|
|
|
|
mov retrd, r1d
|
2014-01-05 13:11:41 +01:00
|
|
|
jmp near return_satd_intra_16x16_x3
|
|
|
|
not_dc_h_16x16:
|
|
|
|
; for V mode
|
2014-04-30 09:54:49 +02:00
|
|
|
mov dword[r4], 0;I16_PRED_V
|
|
|
|
mov retrd, r0d
|
2014-01-05 13:11:41 +01:00
|
|
|
return_satd_intra_16x16_x3:
|
|
|
|
WELSEMMS
|
2014-04-30 09:54:49 +02:00
|
|
|
POP_XMM
|
|
|
|
LOAD_7_PARA_POP
|
2014-01-05 13:11:41 +01:00
|
|
|
ret
|
|
|
|
|
|
|
|
%macro SSE41_ChromaGetX38x8Satd 0
|
|
|
|
movdqa xmm5, [HSumSubDB1]
|
|
|
|
movdqa xmm6, [HSumSubDW1]
|
|
|
|
movdqa xmm7, [PDW1]
|
2014-04-30 09:54:49 +02:00
|
|
|
sub r0, r1
|
|
|
|
movq xmm0, [r0]
|
2014-01-05 13:11:41 +01:00
|
|
|
punpcklqdq xmm0, xmm0
|
|
|
|
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
|
2014-04-30 09:54:49 +02:00
|
|
|
movdqa [r6], xmm0 ;V
|
|
|
|
add r0, r1
|
|
|
|
pinsrb xmm0, byte[r0-1], 0
|
|
|
|
pinsrb xmm0, byte[r0+r1-1], 1
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
pinsrb xmm0, byte[r0-1], 2
|
|
|
|
pinsrb xmm0, byte[r0+r1-1], 3
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
pinsrb xmm0, byte[r0-1], 4
|
|
|
|
pinsrb xmm0, byte[r0+r1-1], 5
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
pinsrb xmm0, byte[r0-1], 6
|
|
|
|
pinsrb xmm0, byte[r0+r1-1], 7
|
2014-01-05 13:11:41 +01:00
|
|
|
punpcklqdq xmm0, xmm0
|
|
|
|
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
|
2014-04-30 09:54:49 +02:00
|
|
|
movdqa [r6+16], xmm0 ;H
|
2014-01-05 13:11:41 +01:00
|
|
|
;(sum+2)>>2
|
|
|
|
movdqa xmm6, [PDQ2]
|
|
|
|
movdqa xmm5, xmm4
|
|
|
|
punpckhqdq xmm5, xmm1
|
|
|
|
paddd xmm5, xmm6
|
|
|
|
psrld xmm5, 2
|
|
|
|
;(sum1+sum2+4)>>3
|
|
|
|
paddd xmm6, xmm6
|
|
|
|
paddd xmm4, xmm1
|
|
|
|
paddd xmm4, xmm6
|
|
|
|
psrld xmm4, 3
|
|
|
|
;satd *16
|
|
|
|
pslld xmm5, 4
|
|
|
|
pslld xmm4, 4
|
|
|
|
;temp satd
|
|
|
|
movdqa xmm6, xmm4
|
|
|
|
punpcklqdq xmm4, xmm5
|
|
|
|
psllq xmm4, 32
|
|
|
|
psrlq xmm4, 32
|
2014-04-30 09:54:49 +02:00
|
|
|
movdqa [r6+32], xmm4
|
2014-01-05 13:11:41 +01:00
|
|
|
punpckhqdq xmm5, xmm6
|
|
|
|
psllq xmm5, 32
|
|
|
|
psrlq xmm5, 32
|
2014-04-30 09:54:49 +02:00
|
|
|
movdqa [r6+48], xmm5
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
pxor xmm4, xmm4 ;V
|
|
|
|
pxor xmm5, xmm5 ;H
|
|
|
|
pxor xmm6, xmm6 ;DC
|
2014-04-30 09:54:49 +02:00
|
|
|
mov r0, 0
|
|
|
|
SSE41_ChromaGetX38x4Satd r0, 0
|
|
|
|
inc r0
|
|
|
|
SSE41_ChromaGetX38x4Satd r0, 0
|
2014-01-05 13:11:41 +01:00
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSEReg2MMX 3
|
|
|
|
movdq2q %2, %1
|
|
|
|
movhlps %1, %1
|
|
|
|
movdq2q %3, %1
|
|
|
|
%endmacro
|
|
|
|
%macro MMXReg2SSE 4
|
|
|
|
movq2dq %1, %3
|
|
|
|
movq2dq %2, %4
|
|
|
|
punpcklqdq %1, %2
|
|
|
|
%endmacro
|
|
|
|
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
|
|
|
|
|
|
|
|
WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
|
2014-04-30 09:54:49 +02:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_7_PARA
|
|
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r5, r5d
|
2014-01-05 13:11:41 +01:00
|
|
|
loop_chroma_satdx3:
|
|
|
|
SSE41_ChromaGetX38x8Satd
|
|
|
|
SSEReg2MMX xmm4, mm0,mm1
|
|
|
|
SSEReg2MMX xmm5, mm2,mm3
|
|
|
|
SSEReg2MMX xmm6, mm5,mm6
|
2014-04-30 09:54:49 +02:00
|
|
|
mov r0, arg8
|
|
|
|
mov r2, arg9
|
|
|
|
|
|
|
|
SSE41_ChromaGetX38x8Satd
|
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
MMXReg2SSE xmm0, xmm3, mm0, mm1
|
|
|
|
MMXReg2SSE xmm1, xmm3, mm2, mm3
|
|
|
|
MMXReg2SSE xmm2, xmm3, mm5, mm6
|
|
|
|
|
|
|
|
paddw xmm4, xmm0
|
|
|
|
paddw xmm5, xmm1
|
|
|
|
paddw xmm6, xmm2
|
|
|
|
|
|
|
|
MMX_DW_1_2REG xmm0, xmm1
|
|
|
|
psrlw xmm4, 1 ;/2
|
|
|
|
psrlw xmm5, 1 ;/2
|
|
|
|
psrlw xmm6, 1 ;/2
|
|
|
|
SSE41_HSum8W xmm4, xmm0, xmm1
|
|
|
|
SSE41_HSum8W xmm5, xmm0, xmm1
|
|
|
|
SSE41_HSum8W xmm6, xmm0, xmm1
|
|
|
|
; comparing order: DC H V
|
2014-04-30 09:54:49 +02:00
|
|
|
movd r3d, xmm6 ;DC
|
|
|
|
movd r1d, xmm5 ;H
|
|
|
|
movd r0d, xmm4 ;V
|
|
|
|
|
|
|
|
|
|
|
|
shl r5d, 1
|
|
|
|
add r1d, r5d
|
|
|
|
add r0d, r5d
|
|
|
|
cmp r3d, r1d
|
2014-01-05 13:11:41 +01:00
|
|
|
jge near not_dc_8x8
|
2014-04-30 09:54:49 +02:00
|
|
|
cmp r3d, r0d
|
2014-01-05 13:11:41 +01:00
|
|
|
jge near not_dc_h_8x8
|
|
|
|
|
|
|
|
; for DC mode
|
2014-04-30 09:54:49 +02:00
|
|
|
mov dword[r4], 0;I8_PRED_DC
|
|
|
|
mov retrd, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
jmp near return_satd_intra_8x8_x3
|
|
|
|
not_dc_8x8:
|
|
|
|
; for H mode
|
2014-04-30 09:54:49 +02:00
|
|
|
cmp r1d, r0d
|
2014-01-05 13:11:41 +01:00
|
|
|
jge near not_dc_h_8x8
|
2014-04-30 09:54:49 +02:00
|
|
|
mov dword[r4], 1;I8_PRED_H
|
|
|
|
mov retrd, r1d
|
2014-01-05 13:11:41 +01:00
|
|
|
jmp near return_satd_intra_8x8_x3
|
|
|
|
not_dc_h_8x8:
|
|
|
|
; for V mode
|
2014-04-30 09:54:49 +02:00
|
|
|
mov dword[r4], 2;I8_PRED_V
|
|
|
|
mov retrd, r0d
|
2014-01-05 13:11:41 +01:00
|
|
|
return_satd_intra_8x8_x3:
|
|
|
|
WELSEMMS
|
2014-04-30 09:54:49 +02:00
|
|
|
POP_XMM
|
|
|
|
LOAD_7_PARA_POP
|
2014-01-05 13:11:41 +01:00
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;Pixel_satd_intra_sse2 END
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
%macro SSSE3_Get16BSadHVDC 2
|
|
|
|
movd xmm6,%1
|
|
|
|
pshufb xmm6,xmm1
|
|
|
|
movdqa %1, xmm6
|
|
|
|
movdqa xmm0,%2
|
|
|
|
psadbw xmm0,xmm7
|
|
|
|
paddw xmm4,xmm0
|
|
|
|
movdqa xmm0,%2
|
|
|
|
psadbw xmm0,xmm5
|
|
|
|
paddw xmm2,xmm0
|
|
|
|
psadbw xmm6,%2
|
|
|
|
paddw xmm3,xmm6
|
|
|
|
%endmacro
|
|
|
|
%macro WelsAddDCValue 4
|
2014-04-30 09:54:49 +02:00
|
|
|
movzx %2, byte %1
|
|
|
|
mov %3, %2
|
|
|
|
add %4, %2
|
2014-01-05 13:11:41 +01:00
|
|
|
%endmacro
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;Pixel_sad_intra_ssse3 BEGIN
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
|
2014-04-30 09:54:49 +02:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_7_PARA
|
|
|
|
PUSH_XMM 8
|
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
|
|
|
SIGN_EXTENSION r5, r5d
|
|
|
|
|
|
|
|
push r5
|
|
|
|
push r4
|
|
|
|
push r3
|
|
|
|
|
|
|
|
sub r0, r1
|
|
|
|
movdqa xmm5,[r0]
|
|
|
|
pxor xmm0,xmm0
|
|
|
|
psadbw xmm0,xmm5
|
|
|
|
movhlps xmm1,xmm0
|
|
|
|
paddw xmm0,xmm1
|
|
|
|
movd r5d, xmm0
|
|
|
|
|
|
|
|
add r0,r1
|
|
|
|
lea r3,[r1+2*r1] ;ebx r3
|
|
|
|
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d
|
|
|
|
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
|
|
|
|
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
|
|
|
|
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
|
|
|
|
lea r0, [r0+4*r1]
|
|
|
|
add r6, 64
|
|
|
|
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
|
|
|
|
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
|
|
|
|
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
|
|
|
|
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
|
|
|
|
lea r0, [r0+4*r1]
|
|
|
|
add r6, 64
|
|
|
|
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
|
|
|
|
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
|
|
|
|
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
|
|
|
|
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
|
|
|
|
lea r0, [r0+4*r1]
|
|
|
|
add r6, 64
|
|
|
|
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
|
|
|
|
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
|
|
|
|
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
|
|
|
|
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
|
|
|
|
sub r6, 192
|
|
|
|
add r5d,10h
|
|
|
|
shr r5d,5
|
|
|
|
movd xmm7,r5d
|
|
|
|
pxor xmm1,xmm1
|
|
|
|
pshufb xmm7,xmm1
|
|
|
|
pxor xmm4,xmm4
|
|
|
|
pxor xmm3,xmm3
|
|
|
|
pxor xmm2,xmm2
|
|
|
|
;sad begin
|
|
|
|
pop r3
|
|
|
|
lea r4, [r3+2*r3] ;esi r4
|
|
|
|
SSSE3_Get16BSadHVDC [r6], [r2]
|
|
|
|
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
|
|
|
|
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
|
|
|
|
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
|
|
|
|
add r6, 64
|
|
|
|
lea r2, [r2+4*r3]
|
|
|
|
SSSE3_Get16BSadHVDC [r6], [r2]
|
|
|
|
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
|
|
|
|
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
|
|
|
|
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
|
|
|
|
add r6, 64
|
|
|
|
lea r2, [r2+4*r3]
|
|
|
|
SSSE3_Get16BSadHVDC [r6], [r2]
|
|
|
|
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
|
|
|
|
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
|
|
|
|
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
|
|
|
|
add r6, 64
|
|
|
|
lea r2, [r2+4*r3]
|
|
|
|
SSSE3_Get16BSadHVDC [r6], [r2]
|
|
|
|
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
|
|
|
|
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
|
|
|
|
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
|
|
|
|
|
|
|
|
pop r4
|
|
|
|
pop r5
|
|
|
|
pslldq xmm3,4
|
|
|
|
por xmm3,xmm2
|
|
|
|
movhlps xmm1,xmm3
|
|
|
|
paddw xmm3,xmm1
|
|
|
|
movhlps xmm0,xmm4
|
|
|
|
paddw xmm4,xmm0
|
|
|
|
; comparing order: DC H V
|
|
|
|
movd r1d, xmm4 ;DC ;ebx r1d
|
|
|
|
movd r0d, xmm3 ;V ;ecx r0d
|
2014-01-05 13:11:41 +01:00
|
|
|
psrldq xmm3, 4
|
2014-04-30 09:54:49 +02:00
|
|
|
movd r2d, xmm3 ;H ;esi r2d
|
|
|
|
|
|
|
|
;mov eax, [esp+36] ;lamda ;eax r5
|
|
|
|
shl r5d, 1
|
|
|
|
add r2d, r5d
|
|
|
|
add r1d, r5d
|
|
|
|
;mov edx, [esp+32] ;edx r4
|
|
|
|
cmp r1d, r2d
|
2014-01-05 13:11:41 +01:00
|
|
|
jge near not_dc_16x16_sad
|
2014-04-30 09:54:49 +02:00
|
|
|
cmp r1d, r0d
|
2014-01-05 13:11:41 +01:00
|
|
|
jge near not_dc_h_16x16_sad
|
|
|
|
; for DC mode
|
2014-04-30 09:54:49 +02:00
|
|
|
mov dword[r4], 2;I16_PRED_DC
|
|
|
|
mov retrd, r1d
|
|
|
|
sub r6, 192
|
2014-01-05 13:11:41 +01:00
|
|
|
%assign x 0
|
|
|
|
%rep 16
|
2014-04-30 09:54:49 +02:00
|
|
|
movdqa [r6+16*x], xmm7
|
2014-01-05 13:11:41 +01:00
|
|
|
%assign x x+1
|
|
|
|
%endrep
|
|
|
|
jmp near return_sad_intra_16x16_x3
|
|
|
|
not_dc_16x16_sad:
|
|
|
|
; for H mode
|
2014-04-30 09:54:49 +02:00
|
|
|
cmp r2d, r0d
|
2014-01-05 13:11:41 +01:00
|
|
|
jge near not_dc_h_16x16_sad
|
2014-04-30 09:54:49 +02:00
|
|
|
mov dword[r4], 1;I16_PRED_H
|
|
|
|
mov retrd, r2d
|
2014-01-05 13:11:41 +01:00
|
|
|
jmp near return_sad_intra_16x16_x3
|
|
|
|
not_dc_h_16x16_sad:
|
|
|
|
; for V mode
|
2014-04-30 09:54:49 +02:00
|
|
|
mov dword[r4], 0;I16_PRED_V
|
|
|
|
mov retrd, r0d
|
|
|
|
sub r6, 192
|
2014-01-05 13:11:41 +01:00
|
|
|
%assign x 0
|
|
|
|
%rep 16
|
2014-04-30 09:54:49 +02:00
|
|
|
movdqa [r6+16*x], xmm5
|
2014-01-05 13:11:41 +01:00
|
|
|
%assign x x+1
|
|
|
|
%endrep
|
|
|
|
return_sad_intra_16x16_x3:
|
2014-04-30 09:54:49 +02:00
|
|
|
POP_XMM
|
|
|
|
LOAD_7_PARA_POP
|
2014-01-05 13:11:41 +01:00
|
|
|
ret
|
2014-04-30 09:54:49 +02:00
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;Pixel_sad_intra_ssse3 END
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;Pixel_satd_wxh_sse41 BEGIN
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
|
|
|
|
;SSE4.1
|
|
|
|
%macro SSE41_GetSatd8x4 0
|
|
|
|
movq xmm0, [r0]
|
|
|
|
punpcklqdq xmm0, xmm0
|
|
|
|
pmaddubsw xmm0, xmm7
|
|
|
|
movq xmm1, [r0+r1]
|
|
|
|
punpcklqdq xmm1, xmm1
|
|
|
|
pmaddubsw xmm1, xmm7
|
|
|
|
movq xmm2, [r2]
|
|
|
|
punpcklqdq xmm2, xmm2
|
|
|
|
pmaddubsw xmm2, xmm7
|
|
|
|
movq xmm3, [r2+r3]
|
|
|
|
punpcklqdq xmm3, xmm3
|
|
|
|
pmaddubsw xmm3, xmm7
|
|
|
|
psubsw xmm0, xmm2
|
|
|
|
psubsw xmm1, xmm3
|
|
|
|
movq xmm2, [r0+2*r1]
|
|
|
|
punpcklqdq xmm2, xmm2
|
|
|
|
pmaddubsw xmm2, xmm7
|
|
|
|
movq xmm3, [r0+r4]
|
|
|
|
punpcklqdq xmm3, xmm3
|
|
|
|
pmaddubsw xmm3, xmm7
|
|
|
|
movq xmm4, [r2+2*r3]
|
|
|
|
punpcklqdq xmm4, xmm4
|
|
|
|
pmaddubsw xmm4, xmm7
|
|
|
|
movq xmm5, [r2+r5]
|
|
|
|
punpcklqdq xmm5, xmm5
|
|
|
|
pmaddubsw xmm5, xmm7
|
|
|
|
psubsw xmm2, xmm4
|
|
|
|
psubsw xmm3, xmm5
|
|
|
|
SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
|
|
|
|
pabsw xmm0, xmm0
|
|
|
|
pabsw xmm2, xmm2
|
|
|
|
pabsw xmm1, xmm1
|
|
|
|
pabsw xmm3, xmm3
|
|
|
|
movdqa xmm4, xmm3
|
|
|
|
pblendw xmm3, xmm1, 0xAA
|
|
|
|
pslld xmm1, 16
|
|
|
|
psrld xmm4, 16
|
|
|
|
por xmm1, xmm4
|
|
|
|
pmaxuw xmm1, xmm3
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
movdqa xmm4, xmm0
|
|
|
|
pblendw xmm0, xmm2, 0xAA
|
|
|
|
pslld xmm2, 16
|
|
|
|
psrld xmm4, 16
|
|
|
|
por xmm2, xmm4
|
|
|
|
pmaxuw xmm0, xmm2
|
|
|
|
paddw xmm6, xmm0
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
|
|
|
|
MMX_DW_1_2REG %3, %4
|
|
|
|
pmaddwd %2, %3
|
|
|
|
movhlps %4, %2
|
|
|
|
paddd %2, %4
|
|
|
|
pshuflw %4, %2,0Eh
|
|
|
|
paddd %2, %4
|
|
|
|
movd %1, %2
|
|
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsSampleSatd4x4_sse41
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
movdqa xmm4,[HSwapSumSubDB1]
|
|
|
|
movd xmm2,[r2]
|
|
|
|
movd xmm5,[r2+r3]
|
|
|
|
shufps xmm2,xmm5,0
|
|
|
|
movd xmm3,[r2+r3*2]
|
|
|
|
lea r2, [r3*2+r2]
|
|
|
|
movd xmm5,[r2+r3]
|
|
|
|
shufps xmm3,xmm5,0
|
|
|
|
movd xmm0,[r0]
|
|
|
|
movd xmm5,[r0+r1]
|
|
|
|
shufps xmm0,xmm5,0
|
|
|
|
movd xmm1,[r0+r1*2]
|
|
|
|
lea r0, [r1*2+r0]
|
|
|
|
movd xmm5,[r0+r1]
|
|
|
|
shufps xmm1,xmm5,0
|
|
|
|
pmaddubsw xmm0,xmm4
|
|
|
|
pmaddubsw xmm1,xmm4
|
|
|
|
pmaddubsw xmm2,xmm4
|
|
|
|
pmaddubsw xmm3,xmm4
|
|
|
|
psubw xmm0,xmm2
|
|
|
|
psubw xmm1,xmm3
|
|
|
|
movdqa xmm2,xmm0
|
|
|
|
paddw xmm0,xmm1
|
|
|
|
psubw xmm1,xmm2
|
|
|
|
movdqa xmm2,xmm0
|
|
|
|
punpcklqdq xmm0,xmm1
|
|
|
|
punpckhqdq xmm2,xmm1
|
|
|
|
movdqa xmm1,xmm0
|
|
|
|
paddw xmm0,xmm2
|
|
|
|
psubw xmm2,xmm1
|
|
|
|
movdqa xmm1,xmm0
|
|
|
|
pblendw xmm0,xmm2,0AAh
|
|
|
|
pslld xmm2,16
|
|
|
|
psrld xmm1,16
|
|
|
|
por xmm2,xmm1
|
|
|
|
pabsw xmm0,xmm0
|
|
|
|
pabsw xmm2,xmm2
|
|
|
|
pmaxsw xmm0,xmm2
|
|
|
|
SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsSampleSatd8x8_sse41
|
2014-01-05 13:16:22 +01:00
|
|
|
%ifdef X86_32
|
2014-01-05 13:11:41 +01:00
|
|
|
push r4
|
|
|
|
push r5
|
2014-01-05 13:16:22 +01:00
|
|
|
%endif
|
2014-01-05 13:11:41 +01:00
|
|
|
%assign push_num 2
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
movdqa xmm7, [HSumSubDB1]
|
|
|
|
lea r4, [r1+r1*2]
|
|
|
|
lea r5, [r3+r3*2]
|
|
|
|
pxor xmm6, xmm6
|
|
|
|
SSE41_GetSatd8x4
|
|
|
|
lea r0, [r0+4*r1]
|
|
|
|
lea r2, [r2+4*r3]
|
|
|
|
SSE41_GetSatd8x4
|
|
|
|
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
%ifdef X86_32
|
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
%endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsSampleSatd8x16_sse41
|
2014-01-05 13:16:22 +01:00
|
|
|
%ifdef X86_32
|
2014-01-05 13:11:41 +01:00
|
|
|
push r4
|
|
|
|
push r5
|
|
|
|
push r6
|
2014-01-05 13:16:22 +01:00
|
|
|
%endif
|
2014-01-05 13:11:41 +01:00
|
|
|
%assign push_num 3
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
movdqa xmm7, [HSumSubDB1]
|
|
|
|
lea r4, [r1+r1*2]
|
|
|
|
lea r5, [r3+r3*2]
|
|
|
|
pxor xmm6, xmm6
|
|
|
|
mov r6, 0
|
|
|
|
loop_get_satd_8x16:
|
|
|
|
SSE41_GetSatd8x4
|
|
|
|
lea r0, [r0+4*r1]
|
|
|
|
lea r2, [r2+4*r3]
|
|
|
|
inc r6
|
|
|
|
cmp r6, 4
|
|
|
|
jl loop_get_satd_8x16
|
|
|
|
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
%ifdef X86_32
|
|
|
|
pop r6
|
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
%endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsSampleSatd16x8_sse41
|
2014-01-05 13:16:22 +01:00
|
|
|
%ifdef X86_32
|
2014-01-05 13:11:41 +01:00
|
|
|
push r4
|
|
|
|
push r5
|
2014-01-05 13:16:22 +01:00
|
|
|
%endif
|
2014-01-05 13:11:41 +01:00
|
|
|
%assign push_num 2
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
push r0
|
|
|
|
push r2
|
2014-01-05 13:16:22 +01:00
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
movdqa xmm7, [HSumSubDB1]
|
|
|
|
lea r4, [r1+r1*2]
|
|
|
|
lea r5, [r3+r3*2]
|
|
|
|
pxor xmm6, xmm6
|
|
|
|
SSE41_GetSatd8x4
|
|
|
|
lea r0, [r0+4*r1]
|
|
|
|
lea r2, [r2+4*r3]
|
|
|
|
SSE41_GetSatd8x4
|
2014-01-05 13:16:22 +01:00
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
pop r2
|
|
|
|
pop r0
|
|
|
|
add r0, 8
|
|
|
|
add r2, 8
|
|
|
|
SSE41_GetSatd8x4
|
|
|
|
lea r0, [r0+4*r1]
|
|
|
|
lea r2, [r2+4*r3]
|
|
|
|
SSE41_GetSatd8x4
|
|
|
|
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
%ifdef X86_32
|
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
%endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
|
|
|
|
WELS_EXTERN WelsSampleSatd16x16_sse41
|
2014-01-05 13:16:22 +01:00
|
|
|
%ifdef X86_32
|
2014-01-05 13:11:41 +01:00
|
|
|
push r4
|
|
|
|
push r5
|
|
|
|
push r6
|
2014-01-05 13:16:22 +01:00
|
|
|
%endif
|
2014-01-05 13:11:41 +01:00
|
|
|
%assign push_num 3
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:16:22 +01:00
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
push r0
|
|
|
|
push r2
|
2014-01-05 13:16:22 +01:00
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
movdqa xmm7, [HSumSubDB1]
|
|
|
|
lea r4, [r1+r1*2]
|
|
|
|
lea r5, [r3+r3*2]
|
|
|
|
pxor xmm6, xmm6
|
|
|
|
mov r6, 0
|
|
|
|
loop_get_satd_16x16_left:
|
|
|
|
SSE41_GetSatd8x4
|
|
|
|
lea r0, [r0+4*r1]
|
|
|
|
lea r2, [r2+4*r3]
|
|
|
|
inc r6
|
|
|
|
cmp r6, 4
|
|
|
|
jl loop_get_satd_16x16_left
|
|
|
|
|
|
|
|
pop r2
|
2014-01-05 13:16:22 +01:00
|
|
|
pop r0
|
2014-01-05 13:11:41 +01:00
|
|
|
add r0, 8
|
|
|
|
add r2, 8
|
|
|
|
mov r6, 0
|
|
|
|
loop_get_satd_16x16_right:
|
|
|
|
SSE41_GetSatd8x4
|
|
|
|
lea r0, [r0+4*r1]
|
|
|
|
lea r2, [r2+4*r3]
|
|
|
|
inc r6
|
|
|
|
cmp r6, 4
|
|
|
|
jl loop_get_satd_16x16_right
|
|
|
|
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
%ifdef X86_32
|
|
|
|
pop r6
|
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
%endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;Pixel_satd_wxh_sse41 END
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;Pixel_sad_wxh_sse2 BEGIN
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
|
|
|
|
%macro SSE2_GetSad2x16 0
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqu xmm1, [r2]
|
|
|
|
MOVDQ xmm2, [r0];[eax] must aligned 16
|
|
|
|
psadbw xmm1, xmm2
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
movdqu xmm1, [r2+r3]
|
|
|
|
MOVDQ xmm2, [r0+r1]
|
|
|
|
psadbw xmm1, xmm2
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
|
|
|
|
%macro SSE2_GetSad4x16 0
|
|
|
|
movdqu xmm0, [r2]
|
|
|
|
MOVDQ xmm2, [r0]
|
|
|
|
psadbw xmm0, xmm2
|
|
|
|
paddw xmm7, xmm0
|
|
|
|
movdqu xmm1, [r2+r3]
|
|
|
|
MOVDQ xmm2, [r0+r1]
|
|
|
|
psadbw xmm1, xmm2
|
|
|
|
paddw xmm7, xmm1
|
|
|
|
movdqu xmm1, [r2+2*r3]
|
|
|
|
MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
|
|
|
|
psadbw xmm1, xmm2
|
|
|
|
paddw xmm7, xmm1
|
|
|
|
movdqu xmm1, [r2+r5]
|
|
|
|
MOVDQ xmm2, [r0+r4]
|
|
|
|
psadbw xmm1, xmm2
|
|
|
|
paddw xmm7, xmm1
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
|
|
|
|
%macro SSE2_GetSad8x4 0
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movq xmm1, [r0+r1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movhps xmm0, [r0]
|
|
|
|
movhps xmm1, [r0+r1]
|
|
|
|
|
|
|
|
movq xmm2, [r2]
|
|
|
|
movq xmm3, [r2+r3]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm2, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm2
|
|
|
|
psadbw xmm1, xmm3
|
|
|
|
paddw xmm6, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
|
|
|
|
;First parameter can align to 16 bytes,
|
|
|
|
;In wels, the third parameter can't align to 16 bytes.
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsSampleSad16x16_sse2
|
|
|
|
%ifdef X86_32
|
|
|
|
push r4
|
|
|
|
push r5
|
2014-01-05 13:16:22 +01:00
|
|
|
%endif
|
2014-01-05 13:11:41 +01:00
|
|
|
|
|
|
|
%assign push_num 2
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
lea r4, [3*r1]
|
|
|
|
lea r5, [3*r3]
|
|
|
|
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
SSE2_GetSad4x16
|
|
|
|
lea r0, [r0+4*r1]
|
|
|
|
lea r2, [r2+4*r3]
|
|
|
|
SSE2_GetSad4x16
|
|
|
|
lea r0, [r0+4*r1]
|
|
|
|
lea r2, [r2+4*r3]
|
|
|
|
SSE2_GetSad4x16
|
|
|
|
lea r0, [r0+4*r1]
|
|
|
|
lea r2, [r2+4*r3]
|
|
|
|
SSE2_GetSad4x16
|
|
|
|
movhlps xmm0, xmm7
|
|
|
|
paddw xmm0, xmm7
|
|
|
|
movd retrd, xmm0
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
%ifdef X86_32
|
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
%endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
|
|
|
|
;First parameter can align to 16 bytes,
|
|
|
|
;In wels, the third parameter can't align to 16 bytes.
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsSampleSad16x8_sse2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
movdqu xmm0, [r2]
|
|
|
|
MOVDQ xmm2, [r0]
|
|
|
|
psadbw xmm0, xmm2
|
|
|
|
movdqu xmm1, [r2+r3]
|
|
|
|
MOVDQ xmm2, [r0+r1]
|
|
|
|
psadbw xmm1, xmm2
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
|
|
|
|
SSE2_GetSad2x16
|
|
|
|
SSE2_GetSad2x16
|
|
|
|
SSE2_GetSad2x16
|
|
|
|
|
|
|
|
movhlps xmm1, xmm0
|
|
|
|
paddw xmm0, xmm1
|
|
|
|
movd retrd, xmm0
|
|
|
|
LOAD_4_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
WELS_EXTERN WelsSampleSad8x16_sse2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 7
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm6, xmm6
|
|
|
|
|
|
|
|
SSE2_GetSad8x4
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE2_GetSad8x4
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE2_GetSad8x4
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE2_GetSad8x4
|
|
|
|
|
|
|
|
movhlps xmm0, xmm6
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
movd retrd, xmm0
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
|
|
|
|
and %1, 0x1f|(%3>>1)
|
|
|
|
cmp %1, (32-%2)|(%3>>1)
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
WELS_EXTERN WelsSampleSad8x8_sse21
|
|
|
|
%assign push_num 0
|
|
|
|
mov r2, arg3
|
|
|
|
push r2
|
|
|
|
CACHE_SPLIT_CHECK r2, 8, 64
|
|
|
|
jle near .pixel_sad_8x8_nsplit
|
|
|
|
pop r2
|
2014-01-05 13:16:22 +01:00
|
|
|
%ifdef X86_32
|
2014-01-05 13:11:41 +01:00
|
|
|
push r3
|
|
|
|
push r4
|
|
|
|
push r5
|
|
|
|
%endif
|
|
|
|
%assign push_num 3
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-01-05 13:11:41 +01:00
|
|
|
mov r0, arg1
|
2014-01-05 13:16:22 +01:00
|
|
|
mov r1, arg2
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm7, xmm7
|
2014-01-05 13:16:22 +01:00
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
;ecx r2, edx r4, edi r5
|
|
|
|
|
|
|
|
mov r5, r2
|
|
|
|
and r5, 0x07
|
|
|
|
sub r2, r5
|
|
|
|
mov r4, 8
|
|
|
|
sub r4, r5
|
|
|
|
|
|
|
|
shl r5, 3
|
|
|
|
shl r4, 3
|
|
|
|
movd xmm5, r5d
|
|
|
|
movd xmm6, r4d
|
|
|
|
mov r5, 8
|
|
|
|
add r5, r2
|
|
|
|
mov r3, arg4
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
|
|
|
|
movq xmm1, [r2]
|
|
|
|
movq xmm2, [r5]
|
|
|
|
movhps xmm1, [r2+r3]
|
|
|
|
movhps xmm2, [r5+r3]
|
|
|
|
psrlq xmm1, xmm5
|
|
|
|
psllq xmm2, xmm6
|
|
|
|
por xmm1, xmm2
|
|
|
|
|
|
|
|
psadbw xmm0, xmm1
|
|
|
|
paddw xmm7, xmm0
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
lea r5, [r5+2*r3]
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
|
|
|
|
movq xmm1, [r2]
|
|
|
|
movq xmm2, [r5]
|
|
|
|
movhps xmm1, [r2+r3]
|
|
|
|
movhps xmm2, [r5+r3]
|
|
|
|
psrlq xmm1, xmm5
|
|
|
|
psllq xmm2, xmm6
|
|
|
|
por xmm1, xmm2
|
|
|
|
|
|
|
|
psadbw xmm0, xmm1
|
|
|
|
paddw xmm7, xmm0
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
lea r5, [r5+2*r3]
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
|
|
|
|
movq xmm1, [r2]
|
|
|
|
movq xmm2, [r5]
|
|
|
|
movhps xmm1, [r2+r3]
|
|
|
|
movhps xmm2, [r5+r3]
|
|
|
|
psrlq xmm1, xmm5
|
|
|
|
psllq xmm2, xmm6
|
|
|
|
por xmm1, xmm2
|
|
|
|
|
|
|
|
psadbw xmm0, xmm1
|
|
|
|
paddw xmm7, xmm0
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
lea r5, [r5+2*r3]
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
|
|
|
|
movq xmm1, [r2]
|
|
|
|
movq xmm2, [r5]
|
|
|
|
movhps xmm1, [r2+r3]
|
|
|
|
movhps xmm2, [r5+r3]
|
|
|
|
psrlq xmm1, xmm5
|
|
|
|
psllq xmm2, xmm6
|
|
|
|
por xmm1, xmm2
|
|
|
|
|
|
|
|
psadbw xmm0, xmm1
|
|
|
|
paddw xmm7, xmm0
|
|
|
|
|
|
|
|
movhlps xmm0, xmm7
|
|
|
|
paddw xmm0, xmm7
|
|
|
|
movd retrd, xmm0
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
%ifdef X86_32
|
|
|
|
pop r5
|
|
|
|
pop r4
|
|
|
|
pop r3
|
|
|
|
%endif
|
|
|
|
jmp .return
|
2014-01-05 13:16:22 +01:00
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
.pixel_sad_8x8_nsplit:
|
2014-01-05 13:16:22 +01:00
|
|
|
|
2014-01-05 13:11:41 +01:00
|
|
|
pop r2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 7
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm6, xmm6
|
|
|
|
SSE2_GetSad8x4
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
SSE2_GetSad8x4
|
|
|
|
movhlps xmm0, xmm6
|
|
|
|
paddw xmm0, xmm6
|
|
|
|
movd retrd, xmm0
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_4_PARA_POP
|
|
|
|
.return:
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;Pixel_sad_wxh_sse2 END
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;Pixel_sad_4_wxh_sse2 BEGIN
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
|
|
|
|
|
|
|
|
%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
|
|
|
|
psadbw %1, %4
|
|
|
|
paddw xmm5, %1
|
|
|
|
psadbw %4, %3
|
|
|
|
paddw xmm4, %4
|
|
|
|
movdqu %4, [%5-1]
|
|
|
|
psadbw %4, %2
|
|
|
|
paddw xmm6, %4
|
|
|
|
movdqu %4, [%5+1]
|
|
|
|
psadbw %4, %2
|
|
|
|
paddw xmm7, %4
|
|
|
|
%endmacro
|
|
|
|
WELS_EXTERN WelsSampleSadFour16x16_sse2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
|
|
|
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
|
|
|
|
pxor xmm6, xmm6 ;sad pRefMb-1
|
|
|
|
pxor xmm7, xmm7 ;sad pRefMb+1
|
|
|
|
movdqa xmm0, [r0]
|
|
|
|
sub r2, r3
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movdqa xmm1, [r0+r1]
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
psadbw xmm3, xmm1
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movdqu xmm2, [r2+r3-1]
|
|
|
|
psadbw xmm2, xmm0
|
|
|
|
paddw xmm6, xmm2
|
|
|
|
|
|
|
|
movdqu xmm3, [r2+r3+1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm2, [r0]
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
|
|
|
|
movdqa xmm0, [r0+r1]
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm1, [r0]
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
|
|
|
|
movdqa xmm2, [r0+r1]
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm0, [r0]
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
|
|
|
|
movdqa xmm1, [r0+r1]
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm2, [r0]
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
|
|
|
|
movdqa xmm0, [r0+r1]
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm1, [r0]
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
|
|
|
|
movdqa xmm2, [r0+r1]
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm0, [r0]
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
|
|
|
|
movdqa xmm1, [r0+r1]
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm2, [r0]
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
|
|
|
|
movdqa xmm0, [r0+r1]
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
psadbw xmm2, xmm3
|
|
|
|
paddw xmm5, xmm2
|
|
|
|
|
|
|
|
movdqu xmm2, [r2-1]
|
|
|
|
psadbw xmm2, xmm0
|
|
|
|
paddw xmm6, xmm2
|
|
|
|
|
|
|
|
movdqu xmm3, [r2+1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movhlps xmm0, xmm4
|
|
|
|
paddw xmm4, xmm0
|
|
|
|
movhlps xmm0, xmm5
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
movhlps xmm0, xmm6
|
|
|
|
paddw xmm6, xmm0
|
|
|
|
movhlps xmm0, xmm7
|
|
|
|
paddw xmm7, xmm0
|
|
|
|
punpckldq xmm4, xmm5
|
|
|
|
punpckldq xmm6, xmm7
|
|
|
|
punpcklqdq xmm4, xmm6
|
|
|
|
movdqa [r4],xmm4
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_5_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
WELS_EXTERN WelsSampleSadFour16x8_sse2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
|
|
|
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
|
|
|
|
pxor xmm6, xmm6 ;sad pRefMb-1
|
|
|
|
pxor xmm7, xmm7 ;sad pRefMb+1
|
|
|
|
movdqa xmm0, [r0]
|
|
|
|
sub r2, r3
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movdqa xmm1, [r0+r1]
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
psadbw xmm3, xmm1
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movdqu xmm2, [r2+r3-1]
|
|
|
|
psadbw xmm2, xmm0
|
|
|
|
paddw xmm6, xmm2
|
|
|
|
|
|
|
|
movdqu xmm3, [r2+r3+1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm2, [r0]
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
|
|
|
|
movdqa xmm0, [r0+r1]
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm1, [r0]
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
|
|
|
|
movdqa xmm2, [r0+r1]
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqa xmm0, [r0]
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
|
|
|
|
movdqa xmm1, [r0+r1]
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movdqu xmm3, [r2]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movdqu xmm0, [r2-1]
|
|
|
|
psadbw xmm0, xmm1
|
|
|
|
paddw xmm6, xmm0
|
|
|
|
|
|
|
|
movdqu xmm3, [r2+1]
|
|
|
|
psadbw xmm3, xmm1
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movdqu xmm3, [r2+r3]
|
|
|
|
psadbw xmm1, xmm3
|
|
|
|
paddw xmm5, xmm1
|
|
|
|
|
|
|
|
movhlps xmm0, xmm4
|
|
|
|
paddw xmm4, xmm0
|
|
|
|
movhlps xmm0, xmm5
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
movhlps xmm0, xmm6
|
|
|
|
paddw xmm6, xmm0
|
|
|
|
movhlps xmm0, xmm7
|
|
|
|
paddw xmm7, xmm0
|
|
|
|
punpckldq xmm4, xmm5
|
|
|
|
punpckldq xmm6, xmm7
|
|
|
|
punpcklqdq xmm4, xmm6
|
|
|
|
movdqa [r4],xmm4
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_5_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
WELS_EXTERN WelsSampleSadFour8x16_sse2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
|
|
|
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
|
|
|
|
pxor xmm6, xmm6 ;sad pRefMb-1
|
|
|
|
pxor xmm7, xmm7 ;sad pRefMb+1
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
sub r2, r3
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm1, [r2-1]
|
|
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm1, [r2-1]
|
|
|
|
movhps xmm3, [r2+1]
|
|
|
|
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm1, [r2-1]
|
|
|
|
movhps xmm3, [r2+1]
|
|
|
|
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm1, [r2-1]
|
|
|
|
movhps xmm3, [r2+1]
|
|
|
|
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm1, [r2-1]
|
|
|
|
movhps xmm3, [r2+1]
|
|
|
|
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm1, [r2-1]
|
|
|
|
movhps xmm3, [r2+1]
|
|
|
|
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm1, [r2-1]
|
|
|
|
movhps xmm3, [r2+1]
|
|
|
|
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm1, [r2-1]
|
|
|
|
movhps xmm3, [r2+1]
|
|
|
|
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movhlps xmm0, xmm4
|
|
|
|
paddw xmm4, xmm0
|
|
|
|
movhlps xmm0, xmm5
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
movhlps xmm0, xmm6
|
|
|
|
paddw xmm6, xmm0
|
|
|
|
movhlps xmm0, xmm7
|
|
|
|
paddw xmm7, xmm0
|
|
|
|
punpckldq xmm4, xmm5
|
|
|
|
punpckldq xmm6, xmm7
|
|
|
|
punpcklqdq xmm4, xmm6
|
|
|
|
movdqa [r4],xmm4
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_5_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
WELS_EXTERN WelsSampleSadFour8x8_sse2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
PUSH_XMM 8
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
|
|
|
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
|
|
|
|
pxor xmm6, xmm6 ;sad pRefMb-1
|
|
|
|
pxor xmm7, xmm7 ;sad pRefMb+1
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
sub r2, r3
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm1, [r2-1]
|
|
|
|
movhps xmm3, [r2+1]
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm1, [r2-1]
|
|
|
|
movhps xmm3, [r2+1]
|
|
|
|
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm1, [r2-1]
|
|
|
|
movhps xmm3, [r2+1]
|
|
|
|
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movq xmm0, [r0]
|
|
|
|
movhps xmm0, [r0+r1]
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm4, xmm3
|
|
|
|
|
|
|
|
|
|
|
|
movq xmm1, [r2+r3-1]
|
|
|
|
movq xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movhps xmm1, [r2-1]
|
|
|
|
movhps xmm3, [r2+1]
|
|
|
|
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
paddw xmm6, xmm1
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
paddw xmm7, xmm3
|
|
|
|
|
|
|
|
movq xmm3, [r2]
|
|
|
|
movhps xmm3, [r2+r3]
|
|
|
|
psadbw xmm0, xmm3
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
|
|
|
|
movhlps xmm0, xmm4
|
|
|
|
paddw xmm4, xmm0
|
|
|
|
movhlps xmm0, xmm5
|
|
|
|
paddw xmm5, xmm0
|
|
|
|
movhlps xmm0, xmm6
|
|
|
|
paddw xmm6, xmm0
|
|
|
|
movhlps xmm0, xmm7
|
|
|
|
paddw xmm7, xmm0
|
|
|
|
punpckldq xmm4, xmm5
|
|
|
|
punpckldq xmm6, xmm7
|
|
|
|
punpcklqdq xmm4, xmm6
|
|
|
|
movdqa [r4],xmm4
|
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64
According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.
This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.
This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.
This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.
2014-03-14 09:29:53 +01:00
|
|
|
POP_XMM
|
2014-01-05 13:11:41 +01:00
|
|
|
LOAD_5_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
WELS_EXTERN WelsSampleSadFour4x4_sse2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
movd xmm0, [r0]
|
|
|
|
movd xmm1, [r0+r1]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
movd xmm2, [r0]
|
|
|
|
movd xmm3, [r0+r1]
|
|
|
|
punpckldq xmm0, xmm1
|
|
|
|
punpckldq xmm2, xmm3
|
|
|
|
punpcklqdq xmm0, xmm2
|
|
|
|
sub r2, r3
|
|
|
|
movd xmm1, [r2]
|
|
|
|
movd xmm2, [r2+r3]
|
|
|
|
punpckldq xmm1, xmm2
|
|
|
|
movd xmm2, [r2+r3-1]
|
|
|
|
movd xmm3, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
|
|
|
|
movd xmm4, [r2]
|
|
|
|
movd xmm5, [r2-1]
|
|
|
|
punpckldq xmm2, xmm5
|
|
|
|
movd xmm5, [r2+1]
|
|
|
|
punpckldq xmm3, xmm5
|
|
|
|
|
|
|
|
movd xmm5, [r2+r3]
|
|
|
|
punpckldq xmm4, xmm5
|
|
|
|
|
|
|
|
punpcklqdq xmm1, xmm4 ;-L
|
|
|
|
|
|
|
|
movd xmm5, [r2+r3-1]
|
|
|
|
movd xmm6, [r2+r3+1]
|
|
|
|
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
movd xmm7, [r2-1]
|
|
|
|
punpckldq xmm5, xmm7
|
|
|
|
punpcklqdq xmm2, xmm5 ;-1
|
|
|
|
movd xmm7, [r2+1]
|
|
|
|
punpckldq xmm6, xmm7
|
|
|
|
punpcklqdq xmm3, xmm6 ;+1
|
|
|
|
movd xmm6, [r2]
|
|
|
|
movd xmm7, [r2+r3]
|
|
|
|
punpckldq xmm6, xmm7
|
|
|
|
punpcklqdq xmm4, xmm6 ;+L
|
|
|
|
psadbw xmm1, xmm0
|
|
|
|
psadbw xmm2, xmm0
|
|
|
|
psadbw xmm3, xmm0
|
|
|
|
psadbw xmm4, xmm0
|
|
|
|
|
|
|
|
movhlps xmm0, xmm1
|
|
|
|
paddw xmm1, xmm0
|
|
|
|
movhlps xmm0, xmm2
|
|
|
|
paddw xmm2, xmm0
|
|
|
|
movhlps xmm0, xmm3
|
|
|
|
paddw xmm3, xmm0
|
|
|
|
movhlps xmm0, xmm4
|
|
|
|
paddw xmm4, xmm0
|
|
|
|
punpckldq xmm1, xmm4
|
|
|
|
punpckldq xmm2, xmm3
|
|
|
|
punpcklqdq xmm1, xmm2
|
|
|
|
movdqa [r4],xmm1
|
|
|
|
LOAD_5_PARA_POP
|
|
|
|
ret
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
;
|
|
|
|
;Pixel_sad_4_wxh_sse2 END
|
|
|
|
;
|
|
|
|
;***********************************************************************
|
|
|
|
|
|
|
|
;***********************************************************************
|
2014-03-09 11:28:36 +01:00
|
|
|
; int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
|
2014-01-05 13:11:41 +01:00
|
|
|
;***********************************************************************
|
2014-03-16 12:23:24 +01:00
|
|
|
WELS_EXTERN WelsSampleSad4x4_mmx
|
2014-01-05 13:11:41 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_4_PARA
|
2014-03-14 09:13:18 +01:00
|
|
|
SIGN_EXTENSION r1, r1d
|
|
|
|
SIGN_EXTENSION r3, r3d
|
2014-01-05 13:11:41 +01:00
|
|
|
movd mm0, [r0]
|
|
|
|
movd mm1, [r0+r1]
|
|
|
|
punpckldq mm0, mm1
|
|
|
|
|
|
|
|
movd mm3, [r2]
|
|
|
|
movd mm4, [r2+r3]
|
|
|
|
punpckldq mm3, mm4
|
|
|
|
psadbw mm0, mm3
|
|
|
|
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
|
|
|
|
movd mm1, [r0]
|
|
|
|
movd mm2, [r0+r1]
|
|
|
|
punpckldq mm1, mm2
|
|
|
|
|
|
|
|
movd mm3, [r2]
|
|
|
|
movd mm4, [r2+r3]
|
|
|
|
punpckldq mm3, mm4
|
|
|
|
psadbw mm1, mm3
|
|
|
|
paddw mm0, mm1
|
|
|
|
|
|
|
|
movd retrd, mm0
|
|
|
|
|
|
|
|
WELSEMMS
|
|
|
|
LOAD_4_PARA_POP
|
|
|
|
ret
|