57f6bcc4b0
Previously the assembly sources had mixed indentation consisting of both spaces and tabs, making it quite hard to read unless the right tab size was used in the editor. Tabs have been interpreted as 4 spaces in most cases, matching the surrounding code.
412 lines
10 KiB
NASM
412 lines
10 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2010-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* vaa.asm
|
|
;*
|
|
;* Abstract
|
|
;* sse2 for pVaa routines
|
|
;*
|
|
;* History
|
|
;* 04/14/2010 Created
|
|
;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
|
|
;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
|
|
;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
|
|
;*
|
|
;*************************************************************************/
|
|
%include "asm_inc.asm"
|
|
|
|
|
|
;***********************************************************************
|
|
; Macros and other preprocessor constants
|
|
;***********************************************************************
|
|
|
|
; by comparing it outperforms than phaddw(SSSE3) sets
|
|
%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
|
|
; @sum_8x2 begin
|
|
pshufd %2, %1, 04Eh ; 01001110 B
|
|
paddw %1, %2
|
|
pshuflw %2, %1, 04Eh ; 01001110 B
|
|
paddw %1, %2
|
|
pshuflw %2, %1, 0B1h ; 10110001 B
|
|
paddw %1, %2
|
|
; end of @sum_8x2
|
|
%endmacro ; END of SUM_WORD_8x2_SSE2
|
|
|
|
|
|
%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
|
|
movdqa %1, [r0 ] ; line 0
|
|
movdqa %2, [r0+r1] ; line 1
|
|
movdqa %3, %1
|
|
punpcklbw %1, xmm7
|
|
punpckhbw %3, xmm7
|
|
movdqa %4, %2
|
|
punpcklbw %4, xmm7
|
|
punpckhbw %2, xmm7
|
|
paddw %1, %4
|
|
paddw %2, %3
|
|
movdqa %3, [r0+r2] ; line 2
|
|
movdqa %4, [r0+r3] ; line 3
|
|
movdqa %5, %3
|
|
punpcklbw %3, xmm7
|
|
punpckhbw %5, xmm7
|
|
movdqa %6, %4
|
|
punpcklbw %6, xmm7
|
|
punpckhbw %4, xmm7
|
|
paddw %3, %6
|
|
paddw %4, %5
|
|
paddw %1, %3 ; block 0, 1
|
|
paddw %2, %4 ; block 2, 3
|
|
pshufd %3, %1, 0B1h
|
|
pshufd %4, %2, 0B1h
|
|
paddw %1, %3
|
|
paddw %2, %4
|
|
movdqa %3, %1
|
|
movdqa %4, %2
|
|
pshuflw %5, %1, 0B1h
|
|
pshufhw %6, %3, 0B1h
|
|
paddw %1, %5
|
|
paddw %3, %6
|
|
pshuflw %5, %2, 0B1h
|
|
pshufhw %6, %4, 0B1h
|
|
paddw %2, %5
|
|
paddw %4, %6
|
|
punpcklwd %1, %2
|
|
punpckhwd %3, %4
|
|
punpcklwd %1, %3
|
|
psraw %1, $04
|
|
%endmacro
|
|
|
|
%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
|
|
movdqa %1, [r0 ] ; line 0
|
|
movdqa %2, [r0+r1] ; line 1
|
|
movdqa %3, %1
|
|
punpcklbw %1, xmm7
|
|
punpckhbw %3, xmm7
|
|
movdqa %4, %2
|
|
punpcklbw %4, xmm7
|
|
punpckhbw %2, xmm7
|
|
paddw %1, %4
|
|
paddw %2, %3
|
|
movdqa %3, [r0+r2] ; line 2
|
|
movdqa %4, [r0+r3] ; line 3
|
|
movdqa %5, %3
|
|
punpcklbw %3, xmm7
|
|
punpckhbw %5, xmm7
|
|
movdqa %6, %4
|
|
punpcklbw %6, xmm7
|
|
punpckhbw %4, xmm7
|
|
paddw %3, %6
|
|
paddw %4, %5
|
|
paddw %1, %3 ; block 0, 1
|
|
paddw %2, %4 ; block 2, 3
|
|
phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
|
|
phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
|
|
psraw %1, $04
|
|
%endmacro
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
; Code
|
|
;***********************************************************************
|
|
|
|
SECTION .text
|
|
|
|
; , 6/7/2010
|
|
|
|
;***********************************************************************
|
|
; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
|
|
;***********************************************************************
|
|
WELS_EXTERN AnalysisVaaInfoIntra_sse2
|
|
|
|
%assign push_num 0
|
|
LOAD_2_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1,r1d
|
|
|
|
%ifdef X86_32
|
|
push r3
|
|
push r4
|
|
push r5
|
|
push r6
|
|
%assign push_num push_num+4
|
|
%endif
|
|
|
|
mov r5,r7
|
|
and r5,0fh
|
|
sub r7,r5
|
|
sub r7,32
|
|
|
|
|
|
mov r2,r1
|
|
sal r2,$01 ;r2 = 2*iLineSize
|
|
mov r3,r2
|
|
add r3,r1 ;r3 = 3*iLineSize
|
|
|
|
mov r4,r2
|
|
sal r4,$01 ;r4 = 4*iLineSize
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
; loops
|
|
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
|
movq [r7], xmm0
|
|
|
|
lea r0, [r0+r4]
|
|
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
|
movq [r7+8], xmm0
|
|
|
|
lea r0, [r0+r4]
|
|
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
|
movq [r7+16], xmm0
|
|
|
|
lea r0, [r0+r4]
|
|
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
|
movq [r7+24], xmm0
|
|
|
|
movdqa xmm0, [r7] ; block 0~7
|
|
movdqa xmm1, [r7+16] ; block 8~15
|
|
movdqa xmm2, xmm0
|
|
paddw xmm0, xmm1
|
|
SUM_WORD_8x2_SSE2 xmm0, xmm3
|
|
|
|
pmullw xmm1, xmm1
|
|
pmullw xmm2, xmm2
|
|
movdqa xmm3, xmm1
|
|
movdqa xmm4, xmm2
|
|
punpcklwd xmm1, xmm7
|
|
punpckhwd xmm3, xmm7
|
|
punpcklwd xmm2, xmm7
|
|
punpckhwd xmm4, xmm7
|
|
paddd xmm1, xmm2
|
|
paddd xmm3, xmm4
|
|
paddd xmm1, xmm3
|
|
pshufd xmm2, xmm1, 01Bh
|
|
paddd xmm1, xmm2
|
|
pshufd xmm2, xmm1, 0B1h
|
|
paddd xmm1, xmm2
|
|
|
|
|
|
|
|
movd r2d, xmm0
|
|
and r2, 0ffffh ; effective low work truncated
|
|
mov r3, r2
|
|
imul r2, r3
|
|
sar r2, $04
|
|
movd retrd, xmm1
|
|
sub retrd, r2d
|
|
|
|
add r7,32
|
|
add r7,r5
|
|
|
|
%ifdef X86_32
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
POP_XMM
|
|
|
|
ret
|
|
|
|
;***********************************************************************
|
|
; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
|
|
;***********************************************************************
|
|
WELS_EXTERN AnalysisVaaInfoIntra_ssse3
|
|
|
|
%assign push_num 0
|
|
LOAD_2_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r1,r1d
|
|
|
|
%ifdef X86_32
|
|
push r3
|
|
push r4
|
|
push r5
|
|
push r6
|
|
%assign push_num push_num+4
|
|
%endif
|
|
|
|
mov r5,r7
|
|
and r5,0fh
|
|
sub r7,r5
|
|
sub r7,32
|
|
|
|
|
|
mov r2,r1
|
|
sal r2,$01 ;r2 = 2*iLineSize
|
|
mov r3,r2
|
|
add r3,r1 ;r3 = 3*iLineSize
|
|
|
|
mov r4,r2
|
|
sal r4,$01 ;r4 = 4*iLineSize
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
; loops
|
|
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
|
movq [r7],xmm0
|
|
|
|
lea r0,[r0+r4]
|
|
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
|
|
movq [r7+8],xmm1
|
|
|
|
|
|
lea r0,[r0+r4]
|
|
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
|
movq [r7+16],xmm0
|
|
|
|
lea r0,[r0+r4]
|
|
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
|
|
movq [r7+24],xmm1
|
|
|
|
|
|
movdqa xmm0,[r7]
|
|
movdqa xmm1,[r7+16]
|
|
movdqa xmm2, xmm0
|
|
paddw xmm0, xmm1
|
|
SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
|
|
|
|
pmullw xmm1, xmm1
|
|
pmullw xmm2, xmm2
|
|
movdqa xmm3, xmm1
|
|
movdqa xmm4, xmm2
|
|
punpcklwd xmm1, xmm7
|
|
punpckhwd xmm3, xmm7
|
|
punpcklwd xmm2, xmm7
|
|
punpckhwd xmm4, xmm7
|
|
paddd xmm1, xmm2
|
|
paddd xmm3, xmm4
|
|
paddd xmm1, xmm3
|
|
pshufd xmm2, xmm1, 01Bh
|
|
paddd xmm1, xmm2
|
|
pshufd xmm2, xmm1, 0B1h
|
|
paddd xmm1, xmm2
|
|
|
|
|
|
movd r2d, xmm0
|
|
and r2, 0ffffh ; effective low work truncated
|
|
mov r3, r2
|
|
imul r2, r3
|
|
sar r2, $04
|
|
movd retrd, xmm1
|
|
sub retrd, r2d
|
|
|
|
add r7,32
|
|
add r7,r5
|
|
%ifdef X86_32
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
POP_XMM
|
|
|
|
ret
|
|
|
|
;***********************************************************************
|
|
; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
|
|
;***********************************************************************
|
|
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
|
|
%assign push_num 0
|
|
LOAD_1_PARA
|
|
movdqa xmm0,[r0]
|
|
pshufd xmm1, xmm0, 01Bh
|
|
paddd xmm1, xmm0
|
|
pshufd xmm2, xmm1, 0B1h
|
|
paddd xmm1, xmm2
|
|
psrad xmm1, 02h ; iAverageSad
|
|
movdqa xmm2, xmm1
|
|
psrad xmm2, 06h
|
|
movdqa xmm3, xmm0 ; iSadBlock
|
|
psrad xmm3, 06h
|
|
psubd xmm3, xmm2
|
|
pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
|
|
pshufd xmm4, xmm3, 01Bh
|
|
paddd xmm4, xmm3
|
|
pshufd xmm3, xmm4, 0B1h
|
|
paddd xmm3, xmm4
|
|
movd r0d, xmm3
|
|
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
|
|
|
|
jb near .threshold_exit
|
|
pshufd xmm0, xmm0, 01Bh
|
|
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
|
|
movmskps retrd, xmm0
|
|
ret
|
|
.threshold_exit:
|
|
mov retrd, 15
|
|
ret
|
|
|
|
;***********************************************************************
|
|
; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
|
|
;***********************************************************************
|
|
WELS_EXTERN MdInterAnalysisVaaInfo_sse2
|
|
%assign push_num 0
|
|
LOAD_1_PARA
|
|
movdqa xmm0, [r0]
|
|
pshufd xmm1, xmm0, 01Bh
|
|
paddd xmm1, xmm0
|
|
pshufd xmm2, xmm1, 0B1h
|
|
paddd xmm1, xmm2
|
|
psrad xmm1, 02h ; iAverageSad
|
|
movdqa xmm2, xmm1
|
|
psrad xmm2, 06h
|
|
movdqa xmm3, xmm0 ; iSadBlock
|
|
psrad xmm3, 06h
|
|
psubd xmm3, xmm2
|
|
|
|
; to replace pmulld functionality as below
|
|
movdqa xmm2, xmm3
|
|
pmuludq xmm2, xmm3
|
|
pshufd xmm4, xmm3, 0B1h
|
|
pmuludq xmm4, xmm4
|
|
movdqa xmm5, xmm2
|
|
punpckldq xmm5, xmm4
|
|
punpckhdq xmm2, xmm4
|
|
punpcklqdq xmm5, xmm2
|
|
|
|
pshufd xmm4, xmm5, 01Bh
|
|
paddd xmm4, xmm5
|
|
pshufd xmm5, xmm4, 0B1h
|
|
paddd xmm5, xmm4
|
|
|
|
movd r0d, xmm5
|
|
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
|
|
jb near .threshold_exit
|
|
pshufd xmm0, xmm0, 01Bh
|
|
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
|
|
movmskps retrd, xmm0
|
|
ret
|
|
.threshold_exit:
|
|
mov retrd, 15
|
|
ret
|