360 lines
10 KiB
NASM
360 lines
10 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* score.asm
|
|
;*
|
|
;* Abstract
|
|
;* scan/score/count of sse2
|
|
;*
|
|
;* History
|
|
;* 8/21/2009 Created
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
|
|
%include "asm_inc.asm"
|
|
|
|
;***********************************************************************
|
|
; Macros
|
|
;***********************************************************************
|
|
|
|
;***********************************************************************
|
|
; Local Data (Read Only)
|
|
;***********************************************************************
|
|
SECTION .rodata align=16
|
|
|
|
;align 16
|
|
;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
|
|
align 16
|
|
sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
|
|
align 16
|
|
sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
|
i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
align 16
|
|
sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
|
|
align 16
|
|
sse2_plane_inc: dw 1, 2, 3, 4, 5, 6, 7, 8
|
|
align 16
|
|
sse2_plane_dec: dw 8, 7, 6, 5, 4, 3, 2, 1
|
|
align 16
|
|
pb_scanacdc_maska:db 0,1,2,3,8,9,14,15,10,11,4,5,6,7,12,13
|
|
align 16
|
|
pb_scanacdc_maskb:db 2,3,8,9,10,11,4,5,0,1,6,7,12,13,14,15
|
|
align 16
|
|
pb_scandc_maska:db 2,3,8,9,14,15,10,11,4,5,6,7,12,13,0,1
|
|
align 16
|
|
pb_scandc_maskb:db 8,9,10,11,4,5,0,1,6,7,12,13,14,15,128,128
|
|
|
|
align 16
|
|
nozero_count_table:
|
|
db 0,1,1,2,1,2,2,3,1,2
|
|
db 2,3,2,3,3,4,1,2,2,3
|
|
db 2,3,3,4,2,3,3,4,3,4
|
|
db 4,5,1,2,2,3,2,3,3,4
|
|
db 2,3,3,4,3,4,4,5,2,3
|
|
db 3,4,3,4,4,5,3,4,4,5
|
|
db 4,5,5,6,1,2,2,3,2,3
|
|
db 3,4,2,3,3,4,3,4,4,5
|
|
db 2,3,3,4,3,4,4,5,3,4
|
|
db 4,5,4,5,5,6,2,3,3,4
|
|
db 3,4,4,5,3,4,4,5,4,5
|
|
db 5,6,3,4,4,5,4,5,5,6
|
|
db 4,5,5,6,5,6,6,7,1,2
|
|
db 2,3,2,3,3,4,2,3,3,4
|
|
db 3,4,4,5,2,3,3,4,3,4
|
|
db 4,5,3,4,4,5,4,5,5,6
|
|
db 2,3,3,4,3,4,4,5,3,4
|
|
db 4,5,4,5,5,6,3,4,4,5
|
|
db 4,5,5,6,4,5,5,6,5,6
|
|
db 6,7,2,3,3,4,3,4,4,5
|
|
db 3,4,4,5,4,5,5,6,3,4
|
|
db 4,5,4,5,5,6,4,5,5,6
|
|
db 5,6,6,7,3,4,4,5,4,5
|
|
db 5,6,4,5,5,6,5,6,6,7
|
|
db 4,5,5,6,5,6,6,7,5,6
|
|
db 6,7,6,7,7,8
|
|
|
|
align 16
|
|
high_mask_table:
|
|
db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
|
|
db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
|
|
db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
|
|
db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
|
|
db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
|
|
db 5, 8, 5, 7, 8,11, 6, 8, 8,11
|
|
db 9,11,12,15, 0, 1, 1, 4, 1, 3
|
|
db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
|
|
db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
|
|
db 7,10, 8,10,11,14, 3, 4, 4, 7
|
|
db 5, 7, 8,11, 5, 7, 7,10, 8,10
|
|
db 11,14, 6, 7, 8,11, 8,10,11,14
|
|
db 9,11,11,14,12,14,15,18, 0, 0
|
|
db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
|
|
db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
|
|
db 7,10, 5, 7, 7,10, 8,10,11,14
|
|
db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
|
|
db 6, 9, 7, 9,10,13, 5, 6, 7,10
|
|
db 7, 9,10,13, 8,10,10,13,11,13
|
|
db 14,17, 3, 4, 4, 7, 4, 6, 7,10
|
|
db 5, 7, 7,10, 8,10,11,14, 5, 6
|
|
db 7,10, 7, 9,10,13, 8,10,10,13
|
|
db 11,13,14,17, 6, 7, 7,10, 8,10
|
|
db 11,14, 8,10,10,13,11,13,14,17
|
|
db 9,10,11,14,11,13,14,17,12,14
|
|
db 14,17,15,17,18,21
|
|
|
|
align 16
|
|
low_mask_table:
|
|
db 0, 3, 2, 6, 2, 5, 5, 9, 1, 5
|
|
db 4, 8, 5, 8, 8,12, 1, 4, 4, 8
|
|
db 4, 7, 7,11, 4, 8, 7,11, 8,11
|
|
db 11,15, 1, 4, 3, 7, 4, 7, 7,11
|
|
db 3, 7, 6,10, 7,10,10,14, 4, 7
|
|
db 7,11, 7,10,10,14, 7,11,10,14
|
|
db 11,14,14,18, 0, 4, 3, 7, 3, 6
|
|
db 6,10, 3, 7, 6,10, 7,10,10,14
|
|
db 3, 6, 6,10, 6, 9, 9,13, 6,10
|
|
db 9,13,10,13,13,17, 4, 7, 6,10
|
|
db 7,10,10,14, 6,10, 9,13,10,13
|
|
db 13,17, 7,10,10,14,10,13,13,17
|
|
db 10,14,13,17,14,17,17,21, 0, 3
|
|
db 3, 7, 3, 6, 6,10, 2, 6, 5, 9
|
|
db 6, 9, 9,13, 3, 6, 6,10, 6, 9
|
|
db 9,13, 6,10, 9,13,10,13,13,17
|
|
db 3, 6, 5, 9, 6, 9, 9,13, 5, 9
|
|
db 8,12, 9,12,12,16, 6, 9, 9,13
|
|
db 9,12,12,16, 9,13,12,16,13,16
|
|
db 16,20, 3, 7, 6,10, 6, 9, 9,13
|
|
db 6,10, 9,13,10,13,13,17, 6, 9
|
|
db 9,13, 9,12,12,16, 9,13,12,16
|
|
db 13,16,16,20, 7,10, 9,13,10,13
|
|
db 13,17, 9,13,12,16,13,16,16,20
|
|
db 10,13,13,17,13,16,16,20,13,17
|
|
db 16,20,17,20,20,24
|
|
|
|
|
|
SECTION .text
|
|
|
|
;***********************************************************************
|
|
;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
|
|
;***********************************************************************
|
|
ALIGN 16
|
|
WELS_EXTERN WelsScan4x4DcAc_sse2
|
|
WelsScan4x4DcAc_sse2:
|
|
%ifdef X86_32
|
|
push r3
|
|
%assign push_num 1
|
|
%else
|
|
%assign push_num 0
|
|
%endif
|
|
LOAD_2_PARA
|
|
;mov eax, [esp+8]
|
|
movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
|
|
movdqa xmm1, [r1+16] ; f e d c b a 9 8
|
|
pextrw r2d, xmm0, 7 ; ecx = 7
|
|
pextrw r3d, xmm1, 2 ; edx = a
|
|
pextrw r1d, xmm0, 5 ; eax = 5
|
|
pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
|
|
pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
|
|
pextrw r2d, xmm1, 0 ; ecx = 8
|
|
pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
|
|
pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
|
|
pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
|
|
pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
|
|
pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
|
|
pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
|
|
;mov eax, [esp+4]
|
|
movdqa [r0],xmm0
|
|
movdqa [r0+16], xmm1
|
|
%ifdef X86_32
|
|
pop r3
|
|
%endif
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
|
|
;***********************************************************************
|
|
ALIGN 16
|
|
WELS_EXTERN WelsScan4x4DcAc_ssse3
|
|
WelsScan4x4DcAc_ssse3:
|
|
%assign push_num 0
|
|
LOAD_2_PARA
|
|
;mov eax, [esp+8]
|
|
movdqa xmm0, [r1]
|
|
movdqa xmm1, [r1+16]
|
|
pextrw r2d, xmm0, 7 ; ecx = [7]
|
|
pextrw r1d, xmm1, 0 ; eax = [8]
|
|
pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
|
|
pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
|
|
pshufb xmm1, [pb_scanacdc_maskb]
|
|
pshufb xmm0, [pb_scanacdc_maska]
|
|
|
|
;mov eax, [esp+4]
|
|
movdqa [r0],xmm0
|
|
movdqa [r0+16], xmm1
|
|
ret
|
|
;***********************************************************************
|
|
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
|
|
;***********************************************************************
|
|
ALIGN 16
|
|
WELS_EXTERN WelsScan4x4Ac_sse2
|
|
WelsScan4x4Ac_sse2:
|
|
%assign push_num 0
|
|
LOAD_2_PARA
|
|
;mov eax, [esp+8]
|
|
movdqa xmm0, [r1]
|
|
movdqa xmm1, [r1+16]
|
|
movdqa xmm2, xmm0
|
|
punpcklqdq xmm0, xmm1
|
|
punpckhqdq xmm2, xmm1
|
|
|
|
movdqa xmm3, xmm0
|
|
punpckldq xmm0, xmm2
|
|
punpckhdq xmm3, xmm2
|
|
pextrw r1d , xmm0, 3
|
|
pextrw r2d , xmm0, 7
|
|
pinsrw xmm0, r1d, 7
|
|
pextrw r1d, xmm3, 4
|
|
pinsrw xmm3, r2d, 4
|
|
pextrw r2d, xmm3, 0
|
|
pinsrw xmm3, r1d, 0
|
|
pinsrw xmm0, r2d, 3
|
|
|
|
pshufhw xmm1, xmm0, 0x93
|
|
pshuflw xmm2, xmm3, 0x39
|
|
|
|
movdqa xmm3, xmm2
|
|
psrldq xmm1, 2
|
|
pslldq xmm3, 14
|
|
por xmm1, xmm3
|
|
psrldq xmm2, 2
|
|
;mov eax, [esp+4]
|
|
movdqa [r0],xmm1
|
|
movdqa [r0+16], xmm2
|
|
ret
|
|
|
|
|
|
;***********************************************************************
|
|
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
|
|
;***********************************************************************
|
|
ALIGN 16
|
|
WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
|
|
WelsCalculateSingleCtr4x4_sse2:
|
|
;push ebx
|
|
;mov eax, [esp+8]
|
|
%ifdef X86_32
|
|
push r3
|
|
%assign push_num 1
|
|
%else
|
|
%assign push_num 0
|
|
%endif
|
|
LOAD_1_PARA
|
|
movdqa xmm0, [r0]
|
|
movdqa xmm1, [r0+16]
|
|
|
|
packsswb xmm0, xmm1
|
|
; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
|
|
xor r3, r3
|
|
pxor xmm3, xmm3
|
|
pcmpeqb xmm0, xmm3
|
|
pmovmskb r3d, xmm0
|
|
|
|
xor r3, 0xffff
|
|
|
|
xor r0, r0
|
|
mov r2, 7
|
|
mov r1, 8
|
|
.loop_low8_find1:
|
|
bt r3, r2
|
|
jc .loop_high8_find1
|
|
dec r2
|
|
jnz .loop_low8_find1
|
|
.loop_high8_find1:
|
|
bt r3, r1
|
|
jc .find1end
|
|
inc r1
|
|
cmp r1,16
|
|
jb .loop_high8_find1
|
|
.find1end:
|
|
sub r1, r2
|
|
sub r1, 1
|
|
lea r2, [i_ds_table]
|
|
add r0b, [r2+r1]
|
|
mov r1, r3
|
|
and r3, 0xff
|
|
shr r1, 8
|
|
and r1, 0xff
|
|
lea r2 , [low_mask_table]
|
|
add r0b, [r2 +r3]
|
|
lea r2, [high_mask_table]
|
|
add r0b, [r2+r1]
|
|
%ifdef X86_32
|
|
pop r3
|
|
%else
|
|
mov retrd, r0d
|
|
%endif
|
|
;pop ebx
|
|
ret
|
|
|
|
|
|
;***********************************************************************
|
|
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
|
|
;***********************************************************************
|
|
ALIGN 16
|
|
WELS_EXTERN WelsGetNoneZeroCount_sse2
|
|
WelsGetNoneZeroCount_sse2:
|
|
%assign push_num 0
|
|
LOAD_1_PARA
|
|
;mov eax, [esp+4]
|
|
movdqa xmm0, [r0]
|
|
movdqa xmm1, [r0+16]
|
|
pxor xmm2, xmm2
|
|
pcmpeqw xmm0, xmm2
|
|
pcmpeqw xmm1, xmm2
|
|
packsswb xmm1, xmm0
|
|
xor r1, r1
|
|
pmovmskb r1d, xmm1
|
|
xor r1d, 0xffff
|
|
mov r2, r1
|
|
and r1, 0xff
|
|
shr r2, 8
|
|
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
|
|
; xor retr, retr
|
|
;add al, [nozero_count_table+r2]
|
|
lea r0 , [nozero_count_table]
|
|
movzx r2, byte [r0+r2]
|
|
movzx r1, byte [r0+r1]
|
|
mov retrq, r2
|
|
add retrq, r1
|
|
;add al, [nozero_count_table+r1]
|
|
ret
|
|
|