openh264/codec/encoder/core/asm/score.asm
2014-01-03 14:49:45 +08:00

360 lines
10 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* score.asm
;*
;* Abstract
;* scan/score/count of sse2
;*
;* History
;* 8/21/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Macros
;***********************************************************************
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
SECTION .rodata align=16
;align 16
;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
align 16
sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
align 16
sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
align 16
sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
align 16
sse2_plane_inc: dw 1, 2, 3, 4, 5, 6, 7, 8
align 16
sse2_plane_dec: dw 8, 7, 6, 5, 4, 3, 2, 1
align 16
pb_scanacdc_maska:db 0,1,2,3,8,9,14,15,10,11,4,5,6,7,12,13
align 16
pb_scanacdc_maskb:db 2,3,8,9,10,11,4,5,0,1,6,7,12,13,14,15
align 16
pb_scandc_maska:db 2,3,8,9,14,15,10,11,4,5,6,7,12,13,0,1
align 16
pb_scandc_maskb:db 8,9,10,11,4,5,0,1,6,7,12,13,14,15,128,128
align 16
nozero_count_table:
db 0,1,1,2,1,2,2,3,1,2
db 2,3,2,3,3,4,1,2,2,3
db 2,3,3,4,2,3,3,4,3,4
db 4,5,1,2,2,3,2,3,3,4
db 2,3,3,4,3,4,4,5,2,3
db 3,4,3,4,4,5,3,4,4,5
db 4,5,5,6,1,2,2,3,2,3
db 3,4,2,3,3,4,3,4,4,5
db 2,3,3,4,3,4,4,5,3,4
db 4,5,4,5,5,6,2,3,3,4
db 3,4,4,5,3,4,4,5,4,5
db 5,6,3,4,4,5,4,5,5,6
db 4,5,5,6,5,6,6,7,1,2
db 2,3,2,3,3,4,2,3,3,4
db 3,4,4,5,2,3,3,4,3,4
db 4,5,3,4,4,5,4,5,5,6
db 2,3,3,4,3,4,4,5,3,4
db 4,5,4,5,5,6,3,4,4,5
db 4,5,5,6,4,5,5,6,5,6
db 6,7,2,3,3,4,3,4,4,5
db 3,4,4,5,4,5,5,6,3,4
db 4,5,4,5,5,6,4,5,5,6
db 5,6,6,7,3,4,4,5,4,5
db 5,6,4,5,5,6,5,6,6,7
db 4,5,5,6,5,6,6,7,5,6
db 6,7,6,7,7,8
align 16
high_mask_table:
db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
db 5, 8, 5, 7, 8,11, 6, 8, 8,11
db 9,11,12,15, 0, 1, 1, 4, 1, 3
db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
db 7,10, 8,10,11,14, 3, 4, 4, 7
db 5, 7, 8,11, 5, 7, 7,10, 8,10
db 11,14, 6, 7, 8,11, 8,10,11,14
db 9,11,11,14,12,14,15,18, 0, 0
db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
db 7,10, 5, 7, 7,10, 8,10,11,14
db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
db 6, 9, 7, 9,10,13, 5, 6, 7,10
db 7, 9,10,13, 8,10,10,13,11,13
db 14,17, 3, 4, 4, 7, 4, 6, 7,10
db 5, 7, 7,10, 8,10,11,14, 5, 6
db 7,10, 7, 9,10,13, 8,10,10,13
db 11,13,14,17, 6, 7, 7,10, 8,10
db 11,14, 8,10,10,13,11,13,14,17
db 9,10,11,14,11,13,14,17,12,14
db 14,17,15,17,18,21
align 16
low_mask_table:
db 0, 3, 2, 6, 2, 5, 5, 9, 1, 5
db 4, 8, 5, 8, 8,12, 1, 4, 4, 8
db 4, 7, 7,11, 4, 8, 7,11, 8,11
db 11,15, 1, 4, 3, 7, 4, 7, 7,11
db 3, 7, 6,10, 7,10,10,14, 4, 7
db 7,11, 7,10,10,14, 7,11,10,14
db 11,14,14,18, 0, 4, 3, 7, 3, 6
db 6,10, 3, 7, 6,10, 7,10,10,14
db 3, 6, 6,10, 6, 9, 9,13, 6,10
db 9,13,10,13,13,17, 4, 7, 6,10
db 7,10,10,14, 6,10, 9,13,10,13
db 13,17, 7,10,10,14,10,13,13,17
db 10,14,13,17,14,17,17,21, 0, 3
db 3, 7, 3, 6, 6,10, 2, 6, 5, 9
db 6, 9, 9,13, 3, 6, 6,10, 6, 9
db 9,13, 6,10, 9,13,10,13,13,17
db 3, 6, 5, 9, 6, 9, 9,13, 5, 9
db 8,12, 9,12,12,16, 6, 9, 9,13
db 9,12,12,16, 9,13,12,16,13,16
db 16,20, 3, 7, 6,10, 6, 9, 9,13
db 6,10, 9,13,10,13,13,17, 6, 9
db 9,13, 9,12,12,16, 9,13,12,16
db 13,16,16,20, 7,10, 9,13,10,13
db 13,17, 9,13,12,16,13,16,16,20
db 10,13,13,17,13,16,16,20,13,17
db 16,20,17,20,20,24
SECTION .text
;***********************************************************************
;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsScan4x4DcAc_sse2
WelsScan4x4DcAc_sse2:
%ifdef X86_32
push r3
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_2_PARA
;mov eax, [esp+8]
movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
movdqa xmm1, [r1+16] ; f e d c b a 9 8
pextrw r2d, xmm0, 7 ; ecx = 7
pextrw r3d, xmm1, 2 ; edx = a
pextrw r1d, xmm0, 5 ; eax = 5
pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
pextrw r2d, xmm1, 0 ; ecx = 8
pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
;mov eax, [esp+4]
movdqa [r0],xmm0
movdqa [r0+16], xmm1
%ifdef X86_32
pop r3
%endif
ret
;***********************************************************************
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsScan4x4DcAc_ssse3
WelsScan4x4DcAc_ssse3:
%assign push_num 0
LOAD_2_PARA
;mov eax, [esp+8]
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
pextrw r2d, xmm0, 7 ; ecx = [7]
pextrw r1d, xmm1, 0 ; eax = [8]
pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
pshufb xmm1, [pb_scanacdc_maskb]
pshufb xmm0, [pb_scanacdc_maska]
;mov eax, [esp+4]
movdqa [r0],xmm0
movdqa [r0+16], xmm1
ret
;***********************************************************************
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsScan4x4Ac_sse2
WelsScan4x4Ac_sse2:
%assign push_num 0
LOAD_2_PARA
;mov eax, [esp+8]
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdqa xmm2, xmm0
punpcklqdq xmm0, xmm1
punpckhqdq xmm2, xmm1
movdqa xmm3, xmm0
punpckldq xmm0, xmm2
punpckhdq xmm3, xmm2
pextrw r1d , xmm0, 3
pextrw r2d , xmm0, 7
pinsrw xmm0, r1d, 7
pextrw r1d, xmm3, 4
pinsrw xmm3, r2d, 4
pextrw r2d, xmm3, 0
pinsrw xmm3, r1d, 0
pinsrw xmm0, r2d, 3
pshufhw xmm1, xmm0, 0x93
pshuflw xmm2, xmm3, 0x39
movdqa xmm3, xmm2
psrldq xmm1, 2
pslldq xmm3, 14
por xmm1, xmm3
psrldq xmm2, 2
;mov eax, [esp+4]
movdqa [r0],xmm1
movdqa [r0+16], xmm2
ret
;***********************************************************************
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
WelsCalculateSingleCtr4x4_sse2:
;push ebx
;mov eax, [esp+8]
%ifdef X86_32
push r3
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_1_PARA
movdqa xmm0, [r0]
movdqa xmm1, [r0+16]
packsswb xmm0, xmm1
; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
xor r3, r3
pxor xmm3, xmm3
pcmpeqb xmm0, xmm3
pmovmskb r3d, xmm0
xor r3, 0xffff
xor r0, r0
mov r2, 7
mov r1, 8
.loop_low8_find1:
bt r3, r2
jc .loop_high8_find1
dec r2
jnz .loop_low8_find1
.loop_high8_find1:
bt r3, r1
jc .find1end
inc r1
cmp r1,16
jb .loop_high8_find1
.find1end:
sub r1, r2
sub r1, 1
lea r2, [i_ds_table]
add r0b, [r2+r1]
mov r1, r3
and r3, 0xff
shr r1, 8
and r1, 0xff
lea r2 , [low_mask_table]
add r0b, [r2 +r3]
lea r2, [high_mask_table]
add r0b, [r2+r1]
%ifdef X86_32
pop r3
%else
mov retrd, r0d
%endif
;pop ebx
ret
;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsGetNoneZeroCount_sse2
WelsGetNoneZeroCount_sse2:
%assign push_num 0
LOAD_1_PARA
;mov eax, [esp+4]
movdqa xmm0, [r0]
movdqa xmm1, [r0+16]
pxor xmm2, xmm2
pcmpeqw xmm0, xmm2
pcmpeqw xmm1, xmm2
packsswb xmm1, xmm0
xor r1, r1
pmovmskb r1d, xmm1
xor r1d, 0xffff
mov r2, r1
and r1, 0xff
shr r2, 8
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
; xor retr, retr
;add al, [nozero_count_table+r2]
lea r0 , [nozero_count_table]
movzx r2, byte [r0+r2]
movzx r1, byte [r0+r1]
mov retrq, r2
add retrq, r1
;add al, [nozero_count_table+r1]
ret