openh264/codec/encoder/core/x86/score.asm
Martin Storsjö 57f6bcc4b0 Convert all tabs to spaces in assembly sources, unify indentation
Previously the assembly sources had mixed indentation consisting
of both spaces and tabs, making it quite hard to read unless
the right tab size was used in the editor.

Tabs have been interpreted as 4 spaces in most cases, matching
the surrounding code.
2014-06-01 01:35:43 +03:00

340 lines
10 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* score.asm
;*
;* Abstract
;* scan/score/count of sse2
;*
;* History
;* 8/21/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Macros
;***********************************************************************
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
SECTION .rodata align=16
;align 16
;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
align 16
sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
align 16
sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
align 16
sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
align 16
sse2_plane_inc: dw 1, 2, 3, 4, 5, 6, 7, 8
align 16
sse2_plane_dec: dw 8, 7, 6, 5, 4, 3, 2, 1
align 16
pb_scanacdc_maska:db 0,1,2,3,8,9,14,15,10,11,4,5,6,7,12,13
align 16
pb_scanacdc_maskb:db 2,3,8,9,10,11,4,5,0,1,6,7,12,13,14,15
align 16
pb_scandc_maska:db 2,3,8,9,14,15,10,11,4,5,6,7,12,13,0,1
align 16
pb_scandc_maskb:db 8,9,10,11,4,5,0,1,6,7,12,13,14,15,128,128
align 16
nozero_count_table:
db 0,1,1,2,1,2,2,3,1,2
db 2,3,2,3,3,4,1,2,2,3
db 2,3,3,4,2,3,3,4,3,4
db 4,5,1,2,2,3,2,3,3,4
db 2,3,3,4,3,4,4,5,2,3
db 3,4,3,4,4,5,3,4,4,5
db 4,5,5,6,1,2,2,3,2,3
db 3,4,2,3,3,4,3,4,4,5
db 2,3,3,4,3,4,4,5,3,4
db 4,5,4,5,5,6,2,3,3,4
db 3,4,4,5,3,4,4,5,4,5
db 5,6,3,4,4,5,4,5,5,6
db 4,5,5,6,5,6,6,7,1,2
db 2,3,2,3,3,4,2,3,3,4
db 3,4,4,5,2,3,3,4,3,4
db 4,5,3,4,4,5,4,5,5,6
db 2,3,3,4,3,4,4,5,3,4
db 4,5,4,5,5,6,3,4,4,5
db 4,5,5,6,4,5,5,6,5,6
db 6,7,2,3,3,4,3,4,4,5
db 3,4,4,5,4,5,5,6,3,4
db 4,5,4,5,5,6,4,5,5,6
db 5,6,6,7,3,4,4,5,4,5
db 5,6,4,5,5,6,5,6,6,7
db 4,5,5,6,5,6,6,7,5,6
db 6,7,6,7,7,8
align 16
high_mask_table:
db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
db 5, 8, 5, 7, 8,11, 6, 8, 8,11
db 9,11,12,15, 0, 1, 1, 4, 1, 3
db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
db 7,10, 8,10,11,14, 3, 4, 4, 7
db 5, 7, 8,11, 5, 7, 7,10, 8,10
db 11,14, 6, 7, 8,11, 8,10,11,14
db 9,11,11,14,12,14,15,18, 0, 0
db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
db 7,10, 5, 7, 7,10, 8,10,11,14
db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
db 6, 9, 7, 9,10,13, 5, 6, 7,10
db 7, 9,10,13, 8,10,10,13,11,13
db 14,17, 3, 4, 4, 7, 4, 6, 7,10
db 5, 7, 7,10, 8,10,11,14, 5, 6
db 7,10, 7, 9,10,13, 8,10,10,13
db 11,13,14,17, 6, 7, 7,10, 8,10
db 11,14, 8,10,10,13,11,13,14,17
db 9,10,11,14,11,13,14,17,12,14
db 14,17,15,17,18,21
align 16
low_mask_table:
db 0, 3, 2, 6, 2, 5, 5, 9, 1, 5
db 4, 8, 5, 8, 8,12, 1, 4, 4, 8
db 4, 7, 7,11, 4, 8, 7,11, 8,11
db 11,15, 1, 4, 3, 7, 4, 7, 7,11
db 3, 7, 6,10, 7,10,10,14, 4, 7
db 7,11, 7,10,10,14, 7,11,10,14
db 11,14,14,18, 0, 4, 3, 7, 3, 6
db 6,10, 3, 7, 6,10, 7,10,10,14
db 3, 6, 6,10, 6, 9, 9,13, 6,10
db 9,13,10,13,13,17, 4, 7, 6,10
db 7,10,10,14, 6,10, 9,13,10,13
db 13,17, 7,10,10,14,10,13,13,17
db 10,14,13,17,14,17,17,21, 0, 3
db 3, 7, 3, 6, 6,10, 2, 6, 5, 9
db 6, 9, 9,13, 3, 6, 6,10, 6, 9
db 9,13, 6,10, 9,13,10,13,13,17
db 3, 6, 5, 9, 6, 9, 9,13, 5, 9
db 8,12, 9,12,12,16, 6, 9, 9,13
db 9,12,12,16, 9,13,12,16,13,16
db 16,20, 3, 7, 6,10, 6, 9, 9,13
db 6,10, 9,13,10,13,13,17, 6, 9
db 9,13, 9,12,12,16, 9,13,12,16
db 13,16,16,20, 7,10, 9,13,10,13
db 13,17, 9,13,12,16,13,16,16,20
db 10,13,13,17,13,16,16,20,13,17
db 16,20,17,20,20,24
SECTION .text
;***********************************************************************
;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
;***********************************************************************
WELS_EXTERN WelsScan4x4DcAc_sse2
%ifdef X86_32
push r3
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_2_PARA
movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
movdqa xmm1, [r1+16] ; f e d c b a 9 8
pextrw r2d, xmm0, 7 ; ecx = 7
pextrw r3d, xmm1, 2 ; edx = a
pextrw r1d, xmm0, 5 ; eax = 5
pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
pextrw r2d, xmm1, 0 ; ecx = 8
pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
movdqa [r0],xmm0
movdqa [r0+16], xmm1
%ifdef X86_32
pop r3
%endif
ret
;***********************************************************************
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
;***********************************************************************
WELS_EXTERN WelsScan4x4DcAc_ssse3
%assign push_num 0
LOAD_2_PARA
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
pextrw r2d, xmm0, 7 ; ecx = [7]
pextrw r1d, xmm1, 0 ; eax = [8]
pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
pshufb xmm1, [pb_scanacdc_maskb]
pshufb xmm0, [pb_scanacdc_maska]
movdqa [r0],xmm0
movdqa [r0+16], xmm1
ret
;***********************************************************************
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
;***********************************************************************
WELS_EXTERN WelsScan4x4Ac_sse2
%assign push_num 0
LOAD_2_PARA
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdqa xmm2, xmm0
punpcklqdq xmm0, xmm1
punpckhqdq xmm2, xmm1
movdqa xmm3, xmm0
punpckldq xmm0, xmm2
punpckhdq xmm3, xmm2
pextrw r1d , xmm0, 3
pextrw r2d , xmm0, 7
pinsrw xmm0, r1d, 7
pextrw r1d, xmm3, 4
pinsrw xmm3, r2d, 4
pextrw r2d, xmm3, 0
pinsrw xmm3, r1d, 0
pinsrw xmm0, r2d, 3
pshufhw xmm1, xmm0, 0x93
pshuflw xmm2, xmm3, 0x39
movdqa xmm3, xmm2
psrldq xmm1, 2
pslldq xmm3, 14
por xmm1, xmm3
psrldq xmm2, 2
movdqa [r0],xmm1
movdqa [r0+16], xmm2
ret
;***********************************************************************
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
%ifdef X86_32
push r3
%assign push_num 1
%else
%assign push_num 0
%endif
LOAD_1_PARA
movdqa xmm0, [r0]
movdqa xmm1, [r0+16]
packsswb xmm0, xmm1
; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
xor r3, r3
pxor xmm3, xmm3
pcmpeqb xmm0, xmm3
pmovmskb r3d, xmm0
xor r3, 0xffff
xor r0, r0
mov r2, 7
mov r1, 8
.loop_low8_find1:
bt r3, r2
jc .loop_high8_find1
dec r2
jnz .loop_low8_find1
.loop_high8_find1:
bt r3, r1
jc .find1end
inc r1
cmp r1,16
jb .loop_high8_find1
.find1end:
sub r1, r2
sub r1, 1
lea r2, [i_ds_table]
add r0b, [r2+r1]
mov r1, r3
and r3, 0xff
shr r1, 8
and r1, 0xff
lea r2 , [low_mask_table]
add r0b, [r2 +r3]
lea r2, [high_mask_table]
add r0b, [r2+r1]
%ifdef X86_32
pop r3
%else
mov retrd, r0d
%endif
ret
;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
;***********************************************************************
WELS_EXTERN WelsGetNoneZeroCount_sse2
%assign push_num 0
LOAD_1_PARA
movdqa xmm0, [r0]
movdqa xmm1, [r0+16]
pxor xmm2, xmm2
pcmpeqw xmm0, xmm2
pcmpeqw xmm1, xmm2
packsswb xmm1, xmm0
xor r1, r1
pmovmskb r1d, xmm1
xor r1d, 0xffff
mov r2, r1
and r1, 0xff
shr r2, 8
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
; xor retr, retr
;add al, [nozero_count_table+r2]
lea r0 , [nozero_count_table]
movzx r2, byte [r0+r2]
movzx r1, byte [r0+r1]
mov retrq, r2
add retrq, r1
;add al, [nozero_count_table+r1]
ret