;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* * Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* * Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* ;* score.asm ;* ;* Abstract ;* scan/score/count of sse2 ;* ;* History ;* 8/21/2009 Created ;* ;* ;*************************************************************************/ %include "asm_inc.asm" ;*********************************************************************** ; Macros ;*********************************************************************** ;*********************************************************************** ; Local Data (Read Only) ;*********************************************************************** SECTION .rodata align=16 ;align 16 ;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2 align 16 sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1 align 16 sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 align 16 sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0 align 16 sse2_plane_inc: dw 1, 2, 3, 4, 5, 6, 7, 8 align 16 sse2_plane_dec: dw 8, 7, 6, 5, 4, 3, 2, 1 align 16 pb_scanacdc_maska:db 0,1,2,3,8,9,14,15,10,11,4,5,6,7,12,13 align 16 pb_scanacdc_maskb:db 2,3,8,9,10,11,4,5,0,1,6,7,12,13,14,15 align 16 pb_scandc_maska:db 2,3,8,9,14,15,10,11,4,5,6,7,12,13,0,1 align 16 pb_scandc_maskb:db 8,9,10,11,4,5,0,1,6,7,12,13,14,15,128,128 align 16 nozero_count_table: db 0,1,1,2,1,2,2,3,1,2 db 2,3,2,3,3,4,1,2,2,3 db 2,3,3,4,2,3,3,4,3,4 db 4,5,1,2,2,3,2,3,3,4 db 2,3,3,4,3,4,4,5,2,3 db 3,4,3,4,4,5,3,4,4,5 db 4,5,5,6,1,2,2,3,2,3 db 3,4,2,3,3,4,3,4,4,5 db 2,3,3,4,3,4,4,5,3,4 db 4,5,4,5,5,6,2,3,3,4 db 3,4,4,5,3,4,4,5,4,5 db 5,6,3,4,4,5,4,5,5,6 db 4,5,5,6,5,6,6,7,1,2 db 2,3,2,3,3,4,2,3,3,4 db 3,4,4,5,2,3,3,4,3,4 db 4,5,3,4,4,5,4,5,5,6 db 2,3,3,4,3,4,4,5,3,4 db 4,5,4,5,5,6,3,4,4,5 db 4,5,5,6,4,5,5,6,5,6 db 6,7,2,3,3,4,3,4,4,5 db 3,4,4,5,4,5,5,6,3,4 db 4,5,4,5,5,6,4,5,5,6 db 5,6,6,7,3,4,4,5,4,5 db 5,6,4,5,5,6,5,6,6,7 db 4,5,5,6,5,6,6,7,5,6 db 6,7,6,7,7,8 align 16 high_mask_table: db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2 db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5 db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8 db 9,12, 0, 1, 1, 4, 2, 4, 5, 8 db 2, 4, 4, 7, 5, 7, 8,11, 3, 4 db 5, 8, 5, 7, 8,11, 6, 8, 8,11 db 9,11,12,15, 0, 1, 1, 4, 1, 3 db 4, 7, 2, 4, 4, 7, 5, 7, 8,11 db 2, 3, 4, 7, 4, 6, 7,10, 5, 7 db 7,10, 8,10,11,14, 3, 4, 4, 7 db 5, 7, 8,11, 5, 7, 7,10, 8,10 db 11,14, 6, 7, 8,11, 8,10,11,14 db 9,11,11,14,12,14,15,18, 0, 0 db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6 db 4, 6, 7,10, 2, 3, 4, 7, 4, 6 db 7,10, 5, 7, 7,10, 8,10,11,14 db 2, 3, 3, 6, 4, 6, 7,10, 4, 6 db 6, 9, 7, 9,10,13, 5, 6, 7,10 db 7, 9,10,13, 8,10,10,13,11,13 db 14,17, 3, 4, 4, 7, 4, 6, 7,10 db 5, 7, 7,10, 8,10,11,14, 5, 6 db 7,10, 7, 9,10,13, 8,10,10,13 db 11,13,14,17, 6, 7, 7,10, 8,10 db 11,14, 8,10,10,13,11,13,14,17 db 9,10,11,14,11,13,14,17,12,14 db 14,17,15,17,18,21 align 16 low_mask_table: db 0, 3, 2, 6, 2, 5, 5, 9, 1, 5 db 4, 8, 5, 8, 8,12, 1, 4, 4, 8 db 4, 7, 7,11, 4, 8, 7,11, 8,11 db 11,15, 1, 4, 3, 7, 4, 7, 7,11 db 3, 7, 6,10, 7,10,10,14, 4, 7 db 7,11, 7,10,10,14, 7,11,10,14 db 11,14,14,18, 0, 4, 3, 7, 3, 6 db 6,10, 3, 7, 6,10, 7,10,10,14 db 3, 6, 6,10, 6, 9, 9,13, 6,10 db 9,13,10,13,13,17, 4, 7, 6,10 db 7,10,10,14, 6,10, 9,13,10,13 db 13,17, 7,10,10,14,10,13,13,17 db 10,14,13,17,14,17,17,21, 0, 3 db 3, 7, 3, 6, 6,10, 2, 6, 5, 9 db 6, 9, 9,13, 3, 6, 6,10, 6, 9 db 9,13, 6,10, 9,13,10,13,13,17 db 3, 6, 5, 9, 6, 9, 9,13, 5, 9 db 8,12, 9,12,12,16, 6, 9, 9,13 db 9,12,12,16, 9,13,12,16,13,16 db 16,20, 3, 7, 6,10, 6, 9, 9,13 db 6,10, 9,13,10,13,13,17, 6, 9 db 9,13, 9,12,12,16, 9,13,12,16 db 13,16,16,20, 7,10, 9,13,10,13 db 13,17, 9,13,12,16,13,16,16,20 db 10,13,13,17,13,16,16,20,13,17 db 16,20,17,20,20,24 SECTION .text ;*********************************************************************** ;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct ) ;*********************************************************************** ALIGN 16 WELS_EXTERN WelsScan4x4DcAc_sse2 WelsScan4x4DcAc_sse2: %ifdef X86_32 push r3 %assign push_num 1 %else %assign push_num 0 %endif LOAD_2_PARA ;mov eax, [esp+8] movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0 movdqa xmm1, [r1+16] ; f e d c b a 9 8 pextrw r2d, xmm0, 7 ; ecx = 7 pextrw r3d, xmm1, 2 ; edx = a pextrw r1d, xmm0, 5 ; eax = 5 pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8 pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0 pextrw r2d, xmm1, 0 ; ecx = 8 pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0 pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0 pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0 pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9 ;mov eax, [esp+4] movdqa [r0],xmm0 movdqa [r0+16], xmm1 %ifdef X86_32 pop r3 %endif ret ;*********************************************************************** ;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct ) ;*********************************************************************** ALIGN 16 WELS_EXTERN WelsScan4x4DcAc_ssse3 WelsScan4x4DcAc_ssse3: %assign push_num 0 LOAD_2_PARA ;mov eax, [esp+8] movdqa xmm0, [r1] movdqa xmm1, [r1+16] pextrw r2d, xmm0, 7 ; ecx = [7] pextrw r1d, xmm1, 0 ; eax = [8] pinsrw xmm0, r1d, 7 ; xmm0[7] = [8] pinsrw xmm1, r2d, 0 ; xmm1[0] = [7] pshufb xmm1, [pb_scanacdc_maskb] pshufb xmm0, [pb_scanacdc_maska] ;mov eax, [esp+4] movdqa [r0],xmm0 movdqa [r0+16], xmm1 ret ;*********************************************************************** ;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct ) ;*********************************************************************** ALIGN 16 WELS_EXTERN WelsScan4x4Ac_sse2 WelsScan4x4Ac_sse2: %assign push_num 0 LOAD_2_PARA ;mov eax, [esp+8] movdqa xmm0, [r1] movdqa xmm1, [r1+16] movdqa xmm2, xmm0 punpcklqdq xmm0, xmm1 punpckhqdq xmm2, xmm1 movdqa xmm3, xmm0 punpckldq xmm0, xmm2 punpckhdq xmm3, xmm2 pextrw r1d , xmm0, 3 pextrw r2d , xmm0, 7 pinsrw xmm0, r1d, 7 pextrw r1d, xmm3, 4 pinsrw xmm3, r2d, 4 pextrw r2d, xmm3, 0 pinsrw xmm3, r1d, 0 pinsrw xmm0, r2d, 3 pshufhw xmm1, xmm0, 0x93 pshuflw xmm2, xmm3, 0x39 movdqa xmm3, xmm2 psrldq xmm1, 2 pslldq xmm3, 14 por xmm1, xmm3 psrldq xmm2, 2 ;mov eax, [esp+4] movdqa [r0],xmm1 movdqa [r0+16], xmm2 ret ;*********************************************************************** ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct ); ;*********************************************************************** ALIGN 16 WELS_EXTERN WelsCalculateSingleCtr4x4_sse2 WelsCalculateSingleCtr4x4_sse2: ;push ebx ;mov eax, [esp+8] %ifdef X86_32 push r3 %assign push_num 1 %else %assign push_num 0 %endif LOAD_1_PARA movdqa xmm0, [r0] movdqa xmm1, [r0+16] packsswb xmm0, xmm1 ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx xor r3, r3 pxor xmm3, xmm3 pcmpeqb xmm0, xmm3 pmovmskb r3d, xmm0 xor r3, 0xffff xor r0, r0 mov r2, 7 mov r1, 8 .loop_low8_find1: bt r3, r2 jc .loop_high8_find1 dec r2 jnz .loop_low8_find1 .loop_high8_find1: bt r3, r1 jc .find1end inc r1 cmp r1,16 jb .loop_high8_find1 .find1end: sub r1, r2 sub r1, 1 lea r2, [i_ds_table] add r0b, [r2+r1] mov r1, r3 and r3, 0xff shr r1, 8 and r1, 0xff lea r2 , [low_mask_table] add r0b, [r2 +r3] lea r2, [high_mask_table] add r0b, [r2+r1] %ifdef X86_32 pop r3 %else mov retrd, r0d %endif ;pop ebx ret ;*********************************************************************** ; int32_t WelsGetNoneZeroCount_sse2(int16_t* level); ;*********************************************************************** ALIGN 16 WELS_EXTERN WelsGetNoneZeroCount_sse2 WelsGetNoneZeroCount_sse2: %assign push_num 0 LOAD_1_PARA ;mov eax, [esp+4] movdqa xmm0, [r0] movdqa xmm1, [r0+16] pxor xmm2, xmm2 pcmpeqw xmm0, xmm2 pcmpeqw xmm1, xmm2 packsswb xmm1, xmm0 xor r1, r1 pmovmskb r1d, xmm1 xor r1d, 0xffff mov r2, r1 and r1, 0xff shr r2, 8 ; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet ; xor retr, retr ;add al, [nozero_count_table+r2] lea r0 , [nozero_count_table] movzx r2, byte [r0+r2] movzx r1, byte [r0+r1] mov retrq, r2 add retrq, r1 ;add al, [nozero_count_table+r1] ret