d5a45ec513
This avoids having to add extra linker flags in order to specify this. This is similar to how this already is handled for the arm assembly.
611 lines
14 KiB
NASM
611 lines
14 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* sse2inc.asm
|
|
;*
|
|
;* Abstract
|
|
;* macro and constant
|
|
;*
|
|
;* History
|
|
;* 8/5/2009 Created
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
;***********************************************************************
|
|
; Options, for DEBUG
|
|
;***********************************************************************
|
|
|
|
%if 1
|
|
%define MOVDQ movdqa
|
|
%else
|
|
%define MOVDQ movdqu
|
|
%endif
|
|
|
|
%if 1
|
|
%define WELSEMMS emms
|
|
%else
|
|
%define WELSEMMS
|
|
%endif
|
|
|
|
|
|
;***********************************************************************
|
|
; Macros
|
|
;***********************************************************************
|
|
|
|
DEFAULT REL
|
|
|
|
%ifdef WIN64 ; Windows x64 ;************************************
|
|
|
|
BITS 64
|
|
|
|
%define arg1 rcx
|
|
%define arg2 rdx
|
|
%define arg3 r8
|
|
%define arg4 r9
|
|
%define arg5 [rsp + push_num*8 + 40]
|
|
%define arg6 [rsp + push_num*8 + 48]
|
|
%define arg7 [rsp + push_num*8 + 56]
|
|
%define arg8 [rsp + push_num*8 + 64]
|
|
%define arg9 [rsp + push_num*8 + 72]
|
|
%define arg10 [rsp + push_num*8 + 80]
|
|
%define arg11 [rsp + push_num*8 + 88]
|
|
%define arg12 [rsp + push_num*8 + 96]
|
|
|
|
%define r0 rcx
|
|
%define r1 rdx
|
|
%define r2 r8
|
|
%define r3 r9
|
|
%define r4 rax
|
|
%define r5 r10
|
|
%define r6 r11
|
|
%define r7 rsp
|
|
|
|
%define r0d ecx
|
|
%define r1d edx
|
|
%define r2d r8d
|
|
%define r3d r9d
|
|
%define r4d eax
|
|
%define r5d r10d
|
|
%define r6d r11d
|
|
|
|
%define r0w cx
|
|
%define r1w dx
|
|
%define r2w r8w
|
|
%define r3w r9w
|
|
%define r6w r11w
|
|
|
|
%define r0b cl
|
|
%define r1b dl
|
|
%define r2b r8l
|
|
%define r3b r9l
|
|
|
|
%define PUSHRFLAGS pushfq
|
|
%define POPRFLAGS popfq
|
|
%define retrq rax
|
|
%define retrd eax
|
|
|
|
%elifdef UNIX64 ; Unix x64 ;************************************
|
|
|
|
BITS 64
|
|
|
|
%ifidn __OUTPUT_FORMAT__,elf64
|
|
SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-executable
|
|
%endif
|
|
|
|
%define arg1 rdi
|
|
%define arg2 rsi
|
|
%define arg3 rdx
|
|
%define arg4 rcx
|
|
%define arg5 r8
|
|
%define arg6 r9
|
|
%define arg7 [rsp + push_num*8 + 8]
|
|
%define arg8 [rsp + push_num*8 + 16]
|
|
%define arg9 [rsp + push_num*8 + 24]
|
|
%define arg10 [rsp + push_num*8 + 32]
|
|
%define arg11 [rsp + push_num*8 + 40]
|
|
%define arg12 [rsp + push_num*8 + 48]
|
|
|
|
%define r0 rdi
|
|
%define r1 rsi
|
|
%define r2 rdx
|
|
%define r3 rcx
|
|
%define r4 r8
|
|
%define r5 r9
|
|
%define r6 r10
|
|
%define r7 rsp
|
|
|
|
%define r0d edi
|
|
%define r1d esi
|
|
%define r2d edx
|
|
%define r3d ecx
|
|
%define r4d r8d
|
|
%define r5d r9d
|
|
%define r6d r10d
|
|
|
|
%define r0w di
|
|
%define r1w si
|
|
%define r2w dx
|
|
%define r3w cx
|
|
%define r6w r10w
|
|
|
|
%define r0b dil
|
|
%define r1b sil
|
|
%define r2b dl
|
|
%define r3b cl
|
|
|
|
%define PUSHRFLAGS pushfq
|
|
%define POPRFLAGS popfq
|
|
%define retrq rax
|
|
%define retrd eax
|
|
|
|
%elifdef X86_32 ; X86_32 ;************************************
|
|
|
|
BITS 32
|
|
|
|
%ifidn __OUTPUT_FORMAT__,elf
|
|
SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-executable
|
|
%endif
|
|
|
|
%define arg1 [esp + push_num*4 + 4]
|
|
%define arg2 [esp + push_num*4 + 8]
|
|
%define arg3 [esp + push_num*4 + 12]
|
|
%define arg4 [esp + push_num*4 + 16]
|
|
%define arg5 [esp + push_num*4 + 20]
|
|
%define arg6 [esp + push_num*4 + 24]
|
|
%define arg7 [esp + push_num*4 + 28]
|
|
%define arg8 [esp + push_num*4 + 32]
|
|
%define arg9 [esp + push_num*4 + 36]
|
|
%define arg10 [esp + push_num*4 + 40]
|
|
%define arg11 [esp + push_num*4 + 44]
|
|
%define arg12 [esp + push_num*4 + 48]
|
|
|
|
%define r0 eax
|
|
%define r1 ecx
|
|
%define r2 edx
|
|
%define r3 ebx
|
|
%define r4 esi
|
|
%define r5 edi
|
|
%define r6 ebp
|
|
%define r7 esp
|
|
|
|
%define r0d eax
|
|
%define r1d ecx
|
|
%define r2d edx
|
|
%define r3d ebx
|
|
%define r4d esi
|
|
%define r5d edi
|
|
%define r6d ebp
|
|
|
|
%define r0w ax
|
|
%define r1w cx
|
|
%define r2w dx
|
|
%define r3w bx
|
|
%define r6w bp
|
|
|
|
%define r0b al
|
|
%define r1b cl
|
|
%define r2b dl
|
|
%define r3b bl
|
|
|
|
%define PUSHRFLAGS pushfd
|
|
%define POPRFLAGS popfd
|
|
%define retrq eax ; 32 bit mode do not support 64 bits regesters
|
|
%define retrd eax
|
|
|
|
%endif
|
|
|
|
%macro LOAD_PARA 2
|
|
mov %1, %2
|
|
%endmacro
|
|
|
|
%macro LOAD_1_PARA 0
|
|
%ifdef X86_32
|
|
mov r0, [esp + push_num*4 + 4]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_2_PARA 0
|
|
%ifdef X86_32
|
|
mov r0, [esp + push_num*4 + 4]
|
|
mov r1, [esp + push_num*4 + 8]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_3_PARA 0
|
|
%ifdef X86_32
|
|
mov r0, [esp + push_num*4 + 4]
|
|
mov r1, [esp + push_num*4 + 8]
|
|
mov r2, [esp + push_num*4 + 12]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_4_PARA 0
|
|
%ifdef X86_32
|
|
push r3
|
|
%assign push_num push_num+1
|
|
mov r0, [esp + push_num*4 + 4]
|
|
mov r1, [esp + push_num*4 + 8]
|
|
mov r2, [esp + push_num*4 + 12]
|
|
mov r3, [esp + push_num*4 + 16]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_5_PARA 0
|
|
%ifdef X86_32
|
|
push r3
|
|
push r4
|
|
%assign push_num push_num+2
|
|
mov r0, [esp + push_num*4 + 4]
|
|
mov r1, [esp + push_num*4 + 8]
|
|
mov r2, [esp + push_num*4 + 12]
|
|
mov r3, [esp + push_num*4 + 16]
|
|
mov r4, [esp + push_num*4 + 20]
|
|
%elifdef WIN64
|
|
mov r4, [rsp + push_num*8 + 40]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_6_PARA 0
|
|
%ifdef X86_32
|
|
push r3
|
|
push r4
|
|
push r5
|
|
%assign push_num push_num+3
|
|
mov r0, [esp + push_num*4 + 4]
|
|
mov r1, [esp + push_num*4 + 8]
|
|
mov r2, [esp + push_num*4 + 12]
|
|
mov r3, [esp + push_num*4 + 16]
|
|
mov r4, [esp + push_num*4 + 20]
|
|
mov r5, [esp + push_num*4 + 24]
|
|
%elifdef WIN64
|
|
mov r4, [rsp + push_num*8 + 40]
|
|
mov r5, [rsp + push_num*8 + 48]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_7_PARA 0
|
|
%ifdef X86_32
|
|
push r3
|
|
push r4
|
|
push r5
|
|
push r6
|
|
%assign push_num push_num+4
|
|
mov r0, [esp + push_num*4 + 4]
|
|
mov r1, [esp + push_num*4 + 8]
|
|
mov r2, [esp + push_num*4 + 12]
|
|
mov r3, [esp + push_num*4 + 16]
|
|
mov r4, [esp + push_num*4 + 20]
|
|
mov r5, [esp + push_num*4 + 24]
|
|
mov r6, [esp + push_num*4 + 28]
|
|
%elifdef WIN64
|
|
mov r4, [rsp + push_num*8 + 40]
|
|
mov r5, [rsp + push_num*8 + 48]
|
|
mov r6, [rsp + push_num*8 + 56]
|
|
%elifdef UNIX64
|
|
mov r6, [rsp + push_num*8 + 8]
|
|
%endif
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro LOAD_4_PARA_POP 0
|
|
%ifdef X86_32
|
|
pop r3
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_5_PARA_POP 0
|
|
%ifdef X86_32
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_6_PARA_POP 0
|
|
%ifdef X86_32
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_7_PARA_POP 0
|
|
%ifdef X86_32
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro PUSH_XMM 1
|
|
%ifdef WIN64
|
|
%assign xmm_num_regs %1
|
|
%if xmm_num_regs > 6
|
|
%ifdef push_num
|
|
%assign push_num push_num+2*(%1-6)
|
|
%endif
|
|
sub rsp, 16*(%1 - 6)
|
|
movdqu [rsp], xmm6
|
|
%endif
|
|
%if xmm_num_regs > 7
|
|
movdqu [rsp+16], xmm7
|
|
%endif
|
|
%if xmm_num_regs > 8
|
|
movdqu [rsp+32], xmm8
|
|
%endif
|
|
%if xmm_num_regs > 9
|
|
movdqu [rsp+48], xmm9
|
|
%endif
|
|
%if xmm_num_regs > 10
|
|
movdqu [rsp+64], xmm10
|
|
%endif
|
|
%if xmm_num_regs > 11
|
|
movdqu [rsp+80], xmm11
|
|
%endif
|
|
%if xmm_num_regs > 12
|
|
movdqu [rsp+96], xmm12
|
|
%endif
|
|
%if xmm_num_regs > 13
|
|
movdqu [rsp+112], xmm13
|
|
%endif
|
|
%if xmm_num_regs > 14
|
|
movdqu [rsp+128], xmm14
|
|
%endif
|
|
%if xmm_num_regs > 15
|
|
movdqu [rsp+144], xmm15
|
|
%endif
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro POP_XMM 0
|
|
%ifdef WIN64
|
|
%if xmm_num_regs > 15
|
|
movdqu xmm15, [rsp+144]
|
|
%endif
|
|
%if xmm_num_regs > 14
|
|
movdqu xmm14, [rsp+128]
|
|
%endif
|
|
%if xmm_num_regs > 13
|
|
movdqu xmm13, [rsp+112]
|
|
%endif
|
|
%if xmm_num_regs > 12
|
|
movdqu xmm12, [rsp+96]
|
|
%endif
|
|
%if xmm_num_regs > 11
|
|
movdqu xmm11, [rsp+80]
|
|
%endif
|
|
%if xmm_num_regs > 10
|
|
movdqu xmm10, [rsp+64]
|
|
%endif
|
|
%if xmm_num_regs > 9
|
|
movdqu xmm9, [rsp+48]
|
|
%endif
|
|
%if xmm_num_regs > 8
|
|
movdqu xmm8, [rsp+32]
|
|
%endif
|
|
%if xmm_num_regs > 7
|
|
movdqu xmm7, [rsp+16]
|
|
%endif
|
|
%if xmm_num_regs > 6
|
|
movdqu xmm6, [rsp]
|
|
add rsp, 16*(xmm_num_regs - 6)
|
|
%endif
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro SIGN_EXTENSION 2
|
|
%ifndef X86_32
|
|
movsxd %1, %2
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro SIGN_EXTENSIONW 2
|
|
%ifndef X86_32
|
|
movsx %1, %2
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro WELS_EXTERN 1
|
|
ALIGN 16
|
|
%ifdef PREFIX
|
|
global _%1
|
|
%define %1 _%1
|
|
%else
|
|
global %1
|
|
%endif
|
|
%1:
|
|
%endmacro
|
|
|
|
%macro WELS_AbsW 2
|
|
pxor %2, %2
|
|
psubw %2, %1
|
|
pmaxsw %1, %2
|
|
%endmacro
|
|
|
|
%macro MMX_XSwap 4
|
|
movq %4, %2
|
|
punpckh%1 %4, %3
|
|
punpckl%1 %2, %3
|
|
%endmacro
|
|
|
|
; pOut mm1, mm4, mm5, mm3
|
|
%macro MMX_Trans4x4W 5
|
|
MMX_XSwap wd, %1, %2, %5
|
|
MMX_XSwap wd, %3, %4, %2
|
|
MMX_XSwap dq, %1, %3, %4
|
|
MMX_XSwap dq, %5, %2, %3
|
|
%endmacro
|
|
|
|
;for TRANSPOSE
|
|
%macro SSE2_XSawp 4
|
|
movdqa %4, %2
|
|
punpckl%1 %2, %3
|
|
punpckh%1 %4, %3
|
|
%endmacro
|
|
|
|
; in: xmm1, xmm2, xmm3, xmm4 pOut: xmm1, xmm4, xmm5, mm3
|
|
%macro SSE2_Trans4x4D 5
|
|
SSE2_XSawp dq, %1, %2, %5
|
|
SSE2_XSawp dq, %3, %4, %2
|
|
SSE2_XSawp qdq, %1, %3, %4
|
|
SSE2_XSawp qdq, %5, %2, %3
|
|
%endmacro
|
|
|
|
;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
|
|
%macro SSE2_TransTwo4x4W 5
|
|
SSE2_XSawp wd, %1, %2, %5
|
|
SSE2_XSawp wd, %3, %4, %2
|
|
SSE2_XSawp dq, %1, %3, %4
|
|
SSE2_XSawp dq, %5, %2, %3
|
|
SSE2_XSawp qdq, %1, %5, %2
|
|
SSE2_XSawp qdq, %4, %3, %5
|
|
%endmacro
|
|
|
|
;in: m1, m2, m3, m4, m5, m6, m7, m8
|
|
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
|
%macro SSE2_TransTwo8x8B 9
|
|
movdqa %9, %8
|
|
SSE2_XSawp bw, %1, %2, %8
|
|
SSE2_XSawp bw, %3, %4, %2
|
|
SSE2_XSawp bw, %5, %6, %4
|
|
movdqa %6, %9
|
|
movdqa %9, %4
|
|
SSE2_XSawp bw, %7, %6, %4
|
|
|
|
SSE2_XSawp wd, %1, %3, %6
|
|
SSE2_XSawp wd, %8, %2, %3
|
|
SSE2_XSawp wd, %5, %7, %2
|
|
movdqa %7, %9
|
|
movdqa %9, %3
|
|
SSE2_XSawp wd, %7, %4, %3
|
|
|
|
SSE2_XSawp dq, %1, %5, %4
|
|
SSE2_XSawp dq, %6, %2, %5
|
|
SSE2_XSawp dq, %8, %7, %2
|
|
movdqa %7, %9
|
|
movdqa %9, %5
|
|
SSE2_XSawp dq, %7, %3, %5
|
|
|
|
SSE2_XSawp qdq, %1, %8, %3
|
|
SSE2_XSawp qdq, %4, %2, %8
|
|
SSE2_XSawp qdq, %6, %7, %2
|
|
movdqa %7, %9
|
|
movdqa %9, %1
|
|
SSE2_XSawp qdq, %7, %5, %1
|
|
movdqa %5, %9
|
|
%endmacro
|
|
|
|
;xmm0, xmm6, xmm7, [eax], [ecx]
|
|
;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
|
|
%macro SSE2_LoadDiff8P 5
|
|
movq %1, %4
|
|
punpcklbw %1, %3
|
|
movq %2, %5
|
|
punpcklbw %2, %3
|
|
psubw %1, %2
|
|
%endmacro
|
|
|
|
; m2 = m1 + m2, m1 = m1 - m2
|
|
%macro SSE2_SumSub 3
|
|
movdqa %3, %2
|
|
paddw %2, %1
|
|
psubw %1, %3
|
|
%endmacro
|
|
|
|
|
|
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
|
|
mov %3h, %3l
|
|
movd %1, e%3x ; i.e, 1% = eax (=b0)
|
|
pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
|
|
pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
|
|
%endmacro
|
|
|
|
;copy a dw into a xmm for 8 times
|
|
%macro SSE2_Copy8Times 2
|
|
movd %1, %2
|
|
punpcklwd %1, %1
|
|
pshufd %1, %1, 0
|
|
%endmacro
|
|
|
|
;copy a db into a xmm for 16 times
|
|
%macro SSE2_Copy16Times 2
|
|
movd %1, %2
|
|
pshuflw %1, %1, 0
|
|
punpcklqdq %1, %1
|
|
packuswb %1, %1
|
|
%endmacro
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
;preprocessor constants
|
|
;***********************************************************************
|
|
;dw 32,32,32,32,32,32,32,32 for xmm
|
|
;dw 32,32,32,32 for mm
|
|
%macro WELS_DW32 1
|
|
pcmpeqw %1,%1
|
|
psrlw %1,15
|
|
psllw %1,5
|
|
%endmacro
|
|
|
|
;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
|
|
;dw 1, 1, 1, 1 for mm
|
|
%macro WELS_DW1 1
|
|
pcmpeqw %1,%1
|
|
psrlw %1,15
|
|
%endmacro
|
|
|
|
;all 0 for xmm and mm
|
|
%macro WELS_Zero 1
|
|
pxor %1, %1
|
|
%endmacro
|
|
|
|
;dd 1, 1, 1, 1 for xmm
|
|
;dd 1, 1 for mm
|
|
%macro WELS_DD1 1
|
|
pcmpeqw %1,%1
|
|
psrld %1,31
|
|
%endmacro
|
|
|
|
;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
|
|
%macro WELS_DB1 1
|
|
pcmpeqw %1,%1
|
|
psrlw %1,15
|
|
packuswb %1,%1
|
|
%endmacro
|
|
|
|
|
|
|
|
|
|
|
|
|