3243a78959
This fixes warnings when building for x86_32 using yasm, which says the "DEFAULT REL" is ignored for non-64-bit targets.
613 lines
14 KiB
NASM
613 lines
14 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* sse2inc.asm
|
|
;*
|
|
;* Abstract
|
|
;* macro and constant
|
|
;*
|
|
;* History
|
|
;* 8/5/2009 Created
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
;***********************************************************************
|
|
; Options, for DEBUG
|
|
;***********************************************************************
|
|
|
|
%if 1
|
|
%define MOVDQ movdqa
|
|
%else
|
|
%define MOVDQ movdqu
|
|
%endif
|
|
|
|
%if 1
|
|
%define WELSEMMS emms
|
|
%else
|
|
%define WELSEMMS
|
|
%endif
|
|
|
|
|
|
;***********************************************************************
|
|
; Macros
|
|
;***********************************************************************
|
|
|
|
%ifdef WIN64 ; Windows x64 ;************************************
|
|
|
|
DEFAULT REL
|
|
|
|
BITS 64
|
|
|
|
%define arg1 rcx
|
|
%define arg2 rdx
|
|
%define arg3 r8
|
|
%define arg4 r9
|
|
%define arg5 [rsp + push_num*8 + 40]
|
|
%define arg6 [rsp + push_num*8 + 48]
|
|
%define arg7 [rsp + push_num*8 + 56]
|
|
%define arg8 [rsp + push_num*8 + 64]
|
|
%define arg9 [rsp + push_num*8 + 72]
|
|
%define arg10 [rsp + push_num*8 + 80]
|
|
%define arg11 [rsp + push_num*8 + 88]
|
|
%define arg12 [rsp + push_num*8 + 96]
|
|
|
|
%define r0 rcx
|
|
%define r1 rdx
|
|
%define r2 r8
|
|
%define r3 r9
|
|
%define r4 rax
|
|
%define r5 r10
|
|
%define r6 r11
|
|
%define r7 rsp
|
|
|
|
%define r0d ecx
|
|
%define r1d edx
|
|
%define r2d r8d
|
|
%define r3d r9d
|
|
%define r4d eax
|
|
%define r5d r10d
|
|
%define r6d r11d
|
|
|
|
%define r0w cx
|
|
%define r1w dx
|
|
%define r2w r8w
|
|
%define r3w r9w
|
|
%define r6w r11w
|
|
|
|
%define r0b cl
|
|
%define r1b dl
|
|
%define r2b r8l
|
|
%define r3b r9l
|
|
|
|
%define PUSHRFLAGS pushfq
|
|
%define POPRFLAGS popfq
|
|
%define retrq rax
|
|
%define retrd eax
|
|
|
|
%elifdef UNIX64 ; Unix x64 ;************************************
|
|
|
|
DEFAULT REL
|
|
|
|
BITS 64
|
|
|
|
%ifidn __OUTPUT_FORMAT__,elf64
|
|
SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-executable
|
|
%endif
|
|
|
|
%define arg1 rdi
|
|
%define arg2 rsi
|
|
%define arg3 rdx
|
|
%define arg4 rcx
|
|
%define arg5 r8
|
|
%define arg6 r9
|
|
%define arg7 [rsp + push_num*8 + 8]
|
|
%define arg8 [rsp + push_num*8 + 16]
|
|
%define arg9 [rsp + push_num*8 + 24]
|
|
%define arg10 [rsp + push_num*8 + 32]
|
|
%define arg11 [rsp + push_num*8 + 40]
|
|
%define arg12 [rsp + push_num*8 + 48]
|
|
|
|
%define r0 rdi
|
|
%define r1 rsi
|
|
%define r2 rdx
|
|
%define r3 rcx
|
|
%define r4 r8
|
|
%define r5 r9
|
|
%define r6 r10
|
|
%define r7 rsp
|
|
|
|
%define r0d edi
|
|
%define r1d esi
|
|
%define r2d edx
|
|
%define r3d ecx
|
|
%define r4d r8d
|
|
%define r5d r9d
|
|
%define r6d r10d
|
|
|
|
%define r0w di
|
|
%define r1w si
|
|
%define r2w dx
|
|
%define r3w cx
|
|
%define r6w r10w
|
|
|
|
%define r0b dil
|
|
%define r1b sil
|
|
%define r2b dl
|
|
%define r3b cl
|
|
|
|
%define PUSHRFLAGS pushfq
|
|
%define POPRFLAGS popfq
|
|
%define retrq rax
|
|
%define retrd eax
|
|
|
|
%elifdef X86_32 ; X86_32 ;************************************
|
|
|
|
BITS 32
|
|
|
|
%ifidn __OUTPUT_FORMAT__,elf
|
|
SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-executable
|
|
%endif
|
|
|
|
%define arg1 [esp + push_num*4 + 4]
|
|
%define arg2 [esp + push_num*4 + 8]
|
|
%define arg3 [esp + push_num*4 + 12]
|
|
%define arg4 [esp + push_num*4 + 16]
|
|
%define arg5 [esp + push_num*4 + 20]
|
|
%define arg6 [esp + push_num*4 + 24]
|
|
%define arg7 [esp + push_num*4 + 28]
|
|
%define arg8 [esp + push_num*4 + 32]
|
|
%define arg9 [esp + push_num*4 + 36]
|
|
%define arg10 [esp + push_num*4 + 40]
|
|
%define arg11 [esp + push_num*4 + 44]
|
|
%define arg12 [esp + push_num*4 + 48]
|
|
|
|
%define r0 eax
|
|
%define r1 ecx
|
|
%define r2 edx
|
|
%define r3 ebx
|
|
%define r4 esi
|
|
%define r5 edi
|
|
%define r6 ebp
|
|
%define r7 esp
|
|
|
|
%define r0d eax
|
|
%define r1d ecx
|
|
%define r2d edx
|
|
%define r3d ebx
|
|
%define r4d esi
|
|
%define r5d edi
|
|
%define r6d ebp
|
|
|
|
%define r0w ax
|
|
%define r1w cx
|
|
%define r2w dx
|
|
%define r3w bx
|
|
%define r6w bp
|
|
|
|
%define r0b al
|
|
%define r1b cl
|
|
%define r2b dl
|
|
%define r3b bl
|
|
|
|
%define PUSHRFLAGS pushfd
|
|
%define POPRFLAGS popfd
|
|
%define retrq eax ; 32 bit mode do not support 64 bits regesters
|
|
%define retrd eax
|
|
|
|
%endif
|
|
|
|
%macro LOAD_PARA 2
|
|
mov %1, %2
|
|
%endmacro
|
|
|
|
%macro LOAD_1_PARA 0
|
|
%ifdef X86_32
|
|
mov r0, [esp + push_num*4 + 4]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_2_PARA 0
|
|
%ifdef X86_32
|
|
mov r0, [esp + push_num*4 + 4]
|
|
mov r1, [esp + push_num*4 + 8]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_3_PARA 0
|
|
%ifdef X86_32
|
|
mov r0, [esp + push_num*4 + 4]
|
|
mov r1, [esp + push_num*4 + 8]
|
|
mov r2, [esp + push_num*4 + 12]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_4_PARA 0
|
|
%ifdef X86_32
|
|
push r3
|
|
%assign push_num push_num+1
|
|
mov r0, [esp + push_num*4 + 4]
|
|
mov r1, [esp + push_num*4 + 8]
|
|
mov r2, [esp + push_num*4 + 12]
|
|
mov r3, [esp + push_num*4 + 16]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_5_PARA 0
|
|
%ifdef X86_32
|
|
push r3
|
|
push r4
|
|
%assign push_num push_num+2
|
|
mov r0, [esp + push_num*4 + 4]
|
|
mov r1, [esp + push_num*4 + 8]
|
|
mov r2, [esp + push_num*4 + 12]
|
|
mov r3, [esp + push_num*4 + 16]
|
|
mov r4, [esp + push_num*4 + 20]
|
|
%elifdef WIN64
|
|
mov r4, [rsp + push_num*8 + 40]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_6_PARA 0
|
|
%ifdef X86_32
|
|
push r3
|
|
push r4
|
|
push r5
|
|
%assign push_num push_num+3
|
|
mov r0, [esp + push_num*4 + 4]
|
|
mov r1, [esp + push_num*4 + 8]
|
|
mov r2, [esp + push_num*4 + 12]
|
|
mov r3, [esp + push_num*4 + 16]
|
|
mov r4, [esp + push_num*4 + 20]
|
|
mov r5, [esp + push_num*4 + 24]
|
|
%elifdef WIN64
|
|
mov r4, [rsp + push_num*8 + 40]
|
|
mov r5, [rsp + push_num*8 + 48]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_7_PARA 0
|
|
%ifdef X86_32
|
|
push r3
|
|
push r4
|
|
push r5
|
|
push r6
|
|
%assign push_num push_num+4
|
|
mov r0, [esp + push_num*4 + 4]
|
|
mov r1, [esp + push_num*4 + 8]
|
|
mov r2, [esp + push_num*4 + 12]
|
|
mov r3, [esp + push_num*4 + 16]
|
|
mov r4, [esp + push_num*4 + 20]
|
|
mov r5, [esp + push_num*4 + 24]
|
|
mov r6, [esp + push_num*4 + 28]
|
|
%elifdef WIN64
|
|
mov r4, [rsp + push_num*8 + 40]
|
|
mov r5, [rsp + push_num*8 + 48]
|
|
mov r6, [rsp + push_num*8 + 56]
|
|
%elifdef UNIX64
|
|
mov r6, [rsp + push_num*8 + 8]
|
|
%endif
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro LOAD_4_PARA_POP 0
|
|
%ifdef X86_32
|
|
pop r3
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_5_PARA_POP 0
|
|
%ifdef X86_32
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_6_PARA_POP 0
|
|
%ifdef X86_32
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD_7_PARA_POP 0
|
|
%ifdef X86_32
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro PUSH_XMM 1
|
|
%ifdef WIN64
|
|
%assign xmm_num_regs %1
|
|
%if xmm_num_regs > 6
|
|
%ifdef push_num
|
|
%assign push_num push_num+2*(%1-6)
|
|
%endif
|
|
sub rsp, 16*(%1 - 6)
|
|
movdqu [rsp], xmm6
|
|
%endif
|
|
%if xmm_num_regs > 7
|
|
movdqu [rsp+16], xmm7
|
|
%endif
|
|
%if xmm_num_regs > 8
|
|
movdqu [rsp+32], xmm8
|
|
%endif
|
|
%if xmm_num_regs > 9
|
|
movdqu [rsp+48], xmm9
|
|
%endif
|
|
%if xmm_num_regs > 10
|
|
movdqu [rsp+64], xmm10
|
|
%endif
|
|
%if xmm_num_regs > 11
|
|
movdqu [rsp+80], xmm11
|
|
%endif
|
|
%if xmm_num_regs > 12
|
|
movdqu [rsp+96], xmm12
|
|
%endif
|
|
%if xmm_num_regs > 13
|
|
movdqu [rsp+112], xmm13
|
|
%endif
|
|
%if xmm_num_regs > 14
|
|
movdqu [rsp+128], xmm14
|
|
%endif
|
|
%if xmm_num_regs > 15
|
|
movdqu [rsp+144], xmm15
|
|
%endif
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro POP_XMM 0
|
|
%ifdef WIN64
|
|
%if xmm_num_regs > 15
|
|
movdqu xmm15, [rsp+144]
|
|
%endif
|
|
%if xmm_num_regs > 14
|
|
movdqu xmm14, [rsp+128]
|
|
%endif
|
|
%if xmm_num_regs > 13
|
|
movdqu xmm13, [rsp+112]
|
|
%endif
|
|
%if xmm_num_regs > 12
|
|
movdqu xmm12, [rsp+96]
|
|
%endif
|
|
%if xmm_num_regs > 11
|
|
movdqu xmm11, [rsp+80]
|
|
%endif
|
|
%if xmm_num_regs > 10
|
|
movdqu xmm10, [rsp+64]
|
|
%endif
|
|
%if xmm_num_regs > 9
|
|
movdqu xmm9, [rsp+48]
|
|
%endif
|
|
%if xmm_num_regs > 8
|
|
movdqu xmm8, [rsp+32]
|
|
%endif
|
|
%if xmm_num_regs > 7
|
|
movdqu xmm7, [rsp+16]
|
|
%endif
|
|
%if xmm_num_regs > 6
|
|
movdqu xmm6, [rsp]
|
|
add rsp, 16*(xmm_num_regs - 6)
|
|
%endif
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro SIGN_EXTENSION 2
|
|
%ifndef X86_32
|
|
movsxd %1, %2
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro SIGN_EXTENSIONW 2
|
|
%ifndef X86_32
|
|
movsx %1, %2
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro WELS_EXTERN 1
|
|
ALIGN 16
|
|
%ifdef PREFIX
|
|
global _%1
|
|
%define %1 _%1
|
|
%else
|
|
global %1
|
|
%endif
|
|
%1:
|
|
%endmacro
|
|
|
|
%macro WELS_AbsW 2
|
|
pxor %2, %2
|
|
psubw %2, %1
|
|
pmaxsw %1, %2
|
|
%endmacro
|
|
|
|
%macro MMX_XSwap 4
|
|
movq %4, %2
|
|
punpckh%1 %4, %3
|
|
punpckl%1 %2, %3
|
|
%endmacro
|
|
|
|
; pOut mm1, mm4, mm5, mm3
|
|
%macro MMX_Trans4x4W 5
|
|
MMX_XSwap wd, %1, %2, %5
|
|
MMX_XSwap wd, %3, %4, %2
|
|
MMX_XSwap dq, %1, %3, %4
|
|
MMX_XSwap dq, %5, %2, %3
|
|
%endmacro
|
|
|
|
;for TRANSPOSE
|
|
%macro SSE2_XSawp 4
|
|
movdqa %4, %2
|
|
punpckl%1 %2, %3
|
|
punpckh%1 %4, %3
|
|
%endmacro
|
|
|
|
; in: xmm1, xmm2, xmm3, xmm4 pOut: xmm1, xmm4, xmm5, mm3
|
|
%macro SSE2_Trans4x4D 5
|
|
SSE2_XSawp dq, %1, %2, %5
|
|
SSE2_XSawp dq, %3, %4, %2
|
|
SSE2_XSawp qdq, %1, %3, %4
|
|
SSE2_XSawp qdq, %5, %2, %3
|
|
%endmacro
|
|
|
|
;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
|
|
%macro SSE2_TransTwo4x4W 5
|
|
SSE2_XSawp wd, %1, %2, %5
|
|
SSE2_XSawp wd, %3, %4, %2
|
|
SSE2_XSawp dq, %1, %3, %4
|
|
SSE2_XSawp dq, %5, %2, %3
|
|
SSE2_XSawp qdq, %1, %5, %2
|
|
SSE2_XSawp qdq, %4, %3, %5
|
|
%endmacro
|
|
|
|
;in: m1, m2, m3, m4, m5, m6, m7, m8
|
|
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
|
%macro SSE2_TransTwo8x8B 9
|
|
movdqa %9, %8
|
|
SSE2_XSawp bw, %1, %2, %8
|
|
SSE2_XSawp bw, %3, %4, %2
|
|
SSE2_XSawp bw, %5, %6, %4
|
|
movdqa %6, %9
|
|
movdqa %9, %4
|
|
SSE2_XSawp bw, %7, %6, %4
|
|
|
|
SSE2_XSawp wd, %1, %3, %6
|
|
SSE2_XSawp wd, %8, %2, %3
|
|
SSE2_XSawp wd, %5, %7, %2
|
|
movdqa %7, %9
|
|
movdqa %9, %3
|
|
SSE2_XSawp wd, %7, %4, %3
|
|
|
|
SSE2_XSawp dq, %1, %5, %4
|
|
SSE2_XSawp dq, %6, %2, %5
|
|
SSE2_XSawp dq, %8, %7, %2
|
|
movdqa %7, %9
|
|
movdqa %9, %5
|
|
SSE2_XSawp dq, %7, %3, %5
|
|
|
|
SSE2_XSawp qdq, %1, %8, %3
|
|
SSE2_XSawp qdq, %4, %2, %8
|
|
SSE2_XSawp qdq, %6, %7, %2
|
|
movdqa %7, %9
|
|
movdqa %9, %1
|
|
SSE2_XSawp qdq, %7, %5, %1
|
|
movdqa %5, %9
|
|
%endmacro
|
|
|
|
;xmm0, xmm6, xmm7, [eax], [ecx]
|
|
;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
|
|
%macro SSE2_LoadDiff8P 5
|
|
movq %1, %4
|
|
punpcklbw %1, %3
|
|
movq %2, %5
|
|
punpcklbw %2, %3
|
|
psubw %1, %2
|
|
%endmacro
|
|
|
|
; m2 = m1 + m2, m1 = m1 - m2
|
|
%macro SSE2_SumSub 3
|
|
movdqa %3, %2
|
|
paddw %2, %1
|
|
psubw %1, %3
|
|
%endmacro
|
|
|
|
|
|
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
|
|
mov %3h, %3l
|
|
movd %1, e%3x ; i.e, 1% = eax (=b0)
|
|
pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
|
|
pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
|
|
%endmacro
|
|
|
|
;copy a dw into a xmm for 8 times
|
|
%macro SSE2_Copy8Times 2
|
|
movd %1, %2
|
|
punpcklwd %1, %1
|
|
pshufd %1, %1, 0
|
|
%endmacro
|
|
|
|
;copy a db into a xmm for 16 times
|
|
%macro SSE2_Copy16Times 2
|
|
movd %1, %2
|
|
pshuflw %1, %1, 0
|
|
punpcklqdq %1, %1
|
|
packuswb %1, %1
|
|
%endmacro
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
;preprocessor constants
|
|
;***********************************************************************
|
|
;dw 32,32,32,32,32,32,32,32 for xmm
|
|
;dw 32,32,32,32 for mm
|
|
%macro WELS_DW32 1
|
|
pcmpeqw %1,%1
|
|
psrlw %1,15
|
|
psllw %1,5
|
|
%endmacro
|
|
|
|
;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
|
|
;dw 1, 1, 1, 1 for mm
|
|
%macro WELS_DW1 1
|
|
pcmpeqw %1,%1
|
|
psrlw %1,15
|
|
%endmacro
|
|
|
|
;all 0 for xmm and mm
|
|
%macro WELS_Zero 1
|
|
pxor %1, %1
|
|
%endmacro
|
|
|
|
;dd 1, 1, 1, 1 for xmm
|
|
;dd 1, 1 for mm
|
|
%macro WELS_DD1 1
|
|
pcmpeqw %1,%1
|
|
psrld %1,31
|
|
%endmacro
|
|
|
|
;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
|
|
%macro WELS_DB1 1
|
|
pcmpeqw %1,%1
|
|
psrlw %1,15
|
|
packuswb %1,%1
|
|
%endmacro
|
|
|
|
|
|
|
|
|
|
|
|
|