Merge remote branch 'origin/master' into experimental

Change-Id: I9e9ece0424b2f4b6861e9c7c0986f6eccc9159d6
This commit is contained in:
John Koleszar
2011-04-20 00:05:11 -04:00
9 changed files with 120 additions and 78 deletions

View File

@@ -32,9 +32,6 @@ sym(idct_dequant_0_2x_sse2):
mov rdx, arg(1) ; dequant mov rdx, arg(1) ; dequant
mov rax, arg(0) ; qcoeff mov rax, arg(0) ; qcoeff
; Zero out xmm7, for use unpacking
pxor xmm7, xmm7
movd xmm4, [rax] movd xmm4, [rax]
movd xmm5, [rdx] movd xmm5, [rdx]
@@ -43,9 +40,12 @@ sym(idct_dequant_0_2x_sse2):
pmullw xmm4, xmm5 pmullw xmm4, xmm5
; Zero out xmm5, for use unpacking
pxor xmm5, xmm5
; clear coeffs ; clear coeffs
movd [rax], xmm7 movd [rax], xmm5
movd [rax+32], xmm7 movd [rax+32], xmm5
;pshufb ;pshufb
pshuflw xmm4, xmm4, 00000000b pshuflw xmm4, xmm4, 00000000b
pshufhw xmm4, xmm4, 00000000b pshufhw xmm4, xmm4, 00000000b
@@ -62,10 +62,10 @@ sym(idct_dequant_0_2x_sse2):
lea rcx, [3*rcx] lea rcx, [3*rcx]
movq xmm3, [rax+rcx] movq xmm3, [rax+rcx]
punpcklbw xmm0, xmm7 punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm7 punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm7 punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm7 punpcklbw xmm3, xmm5
mov rax, arg(3) ; dst mov rax, arg(3) ; dst
movsxd rdx, dword ptr arg(4) ; dst_stride movsxd rdx, dword ptr arg(4) ; dst_stride
@@ -77,10 +77,10 @@ sym(idct_dequant_0_2x_sse2):
paddw xmm3, xmm4 paddw xmm3, xmm4
; pack up before storing ; pack up before storing
packuswb xmm0, xmm7 packuswb xmm0, xmm5
packuswb xmm1, xmm7 packuswb xmm1, xmm5
packuswb xmm2, xmm7 packuswb xmm2, xmm5
packuswb xmm3, xmm7 packuswb xmm3, xmm5
; store blocks back out ; store blocks back out
movq [rax], xmm0 movq [rax], xmm0
@@ -102,6 +102,7 @@ sym(idct_dequant_full_2x_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 7 SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
@@ -347,6 +348,7 @@ sym(idct_dequant_full_2x_sse2):
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -377,8 +379,8 @@ sym(idct_dequant_dc_0_2x_sse2):
mov rdi, arg(3) ; dst mov rdi, arg(3) ; dst
mov rdx, arg(5) ; dc mov rdx, arg(5) ; dc
; Zero out xmm7, for use unpacking ; Zero out xmm5, for use unpacking
pxor xmm7, xmm7 pxor xmm5, xmm5
; load up 2 dc words here == 2*16 = doubleword ; load up 2 dc words here == 2*16 = doubleword
movd xmm4, [rdx] movd xmm4, [rdx]
@@ -398,10 +400,10 @@ sym(idct_dequant_dc_0_2x_sse2):
psraw xmm4, 3 psraw xmm4, 3
; Predict buffer needs to be expanded from bytes to words ; Predict buffer needs to be expanded from bytes to words
punpcklbw xmm0, xmm7 punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm7 punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm7 punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm7 punpcklbw xmm3, xmm5
; Add to predict buffer ; Add to predict buffer
paddw xmm0, xmm4 paddw xmm0, xmm4
@@ -410,10 +412,10 @@ sym(idct_dequant_dc_0_2x_sse2):
paddw xmm3, xmm4 paddw xmm3, xmm4
; pack up before storing ; pack up before storing
packuswb xmm0, xmm7 packuswb xmm0, xmm5
packuswb xmm1, xmm7 packuswb xmm1, xmm5
packuswb xmm2, xmm7 packuswb xmm2, xmm5
packuswb xmm3, xmm7 packuswb xmm3, xmm5
; Load destination stride before writing out, ; Load destination stride before writing out,
; doesn't need to persist ; doesn't need to persist
@@ -441,6 +443,7 @@ sym(idct_dequant_dc_full_2x_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 7 SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
@@ -692,6 +695,7 @@ sym(idct_dequant_dc_full_2x_sse2):
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret

View File

@@ -39,6 +39,7 @@ sym(vp8_filter_block1d8_h6_ssse3):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 6 SHADOW_ARGS_TO_STACK 6
SAVE_XMM
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
@@ -107,6 +108,7 @@ filter_block1d8_h6_rowloop_ssse3:
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -162,6 +164,7 @@ filter_block1d8_h4_rowloop_ssse3:
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -286,6 +289,7 @@ sym(vp8_filter_block1d4_h6_ssse3):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 6 SHADOW_ARGS_TO_STACK 6
SAVE_XMM
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
@@ -393,6 +397,7 @@ filter_block1d4_h4_rowloop_ssse3:
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -413,6 +418,7 @@ sym(vp8_filter_block1d16_v6_ssse3):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 6 SHADOW_ARGS_TO_STACK 6
SAVE_XMM
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
@@ -508,6 +514,7 @@ vp8_filter_block1d16_v6_ssse3_loop:
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -580,6 +587,7 @@ vp8_filter_block1d16_v4_ssse3_loop:
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -598,6 +606,7 @@ sym(vp8_filter_block1d8_v6_ssse3):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 6 SHADOW_ARGS_TO_STACK 6
SAVE_XMM
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
@@ -670,6 +679,7 @@ vp8_filter_block1d8_v6_ssse3_loop:
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -718,6 +728,7 @@ vp8_filter_block1d8_v4_ssse3_loop:
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -808,6 +819,7 @@ vp8_filter_block1d4_v6_ssse3_loop:
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret

View File

@@ -33,6 +33,7 @@
%define input rcx %define input rcx
%define output rdx %define output rdx
%define pitch r8 %define pitch r8
SAVE_XMM
%else %else
%define input rdi %define input rdi
%define output rsi %define output rsi
@@ -53,6 +54,7 @@
pop rbp pop rbp
%else %else
%ifidn __OUTPUT_FORMAT__,x64 %ifidn __OUTPUT_FORMAT__,x64
RESTORE_XMM
%endif %endif
%endif %endif
ret ret

View File

@@ -22,33 +22,33 @@ sym(vp8_block_error_xmm):
; end prologue ; end prologue
mov rsi, arg(0) ;coeff_ptr mov rsi, arg(0) ;coeff_ptr
mov rdi, arg(1) ;dcoef_ptr mov rdi, arg(1) ;dcoef_ptr
movdqa xmm3, [rsi]
movdqa xmm4, [rdi] movdqa xmm0, [rsi]
movdqa xmm5, [rsi+16] movdqa xmm1, [rdi]
movdqa xmm6, [rdi+16] movdqa xmm2, [rsi+16]
psubw xmm3, xmm4 movdqa xmm3, [rdi+16]
psubw xmm5, xmm6 psubw xmm0, xmm1
pmaddwd xmm3, xmm3 psubw xmm2, xmm3
pmaddwd xmm5, xmm5
paddd xmm3, xmm5 pmaddwd xmm0, xmm0
pmaddwd xmm2, xmm2
pxor xmm7, xmm7 paddd xmm0, xmm2
movdqa xmm0, xmm3
punpckldq xmm0, xmm7 pxor xmm5, xmm5
punpckhdq xmm3, xmm7 movdqa xmm1, xmm0
paddd xmm0, xmm3 punpckldq xmm0, xmm5
movdqa xmm3, xmm0 punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm0, 8 psrldq xmm0, 8
paddd xmm0, xmm3 paddd xmm0, xmm1
movq rax, xmm0 movq rax, xmm0
@@ -208,53 +208,54 @@ sym(vp8_mbblock_error_xmm_impl):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 3 SHADOW_ARGS_TO_STACK 3
SAVE_XMM ; 6
push rsi push rsi
push rdi push rdi
; end prolog ; end prolog
mov rsi, arg(0) ;coeff_ptr mov rsi, arg(0) ;coeff_ptr
pxor xmm7, xmm7 pxor xmm6, xmm6
mov rdi, arg(1) ;dcoef_ptr mov rdi, arg(1) ;dcoef_ptr
pxor xmm2, xmm2 pxor xmm4, xmm4
movd xmm1, dword ptr arg(2) ;dc movd xmm5, dword ptr arg(2) ;dc
por xmm1, xmm2 por xmm5, xmm4
pcmpeqw xmm1, xmm7 pcmpeqw xmm5, xmm6
mov rcx, 16 mov rcx, 16
mberror_loop: mberror_loop:
movdqa xmm3, [rsi] movdqa xmm0, [rsi]
movdqa xmm4, [rdi] movdqa xmm1, [rdi]
movdqa xmm5, [rsi+16] movdqa xmm2, [rsi+16]
movdqa xmm6, [rdi+16] movdqa xmm3, [rdi+16]
psubw xmm5, xmm6 psubw xmm2, xmm3
pmaddwd xmm5, xmm5 pmaddwd xmm2, xmm2
psubw xmm3, xmm4 psubw xmm0, xmm1
pand xmm3, xmm1 pand xmm0, xmm5
pmaddwd xmm3, xmm3 pmaddwd xmm0, xmm0
add rsi, 32 add rsi, 32
add rdi, 32 add rdi, 32
sub rcx, 1 sub rcx, 1
paddd xmm2, xmm5 paddd xmm4, xmm2
paddd xmm2, xmm3 paddd xmm4, xmm0
jnz mberror_loop jnz mberror_loop
movdqa xmm0, xmm2 movdqa xmm0, xmm4
punpckldq xmm0, xmm7 punpckldq xmm0, xmm6
punpckhdq xmm2, xmm7 punpckhdq xmm4, xmm6
paddd xmm0, xmm2 paddd xmm0, xmm4
movdqa xmm1, xmm0 movdqa xmm1, xmm0
psrldq xmm0, 8 psrldq xmm0, 8
@@ -265,6 +266,7 @@ mberror_loop:
pop rdi pop rdi
pop rsi pop rsi
; begin epilog ; begin epilog
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -342,7 +344,7 @@ sym(vp8_mbuverror_xmm_impl):
mov rdi, arg(1) ;d_ptr mov rdi, arg(1) ;d_ptr
mov rcx, 16 mov rcx, 16
pxor xmm7, xmm7 pxor xmm3, xmm3
mbuverror_loop: mbuverror_loop:
@@ -352,7 +354,7 @@ mbuverror_loop:
psubw xmm1, xmm2 psubw xmm1, xmm2
pmaddwd xmm1, xmm1 pmaddwd xmm1, xmm1
paddd xmm7, xmm1 paddd xmm3, xmm1
add rsi, 16 add rsi, 16
add rdi, 16 add rdi, 16
@@ -361,7 +363,7 @@ mbuverror_loop:
jnz mbuverror_loop jnz mbuverror_loop
pxor xmm0, xmm0 pxor xmm0, xmm0
movdqa xmm1, xmm7 movdqa xmm1, xmm3
movdqa xmm2, xmm1 movdqa xmm2, xmm1
punpckldq xmm1, xmm0 punpckldq xmm1, xmm0

View File

@@ -21,6 +21,7 @@ sym(vp8_sad16x16_wmt):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 4 SHADOW_ARGS_TO_STACK 4
SAVE_XMM ; 6
push rsi push rsi
push rdi push rdi
; end prolog ; end prolog
@@ -34,7 +35,7 @@ sym(vp8_sad16x16_wmt):
lea rcx, [rsi+rax*8] lea rcx, [rsi+rax*8]
lea rcx, [rcx+rax*8] lea rcx, [rcx+rax*8]
pxor xmm7, xmm7 pxor xmm6, xmm6
x16x16sad_wmt_loop: x16x16sad_wmt_loop:
@@ -52,32 +53,33 @@ x16x16sad_wmt_loop:
punpcklbw xmm1, xmm3 punpcklbw xmm1, xmm3
psadbw xmm0, xmm1 psadbw xmm0, xmm1
movq xmm6, QWORD PTR [rsi+rax+8] movq xmm2, QWORD PTR [rsi+rax+8]
movq xmm3, QWORD PTR [rdi+rdx+8] movq xmm3, QWORD PTR [rdi+rdx+8]
lea rsi, [rsi+rax*2] lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2] lea rdi, [rdi+rdx*2]
punpcklbw xmm4, xmm6 punpcklbw xmm4, xmm2
punpcklbw xmm5, xmm3 punpcklbw xmm5, xmm3
psadbw xmm4, xmm5 psadbw xmm4, xmm5
paddw xmm7, xmm0 paddw xmm6, xmm0
paddw xmm7, xmm4 paddw xmm6, xmm4
cmp rsi, rcx cmp rsi, rcx
jne x16x16sad_wmt_loop jne x16x16sad_wmt_loop
movq xmm0, xmm7 movq xmm0, xmm6
psrldq xmm7, 8 psrldq xmm6, 8
paddw xmm0, xmm7 paddw xmm0, xmm6
movq rax, xmm0 movq rax, xmm0
; begin epilog ; begin epilog
pop rdi pop rdi
pop rsi pop rsi
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret

View File

@@ -39,8 +39,9 @@
%define ref_stride r9 %define ref_stride r9
%define end_ptr r10 %define end_ptr r10
%define ret_var r11 %define ret_var r11
%define result_ptr [rsp+8+4*8] %define result_ptr [rsp+40+4*8]
%define max_err [rsp+8+4*8] %define max_err [rsp+40+4*8]
SAVE_XMM
%else %else
%define src_ptr rdi %define src_ptr rdi
%define src_stride rsi %define src_stride rsi
@@ -72,6 +73,7 @@
pop rbp pop rbp
%else %else
%ifidn __OUTPUT_FORMAT__,x64 %ifidn __OUTPUT_FORMAT__,x64
RESTORE_XMM
%endif %endif
%endif %endif
ret ret
@@ -113,7 +115,8 @@
%define r2_ptr r11 %define r2_ptr r11
%define r3_ptr r8 %define r3_ptr r8
%define ref_stride r9 %define ref_stride r9
%define result_ptr [rsp+16+4*8] %define result_ptr [rsp+48+4*8]
SAVE_XMM
push rsi push rsi
LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
@@ -151,6 +154,7 @@
%else %else
%ifidn __OUTPUT_FORMAT__,x64 %ifidn __OUTPUT_FORMAT__,x64
pop rsi pop rsi
RESTORE_XMM
%endif %endif
%endif %endif
ret ret

View File

@@ -157,6 +157,7 @@ sym(vp8_sad16x16x3_ssse3):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 5 SHADOW_ARGS_TO_STACK 5
SAVE_XMM
push rsi push rsi
push rdi push rdi
push rcx push rcx
@@ -253,6 +254,7 @@ vp8_sad16x16x3_ssse3_store_off:
pop rcx pop rcx
pop rdi pop rdi
pop rsi pop rsi
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -268,6 +270,7 @@ sym(vp8_sad16x8x3_ssse3):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 5 SHADOW_ARGS_TO_STACK 5
SAVE_XMM
push rsi push rsi
push rdi push rdi
push rcx push rcx
@@ -361,6 +364,7 @@ vp8_sad16x8x3_ssse3_store_off:
pop rcx pop rcx
pop rdi pop rdi
pop rsi pop rsi
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret

View File

@@ -85,6 +85,7 @@ sym(vp8_get16x16var_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 6 SHADOW_ARGS_TO_STACK 6
SAVE_XMM
push rbx push rbx
push rsi push rsi
push rdi push rdi
@@ -206,6 +207,7 @@ var16loop:
pop rdi pop rdi
pop rsi pop rsi
pop rbx pop rbx
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -223,6 +225,7 @@ sym(vp8_get16x16pred_error_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 4 SHADOW_ARGS_TO_STACK 4
SAVE_XMM
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
@@ -321,6 +324,7 @@ var16peloop:
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -341,6 +345,7 @@ sym(vp8_get8x8var_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 6 SHADOW_ARGS_TO_STACK 6
SAVE_XMM
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
@@ -506,6 +511,7 @@ sym(vp8_get8x8var_sse2):
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -805,6 +811,7 @@ sym(vp8_half_horiz_vert_variance8x_h_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 7 SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
@@ -906,6 +913,7 @@ vp8_half_horiz_vert_variance8x_h_1:
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -1041,6 +1049,7 @@ sym(vp8_half_vert_variance8x_h_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 7 SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
@@ -1127,6 +1136,7 @@ vp8_half_vert_variance8x_h_1:
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
@@ -1254,6 +1264,7 @@ sym(vp8_half_horiz_variance8x_h_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 7 SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
@@ -1338,6 +1349,7 @@ vp8_half_horiz_variance8x_h_1:
pop rdi pop rdi
pop rsi pop rsi
RESTORE_GOT RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret

View File

@@ -260,12 +260,12 @@
%ifidn __OUTPUT_FORMAT__,x64 %ifidn __OUTPUT_FORMAT__,x64
%macro SAVE_XMM 0 %macro SAVE_XMM 0
sub rsp, 32 sub rsp, 32
movdqa XMMWORD PTR [rsp], xmm6 movdqu XMMWORD PTR [rsp], xmm6
movdqa XMMWORD PTR [rsp+16], xmm7 movdqu XMMWORD PTR [rsp+16], xmm7
%endmacro %endmacro
%macro RESTORE_XMM 0 %macro RESTORE_XMM 0
movdqa xmm6, XMMWORD PTR [rsp] movdqu xmm6, XMMWORD PTR [rsp]
movdqa xmm7, XMMWORD PTR [rsp+16] movdqu xmm7, XMMWORD PTR [rsp+16]
add rsp, 32 add rsp, 32
%endmacro %endmacro
%else %else