Merge pull request #519 from mstorsjo/push-xmm-registers

Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64

Reviewed by zhiliang
This commit is contained in:
volvet 2014-03-18 15:04:54 +08:00
commit fb1958ad13
24 changed files with 213 additions and 117 deletions

View File

@ -335,6 +335,82 @@ BITS 32
%endif
%endmacro
%macro PUSH_XMM 1
%ifdef WIN64
%assign xmm_num_regs %1
%if xmm_num_regs > 6
%ifdef push_num
%assign push_num push_num+2*(%1-6)
%endif
sub rsp, 16*(%1 - 6)
movdqu [rsp], xmm6
%endif
%if xmm_num_regs > 7
movdqu [rsp+16], xmm7
%endif
%if xmm_num_regs > 8
movdqu [rsp+32], xmm8
%endif
%if xmm_num_regs > 9
movdqu [rsp+48], xmm9
%endif
%if xmm_num_regs > 10
movdqu [rsp+64], xmm10
%endif
%if xmm_num_regs > 11
movdqu [rsp+80], xmm11
%endif
%if xmm_num_regs > 12
movdqu [rsp+96], xmm12
%endif
%if xmm_num_regs > 13
movdqu [rsp+112], xmm13
%endif
%if xmm_num_regs > 14
movdqu [rsp+128], xmm14
%endif
%if xmm_num_regs > 15
movdqu [rsp+144], xmm15
%endif
%endif
%endmacro
%macro POP_XMM 0
%ifdef WIN64
%if xmm_num_regs > 15
movdqu xmm15, [rsp+144]
%endif
%if xmm_num_regs > 14
movdqu xmm14, [rsp+128]
%endif
%if xmm_num_regs > 13
movdqu xmm13, [rsp+112]
%endif
%if xmm_num_regs > 12
movdqu xmm12, [rsp+96]
%endif
%if xmm_num_regs > 11
movdqu xmm11, [rsp+80]
%endif
%if xmm_num_regs > 10
movdqu xmm10, [rsp+64]
%endif
%if xmm_num_regs > 9
movdqu xmm9, [rsp+48]
%endif
%if xmm_num_regs > 8
movdqu xmm8, [rsp+32]
%endif
%if xmm_num_regs > 7
movdqu xmm7, [rsp+16]
%endif
%if xmm_num_regs > 6
movdqu xmm6, [rsp]
add rsp, 16*(xmm_num_regs - 6)
%endif
%endif
%endmacro
%macro SIGN_EXTENSION 2
%ifndef X86_32
movsxd %1, %2

View File

@ -207,9 +207,6 @@ void WelsCPURestore (const uint32_t kuiCPU) {
}
}
void WelsXmmRegEmptyOp(void * pSrc) {
}
#elif defined(HAVE_NEON) //For supporting both android platform and iOS platform
#if defined(ANDROID_NDK)
uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors)

View File

@ -67,60 +67,14 @@ void WelsEmms();
*/
void WelsCPURestore (const uint32_t kuiCPU);
#ifdef WIN64
void WelsXmmRegStore(void * src);
void WelsXmmRegLoad(void * src);
#endif
#else
#define WelsEmms()
#endif
void WelsXmmRegEmptyOp(void * pSrc);
uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
#if defined(__cplusplus)
}
#endif//__cplusplus
typedef void (*WelsXmmRegProtectFunc)(void * pSrc);
#if defined(WIN64) && defined(X86_ASM)
#define XMMREG_PROTECT_DECLARE(name) \
WelsXmmRegProtectFunc name##load;\
WelsXmmRegProtectFunc name##store;\
uint8_t name##Buffer[160];
#define XMMREG_PROTECT_INIT(name) \
{ \
uint32_t uiCpuFlag = WelsCPUFeatureDetect(NULL);\
if( uiCpuFlag & WELS_CPU_SSE2 ){\
name##load = WelsXmmRegLoad;\
name##store = WelsXmmRegStore; \
} else { \
name##load = WelsXmmRegEmptyOp; \
name##store = WelsXmmRegEmptyOp; \
} \
}
#define XMMREG_PROTECT_UNINIT(name) \
#define XMMREG_PROTECT_STORE(name) \
name##store(name##Buffer);
#define XMMREG_PROTECT_LOAD(name) \
name##load(name##Buffer);
#else
#define XMMREG_PROTECT_DECLARE(name)
#define XMMREG_PROTECT_INIT(name)
#define XMMREG_PROTECT_UNINIT(name)
#define XMMREG_PROTECT_STORE(name)
#define XMMREG_PROTECT_LOAD(name)
#endif
#endif//WELS_CPU_DETECTION_H__

View File

@ -210,44 +210,3 @@ WELS_EXTERN WelsEmms
emms ; empty mmx technology states
ret
%ifdef WIN64
WELS_EXTERN WelsXmmRegStore
ALIGN 16
;******************************************************************************************
; void WelsXmmRegStore(void *src)
;******************************************************************************************
WelsXmmRegStore:
movdqu [rcx], xmm6
movdqu [rcx+16], xmm7
movdqu [rcx+32], xmm8
movdqu [rcx+48], xmm9
movdqu [rcx+64], xmm10
movdqu [rcx+80], xmm11
movdqu [rcx+96], xmm12
movdqu [rcx+112], xmm13
movdqu [rcx+128], xmm14
movdqu [rcx+144], xmm15
ret
WELS_EXTERN WelsXmmRegLoad
ALIGN 16
;******************************************************************************************
; void WelsXmmRegLoad(void *src)
;******************************************************************************************
WelsXmmRegLoad:
movdqu xmm6, [rcx]
movdqu xmm7, [rcx+16]
movdqu xmm8, [rcx+32]
movdqu xmm9, [rcx+48]
movdqu xmm10, [rcx+64]
movdqu xmm11, [rcx+80]
movdqu xmm12, [rcx+96]
movdqu xmm13, [rcx+112]
movdqu xmm14, [rcx+128]
movdqu xmm15, [rcx+144]
ret
%endif

View File

@ -63,6 +63,7 @@ SECTION .text
WELS_EXTERN DeblockLumaLt4V_ssse3
push rbp
mov r11,[rsp + 16 + 20h] ; pTC
PUSH_XMM 16
sub rsp,1B0h
lea rbp,[rsp+20h]
movd xmm4,r8d
@ -311,6 +312,7 @@ WELS_EXTERN DeblockLumaLt4V_ssse3
movdqa [r12+rcx],xmm0
mov r12,qword [rbp+180h]
lea rsp,[rbp+190h]
POP_XMM
pop rbp
ret
@ -779,6 +781,7 @@ WELS_EXTERN DeblockChromaLt4V_ssse3
mov rax,rsp
push rbx
push rdi
PUSH_XMM 16
sub rsp,0C8h
mov r10,qword [rax + 30h] ; pTC
pxor xmm1,xmm1
@ -833,7 +836,7 @@ WELS_EXTERN DeblockChromaLt4V_ssse3
punpckhbw xmm2,xmm1
punpcklbw xmm14,xmm1
movd xmm0,eax
movsx eax,word [rsp + 0C8h + 38h] ; iBeta
movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta
punpckhbw xmm13,xmm1
punpckhbw xmm15,xmm1
movdqa xmm3,xmm9
@ -929,6 +932,7 @@ WELS_EXTERN DeblockChromaLt4V_ssse3
movq [rdi],xmm14
movaps xmm14,[rsp+30h]
mov rsp,r11
POP_XMM
pop rdi
pop rbx
ret
@ -937,6 +941,7 @@ WELS_EXTERN DeblockChromaLt4V_ssse3
WELS_EXTERN DeblockChromaEq4V_ssse3
mov rax,rsp
push rbx
PUSH_XMM 15
sub rsp,90h
pxor xmm1,xmm1
mov r11,rcx
@ -973,7 +978,7 @@ WELS_EXTERN DeblockChromaEq4V_ssse3
punpcklbw xmm9,xmm1
punpckhbw xmm10,xmm1
movd xmm0,eax
movsx eax,word [rsp + 90h + 8h + 28h] ; iBeta
movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta
punpckhbw xmm13,xmm1
movdqa xmm7,xmm12
punpcklwd xmm0,xmm0
@ -1079,6 +1084,7 @@ WELS_EXTERN DeblockChromaEq4V_ssse3
movaps xmm12,[r11-70h]
movaps xmm13,[r11-80h]
mov rsp,r11
POP_XMM
pop rbx
ret
@ -1090,6 +1096,7 @@ WELS_EXTERN DeblockChromaEq4H_ssse3
mov rax,rsp
mov [rax+20h],rbx
push rdi
PUSH_XMM 16
sub rsp,140h
mov rdi,rdx
lea eax,[r8*4]
@ -1182,7 +1189,7 @@ WELS_EXTERN DeblockChromaEq4H_ssse3
movd xmm0,eax
movdqa xmm4,xmm12
movdqa xmm8,xmm11
movsx eax,word [rsp+170h] ; iBeta
movsx eax,word [rsp+170h + 160] ; iBeta
punpcklwd xmm0,xmm0
punpcklbw xmm4,xmm1
punpckhbw xmm12,xmm1
@ -1340,9 +1347,9 @@ WELS_EXTERN DeblockChromaEq4H_ssse3
mov [rbx+r10*2],eax
mov eax,[rsp+7Ch]
mov [rdx+rbx],eax
lea r11,[rsp+140h]
mov rbx, [r11+28h]
mov rsp,r11
lea rsp,[rsp+140h]
POP_XMM
mov rbx, [rsp+28h]
pop rdi
ret
@ -1355,6 +1362,7 @@ WELS_EXTERN DeblockChromaLt4H_ssse3
push rsi
push rdi
push r12
PUSH_XMM 16
sub rsp,170h
movsxd rsi,r8d
@ -1438,7 +1446,7 @@ WELS_EXTERN DeblockChromaLt4H_ssse3
punpckhdq xmm7,xmm0
movdqa xmm0,xmm1
punpckldq xmm0,xmm5
mov rax, [rsp+1C8h] ; pTC
mov rax, [rsp+1C8h+160] ; pTC
punpckhdq xmm1,xmm5
movdqa xmm9,xmm6
punpckhqdq xmm6,xmm0
@ -1476,7 +1484,7 @@ WELS_EXTERN DeblockChromaLt4H_ssse3
punpckhbw xmm9,xmm1
punpckhbw xmm8,xmm1
punpcklwd xmm0,xmm0
movsx eax,word [rsp+1C0h] ; iBeta
movsx eax,word [rsp+1C0h+160] ; iBeta
mov word [rsp+4],r8w
mov word [rsp+2],r9w
pshufd xmm12,xmm0,0
@ -1620,6 +1628,7 @@ WELS_EXTERN DeblockChromaLt4H_ssse3
mov [r10+rbp],eax
lea r11,[rsp+170h]
mov rsp,r11
POP_XMM
pop r12
pop rdi
pop rsi
@ -5132,6 +5141,7 @@ WELS_EXTERN DeblockLumaTransposeH2V_sse2
%assign push_num 3
LOAD_3_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@ -5188,6 +5198,7 @@ WELS_EXTERN DeblockLumaTransposeH2V_sse2
movdqa [r2 + 70h], xmm0
mov r7, r5
POP_XMM
pop r5
pop r4
pop r3
@ -5206,6 +5217,7 @@ WELS_EXTERN DeblockLumaTransposeV2H_sse2
%assign push_num 2
LOAD_3_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@ -5263,6 +5275,7 @@ WELS_EXTERN DeblockLumaTransposeV2H_sse2
mov r7, r4
POP_XMM
pop r4
pop r3
ret

View File

@ -360,6 +360,7 @@ WELS_EXTERN ExpandPictureLuma_sse2
%assign push_num 3
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r2, r2d
@ -461,6 +462,7 @@ WELS_EXTERN ExpandPictureLuma_sse2
; for left & right border expanding
exp_cross_sse2 32,a
POP_XMM
LOAD_4_PARA_POP
pop r6
@ -486,6 +488,7 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
%assign push_num 3
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1,r1d
SIGN_EXTENSION r2,r2d
@ -586,6 +589,7 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
; for left & right border expanding
exp_cross_sse2 16,a
POP_XMM
LOAD_4_PARA_POP
pop r6
@ -610,6 +614,7 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
%assign push_num 3
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1,r1d
SIGN_EXTENSION r2,r2d
@ -710,6 +715,7 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
; for left & right border expanding
exp_cross_sse2 16,u
POP_XMM
LOAD_4_PARA_POP
pop r6

View File

@ -67,6 +67,7 @@ WELS_EXTERN WelsCopy16x16_sse2
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
@ -112,6 +113,7 @@ WELS_EXTERN WelsCopy16x16_sse2
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
@ -129,6 +131,7 @@ WELS_EXTERN WelsCopy16x16NotAligned_sse2
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
@ -174,6 +177,7 @@ WELS_EXTERN WelsCopy16x16NotAligned_sse2
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
@ -191,6 +195,7 @@ WELS_EXTERN WelsCopy16x8NotAligned_sse2
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
@ -214,6 +219,7 @@ WELS_EXTERN WelsCopy16x8NotAligned_sse2
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4

View File

@ -149,6 +149,7 @@ WELS_EXTERN McChromaWidthEq4_mmx
WELS_EXTERN McChromaWidthEq8_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
@ -208,6 +209,7 @@ WELS_EXTERN McChromaWidthEq8_sse2
dec r5
jnz near .xloop
POP_XMM
LOAD_6_PARA_POP
ret
@ -226,6 +228,7 @@ WELS_EXTERN McChromaWidthEq8_sse2
WELS_EXTERN McChromaWidthEq8_ssse3
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
@ -282,6 +285,7 @@ WELS_EXTERN McChromaWidthEq8_ssse3
sub r5, 2
jnz .hloop_chroma
POP_XMM
LOAD_6_PARA_POP
ret

View File

@ -171,6 +171,7 @@ SECTION .text
WELS_EXTERN McHorVer22Width8HorFirst_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@ -207,6 +208,7 @@ WELS_EXTERN McHorVer22Width8HorFirst_sse2
add r2, r3
dec r4
jnz .yloop_width_8
POP_XMM
LOAD_5_PARA_POP
ret
@ -221,6 +223,7 @@ WELS_EXTERN McHorVer22Width8HorFirst_sse2
WELS_EXTERN McHorVer20WidthEq8_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@ -261,6 +264,7 @@ WELS_EXTERN McHorVer20WidthEq8_sse2
dec r4
jnz near .y_loop
POP_XMM
LOAD_5_PARA_POP
ret
@ -275,6 +279,7 @@ WELS_EXTERN McHorVer20WidthEq8_sse2
WELS_EXTERN McHorVer20WidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@ -341,6 +346,7 @@ WELS_EXTERN McHorVer20WidthEq16_sse2
dec r4
jnz near .y_loop
POP_XMM
LOAD_5_PARA_POP
ret
@ -355,6 +361,7 @@ WELS_EXTERN McHorVer20WidthEq16_sse2
WELS_EXTERN McHorVer02WidthEq8_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@ -424,6 +431,7 @@ WELS_EXTERN McHorVer02WidthEq8_sse2
jmp near .start
.xx_exit:
POP_XMM
LOAD_5_PARA_POP
ret
@ -446,6 +454,7 @@ SECTION .text
WELS_EXTERN McHorVer02Height9Or17_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@ -563,6 +572,7 @@ WELS_EXTERN McHorVer02Height9Or17_sse2
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
ret
@ -579,6 +589,7 @@ WELS_EXTERN McHorVer02Height9Or17_sse2
WELS_EXTERN McHorVer20Width9Or17_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@ -639,6 +650,7 @@ WELS_EXTERN McHorVer20Width9Or17_sse2
add r2, r3
dec r5
jnz .yloop_width_9
POP_XMM
LOAD_6_PARA_POP
ret
@ -720,6 +732,7 @@ WELS_EXTERN McHorVer20Width9Or17_sse2
add r2, r3
dec r5
jnz .yloop_width_17
POP_XMM
LOAD_6_PARA_POP
ret
@ -736,6 +749,7 @@ WELS_EXTERN McHorVer20Width9Or17_sse2
WELS_EXTERN McHorVer22HorFirst_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@ -792,6 +806,7 @@ WELS_EXTERN McHorVer22HorFirst_sse2
add r2, r3
dec r5
jnz .yloop_width_9
POP_XMM
LOAD_6_PARA_POP
ret
@ -866,6 +881,7 @@ WELS_EXTERN McHorVer22HorFirst_sse2
add r2, r3
dec r5
jnz .yloop_width_17
POP_XMM
LOAD_6_PARA_POP
ret
@ -903,6 +919,7 @@ WELS_EXTERN McHorVer22HorFirst_sse2
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@ -1016,6 +1033,7 @@ WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
ret
@ -1032,6 +1050,7 @@ WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@ -1144,5 +1163,6 @@ WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
ret

View File

@ -158,6 +158,7 @@ SECTION .text
WELS_EXTERN WelsSampleSatd4x4_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movd xmm0, [r0]
@ -219,6 +220,7 @@ WELS_EXTERN WelsSampleSatd4x4_sse2
movd retrd, xmm6
and retrd, 0xffff
shr retrd, 1
POP_XMM
LOAD_4_PARA_POP
ret
@ -230,6 +232,7 @@ WELS_EXTERN WelsSampleSatd4x4_sse2
WELS_EXTERN WelsSampleSatd8x8_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
@ -238,6 +241,7 @@ WELS_EXTERN WelsSampleSatd8x8_sse2
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
POP_XMM
LOAD_4_PARA_POP
ret
@ -249,6 +253,7 @@ WELS_EXTERN WelsSampleSatd8x8_sse2
WELS_EXTERN WelsSampleSatd8x16_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
@ -262,6 +267,7 @@ WELS_EXTERN WelsSampleSatd8x16_sse2
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
POP_XMM
LOAD_4_PARA_POP
ret
@ -273,6 +279,7 @@ WELS_EXTERN WelsSampleSatd8x16_sse2
WELS_EXTERN WelsSampleSatd16x8_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
@ -291,6 +298,7 @@ WELS_EXTERN WelsSampleSatd16x8_sse2
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
POP_XMM
LOAD_4_PARA_POP
ret
@ -302,6 +310,7 @@ WELS_EXTERN WelsSampleSatd16x8_sse2
WELS_EXTERN WelsSampleSatd16x16_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
@ -328,6 +337,7 @@ WELS_EXTERN WelsSampleSatd16x16_sse2
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
POP_XMM
LOAD_4_PARA_POP
ret
@ -976,6 +986,7 @@ return_sad_intra_16x16_x3:
WELS_EXTERN WelsSampleSatd4x4_sse41
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm4,[HSwapSumSubDB1]
@ -1017,6 +1028,7 @@ WELS_EXTERN WelsSampleSatd4x4_sse41
pabsw xmm2,xmm2
pmaxsw xmm0,xmm2
SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
ret
@ -1032,6 +1044,7 @@ WELS_EXTERN WelsSampleSatd8x8_sse41
%endif
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm7, [HSumSubDB1]
@ -1043,6 +1056,7 @@ WELS_EXTERN WelsSampleSatd8x8_sse41
lea r2, [r2+4*r3]
SSE41_GetSatd8x4
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r5
@ -1063,6 +1077,7 @@ WELS_EXTERN WelsSampleSatd8x16_sse41
%endif
%assign push_num 3
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm7, [HSumSubDB1]
@ -1078,6 +1093,7 @@ loop_get_satd_8x16:
cmp r6, 4
jl loop_get_satd_8x16
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r6
@ -1098,6 +1114,7 @@ WELS_EXTERN WelsSampleSatd16x8_sse41
%endif
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
@ -1121,6 +1138,7 @@ WELS_EXTERN WelsSampleSatd16x8_sse41
lea r2, [r2+4*r3]
SSE41_GetSatd8x4
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r5
@ -1142,6 +1160,7 @@ WELS_EXTERN WelsSampleSatd16x16_sse41
%endif
%assign push_num 3
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
@ -1174,6 +1193,7 @@ loop_get_satd_16x16_right:
cmp r6, 4
jl loop_get_satd_16x16_right
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r6
@ -1261,6 +1281,7 @@ WELS_EXTERN WelsSampleSad16x16_sse2
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
lea r4, [3*r1]
@ -1280,6 +1301,7 @@ WELS_EXTERN WelsSampleSad16x16_sse2
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd retrd, xmm0
POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r5
@ -1322,6 +1344,7 @@ WELS_EXTERN WelsSampleSad16x8_sse2
WELS_EXTERN WelsSampleSad8x16_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
@ -1340,6 +1363,7 @@ WELS_EXTERN WelsSampleSad8x16_sse2
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd retrd, xmm0
POP_XMM
LOAD_4_PARA_POP
ret
@ -1362,6 +1386,7 @@ WELS_EXTERN WelsSampleSad8x8_sse21
push r5
%endif
%assign push_num 3
PUSH_XMM 8
mov r0, arg1
mov r1, arg2
SIGN_EXTENSION r1, r1d
@ -1454,6 +1479,7 @@ WELS_EXTERN WelsSampleSad8x8_sse21
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd retrd, xmm0
POP_XMM
%ifdef X86_32
pop r5
pop r4
@ -1466,6 +1492,7 @@ WELS_EXTERN WelsSampleSad8x8_sse21
pop r2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
@ -1476,6 +1503,7 @@ WELS_EXTERN WelsSampleSad8x8_sse21
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd retrd, xmm0
POP_XMM
LOAD_4_PARA_POP
.return:
ret
@ -1510,6 +1538,7 @@ WELS_EXTERN WelsSampleSad8x8_sse21
WELS_EXTERN WelsSampleSadFour16x16_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
@ -1620,6 +1649,7 @@ WELS_EXTERN WelsSampleSadFour16x16_sse2
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
POP_XMM
LOAD_5_PARA_POP
ret
@ -1627,6 +1657,7 @@ WELS_EXTERN WelsSampleSadFour16x16_sse2
WELS_EXTERN WelsSampleSadFour16x8_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
@ -1705,12 +1736,14 @@ WELS_EXTERN WelsSampleSadFour16x8_sse2
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
POP_XMM
LOAD_5_PARA_POP
ret
WELS_EXTERN WelsSampleSadFour8x16_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
@ -1915,6 +1948,7 @@ WELS_EXTERN WelsSampleSadFour8x16_sse2
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
POP_XMM
LOAD_5_PARA_POP
ret
@ -1922,6 +1956,7 @@ WELS_EXTERN WelsSampleSadFour8x16_sse2
WELS_EXTERN WelsSampleSadFour8x8_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
@ -2035,6 +2070,7 @@ WELS_EXTERN WelsSampleSadFour8x8_sse2
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
POP_XMM
LOAD_5_PARA_POP
ret

View File

@ -149,6 +149,7 @@ WELS_EXTERN AnalysisVaaInfoIntra_sse2
%assign push_num 0
LOAD_2_PARA
PUSH_XMM 8
SIGN_EXTENSION r1,r1d
%ifdef X86_32
@ -232,6 +233,7 @@ WELS_EXTERN AnalysisVaaInfoIntra_sse2
pop r4
pop r3
%endif
POP_XMM
ret
@ -242,6 +244,7 @@ WELS_EXTERN AnalysisVaaInfoIntra_ssse3
%assign push_num 0
LOAD_2_PARA
PUSH_XMM 8
SIGN_EXTENSION r1,r1d
%ifdef X86_32
@ -325,6 +328,7 @@ WELS_EXTERN AnalysisVaaInfoIntra_ssse3
pop r4
pop r3
%endif
POP_XMM
ret

View File

@ -55,6 +55,7 @@ SECTION .text
WELS_EXTERN WelsResBlockZero16x16_sse2
%assign push_num 0
LOAD_2_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
lea r1, [r1*2]
lea r2, [r1*3]
@ -116,6 +117,7 @@ WELS_EXTERN WelsResBlockZero16x16_sse2
movdqa [r0+r2], xmm7
movdqa [r0+r2+10h], xmm7
POP_XMM
ret
@ -125,6 +127,7 @@ WELS_EXTERN WelsResBlockZero16x16_sse2
WELS_EXTERN WelsResBlockZero8x8_sse2
%assign push_num 0
LOAD_2_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
lea r1, [r1*2]
lea r2, [r1*3]
@ -143,5 +146,6 @@ WELS_EXTERN WelsResBlockZero8x8_sse2
movdqa [r0+r2], xmm7
POP_XMM
ret

View File

@ -223,6 +223,7 @@ WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
push r4
%assign push_num 2
LOAD_2_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
mov r4, r0 ; save r0 in r4
sub r0, 1
@ -302,6 +303,7 @@ get_i16x16_luma_pred_plane_sse2_1:
cmp r2, 16
jnz get_i16x16_luma_pred_plane_sse2_1
POP_XMM
pop r4
pop r3
ret
@ -387,6 +389,7 @@ WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
push r4
%assign push_num 2
LOAD_2_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
mov r4, r0
sub r0, 1
@ -465,6 +468,7 @@ get_i_chroma_pred_plane_sse2_1:
cmp r2, 8
jnz get_i_chroma_pred_plane_sse2_1
POP_XMM
pop r4
pop r3
WELSEMMS
@ -1181,6 +1185,7 @@ WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
%assign push_num 0
LOAD_2_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
mov r2, r0
sub r2, r1
@ -1243,6 +1248,7 @@ WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
movdqa [r0+2*r1], xmm0
movdqa [r0+r2], xmm1
POP_XMM
ret
;*******************************************************************************
@ -1355,6 +1361,7 @@ WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
%assign push_num 0
LOAD_2_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
mov r2, r0
sub r2, r1
@ -1384,6 +1391,7 @@ WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
movq [r0+r1], xmm0
movq [r0+2*r1], xmm0
movq [r0+r2], xmm0
POP_XMM
ret
;*******************************************************************************

View File

@ -104,8 +104,6 @@ IWelsTrace* m_pTrace;
void InitDecoder (void);
void UninitDecoder (void);
XMMREG_PROTECT_DECLARE(CWelsH264Decoder);
#ifdef OUTPUT_BIT_STREAM
WelsFileHandle* m_pFBS;
WelsFileHandle* m_pFBSSize;

View File

@ -101,7 +101,6 @@ CWelsDecoder::CWelsDecoder (void)
m_pTrace = CreateWelsTrace (Wels_Trace_Type);
IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::CWelsDecoder() entry");
XMMREG_PROTECT_INIT(CWelsH264Decoder);
#ifdef OUTPUT_BIT_STREAM
SWelsTime sCurTime;
@ -167,7 +166,6 @@ CWelsDecoder::~CWelsDecoder() {
IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::~CWelsDecoder()");
UninitDecoder();
XMMREG_PROTECT_UNINIT(CWelsH264Decoder);
#ifdef OUTPUT_BIT_STREAM
if (m_pFBS) {
@ -361,10 +359,8 @@ DECODING_STATE CWelsDecoder::DecodeFrame2 (const unsigned char* kpSrc,
m_pDecContext->iFeedbackTidInAu = -1; //initialize
XMMREG_PROTECT_STORE(CWelsH264Decoder);
WelsDecodeBs (m_pDecContext, kpSrc, kiSrcLen, (unsigned char**)ppDst,
pDstInfo); //iErrorCode has been modified in this function
XMMREG_PROTECT_LOAD(CWelsH264Decoder);
if (m_pDecContext->iErrorCode) {
ENalUnitType eNalType =

View File

@ -290,6 +290,7 @@ WELS_EXTERN WelsIDctT4Rec_mmx
WELS_EXTERN WelsDctFourT4_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
pxor xmm7, xmm7
@ -327,6 +328,7 @@ WELS_EXTERN WelsDctFourT4_sse2
lea r0, [r0+64]
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
POP_XMM
LOAD_5_PARA_POP
ret
@ -337,6 +339,7 @@ WELS_EXTERN WelsDctFourT4_sse2
WELS_EXTERN WelsIDctFourT4Rec_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
;Load 4x8
@ -376,6 +379,7 @@ WELS_EXTERN WelsIDctFourT4Rec_sse2
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
POP_XMM
LOAD_5_PARA_POP
; pop esi
; pop ebx
@ -394,6 +398,7 @@ WELS_EXTERN WelsIDctFourT4Rec_sse2
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm7, xmm7
@ -430,6 +435,7 @@ WELS_EXTERN WelsIDctRecI16x16Dc_sse2
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
POP_XMM
LOAD_5_PARA_POP
ret
@ -468,6 +474,7 @@ WELS_EXTERN WelsIDctRecI16x16Dc_sse2
WELS_EXTERN WelsHadamardT4Dc_sse2
%assign push_num 0
LOAD_2_PARA
PUSH_XMM 8
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
@ -493,4 +500,5 @@ WELS_EXTERN WelsHadamardT4Dc_sse2
movdqa [r0+ 0], xmm3
movdqa [r0+16], xmm2
POP_XMM
ret

View File

@ -229,6 +229,7 @@ WELS_EXTERN WelsI16x16LumaPredPlane_sse2
push r4
%assign push_num 2
LOAD_3_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
sub r1, 1
sub r1, r2
@ -304,6 +305,7 @@ get_i16x16_luma_pred_plane_sse2_1:
inc r3
cmp r3, 16
jnz get_i16x16_luma_pred_plane_sse2_1
POP_XMM
pop r4
pop r3
ret
@ -384,6 +386,7 @@ WELS_EXTERN WelsIChromaPredPlane_sse2
push r4
%assign push_num 2
LOAD_3_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
sub r1, 1
sub r1, r2
@ -458,6 +461,7 @@ get_i_chroma_pred_plane_sse2_1:
inc r3
cmp r3, 8
jnz get_i_chroma_pred_plane_sse2_1
POP_XMM
pop r4
pop r3
WELSEMMS

View File

@ -136,6 +136,7 @@ WELS_EXTERN WelsQuantFour4x4_sse2
WELS_EXTERN WelsQuantFour4x4Max_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
MOVDQ xmm2, [r1]
MOVDQ xmm3, [r2]
@ -161,6 +162,7 @@ WELS_EXTERN WelsQuantFour4x4Max_sse2
pmaxsw xmm0, xmm1
movq [r3], xmm0
POP_XMM
LOAD_4_PARA_POP
ret

View File

@ -132,8 +132,6 @@ class CWelsH264SVCEncoder : public ISVCEncoder {
void InitEncoder (void);
int32_t RawData2SrcPic (const uint8_t* pSrc);
void DumpSrcPicture (const uint8_t* pSrc);
XMMREG_PROTECT_DECLARE(CWelsH264SVCEncoder);
};
}
#endif // !defined(AFX_WELSH264ENCODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_)

View File

@ -138,7 +138,6 @@ CWelsH264SVCEncoder::CWelsH264SVCEncoder()
#endif//OUTPUT_BIT_STREAM
InitEncoder();
XMMREG_PROTECT_INIT(CWelsH264SVCEncoder);
}
CWelsH264SVCEncoder::~CWelsH264SVCEncoder() {
@ -172,7 +171,6 @@ CWelsH264SVCEncoder::~CWelsH264SVCEncoder() {
#endif//OUTPUT_BIT_STREAM
Uninitialize();
XMMREG_PROTECT_UNINIT(CWelsH264SVCEncoder);
}
void CWelsH264SVCEncoder::InitEncoder (void) {
@ -551,9 +549,7 @@ int CWelsH264SVCEncoder::EncodeFrameInternal(const SSourcePicture* pSrcPic, SFr
int32_t iFrameTypeReturned = 0;
int32_t iFrameType = videoFrameTypeInvalid;
XMMREG_PROTECT_STORE(CWelsH264SVCEncoder);
const int32_t kiEncoderReturn = WelsEncoderEncodeExt (m_pEncContext, pBsInfo, pSrcPic);
XMMREG_PROTECT_LOAD(CWelsH264SVCEncoder);
if(kiEncoderReturn == ENC_RETURN_MEMALLOCERR) {
WelsUninitEncoderExt (&m_pEncContext);

View File

@ -49,11 +49,9 @@ CAdaptiveQuantization::CAdaptiveQuantization (int32_t iCpuFlag) {
m_pfVar = NULL;
WelsMemset (&m_sAdaptiveQuantParam, 0, sizeof (m_sAdaptiveQuantParam));
WelsInitVarFunc (m_pfVar, m_CPUFlag);
XMMREG_PROTECT_INIT(AdaptiveQuantization);
}
CAdaptiveQuantization::~CAdaptiveQuantization() {
XMMREG_PROTECT_UNINIT(AdaptiveQuantization);
}
EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
@ -102,7 +100,6 @@ EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPix
pRefFrameTmp = pRefFrameY;
pCurFrameTmp = pCurFrameY;
for (i = 0; i < iMbWidth; i++) {
XMMREG_PROTECT_STORE(AdaptiveQuantization);
iSumDiff = pVaaCalcResults->pSad8x8[iMbIndex][0];
iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
@ -111,7 +108,6 @@ EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPix
iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
XMMREG_PROTECT_LOAD(AdaptiveQuantization);
iSumDiff = iSumDiff >> 8;
pMotionTexture->uiMotionIndex = (iSQDiff >> 8) - (iSumDiff * iSumDiff);
@ -134,9 +130,7 @@ EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPix
pRefFrameTmp = pRefFrameY;
pCurFrameTmp = pCurFrameY;
for (i = 0; i < iMbWidth; i++) {
XMMREG_PROTECT_STORE(AdaptiveQuantization);
m_pfVar (pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
XMMREG_PROTECT_LOAD(AdaptiveQuantization);
dAverageMotionIndex += pMotionTexture->uiMotionIndex;
dAverageTextureIndex += pMotionTexture->uiTextureIndex;
pMotionTexture++;

View File

@ -84,7 +84,6 @@ class CAdaptiveQuantization : public IStrategy {
PVarFunc m_pfVar;
int32_t m_CPUFlag;
SAdaptiveQuantizationParam m_sAdaptiveQuantParam;
XMMREG_PROTECT_DECLARE(AdaptiveQuantization);
};
WELSVP_NAMESPACE_END

View File

@ -176,6 +176,7 @@ WELS_EXTERN BilateralLumaFilter8_sse2
push r3
%assign push_num 1
LOAD_2_PARA
PUSH_XMM 8
pxor xmm7, xmm7
@ -212,6 +213,7 @@ WELS_EXTERN BilateralLumaFilter8_sse2
movq [r3], xmm5
POP_XMM
pop r3
%assign push_num 0

View File

@ -452,6 +452,7 @@ WELS_EXTERN SampleVariance16x16_sse2
push r15
%assign push_num 4
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1,r1d
SIGN_EXTENSION r3,r3d
@ -521,6 +522,7 @@ WELS_EXTERN SampleVariance16x16_sse2
sub r1, r0
mov [r4+2], r1w ; to store uiTextureIndex
POP_XMM
LOAD_5_PARA_POP
pop r15
pop r14
@ -552,6 +554,7 @@ WELS_EXTERN VAACalcSad_sse2
push r13
%assign push_num 2
LOAD_7_PARA
PUSH_XMM 8
SIGN_EXTENSION r2,r2d
SIGN_EXTENSION r3,r3d
SIGN_EXTENSION r4,r4d
@ -619,6 +622,7 @@ width_loop:
%undef psadframe
%undef psad8x8
%undef pushsize
POP_XMM
LOAD_7_PARA_POP
pop r13
pop r12
@ -785,6 +789,7 @@ WELS_EXTERN VAACalcSadVar_sse2
push r14
push r15
%assign push_num 4
PUSH_XMM 8
%ifdef WIN64
mov r4, arg5 ;iPicStride
@ -880,6 +885,7 @@ var_width_loop:
paddd xmm7, xmm5
movd [r15], xmm7
POP_XMM
pop r15
pop r14
pop r13
@ -1082,6 +1088,7 @@ WELS_EXTERN VAACalcSadSsd_sse2
push r14
push r15
%assign push_num 4
PUSH_XMM 10
%ifdef WIN64
mov r4,arg5
@ -1192,6 +1199,7 @@ sqdiff_width_loop:
mov r13, psadframe
movd [r13], xmm8
POP_XMM
pop r15
pop r14
pop r13
@ -1648,6 +1656,7 @@ WELS_EXTERN VAACalcSadBgd_sse2
push r14
push r15
%assign push_num 4
PUSH_XMM 10
%ifdef WIN64
mov r4,arg5
; mov r5,arg6
@ -1773,6 +1782,7 @@ bgd_width_loop:
mov r13, psadframe
movd [r13], xmm8
POP_XMM
pop r15
pop r14
pop r13
@ -1821,6 +1831,7 @@ WELS_EXTERN VAACalcSadSsdBgd_sse2
push r14
push r15
%assign push_num 4
PUSH_XMM 10
%ifdef WIN64
mov r4,arg5
;mov r5,arg6
@ -1993,6 +2004,7 @@ sqdiff_bgd_width_loop:
mov r14, psadframe
movd [r14], xmm8
POP_XMM
pop r15
pop r14
pop r13