Merge pull request #519 from mstorsjo/push-xmm-registers
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64 Reviewed by zhiliang
This commit is contained in:
commit
fb1958ad13
@ -335,6 +335,82 @@ BITS 32
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PUSH_XMM 1
|
||||
%ifdef WIN64
|
||||
%assign xmm_num_regs %1
|
||||
%if xmm_num_regs > 6
|
||||
%ifdef push_num
|
||||
%assign push_num push_num+2*(%1-6)
|
||||
%endif
|
||||
sub rsp, 16*(%1 - 6)
|
||||
movdqu [rsp], xmm6
|
||||
%endif
|
||||
%if xmm_num_regs > 7
|
||||
movdqu [rsp+16], xmm7
|
||||
%endif
|
||||
%if xmm_num_regs > 8
|
||||
movdqu [rsp+32], xmm8
|
||||
%endif
|
||||
%if xmm_num_regs > 9
|
||||
movdqu [rsp+48], xmm9
|
||||
%endif
|
||||
%if xmm_num_regs > 10
|
||||
movdqu [rsp+64], xmm10
|
||||
%endif
|
||||
%if xmm_num_regs > 11
|
||||
movdqu [rsp+80], xmm11
|
||||
%endif
|
||||
%if xmm_num_regs > 12
|
||||
movdqu [rsp+96], xmm12
|
||||
%endif
|
||||
%if xmm_num_regs > 13
|
||||
movdqu [rsp+112], xmm13
|
||||
%endif
|
||||
%if xmm_num_regs > 14
|
||||
movdqu [rsp+128], xmm14
|
||||
%endif
|
||||
%if xmm_num_regs > 15
|
||||
movdqu [rsp+144], xmm15
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro POP_XMM 0
|
||||
%ifdef WIN64
|
||||
%if xmm_num_regs > 15
|
||||
movdqu xmm15, [rsp+144]
|
||||
%endif
|
||||
%if xmm_num_regs > 14
|
||||
movdqu xmm14, [rsp+128]
|
||||
%endif
|
||||
%if xmm_num_regs > 13
|
||||
movdqu xmm13, [rsp+112]
|
||||
%endif
|
||||
%if xmm_num_regs > 12
|
||||
movdqu xmm12, [rsp+96]
|
||||
%endif
|
||||
%if xmm_num_regs > 11
|
||||
movdqu xmm11, [rsp+80]
|
||||
%endif
|
||||
%if xmm_num_regs > 10
|
||||
movdqu xmm10, [rsp+64]
|
||||
%endif
|
||||
%if xmm_num_regs > 9
|
||||
movdqu xmm9, [rsp+48]
|
||||
%endif
|
||||
%if xmm_num_regs > 8
|
||||
movdqu xmm8, [rsp+32]
|
||||
%endif
|
||||
%if xmm_num_regs > 7
|
||||
movdqu xmm7, [rsp+16]
|
||||
%endif
|
||||
%if xmm_num_regs > 6
|
||||
movdqu xmm6, [rsp]
|
||||
add rsp, 16*(xmm_num_regs - 6)
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SIGN_EXTENSION 2
|
||||
%ifndef X86_32
|
||||
movsxd %1, %2
|
||||
|
@ -207,9 +207,6 @@ void WelsCPURestore (const uint32_t kuiCPU) {
|
||||
}
|
||||
}
|
||||
|
||||
void WelsXmmRegEmptyOp(void * pSrc) {
|
||||
}
|
||||
|
||||
#elif defined(HAVE_NEON) //For supporting both android platform and iOS platform
|
||||
#if defined(ANDROID_NDK)
|
||||
uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors)
|
||||
|
@ -67,60 +67,14 @@ void WelsEmms();
|
||||
*/
|
||||
void WelsCPURestore (const uint32_t kuiCPU);
|
||||
|
||||
#ifdef WIN64
|
||||
void WelsXmmRegStore(void * src);
|
||||
void WelsXmmRegLoad(void * src);
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define WelsEmms()
|
||||
#endif
|
||||
|
||||
void WelsXmmRegEmptyOp(void * pSrc);
|
||||
|
||||
uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif//__cplusplus
|
||||
|
||||
typedef void (*WelsXmmRegProtectFunc)(void * pSrc);
|
||||
|
||||
|
||||
#if defined(WIN64) && defined(X86_ASM)
|
||||
#define XMMREG_PROTECT_DECLARE(name) \
|
||||
WelsXmmRegProtectFunc name##load;\
|
||||
WelsXmmRegProtectFunc name##store;\
|
||||
uint8_t name##Buffer[160];
|
||||
|
||||
#define XMMREG_PROTECT_INIT(name) \
|
||||
{ \
|
||||
uint32_t uiCpuFlag = WelsCPUFeatureDetect(NULL);\
|
||||
if( uiCpuFlag & WELS_CPU_SSE2 ){\
|
||||
name##load = WelsXmmRegLoad;\
|
||||
name##store = WelsXmmRegStore; \
|
||||
} else { \
|
||||
name##load = WelsXmmRegEmptyOp; \
|
||||
name##store = WelsXmmRegEmptyOp; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define XMMREG_PROTECT_UNINIT(name) \
|
||||
|
||||
#define XMMREG_PROTECT_STORE(name) \
|
||||
name##store(name##Buffer);
|
||||
|
||||
#define XMMREG_PROTECT_LOAD(name) \
|
||||
name##load(name##Buffer);
|
||||
|
||||
#else
|
||||
|
||||
#define XMMREG_PROTECT_DECLARE(name)
|
||||
#define XMMREG_PROTECT_INIT(name)
|
||||
#define XMMREG_PROTECT_UNINIT(name)
|
||||
#define XMMREG_PROTECT_STORE(name)
|
||||
#define XMMREG_PROTECT_LOAD(name)
|
||||
|
||||
#endif
|
||||
|
||||
#endif//WELS_CPU_DETECTION_H__
|
||||
|
@ -210,44 +210,3 @@ WELS_EXTERN WelsEmms
|
||||
emms ; empty mmx technology states
|
||||
ret
|
||||
|
||||
|
||||
%ifdef WIN64
|
||||
|
||||
WELS_EXTERN WelsXmmRegStore
|
||||
ALIGN 16
|
||||
;******************************************************************************************
|
||||
; void WelsXmmRegStore(void *src)
|
||||
;******************************************************************************************
|
||||
WelsXmmRegStore:
|
||||
movdqu [rcx], xmm6
|
||||
movdqu [rcx+16], xmm7
|
||||
movdqu [rcx+32], xmm8
|
||||
movdqu [rcx+48], xmm9
|
||||
movdqu [rcx+64], xmm10
|
||||
movdqu [rcx+80], xmm11
|
||||
movdqu [rcx+96], xmm12
|
||||
movdqu [rcx+112], xmm13
|
||||
movdqu [rcx+128], xmm14
|
||||
movdqu [rcx+144], xmm15
|
||||
ret
|
||||
|
||||
WELS_EXTERN WelsXmmRegLoad
|
||||
ALIGN 16
|
||||
;******************************************************************************************
|
||||
; void WelsXmmRegLoad(void *src)
|
||||
;******************************************************************************************
|
||||
WelsXmmRegLoad:
|
||||
movdqu xmm6, [rcx]
|
||||
movdqu xmm7, [rcx+16]
|
||||
movdqu xmm8, [rcx+32]
|
||||
movdqu xmm9, [rcx+48]
|
||||
movdqu xmm10, [rcx+64]
|
||||
movdqu xmm11, [rcx+80]
|
||||
movdqu xmm12, [rcx+96]
|
||||
movdqu xmm13, [rcx+112]
|
||||
movdqu xmm14, [rcx+128]
|
||||
movdqu xmm15, [rcx+144]
|
||||
ret
|
||||
%endif
|
||||
|
||||
|
||||
|
@ -63,6 +63,7 @@ SECTION .text
|
||||
WELS_EXTERN DeblockLumaLt4V_ssse3
|
||||
push rbp
|
||||
mov r11,[rsp + 16 + 20h] ; pTC
|
||||
PUSH_XMM 16
|
||||
sub rsp,1B0h
|
||||
lea rbp,[rsp+20h]
|
||||
movd xmm4,r8d
|
||||
@ -311,6 +312,7 @@ WELS_EXTERN DeblockLumaLt4V_ssse3
|
||||
movdqa [r12+rcx],xmm0
|
||||
mov r12,qword [rbp+180h]
|
||||
lea rsp,[rbp+190h]
|
||||
POP_XMM
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
@ -779,6 +781,7 @@ WELS_EXTERN DeblockChromaLt4V_ssse3
|
||||
mov rax,rsp
|
||||
push rbx
|
||||
push rdi
|
||||
PUSH_XMM 16
|
||||
sub rsp,0C8h
|
||||
mov r10,qword [rax + 30h] ; pTC
|
||||
pxor xmm1,xmm1
|
||||
@ -833,7 +836,7 @@ WELS_EXTERN DeblockChromaLt4V_ssse3
|
||||
punpckhbw xmm2,xmm1
|
||||
punpcklbw xmm14,xmm1
|
||||
movd xmm0,eax
|
||||
movsx eax,word [rsp + 0C8h + 38h] ; iBeta
|
||||
movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta
|
||||
punpckhbw xmm13,xmm1
|
||||
punpckhbw xmm15,xmm1
|
||||
movdqa xmm3,xmm9
|
||||
@ -929,6 +932,7 @@ WELS_EXTERN DeblockChromaLt4V_ssse3
|
||||
movq [rdi],xmm14
|
||||
movaps xmm14,[rsp+30h]
|
||||
mov rsp,r11
|
||||
POP_XMM
|
||||
pop rdi
|
||||
pop rbx
|
||||
ret
|
||||
@ -937,6 +941,7 @@ WELS_EXTERN DeblockChromaLt4V_ssse3
|
||||
WELS_EXTERN DeblockChromaEq4V_ssse3
|
||||
mov rax,rsp
|
||||
push rbx
|
||||
PUSH_XMM 15
|
||||
sub rsp,90h
|
||||
pxor xmm1,xmm1
|
||||
mov r11,rcx
|
||||
@ -973,7 +978,7 @@ WELS_EXTERN DeblockChromaEq4V_ssse3
|
||||
punpcklbw xmm9,xmm1
|
||||
punpckhbw xmm10,xmm1
|
||||
movd xmm0,eax
|
||||
movsx eax,word [rsp + 90h + 8h + 28h] ; iBeta
|
||||
movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta
|
||||
punpckhbw xmm13,xmm1
|
||||
movdqa xmm7,xmm12
|
||||
punpcklwd xmm0,xmm0
|
||||
@ -1079,6 +1084,7 @@ WELS_EXTERN DeblockChromaEq4V_ssse3
|
||||
movaps xmm12,[r11-70h]
|
||||
movaps xmm13,[r11-80h]
|
||||
mov rsp,r11
|
||||
POP_XMM
|
||||
pop rbx
|
||||
ret
|
||||
|
||||
@ -1090,6 +1096,7 @@ WELS_EXTERN DeblockChromaEq4H_ssse3
|
||||
mov rax,rsp
|
||||
mov [rax+20h],rbx
|
||||
push rdi
|
||||
PUSH_XMM 16
|
||||
sub rsp,140h
|
||||
mov rdi,rdx
|
||||
lea eax,[r8*4]
|
||||
@ -1182,7 +1189,7 @@ WELS_EXTERN DeblockChromaEq4H_ssse3
|
||||
movd xmm0,eax
|
||||
movdqa xmm4,xmm12
|
||||
movdqa xmm8,xmm11
|
||||
movsx eax,word [rsp+170h] ; iBeta
|
||||
movsx eax,word [rsp+170h + 160] ; iBeta
|
||||
punpcklwd xmm0,xmm0
|
||||
punpcklbw xmm4,xmm1
|
||||
punpckhbw xmm12,xmm1
|
||||
@ -1340,9 +1347,9 @@ WELS_EXTERN DeblockChromaEq4H_ssse3
|
||||
mov [rbx+r10*2],eax
|
||||
mov eax,[rsp+7Ch]
|
||||
mov [rdx+rbx],eax
|
||||
lea r11,[rsp+140h]
|
||||
mov rbx, [r11+28h]
|
||||
mov rsp,r11
|
||||
lea rsp,[rsp+140h]
|
||||
POP_XMM
|
||||
mov rbx, [rsp+28h]
|
||||
pop rdi
|
||||
ret
|
||||
|
||||
@ -1355,6 +1362,7 @@ WELS_EXTERN DeblockChromaLt4H_ssse3
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
PUSH_XMM 16
|
||||
sub rsp,170h
|
||||
|
||||
movsxd rsi,r8d
|
||||
@ -1438,7 +1446,7 @@ WELS_EXTERN DeblockChromaLt4H_ssse3
|
||||
punpckhdq xmm7,xmm0
|
||||
movdqa xmm0,xmm1
|
||||
punpckldq xmm0,xmm5
|
||||
mov rax, [rsp+1C8h] ; pTC
|
||||
mov rax, [rsp+1C8h+160] ; pTC
|
||||
punpckhdq xmm1,xmm5
|
||||
movdqa xmm9,xmm6
|
||||
punpckhqdq xmm6,xmm0
|
||||
@ -1476,7 +1484,7 @@ WELS_EXTERN DeblockChromaLt4H_ssse3
|
||||
punpckhbw xmm9,xmm1
|
||||
punpckhbw xmm8,xmm1
|
||||
punpcklwd xmm0,xmm0
|
||||
movsx eax,word [rsp+1C0h] ; iBeta
|
||||
movsx eax,word [rsp+1C0h+160] ; iBeta
|
||||
mov word [rsp+4],r8w
|
||||
mov word [rsp+2],r9w
|
||||
pshufd xmm12,xmm0,0
|
||||
@ -1620,6 +1628,7 @@ WELS_EXTERN DeblockChromaLt4H_ssse3
|
||||
mov [r10+rbp],eax
|
||||
lea r11,[rsp+170h]
|
||||
mov rsp,r11
|
||||
POP_XMM
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
@ -5132,6 +5141,7 @@ WELS_EXTERN DeblockLumaTransposeH2V_sse2
|
||||
|
||||
%assign push_num 3
|
||||
LOAD_3_PARA
|
||||
PUSH_XMM 8
|
||||
|
||||
SIGN_EXTENSION r1, r1d
|
||||
|
||||
@ -5188,6 +5198,7 @@ WELS_EXTERN DeblockLumaTransposeH2V_sse2
|
||||
movdqa [r2 + 70h], xmm0
|
||||
|
||||
mov r7, r5
|
||||
POP_XMM
|
||||
pop r5
|
||||
pop r4
|
||||
pop r3
|
||||
@ -5206,6 +5217,7 @@ WELS_EXTERN DeblockLumaTransposeV2H_sse2
|
||||
|
||||
%assign push_num 2
|
||||
LOAD_3_PARA
|
||||
PUSH_XMM 8
|
||||
|
||||
SIGN_EXTENSION r1, r1d
|
||||
|
||||
@ -5263,6 +5275,7 @@ WELS_EXTERN DeblockLumaTransposeV2H_sse2
|
||||
|
||||
|
||||
mov r7, r4
|
||||
POP_XMM
|
||||
pop r4
|
||||
pop r3
|
||||
ret
|
||||
|
@ -360,6 +360,7 @@ WELS_EXTERN ExpandPictureLuma_sse2
|
||||
|
||||
%assign push_num 3
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 7
|
||||
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r2, r2d
|
||||
@ -461,6 +462,7 @@ WELS_EXTERN ExpandPictureLuma_sse2
|
||||
; for left & right border expanding
|
||||
exp_cross_sse2 32,a
|
||||
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
|
||||
pop r6
|
||||
@ -486,6 +488,7 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
|
||||
|
||||
%assign push_num 3
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 7
|
||||
|
||||
SIGN_EXTENSION r1,r1d
|
||||
SIGN_EXTENSION r2,r2d
|
||||
@ -586,6 +589,7 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
|
||||
; for left & right border expanding
|
||||
exp_cross_sse2 16,a
|
||||
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
|
||||
pop r6
|
||||
@ -610,6 +614,7 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
|
||||
|
||||
%assign push_num 3
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 7
|
||||
|
||||
SIGN_EXTENSION r1,r1d
|
||||
SIGN_EXTENSION r2,r2d
|
||||
@ -710,6 +715,7 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
|
||||
; for left & right border expanding
|
||||
exp_cross_sse2 16,u
|
||||
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
|
||||
pop r6
|
||||
|
@ -67,6 +67,7 @@ WELS_EXTERN WelsCopy16x16_sse2
|
||||
push r5
|
||||
%assign push_num 2
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
|
||||
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
|
||||
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
|
||||
@ -112,6 +113,7 @@ WELS_EXTERN WelsCopy16x16_sse2
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
pop r5
|
||||
pop r4
|
||||
@ -129,6 +131,7 @@ WELS_EXTERN WelsCopy16x16NotAligned_sse2
|
||||
push r5
|
||||
%assign push_num 2
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
|
||||
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
|
||||
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
|
||||
@ -174,6 +177,7 @@ WELS_EXTERN WelsCopy16x16NotAligned_sse2
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
pop r5
|
||||
pop r4
|
||||
@ -191,6 +195,7 @@ WELS_EXTERN WelsCopy16x8NotAligned_sse2
|
||||
push r5
|
||||
%assign push_num 2
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
|
||||
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
|
||||
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
|
||||
@ -214,6 +219,7 @@ WELS_EXTERN WelsCopy16x8NotAligned_sse2
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
pop r5
|
||||
pop r4
|
||||
|
@ -149,6 +149,7 @@ WELS_EXTERN McChromaWidthEq4_mmx
|
||||
WELS_EXTERN McChromaWidthEq8_sse2
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
@ -208,6 +209,7 @@ WELS_EXTERN McChromaWidthEq8_sse2
|
||||
dec r5
|
||||
jnz near .xloop
|
||||
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
|
||||
ret
|
||||
@ -226,6 +228,7 @@ WELS_EXTERN McChromaWidthEq8_sse2
|
||||
WELS_EXTERN McChromaWidthEq8_ssse3
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
@ -282,6 +285,7 @@ WELS_EXTERN McChromaWidthEq8_ssse3
|
||||
sub r5, 2
|
||||
jnz .hloop_chroma
|
||||
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
|
||||
ret
|
||||
|
@ -171,6 +171,7 @@ SECTION .text
|
||||
WELS_EXTERN McHorVer22Width8HorFirst_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
@ -207,6 +208,7 @@ WELS_EXTERN McHorVer22Width8HorFirst_sse2
|
||||
add r2, r3
|
||||
dec r4
|
||||
jnz .yloop_width_8
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
@ -221,6 +223,7 @@ WELS_EXTERN McHorVer22Width8HorFirst_sse2
|
||||
WELS_EXTERN McHorVer20WidthEq8_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
@ -261,6 +264,7 @@ WELS_EXTERN McHorVer20WidthEq8_sse2
|
||||
dec r4
|
||||
jnz near .y_loop
|
||||
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
@ -275,6 +279,7 @@ WELS_EXTERN McHorVer20WidthEq8_sse2
|
||||
WELS_EXTERN McHorVer20WidthEq16_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
@ -341,6 +346,7 @@ WELS_EXTERN McHorVer20WidthEq16_sse2
|
||||
dec r4
|
||||
jnz near .y_loop
|
||||
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
@ -355,6 +361,7 @@ WELS_EXTERN McHorVer20WidthEq16_sse2
|
||||
WELS_EXTERN McHorVer02WidthEq8_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
@ -424,6 +431,7 @@ WELS_EXTERN McHorVer02WidthEq8_sse2
|
||||
jmp near .start
|
||||
|
||||
.xx_exit:
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
@ -446,6 +454,7 @@ SECTION .text
|
||||
WELS_EXTERN McHorVer02Height9Or17_sse2
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
@ -563,6 +572,7 @@ WELS_EXTERN McHorVer02Height9Or17_sse2
|
||||
pop r13
|
||||
pop r12
|
||||
%endif
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
ret
|
||||
|
||||
@ -579,6 +589,7 @@ WELS_EXTERN McHorVer02Height9Or17_sse2
|
||||
WELS_EXTERN McHorVer20Width9Or17_sse2
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
@ -639,6 +650,7 @@ WELS_EXTERN McHorVer20Width9Or17_sse2
|
||||
add r2, r3
|
||||
dec r5
|
||||
jnz .yloop_width_9
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
ret
|
||||
|
||||
@ -720,6 +732,7 @@ WELS_EXTERN McHorVer20Width9Or17_sse2
|
||||
add r2, r3
|
||||
dec r5
|
||||
jnz .yloop_width_17
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
ret
|
||||
|
||||
@ -736,6 +749,7 @@ WELS_EXTERN McHorVer20Width9Or17_sse2
|
||||
WELS_EXTERN McHorVer22HorFirst_sse2
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
@ -792,6 +806,7 @@ WELS_EXTERN McHorVer22HorFirst_sse2
|
||||
add r2, r3
|
||||
dec r5
|
||||
jnz .yloop_width_9
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
ret
|
||||
|
||||
@ -866,6 +881,7 @@ WELS_EXTERN McHorVer22HorFirst_sse2
|
||||
add r2, r3
|
||||
dec r5
|
||||
jnz .yloop_width_17
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
ret
|
||||
|
||||
@ -903,6 +919,7 @@ WELS_EXTERN McHorVer22HorFirst_sse2
|
||||
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
@ -1016,6 +1033,7 @@ WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
|
||||
pop r13
|
||||
pop r12
|
||||
%endif
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
ret
|
||||
|
||||
@ -1032,6 +1050,7 @@ WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
|
||||
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
@ -1144,5 +1163,6 @@ WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
|
||||
pop r13
|
||||
pop r12
|
||||
%endif
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
ret
|
||||
|
@ -158,6 +158,7 @@ SECTION .text
|
||||
WELS_EXTERN WelsSampleSatd4x4_sse2
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
movd xmm0, [r0]
|
||||
@ -219,6 +220,7 @@ WELS_EXTERN WelsSampleSatd4x4_sse2
|
||||
movd retrd, xmm6
|
||||
and retrd, 0xffff
|
||||
shr retrd, 1
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
||||
@ -230,6 +232,7 @@ WELS_EXTERN WelsSampleSatd4x4_sse2
|
||||
WELS_EXTERN WelsSampleSatd8x8_sse2
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm6, xmm6
|
||||
@ -238,6 +241,7 @@ WELS_EXTERN WelsSampleSatd8x8_sse2
|
||||
psrlw xmm6, 1
|
||||
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
||||
movd retrd, xmm6
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
||||
@ -249,6 +253,7 @@ WELS_EXTERN WelsSampleSatd8x8_sse2
|
||||
WELS_EXTERN WelsSampleSatd8x16_sse2
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm6, xmm6
|
||||
@ -262,6 +267,7 @@ WELS_EXTERN WelsSampleSatd8x16_sse2
|
||||
psrlw xmm6, 1
|
||||
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
||||
movd retrd, xmm6
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
||||
@ -273,6 +279,7 @@ WELS_EXTERN WelsSampleSatd8x16_sse2
|
||||
WELS_EXTERN WelsSampleSatd16x8_sse2
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
push r0
|
||||
@ -291,6 +298,7 @@ WELS_EXTERN WelsSampleSatd16x8_sse2
|
||||
psrlw xmm6, 1
|
||||
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
||||
movd retrd, xmm6
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
||||
@ -302,6 +310,7 @@ WELS_EXTERN WelsSampleSatd16x8_sse2
|
||||
WELS_EXTERN WelsSampleSatd16x16_sse2
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
push r0
|
||||
@ -328,6 +337,7 @@ WELS_EXTERN WelsSampleSatd16x16_sse2
|
||||
psrlw xmm6, 1
|
||||
SSE2_SumWHorizon xmm6,xmm4,xmm7
|
||||
movd retrd, xmm6
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
||||
@ -976,6 +986,7 @@ return_sad_intra_16x16_x3:
|
||||
WELS_EXTERN WelsSampleSatd4x4_sse41
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
movdqa xmm4,[HSwapSumSubDB1]
|
||||
@ -1017,6 +1028,7 @@ WELS_EXTERN WelsSampleSatd4x4_sse41
|
||||
pabsw xmm2,xmm2
|
||||
pmaxsw xmm0,xmm2
|
||||
SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
||||
@ -1032,6 +1044,7 @@ WELS_EXTERN WelsSampleSatd8x8_sse41
|
||||
%endif
|
||||
%assign push_num 2
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
movdqa xmm7, [HSumSubDB1]
|
||||
@ -1043,6 +1056,7 @@ WELS_EXTERN WelsSampleSatd8x8_sse41
|
||||
lea r2, [r2+4*r3]
|
||||
SSE41_GetSatd8x4
|
||||
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
%ifdef X86_32
|
||||
pop r5
|
||||
@ -1063,6 +1077,7 @@ WELS_EXTERN WelsSampleSatd8x16_sse41
|
||||
%endif
|
||||
%assign push_num 3
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
movdqa xmm7, [HSumSubDB1]
|
||||
@ -1078,6 +1093,7 @@ loop_get_satd_8x16:
|
||||
cmp r6, 4
|
||||
jl loop_get_satd_8x16
|
||||
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
@ -1098,6 +1114,7 @@ WELS_EXTERN WelsSampleSatd16x8_sse41
|
||||
%endif
|
||||
%assign push_num 2
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
push r0
|
||||
@ -1121,6 +1138,7 @@ WELS_EXTERN WelsSampleSatd16x8_sse41
|
||||
lea r2, [r2+4*r3]
|
||||
SSE41_GetSatd8x4
|
||||
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
%ifdef X86_32
|
||||
pop r5
|
||||
@ -1142,6 +1160,7 @@ WELS_EXTERN WelsSampleSatd16x16_sse41
|
||||
%endif
|
||||
%assign push_num 3
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
|
||||
@ -1174,6 +1193,7 @@ loop_get_satd_16x16_right:
|
||||
cmp r6, 4
|
||||
jl loop_get_satd_16x16_right
|
||||
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
@ -1261,6 +1281,7 @@ WELS_EXTERN WelsSampleSad16x16_sse2
|
||||
|
||||
%assign push_num 2
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
lea r4, [3*r1]
|
||||
@ -1280,6 +1301,7 @@ WELS_EXTERN WelsSampleSad16x16_sse2
|
||||
movhlps xmm0, xmm7
|
||||
paddw xmm0, xmm7
|
||||
movd retrd, xmm0
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
%ifdef X86_32
|
||||
pop r5
|
||||
@ -1322,6 +1344,7 @@ WELS_EXTERN WelsSampleSad16x8_sse2
|
||||
WELS_EXTERN WelsSampleSad8x16_sse2
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 7
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm6, xmm6
|
||||
@ -1340,6 +1363,7 @@ WELS_EXTERN WelsSampleSad8x16_sse2
|
||||
movhlps xmm0, xmm6
|
||||
paddw xmm0, xmm6
|
||||
movd retrd, xmm0
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
||||
@ -1362,6 +1386,7 @@ WELS_EXTERN WelsSampleSad8x8_sse21
|
||||
push r5
|
||||
%endif
|
||||
%assign push_num 3
|
||||
PUSH_XMM 8
|
||||
mov r0, arg1
|
||||
mov r1, arg2
|
||||
SIGN_EXTENSION r1, r1d
|
||||
@ -1454,6 +1479,7 @@ WELS_EXTERN WelsSampleSad8x8_sse21
|
||||
movhlps xmm0, xmm7
|
||||
paddw xmm0, xmm7
|
||||
movd retrd, xmm0
|
||||
POP_XMM
|
||||
%ifdef X86_32
|
||||
pop r5
|
||||
pop r4
|
||||
@ -1466,6 +1492,7 @@ WELS_EXTERN WelsSampleSad8x8_sse21
|
||||
pop r2
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 7
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm6, xmm6
|
||||
@ -1476,6 +1503,7 @@ WELS_EXTERN WelsSampleSad8x8_sse21
|
||||
movhlps xmm0, xmm6
|
||||
paddw xmm0, xmm6
|
||||
movd retrd, xmm0
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
.return:
|
||||
ret
|
||||
@ -1510,6 +1538,7 @@ WELS_EXTERN WelsSampleSad8x8_sse21
|
||||
WELS_EXTERN WelsSampleSadFour16x16_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
||||
@ -1620,6 +1649,7 @@ WELS_EXTERN WelsSampleSadFour16x16_sse2
|
||||
punpckldq xmm6, xmm7
|
||||
punpcklqdq xmm4, xmm6
|
||||
movdqa [r4],xmm4
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
@ -1627,6 +1657,7 @@ WELS_EXTERN WelsSampleSadFour16x16_sse2
|
||||
WELS_EXTERN WelsSampleSadFour16x8_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
||||
@ -1705,12 +1736,14 @@ WELS_EXTERN WelsSampleSadFour16x8_sse2
|
||||
punpckldq xmm6, xmm7
|
||||
punpcklqdq xmm4, xmm6
|
||||
movdqa [r4],xmm4
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
WELS_EXTERN WelsSampleSadFour8x16_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
||||
@ -1915,6 +1948,7 @@ WELS_EXTERN WelsSampleSadFour8x16_sse2
|
||||
punpckldq xmm6, xmm7
|
||||
punpcklqdq xmm4, xmm6
|
||||
movdqa [r4],xmm4
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
@ -1922,6 +1956,7 @@ WELS_EXTERN WelsSampleSadFour8x16_sse2
|
||||
WELS_EXTERN WelsSampleSadFour8x8_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
|
||||
@ -2035,6 +2070,7 @@ WELS_EXTERN WelsSampleSadFour8x8_sse2
|
||||
punpckldq xmm6, xmm7
|
||||
punpcklqdq xmm4, xmm6
|
||||
movdqa [r4],xmm4
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
|
@ -149,6 +149,7 @@ WELS_EXTERN AnalysisVaaInfoIntra_sse2
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1,r1d
|
||||
|
||||
%ifdef X86_32
|
||||
@ -232,6 +233,7 @@ WELS_EXTERN AnalysisVaaInfoIntra_sse2
|
||||
pop r4
|
||||
pop r3
|
||||
%endif
|
||||
POP_XMM
|
||||
|
||||
ret
|
||||
|
||||
@ -242,6 +244,7 @@ WELS_EXTERN AnalysisVaaInfoIntra_ssse3
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1,r1d
|
||||
|
||||
%ifdef X86_32
|
||||
@ -325,6 +328,7 @@ WELS_EXTERN AnalysisVaaInfoIntra_ssse3
|
||||
pop r4
|
||||
pop r3
|
||||
%endif
|
||||
POP_XMM
|
||||
|
||||
ret
|
||||
|
||||
|
@ -55,6 +55,7 @@ SECTION .text
|
||||
WELS_EXTERN WelsResBlockZero16x16_sse2
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
lea r1, [r1*2]
|
||||
lea r2, [r1*3]
|
||||
@ -116,6 +117,7 @@ WELS_EXTERN WelsResBlockZero16x16_sse2
|
||||
movdqa [r0+r2], xmm7
|
||||
movdqa [r0+r2+10h], xmm7
|
||||
|
||||
POP_XMM
|
||||
ret
|
||||
|
||||
|
||||
@ -125,6 +127,7 @@ WELS_EXTERN WelsResBlockZero16x16_sse2
|
||||
WELS_EXTERN WelsResBlockZero8x8_sse2
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
lea r1, [r1*2]
|
||||
lea r2, [r1*3]
|
||||
@ -143,5 +146,6 @@ WELS_EXTERN WelsResBlockZero8x8_sse2
|
||||
movdqa [r0+r2], xmm7
|
||||
|
||||
|
||||
POP_XMM
|
||||
ret
|
||||
|
||||
|
@ -223,6 +223,7 @@ WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
|
||||
push r4
|
||||
%assign push_num 2
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
mov r4, r0 ; save r0 in r4
|
||||
sub r0, 1
|
||||
@ -302,6 +303,7 @@ get_i16x16_luma_pred_plane_sse2_1:
|
||||
cmp r2, 16
|
||||
jnz get_i16x16_luma_pred_plane_sse2_1
|
||||
|
||||
POP_XMM
|
||||
pop r4
|
||||
pop r3
|
||||
ret
|
||||
@ -387,6 +389,7 @@ WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
|
||||
push r4
|
||||
%assign push_num 2
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
mov r4, r0
|
||||
sub r0, 1
|
||||
@ -465,6 +468,7 @@ get_i_chroma_pred_plane_sse2_1:
|
||||
cmp r2, 8
|
||||
jnz get_i_chroma_pred_plane_sse2_1
|
||||
|
||||
POP_XMM
|
||||
pop r4
|
||||
pop r3
|
||||
WELSEMMS
|
||||
@ -1181,6 +1185,7 @@ WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
|
||||
WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
mov r2, r0
|
||||
sub r2, r1
|
||||
@ -1243,6 +1248,7 @@ WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
|
||||
movdqa [r0+2*r1], xmm0
|
||||
movdqa [r0+r2], xmm1
|
||||
|
||||
POP_XMM
|
||||
ret
|
||||
|
||||
;*******************************************************************************
|
||||
@ -1355,6 +1361,7 @@ WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
|
||||
WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
mov r2, r0
|
||||
sub r2, r1
|
||||
@ -1384,6 +1391,7 @@ WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
|
||||
movq [r0+r1], xmm0
|
||||
movq [r0+2*r1], xmm0
|
||||
movq [r0+r2], xmm0
|
||||
POP_XMM
|
||||
ret
|
||||
|
||||
;*******************************************************************************
|
||||
|
@ -104,8 +104,6 @@ IWelsTrace* m_pTrace;
|
||||
void InitDecoder (void);
|
||||
void UninitDecoder (void);
|
||||
|
||||
XMMREG_PROTECT_DECLARE(CWelsH264Decoder);
|
||||
|
||||
#ifdef OUTPUT_BIT_STREAM
|
||||
WelsFileHandle* m_pFBS;
|
||||
WelsFileHandle* m_pFBSSize;
|
||||
|
@ -101,7 +101,6 @@ CWelsDecoder::CWelsDecoder (void)
|
||||
m_pTrace = CreateWelsTrace (Wels_Trace_Type);
|
||||
|
||||
IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::CWelsDecoder() entry");
|
||||
XMMREG_PROTECT_INIT(CWelsH264Decoder);
|
||||
|
||||
#ifdef OUTPUT_BIT_STREAM
|
||||
SWelsTime sCurTime;
|
||||
@ -167,7 +166,6 @@ CWelsDecoder::~CWelsDecoder() {
|
||||
IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::~CWelsDecoder()");
|
||||
|
||||
UninitDecoder();
|
||||
XMMREG_PROTECT_UNINIT(CWelsH264Decoder);
|
||||
|
||||
#ifdef OUTPUT_BIT_STREAM
|
||||
if (m_pFBS) {
|
||||
@ -361,10 +359,8 @@ DECODING_STATE CWelsDecoder::DecodeFrame2 (const unsigned char* kpSrc,
|
||||
|
||||
m_pDecContext->iFeedbackTidInAu = -1; //initialize
|
||||
|
||||
XMMREG_PROTECT_STORE(CWelsH264Decoder);
|
||||
WelsDecodeBs (m_pDecContext, kpSrc, kiSrcLen, (unsigned char**)ppDst,
|
||||
pDstInfo); //iErrorCode has been modified in this function
|
||||
XMMREG_PROTECT_LOAD(CWelsH264Decoder);
|
||||
|
||||
if (m_pDecContext->iErrorCode) {
|
||||
ENalUnitType eNalType =
|
||||
|
@ -290,6 +290,7 @@ WELS_EXTERN WelsIDctT4Rec_mmx
|
||||
WELS_EXTERN WelsDctFourT4_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r2, r2d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
pxor xmm7, xmm7
|
||||
@ -327,6 +328,7 @@ WELS_EXTERN WelsDctFourT4_sse2
|
||||
lea r0, [r0+64]
|
||||
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
|
||||
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
@ -337,6 +339,7 @@ WELS_EXTERN WelsDctFourT4_sse2
|
||||
WELS_EXTERN WelsIDctFourT4Rec_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
;Load 4x8
|
||||
@ -376,6 +379,7 @@ WELS_EXTERN WelsIDctFourT4Rec_sse2
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
|
||||
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
; pop esi
|
||||
; pop ebx
|
||||
@ -394,6 +398,7 @@ WELS_EXTERN WelsIDctFourT4Rec_sse2
|
||||
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm7, xmm7
|
||||
@ -430,6 +435,7 @@ WELS_EXTERN WelsIDctRecI16x16Dc_sse2
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
@ -468,6 +474,7 @@ WELS_EXTERN WelsIDctRecI16x16Dc_sse2
|
||||
WELS_EXTERN WelsHadamardT4Dc_sse2
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
|
||||
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
|
||||
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
|
||||
@ -493,4 +500,5 @@ WELS_EXTERN WelsHadamardT4Dc_sse2
|
||||
movdqa [r0+ 0], xmm3
|
||||
movdqa [r0+16], xmm2
|
||||
|
||||
POP_XMM
|
||||
ret
|
||||
|
@ -229,6 +229,7 @@ WELS_EXTERN WelsI16x16LumaPredPlane_sse2
|
||||
push r4
|
||||
%assign push_num 2
|
||||
LOAD_3_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r2, r2d
|
||||
sub r1, 1
|
||||
sub r1, r2
|
||||
@ -304,6 +305,7 @@ get_i16x16_luma_pred_plane_sse2_1:
|
||||
inc r3
|
||||
cmp r3, 16
|
||||
jnz get_i16x16_luma_pred_plane_sse2_1
|
||||
POP_XMM
|
||||
pop r4
|
||||
pop r3
|
||||
ret
|
||||
@ -384,6 +386,7 @@ WELS_EXTERN WelsIChromaPredPlane_sse2
|
||||
push r4
|
||||
%assign push_num 2
|
||||
LOAD_3_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r2, r2d
|
||||
sub r1, 1
|
||||
sub r1, r2
|
||||
@ -458,6 +461,7 @@ get_i_chroma_pred_plane_sse2_1:
|
||||
inc r3
|
||||
cmp r3, 8
|
||||
jnz get_i_chroma_pred_plane_sse2_1
|
||||
POP_XMM
|
||||
pop r4
|
||||
pop r3
|
||||
WELSEMMS
|
||||
|
@ -136,6 +136,7 @@ WELS_EXTERN WelsQuantFour4x4_sse2
|
||||
WELS_EXTERN WelsQuantFour4x4Max_sse2
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
MOVDQ xmm2, [r1]
|
||||
MOVDQ xmm3, [r2]
|
||||
|
||||
@ -161,6 +162,7 @@ WELS_EXTERN WelsQuantFour4x4Max_sse2
|
||||
pmaxsw xmm0, xmm1
|
||||
|
||||
movq [r3], xmm0
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
||||
|
@ -132,8 +132,6 @@ class CWelsH264SVCEncoder : public ISVCEncoder {
|
||||
void InitEncoder (void);
|
||||
int32_t RawData2SrcPic (const uint8_t* pSrc);
|
||||
void DumpSrcPicture (const uint8_t* pSrc);
|
||||
|
||||
XMMREG_PROTECT_DECLARE(CWelsH264SVCEncoder);
|
||||
};
|
||||
}
|
||||
#endif // !defined(AFX_WELSH264ENCODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_)
|
||||
|
@ -138,7 +138,6 @@ CWelsH264SVCEncoder::CWelsH264SVCEncoder()
|
||||
#endif//OUTPUT_BIT_STREAM
|
||||
|
||||
InitEncoder();
|
||||
XMMREG_PROTECT_INIT(CWelsH264SVCEncoder);
|
||||
}
|
||||
|
||||
CWelsH264SVCEncoder::~CWelsH264SVCEncoder() {
|
||||
@ -172,7 +171,6 @@ CWelsH264SVCEncoder::~CWelsH264SVCEncoder() {
|
||||
#endif//OUTPUT_BIT_STREAM
|
||||
|
||||
Uninitialize();
|
||||
XMMREG_PROTECT_UNINIT(CWelsH264SVCEncoder);
|
||||
}
|
||||
|
||||
void CWelsH264SVCEncoder::InitEncoder (void) {
|
||||
@ -551,9 +549,7 @@ int CWelsH264SVCEncoder::EncodeFrameInternal(const SSourcePicture* pSrcPic, SFr
|
||||
|
||||
int32_t iFrameTypeReturned = 0;
|
||||
int32_t iFrameType = videoFrameTypeInvalid;
|
||||
XMMREG_PROTECT_STORE(CWelsH264SVCEncoder);
|
||||
const int32_t kiEncoderReturn = WelsEncoderEncodeExt (m_pEncContext, pBsInfo, pSrcPic);
|
||||
XMMREG_PROTECT_LOAD(CWelsH264SVCEncoder);
|
||||
|
||||
if(kiEncoderReturn == ENC_RETURN_MEMALLOCERR) {
|
||||
WelsUninitEncoderExt (&m_pEncContext);
|
||||
|
@ -49,11 +49,9 @@ CAdaptiveQuantization::CAdaptiveQuantization (int32_t iCpuFlag) {
|
||||
m_pfVar = NULL;
|
||||
WelsMemset (&m_sAdaptiveQuantParam, 0, sizeof (m_sAdaptiveQuantParam));
|
||||
WelsInitVarFunc (m_pfVar, m_CPUFlag);
|
||||
XMMREG_PROTECT_INIT(AdaptiveQuantization);
|
||||
}
|
||||
|
||||
CAdaptiveQuantization::~CAdaptiveQuantization() {
|
||||
XMMREG_PROTECT_UNINIT(AdaptiveQuantization);
|
||||
}
|
||||
|
||||
EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
|
||||
@ -102,7 +100,6 @@ EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPix
|
||||
pRefFrameTmp = pRefFrameY;
|
||||
pCurFrameTmp = pCurFrameY;
|
||||
for (i = 0; i < iMbWidth; i++) {
|
||||
XMMREG_PROTECT_STORE(AdaptiveQuantization);
|
||||
iSumDiff = pVaaCalcResults->pSad8x8[iMbIndex][0];
|
||||
iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
|
||||
iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
|
||||
@ -111,7 +108,6 @@ EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPix
|
||||
iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
|
||||
uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
|
||||
iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
|
||||
XMMREG_PROTECT_LOAD(AdaptiveQuantization);
|
||||
|
||||
iSumDiff = iSumDiff >> 8;
|
||||
pMotionTexture->uiMotionIndex = (iSQDiff >> 8) - (iSumDiff * iSumDiff);
|
||||
@ -134,9 +130,7 @@ EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPix
|
||||
pRefFrameTmp = pRefFrameY;
|
||||
pCurFrameTmp = pCurFrameY;
|
||||
for (i = 0; i < iMbWidth; i++) {
|
||||
XMMREG_PROTECT_STORE(AdaptiveQuantization);
|
||||
m_pfVar (pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
|
||||
XMMREG_PROTECT_LOAD(AdaptiveQuantization);
|
||||
dAverageMotionIndex += pMotionTexture->uiMotionIndex;
|
||||
dAverageTextureIndex += pMotionTexture->uiTextureIndex;
|
||||
pMotionTexture++;
|
||||
|
@ -84,7 +84,6 @@ class CAdaptiveQuantization : public IStrategy {
|
||||
PVarFunc m_pfVar;
|
||||
int32_t m_CPUFlag;
|
||||
SAdaptiveQuantizationParam m_sAdaptiveQuantParam;
|
||||
XMMREG_PROTECT_DECLARE(AdaptiveQuantization);
|
||||
};
|
||||
|
||||
WELSVP_NAMESPACE_END
|
||||
|
@ -176,6 +176,7 @@ WELS_EXTERN BilateralLumaFilter8_sse2
|
||||
push r3
|
||||
%assign push_num 1
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
|
||||
pxor xmm7, xmm7
|
||||
|
||||
@ -212,6 +213,7 @@ WELS_EXTERN BilateralLumaFilter8_sse2
|
||||
movq [r3], xmm5
|
||||
|
||||
|
||||
POP_XMM
|
||||
pop r3
|
||||
%assign push_num 0
|
||||
|
||||
|
@ -452,6 +452,7 @@ WELS_EXTERN SampleVariance16x16_sse2
|
||||
push r15
|
||||
%assign push_num 4
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1,r1d
|
||||
SIGN_EXTENSION r3,r3d
|
||||
|
||||
@ -521,6 +522,7 @@ WELS_EXTERN SampleVariance16x16_sse2
|
||||
sub r1, r0
|
||||
mov [r4+2], r1w ; to store uiTextureIndex
|
||||
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
pop r15
|
||||
pop r14
|
||||
@ -552,6 +554,7 @@ WELS_EXTERN VAACalcSad_sse2
|
||||
push r13
|
||||
%assign push_num 2
|
||||
LOAD_7_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r2,r2d
|
||||
SIGN_EXTENSION r3,r3d
|
||||
SIGN_EXTENSION r4,r4d
|
||||
@ -619,6 +622,7 @@ width_loop:
|
||||
%undef psadframe
|
||||
%undef psad8x8
|
||||
%undef pushsize
|
||||
POP_XMM
|
||||
LOAD_7_PARA_POP
|
||||
pop r13
|
||||
pop r12
|
||||
@ -785,6 +789,7 @@ WELS_EXTERN VAACalcSadVar_sse2
|
||||
push r14
|
||||
push r15
|
||||
%assign push_num 4
|
||||
PUSH_XMM 8
|
||||
|
||||
%ifdef WIN64
|
||||
mov r4, arg5 ;iPicStride
|
||||
@ -880,6 +885,7 @@ var_width_loop:
|
||||
paddd xmm7, xmm5
|
||||
movd [r15], xmm7
|
||||
|
||||
POP_XMM
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
@ -1082,6 +1088,7 @@ WELS_EXTERN VAACalcSadSsd_sse2
|
||||
push r14
|
||||
push r15
|
||||
%assign push_num 4
|
||||
PUSH_XMM 10
|
||||
|
||||
%ifdef WIN64
|
||||
mov r4,arg5
|
||||
@ -1192,6 +1199,7 @@ sqdiff_width_loop:
|
||||
mov r13, psadframe
|
||||
movd [r13], xmm8
|
||||
|
||||
POP_XMM
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
@ -1648,6 +1656,7 @@ WELS_EXTERN VAACalcSadBgd_sse2
|
||||
push r14
|
||||
push r15
|
||||
%assign push_num 4
|
||||
PUSH_XMM 10
|
||||
%ifdef WIN64
|
||||
mov r4,arg5
|
||||
; mov r5,arg6
|
||||
@ -1773,6 +1782,7 @@ bgd_width_loop:
|
||||
mov r13, psadframe
|
||||
movd [r13], xmm8
|
||||
|
||||
POP_XMM
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
@ -1821,6 +1831,7 @@ WELS_EXTERN VAACalcSadSsdBgd_sse2
|
||||
push r14
|
||||
push r15
|
||||
%assign push_num 4
|
||||
PUSH_XMM 10
|
||||
%ifdef WIN64
|
||||
mov r4,arg5
|
||||
;mov r5,arg6
|
||||
@ -1993,6 +2004,7 @@ sqdiff_bgd_width_loop:
|
||||
mov r14, psadframe
|
||||
movd [r14], xmm8
|
||||
|
||||
POP_XMM
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
|
Loading…
x
Reference in New Issue
Block a user