Merge pull request #517 from mstorsjo/simplify-x86-asm-func-macro

Fold ALIGN 16 and the function label into WELS_EXTERN
This commit is contained in:
volvet 2014-03-18 09:29:17 +08:00
commit e75cd2298b
21 changed files with 76 additions and 412 deletions

View File

@ -342,12 +342,14 @@ BITS 32
%endmacro
%macro WELS_EXTERN 1
ALIGN 16
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%1:
%endmacro
%macro WELS_AbsW 2

View File

@ -55,12 +55,10 @@ SECTION .text
; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
; section CPUID - CPU Identification
WELS_EXTERN WelsCPUIdVerify
ALIGN 16
;******************************************************************************************
; int32_t WelsCPUIdVerify()
;******************************************************************************************
WelsCPUIdVerify:
WELS_EXTERN WelsCPUIdVerify
push r1
PUSHRFLAGS
PUSHRFLAGS
@ -73,14 +71,12 @@ WelsCPUIdVerify:
pop r1
ret
WELS_EXTERN WelsCPUId
ALIGN 16
;****************************************************************************************************
; void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
;****************************************************************************************************
%ifdef WIN64
WelsCPUId:
WELS_EXTERN WelsCPUId
push rbx
push rdx
@ -98,7 +94,7 @@ WelsCPUId:
ret
%elifdef UNIX64
WelsCPUId:
WELS_EXTERN WelsCPUId
push rbx
push rcx
push rdx
@ -118,7 +114,7 @@ WelsCPUId:
%elifdef X86_32
WelsCPUId:
WELS_EXTERN WelsCPUId
push ebx
push edi
@ -143,13 +139,11 @@ WelsCPUId:
%endif
WELS_EXTERN WelsCPUSupportAVX
; need call after cpuid=1 and eax, ecx flag got then
ALIGN 16
;****************************************************************************************************
; int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
;****************************************************************************************************
WelsCPUSupportAVX:
WELS_EXTERN WelsCPUSupportAVX
%ifdef WIN64
mov eax, ecx
mov ecx, edx
@ -178,13 +172,11 @@ avx_not_supported:
ret
WELS_EXTERN WelsCPUSupportFMA
; need call after cpuid=1 and eax, ecx flag got then
ALIGN 16
;****************************************************************************************************
; int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
;****************************************************************************************************
WelsCPUSupportFMA:
WELS_EXTERN WelsCPUSupportFMA
%ifdef WIN64
mov eax, ecx
mov ecx, edx
@ -211,12 +203,10 @@ fma_not_supported:
mov eax, 0
ret
WELS_EXTERN WelsEmms
ALIGN 16
;******************************************************************************************
; void WelsEmms()
;******************************************************************************************
WelsEmms:
WELS_EXTERN WelsEmms
emms ; empty mmx technology states
ret

View File

@ -61,8 +61,6 @@ SECTION .text
WELS_EXTERN DeblockLumaLt4V_ssse3
DeblockLumaLt4V_ssse3:
push rbp
mov r11,[rsp + 16 + 20h] ; pTC
sub rsp,1B0h
@ -318,9 +316,6 @@ DeblockLumaLt4V_ssse3:
WELS_EXTERN DeblockLumaEq4V_ssse3
ALIGN 16
DeblockLumaEq4V_ssse3:
mov rax,rsp
push rbx
push rbp
@ -781,9 +776,6 @@ DeblockLumaEq4V_ssse3:
WELS_EXTERN DeblockChromaLt4V_ssse3
ALIGN 16
DeblockChromaLt4V_ssse3:
mov rax,rsp
push rbx
push rdi
@ -943,8 +935,6 @@ DeblockChromaLt4V_ssse3:
WELS_EXTERN DeblockChromaEq4V_ssse3
ALIGN 16
DeblockChromaEq4V_ssse3:
mov rax,rsp
push rbx
sub rsp,90h
@ -1097,8 +1087,6 @@ DeblockChromaEq4V_ssse3:
WELS_EXTERN DeblockChromaEq4H_ssse3
ALIGN 16
DeblockChromaEq4H_ssse3:
mov rax,rsp
mov [rax+20h],rbx
push rdi
@ -1361,8 +1349,6 @@ DeblockChromaEq4H_ssse3:
WELS_EXTERN DeblockChromaLt4H_ssse3
ALIGN 16
DeblockChromaLt4H_ssse3:
mov rax,rsp
push rbx
push rbp
@ -1647,8 +1633,6 @@ DeblockChromaLt4H_ssse3:
WELS_EXTERN DeblockLumaLt4V_ssse3
DeblockLumaLt4V_ssse3:
push rbp
mov r11,r8 ; pTC
sub rsp,1B0h
@ -1904,9 +1888,6 @@ DeblockLumaLt4V_ssse3:
WELS_EXTERN DeblockLumaEq4V_ssse3
ALIGN 16
DeblockLumaEq4V_ssse3:
mov rax,rsp
push rbx
push rbp
@ -2366,8 +2347,6 @@ DeblockLumaEq4V_ssse3:
ret
WELS_EXTERN DeblockChromaLt4V_ssse3
ALIGN 16
DeblockChromaLt4V_ssse3:
mov rax,rsp
push rbx
push rbp
@ -2534,8 +2513,6 @@ DeblockChromaLt4V_ssse3:
ret
WELS_EXTERN DeblockChromaEq4V_ssse3
DeblockChromaEq4V_ssse3:
mov rax,rsp
push rbx
push rbp
@ -2685,9 +2662,6 @@ DeblockChromaEq4V_ssse3:
ret
WELS_EXTERN DeblockChromaEq4H_ssse3
ALIGN 16
DeblockChromaEq4H_ssse3:
mov rax,rsp
push rbx
push rbp
@ -2960,8 +2934,6 @@ DeblockChromaEq4H_ssse3:
WELS_EXTERN DeblockChromaLt4H_ssse3
ALIGN 16
DeblockChromaLt4H_ssse3:
mov rax,rsp
push rbx
push rbp
@ -3256,9 +3228,6 @@ DeblockChromaLt4H_ssse3:
; int32_t iAlpha, int32_t iBeta)
;********************************************************************************
WELS_EXTERN DeblockChromaEq4V_ssse3
ALIGN 16
DeblockChromaEq4V_ssse3:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
@ -3426,8 +3395,6 @@ DeblockChromaEq4V_ssse3:
;*******************************************************************************
WELS_EXTERN DeblockChromaLt4V_ssse3
DeblockChromaLt4V_ssse3:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
@ -3629,10 +3596,6 @@ DeblockChromaLt4V_ssse3:
;***************************************************************************
WELS_EXTERN DeblockChromaEq4H_ssse3
ALIGN 16
DeblockChromaEq4H_ssse3:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
@ -3914,10 +3877,6 @@ DeblockChromaEq4H_ssse3:
;*******************************************************************************
WELS_EXTERN DeblockChromaLt4H_ssse3
ALIGN 16
DeblockChromaLt4H_ssse3:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
@ -4230,10 +4189,6 @@ DeblockChromaLt4H_ssse3:
WELS_EXTERN DeblockLumaLt4V_ssse3
ALIGN 16
DeblockLumaLt4V_ssse3:
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
@ -4620,12 +4575,9 @@ DeblockLumaLt4V_ssse3:
; int32_t iBeta)
;*******************************************************************************
WELS_EXTERN DeblockLumaEq4V_ssse3
ALIGN 16
DeblockLumaEq4V_ssse3:
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
@ -5174,10 +5126,6 @@ DeblockLumaEq4V_ssse3:
;********************************************************************************
WELS_EXTERN DeblockLumaTransposeH2V_sse2
ALIGN 16
DeblockLumaTransposeH2V_sse2:
push r3
push r4
push r5
@ -5253,10 +5201,6 @@ DeblockLumaTransposeH2V_sse2:
;*******************************************************************************************
WELS_EXTERN DeblockLumaTransposeV2H_sse2
ALIGN 16
DeblockLumaTransposeV2H_sse2:
push r3
push r4

View File

@ -56,9 +56,6 @@
SECTION .text
WELS_EXTERN ExpandPictureLuma_sse2
WELS_EXTERN ExpandPictureChromaAlign_sse2 ; for chroma alignment
WELS_EXTERN ExpandPictureChromaUnalign_sse2 ; for chroma unalignment
;;;;;;;expanding result;;;;;;;
@ -349,14 +346,13 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2 ; for chroma unalignment
%endif
%endmacro
ALIGN 16
;***********************************************************************----------------
; void ExpandPictureLuma_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
ExpandPictureLuma_sse2:
WELS_EXTERN ExpandPictureLuma_sse2
push r4
push r5
@ -476,14 +472,13 @@ ExpandPictureLuma_sse2:
ret
ALIGN 16
;***********************************************************************----------------
; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
ExpandPictureChromaAlign_sse2:
WELS_EXTERN ExpandPictureChromaAlign_sse2
push r4
push r5
@ -602,14 +597,13 @@ ExpandPictureChromaAlign_sse2:
ret
ALIGN 16
;***********************************************************************----------------
; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
ExpandPictureChromaUnalign_sse2:
WELS_EXTERN ExpandPictureChromaUnalign_sse2
push r4
push r5
push r6

View File

@ -54,12 +54,6 @@
SECTION .text
WELS_EXTERN WelsCopy16x16_sse2
WELS_EXTERN WelsCopy16x16NotAligned_sse2
WELS_EXTERN WelsCopy8x8_mmx
WELS_EXTERN WelsCopy16x8NotAligned_sse2 ;
WELS_EXTERN WelsCopy8x16_mmx ;
WELS_EXTERN UpdateMbMv_sse2 ;
;***********************************************************************
; void WelsCopy16x16_sse2( uint8_t* Dst,
@ -67,8 +61,7 @@ WELS_EXTERN UpdateMbMv_sse2 ;
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
ALIGN 16
WelsCopy16x16_sse2:
WELS_EXTERN WelsCopy16x16_sse2
push r4
push r5
@ -130,9 +123,8 @@ WelsCopy16x16_sse2:
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
ALIGN 16
; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
WelsCopy16x16NotAligned_sse2:
WELS_EXTERN WelsCopy16x16NotAligned_sse2
push r4
push r5
%assign push_num 2
@ -194,8 +186,7 @@ WelsCopy16x16NotAligned_sse2:
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
ALIGN 16
WelsCopy16x8NotAligned_sse2:
WELS_EXTERN WelsCopy16x8NotAligned_sse2
push r4
push r5
%assign push_num 2
@ -235,8 +226,7 @@ WelsCopy16x8NotAligned_sse2:
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
ALIGN 16
WelsCopy8x16_mmx:
WELS_EXTERN WelsCopy8x16_mmx
%assign push_num 0
LOAD_4_PARA
@ -300,8 +290,7 @@ WelsCopy8x16_mmx:
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
ALIGN 16
WelsCopy8x8_mmx:
WELS_EXTERN WelsCopy8x8_mmx
push r4
%assign push_num 1
LOAD_4_PARA
@ -349,8 +338,7 @@ WelsCopy8x8_mmx:
;***********************************************************************
; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
;***********************************************************************
ALIGN 16
UpdateMbMv_sse2:
WELS_EXTERN UpdateMbMv_sse2
%assign push_num 0
LOAD_2_PARA
@ -373,23 +361,16 @@ UpdateMbMv_sse2:
SECTION .text
WELS_EXTERN PixelAvgWidthEq4_mmx
WELS_EXTERN PixelAvgWidthEq8_mmx
WELS_EXTERN PixelAvgWidthEq16_sse2
WELS_EXTERN McCopyWidthEq4_mmx
WELS_EXTERN McCopyWidthEq8_mmx
WELS_EXTERN McCopyWidthEq16_sse2
ALIGN 16
;*******************************************************************************
; void PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride,
; uint8_t *pSrcA, int iSrcAStride,
; uint8_t *pSrcB, int iSrcBStride,
; int iHeight );
;*******************************************************************************
PixelAvgWidthEq4_mmx:
WELS_EXTERN PixelAvgWidthEq4_mmx
%assign push_num 0
LOAD_7_PARA
@ -416,14 +397,13 @@ ALIGN 4
ret
ALIGN 16
;*******************************************************************************
; void PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride,
; uint8_t *pSrcA, int iSrcAStride,
; uint8_t *pSrcB, int iSrcBStride,
; int iHeight );
;*******************************************************************************
PixelAvgWidthEq8_mmx:
WELS_EXTERN PixelAvgWidthEq8_mmx
%assign push_num 0
LOAD_7_PARA
@ -454,14 +434,13 @@ ALIGN 4
ALIGN 16
;*******************************************************************************
; void PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride,
; uint8_t *pSrcA, int iSrcAStride,
; uint8_t *pSrcB, int iSrcBStride,
; int iHeight );
;*******************************************************************************
PixelAvgWidthEq16_sse2:
WELS_EXTERN PixelAvgWidthEq16_sse2
%assign push_num 0
LOAD_7_PARA
@ -507,12 +486,11 @@ ALIGN 4
LOAD_7_PARA_POP
ret
ALIGN 16
;*******************************************************************************
; void McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
McCopyWidthEq4_mmx:
WELS_EXTERN McCopyWidthEq4_mmx
push r5
%assign push_num 1
LOAD_5_PARA
@ -535,12 +513,11 @@ ALIGN 4
pop r5
ret
ALIGN 16
;*******************************************************************************
; void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
McCopyWidthEq8_mmx:
WELS_EXTERN McCopyWidthEq8_mmx
%assign push_num 0
LOAD_5_PARA
@ -562,7 +539,6 @@ ALIGN 4
ret
ALIGN 16
;*******************************************************************************
; void McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
@ -577,7 +553,7 @@ ALIGN 16
movq [%1], %2
movhps [%1+8], %2
%endmacro
McCopyWidthEq16_sse2:
WELS_EXTERN McCopyWidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d

View File

@ -65,7 +65,6 @@ h264_d0x20_mmx:
SECTION .text
ALIGN 16
;*******************************************************************************
; void McChromaWidthEq4_mmx( const uint8_t *src,
; int32_t iSrcStride,
@ -75,7 +74,6 @@ ALIGN 16
; int32_t iHeigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx
McChromaWidthEq4_mmx:
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
@ -140,7 +138,6 @@ McChromaWidthEq4_mmx:
ret
ALIGN 16
;*******************************************************************************
; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
@ -150,7 +147,6 @@ ALIGN 16
; int32_t iheigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2
McChromaWidthEq8_sse2:
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
@ -219,7 +215,6 @@ McChromaWidthEq8_sse2:
ALIGN 16
;***********************************************************************
; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
; int32_t iSrcStride,
@ -229,7 +224,6 @@ ALIGN 16
; int32_t iHeigh);
;***********************************************************************
WELS_EXTERN McChromaWidthEq8_ssse3
McChromaWidthEq8_ssse3:
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d

View File

@ -71,10 +71,8 @@ h264_mc_hc_32:
SECTION .text
WELS_EXTERN McHorVer20WidthEq4_mmx
ALIGN 16
;*******************************************************************************
; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
; int iSrcStride,
@ -82,7 +80,7 @@ ALIGN 16
; int iDstStride,
; int iHeight)
;*******************************************************************************
McHorVer20WidthEq4_mmx:
WELS_EXTERN McHorVer20WidthEq4_mmx
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -161,12 +159,7 @@ McHorVer20WidthEq4_mmx:
;*******************************************************************************
SECTION .text
WELS_EXTERN McHorVer22Width8HorFirst_sse2
WELS_EXTERN McHorVer02WidthEq8_sse2
WELS_EXTERN McHorVer20WidthEq8_sse2
WELS_EXTERN McHorVer20WidthEq16_sse2
ALIGN 16
;***********************************************************************
; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
; int16_t iSrcStride,
@ -175,7 +168,7 @@ ALIGN 16
; int32_t iHeight
; )
;***********************************************************************
McHorVer22Width8HorFirst_sse2:
WELS_EXTERN McHorVer22Width8HorFirst_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -217,7 +210,6 @@ McHorVer22Width8HorFirst_sse2:
LOAD_5_PARA_POP
ret
ALIGN 16
;*******************************************************************************
; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc,
; int iSrcStride,
@ -226,7 +218,7 @@ ALIGN 16
; int iHeight,
; );
;*******************************************************************************
McHorVer20WidthEq8_sse2:
WELS_EXTERN McHorVer20WidthEq8_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -272,7 +264,6 @@ McHorVer20WidthEq8_sse2:
LOAD_5_PARA_POP
ret
ALIGN 16
;*******************************************************************************
; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc,
; int iSrcStride,
@ -281,7 +272,7 @@ ALIGN 16
; int iHeight,
; );
;*******************************************************************************
McHorVer20WidthEq16_sse2:
WELS_EXTERN McHorVer20WidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -361,8 +352,7 @@ McHorVer20WidthEq16_sse2:
; int iDstStride,
; int iHeight )
;*******************************************************************************
ALIGN 16
McHorVer02WidthEq8_sse2:
WELS_EXTERN McHorVer02WidthEq8_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -443,11 +433,6 @@ McHorVer02WidthEq8_sse2:
SECTION .text
WELS_EXTERN McHorVer20Width9Or17_sse2
WELS_EXTERN McHorVer02Height9Or17_sse2
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
WELS_EXTERN McHorVer22HorFirst_sse2
;***********************************************************************
@ -458,8 +443,7 @@ WELS_EXTERN McHorVer22HorFirst_sse2
; int32_t iWidth,
; int32_t iHeight )
;***********************************************************************
ALIGN 16
McHorVer02Height9Or17_sse2:
WELS_EXTERN McHorVer02Height9Or17_sse2
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
@ -583,7 +567,6 @@ McHorVer02Height9Or17_sse2:
ret
ALIGN 16
;***********************************************************************
; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
@ -593,7 +576,7 @@ ALIGN 16
; int32_t iHeight
; );
;***********************************************************************
McHorVer20Width9Or17_sse2:
WELS_EXTERN McHorVer20Width9Or17_sse2
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
@ -742,7 +725,6 @@ McHorVer20Width9Or17_sse2:
ALIGN 16
;***********************************************************************
;void McHorVer22HorFirst_sse2
; (const uint8_t *pSrc,
@ -751,7 +733,7 @@ ALIGN 16
; int32_t iTapStride,
; int32_t iWidth,int32_t iHeight);
;***********************************************************************
McHorVer22HorFirst_sse2:
WELS_EXTERN McHorVer22HorFirst_sse2
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
@ -918,7 +900,7 @@ McHorVer22HorFirst_sse2:
; int32_t iHeight);
;***********************************************************************
McHorVer22Width8VerLastAlign_sse2:
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
@ -1047,7 +1029,7 @@ McHorVer22HorFirst_sse2:
; int32_t iHeight);
;***********************************************************************
McHorVer22Width8VerLastUnAlign_sse2:
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d

View File

@ -156,8 +156,6 @@ SECTION .text
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse2
align 16
WelsSampleSatd4x4_sse2:
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@ -229,9 +227,7 @@ WelsSampleSatd4x4_sse2:
;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse2
align 16
WelsSampleSatd8x8_sse2:
WELS_EXTERN WelsSampleSatd8x8_sse2
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@ -250,9 +246,7 @@ align 16
;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse2
align 16
WelsSampleSatd8x16_sse2:
WELS_EXTERN WelsSampleSatd8x16_sse2
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@ -277,8 +271,6 @@ align 16
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse2
align 16
WelsSampleSatd16x8_sse2:
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@ -308,8 +300,6 @@ WelsSampleSatd16x8_sse2:
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_sse2
align 16
WelsSampleSatd16x16_sse2:
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@ -484,7 +474,6 @@ WelsSampleSatd16x16_sse2:
%ifdef X86_32
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
WelsIntra16x16Combined3Satd_sse41:
push ebx
push esi
push edi
@ -678,7 +667,6 @@ loop_chroma_satdx3_cb_cr:
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
WelsIntraChroma8x8Combined3Satd_sse41:
push ebx
push esi
push edi
@ -782,7 +770,6 @@ ret
;
;***********************************************************************
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
WelsIntra16x16Combined3Sad_ssse3:
push ebx
push esi
push edi
@ -987,7 +974,6 @@ return_sad_intra_16x16_x3:
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse41
WelsSampleSatd4x4_sse41:
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@ -1040,8 +1026,6 @@ WelsSampleSatd4x4_sse41:
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse41
align 16
WelsSampleSatd8x8_sse41:
%ifdef X86_32
push r4
push r5
@ -1072,8 +1056,6 @@ WelsSampleSatd8x8_sse41:
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse41
align 16
WelsSampleSatd8x16_sse41:
%ifdef X86_32
push r4
push r5
@ -1110,8 +1092,6 @@ loop_get_satd_8x16:
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse41
align 16
WelsSampleSatd16x8_sse41:
%ifdef X86_32
push r4
push r5
@ -1155,8 +1135,6 @@ WelsSampleSatd16x8_sse41:
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_sse41
align 16
WelsSampleSatd16x16_sse41:
%ifdef X86_32
push r4
push r5
@ -1276,8 +1254,6 @@ loop_get_satd_16x16_right:
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x16_sse2
align 16
WelsSampleSad16x16_sse2:
%ifdef X86_32
push r4
push r5
@ -1319,8 +1295,6 @@ WelsSampleSad16x16_sse2:
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x8_sse2
align 16
WelsSampleSad16x8_sse2:
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@ -1346,7 +1320,6 @@ WelsSampleSad16x8_sse2:
WELS_EXTERN WelsSampleSad8x16_sse2
WelsSampleSad8x16_sse2:
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@ -1377,7 +1350,6 @@ cmp %1, (32-%2)|(%3>>1)
%endmacro
WELS_EXTERN WelsSampleSad8x8_sse21
WelsSampleSad8x8_sse21:
%assign push_num 0
mov r2, arg3
push r2
@ -1536,7 +1508,6 @@ WelsSampleSad8x8_sse21:
paddw xmm7, %4
%endmacro
WELS_EXTERN WelsSampleSadFour16x16_sse2
WelsSampleSadFour16x16_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -1654,7 +1625,6 @@ WelsSampleSadFour16x16_sse2:
WELS_EXTERN WelsSampleSadFour16x8_sse2
WelsSampleSadFour16x8_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -1739,7 +1709,6 @@ WelsSampleSadFour16x8_sse2:
ret
WELS_EXTERN WelsSampleSadFour8x16_sse2
WelsSampleSadFour8x16_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -1951,7 +1920,6 @@ WelsSampleSadFour8x16_sse2:
WELS_EXTERN WelsSampleSadFour8x8_sse2
WelsSampleSadFour8x8_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -2071,7 +2039,6 @@ WelsSampleSadFour8x8_sse2:
ret
WELS_EXTERN WelsSampleSadFour4x4_sse2
WelsSampleSadFour4x4_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -2144,13 +2111,10 @@ WelsSampleSadFour4x4_sse2:
;
;***********************************************************************
WELS_EXTERN WelsSampleSad4x4_mmx
align 16
;***********************************************************************
; int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
;***********************************************************************
WelsSampleSad4x4_mmx:
WELS_EXTERN WelsSampleSad4x4_mmx
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d

View File

@ -142,12 +142,10 @@ SECTION .text
; , 6/7/2010
WELS_EXTERN AnalysisVaaInfoIntra_sse2
;***********************************************************************
; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
;***********************************************************************
ALIGN 16
AnalysisVaaInfoIntra_sse2:
WELS_EXTERN AnalysisVaaInfoIntra_sse2
%assign push_num 0
LOAD_2_PARA
@ -237,12 +235,10 @@ AnalysisVaaInfoIntra_sse2:
ret
WELS_EXTERN AnalysisVaaInfoIntra_ssse3
;***********************************************************************
; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
;***********************************************************************
ALIGN 16
AnalysisVaaInfoIntra_ssse3:
WELS_EXTERN AnalysisVaaInfoIntra_ssse3
%assign push_num 0
LOAD_2_PARA
@ -332,12 +328,10 @@ AnalysisVaaInfoIntra_ssse3:
ret
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
;***********************************************************************
; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
;***********************************************************************
ALIGN 16
MdInterAnalysisVaaInfo_sse41:
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
%assign push_num 0
LOAD_1_PARA
movdqa xmm0,[r0]
@ -368,12 +362,10 @@ MdInterAnalysisVaaInfo_sse41:
mov retrd, 15
ret
WELS_EXTERN MdInterAnalysisVaaInfo_sse2
;***********************************************************************
; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
;***********************************************************************
ALIGN 16
MdInterAnalysisVaaInfo_sse2:
WELS_EXTERN MdInterAnalysisVaaInfo_sse2
%assign push_num 0
LOAD_1_PARA
movdqa xmm0, [r0]

View File

@ -49,13 +49,10 @@
SECTION .text
WELS_EXTERN WelsResBlockZero16x16_sse2
ALIGN 16
;*******************************************************************************
; void WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
;*******************************************************************************
WelsResBlockZero16x16_sse2:
WELS_EXTERN WelsResBlockZero16x16_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -122,13 +119,10 @@ WelsResBlockZero16x16_sse2:
ret
WELS_EXTERN WelsResBlockZero8x8_sse2
ALIGN 16
;*******************************************************************************
; void WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
;*******************************************************************************
WelsResBlockZero8x8_sse2:
WELS_EXTERN WelsResBlockZero8x8_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d

View File

@ -83,14 +83,11 @@
SECTION .text
WELS_EXTERN IdctResAddPred_mmx
ALIGN 16
;*******************************************************************************
; void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
;*******************************************************************************
IdctResAddPred_mmx:
WELS_EXTERN IdctResAddPred_mmx
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r1, r1d

View File

@ -177,18 +177,14 @@ sse2_wd_0x02: times 8 dw 0x02
;*******************************************************************************
SECTION .text
WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
ALIGN 16
;*******************************************************************************
; void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
;
; pPred must align to 16
;*******************************************************************************
WelsDecoderI4x4LumaPredH_sse2:
WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -222,7 +218,7 @@ WelsDecoderI4x4LumaPredH_sse2:
;*******************************************************************************
; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WelsDecoderI16x16LumaPredPlane_sse2:
WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
push r3
push r4
%assign push_num 2
@ -326,7 +322,6 @@ get_i16x16_luma_pred_plane_sse2_1:
%endmacro
WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
WelsDecoderI16x16LumaPredH_sse2:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -350,7 +345,6 @@ WelsDecoderI16x16LumaPredH_sse2:
; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
WelsDecoderI16x16LumaPredV_sse2:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -389,7 +383,6 @@ WelsDecoderI16x16LumaPredV_sse2:
; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
WelsDecoderIChromaPredPlane_sse2:
push r3
push r4
%assign push_num 2
@ -477,7 +470,6 @@ get_i_chroma_pred_plane_sse2_1:
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
; 0 |1 |2 |3 |4 |
; 6 |7 |8 |9 |10|
@ -490,7 +482,7 @@ ALIGN 16
; void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
;
;*******************************************************************************
WelsDecoderI4x4LumaPredDDR_mmx:
WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -536,7 +528,6 @@ WelsDecoderI4x4LumaPredDDR_mmx:
ret
ALIGN 16
;*******************************************************************************
; void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
; copy 8 pixel of 8 line from left
@ -560,7 +551,6 @@ ALIGN 16
%endmacro
WELS_EXTERN WelsDecoderIChromaPredH_mmx
WelsDecoderIChromaPredH_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -597,13 +587,11 @@ WelsDecoderIChromaPredH_mmx:
ret
ALIGN 16
;*******************************************************************************
; void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
; copy 8 pixels from top 8 pixels
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredV_mmx
WelsDecoderIChromaPredV_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -627,7 +615,6 @@ WelsDecoderIChromaPredV_mmx:
ret
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
; l0|
@ -658,7 +645,6 @@ WelsDecoderIChromaPredV_mmx:
; void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
WelsDecoderI4x4LumaPredHD_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -714,7 +700,6 @@ WelsDecoderI4x4LumaPredHD_mmx:
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
; l0|
@ -742,7 +727,6 @@ ALIGN 16
; void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
WelsDecoderI4x4LumaPredHU_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -799,7 +783,6 @@ WelsDecoderI4x4LumaPredHU_mmx:
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
; l0|
@ -829,7 +812,6 @@ ALIGN 16
; void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
WelsDecoderI4x4LumaPredVR_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -889,7 +871,6 @@ WelsDecoderI4x4LumaPredVR_mmx:
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
; l0|
@ -917,7 +898,6 @@ ALIGN 16
; void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
WelsDecoderI4x4LumaPredDDL_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -956,7 +936,6 @@ WelsDecoderI4x4LumaPredDDL_mmx:
ret
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
; l0|
@ -987,7 +966,6 @@ ALIGN 16
; void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
WelsDecoderI4x4LumaPredVL_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -1023,13 +1001,11 @@ WelsDecoderI4x4LumaPredVL_mmx:
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
;
; void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDc_sse2
WelsDecoderIChromaPredDc_sse2:
push r3
push r4
%assign push_num 2
@ -1120,13 +1096,11 @@ WelsDecoderIChromaPredDc_sse2:
ALIGN 16
;*******************************************************************************
;
; void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
WelsDecoderI16x16LumaPredDc_sse2:
push r3
push r4
%assign push_num 2
@ -1201,12 +1175,10 @@ WelsDecoderI16x16LumaPredDc_sse2:
; for intra prediction as follows, 11/19/2010
;*******************************************************************************
ALIGN 16
;*******************************************************************************
; void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
WelsDecoderI16x16LumaPredDcTop_sse2:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -1273,12 +1245,10 @@ WelsDecoderI16x16LumaPredDcTop_sse2:
ret
ALIGN 16
;*******************************************************************************
; void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
WelsDecoderI16x16LumaPredDcNA_sse2:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -1308,12 +1278,10 @@ WelsDecoderI16x16LumaPredDcNA_sse2:
ret
ALIGN 16
;*******************************************************************************
; void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
WelsDecoderIChromaPredDcLeft_mmx:
push r3
push r4
%assign push_num 2
@ -1381,12 +1349,10 @@ WelsDecoderIChromaPredDcLeft_mmx:
emms
ret
ALIGN 16
;*******************************************************************************
; void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
WelsDecoderIChromaPredDcTop_sse2:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@ -1420,12 +1386,10 @@ WelsDecoderIChromaPredDcTop_sse2:
movq [r0+r2], xmm0
ret
ALIGN 16
;*******************************************************************************
; void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
WelsDecoderIChromaPredDcNA_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d

View File

@ -323,7 +323,6 @@ SECTION .text
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
WELS_EXTERN CavlcParamCal_sse2
CavlcParamCal_sse2:
push ebx
push edi
push esi

View File

@ -130,12 +130,10 @@ SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
movd %5, %1
%endmacro
SECTION .text
ALIGN 16
;***********************************************************************
; void WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
;***********************************************************************
WELS_EXTERN WelsDctT4_mmx
WelsDctT4_mmx:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r2, r2d
@ -163,7 +161,6 @@ WelsDctT4_mmx:
; void WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
;***********************************************************************
WELS_EXTERN WelsIDctT4Rec_mmx
WelsIDctT4Rec_mmx:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -291,8 +288,6 @@ WelsIDctT4Rec_mmx:
; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
;***********************************************************************
WELS_EXTERN WelsDctFourT4_sse2
ALIGN 16
WelsDctFourT4_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r2, r2d
@ -340,8 +335,6 @@ WelsDctFourT4_sse2:
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
;***********************************************************************
WELS_EXTERN WelsIDctFourT4Rec_sse2
ALIGN 16
WelsIDctFourT4Rec_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -399,8 +392,6 @@ WelsIDctFourT4Rec_sse2:
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
;***********************************************************************
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
ALIGN 16
WelsIDctRecI16x16Dc_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@ -475,7 +466,6 @@ WelsIDctRecI16x16Dc_sse2:
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
;***********************************************************************
WELS_EXTERN WelsHadamardT4Dc_sse2
WelsHadamardT4Dc_sse2:
%assign push_num 0
LOAD_2_PARA
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1

View File

@ -184,18 +184,13 @@ mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
;***********************************************************************
SECTION .text
WELS_EXTERN WelsI4x4LumaPredH_sse2
WELS_EXTERN WelsI4x4LumaPredDDR_mmx
WELS_EXTERN WelsI4x4LumaPredDc_sse2
WELS_EXTERN WelsI16x16LumaPredPlane_sse2
ALIGN 16
;***********************************************************************
; void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;
; pred must align to 16
;***********************************************************************
WelsI4x4LumaPredH_sse2:
WELS_EXTERN WelsI4x4LumaPredH_sse2
push r3
%assign push_num 1
LOAD_3_PARA
@ -229,7 +224,7 @@ WelsI4x4LumaPredH_sse2:
;***********************************************************************
; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
WelsI16x16LumaPredPlane_sse2:
WELS_EXTERN WelsI16x16LumaPredPlane_sse2
push r3
push r4
%assign push_num 2
@ -326,7 +321,6 @@ get_i16x16_luma_pred_plane_sse2_1:
%endmacro
WELS_EXTERN WelsI16x16LumaPredH_sse2
WelsI16x16LumaPredH_sse2:
push r3
%assign push_num 1
LOAD_3_PARA
@ -357,7 +351,6 @@ WelsI16x16LumaPredH_sse2:
; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredV_sse2
WelsI16x16LumaPredV_sse2:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@ -387,7 +380,6 @@ WelsI16x16LumaPredV_sse2:
; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
WELS_EXTERN WelsIChromaPredPlane_sse2
WelsIChromaPredPlane_sse2:
push r3
push r4
%assign push_num 2
@ -471,7 +463,6 @@ get_i_chroma_pred_plane_sse2_1:
WELSEMMS
ret
ALIGN 16
;***********************************************************************
; 0 |1 |2 |3 |4 |
; 6 |7 |8 |9 |10|
@ -484,7 +475,7 @@ ALIGN 16
; void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;
;***********************************************************************
WelsI4x4LumaPredDDR_mmx:
WELS_EXTERN WelsI4x4LumaPredDDR_mmx
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@ -525,7 +516,6 @@ WelsI4x4LumaPredDDR_mmx:
WELSEMMS
ret
ALIGN 16
;***********************************************************************
; 0 |1 |2 |3 |4 |
; 5 |6 |7 |8 |9 |
@ -538,7 +528,7 @@ ALIGN 16
; void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
;
;***********************************************************************
WelsI4x4LumaPredDc_sse2:
WELS_EXTERN WelsI4x4LumaPredDc_sse2
push r3
push r4
%assign push_num 2
@ -572,7 +562,6 @@ WelsI4x4LumaPredDc_sse2:
pop r3
ret
ALIGN 16
;***********************************************************************
; void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
; copy 8 pixel of 8 line from left
@ -598,7 +587,6 @@ ALIGN 16
%endmacro
WELS_EXTERN WelsIChromaPredH_mmx
WelsIChromaPredH_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@ -629,13 +617,11 @@ WelsIChromaPredH_mmx:
WELSEMMS
ret
ALIGN 16
;***********************************************************************
; void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
; copy pixels from top 4 pixels
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredV_sse2
WelsI4x4LumaPredV_sse2:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@ -645,13 +631,11 @@ WelsI4x4LumaPredV_sse2:
movdqa [r0], xmm0
ret
ALIGN 16
;***********************************************************************
; void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
; copy 8 pixels from top 8 pixels
;***********************************************************************
WELS_EXTERN WelsIChromaPredV_sse2
WelsIChromaPredV_sse2:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@ -665,7 +649,6 @@ WelsIChromaPredV_sse2:
movdqa [r0+48], xmm0
ret
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
; l0|
@ -696,7 +679,6 @@ WelsIChromaPredV_sse2:
; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
WelsI4x4LumaPredHD_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@ -747,7 +729,6 @@ WelsI4x4LumaPredHD_mmx:
WELSEMMS
ret
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
; l0|
@ -775,7 +756,6 @@ ALIGN 16
; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
WelsI4x4LumaPredHU_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@ -828,7 +808,6 @@ WelsI4x4LumaPredHU_mmx:
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
; l0|
@ -858,7 +837,6 @@ ALIGN 16
; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
WelsI4x4LumaPredVR_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@ -916,7 +894,6 @@ WelsI4x4LumaPredVR_mmx:
WELSEMMS
ret
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
; l0|
@ -944,7 +921,6 @@ ALIGN 16
; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
WelsI4x4LumaPredDDL_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@ -981,7 +957,6 @@ WelsI4x4LumaPredDDL_mmx:
ret
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
; l0|
@ -1012,7 +987,6 @@ ALIGN 16
; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
WelsI4x4LumaPredVL_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@ -1045,13 +1019,11 @@ WelsI4x4LumaPredVL_mmx:
WELSEMMS
ret
ALIGN 16
;***********************************************************************
;
; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;***********************************************************************
WELS_EXTERN WelsIChromaPredDc_sse2
WelsIChromaPredDc_sse2:
push r3
push r4
%assign push_num 2
@ -1137,13 +1109,11 @@ WelsIChromaPredDc_sse2:
ALIGN 16
;***********************************************************************
;
; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredDc_sse2
WelsI16x16LumaPredDc_sse2:
push r3
push r4
%assign push_num 2
@ -1206,8 +1176,6 @@ WelsI16x16LumaPredDc_sse2:
;***********************************************************************
%ifdef X86_32
WELS_EXTERN WelsSampleSatdThree4x4_sse2
align 16
WelsSampleSatdThree4x4_sse2:
push ebx
push esi
push edi

View File

@ -47,24 +47,20 @@
SECTION .text
ALIGN 16
;***********************************************************************
;void WelsPrefetchZero_mmx(int8_t const*_A);
;***********************************************************************
WELS_EXTERN WelsPrefetchZero_mmx
WelsPrefetchZero_mmx:
%assign push_num 0
LOAD_1_PARA
prefetchnta [r0]
ret
ALIGN 16
;***********************************************************************
; void WelsSetMemZeroAligned64_sse2(void *dst, int32_t size)
;***********************************************************************
WELS_EXTERN WelsSetMemZeroAligned64_sse2
WelsSetMemZeroAligned64_sse2:
%assign push_num 0
LOAD_2_PARA
@ -84,12 +80,10 @@ WelsSetMemZeroAligned64_sse2:
ret
ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize64_mmx
WelsSetMemZeroSize64_mmx:
%assign push_num 0
LOAD_2_PARA
@ -114,12 +108,10 @@ WelsSetMemZeroSize64_mmx:
WELSEMMS
ret
ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize8_mmx
WelsSetMemZeroSize8_mmx:
%assign push_num 0
LOAD_2_PARA

View File

@ -83,8 +83,6 @@ SECTION .text
; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4_sse2
align 16
WelsQuant4x4_sse2:
%assign push_num 0
LOAD_3_PARA
movdqa xmm2, [r1]
@ -99,8 +97,6 @@ WelsQuant4x4_sse2:
;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4Dc_sse2
align 16
WelsQuant4x4Dc_sse2:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r1, r1w
@ -118,8 +114,6 @@ WelsQuant4x4Dc_sse2:
; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4_sse2
align 16
WelsQuantFour4x4_sse2:
%assign push_num 0
LOAD_3_PARA
MOVDQ xmm2, [r1]
@ -140,8 +134,6 @@ WelsQuantFour4x4_sse2:
; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4Max_sse2
align 16
WelsQuantFour4x4Max_sse2:
%assign push_num 0
LOAD_4_PARA
MOVDQ xmm2, [r1]
@ -195,8 +187,6 @@ SECTION .text
;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2_mmx
align 16
WelsHadamardQuant2x2_mmx:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1w
@ -253,8 +243,6 @@ WelsHadamardQuant2x2_mmx:
;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
align 16
WelsHadamardQuant2x2Skip_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r1, r1w
@ -303,13 +291,10 @@ WelsHadamardQuant2x2Skip_mmx:
%endmacro
ALIGN 16
;***********************************************************************
; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
;***********************************************************************
align 16
WELS_EXTERN WelsDequant4x4_sse2
WelsDequant4x4_sse2:
%assign push_num 0
LOAD_2_PARA
@ -323,10 +308,7 @@ WelsDequant4x4_sse2:
;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
;***********************************************************************====
align 16
WELS_EXTERN WelsDequantFour4x4_sse2
WelsDequantFour4x4_sse2:
%assign push_num 0
LOAD_2_PARA
@ -346,8 +328,6 @@ WelsDequantFour4x4_sse2:
;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
;***********************************************************************
WELS_EXTERN WelsDequantIHadamard4x4_sse2
align 16
WelsDequantIHadamard4x4_sse2:
%assign push_num 0
LOAD_2_PARA
%ifndef X86_32

View File

@ -166,9 +166,7 @@ SECTION .text
;***********************************************************************
;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsScan4x4DcAc_sse2
WelsScan4x4DcAc_sse2:
%ifdef X86_32
push r3
%assign push_num 1
@ -200,9 +198,7 @@ WelsScan4x4DcAc_sse2:
;***********************************************************************
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsScan4x4DcAc_ssse3
WelsScan4x4DcAc_ssse3:
%assign push_num 0
LOAD_2_PARA
movdqa xmm0, [r1]
@ -220,9 +216,7 @@ WelsScan4x4DcAc_ssse3:
;***********************************************************************
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsScan4x4Ac_sse2
WelsScan4x4Ac_sse2:
%assign push_num 0
LOAD_2_PARA
movdqa xmm0, [r1]
@ -259,9 +253,7 @@ WelsScan4x4Ac_sse2:
;***********************************************************************
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
WelsCalculateSingleCtr4x4_sse2:
%ifdef X86_32
push r3
%assign push_num 1
@ -319,9 +311,7 @@ WelsCalculateSingleCtr4x4_sse2:
;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
;***********************************************************************
ALIGN 16
WELS_EXTERN WelsGetNoneZeroCount_sse2
WelsGetNoneZeroCount_sse2:
%assign push_num 0
LOAD_1_PARA
movdqa xmm0, [r0]

View File

@ -163,8 +163,6 @@ SECTION .text
paddw %3, %2
%endmacro
ALIGN 16
WELS_EXTERN BilateralLumaFilter8_sse2
;***********************************************************************
; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
;***********************************************************************
@ -173,7 +171,7 @@ WELS_EXTERN BilateralLumaFilter8_sse2
; 6 7 8
; 0: the center point
BilateralLumaFilter8_sse2:
WELS_EXTERN BilateralLumaFilter8_sse2
push r3
%assign push_num 1
@ -219,7 +217,6 @@ BilateralLumaFilter8_sse2:
ret
WELS_EXTERN WaverageChromaFilter8_sse2
;***********************************************************************
; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
;***********************************************************************
@ -230,8 +227,7 @@ WELS_EXTERN WaverageChromaFilter8_sse2
;1 2 4 2 1
;1 1 2 1 1
ALIGN 16
WaverageChromaFilter8_sse2:
WELS_EXTERN WaverageChromaFilter8_sse2
push r3

View File

@ -66,22 +66,18 @@ shufb_mask_high:
db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
ALIGN 16
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
ALIGN 16
DyadicBilinearDownsamplerWidthx32_sse:
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
push ebx
push edx
push esi
@ -227,14 +223,12 @@ DyadicBilinearDownsamplerWidthx32_sse:
pop ebx
ret
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
ALIGN 16
DyadicBilinearDownsamplerWidthx16_sse:
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
push ebx
push edx
push esi
@ -331,14 +325,12 @@ DyadicBilinearDownsamplerWidthx16_sse:
pop ebx
ret
WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
ALIGN 16
DyadicBilinearDownsamplerWidthx8_sse:
WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
push ebx
push edx
push esi
@ -422,14 +414,12 @@ DyadicBilinearDownsamplerWidthx8_sse:
; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
ALIGN 16
DyadicBilinearDownsamplerWidthx32_ssse3:
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
push ebx
push edx
push esi
@ -533,14 +523,12 @@ DyadicBilinearDownsamplerWidthx32_ssse3:
pop ebx
ret
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
ALIGN 16
DyadicBilinearDownsamplerWidthx16_ssse3:
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
push ebx
push edx
push esi
@ -623,14 +611,12 @@ DyadicBilinearDownsamplerWidthx16_ssse3:
ret
; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
ALIGN 16
DyadicBilinearDownsamplerWidthx32_sse4:
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
push ebx
push edx
push esi
@ -733,14 +719,12 @@ DyadicBilinearDownsamplerWidthx32_sse4:
pop ebx
ret
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
ALIGN 16
DyadicBilinearDownsamplerWidthx16_sse4:
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
push ebx
push edx
push esi
@ -825,7 +809,6 @@ DyadicBilinearDownsamplerWidthx16_sse4:
WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
;**************************************************************************************************************
;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
@ -833,8 +816,7 @@ WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
;{
;**************************************************************************************************************
ALIGN 16
GeneralBilinearAccurateDownsampler_sse2:
WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
push ebp
push esi
push edi
@ -1029,7 +1011,6 @@ LAST_ROW_END:
WELS_EXTERN GeneralBilinearFastDownsampler_sse2
;**************************************************************************************************************
;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
@ -1037,8 +1018,7 @@ WELS_EXTERN GeneralBilinearFastDownsampler_sse2
;{
;**************************************************************************************************************
ALIGN 16
GeneralBilinearFastDownsampler_sse2:
WELS_EXTERN GeneralBilinearFastDownsampler_sse2
push ebp
push esi
push edi

View File

@ -245,12 +245,10 @@ SECTION .text
%ifdef X86_32
WELS_EXTERN SampleVariance16x16_sse2
;***********************************************************************
; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
;***********************************************************************
ALIGN 16
SampleVariance16x16_sse2:
WELS_EXTERN SampleVariance16x16_sse2
push esi
push edi
push ebx
@ -347,15 +345,13 @@ SampleVariance16x16_sse2:
WELS_EXTERN VAACalcSad_sse2
;*************************************************************************************************************
;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
;*************************************************************************************************************
ALIGN 16
VAACalcSad_sse2:
WELS_EXTERN VAACalcSad_sse2
%define cur_data esp + pushsize + 4
%define ref_data esp + pushsize + 8
%define iPicWidth esp + pushsize + 12
@ -441,12 +437,10 @@ width_loop:
%else ;64-bit
WELS_EXTERN SampleVariance16x16_sse2
;***********************************************************************
; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
;***********************************************************************
ALIGN 16
SampleVariance16x16_sse2:
WELS_EXTERN SampleVariance16x16_sse2
%define SUM r10;[esp]
%define SUM_CUR r11;[esp+4]
%define SQR r13;[esp+8]
@ -539,15 +533,13 @@ SampleVariance16x16_sse2:
ret
WELS_EXTERN VAACalcSad_sse2
;*************************************************************************************************************
;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
;*************************************************************************************************************
ALIGN 16
VAACalcSad_sse2:
WELS_EXTERN VAACalcSad_sse2
%define cur_data r0
%define ref_data r1
%define iPicWidth r2
@ -637,15 +629,13 @@ width_loop:
%ifdef X86_32
WELS_EXTERN VAACalcSadVar_sse2
;*************************************************************************************************************
;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
;*************************************************************************************************************
ALIGN 16
VAACalcSadVar_sse2:
WELS_EXTERN VAACalcSadVar_sse2
%define localsize 8
%define cur_data esp + pushsize + localsize + 4
%define ref_data esp + pushsize + localsize + 8
@ -773,15 +763,13 @@ var_width_loop:
%else ;64-bit
WELS_EXTERN VAACalcSadVar_sse2
;*************************************************************************************************************
;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
;*************************************************************************************************************
ALIGN 16
VAACalcSadVar_sse2:
WELS_EXTERN VAACalcSadVar_sse2
%define cur_data arg1 ;r0
%define ref_data arg2 ;r1
%define iPicWidth arg3 ;r2
@ -916,15 +904,13 @@ var_width_loop:
%ifdef X86_32
WELS_EXTERN VAACalcSadSsd_sse2
;*************************************************************************************************************
;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
;*************************************************************************************************************
ALIGN 16
VAACalcSadSsd_sse2:
WELS_EXTERN VAACalcSadSsd_sse2
%define localsize 12
%define cur_data esp + pushsize + localsize + 4
%define ref_data esp + pushsize + localsize + 8
@ -1072,15 +1058,13 @@ sqdiff_width_loop:
%else
WELS_EXTERN VAACalcSadSsd_sse2
;*************************************************************************************************************
;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
;*************************************************************************************************************
ALIGN 16
VAACalcSadSsd_sse2:
WELS_EXTERN VAACalcSadSsd_sse2
%define localsize 12
%define cur_data arg1;r0
%define ref_data arg2;r1
@ -1236,15 +1220,13 @@ sqdiff_width_loop:
%endif
%ifdef X86_32
WELS_EXTERN VAACalcSadBgd_sse2
;*************************************************************************************************************
;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
;*************************************************************************************************************
ALIGN 16
VAACalcSadBgd_sse2:
WELS_EXTERN VAACalcSadBgd_sse2
%define localsize 12
%define cur_data esp + pushsize + localsize + 4
%define ref_data esp + pushsize + localsize + 8
@ -1415,7 +1397,6 @@ bgd_width_loop:
WELS_EXTERN VAACalcSadSsdBgd_sse2
;*************************************************************************************************************
;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
@ -1423,8 +1404,7 @@ WELS_EXTERN VAACalcSadSsdBgd_sse2
;*************************************************************************************************************
ALIGN 16
VAACalcSadSsdBgd_sse2:
WELS_EXTERN VAACalcSadSsdBgd_sse2
%define localsize 16
%define cur_data esp + pushsize + localsize + 4
%define ref_data esp + pushsize + localsize + 8
@ -1646,15 +1626,13 @@ sqdiff_bgd_width_loop:
ret
%else
WELS_EXTERN VAACalcSadBgd_sse2
;*************************************************************************************************************
;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
;*************************************************************************************************************
ALIGN 16
VAACalcSadBgd_sse2:
WELS_EXTERN VAACalcSadBgd_sse2
%define cur_data arg1;
%define ref_data arg2;
%define iPicWidth arg3;
@ -1817,7 +1795,6 @@ bgd_width_loop:
WELS_EXTERN VAACalcSadSsdBgd_sse2
;*************************************************************************************************************
;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
@ -1825,8 +1802,7 @@ WELS_EXTERN VAACalcSadSsdBgd_sse2
;*************************************************************************************************************
ALIGN 16
VAACalcSadSsdBgd_sse2:
WELS_EXTERN VAACalcSadSsdBgd_sse2
%define cur_data arg1;
%define ref_data arg2;
%define iPicWidth arg3;