Merge pull request #97 from mstorsjo/asm-source-cleanup

Make all asm sources consistently use unix newlines and remove trailing whitespace
This commit is contained in:
Ethan Hugg 2014-01-13 21:21:17 -08:00
commit 9230b49728
14 changed files with 7904 additions and 7905 deletions

View File

@ -154,7 +154,7 @@ BITS 64
%define PUSHRFLAGS pushfq
%define POPRFLAGS popfq
%define retrq rax
%define retrd eax
%define retrd eax
%elifdef X86_32 ; X86_32 ;************************************
@ -233,7 +233,7 @@ BITS 32
%macro LOAD_4_PARA 0
%ifdef X86_32
push r3
%assign push_num push_num+1
%assign push_num push_num+1
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
@ -245,7 +245,7 @@ BITS 32
%ifdef X86_32
push r3
push r4
%assign push_num push_num+2
%assign push_num push_num+2
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
@ -261,7 +261,7 @@ BITS 32
push r3
push r4
push r5
%assign push_num push_num+3
%assign push_num push_num+3
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
@ -280,7 +280,7 @@ BITS 32
push r4
push r5
push r6
%assign push_num push_num+4
%assign push_num push_num+4
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
@ -334,7 +334,7 @@ BITS 32
movsx %1, %2
%endif
%endmacro
%macro WELS_EXTERN 1
%ifdef PREFIX
global _%1

View File

@ -81,17 +81,17 @@ ALIGN 16
%ifdef WIN64
WelsCPUId:
push rbx
push rdx
push rbx
push rdx
mov eax, ecx
mov rcx, [r9]
cpuid
cpuid
mov [r9], ecx
mov [r8], ebx
mov rcx, [rsp + 2*8 + 40]
mov rcx, [rsp + 2*8 + 40]
mov [rcx], edx
pop rdx
pop rdx
mov [rdx], eax
pop rbx
@ -103,8 +103,8 @@ WelsCPUId:
push rcx
push rdx
mov eax, edi
mov rcx, [rcx]
mov eax, edi
mov rcx, [rcx]
cpuid
mov [r8], edx
pop rdx
@ -156,9 +156,9 @@ WelsCPUSupportAVX:
%elifdef UNIX64
mov eax, edi
mov ecx, esi
%else
%else
mov eax, [esp+4]
mov ecx, [esp+8]
mov ecx, [esp+8]
%endif
; refer to detection of AVX addressed in INTEL AVX manual document

File diff suppressed because it is too large Load Diff

View File

@ -244,7 +244,7 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2 ; for chroma unalignment
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
;r6 [height]
;r0 [pSrc+0] r5[pSrc-32] r1[stride]
;r3 [pSrc+(w-1)] r4[pSrc+w]
;r3 [pSrc+(w-1)] r4[pSrc+w]
%if %1 == 32 ; for luma
.left_right_loops:
@ -375,13 +375,13 @@ ExpandPictureLuma_sse2:
%assign push_num 3
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
SIGN_EXTENTION r2, r2d
SIGN_EXTENTION r3, r3d
;also prepare for cross border pData top-left:xmm3
movzx r6d,byte[r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
@ -395,22 +395,22 @@ ExpandPictureLuma_sse2:
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
mov r6,r1 ;r6 = stride
sal r6,05h ;r6 = 32*stride
lea r4,[r3+r6] ;r4 = dst bottom
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
neg r1 ;r1 = -stride
push r0
push r1
push r2
@ -419,20 +419,20 @@ ExpandPictureLuma_sse2:
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
pop r0
lea r5,[r0-32] ;left border dst luma =32 chroma = -16
lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride
@ -444,7 +444,7 @@ ExpandPictureLuma_sse2:
push r1
push r2
push r6
exp_left_right_sse2 32,a
pop r6
@ -455,33 +455,33 @@ ExpandPictureLuma_sse2:
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
lea r3,[r0-32]
lea r3,[r3+r1] ;last line of top-left border
lea r4,[r0+r2] ;psrc +width
lea r4,[r4+r1] ;psrc +width -stride
neg r1 ;r1 = stride
add r6,32 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
neg r1 ; r1 = -stride
; for left & right border expanding
exp_cross_sse2 32,a
LOAD_4_PARA_POP
pop r6
pop r5
pop r4
%assign push_num 0
@ -495,7 +495,7 @@ ALIGN 16
; const int32_t iHeight );
;***********************************************************************----------------
ExpandPictureChromaAlign_sse2:
push r4
push r5
push r6
@ -508,7 +508,7 @@ ExpandPictureChromaAlign_sse2:
SIGN_EXTENTION r3,r3d
;also prepare for cross border pData top-left:xmm3
movzx r6d,byte [r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
@ -522,44 +522,44 @@ ExpandPictureChromaAlign_sse2:
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
mov r6,r1 ;r6 = stride
sal r6,04h ;r6 = 32*stride
lea r4,[r3+r6] ;r4 = dst bottom
lea r4,[r3+r6] ;r4 = dst bottom
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
neg r1 ;r1 = -stride
push r0
push r1
push r1
push r2
exp_top_bottom_sse2 16
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
pop r0
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
lea r3,[r0+r2-1] ;right border src
lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride
@ -568,7 +568,7 @@ ExpandPictureChromaAlign_sse2:
push r0
push r1
push r1
push r2
push r6
exp_left_right_sse2 16,a
@ -581,33 +581,33 @@ ExpandPictureChromaAlign_sse2:
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
lea r3,[r0-16]
lea r3,[r3+r1] ;last line of top-left border
lea r4,[r0+r2] ;psrc +width
lea r4,[r4+r1] ;psrc +width -stride
lea r4,[r4+r1] ;psrc +width -stride
neg r1 ;r1 = stride
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
neg r1 ; r1 = -stride
; for left & right border expanding
exp_cross_sse2 16,a
LOAD_4_PARA_POP
pop r6
pop r5
pop r4
%assign push_num 0
@ -633,7 +633,7 @@ ExpandPictureChromaUnalign_sse2:
SIGN_EXTENTION r3,r3d
;also prepare for cross border pData top-left:xmm3
movzx r6d,byte [r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
@ -647,44 +647,44 @@ ExpandPictureChromaUnalign_sse2:
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
mov r6,r1 ;r6 = stride
sal r6,04h ;r6 = 32*stride
lea r4,[r3+r6] ;r4 = dst bottom
lea r4,[r3+r6] ;r4 = dst bottom
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
neg r1 ;r1 = -stride
push r0
push r1
push r1
push r2
exp_top_bottom_sse2 16
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
pop r0
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
lea r3,[r0+r2-1] ;right border src
lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride
@ -693,7 +693,7 @@ ExpandPictureChromaUnalign_sse2:
push r0
push r1
push r1
push r2
push r6
exp_left_right_sse2 16,u
@ -706,35 +706,34 @@ ExpandPictureChromaUnalign_sse2:
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
lea r3,[r0-16]
lea r3,[r3+r1] ;last line of top-left border
lea r4,[r0+r2] ;psrc +width
lea r4,[r4+r1] ;psrc +width -stride
lea r4,[r4+r1] ;psrc +width -stride
neg r1 ;r1 = stride
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
neg r1 ; r1 = -stride
; for left & right border expanding
exp_cross_sse2 16,u
LOAD_4_PARA_POP
pop r6
pop r5
pop r4
%assign push_num 0
ret

File diff suppressed because it is too large Load Diff

View File

@ -1,345 +1,345 @@
;*!
;* \copy
;* Copyright (c) 2004-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* mc_chroma.asm
;*
;* Abstract
;* mmx motion compensation for chroma
;*
;* History
;* 10/13/2004 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
SECTION .rodata align=16
;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************
ALIGN 16
h264_d0x20_sse2:
dw 32,32,32,32,32,32,32,32
ALIGN 16
h264_d0x20_mmx:
dw 32,32,32,32
;=============================================================================
; Code
;=============================================================================
SECTION .text
ALIGN 16
;*******************************************************************************
; void McChromaWidthEq4_mmx( uint8_t *src,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; uint8_t *pABCD,
; int32_t iHeigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx
McChromaWidthEq4_mmx:
;push esi
;push edi
;push ebx
%assign push_num 0
LOAD_6_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
movsx r5, r5d
%endif
;mov eax, [esp +12 + 20]
movd mm3, [r4]; [eax]
WELS_Zero mm7
punpcklbw mm3, mm3
movq mm4, mm3
punpcklwd mm3, mm3
punpckhwd mm4, mm4
movq mm5, mm3
punpcklbw mm3, mm7
punpckhbw mm5, mm7
movq mm6, mm4
punpcklbw mm4, mm7
punpckhbw mm6, mm7
;mov esi, [esp +12+ 4]
;mov eax, [esp + 12 + 8]
;mov edi, [esp + 12 + 12]
;mov edx, [esp + 12 + 16]
;mov ecx, [esp + 12 + 24]
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
movd mm0, [r0]
movd mm1, [r0+1]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
.xloop:
pmullw mm0, mm3
pmullw mm1, mm5
paddw mm0, mm1
movd mm1, [r4]
punpcklbw mm1, mm7
movq mm2, mm1
pmullw mm1, mm4
paddw mm0, mm1
movd mm1, [r4+1]
punpcklbw mm1, mm7
movq mm7, mm1
pmullw mm1,mm6
paddw mm0, mm1
movq mm1,mm7
paddw mm0, [h264_d0x20_mmx]
psrlw mm0, 6
WELS_Zero mm7
packuswb mm0, mm7
movd [r2], mm0
movq mm0, mm2
lea r2, [r2 + r3]
lea r4, [r4 + r1]
dec r5
jnz near .xloop
WELSEMMS
LOAD_6_PARA_POP
;pop ebx
;pop edi
;pop esi
ret
ALIGN 16
;*******************************************************************************
; void McChromaWidthEq8_sse2( uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; uint8_t *pABCD,
; int32_t iheigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2
McChromaWidthEq8_sse2:
;push esi
;push edi
;push ebx
%assign push_num 0
LOAD_6_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
movsx r5, r5d
%endif
;mov eax, [esp +12 + 20]
movd xmm3, [r4]
WELS_Zero xmm7
punpcklbw xmm3, xmm3
punpcklwd xmm3, xmm3
movdqa xmm4, xmm3
punpckldq xmm3, xmm3
punpckhdq xmm4, xmm4
movdqa xmm5, xmm3
movdqa xmm6, xmm4
punpcklbw xmm3, xmm7
punpckhbw xmm5, xmm7
punpcklbw xmm4, xmm7
punpckhbw xmm6, xmm7
;mov esi, [esp +12+ 4]
;mov eax, [esp + 12 + 8]
;mov edi, [esp + 12 + 12]
;mov edx, [esp + 12 + 16]
;mov ecx, [esp + 12 + 24]
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
movq xmm0, [r0]
movq xmm1, [r0+1]
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
.xloop:
pmullw xmm0, xmm3
pmullw xmm1, xmm5
paddw xmm0, xmm1
movq xmm1, [r4]
punpcklbw xmm1, xmm7
movdqa xmm2, xmm1
pmullw xmm1, xmm4
paddw xmm0, xmm1
movq xmm1, [r4+1]
punpcklbw xmm1, xmm7
movdqa xmm7, xmm1
pmullw xmm1, xmm6
paddw xmm0, xmm1
movdqa xmm1,xmm7
paddw xmm0, [h264_d0x20_sse2]
psrlw xmm0, 6
WELS_Zero xmm7
packuswb xmm0, xmm7
movq [r2], xmm0
movdqa xmm0, xmm2
lea r2, [r2 + r3]
lea r4, [r4 + r1]
dec r5
jnz near .xloop
LOAD_6_PARA_POP
;pop ebx
;pop edi
;pop esi
ret
ALIGN 16
;***********************************************************************
; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; uint8_t *pABCD,
; int32_t iHeigh);
;***********************************************************************
WELS_EXTERN McChromaWidthEq8_ssse3
McChromaWidthEq8_ssse3:
;push ebx
;push esi
;push edi
%assign push_num 0
LOAD_6_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
movsx r5, r5d
%endif
;mov eax, [esp + 12 + 20]
pxor xmm7, xmm7
movd xmm5, [r4]
punpcklwd xmm5, xmm5
punpckldq xmm5, xmm5
movdqa xmm6, xmm5
punpcklqdq xmm5, xmm5
punpckhqdq xmm6, xmm6
;mov eax, [esp + 12 + 4]
;mov edx, [esp + 12 + 8]
;mov esi, [esp + 12 + 12]
;mov edi, [esp + 12 + 16]
;mov ecx, [esp + 12 + 24]
sub r2, r3 ;sub esi, edi
sub r2, r3
movdqa xmm7, [h264_d0x20_sse2]
movdqu xmm0, [r0]
movdqa xmm1, xmm0
psrldq xmm1, 1
punpcklbw xmm0, xmm1
.hloop_chroma:
lea r2, [r2+2*r3]
movdqu xmm2, [r0+r1]
movdqa xmm3, xmm2
psrldq xmm3, 1
punpcklbw xmm2, xmm3
movdqa xmm4, xmm2
pmaddubsw xmm0, xmm5
pmaddubsw xmm2, xmm6
paddw xmm0, xmm2
paddw xmm0, xmm7
psrlw xmm0, 6
packuswb xmm0, xmm0
movq [r2],xmm0
lea r0, [r0+2*r1]
movdqu xmm2, [r0]
movdqa xmm3, xmm2
psrldq xmm3, 1
punpcklbw xmm2, xmm3
movdqa xmm0, xmm2
pmaddubsw xmm4, xmm5
pmaddubsw xmm2, xmm6
paddw xmm4, xmm2
paddw xmm4, xmm7
psrlw xmm4, 6
packuswb xmm4, xmm4
movq [r2+r3],xmm4
sub r5, 2
jnz .hloop_chroma
LOAD_6_PARA_POP
;pop edi
;pop esi
;pop ebx
ret
;*!
;* \copy
;* Copyright (c) 2004-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* mc_chroma.asm
;*
;* Abstract
;* mmx motion compensation for chroma
;*
;* History
;* 10/13/2004 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
SECTION .rodata align=16
;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************
ALIGN 16
h264_d0x20_sse2:
dw 32,32,32,32,32,32,32,32
ALIGN 16
h264_d0x20_mmx:
dw 32,32,32,32
;=============================================================================
; Code
;=============================================================================
SECTION .text
ALIGN 16
;*******************************************************************************
; void McChromaWidthEq4_mmx( uint8_t *src,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; uint8_t *pABCD,
; int32_t iHeigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx
McChromaWidthEq4_mmx:
;push esi
;push edi
;push ebx
%assign push_num 0
LOAD_6_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
movsx r5, r5d
%endif
;mov eax, [esp +12 + 20]
movd mm3, [r4]; [eax]
WELS_Zero mm7
punpcklbw mm3, mm3
movq mm4, mm3
punpcklwd mm3, mm3
punpckhwd mm4, mm4
movq mm5, mm3
punpcklbw mm3, mm7
punpckhbw mm5, mm7
movq mm6, mm4
punpcklbw mm4, mm7
punpckhbw mm6, mm7
;mov esi, [esp +12+ 4]
;mov eax, [esp + 12 + 8]
;mov edi, [esp + 12 + 12]
;mov edx, [esp + 12 + 16]
;mov ecx, [esp + 12 + 24]
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
movd mm0, [r0]
movd mm1, [r0+1]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
.xloop:
pmullw mm0, mm3
pmullw mm1, mm5
paddw mm0, mm1
movd mm1, [r4]
punpcklbw mm1, mm7
movq mm2, mm1
pmullw mm1, mm4
paddw mm0, mm1
movd mm1, [r4+1]
punpcklbw mm1, mm7
movq mm7, mm1
pmullw mm1,mm6
paddw mm0, mm1
movq mm1,mm7
paddw mm0, [h264_d0x20_mmx]
psrlw mm0, 6
WELS_Zero mm7
packuswb mm0, mm7
movd [r2], mm0
movq mm0, mm2
lea r2, [r2 + r3]
lea r4, [r4 + r1]
dec r5
jnz near .xloop
WELSEMMS
LOAD_6_PARA_POP
;pop ebx
;pop edi
;pop esi
ret
ALIGN 16
;*******************************************************************************
; void McChromaWidthEq8_sse2( uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; uint8_t *pABCD,
; int32_t iheigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2
McChromaWidthEq8_sse2:
;push esi
;push edi
;push ebx
%assign push_num 0
LOAD_6_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
movsx r5, r5d
%endif
;mov eax, [esp +12 + 20]
movd xmm3, [r4]
WELS_Zero xmm7
punpcklbw xmm3, xmm3
punpcklwd xmm3, xmm3
movdqa xmm4, xmm3
punpckldq xmm3, xmm3
punpckhdq xmm4, xmm4
movdqa xmm5, xmm3
movdqa xmm6, xmm4
punpcklbw xmm3, xmm7
punpckhbw xmm5, xmm7
punpcklbw xmm4, xmm7
punpckhbw xmm6, xmm7
;mov esi, [esp +12+ 4]
;mov eax, [esp + 12 + 8]
;mov edi, [esp + 12 + 12]
;mov edx, [esp + 12 + 16]
;mov ecx, [esp + 12 + 24]
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
movq xmm0, [r0]
movq xmm1, [r0+1]
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
.xloop:
pmullw xmm0, xmm3
pmullw xmm1, xmm5
paddw xmm0, xmm1
movq xmm1, [r4]
punpcklbw xmm1, xmm7
movdqa xmm2, xmm1
pmullw xmm1, xmm4
paddw xmm0, xmm1
movq xmm1, [r4+1]
punpcklbw xmm1, xmm7
movdqa xmm7, xmm1
pmullw xmm1, xmm6
paddw xmm0, xmm1
movdqa xmm1,xmm7
paddw xmm0, [h264_d0x20_sse2]
psrlw xmm0, 6
WELS_Zero xmm7
packuswb xmm0, xmm7
movq [r2], xmm0
movdqa xmm0, xmm2
lea r2, [r2 + r3]
lea r4, [r4 + r1]
dec r5
jnz near .xloop
LOAD_6_PARA_POP
;pop ebx
;pop edi
;pop esi
ret
ALIGN 16
;***********************************************************************
; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; uint8_t *pABCD,
; int32_t iHeigh);
;***********************************************************************
WELS_EXTERN McChromaWidthEq8_ssse3
McChromaWidthEq8_ssse3:
;push ebx
;push esi
;push edi
%assign push_num 0
LOAD_6_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
movsx r5, r5d
%endif
;mov eax, [esp + 12 + 20]
pxor xmm7, xmm7
movd xmm5, [r4]
punpcklwd xmm5, xmm5
punpckldq xmm5, xmm5
movdqa xmm6, xmm5
punpcklqdq xmm5, xmm5
punpckhqdq xmm6, xmm6
;mov eax, [esp + 12 + 4]
;mov edx, [esp + 12 + 8]
;mov esi, [esp + 12 + 12]
;mov edi, [esp + 12 + 16]
;mov ecx, [esp + 12 + 24]
sub r2, r3 ;sub esi, edi
sub r2, r3
movdqa xmm7, [h264_d0x20_sse2]
movdqu xmm0, [r0]
movdqa xmm1, xmm0
psrldq xmm1, 1
punpcklbw xmm0, xmm1
.hloop_chroma:
lea r2, [r2+2*r3]
movdqu xmm2, [r0+r1]
movdqa xmm3, xmm2
psrldq xmm3, 1
punpcklbw xmm2, xmm3
movdqa xmm4, xmm2
pmaddubsw xmm0, xmm5
pmaddubsw xmm2, xmm6
paddw xmm0, xmm2
paddw xmm0, xmm7
psrlw xmm0, 6
packuswb xmm0, xmm0
movq [r2],xmm0
lea r0, [r0+2*r1]
movdqu xmm2, [r0]
movdqa xmm3, xmm2
psrldq xmm3, 1
punpcklbw xmm2, xmm3
movdqa xmm0, xmm2
pmaddubsw xmm4, xmm5
pmaddubsw xmm2, xmm6
paddw xmm4, xmm2
paddw xmm4, xmm7
psrlw xmm4, 6
packuswb xmm4, xmm4
movq [r2+r3],xmm4
sub r5, 2
jnz .hloop_chroma
LOAD_6_PARA_POP
;pop edi
;pop esi
;pop ebx
ret

File diff suppressed because it is too large Load Diff

View File

@ -160,7 +160,7 @@ ALIGN 16
AnalysisVaaInfoIntra_sse2:
%assign push_num 0
LOAD_2_PARA
LOAD_2_PARA
SIGN_EXTENTION r1,r1d
%ifdef X86_32
@ -175,16 +175,16 @@ AnalysisVaaInfoIntra_sse2:
and r5,0fh
sub r7,r5
sub r7,32
mov r2,r1
mov r2,r1
sal r2,$1 ;r2 = 2*iLineSize
mov r3,r2
add r3,r1 ;r3 = 3*iLineSize
mov r4,r2
sal r4,$1 ;r4 = 4*iLineSize
pxor xmm7, xmm7
; loops
@ -225,8 +225,8 @@ AnalysisVaaInfoIntra_sse2:
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
movd r2d, xmm0
and r2, 0ffffh ; effective low work truncated
mov r3, r2
@ -234,7 +234,7 @@ AnalysisVaaInfoIntra_sse2:
sar r2, $4
movd retrd, xmm1
sub retrd, r2d
add r7,32
add r7,r5
@ -244,7 +244,7 @@ AnalysisVaaInfoIntra_sse2:
pop r4
pop r3
%endif
ret
WELS_EXTERN AnalysisVaaInfoIntra_ssse3
@ -255,7 +255,7 @@ ALIGN 16
AnalysisVaaInfoIntra_ssse3:
%assign push_num 0
LOAD_2_PARA
LOAD_2_PARA
SIGN_EXTENTION r1,r1d
%ifdef X86_32
@ -265,41 +265,41 @@ AnalysisVaaInfoIntra_ssse3:
push r6
%assign push_num push_num+4
%endif
mov r5,r7
and r5,0fh
sub r7,r5
sub r7,32
mov r2,r1
mov r2,r1
sal r2,$1 ;r2 = 2*iLineSize
mov r3,r2
add r3,r1 ;r3 = 3*iLineSize
mov r4,r2
sal r4,$1 ;r4 = 4*iLineSize
pxor xmm7, xmm7
; loops
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7],xmm0
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+8],xmm1
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+16],xmm0
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+24],xmm1
movdqa xmm0,[r7]
movdqa xmm1,[r7+16]
movdqa xmm2, xmm0
@ -322,7 +322,7 @@ AnalysisVaaInfoIntra_ssse3:
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
movd r2d, xmm0
and r2, 0ffffh ; effective low work truncated
mov r3, r2
@ -339,7 +339,7 @@ AnalysisVaaInfoIntra_ssse3:
pop r4
pop r3
%endif
ret
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
@ -368,7 +368,7 @@ MdInterAnalysisVaaInfo_sse41:
paddd xmm3, xmm4
movd r0d, xmm3
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
jb near .threshold_exit
pshufd xmm0, xmm0, 01Bh
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
@ -412,7 +412,7 @@ MdInterAnalysisVaaInfo_sse2:
paddd xmm4, xmm5
pshufd xmm5, xmm4, 0B1h
paddd xmm5, xmm4
movd r0d, xmm5
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
jb near .threshold_exit

View File

@ -477,7 +477,7 @@ WelsDecoderIChromaPredPlane_sse2:
SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c
;mov esi, [esp + pushsize + 4]
mov r0, r4
mov r0, r4
add r3, 16
imul r2, -3
add r3, r2 ; s = a + 16 + (-3)*c

View File

@ -186,7 +186,7 @@ WelsIDctT4Rec_mmx:
movsx r1, r1d
movsx r3, r3d
%endif
; mov eax, [pDct ]
; mov eax, [pDct ]
movq mm0, [r4+ 0]
movq mm1, [r4+ 8]
movq mm2, [r4+16]

View File

@ -32,7 +32,7 @@
;* memzero.asm
;*
;* Abstract
;*
;*
;*
;* History
;* 9/16/2009 Created
@ -45,8 +45,8 @@
; Code
;***********************************************************************
SECTION .text
SECTION .text
ALIGN 16
;***********************************************************************
;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@ -57,7 +57,7 @@ WelsPrefetchZero_mmx:
LOAD_1_PARA
;mov eax,[esp+4]
prefetchnta [r0]
ret
ret
ALIGN 16
@ -71,7 +71,7 @@ WelsSetMemZeroAligned64_sse2:
LOAD_2_PARA
SIGN_EXTENTION r1, r1d
neg r1
pxor xmm0, xmm0
.memzeroa64_sse2_loops:
movdqa [r0], xmm0
@ -79,11 +79,11 @@ WelsSetMemZeroAligned64_sse2:
movdqa [r0+32], xmm0
movdqa [r0+48], xmm0
add r0, 0x40
add r1, 0x40
jnz near .memzeroa64_sse2_loops
ret
ret
ALIGN 16
;***********************************************************************
@ -96,7 +96,7 @@ WelsSetMemZeroSize64_mmx:
LOAD_2_PARA
SIGN_EXTENTION r1, r1d
neg r1
pxor mm0, mm0
.memzero64_mmx_loops:
movq [r0], mm0
@ -106,16 +106,16 @@ WelsSetMemZeroSize64_mmx:
movq [r0+32], mm0
movq [r0+40], mm0
movq [r0+48], mm0
movq [r0+56], mm0
movq [r0+56], mm0
add r0, 0x40
add r1, 0x40
jnz near .memzero64_mmx_loops
WELSEMMS
ret
ALIGN 16
WELSEMMS
ret
ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
;***********************************************************************
@ -125,17 +125,17 @@ WelsSetMemZeroSize8_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENTION r1, r1d
neg r1
neg r1
pxor mm0, mm0
.memzero8_mmx_loops:
movq [r0], mm0
add r0, 0x08
add r1, 0x08
jnz near .memzero8_mmx_loops
WELSEMMS
ret
WELSEMMS
ret

File diff suppressed because it is too large Load Diff

View File

@ -179,15 +179,15 @@ WELS_EXTERN BilateralLumaFilter8_sse2
;%define stride r1
BilateralLumaFilter8_sse2:
push r3
push r3
%assign push_num 1
LOAD_2_PARA
pxor xmm7, xmm7
mov r3, r0
movq xmm6, [r0]
punpcklbw xmm6, xmm7
movdqa xmm3, [sse2_32]
@ -218,10 +218,10 @@ BilateralLumaFilter8_sse2:
packuswb xmm5, xmm5
movq [r3], xmm5
pop r3
%assign push_num 0
ret
WELS_EXTERN WaverageChromaFilter8_sse2
@ -239,11 +239,11 @@ ALIGN 16
WaverageChromaFilter8_sse2:
push r3
%assign push_num 1
LOAD_2_PARA
mov r3, r1
add r3, r3
sub r0, r3 ; pixels - 2 * stride
@ -272,8 +272,8 @@ WaverageChromaFilter8_sse2:
packuswb xmm3, xmm3
movq [r0 + 2], xmm3
pop r3
%assign push_num 0
ret

View File

@ -84,24 +84,24 @@ WelsSampleSad8x8_sse21:
;push edi
;mov eax, [esp+12]
;mov ebx, [esp+16]
%assign push_num 0
mov r2, arg3
push r2
CACHE_SPLIT_CHECK r2, 8, 64
jle near .pixel_sad_8x8_nsplit
pop r2
%ifdef X86_32
%ifdef X86_32
push r3
push r4
push r5
%endif
%assign push_num 3
mov r0, arg1
mov r1, arg2
mov r1, arg2
SIGN_EXTENTION r1, r1d
pxor xmm7, xmm7
;ecx r2, edx r4, edi r5
mov r5, r2
@ -195,18 +195,18 @@ WelsSampleSad8x8_sse21:
pop r3
%endif
jmp .return
.pixel_sad_8x8_nsplit:
;push ebx
;mov eax, [esp+8]
;mov ebx, [esp+12]
;mov edx, [esp+20]
pop r2
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
SIGN_EXTENTION r3, r3d
SIGN_EXTENTION r3, r3d
pxor xmm6, xmm6
SSE2_GetSad8x4
lea r0, [r0+2*r1]