Merge pull request #97 from mstorsjo/asm-source-cleanup
Make all asm sources consistently use unix newlines and remove trailing whitespace
This commit is contained in:
commit
9230b49728
@ -154,7 +154,7 @@ BITS 64
|
||||
%define PUSHRFLAGS pushfq
|
||||
%define POPRFLAGS popfq
|
||||
%define retrq rax
|
||||
%define retrd eax
|
||||
%define retrd eax
|
||||
|
||||
%elifdef X86_32 ; X86_32 ;************************************
|
||||
|
||||
@ -233,7 +233,7 @@ BITS 32
|
||||
%macro LOAD_4_PARA 0
|
||||
%ifdef X86_32
|
||||
push r3
|
||||
%assign push_num push_num+1
|
||||
%assign push_num push_num+1
|
||||
mov r0, [esp + push_num*4 + 4]
|
||||
mov r1, [esp + push_num*4 + 8]
|
||||
mov r2, [esp + push_num*4 + 12]
|
||||
@ -245,7 +245,7 @@ BITS 32
|
||||
%ifdef X86_32
|
||||
push r3
|
||||
push r4
|
||||
%assign push_num push_num+2
|
||||
%assign push_num push_num+2
|
||||
mov r0, [esp + push_num*4 + 4]
|
||||
mov r1, [esp + push_num*4 + 8]
|
||||
mov r2, [esp + push_num*4 + 12]
|
||||
@ -261,7 +261,7 @@ BITS 32
|
||||
push r3
|
||||
push r4
|
||||
push r5
|
||||
%assign push_num push_num+3
|
||||
%assign push_num push_num+3
|
||||
mov r0, [esp + push_num*4 + 4]
|
||||
mov r1, [esp + push_num*4 + 8]
|
||||
mov r2, [esp + push_num*4 + 12]
|
||||
@ -280,7 +280,7 @@ BITS 32
|
||||
push r4
|
||||
push r5
|
||||
push r6
|
||||
%assign push_num push_num+4
|
||||
%assign push_num push_num+4
|
||||
mov r0, [esp + push_num*4 + 4]
|
||||
mov r1, [esp + push_num*4 + 8]
|
||||
mov r2, [esp + push_num*4 + 12]
|
||||
@ -334,7 +334,7 @@ BITS 32
|
||||
movsx %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro WELS_EXTERN 1
|
||||
%ifdef PREFIX
|
||||
global _%1
|
||||
|
@ -81,17 +81,17 @@ ALIGN 16
|
||||
%ifdef WIN64
|
||||
|
||||
WelsCPUId:
|
||||
push rbx
|
||||
push rdx
|
||||
|
||||
push rbx
|
||||
push rdx
|
||||
|
||||
mov eax, ecx
|
||||
mov rcx, [r9]
|
||||
cpuid
|
||||
cpuid
|
||||
mov [r9], ecx
|
||||
mov [r8], ebx
|
||||
mov rcx, [rsp + 2*8 + 40]
|
||||
mov rcx, [rsp + 2*8 + 40]
|
||||
mov [rcx], edx
|
||||
pop rdx
|
||||
pop rdx
|
||||
mov [rdx], eax
|
||||
|
||||
pop rbx
|
||||
@ -103,8 +103,8 @@ WelsCPUId:
|
||||
push rcx
|
||||
push rdx
|
||||
|
||||
mov eax, edi
|
||||
mov rcx, [rcx]
|
||||
mov eax, edi
|
||||
mov rcx, [rcx]
|
||||
cpuid
|
||||
mov [r8], edx
|
||||
pop rdx
|
||||
@ -156,9 +156,9 @@ WelsCPUSupportAVX:
|
||||
%elifdef UNIX64
|
||||
mov eax, edi
|
||||
mov ecx, esi
|
||||
%else
|
||||
%else
|
||||
mov eax, [esp+4]
|
||||
mov ecx, [esp+8]
|
||||
mov ecx, [esp+8]
|
||||
%endif
|
||||
|
||||
; refer to detection of AVX addressed in INTEL AVX manual document
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -244,7 +244,7 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2 ; for chroma unalignment
|
||||
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
|
||||
;r6 [height]
|
||||
;r0 [pSrc+0] r5[pSrc-32] r1[stride]
|
||||
;r3 [pSrc+(w-1)] r4[pSrc+w]
|
||||
;r3 [pSrc+(w-1)] r4[pSrc+w]
|
||||
|
||||
%if %1 == 32 ; for luma
|
||||
.left_right_loops:
|
||||
@ -375,13 +375,13 @@ ExpandPictureLuma_sse2:
|
||||
|
||||
%assign push_num 3
|
||||
LOAD_4_PARA
|
||||
|
||||
|
||||
SIGN_EXTENTION r1, r1d
|
||||
SIGN_EXTENTION r2, r2d
|
||||
SIGN_EXTENTION r3, r3d
|
||||
|
||||
;also prepare for cross border pData top-left:xmm3
|
||||
|
||||
|
||||
movzx r6d,byte[r0]
|
||||
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
|
||||
|
||||
@ -395,22 +395,22 @@ ExpandPictureLuma_sse2:
|
||||
dec r3 ;h-1
|
||||
imul r3,r1 ;(h-1)*stride
|
||||
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
|
||||
|
||||
|
||||
mov r6,r1 ;r6 = stride
|
||||
sal r6,05h ;r6 = 32*stride
|
||||
lea r4,[r3+r6] ;r4 = dst bottom
|
||||
|
||||
|
||||
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
|
||||
|
||||
|
||||
movzx r6d,byte [r3] ;bottom-left
|
||||
SSE2_Copy16Times xmm5,r6d
|
||||
|
||||
|
||||
lea r6,[r3+r2-1]
|
||||
movzx r6d,byte [r6]
|
||||
SSE2_Copy16Times xmm6,r6d ;bottom-right
|
||||
|
||||
|
||||
neg r1 ;r1 = -stride
|
||||
|
||||
|
||||
push r0
|
||||
push r1
|
||||
push r2
|
||||
@ -419,20 +419,20 @@ ExpandPictureLuma_sse2:
|
||||
|
||||
; for both left and right border
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
pop r2
|
||||
pop r1
|
||||
pop r0
|
||||
|
||||
lea r5,[r0-32] ;left border dst luma =32 chroma = -16
|
||||
|
||||
|
||||
lea r3,[r0+r2-1] ;right border src
|
||||
lea r4,[r3+1] ;right border dst
|
||||
|
||||
;prepare for cross border data: top-rigth with xmm4
|
||||
movzx r6d,byte [r3] ;top -rigth
|
||||
SSE2_Copy16Times xmm4,r6d
|
||||
|
||||
|
||||
neg r1 ;r1 = stride
|
||||
|
||||
|
||||
@ -444,7 +444,7 @@ ExpandPictureLuma_sse2:
|
||||
push r1
|
||||
push r2
|
||||
push r6
|
||||
|
||||
|
||||
exp_left_right_sse2 32,a
|
||||
|
||||
pop r6
|
||||
@ -455,33 +455,33 @@ ExpandPictureLuma_sse2:
|
||||
; for cross border [top-left, top-right, bottom-left, bottom-right]
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
|
||||
|
||||
|
||||
neg r1 ;r1 = -stride
|
||||
lea r3,[r0-32]
|
||||
lea r3,[r3+r1] ;last line of top-left border
|
||||
|
||||
|
||||
lea r4,[r0+r2] ;psrc +width
|
||||
lea r4,[r4+r1] ;psrc +width -stride
|
||||
|
||||
|
||||
|
||||
|
||||
neg r1 ;r1 = stride
|
||||
add r6,32 ;height +32(16) ,luma = 32, chroma = 16
|
||||
imul r6,r1
|
||||
|
||||
|
||||
lea r5,[r3+r6] ;last line of bottom-left border
|
||||
lea r6,[r4+r6] ;last line of botoom-right border
|
||||
|
||||
|
||||
neg r1 ; r1 = -stride
|
||||
|
||||
; for left & right border expanding
|
||||
exp_cross_sse2 32,a
|
||||
|
||||
LOAD_4_PARA_POP
|
||||
|
||||
|
||||
pop r6
|
||||
pop r5
|
||||
pop r4
|
||||
|
||||
|
||||
%assign push_num 0
|
||||
|
||||
|
||||
@ -495,7 +495,7 @@ ALIGN 16
|
||||
; const int32_t iHeight );
|
||||
;***********************************************************************----------------
|
||||
ExpandPictureChromaAlign_sse2:
|
||||
|
||||
|
||||
push r4
|
||||
push r5
|
||||
push r6
|
||||
@ -508,7 +508,7 @@ ExpandPictureChromaAlign_sse2:
|
||||
SIGN_EXTENTION r3,r3d
|
||||
|
||||
;also prepare for cross border pData top-left:xmm3
|
||||
|
||||
|
||||
movzx r6d,byte [r0]
|
||||
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
|
||||
|
||||
@ -522,44 +522,44 @@ ExpandPictureChromaAlign_sse2:
|
||||
dec r3 ;h-1
|
||||
imul r3,r1 ;(h-1)*stride
|
||||
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
|
||||
|
||||
|
||||
mov r6,r1 ;r6 = stride
|
||||
sal r6,04h ;r6 = 32*stride
|
||||
lea r4,[r3+r6] ;r4 = dst bottom
|
||||
|
||||
lea r4,[r3+r6] ;r4 = dst bottom
|
||||
|
||||
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
|
||||
|
||||
|
||||
movzx r6d,byte [r3] ;bottom-left
|
||||
SSE2_Copy16Times xmm5,r6d
|
||||
|
||||
|
||||
lea r6,[r3+r2-1]
|
||||
movzx r6d,byte [r6]
|
||||
SSE2_Copy16Times xmm6,r6d ;bottom-right
|
||||
|
||||
|
||||
neg r1 ;r1 = -stride
|
||||
|
||||
|
||||
push r0
|
||||
push r1
|
||||
push r1
|
||||
push r2
|
||||
|
||||
exp_top_bottom_sse2 16
|
||||
|
||||
; for both left and right border
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
pop r2
|
||||
pop r1
|
||||
pop r0
|
||||
|
||||
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
|
||||
|
||||
lea r3,[r0+r2-1] ;right border src
|
||||
|
||||
lea r3,[r0+r2-1] ;right border src
|
||||
lea r4,[r3+1] ;right border dst
|
||||
|
||||
;prepare for cross border data: top-rigth with xmm4
|
||||
movzx r6d,byte [r3] ;top -rigth
|
||||
SSE2_Copy16Times xmm4,r6d
|
||||
|
||||
|
||||
neg r1 ;r1 = stride
|
||||
|
||||
|
||||
@ -568,7 +568,7 @@ ExpandPictureChromaAlign_sse2:
|
||||
|
||||
|
||||
push r0
|
||||
push r1
|
||||
push r1
|
||||
push r2
|
||||
push r6
|
||||
exp_left_right_sse2 16,a
|
||||
@ -581,33 +581,33 @@ ExpandPictureChromaAlign_sse2:
|
||||
; for cross border [top-left, top-right, bottom-left, bottom-right]
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
|
||||
|
||||
|
||||
neg r1 ;r1 = -stride
|
||||
lea r3,[r0-16]
|
||||
lea r3,[r3+r1] ;last line of top-left border
|
||||
|
||||
|
||||
lea r4,[r0+r2] ;psrc +width
|
||||
lea r4,[r4+r1] ;psrc +width -stride
|
||||
|
||||
|
||||
lea r4,[r4+r1] ;psrc +width -stride
|
||||
|
||||
|
||||
neg r1 ;r1 = stride
|
||||
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
|
||||
imul r6,r1
|
||||
|
||||
|
||||
lea r5,[r3+r6] ;last line of bottom-left border
|
||||
lea r6,[r4+r6] ;last line of botoom-right border
|
||||
|
||||
|
||||
neg r1 ; r1 = -stride
|
||||
|
||||
; for left & right border expanding
|
||||
exp_cross_sse2 16,a
|
||||
|
||||
LOAD_4_PARA_POP
|
||||
|
||||
|
||||
pop r6
|
||||
pop r5
|
||||
pop r4
|
||||
|
||||
|
||||
%assign push_num 0
|
||||
|
||||
|
||||
@ -633,7 +633,7 @@ ExpandPictureChromaUnalign_sse2:
|
||||
SIGN_EXTENTION r3,r3d
|
||||
|
||||
;also prepare for cross border pData top-left:xmm3
|
||||
|
||||
|
||||
movzx r6d,byte [r0]
|
||||
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
|
||||
|
||||
@ -647,44 +647,44 @@ ExpandPictureChromaUnalign_sse2:
|
||||
dec r3 ;h-1
|
||||
imul r3,r1 ;(h-1)*stride
|
||||
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
|
||||
|
||||
|
||||
mov r6,r1 ;r6 = stride
|
||||
sal r6,04h ;r6 = 32*stride
|
||||
lea r4,[r3+r6] ;r4 = dst bottom
|
||||
|
||||
lea r4,[r3+r6] ;r4 = dst bottom
|
||||
|
||||
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
|
||||
|
||||
|
||||
movzx r6d,byte [r3] ;bottom-left
|
||||
SSE2_Copy16Times xmm5,r6d
|
||||
|
||||
|
||||
lea r6,[r3+r2-1]
|
||||
movzx r6d,byte [r6]
|
||||
SSE2_Copy16Times xmm6,r6d ;bottom-right
|
||||
|
||||
|
||||
neg r1 ;r1 = -stride
|
||||
|
||||
|
||||
push r0
|
||||
push r1
|
||||
push r1
|
||||
push r2
|
||||
|
||||
exp_top_bottom_sse2 16
|
||||
|
||||
; for both left and right border
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
pop r2
|
||||
pop r1
|
||||
pop r0
|
||||
|
||||
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
|
||||
|
||||
lea r3,[r0+r2-1] ;right border src
|
||||
|
||||
lea r3,[r0+r2-1] ;right border src
|
||||
lea r4,[r3+1] ;right border dst
|
||||
|
||||
;prepare for cross border data: top-rigth with xmm4
|
||||
movzx r6d,byte [r3] ;top -rigth
|
||||
SSE2_Copy16Times xmm4,r6d
|
||||
|
||||
|
||||
neg r1 ;r1 = stride
|
||||
|
||||
|
||||
@ -693,7 +693,7 @@ ExpandPictureChromaUnalign_sse2:
|
||||
|
||||
|
||||
push r0
|
||||
push r1
|
||||
push r1
|
||||
push r2
|
||||
push r6
|
||||
exp_left_right_sse2 16,u
|
||||
@ -706,35 +706,34 @@ ExpandPictureChromaUnalign_sse2:
|
||||
; for cross border [top-left, top-right, bottom-left, bottom-right]
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
|
||||
|
||||
|
||||
neg r1 ;r1 = -stride
|
||||
lea r3,[r0-16]
|
||||
lea r3,[r3+r1] ;last line of top-left border
|
||||
|
||||
|
||||
lea r4,[r0+r2] ;psrc +width
|
||||
lea r4,[r4+r1] ;psrc +width -stride
|
||||
|
||||
|
||||
lea r4,[r4+r1] ;psrc +width -stride
|
||||
|
||||
|
||||
neg r1 ;r1 = stride
|
||||
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
|
||||
imul r6,r1
|
||||
|
||||
|
||||
lea r5,[r3+r6] ;last line of bottom-left border
|
||||
lea r6,[r4+r6] ;last line of botoom-right border
|
||||
|
||||
|
||||
neg r1 ; r1 = -stride
|
||||
|
||||
; for left & right border expanding
|
||||
exp_cross_sse2 16,u
|
||||
|
||||
LOAD_4_PARA_POP
|
||||
|
||||
|
||||
pop r6
|
||||
pop r5
|
||||
pop r4
|
||||
|
||||
|
||||
%assign push_num 0
|
||||
|
||||
|
||||
ret
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,345 +1,345 @@
|
||||
;*!
|
||||
;* \copy
|
||||
;* Copyright (c) 2004-2013, Cisco Systems
|
||||
;* All rights reserved.
|
||||
;*
|
||||
;* Redistribution and use in source and binary forms, with or without
|
||||
;* modification, are permitted provided that the following conditions
|
||||
;* are met:
|
||||
;*
|
||||
;* * Redistributions of source code must retain the above copyright
|
||||
;* notice, this list of conditions and the following disclaimer.
|
||||
;*
|
||||
;* * Redistributions in binary form must reproduce the above copyright
|
||||
;* notice, this list of conditions and the following disclaimer in
|
||||
;* the documentation and/or other materials provided with the
|
||||
;* distribution.
|
||||
;*
|
||||
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
;* POSSIBILITY OF SUCH DAMAGE.
|
||||
;*
|
||||
;*
|
||||
;* mc_chroma.asm
|
||||
;*
|
||||
;* Abstract
|
||||
;* mmx motion compensation for chroma
|
||||
;*
|
||||
;* History
|
||||
;* 10/13/2004 Created
|
||||
;*
|
||||
;*
|
||||
;*************************************************************************/
|
||||
%include "asm_inc.asm"
|
||||
|
||||
;***********************************************************************
|
||||
; Local Data (Read Only)
|
||||
;***********************************************************************
|
||||
|
||||
SECTION .rodata align=16
|
||||
|
||||
;***********************************************************************
|
||||
; Various memory constants (trigonometric values or rounding values)
|
||||
;***********************************************************************
|
||||
|
||||
ALIGN 16
|
||||
h264_d0x20_sse2:
|
||||
dw 32,32,32,32,32,32,32,32
|
||||
ALIGN 16
|
||||
h264_d0x20_mmx:
|
||||
dw 32,32,32,32
|
||||
|
||||
|
||||
;=============================================================================
|
||||
; Code
|
||||
;=============================================================================
|
||||
|
||||
SECTION .text
|
||||
|
||||
ALIGN 16
|
||||
;*******************************************************************************
|
||||
; void McChromaWidthEq4_mmx( uint8_t *src,
|
||||
; int32_t iSrcStride,
|
||||
; uint8_t *pDst,
|
||||
; int32_t iDstStride,
|
||||
; uint8_t *pABCD,
|
||||
; int32_t iHeigh );
|
||||
;*******************************************************************************
|
||||
WELS_EXTERN McChromaWidthEq4_mmx
|
||||
McChromaWidthEq4_mmx:
|
||||
;push esi
|
||||
;push edi
|
||||
;push ebx
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
%ifndef X86_32
|
||||
movsx r1, r1d
|
||||
movsx r3, r3d
|
||||
movsx r5, r5d
|
||||
%endif
|
||||
|
||||
;mov eax, [esp +12 + 20]
|
||||
|
||||
movd mm3, [r4]; [eax]
|
||||
WELS_Zero mm7
|
||||
punpcklbw mm3, mm3
|
||||
movq mm4, mm3
|
||||
punpcklwd mm3, mm3
|
||||
punpckhwd mm4, mm4
|
||||
|
||||
movq mm5, mm3
|
||||
punpcklbw mm3, mm7
|
||||
punpckhbw mm5, mm7
|
||||
|
||||
movq mm6, mm4
|
||||
punpcklbw mm4, mm7
|
||||
punpckhbw mm6, mm7
|
||||
|
||||
;mov esi, [esp +12+ 4]
|
||||
;mov eax, [esp + 12 + 8]
|
||||
;mov edi, [esp + 12 + 12]
|
||||
;mov edx, [esp + 12 + 16]
|
||||
;mov ecx, [esp + 12 + 24]
|
||||
|
||||
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
|
||||
movd mm0, [r0]
|
||||
movd mm1, [r0+1]
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
.xloop:
|
||||
|
||||
pmullw mm0, mm3
|
||||
pmullw mm1, mm5
|
||||
paddw mm0, mm1
|
||||
|
||||
movd mm1, [r4]
|
||||
punpcklbw mm1, mm7
|
||||
movq mm2, mm1
|
||||
pmullw mm1, mm4
|
||||
paddw mm0, mm1
|
||||
|
||||
movd mm1, [r4+1]
|
||||
punpcklbw mm1, mm7
|
||||
movq mm7, mm1
|
||||
pmullw mm1,mm6
|
||||
paddw mm0, mm1
|
||||
movq mm1,mm7
|
||||
|
||||
paddw mm0, [h264_d0x20_mmx]
|
||||
psrlw mm0, 6
|
||||
|
||||
WELS_Zero mm7
|
||||
packuswb mm0, mm7
|
||||
movd [r2], mm0
|
||||
|
||||
movq mm0, mm2
|
||||
|
||||
lea r2, [r2 + r3]
|
||||
lea r4, [r4 + r1]
|
||||
|
||||
dec r5
|
||||
jnz near .xloop
|
||||
WELSEMMS
|
||||
LOAD_6_PARA_POP
|
||||
;pop ebx
|
||||
;pop edi
|
||||
;pop esi
|
||||
ret
|
||||
|
||||
|
||||
ALIGN 16
|
||||
;*******************************************************************************
|
||||
; void McChromaWidthEq8_sse2( uint8_t *pSrc,
|
||||
; int32_t iSrcStride,
|
||||
; uint8_t *pDst,
|
||||
; int32_t iDstStride,
|
||||
; uint8_t *pABCD,
|
||||
; int32_t iheigh );
|
||||
;*******************************************************************************
|
||||
WELS_EXTERN McChromaWidthEq8_sse2
|
||||
McChromaWidthEq8_sse2:
|
||||
;push esi
|
||||
;push edi
|
||||
;push ebx
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
%ifndef X86_32
|
||||
movsx r1, r1d
|
||||
movsx r3, r3d
|
||||
movsx r5, r5d
|
||||
%endif
|
||||
|
||||
;mov eax, [esp +12 + 20]
|
||||
movd xmm3, [r4]
|
||||
WELS_Zero xmm7
|
||||
punpcklbw xmm3, xmm3
|
||||
punpcklwd xmm3, xmm3
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
punpckldq xmm3, xmm3
|
||||
punpckhdq xmm4, xmm4
|
||||
movdqa xmm5, xmm3
|
||||
movdqa xmm6, xmm4
|
||||
|
||||
punpcklbw xmm3, xmm7
|
||||
punpckhbw xmm5, xmm7
|
||||
punpcklbw xmm4, xmm7
|
||||
punpckhbw xmm6, xmm7
|
||||
|
||||
;mov esi, [esp +12+ 4]
|
||||
;mov eax, [esp + 12 + 8]
|
||||
;mov edi, [esp + 12 + 12]
|
||||
;mov edx, [esp + 12 + 16]
|
||||
;mov ecx, [esp + 12 + 24]
|
||||
|
||||
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
|
||||
movq xmm0, [r0]
|
||||
movq xmm1, [r0+1]
|
||||
punpcklbw xmm0, xmm7
|
||||
punpcklbw xmm1, xmm7
|
||||
.xloop:
|
||||
|
||||
pmullw xmm0, xmm3
|
||||
pmullw xmm1, xmm5
|
||||
paddw xmm0, xmm1
|
||||
|
||||
movq xmm1, [r4]
|
||||
punpcklbw xmm1, xmm7
|
||||
movdqa xmm2, xmm1
|
||||
pmullw xmm1, xmm4
|
||||
paddw xmm0, xmm1
|
||||
|
||||
movq xmm1, [r4+1]
|
||||
punpcklbw xmm1, xmm7
|
||||
movdqa xmm7, xmm1
|
||||
pmullw xmm1, xmm6
|
||||
paddw xmm0, xmm1
|
||||
movdqa xmm1,xmm7
|
||||
|
||||
paddw xmm0, [h264_d0x20_sse2]
|
||||
psrlw xmm0, 6
|
||||
|
||||
WELS_Zero xmm7
|
||||
packuswb xmm0, xmm7
|
||||
movq [r2], xmm0
|
||||
|
||||
movdqa xmm0, xmm2
|
||||
|
||||
lea r2, [r2 + r3]
|
||||
lea r4, [r4 + r1]
|
||||
|
||||
dec r5
|
||||
jnz near .xloop
|
||||
|
||||
LOAD_6_PARA_POP
|
||||
|
||||
;pop ebx
|
||||
;pop edi
|
||||
;pop esi
|
||||
ret
|
||||
|
||||
|
||||
|
||||
|
||||
ALIGN 16
|
||||
;***********************************************************************
|
||||
; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
|
||||
; int32_t iSrcStride,
|
||||
; uint8_t *pDst,
|
||||
; int32_t iDstStride,
|
||||
; uint8_t *pABCD,
|
||||
; int32_t iHeigh);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN McChromaWidthEq8_ssse3
|
||||
McChromaWidthEq8_ssse3:
|
||||
;push ebx
|
||||
;push esi
|
||||
;push edi
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
%ifndef X86_32
|
||||
movsx r1, r1d
|
||||
movsx r3, r3d
|
||||
movsx r5, r5d
|
||||
%endif
|
||||
|
||||
;mov eax, [esp + 12 + 20]
|
||||
|
||||
pxor xmm7, xmm7
|
||||
movd xmm5, [r4]
|
||||
punpcklwd xmm5, xmm5
|
||||
punpckldq xmm5, xmm5
|
||||
movdqa xmm6, xmm5
|
||||
punpcklqdq xmm5, xmm5
|
||||
punpckhqdq xmm6, xmm6
|
||||
|
||||
;mov eax, [esp + 12 + 4]
|
||||
;mov edx, [esp + 12 + 8]
|
||||
;mov esi, [esp + 12 + 12]
|
||||
;mov edi, [esp + 12 + 16]
|
||||
;mov ecx, [esp + 12 + 24]
|
||||
|
||||
sub r2, r3 ;sub esi, edi
|
||||
sub r2, r3
|
||||
movdqa xmm7, [h264_d0x20_sse2]
|
||||
|
||||
movdqu xmm0, [r0]
|
||||
movdqa xmm1, xmm0
|
||||
psrldq xmm1, 1
|
||||
punpcklbw xmm0, xmm1
|
||||
|
||||
.hloop_chroma:
|
||||
lea r2, [r2+2*r3]
|
||||
|
||||
movdqu xmm2, [r0+r1]
|
||||
movdqa xmm3, xmm2
|
||||
psrldq xmm3, 1
|
||||
punpcklbw xmm2, xmm3
|
||||
movdqa xmm4, xmm2
|
||||
|
||||
pmaddubsw xmm0, xmm5
|
||||
pmaddubsw xmm2, xmm6
|
||||
paddw xmm0, xmm2
|
||||
paddw xmm0, xmm7
|
||||
psrlw xmm0, 6
|
||||
packuswb xmm0, xmm0
|
||||
movq [r2],xmm0
|
||||
|
||||
lea r0, [r0+2*r1]
|
||||
movdqu xmm2, [r0]
|
||||
movdqa xmm3, xmm2
|
||||
psrldq xmm3, 1
|
||||
punpcklbw xmm2, xmm3
|
||||
movdqa xmm0, xmm2
|
||||
|
||||
pmaddubsw xmm4, xmm5
|
||||
pmaddubsw xmm2, xmm6
|
||||
paddw xmm4, xmm2
|
||||
paddw xmm4, xmm7
|
||||
psrlw xmm4, 6
|
||||
packuswb xmm4, xmm4
|
||||
movq [r2+r3],xmm4
|
||||
|
||||
sub r5, 2
|
||||
jnz .hloop_chroma
|
||||
|
||||
LOAD_6_PARA_POP
|
||||
|
||||
;pop edi
|
||||
;pop esi
|
||||
;pop ebx
|
||||
|
||||
ret
|
||||
|
||||
|
||||
;*!
|
||||
;* \copy
|
||||
;* Copyright (c) 2004-2013, Cisco Systems
|
||||
;* All rights reserved.
|
||||
;*
|
||||
;* Redistribution and use in source and binary forms, with or without
|
||||
;* modification, are permitted provided that the following conditions
|
||||
;* are met:
|
||||
;*
|
||||
;* * Redistributions of source code must retain the above copyright
|
||||
;* notice, this list of conditions and the following disclaimer.
|
||||
;*
|
||||
;* * Redistributions in binary form must reproduce the above copyright
|
||||
;* notice, this list of conditions and the following disclaimer in
|
||||
;* the documentation and/or other materials provided with the
|
||||
;* distribution.
|
||||
;*
|
||||
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
;* POSSIBILITY OF SUCH DAMAGE.
|
||||
;*
|
||||
;*
|
||||
;* mc_chroma.asm
|
||||
;*
|
||||
;* Abstract
|
||||
;* mmx motion compensation for chroma
|
||||
;*
|
||||
;* History
|
||||
;* 10/13/2004 Created
|
||||
;*
|
||||
;*
|
||||
;*************************************************************************/
|
||||
%include "asm_inc.asm"
|
||||
|
||||
;***********************************************************************
|
||||
; Local Data (Read Only)
|
||||
;***********************************************************************
|
||||
|
||||
SECTION .rodata align=16
|
||||
|
||||
;***********************************************************************
|
||||
; Various memory constants (trigonometric values or rounding values)
|
||||
;***********************************************************************
|
||||
|
||||
ALIGN 16
|
||||
h264_d0x20_sse2:
|
||||
dw 32,32,32,32,32,32,32,32
|
||||
ALIGN 16
|
||||
h264_d0x20_mmx:
|
||||
dw 32,32,32,32
|
||||
|
||||
|
||||
;=============================================================================
|
||||
; Code
|
||||
;=============================================================================
|
||||
|
||||
SECTION .text
|
||||
|
||||
ALIGN 16
|
||||
;*******************************************************************************
|
||||
; void McChromaWidthEq4_mmx( uint8_t *src,
|
||||
; int32_t iSrcStride,
|
||||
; uint8_t *pDst,
|
||||
; int32_t iDstStride,
|
||||
; uint8_t *pABCD,
|
||||
; int32_t iHeigh );
|
||||
;*******************************************************************************
|
||||
WELS_EXTERN McChromaWidthEq4_mmx
|
||||
McChromaWidthEq4_mmx:
|
||||
;push esi
|
||||
;push edi
|
||||
;push ebx
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
%ifndef X86_32
|
||||
movsx r1, r1d
|
||||
movsx r3, r3d
|
||||
movsx r5, r5d
|
||||
%endif
|
||||
|
||||
;mov eax, [esp +12 + 20]
|
||||
|
||||
movd mm3, [r4]; [eax]
|
||||
WELS_Zero mm7
|
||||
punpcklbw mm3, mm3
|
||||
movq mm4, mm3
|
||||
punpcklwd mm3, mm3
|
||||
punpckhwd mm4, mm4
|
||||
|
||||
movq mm5, mm3
|
||||
punpcklbw mm3, mm7
|
||||
punpckhbw mm5, mm7
|
||||
|
||||
movq mm6, mm4
|
||||
punpcklbw mm4, mm7
|
||||
punpckhbw mm6, mm7
|
||||
|
||||
;mov esi, [esp +12+ 4]
|
||||
;mov eax, [esp + 12 + 8]
|
||||
;mov edi, [esp + 12 + 12]
|
||||
;mov edx, [esp + 12 + 16]
|
||||
;mov ecx, [esp + 12 + 24]
|
||||
|
||||
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
|
||||
movd mm0, [r0]
|
||||
movd mm1, [r0+1]
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
.xloop:
|
||||
|
||||
pmullw mm0, mm3
|
||||
pmullw mm1, mm5
|
||||
paddw mm0, mm1
|
||||
|
||||
movd mm1, [r4]
|
||||
punpcklbw mm1, mm7
|
||||
movq mm2, mm1
|
||||
pmullw mm1, mm4
|
||||
paddw mm0, mm1
|
||||
|
||||
movd mm1, [r4+1]
|
||||
punpcklbw mm1, mm7
|
||||
movq mm7, mm1
|
||||
pmullw mm1,mm6
|
||||
paddw mm0, mm1
|
||||
movq mm1,mm7
|
||||
|
||||
paddw mm0, [h264_d0x20_mmx]
|
||||
psrlw mm0, 6
|
||||
|
||||
WELS_Zero mm7
|
||||
packuswb mm0, mm7
|
||||
movd [r2], mm0
|
||||
|
||||
movq mm0, mm2
|
||||
|
||||
lea r2, [r2 + r3]
|
||||
lea r4, [r4 + r1]
|
||||
|
||||
dec r5
|
||||
jnz near .xloop
|
||||
WELSEMMS
|
||||
LOAD_6_PARA_POP
|
||||
;pop ebx
|
||||
;pop edi
|
||||
;pop esi
|
||||
ret
|
||||
|
||||
|
||||
ALIGN 16
|
||||
;*******************************************************************************
|
||||
; void McChromaWidthEq8_sse2( uint8_t *pSrc,
|
||||
; int32_t iSrcStride,
|
||||
; uint8_t *pDst,
|
||||
; int32_t iDstStride,
|
||||
; uint8_t *pABCD,
|
||||
; int32_t iheigh );
|
||||
;*******************************************************************************
|
||||
WELS_EXTERN McChromaWidthEq8_sse2
|
||||
McChromaWidthEq8_sse2:
|
||||
;push esi
|
||||
;push edi
|
||||
;push ebx
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
%ifndef X86_32
|
||||
movsx r1, r1d
|
||||
movsx r3, r3d
|
||||
movsx r5, r5d
|
||||
%endif
|
||||
|
||||
;mov eax, [esp +12 + 20]
|
||||
movd xmm3, [r4]
|
||||
WELS_Zero xmm7
|
||||
punpcklbw xmm3, xmm3
|
||||
punpcklwd xmm3, xmm3
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
punpckldq xmm3, xmm3
|
||||
punpckhdq xmm4, xmm4
|
||||
movdqa xmm5, xmm3
|
||||
movdqa xmm6, xmm4
|
||||
|
||||
punpcklbw xmm3, xmm7
|
||||
punpckhbw xmm5, xmm7
|
||||
punpcklbw xmm4, xmm7
|
||||
punpckhbw xmm6, xmm7
|
||||
|
||||
;mov esi, [esp +12+ 4]
|
||||
;mov eax, [esp + 12 + 8]
|
||||
;mov edi, [esp + 12 + 12]
|
||||
;mov edx, [esp + 12 + 16]
|
||||
;mov ecx, [esp + 12 + 24]
|
||||
|
||||
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
|
||||
movq xmm0, [r0]
|
||||
movq xmm1, [r0+1]
|
||||
punpcklbw xmm0, xmm7
|
||||
punpcklbw xmm1, xmm7
|
||||
.xloop:
|
||||
|
||||
pmullw xmm0, xmm3
|
||||
pmullw xmm1, xmm5
|
||||
paddw xmm0, xmm1
|
||||
|
||||
movq xmm1, [r4]
|
||||
punpcklbw xmm1, xmm7
|
||||
movdqa xmm2, xmm1
|
||||
pmullw xmm1, xmm4
|
||||
paddw xmm0, xmm1
|
||||
|
||||
movq xmm1, [r4+1]
|
||||
punpcklbw xmm1, xmm7
|
||||
movdqa xmm7, xmm1
|
||||
pmullw xmm1, xmm6
|
||||
paddw xmm0, xmm1
|
||||
movdqa xmm1,xmm7
|
||||
|
||||
paddw xmm0, [h264_d0x20_sse2]
|
||||
psrlw xmm0, 6
|
||||
|
||||
WELS_Zero xmm7
|
||||
packuswb xmm0, xmm7
|
||||
movq [r2], xmm0
|
||||
|
||||
movdqa xmm0, xmm2
|
||||
|
||||
lea r2, [r2 + r3]
|
||||
lea r4, [r4 + r1]
|
||||
|
||||
dec r5
|
||||
jnz near .xloop
|
||||
|
||||
LOAD_6_PARA_POP
|
||||
|
||||
;pop ebx
|
||||
;pop edi
|
||||
;pop esi
|
||||
ret
|
||||
|
||||
|
||||
|
||||
|
||||
ALIGN 16
|
||||
;***********************************************************************
|
||||
; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
|
||||
; int32_t iSrcStride,
|
||||
; uint8_t *pDst,
|
||||
; int32_t iDstStride,
|
||||
; uint8_t *pABCD,
|
||||
; int32_t iHeigh);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN McChromaWidthEq8_ssse3
|
||||
McChromaWidthEq8_ssse3:
|
||||
;push ebx
|
||||
;push esi
|
||||
;push edi
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
%ifndef X86_32
|
||||
movsx r1, r1d
|
||||
movsx r3, r3d
|
||||
movsx r5, r5d
|
||||
%endif
|
||||
|
||||
;mov eax, [esp + 12 + 20]
|
||||
|
||||
pxor xmm7, xmm7
|
||||
movd xmm5, [r4]
|
||||
punpcklwd xmm5, xmm5
|
||||
punpckldq xmm5, xmm5
|
||||
movdqa xmm6, xmm5
|
||||
punpcklqdq xmm5, xmm5
|
||||
punpckhqdq xmm6, xmm6
|
||||
|
||||
;mov eax, [esp + 12 + 4]
|
||||
;mov edx, [esp + 12 + 8]
|
||||
;mov esi, [esp + 12 + 12]
|
||||
;mov edi, [esp + 12 + 16]
|
||||
;mov ecx, [esp + 12 + 24]
|
||||
|
||||
sub r2, r3 ;sub esi, edi
|
||||
sub r2, r3
|
||||
movdqa xmm7, [h264_d0x20_sse2]
|
||||
|
||||
movdqu xmm0, [r0]
|
||||
movdqa xmm1, xmm0
|
||||
psrldq xmm1, 1
|
||||
punpcklbw xmm0, xmm1
|
||||
|
||||
.hloop_chroma:
|
||||
lea r2, [r2+2*r3]
|
||||
|
||||
movdqu xmm2, [r0+r1]
|
||||
movdqa xmm3, xmm2
|
||||
psrldq xmm3, 1
|
||||
punpcklbw xmm2, xmm3
|
||||
movdqa xmm4, xmm2
|
||||
|
||||
pmaddubsw xmm0, xmm5
|
||||
pmaddubsw xmm2, xmm6
|
||||
paddw xmm0, xmm2
|
||||
paddw xmm0, xmm7
|
||||
psrlw xmm0, 6
|
||||
packuswb xmm0, xmm0
|
||||
movq [r2],xmm0
|
||||
|
||||
lea r0, [r0+2*r1]
|
||||
movdqu xmm2, [r0]
|
||||
movdqa xmm3, xmm2
|
||||
psrldq xmm3, 1
|
||||
punpcklbw xmm2, xmm3
|
||||
movdqa xmm0, xmm2
|
||||
|
||||
pmaddubsw xmm4, xmm5
|
||||
pmaddubsw xmm2, xmm6
|
||||
paddw xmm4, xmm2
|
||||
paddw xmm4, xmm7
|
||||
psrlw xmm4, 6
|
||||
packuswb xmm4, xmm4
|
||||
movq [r2+r3],xmm4
|
||||
|
||||
sub r5, 2
|
||||
jnz .hloop_chroma
|
||||
|
||||
LOAD_6_PARA_POP
|
||||
|
||||
;pop edi
|
||||
;pop esi
|
||||
;pop ebx
|
||||
|
||||
ret
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -160,7 +160,7 @@ ALIGN 16
|
||||
AnalysisVaaInfoIntra_sse2:
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
LOAD_2_PARA
|
||||
SIGN_EXTENTION r1,r1d
|
||||
|
||||
%ifdef X86_32
|
||||
@ -175,16 +175,16 @@ AnalysisVaaInfoIntra_sse2:
|
||||
and r5,0fh
|
||||
sub r7,r5
|
||||
sub r7,32
|
||||
|
||||
|
||||
mov r2,r1
|
||||
|
||||
|
||||
mov r2,r1
|
||||
sal r2,$1 ;r2 = 2*iLineSize
|
||||
mov r3,r2
|
||||
add r3,r1 ;r3 = 3*iLineSize
|
||||
|
||||
|
||||
mov r4,r2
|
||||
sal r4,$1 ;r4 = 4*iLineSize
|
||||
|
||||
|
||||
pxor xmm7, xmm7
|
||||
|
||||
; loops
|
||||
@ -225,8 +225,8 @@ AnalysisVaaInfoIntra_sse2:
|
||||
pshufd xmm2, xmm1, 0B1h
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
movd r2d, xmm0
|
||||
and r2, 0ffffh ; effective low work truncated
|
||||
mov r3, r2
|
||||
@ -234,7 +234,7 @@ AnalysisVaaInfoIntra_sse2:
|
||||
sar r2, $4
|
||||
movd retrd, xmm1
|
||||
sub retrd, r2d
|
||||
|
||||
|
||||
add r7,32
|
||||
add r7,r5
|
||||
|
||||
@ -244,7 +244,7 @@ AnalysisVaaInfoIntra_sse2:
|
||||
pop r4
|
||||
pop r3
|
||||
%endif
|
||||
|
||||
|
||||
ret
|
||||
|
||||
WELS_EXTERN AnalysisVaaInfoIntra_ssse3
|
||||
@ -255,7 +255,7 @@ ALIGN 16
|
||||
AnalysisVaaInfoIntra_ssse3:
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
LOAD_2_PARA
|
||||
SIGN_EXTENTION r1,r1d
|
||||
|
||||
%ifdef X86_32
|
||||
@ -265,41 +265,41 @@ AnalysisVaaInfoIntra_ssse3:
|
||||
push r6
|
||||
%assign push_num push_num+4
|
||||
%endif
|
||||
|
||||
|
||||
mov r5,r7
|
||||
and r5,0fh
|
||||
sub r7,r5
|
||||
sub r7,32
|
||||
|
||||
|
||||
mov r2,r1
|
||||
|
||||
mov r2,r1
|
||||
sal r2,$1 ;r2 = 2*iLineSize
|
||||
mov r3,r2
|
||||
add r3,r1 ;r3 = 3*iLineSize
|
||||
|
||||
|
||||
mov r4,r2
|
||||
sal r4,$1 ;r4 = 4*iLineSize
|
||||
|
||||
|
||||
pxor xmm7, xmm7
|
||||
|
||||
; loops
|
||||
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
movq [r7],xmm0
|
||||
|
||||
|
||||
lea r0,[r0+r4]
|
||||
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
|
||||
movq [r7+8],xmm1
|
||||
|
||||
|
||||
|
||||
|
||||
lea r0,[r0+r4]
|
||||
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
movq [r7+16],xmm0
|
||||
|
||||
|
||||
lea r0,[r0+r4]
|
||||
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
|
||||
movq [r7+24],xmm1
|
||||
|
||||
|
||||
|
||||
|
||||
movdqa xmm0,[r7]
|
||||
movdqa xmm1,[r7+16]
|
||||
movdqa xmm2, xmm0
|
||||
@ -322,7 +322,7 @@ AnalysisVaaInfoIntra_ssse3:
|
||||
pshufd xmm2, xmm1, 0B1h
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
|
||||
movd r2d, xmm0
|
||||
and r2, 0ffffh ; effective low work truncated
|
||||
mov r3, r2
|
||||
@ -339,7 +339,7 @@ AnalysisVaaInfoIntra_ssse3:
|
||||
pop r4
|
||||
pop r3
|
||||
%endif
|
||||
|
||||
|
||||
ret
|
||||
|
||||
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
|
||||
@ -368,7 +368,7 @@ MdInterAnalysisVaaInfo_sse41:
|
||||
paddd xmm3, xmm4
|
||||
movd r0d, xmm3
|
||||
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
|
||||
|
||||
|
||||
jb near .threshold_exit
|
||||
pshufd xmm0, xmm0, 01Bh
|
||||
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
|
||||
@ -412,7 +412,7 @@ MdInterAnalysisVaaInfo_sse2:
|
||||
paddd xmm4, xmm5
|
||||
pshufd xmm5, xmm4, 0B1h
|
||||
paddd xmm5, xmm4
|
||||
|
||||
|
||||
movd r0d, xmm5
|
||||
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
|
||||
jb near .threshold_exit
|
||||
|
@ -477,7 +477,7 @@ WelsDecoderIChromaPredPlane_sse2:
|
||||
SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c
|
||||
|
||||
;mov esi, [esp + pushsize + 4]
|
||||
mov r0, r4
|
||||
mov r0, r4
|
||||
add r3, 16
|
||||
imul r2, -3
|
||||
add r3, r2 ; s = a + 16 + (-3)*c
|
||||
|
@ -186,7 +186,7 @@ WelsIDctT4Rec_mmx:
|
||||
movsx r1, r1d
|
||||
movsx r3, r3d
|
||||
%endif
|
||||
; mov eax, [pDct ]
|
||||
; mov eax, [pDct ]
|
||||
movq mm0, [r4+ 0]
|
||||
movq mm1, [r4+ 8]
|
||||
movq mm2, [r4+16]
|
||||
|
@ -32,7 +32,7 @@
|
||||
;* memzero.asm
|
||||
;*
|
||||
;* Abstract
|
||||
;*
|
||||
;*
|
||||
;*
|
||||
;* History
|
||||
;* 9/16/2009 Created
|
||||
@ -45,8 +45,8 @@
|
||||
; Code
|
||||
;***********************************************************************
|
||||
|
||||
SECTION .text
|
||||
|
||||
SECTION .text
|
||||
|
||||
ALIGN 16
|
||||
;***********************************************************************
|
||||
;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
|
||||
@ -57,7 +57,7 @@ WelsPrefetchZero_mmx:
|
||||
LOAD_1_PARA
|
||||
;mov eax,[esp+4]
|
||||
prefetchnta [r0]
|
||||
ret
|
||||
ret
|
||||
|
||||
|
||||
ALIGN 16
|
||||
@ -71,7 +71,7 @@ WelsSetMemZeroAligned64_sse2:
|
||||
LOAD_2_PARA
|
||||
SIGN_EXTENTION r1, r1d
|
||||
neg r1
|
||||
|
||||
|
||||
pxor xmm0, xmm0
|
||||
.memzeroa64_sse2_loops:
|
||||
movdqa [r0], xmm0
|
||||
@ -79,11 +79,11 @@ WelsSetMemZeroAligned64_sse2:
|
||||
movdqa [r0+32], xmm0
|
||||
movdqa [r0+48], xmm0
|
||||
add r0, 0x40
|
||||
|
||||
|
||||
add r1, 0x40
|
||||
jnz near .memzeroa64_sse2_loops
|
||||
|
||||
ret
|
||||
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
;***********************************************************************
|
||||
@ -96,7 +96,7 @@ WelsSetMemZeroSize64_mmx:
|
||||
LOAD_2_PARA
|
||||
SIGN_EXTENTION r1, r1d
|
||||
neg r1
|
||||
|
||||
|
||||
pxor mm0, mm0
|
||||
.memzero64_mmx_loops:
|
||||
movq [r0], mm0
|
||||
@ -106,16 +106,16 @@ WelsSetMemZeroSize64_mmx:
|
||||
movq [r0+32], mm0
|
||||
movq [r0+40], mm0
|
||||
movq [r0+48], mm0
|
||||
movq [r0+56], mm0
|
||||
movq [r0+56], mm0
|
||||
add r0, 0x40
|
||||
|
||||
|
||||
add r1, 0x40
|
||||
jnz near .memzero64_mmx_loops
|
||||
|
||||
WELSEMMS
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
|
||||
WELSEMMS
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
;***********************************************************************
|
||||
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
|
||||
;***********************************************************************
|
||||
@ -125,17 +125,17 @@ WelsSetMemZeroSize8_mmx:
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
SIGN_EXTENTION r1, r1d
|
||||
neg r1
|
||||
neg r1
|
||||
pxor mm0, mm0
|
||||
|
||||
|
||||
.memzero8_mmx_loops:
|
||||
movq [r0], mm0
|
||||
add r0, 0x08
|
||||
|
||||
|
||||
add r1, 0x08
|
||||
jnz near .memzero8_mmx_loops
|
||||
|
||||
WELSEMMS
|
||||
ret
|
||||
|
||||
|
||||
WELSEMMS
|
||||
ret
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -179,15 +179,15 @@ WELS_EXTERN BilateralLumaFilter8_sse2
|
||||
;%define stride r1
|
||||
|
||||
BilateralLumaFilter8_sse2:
|
||||
|
||||
push r3
|
||||
|
||||
push r3
|
||||
%assign push_num 1
|
||||
LOAD_2_PARA
|
||||
|
||||
pxor xmm7, xmm7
|
||||
|
||||
|
||||
mov r3, r0
|
||||
|
||||
|
||||
movq xmm6, [r0]
|
||||
punpcklbw xmm6, xmm7
|
||||
movdqa xmm3, [sse2_32]
|
||||
@ -218,10 +218,10 @@ BilateralLumaFilter8_sse2:
|
||||
packuswb xmm5, xmm5
|
||||
movq [r3], xmm5
|
||||
|
||||
|
||||
|
||||
pop r3
|
||||
%assign push_num 0
|
||||
|
||||
|
||||
ret
|
||||
|
||||
WELS_EXTERN WaverageChromaFilter8_sse2
|
||||
@ -239,11 +239,11 @@ ALIGN 16
|
||||
WaverageChromaFilter8_sse2:
|
||||
|
||||
push r3
|
||||
|
||||
|
||||
%assign push_num 1
|
||||
|
||||
|
||||
LOAD_2_PARA
|
||||
|
||||
|
||||
mov r3, r1
|
||||
add r3, r3
|
||||
sub r0, r3 ; pixels - 2 * stride
|
||||
@ -272,8 +272,8 @@ WaverageChromaFilter8_sse2:
|
||||
packuswb xmm3, xmm3
|
||||
movq [r0 + 2], xmm3
|
||||
|
||||
|
||||
|
||||
pop r3
|
||||
|
||||
|
||||
%assign push_num 0
|
||||
ret
|
||||
|
@ -84,24 +84,24 @@ WelsSampleSad8x8_sse21:
|
||||
;push edi
|
||||
;mov eax, [esp+12]
|
||||
;mov ebx, [esp+16]
|
||||
|
||||
|
||||
%assign push_num 0
|
||||
mov r2, arg3
|
||||
push r2
|
||||
CACHE_SPLIT_CHECK r2, 8, 64
|
||||
jle near .pixel_sad_8x8_nsplit
|
||||
pop r2
|
||||
%ifdef X86_32
|
||||
%ifdef X86_32
|
||||
push r3
|
||||
push r4
|
||||
push r5
|
||||
%endif
|
||||
%assign push_num 3
|
||||
mov r0, arg1
|
||||
mov r1, arg2
|
||||
mov r1, arg2
|
||||
SIGN_EXTENTION r1, r1d
|
||||
pxor xmm7, xmm7
|
||||
|
||||
|
||||
;ecx r2, edx r4, edi r5
|
||||
|
||||
mov r5, r2
|
||||
@ -195,18 +195,18 @@ WelsSampleSad8x8_sse21:
|
||||
pop r3
|
||||
%endif
|
||||
jmp .return
|
||||
|
||||
|
||||
.pixel_sad_8x8_nsplit:
|
||||
;push ebx
|
||||
;mov eax, [esp+8]
|
||||
;mov ebx, [esp+12]
|
||||
;mov edx, [esp+20]
|
||||
|
||||
|
||||
pop r2
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
SIGN_EXTENTION r1, r1d
|
||||
SIGN_EXTENTION r3, r3d
|
||||
SIGN_EXTENTION r3, r3d
|
||||
pxor xmm6, xmm6
|
||||
SSE2_GetSad8x4
|
||||
lea r0, [r0+2*r1]
|
||||
|
Loading…
x
Reference in New Issue
Block a user