740 lines
20 KiB
NASM
740 lines
20 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* expand_picture.asm
|
|
;*
|
|
;* Abstract
|
|
;* mmxext/sse for expand_frame
|
|
;*
|
|
;* History
|
|
;* 09/25/2009 Created
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
|
|
%include "asm_inc.asm"
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
; Macros and other preprocessor constants
|
|
;***********************************************************************
|
|
|
|
;***********************************************************************
|
|
; Local Data (Read Only)
|
|
;***********************************************************************
|
|
|
|
;SECTION .rodata pData align=16
|
|
|
|
;***********************************************************************
|
|
; Various memory constants (trigonometric values or rounding values)
|
|
;***********************************************************************
|
|
;%define PADDING_SIZE_ASM 32 ; PADDING_LENGTH
|
|
|
|
;***********************************************************************
|
|
; Code
|
|
;***********************************************************************
|
|
|
|
|
|
|
|
SECTION .text
|
|
|
|
WELS_EXTERN ExpandPictureLuma_sse2
|
|
WELS_EXTERN ExpandPictureChromaAlign_sse2 ; for chroma alignment
|
|
WELS_EXTERN ExpandPictureChromaUnalign_sse2 ; for chroma unalignment
|
|
|
|
;;;;;;;expanding result;;;;;;;
|
|
|
|
;aaaa|attttttttttttttttb|bbbb
|
|
;aaaa|attttttttttttttttb|bbbb
|
|
;aaaa|attttttttttttttttb|bbbb
|
|
;aaaa|attttttttttttttttb|bbbb
|
|
;----------------------------
|
|
;aaaa|attttttttttttttttb|bbbb
|
|
;llll|l r|rrrr
|
|
;llll|l r|rrrr
|
|
;llll|l r|rrrr
|
|
;llll|l r|rrrr
|
|
;llll|l r|rrrr
|
|
;cccc|ceeeeeeeeeeeeeeeed|dddd
|
|
;----------------------------
|
|
;cccc|ceeeeeeeeeeeeeeeed|dddd
|
|
;cccc|ceeeeeeeeeeeeeeeed|dddd
|
|
;cccc|ceeeeeeeeeeeeeeeed|dddd
|
|
;cccc|ceeeeeeeeeeeeeeeed|dddd
|
|
|
|
%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
|
|
movq [%1], %3
|
|
movq [%1+%2], %3
|
|
lea %1, [%1+2*%2]
|
|
movq [%1], %3
|
|
movq [%1+%2], %3
|
|
lea %1, [%1+2*%2]
|
|
%endmacro
|
|
|
|
%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
|
|
movq [%1], %3
|
|
movq [%1+%2], %3
|
|
lea %1, [%1+2*%2]
|
|
movq [%1], %3
|
|
movq [%1+%2], %3
|
|
lea %1, [%1+%2]
|
|
%endmacro
|
|
|
|
%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
|
|
movdq%4 [%1], %3 ; top(bottom)_0
|
|
movdq%4 [%1+%2], %3 ; top(bottom)_1
|
|
lea %1, [%1+2*%2]
|
|
movdq%4 [%1], %3 ; top(bottom)_2
|
|
movdq%4 [%1+%2], %3 ; top(bottom)_3
|
|
lea %1, [%1+2*%2]
|
|
%endmacro
|
|
|
|
%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
|
|
movdq%4 [%1], %3 ; top(bottom)_0
|
|
movdq%4 [%1+%2], %3 ; top(bottom)_1
|
|
lea %1, [%1+2*%2]
|
|
movdq%4 [%1], %3 ; top(bottom)_2
|
|
movdq%4 [%1+%2], %3 ; top(bottom)_3
|
|
lea %1, [%1+%2]
|
|
%endmacro
|
|
|
|
%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
|
|
movdqa [%1], %3 ; top(bottom)_0
|
|
movdqa [%1+16], %3 ; top(bottom)_0
|
|
movdqa [%1+%2], %3 ; top(bottom)_1
|
|
movdqa [%1+%2+16], %3 ; top(bottom)_1
|
|
lea %1, [%1+2*%2]
|
|
movdqa [%1], %3 ; top(bottom)_2
|
|
movdqa [%1+16], %3 ; top(bottom)_2
|
|
movdqa [%1+%2], %3 ; top(bottom)_3
|
|
movdqa [%1+%2+16], %3 ; top(bottom)_3
|
|
lea %1, [%1+2*%2]
|
|
%endmacro
|
|
|
|
%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
|
|
movdqa [%1], %3 ; top(bottom)_0
|
|
movdqa [%1+16], %3 ; top(bottom)_0
|
|
movdqa [%1+%2], %3 ; top(bottom)_1
|
|
movdqa [%1+%2+16], %3 ; top(bottom)_1
|
|
lea %1, [%1+2*%2]
|
|
movdqa [%1], %3 ; top(bottom)_2
|
|
movdqa [%1+16], %3 ; top(bottom)_2
|
|
movdqa [%1+%2], %3 ; top(bottom)_3
|
|
movdqa [%1+%2+16], %3 ; top(bottom)_3
|
|
lea %1, [%1+%2]
|
|
%endmacro
|
|
|
|
%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
|
|
;r2 [width/16(8)]
|
|
;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
|
|
;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
|
|
|
|
%if %1 == 32 ; for luma
|
|
sar r2, 04h ; width / 16(8) pixels
|
|
.top_bottom_loops:
|
|
; top
|
|
movdqa xmm0, [r0] ; first line of picture pData
|
|
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r5, r1, xmm0, a
|
|
mov_line_16x4_sse2 r5, r1, xmm0, a
|
|
mov_line_16x4_sse2 r5, r1, xmm0, a
|
|
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r5, r1, xmm0, a
|
|
mov_line_16x4_sse2 r5, r1, xmm0, a
|
|
mov_line_end16x4_sse2 r5, r1, xmm0, a
|
|
|
|
; bottom
|
|
movdqa xmm1, [r3] ; last line of picture pData
|
|
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r4, r1, xmm1, a
|
|
mov_line_16x4_sse2 r4, r1, xmm1, a
|
|
mov_line_16x4_sse2 r4, r1, xmm1, a
|
|
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r4, r1, xmm1, a
|
|
mov_line_16x4_sse2 r4, r1, xmm1, a
|
|
mov_line_end16x4_sse2 r4, r1, xmm1, a
|
|
|
|
lea r0, [r0+16] ; top pSrc
|
|
lea r5, [r5+16] ; top dst
|
|
lea r3, [r3+16] ; bottom pSrc
|
|
lea r4, [r4+16] ; bottom dst
|
|
neg r1 ; positive/negative stride need for next loop?
|
|
|
|
dec r2
|
|
jnz near .top_bottom_loops
|
|
%elif %1 == 16 ; for chroma ??
|
|
mov r6, r2
|
|
sar r2, 04h ; (width / 16) pixels
|
|
.top_bottom_loops:
|
|
; top
|
|
movdqa xmm0, [r0] ; first line of picture pData
|
|
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r5, r1, xmm0, a
|
|
mov_line_16x4_sse2 r5, r1, xmm0, a
|
|
mov_line_end16x4_sse2 r5, r1, xmm0, a
|
|
|
|
; bottom
|
|
movdqa xmm1, [r3] ; last line of picture pData
|
|
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r4, r1, xmm1, a
|
|
mov_line_16x4_sse2 r4, r1, xmm1, a
|
|
mov_line_end16x4_sse2 r4, r1, xmm1, a
|
|
|
|
lea r0, [r0+16] ; top pSrc
|
|
lea r5, [r5+16] ; top dst
|
|
lea r3, [r3+16] ; bottom pSrc
|
|
lea r4, [r4+16] ; bottom dst
|
|
neg r1 ; positive/negative stride need for next loop?
|
|
|
|
dec r2
|
|
jnz near .top_bottom_loops
|
|
|
|
; for remaining 8 bytes
|
|
and r6, 0fh ; any 8 bytes left?
|
|
test r6, r6
|
|
jz near .to_be_continued ; no left to exit here
|
|
|
|
; top
|
|
movq mm0, [r0] ; remained 8 byte
|
|
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
|
|
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
|
|
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
|
|
mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
|
|
; bottom
|
|
movq mm1, [r3]
|
|
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
|
|
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
|
|
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
|
|
mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
|
|
WELSEMMS
|
|
|
|
.to_be_continued:
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
|
|
;r6 [height]
|
|
;r0 [pSrc+0] r5[pSrc-32] r1[stride]
|
|
;r3 [pSrc+(w-1)] r4[pSrc+w]
|
|
|
|
%if %1 == 32 ; for luma
|
|
.left_right_loops:
|
|
; left
|
|
movzx r2d, byte [r0] ; pixel pData for left border
|
|
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
|
|
movdqa [r5], xmm0
|
|
movdqa [r5+16], xmm0
|
|
|
|
; right
|
|
movzx r2d, byte [r3]
|
|
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
|
|
movdqa [r4], xmm1
|
|
movdqa [r4+16], xmm1
|
|
|
|
lea r0, [r0+r1] ; left pSrc
|
|
lea r5, [r5+r1] ; left dst
|
|
lea r3, [r3+r1] ; right pSrc
|
|
lea r4, [r4+r1] ; right dst
|
|
|
|
dec r6
|
|
jnz near .left_right_loops
|
|
%elif %1 == 16 ; for chroma ??
|
|
.left_right_loops:
|
|
; left
|
|
movzx r2d, byte [r0] ; pixel pData for left border
|
|
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
|
|
movdqa [r5], xmm0
|
|
|
|
; right
|
|
movzx r2d, byte [r3]
|
|
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
|
|
movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
|
|
|
|
lea r0, [r0+r1] ; left pSrc
|
|
lea r5, [r5+r1] ; left dst
|
|
lea r3, [r3+r1] ; right pSrc
|
|
lea r4, [r4+r1] ; right dst
|
|
|
|
dec r6
|
|
jnz near .left_right_loops
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
|
|
; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
|
|
; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
|
|
;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
|
|
%if %1 == 32 ; luma
|
|
; TL
|
|
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
|
mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
|
|
|
; TR
|
|
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
|
mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
|
|
|
; BL
|
|
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
|
mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
|
|
|
; BR
|
|
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
|
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
|
mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
|
%elif %1 == 16 ; chroma
|
|
; TL
|
|
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
|
|
mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
|
|
|
|
; TR
|
|
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
|
|
mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
|
|
|
|
; BL
|
|
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
|
|
mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
|
|
|
|
; BR
|
|
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
|
|
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
|
|
mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
|
|
%endif
|
|
%endmacro
|
|
|
|
ALIGN 16
|
|
;***********************************************************************----------------
|
|
; void ExpandPictureLuma_sse2( uint8_t *pDst,
|
|
; const int32_t iStride,
|
|
; const int32_t iWidth,
|
|
; const int32_t iHeight );
|
|
;***********************************************************************----------------
|
|
ExpandPictureLuma_sse2:
|
|
|
|
push r4
|
|
push r5
|
|
push r6
|
|
|
|
%assign push_num 3
|
|
LOAD_4_PARA
|
|
|
|
SIGN_EXTENTION r1, r1d
|
|
SIGN_EXTENTION r2, r2d
|
|
SIGN_EXTENTION r3, r3d
|
|
|
|
;also prepare for cross border pData top-left:xmm3
|
|
|
|
movzx r6d,byte[r0]
|
|
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
|
|
|
|
neg r1
|
|
lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
|
|
neg r1
|
|
|
|
push r3
|
|
|
|
|
|
dec r3 ;h-1
|
|
imul r3,r1 ;(h-1)*stride
|
|
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
|
|
|
|
mov r6,r1 ;r6 = stride
|
|
sal r6,05h ;r6 = 32*stride
|
|
lea r4,[r3+r6] ;r4 = dst bottom
|
|
|
|
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
|
|
|
|
movzx r6d,byte [r3] ;bottom-left
|
|
SSE2_Copy16Times xmm5,r6d
|
|
|
|
lea r6,[r3+r2-1]
|
|
movzx r6d,byte [r6]
|
|
SSE2_Copy16Times xmm6,r6d ;bottom-right
|
|
|
|
neg r1 ;r1 = -stride
|
|
|
|
push r0
|
|
push r1
|
|
push r2
|
|
|
|
exp_top_bottom_sse2 32
|
|
|
|
; for both left and right border
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
pop r2
|
|
pop r1
|
|
pop r0
|
|
|
|
lea r5,[r0-32] ;left border dst luma =32 chroma = -16
|
|
|
|
lea r3,[r0+r2-1] ;right border src
|
|
lea r4,[r3+1] ;right border dst
|
|
|
|
;prepare for cross border data: top-rigth with xmm4
|
|
movzx r6d,byte [r3] ;top -rigth
|
|
SSE2_Copy16Times xmm4,r6d
|
|
|
|
neg r1 ;r1 = stride
|
|
|
|
|
|
pop r6 ; r6 = height
|
|
|
|
|
|
|
|
push r0
|
|
push r1
|
|
push r2
|
|
push r6
|
|
|
|
exp_left_right_sse2 32,a
|
|
|
|
pop r6
|
|
pop r2
|
|
pop r1
|
|
pop r0
|
|
|
|
; for cross border [top-left, top-right, bottom-left, bottom-right]
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
|
|
|
|
neg r1 ;r1 = -stride
|
|
lea r3,[r0-32]
|
|
lea r3,[r3+r1] ;last line of top-left border
|
|
|
|
lea r4,[r0+r2] ;psrc +width
|
|
lea r4,[r4+r1] ;psrc +width -stride
|
|
|
|
|
|
neg r1 ;r1 = stride
|
|
add r6,32 ;height +32(16) ,luma = 32, chroma = 16
|
|
imul r6,r1
|
|
|
|
lea r5,[r3+r6] ;last line of bottom-left border
|
|
lea r6,[r4+r6] ;last line of botoom-right border
|
|
|
|
neg r1 ; r1 = -stride
|
|
|
|
; for left & right border expanding
|
|
exp_cross_sse2 32,a
|
|
|
|
LOAD_4_PARA_POP
|
|
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
|
|
%assign push_num 0
|
|
|
|
|
|
ret
|
|
|
|
ALIGN 16
|
|
;***********************************************************************----------------
|
|
; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
|
|
; const int32_t iStride,
|
|
; const int32_t iWidth,
|
|
; const int32_t iHeight );
|
|
;***********************************************************************----------------
|
|
ExpandPictureChromaAlign_sse2:
|
|
|
|
push r4
|
|
push r5
|
|
push r6
|
|
|
|
%assign push_num 3
|
|
LOAD_4_PARA
|
|
|
|
SIGN_EXTENTION r1,r1d
|
|
SIGN_EXTENTION r2,r2d
|
|
SIGN_EXTENTION r3,r3d
|
|
|
|
;also prepare for cross border pData top-left:xmm3
|
|
|
|
movzx r6d,byte [r0]
|
|
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
|
|
|
|
neg r1
|
|
lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
|
|
neg r1
|
|
|
|
push r3
|
|
|
|
|
|
dec r3 ;h-1
|
|
imul r3,r1 ;(h-1)*stride
|
|
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
|
|
|
|
mov r6,r1 ;r6 = stride
|
|
sal r6,04h ;r6 = 32*stride
|
|
lea r4,[r3+r6] ;r4 = dst bottom
|
|
|
|
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
|
|
|
|
movzx r6d,byte [r3] ;bottom-left
|
|
SSE2_Copy16Times xmm5,r6d
|
|
|
|
lea r6,[r3+r2-1]
|
|
movzx r6d,byte [r6]
|
|
SSE2_Copy16Times xmm6,r6d ;bottom-right
|
|
|
|
neg r1 ;r1 = -stride
|
|
|
|
push r0
|
|
push r1
|
|
push r2
|
|
|
|
exp_top_bottom_sse2 16
|
|
|
|
; for both left and right border
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
pop r2
|
|
pop r1
|
|
pop r0
|
|
|
|
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
|
|
|
|
lea r3,[r0+r2-1] ;right border src
|
|
lea r4,[r3+1] ;right border dst
|
|
|
|
;prepare for cross border data: top-rigth with xmm4
|
|
movzx r6d,byte [r3] ;top -rigth
|
|
SSE2_Copy16Times xmm4,r6d
|
|
|
|
neg r1 ;r1 = stride
|
|
|
|
|
|
pop r6 ; r6 = height
|
|
|
|
|
|
|
|
push r0
|
|
push r1
|
|
push r2
|
|
push r6
|
|
exp_left_right_sse2 16,a
|
|
|
|
pop r6
|
|
pop r2
|
|
pop r1
|
|
pop r0
|
|
|
|
; for cross border [top-left, top-right, bottom-left, bottom-right]
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
|
|
|
|
neg r1 ;r1 = -stride
|
|
lea r3,[r0-16]
|
|
lea r3,[r3+r1] ;last line of top-left border
|
|
|
|
lea r4,[r0+r2] ;psrc +width
|
|
lea r4,[r4+r1] ;psrc +width -stride
|
|
|
|
|
|
neg r1 ;r1 = stride
|
|
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
|
|
imul r6,r1
|
|
|
|
lea r5,[r3+r6] ;last line of bottom-left border
|
|
lea r6,[r4+r6] ;last line of botoom-right border
|
|
|
|
neg r1 ; r1 = -stride
|
|
|
|
; for left & right border expanding
|
|
exp_cross_sse2 16,a
|
|
|
|
LOAD_4_PARA_POP
|
|
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
|
|
%assign push_num 0
|
|
|
|
|
|
ret
|
|
|
|
ALIGN 16
|
|
;***********************************************************************----------------
|
|
; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
|
|
; const int32_t iStride,
|
|
; const int32_t iWidth,
|
|
; const int32_t iHeight );
|
|
;***********************************************************************----------------
|
|
ExpandPictureChromaUnalign_sse2:
|
|
push r4
|
|
push r5
|
|
push r6
|
|
|
|
%assign push_num 3
|
|
LOAD_4_PARA
|
|
|
|
SIGN_EXTENTION r1,r1d
|
|
SIGN_EXTENTION r2,r2d
|
|
SIGN_EXTENTION r3,r3d
|
|
|
|
;also prepare for cross border pData top-left:xmm3
|
|
|
|
movzx r6d,byte [r0]
|
|
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
|
|
|
|
neg r1
|
|
lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
|
|
neg r1
|
|
|
|
push r3
|
|
|
|
|
|
dec r3 ;h-1
|
|
imul r3,r1 ;(h-1)*stride
|
|
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
|
|
|
|
mov r6,r1 ;r6 = stride
|
|
sal r6,04h ;r6 = 32*stride
|
|
lea r4,[r3+r6] ;r4 = dst bottom
|
|
|
|
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
|
|
|
|
movzx r6d,byte [r3] ;bottom-left
|
|
SSE2_Copy16Times xmm5,r6d
|
|
|
|
lea r6,[r3+r2-1]
|
|
movzx r6d,byte [r6]
|
|
SSE2_Copy16Times xmm6,r6d ;bottom-right
|
|
|
|
neg r1 ;r1 = -stride
|
|
|
|
push r0
|
|
push r1
|
|
push r2
|
|
|
|
exp_top_bottom_sse2 16
|
|
|
|
; for both left and right border
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
pop r2
|
|
pop r1
|
|
pop r0
|
|
|
|
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
|
|
|
|
lea r3,[r0+r2-1] ;right border src
|
|
lea r4,[r3+1] ;right border dst
|
|
|
|
;prepare for cross border data: top-rigth with xmm4
|
|
movzx r6d,byte [r3] ;top -rigth
|
|
SSE2_Copy16Times xmm4,r6d
|
|
|
|
neg r1 ;r1 = stride
|
|
|
|
|
|
pop r6 ; r6 = height
|
|
|
|
|
|
|
|
push r0
|
|
push r1
|
|
push r2
|
|
push r6
|
|
exp_left_right_sse2 16,u
|
|
|
|
pop r6
|
|
pop r2
|
|
pop r1
|
|
pop r0
|
|
|
|
; for cross border [top-left, top-right, bottom-left, bottom-right]
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
|
|
|
|
neg r1 ;r1 = -stride
|
|
lea r3,[r0-16]
|
|
lea r3,[r3+r1] ;last line of top-left border
|
|
|
|
lea r4,[r0+r2] ;psrc +width
|
|
lea r4,[r4+r1] ;psrc +width -stride
|
|
|
|
|
|
neg r1 ;r1 = stride
|
|
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
|
|
imul r6,r1
|
|
|
|
lea r5,[r3+r6] ;last line of bottom-left border
|
|
lea r6,[r4+r6] ;last line of botoom-right border
|
|
|
|
neg r1 ; r1 = -stride
|
|
|
|
; for left & right border expanding
|
|
exp_cross_sse2 16,u
|
|
|
|
LOAD_4_PARA_POP
|
|
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
|
|
%assign push_num 0
|
|
|
|
|
|
ret
|