openh264/codec/common/expand_picture.asm

740 lines
20 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* expand_picture.asm
;*
;* Abstract
;* mmxext/sse for expand_frame
;*
;* History
;* 09/25/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
;SECTION .rodata pData align=16
;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************
;%define PADDING_SIZE_ASM 32 ; PADDING_LENGTH
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
WELS_EXTERN ExpandPictureLuma_sse2
WELS_EXTERN ExpandPictureChromaAlign_sse2 ; for chroma alignment
WELS_EXTERN ExpandPictureChromaUnalign_sse2 ; for chroma unalignment
;;;;;;;expanding result;;;;;;;
;aaaa|attttttttttttttttb|bbbb
;aaaa|attttttttttttttttb|bbbb
;aaaa|attttttttttttttttb|bbbb
;aaaa|attttttttttttttttb|bbbb
;----------------------------
;aaaa|attttttttttttttttb|bbbb
;llll|l r|rrrr
;llll|l r|rrrr
;llll|l r|rrrr
;llll|l r|rrrr
;llll|l r|rrrr
;cccc|ceeeeeeeeeeeeeeeed|dddd
;----------------------------
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+%2]
%endmacro
%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
movdq%4 [%1], %3 ; top(bottom)_0
movdq%4 [%1+%2], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdq%4 [%1], %3 ; top(bottom)_2
movdq%4 [%1+%2], %3 ; top(bottom)_3
lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
movdq%4 [%1], %3 ; top(bottom)_0
movdq%4 [%1+%2], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdq%4 [%1], %3 ; top(bottom)_2
movdq%4 [%1+%2], %3 ; top(bottom)_3
lea %1, [%1+%2]
%endmacro
%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
movdqa [%1], %3 ; top(bottom)_0
movdqa [%1+16], %3 ; top(bottom)_0
movdqa [%1+%2], %3 ; top(bottom)_1
movdqa [%1+%2+16], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdqa [%1], %3 ; top(bottom)_2
movdqa [%1+16], %3 ; top(bottom)_2
movdqa [%1+%2], %3 ; top(bottom)_3
movdqa [%1+%2+16], %3 ; top(bottom)_3
lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
movdqa [%1], %3 ; top(bottom)_0
movdqa [%1+16], %3 ; top(bottom)_0
movdqa [%1+%2], %3 ; top(bottom)_1
movdqa [%1+%2+16], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdqa [%1], %3 ; top(bottom)_2
movdqa [%1+16], %3 ; top(bottom)_2
movdqa [%1+%2], %3 ; top(bottom)_3
movdqa [%1+%2+16], %3 ; top(bottom)_3
lea %1, [%1+%2]
%endmacro
%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
;r2 [width/16(8)]
;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
%if %1 == 32 ; for luma
sar r2, 04h ; width / 16(8) pixels
.top_bottom_loops:
; top
movdqa xmm0, [r0] ; first line of picture pData
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_end16x4_sse2 r5, r1, xmm0, a
; bottom
movdqa xmm1, [r3] ; last line of picture pData
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_end16x4_sse2 r4, r1, xmm1, a
lea r0, [r0+16] ; top pSrc
lea r5, [r5+16] ; top dst
lea r3, [r3+16] ; bottom pSrc
lea r4, [r4+16] ; bottom dst
neg r1 ; positive/negative stride need for next loop?
dec r2
jnz near .top_bottom_loops
%elif %1 == 16 ; for chroma ??
mov r6, r2
sar r2, 04h ; (width / 16) pixels
.top_bottom_loops:
; top
movdqa xmm0, [r0] ; first line of picture pData
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_end16x4_sse2 r5, r1, xmm0, a
; bottom
movdqa xmm1, [r3] ; last line of picture pData
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_end16x4_sse2 r4, r1, xmm1, a
lea r0, [r0+16] ; top pSrc
lea r5, [r5+16] ; top dst
lea r3, [r3+16] ; bottom pSrc
lea r4, [r4+16] ; bottom dst
neg r1 ; positive/negative stride need for next loop?
dec r2
jnz near .top_bottom_loops
; for remaining 8 bytes
and r6, 0fh ; any 8 bytes left?
test r6, r6
jz near .to_be_continued ; no left to exit here
; top
movq mm0, [r0] ; remained 8 byte
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
; bottom
movq mm1, [r3]
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
WELSEMMS
.to_be_continued:
%endif
%endmacro
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
;r6 [height]
;r0 [pSrc+0] r5[pSrc-32] r1[stride]
;r3 [pSrc+(w-1)] r4[pSrc+w]
%if %1 == 32 ; for luma
.left_right_loops:
; left
movzx r2d, byte [r0] ; pixel pData for left border
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r5], xmm0
movdqa [r5+16], xmm0
; right
movzx r2d, byte [r3]
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r4], xmm1
movdqa [r4+16], xmm1
lea r0, [r0+r1] ; left pSrc
lea r5, [r5+r1] ; left dst
lea r3, [r3+r1] ; right pSrc
lea r4, [r4+r1] ; right dst
dec r6
jnz near .left_right_loops
%elif %1 == 16 ; for chroma ??
.left_right_loops:
; left
movzx r2d, byte [r0] ; pixel pData for left border
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r5], xmm0
; right
movzx r2d, byte [r3]
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
lea r0, [r0+r1] ; left pSrc
lea r5, [r5+r1] ; left dst
lea r3, [r3+r1] ; right pSrc
lea r4, [r4+r1] ; right dst
dec r6
jnz near .left_right_loops
%endif
%endmacro
%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
%if %1 == 32 ; luma
; TL
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
; TR
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
; BL
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
; BR
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
%elif %1 == 16 ; chroma
; TL
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
; TR
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
; BL
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
; BR
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
%endif
%endmacro
ALIGN 16
;***********************************************************************----------------
; void ExpandPictureLuma_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
ExpandPictureLuma_sse2:
push r4
push r5
push r6
%assign push_num 3
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
SIGN_EXTENTION r2, r2d
SIGN_EXTENTION r3, r3d
;also prepare for cross border pData top-left:xmm3
movzx r6d,byte[r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
neg r1
lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
neg r1
push r3
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
mov r6,r1 ;r6 = stride
sal r6,05h ;r6 = 32*stride
lea r4,[r3+r6] ;r4 = dst bottom
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
neg r1 ;r1 = -stride
push r0
push r1
push r2
exp_top_bottom_sse2 32
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
pop r0
lea r5,[r0-32] ;left border dst luma =32 chroma = -16
lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride
pop r6 ; r6 = height
push r0
push r1
push r2
push r6
exp_left_right_sse2 32,a
pop r6
pop r2
pop r1
pop r0
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
lea r3,[r0-32]
lea r3,[r3+r1] ;last line of top-left border
lea r4,[r0+r2] ;psrc +width
lea r4,[r4+r1] ;psrc +width -stride
neg r1 ;r1 = stride
add r6,32 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
neg r1 ; r1 = -stride
; for left & right border expanding
exp_cross_sse2 32,a
LOAD_4_PARA_POP
pop r6
pop r5
pop r4
%assign push_num 0
ret
ALIGN 16
;***********************************************************************----------------
; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
ExpandPictureChromaAlign_sse2:
push r4
push r5
push r6
%assign push_num 3
LOAD_4_PARA
SIGN_EXTENTION r1,r1d
SIGN_EXTENTION r2,r2d
SIGN_EXTENTION r3,r3d
;also prepare for cross border pData top-left:xmm3
movzx r6d,byte [r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
neg r1
lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
neg r1
push r3
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
mov r6,r1 ;r6 = stride
sal r6,04h ;r6 = 32*stride
lea r4,[r3+r6] ;r4 = dst bottom
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
neg r1 ;r1 = -stride
push r0
push r1
push r2
exp_top_bottom_sse2 16
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
pop r0
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride
pop r6 ; r6 = height
push r0
push r1
push r2
push r6
exp_left_right_sse2 16,a
pop r6
pop r2
pop r1
pop r0
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
lea r3,[r0-16]
lea r3,[r3+r1] ;last line of top-left border
lea r4,[r0+r2] ;psrc +width
lea r4,[r4+r1] ;psrc +width -stride
neg r1 ;r1 = stride
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
neg r1 ; r1 = -stride
; for left & right border expanding
exp_cross_sse2 16,a
LOAD_4_PARA_POP
pop r6
pop r5
pop r4
%assign push_num 0
ret
ALIGN 16
;***********************************************************************----------------
; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
ExpandPictureChromaUnalign_sse2:
push r4
push r5
push r6
%assign push_num 3
LOAD_4_PARA
SIGN_EXTENTION r1,r1d
SIGN_EXTENTION r2,r2d
SIGN_EXTENTION r3,r3d
;also prepare for cross border pData top-left:xmm3
movzx r6d,byte [r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
neg r1
lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
neg r1
push r3
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
mov r6,r1 ;r6 = stride
sal r6,04h ;r6 = 32*stride
lea r4,[r3+r6] ;r4 = dst bottom
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
neg r1 ;r1 = -stride
push r0
push r1
push r2
exp_top_bottom_sse2 16
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
pop r0
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride
pop r6 ; r6 = height
push r0
push r1
push r2
push r6
exp_left_right_sse2 16,u
pop r6
pop r2
pop r1
pop r0
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
lea r3,[r0-16]
lea r3,[r3+r1] ;last line of top-left border
lea r4,[r0+r2] ;psrc +width
lea r4,[r4+r1] ;psrc +width -stride
neg r1 ;r1 = stride
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
neg r1 ; r1 = -stride
; for left & right border expanding
exp_cross_sse2 16,u
LOAD_4_PARA_POP
pop r6
pop r5
pop r4
%assign push_num 0
ret