openh264/codec/common/x86/expand_picture.asm
Martin Storsjö 57f6bcc4b0 Convert all tabs to spaces in assembly sources, unify indentation
Previously the assembly sources had mixed indentation consisting
of both spaces and tabs, making it quite hard to read unless
the right tab size was used in the editor.

Tabs have been interpreted as 4 spaces in most cases, matching
the surrounding code.
2014-06-01 01:35:43 +03:00

729 lines
21 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* expand_picture.asm
;*
;* Abstract
;* mmxext/sse for expand_frame
;*
;* History
;* 09/25/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
;;;;;;;expanding result;;;;;;;
;aaaa|attttttttttttttttb|bbbb
;aaaa|attttttttttttttttb|bbbb
;aaaa|attttttttttttttttb|bbbb
;aaaa|attttttttttttttttb|bbbb
;----------------------------
;aaaa|attttttttttttttttb|bbbb
;llll|l r|rrrr
;llll|l r|rrrr
;llll|l r|rrrr
;llll|l r|rrrr
;llll|l r|rrrr
;cccc|ceeeeeeeeeeeeeeeed|dddd
;----------------------------
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+%2]
%endmacro
%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
movdq%4 [%1], %3 ; top(bottom)_0
movdq%4 [%1+%2], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdq%4 [%1], %3 ; top(bottom)_2
movdq%4 [%1+%2], %3 ; top(bottom)_3
lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
movdq%4 [%1], %3 ; top(bottom)_0
movdq%4 [%1+%2], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdq%4 [%1], %3 ; top(bottom)_2
movdq%4 [%1+%2], %3 ; top(bottom)_3
lea %1, [%1+%2]
%endmacro
%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
movdqa [%1], %3 ; top(bottom)_0
movdqa [%1+16], %3 ; top(bottom)_0
movdqa [%1+%2], %3 ; top(bottom)_1
movdqa [%1+%2+16], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdqa [%1], %3 ; top(bottom)_2
movdqa [%1+16], %3 ; top(bottom)_2
movdqa [%1+%2], %3 ; top(bottom)_3
movdqa [%1+%2+16], %3 ; top(bottom)_3
lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
movdqa [%1], %3 ; top(bottom)_0
movdqa [%1+16], %3 ; top(bottom)_0
movdqa [%1+%2], %3 ; top(bottom)_1
movdqa [%1+%2+16], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdqa [%1], %3 ; top(bottom)_2
movdqa [%1+16], %3 ; top(bottom)_2
movdqa [%1+%2], %3 ; top(bottom)_3
movdqa [%1+%2+16], %3 ; top(bottom)_3
lea %1, [%1+%2]
%endmacro
%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
;r2 [width/16(8)]
;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
%if %1 == 32 ; for luma
sar r2, 04h ; width / 16(8) pixels
.top_bottom_loops:
; top
movdqa xmm0, [r0] ; first line of picture pData
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_end16x4_sse2 r5, r1, xmm0, a
; bottom
movdqa xmm1, [r3] ; last line of picture pData
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_end16x4_sse2 r4, r1, xmm1, a
lea r0, [r0+16] ; top pSrc
lea r5, [r5+16] ; top dst
lea r3, [r3+16] ; bottom pSrc
lea r4, [r4+16] ; bottom dst
neg r1 ; positive/negative stride need for next loop?
dec r2
jnz near .top_bottom_loops
%elif %1 == 16 ; for chroma ??
mov r6, r2
sar r2, 04h ; (width / 16) pixels
.top_bottom_loops:
; top
movdqa xmm0, [r0] ; first line of picture pData
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_16x4_sse2 r5, r1, xmm0, a
mov_line_end16x4_sse2 r5, r1, xmm0, a
; bottom
movdqa xmm1, [r3] ; last line of picture pData
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_16x4_sse2 r4, r1, xmm1, a
mov_line_end16x4_sse2 r4, r1, xmm1, a
lea r0, [r0+16] ; top pSrc
lea r5, [r5+16] ; top dst
lea r3, [r3+16] ; bottom pSrc
lea r4, [r4+16] ; bottom dst
neg r1 ; positive/negative stride need for next loop?
dec r2
jnz near .top_bottom_loops
; for remaining 8 bytes
and r6, 0fh ; any 8 bytes left?
test r6, r6
jz near .to_be_continued ; no left to exit here
; top
movq mm0, [r0] ; remained 8 byte
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
; bottom
movq mm1, [r3]
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
WELSEMMS
.to_be_continued:
%endif
%endmacro
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
;r6 [height]
;r0 [pSrc+0] r5[pSrc-32] r1[stride]
;r3 [pSrc+(w-1)] r4[pSrc+w]
%if %1 == 32 ; for luma
.left_right_loops:
; left
movzx r2d, byte [r0] ; pixel pData for left border
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r5], xmm0
movdqa [r5+16], xmm0
; right
movzx r2d, byte [r3]
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r4], xmm1
movdqa [r4+16], xmm1
lea r0, [r0+r1] ; left pSrc
lea r5, [r5+r1] ; left dst
lea r3, [r3+r1] ; right pSrc
lea r4, [r4+r1] ; right dst
dec r6
jnz near .left_right_loops
%elif %1 == 16 ; for chroma ??
.left_right_loops:
; left
movzx r2d, byte [r0] ; pixel pData for left border
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [r5], xmm0
; right
movzx r2d, byte [r3]
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
lea r0, [r0+r1] ; left pSrc
lea r5, [r5+r1] ; left dst
lea r3, [r3+r1] ; right pSrc
lea r4, [r4+r1] ; right dst
dec r6
jnz near .left_right_loops
%endif
%endmacro
%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
%if %1 == 32 ; luma
; TL
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
; TR
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
; BL
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
; BR
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
%elif %1 == 16 ; chroma
; TL
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
; TR
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
; BL
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
; BR
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
%endif
%endmacro
;***********************************************************************----------------
; void ExpandPictureLuma_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
WELS_EXTERN ExpandPictureLuma_sse2
push r4
push r5
push r6
%assign push_num 3
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r3, r3d
;also prepare for cross border pData top-left:xmm3
movzx r6d,byte[r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
neg r1
lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
neg r1
push r3
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
mov r6,r1 ;r6 = stride
sal r6,05h ;r6 = 32*stride
lea r4,[r3+r6] ;r4 = dst bottom
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
neg r1 ;r1 = -stride
push r0
push r1
push r2
exp_top_bottom_sse2 32
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
pop r0
lea r5,[r0-32] ;left border dst luma =32 chroma = -16
lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride
pop r6 ; r6 = height
push r0
push r1
push r2
push r6
exp_left_right_sse2 32,a
pop r6
pop r2
pop r1
pop r0
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
lea r3,[r0-32]
lea r3,[r3+r1] ;last line of top-left border
lea r4,[r0+r2] ;psrc +width
lea r4,[r4+r1] ;psrc +width -stride
neg r1 ;r1 = stride
add r6,32 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
neg r1 ; r1 = -stride
; for left & right border expanding
exp_cross_sse2 32,a
POP_XMM
LOAD_4_PARA_POP
pop r6
pop r5
pop r4
%assign push_num 0
ret
;***********************************************************************----------------
; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
WELS_EXTERN ExpandPictureChromaAlign_sse2
push r4
push r5
push r6
%assign push_num 3
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1,r1d
SIGN_EXTENSION r2,r2d
SIGN_EXTENSION r3,r3d
;also prepare for cross border pData top-left:xmm3
movzx r6d,byte [r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
neg r1
lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
neg r1
push r3
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
mov r6,r1 ;r6 = stride
sal r6,04h ;r6 = 32*stride
lea r4,[r3+r6] ;r4 = dst bottom
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
neg r1 ;r1 = -stride
push r0
push r1
push r2
exp_top_bottom_sse2 16
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
pop r0
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride
pop r6 ; r6 = height
push r0
push r1
push r2
push r6
exp_left_right_sse2 16,a
pop r6
pop r2
pop r1
pop r0
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
lea r3,[r0-16]
lea r3,[r3+r1] ;last line of top-left border
lea r4,[r0+r2] ;psrc +width
lea r4,[r4+r1] ;psrc +width -stride
neg r1 ;r1 = stride
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
neg r1 ; r1 = -stride
; for left & right border expanding
exp_cross_sse2 16,a
POP_XMM
LOAD_4_PARA_POP
pop r6
pop r5
pop r4
%assign push_num 0
ret
;***********************************************************************----------------
; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
; const int32_t iStride,
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
WELS_EXTERN ExpandPictureChromaUnalign_sse2
push r4
push r5
push r6
%assign push_num 3
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1,r1d
SIGN_EXTENSION r2,r2d
SIGN_EXTENSION r3,r3d
;also prepare for cross border pData top-left:xmm3
movzx r6d,byte [r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
neg r1
lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
neg r1
push r3
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
mov r6,r1 ;r6 = stride
sal r6,04h ;r6 = 32*stride
lea r4,[r3+r6] ;r4 = dst bottom
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
neg r1 ;r1 = -stride
push r0
push r1
push r2
exp_top_bottom_sse2 16
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
pop r0
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride
pop r6 ; r6 = height
push r0
push r1
push r2
push r6
exp_left_right_sse2 16,u
pop r6
pop r2
pop r1
pop r0
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
lea r3,[r0-16]
lea r3,[r3+r1] ;last line of top-left border
lea r4,[r0+r2] ;psrc +width
lea r4,[r4+r1] ;psrc +width -stride
neg r1 ;r1 = stride
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
neg r1 ; r1 = -stride
; for left & right border expanding
exp_cross_sse2 16,u
POP_XMM
LOAD_4_PARA_POP
pop r6
pop r5
pop r4
%assign push_num 0
ret