openh264/codec/decoder/core/asm/expand_picture.asm
2013-12-09 04:51:09 -08:00

656 lines
22 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* expand_picture.asm
;*
;* Abstract
;* mmxext/sse for expand_frame
;*
;* History
;* 09/25/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
BITS 32
;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
;SECTION .rodata pData align=16
;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************
;%define PADDING_SIZE_ASM 32 ; PADDING_LENGTH
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
;WELS_EXTERN expand_picture_luma_mmx
;WELS_EXTERN expand_picture_chroma_mmx
WELS_EXTERN ExpandPictureLuma_sse2
WELS_EXTERN ExpandPictureChromaAlign_sse2 ; for chroma alignment
WELS_EXTERN ExpandPictureChromaUnalign_sse2 ; for chroma unalignment
;;;;;;;expanding result;;;;;;;
;aaaa|attttttttttttttttb|bbbb
;aaaa|attttttttttttttttb|bbbb
;aaaa|attttttttttttttttb|bbbb
;aaaa|attttttttttttttttb|bbbb
;----------------------------
;aaaa|attttttttttttttttb|bbbb
;llll|l r|rrrr
;llll|l r|rrrr
;llll|l r|rrrr
;llll|l r|rrrr
;llll|l r|rrrr
;cccc|ceeeeeeeeeeeeeeeed|dddd
;----------------------------
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+2*%2]
movq [%1], %3
movq [%1+%2], %3
lea %1, [%1+%2]
%endmacro
%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
movdq%4 [%1], %3 ; top(bottom)_0
movdq%4 [%1+%2], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdq%4 [%1], %3 ; top(bottom)_2
movdq%4 [%1+%2], %3 ; top(bottom)_3
lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
movdq%4 [%1], %3 ; top(bottom)_0
movdq%4 [%1+%2], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdq%4 [%1], %3 ; top(bottom)_2
movdq%4 [%1+%2], %3 ; top(bottom)_3
lea %1, [%1+%2]
%endmacro
%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
movdqa [%1], %3 ; top(bottom)_0
movdqa [%1+16], %3 ; top(bottom)_0
movdqa [%1+%2], %3 ; top(bottom)_1
movdqa [%1+%2+16], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdqa [%1], %3 ; top(bottom)_2
movdqa [%1+16], %3 ; top(bottom)_2
movdqa [%1+%2], %3 ; top(bottom)_3
movdqa [%1+%2+16], %3 ; top(bottom)_3
lea %1, [%1+2*%2]
%endmacro
%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
movdqa [%1], %3 ; top(bottom)_0
movdqa [%1+16], %3 ; top(bottom)_0
movdqa [%1+%2], %3 ; top(bottom)_1
movdqa [%1+%2+16], %3 ; top(bottom)_1
lea %1, [%1+2*%2]
movdqa [%1], %3 ; top(bottom)_2
movdqa [%1+16], %3 ; top(bottom)_2
movdqa [%1+%2], %3 ; top(bottom)_3
movdqa [%1+%2+16], %3 ; top(bottom)_3
lea %1, [%1+%2]
%endmacro
%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
; ebx [width/16(8)]
; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16) ; top
; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16) ; bottom
%if %1 == 32 ; for luma
sar ebx, 04h ; width / 16(8) pixels
.top_bottom_loops:
; top
movdqa xmm0, [esi] ; first line of picture pData
mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_end16x4_sse2 edi, ecx, xmm0, a
; bottom
movdqa xmm1, [eax] ; last line of picture pData
mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_end16x4_sse2 ebp, ecx, xmm1, a
lea esi, [esi+16] ; top pSrc
lea edi, [edi+16] ; top dst
lea eax, [eax+16] ; bottom pSrc
lea ebp, [ebp+16] ; bottom dst
neg ecx ; positive/negative stride need for next loop?
dec ebx
jnz near .top_bottom_loops
%elif %1 == 16 ; for chroma ??
mov edx, ebx
sar ebx, 04h ; (width / 16) pixels
.top_bottom_loops:
; top
movdqa xmm0, [esi] ; first line of picture pData
mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_end16x4_sse2 edi, ecx, xmm0, a
; bottom
movdqa xmm1, [eax] ; last line of picture pData
mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_end16x4_sse2 ebp, ecx, xmm1, a
lea esi, [esi+16] ; top pSrc
lea edi, [edi+16] ; top dst
lea eax, [eax+16] ; bottom pSrc
lea ebp, [ebp+16] ; bottom dst
neg ecx ; positive/negative stride need for next loop?
dec ebx
jnz near .top_bottom_loops
; for remaining 8 bytes
and edx, 0fh ; any 8 bytes left?
test edx, edx
jz near .to_be_continued ; no left to exit here
; top
movq mm0, [esi] ; remained 8 byte
mov_line_8x4_mmx edi, ecx, mm0 ; dst, stride, mm?
mov_line_8x4_mmx edi, ecx, mm0 ; dst, stride, mm?
mov_line_8x4_mmx edi, ecx, mm0 ; dst, stride, mm?
mov_line_end8x4_mmx edi, ecx, mm0 ; dst, stride, mm?
; bottom
movq mm1, [eax]
mov_line_8x4_mmx ebp, ecx, mm1 ; dst, stride, mm?
mov_line_8x4_mmx ebp, ecx, mm1 ; dst, stride, mm?
mov_line_8x4_mmx ebp, ecx, mm1 ; dst, stride, mm?
mov_line_end8x4_mmx ebp, ecx, mm1 ; dst, stride, mm?
WELSEMMS
.to_be_continued:
%endif
%endmacro
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
; ecx [height]
; esi [pSrc+0], edi [pSrc-32], edx [stride], 32(16) ; left
; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16) ; right
; xor eax, eax ; for pixel pData (uint8_t) ; make sure eax=0 at least high 24 bits of eax = 0
%if %1 == 32 ; for luma
.left_right_loops:
; left
mov al, byte [esi] ; pixel pData for left border
butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [edi], xmm0
movdqa [edi+16], xmm0
; right
mov al, byte [ebx]
butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [ebp], xmm1
movdqa [ebp+16], xmm1
lea esi, [esi+edx] ; left pSrc
lea edi, [edi+edx] ; left dst
lea ebx, [ebx+edx] ; right pSrc
lea ebp, [ebp+edx] ; right dst
dec ecx
jnz near .left_right_loops
%elif %1 == 16 ; for chroma ??
.left_right_loops:
; left
mov al, byte [esi] ; pixel pData for left border
butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [edi], xmm0
; right
mov al, byte [ebx]
butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdq%2 [ebp], xmm1 ; might not be aligned 16 bytes in case chroma planes
lea esi, [esi+edx] ; left pSrc
lea edi, [edi+edx] ; left dst
lea ebx, [ebx+edx] ; right pSrc
lea ebp, [ebp+edx] ; right dst
dec ecx
jnz near .left_right_loops
%endif
%endmacro
%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
%if %1 == 32 ; luma
; TL
mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
mov_line_end32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
; TR
mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
mov_line_end32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
; BL
mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
mov_line_end32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
; BR
mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
mov_line_end32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
%elif %1 == 16 ; chroma
; TL
mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
mov_line_end16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
; TR
mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
; BL
mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
mov_line_end16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
; BR
mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
%endif
%endmacro
ALIGN 16
;***********************************************************************----------------
; void ExpandPictureLuma_sse2( uint8_t *pDst,
; const int32_t kiStride,
; const int32_t kiWidth,
; const int32_t kiHeight );
;***********************************************************************----------------
ExpandPictureLuma_sse2:
push ebx
push edx
push esi
push edi
push ebp
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
mov edx, [esp+28] ; kiStride
mov ebx, [esp+32] ; kiWidth
mov eax, [esp+36] ; kiHeight
; also prepare for cross border pData top-left: xmm3
; xor ecx, ecx
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
mov ecx, edx ; kiStride
neg ecx ; -kiStride
lea edi, [esi+ecx] ; last line of top border
; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*kiStride
lea eax, [esi+eax] ; last line of picture pData
sal edx, 05h ; 32*kiStride
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 32 * stride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; kiWidth-1
lea ebx, [eax+ebx] ; dst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
; for top & bottom expanding
mov ebx, [esp+32] ; kiWidth
exp_top_bottom_sse2 32
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst: left border pSrc
mov edx, [esp+28] ; kiStride
mov ebx, [esp+32] ; kiWidth
mov ecx, [esp+36] ; kiHeight
; load left border
mov eax, -32 ; luma=-32, chroma=-16
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
butterfly_1to16_sse xmm4, xmm0, a ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 32, a
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
mov ecx, [esp+28] ; kiStride
mov ebx, [esp+32] ; kiWidth
mov edx, [esp+36] ; kiHeight
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
mov eax, -32 ; luma=-32, chroma=-16
neg ecx ; -stride
lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
add edx, 32 ; height+32(16), luma=32, chroma=16
mov ecx, [esp+28] ; kiStride
imul edx, ecx ; (height+32(16)) * stride
lea eax, [edi+edx] ; last line of bottom-left border
lea ebx, [ebp+edx] ; last line of bottom-right border
neg ecx ; -kiStride
; for left & right border expanding
exp_cross_sse2 32, a
; sfence ; commit cache write back memory
pop ebp
pop edi
pop esi
pop edx
pop ebx
ret
ALIGN 16
;***********************************************************************----------------
; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
; const int32_t kiStride,
; const int32_t kiWidth,
; const int32_t kiHeight );
;***********************************************************************----------------
ExpandPictureChromaAlign_sse2:
push ebx
push edx
push esi
push edi
push ebp
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
mov edx, [esp+28] ; kiStride
mov ebx, [esp+32] ; kiWidth
mov eax, [esp+36] ; kiHeight
; also prepare for cross border pData top-left: xmm3
; xor ecx, ecx
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
mov ecx, edx ; kiStride
neg ecx ; -kiStride
lea edi, [esi+ecx] ; last line of top border
; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*kiStride
lea eax, [esi+eax] ; last line of picture pData
sal edx, 04h ; 16*kiStride
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*kiStride + 16 * kiStride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; kiWidth-1
lea ebx, [eax+ebx] ; pDst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
; for top & bottom expanding
mov ebx, [esp+32] ; kiWidth
exp_top_bottom_sse2 16
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst: left border pSrc
mov edx, [esp+28] ; kiStride
mov ebx, [esp+32] ; kiWidth
mov ecx, [esp+36] ; kiHeight
; load left border
mov eax, -16 ; luma=-32, chroma=-16
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
butterfly_1to16_sse xmm4, xmm0, a ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 16, a
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
mov ecx, [esp+28] ; kiStride
mov ebx, [esp+32] ; kiWidth
mov edx, [esp+36] ; kiHeight
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
mov eax, -16 ; chroma=-16
neg ecx ; -stride
lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
mov ecx, [esp+28] ; kiStride
add edx, 16 ; height+16, luma=32, chroma=16
imul edx, ecx ; (kiHeight+16) * kiStride
lea eax, [edi+edx] ; last line of bottom-left border
lea ebx, [ebp+edx] ; last line of bottom-right border
neg ecx ; -kiStride
; for left & right border expanding
exp_cross_sse2 16, a
; sfence ; commit cache write back memory
pop ebp
pop edi
pop esi
pop edx
pop ebx
ret
ALIGN 16
;***********************************************************************----------------
; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
; const int32_t kiStride,
; const int32_t kiWidth,
; const int32_t kiHeight );
;***********************************************************************----------------
ExpandPictureChromaUnalign_sse2:
push ebx
push edx
push esi
push edi
push ebp
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
mov edx, [esp+28] ; kiStride
mov ebx, [esp+32] ; kiWidth
mov eax, [esp+36] ; kiHeight
; also prepare for cross border pData top-left: xmm3
; xor ecx, ecx
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
mov ecx, edx ; kiStride
neg ecx ; -kiStride
lea edi, [esi+ecx] ; last line of top border
; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*kiStride
lea eax, [esi+eax] ; last line of picture pData
sal edx, 04h ; 16*kiStride
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*kiStride + 16 * kiStride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; kiWidth-1
lea ebx, [eax+ebx] ; dst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
; for top & bottom expanding
mov ebx, [esp+32] ; kiWidth
exp_top_bottom_sse2 16
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst: left border pSrc
mov edx, [esp+28] ; kiStride
mov ebx, [esp+32] ; kiWidth
mov ecx, [esp+36] ; kiHeight
; load left border
mov eax, -16 ; luma=-32, chroma=-16
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 16, u
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
mov ecx, [esp+28] ; kiStride
mov ebx, [esp+32] ; kiWidth
mov edx, [esp+36] ; kiHeight
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg ecx ; -kiStride
mov eax, -16 ; chroma=-16
lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
mov ecx, [esp+28] ; kiStride
add edx, 16 ; kiHeight+16, luma=32, chroma=16
imul edx, ecx ; (kiHeight+16) * kiStride
lea eax, [edi+edx] ; last line of bottom-left border
lea ebx, [ebp+edx] ; last line of bottom-right border
neg ecx ; -kiStride
; for left & right border expanding
exp_cross_sse2 16, u
; sfence ; commit cache write back memory
pop ebp
pop edi
pop esi
pop edx
pop ebx
ret