openh264/codec/decoder/core/asm/intra_pred.asm
2013-12-09 04:51:09 -08:00

1499 lines
40 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* intra_pred.asm
;*
;* Abstract
;* sse2 and mmx function for intra predict operations(decoder)
;*
;* History
;* 18/09/2009 Created
;* 19/11/2010 Added
;* WelsI16x16LumaPredDcTop_sse2, WelsI16x16LumaPredDcNA_sse2,
;* WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2
;* and WelsIChromaPredDcNA_mmx
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
BITS 32
;*******************************************************************************
; Local Data (Read Only)
;*******************************************************************************
%ifdef FORMAT_COFF
SECTION .rodata data
%else
SECTION .rodata align=16
%endif
%if 1
%define WELSEMMS emms
%else
%define WELSEMMS
%endif
align 16
sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
align 16
sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
align 16
sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
; for chroma plane mode
sse2_plane_inc_c dw 1, 2, 3, 4
sse2_plane_dec_c dw 4, 3, 2, 1
align 16
sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
align 16
mmx_01bytes: times 16 db 1
align 16
mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
align 16
sse2_dc_0x80: times 16 db 0x80
align 16
sse2_wd_0x02: times 8 dw 0x02
;*******************************************************************************
; macros
;*******************************************************************************
;xmm0, xmm1, xmm2, eax, ecx
;lower 64 bits of xmm0 save the result
%macro SSE2_PRED_H_4X4_TWO_LINE 5
movd %1, [%4-1]
movdqa %3, %1
punpcklbw %1, %3
movdqa %3, %1
punpcklbw %1, %3
;add %4, %5
movd %2, [%4+%5-1]
movdqa %3, %2
punpcklbw %2, %3
movdqa %3, %2
punpcklbw %2, %3
punpckldq %1, %2
%endmacro
%macro LOAD_COLUMN 6
movd %1, [%5]
movd %2, [%5+%6]
punpcklbw %1, %2
lea %5, [%5+2*%6]
movd %3, [%5]
movd %2, [%5+%6]
punpcklbw %3, %2
punpcklwd %1, %3
lea %5, [%5+2*%6]
movd %4, [%5]
movd %2, [%5+%6]
punpcklbw %4, %2
lea %5, [%5+2*%6]
movd %3, [%5]
movd %2, [%5+%6]
lea %5, [%5+2*%6]
punpcklbw %3, %2
punpcklwd %4, %3
punpckhdq %1, %4
%endmacro
%macro SUMW_HORIZON 3
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
%endmacro
%macro COPY_16_TIMES 2
movdqa %2, [%1-16]
psrldq %2, 15
pmuludq %2, [mmx_01bytes]
pshufd %2, %2, 0
%endmacro
%macro COPY_16_TIMESS 3
movdqa %2, [%1+%3-16]
psrldq %2, 15
pmuludq %2, [mmx_01bytes]
pshufd %2, %2, 0
%endmacro
%macro LOAD_COLUMN_C 6
movd %1, [%5]
movd %2, [%5+%6]
punpcklbw %1,%2
lea %5, [%5+2*%6]
movd %3, [%5]
movd %2, [%5+%6]
punpcklbw %3, %2
punpckhwd %1, %3
lea %5, [%5+2*%6]
%endmacro
%macro LOAD_2_LEFT_AND_ADD 0
lea eax, [eax+2*ecx]
movzx edx, byte [eax-0x01]
add ebx, edx
movzx edx, byte [eax+ecx-0x01]
add ebx, edx
%endmacro
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
WELS_EXTERN WelsI4x4LumaPredH_sse2
WELS_EXTERN WelsI4x4LumaPredDDR_mmx
WELS_EXTERN WelsI16x16LumaPredPlane_sse2
WELS_EXTERN WelsI4x4LumaPredDc_sse2
ALIGN 16
;*******************************************************************************
; void_t __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
;
; pPred must align to 16
;*******************************************************************************
WelsI4x4LumaPredH_sse2:
mov eax, [esp+4] ;pPred
mov ecx, [esp+8] ;kiStride
movzx edx, byte [eax-1]
movd xmm0, edx
pmuludq xmm0, [mmx_01bytes]
movzx edx, byte [eax+ecx-1]
movd xmm1, edx
pmuludq xmm1, [mmx_01bytes]
lea eax, [eax+ecx]
movzx edx, byte [eax+ecx-1]
movd xmm2, edx
pmuludq xmm2, [mmx_01bytes]
movzx edx, byte [eax+2*ecx-1]
movd xmm3, edx
pmuludq xmm3, [mmx_01bytes]
sub eax, ecx
movd [eax], xmm0
movd [eax+ecx], xmm1
lea eax, [eax+2*ecx]
movd [eax], xmm2
movd [eax+ecx], xmm3
ret
;*******************************************************************************
; void_t WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WelsI16x16LumaPredPlane_sse2:
%define pushsize 4
push esi
mov esi, [esp + pushsize + 4]
mov ecx, [esp + pushsize + 8]
sub esi, 1
sub esi, ecx
;for H
pxor xmm7, xmm7
movq xmm0, [esi]
movdqa xmm5, [sse2_plane_dec]
punpcklbw xmm0, xmm7
pmullw xmm0, xmm5
movq xmm1, [esi + 9]
movdqa xmm6, [sse2_plane_inc]
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
psubw xmm1, xmm0
SUMW_HORIZON xmm1,xmm0,xmm2
movd eax, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
movsx eax, ax
imul eax, 5
add eax, 32
sar eax, 6 ; b = (5 * H + 32) >> 6;
SSE2_Copy8Times xmm1, eax ; xmm1 = b,b,b,b,b,b,b,b
movzx edx, BYTE [esi+16]
sub esi, 3
LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, esi, ecx
add esi, 3
movzx eax, BYTE [esi+8*ecx]
add edx, eax
shl edx, 4 ; a = (left[15*kiStride] + top[15]) << 4;
sub esi, 3
add esi, ecx
LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, esi, ecx
pxor xmm4, xmm4
punpckhbw xmm0, xmm4
pmullw xmm0, xmm5
punpckhbw xmm7, xmm4
pmullw xmm7, xmm6
psubw xmm7, xmm0
SUMW_HORIZON xmm7,xmm0,xmm2
movd eax, xmm7 ; V
movsx eax, ax
imul eax, 5
add eax, 32
sar eax, 6 ; c = (5 * V + 32) >> 6;
SSE2_Copy8Times xmm4, eax ; xmm4 = c,c,c,c,c,c,c,c
mov esi, [esp + pushsize + 4]
add edx, 16
imul eax, -7
add edx, eax ; s = a + 16 + (-7)*c
SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
xor eax, eax
movdqa xmm5, [sse2_plane_inc_minus]
get_i16x16_luma_pred_plane_sse2_1:
movdqa xmm2, xmm1
pmullw xmm2, xmm5
paddw xmm2, xmm0
psraw xmm2, 5
movdqa xmm3, xmm1
pmullw xmm3, xmm6
paddw xmm3, xmm0
psraw xmm3, 5
packuswb xmm2, xmm3
movdqa [esi], xmm2
paddw xmm0, xmm4
add esi, ecx
inc eax
cmp eax, 16
jnz get_i16x16_luma_pred_plane_sse2_1
pop esi
ret
;*******************************************************************************
; void_t WelsI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 0
lea eax, [eax+ecx*2]
COPY_16_TIMES eax, xmm0
movdqa [eax], xmm0
COPY_16_TIMESS eax, xmm0, ecx
movdqa [eax+ecx], xmm0
%endmacro
WELS_EXTERN WelsI16x16LumaPredH_sse2
WelsI16x16LumaPredH_sse2:
mov eax, [esp+4] ; pPred
mov ecx, [esp+8] ; kiStride
COPY_16_TIMES eax, xmm0
movdqa [eax], xmm0
COPY_16_TIMESS eax, xmm0, ecx
movdqa [eax+ecx], xmm0
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
ret
;*******************************************************************************
; void_t WelsI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsI16x16LumaPredV_sse2
WelsI16x16LumaPredV_sse2:
mov edx, [esp+4] ; pPred
mov ecx, [esp+8] ; kiStride
sub edx, ecx
movdqa xmm0, [edx]
movdqa [edx+ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx], xmm0
movdqa [edx+ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx], xmm0
movdqa [edx+ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx], xmm0
movdqa [edx+ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx], xmm0
movdqa [edx+ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx], xmm0
movdqa [edx+ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx], xmm0
movdqa [edx+ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx], xmm0
movdqa [edx+ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx], xmm0
ret
;*******************************************************************************
; void_t WelsIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsIChromaPredPlane_sse2
WelsIChromaPredPlane_sse2:
%define pushsize 4
push esi
mov esi, [esp + pushsize + 4] ;pPred
mov ecx, [esp + pushsize + 8] ;kiStride
sub esi, 1
sub esi, ecx
pxor mm7, mm7
movq mm0, [esi]
movq mm5, [sse2_plane_dec_c]
punpcklbw mm0, mm7
pmullw mm0, mm5
movq mm1, [esi + 5]
movq mm6, [sse2_plane_inc_c]
punpcklbw mm1, mm7
pmullw mm1, mm6
psubw mm1, mm0
movq2dq xmm1, mm1
pxor xmm2, xmm2
SUMW_HORIZON xmm1,xmm0,xmm2
movd eax, xmm1
movsx eax, ax
imul eax, 17
add eax, 16
sar eax, 5 ; b = (17 * H + 16) >> 5;
SSE2_Copy8Times xmm1, eax ; mm1 = b,b,b,b,b,b,b,b
movzx edx, BYTE [esi+8]
sub esi, 3
LOAD_COLUMN_C mm0, mm2, mm3, mm4, esi, ecx
add esi, 3
movzx eax, BYTE [esi+4*ecx]
add edx, eax
shl edx, 4 ; a = (left[7*kiStride] + top[7]) << 4;
sub esi, 3
add esi, ecx
LOAD_COLUMN_C mm7, mm2, mm3, mm4, esi, ecx
pxor mm4, mm4
punpckhbw mm0, mm4
pmullw mm0, mm5
punpckhbw mm7, mm4
pmullw mm7, mm6
psubw mm7, mm0
movq2dq xmm7, mm7
pxor xmm2, xmm2
SUMW_HORIZON xmm7,xmm0,xmm2
movd eax, xmm7 ; V
movsx eax, ax
imul eax, 17
add eax, 16
sar eax, 5 ; c = (17 * V + 16) >> 5;
SSE2_Copy8Times xmm4, eax ; mm4 = c,c,c,c,c,c,c,c
mov esi, [esp + pushsize + 4]
add edx, 16
imul eax, -3
add edx, eax ; s = a + 16 + (-3)*c
SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
xor eax, eax
movdqa xmm5, [sse2_plane_mul_b_c]
get_i_chroma_pred_plane_sse2_1:
movdqa xmm2, xmm1
pmullw xmm2, xmm5
paddw xmm2, xmm0
psraw xmm2, 5
packuswb xmm2, xmm2
movq [esi], xmm2
paddw xmm0, xmm4
add esi, ecx
inc eax
cmp eax, 8
jnz get_i_chroma_pred_plane_sse2_1
pop esi
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
; 0 |1 |2 |3 |4 |
; 6 |7 |8 |9 |10|
; 11|12|13|14|15|
; 16|17|18|19|20|
; 21|22|23|24|25|
; 7 is the start pixel of current 4x4 block
; pPred[7] = ([6]+[0]*2+[1]+2)/4
;
; void_t __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
;
;*******************************************************************************
WelsI4x4LumaPredDDR_mmx:
mov edx,[esp+4] ;pPred
mov eax,edx
mov ecx,[esp+8] ;kiStride
movq mm1,[eax+ecx-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
movq mm2,[eax-8] ;get value of 6 mm2[8] = 6
sub eax, ecx ;mov eax to above line of current block(postion of 1)
punpckhbw mm2,[eax-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
movd mm3,[eax] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
psllq mm3,18h ;mm3[5]=[1]
psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
lea eax,[eax+ecx*2-8h] ;set eax point to 12
movq mm4,[eax+ecx] ;get value of 16, mm4[8]=[16]
psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
psrlq mm4,38h ;mm4[1]=[16]
por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
movq mm4,[eax+ecx*2] ;mm4[8]=[21]
psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
psrlq mm4,38h ;mm4[1]=[21]
por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
pxor mm1,mm4 ;find odd value in the lowest bit of each byte
pand mm1,[mmx_01bytes] ;set the odd bit
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
lea edx,[edx+ecx]
movd [edx+2*ecx],mm2
sub edx,ecx
psrlq mm2,8
movd [edx+2*ecx],mm2
psrlq mm2,8
movd [edx+ecx],mm2
psrlq mm2,8
movd [edx],mm2
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
; 0 |1 |2 |3 |4 |
; 5 |6 |7 |8 |9 |
; 10|11|12|13|14|
; 15|16|17|18|19|
; 20|21|22|23|24|
; 6 is the start pixel of current 4x4 block
; pPred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
;
; void_t __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;
;*******************************************************************************
WelsI4x4LumaPredDc_sse2:
mov eax,[esp+4] ;pPred
mov ecx,[esp+8] ;kiStride
push ebx
movzx edx, byte [eax-1h]
sub eax, ecx
movd xmm0, [eax]
pxor xmm1, xmm1
psadbw xmm0, xmm1
movd ebx, xmm0
add ebx, edx
movzx edx, byte [eax+ecx*2-1h]
add ebx, edx
lea eax, [eax+ecx*2-1]
movzx edx, byte [eax+ecx]
add ebx, edx
movzx edx, byte [eax+ecx*2]
add ebx, edx
add ebx, 4
sar ebx, 3
imul ebx, 0x01010101
mov edx, [esp+8] ;pPred
mov [edx], ebx
mov [edx+ecx], ebx
mov [edx+2*ecx], ebx
lea edx, [edx+2*ecx]
mov [edx+ecx], ebx
pop ebx
ret
ALIGN 16
;*******************************************************************************
; void_t __cdecl WelsIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
; copy 8 pixel of 8 line from left
;*******************************************************************************
%macro MMX_PRED_H_8X8_ONE_LINE 4
movq %1, [%3-8]
psrlq %1, 38h
pmullw %1, [mmx_01bytes]
pshufw %1, %1, 0
movq [%4], %1
%endmacro
%macro MMX_PRED_H_8X8_ONE_LINEE 4
movq %1, [%3+ecx-8]
psrlq %1, 38h
pmullw %1, [mmx_01bytes]
pshufw %1, %1, 0
movq [%4], %1
%endmacro
WELS_EXTERN WelsIChromaPredH_mmx
WelsIChromaPredH_mmx:
mov edx, [esp+4] ;pPred
mov eax, edx
mov ecx, [esp+8] ;kiStride
movq mm0, [eax-8]
psrlq mm0, 38h
pmullw mm0, [mmx_01bytes]
pshufw mm0, mm0, 0
movq [edx], mm0
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
lea eax, [eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
lea edx, [edx+2*ecx]
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
lea eax, [eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
lea edx, [edx+2*ecx]
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
lea eax, [eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
lea edx, [edx+2*ecx]
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
; void_t __cdecl get_i4x4_luma_pred_v_asm(uint8_t *pPred, const int32_t kiStride)
; copy pixels from top 4 pixels
;*******************************************************************************
WELS_EXTERN get_i4x4_luma_pred_v_asm
get_i4x4_luma_pred_v_asm:
mov eax, [esp+4] ;pPred
mov ecx, [esp+8] ;kiStride
sub eax, ecx
mov edx, [eax]
mov [eax+ecx], edx
mov [eax+2*ecx], edx
lea eax, [eax+2*ecx]
mov [eax+ecx], edx
mov [eax+2*ecx], edx
ret
ALIGN 16
;*******************************************************************************
; void_t __cdecl WelsIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
; copy 8 pixels from top 8 pixels
;*******************************************************************************
WELS_EXTERN WelsIChromaPredV_mmx
WelsIChromaPredV_mmx:
mov eax, [esp+4] ;pPred
mov ecx, [esp+8] ;kiStride
sub eax, ecx
movq mm0, [eax]
movq [eax+ecx], mm0
movq [eax+2*ecx], mm0
lea eax, [eax+2*ecx]
movq [eax+ecx], mm0
movq [eax+2*ecx], mm0
lea eax, [eax+2*ecx]
movq [eax+ecx], mm0
movq [eax+2*ecx], mm0
lea eax, [eax+2*ecx]
movq [eax+ecx], mm0
movq [eax+2*ecx], mm0
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
; l0|
; l1|
; l2|
; l3|
; t3 will never been used
; destination:
; |a |b |c |d |
; |e |f |a |b |
; |g |h |e |f |
; |i |j |g |h |
; a = (1 + lt + l0)>>1
; e = (1 + l0 + l1)>>1
; g = (1 + l1 + l2)>>1
; i = (1 + l2 + l3)>>1
; d = (2 + t0 + (t1<<1) + t2)>>2
; c = (2 + lt + (t0<<1) + t1)>>2
; b = (2 + l0 + (lt<<1) + t0)>>2
; f = (2 + l1 + (l0<<1) + lt)>>2
; h = (2 + l2 + (l1<<1) + l0)>>2
; j = (2 + l3 + (l2<<1) + l1)>>2
; [b a f e h g j i] + [d c b a] --> mov to memory
;
; void_t WelsI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
WelsI4x4LumaPredHD_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
sub eax, ecx
movd mm0, [eax-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
movd mm1, [eax+2*ecx-4]
punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
lea eax, [eax+2*ecx]
movd mm2, [eax+2*ecx-4]
punpcklbw mm2, [eax+ecx-4] ; mm2[7] = l2, mm2[6] = l3
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
psrlq mm2, 20h
pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
movq mm1, mm0
psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
movq mm2, mm0
psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
movq mm3, mm2
movq mm4, mm1
pavgb mm1, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
pand mm4, [mmx_01bytes] ; set the odd bit
psubusb mm1, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
movq mm4, mm0
pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
psrlq mm2, 20h
psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
movq mm4, mm3
psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
movd [edx], mm2
lea edx, [edx+ecx]
movd [edx+2*ecx], mm3
sub edx, ecx
psrlq mm3, 10h
movd [edx+2*ecx], mm3
psrlq mm3, 10h
movd [edx+ecx], mm3
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
; l0|
; l1|
; l2|
; l3|
; t3 will never been used
; destination:
; |a |b |c |d |
; |c |d |e |f |
; |e |f |g |g |
; |g |g |g |g |
; a = (1 + l0 + l1)>>1
; c = (1 + l1 + l2)>>1
; e = (1 + l2 + l3)>>1
; g = l3
; b = (2 + l0 + (l1<<1) + l2)>>2
; d = (2 + l1 + (l2<<1) + l3)>>2
; f = (2 + l2 + (l3<<1) + l3)>>2
; [g g f e d c b a] + [g g g g] --> mov to memory
;
; void_t WelsI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
WelsI4x4LumaPredHU_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
movd mm0, [eax-4] ; mm0[3] = l0
punpcklbw mm0, [eax+ecx-4] ; mm0[7] = l1, mm0[6] = l0
lea eax, [eax+2*ecx]
movd mm2, [eax-4] ; mm2[3] = l2
movd mm4, [eax+ecx-4] ; mm4[3] = l3
punpcklbw mm2, mm4
punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
psrlq mm4, 18h
psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
psrlq mm0, 8h
pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
movq mm1, mm0
psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
movq mm2, mm0
psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
movq mm5, mm2
pavgb mm2, mm0
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
pand mm5, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm5 ; decrease 1 from odd bytes
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
psrlq mm2, 8h
pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
psrlq mm4, 20h
lea edx, [edx+ecx]
movd [edx+2*ecx], mm4
sub edx, ecx
movd [edx], mm1
psrlq mm1, 10h
movd [edx+ecx], mm1
psrlq mm1, 10h
movd [edx+2*ecx], mm1
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
; l0|
; l1|
; l2|
; l3|
; l3 will never been used
; destination:
; |a |b |c |d |
; |e |f |g |h |
; |i |a |b |c |
; |j |e |f |g |
; a = (1 + lt + t0)>>1
; b = (1 + t0 + t1)>>1
; c = (1 + t1 + t2)>>1
; d = (1 + t2 + t3)>>1
; e = (2 + l0 + (lt<<1) + t0)>>2
; f = (2 + lt + (t0<<1) + t1)>>2
; g = (2 + t0 + (t1<<1) + t2)>>2
; h = (2 + t1 + (t2<<1) + t3)>>2
; i = (2 + lt + (l0<<1) + l1)>>2
; j = (2 + l0 + (l1<<1) + l2)>>2
;
; void_t WelsI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
WelsI4x4LumaPredVR_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
sub eax, ecx
movq mm0, [eax-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
movd mm1, [eax+2*ecx-4]
punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
lea eax, [eax+2*ecx]
movq mm2, [eax+ecx-8] ; mm2[7] = l2
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
psrlq mm2, 28h
pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
movq mm1, mm0
psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
movq mm2, mm0
psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
movq mm3, mm2
pavgb mm2, mm0
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
pand mm3, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm3 ; decrease 1 from odd bytes
movq mm3, mm0
psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
movq mm2, mm3
psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
movd [edx], mm1
psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
movd [edx+ecx], mm2
movq mm4, mm3
psllq mm4, 20h
psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
movq mm5, mm3
psllq mm5, 28h
psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
psllq mm1, 8h
pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
movd [edx+2*ecx], mm4
psllq mm2, 8h
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
lea edx, [edx+2*ecx]
movd [edx+ecx], mm5
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
; l0|
; l1|
; l2|
; l3|
; lt,t0,t1,t2,t3 will never been used
; destination:
; |a |b |c |d |
; |b |c |d |e |
; |c |d |e |f |
; |d |e |f |g |
; a = (2 + t0 + t2 + (t1<<1))>>2
; b = (2 + t1 + t3 + (t2<<1))>>2
; c = (2 + t2 + t4 + (t3<<1))>>2
; d = (2 + t3 + t5 + (t4<<1))>>2
; e = (2 + t4 + t6 + (t5<<1))>>2
; f = (2 + t5 + t7 + (t6<<1))>>2
; g = (2 + t6 + t7 + (t7<<1))>>2
; [g f e d c b a] --> mov to memory
;
; void_t WelsI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
WelsI4x4LumaPredDDL_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
sub eax, ecx
movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
movq mm3, mm0
psrlq mm3, 38h
psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
psrlq mm2, 8h
pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
movq mm3, mm1
pavgb mm1, mm2
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
pand mm3, [mmx_01bytes] ; set the odd bit
psubusb mm1, mm3 ; decrease 1 from odd bytes
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
psrlq mm0, 8h
movd [edx], mm0
psrlq mm0, 8h
movd [edx+ecx], mm0
psrlq mm0, 8h
movd [edx+2*ecx], mm0
psrlq mm0, 8h
lea edx, [edx+2*ecx]
movd [edx+ecx], mm0
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
; l0|
; l1|
; l2|
; l3|
; lt,t0,t1,t2,t3 will never been used
; destination:
; |a |b |c |d |
; |e |f |g |h |
; |b |c |d |i |
; |f |g |h |j |
; a = (1 + t0 + t1)>>1
; b = (1 + t1 + t2)>>1
; c = (1 + t2 + t3)>>1
; d = (1 + t3 + t4)>>1
; i = (1 + t4 + t5)>>1
; e = (2 + t0 + (t1<<1) + t2)>>2
; f = (2 + t1 + (t2<<1) + t3)>>2
; g = (2 + t2 + (t3<<1) + t4)>>2
; h = (2 + t3 + (t4<<1) + t5)>>2
; j = (2 + t4 + (t5<<1) + t6)>>2
; [i d c b a] + [j h g f e] --> mov to memory
;
; void_t WelsI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
WelsI4x4LumaPredVL_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
sub eax, ecx
movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
movq mm3, mm1
pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
movq mm4, mm2
pavgb mm2, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
pand mm4, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
movd [edx], mm3
psrlq mm3, 8h
movd [edx+2*ecx], mm3
movd [edx+ecx], mm2
psrlq mm2, 8h
lea edx, [edx+2*ecx]
movd [edx+ecx], mm2
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
;
; void_t WelsIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsIChromaPredDc_sse2
WelsIChromaPredDc_sse2:
push ebx
mov eax, [esp+8] ; pPred
mov ecx, [esp+12] ; kiStride
sub eax, ecx
movq mm0, [eax]
movzx ebx, byte [eax+ecx-0x01] ; l1
lea eax, [eax+2*ecx]
movzx edx, byte [eax-0x01] ; l2
add ebx, edx
movzx edx, byte [eax+ecx-0x01] ; l3
add ebx, edx
lea eax, [eax+2*ecx]
movzx edx, byte [eax-0x01] ; l4
add ebx, edx
movd mm1, ebx ; mm1 = l1+l2+l3+l4
movzx ebx, byte [eax+ecx-0x01] ; l5
lea eax, [eax+2*ecx]
movzx edx, byte [eax-0x01] ; l6
add ebx, edx
movzx edx, byte [eax+ecx-0x01] ; l7
add ebx, edx
lea eax, [eax+2*ecx]
movzx edx, byte [eax-0x01] ; l8
add ebx, edx
movd mm2, ebx ; mm2 = l5+l6+l7+l8
movq mm3, mm0
psrlq mm0, 0x20
psllq mm3, 0x20
psrlq mm3, 0x20
pxor mm4, mm4
psadbw mm0, mm4
psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
paddq mm3, mm1
movq mm1, mm2
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
movq mm4, [mmx_0x02]
paddq mm0, mm4
psrlq mm0, 0x02
paddq mm2, mm4
psrlq mm2, 0x02
paddq mm3, mm4
paddq mm3, mm4
psrlq mm3, 0x03
paddq mm1, mm4
paddq mm1, mm4
psrlq mm1, 0x03
pmuludq mm0, [mmx_01bytes]
pmuludq mm3, [mmx_01bytes]
psllq mm0, 0x20
pxor mm0, mm3 ; mm0 = m_up
pmuludq mm2, [mmx_01bytes]
pmuludq mm1, [mmx_01bytes]
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
mov edx, [esp+8] ; pPred
movq [edx], mm0
movq [edx+ecx], mm0
movq [edx+2*ecx], mm0
lea edx, [edx+2*ecx]
movq [edx+ecx], mm0
movq [edx+2*ecx], mm1
lea edx, [edx+2*ecx]
movq [edx+ecx], mm1
movq [edx+2*ecx], mm1
lea edx, [edx+2*ecx]
movq [edx+ecx], mm1
pop ebx
WELSEMMS
ret
ALIGN 16
;*******************************************************************************
;
; void_t WelsI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI16x16LumaPredDc_sse2
WelsI16x16LumaPredDc_sse2:
push ebx
mov eax, [esp+8] ; pPred
mov ecx, [esp+12] ; kiStride
sub eax, ecx
movdqa xmm0, [eax] ; read one row
pxor xmm1, xmm1
psadbw xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 0x08
pslldq xmm0, 0x08
psrldq xmm0, 0x08
paddw xmm0, xmm1
movzx ebx, byte [eax+ecx-0x01]
movzx edx, byte [eax+2*ecx-0x01]
add ebx, edx
lea eax, [eax+ecx]
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
add ebx, 0x10
movd xmm1, ebx
paddw xmm0, xmm1
psrld xmm0, 0x05
pmuludq xmm0, [mmx_01bytes]
pshufd xmm0, xmm0, 0
mov edx, [esp+8] ; pPred
movdqa [edx], xmm0
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx+ecx], xmm0
pop ebx
ret
;*******************************************************************************
; for intra prediction as follows, 11/19/2010
;*******************************************************************************
ALIGN 16
;*******************************************************************************
; void_t WelsI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI16x16LumaPredDcTop_sse2
WelsI16x16LumaPredDcTop_sse2:
push ebx
%define PUSH_SIZE 4
mov eax, [esp+PUSH_SIZE+4] ; pPred
mov ebx, [esp+PUSH_SIZE+8] ; kiStride
mov ecx, ebx
neg ecx
movdqa xmm0, [eax+ecx] ; pPred-kiStride, top line
pxor xmm7, xmm7
movdqa xmm1, xmm0
punpcklbw xmm0, xmm7
punpckhbw xmm1, xmm7
paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4
paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
pshuflw xmm1, xmm0, 0b1h ; 10110001
paddw xmm0, xmm1 ; sum in word unit (x8)
movd edx, xmm0
and edx, 0ffffh
add edx, 08h
sar edx, 04h
mov dh, dl
mov ecx, edx
shl ecx, 010h
or edx, ecx
movd xmm1, edx
pshufd xmm0, xmm1, 00h
movdqa xmm1, xmm0
lea ecx, [2*ebx+ebx] ; 3*kiStride
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
%undef PUSH_SIZE
pop ebx
ret
ALIGN 16
;*******************************************************************************
; void_t WelsI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI16x16LumaPredDcNA_sse2
WelsI16x16LumaPredDcNA_sse2:
push ebx
%define PUSH_SIZE 4
mov eax, [esp+PUSH_SIZE+4] ; pPred
mov ebx, [esp+PUSH_SIZE+8] ; kiStride
lea ecx, [2*ebx+ebx] ; 3*kiStride
movdqa xmm0, [sse2_dc_0x80]
movdqa xmm1, xmm0
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
%undef PUSH_SIZE
pop ebx
ret
ALIGN 16
;*******************************************************************************
; void_t WelsIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsIChromaPredDcLeft_mmx
WelsIChromaPredDcLeft_mmx:
push ebx
push esi
%define PUSH_SIZE 8
mov esi, [esp+PUSH_SIZE+4] ; pPred
mov ecx, [esp+PUSH_SIZE+8] ; kiStride
mov eax, esi
; for left
dec eax
xor ebx, ebx
xor edx, edx
mov bl, [eax]
mov dl, [eax+ecx]
add ebx, edx
lea eax, [eax+2*ecx]
mov dl, [eax]
add ebx, edx
mov dl, [eax+ecx]
add ebx, edx
add ebx, 02h
sar ebx, 02h
mov bh, bl
movd mm1, ebx
pshufw mm0, mm1, 00h ; up64
movq mm1, mm0
xor ebx, ebx
lea eax, [eax+2*ecx]
mov bl, [eax]
mov dl, [eax+ecx]
add ebx, edx
lea eax, [eax+2*ecx]
mov dl, [eax]
add ebx, edx
mov dl, [eax+ecx]
add ebx, edx
add ebx, 02h
sar ebx, 02h
mov bh, bl
movd mm3, ebx
pshufw mm2, mm3, 00h ; down64
movq mm3, mm2
lea ebx, [2*ecx+ecx]
movq [esi], mm0
movq [esi+ecx], mm1
movq [esi+2*ecx], mm0
movq [esi+ebx], mm1
lea esi, [esi+4*ecx]
movq [esi], mm2
movq [esi+ecx], mm3
movq [esi+2*ecx], mm2
movq [esi+ebx], mm3
pop esi
pop ebx
emms
ret
ALIGN 16
;*******************************************************************************
; void_t WelsIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsIChromaPredDcTop_sse2
WelsIChromaPredDcTop_sse2:
push ebx
%define PUSH_SIZE 4
mov eax, [esp+PUSH_SIZE+4] ; pPred
mov ecx, [esp+PUSH_SIZE+8] ; kiStride
mov ebx, ecx
neg ebx
movq xmm0, [eax+ebx] ; top: 8x1 pixels
pxor xmm7, xmm7
punpcklbw xmm0, xmm7 ; ext 8x2 words
pshufd xmm1, xmm0, 0B1h ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
paddw xmm0, xmm1 ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
movdqa xmm1, xmm0
pshuflw xmm2, xmm0, 0B1h ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
pshufhw xmm3, xmm1, 0B1h ; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
paddw xmm0, xmm2 ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
paddw xmm1, xmm3 ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
punpckhqdq xmm1, xmm7
punpcklqdq xmm0, xmm1 ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
movdqa xmm6, [sse2_wd_0x02]
paddw xmm0, xmm6
psraw xmm0, 02h
packuswb xmm0, xmm7
lea ebx, [2*ecx+ecx]
movq [eax], xmm0
movq [eax+ecx], xmm0
movq [eax+2*ecx], xmm0
movq [eax+ebx], xmm0
lea eax, [eax+4*ecx]
movq [eax], xmm0
movq [eax+ecx], xmm0
movq [eax+2*ecx], xmm0
movq [eax+ebx], xmm0
%undef PUSH_SIZE
pop ebx
ret
ALIGN 16
;*******************************************************************************
; void_t WelsIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsIChromaPredDcNA_mmx
WelsIChromaPredDcNA_mmx:
push ebx
%define PUSH_SIZE 4
mov eax, [esp+PUSH_SIZE+4] ; pPred
mov ebx, [esp+PUSH_SIZE+8] ; kiStride
lea ecx, [2*ebx+ebx]
movq mm0, [sse2_dc_0x80]
movq mm1, mm0
movq [eax], mm0
movq [eax+ebx], mm1
movq [eax+2*ebx], mm0
movq [eax+ecx], mm1
lea eax, [eax+4*ebx]
movq [eax], mm0
movq [eax+ebx], mm1
movq [eax+2*ebx], mm0
movq [eax+ecx], mm1
%undef PUSH_SIZE
pop ebx
emms
ret