1499 lines
40 KiB
NASM
1499 lines
40 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* intra_pred.asm
|
|
;*
|
|
;* Abstract
|
|
;* sse2 and mmx function for intra predict operations(decoder)
|
|
;*
|
|
;* History
|
|
;* 18/09/2009 Created
|
|
;* 19/11/2010 Added
|
|
;* WelsI16x16LumaPredDcTop_sse2, WelsI16x16LumaPredDcNA_sse2,
|
|
;* WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2
|
|
;* and WelsIChromaPredDcNA_mmx
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
|
|
%include "asm_inc.asm"
|
|
BITS 32
|
|
;*******************************************************************************
|
|
; Local Data (Read Only)
|
|
;*******************************************************************************
|
|
|
|
%ifdef FORMAT_COFF
|
|
SECTION .rodata data
|
|
%else
|
|
SECTION .rodata align=16
|
|
%endif
|
|
%if 1
|
|
%define WELSEMMS emms
|
|
%else
|
|
%define WELSEMMS
|
|
%endif
|
|
|
|
align 16
|
|
sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
|
|
align 16
|
|
sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
|
|
align 16
|
|
sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
|
|
|
|
; for chroma plane mode
|
|
sse2_plane_inc_c dw 1, 2, 3, 4
|
|
sse2_plane_dec_c dw 4, 3, 2, 1
|
|
align 16
|
|
sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
|
|
|
|
align 16
|
|
mmx_01bytes: times 16 db 1
|
|
|
|
align 16
|
|
mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
|
|
|
|
align 16
|
|
sse2_dc_0x80: times 16 db 0x80
|
|
align 16
|
|
sse2_wd_0x02: times 8 dw 0x02
|
|
|
|
;*******************************************************************************
|
|
; macros
|
|
;*******************************************************************************
|
|
;xmm0, xmm1, xmm2, eax, ecx
|
|
;lower 64 bits of xmm0 save the result
|
|
%macro SSE2_PRED_H_4X4_TWO_LINE 5
|
|
movd %1, [%4-1]
|
|
movdqa %3, %1
|
|
punpcklbw %1, %3
|
|
movdqa %3, %1
|
|
punpcklbw %1, %3
|
|
|
|
;add %4, %5
|
|
movd %2, [%4+%5-1]
|
|
movdqa %3, %2
|
|
punpcklbw %2, %3
|
|
movdqa %3, %2
|
|
punpcklbw %2, %3
|
|
punpckldq %1, %2
|
|
%endmacro
|
|
|
|
|
|
%macro LOAD_COLUMN 6
|
|
movd %1, [%5]
|
|
movd %2, [%5+%6]
|
|
punpcklbw %1, %2
|
|
lea %5, [%5+2*%6]
|
|
movd %3, [%5]
|
|
movd %2, [%5+%6]
|
|
punpcklbw %3, %2
|
|
punpcklwd %1, %3
|
|
lea %5, [%5+2*%6]
|
|
movd %4, [%5]
|
|
movd %2, [%5+%6]
|
|
punpcklbw %4, %2
|
|
lea %5, [%5+2*%6]
|
|
movd %3, [%5]
|
|
movd %2, [%5+%6]
|
|
lea %5, [%5+2*%6]
|
|
punpcklbw %3, %2
|
|
punpcklwd %4, %3
|
|
punpckhdq %1, %4
|
|
%endmacro
|
|
|
|
%macro SUMW_HORIZON 3
|
|
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
|
|
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
|
|
punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
|
|
movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
|
|
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
|
|
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
|
|
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
|
|
%endmacro
|
|
|
|
%macro COPY_16_TIMES 2
|
|
movdqa %2, [%1-16]
|
|
psrldq %2, 15
|
|
pmuludq %2, [mmx_01bytes]
|
|
pshufd %2, %2, 0
|
|
%endmacro
|
|
|
|
%macro COPY_16_TIMESS 3
|
|
movdqa %2, [%1+%3-16]
|
|
psrldq %2, 15
|
|
pmuludq %2, [mmx_01bytes]
|
|
pshufd %2, %2, 0
|
|
%endmacro
|
|
|
|
%macro LOAD_COLUMN_C 6
|
|
movd %1, [%5]
|
|
movd %2, [%5+%6]
|
|
punpcklbw %1,%2
|
|
lea %5, [%5+2*%6]
|
|
movd %3, [%5]
|
|
movd %2, [%5+%6]
|
|
punpcklbw %3, %2
|
|
punpckhwd %1, %3
|
|
lea %5, [%5+2*%6]
|
|
%endmacro
|
|
|
|
%macro LOAD_2_LEFT_AND_ADD 0
|
|
lea eax, [eax+2*ecx]
|
|
movzx edx, byte [eax-0x01]
|
|
add ebx, edx
|
|
movzx edx, byte [eax+ecx-0x01]
|
|
add ebx, edx
|
|
%endmacro
|
|
|
|
;*******************************************************************************
|
|
; Code
|
|
;*******************************************************************************
|
|
|
|
SECTION .text
|
|
WELS_EXTERN WelsI4x4LumaPredH_sse2
|
|
WELS_EXTERN WelsI4x4LumaPredDDR_mmx
|
|
WELS_EXTERN WelsI16x16LumaPredPlane_sse2
|
|
WELS_EXTERN WelsI4x4LumaPredDc_sse2
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; void_t __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
|
|
;
|
|
; pPred must align to 16
|
|
;*******************************************************************************
|
|
WelsI4x4LumaPredH_sse2:
|
|
mov eax, [esp+4] ;pPred
|
|
mov ecx, [esp+8] ;kiStride
|
|
|
|
movzx edx, byte [eax-1]
|
|
movd xmm0, edx
|
|
pmuludq xmm0, [mmx_01bytes]
|
|
|
|
movzx edx, byte [eax+ecx-1]
|
|
movd xmm1, edx
|
|
pmuludq xmm1, [mmx_01bytes]
|
|
|
|
lea eax, [eax+ecx]
|
|
movzx edx, byte [eax+ecx-1]
|
|
movd xmm2, edx
|
|
pmuludq xmm2, [mmx_01bytes]
|
|
|
|
movzx edx, byte [eax+2*ecx-1]
|
|
movd xmm3, edx
|
|
pmuludq xmm3, [mmx_01bytes]
|
|
|
|
sub eax, ecx
|
|
movd [eax], xmm0
|
|
movd [eax+ecx], xmm1
|
|
lea eax, [eax+2*ecx]
|
|
movd [eax], xmm2
|
|
movd [eax+ecx], xmm3
|
|
|
|
ret
|
|
|
|
;*******************************************************************************
|
|
; void_t WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
|
|
;*******************************************************************************
|
|
WelsI16x16LumaPredPlane_sse2:
|
|
%define pushsize 4
|
|
push esi
|
|
mov esi, [esp + pushsize + 4]
|
|
mov ecx, [esp + pushsize + 8]
|
|
sub esi, 1
|
|
sub esi, ecx
|
|
|
|
;for H
|
|
pxor xmm7, xmm7
|
|
movq xmm0, [esi]
|
|
movdqa xmm5, [sse2_plane_dec]
|
|
punpcklbw xmm0, xmm7
|
|
pmullw xmm0, xmm5
|
|
movq xmm1, [esi + 9]
|
|
movdqa xmm6, [sse2_plane_inc]
|
|
punpcklbw xmm1, xmm7
|
|
pmullw xmm1, xmm6
|
|
psubw xmm1, xmm0
|
|
|
|
SUMW_HORIZON xmm1,xmm0,xmm2
|
|
movd eax, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
|
|
movsx eax, ax
|
|
imul eax, 5
|
|
add eax, 32
|
|
sar eax, 6 ; b = (5 * H + 32) >> 6;
|
|
SSE2_Copy8Times xmm1, eax ; xmm1 = b,b,b,b,b,b,b,b
|
|
|
|
movzx edx, BYTE [esi+16]
|
|
sub esi, 3
|
|
LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, esi, ecx
|
|
|
|
add esi, 3
|
|
movzx eax, BYTE [esi+8*ecx]
|
|
add edx, eax
|
|
shl edx, 4 ; a = (left[15*kiStride] + top[15]) << 4;
|
|
|
|
sub esi, 3
|
|
add esi, ecx
|
|
LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, esi, ecx
|
|
pxor xmm4, xmm4
|
|
punpckhbw xmm0, xmm4
|
|
pmullw xmm0, xmm5
|
|
punpckhbw xmm7, xmm4
|
|
pmullw xmm7, xmm6
|
|
psubw xmm7, xmm0
|
|
|
|
SUMW_HORIZON xmm7,xmm0,xmm2
|
|
movd eax, xmm7 ; V
|
|
movsx eax, ax
|
|
|
|
imul eax, 5
|
|
add eax, 32
|
|
sar eax, 6 ; c = (5 * V + 32) >> 6;
|
|
SSE2_Copy8Times xmm4, eax ; xmm4 = c,c,c,c,c,c,c,c
|
|
|
|
mov esi, [esp + pushsize + 4]
|
|
add edx, 16
|
|
imul eax, -7
|
|
add edx, eax ; s = a + 16 + (-7)*c
|
|
SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
|
|
|
|
xor eax, eax
|
|
movdqa xmm5, [sse2_plane_inc_minus]
|
|
|
|
get_i16x16_luma_pred_plane_sse2_1:
|
|
movdqa xmm2, xmm1
|
|
pmullw xmm2, xmm5
|
|
paddw xmm2, xmm0
|
|
psraw xmm2, 5
|
|
movdqa xmm3, xmm1
|
|
pmullw xmm3, xmm6
|
|
paddw xmm3, xmm0
|
|
psraw xmm3, 5
|
|
packuswb xmm2, xmm3
|
|
movdqa [esi], xmm2
|
|
paddw xmm0, xmm4
|
|
add esi, ecx
|
|
inc eax
|
|
cmp eax, 16
|
|
jnz get_i16x16_luma_pred_plane_sse2_1
|
|
|
|
pop esi
|
|
ret
|
|
|
|
|
|
|
|
;*******************************************************************************
|
|
; void_t WelsI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
|
|
;*******************************************************************************
|
|
|
|
%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 0
|
|
lea eax, [eax+ecx*2]
|
|
|
|
COPY_16_TIMES eax, xmm0
|
|
movdqa [eax], xmm0
|
|
COPY_16_TIMESS eax, xmm0, ecx
|
|
movdqa [eax+ecx], xmm0
|
|
%endmacro
|
|
|
|
WELS_EXTERN WelsI16x16LumaPredH_sse2
|
|
WelsI16x16LumaPredH_sse2:
|
|
mov eax, [esp+4] ; pPred
|
|
mov ecx, [esp+8] ; kiStride
|
|
|
|
COPY_16_TIMES eax, xmm0
|
|
movdqa [eax], xmm0
|
|
COPY_16_TIMESS eax, xmm0, ecx
|
|
movdqa [eax+ecx], xmm0
|
|
|
|
SSE2_PRED_H_16X16_TWO_LINE_DEC
|
|
SSE2_PRED_H_16X16_TWO_LINE_DEC
|
|
SSE2_PRED_H_16X16_TWO_LINE_DEC
|
|
SSE2_PRED_H_16X16_TWO_LINE_DEC
|
|
SSE2_PRED_H_16X16_TWO_LINE_DEC
|
|
SSE2_PRED_H_16X16_TWO_LINE_DEC
|
|
SSE2_PRED_H_16X16_TWO_LINE_DEC
|
|
|
|
ret
|
|
|
|
;*******************************************************************************
|
|
; void_t WelsI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsI16x16LumaPredV_sse2
|
|
WelsI16x16LumaPredV_sse2:
|
|
mov edx, [esp+4] ; pPred
|
|
mov ecx, [esp+8] ; kiStride
|
|
|
|
sub edx, ecx
|
|
movdqa xmm0, [edx]
|
|
|
|
movdqa [edx+ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
movdqa [edx], xmm0
|
|
|
|
ret
|
|
|
|
;*******************************************************************************
|
|
; void_t WelsIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsIChromaPredPlane_sse2
|
|
WelsIChromaPredPlane_sse2:
|
|
%define pushsize 4
|
|
push esi
|
|
mov esi, [esp + pushsize + 4] ;pPred
|
|
mov ecx, [esp + pushsize + 8] ;kiStride
|
|
sub esi, 1
|
|
sub esi, ecx
|
|
|
|
pxor mm7, mm7
|
|
movq mm0, [esi]
|
|
movq mm5, [sse2_plane_dec_c]
|
|
punpcklbw mm0, mm7
|
|
pmullw mm0, mm5
|
|
movq mm1, [esi + 5]
|
|
movq mm6, [sse2_plane_inc_c]
|
|
punpcklbw mm1, mm7
|
|
pmullw mm1, mm6
|
|
psubw mm1, mm0
|
|
|
|
movq2dq xmm1, mm1
|
|
pxor xmm2, xmm2
|
|
SUMW_HORIZON xmm1,xmm0,xmm2
|
|
movd eax, xmm1
|
|
movsx eax, ax
|
|
imul eax, 17
|
|
add eax, 16
|
|
sar eax, 5 ; b = (17 * H + 16) >> 5;
|
|
SSE2_Copy8Times xmm1, eax ; mm1 = b,b,b,b,b,b,b,b
|
|
|
|
movzx edx, BYTE [esi+8]
|
|
sub esi, 3
|
|
LOAD_COLUMN_C mm0, mm2, mm3, mm4, esi, ecx
|
|
|
|
add esi, 3
|
|
movzx eax, BYTE [esi+4*ecx]
|
|
add edx, eax
|
|
shl edx, 4 ; a = (left[7*kiStride] + top[7]) << 4;
|
|
|
|
sub esi, 3
|
|
add esi, ecx
|
|
LOAD_COLUMN_C mm7, mm2, mm3, mm4, esi, ecx
|
|
pxor mm4, mm4
|
|
punpckhbw mm0, mm4
|
|
pmullw mm0, mm5
|
|
punpckhbw mm7, mm4
|
|
pmullw mm7, mm6
|
|
psubw mm7, mm0
|
|
|
|
movq2dq xmm7, mm7
|
|
pxor xmm2, xmm2
|
|
SUMW_HORIZON xmm7,xmm0,xmm2
|
|
movd eax, xmm7 ; V
|
|
movsx eax, ax
|
|
|
|
imul eax, 17
|
|
add eax, 16
|
|
sar eax, 5 ; c = (17 * V + 16) >> 5;
|
|
SSE2_Copy8Times xmm4, eax ; mm4 = c,c,c,c,c,c,c,c
|
|
|
|
mov esi, [esp + pushsize + 4]
|
|
add edx, 16
|
|
imul eax, -3
|
|
add edx, eax ; s = a + 16 + (-3)*c
|
|
SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
|
|
|
|
xor eax, eax
|
|
movdqa xmm5, [sse2_plane_mul_b_c]
|
|
|
|
get_i_chroma_pred_plane_sse2_1:
|
|
movdqa xmm2, xmm1
|
|
pmullw xmm2, xmm5
|
|
paddw xmm2, xmm0
|
|
psraw xmm2, 5
|
|
packuswb xmm2, xmm2
|
|
movq [esi], xmm2
|
|
paddw xmm0, xmm4
|
|
add esi, ecx
|
|
inc eax
|
|
cmp eax, 8
|
|
jnz get_i_chroma_pred_plane_sse2_1
|
|
|
|
pop esi
|
|
WELSEMMS
|
|
ret
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; 0 |1 |2 |3 |4 |
|
|
; 6 |7 |8 |9 |10|
|
|
; 11|12|13|14|15|
|
|
; 16|17|18|19|20|
|
|
; 21|22|23|24|25|
|
|
; 7 is the start pixel of current 4x4 block
|
|
; pPred[7] = ([6]+[0]*2+[1]+2)/4
|
|
;
|
|
; void_t __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
|
|
;
|
|
;*******************************************************************************
|
|
WelsI4x4LumaPredDDR_mmx:
|
|
mov edx,[esp+4] ;pPred
|
|
mov eax,edx
|
|
mov ecx,[esp+8] ;kiStride
|
|
|
|
movq mm1,[eax+ecx-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
|
|
movq mm2,[eax-8] ;get value of 6 mm2[8] = 6
|
|
sub eax, ecx ;mov eax to above line of current block(postion of 1)
|
|
punpckhbw mm2,[eax-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
|
|
movd mm3,[eax] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
|
|
punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
|
|
psllq mm3,18h ;mm3[5]=[1]
|
|
psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
|
|
por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
|
|
movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
|
|
lea eax,[eax+ecx*2-8h] ;set eax point to 12
|
|
movq mm4,[eax+ecx] ;get value of 16, mm4[8]=[16]
|
|
psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
|
|
psrlq mm4,38h ;mm4[1]=[16]
|
|
por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
|
|
movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
|
|
movq mm4,[eax+ecx*2] ;mm4[8]=[21]
|
|
psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
|
|
psrlq mm4,38h ;mm4[1]=[21]
|
|
por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
|
|
movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
|
|
pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
|
|
pxor mm1,mm4 ;find odd value in the lowest bit of each byte
|
|
pand mm1,[mmx_01bytes] ;set the odd bit
|
|
psubusb mm3,mm1 ;decrease 1 from odd bytes
|
|
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
|
|
|
|
lea edx,[edx+ecx]
|
|
movd [edx+2*ecx],mm2
|
|
sub edx,ecx
|
|
psrlq mm2,8
|
|
movd [edx+2*ecx],mm2
|
|
psrlq mm2,8
|
|
movd [edx+ecx],mm2
|
|
psrlq mm2,8
|
|
movd [edx],mm2
|
|
WELSEMMS
|
|
ret
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; 0 |1 |2 |3 |4 |
|
|
; 5 |6 |7 |8 |9 |
|
|
; 10|11|12|13|14|
|
|
; 15|16|17|18|19|
|
|
; 20|21|22|23|24|
|
|
; 6 is the start pixel of current 4x4 block
|
|
; pPred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
|
|
;
|
|
; void_t __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
|
|
;
|
|
;*******************************************************************************
|
|
WelsI4x4LumaPredDc_sse2:
|
|
mov eax,[esp+4] ;pPred
|
|
mov ecx,[esp+8] ;kiStride
|
|
push ebx
|
|
|
|
movzx edx, byte [eax-1h]
|
|
|
|
sub eax, ecx
|
|
movd xmm0, [eax]
|
|
pxor xmm1, xmm1
|
|
psadbw xmm0, xmm1
|
|
|
|
movd ebx, xmm0
|
|
add ebx, edx
|
|
|
|
movzx edx, byte [eax+ecx*2-1h]
|
|
add ebx, edx
|
|
|
|
lea eax, [eax+ecx*2-1]
|
|
movzx edx, byte [eax+ecx]
|
|
add ebx, edx
|
|
|
|
movzx edx, byte [eax+ecx*2]
|
|
add ebx, edx
|
|
add ebx, 4
|
|
sar ebx, 3
|
|
imul ebx, 0x01010101
|
|
|
|
mov edx, [esp+8] ;pPred
|
|
mov [edx], ebx
|
|
mov [edx+ecx], ebx
|
|
mov [edx+2*ecx], ebx
|
|
lea edx, [edx+2*ecx]
|
|
mov [edx+ecx], ebx
|
|
|
|
pop ebx
|
|
ret
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; void_t __cdecl WelsIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
|
|
; copy 8 pixel of 8 line from left
|
|
;*******************************************************************************
|
|
%macro MMX_PRED_H_8X8_ONE_LINE 4
|
|
movq %1, [%3-8]
|
|
psrlq %1, 38h
|
|
|
|
pmullw %1, [mmx_01bytes]
|
|
pshufw %1, %1, 0
|
|
movq [%4], %1
|
|
%endmacro
|
|
|
|
%macro MMX_PRED_H_8X8_ONE_LINEE 4
|
|
movq %1, [%3+ecx-8]
|
|
psrlq %1, 38h
|
|
|
|
pmullw %1, [mmx_01bytes]
|
|
pshufw %1, %1, 0
|
|
movq [%4], %1
|
|
%endmacro
|
|
|
|
WELS_EXTERN WelsIChromaPredH_mmx
|
|
WelsIChromaPredH_mmx:
|
|
mov edx, [esp+4] ;pPred
|
|
mov eax, edx
|
|
mov ecx, [esp+8] ;kiStride
|
|
|
|
movq mm0, [eax-8]
|
|
psrlq mm0, 38h
|
|
|
|
pmullw mm0, [mmx_01bytes]
|
|
pshufw mm0, mm0, 0
|
|
movq [edx], mm0
|
|
|
|
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
|
|
|
|
lea eax, [eax+ecx*2]
|
|
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
|
|
|
|
lea edx, [edx+2*ecx]
|
|
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
|
|
|
|
lea eax, [eax+ecx*2]
|
|
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
|
|
|
|
lea edx, [edx+2*ecx]
|
|
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
|
|
|
|
lea eax, [eax+ecx*2]
|
|
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
|
|
|
|
lea edx, [edx+2*ecx]
|
|
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
|
|
|
|
WELSEMMS
|
|
ret
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; void_t __cdecl get_i4x4_luma_pred_v_asm(uint8_t *pPred, const int32_t kiStride)
|
|
; copy pixels from top 4 pixels
|
|
;*******************************************************************************
|
|
WELS_EXTERN get_i4x4_luma_pred_v_asm
|
|
get_i4x4_luma_pred_v_asm:
|
|
mov eax, [esp+4] ;pPred
|
|
mov ecx, [esp+8] ;kiStride
|
|
|
|
sub eax, ecx
|
|
mov edx, [eax]
|
|
mov [eax+ecx], edx
|
|
mov [eax+2*ecx], edx
|
|
lea eax, [eax+2*ecx]
|
|
mov [eax+ecx], edx
|
|
mov [eax+2*ecx], edx
|
|
|
|
ret
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; void_t __cdecl WelsIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
|
|
; copy 8 pixels from top 8 pixels
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsIChromaPredV_mmx
|
|
WelsIChromaPredV_mmx:
|
|
mov eax, [esp+4] ;pPred
|
|
mov ecx, [esp+8] ;kiStride
|
|
|
|
sub eax, ecx
|
|
movq mm0, [eax]
|
|
|
|
movq [eax+ecx], mm0
|
|
movq [eax+2*ecx], mm0
|
|
lea eax, [eax+2*ecx]
|
|
movq [eax+ecx], mm0
|
|
movq [eax+2*ecx], mm0
|
|
lea eax, [eax+2*ecx]
|
|
movq [eax+ecx], mm0
|
|
movq [eax+2*ecx], mm0
|
|
lea eax, [eax+2*ecx]
|
|
movq [eax+ecx], mm0
|
|
movq [eax+2*ecx], mm0
|
|
|
|
WELSEMMS
|
|
ret
|
|
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; lt|t0|t1|t2|t3|
|
|
; l0|
|
|
; l1|
|
|
; l2|
|
|
; l3|
|
|
; t3 will never been used
|
|
; destination:
|
|
; |a |b |c |d |
|
|
; |e |f |a |b |
|
|
; |g |h |e |f |
|
|
; |i |j |g |h |
|
|
|
|
; a = (1 + lt + l0)>>1
|
|
; e = (1 + l0 + l1)>>1
|
|
; g = (1 + l1 + l2)>>1
|
|
; i = (1 + l2 + l3)>>1
|
|
|
|
; d = (2 + t0 + (t1<<1) + t2)>>2
|
|
; c = (2 + lt + (t0<<1) + t1)>>2
|
|
; b = (2 + l0 + (lt<<1) + t0)>>2
|
|
|
|
; f = (2 + l1 + (l0<<1) + lt)>>2
|
|
; h = (2 + l2 + (l1<<1) + l0)>>2
|
|
; j = (2 + l3 + (l2<<1) + l1)>>2
|
|
; [b a f e h g j i] + [d c b a] --> mov to memory
|
|
;
|
|
; void_t WelsI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsI4x4LumaPredHD_mmx
|
|
WelsI4x4LumaPredHD_mmx:
|
|
mov edx, [esp+4] ; pPred
|
|
mov eax, edx
|
|
mov ecx, [esp+8] ; kiStride
|
|
sub eax, ecx
|
|
movd mm0, [eax-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
|
|
psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
|
|
|
|
movd mm1, [eax+2*ecx-4]
|
|
punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
|
|
lea eax, [eax+2*ecx]
|
|
movd mm2, [eax+2*ecx-4]
|
|
punpcklbw mm2, [eax+ecx-4] ; mm2[7] = l2, mm2[6] = l3
|
|
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
|
|
psrlq mm2, 20h
|
|
pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
|
|
|
|
movq mm1, mm0
|
|
psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
|
|
movq mm2, mm0
|
|
psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
|
|
movq mm3, mm2
|
|
movq mm4, mm1
|
|
pavgb mm1, mm0
|
|
|
|
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
|
|
pand mm4, [mmx_01bytes] ; set the odd bit
|
|
psubusb mm1, mm4 ; decrease 1 from odd bytes
|
|
|
|
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
|
|
|
|
movq mm4, mm0
|
|
pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
|
|
punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
|
|
|
|
psrlq mm2, 20h
|
|
psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
|
|
movq mm4, mm3
|
|
psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
|
|
pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
|
|
psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
|
|
|
|
movd [edx], mm2
|
|
lea edx, [edx+ecx]
|
|
movd [edx+2*ecx], mm3
|
|
sub edx, ecx
|
|
psrlq mm3, 10h
|
|
movd [edx+2*ecx], mm3
|
|
psrlq mm3, 10h
|
|
movd [edx+ecx], mm3
|
|
WELSEMMS
|
|
ret
|
|
|
|
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; lt|t0|t1|t2|t3|
|
|
; l0|
|
|
; l1|
|
|
; l2|
|
|
; l3|
|
|
; t3 will never been used
|
|
; destination:
|
|
; |a |b |c |d |
|
|
; |c |d |e |f |
|
|
; |e |f |g |g |
|
|
; |g |g |g |g |
|
|
|
|
; a = (1 + l0 + l1)>>1
|
|
; c = (1 + l1 + l2)>>1
|
|
; e = (1 + l2 + l3)>>1
|
|
; g = l3
|
|
|
|
; b = (2 + l0 + (l1<<1) + l2)>>2
|
|
; d = (2 + l1 + (l2<<1) + l3)>>2
|
|
; f = (2 + l2 + (l3<<1) + l3)>>2
|
|
|
|
; [g g f e d c b a] + [g g g g] --> mov to memory
|
|
;
|
|
; void_t WelsI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsI4x4LumaPredHU_mmx
|
|
WelsI4x4LumaPredHU_mmx:
|
|
mov edx, [esp+4] ; pPred
|
|
mov eax, edx
|
|
mov ecx, [esp+8] ; kiStride
|
|
|
|
movd mm0, [eax-4] ; mm0[3] = l0
|
|
punpcklbw mm0, [eax+ecx-4] ; mm0[7] = l1, mm0[6] = l0
|
|
lea eax, [eax+2*ecx]
|
|
movd mm2, [eax-4] ; mm2[3] = l2
|
|
movd mm4, [eax+ecx-4] ; mm4[3] = l3
|
|
punpcklbw mm2, mm4
|
|
punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
|
|
|
|
psrlq mm4, 18h
|
|
psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
|
|
psrlq mm0, 8h
|
|
pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
|
|
|
|
movq mm1, mm0
|
|
psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
|
|
movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
|
|
pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
|
|
|
|
movq mm2, mm0
|
|
psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
|
|
movq mm5, mm2
|
|
pavgb mm2, mm0
|
|
|
|
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
|
|
pand mm5, [mmx_01bytes] ; set the odd bit
|
|
psubusb mm2, mm5 ; decrease 1 from odd bytes
|
|
|
|
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
|
|
|
|
psrlq mm2, 8h
|
|
pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
|
|
|
|
punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
|
|
punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
|
|
punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
|
|
|
|
psrlq mm4, 20h
|
|
lea edx, [edx+ecx]
|
|
movd [edx+2*ecx], mm4
|
|
|
|
sub edx, ecx
|
|
movd [edx], mm1
|
|
psrlq mm1, 10h
|
|
movd [edx+ecx], mm1
|
|
psrlq mm1, 10h
|
|
movd [edx+2*ecx], mm1
|
|
WELSEMMS
|
|
ret
|
|
|
|
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; lt|t0|t1|t2|t3|
|
|
; l0|
|
|
; l1|
|
|
; l2|
|
|
; l3|
|
|
; l3 will never been used
|
|
; destination:
|
|
; |a |b |c |d |
|
|
; |e |f |g |h |
|
|
; |i |a |b |c |
|
|
; |j |e |f |g |
|
|
|
|
; a = (1 + lt + t0)>>1
|
|
; b = (1 + t0 + t1)>>1
|
|
; c = (1 + t1 + t2)>>1
|
|
; d = (1 + t2 + t3)>>1
|
|
|
|
; e = (2 + l0 + (lt<<1) + t0)>>2
|
|
; f = (2 + lt + (t0<<1) + t1)>>2
|
|
; g = (2 + t0 + (t1<<1) + t2)>>2
|
|
|
|
; h = (2 + t1 + (t2<<1) + t3)>>2
|
|
; i = (2 + lt + (l0<<1) + l1)>>2
|
|
; j = (2 + l0 + (l1<<1) + l2)>>2
|
|
;
|
|
; void_t WelsI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsI4x4LumaPredVR_mmx
|
|
WelsI4x4LumaPredVR_mmx:
|
|
mov edx, [esp+4] ; pPred
|
|
mov eax, edx
|
|
mov ecx, [esp+8] ; kiStride
|
|
sub eax, ecx
|
|
movq mm0, [eax-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
|
|
psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
|
|
|
|
movd mm1, [eax+2*ecx-4]
|
|
punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
|
|
lea eax, [eax+2*ecx]
|
|
movq mm2, [eax+ecx-8] ; mm2[7] = l2
|
|
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
|
|
psrlq mm2, 28h
|
|
pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
|
|
|
|
movq mm1, mm0
|
|
psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
|
|
pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
|
|
|
|
movq mm2, mm0
|
|
psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
|
|
movq mm3, mm2
|
|
pavgb mm2, mm0
|
|
|
|
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
|
|
pand mm3, [mmx_01bytes] ; set the odd bit
|
|
psubusb mm2, mm3 ; decrease 1 from odd bytes
|
|
|
|
movq mm3, mm0
|
|
psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
|
|
pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
|
|
movq mm2, mm3
|
|
|
|
psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
|
|
movd [edx], mm1
|
|
|
|
psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
|
|
movd [edx+ecx], mm2
|
|
|
|
movq mm4, mm3
|
|
psllq mm4, 20h
|
|
psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
|
|
|
|
movq mm5, mm3
|
|
psllq mm5, 28h
|
|
psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
|
|
|
|
psllq mm1, 8h
|
|
pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
|
|
movd [edx+2*ecx], mm4
|
|
|
|
psllq mm2, 8h
|
|
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
|
|
lea edx, [edx+2*ecx]
|
|
movd [edx+ecx], mm5
|
|
WELSEMMS
|
|
ret
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; lt|t0|t1|t2|t3|t4|t5|t6|t7
|
|
; l0|
|
|
; l1|
|
|
; l2|
|
|
; l3|
|
|
; lt,t0,t1,t2,t3 will never been used
|
|
; destination:
|
|
; |a |b |c |d |
|
|
; |b |c |d |e |
|
|
; |c |d |e |f |
|
|
; |d |e |f |g |
|
|
|
|
; a = (2 + t0 + t2 + (t1<<1))>>2
|
|
; b = (2 + t1 + t3 + (t2<<1))>>2
|
|
; c = (2 + t2 + t4 + (t3<<1))>>2
|
|
; d = (2 + t3 + t5 + (t4<<1))>>2
|
|
|
|
; e = (2 + t4 + t6 + (t5<<1))>>2
|
|
; f = (2 + t5 + t7 + (t6<<1))>>2
|
|
; g = (2 + t6 + t7 + (t7<<1))>>2
|
|
|
|
; [g f e d c b a] --> mov to memory
|
|
;
|
|
; void_t WelsI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
|
|
WelsI4x4LumaPredDDL_mmx:
|
|
mov edx, [esp+4] ; pPred
|
|
mov eax, edx
|
|
mov ecx, [esp+8] ; kiStride
|
|
sub eax, ecx
|
|
movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
|
|
movq mm1, mm0
|
|
movq mm2, mm0
|
|
|
|
movq mm3, mm0
|
|
psrlq mm3, 38h
|
|
psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
|
|
|
|
psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
|
|
psrlq mm2, 8h
|
|
pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
|
|
|
|
movq mm3, mm1
|
|
pavgb mm1, mm2
|
|
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
|
|
pand mm3, [mmx_01bytes] ; set the odd bit
|
|
psubusb mm1, mm3 ; decrease 1 from odd bytes
|
|
|
|
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
|
|
|
|
psrlq mm0, 8h
|
|
movd [edx], mm0
|
|
psrlq mm0, 8h
|
|
movd [edx+ecx], mm0
|
|
psrlq mm0, 8h
|
|
movd [edx+2*ecx], mm0
|
|
psrlq mm0, 8h
|
|
lea edx, [edx+2*ecx]
|
|
movd [edx+ecx], mm0
|
|
WELSEMMS
|
|
ret
|
|
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; lt|t0|t1|t2|t3|t4|t5|t6|t7
|
|
; l0|
|
|
; l1|
|
|
; l2|
|
|
; l3|
|
|
; lt,t0,t1,t2,t3 will never been used
|
|
; destination:
|
|
; |a |b |c |d |
|
|
; |e |f |g |h |
|
|
; |b |c |d |i |
|
|
; |f |g |h |j |
|
|
|
|
; a = (1 + t0 + t1)>>1
|
|
; b = (1 + t1 + t2)>>1
|
|
; c = (1 + t2 + t3)>>1
|
|
; d = (1 + t3 + t4)>>1
|
|
; i = (1 + t4 + t5)>>1
|
|
|
|
; e = (2 + t0 + (t1<<1) + t2)>>2
|
|
; f = (2 + t1 + (t2<<1) + t3)>>2
|
|
; g = (2 + t2 + (t3<<1) + t4)>>2
|
|
; h = (2 + t3 + (t4<<1) + t5)>>2
|
|
; j = (2 + t4 + (t5<<1) + t6)>>2
|
|
|
|
; [i d c b a] + [j h g f e] --> mov to memory
|
|
;
|
|
; void_t WelsI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsI4x4LumaPredVL_mmx
|
|
WelsI4x4LumaPredVL_mmx:
|
|
mov edx, [esp+4] ; pPred
|
|
mov eax, edx
|
|
mov ecx, [esp+8] ; kiStride
|
|
|
|
sub eax, ecx
|
|
movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
|
|
movq mm1, mm0
|
|
movq mm2, mm0
|
|
|
|
psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
|
|
psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
|
|
|
|
movq mm3, mm1
|
|
pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
|
|
|
|
movq mm4, mm2
|
|
pavgb mm2, mm0
|
|
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
|
|
pand mm4, [mmx_01bytes] ; set the odd bit
|
|
psubusb mm2, mm4 ; decrease 1 from odd bytes
|
|
|
|
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
|
|
|
|
movd [edx], mm3
|
|
psrlq mm3, 8h
|
|
movd [edx+2*ecx], mm3
|
|
|
|
movd [edx+ecx], mm2
|
|
psrlq mm2, 8h
|
|
lea edx, [edx+2*ecx]
|
|
movd [edx+ecx], mm2
|
|
WELSEMMS
|
|
ret
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
;
|
|
; void_t WelsIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsIChromaPredDc_sse2
|
|
WelsIChromaPredDc_sse2:
|
|
push ebx
|
|
mov eax, [esp+8] ; pPred
|
|
mov ecx, [esp+12] ; kiStride
|
|
|
|
sub eax, ecx
|
|
movq mm0, [eax]
|
|
|
|
movzx ebx, byte [eax+ecx-0x01] ; l1
|
|
lea eax, [eax+2*ecx]
|
|
movzx edx, byte [eax-0x01] ; l2
|
|
add ebx, edx
|
|
movzx edx, byte [eax+ecx-0x01] ; l3
|
|
add ebx, edx
|
|
lea eax, [eax+2*ecx]
|
|
movzx edx, byte [eax-0x01] ; l4
|
|
add ebx, edx
|
|
movd mm1, ebx ; mm1 = l1+l2+l3+l4
|
|
|
|
movzx ebx, byte [eax+ecx-0x01] ; l5
|
|
lea eax, [eax+2*ecx]
|
|
movzx edx, byte [eax-0x01] ; l6
|
|
add ebx, edx
|
|
movzx edx, byte [eax+ecx-0x01] ; l7
|
|
add ebx, edx
|
|
lea eax, [eax+2*ecx]
|
|
movzx edx, byte [eax-0x01] ; l8
|
|
add ebx, edx
|
|
movd mm2, ebx ; mm2 = l5+l6+l7+l8
|
|
|
|
movq mm3, mm0
|
|
psrlq mm0, 0x20
|
|
psllq mm3, 0x20
|
|
psrlq mm3, 0x20
|
|
pxor mm4, mm4
|
|
psadbw mm0, mm4
|
|
psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
|
|
|
|
paddq mm3, mm1
|
|
movq mm1, mm2
|
|
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
|
|
|
|
movq mm4, [mmx_0x02]
|
|
|
|
paddq mm0, mm4
|
|
psrlq mm0, 0x02
|
|
|
|
paddq mm2, mm4
|
|
psrlq mm2, 0x02
|
|
|
|
paddq mm3, mm4
|
|
paddq mm3, mm4
|
|
psrlq mm3, 0x03
|
|
|
|
paddq mm1, mm4
|
|
paddq mm1, mm4
|
|
psrlq mm1, 0x03
|
|
|
|
pmuludq mm0, [mmx_01bytes]
|
|
pmuludq mm3, [mmx_01bytes]
|
|
psllq mm0, 0x20
|
|
pxor mm0, mm3 ; mm0 = m_up
|
|
|
|
pmuludq mm2, [mmx_01bytes]
|
|
pmuludq mm1, [mmx_01bytes]
|
|
psllq mm1, 0x20
|
|
pxor mm1, mm2 ; mm2 = m_down
|
|
|
|
mov edx, [esp+8] ; pPred
|
|
|
|
movq [edx], mm0
|
|
movq [edx+ecx], mm0
|
|
movq [edx+2*ecx], mm0
|
|
lea edx, [edx+2*ecx]
|
|
movq [edx+ecx], mm0
|
|
|
|
movq [edx+2*ecx], mm1
|
|
lea edx, [edx+2*ecx]
|
|
movq [edx+ecx], mm1
|
|
movq [edx+2*ecx], mm1
|
|
lea edx, [edx+2*ecx]
|
|
movq [edx+ecx], mm1
|
|
|
|
pop ebx
|
|
WELSEMMS
|
|
ret
|
|
|
|
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
;
|
|
; void_t WelsI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsI16x16LumaPredDc_sse2
|
|
WelsI16x16LumaPredDc_sse2:
|
|
push ebx
|
|
mov eax, [esp+8] ; pPred
|
|
mov ecx, [esp+12] ; kiStride
|
|
|
|
sub eax, ecx
|
|
movdqa xmm0, [eax] ; read one row
|
|
pxor xmm1, xmm1
|
|
psadbw xmm0, xmm1
|
|
movdqa xmm1, xmm0
|
|
psrldq xmm1, 0x08
|
|
pslldq xmm0, 0x08
|
|
psrldq xmm0, 0x08
|
|
paddw xmm0, xmm1
|
|
|
|
movzx ebx, byte [eax+ecx-0x01]
|
|
movzx edx, byte [eax+2*ecx-0x01]
|
|
add ebx, edx
|
|
lea eax, [eax+ecx]
|
|
LOAD_2_LEFT_AND_ADD
|
|
LOAD_2_LEFT_AND_ADD
|
|
LOAD_2_LEFT_AND_ADD
|
|
LOAD_2_LEFT_AND_ADD
|
|
LOAD_2_LEFT_AND_ADD
|
|
LOAD_2_LEFT_AND_ADD
|
|
LOAD_2_LEFT_AND_ADD
|
|
add ebx, 0x10
|
|
movd xmm1, ebx
|
|
paddw xmm0, xmm1
|
|
psrld xmm0, 0x05
|
|
pmuludq xmm0, [mmx_01bytes]
|
|
pshufd xmm0, xmm0, 0
|
|
|
|
mov edx, [esp+8] ; pPred
|
|
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+ecx], xmm0
|
|
movdqa [edx+2*ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
|
|
movdqa [edx+ecx], xmm0
|
|
movdqa [edx+2*ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
|
|
movdqa [edx+ecx], xmm0
|
|
movdqa [edx+2*ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
|
|
movdqa [edx+ecx], xmm0
|
|
movdqa [edx+2*ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
|
|
movdqa [edx+ecx], xmm0
|
|
movdqa [edx+2*ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
|
|
movdqa [edx+ecx], xmm0
|
|
movdqa [edx+2*ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
|
|
movdqa [edx+ecx], xmm0
|
|
movdqa [edx+2*ecx], xmm0
|
|
lea edx, [edx+2*ecx]
|
|
|
|
movdqa [edx+ecx], xmm0
|
|
|
|
pop ebx
|
|
|
|
ret
|
|
|
|
;*******************************************************************************
|
|
; for intra prediction as follows, 11/19/2010
|
|
;*******************************************************************************
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; void_t WelsI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsI16x16LumaPredDcTop_sse2
|
|
WelsI16x16LumaPredDcTop_sse2:
|
|
push ebx
|
|
|
|
%define PUSH_SIZE 4
|
|
|
|
mov eax, [esp+PUSH_SIZE+4] ; pPred
|
|
mov ebx, [esp+PUSH_SIZE+8] ; kiStride
|
|
|
|
mov ecx, ebx
|
|
neg ecx
|
|
movdqa xmm0, [eax+ecx] ; pPred-kiStride, top line
|
|
pxor xmm7, xmm7
|
|
movdqa xmm1, xmm0
|
|
punpcklbw xmm0, xmm7
|
|
punpckhbw xmm1, xmm7
|
|
|
|
paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
|
|
pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4
|
|
paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
|
|
pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
|
|
paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
|
|
pshuflw xmm1, xmm0, 0b1h ; 10110001
|
|
paddw xmm0, xmm1 ; sum in word unit (x8)
|
|
movd edx, xmm0
|
|
and edx, 0ffffh
|
|
|
|
add edx, 08h
|
|
sar edx, 04h
|
|
mov dh, dl
|
|
mov ecx, edx
|
|
shl ecx, 010h
|
|
or edx, ecx
|
|
movd xmm1, edx
|
|
pshufd xmm0, xmm1, 00h
|
|
movdqa xmm1, xmm0
|
|
|
|
lea ecx, [2*ebx+ebx] ; 3*kiStride
|
|
|
|
movdqa [eax], xmm0
|
|
movdqa [eax+ebx], xmm1
|
|
movdqa [eax+2*ebx], xmm0
|
|
movdqa [eax+ecx], xmm1
|
|
|
|
lea eax, [eax+4*ebx]
|
|
movdqa [eax], xmm0
|
|
movdqa [eax+ebx], xmm1
|
|
movdqa [eax+2*ebx], xmm0
|
|
movdqa [eax+ecx], xmm1
|
|
|
|
lea eax, [eax+4*ebx]
|
|
movdqa [eax], xmm0
|
|
movdqa [eax+ebx], xmm1
|
|
movdqa [eax+2*ebx], xmm0
|
|
movdqa [eax+ecx], xmm1
|
|
|
|
lea eax, [eax+4*ebx]
|
|
movdqa [eax], xmm0
|
|
movdqa [eax+ebx], xmm1
|
|
movdqa [eax+2*ebx], xmm0
|
|
movdqa [eax+ecx], xmm1
|
|
|
|
%undef PUSH_SIZE
|
|
pop ebx
|
|
ret
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; void_t WelsI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsI16x16LumaPredDcNA_sse2
|
|
WelsI16x16LumaPredDcNA_sse2:
|
|
push ebx
|
|
|
|
%define PUSH_SIZE 4
|
|
|
|
mov eax, [esp+PUSH_SIZE+4] ; pPred
|
|
mov ebx, [esp+PUSH_SIZE+8] ; kiStride
|
|
|
|
lea ecx, [2*ebx+ebx] ; 3*kiStride
|
|
|
|
movdqa xmm0, [sse2_dc_0x80]
|
|
movdqa xmm1, xmm0
|
|
movdqa [eax], xmm0
|
|
movdqa [eax+ebx], xmm1
|
|
movdqa [eax+2*ebx], xmm0
|
|
movdqa [eax+ecx], xmm1
|
|
lea eax, [eax+4*ebx]
|
|
movdqa [eax], xmm0
|
|
movdqa [eax+ebx], xmm1
|
|
movdqa [eax+2*ebx], xmm0
|
|
movdqa [eax+ecx], xmm1
|
|
lea eax, [eax+4*ebx]
|
|
movdqa [eax], xmm0
|
|
movdqa [eax+ebx], xmm1
|
|
movdqa [eax+2*ebx], xmm0
|
|
movdqa [eax+ecx], xmm1
|
|
lea eax, [eax+4*ebx]
|
|
movdqa [eax], xmm0
|
|
movdqa [eax+ebx], xmm1
|
|
movdqa [eax+2*ebx], xmm0
|
|
movdqa [eax+ecx], xmm1
|
|
|
|
%undef PUSH_SIZE
|
|
|
|
pop ebx
|
|
ret
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; void_t WelsIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsIChromaPredDcLeft_mmx
|
|
WelsIChromaPredDcLeft_mmx:
|
|
push ebx
|
|
push esi
|
|
%define PUSH_SIZE 8
|
|
mov esi, [esp+PUSH_SIZE+4] ; pPred
|
|
mov ecx, [esp+PUSH_SIZE+8] ; kiStride
|
|
mov eax, esi
|
|
; for left
|
|
dec eax
|
|
xor ebx, ebx
|
|
xor edx, edx
|
|
mov bl, [eax]
|
|
mov dl, [eax+ecx]
|
|
add ebx, edx
|
|
lea eax, [eax+2*ecx]
|
|
mov dl, [eax]
|
|
add ebx, edx
|
|
mov dl, [eax+ecx]
|
|
add ebx, edx
|
|
add ebx, 02h
|
|
sar ebx, 02h
|
|
mov bh, bl
|
|
movd mm1, ebx
|
|
pshufw mm0, mm1, 00h ; up64
|
|
movq mm1, mm0
|
|
xor ebx, ebx
|
|
lea eax, [eax+2*ecx]
|
|
mov bl, [eax]
|
|
mov dl, [eax+ecx]
|
|
add ebx, edx
|
|
lea eax, [eax+2*ecx]
|
|
mov dl, [eax]
|
|
add ebx, edx
|
|
mov dl, [eax+ecx]
|
|
add ebx, edx
|
|
add ebx, 02h
|
|
sar ebx, 02h
|
|
mov bh, bl
|
|
movd mm3, ebx
|
|
pshufw mm2, mm3, 00h ; down64
|
|
movq mm3, mm2
|
|
lea ebx, [2*ecx+ecx]
|
|
movq [esi], mm0
|
|
movq [esi+ecx], mm1
|
|
movq [esi+2*ecx], mm0
|
|
movq [esi+ebx], mm1
|
|
lea esi, [esi+4*ecx]
|
|
movq [esi], mm2
|
|
movq [esi+ecx], mm3
|
|
movq [esi+2*ecx], mm2
|
|
movq [esi+ebx], mm3
|
|
pop esi
|
|
pop ebx
|
|
emms
|
|
ret
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; void_t WelsIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsIChromaPredDcTop_sse2
|
|
WelsIChromaPredDcTop_sse2:
|
|
push ebx
|
|
%define PUSH_SIZE 4
|
|
mov eax, [esp+PUSH_SIZE+4] ; pPred
|
|
mov ecx, [esp+PUSH_SIZE+8] ; kiStride
|
|
mov ebx, ecx
|
|
neg ebx
|
|
movq xmm0, [eax+ebx] ; top: 8x1 pixels
|
|
pxor xmm7, xmm7
|
|
punpcklbw xmm0, xmm7 ; ext 8x2 words
|
|
pshufd xmm1, xmm0, 0B1h ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
|
|
paddw xmm0, xmm1 ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
|
|
movdqa xmm1, xmm0
|
|
pshuflw xmm2, xmm0, 0B1h ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
|
|
pshufhw xmm3, xmm1, 0B1h ; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
|
|
paddw xmm0, xmm2 ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
|
|
paddw xmm1, xmm3 ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
|
|
punpckhqdq xmm1, xmm7
|
|
punpcklqdq xmm0, xmm1 ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
|
|
movdqa xmm6, [sse2_wd_0x02]
|
|
paddw xmm0, xmm6
|
|
psraw xmm0, 02h
|
|
packuswb xmm0, xmm7
|
|
lea ebx, [2*ecx+ecx]
|
|
movq [eax], xmm0
|
|
movq [eax+ecx], xmm0
|
|
movq [eax+2*ecx], xmm0
|
|
movq [eax+ebx], xmm0
|
|
lea eax, [eax+4*ecx]
|
|
movq [eax], xmm0
|
|
movq [eax+ecx], xmm0
|
|
movq [eax+2*ecx], xmm0
|
|
movq [eax+ebx], xmm0
|
|
%undef PUSH_SIZE
|
|
pop ebx
|
|
ret
|
|
|
|
|
|
ALIGN 16
|
|
;*******************************************************************************
|
|
; void_t WelsIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
|
|
;*******************************************************************************
|
|
WELS_EXTERN WelsIChromaPredDcNA_mmx
|
|
WelsIChromaPredDcNA_mmx:
|
|
push ebx
|
|
%define PUSH_SIZE 4
|
|
mov eax, [esp+PUSH_SIZE+4] ; pPred
|
|
mov ebx, [esp+PUSH_SIZE+8] ; kiStride
|
|
lea ecx, [2*ebx+ebx]
|
|
movq mm0, [sse2_dc_0x80]
|
|
movq mm1, mm0
|
|
movq [eax], mm0
|
|
movq [eax+ebx], mm1
|
|
movq [eax+2*ebx], mm0
|
|
movq [eax+ecx], mm1
|
|
lea eax, [eax+4*ebx]
|
|
movq [eax], mm0
|
|
movq [eax+ebx], mm1
|
|
movq [eax+2*ebx], mm0
|
|
movq [eax+ecx], mm1
|
|
%undef PUSH_SIZE
|
|
pop ebx
|
|
emms
|
|
ret
|
|
|
|
|
|
|