3cf52554f7
According to the Win64 ABI, these registers need to be preserved, and compilers are allowed to rely on their content to stay available - not only for float usage but for any usage, anywhere, in the calling C++ code. This adds a macro which pushes the clobbered registers onto the stack if targeting win64 (and a matching one which restores them). The parameter to the macro is the number of xmm registers used (e.g. if using xmm0 - xmm7, the parameter is 8), or in other words, the number of the highest xmm register used plus one. This is similar to how the same issue is handled for the NEON registers q4-q7 with the vpush instruction, except that they needed to be preserved on all platforms, not only on one particular platform. This allows removing the XMMREG_PROTECT_* hacks, which can easily fail if the compiler chooses to use the callee saved xmm registers in an unexpected spot.
1457 lines
37 KiB
NASM
1457 lines
37 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* * Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* * Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* intra_pred.asm
|
|
;*
|
|
;* Abstract
|
|
;* sse2 function for intra predict operations
|
|
;*
|
|
;* History
|
|
;* 18/09/2009 Created
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
%include "asm_inc.asm"
|
|
|
|
;***********************************************************************
|
|
; Local Data (Read Only)
|
|
;***********************************************************************
|
|
|
|
%ifdef FORMAT_COFF
|
|
SECTION .rodata pData
|
|
%else
|
|
SECTION .rodata align=16
|
|
%endif
|
|
|
|
align 16
|
|
sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
|
|
align 16
|
|
sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
|
|
align 16
|
|
sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
|
|
|
|
; for chroma plane mode
|
|
sse2_plane_inc_c dw 1, 2, 3, 4
|
|
sse2_plane_dec_c dw 4, 3, 2, 1
|
|
align 16
|
|
sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
|
|
|
|
align 16
|
|
mmx_01bytes: times 16 db 1
|
|
;align 16
|
|
;sse_0x0004bytes: times 8 dw 4
|
|
;ALIGN 16
|
|
;sse_f000 db 255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
|
|
|
align 16
|
|
mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
|
|
|
|
|
|
;***********************************************************************
|
|
; macros
|
|
;***********************************************************************
|
|
;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
|
|
;%1 will keep the last result
|
|
%macro SSE_DB_1_2REG 2
|
|
pxor %1, %1
|
|
pcmpeqw %2, %2
|
|
psubb %1, %2
|
|
%endmacro
|
|
|
|
;xmm0, xmm1, xmm2, eax, ecx
|
|
;lower 64 bits of xmm0 save the result
|
|
%macro SSE2_PRED_H_4X4_TWO_LINE 5
|
|
movd %1, [%4-1]
|
|
movdqa %3, %1
|
|
punpcklbw %1, %3
|
|
movdqa %3, %1
|
|
punpcklbw %1, %3
|
|
|
|
;add %4, %5
|
|
movd %2, [%4+%5-1]
|
|
movdqa %3, %2
|
|
punpcklbw %2, %3
|
|
movdqa %3, %2
|
|
punpcklbw %2, %3
|
|
punpckldq %1, %2
|
|
%endmacro
|
|
|
|
%macro SUMW_HORIZON1 2
|
|
movdqa %2, %1
|
|
psrldq %2, 8
|
|
paddusw %1, %2
|
|
movdqa %2, %1
|
|
psrldq %2, 4
|
|
paddusw %1, %2
|
|
movdqa %2, %1
|
|
psrldq %2, 2
|
|
paddusw %1, %2
|
|
%endmacro
|
|
|
|
%macro LOAD_COLUMN 6
|
|
movd %1, [%5]
|
|
movd %2, [%5+%6]
|
|
punpcklbw %1, %2
|
|
lea %5, [%5+2*%6]
|
|
movd %3, [%5]
|
|
movd %2, [%5+%6]
|
|
punpcklbw %3, %2
|
|
punpcklwd %1, %3
|
|
lea %5, [%5+2*%6]
|
|
movd %4, [%5]
|
|
movd %2, [%5+%6]
|
|
punpcklbw %4, %2
|
|
lea %5, [%5+2*%6]
|
|
movd %3, [%5]
|
|
movd %2, [%5+%6]
|
|
lea %5, [%5+2*%6]
|
|
punpcklbw %3, %2
|
|
punpcklwd %4, %3
|
|
punpckhdq %1, %4
|
|
%endmacro
|
|
|
|
%macro SUMW_HORIZON 3
|
|
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
|
|
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
|
|
punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
|
|
movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
|
|
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
|
|
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
|
|
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
|
|
%endmacro
|
|
|
|
|
|
%macro COPY_16_TIMES 2
|
|
movdqa %2, [%1-16]
|
|
psrldq %2, 15
|
|
pmuludq %2, [mmx_01bytes]
|
|
pshufd %2, %2, 0
|
|
%endmacro
|
|
|
|
%macro COPY_16_TIMESS 3
|
|
movdqa %2, [%1+%3-16]
|
|
psrldq %2, 15
|
|
pmuludq %2, [mmx_01bytes]
|
|
pshufd %2, %2, 0
|
|
%endmacro
|
|
|
|
%macro LOAD_COLUMN_C 6
|
|
movd %1, [%5]
|
|
movd %2, [%5+%6]
|
|
punpcklbw %1,%2
|
|
lea %5, [%5+2*%6]
|
|
movd %3, [%5]
|
|
movd %2, [%5+%6]
|
|
punpcklbw %3, %2
|
|
punpckhwd %1, %3
|
|
lea %5, [%5+2*%6]
|
|
%endmacro
|
|
|
|
%macro LOAD_2_LEFT_AND_ADD 0
|
|
lea r1, [r1+2*r2]
|
|
movzx r4, byte [r1-0x01]
|
|
add r3, r4
|
|
movzx r4, byte [r1+r2-0x01]
|
|
add r3, r4
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
; Code
|
|
;***********************************************************************
|
|
|
|
SECTION .text
|
|
WELS_EXTERN WelsI4x4LumaPredH_sse2
|
|
WELS_EXTERN WelsI4x4LumaPredDDR_mmx
|
|
WELS_EXTERN WelsI4x4LumaPredDc_sse2
|
|
WELS_EXTERN WelsI16x16LumaPredPlane_sse2
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
; void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
|
|
;
|
|
; pred must align to 16
|
|
;***********************************************************************
|
|
WelsI4x4LumaPredH_sse2:
|
|
push r3
|
|
%assign push_num 1
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
movzx r3, byte [r1-1]
|
|
movd xmm0, r3d
|
|
pmuludq xmm0, [mmx_01bytes]
|
|
|
|
movzx r3, byte [r1+r2-1]
|
|
movd xmm1, r3d
|
|
pmuludq xmm1, [mmx_01bytes]
|
|
|
|
unpcklps xmm0, xmm1
|
|
|
|
lea r1, [r1+r2*2]
|
|
movzx r3, byte [r1-1]
|
|
movd xmm2, r3d
|
|
pmuludq xmm2, [mmx_01bytes]
|
|
|
|
movzx r3, byte [r1+r2-1]
|
|
movd xmm3, r3d
|
|
pmuludq xmm3, [mmx_01bytes]
|
|
|
|
unpcklps xmm2, xmm3
|
|
unpcklpd xmm0, xmm2
|
|
|
|
movdqa [r0], xmm0
|
|
pop r3
|
|
ret
|
|
|
|
;***********************************************************************
|
|
; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
|
|
;***********************************************************************
|
|
WelsI16x16LumaPredPlane_sse2:
|
|
push r3
|
|
push r4
|
|
%assign push_num 2
|
|
LOAD_3_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r2, r2d
|
|
sub r1, 1
|
|
sub r1, r2
|
|
|
|
;for H
|
|
pxor xmm7, xmm7
|
|
movq xmm0, [r1]
|
|
movdqa xmm5, [sse2_plane_dec]
|
|
punpcklbw xmm0, xmm7
|
|
pmullw xmm0, xmm5
|
|
movq xmm1, [r1 + 9]
|
|
movdqa xmm6, [sse2_plane_inc]
|
|
punpcklbw xmm1, xmm7
|
|
pmullw xmm1, xmm6
|
|
psubw xmm1, xmm0
|
|
|
|
SUMW_HORIZON xmm1,xmm0,xmm2
|
|
movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
|
|
movsx r3, r3w
|
|
imul r3, 5
|
|
add r3, 32
|
|
sar r3, 6 ; b = (5 * H + 32) >> 6;
|
|
SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b
|
|
|
|
movzx r4, BYTE [r1+16]
|
|
sub r1, 3
|
|
LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2
|
|
|
|
add r1, 3
|
|
movzx r3, BYTE [r1+8*r2]
|
|
add r4, r3
|
|
shl r4, 4 ; a = (left[15*stride] + top[15]) << 4;
|
|
|
|
sub r1, 3
|
|
add r1, r2
|
|
LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2
|
|
pxor xmm4, xmm4
|
|
punpckhbw xmm0, xmm4
|
|
pmullw xmm0, xmm5
|
|
punpckhbw xmm7, xmm4
|
|
pmullw xmm7, xmm6
|
|
psubw xmm7, xmm0
|
|
|
|
SUMW_HORIZON xmm7,xmm0,xmm2
|
|
movd r3d, xmm7 ; V
|
|
movsx r3, r3w
|
|
imul r3, 5
|
|
add r3, 32
|
|
sar r3, 6 ; c = (5 * V + 32) >> 6;
|
|
SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c
|
|
|
|
add r4, 16
|
|
imul r3, -7
|
|
add r3, r4 ; s = a + 16 + (-7)*c
|
|
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
|
|
|
|
xor r3, r3
|
|
movdqa xmm5, [sse2_plane_inc_minus]
|
|
|
|
get_i16x16_luma_pred_plane_sse2_1:
|
|
movdqa xmm2, xmm1
|
|
pmullw xmm2, xmm5
|
|
paddw xmm2, xmm0
|
|
psraw xmm2, 5
|
|
movdqa xmm3, xmm1
|
|
pmullw xmm3, xmm6
|
|
paddw xmm3, xmm0
|
|
psraw xmm3, 5
|
|
packuswb xmm2, xmm3
|
|
movdqa [r0], xmm2
|
|
paddw xmm0, xmm4
|
|
add r0, 16
|
|
inc r3
|
|
cmp r3, 16
|
|
jnz get_i16x16_luma_pred_plane_sse2_1
|
|
POP_XMM
|
|
pop r4
|
|
pop r3
|
|
ret
|
|
|
|
;***********************************************************************
|
|
; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
|
|
;***********************************************************************
|
|
|
|
%macro SSE2_PRED_H_16X16_ONE_LINE 0
|
|
add r0, 16
|
|
add r1, r2
|
|
movzx r3, byte [r1]
|
|
SSE2_Copy16Times xmm0, r3d
|
|
movdqa [r0], xmm0
|
|
%endmacro
|
|
|
|
WELS_EXTERN WelsI16x16LumaPredH_sse2
|
|
WelsI16x16LumaPredH_sse2:
|
|
push r3
|
|
%assign push_num 1
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
dec r1
|
|
movzx r3, byte [r1]
|
|
SSE2_Copy16Times xmm0, r3d
|
|
movdqa [r0], xmm0
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
SSE2_PRED_H_16X16_ONE_LINE
|
|
pop r3
|
|
ret
|
|
|
|
;***********************************************************************
|
|
; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsI16x16LumaPredV_sse2
|
|
WelsI16x16LumaPredV_sse2:
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
sub r1, r2
|
|
movdqa xmm0, [r1]
|
|
|
|
movdqa [r0], xmm0
|
|
movdqa [r0+10h], xmm0
|
|
movdqa [r0+20h], xmm0
|
|
movdqa [r0+30h], xmm0
|
|
movdqa [r0+40h], xmm0
|
|
movdqa [r0+50h], xmm0
|
|
movdqa [r0+60h], xmm0
|
|
movdqa [r0+70h], xmm0
|
|
movdqa [r0+80h], xmm0
|
|
movdqa [r0+90h], xmm0
|
|
movdqa [r0+160], xmm0
|
|
movdqa [r0+176], xmm0
|
|
movdqa [r0+192], xmm0
|
|
movdqa [r0+208], xmm0
|
|
movdqa [r0+224], xmm0
|
|
movdqa [r0+240], xmm0
|
|
|
|
ret
|
|
|
|
;***********************************************************************
|
|
; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsIChromaPredPlane_sse2
|
|
WelsIChromaPredPlane_sse2:
|
|
push r3
|
|
push r4
|
|
%assign push_num 2
|
|
LOAD_3_PARA
|
|
PUSH_XMM 8
|
|
SIGN_EXTENSION r2, r2d
|
|
sub r1, 1
|
|
sub r1, r2
|
|
|
|
pxor mm7, mm7
|
|
movq mm0, [r1]
|
|
movq mm5, [sse2_plane_dec_c]
|
|
punpcklbw mm0, mm7
|
|
pmullw mm0, mm5
|
|
movq mm1, [r1 + 5]
|
|
movq mm6, [sse2_plane_inc_c]
|
|
punpcklbw mm1, mm7
|
|
pmullw mm1, mm6
|
|
psubw mm1, mm0
|
|
|
|
movq2dq xmm1, mm1
|
|
pxor xmm2, xmm2
|
|
SUMW_HORIZON xmm1,xmm0,xmm2
|
|
movd r3d, xmm1
|
|
movsx r3, r3w
|
|
imul r3, 17
|
|
add r3, 16
|
|
sar r3, 5 ; b = (17 * H + 16) >> 5;
|
|
SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b
|
|
|
|
movzx r3, BYTE [r1+8]
|
|
sub r1, 3
|
|
LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2
|
|
|
|
add r1, 3
|
|
movzx r4, BYTE [r1+4*r2]
|
|
add r4, r3
|
|
shl r4, 4 ; a = (left[7*stride] + top[7]) << 4;
|
|
|
|
sub r1, 3
|
|
add r1, r2
|
|
LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2
|
|
pxor mm4, mm4
|
|
punpckhbw mm0, mm4
|
|
pmullw mm0, mm5
|
|
punpckhbw mm7, mm4
|
|
pmullw mm7, mm6
|
|
psubw mm7, mm0
|
|
|
|
movq2dq xmm7, mm7
|
|
pxor xmm2, xmm2
|
|
SUMW_HORIZON xmm7,xmm0,xmm2
|
|
movd r3d, xmm7 ; V
|
|
movsx r3, r3w
|
|
imul r3, 17
|
|
add r3, 16
|
|
sar r3, 5 ; c = (17 * V + 16) >> 5;
|
|
SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c
|
|
|
|
add r4, 16
|
|
imul r3, -3
|
|
add r3, r4 ; s = a + 16 + (-3)*c
|
|
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
|
|
|
|
xor r3, r3
|
|
movdqa xmm5, [sse2_plane_mul_b_c]
|
|
|
|
get_i_chroma_pred_plane_sse2_1:
|
|
movdqa xmm2, xmm1
|
|
pmullw xmm2, xmm5
|
|
paddw xmm2, xmm0
|
|
psraw xmm2, 5
|
|
packuswb xmm2, xmm2
|
|
movq [r0], xmm2
|
|
paddw xmm0, xmm4
|
|
add r0, 8
|
|
inc r3
|
|
cmp r3, 8
|
|
jnz get_i_chroma_pred_plane_sse2_1
|
|
POP_XMM
|
|
pop r4
|
|
pop r3
|
|
WELSEMMS
|
|
ret
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
; 0 |1 |2 |3 |4 |
|
|
; 6 |7 |8 |9 |10|
|
|
; 11|12|13|14|15|
|
|
; 16|17|18|19|20|
|
|
; 21|22|23|24|25|
|
|
; 7 is the start pixel of current 4x4 block
|
|
; pred[7] = ([6]+[0]*2+[1]+2)/4
|
|
;
|
|
; void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
|
|
;
|
|
;***********************************************************************
|
|
WelsI4x4LumaPredDDR_mmx:
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
|
|
movq mm2,[r1-8] ;get value of 6 mm2[8] = 6
|
|
sub r1, r2 ;mov eax to above line of current block(postion of 1)
|
|
punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
|
|
movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
|
|
punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
|
|
psllq mm3,18h ;mm3[5]=[1]
|
|
psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
|
|
por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
|
|
movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
|
|
lea r1,[r1+r2*2-8h] ;set eax point to 12
|
|
movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16]
|
|
psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
|
|
psrlq mm4,38h ;mm4[1]=[16]
|
|
por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
|
|
movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
|
|
movq mm4,[r1+r2*2] ;mm4[8]=[21]
|
|
psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
|
|
psrlq mm4,38h ;mm4[1]=[21]
|
|
por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
|
|
movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
|
|
pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
|
|
pxor mm1,mm4 ;find odd value in the lowest bit of each byte
|
|
pand mm1,[mmx_01bytes] ;set the odd bit
|
|
psubusb mm3,mm1 ;decrease 1 from odd bytes
|
|
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
|
|
|
|
movd [r0+12],mm2
|
|
psrlq mm2,8
|
|
movd [r0+8],mm2
|
|
psrlq mm2,8
|
|
movd [r0+4],mm2
|
|
psrlq mm2,8
|
|
movd [r0],mm2
|
|
WELSEMMS
|
|
ret
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
; 0 |1 |2 |3 |4 |
|
|
; 5 |6 |7 |8 |9 |
|
|
; 10|11|12|13|14|
|
|
; 15|16|17|18|19|
|
|
; 20|21|22|23|24|
|
|
; 6 is the start pixel of current 4x4 block
|
|
; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
|
|
;
|
|
; void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
|
|
;
|
|
;***********************************************************************
|
|
WelsI4x4LumaPredDc_sse2:
|
|
push r3
|
|
push r4
|
|
%assign push_num 2
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
movzx r4, byte [r1-1h]
|
|
sub r1, r2
|
|
movd xmm0, [r1]
|
|
pxor xmm1, xmm1
|
|
psadbw xmm0, xmm1
|
|
xor r3, r3
|
|
movd r3d, xmm0
|
|
add r3, r4
|
|
movzx r4, byte [r1+r2*2-1h]
|
|
add r3, r4
|
|
|
|
lea r1, [r1+r2*2-1]
|
|
movzx r4, byte [r1+r2]
|
|
add r3, r4
|
|
|
|
movzx r4, byte [r1+r2*2]
|
|
add r3, r4
|
|
add r3, 4
|
|
sar r3, 3
|
|
imul r3, 0x01010101
|
|
|
|
movd xmm0, r3d
|
|
pshufd xmm0, xmm0, 0
|
|
movdqa [r0], xmm0
|
|
pop r4
|
|
pop r3
|
|
ret
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
; void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
|
|
; copy 8 pixel of 8 line from left
|
|
;***********************************************************************
|
|
%macro MMX_PRED_H_8X8_ONE_LINE 4
|
|
movq %1, [%3-8]
|
|
psrlq %1, 38h
|
|
|
|
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
|
|
pmullw %1, [mmx_01bytes]
|
|
pshufw %1, %1, 0
|
|
movq [%4], %1
|
|
%endmacro
|
|
|
|
%macro MMX_PRED_H_8X8_ONE_LINEE 4
|
|
movq %1, [%3+r2-8]
|
|
psrlq %1, 38h
|
|
|
|
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
|
|
pmullw %1, [mmx_01bytes]
|
|
pshufw %1, %1, 0
|
|
movq [%4], %1
|
|
%endmacro
|
|
|
|
WELS_EXTERN WelsIChromaPredH_mmx
|
|
WelsIChromaPredH_mmx:
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
movq mm0, [r1-8]
|
|
psrlq mm0, 38h
|
|
|
|
;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
|
|
pmullw mm0, [mmx_01bytes]
|
|
pshufw mm0, mm0, 0
|
|
movq [r0], mm0
|
|
|
|
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8
|
|
|
|
lea r1,[r1+r2*2]
|
|
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
|
|
|
|
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24
|
|
|
|
lea r1,[r1+r2*2]
|
|
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
|
|
|
|
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40
|
|
|
|
lea r1,[r1+r2*2]
|
|
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
|
|
|
|
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
|
|
WELSEMMS
|
|
ret
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
; void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
|
|
; copy pixels from top 4 pixels
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsI4x4LumaPredV_sse2
|
|
WelsI4x4LumaPredV_sse2:
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
sub r1, r2
|
|
movd xmm0, [r1]
|
|
pshufd xmm0, xmm0, 0
|
|
movdqa [r0], xmm0
|
|
ret
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
; void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
|
|
; copy 8 pixels from top 8 pixels
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsIChromaPredV_sse2
|
|
WelsIChromaPredV_sse2:
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
sub r1, r2
|
|
movq xmm0, [r1]
|
|
movdqa xmm1, xmm0
|
|
punpcklqdq xmm0, xmm1
|
|
movdqa [r0], xmm0
|
|
movdqa [r0+16], xmm0
|
|
movdqa [r0+32], xmm0
|
|
movdqa [r0+48], xmm0
|
|
ret
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
; lt|t0|t1|t2|t3|
|
|
; l0|
|
|
; l1|
|
|
; l2|
|
|
; l3|
|
|
; t3 will never been used
|
|
; destination:
|
|
; |a |b |c |d |
|
|
; |e |f |a |b |
|
|
; |g |h |e |f |
|
|
; |i |j |g |h |
|
|
|
|
; a = (1 + lt + l0)>>1
|
|
; e = (1 + l0 + l1)>>1
|
|
; g = (1 + l1 + l2)>>1
|
|
; i = (1 + l2 + l3)>>1
|
|
|
|
; d = (2 + t0 + (t1<<1) + t2)>>2
|
|
; c = (2 + lt + (t0<<1) + t1)>>2
|
|
; b = (2 + l0 + (lt<<1) + t0)>>2
|
|
|
|
; f = (2 + l1 + (l0<<1) + lt)>>2
|
|
; h = (2 + l2 + (l1<<1) + l0)>>2
|
|
; j = (2 + l3 + (l2<<1) + l1)>>2
|
|
; [b a f e h g j i] + [d c b a] --> mov to memory
|
|
;
|
|
; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsI4x4LumaPredHD_mmx
|
|
WelsI4x4LumaPredHD_mmx:
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
sub r1, r2
|
|
movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
|
|
psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
|
|
|
|
movd mm1, [r1+2*r2-4]
|
|
punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
|
|
lea r1, [r1+2*r2]
|
|
movd mm2, [r1+2*r2-4]
|
|
punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3
|
|
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
|
|
psrlq mm2, 20h
|
|
pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
|
|
|
|
movq mm1, mm0
|
|
psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
|
|
movq mm2, mm0
|
|
psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
|
|
movq mm3, mm2
|
|
movq mm4, mm1
|
|
pavgb mm1, mm0
|
|
|
|
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
|
|
pand mm4, [mmx_01bytes] ; set the odd bit
|
|
psubusb mm1, mm4 ; decrease 1 from odd bytes
|
|
|
|
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
|
|
|
|
movq mm4, mm0
|
|
pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
|
|
punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
|
|
|
|
psrlq mm2, 20h
|
|
psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
|
|
movq mm4, mm3
|
|
psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
|
|
pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
|
|
psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
|
|
|
|
movd [r0], mm2
|
|
movd [r0+12], mm3
|
|
psrlq mm3, 10h
|
|
movd [r0+8], mm3
|
|
psrlq mm3, 10h
|
|
movd [r0+4], mm3
|
|
WELSEMMS
|
|
ret
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
; lt|t0|t1|t2|t3|
|
|
; l0|
|
|
; l1|
|
|
; l2|
|
|
; l3|
|
|
; t3 will never been used
|
|
; destination:
|
|
; |a |b |c |d |
|
|
; |c |d |e |f |
|
|
; |e |f |g |g |
|
|
; |g |g |g |g |
|
|
|
|
; a = (1 + l0 + l1)>>1
|
|
; c = (1 + l1 + l2)>>1
|
|
; e = (1 + l2 + l3)>>1
|
|
; g = l3
|
|
|
|
; b = (2 + l0 + (l1<<1) + l2)>>2
|
|
; d = (2 + l1 + (l2<<1) + l3)>>2
|
|
; f = (2 + l2 + (l3<<1) + l3)>>2
|
|
|
|
; [g g f e d c b a] + [g g g g] --> mov to memory
|
|
;
|
|
; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsI4x4LumaPredHU_mmx
|
|
WelsI4x4LumaPredHU_mmx:
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
movd mm0, [r1-4] ; mm0[3] = l0
|
|
punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0
|
|
lea r1, [r1+2*r2]
|
|
movd mm2, [r1-4] ; mm2[3] = l2
|
|
movd mm4, [r1+r2-4] ; mm4[3] = l3
|
|
punpcklbw mm2, mm4
|
|
punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
|
|
|
|
psrlq mm4, 18h
|
|
psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
|
|
psrlq mm0, 8h
|
|
pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
|
|
|
|
movq mm1, mm0
|
|
psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
|
|
movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
|
|
pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
|
|
|
|
movq mm2, mm0
|
|
psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
|
|
movq mm5, mm2
|
|
pavgb mm2, mm0
|
|
|
|
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
|
|
pand mm5, [mmx_01bytes] ; set the odd bit
|
|
psubusb mm2, mm5 ; decrease 1 from odd bytes
|
|
|
|
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
|
|
|
|
psrlq mm2, 8h
|
|
pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
|
|
|
|
punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
|
|
punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
|
|
punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
|
|
|
|
psrlq mm4, 20h
|
|
movd [r0+12], mm4
|
|
|
|
movd [r0], mm1
|
|
psrlq mm1, 10h
|
|
movd [r0+4], mm1
|
|
psrlq mm1, 10h
|
|
movd [r0+8], mm1
|
|
WELSEMMS
|
|
ret
|
|
|
|
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
; lt|t0|t1|t2|t3|
|
|
; l0|
|
|
; l1|
|
|
; l2|
|
|
; l3|
|
|
; l3 will never been used
|
|
; destination:
|
|
; |a |b |c |d |
|
|
; |e |f |g |h |
|
|
; |i |a |b |c |
|
|
; |j |e |f |g |
|
|
|
|
; a = (1 + lt + t0)>>1
|
|
; b = (1 + t0 + t1)>>1
|
|
; c = (1 + t1 + t2)>>1
|
|
; d = (1 + t2 + t3)>>1
|
|
|
|
; e = (2 + l0 + (lt<<1) + t0)>>2
|
|
; f = (2 + lt + (t0<<1) + t1)>>2
|
|
; g = (2 + t0 + (t1<<1) + t2)>>2
|
|
|
|
; h = (2 + t1 + (t2<<1) + t3)>>2
|
|
; i = (2 + lt + (l0<<1) + l1)>>2
|
|
; j = (2 + l0 + (l1<<1) + l2)>>2
|
|
;
|
|
; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsI4x4LumaPredVR_mmx
|
|
WelsI4x4LumaPredVR_mmx:
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
sub r1, r2
|
|
movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
|
|
psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
|
|
|
|
movd mm1, [r1+2*r2-4]
|
|
punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
|
|
lea r1, [r1+2*r2]
|
|
movq mm2, [r1+r2-8] ; mm2[7] = l2
|
|
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
|
|
psrlq mm2, 28h
|
|
pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
|
|
|
|
movq mm1, mm0
|
|
psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
|
|
pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
|
|
|
|
movq mm2, mm0
|
|
psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
|
|
movq mm3, mm2
|
|
pavgb mm2, mm0
|
|
|
|
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
|
|
pand mm3, [mmx_01bytes] ; set the odd bit
|
|
psubusb mm2, mm3 ; decrease 1 from odd bytes
|
|
|
|
movq mm3, mm0
|
|
psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
|
|
pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
|
|
movq mm2, mm3
|
|
|
|
psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
|
|
movd [r0], mm1
|
|
|
|
psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
|
|
movd [r0+4], mm2
|
|
|
|
movq mm4, mm3
|
|
psllq mm4, 20h
|
|
psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
|
|
|
|
movq mm5, mm3
|
|
psllq mm5, 28h
|
|
psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
|
|
|
|
psllq mm1, 8h
|
|
pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
|
|
movd [r0+8], mm4
|
|
|
|
psllq mm2, 8h
|
|
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
|
|
movd [r0+12], mm5
|
|
WELSEMMS
|
|
ret
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
; lt|t0|t1|t2|t3|t4|t5|t6|t7
|
|
; l0|
|
|
; l1|
|
|
; l2|
|
|
; l3|
|
|
; lt,t0,t1,t2,t3 will never been used
|
|
; destination:
|
|
; |a |b |c |d |
|
|
; |b |c |d |e |
|
|
; |c |d |e |f |
|
|
; |d |e |f |g |
|
|
|
|
; a = (2 + t0 + t2 + (t1<<1))>>2
|
|
; b = (2 + t1 + t3 + (t2<<1))>>2
|
|
; c = (2 + t2 + t4 + (t3<<1))>>2
|
|
; d = (2 + t3 + t5 + (t4<<1))>>2
|
|
|
|
; e = (2 + t4 + t6 + (t5<<1))>>2
|
|
; f = (2 + t5 + t7 + (t6<<1))>>2
|
|
; g = (2 + t6 + t7 + (t7<<1))>>2
|
|
|
|
; [g f e d c b a] --> mov to memory
|
|
;
|
|
; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
|
|
WelsI4x4LumaPredDDL_mmx:
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
sub r1, r2
|
|
movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
|
|
movq mm1, mm0
|
|
movq mm2, mm0
|
|
|
|
movq mm3, mm0
|
|
psrlq mm3, 38h
|
|
psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
|
|
|
|
psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
|
|
psrlq mm2, 8h
|
|
pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
|
|
|
|
movq mm3, mm1
|
|
pavgb mm1, mm2
|
|
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
|
|
pand mm3, [mmx_01bytes] ; set the odd bit
|
|
psubusb mm1, mm3 ; decrease 1 from odd bytes
|
|
|
|
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
|
|
|
|
psrlq mm0, 8h
|
|
movd [r0], mm0
|
|
psrlq mm0, 8h
|
|
movd [r0+4], mm0
|
|
psrlq mm0, 8h
|
|
movd [r0+8], mm0
|
|
psrlq mm0, 8h
|
|
movd [r0+12], mm0
|
|
WELSEMMS
|
|
ret
|
|
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
; lt|t0|t1|t2|t3|t4|t5|t6|t7
|
|
; l0|
|
|
; l1|
|
|
; l2|
|
|
; l3|
|
|
; lt,t0,t1,t2,t3 will never been used
|
|
; destination:
|
|
; |a |b |c |d |
|
|
; |e |f |g |h |
|
|
; |b |c |d |i |
|
|
; |f |g |h |j |
|
|
|
|
; a = (1 + t0 + t1)>>1
|
|
; b = (1 + t1 + t2)>>1
|
|
; c = (1 + t2 + t3)>>1
|
|
; d = (1 + t3 + t4)>>1
|
|
; i = (1 + t4 + t5)>>1
|
|
|
|
; e = (2 + t0 + (t1<<1) + t2)>>2
|
|
; f = (2 + t1 + (t2<<1) + t3)>>2
|
|
; g = (2 + t2 + (t3<<1) + t4)>>2
|
|
; h = (2 + t3 + (t4<<1) + t5)>>2
|
|
; j = (2 + t4 + (t5<<1) + t6)>>2
|
|
|
|
; [i d c b a] + [j h g f e] --> mov to memory
|
|
;
|
|
; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsI4x4LumaPredVL_mmx
|
|
WelsI4x4LumaPredVL_mmx:
|
|
%assign push_num 0
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
sub r1, r2
|
|
movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
|
|
movq mm1, mm0
|
|
movq mm2, mm0
|
|
|
|
psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
|
|
psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
|
|
|
|
movq mm3, mm1
|
|
pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
|
|
|
|
movq mm4, mm2
|
|
pavgb mm2, mm0
|
|
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
|
|
pand mm4, [mmx_01bytes] ; set the odd bit
|
|
psubusb mm2, mm4 ; decrease 1 from odd bytes
|
|
|
|
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
|
|
|
|
movd [r0], mm3
|
|
psrlq mm3, 8h
|
|
movd [r0+8], mm3
|
|
|
|
movd [r0+4], mm2
|
|
psrlq mm2, 8h
|
|
movd [r0+12], mm2
|
|
WELSEMMS
|
|
ret
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
;
|
|
; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsIChromaPredDc_sse2
|
|
WelsIChromaPredDc_sse2:
|
|
push r3
|
|
push r4
|
|
%assign push_num 2
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
sub r1, r2
|
|
movq mm0, [r1]
|
|
|
|
movzx r3, byte [r1+r2-0x01] ; l1
|
|
lea r1, [r1+2*r2]
|
|
movzx r4, byte [r1-0x01] ; l2
|
|
add r3, r4
|
|
movzx r4, byte [r1+r2-0x01] ; l3
|
|
add r3, r4
|
|
lea r1, [r1+2*r2]
|
|
movzx r4, byte [r1-0x01] ; l4
|
|
add r3, r4
|
|
movd mm1, r3d ; mm1 = l1+l2+l3+l4
|
|
|
|
movzx r3, byte [r1+r2-0x01] ; l5
|
|
lea r1, [r1+2*r2]
|
|
movzx r4, byte [r1-0x01] ; l6
|
|
add r3, r4
|
|
movzx r4, byte [r1+r2-0x01] ; l7
|
|
add r3, r4
|
|
lea r1, [r1+2*r2]
|
|
movzx r4, byte [r1-0x01] ; l8
|
|
add r3, r4
|
|
movd mm2, r3d ; mm2 = l5+l6+l7+l8
|
|
|
|
movq mm3, mm0
|
|
psrlq mm0, 0x20
|
|
psllq mm3, 0x20
|
|
psrlq mm3, 0x20
|
|
pxor mm4, mm4
|
|
psadbw mm0, mm4
|
|
psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
|
|
|
|
paddq mm3, mm1
|
|
movq mm1, mm2
|
|
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
|
|
|
|
movq mm4, [mmx_0x02]
|
|
|
|
paddq mm0, mm4
|
|
psrlq mm0, 0x02
|
|
|
|
paddq mm2, mm4
|
|
psrlq mm2, 0x02
|
|
|
|
paddq mm3, mm4
|
|
paddq mm3, mm4
|
|
psrlq mm3, 0x03
|
|
|
|
paddq mm1, mm4
|
|
paddq mm1, mm4
|
|
psrlq mm1, 0x03
|
|
|
|
pmuludq mm0, [mmx_01bytes]
|
|
pmuludq mm3, [mmx_01bytes]
|
|
psllq mm0, 0x20
|
|
pxor mm0, mm3 ; mm0 = m_up
|
|
|
|
pmuludq mm2, [mmx_01bytes]
|
|
pmuludq mm1, [mmx_01bytes]
|
|
psllq mm1, 0x20
|
|
pxor mm1, mm2 ; mm2 = m_down
|
|
|
|
movq [r0], mm0
|
|
movq [r0+0x08], mm0
|
|
movq [r0+0x10], mm0
|
|
movq [r0+0x18], mm0
|
|
|
|
movq [r0+0x20], mm1
|
|
movq [r0+0x28], mm1
|
|
movq [r0+0x30], mm1
|
|
movq [r0+0x38], mm1
|
|
|
|
pop r4
|
|
pop r3
|
|
WELSEMMS
|
|
ret
|
|
|
|
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
;
|
|
; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsI16x16LumaPredDc_sse2
|
|
WelsI16x16LumaPredDc_sse2:
|
|
push r3
|
|
push r4
|
|
%assign push_num 2
|
|
LOAD_3_PARA
|
|
SIGN_EXTENSION r2, r2d
|
|
sub r1, r2
|
|
movdqa xmm0, [r1] ; read one row
|
|
pxor xmm1, xmm1
|
|
psadbw xmm0, xmm1
|
|
movdqa xmm1, xmm0
|
|
psrldq xmm1, 0x08
|
|
pslldq xmm0, 0x08
|
|
psrldq xmm0, 0x08
|
|
paddw xmm0, xmm1
|
|
|
|
movzx r3, byte [r1+r2-0x01]
|
|
movzx r4, byte [r1+2*r2-0x01]
|
|
add r3, r4
|
|
lea r1, [r1+r2]
|
|
LOAD_2_LEFT_AND_ADD
|
|
LOAD_2_LEFT_AND_ADD
|
|
LOAD_2_LEFT_AND_ADD
|
|
LOAD_2_LEFT_AND_ADD
|
|
LOAD_2_LEFT_AND_ADD
|
|
LOAD_2_LEFT_AND_ADD
|
|
LOAD_2_LEFT_AND_ADD
|
|
add r3, 0x10
|
|
movd xmm1, r3d
|
|
paddw xmm0, xmm1
|
|
psrld xmm0, 0x05
|
|
pmuludq xmm0, [mmx_01bytes]
|
|
pshufd xmm0, xmm0, 0
|
|
|
|
movdqa [r0], xmm0
|
|
movdqa [r0+0x10], xmm0
|
|
movdqa [r0+0x20], xmm0
|
|
movdqa [r0+0x30], xmm0
|
|
movdqa [r0+0x40], xmm0
|
|
movdqa [r0+0x50], xmm0
|
|
movdqa [r0+0x60], xmm0
|
|
movdqa [r0+0x70], xmm0
|
|
movdqa [r0+0x80], xmm0
|
|
movdqa [r0+0x90], xmm0
|
|
movdqa [r0+0xa0], xmm0
|
|
movdqa [r0+0xb0], xmm0
|
|
movdqa [r0+0xc0], xmm0
|
|
movdqa [r0+0xd0], xmm0
|
|
movdqa [r0+0xe0], xmm0
|
|
movdqa [r0+0xf0], xmm0
|
|
|
|
pop r4
|
|
pop r3
|
|
ret
|
|
|
|
;***********************************************************************
|
|
;
|
|
;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
|
|
; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
|
|
;
|
|
;***********************************************************************
|
|
%ifdef X86_32
|
|
WELS_EXTERN WelsSampleSatdThree4x4_sse2
|
|
align 16
|
|
WelsSampleSatdThree4x4_sse2:
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
mov eax, [esp+24];p_enc
|
|
mov ebx, [esp+28];linesize_enc
|
|
|
|
; load source 4x4 samples and Hadamard transform
|
|
movd xmm0, [eax]
|
|
movd xmm1, [eax+ebx]
|
|
lea eax , [eax+2*ebx]
|
|
movd xmm2, [eax]
|
|
movd xmm3, [eax+ebx]
|
|
punpckldq xmm0, xmm2
|
|
punpckldq xmm1, xmm3
|
|
|
|
pxor xmm6, xmm6
|
|
punpcklbw xmm0, xmm6
|
|
punpcklbw xmm1, xmm6
|
|
|
|
movdqa xmm2, xmm0
|
|
paddw xmm0, xmm1
|
|
psubw xmm2, xmm1
|
|
SSE2_XSawp qdq, xmm0, xmm2, xmm3
|
|
|
|
movdqa xmm4, xmm0
|
|
paddw xmm0, xmm3
|
|
psubw xmm4, xmm3
|
|
|
|
movdqa xmm2, xmm0
|
|
punpcklwd xmm0, xmm4
|
|
punpckhwd xmm4, xmm2
|
|
|
|
SSE2_XSawp dq, xmm0, xmm4, xmm3
|
|
SSE2_XSawp qdq, xmm0, xmm3, xmm5
|
|
|
|
movdqa xmm7, xmm0
|
|
paddw xmm0, xmm5
|
|
psubw xmm7, xmm5
|
|
|
|
SSE2_XSawp qdq, xmm0, xmm7, xmm1
|
|
|
|
; Hadamard transform results are saved in xmm0 and xmm2
|
|
movdqa xmm2, xmm0
|
|
paddw xmm0, xmm1
|
|
psubw xmm2, xmm1
|
|
|
|
; load top boundary samples: [a b c d]
|
|
mov eax, [esp+16];p_dec
|
|
sub eax, [esp+20];linesize_dec
|
|
movzx ecx, byte [eax]
|
|
movzx edx, byte [eax+1]
|
|
movzx esi, byte [eax+2]
|
|
movzx edi, byte [eax+3]
|
|
|
|
; get the transform results of top boundary samples: [a b c d]
|
|
add edx, ecx ; edx = a + b
|
|
add edi, esi ; edi = c + d
|
|
add ecx, ecx ; ecx = a + a
|
|
add esi, esi ; esi = c + c
|
|
sub ecx, edx ; ecx = a + a - a - b = a - b
|
|
sub esi, edi ; esi = c + c - c - d = c - d
|
|
add edi, edx ; edi = (a + b) + (c + d)
|
|
add edx, edx
|
|
sub edx, edi ; edx = (a + b) - (c + d)
|
|
add esi, ecx ; esi = (a - b) + (c - d)
|
|
add ecx, ecx
|
|
sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
|
|
|
|
movdqa xmm6, xmm0
|
|
movdqa xmm7, xmm2
|
|
movd xmm5, edi ; store the edi for DC mode
|
|
pxor xmm3, xmm3
|
|
pxor xmm4, xmm4
|
|
pinsrw xmm3, edi, 0
|
|
pinsrw xmm3, esi, 4
|
|
psllw xmm3, 2
|
|
pinsrw xmm4, edx, 0
|
|
pinsrw xmm4, ecx, 4
|
|
psllw xmm4, 2
|
|
|
|
; get the satd of H
|
|
psubw xmm0, xmm3
|
|
psubw xmm2, xmm4
|
|
|
|
WELS_AbsW xmm0, xmm1
|
|
WELS_AbsW xmm2, xmm1
|
|
paddusw xmm0, xmm2
|
|
SUMW_HORIZON1 xmm0, xmm1 ; satd of V is stored in xmm0
|
|
|
|
; load left boundary samples: [a b c d]'
|
|
mov eax, [esp+16]
|
|
mov ebx, [esp+20]
|
|
movzx ecx, byte [eax-1]
|
|
movzx edx, byte [eax+ebx-1]
|
|
lea eax , [eax+2*ebx]
|
|
movzx esi, byte [eax-1]
|
|
movzx edi, byte [eax+ebx-1]
|
|
|
|
; get the transform results of left boundary samples: [a b c d]'
|
|
add edx, ecx ; edx = a + b
|
|
add edi, esi ; edi = c + d
|
|
add ecx, ecx ; ecx = a + a
|
|
add esi, esi ; esi = c + c
|
|
sub ecx, edx ; ecx = a + a - a - b = a - b
|
|
sub esi, edi ; esi = c + c - c - d = c - d
|
|
add edi, edx ; edi = (a + b) + (c + d)
|
|
add edx, edx
|
|
sub edx, edi ; edx = (a + b) - (c + d)
|
|
add esi, ecx ; esi = (a - b) + (c - d)
|
|
add ecx, ecx
|
|
sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
|
|
|
|
; store the transform results in xmm3
|
|
movd xmm3, edi
|
|
pinsrw xmm3, edx, 1
|
|
pinsrw xmm3, ecx, 2
|
|
pinsrw xmm3, esi, 3
|
|
psllw xmm3, 2
|
|
|
|
; get the satd of V
|
|
movdqa xmm2, xmm6
|
|
movdqa xmm4, xmm7
|
|
psubw xmm2, xmm3
|
|
WELS_AbsW xmm2, xmm1
|
|
WELS_AbsW xmm4, xmm1
|
|
paddusw xmm2, xmm4
|
|
SUMW_HORIZON1 xmm2, xmm1 ; satd of H is stored in xmm2
|
|
|
|
; DC result is stored in xmm1
|
|
add edi, 4
|
|
movd xmm1, edi
|
|
paddw xmm1, xmm5
|
|
psrlw xmm1, 3
|
|
movdqa xmm5, xmm1
|
|
psllw xmm1, 4
|
|
|
|
; get the satd of DC
|
|
psubw xmm6, xmm1
|
|
WELS_AbsW xmm6, xmm1
|
|
WELS_AbsW xmm7, xmm1
|
|
paddusw xmm6, xmm7
|
|
SUMW_HORIZON1 xmm6, xmm1 ; satd of DC is stored in xmm6
|
|
|
|
; comparing order: DC H V
|
|
mov edx, [esp+32]
|
|
movd eax, xmm6
|
|
movd edi, xmm2
|
|
movd esi, xmm0
|
|
and eax, 0xffff
|
|
shr eax, 1
|
|
and edi, 0xffff
|
|
shr edi, 1
|
|
and esi, 0xffff
|
|
shr esi, 1
|
|
add eax, [esp+40]
|
|
add edi, [esp+44]
|
|
add esi, [esp+48]
|
|
cmp ax, di
|
|
jg near not_dc
|
|
cmp ax, si
|
|
jg near not_dc_h
|
|
|
|
; for DC mode
|
|
movd ebx, xmm5
|
|
imul ebx, 0x01010101
|
|
movd xmm5, ebx
|
|
pshufd xmm5, xmm5, 0
|
|
movdqa [edx], xmm5
|
|
mov ebx, [esp+36]
|
|
mov dword [ebx], 0x02
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
|
|
not_dc:
|
|
cmp di, si
|
|
jg near not_dc_h
|
|
|
|
; for H mode
|
|
SSE_DB_1_2REG xmm6, xmm7
|
|
mov eax, [esp+16]
|
|
mov ebx, [esp+20]
|
|
movzx ecx, byte [eax-1]
|
|
movd xmm0, ecx
|
|
pmuludq xmm0, xmm6
|
|
|
|
movzx ecx, byte [eax+ebx-1]
|
|
movd xmm1, ecx
|
|
pmuludq xmm1, xmm6
|
|
%if 1
|
|
punpckldq xmm0, xmm1
|
|
%else
|
|
unpcklps xmm0, xmm1
|
|
%endif
|
|
lea eax, [eax+ebx*2]
|
|
movzx ecx, byte [eax-1]
|
|
movd xmm2, ecx
|
|
pmuludq xmm2, xmm6
|
|
|
|
movzx ecx, byte [eax+ebx-1]
|
|
movd xmm3, ecx
|
|
pmuludq xmm3, xmm6
|
|
%if 1
|
|
punpckldq xmm2, xmm3
|
|
punpcklqdq xmm0, xmm2
|
|
%else
|
|
unpcklps xmm2, xmm3
|
|
unpcklpd xmm0, xmm2
|
|
%endif
|
|
movdqa [edx],xmm0
|
|
|
|
mov eax, edi
|
|
mov ebx, [esp+36]
|
|
mov dword [ebx], 0x01
|
|
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
not_dc_h:
|
|
; for V mode
|
|
mov eax, [esp+16]
|
|
sub eax, [esp+20]
|
|
movd xmm0, [eax]
|
|
pshufd xmm0, xmm0, 0
|
|
movdqa [edx],xmm0
|
|
|
|
mov eax, esi
|
|
mov ebx, [esp+36]
|
|
mov dword [ebx], 0x00
|
|
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
%endif
|
|
|