From 3761901ed4785ce0c85557362c857f506b4072ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 28 Jan 2014 13:42:22 +0200 Subject: [PATCH 1/2] Remove sad.asm from the processing lib, move satd_sad from the encoder to the common lib sad.asm as used in processing is an exact subset of the code in satd_sad.asm in the encoder. --- codec/build/win32/enc/WelsEncCore.vcproj | 2 +- .../{encoder/core/asm => common}/satd_sad.asm | 0 codec/common/targets.mk | 1 + codec/encoder/targets.mk | 1 - .../processing/build/win32/WelsVP_2008.vcproj | 2 +- codec/processing/src/asm/sad.asm | 220 ------------------ codec/processing/targets.mk | 1 - 7 files changed, 3 insertions(+), 224 deletions(-) rename codec/{encoder/core/asm => common}/satd_sad.asm (100%) delete mode 100644 codec/processing/src/asm/sad.asm diff --git a/codec/build/win32/enc/WelsEncCore.vcproj b/codec/build/win32/enc/WelsEncCore.vcproj index a6284f8b..403dcef2 100644 --- a/codec/build/win32/enc/WelsEncCore.vcproj +++ b/codec/build/win32/enc/WelsEncCore.vcproj @@ -2106,7 +2106,7 @@ >1) -cmp %1, (32-%2)|(%3>>1) -%endmacro - -%macro SSE2_GetSad8x4 0 - movq xmm0, [r0] - movq xmm1, [r0+r1] - lea r0, [r0+2*r1] - movhps xmm0, [r0] - movhps xmm1, [r0+r1] - - movq xmm2, [r2] - movq xmm3, [r2+r3] - lea r2, [r2+2*r3] - movhps xmm2, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm2 - psadbw xmm1, xmm3 - paddw xmm6, xmm0 - paddw xmm6, xmm1 -%endmacro - - -;*********************************************************************** -; Code -;*********************************************************************** -SECTION .text - -WELS_EXTERN WelsSampleSad8x8_sse21 -WelsSampleSad8x8_sse21: - ;mov ecx, [esp+12] - ;mov edx, ecx - ;CACHE_SPLIT_CHECK edx, 8, 64 - ;jle near .pixel_sad_8x8_nsplit - ;push ebx - ;push edi - ;mov eax, [esp+12] - ;mov ebx, [esp+16] - - %assign push_num 0 - mov r2, arg3 - push r2 - CACHE_SPLIT_CHECK r2, 8, 64 - jle near .pixel_sad_8x8_nsplit - pop r2 -%ifdef X86_32 - push r3 - push r4 - push r5 -%endif - %assign push_num 3 - mov r0, arg1 - mov r1, arg2 - SIGN_EXTENTION r1, r1d - pxor xmm7, xmm7 - - ;ecx r2, edx r4, edi r5 - - mov r5, r2 - and r5, 0x07 - sub r2, r5 - mov r4, 8 - sub r4, r5 - - shl r5, 3 - shl r4, 3 - movd xmm5, r5d - movd xmm6, r4d - mov r5, 8 - add r5, r2 - mov r3, arg4 - SIGN_EXTENTION r3, r3d - movq xmm0, [r0] - movhps xmm0, [r0+r1] - - movq xmm1, [r2] - movq xmm2, [r5] - movhps xmm1, [r2+r3] - movhps xmm2, [r5+r3] - psrlq xmm1, xmm5 - psllq xmm2, xmm6 - por xmm1, xmm2 - - psadbw xmm0, xmm1 - paddw xmm7, xmm0 - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - lea r5, [r5+2*r3] - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - - movq xmm1, [r2] - movq xmm2, [r5] - movhps xmm1, [r2+r3] - movhps xmm2, [r5+r3] - psrlq xmm1, xmm5 - psllq xmm2, xmm6 - por xmm1, xmm2 - - psadbw xmm0, xmm1 - paddw xmm7, xmm0 - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - lea r5, [r5+2*r3] - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - - movq xmm1, [r2] - movq xmm2, [r5] - movhps xmm1, [r2+r3] - movhps xmm2, [r5+r3] - psrlq xmm1, xmm5 - psllq xmm2, xmm6 - por xmm1, xmm2 - - psadbw xmm0, xmm1 - paddw xmm7, xmm0 - - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - lea r5, [r5+2*r3] - - movq xmm0, [r0] - movhps xmm0, [r0+r1] - - movq xmm1, [r2] - movq xmm2, [r5] - movhps xmm1, [r2+r3] - movhps xmm2, [r5+r3] - psrlq xmm1, xmm5 - psllq xmm2, xmm6 - por xmm1, xmm2 - - psadbw xmm0, xmm1 - paddw xmm7, xmm0 - - movhlps xmm0, xmm7 - paddw xmm0, xmm7 - movd retrd, xmm0 -%ifdef X86_32 - pop r5 - pop r4 - pop r3 -%endif - jmp .return - -.pixel_sad_8x8_nsplit: - ;push ebx - ;mov eax, [esp+8] - ;mov ebx, [esp+12] - ;mov edx, [esp+20] - - pop r2 - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENTION r1, r1d - SIGN_EXTENTION r3, r3d - pxor xmm6, xmm6 - SSE2_GetSad8x4 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_GetSad8x4 - movhlps xmm0, xmm6 - paddw xmm0, xmm6 - movd retrd, xmm0 - LOAD_4_PARA_POP -.return: - ret \ No newline at end of file diff --git a/codec/processing/targets.mk b/codec/processing/targets.mk index 8163329d..9017dfc4 100644 --- a/codec/processing/targets.mk +++ b/codec/processing/targets.mk @@ -24,7 +24,6 @@ PROCESSING_ASM_SRCS=\ $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm\ $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm\ $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm\ - $(PROCESSING_SRCDIR)/./src/asm/sad.asm\ $(PROCESSING_SRCDIR)/./src/asm/vaa.asm\ PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.o) From 04dba61d22ab43c1f428302f2abbaecca723fe30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 28 Jan 2014 13:47:51 +0200 Subject: [PATCH 2/2] Remove an unused assembly source file Nothing within processing uses functions from this file. --- .../processing/build/win32/WelsVP_2008.vcproj | 40 - codec/processing/src/asm/intra_pred.asm | 1505 ----------------- codec/processing/targets.mk | 1 - 3 files changed, 1546 deletions(-) delete mode 100644 codec/processing/src/asm/intra_pred.asm diff --git a/codec/processing/build/win32/WelsVP_2008.vcproj b/codec/processing/build/win32/WelsVP_2008.vcproj index 236327da..5062a288 100644 --- a/codec/processing/build/win32/WelsVP_2008.vcproj +++ b/codec/processing/build/win32/WelsVP_2008.vcproj @@ -593,46 +593,6 @@ /> - - - - - - - - - - - - - - diff --git a/codec/processing/src/asm/intra_pred.asm b/codec/processing/src/asm/intra_pred.asm deleted file mode 100644 index 7438f707..00000000 --- a/codec/processing/src/asm/intra_pred.asm +++ /dev/null @@ -1,1505 +0,0 @@ -;*! -;* \copy -;* Copyright (c) 2009-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* intra_pred.asm -;* -;* Abstract -;* sse2 function for intra predict operations -;* -;* History -;* 18/09/2009 Created -;* -;* -;*************************************************************************/ -%include "asm_inc.asm" - -;*********************************************************************** -; Local Data (Read Only) -;*********************************************************************** - -%ifdef FORMAT_COFF -SECTION .rodata pData -%else -SECTION .rodata align=16 -%endif - -align 16 -sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0 -align 16 -sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8 -align 16 -sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1 - -; for chroma plane mode -sse2_plane_inc_c dw 1, 2, 3, 4 -sse2_plane_dec_c dw 4, 3, 2, 1 -align 16 -sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4 - -align 16 -mmx_01bytes: times 16 db 1 -;align 16 -;sse_0x0004bytes: times 8 dw 4 -;ALIGN 16 -;sse_f000 db 255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 - -align 16 -mmx_0x02: dw 0x02, 0x00, 0x00, 0x00 - - -;*********************************************************************** -; macros -;*********************************************************************** -;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 -;%1 will keep the last result -%macro SSE_DB_1_2REG 2 - pxor %1, %1 - pcmpeqw %2, %2 - psubb %1, %2 -%endmacro - -;xmm0, xmm1, xmm2, eax, ecx -;lower 64 bits of xmm0 save the result -%macro SSE2_PRED_H_4X4_TWO_LINE 5 - movd %1, [%4-1] - movdqa %3, %1 - punpcklbw %1, %3 - movdqa %3, %1 - punpcklbw %1, %3 - - ;add %4, %5 - movd %2, [%4+%5-1] - movdqa %3, %2 - punpcklbw %2, %3 - movdqa %3, %2 - punpcklbw %2, %3 - punpckldq %1, %2 -%endmacro - -%macro SUMW_HORIZON1 2 - movdqa %2, %1 - psrldq %2, 8 - paddusw %1, %2 - movdqa %2, %1 - psrldq %2, 4 - paddusw %1, %2 - movdqa %2, %1 - psrldq %2, 2 - paddusw %1, %2 -%endmacro - -%macro LOAD_COLUMN 6 - movd %1, [%5] - movd %2, [%5+%6] - punpcklbw %1, %2 - lea %5, [%5+2*%6] - movd %3, [%5] - movd %2, [%5+%6] - punpcklbw %3, %2 - punpcklwd %1, %3 - lea %5, [%5+2*%6] - movd %4, [%5] - movd %2, [%5+%6] - punpcklbw %4, %2 - lea %5, [%5+2*%6] - movd %3, [%5] - movd %2, [%5+%6] - lea %5, [%5+2*%6] - punpcklbw %3, %2 - punpcklwd %4, %3 - punpckhdq %1, %4 -%endmacro - -%macro SUMW_HORIZON 3 - movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 - paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 - punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 - movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 - paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 - pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 - paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 -%endmacro - - -%macro COPY_16_TIMES 2 - movdqa %2, [%1-16] - psrldq %2, 15 - pmuludq %2, [mmx_01bytes] - pshufd %2, %2, 0 -%endmacro - -%macro COPY_16_TIMESS 3 - movdqa %2, [%1+%3-16] - psrldq %2, 15 - pmuludq %2, [mmx_01bytes] - pshufd %2, %2, 0 -%endmacro - -%macro LOAD_COLUMN_C 6 - movd %1, [%5] - movd %2, [%5+%6] - punpcklbw %1,%2 - lea %5, [%5+2*%6] - movd %3, [%5] - movd %2, [%5+%6] - punpcklbw %3, %2 - punpckhwd %1, %3 - lea %5, [%5+2*%6] -%endmacro - -%macro LOAD_2_LEFT_AND_ADD 0 - lea r1, [r1+2*r2] - movzx r4, byte [r1-0x01] - add r3, r4 - movzx r4, byte [r1+r2-0x01] - add r3, r4 -%endmacro - -;*********************************************************************** -; Code -;*********************************************************************** - -SECTION .text -WELS_EXTERN WelsI4x4LumaPredH_sse2 -WELS_EXTERN WelsI4x4LumaPredDDR_mmx -WELS_EXTERN WelsI4x4LumaPredDc_sse2 -WELS_EXTERN WelsI16x16LumaPredPlane_sse2 - -ALIGN 16 -;*********************************************************************** -; void __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) -; -; pred must align to 16 -;*********************************************************************** -WelsI4x4LumaPredH_sse2: - push r3 - %assign push_num 1 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - movzx r3, byte [r1-1] - movd xmm0, r3d - pmuludq xmm0, [mmx_01bytes] - - movzx r3, byte [r1+r2-1] - movd xmm1, r3d - pmuludq xmm1, [mmx_01bytes] - - unpcklps xmm0, xmm1 - - lea r1, [r1+r2*2] - movzx r3, byte [r1-1] - movd xmm2, r3d - pmuludq xmm2, [mmx_01bytes] - - movzx r3, byte [r1+r2-1] - movd xmm3, r3d - pmuludq xmm3, [mmx_01bytes] - - unpcklps xmm2, xmm3 - unpcklpd xmm0, xmm2 - - movdqa [r0], xmm0 - pop r3 - ret - -;*********************************************************************** -; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); -;*********************************************************************** -WelsI16x16LumaPredPlane_sse2: - ;%define pushsize 4 - ;push esi - ;mov esi, [esp + pushsize + 8] - ;mov ecx, [esp + pushsize + 12] - push r3 - push r4 - %assign push_num 2 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - sub r1, 1 - sub r1, r2 - - ;for H - pxor xmm7, xmm7 - movq xmm0, [r1] - movdqa xmm5, [sse2_plane_dec] - punpcklbw xmm0, xmm7 - pmullw xmm0, xmm5 - movq xmm1, [r1 + 9] - movdqa xmm6, [sse2_plane_inc] - punpcklbw xmm1, xmm7 - pmullw xmm1, xmm6 - psubw xmm1, xmm0 - - SUMW_HORIZON xmm1,xmm0,xmm2 - movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]); - movsx r3, r3w - imul r3, 5 - add r3, 32 - sar r3, 6 ; b = (5 * H + 32) >> 6; - SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b - - movzx r4, BYTE [r1+16] - sub r1, 3 - LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2 - - add r1, 3 - movzx r3, BYTE [r1+8*r2] - add r4, r3 - shl r4, 4 ; a = (left[15*stride] + top[15]) << 4; - - sub r1, 3 - add r1, r2 - LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2 - pxor xmm4, xmm4 - punpckhbw xmm0, xmm4 - pmullw xmm0, xmm5 - punpckhbw xmm7, xmm4 - pmullw xmm7, xmm6 - psubw xmm7, xmm0 - - SUMW_HORIZON xmm7,xmm0,xmm2 - movd r3d, xmm7 ; V - movsx r3, r3w - imul r3, 5 - add r3, 32 - sar r3, 6 ; c = (5 * V + 32) >> 6; - SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c - - ;mov esi, [esp + pushsize + 4] - add r4, 16 - imul r3, -7 - add r3, r4 ; s = a + 16 + (-7)*c - SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s - - xor r3, r3 - movdqa xmm5, [sse2_plane_inc_minus] - -get_i16x16_luma_pred_plane_sse2_1: - movdqa xmm2, xmm1 - pmullw xmm2, xmm5 - paddw xmm2, xmm0 - psraw xmm2, 5 - movdqa xmm3, xmm1 - pmullw xmm3, xmm6 - paddw xmm3, xmm0 - psraw xmm3, 5 - packuswb xmm2, xmm3 - movdqa [r0], xmm2 - paddw xmm0, xmm4 - add r0, 16 - inc r3 - cmp r3, 16 - jnz get_i16x16_luma_pred_plane_sse2_1 - pop r4 - pop r3 - ret - -;*********************************************************************** -; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); -;*********************************************************************** - -%macro SSE2_PRED_H_16X16_ONE_LINE 0 - add r0, 16 - add r1, r2 - movzx r3, byte [r1] - SSE2_Copy16Times xmm0, r3d - movdqa [r0], xmm0 -%endmacro - -WELS_EXTERN WelsI16x16LumaPredH_sse2 -WelsI16x16LumaPredH_sse2: - push r3 - %assign push_num 1 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - dec r1 - movzx r3, byte [r1] - SSE2_Copy16Times xmm0, r3d - movdqa [r0], xmm0 - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - pop r3 - ret - -;*********************************************************************** -; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); -;*********************************************************************** -WELS_EXTERN WelsI16x16LumaPredV_sse2 -WelsI16x16LumaPredV_sse2: - ;mov edx, [esp+4] ; pred - ;mov eax, [esp+8] ; pRef - ;mov ecx, [esp+12] ; stride - %assign push_num 0 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - sub r1, r2 - movdqa xmm0, [r1] - - movdqa [r0], xmm0 - movdqa [r0+10h], xmm0 - movdqa [r0+20h], xmm0 - movdqa [r0+30h], xmm0 - movdqa [r0+40h], xmm0 - movdqa [r0+50h], xmm0 - movdqa [r0+60h], xmm0 - movdqa [r0+70h], xmm0 - movdqa [r0+80h], xmm0 - movdqa [r0+90h], xmm0 - movdqa [r0+160], xmm0 - movdqa [r0+176], xmm0 - movdqa [r0+192], xmm0 - movdqa [r0+208], xmm0 - movdqa [r0+224], xmm0 - movdqa [r0+240], xmm0 - - ret - -;*********************************************************************** -; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); -;*********************************************************************** -WELS_EXTERN WelsIChromaPredPlane_sse2 -WelsIChromaPredPlane_sse2: - ;%define pushsize 4 - ;push esi - ;mov esi, [esp + pushsize + 8] ;pRef - ;mov ecx, [esp + pushsize + 12] ;stride - push r3 - push r4 - %assign push_num 2 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - sub r1, 1 - sub r1, r2 - - pxor mm7, mm7 - movq mm0, [r1] - movq mm5, [sse2_plane_dec_c] - punpcklbw mm0, mm7 - pmullw mm0, mm5 - movq mm1, [r1 + 5] - movq mm6, [sse2_plane_inc_c] - punpcklbw mm1, mm7 - pmullw mm1, mm6 - psubw mm1, mm0 - - movq2dq xmm1, mm1 - pxor xmm2, xmm2 - SUMW_HORIZON xmm1,xmm0,xmm2 - movd r3d, xmm1 - movsx r3, r3w - imul r3, 17 - add r3, 16 - sar r3, 5 ; b = (17 * H + 16) >> 5; - SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b - - movzx r3, BYTE [r1+8] - sub r1, 3 - LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2 - - add r1, 3 - movzx r4, BYTE [r1+4*r2] - add r4, r3 - shl r4, 4 ; a = (left[7*stride] + top[7]) << 4; - - sub r1, 3 - add r1, r2 - LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2 - pxor mm4, mm4 - punpckhbw mm0, mm4 - pmullw mm0, mm5 - punpckhbw mm7, mm4 - pmullw mm7, mm6 - psubw mm7, mm0 - - movq2dq xmm7, mm7 - pxor xmm2, xmm2 - SUMW_HORIZON xmm7,xmm0,xmm2 - movd r3d, xmm7 ; V - movsx r3, r3w - imul r3, 17 - add r3, 16 - sar r3, 5 ; c = (17 * V + 16) >> 5; - SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c - - ;mov esi, [esp + pushsize + 4] - add r4, 16 - imul r3, -3 - add r3, r4 ; s = a + 16 + (-3)*c - SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s - - xor r3, r3 - movdqa xmm5, [sse2_plane_mul_b_c] - -get_i_chroma_pred_plane_sse2_1: - movdqa xmm2, xmm1 - pmullw xmm2, xmm5 - paddw xmm2, xmm0 - psraw xmm2, 5 - packuswb xmm2, xmm2 - movq [r0], xmm2 - paddw xmm0, xmm4 - add r0, 8 - inc r3 - cmp r3, 8 - jnz get_i_chroma_pred_plane_sse2_1 - pop r4 - pop r3 - WELSEMMS - ret - -ALIGN 16 -;*********************************************************************** -; 0 |1 |2 |3 |4 | -; 6 |7 |8 |9 |10| -; 11|12|13|14|15| -; 16|17|18|19|20| -; 21|22|23|24|25| -; 7 is the start pixel of current 4x4 block -; pred[7] = ([6]+[0]*2+[1]+2)/4 -; -; void __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) -; -;*********************************************************************** -WelsI4x4LumaPredDDR_mmx: - ;mov edx,[esp+4] ;pred - ;mov eax,[esp+8] ;pRef - ;mov ecx,[esp+12] ;stride - %assign push_num 0 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11 - movq mm2,[r1-8] ;get value of 6 mm2[8] = 6 - sub r1, r2 ;mov eax to above line of current block(postion of 1) - punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6] - movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3] - punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11] - psllq mm3,18h ;mm3[5]=[1] - psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] - por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11] - movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] - lea r1,[r1+r2*2-8h] ;set eax point to 12 - movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16] - psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0 - psrlq mm4,38h ;mm4[1]=[16] - por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16] - movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16] - movq mm4,[r1+r2*2] ;mm4[8]=[21] - psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0 - psrlq mm4,38h ;mm4[1]=[21] - por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21] - movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21] - pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2 - pxor mm1,mm4 ;find odd value in the lowest bit of each byte - pand mm1,[mmx_01bytes] ;set the odd bit - psubusb mm3,mm1 ;decrease 1 from odd bytes - pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2 - - movd [r0+12],mm2 - psrlq mm2,8 - movd [r0+8],mm2 - psrlq mm2,8 - movd [r0+4],mm2 - psrlq mm2,8 - movd [r0],mm2 - WELSEMMS - ret - -ALIGN 16 -;*********************************************************************** -; 0 |1 |2 |3 |4 | -; 5 |6 |7 |8 |9 | -; 10|11|12|13|14| -; 15|16|17|18|19| -; 20|21|22|23|24| -; 6 is the start pixel of current 4x4 block -; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8 -; -; void __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride) -; -;*********************************************************************** -WelsI4x4LumaPredDc_sse2: - push r3 - push r4 - %assign push_num 2 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - movzx r4, byte [r1-1h] - sub r1, r2 - movd xmm0, [r1] - pxor xmm1, xmm1 - psadbw xmm0, xmm1 - xor r3, r3 - movd r3d, xmm0 - add r3, r4 - movzx r4, byte [r1+r2*2-1h] - add r3, r4 - - lea r1, [r1+r2*2-1] - movzx r4, byte [r1+r2] - add r3, r4 - - movzx r4, byte [r1+r2*2] - add r3, r4 - add r3, 4 - sar r3, 3 - imul r3, 0x01010101 - - movd xmm0, r3d - pshufd xmm0, xmm0, 0 - movdqa [r0], xmm0 - pop r4 - pop r3 - ret - -ALIGN 16 -;*********************************************************************** -; void __cdecl WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride) -; copy 8 pixel of 8 line from left -;*********************************************************************** -%macro MMX_PRED_H_8X8_ONE_LINE 4 - movq %1, [%3-8] - psrlq %1, 38h - - ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes - pmullw %1, [mmx_01bytes] - pshufw %1, %1, 0 - movq [%4], %1 -%endmacro - -%macro MMX_PRED_H_8X8_ONE_LINEE 4 - movq %1, [%3+r2-8] - psrlq %1, 38h - - ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes - pmullw %1, [mmx_01bytes] - pshufw %1, %1, 0 - movq [%4], %1 -%endmacro - -WELS_EXTERN WelsIChromaPredH_mmx -WelsIChromaPredH_mmx: - ;mov edx, [esp+4] ;pred - ;mov eax, [esp+8] ;pRef - ;mov ecx, [esp+12] ;stride - %assign push_num 0 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - movq mm0, [r1-8] - psrlq mm0, 38h - - ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes - pmullw mm0, [mmx_01bytes] - pshufw mm0, mm0, 0 - movq [r0], mm0 - - MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8 - - lea r1,[r1+r2*2] - MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16 - - MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24 - - lea r1,[r1+r2*2] - MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32 - - MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40 - - lea r1,[r1+r2*2] - MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48 - - MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56 - WELSEMMS - ret - -ALIGN 16 -;*********************************************************************** -; void __cdecl WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) -; copy pixels from top 4 pixels -;*********************************************************************** -WELS_EXTERN WelsI4x4LumaPredV_sse2 -WelsI4x4LumaPredV_sse2: - %assign push_num 0 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - sub r1, r2 - movd xmm0, [r1] - pshufd xmm0, xmm0, 0 - movdqa [r0], xmm0 - ret - -ALIGN 16 -;*********************************************************************** -; void __cdecl WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) -; copy 8 pixels from top 8 pixels -;*********************************************************************** -WELS_EXTERN WelsIChromaPredV_sse2 -WelsIChromaPredV_sse2: - %assign push_num 0 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - sub r1, r2 - movq xmm0, [r1] - movdqa xmm1, xmm0 - punpcklqdq xmm0, xmm1 - movdqa [r0], xmm0 - movdqa [r0+16], xmm0 - movdqa [r0+32], xmm0 - movdqa [r0+48], xmm0 - ret - - ALIGN 16 -;*********************************************************************** -; lt|t0|t1|t2|t3| -; l0| -; l1| -; l2| -; l3| -; t3 will never been used -; destination: -; |a |b |c |d | -; |e |f |a |b | -; |g |h |e |f | -; |i |j |g |h | - -; a = (1 + lt + l0)>>1 -; e = (1 + l0 + l1)>>1 -; g = (1 + l1 + l2)>>1 -; i = (1 + l2 + l3)>>1 - -; d = (2 + t0 + (t1<<1) + t2)>>2 -; c = (2 + lt + (t0<<1) + t1)>>2 -; b = (2 + l0 + (lt<<1) + t0)>>2 - -; f = (2 + l1 + (l0<<1) + lt)>>2 -; h = (2 + l2 + (l1<<1) + l0)>>2 -; j = (2 + l3 + (l2<<1) + l1)>>2 -; [b a f e h g j i] + [d c b a] --> mov to memory -; -; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) -;*********************************************************************** -WELS_EXTERN WelsI4x4LumaPredHD_mmx -WelsI4x4LumaPredHD_mmx: - %assign push_num 0 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - sub r1, r2 - movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt] - psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx] - - movd mm1, [r1+2*r2-4] - punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1 - lea r1, [r1+2*r2] - movd mm2, [r1+2*r2-4] - punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3 - punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx] - psrlq mm2, 20h - pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3] - - movq mm1, mm0 - psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1] - movq mm2, mm0 - psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2] - movq mm3, mm2 - movq mm4, mm1 - pavgb mm1, mm0 - - pxor mm4, mm0 ; find odd value in the lowest bit of each byte - pand mm4, [mmx_01bytes] ; set the odd bit - psubusb mm1, mm4 ; decrease 1 from odd bytes - - pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j] - - movq mm4, mm0 - pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i] - punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i] - - psrlq mm2, 20h - psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0] - movq mm4, mm3 - psrlq mm4, 10h ; mm4 = [0 0 b a f e h j] - pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx] - psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a] - - movd [r0], mm2 - movd [r0+12], mm3 - psrlq mm3, 10h - movd [r0+8], mm3 - psrlq mm3, 10h - movd [r0+4], mm3 - WELSEMMS - ret - -ALIGN 16 -;*********************************************************************** -; lt|t0|t1|t2|t3| -; l0| -; l1| -; l2| -; l3| -; t3 will never been used -; destination: -; |a |b |c |d | -; |c |d |e |f | -; |e |f |g |g | -; |g |g |g |g | - -; a = (1 + l0 + l1)>>1 -; c = (1 + l1 + l2)>>1 -; e = (1 + l2 + l3)>>1 -; g = l3 - -; b = (2 + l0 + (l1<<1) + l2)>>2 -; d = (2 + l1 + (l2<<1) + l3)>>2 -; f = (2 + l2 + (l3<<1) + l3)>>2 - -; [g g f e d c b a] + [g g g g] --> mov to memory -; -; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) -;*********************************************************************** -WELS_EXTERN WelsI4x4LumaPredHU_mmx -WelsI4x4LumaPredHU_mmx: - %assign push_num 0 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - movd mm0, [r1-4] ; mm0[3] = l0 - punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0 - lea r1, [r1+2*r2] - movd mm2, [r1-4] ; mm2[3] = l2 - movd mm4, [r1+r2-4] ; mm4[3] = l3 - punpcklbw mm2, mm4 - punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx] - - psrlq mm4, 18h - psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx] - psrlq mm0, 8h - pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx] - - movq mm1, mm0 - psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx] - movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx] - pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx] - - movq mm2, mm0 - psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx] - movq mm5, mm2 - pavgb mm2, mm0 - - pxor mm5, mm0 ; find odd value in the lowest bit of each byte - pand mm5, [mmx_01bytes] ; set the odd bit - psubusb mm2, mm5 ; decrease 1 from odd bytes - - pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx] - - psrlq mm2, 8h - pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx] - - punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a] - punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx] - punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx] - - psrlq mm4, 20h - movd [r0+12], mm4 - - movd [r0], mm1 - psrlq mm1, 10h - movd [r0+4], mm1 - psrlq mm1, 10h - movd [r0+8], mm1 - WELSEMMS - ret - - - -ALIGN 16 -;*********************************************************************** -; lt|t0|t1|t2|t3| -; l0| -; l1| -; l2| -; l3| -; l3 will never been used -; destination: -; |a |b |c |d | -; |e |f |g |h | -; |i |a |b |c | -; |j |e |f |g | - -; a = (1 + lt + t0)>>1 -; b = (1 + t0 + t1)>>1 -; c = (1 + t1 + t2)>>1 -; d = (1 + t2 + t3)>>1 - -; e = (2 + l0 + (lt<<1) + t0)>>2 -; f = (2 + lt + (t0<<1) + t1)>>2 -; g = (2 + t0 + (t1<<1) + t2)>>2 - -; h = (2 + t1 + (t2<<1) + t3)>>2 -; i = (2 + lt + (l0<<1) + l1)>>2 -; j = (2 + l0 + (l1<<1) + l2)>>2 -; -; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) -;*********************************************************************** -WELS_EXTERN WelsI4x4LumaPredVR_mmx -WelsI4x4LumaPredVR_mmx: - %assign push_num 0 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - sub r1, r2 - movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt] - psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx] - - movd mm1, [r1+2*r2-4] - punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1 - lea r1, [r1+2*r2] - movq mm2, [r1+r2-8] ; mm2[7] = l2 - punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx] - psrlq mm2, 28h - pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2] - - movq mm1, mm0 - psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx] - pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx] - - movq mm2, mm0 - psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx] - movq mm3, mm2 - pavgb mm2, mm0 - - pxor mm3, mm0 ; find odd value in the lowest bit of each byte - pand mm3, [mmx_01bytes] ; set the odd bit - psubusb mm2, mm3 ; decrease 1 from odd bytes - - movq mm3, mm0 - psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx] - pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx] - movq mm2, mm3 - - psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a] - movd [r0], mm1 - - psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e] - movd [r0+4], mm2 - - movq mm4, mm3 - psllq mm4, 20h - psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i] - - movq mm5, mm3 - psllq mm5, 28h - psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j] - - psllq mm1, 8h - pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i] - movd [r0+8], mm4 - - psllq mm2, 8h - pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j] - movd [r0+12], mm5 - WELSEMMS - ret - -ALIGN 16 -;*********************************************************************** -; lt|t0|t1|t2|t3|t4|t5|t6|t7 -; l0| -; l1| -; l2| -; l3| -; lt,t0,t1,t2,t3 will never been used -; destination: -; |a |b |c |d | -; |b |c |d |e | -; |c |d |e |f | -; |d |e |f |g | - -; a = (2 + t0 + t2 + (t1<<1))>>2 -; b = (2 + t1 + t3 + (t2<<1))>>2 -; c = (2 + t2 + t4 + (t3<<1))>>2 -; d = (2 + t3 + t5 + (t4<<1))>>2 - -; e = (2 + t4 + t6 + (t5<<1))>>2 -; f = (2 + t5 + t7 + (t6<<1))>>2 -; g = (2 + t6 + t7 + (t7<<1))>>2 - -; [g f e d c b a] --> mov to memory -; -; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) -;*********************************************************************** -WELS_EXTERN WelsI4x4LumaPredDDL_mmx -WelsI4x4LumaPredDDL_mmx: - %assign push_num 0 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - sub r1, r2 - movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] - movq mm1, mm0 - movq mm2, mm0 - - movq mm3, mm0 - psrlq mm3, 38h - psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx] - - psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx] - psrlq mm2, 8h - pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1] - - movq mm3, mm1 - pavgb mm1, mm2 - pxor mm3, mm2 ; find odd value in the lowest bit of each byte - pand mm3, [mmx_01bytes] ; set the odd bit - psubusb mm1, mm3 ; decrease 1 from odd bytes - - pavgb mm0, mm1 ; mm0 = [g f e d c b a xx] - - psrlq mm0, 8h - movd [r0], mm0 - psrlq mm0, 8h - movd [r0+4], mm0 - psrlq mm0, 8h - movd [r0+8], mm0 - psrlq mm0, 8h - movd [r0+12], mm0 - WELSEMMS - ret - - -ALIGN 16 -;*********************************************************************** -; lt|t0|t1|t2|t3|t4|t5|t6|t7 -; l0| -; l1| -; l2| -; l3| -; lt,t0,t1,t2,t3 will never been used -; destination: -; |a |b |c |d | -; |e |f |g |h | -; |b |c |d |i | -; |f |g |h |j | - -; a = (1 + t0 + t1)>>1 -; b = (1 + t1 + t2)>>1 -; c = (1 + t2 + t3)>>1 -; d = (1 + t3 + t4)>>1 -; i = (1 + t4 + t5)>>1 - -; e = (2 + t0 + (t1<<1) + t2)>>2 -; f = (2 + t1 + (t2<<1) + t3)>>2 -; g = (2 + t2 + (t3<<1) + t4)>>2 -; h = (2 + t3 + (t4<<1) + t5)>>2 -; j = (2 + t4 + (t5<<1) + t6)>>2 - -; [i d c b a] + [j h g f e] --> mov to memory -; -; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) -;*********************************************************************** -WELS_EXTERN WelsI4x4LumaPredVL_mmx -WelsI4x4LumaPredVL_mmx: - %assign push_num 0 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - sub r1, r2 - movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] - movq mm1, mm0 - movq mm2, mm0 - - psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1] - psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2] - - movq mm3, mm1 - pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a] - - movq mm4, mm2 - pavgb mm2, mm0 - pxor mm4, mm0 ; find odd value in the lowest bit of each byte - pand mm4, [mmx_01bytes] ; set the odd bit - psubusb mm2, mm4 ; decrease 1 from odd bytes - - pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e] - - movd [r0], mm3 - psrlq mm3, 8h - movd [r0+8], mm3 - - movd [r0+4], mm2 - psrlq mm2, 8h - movd [r0+12], mm2 - WELSEMMS - ret - -ALIGN 16 -;*********************************************************************** -; -; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) -;*********************************************************************** -WELS_EXTERN WelsIChromaPredDc_sse2 -WelsIChromaPredDc_sse2: - push r3 - push r4 - %assign push_num 2 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - sub r1, r2 - movq mm0, [r1] - - movzx r3, byte [r1+r2-0x01] ; l1 - lea r1, [r1+2*r2] - movzx r4, byte [r1-0x01] ; l2 - add r3, r4 - movzx r4, byte [r1+r2-0x01] ; l3 - add r3, r4 - lea r1, [r1+2*r2] - movzx r4, byte [r1-0x01] ; l4 - add r3, r4 - movd mm1, r3d ; mm1 = l1+l2+l3+l4 - - movzx r3, byte [r1+r2-0x01] ; l5 - lea r1, [r1+2*r2] - movzx r4, byte [r1-0x01] ; l6 - add r3, r4 - movzx r4, byte [r1+r2-0x01] ; l7 - add r3, r4 - lea r1, [r1+2*r2] - movzx r4, byte [r1-0x01] ; l8 - add r3, r4 - movd mm2, r3d ; mm2 = l5+l6+l7+l8 - - movq mm3, mm0 - psrlq mm0, 0x20 - psllq mm3, 0x20 - psrlq mm3, 0x20 - pxor mm4, mm4 - psadbw mm0, mm4 - psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2 - - paddq mm3, mm1 - movq mm1, mm2 - paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1 - - movq mm4, [mmx_0x02] - - paddq mm0, mm4 - psrlq mm0, 0x02 - - paddq mm2, mm4 - psrlq mm2, 0x02 - - paddq mm3, mm4 - paddq mm3, mm4 - psrlq mm3, 0x03 - - paddq mm1, mm4 - paddq mm1, mm4 - psrlq mm1, 0x03 - - pmuludq mm0, [mmx_01bytes] - pmuludq mm3, [mmx_01bytes] - psllq mm0, 0x20 - pxor mm0, mm3 ; mm0 = m_up - - pmuludq mm2, [mmx_01bytes] - pmuludq mm1, [mmx_01bytes] - psllq mm1, 0x20 - pxor mm1, mm2 ; mm2 = m_down - - movq [r0], mm0 - movq [r0+0x08], mm0 - movq [r0+0x10], mm0 - movq [r0+0x18], mm0 - - movq [r0+0x20], mm1 - movq [r0+0x28], mm1 - movq [r0+0x30], mm1 - movq [r0+0x38], mm1 - - pop r4 - pop r3 - WELSEMMS - ret - - - -ALIGN 16 -;*********************************************************************** -; -; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) -;*********************************************************************** -WELS_EXTERN WelsI16x16LumaPredDc_sse2 -WelsI16x16LumaPredDc_sse2: - push r3 - push r4 - %assign push_num 2 - LOAD_3_PARA - %ifndef X86_32 - movsx r2, r2d - %endif - sub r1, r2 - movdqa xmm0, [r1] ; read one row - pxor xmm1, xmm1 - psadbw xmm0, xmm1 - movdqa xmm1, xmm0 - psrldq xmm1, 0x08 - pslldq xmm0, 0x08 - psrldq xmm0, 0x08 - paddw xmm0, xmm1 - - movzx r3, byte [r1+r2-0x01] - movzx r4, byte [r1+2*r2-0x01] - add r3, r4 - lea r1, [r1+r2] - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - add r3, 0x10 - movd xmm1, r3d - paddw xmm0, xmm1 - psrld xmm0, 0x05 - pmuludq xmm0, [mmx_01bytes] - pshufd xmm0, xmm0, 0 - - movdqa [r0], xmm0 - movdqa [r0+0x10], xmm0 - movdqa [r0+0x20], xmm0 - movdqa [r0+0x30], xmm0 - movdqa [r0+0x40], xmm0 - movdqa [r0+0x50], xmm0 - movdqa [r0+0x60], xmm0 - movdqa [r0+0x70], xmm0 - movdqa [r0+0x80], xmm0 - movdqa [r0+0x90], xmm0 - movdqa [r0+0xa0], xmm0 - movdqa [r0+0xb0], xmm0 - movdqa [r0+0xc0], xmm0 - movdqa [r0+0xd0], xmm0 - movdqa [r0+0xe0], xmm0 - movdqa [r0+0xf0], xmm0 - - pop r4 - pop r3 - ret - -;*********************************************************************** -; -;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc, -; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t); -; -;*********************************************************************** -%ifdef X86_ASM -WELS_EXTERN WelsSmpleSatdThree4x4_sse2 -align 16 -WelsSmpleSatdThree4x4_sse2: - push ebx - push esi - push edi - mov eax, [esp+24];p_enc - mov ebx, [esp+28];linesize_enc - - ; load source 4x4 samples and Hadamard transform - movd xmm0, [eax] - movd xmm1, [eax+ebx] - lea eax , [eax+2*ebx] - movd xmm2, [eax] - movd xmm3, [eax+ebx] - punpckldq xmm0, xmm2 - punpckldq xmm1, xmm3 - - pxor xmm6, xmm6 - punpcklbw xmm0, xmm6 - punpcklbw xmm1, xmm6 - - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - psubw xmm2, xmm1 - SSE2_XSawp qdq, xmm0, xmm2, xmm3 - - movdqa xmm4, xmm0 - paddw xmm0, xmm3 - psubw xmm4, xmm3 - - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm4 - punpckhwd xmm4, xmm2 - - SSE2_XSawp dq, xmm0, xmm4, xmm3 - SSE2_XSawp qdq, xmm0, xmm3, xmm5 - - movdqa xmm7, xmm0 - paddw xmm0, xmm5 - psubw xmm7, xmm5 - - SSE2_XSawp qdq, xmm0, xmm7, xmm1 - - ; Hadamard transform results are saved in xmm0 and xmm2 - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - psubw xmm2, xmm1 - - ; load top boundary samples: [a b c d] - mov eax, [esp+16];p_dec - sub eax, [esp+20];linesize_dec - movzx ecx, byte [eax] - movzx edx, byte [eax+1] - movzx esi, byte [eax+2] - movzx edi, byte [eax+3] - - ; get the transform results of top boundary samples: [a b c d] - add edx, ecx ; edx = a + b - add edi, esi ; edi = c + d - add ecx, ecx ; ecx = a + a - add esi, esi ; esi = c + c - sub ecx, edx ; ecx = a + a - a - b = a - b - sub esi, edi ; esi = c + c - c - d = c - d - add edi, edx ; edi = (a + b) + (c + d) - add edx, edx - sub edx, edi ; edx = (a + b) - (c + d) - add esi, ecx ; esi = (a - b) + (c - d) - add ecx, ecx - sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi] - - movdqa xmm6, xmm0 - movdqa xmm7, xmm2 - movd xmm5, edi ; store the edi for DC mode - pxor xmm3, xmm3 - pxor xmm4, xmm4 - pinsrw xmm3, edi, 0 - pinsrw xmm3, esi, 4 - psllw xmm3, 2 - pinsrw xmm4, edx, 0 - pinsrw xmm4, ecx, 4 - psllw xmm4, 2 - - ; get the satd of H - psubw xmm0, xmm3 - psubw xmm2, xmm4 - - WELS_AbsW xmm0, xmm1 - WELS_AbsW xmm2, xmm1 - paddusw xmm0, xmm2 - SUMW_HORIZON1 xmm0, xmm1 ; satd of V is stored in xmm0 - - ; load left boundary samples: [a b c d]' - mov eax, [esp+16] - mov ebx, [esp+20] - movzx ecx, byte [eax-1] - movzx edx, byte [eax+ebx-1] - lea eax , [eax+2*ebx] - movzx esi, byte [eax-1] - movzx edi, byte [eax+ebx-1] - - ; get the transform results of left boundary samples: [a b c d]' - add edx, ecx ; edx = a + b - add edi, esi ; edi = c + d - add ecx, ecx ; ecx = a + a - add esi, esi ; esi = c + c - sub ecx, edx ; ecx = a + a - a - b = a - b - sub esi, edi ; esi = c + c - c - d = c - d - add edi, edx ; edi = (a + b) + (c + d) - add edx, edx - sub edx, edi ; edx = (a + b) - (c + d) - add esi, ecx ; esi = (a - b) + (c - d) - add ecx, ecx - sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]' - - ; store the transform results in xmm3 - movd xmm3, edi - pinsrw xmm3, edx, 1 - pinsrw xmm3, ecx, 2 - pinsrw xmm3, esi, 3 - psllw xmm3, 2 - - ; get the satd of V - movdqa xmm2, xmm6 - movdqa xmm4, xmm7 - psubw xmm2, xmm3 - WELS_AbsW xmm2, xmm1 - WELS_AbsW xmm4, xmm1 - paddusw xmm2, xmm4 - SUMW_HORIZON1 xmm2, xmm1 ; satd of H is stored in xmm2 - - ; DC result is stored in xmm1 - add edi, 4 - movd xmm1, edi - paddw xmm1, xmm5 - psrlw xmm1, 3 - movdqa xmm5, xmm1 - psllw xmm1, 4 - - ; get the satd of DC - psubw xmm6, xmm1 - WELS_AbsW xmm6, xmm1 - WELS_AbsW xmm7, xmm1 - paddusw xmm6, xmm7 - SUMW_HORIZON1 xmm6, xmm1 ; satd of DC is stored in xmm6 - - ; comparing order: DC H V - mov edx, [esp+32] - movd eax, xmm6 - movd edi, xmm2 - movd esi, xmm0 - and eax, 0xffff - shr eax, 1 - and edi, 0xffff - shr edi, 1 - and esi, 0xffff - shr esi, 1 - add eax, [esp+40] - add edi, [esp+44] - add esi, [esp+48] - cmp ax, di - jg near not_dc - cmp ax, si - jg near not_dc_h - - ; for DC mode - movd ebx, xmm5 - imul ebx, 0x01010101 - movd xmm5, ebx - pshufd xmm5, xmm5, 0 - movdqa [edx], xmm5 - mov ebx, [esp+36] - mov dword [ebx], 0x02 - pop edi - pop esi - pop ebx - ret - -not_dc: - cmp di, si - jg near not_dc_h - - ; for H mode - SSE_DB_1_2REG xmm6, xmm7 - mov eax, [esp+16] - mov ebx, [esp+20] - movzx ecx, byte [eax-1] - movd xmm0, ecx - pmuludq xmm0, xmm6 - - movzx ecx, byte [eax+ebx-1] - movd xmm1, ecx - pmuludq xmm1, xmm6 -%if 1 - punpckldq xmm0, xmm1 -%else - unpcklps xmm0, xmm1 -%endif - lea eax, [eax+ebx*2] - movzx ecx, byte [eax-1] - movd xmm2, ecx - pmuludq xmm2, xmm6 - - movzx ecx, byte [eax+ebx-1] - movd xmm3, ecx - pmuludq xmm3, xmm6 -%if 1 - punpckldq xmm2, xmm3 - punpcklqdq xmm0, xmm2 -%else - unpcklps xmm2, xmm3 - unpcklpd xmm0, xmm2 -%endif - movdqa [edx],xmm0 - - mov eax, edi - mov ebx, [esp+36] - mov dword [ebx], 0x01 - - pop edi - pop esi - pop ebx - ret -not_dc_h: - ; for V mode - mov eax, [esp+16] - sub eax, [esp+20] - movd xmm0, [eax] - pshufd xmm0, xmm0, 0 - movdqa [edx],xmm0 - - mov eax, esi - mov ebx, [esp+36] - mov dword [ebx], 0x00 - - pop edi - pop esi - pop ebx - ret -%endif - diff --git a/codec/processing/targets.mk b/codec/processing/targets.mk index 9017dfc4..c4c96ed7 100644 --- a/codec/processing/targets.mk +++ b/codec/processing/targets.mk @@ -23,7 +23,6 @@ ifeq ($(USE_ASM), Yes) PROCESSING_ASM_SRCS=\ $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm\ $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm\ - $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm\ $(PROCESSING_SRCDIR)/./src/asm/vaa.asm\ PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.o)