From 8f9a5469beb962c22b6d8bbe78f01ec79fb33a55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 13 Dec 2013 09:40:57 +0200 Subject: [PATCH] Convert source files to unix newlines Most files were converted in ff6b669176, but some (non C++ source files) were left with windows newlines. --- codec/decoder/core/asm/dct.asm | 258 +- codec/decoder/core/asm/deblock.asm | 4224 ++++++++++---------- codec/decoder/core/asm/mc_chroma.asm | 634 +-- codec/encoder/core/asm/deblock.asm | 4224 ++++++++++---------- codec/encoder/core/asm/mc_chroma.asm | 634 +-- processing/build/linux/makefile | 188 +- processing/src/asm/denoisefilter.asm | 524 +-- processing/src/asm/downsample_bilinear.asm | 2448 ++++++------ processing/src/asm/intra_pred.asm | 288 +- processing/src/asm/sad.asm | 160 +- processing/src/asm/vaa.asm | 3178 +++++++-------- processing/src/common/WelsVP.def | 70 +- testbin/layer2.cfg | 78 +- testbin/layer2_vd.cfg | 78 +- testbin/layer2_vd_rc.cfg | 78 +- testbin/welsenc.cfg | 126 +- testbin/welsenc_vd_1d.cfg | 126 +- testbin/welsenc_vd_rc.cfg | 126 +- 18 files changed, 8721 insertions(+), 8721 deletions(-) diff --git a/codec/decoder/core/asm/dct.asm b/codec/decoder/core/asm/dct.asm index c30e4438..c493d0a7 100644 --- a/codec/decoder/core/asm/dct.asm +++ b/codec/decoder/core/asm/dct.asm @@ -1,129 +1,129 @@ -;*! -;* \copy -;* Copyright (c) 2009-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* ?Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* ?Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* dct.asm -;* -;* Abstract -;* WelsDctFourT4_sse2 -;* -;* History -;* 8/4/2009 Created -;* -;* -;*************************************************************************/ - -%include "asm_inc.asm" - -BITS 32 - -;******************************************************************************* -; Macros and other preprocessor constants -;******************************************************************************* -%macro MMX_SumSubDiv2 3 - movq %3, %2 - psraw %3, $1 - paddw %3, %1 - psraw %1, $1 - psubw %1, %2 -%endmacro - -%macro MMX_SumSub 3 - movq %3, %2 - psubw %2, %1 - paddw %1, %3 -%endmacro - -%macro MMX_IDCT 6 - MMX_SumSub %4, %5, %6 - MMX_SumSubDiv2 %3, %2, %1 - MMX_SumSub %1, %4, %6 - MMX_SumSub %3, %5, %6 -%endmacro - - -%macro MMX_StoreDiff4P 5 - movd %2, %5 - punpcklbw %2, %4 - paddw %1, %3 - psraw %1, $6 - paddsw %1, %2 - packuswb %1, %2 - movd %5, %1 -%endmacro - -;******************************************************************************* -; Code -;******************************************************************************* - -SECTION .text - -WELS_EXTERN IdctResAddPred_mmx - -ALIGN 16 -;******************************************************************************* -; void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs ) -;******************************************************************************* - -IdctResAddPred_mmx: - -%define pushsize 0 -%define pPred esp+pushsize+4 -%define kiStride esp+pushsize+8 -%define pRs esp+pushsize+12 - - mov eax, [pRs ] - mov edx, [pPred ] - mov ecx, [kiStride] - movq mm0, [eax+ 0] - movq mm1, [eax+ 8] - movq mm2, [eax+16] - movq mm3, [eax+24] - - MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4 - MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6 - MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2 - MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6 - - WELS_Zero mm7 - WELS_DW32 mm6 - - MMX_StoreDiff4P mm3, mm0, mm6, mm7, [edx] - MMX_StoreDiff4P mm4, mm0, mm6, mm7, [edx+ecx] - lea edx, [edx+2*ecx] - MMX_StoreDiff4P mm1, mm0, mm6, mm7, [edx] - MMX_StoreDiff4P mm2, mm0, mm6, mm7, [edx+ecx] - -%undef pushsize -%undef pPred -%undef kiStride -%undef pRs - emms - ret +;*! +;* \copy +;* Copyright (c) 2009-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* ?Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* ?Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* dct.asm +;* +;* Abstract +;* WelsDctFourT4_sse2 +;* +;* History +;* 8/4/2009 Created +;* +;* +;*************************************************************************/ + +%include "asm_inc.asm" + +BITS 32 + +;******************************************************************************* +; Macros and other preprocessor constants +;******************************************************************************* +%macro MMX_SumSubDiv2 3 + movq %3, %2 + psraw %3, $1 + paddw %3, %1 + psraw %1, $1 + psubw %1, %2 +%endmacro + +%macro MMX_SumSub 3 + movq %3, %2 + psubw %2, %1 + paddw %1, %3 +%endmacro + +%macro MMX_IDCT 6 + MMX_SumSub %4, %5, %6 + MMX_SumSubDiv2 %3, %2, %1 + MMX_SumSub %1, %4, %6 + MMX_SumSub %3, %5, %6 +%endmacro + + +%macro MMX_StoreDiff4P 5 + movd %2, %5 + punpcklbw %2, %4 + paddw %1, %3 + psraw %1, $6 + paddsw %1, %2 + packuswb %1, %2 + movd %5, %1 +%endmacro + +;******************************************************************************* +; Code +;******************************************************************************* + +SECTION .text + +WELS_EXTERN IdctResAddPred_mmx + +ALIGN 16 +;******************************************************************************* +; void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs ) +;******************************************************************************* + +IdctResAddPred_mmx: + +%define pushsize 0 +%define pPred esp+pushsize+4 +%define kiStride esp+pushsize+8 +%define pRs esp+pushsize+12 + + mov eax, [pRs ] + mov edx, [pPred ] + mov ecx, [kiStride] + movq mm0, [eax+ 0] + movq mm1, [eax+ 8] + movq mm2, [eax+16] + movq mm3, [eax+24] + + MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4 + MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6 + MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2 + MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6 + + WELS_Zero mm7 + WELS_DW32 mm6 + + MMX_StoreDiff4P mm3, mm0, mm6, mm7, [edx] + MMX_StoreDiff4P mm4, mm0, mm6, mm7, [edx+ecx] + lea edx, [edx+2*ecx] + MMX_StoreDiff4P mm1, mm0, mm6, mm7, [edx] + MMX_StoreDiff4P mm2, mm0, mm6, mm7, [edx+ecx] + +%undef pushsize +%undef pPred +%undef kiStride +%undef pRs + emms + ret diff --git a/codec/decoder/core/asm/deblock.asm b/codec/decoder/core/asm/deblock.asm index 5614c254..ed82286c 100644 --- a/codec/decoder/core/asm/deblock.asm +++ b/codec/decoder/core/asm/deblock.asm @@ -1,2113 +1,2113 @@ -;*! -;* \copy -;* Copyright (c) 2009-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* deblock.asm -;* -;* Abstract -;* edge loop -;* -;* History -;* 08/07/2009 Created -;* -;* -;*************************************************************************/ -%include "asm_inc.asm" -BITS 32 - -;******************************************************************************* -; Macros and other preprocessor constants -;******************************************************************************* - -%ifdef FORMAT_COFF -SECTION .rodata pData -%else -SECTION .rodata align=16 -%endif - -SECTION .text - -;******************************************************************************** -; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, -; int32_t iAlpha, int32_t iBeta) -;******************************************************************************** -WELS_EXTERN DeblockChromaEq4V_sse2 - -ALIGN 16 -DeblockChromaEq4V_sse2: - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,68h - mov edx,[ebp+10h] ; iStride - mov eax,[ebp+8] ; pPixCb - mov ecx,[ebp+0Ch] ; pPixCr - movq xmm4,[ecx] - movq xmm5,[edx+ecx] - push esi - push edi - lea esi,[edx+edx] - mov edi,eax - sub edi,esi - movq xmm1,[edi] - mov edi,ecx - sub edi,esi - movq xmm2,[edi] - punpcklqdq xmm1,xmm2 - mov esi,eax - sub esi,edx - movq xmm2,[esi] - mov edi,ecx - sub edi,edx - movq xmm3,[edi] - punpcklqdq xmm2,xmm3 - movq xmm3,[eax] - punpcklqdq xmm3,xmm4 - movq xmm4,[edx+eax] - mov edx, [ebp + 14h] - punpcklqdq xmm4,xmm5 - movd xmm5,edx - mov edx, [ebp + 18h] - pxor xmm0,xmm0 - movdqa xmm6,xmm5 - punpcklwd xmm6,xmm5 - pshufd xmm5,xmm6,0 - movd xmm6,edx - movdqa xmm7,xmm6 - punpcklwd xmm7,xmm6 - pshufd xmm6,xmm7,0 - movdqa xmm7,xmm1 - punpckhbw xmm1,xmm0 - punpcklbw xmm7,xmm0 - movdqa [esp+40h],xmm1 - movdqa [esp+60h],xmm7 - movdqa xmm7,xmm2 - punpcklbw xmm7,xmm0 - movdqa [esp+10h],xmm7 - movdqa xmm7,xmm3 - punpcklbw xmm7,xmm0 - punpckhbw xmm3,xmm0 - movdqa [esp+50h],xmm7 - movdqa xmm7,xmm4 - punpckhbw xmm4,xmm0 - punpckhbw xmm2,xmm0 - punpcklbw xmm7,xmm0 - movdqa [esp+30h],xmm3 - movdqa xmm3,[esp+10h] - movdqa xmm1,xmm3 - psubw xmm1,[esp+50h] - pabsw xmm1,xmm1 - movdqa [esp+20h],xmm4 - movdqa xmm0,xmm5 - pcmpgtw xmm0,xmm1 - movdqa xmm1,[esp+60h] - psubw xmm1,xmm3 - pabsw xmm1,xmm1 - movdqa xmm4,xmm6 - pcmpgtw xmm4,xmm1 - pand xmm0,xmm4 - movdqa xmm1,xmm7 - psubw xmm1,[esp+50h] - pabsw xmm1,xmm1 - movdqa xmm4,xmm6 - pcmpgtw xmm4,xmm1 - movdqa xmm1,xmm2 - psubw xmm1,[esp+30h] - pabsw xmm1,xmm1 - pcmpgtw xmm5,xmm1 - movdqa xmm1,[esp+40h] - pand xmm0,xmm4 - psubw xmm1,xmm2 - pabsw xmm1,xmm1 - movdqa xmm4,xmm6 - pcmpgtw xmm4,xmm1 - movdqa xmm1,[esp+20h] - psubw xmm1,[esp+30h] - pand xmm5,xmm4 - pabsw xmm1,xmm1 - pcmpgtw xmm6,xmm1 - pand xmm5,xmm6 - mov edx,2 - movsx edx,dx - movd xmm1,edx - movdqa xmm4,xmm1 - punpcklwd xmm4,xmm1 - pshufd xmm1,xmm4,0 - movdqa xmm4,[esp+60h] - movdqa xmm6,xmm4 - paddw xmm6,xmm4 - paddw xmm6,xmm3 - paddw xmm6,xmm7 - movdqa [esp+10h],xmm1 - paddw xmm6,[esp+10h] - psraw xmm6,2 - movdqa xmm4,xmm0 - pandn xmm4,xmm3 - movdqa xmm3,[esp+40h] - movdqa xmm1,xmm0 - pand xmm1,xmm6 - por xmm1,xmm4 - movdqa xmm6,xmm3 - paddw xmm6,xmm3 - movdqa xmm3,[esp+10h] - paddw xmm6,xmm2 - paddw xmm6,[esp+20h] - paddw xmm6,xmm3 - psraw xmm6,2 - movdqa xmm4,xmm5 - pand xmm4,xmm6 - movdqa xmm6,xmm5 - pandn xmm6,xmm2 - por xmm4,xmm6 - packuswb xmm1,xmm4 - movdqa xmm4,[esp+50h] - movdqa xmm6,xmm7 - paddw xmm6,xmm7 - paddw xmm6,xmm4 - paddw xmm6,[esp+60h] - paddw xmm6,xmm3 - psraw xmm6,2 - movdqa xmm2,xmm0 - pand xmm2,xmm6 - pandn xmm0,xmm4 - por xmm2,xmm0 - movdqa xmm0,[esp+20h] - movdqa xmm6,xmm0 - paddw xmm6,xmm0 - movdqa xmm0,[esp+30h] - paddw xmm6,xmm0 - paddw xmm6,[esp+40h] - movdqa xmm4,xmm5 - paddw xmm6,xmm3 - movq [esi],xmm1 - psraw xmm6,2 - pand xmm4,xmm6 - pandn xmm5,xmm0 - por xmm4,xmm5 - packuswb xmm2,xmm4 - movq [eax],xmm2 - psrldq xmm1,8 - movq [edi],xmm1 - pop edi - psrldq xmm2,8 - movq [ecx],xmm2 - pop esi - mov esp,ebp - pop ebp - ret - -;****************************************************************************** -; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, -; int32_t iAlpha, int32_t iBeta, int8_t * pTC); -;******************************************************************************* - -WELS_EXTERN DeblockChromaLt4V_sse2 - -DeblockChromaLt4V_sse2: - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,0E4h - push ebx - push esi - mov esi, [ebp+1Ch] ; pTC - movsx ebx, byte [esi+2] - push edi - movsx di,byte [esi+3] - mov word [esp+0Ch],bx - movsx bx,byte [esi+1] - movsx esi,byte [esi] - mov word [esp+0Eh],si - movzx esi,di - movd xmm1,esi - movzx esi,di - movd xmm2,esi - mov si,word [esp+0Ch] - mov edx, [ebp + 10h] - mov eax, [ebp + 08h] - movzx edi,si - movzx esi,si - mov ecx, [ebp + 0Ch] - movd xmm4,esi - movzx esi,bx - movd xmm5,esi - movd xmm3,edi - movzx esi,bx - movd xmm6,esi - mov si,word [esp+0Eh] - movzx edi,si - movzx esi,si - punpcklwd xmm6,xmm2 - pxor xmm0,xmm0 - movdqa [esp+40h],xmm0 - movd xmm7,edi - movd xmm0,esi - lea esi,[edx+edx] - mov edi,eax - sub edi,esi - punpcklwd xmm5,xmm1 - movdqa xmm1,[esp+40h] - punpcklwd xmm0,xmm4 - movq xmm4,[edx+ecx] - punpcklwd xmm7,xmm3 - movq xmm3,[eax] - punpcklwd xmm0,xmm6 - movq xmm6,[edi] - punpcklwd xmm7,xmm5 - punpcklwd xmm0,xmm7 - mov edi,ecx - sub edi,esi - movdqa xmm2,xmm1 - psubw xmm2,xmm0 - movdqa [esp+60h],xmm2 - movq xmm2, [edi] - punpcklqdq xmm6,xmm2 - mov esi,eax - sub esi,edx - movq xmm7,[esi] - mov edi,ecx - sub edi,edx - movq xmm2,[edi] - punpcklqdq xmm7,xmm2 - movq xmm2,[ecx] - punpcklqdq xmm3,xmm2 - movq xmm2,[edx+eax] - movsx edx,word [ebp + 14h] - punpcklqdq xmm2,xmm4 - movdqa [esp+0E0h],xmm2 - movd xmm2,edx - movsx edx,word [ebp + 18h] - movdqa xmm4,xmm2 - punpcklwd xmm4,xmm2 - movd xmm2,edx - movdqa xmm5,xmm2 - punpcklwd xmm5,xmm2 - pshufd xmm2,xmm5,0 - movdqa [esp+50h],xmm2 - movdqa xmm2,xmm6 - punpcklbw xmm2,xmm1 - movdqa [esp+0D0h],xmm3 - pshufd xmm4,xmm4,0 - movdqa [esp+30h],xmm2 - punpckhbw xmm6,xmm1 - movdqa [esp+80h],xmm6 - movdqa xmm6,[esp+0D0h] - punpckhbw xmm6,xmm1 - movdqa [esp+70h],xmm6 - movdqa xmm6, [esp+0E0h] - punpckhbw xmm6,xmm1 - movdqa [esp+90h],xmm6 - movdqa xmm5, [esp+0E0h] - movdqa xmm2,xmm7 - punpckhbw xmm7,xmm1 - punpcklbw xmm5,xmm1 - movdqa [esp+0A0h],xmm7 - punpcklbw xmm3,xmm1 - mov edx,4 - punpcklbw xmm2,xmm1 - movsx edx,dx - movd xmm6,edx - movdqa xmm7,xmm6 - punpcklwd xmm7,xmm6 - pshufd xmm6,xmm7,0 - movdqa xmm7,[esp+30h] - movdqa [esp+20h],xmm6 - psubw xmm7,xmm5 - movdqa xmm6,xmm0 - pcmpgtw xmm6,xmm1 - movdqa xmm1,[esp+60h] - movdqa [esp+40h],xmm6 - movdqa xmm6,xmm3 - psubw xmm6,xmm2 - psllw xmm6,2 - paddw xmm6,xmm7 - paddw xmm6, [esp+20h] - movdqa xmm7, [esp+50h] - psraw xmm6,3 - pmaxsw xmm1,xmm6 - movdqa [esp+10h],xmm0 - movdqa xmm6, [esp+10h] - pminsw xmm6,xmm1 - movdqa [esp+10h],xmm6 - movdqa xmm1,xmm2 - psubw xmm1,xmm3 - pabsw xmm1,xmm1 - movdqa xmm6,xmm4 - pcmpgtw xmm6,xmm1 - movdqa xmm1, [esp+30h] - psubw xmm1,xmm2 - pabsw xmm1,xmm1 - pcmpgtw xmm7,xmm1 - movdqa xmm1,[esp+50h] - pand xmm6,xmm7 - movdqa xmm7,[esp+50h] - psubw xmm5,xmm3 - pabsw xmm5,xmm5 - pcmpgtw xmm1,xmm5 - movdqa xmm5,[esp+80h] - psubw xmm5,[esp+90h] - pand xmm6,xmm1 - pand xmm6,[esp+40h] - movdqa xmm1,[esp+10h] - pand xmm1,xmm6 - movdqa xmm6,[esp+70h] - movdqa [esp+30h],xmm1 - movdqa xmm1,[esp+0A0h] - psubw xmm6,xmm1 - psllw xmm6,2 - paddw xmm6,xmm5 - paddw xmm6,[esp+20h] - movdqa xmm5,[esp+60h] - psraw xmm6,3 - pmaxsw xmm5,xmm6 - pminsw xmm0,xmm5 - movdqa xmm5,[esp+70h] - movdqa xmm6,xmm1 - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pcmpgtw xmm4,xmm6 - movdqa xmm6,[esp+80h] - psubw xmm6,xmm1 - pabsw xmm6,xmm6 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+90h] - pand xmm4,xmm7 - movdqa xmm7,[esp+50h] - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pcmpgtw xmm7,xmm6 - pand xmm4,xmm7 - pand xmm4,[esp+40h] - pand xmm0,xmm4 - movdqa xmm4,[esp+30h] - paddw xmm2,xmm4 - paddw xmm1,xmm0 - packuswb xmm2,xmm1 - movq [esi],xmm2 - psubw xmm3,xmm4 - psubw xmm5,xmm0 - packuswb xmm3,xmm5 - movq [eax],xmm3 - psrldq xmm2,8 - movq [edi],xmm2 - pop edi - pop esi - psrldq xmm3,8 - movq [ecx],xmm3 - pop ebx - mov esp,ebp - pop ebp - ret - -;*************************************************************************** -; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, -; int32_t iAlpha, int32_t iBeta) -;*************************************************************************** - -WELS_EXTERN DeblockChromaEq4H_sse2 - -ALIGN 16 - -DeblockChromaEq4H_sse2: - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,0C8h - mov ecx,dword [ebp+8] - mov edx,dword [ebp+0Ch] - mov eax,dword [ebp+10h] - sub ecx,2 - sub edx,2 - push esi - lea esi,[eax+eax*2] - mov dword [esp+18h],ecx - mov dword [esp+4],edx - lea ecx,[ecx+eax*4] - lea edx,[edx+eax*4] - lea eax,[esp+7Ch] - push edi - mov dword [esp+14h],esi - mov dword [esp+18h],ecx - mov dword [esp+0Ch],edx - mov dword [esp+10h],eax - mov esi,dword [esp+1Ch] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+14h] - movd xmm0,dword [esi] - movd xmm1,dword [esi+ecx] - movd xmm2,dword [esi+ecx*2] - movd xmm3,dword [esi+edx] - mov esi,dword [esp+8] - movd xmm4,dword [esi] - movd xmm5,dword [esi+ecx] - movd xmm6,dword [esi+ecx*2] - movd xmm7,dword [esi+edx] - punpckldq xmm0,xmm4 - punpckldq xmm1,xmm5 - punpckldq xmm2,xmm6 - punpckldq xmm3,xmm7 - mov esi,dword [esp+18h] - mov edi,dword [esp+0Ch] - movd xmm4,dword [esi] - movd xmm5,dword [edi] - punpckldq xmm4,xmm5 - punpcklqdq xmm0,xmm4 - movd xmm4,dword [esi+ecx] - movd xmm5,dword [edi+ecx] - punpckldq xmm4,xmm5 - punpcklqdq xmm1,xmm4 - movd xmm4,dword [esi+ecx*2] - movd xmm5,dword [edi+ecx*2] - punpckldq xmm4,xmm5 - punpcklqdq xmm2,xmm4 - movd xmm4,dword [esi+edx] - movd xmm5,dword [edi+edx] - punpckldq xmm4,xmm5 - punpcklqdq xmm3,xmm4 - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov edi,dword [esp+10h] - movdqa [edi],xmm0 - movdqa [edi+10h],xmm5 - movdqa [edi+20h],xmm1 - movdqa [edi+30h],xmm6 - movsx ecx,word [ebp+14h] - movsx edx,word [ebp+18h] - movdqa xmm6,[esp+80h] - movdqa xmm4,[esp+90h] - movdqa xmm5,[esp+0A0h] - movdqa xmm7,[esp+0B0h] - pxor xmm0,xmm0 - movd xmm1,ecx - movdqa xmm2,xmm1 - punpcklwd xmm2,xmm1 - pshufd xmm1,xmm2,0 - movd xmm2,edx - movdqa xmm3,xmm2 - punpcklwd xmm3,xmm2 - pshufd xmm2,xmm3,0 - movdqa xmm3,xmm6 - punpckhbw xmm6,xmm0 - movdqa [esp+60h],xmm6 - movdqa xmm6,[esp+90h] - punpckhbw xmm6,xmm0 - movdqa [esp+30h],xmm6 - movdqa xmm6,[esp+0A0h] - punpckhbw xmm6,xmm0 - movdqa [esp+40h],xmm6 - movdqa xmm6,[esp+0B0h] - punpckhbw xmm6,xmm0 - movdqa [esp+70h],xmm6 - punpcklbw xmm7,xmm0 - punpcklbw xmm4,xmm0 - punpcklbw xmm5,xmm0 - punpcklbw xmm3,xmm0 - movdqa [esp+50h],xmm7 - movdqa xmm6,xmm4 - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - movdqa xmm0,xmm1 - pcmpgtw xmm0,xmm6 - movdqa xmm6,xmm3 - psubw xmm6,xmm4 - pabsw xmm6,xmm6 - movdqa xmm7,xmm2 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+50h] - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pand xmm0,xmm7 - movdqa xmm7,xmm2 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+30h] - psubw xmm6,[esp+40h] - pabsw xmm6,xmm6 - pcmpgtw xmm1,xmm6 - movdqa xmm6,[esp+60h] - psubw xmm6,[esp+30h] - pabsw xmm6,xmm6 - pand xmm0,xmm7 - movdqa xmm7,xmm2 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+70h] - psubw xmm6,[esp+40h] - pabsw xmm6,xmm6 - pand xmm1,xmm7 - pcmpgtw xmm2,xmm6 - pand xmm1,xmm2 - mov eax,2 - movsx ecx,ax - movd xmm2,ecx - movdqa xmm6,xmm2 - punpcklwd xmm6,xmm2 - pshufd xmm2,xmm6,0 - movdqa [esp+20h],xmm2 - movdqa xmm2,xmm3 - paddw xmm2,xmm3 - paddw xmm2,xmm4 - paddw xmm2,[esp+50h] - paddw xmm2,[esp+20h] - psraw xmm2,2 - movdqa xmm6,xmm0 - pand xmm6,xmm2 - movdqa xmm2,xmm0 - pandn xmm2,xmm4 - por xmm6,xmm2 - movdqa xmm2,[esp+60h] - movdqa xmm7,xmm2 - paddw xmm7,xmm2 - paddw xmm7,[esp+30h] - paddw xmm7,[esp+70h] - paddw xmm7,[esp+20h] - movdqa xmm4,xmm1 - movdqa xmm2,xmm1 - pandn xmm2,[esp+30h] - psraw xmm7,2 - pand xmm4,xmm7 - por xmm4,xmm2 - movdqa xmm2,[esp+50h] - packuswb xmm6,xmm4 - movdqa [esp+90h],xmm6 - movdqa xmm6,xmm2 - paddw xmm6,xmm2 - movdqa xmm2,[esp+20h] - paddw xmm6,xmm5 - paddw xmm6,xmm3 - movdqa xmm4,xmm0 - pandn xmm0,xmm5 - paddw xmm6,xmm2 - psraw xmm6,2 - pand xmm4,xmm6 - por xmm4,xmm0 - movdqa xmm0,[esp+70h] - movdqa xmm5,xmm0 - paddw xmm5,xmm0 - movdqa xmm0,[esp+40h] - paddw xmm5,xmm0 - paddw xmm5,[esp+60h] - movdqa xmm3,xmm1 - paddw xmm5,xmm2 - psraw xmm5,2 - pand xmm3,xmm5 - pandn xmm1,xmm0 - por xmm3,xmm1 - packuswb xmm4,xmm3 - movdqa [esp+0A0h],xmm4 - mov esi,dword [esp+10h] - movdqa xmm0,[esi] - movdqa xmm1,[esi+10h] - movdqa xmm2,[esi+20h] - movdqa xmm3,[esi+30h] - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov esi,dword [esp+1Ch] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+14h] - mov edi,dword [esp+8] - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov esi,dword [esp+18h] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov edi,dword [esp+0Ch] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - pop edi - pop esi - mov esp,ebp - pop ebp - ret - -;******************************************************************************* -; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, -; int32_t iAlpha, int32_t iBeta, int8_t * pTC); -;******************************************************************************* - -WELS_EXTERN DeblockChromaLt4H_sse2 - -ALIGN 16 - -DeblockChromaLt4H_sse2: - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,108h - mov ecx,dword [ebp+8] - mov edx,dword [ebp+0Ch] - mov eax,dword [ebp+10h] - sub ecx,2 - sub edx,2 - push esi - lea esi,[eax+eax*2] - mov dword [esp+10h],ecx - mov dword [esp+4],edx - lea ecx,[ecx+eax*4] - lea edx,[edx+eax*4] - lea eax,[esp+6Ch] - push edi - mov dword [esp+0Ch],esi - mov dword [esp+18h],ecx - mov dword [esp+10h],edx - mov dword [esp+1Ch],eax - mov esi,dword [esp+14h] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+0Ch] - movd xmm0,dword [esi] - movd xmm1,dword [esi+ecx] - movd xmm2,dword [esi+ecx*2] - movd xmm3,dword [esi+edx] - mov esi,dword [esp+8] - movd xmm4,dword [esi] - movd xmm5,dword [esi+ecx] - movd xmm6,dword [esi+ecx*2] - movd xmm7,dword [esi+edx] - punpckldq xmm0,xmm4 - punpckldq xmm1,xmm5 - punpckldq xmm2,xmm6 - punpckldq xmm3,xmm7 - mov esi,dword [esp+18h] - mov edi,dword [esp+10h] - movd xmm4,dword [esi] - movd xmm5,dword [edi] - punpckldq xmm4,xmm5 - punpcklqdq xmm0,xmm4 - movd xmm4,dword [esi+ecx] - movd xmm5,dword [edi+ecx] - punpckldq xmm4,xmm5 - punpcklqdq xmm1,xmm4 - movd xmm4,dword [esi+ecx*2] - movd xmm5,dword [edi+ecx*2] - punpckldq xmm4,xmm5 - punpcklqdq xmm2,xmm4 - movd xmm4,dword [esi+edx] - movd xmm5,dword [edi+edx] - punpckldq xmm4,xmm5 - punpcklqdq xmm3,xmm4 - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov edi,dword [esp+1Ch] - movdqa [edi],xmm0 - movdqa [edi+10h],xmm5 - movdqa [edi+20h],xmm1 - movdqa [edi+30h],xmm6 - mov eax,dword [ebp+1Ch] - movsx cx,byte [eax+3] - movsx dx,byte [eax+2] - movsx si,byte [eax+1] - movsx ax,byte [eax] - movzx edi,cx - movzx ecx,cx - movd xmm2,ecx - movzx ecx,dx - movzx edx,dx - movd xmm3,ecx - movd xmm4,edx - movzx ecx,si - movzx edx,si - movd xmm5,ecx - pxor xmm0,xmm0 - movd xmm6,edx - movzx ecx,ax - movdqa [esp+60h],xmm0 - movzx edx,ax - movsx eax,word [ebp+14h] - punpcklwd xmm6,xmm2 - movd xmm1,edi - movd xmm7,ecx - movsx ecx,word [ebp+18h] - movd xmm0,edx - punpcklwd xmm7,xmm3 - punpcklwd xmm5,xmm1 - movdqa xmm1,[esp+60h] - punpcklwd xmm7,xmm5 - movdqa xmm5,[esp+0A0h] - punpcklwd xmm0,xmm4 - punpcklwd xmm0,xmm6 - movdqa xmm6, [esp+70h] - punpcklwd xmm0,xmm7 - movdqa xmm7,[esp+80h] - movdqa xmm2,xmm1 - psubw xmm2,xmm0 - movdqa [esp+0D0h],xmm2 - movd xmm2,eax - movdqa xmm3,xmm2 - punpcklwd xmm3,xmm2 - pshufd xmm4,xmm3,0 - movd xmm2,ecx - movdqa xmm3,xmm2 - punpcklwd xmm3,xmm2 - pshufd xmm2,xmm3,0 - movdqa xmm3, [esp+90h] - movdqa [esp+50h],xmm2 - movdqa xmm2,xmm6 - punpcklbw xmm2,xmm1 - punpckhbw xmm6,xmm1 - movdqa [esp+40h],xmm2 - movdqa [esp+0B0h],xmm6 - movdqa xmm6,[esp+90h] - movdqa xmm2,xmm7 - punpckhbw xmm7,xmm1 - punpckhbw xmm6,xmm1 - punpcklbw xmm2,xmm1 - punpcklbw xmm3,xmm1 - punpcklbw xmm5,xmm1 - movdqa [esp+0F0h],xmm7 - movdqa [esp+0C0h],xmm6 - movdqa xmm6, [esp+0A0h] - punpckhbw xmm6,xmm1 - movdqa [esp+0E0h],xmm6 - mov edx,4 - movsx eax,dx - movd xmm6,eax - movdqa xmm7,xmm6 - punpcklwd xmm7,xmm6 - pshufd xmm6,xmm7,0 - movdqa [esp+30h],xmm6 - movdqa xmm7, [esp+40h] - psubw xmm7,xmm5 - movdqa xmm6,xmm0 - pcmpgtw xmm6,xmm1 - movdqa [esp+60h],xmm6 - movdqa xmm1, [esp+0D0h] - movdqa xmm6,xmm3 - psubw xmm6,xmm2 - psllw xmm6,2 - paddw xmm6,xmm7 - paddw xmm6,[esp+30h] - psraw xmm6,3 - pmaxsw xmm1,xmm6 - movdqa xmm7,[esp+50h] - movdqa [esp+20h],xmm0 - movdqa xmm6, [esp+20h] - pminsw xmm6,xmm1 - movdqa [esp+20h],xmm6 - movdqa xmm6,xmm4 - movdqa xmm1,xmm2 - psubw xmm1,xmm3 - pabsw xmm1,xmm1 - pcmpgtw xmm6,xmm1 - movdqa xmm1, [esp+40h] - psubw xmm1,xmm2 - pabsw xmm1,xmm1 - pcmpgtw xmm7,xmm1 - movdqa xmm1, [esp+50h] - pand xmm6,xmm7 - movdqa xmm7, [esp+50h] - psubw xmm5,xmm3 - pabsw xmm5,xmm5 - pcmpgtw xmm1,xmm5 - movdqa xmm5, [esp+0B0h] - psubw xmm5,[esp+0E0h] - pand xmm6,xmm1 - pand xmm6, [esp+60h] - movdqa xmm1, [esp+20h] - pand xmm1,xmm6 - movdqa xmm6, [esp+0C0h] - movdqa [esp+40h],xmm1 - movdqa xmm1, [esp+0F0h] - psubw xmm6,xmm1 - psllw xmm6,2 - paddw xmm6,xmm5 - paddw xmm6, [esp+30h] - movdqa xmm5, [esp+0D0h] - psraw xmm6,3 - pmaxsw xmm5,xmm6 - pminsw xmm0,xmm5 - movdqa xmm5,[esp+0C0h] - movdqa xmm6,xmm1 - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pcmpgtw xmm4,xmm6 - movdqa xmm6,[esp+0B0h] - psubw xmm6,xmm1 - pabsw xmm6,xmm6 - pcmpgtw xmm7,xmm6 - movdqa xmm6, [esp+0E0h] - pand xmm4,xmm7 - movdqa xmm7, [esp+50h] - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pcmpgtw xmm7,xmm6 - pand xmm4,xmm7 - pand xmm4,[esp+60h] - pand xmm0,xmm4 - movdqa xmm4, [esp+40h] - paddw xmm2,xmm4 - paddw xmm1,xmm0 - psubw xmm3,xmm4 - psubw xmm5,xmm0 - packuswb xmm2,xmm1 - packuswb xmm3,xmm5 - movdqa [esp+80h],xmm2 - movdqa [esp+90h],xmm3 - mov esi,dword [esp+1Ch] - movdqa xmm0, [esi] - movdqa xmm1, [esi+10h] - movdqa xmm2, [esi+20h] - movdqa xmm3, [esi+30h] - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov esi,dword [esp+14h] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+0Ch] - mov edi,dword [esp+8] - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov esi,dword [esp+18h] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov edi,dword [esp+10h] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - pop edi - pop esi - mov esp,ebp - pop ebp - ret - - - -;******************************************************************************* -; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, -; int32_t iBeta, int8_t * pTC) -;******************************************************************************* - - -WELS_EXTERN DeblockLumaLt4V_sse2 - -ALIGN 16 - -DeblockLumaLt4V_sse2: - push ebp - mov ebp, esp - and esp, -16 ; fffffff0H - sub esp, 420 ; 000001a4H - mov eax, dword [ebp+8] - mov ecx, dword [ebp+12] - - pxor xmm0, xmm0 - push ebx - mov edx, dword [ebp+24] - movdqa [esp+424-384], xmm0 - push esi - - lea esi, [ecx+ecx*2] - push edi - mov edi, eax - sub edi, esi - movdqa xmm0, [edi] - - lea esi, [ecx+ecx] - movdqa [esp+432-208], xmm0 - mov edi, eax - sub edi, esi - movdqa xmm0, [edi] - movdqa [esp+448-208], xmm0 - - mov ebx, eax - sub ebx, ecx - movdqa xmm0, [ebx] - movdqa [esp+464-208], xmm0 - - movdqa xmm0, [eax] - - add ecx, eax - movdqa [esp+480-208], xmm0 - movdqa xmm0, [ecx] - mov dword [esp+432-404], ecx - - movsx ecx, word [ebp+16] - movdqa [esp+496-208], xmm0 - movdqa xmm0, [esi+eax] - - movsx si, byte [edx] - movdqa [esp+512-208], xmm0 - movd xmm0, ecx - movsx ecx, word [ebp+20] - movdqa xmm1, xmm0 - punpcklwd xmm1, xmm0 - pshufd xmm0, xmm1, 0 - movdqa [esp+432-112], xmm0 - movd xmm0, ecx - movsx cx, byte [edx+1] - movdqa xmm1, xmm0 - punpcklwd xmm1, xmm0 - mov dword [esp+432-408], ebx - movzx ebx, cx - pshufd xmm0, xmm1, 0 - movd xmm1, ebx - movzx ebx, cx - movd xmm2, ebx - movzx ebx, cx - movzx ecx, cx - movd xmm4, ecx - movzx ecx, si - movd xmm5, ecx - movzx ecx, si - movd xmm6, ecx - movzx ecx, si - movd xmm7, ecx - movzx ecx, si - movdqa [esp+432-336], xmm0 - movd xmm0, ecx - - movsx cx, byte [edx+3] - movsx dx, byte [edx+2] - movd xmm3, ebx - punpcklwd xmm0, xmm4 - movzx esi, cx - punpcklwd xmm6, xmm2 - punpcklwd xmm5, xmm1 - punpcklwd xmm0, xmm6 - punpcklwd xmm7, xmm3 - punpcklwd xmm7, xmm5 - punpcklwd xmm0, xmm7 - movdqa [esp+432-400], xmm0 - movd xmm0, esi - movzx esi, cx - movd xmm2, esi - movzx esi, cx - movzx ecx, cx - movd xmm4, ecx - movzx ecx, dx - movd xmm3, esi - movd xmm5, ecx - punpcklwd xmm5, xmm0 - - movdqa xmm0, [esp+432-384] - movzx ecx, dx - movd xmm6, ecx - movzx ecx, dx - movzx edx, dx - punpcklwd xmm6, xmm2 - movd xmm7, ecx - movd xmm1, edx - - movdqa xmm2, [esp+448-208] - punpcklbw xmm2, xmm0 - - mov ecx, 4 - movsx edx, cx - punpcklwd xmm7, xmm3 - punpcklwd xmm7, xmm5 - movdqa xmm5, [esp+496-208] - movdqa xmm3, [esp+464-208] - punpcklbw xmm5, xmm0 - movdqa [esp+432-240], xmm5 - movdqa xmm5, [esp+512-208] - punpcklbw xmm5, xmm0 - movdqa [esp+432-352], xmm5 - punpcklwd xmm1, xmm4 - movdqa xmm4, [esp+432-208] - punpcklwd xmm1, xmm6 - movdqa xmm6, [esp+480-208] - punpcklwd xmm1, xmm7 - punpcklbw xmm6, xmm0 - punpcklbw xmm3, xmm0 - punpcklbw xmm4, xmm0 - movdqa xmm7, xmm3 - psubw xmm7, xmm4 - pabsw xmm7, xmm7 - movdqa [esp+432-272], xmm4 - movdqa xmm4, [esp+432-336] - movdqa xmm5, xmm4 - pcmpgtw xmm5, xmm7 - movdqa [esp+432-288], xmm5 - movdqa xmm7, xmm6 - psubw xmm7, [esp+432-352] - pabsw xmm7, xmm7 - movdqa xmm5, xmm4 - pcmpgtw xmm5, xmm7 - movdqa [esp+432-256], xmm5 - movdqa xmm5, xmm3 - pavgw xmm5, xmm6 - movdqa [esp+432-304], xmm5 - movdqa xmm5, [esp+432-400] - psubw xmm5, [esp+432-288] - psubw xmm5, [esp+432-256] - movdqa [esp+432-224], xmm5 - movdqa xmm5, xmm6 - psubw xmm5, xmm3 - movdqa [esp+432-32], xmm6 - psubw xmm6, [esp+432-240] - movdqa xmm7, xmm5 - movdqa [esp+432-384], xmm5 - movdqa xmm5, [esp+432-112] - pabsw xmm7, xmm7 - pcmpgtw xmm5, xmm7 - pabsw xmm6, xmm6 - movdqa xmm7, xmm4 - pcmpgtw xmm7, xmm6 - - pand xmm5, xmm7 - movdqa xmm6, xmm3 - psubw xmm6, xmm2 - pabsw xmm6, xmm6 - movdqa xmm7, xmm4 - pcmpgtw xmm7, xmm6 - movdqa xmm6, [esp+432-400] - pand xmm5, xmm7 - movdqa xmm7, xmm6 - pcmpeqw xmm6, xmm0 - pcmpgtw xmm7, xmm0 - por xmm7, xmm6 - pand xmm5, xmm7 - movdqa [esp+432-320], xmm5 - movd xmm5, edx - movdqa xmm6, xmm5 - punpcklwd xmm6, xmm5 - pshufd xmm5, xmm6, 0 - movdqa [esp+432-336], xmm5 - movdqa xmm5, [esp+432-224] - movdqa [esp+432-368], xmm5 - movdqa xmm6, xmm0 - psubw xmm6, xmm5 - movdqa xmm5, [esp+432-384] - psllw xmm5, 2 - movdqa xmm7, xmm2 - psubw xmm7, [esp+432-240] - paddw xmm7, xmm5 - paddw xmm7, [esp+432-336] - movdqa xmm5, [esp+432-368] - psraw xmm7, 3 - pmaxsw xmm6, xmm7 - pminsw xmm5, xmm6 - - pand xmm5, [esp+432-320] - movdqa xmm6, [esp+432-400] - movdqa [esp+432-64], xmm5 - movdqa [esp+432-384], xmm6 - movdqa xmm5, xmm0 - psubw xmm5, xmm6 - movdqa [esp+432-368], xmm5 - movdqa xmm6, xmm5 - movdqa xmm5, [esp+432-272] - paddw xmm5, [esp+432-304] - movdqa xmm7, xmm2 - paddw xmm7, xmm2 - psubw xmm5, xmm7 - psraw xmm5, 1 - pmaxsw xmm6, xmm5 - movdqa xmm5, [esp+432-384] - pminsw xmm5, xmm6 - - pand xmm5, [esp+432-320] - pand xmm5, [esp+432-288] - movdqa xmm6, [esp+432-240] - movdqa [esp+432-96], xmm5 - movdqa xmm5, [esp+432-352] - paddw xmm5, [esp+432-304] - movdqa xmm7, xmm6 - paddw xmm7, xmm6 - movdqa xmm6, [esp+432-368] - psubw xmm5, xmm7 - - movdqa xmm7, [esp+496-208] - psraw xmm5, 1 - pmaxsw xmm6, xmm5 - movdqa xmm5, [esp+432-400] - pminsw xmm5, xmm6 - pand xmm5, [esp+432-320] - pand xmm5, [esp+432-256] - movdqa xmm6, [esp+448-208] - punpckhbw xmm7, xmm0 - movdqa [esp+432-352], xmm7 - - movdqa xmm7, [esp+512-208] - punpckhbw xmm6, xmm0 - movdqa [esp+432-48], xmm5 - movdqa xmm5, [esp+432-208] - movdqa [esp+432-368], xmm6 - movdqa xmm6, [esp+464-208] - punpckhbw xmm7, xmm0 - punpckhbw xmm5, xmm0 - movdqa [esp+432-384], xmm7 - punpckhbw xmm6, xmm0 - movdqa [esp+432-400], xmm6 - - movdqa xmm7, [esp+432-400] - movdqa xmm6, [esp+480-208] - psubw xmm7, xmm5 - movdqa [esp+432-16], xmm5 - pabsw xmm7, xmm7 - punpckhbw xmm6, xmm0 - movdqa xmm5, xmm4 - pcmpgtw xmm5, xmm7 - movdqa [esp+432-288], xmm5 - - movdqa xmm7, xmm6 - psubw xmm7, [esp+432-384] - pabsw xmm7, xmm7 - movdqa xmm5, xmm4 - pcmpgtw xmm5, xmm7 - movdqa [esp+432-256], xmm5 - - movdqa xmm5, [esp+432-400] - movdqa [esp+432-80], xmm6 - pavgw xmm5, xmm6 - movdqa [esp+432-304], xmm5 - - movdqa xmm5, xmm1 - psubw xmm5, [esp+432-288] - psubw xmm5, [esp+432-256] - movdqa [esp+432-224], xmm5 - movdqa xmm5, xmm6 - psubw xmm5, [esp+432-400] - psubw xmm6, [esp+432-352] - movdqa [esp+432-272], xmm5 - movdqa xmm7, xmm5 - movdqa xmm5, [esp+432-112] - pabsw xmm7, xmm7 - pcmpgtw xmm5, xmm7 - movdqa xmm7, xmm4 - pabsw xmm6, xmm6 - pcmpgtw xmm7, xmm6 - movdqa xmm6, [esp+432-368] - - pand xmm5, xmm7 - movdqa xmm7, [esp+432-400] - psubw xmm7, xmm6 - psubw xmm6, [esp+432-352] - pabsw xmm7, xmm7 - pcmpgtw xmm4, xmm7 - pand xmm5, xmm4 - - paddw xmm2, [esp+432-96] - movdqa xmm4, xmm1 - pcmpgtw xmm4, xmm0 - movdqa xmm7, xmm1 - pcmpeqw xmm7, xmm0 - por xmm4, xmm7 - pand xmm5, xmm4 - movdqa xmm4, [esp+432-224] - movdqa [esp+432-320], xmm5 - movdqa xmm5, [esp+432-272] - movdqa xmm7, xmm0 - psubw xmm7, xmm4 - psubw xmm0, xmm1 - psllw xmm5, 2 - paddw xmm6, xmm5 - paddw xmm6, [esp+432-336] - movdqa xmm5, [esp+432-368] - movdqa [esp+432-336], xmm0 - psraw xmm6, 3 - pmaxsw xmm7, xmm6 - pminsw xmm4, xmm7 - pand xmm4, [esp+432-320] - movdqa xmm6, xmm0 - movdqa xmm0, [esp+432-16] - paddw xmm0, [esp+432-304] - movdqa [esp+432-272], xmm4 - movdqa xmm4, [esp+432-368] - paddw xmm4, xmm4 - psubw xmm0, xmm4 - - movdqa xmm4, [esp+432-64] - psraw xmm0, 1 - pmaxsw xmm6, xmm0 - movdqa xmm0, [esp+432-400] - movdqa xmm7, xmm1 - pminsw xmm7, xmm6 - movdqa xmm6, [esp+432-320] - pand xmm7, xmm6 - pand xmm7, [esp+432-288] - paddw xmm5, xmm7 - packuswb xmm2, xmm5 - movdqa xmm5, [esp+432-272] - paddw xmm0, xmm5 - paddw xmm3, xmm4 - packuswb xmm3, xmm0 - - movdqa xmm0, [esp+432-32] - psubw xmm0, xmm4 - movdqa xmm4, [esp+432-80] - psubw xmm4, xmm5 - - movdqa xmm5, [esp+432-240] - paddw xmm5, [esp+432-48] - packuswb xmm0, xmm4 - movdqa xmm4, [esp+432-384] - paddw xmm4, [esp+432-304] - movdqa [esp+480-208], xmm0 - movdqa xmm0, [esp+432-352] - movdqa xmm7, xmm0 - paddw xmm0, xmm0 - - mov ecx, dword [esp+432-408] - - mov edx, dword [esp+432-404] - psubw xmm4, xmm0 - movdqa xmm0, [esp+432-336] - movdqa [edi], xmm2 - psraw xmm4, 1 - pmaxsw xmm0, xmm4 - pminsw xmm1, xmm0 - movdqa xmm0, [esp+480-208] - - pop edi - pand xmm1, xmm6 - pand xmm1, [esp+428-256] - movdqa [ecx], xmm3 - paddw xmm7, xmm1 - pop esi - packuswb xmm5, xmm7 - movdqa [eax], xmm0 - movdqa [edx], xmm5 - pop ebx - mov esp, ebp - pop ebp - ret - - -;******************************************************************************* -; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, -; int32_t iBeta) -;******************************************************************************* - -WELS_EXTERN DeblockLumaEq4V_sse2 - -ALIGN 16 - -DeblockLumaEq4V_sse2: - - push ebp - mov ebp, esp - and esp, -16 ; fffffff0H - sub esp, 628 ; 00000274H - mov eax, dword [ebp+8] - mov ecx, dword [ebp+12] - push ebx - push esi - - lea edx, [ecx*4] - pxor xmm0, xmm0 - movdqa xmm2, xmm0 - - movdqa xmm0, [ecx+eax] - mov esi, eax - sub esi, edx - movdqa xmm3, [esi] - movdqa xmm5, [eax] - push edi - lea edi, [ecx+ecx] - lea ebx, [ecx+ecx*2] - mov dword [esp+640-600], edi - mov esi, eax - sub esi, edi - movdqa xmm1, [esi] - movdqa [esp+720-272], xmm0 - mov edi, eax - sub edi, ecx - movdqa xmm4, [edi] - add ecx, eax - mov dword [esp+640-596], ecx - - mov ecx, dword [esp+640-600] - movdqa xmm0, [ecx+eax] - movdqa [esp+736-272], xmm0 - - movdqa xmm0, [eax+ebx] - mov edx, eax - sub edx, ebx - - movsx ebx, word [ebp+16] - movdqa xmm6, [edx] - add ecx, eax - movdqa [esp+752-272], xmm0 - movd xmm0, ebx - - movsx ebx, word [ebp+20] - movdqa xmm7, xmm0 - punpcklwd xmm7, xmm0 - pshufd xmm0, xmm7, 0 - movdqa [esp+640-320], xmm0 - movd xmm0, ebx - movdqa xmm7, xmm0 - punpcklwd xmm7, xmm0 - pshufd xmm0, xmm7, 0 - - movdqa xmm7, [esp+736-272] - punpcklbw xmm7, xmm2 - movdqa [esp+640-416], xmm7 - movdqa [esp+640-512], xmm0 - movdqa xmm0, xmm1 - movdqa [esp+672-272], xmm1 - movdqa xmm1, xmm4 - movdqa [esp+704-272], xmm5 - punpcklbw xmm5, xmm2 - punpcklbw xmm1, xmm2 - - movdqa xmm7, xmm5 - psubw xmm7, xmm1 - pabsw xmm7, xmm7 - movdqa [esp+640-560], xmm7 - punpcklbw xmm0, xmm2 - movdqa [esp+688-272], xmm4 - movdqa xmm4, [esp+720-272] - movdqa [esp+640-480], xmm0 - - movdqa xmm7, xmm1 - psubw xmm7, xmm0 - - movdqa xmm0, [esp+640-512] - pabsw xmm7, xmm7 - punpcklbw xmm4, xmm2 - pcmpgtw xmm0, xmm7 - movdqa [esp+640-384], xmm4 - movdqa xmm7, xmm5 - psubw xmm7, xmm4 - movdqa xmm4, [esp+640-512] - movdqa [esp+656-272], xmm6 - punpcklbw xmm6, xmm2 - pabsw xmm7, xmm7 - movdqa [esp+640-48], xmm2 - movdqa [esp+640-368], xmm6 - movdqa [esp+640-144], xmm1 - movdqa [esp+640-400], xmm5 - pcmpgtw xmm4, xmm7 - pand xmm0, xmm4 - movdqa xmm4, [esp+640-320] - pcmpgtw xmm4, [esp+640-560] - pand xmm0, xmm4 - - mov ebx, 2 - movsx ebx, bx - movd xmm4, ebx - movdqa xmm7, xmm4 - punpcklwd xmm7, xmm4 - movdqa xmm4, [esp+640-320] - psraw xmm4, 2 - pshufd xmm7, xmm7, 0 - paddw xmm4, xmm7 - movdqa [esp+640-576], xmm4 - pcmpgtw xmm4, [esp+640-560] - movdqa [esp+640-560], xmm4 - - movdqa xmm4, [esp+640-512] - movdqa [esp+640-624], xmm7 - movdqa xmm7, xmm1 - psubw xmm7, xmm6 - pabsw xmm7, xmm7 - pcmpgtw xmm4, xmm7 - - pand xmm4, [esp+640-560] - movdqa [esp+640-544], xmm4 - movdqa xmm4, [esp+640-512] - movdqa xmm7, xmm5 - psubw xmm7, [esp+640-416] - pabsw xmm7, xmm7 - pcmpgtw xmm4, xmm7 - - pand xmm4, [esp+640-560] - movdqa [esp+640-560], xmm4 - - movdqa xmm4, [esp+640-544] - pandn xmm4, xmm6 - movdqa [esp+640-16], xmm4 - mov ebx, 4 - movsx ebx, bx - movd xmm4, ebx - movdqa xmm7, xmm4 - punpcklwd xmm7, xmm4 - movdqa xmm4, xmm3 - punpcklbw xmm4, xmm2 - psllw xmm4, 1 - paddw xmm4, xmm6 - paddw xmm4, xmm6 - paddw xmm4, xmm6 - paddw xmm4, [esp+640-480] - - movdqa xmm6, [esp+640-560] - pshufd xmm7, xmm7, 0 - paddw xmm4, xmm1 - movdqa [esp+640-592], xmm7 - paddw xmm4, xmm5 - paddw xmm4, xmm7 - movdqa xmm7, [esp+640-416] - pandn xmm6, xmm7 - movdqa [esp+640-80], xmm6 - movdqa xmm6, [esp+752-272] - punpcklbw xmm6, xmm2 - psllw xmm6, 1 - paddw xmm6, xmm7 - paddw xmm6, xmm7 - paddw xmm6, xmm7 - paddw xmm6, [esp+640-384] - - movdqa xmm7, [esp+640-480] - paddw xmm6, xmm5 - paddw xmm6, xmm1 - paddw xmm6, [esp+640-592] - psraw xmm6, 3 - pand xmm6, [esp+640-560] - movdqa [esp+640-112], xmm6 - movdqa xmm6, [esp+640-544] - pandn xmm6, xmm7 - movdqa [esp+640-336], xmm6 - movdqa xmm6, [esp+640-544] - movdqa [esp+640-528], xmm6 - movdqa xmm6, [esp+640-368] - paddw xmm6, xmm7 - movdqa xmm7, xmm1 - psraw xmm4, 3 - pand xmm4, [esp+640-544] - paddw xmm7, xmm5 - paddw xmm6, xmm7 - paddw xmm6, [esp+640-624] - movdqa xmm7, [esp+640-528] - - paddw xmm5, xmm1 - psraw xmm6, 2 - pand xmm7, xmm6 - - movdqa xmm6, [esp+640-384] - movdqa [esp+640-64], xmm7 - movdqa xmm7, [esp+640-560] - pandn xmm7, xmm6 - movdqa [esp+640-304], xmm7 - movdqa xmm7, [esp+640-560] - movdqa [esp+640-528], xmm7 - movdqa xmm7, [esp+640-416] - paddw xmm7, xmm6 - paddw xmm7, xmm5 - paddw xmm7, [esp+640-624] - movdqa xmm5, [esp+640-528] - psraw xmm7, 2 - pand xmm5, xmm7 - movdqa [esp+640-32], xmm5 - - movdqa xmm5, [esp+640-544] - movdqa [esp+640-528], xmm5 - movdqa xmm5, [esp+640-480] - movdqa xmm7, xmm5 - paddw xmm7, xmm5 - movdqa xmm5, xmm1 - paddw xmm5, xmm6 - paddw xmm6, [esp+640-592] - paddw xmm7, xmm5 - paddw xmm7, [esp+640-624] - movdqa xmm5, [esp+640-528] - psraw xmm7, 2 - pandn xmm5, xmm7 - movdqa xmm7, [esp+640-480] - paddw xmm7, xmm1 - paddw xmm7, [esp+640-400] - movdqa xmm1, [esp+640-544] - movdqa [esp+640-352], xmm5 - movdqa xmm5, [esp+640-368] - psllw xmm7, 1 - paddw xmm7, xmm6 - paddw xmm5, xmm7 - - movdqa xmm7, [esp+640-400] - psraw xmm5, 3 - pand xmm1, xmm5 - movdqa xmm5, [esp+640-480] - movdqa [esp+640-96], xmm1 - movdqa xmm1, [esp+640-560] - movdqa [esp+640-528], xmm1 - movdqa xmm1, [esp+640-384] - movdqa xmm6, xmm1 - paddw xmm6, xmm1 - paddw xmm1, [esp+640-400] - paddw xmm1, [esp+640-144] - paddw xmm7, xmm5 - paddw xmm5, [esp+640-592] - paddw xmm6, xmm7 - paddw xmm6, [esp+640-624] - movdqa xmm7, [esp+640-528] - psraw xmm6, 2 - psllw xmm1, 1 - paddw xmm1, xmm5 - - movdqa xmm5, [esp+656-272] - pandn xmm7, xmm6 - movdqa xmm6, [esp+640-416] - paddw xmm6, xmm1 - movdqa xmm1, [esp+640-560] - psraw xmm6, 3 - pand xmm1, xmm6 - - movdqa xmm6, [esp+704-272] - movdqa [esp+640-128], xmm1 - movdqa xmm1, [esp+672-272] - punpckhbw xmm1, xmm2 - movdqa [esp+640-448], xmm1 - movdqa xmm1, [esp+688-272] - punpckhbw xmm1, xmm2 - punpckhbw xmm6, xmm2 - movdqa [esp+640-288], xmm7 - punpckhbw xmm5, xmm2 - movdqa [esp+640-496], xmm1 - movdqa [esp+640-432], xmm6 - - movdqa xmm7, [esp+720-272] - punpckhbw xmm7, xmm2 - movdqa [esp+640-464], xmm7 - - movdqa xmm7, [esp+736-272] - punpckhbw xmm7, xmm2 - movdqa [esp+640-528], xmm7 - - movdqa xmm7, xmm6 - - psubw xmm6, [esp+640-464] - psubw xmm7, xmm1 - pabsw xmm7, xmm7 - movdqa [esp+640-560], xmm7 - por xmm4, [esp+640-16] - pabsw xmm6, xmm6 - movdqa xmm7, xmm1 - psubw xmm7, [esp+640-448] - - movdqa xmm1, [esp+640-512] - pabsw xmm7, xmm7 - pcmpgtw xmm1, xmm7 - movdqa xmm7, [esp+640-512] - pcmpgtw xmm7, xmm6 - movdqa xmm6, [esp+640-320] - pand xmm1, xmm7 - movdqa xmm7, [esp+640-560] - pcmpgtw xmm6, xmm7 - pand xmm1, xmm6 - - movdqa xmm6, [esp+640-576] - pcmpgtw xmm6, xmm7 - - movdqa xmm7, [esp+640-496] - punpckhbw xmm3, xmm2 - movdqa [esp+640-560], xmm6 - movdqa xmm6, [esp+640-512] - psubw xmm7, xmm5 - pabsw xmm7, xmm7 - pcmpgtw xmm6, xmm7 - - pand xmm6, [esp+640-560] - movdqa xmm7, [esp+640-432] - psubw xmm7, [esp+640-528] - - psllw xmm3, 1 - movdqa [esp+640-544], xmm6 - movdqa xmm6, [esp+640-512] - - movdqa xmm2, [esp+640-544] - paddw xmm3, xmm5 - paddw xmm3, xmm5 - paddw xmm3, xmm5 - paddw xmm3, [esp+640-448] - paddw xmm3, [esp+640-496] - pabsw xmm7, xmm7 - pcmpgtw xmm6, xmm7 - pand xmm6, [esp+640-560] - movdqa [esp+640-560], xmm6 - - movdqa xmm6, xmm0 - pand xmm6, xmm4 - movdqa xmm4, xmm0 - pandn xmm4, [esp+640-368] - por xmm6, xmm4 - movdqa xmm4, [esp+640-432] - paddw xmm3, xmm4 - paddw xmm3, [esp+640-592] - psraw xmm3, 3 - pand xmm3, xmm2 - pandn xmm2, xmm5 - por xmm3, xmm2 - movdqa xmm7, xmm1 - pand xmm7, xmm3 - movdqa xmm3, [esp+640-64] - por xmm3, [esp+640-336] - movdqa xmm2, xmm1 - pandn xmm2, xmm5 - por xmm7, xmm2 - - movdqa xmm2, xmm0 - pand xmm2, xmm3 - movdqa xmm3, xmm0 - pandn xmm3, [esp+640-480] - por xmm2, xmm3 - packuswb xmm6, xmm7 - movdqa [esp+640-336], xmm2 - movdqa [esp+656-272], xmm6 - movdqa xmm6, [esp+640-544] - movdqa xmm2, xmm5 - paddw xmm2, [esp+640-448] - movdqa xmm3, xmm1 - movdqa xmm7, [esp+640-496] - paddw xmm7, xmm4 - paddw xmm2, xmm7 - paddw xmm2, [esp+640-624] - movdqa xmm7, [esp+640-544] - psraw xmm2, 2 - pand xmm6, xmm2 - movdqa xmm2, [esp+640-448] - pandn xmm7, xmm2 - por xmm6, xmm7 - pand xmm3, xmm6 - movdqa xmm6, xmm1 - pandn xmm6, xmm2 - paddw xmm2, [esp+640-496] - paddw xmm2, xmm4 - por xmm3, xmm6 - movdqa xmm6, [esp+640-336] - packuswb xmm6, xmm3 - psllw xmm2, 1 - movdqa [esp+672-272], xmm6 - movdqa xmm6, [esp+640-96] - por xmm6, [esp+640-352] - - movdqa xmm3, xmm0 - pand xmm3, xmm6 - movdqa xmm6, xmm0 - pandn xmm6, [esp+640-144] - por xmm3, xmm6 - movdqa xmm6, [esp+640-544] - movdqa [esp+640-352], xmm3 - movdqa xmm3, [esp+640-464] - paddw xmm3, [esp+640-592] - paddw xmm2, xmm3 - movdqa xmm3, [esp+640-448] - paddw xmm5, xmm2 - movdqa xmm2, [esp+640-496] - psraw xmm5, 3 - pand xmm6, xmm5 - movdqa xmm5, [esp+640-464] - paddw xmm2, xmm5 - paddw xmm5, [esp+640-432] - movdqa xmm4, xmm3 - paddw xmm4, xmm3 - paddw xmm4, xmm2 - paddw xmm4, [esp+640-624] - movdqa xmm2, [esp+640-544] - paddw xmm3, [esp+640-592] - psraw xmm4, 2 - pandn xmm2, xmm4 - por xmm6, xmm2 - movdqa xmm7, xmm1 - pand xmm7, xmm6 - movdqa xmm6, [esp+640-496] - movdqa xmm2, xmm1 - pandn xmm2, xmm6 - por xmm7, xmm2 - movdqa xmm2, [esp+640-352] - packuswb xmm2, xmm7 - movdqa [esp+688-272], xmm2 - movdqa xmm2, [esp+640-128] - por xmm2, [esp+640-288] - - movdqa xmm4, xmm0 - pand xmm4, xmm2 - paddw xmm5, xmm6 - movdqa xmm2, xmm0 - pandn xmm2, [esp+640-400] - por xmm4, xmm2 - movdqa xmm2, [esp+640-528] - psllw xmm5, 1 - paddw xmm5, xmm3 - movdqa xmm3, [esp+640-560] - paddw xmm2, xmm5 - psraw xmm2, 3 - movdqa [esp+640-288], xmm4 - movdqa xmm4, [esp+640-560] - pand xmm4, xmm2 - movdqa xmm2, [esp+640-464] - movdqa xmm5, xmm2 - paddw xmm5, xmm2 - movdqa xmm2, [esp+640-432] - paddw xmm2, [esp+640-448] - movdqa xmm7, xmm1 - paddw xmm5, xmm2 - paddw xmm5, [esp+640-624] - movdqa xmm6, [esp+640-560] - psraw xmm5, 2 - pandn xmm3, xmm5 - por xmm4, xmm3 - movdqa xmm3, [esp+640-32] - por xmm3, [esp+640-304] - pand xmm7, xmm4 - movdqa xmm4, [esp+640-432] - movdqa xmm5, [esp+640-464] - movdqa xmm2, xmm1 - pandn xmm2, xmm4 - paddw xmm4, [esp+640-496] - por xmm7, xmm2 - movdqa xmm2, [esp+640-288] - packuswb xmm2, xmm7 - movdqa [esp+704-272], xmm2 - - movdqa xmm2, xmm0 - pand xmm2, xmm3 - movdqa xmm3, xmm0 - pandn xmm3, [esp+640-384] - por xmm2, xmm3 - movdqa [esp+640-304], xmm2 - movdqa xmm2, [esp+640-528] - movdqa xmm3, xmm2 - paddw xmm3, [esp+640-464] - paddw xmm3, xmm4 - paddw xmm3, [esp+640-624] - psraw xmm3, 2 - pand xmm6, xmm3 - movdqa xmm3, [esp+640-560] - movdqa xmm4, xmm3 - pandn xmm4, xmm5 - por xmm6, xmm4 - movdqa xmm7, xmm1 - pand xmm7, xmm6 - movdqa xmm6, [esp+640-304] - movdqa xmm4, xmm1 - pandn xmm4, xmm5 - por xmm7, xmm4 - - movdqa xmm4, xmm0 - pandn xmm0, [esp+640-416] - packuswb xmm6, xmm7 - movdqa xmm7, [esp+640-112] - por xmm7, [esp+640-80] - pand xmm4, xmm7 - por xmm4, xmm0 - movdqa xmm0, [esp+752-272] - punpckhbw xmm0, [esp+640-48] - psllw xmm0, 1 - paddw xmm0, xmm2 - paddw xmm0, xmm2 - paddw xmm0, xmm2 - paddw xmm0, xmm5 - paddw xmm0, [esp+640-432] - paddw xmm0, [esp+640-496] - paddw xmm0, [esp+640-592] - psraw xmm0, 3 - pand xmm0, xmm3 - movdqa xmm7, xmm1 - pandn xmm3, xmm2 - por xmm0, xmm3 - pand xmm7, xmm0 - - movdqa xmm0, [esp+656-272] - movdqa [edx], xmm0 - - movdqa xmm0, [esp+672-272] - - mov edx, dword [esp+640-596] - movdqa [esi], xmm0 - movdqa xmm0, [esp+688-272] - movdqa [edi], xmm0 - movdqa xmm0, [esp+704-272] - - pop edi - pandn xmm1, xmm2 - movdqa [eax], xmm0 - por xmm7, xmm1 - pop esi - packuswb xmm4, xmm7 - movdqa [edx], xmm6 - movdqa [ecx], xmm4 - pop ebx - mov esp, ebp - pop ebp - ret - - -;******************************************************************************** -; -; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst); -; -;******************************************************************************** - -WELS_EXTERN DeblockLumaTransposeH2V_sse2 - -ALIGN 16 - -DeblockLumaTransposeH2V_sse2: - push ebp - push ebx - mov ebp, esp - and esp,0FFFFFFF0h - sub esp, 10h - - mov eax, [ebp + 0Ch] - mov ecx, [ebp + 10h] - lea edx, [eax + ecx * 8] - lea ebx, [ecx*3] - - movq xmm0, [eax] - movq xmm7, [edx] - punpcklqdq xmm0, xmm7 - movq xmm1, [eax + ecx] - movq xmm7, [edx + ecx] - punpcklqdq xmm1, xmm7 - movq xmm2, [eax + ecx*2] - movq xmm7, [edx + ecx*2] - punpcklqdq xmm2, xmm7 - movq xmm3, [eax + ebx] - movq xmm7, [edx + ebx] - punpcklqdq xmm3, xmm7 - - lea eax, [eax + ecx * 4] - lea edx, [edx + ecx * 4] - movq xmm4, [eax] - movq xmm7, [edx] - punpcklqdq xmm4, xmm7 - movq xmm5, [eax + ecx] - movq xmm7, [edx + ecx] - punpcklqdq xmm5, xmm7 - movq xmm6, [eax + ecx*2] - movq xmm7, [edx + ecx*2] - punpcklqdq xmm6, xmm7 - - movdqa [esp], xmm0 - movq xmm7, [eax + ebx] - movq xmm0, [edx + ebx] - punpcklqdq xmm7, xmm0 - movdqa xmm0, [esp] - - SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp] - ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 - - mov eax, [ebp + 14h] - movdqa [eax], xmm4 - movdqa [eax + 10h], xmm2 - movdqa [eax + 20h], xmm3 - movdqa [eax + 30h], xmm7 - movdqa [eax + 40h], xmm5 - movdqa [eax + 50h], xmm1 - movdqa [eax + 60h], xmm6 - movdqa [eax + 70h], xmm0 - - mov esp, ebp - pop ebx - pop ebp - ret - - - -;******************************************************************************************* -; -; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc); -; -;******************************************************************************************* - -WELS_EXTERN DeblockLumaTransposeV2H_sse2 - -ALIGN 16 - -DeblockLumaTransposeV2H_sse2: - push ebp - mov ebp, esp - - and esp, 0FFFFFFF0h - sub esp, 10h - - mov eax, [ebp + 10h] - mov ecx, [ebp + 0Ch] - mov edx, [ebp + 08h] - - movdqa xmm0, [eax] - movdqa xmm1, [eax + 10h] - movdqa xmm2, [eax + 20h] - movdqa xmm3, [eax + 30h] - movdqa xmm4, [eax + 40h] - movdqa xmm5, [eax + 50h] - movdqa xmm6, [eax + 60h] - movdqa xmm7, [eax + 70h] - - SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp] - ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 - - lea eax, [ecx * 3] - - movq [edx], xmm4 - movq [edx + ecx], xmm2 - movq [edx + ecx*2], xmm3 - movq [edx + eax], xmm7 - - lea edx, [edx + ecx*4] - movq [edx], xmm5 - movq [edx + ecx], xmm1 - movq [edx + ecx*2], xmm6 - movq [edx + eax], xmm0 - - psrldq xmm4, 8 - psrldq xmm2, 8 - psrldq xmm3, 8 - psrldq xmm7, 8 - psrldq xmm5, 8 - psrldq xmm1, 8 - psrldq xmm6, 8 - psrldq xmm0, 8 - - lea edx, [edx + ecx*4] - movq [edx], xmm4 - movq [edx + ecx], xmm2 - movq [edx + ecx*2], xmm3 - movq [edx + eax], xmm7 - - lea edx, [edx + ecx*4] - movq [edx], xmm5 - movq [edx + ecx], xmm1 - movq [edx + ecx*2], xmm6 - movq [edx + eax], xmm0 - - - mov esp, ebp - pop ebp +;*! +;* \copy +;* Copyright (c) 2009-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* deblock.asm +;* +;* Abstract +;* edge loop +;* +;* History +;* 08/07/2009 Created +;* +;* +;*************************************************************************/ +%include "asm_inc.asm" +BITS 32 + +;******************************************************************************* +; Macros and other preprocessor constants +;******************************************************************************* + +%ifdef FORMAT_COFF +SECTION .rodata pData +%else +SECTION .rodata align=16 +%endif + +SECTION .text + +;******************************************************************************** +; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, +; int32_t iAlpha, int32_t iBeta) +;******************************************************************************** +WELS_EXTERN DeblockChromaEq4V_sse2 + +ALIGN 16 +DeblockChromaEq4V_sse2: + push ebp + mov ebp,esp + and esp,0FFFFFFF0h + sub esp,68h + mov edx,[ebp+10h] ; iStride + mov eax,[ebp+8] ; pPixCb + mov ecx,[ebp+0Ch] ; pPixCr + movq xmm4,[ecx] + movq xmm5,[edx+ecx] + push esi + push edi + lea esi,[edx+edx] + mov edi,eax + sub edi,esi + movq xmm1,[edi] + mov edi,ecx + sub edi,esi + movq xmm2,[edi] + punpcklqdq xmm1,xmm2 + mov esi,eax + sub esi,edx + movq xmm2,[esi] + mov edi,ecx + sub edi,edx + movq xmm3,[edi] + punpcklqdq xmm2,xmm3 + movq xmm3,[eax] + punpcklqdq xmm3,xmm4 + movq xmm4,[edx+eax] + mov edx, [ebp + 14h] + punpcklqdq xmm4,xmm5 + movd xmm5,edx + mov edx, [ebp + 18h] + pxor xmm0,xmm0 + movdqa xmm6,xmm5 + punpcklwd xmm6,xmm5 + pshufd xmm5,xmm6,0 + movd xmm6,edx + movdqa xmm7,xmm6 + punpcklwd xmm7,xmm6 + pshufd xmm6,xmm7,0 + movdqa xmm7,xmm1 + punpckhbw xmm1,xmm0 + punpcklbw xmm7,xmm0 + movdqa [esp+40h],xmm1 + movdqa [esp+60h],xmm7 + movdqa xmm7,xmm2 + punpcklbw xmm7,xmm0 + movdqa [esp+10h],xmm7 + movdqa xmm7,xmm3 + punpcklbw xmm7,xmm0 + punpckhbw xmm3,xmm0 + movdqa [esp+50h],xmm7 + movdqa xmm7,xmm4 + punpckhbw xmm4,xmm0 + punpckhbw xmm2,xmm0 + punpcklbw xmm7,xmm0 + movdqa [esp+30h],xmm3 + movdqa xmm3,[esp+10h] + movdqa xmm1,xmm3 + psubw xmm1,[esp+50h] + pabsw xmm1,xmm1 + movdqa [esp+20h],xmm4 + movdqa xmm0,xmm5 + pcmpgtw xmm0,xmm1 + movdqa xmm1,[esp+60h] + psubw xmm1,xmm3 + pabsw xmm1,xmm1 + movdqa xmm4,xmm6 + pcmpgtw xmm4,xmm1 + pand xmm0,xmm4 + movdqa xmm1,xmm7 + psubw xmm1,[esp+50h] + pabsw xmm1,xmm1 + movdqa xmm4,xmm6 + pcmpgtw xmm4,xmm1 + movdqa xmm1,xmm2 + psubw xmm1,[esp+30h] + pabsw xmm1,xmm1 + pcmpgtw xmm5,xmm1 + movdqa xmm1,[esp+40h] + pand xmm0,xmm4 + psubw xmm1,xmm2 + pabsw xmm1,xmm1 + movdqa xmm4,xmm6 + pcmpgtw xmm4,xmm1 + movdqa xmm1,[esp+20h] + psubw xmm1,[esp+30h] + pand xmm5,xmm4 + pabsw xmm1,xmm1 + pcmpgtw xmm6,xmm1 + pand xmm5,xmm6 + mov edx,2 + movsx edx,dx + movd xmm1,edx + movdqa xmm4,xmm1 + punpcklwd xmm4,xmm1 + pshufd xmm1,xmm4,0 + movdqa xmm4,[esp+60h] + movdqa xmm6,xmm4 + paddw xmm6,xmm4 + paddw xmm6,xmm3 + paddw xmm6,xmm7 + movdqa [esp+10h],xmm1 + paddw xmm6,[esp+10h] + psraw xmm6,2 + movdqa xmm4,xmm0 + pandn xmm4,xmm3 + movdqa xmm3,[esp+40h] + movdqa xmm1,xmm0 + pand xmm1,xmm6 + por xmm1,xmm4 + movdqa xmm6,xmm3 + paddw xmm6,xmm3 + movdqa xmm3,[esp+10h] + paddw xmm6,xmm2 + paddw xmm6,[esp+20h] + paddw xmm6,xmm3 + psraw xmm6,2 + movdqa xmm4,xmm5 + pand xmm4,xmm6 + movdqa xmm6,xmm5 + pandn xmm6,xmm2 + por xmm4,xmm6 + packuswb xmm1,xmm4 + movdqa xmm4,[esp+50h] + movdqa xmm6,xmm7 + paddw xmm6,xmm7 + paddw xmm6,xmm4 + paddw xmm6,[esp+60h] + paddw xmm6,xmm3 + psraw xmm6,2 + movdqa xmm2,xmm0 + pand xmm2,xmm6 + pandn xmm0,xmm4 + por xmm2,xmm0 + movdqa xmm0,[esp+20h] + movdqa xmm6,xmm0 + paddw xmm6,xmm0 + movdqa xmm0,[esp+30h] + paddw xmm6,xmm0 + paddw xmm6,[esp+40h] + movdqa xmm4,xmm5 + paddw xmm6,xmm3 + movq [esi],xmm1 + psraw xmm6,2 + pand xmm4,xmm6 + pandn xmm5,xmm0 + por xmm4,xmm5 + packuswb xmm2,xmm4 + movq [eax],xmm2 + psrldq xmm1,8 + movq [edi],xmm1 + pop edi + psrldq xmm2,8 + movq [ecx],xmm2 + pop esi + mov esp,ebp + pop ebp + ret + +;****************************************************************************** +; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, +; int32_t iAlpha, int32_t iBeta, int8_t * pTC); +;******************************************************************************* + +WELS_EXTERN DeblockChromaLt4V_sse2 + +DeblockChromaLt4V_sse2: + push ebp + mov ebp,esp + and esp,0FFFFFFF0h + sub esp,0E4h + push ebx + push esi + mov esi, [ebp+1Ch] ; pTC + movsx ebx, byte [esi+2] + push edi + movsx di,byte [esi+3] + mov word [esp+0Ch],bx + movsx bx,byte [esi+1] + movsx esi,byte [esi] + mov word [esp+0Eh],si + movzx esi,di + movd xmm1,esi + movzx esi,di + movd xmm2,esi + mov si,word [esp+0Ch] + mov edx, [ebp + 10h] + mov eax, [ebp + 08h] + movzx edi,si + movzx esi,si + mov ecx, [ebp + 0Ch] + movd xmm4,esi + movzx esi,bx + movd xmm5,esi + movd xmm3,edi + movzx esi,bx + movd xmm6,esi + mov si,word [esp+0Eh] + movzx edi,si + movzx esi,si + punpcklwd xmm6,xmm2 + pxor xmm0,xmm0 + movdqa [esp+40h],xmm0 + movd xmm7,edi + movd xmm0,esi + lea esi,[edx+edx] + mov edi,eax + sub edi,esi + punpcklwd xmm5,xmm1 + movdqa xmm1,[esp+40h] + punpcklwd xmm0,xmm4 + movq xmm4,[edx+ecx] + punpcklwd xmm7,xmm3 + movq xmm3,[eax] + punpcklwd xmm0,xmm6 + movq xmm6,[edi] + punpcklwd xmm7,xmm5 + punpcklwd xmm0,xmm7 + mov edi,ecx + sub edi,esi + movdqa xmm2,xmm1 + psubw xmm2,xmm0 + movdqa [esp+60h],xmm2 + movq xmm2, [edi] + punpcklqdq xmm6,xmm2 + mov esi,eax + sub esi,edx + movq xmm7,[esi] + mov edi,ecx + sub edi,edx + movq xmm2,[edi] + punpcklqdq xmm7,xmm2 + movq xmm2,[ecx] + punpcklqdq xmm3,xmm2 + movq xmm2,[edx+eax] + movsx edx,word [ebp + 14h] + punpcklqdq xmm2,xmm4 + movdqa [esp+0E0h],xmm2 + movd xmm2,edx + movsx edx,word [ebp + 18h] + movdqa xmm4,xmm2 + punpcklwd xmm4,xmm2 + movd xmm2,edx + movdqa xmm5,xmm2 + punpcklwd xmm5,xmm2 + pshufd xmm2,xmm5,0 + movdqa [esp+50h],xmm2 + movdqa xmm2,xmm6 + punpcklbw xmm2,xmm1 + movdqa [esp+0D0h],xmm3 + pshufd xmm4,xmm4,0 + movdqa [esp+30h],xmm2 + punpckhbw xmm6,xmm1 + movdqa [esp+80h],xmm6 + movdqa xmm6,[esp+0D0h] + punpckhbw xmm6,xmm1 + movdqa [esp+70h],xmm6 + movdqa xmm6, [esp+0E0h] + punpckhbw xmm6,xmm1 + movdqa [esp+90h],xmm6 + movdqa xmm5, [esp+0E0h] + movdqa xmm2,xmm7 + punpckhbw xmm7,xmm1 + punpcklbw xmm5,xmm1 + movdqa [esp+0A0h],xmm7 + punpcklbw xmm3,xmm1 + mov edx,4 + punpcklbw xmm2,xmm1 + movsx edx,dx + movd xmm6,edx + movdqa xmm7,xmm6 + punpcklwd xmm7,xmm6 + pshufd xmm6,xmm7,0 + movdqa xmm7,[esp+30h] + movdqa [esp+20h],xmm6 + psubw xmm7,xmm5 + movdqa xmm6,xmm0 + pcmpgtw xmm6,xmm1 + movdqa xmm1,[esp+60h] + movdqa [esp+40h],xmm6 + movdqa xmm6,xmm3 + psubw xmm6,xmm2 + psllw xmm6,2 + paddw xmm6,xmm7 + paddw xmm6, [esp+20h] + movdqa xmm7, [esp+50h] + psraw xmm6,3 + pmaxsw xmm1,xmm6 + movdqa [esp+10h],xmm0 + movdqa xmm6, [esp+10h] + pminsw xmm6,xmm1 + movdqa [esp+10h],xmm6 + movdqa xmm1,xmm2 + psubw xmm1,xmm3 + pabsw xmm1,xmm1 + movdqa xmm6,xmm4 + pcmpgtw xmm6,xmm1 + movdqa xmm1, [esp+30h] + psubw xmm1,xmm2 + pabsw xmm1,xmm1 + pcmpgtw xmm7,xmm1 + movdqa xmm1,[esp+50h] + pand xmm6,xmm7 + movdqa xmm7,[esp+50h] + psubw xmm5,xmm3 + pabsw xmm5,xmm5 + pcmpgtw xmm1,xmm5 + movdqa xmm5,[esp+80h] + psubw xmm5,[esp+90h] + pand xmm6,xmm1 + pand xmm6,[esp+40h] + movdqa xmm1,[esp+10h] + pand xmm1,xmm6 + movdqa xmm6,[esp+70h] + movdqa [esp+30h],xmm1 + movdqa xmm1,[esp+0A0h] + psubw xmm6,xmm1 + psllw xmm6,2 + paddw xmm6,xmm5 + paddw xmm6,[esp+20h] + movdqa xmm5,[esp+60h] + psraw xmm6,3 + pmaxsw xmm5,xmm6 + pminsw xmm0,xmm5 + movdqa xmm5,[esp+70h] + movdqa xmm6,xmm1 + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pcmpgtw xmm4,xmm6 + movdqa xmm6,[esp+80h] + psubw xmm6,xmm1 + pabsw xmm6,xmm6 + pcmpgtw xmm7,xmm6 + movdqa xmm6,[esp+90h] + pand xmm4,xmm7 + movdqa xmm7,[esp+50h] + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pcmpgtw xmm7,xmm6 + pand xmm4,xmm7 + pand xmm4,[esp+40h] + pand xmm0,xmm4 + movdqa xmm4,[esp+30h] + paddw xmm2,xmm4 + paddw xmm1,xmm0 + packuswb xmm2,xmm1 + movq [esi],xmm2 + psubw xmm3,xmm4 + psubw xmm5,xmm0 + packuswb xmm3,xmm5 + movq [eax],xmm3 + psrldq xmm2,8 + movq [edi],xmm2 + pop edi + pop esi + psrldq xmm3,8 + movq [ecx],xmm3 + pop ebx + mov esp,ebp + pop ebp + ret + +;*************************************************************************** +; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, +; int32_t iAlpha, int32_t iBeta) +;*************************************************************************** + +WELS_EXTERN DeblockChromaEq4H_sse2 + +ALIGN 16 + +DeblockChromaEq4H_sse2: + push ebp + mov ebp,esp + and esp,0FFFFFFF0h + sub esp,0C8h + mov ecx,dword [ebp+8] + mov edx,dword [ebp+0Ch] + mov eax,dword [ebp+10h] + sub ecx,2 + sub edx,2 + push esi + lea esi,[eax+eax*2] + mov dword [esp+18h],ecx + mov dword [esp+4],edx + lea ecx,[ecx+eax*4] + lea edx,[edx+eax*4] + lea eax,[esp+7Ch] + push edi + mov dword [esp+14h],esi + mov dword [esp+18h],ecx + mov dword [esp+0Ch],edx + mov dword [esp+10h],eax + mov esi,dword [esp+1Ch] + mov ecx,dword [ebp+10h] + mov edx,dword [esp+14h] + movd xmm0,dword [esi] + movd xmm1,dword [esi+ecx] + movd xmm2,dword [esi+ecx*2] + movd xmm3,dword [esi+edx] + mov esi,dword [esp+8] + movd xmm4,dword [esi] + movd xmm5,dword [esi+ecx] + movd xmm6,dword [esi+ecx*2] + movd xmm7,dword [esi+edx] + punpckldq xmm0,xmm4 + punpckldq xmm1,xmm5 + punpckldq xmm2,xmm6 + punpckldq xmm3,xmm7 + mov esi,dword [esp+18h] + mov edi,dword [esp+0Ch] + movd xmm4,dword [esi] + movd xmm5,dword [edi] + punpckldq xmm4,xmm5 + punpcklqdq xmm0,xmm4 + movd xmm4,dword [esi+ecx] + movd xmm5,dword [edi+ecx] + punpckldq xmm4,xmm5 + punpcklqdq xmm1,xmm4 + movd xmm4,dword [esi+ecx*2] + movd xmm5,dword [edi+ecx*2] + punpckldq xmm4,xmm5 + punpcklqdq xmm2,xmm4 + movd xmm4,dword [esi+edx] + movd xmm5,dword [edi+edx] + punpckldq xmm4,xmm5 + punpcklqdq xmm3,xmm4 + movdqa xmm6,xmm0 + punpcklbw xmm0,xmm1 + punpckhbw xmm6,xmm1 + movdqa xmm7,xmm2 + punpcklbw xmm2,xmm3 + punpckhbw xmm7,xmm3 + movdqa xmm4,xmm0 + movdqa xmm5,xmm6 + punpcklwd xmm0,xmm2 + punpckhwd xmm4,xmm2 + punpcklwd xmm6,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm1,xmm0 + movdqa xmm2,xmm4 + punpckldq xmm0,xmm6 + punpckhdq xmm1,xmm6 + punpckldq xmm4,xmm5 + punpckhdq xmm2,xmm5 + movdqa xmm5,xmm0 + movdqa xmm6,xmm1 + punpcklqdq xmm0,xmm4 + punpckhqdq xmm5,xmm4 + punpcklqdq xmm1,xmm2 + punpckhqdq xmm6,xmm2 + mov edi,dword [esp+10h] + movdqa [edi],xmm0 + movdqa [edi+10h],xmm5 + movdqa [edi+20h],xmm1 + movdqa [edi+30h],xmm6 + movsx ecx,word [ebp+14h] + movsx edx,word [ebp+18h] + movdqa xmm6,[esp+80h] + movdqa xmm4,[esp+90h] + movdqa xmm5,[esp+0A0h] + movdqa xmm7,[esp+0B0h] + pxor xmm0,xmm0 + movd xmm1,ecx + movdqa xmm2,xmm1 + punpcklwd xmm2,xmm1 + pshufd xmm1,xmm2,0 + movd xmm2,edx + movdqa xmm3,xmm2 + punpcklwd xmm3,xmm2 + pshufd xmm2,xmm3,0 + movdqa xmm3,xmm6 + punpckhbw xmm6,xmm0 + movdqa [esp+60h],xmm6 + movdqa xmm6,[esp+90h] + punpckhbw xmm6,xmm0 + movdqa [esp+30h],xmm6 + movdqa xmm6,[esp+0A0h] + punpckhbw xmm6,xmm0 + movdqa [esp+40h],xmm6 + movdqa xmm6,[esp+0B0h] + punpckhbw xmm6,xmm0 + movdqa [esp+70h],xmm6 + punpcklbw xmm7,xmm0 + punpcklbw xmm4,xmm0 + punpcklbw xmm5,xmm0 + punpcklbw xmm3,xmm0 + movdqa [esp+50h],xmm7 + movdqa xmm6,xmm4 + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + movdqa xmm0,xmm1 + pcmpgtw xmm0,xmm6 + movdqa xmm6,xmm3 + psubw xmm6,xmm4 + pabsw xmm6,xmm6 + movdqa xmm7,xmm2 + pcmpgtw xmm7,xmm6 + movdqa xmm6,[esp+50h] + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pand xmm0,xmm7 + movdqa xmm7,xmm2 + pcmpgtw xmm7,xmm6 + movdqa xmm6,[esp+30h] + psubw xmm6,[esp+40h] + pabsw xmm6,xmm6 + pcmpgtw xmm1,xmm6 + movdqa xmm6,[esp+60h] + psubw xmm6,[esp+30h] + pabsw xmm6,xmm6 + pand xmm0,xmm7 + movdqa xmm7,xmm2 + pcmpgtw xmm7,xmm6 + movdqa xmm6,[esp+70h] + psubw xmm6,[esp+40h] + pabsw xmm6,xmm6 + pand xmm1,xmm7 + pcmpgtw xmm2,xmm6 + pand xmm1,xmm2 + mov eax,2 + movsx ecx,ax + movd xmm2,ecx + movdqa xmm6,xmm2 + punpcklwd xmm6,xmm2 + pshufd xmm2,xmm6,0 + movdqa [esp+20h],xmm2 + movdqa xmm2,xmm3 + paddw xmm2,xmm3 + paddw xmm2,xmm4 + paddw xmm2,[esp+50h] + paddw xmm2,[esp+20h] + psraw xmm2,2 + movdqa xmm6,xmm0 + pand xmm6,xmm2 + movdqa xmm2,xmm0 + pandn xmm2,xmm4 + por xmm6,xmm2 + movdqa xmm2,[esp+60h] + movdqa xmm7,xmm2 + paddw xmm7,xmm2 + paddw xmm7,[esp+30h] + paddw xmm7,[esp+70h] + paddw xmm7,[esp+20h] + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + pandn xmm2,[esp+30h] + psraw xmm7,2 + pand xmm4,xmm7 + por xmm4,xmm2 + movdqa xmm2,[esp+50h] + packuswb xmm6,xmm4 + movdqa [esp+90h],xmm6 + movdqa xmm6,xmm2 + paddw xmm6,xmm2 + movdqa xmm2,[esp+20h] + paddw xmm6,xmm5 + paddw xmm6,xmm3 + movdqa xmm4,xmm0 + pandn xmm0,xmm5 + paddw xmm6,xmm2 + psraw xmm6,2 + pand xmm4,xmm6 + por xmm4,xmm0 + movdqa xmm0,[esp+70h] + movdqa xmm5,xmm0 + paddw xmm5,xmm0 + movdqa xmm0,[esp+40h] + paddw xmm5,xmm0 + paddw xmm5,[esp+60h] + movdqa xmm3,xmm1 + paddw xmm5,xmm2 + psraw xmm5,2 + pand xmm3,xmm5 + pandn xmm1,xmm0 + por xmm3,xmm1 + packuswb xmm4,xmm3 + movdqa [esp+0A0h],xmm4 + mov esi,dword [esp+10h] + movdqa xmm0,[esi] + movdqa xmm1,[esi+10h] + movdqa xmm2,[esi+20h] + movdqa xmm3,[esi+30h] + movdqa xmm6,xmm0 + punpcklbw xmm0,xmm1 + punpckhbw xmm6,xmm1 + movdqa xmm7,xmm2 + punpcklbw xmm2,xmm3 + punpckhbw xmm7,xmm3 + movdqa xmm4,xmm0 + movdqa xmm5,xmm6 + punpcklwd xmm0,xmm2 + punpckhwd xmm4,xmm2 + punpcklwd xmm6,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm1,xmm0 + movdqa xmm2,xmm4 + punpckldq xmm0,xmm6 + punpckhdq xmm1,xmm6 + punpckldq xmm4,xmm5 + punpckhdq xmm2,xmm5 + movdqa xmm5,xmm0 + movdqa xmm6,xmm1 + punpcklqdq xmm0,xmm4 + punpckhqdq xmm5,xmm4 + punpcklqdq xmm1,xmm2 + punpckhqdq xmm6,xmm2 + mov esi,dword [esp+1Ch] + mov ecx,dword [ebp+10h] + mov edx,dword [esp+14h] + mov edi,dword [esp+8] + movd dword [esi],xmm0 + movd dword [esi+ecx],xmm5 + movd dword [esi+ecx*2],xmm1 + movd dword [esi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + mov esi,dword [esp+18h] + movd dword [edi],xmm0 + movd dword [edi+ecx],xmm5 + movd dword [edi+ecx*2],xmm1 + movd dword [edi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + movd dword [esi],xmm0 + movd dword [esi+ecx],xmm5 + movd dword [esi+ecx*2],xmm1 + movd dword [esi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + mov edi,dword [esp+0Ch] + movd dword [edi],xmm0 + movd dword [edi+ecx],xmm5 + movd dword [edi+ecx*2],xmm1 + movd dword [edi+edx],xmm6 + pop edi + pop esi + mov esp,ebp + pop ebp + ret + +;******************************************************************************* +; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, +; int32_t iAlpha, int32_t iBeta, int8_t * pTC); +;******************************************************************************* + +WELS_EXTERN DeblockChromaLt4H_sse2 + +ALIGN 16 + +DeblockChromaLt4H_sse2: + push ebp + mov ebp,esp + and esp,0FFFFFFF0h + sub esp,108h + mov ecx,dword [ebp+8] + mov edx,dword [ebp+0Ch] + mov eax,dword [ebp+10h] + sub ecx,2 + sub edx,2 + push esi + lea esi,[eax+eax*2] + mov dword [esp+10h],ecx + mov dword [esp+4],edx + lea ecx,[ecx+eax*4] + lea edx,[edx+eax*4] + lea eax,[esp+6Ch] + push edi + mov dword [esp+0Ch],esi + mov dword [esp+18h],ecx + mov dword [esp+10h],edx + mov dword [esp+1Ch],eax + mov esi,dword [esp+14h] + mov ecx,dword [ebp+10h] + mov edx,dword [esp+0Ch] + movd xmm0,dword [esi] + movd xmm1,dword [esi+ecx] + movd xmm2,dword [esi+ecx*2] + movd xmm3,dword [esi+edx] + mov esi,dword [esp+8] + movd xmm4,dword [esi] + movd xmm5,dword [esi+ecx] + movd xmm6,dword [esi+ecx*2] + movd xmm7,dword [esi+edx] + punpckldq xmm0,xmm4 + punpckldq xmm1,xmm5 + punpckldq xmm2,xmm6 + punpckldq xmm3,xmm7 + mov esi,dword [esp+18h] + mov edi,dword [esp+10h] + movd xmm4,dword [esi] + movd xmm5,dword [edi] + punpckldq xmm4,xmm5 + punpcklqdq xmm0,xmm4 + movd xmm4,dword [esi+ecx] + movd xmm5,dword [edi+ecx] + punpckldq xmm4,xmm5 + punpcklqdq xmm1,xmm4 + movd xmm4,dword [esi+ecx*2] + movd xmm5,dword [edi+ecx*2] + punpckldq xmm4,xmm5 + punpcklqdq xmm2,xmm4 + movd xmm4,dword [esi+edx] + movd xmm5,dword [edi+edx] + punpckldq xmm4,xmm5 + punpcklqdq xmm3,xmm4 + movdqa xmm6,xmm0 + punpcklbw xmm0,xmm1 + punpckhbw xmm6,xmm1 + movdqa xmm7,xmm2 + punpcklbw xmm2,xmm3 + punpckhbw xmm7,xmm3 + movdqa xmm4,xmm0 + movdqa xmm5,xmm6 + punpcklwd xmm0,xmm2 + punpckhwd xmm4,xmm2 + punpcklwd xmm6,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm1,xmm0 + movdqa xmm2,xmm4 + punpckldq xmm0,xmm6 + punpckhdq xmm1,xmm6 + punpckldq xmm4,xmm5 + punpckhdq xmm2,xmm5 + movdqa xmm5,xmm0 + movdqa xmm6,xmm1 + punpcklqdq xmm0,xmm4 + punpckhqdq xmm5,xmm4 + punpcklqdq xmm1,xmm2 + punpckhqdq xmm6,xmm2 + mov edi,dword [esp+1Ch] + movdqa [edi],xmm0 + movdqa [edi+10h],xmm5 + movdqa [edi+20h],xmm1 + movdqa [edi+30h],xmm6 + mov eax,dword [ebp+1Ch] + movsx cx,byte [eax+3] + movsx dx,byte [eax+2] + movsx si,byte [eax+1] + movsx ax,byte [eax] + movzx edi,cx + movzx ecx,cx + movd xmm2,ecx + movzx ecx,dx + movzx edx,dx + movd xmm3,ecx + movd xmm4,edx + movzx ecx,si + movzx edx,si + movd xmm5,ecx + pxor xmm0,xmm0 + movd xmm6,edx + movzx ecx,ax + movdqa [esp+60h],xmm0 + movzx edx,ax + movsx eax,word [ebp+14h] + punpcklwd xmm6,xmm2 + movd xmm1,edi + movd xmm7,ecx + movsx ecx,word [ebp+18h] + movd xmm0,edx + punpcklwd xmm7,xmm3 + punpcklwd xmm5,xmm1 + movdqa xmm1,[esp+60h] + punpcklwd xmm7,xmm5 + movdqa xmm5,[esp+0A0h] + punpcklwd xmm0,xmm4 + punpcklwd xmm0,xmm6 + movdqa xmm6, [esp+70h] + punpcklwd xmm0,xmm7 + movdqa xmm7,[esp+80h] + movdqa xmm2,xmm1 + psubw xmm2,xmm0 + movdqa [esp+0D0h],xmm2 + movd xmm2,eax + movdqa xmm3,xmm2 + punpcklwd xmm3,xmm2 + pshufd xmm4,xmm3,0 + movd xmm2,ecx + movdqa xmm3,xmm2 + punpcklwd xmm3,xmm2 + pshufd xmm2,xmm3,0 + movdqa xmm3, [esp+90h] + movdqa [esp+50h],xmm2 + movdqa xmm2,xmm6 + punpcklbw xmm2,xmm1 + punpckhbw xmm6,xmm1 + movdqa [esp+40h],xmm2 + movdqa [esp+0B0h],xmm6 + movdqa xmm6,[esp+90h] + movdqa xmm2,xmm7 + punpckhbw xmm7,xmm1 + punpckhbw xmm6,xmm1 + punpcklbw xmm2,xmm1 + punpcklbw xmm3,xmm1 + punpcklbw xmm5,xmm1 + movdqa [esp+0F0h],xmm7 + movdqa [esp+0C0h],xmm6 + movdqa xmm6, [esp+0A0h] + punpckhbw xmm6,xmm1 + movdqa [esp+0E0h],xmm6 + mov edx,4 + movsx eax,dx + movd xmm6,eax + movdqa xmm7,xmm6 + punpcklwd xmm7,xmm6 + pshufd xmm6,xmm7,0 + movdqa [esp+30h],xmm6 + movdqa xmm7, [esp+40h] + psubw xmm7,xmm5 + movdqa xmm6,xmm0 + pcmpgtw xmm6,xmm1 + movdqa [esp+60h],xmm6 + movdqa xmm1, [esp+0D0h] + movdqa xmm6,xmm3 + psubw xmm6,xmm2 + psllw xmm6,2 + paddw xmm6,xmm7 + paddw xmm6,[esp+30h] + psraw xmm6,3 + pmaxsw xmm1,xmm6 + movdqa xmm7,[esp+50h] + movdqa [esp+20h],xmm0 + movdqa xmm6, [esp+20h] + pminsw xmm6,xmm1 + movdqa [esp+20h],xmm6 + movdqa xmm6,xmm4 + movdqa xmm1,xmm2 + psubw xmm1,xmm3 + pabsw xmm1,xmm1 + pcmpgtw xmm6,xmm1 + movdqa xmm1, [esp+40h] + psubw xmm1,xmm2 + pabsw xmm1,xmm1 + pcmpgtw xmm7,xmm1 + movdqa xmm1, [esp+50h] + pand xmm6,xmm7 + movdqa xmm7, [esp+50h] + psubw xmm5,xmm3 + pabsw xmm5,xmm5 + pcmpgtw xmm1,xmm5 + movdqa xmm5, [esp+0B0h] + psubw xmm5,[esp+0E0h] + pand xmm6,xmm1 + pand xmm6, [esp+60h] + movdqa xmm1, [esp+20h] + pand xmm1,xmm6 + movdqa xmm6, [esp+0C0h] + movdqa [esp+40h],xmm1 + movdqa xmm1, [esp+0F0h] + psubw xmm6,xmm1 + psllw xmm6,2 + paddw xmm6,xmm5 + paddw xmm6, [esp+30h] + movdqa xmm5, [esp+0D0h] + psraw xmm6,3 + pmaxsw xmm5,xmm6 + pminsw xmm0,xmm5 + movdqa xmm5,[esp+0C0h] + movdqa xmm6,xmm1 + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pcmpgtw xmm4,xmm6 + movdqa xmm6,[esp+0B0h] + psubw xmm6,xmm1 + pabsw xmm6,xmm6 + pcmpgtw xmm7,xmm6 + movdqa xmm6, [esp+0E0h] + pand xmm4,xmm7 + movdqa xmm7, [esp+50h] + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pcmpgtw xmm7,xmm6 + pand xmm4,xmm7 + pand xmm4,[esp+60h] + pand xmm0,xmm4 + movdqa xmm4, [esp+40h] + paddw xmm2,xmm4 + paddw xmm1,xmm0 + psubw xmm3,xmm4 + psubw xmm5,xmm0 + packuswb xmm2,xmm1 + packuswb xmm3,xmm5 + movdqa [esp+80h],xmm2 + movdqa [esp+90h],xmm3 + mov esi,dword [esp+1Ch] + movdqa xmm0, [esi] + movdqa xmm1, [esi+10h] + movdqa xmm2, [esi+20h] + movdqa xmm3, [esi+30h] + movdqa xmm6,xmm0 + punpcklbw xmm0,xmm1 + punpckhbw xmm6,xmm1 + movdqa xmm7,xmm2 + punpcklbw xmm2,xmm3 + punpckhbw xmm7,xmm3 + movdqa xmm4,xmm0 + movdqa xmm5,xmm6 + punpcklwd xmm0,xmm2 + punpckhwd xmm4,xmm2 + punpcklwd xmm6,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm1,xmm0 + movdqa xmm2,xmm4 + punpckldq xmm0,xmm6 + punpckhdq xmm1,xmm6 + punpckldq xmm4,xmm5 + punpckhdq xmm2,xmm5 + movdqa xmm5,xmm0 + movdqa xmm6,xmm1 + punpcklqdq xmm0,xmm4 + punpckhqdq xmm5,xmm4 + punpcklqdq xmm1,xmm2 + punpckhqdq xmm6,xmm2 + mov esi,dword [esp+14h] + mov ecx,dword [ebp+10h] + mov edx,dword [esp+0Ch] + mov edi,dword [esp+8] + movd dword [esi],xmm0 + movd dword [esi+ecx],xmm5 + movd dword [esi+ecx*2],xmm1 + movd dword [esi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + mov esi,dword [esp+18h] + movd dword [edi],xmm0 + movd dword [edi+ecx],xmm5 + movd dword [edi+ecx*2],xmm1 + movd dword [edi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + movd dword [esi],xmm0 + movd dword [esi+ecx],xmm5 + movd dword [esi+ecx*2],xmm1 + movd dword [esi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + mov edi,dword [esp+10h] + movd dword [edi],xmm0 + movd dword [edi+ecx],xmm5 + movd dword [edi+ecx*2],xmm1 + movd dword [edi+edx],xmm6 + pop edi + pop esi + mov esp,ebp + pop ebp + ret + + + +;******************************************************************************* +; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, +; int32_t iBeta, int8_t * pTC) +;******************************************************************************* + + +WELS_EXTERN DeblockLumaLt4V_sse2 + +ALIGN 16 + +DeblockLumaLt4V_sse2: + push ebp + mov ebp, esp + and esp, -16 ; fffffff0H + sub esp, 420 ; 000001a4H + mov eax, dword [ebp+8] + mov ecx, dword [ebp+12] + + pxor xmm0, xmm0 + push ebx + mov edx, dword [ebp+24] + movdqa [esp+424-384], xmm0 + push esi + + lea esi, [ecx+ecx*2] + push edi + mov edi, eax + sub edi, esi + movdqa xmm0, [edi] + + lea esi, [ecx+ecx] + movdqa [esp+432-208], xmm0 + mov edi, eax + sub edi, esi + movdqa xmm0, [edi] + movdqa [esp+448-208], xmm0 + + mov ebx, eax + sub ebx, ecx + movdqa xmm0, [ebx] + movdqa [esp+464-208], xmm0 + + movdqa xmm0, [eax] + + add ecx, eax + movdqa [esp+480-208], xmm0 + movdqa xmm0, [ecx] + mov dword [esp+432-404], ecx + + movsx ecx, word [ebp+16] + movdqa [esp+496-208], xmm0 + movdqa xmm0, [esi+eax] + + movsx si, byte [edx] + movdqa [esp+512-208], xmm0 + movd xmm0, ecx + movsx ecx, word [ebp+20] + movdqa xmm1, xmm0 + punpcklwd xmm1, xmm0 + pshufd xmm0, xmm1, 0 + movdqa [esp+432-112], xmm0 + movd xmm0, ecx + movsx cx, byte [edx+1] + movdqa xmm1, xmm0 + punpcklwd xmm1, xmm0 + mov dword [esp+432-408], ebx + movzx ebx, cx + pshufd xmm0, xmm1, 0 + movd xmm1, ebx + movzx ebx, cx + movd xmm2, ebx + movzx ebx, cx + movzx ecx, cx + movd xmm4, ecx + movzx ecx, si + movd xmm5, ecx + movzx ecx, si + movd xmm6, ecx + movzx ecx, si + movd xmm7, ecx + movzx ecx, si + movdqa [esp+432-336], xmm0 + movd xmm0, ecx + + movsx cx, byte [edx+3] + movsx dx, byte [edx+2] + movd xmm3, ebx + punpcklwd xmm0, xmm4 + movzx esi, cx + punpcklwd xmm6, xmm2 + punpcklwd xmm5, xmm1 + punpcklwd xmm0, xmm6 + punpcklwd xmm7, xmm3 + punpcklwd xmm7, xmm5 + punpcklwd xmm0, xmm7 + movdqa [esp+432-400], xmm0 + movd xmm0, esi + movzx esi, cx + movd xmm2, esi + movzx esi, cx + movzx ecx, cx + movd xmm4, ecx + movzx ecx, dx + movd xmm3, esi + movd xmm5, ecx + punpcklwd xmm5, xmm0 + + movdqa xmm0, [esp+432-384] + movzx ecx, dx + movd xmm6, ecx + movzx ecx, dx + movzx edx, dx + punpcklwd xmm6, xmm2 + movd xmm7, ecx + movd xmm1, edx + + movdqa xmm2, [esp+448-208] + punpcklbw xmm2, xmm0 + + mov ecx, 4 + movsx edx, cx + punpcklwd xmm7, xmm3 + punpcklwd xmm7, xmm5 + movdqa xmm5, [esp+496-208] + movdqa xmm3, [esp+464-208] + punpcklbw xmm5, xmm0 + movdqa [esp+432-240], xmm5 + movdqa xmm5, [esp+512-208] + punpcklbw xmm5, xmm0 + movdqa [esp+432-352], xmm5 + punpcklwd xmm1, xmm4 + movdqa xmm4, [esp+432-208] + punpcklwd xmm1, xmm6 + movdqa xmm6, [esp+480-208] + punpcklwd xmm1, xmm7 + punpcklbw xmm6, xmm0 + punpcklbw xmm3, xmm0 + punpcklbw xmm4, xmm0 + movdqa xmm7, xmm3 + psubw xmm7, xmm4 + pabsw xmm7, xmm7 + movdqa [esp+432-272], xmm4 + movdqa xmm4, [esp+432-336] + movdqa xmm5, xmm4 + pcmpgtw xmm5, xmm7 + movdqa [esp+432-288], xmm5 + movdqa xmm7, xmm6 + psubw xmm7, [esp+432-352] + pabsw xmm7, xmm7 + movdqa xmm5, xmm4 + pcmpgtw xmm5, xmm7 + movdqa [esp+432-256], xmm5 + movdqa xmm5, xmm3 + pavgw xmm5, xmm6 + movdqa [esp+432-304], xmm5 + movdqa xmm5, [esp+432-400] + psubw xmm5, [esp+432-288] + psubw xmm5, [esp+432-256] + movdqa [esp+432-224], xmm5 + movdqa xmm5, xmm6 + psubw xmm5, xmm3 + movdqa [esp+432-32], xmm6 + psubw xmm6, [esp+432-240] + movdqa xmm7, xmm5 + movdqa [esp+432-384], xmm5 + movdqa xmm5, [esp+432-112] + pabsw xmm7, xmm7 + pcmpgtw xmm5, xmm7 + pabsw xmm6, xmm6 + movdqa xmm7, xmm4 + pcmpgtw xmm7, xmm6 + + pand xmm5, xmm7 + movdqa xmm6, xmm3 + psubw xmm6, xmm2 + pabsw xmm6, xmm6 + movdqa xmm7, xmm4 + pcmpgtw xmm7, xmm6 + movdqa xmm6, [esp+432-400] + pand xmm5, xmm7 + movdqa xmm7, xmm6 + pcmpeqw xmm6, xmm0 + pcmpgtw xmm7, xmm0 + por xmm7, xmm6 + pand xmm5, xmm7 + movdqa [esp+432-320], xmm5 + movd xmm5, edx + movdqa xmm6, xmm5 + punpcklwd xmm6, xmm5 + pshufd xmm5, xmm6, 0 + movdqa [esp+432-336], xmm5 + movdqa xmm5, [esp+432-224] + movdqa [esp+432-368], xmm5 + movdqa xmm6, xmm0 + psubw xmm6, xmm5 + movdqa xmm5, [esp+432-384] + psllw xmm5, 2 + movdqa xmm7, xmm2 + psubw xmm7, [esp+432-240] + paddw xmm7, xmm5 + paddw xmm7, [esp+432-336] + movdqa xmm5, [esp+432-368] + psraw xmm7, 3 + pmaxsw xmm6, xmm7 + pminsw xmm5, xmm6 + + pand xmm5, [esp+432-320] + movdqa xmm6, [esp+432-400] + movdqa [esp+432-64], xmm5 + movdqa [esp+432-384], xmm6 + movdqa xmm5, xmm0 + psubw xmm5, xmm6 + movdqa [esp+432-368], xmm5 + movdqa xmm6, xmm5 + movdqa xmm5, [esp+432-272] + paddw xmm5, [esp+432-304] + movdqa xmm7, xmm2 + paddw xmm7, xmm2 + psubw xmm5, xmm7 + psraw xmm5, 1 + pmaxsw xmm6, xmm5 + movdqa xmm5, [esp+432-384] + pminsw xmm5, xmm6 + + pand xmm5, [esp+432-320] + pand xmm5, [esp+432-288] + movdqa xmm6, [esp+432-240] + movdqa [esp+432-96], xmm5 + movdqa xmm5, [esp+432-352] + paddw xmm5, [esp+432-304] + movdqa xmm7, xmm6 + paddw xmm7, xmm6 + movdqa xmm6, [esp+432-368] + psubw xmm5, xmm7 + + movdqa xmm7, [esp+496-208] + psraw xmm5, 1 + pmaxsw xmm6, xmm5 + movdqa xmm5, [esp+432-400] + pminsw xmm5, xmm6 + pand xmm5, [esp+432-320] + pand xmm5, [esp+432-256] + movdqa xmm6, [esp+448-208] + punpckhbw xmm7, xmm0 + movdqa [esp+432-352], xmm7 + + movdqa xmm7, [esp+512-208] + punpckhbw xmm6, xmm0 + movdqa [esp+432-48], xmm5 + movdqa xmm5, [esp+432-208] + movdqa [esp+432-368], xmm6 + movdqa xmm6, [esp+464-208] + punpckhbw xmm7, xmm0 + punpckhbw xmm5, xmm0 + movdqa [esp+432-384], xmm7 + punpckhbw xmm6, xmm0 + movdqa [esp+432-400], xmm6 + + movdqa xmm7, [esp+432-400] + movdqa xmm6, [esp+480-208] + psubw xmm7, xmm5 + movdqa [esp+432-16], xmm5 + pabsw xmm7, xmm7 + punpckhbw xmm6, xmm0 + movdqa xmm5, xmm4 + pcmpgtw xmm5, xmm7 + movdqa [esp+432-288], xmm5 + + movdqa xmm7, xmm6 + psubw xmm7, [esp+432-384] + pabsw xmm7, xmm7 + movdqa xmm5, xmm4 + pcmpgtw xmm5, xmm7 + movdqa [esp+432-256], xmm5 + + movdqa xmm5, [esp+432-400] + movdqa [esp+432-80], xmm6 + pavgw xmm5, xmm6 + movdqa [esp+432-304], xmm5 + + movdqa xmm5, xmm1 + psubw xmm5, [esp+432-288] + psubw xmm5, [esp+432-256] + movdqa [esp+432-224], xmm5 + movdqa xmm5, xmm6 + psubw xmm5, [esp+432-400] + psubw xmm6, [esp+432-352] + movdqa [esp+432-272], xmm5 + movdqa xmm7, xmm5 + movdqa xmm5, [esp+432-112] + pabsw xmm7, xmm7 + pcmpgtw xmm5, xmm7 + movdqa xmm7, xmm4 + pabsw xmm6, xmm6 + pcmpgtw xmm7, xmm6 + movdqa xmm6, [esp+432-368] + + pand xmm5, xmm7 + movdqa xmm7, [esp+432-400] + psubw xmm7, xmm6 + psubw xmm6, [esp+432-352] + pabsw xmm7, xmm7 + pcmpgtw xmm4, xmm7 + pand xmm5, xmm4 + + paddw xmm2, [esp+432-96] + movdqa xmm4, xmm1 + pcmpgtw xmm4, xmm0 + movdqa xmm7, xmm1 + pcmpeqw xmm7, xmm0 + por xmm4, xmm7 + pand xmm5, xmm4 + movdqa xmm4, [esp+432-224] + movdqa [esp+432-320], xmm5 + movdqa xmm5, [esp+432-272] + movdqa xmm7, xmm0 + psubw xmm7, xmm4 + psubw xmm0, xmm1 + psllw xmm5, 2 + paddw xmm6, xmm5 + paddw xmm6, [esp+432-336] + movdqa xmm5, [esp+432-368] + movdqa [esp+432-336], xmm0 + psraw xmm6, 3 + pmaxsw xmm7, xmm6 + pminsw xmm4, xmm7 + pand xmm4, [esp+432-320] + movdqa xmm6, xmm0 + movdqa xmm0, [esp+432-16] + paddw xmm0, [esp+432-304] + movdqa [esp+432-272], xmm4 + movdqa xmm4, [esp+432-368] + paddw xmm4, xmm4 + psubw xmm0, xmm4 + + movdqa xmm4, [esp+432-64] + psraw xmm0, 1 + pmaxsw xmm6, xmm0 + movdqa xmm0, [esp+432-400] + movdqa xmm7, xmm1 + pminsw xmm7, xmm6 + movdqa xmm6, [esp+432-320] + pand xmm7, xmm6 + pand xmm7, [esp+432-288] + paddw xmm5, xmm7 + packuswb xmm2, xmm5 + movdqa xmm5, [esp+432-272] + paddw xmm0, xmm5 + paddw xmm3, xmm4 + packuswb xmm3, xmm0 + + movdqa xmm0, [esp+432-32] + psubw xmm0, xmm4 + movdqa xmm4, [esp+432-80] + psubw xmm4, xmm5 + + movdqa xmm5, [esp+432-240] + paddw xmm5, [esp+432-48] + packuswb xmm0, xmm4 + movdqa xmm4, [esp+432-384] + paddw xmm4, [esp+432-304] + movdqa [esp+480-208], xmm0 + movdqa xmm0, [esp+432-352] + movdqa xmm7, xmm0 + paddw xmm0, xmm0 + + mov ecx, dword [esp+432-408] + + mov edx, dword [esp+432-404] + psubw xmm4, xmm0 + movdqa xmm0, [esp+432-336] + movdqa [edi], xmm2 + psraw xmm4, 1 + pmaxsw xmm0, xmm4 + pminsw xmm1, xmm0 + movdqa xmm0, [esp+480-208] + + pop edi + pand xmm1, xmm6 + pand xmm1, [esp+428-256] + movdqa [ecx], xmm3 + paddw xmm7, xmm1 + pop esi + packuswb xmm5, xmm7 + movdqa [eax], xmm0 + movdqa [edx], xmm5 + pop ebx + mov esp, ebp + pop ebp + ret + + +;******************************************************************************* +; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, +; int32_t iBeta) +;******************************************************************************* + +WELS_EXTERN DeblockLumaEq4V_sse2 + +ALIGN 16 + +DeblockLumaEq4V_sse2: + + push ebp + mov ebp, esp + and esp, -16 ; fffffff0H + sub esp, 628 ; 00000274H + mov eax, dword [ebp+8] + mov ecx, dword [ebp+12] + push ebx + push esi + + lea edx, [ecx*4] + pxor xmm0, xmm0 + movdqa xmm2, xmm0 + + movdqa xmm0, [ecx+eax] + mov esi, eax + sub esi, edx + movdqa xmm3, [esi] + movdqa xmm5, [eax] + push edi + lea edi, [ecx+ecx] + lea ebx, [ecx+ecx*2] + mov dword [esp+640-600], edi + mov esi, eax + sub esi, edi + movdqa xmm1, [esi] + movdqa [esp+720-272], xmm0 + mov edi, eax + sub edi, ecx + movdqa xmm4, [edi] + add ecx, eax + mov dword [esp+640-596], ecx + + mov ecx, dword [esp+640-600] + movdqa xmm0, [ecx+eax] + movdqa [esp+736-272], xmm0 + + movdqa xmm0, [eax+ebx] + mov edx, eax + sub edx, ebx + + movsx ebx, word [ebp+16] + movdqa xmm6, [edx] + add ecx, eax + movdqa [esp+752-272], xmm0 + movd xmm0, ebx + + movsx ebx, word [ebp+20] + movdqa xmm7, xmm0 + punpcklwd xmm7, xmm0 + pshufd xmm0, xmm7, 0 + movdqa [esp+640-320], xmm0 + movd xmm0, ebx + movdqa xmm7, xmm0 + punpcklwd xmm7, xmm0 + pshufd xmm0, xmm7, 0 + + movdqa xmm7, [esp+736-272] + punpcklbw xmm7, xmm2 + movdqa [esp+640-416], xmm7 + movdqa [esp+640-512], xmm0 + movdqa xmm0, xmm1 + movdqa [esp+672-272], xmm1 + movdqa xmm1, xmm4 + movdqa [esp+704-272], xmm5 + punpcklbw xmm5, xmm2 + punpcklbw xmm1, xmm2 + + movdqa xmm7, xmm5 + psubw xmm7, xmm1 + pabsw xmm7, xmm7 + movdqa [esp+640-560], xmm7 + punpcklbw xmm0, xmm2 + movdqa [esp+688-272], xmm4 + movdqa xmm4, [esp+720-272] + movdqa [esp+640-480], xmm0 + + movdqa xmm7, xmm1 + psubw xmm7, xmm0 + + movdqa xmm0, [esp+640-512] + pabsw xmm7, xmm7 + punpcklbw xmm4, xmm2 + pcmpgtw xmm0, xmm7 + movdqa [esp+640-384], xmm4 + movdqa xmm7, xmm5 + psubw xmm7, xmm4 + movdqa xmm4, [esp+640-512] + movdqa [esp+656-272], xmm6 + punpcklbw xmm6, xmm2 + pabsw xmm7, xmm7 + movdqa [esp+640-48], xmm2 + movdqa [esp+640-368], xmm6 + movdqa [esp+640-144], xmm1 + movdqa [esp+640-400], xmm5 + pcmpgtw xmm4, xmm7 + pand xmm0, xmm4 + movdqa xmm4, [esp+640-320] + pcmpgtw xmm4, [esp+640-560] + pand xmm0, xmm4 + + mov ebx, 2 + movsx ebx, bx + movd xmm4, ebx + movdqa xmm7, xmm4 + punpcklwd xmm7, xmm4 + movdqa xmm4, [esp+640-320] + psraw xmm4, 2 + pshufd xmm7, xmm7, 0 + paddw xmm4, xmm7 + movdqa [esp+640-576], xmm4 + pcmpgtw xmm4, [esp+640-560] + movdqa [esp+640-560], xmm4 + + movdqa xmm4, [esp+640-512] + movdqa [esp+640-624], xmm7 + movdqa xmm7, xmm1 + psubw xmm7, xmm6 + pabsw xmm7, xmm7 + pcmpgtw xmm4, xmm7 + + pand xmm4, [esp+640-560] + movdqa [esp+640-544], xmm4 + movdqa xmm4, [esp+640-512] + movdqa xmm7, xmm5 + psubw xmm7, [esp+640-416] + pabsw xmm7, xmm7 + pcmpgtw xmm4, xmm7 + + pand xmm4, [esp+640-560] + movdqa [esp+640-560], xmm4 + + movdqa xmm4, [esp+640-544] + pandn xmm4, xmm6 + movdqa [esp+640-16], xmm4 + mov ebx, 4 + movsx ebx, bx + movd xmm4, ebx + movdqa xmm7, xmm4 + punpcklwd xmm7, xmm4 + movdqa xmm4, xmm3 + punpcklbw xmm4, xmm2 + psllw xmm4, 1 + paddw xmm4, xmm6 + paddw xmm4, xmm6 + paddw xmm4, xmm6 + paddw xmm4, [esp+640-480] + + movdqa xmm6, [esp+640-560] + pshufd xmm7, xmm7, 0 + paddw xmm4, xmm1 + movdqa [esp+640-592], xmm7 + paddw xmm4, xmm5 + paddw xmm4, xmm7 + movdqa xmm7, [esp+640-416] + pandn xmm6, xmm7 + movdqa [esp+640-80], xmm6 + movdqa xmm6, [esp+752-272] + punpcklbw xmm6, xmm2 + psllw xmm6, 1 + paddw xmm6, xmm7 + paddw xmm6, xmm7 + paddw xmm6, xmm7 + paddw xmm6, [esp+640-384] + + movdqa xmm7, [esp+640-480] + paddw xmm6, xmm5 + paddw xmm6, xmm1 + paddw xmm6, [esp+640-592] + psraw xmm6, 3 + pand xmm6, [esp+640-560] + movdqa [esp+640-112], xmm6 + movdqa xmm6, [esp+640-544] + pandn xmm6, xmm7 + movdqa [esp+640-336], xmm6 + movdqa xmm6, [esp+640-544] + movdqa [esp+640-528], xmm6 + movdqa xmm6, [esp+640-368] + paddw xmm6, xmm7 + movdqa xmm7, xmm1 + psraw xmm4, 3 + pand xmm4, [esp+640-544] + paddw xmm7, xmm5 + paddw xmm6, xmm7 + paddw xmm6, [esp+640-624] + movdqa xmm7, [esp+640-528] + + paddw xmm5, xmm1 + psraw xmm6, 2 + pand xmm7, xmm6 + + movdqa xmm6, [esp+640-384] + movdqa [esp+640-64], xmm7 + movdqa xmm7, [esp+640-560] + pandn xmm7, xmm6 + movdqa [esp+640-304], xmm7 + movdqa xmm7, [esp+640-560] + movdqa [esp+640-528], xmm7 + movdqa xmm7, [esp+640-416] + paddw xmm7, xmm6 + paddw xmm7, xmm5 + paddw xmm7, [esp+640-624] + movdqa xmm5, [esp+640-528] + psraw xmm7, 2 + pand xmm5, xmm7 + movdqa [esp+640-32], xmm5 + + movdqa xmm5, [esp+640-544] + movdqa [esp+640-528], xmm5 + movdqa xmm5, [esp+640-480] + movdqa xmm7, xmm5 + paddw xmm7, xmm5 + movdqa xmm5, xmm1 + paddw xmm5, xmm6 + paddw xmm6, [esp+640-592] + paddw xmm7, xmm5 + paddw xmm7, [esp+640-624] + movdqa xmm5, [esp+640-528] + psraw xmm7, 2 + pandn xmm5, xmm7 + movdqa xmm7, [esp+640-480] + paddw xmm7, xmm1 + paddw xmm7, [esp+640-400] + movdqa xmm1, [esp+640-544] + movdqa [esp+640-352], xmm5 + movdqa xmm5, [esp+640-368] + psllw xmm7, 1 + paddw xmm7, xmm6 + paddw xmm5, xmm7 + + movdqa xmm7, [esp+640-400] + psraw xmm5, 3 + pand xmm1, xmm5 + movdqa xmm5, [esp+640-480] + movdqa [esp+640-96], xmm1 + movdqa xmm1, [esp+640-560] + movdqa [esp+640-528], xmm1 + movdqa xmm1, [esp+640-384] + movdqa xmm6, xmm1 + paddw xmm6, xmm1 + paddw xmm1, [esp+640-400] + paddw xmm1, [esp+640-144] + paddw xmm7, xmm5 + paddw xmm5, [esp+640-592] + paddw xmm6, xmm7 + paddw xmm6, [esp+640-624] + movdqa xmm7, [esp+640-528] + psraw xmm6, 2 + psllw xmm1, 1 + paddw xmm1, xmm5 + + movdqa xmm5, [esp+656-272] + pandn xmm7, xmm6 + movdqa xmm6, [esp+640-416] + paddw xmm6, xmm1 + movdqa xmm1, [esp+640-560] + psraw xmm6, 3 + pand xmm1, xmm6 + + movdqa xmm6, [esp+704-272] + movdqa [esp+640-128], xmm1 + movdqa xmm1, [esp+672-272] + punpckhbw xmm1, xmm2 + movdqa [esp+640-448], xmm1 + movdqa xmm1, [esp+688-272] + punpckhbw xmm1, xmm2 + punpckhbw xmm6, xmm2 + movdqa [esp+640-288], xmm7 + punpckhbw xmm5, xmm2 + movdqa [esp+640-496], xmm1 + movdqa [esp+640-432], xmm6 + + movdqa xmm7, [esp+720-272] + punpckhbw xmm7, xmm2 + movdqa [esp+640-464], xmm7 + + movdqa xmm7, [esp+736-272] + punpckhbw xmm7, xmm2 + movdqa [esp+640-528], xmm7 + + movdqa xmm7, xmm6 + + psubw xmm6, [esp+640-464] + psubw xmm7, xmm1 + pabsw xmm7, xmm7 + movdqa [esp+640-560], xmm7 + por xmm4, [esp+640-16] + pabsw xmm6, xmm6 + movdqa xmm7, xmm1 + psubw xmm7, [esp+640-448] + + movdqa xmm1, [esp+640-512] + pabsw xmm7, xmm7 + pcmpgtw xmm1, xmm7 + movdqa xmm7, [esp+640-512] + pcmpgtw xmm7, xmm6 + movdqa xmm6, [esp+640-320] + pand xmm1, xmm7 + movdqa xmm7, [esp+640-560] + pcmpgtw xmm6, xmm7 + pand xmm1, xmm6 + + movdqa xmm6, [esp+640-576] + pcmpgtw xmm6, xmm7 + + movdqa xmm7, [esp+640-496] + punpckhbw xmm3, xmm2 + movdqa [esp+640-560], xmm6 + movdqa xmm6, [esp+640-512] + psubw xmm7, xmm5 + pabsw xmm7, xmm7 + pcmpgtw xmm6, xmm7 + + pand xmm6, [esp+640-560] + movdqa xmm7, [esp+640-432] + psubw xmm7, [esp+640-528] + + psllw xmm3, 1 + movdqa [esp+640-544], xmm6 + movdqa xmm6, [esp+640-512] + + movdqa xmm2, [esp+640-544] + paddw xmm3, xmm5 + paddw xmm3, xmm5 + paddw xmm3, xmm5 + paddw xmm3, [esp+640-448] + paddw xmm3, [esp+640-496] + pabsw xmm7, xmm7 + pcmpgtw xmm6, xmm7 + pand xmm6, [esp+640-560] + movdqa [esp+640-560], xmm6 + + movdqa xmm6, xmm0 + pand xmm6, xmm4 + movdqa xmm4, xmm0 + pandn xmm4, [esp+640-368] + por xmm6, xmm4 + movdqa xmm4, [esp+640-432] + paddw xmm3, xmm4 + paddw xmm3, [esp+640-592] + psraw xmm3, 3 + pand xmm3, xmm2 + pandn xmm2, xmm5 + por xmm3, xmm2 + movdqa xmm7, xmm1 + pand xmm7, xmm3 + movdqa xmm3, [esp+640-64] + por xmm3, [esp+640-336] + movdqa xmm2, xmm1 + pandn xmm2, xmm5 + por xmm7, xmm2 + + movdqa xmm2, xmm0 + pand xmm2, xmm3 + movdqa xmm3, xmm0 + pandn xmm3, [esp+640-480] + por xmm2, xmm3 + packuswb xmm6, xmm7 + movdqa [esp+640-336], xmm2 + movdqa [esp+656-272], xmm6 + movdqa xmm6, [esp+640-544] + movdqa xmm2, xmm5 + paddw xmm2, [esp+640-448] + movdqa xmm3, xmm1 + movdqa xmm7, [esp+640-496] + paddw xmm7, xmm4 + paddw xmm2, xmm7 + paddw xmm2, [esp+640-624] + movdqa xmm7, [esp+640-544] + psraw xmm2, 2 + pand xmm6, xmm2 + movdqa xmm2, [esp+640-448] + pandn xmm7, xmm2 + por xmm6, xmm7 + pand xmm3, xmm6 + movdqa xmm6, xmm1 + pandn xmm6, xmm2 + paddw xmm2, [esp+640-496] + paddw xmm2, xmm4 + por xmm3, xmm6 + movdqa xmm6, [esp+640-336] + packuswb xmm6, xmm3 + psllw xmm2, 1 + movdqa [esp+672-272], xmm6 + movdqa xmm6, [esp+640-96] + por xmm6, [esp+640-352] + + movdqa xmm3, xmm0 + pand xmm3, xmm6 + movdqa xmm6, xmm0 + pandn xmm6, [esp+640-144] + por xmm3, xmm6 + movdqa xmm6, [esp+640-544] + movdqa [esp+640-352], xmm3 + movdqa xmm3, [esp+640-464] + paddw xmm3, [esp+640-592] + paddw xmm2, xmm3 + movdqa xmm3, [esp+640-448] + paddw xmm5, xmm2 + movdqa xmm2, [esp+640-496] + psraw xmm5, 3 + pand xmm6, xmm5 + movdqa xmm5, [esp+640-464] + paddw xmm2, xmm5 + paddw xmm5, [esp+640-432] + movdqa xmm4, xmm3 + paddw xmm4, xmm3 + paddw xmm4, xmm2 + paddw xmm4, [esp+640-624] + movdqa xmm2, [esp+640-544] + paddw xmm3, [esp+640-592] + psraw xmm4, 2 + pandn xmm2, xmm4 + por xmm6, xmm2 + movdqa xmm7, xmm1 + pand xmm7, xmm6 + movdqa xmm6, [esp+640-496] + movdqa xmm2, xmm1 + pandn xmm2, xmm6 + por xmm7, xmm2 + movdqa xmm2, [esp+640-352] + packuswb xmm2, xmm7 + movdqa [esp+688-272], xmm2 + movdqa xmm2, [esp+640-128] + por xmm2, [esp+640-288] + + movdqa xmm4, xmm0 + pand xmm4, xmm2 + paddw xmm5, xmm6 + movdqa xmm2, xmm0 + pandn xmm2, [esp+640-400] + por xmm4, xmm2 + movdqa xmm2, [esp+640-528] + psllw xmm5, 1 + paddw xmm5, xmm3 + movdqa xmm3, [esp+640-560] + paddw xmm2, xmm5 + psraw xmm2, 3 + movdqa [esp+640-288], xmm4 + movdqa xmm4, [esp+640-560] + pand xmm4, xmm2 + movdqa xmm2, [esp+640-464] + movdqa xmm5, xmm2 + paddw xmm5, xmm2 + movdqa xmm2, [esp+640-432] + paddw xmm2, [esp+640-448] + movdqa xmm7, xmm1 + paddw xmm5, xmm2 + paddw xmm5, [esp+640-624] + movdqa xmm6, [esp+640-560] + psraw xmm5, 2 + pandn xmm3, xmm5 + por xmm4, xmm3 + movdqa xmm3, [esp+640-32] + por xmm3, [esp+640-304] + pand xmm7, xmm4 + movdqa xmm4, [esp+640-432] + movdqa xmm5, [esp+640-464] + movdqa xmm2, xmm1 + pandn xmm2, xmm4 + paddw xmm4, [esp+640-496] + por xmm7, xmm2 + movdqa xmm2, [esp+640-288] + packuswb xmm2, xmm7 + movdqa [esp+704-272], xmm2 + + movdqa xmm2, xmm0 + pand xmm2, xmm3 + movdqa xmm3, xmm0 + pandn xmm3, [esp+640-384] + por xmm2, xmm3 + movdqa [esp+640-304], xmm2 + movdqa xmm2, [esp+640-528] + movdqa xmm3, xmm2 + paddw xmm3, [esp+640-464] + paddw xmm3, xmm4 + paddw xmm3, [esp+640-624] + psraw xmm3, 2 + pand xmm6, xmm3 + movdqa xmm3, [esp+640-560] + movdqa xmm4, xmm3 + pandn xmm4, xmm5 + por xmm6, xmm4 + movdqa xmm7, xmm1 + pand xmm7, xmm6 + movdqa xmm6, [esp+640-304] + movdqa xmm4, xmm1 + pandn xmm4, xmm5 + por xmm7, xmm4 + + movdqa xmm4, xmm0 + pandn xmm0, [esp+640-416] + packuswb xmm6, xmm7 + movdqa xmm7, [esp+640-112] + por xmm7, [esp+640-80] + pand xmm4, xmm7 + por xmm4, xmm0 + movdqa xmm0, [esp+752-272] + punpckhbw xmm0, [esp+640-48] + psllw xmm0, 1 + paddw xmm0, xmm2 + paddw xmm0, xmm2 + paddw xmm0, xmm2 + paddw xmm0, xmm5 + paddw xmm0, [esp+640-432] + paddw xmm0, [esp+640-496] + paddw xmm0, [esp+640-592] + psraw xmm0, 3 + pand xmm0, xmm3 + movdqa xmm7, xmm1 + pandn xmm3, xmm2 + por xmm0, xmm3 + pand xmm7, xmm0 + + movdqa xmm0, [esp+656-272] + movdqa [edx], xmm0 + + movdqa xmm0, [esp+672-272] + + mov edx, dword [esp+640-596] + movdqa [esi], xmm0 + movdqa xmm0, [esp+688-272] + movdqa [edi], xmm0 + movdqa xmm0, [esp+704-272] + + pop edi + pandn xmm1, xmm2 + movdqa [eax], xmm0 + por xmm7, xmm1 + pop esi + packuswb xmm4, xmm7 + movdqa [edx], xmm6 + movdqa [ecx], xmm4 + pop ebx + mov esp, ebp + pop ebp + ret + + +;******************************************************************************** +; +; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst); +; +;******************************************************************************** + +WELS_EXTERN DeblockLumaTransposeH2V_sse2 + +ALIGN 16 + +DeblockLumaTransposeH2V_sse2: + push ebp + push ebx + mov ebp, esp + and esp,0FFFFFFF0h + sub esp, 10h + + mov eax, [ebp + 0Ch] + mov ecx, [ebp + 10h] + lea edx, [eax + ecx * 8] + lea ebx, [ecx*3] + + movq xmm0, [eax] + movq xmm7, [edx] + punpcklqdq xmm0, xmm7 + movq xmm1, [eax + ecx] + movq xmm7, [edx + ecx] + punpcklqdq xmm1, xmm7 + movq xmm2, [eax + ecx*2] + movq xmm7, [edx + ecx*2] + punpcklqdq xmm2, xmm7 + movq xmm3, [eax + ebx] + movq xmm7, [edx + ebx] + punpcklqdq xmm3, xmm7 + + lea eax, [eax + ecx * 4] + lea edx, [edx + ecx * 4] + movq xmm4, [eax] + movq xmm7, [edx] + punpcklqdq xmm4, xmm7 + movq xmm5, [eax + ecx] + movq xmm7, [edx + ecx] + punpcklqdq xmm5, xmm7 + movq xmm6, [eax + ecx*2] + movq xmm7, [edx + ecx*2] + punpcklqdq xmm6, xmm7 + + movdqa [esp], xmm0 + movq xmm7, [eax + ebx] + movq xmm0, [edx + ebx] + punpcklqdq xmm7, xmm0 + movdqa xmm0, [esp] + + SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp] + ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 + + mov eax, [ebp + 14h] + movdqa [eax], xmm4 + movdqa [eax + 10h], xmm2 + movdqa [eax + 20h], xmm3 + movdqa [eax + 30h], xmm7 + movdqa [eax + 40h], xmm5 + movdqa [eax + 50h], xmm1 + movdqa [eax + 60h], xmm6 + movdqa [eax + 70h], xmm0 + + mov esp, ebp + pop ebx + pop ebp + ret + + + +;******************************************************************************************* +; +; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc); +; +;******************************************************************************************* + +WELS_EXTERN DeblockLumaTransposeV2H_sse2 + +ALIGN 16 + +DeblockLumaTransposeV2H_sse2: + push ebp + mov ebp, esp + + and esp, 0FFFFFFF0h + sub esp, 10h + + mov eax, [ebp + 10h] + mov ecx, [ebp + 0Ch] + mov edx, [ebp + 08h] + + movdqa xmm0, [eax] + movdqa xmm1, [eax + 10h] + movdqa xmm2, [eax + 20h] + movdqa xmm3, [eax + 30h] + movdqa xmm4, [eax + 40h] + movdqa xmm5, [eax + 50h] + movdqa xmm6, [eax + 60h] + movdqa xmm7, [eax + 70h] + + SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp] + ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 + + lea eax, [ecx * 3] + + movq [edx], xmm4 + movq [edx + ecx], xmm2 + movq [edx + ecx*2], xmm3 + movq [edx + eax], xmm7 + + lea edx, [edx + ecx*4] + movq [edx], xmm5 + movq [edx + ecx], xmm1 + movq [edx + ecx*2], xmm6 + movq [edx + eax], xmm0 + + psrldq xmm4, 8 + psrldq xmm2, 8 + psrldq xmm3, 8 + psrldq xmm7, 8 + psrldq xmm5, 8 + psrldq xmm1, 8 + psrldq xmm6, 8 + psrldq xmm0, 8 + + lea edx, [edx + ecx*4] + movq [edx], xmm4 + movq [edx + ecx], xmm2 + movq [edx + ecx*2], xmm3 + movq [edx + eax], xmm7 + + lea edx, [edx + ecx*4] + movq [edx], xmm5 + movq [edx + ecx], xmm1 + movq [edx + ecx*2], xmm6 + movq [edx + eax], xmm0 + + + mov esp, ebp + pop ebp ret \ No newline at end of file diff --git a/codec/decoder/core/asm/mc_chroma.asm b/codec/decoder/core/asm/mc_chroma.asm index 740063e4..649fc735 100644 --- a/codec/decoder/core/asm/mc_chroma.asm +++ b/codec/decoder/core/asm/mc_chroma.asm @@ -1,317 +1,317 @@ -;*! -;* \copy -;* Copyright (c) 2004-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* mc_chroma.asm -;* -;* Abstract -;* mmx motion compensation for chroma -;* -;* History -;* 10/13/2004 Created -;* -;* -;*************************************************************************/ -%include "asm_inc.asm" - -BITS 32 - -;*********************************************************************** -; Local Data (Read Only) -;*********************************************************************** - -SECTION .rodata align=16 - -;*********************************************************************** -; Various memory constants (trigonometric values or rounding values) -;*********************************************************************** - -ALIGN 16 -h264_d0x20_sse2: - dw 32,32,32,32,32,32,32,32 -ALIGN 16 -h264_d0x20_mmx: - dw 32,32,32,32 - - -;============================================================================= -; Code -;============================================================================= - -SECTION .text - -ALIGN 16 -;******************************************************************************* -; void McChromaWidthEq4_mmx( uint8_t *src, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; uint8_t *pABCD, -; int32_t iHeigh ); -;******************************************************************************* -WELS_EXTERN McChromaWidthEq4_mmx -McChromaWidthEq4_mmx: - push esi - push edi - push ebx - - mov eax, [esp +12 + 20] - movd mm3, [eax] - WELS_Zero mm7 - punpcklbw mm3, mm3 - movq mm4, mm3 - punpcklwd mm3, mm3 - punpckhwd mm4, mm4 - - movq mm5, mm3 - punpcklbw mm3, mm7 - punpckhbw mm5, mm7 - - movq mm6, mm4 - punpcklbw mm4, mm7 - punpckhbw mm6, mm7 - - mov esi, [esp +12+ 4] - mov eax, [esp + 12 + 8] - mov edi, [esp + 12 + 12] - mov edx, [esp + 12 + 16] - mov ecx, [esp + 12 + 24] - - lea ebx, [esi + eax] - movd mm0, [esi] - movd mm1, [esi+1] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 -.xloop: - - pmullw mm0, mm3 - pmullw mm1, mm5 - paddw mm0, mm1 - - movd mm1, [ebx] - punpcklbw mm1, mm7 - movq mm2, mm1 - pmullw mm1, mm4 - paddw mm0, mm1 - - movd mm1, [ebx+1] - punpcklbw mm1, mm7 - movq mm7, mm1 - pmullw mm1,mm6 - paddw mm0, mm1 - movq mm1,mm7 - - paddw mm0, [h264_d0x20_mmx] - psrlw mm0, 6 - - WELS_Zero mm7 - packuswb mm0, mm7 - movd [edi], mm0 - - movq mm0, mm2 - - lea edi, [edi +edx ] - lea ebx, [ebx + eax] - - dec ecx - jnz near .xloop - WELSEMMS - pop ebx - pop edi - pop esi - ret - - -ALIGN 16 -;******************************************************************************* -; void McChromaWidthEq8_sse2( uint8_t *pSrc, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; uint8_t *pABCD, -; int32_t iheigh ); -;******************************************************************************* -WELS_EXTERN McChromaWidthEq8_sse2 -McChromaWidthEq8_sse2: - push esi - push edi - push ebx - - mov eax, [esp +12 + 20] - movd xmm3, [eax] - WELS_Zero xmm7 - punpcklbw xmm3, xmm3 - punpcklwd xmm3, xmm3 - - movdqa xmm4, xmm3 - punpckldq xmm3, xmm3 - punpckhdq xmm4, xmm4 - movdqa xmm5, xmm3 - movdqa xmm6, xmm4 - - punpcklbw xmm3, xmm7 - punpckhbw xmm5, xmm7 - punpcklbw xmm4, xmm7 - punpckhbw xmm6, xmm7 - - mov esi, [esp +12+ 4] - mov eax, [esp + 12 + 8] - mov edi, [esp + 12 + 12] - mov edx, [esp + 12 + 16] - mov ecx, [esp + 12 + 24] - - lea ebx, [esi + eax] - movq xmm0, [esi] - movq xmm1, [esi+1] - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 -.xloop: - - pmullw xmm0, xmm3 - pmullw xmm1, xmm5 - paddw xmm0, xmm1 - - movq xmm1, [ebx] - punpcklbw xmm1, xmm7 - movdqa xmm2, xmm1 - pmullw xmm1, xmm4 - paddw xmm0, xmm1 - - movq xmm1, [ebx+1] - punpcklbw xmm1, xmm7 - movdqa xmm7, xmm1 - pmullw xmm1, xmm6 - paddw xmm0, xmm1 - movdqa xmm1,xmm7 - - paddw xmm0, [h264_d0x20_sse2] - psrlw xmm0, 6 - - WELS_Zero xmm7 - packuswb xmm0, xmm7 - movq [edi], xmm0 - - movdqa xmm0, xmm2 - - lea edi, [edi +edx ] - lea ebx, [ebx + eax] - - dec ecx - jnz near .xloop - - pop ebx - pop edi - pop esi - ret - - - - -ALIGN 16 -;*********************************************************************** -; void McChromaWidthEq8_ssse3( uint8_t *pSrc, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; uint8_t *pABCD, -; int32_t iHeigh); -;*********************************************************************** -WELS_EXTERN McChromaWidthEq8_ssse3 -McChromaWidthEq8_ssse3: - push ebx - push esi - push edi - - mov eax, [esp + 12 + 20] - - pxor xmm7, xmm7 - movd xmm5, [eax] - punpcklwd xmm5, xmm5 - punpckldq xmm5, xmm5 - movdqa xmm6, xmm5 - punpcklqdq xmm5, xmm5 - punpckhqdq xmm6, xmm6 - - mov eax, [esp + 12 + 4] - mov edx, [esp + 12 + 8] - mov esi, [esp + 12 + 12] - mov edi, [esp + 12 + 16] - mov ecx, [esp + 12 + 24] - - sub esi, edi - sub esi, edi - movdqa xmm7, [h264_d0x20_sse2] - - movdqu xmm0, [eax] - movdqa xmm1, xmm0 - psrldq xmm1, 1 - punpcklbw xmm0, xmm1 - -.hloop_chroma: - lea esi, [esi+2*edi] - - movdqu xmm2, [eax+edx] - movdqa xmm3, xmm2 - psrldq xmm3, 1 - punpcklbw xmm2, xmm3 - movdqa xmm4, xmm2 - - pmaddubsw xmm0, xmm5 - pmaddubsw xmm2, xmm6 - paddw xmm0, xmm2 - paddw xmm0, xmm7 - psrlw xmm0, 6 - packuswb xmm0, xmm0 - movq [esi],xmm0 - - lea eax, [eax+2*edx] - movdqu xmm2, [eax] - movdqa xmm3, xmm2 - psrldq xmm3, 1 - punpcklbw xmm2, xmm3 - movdqa xmm0, xmm2 - - pmaddubsw xmm4, xmm5 - pmaddubsw xmm2, xmm6 - paddw xmm4, xmm2 - paddw xmm4, xmm7 - psrlw xmm4, 6 - packuswb xmm4, xmm4 - movq [esi+edi],xmm4 - - sub ecx, 2 - jnz .hloop_chroma - pop edi - pop esi - pop ebx - - ret - - +;*! +;* \copy +;* Copyright (c) 2004-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* mc_chroma.asm +;* +;* Abstract +;* mmx motion compensation for chroma +;* +;* History +;* 10/13/2004 Created +;* +;* +;*************************************************************************/ +%include "asm_inc.asm" + +BITS 32 + +;*********************************************************************** +; Local Data (Read Only) +;*********************************************************************** + +SECTION .rodata align=16 + +;*********************************************************************** +; Various memory constants (trigonometric values or rounding values) +;*********************************************************************** + +ALIGN 16 +h264_d0x20_sse2: + dw 32,32,32,32,32,32,32,32 +ALIGN 16 +h264_d0x20_mmx: + dw 32,32,32,32 + + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +ALIGN 16 +;******************************************************************************* +; void McChromaWidthEq4_mmx( uint8_t *src, +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; uint8_t *pABCD, +; int32_t iHeigh ); +;******************************************************************************* +WELS_EXTERN McChromaWidthEq4_mmx +McChromaWidthEq4_mmx: + push esi + push edi + push ebx + + mov eax, [esp +12 + 20] + movd mm3, [eax] + WELS_Zero mm7 + punpcklbw mm3, mm3 + movq mm4, mm3 + punpcklwd mm3, mm3 + punpckhwd mm4, mm4 + + movq mm5, mm3 + punpcklbw mm3, mm7 + punpckhbw mm5, mm7 + + movq mm6, mm4 + punpcklbw mm4, mm7 + punpckhbw mm6, mm7 + + mov esi, [esp +12+ 4] + mov eax, [esp + 12 + 8] + mov edi, [esp + 12 + 12] + mov edx, [esp + 12 + 16] + mov ecx, [esp + 12 + 24] + + lea ebx, [esi + eax] + movd mm0, [esi] + movd mm1, [esi+1] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 +.xloop: + + pmullw mm0, mm3 + pmullw mm1, mm5 + paddw mm0, mm1 + + movd mm1, [ebx] + punpcklbw mm1, mm7 + movq mm2, mm1 + pmullw mm1, mm4 + paddw mm0, mm1 + + movd mm1, [ebx+1] + punpcklbw mm1, mm7 + movq mm7, mm1 + pmullw mm1,mm6 + paddw mm0, mm1 + movq mm1,mm7 + + paddw mm0, [h264_d0x20_mmx] + psrlw mm0, 6 + + WELS_Zero mm7 + packuswb mm0, mm7 + movd [edi], mm0 + + movq mm0, mm2 + + lea edi, [edi +edx ] + lea ebx, [ebx + eax] + + dec ecx + jnz near .xloop + WELSEMMS + pop ebx + pop edi + pop esi + ret + + +ALIGN 16 +;******************************************************************************* +; void McChromaWidthEq8_sse2( uint8_t *pSrc, +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; uint8_t *pABCD, +; int32_t iheigh ); +;******************************************************************************* +WELS_EXTERN McChromaWidthEq8_sse2 +McChromaWidthEq8_sse2: + push esi + push edi + push ebx + + mov eax, [esp +12 + 20] + movd xmm3, [eax] + WELS_Zero xmm7 + punpcklbw xmm3, xmm3 + punpcklwd xmm3, xmm3 + + movdqa xmm4, xmm3 + punpckldq xmm3, xmm3 + punpckhdq xmm4, xmm4 + movdqa xmm5, xmm3 + movdqa xmm6, xmm4 + + punpcklbw xmm3, xmm7 + punpckhbw xmm5, xmm7 + punpcklbw xmm4, xmm7 + punpckhbw xmm6, xmm7 + + mov esi, [esp +12+ 4] + mov eax, [esp + 12 + 8] + mov edi, [esp + 12 + 12] + mov edx, [esp + 12 + 16] + mov ecx, [esp + 12 + 24] + + lea ebx, [esi + eax] + movq xmm0, [esi] + movq xmm1, [esi+1] + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 +.xloop: + + pmullw xmm0, xmm3 + pmullw xmm1, xmm5 + paddw xmm0, xmm1 + + movq xmm1, [ebx] + punpcklbw xmm1, xmm7 + movdqa xmm2, xmm1 + pmullw xmm1, xmm4 + paddw xmm0, xmm1 + + movq xmm1, [ebx+1] + punpcklbw xmm1, xmm7 + movdqa xmm7, xmm1 + pmullw xmm1, xmm6 + paddw xmm0, xmm1 + movdqa xmm1,xmm7 + + paddw xmm0, [h264_d0x20_sse2] + psrlw xmm0, 6 + + WELS_Zero xmm7 + packuswb xmm0, xmm7 + movq [edi], xmm0 + + movdqa xmm0, xmm2 + + lea edi, [edi +edx ] + lea ebx, [ebx + eax] + + dec ecx + jnz near .xloop + + pop ebx + pop edi + pop esi + ret + + + + +ALIGN 16 +;*********************************************************************** +; void McChromaWidthEq8_ssse3( uint8_t *pSrc, +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; uint8_t *pABCD, +; int32_t iHeigh); +;*********************************************************************** +WELS_EXTERN McChromaWidthEq8_ssse3 +McChromaWidthEq8_ssse3: + push ebx + push esi + push edi + + mov eax, [esp + 12 + 20] + + pxor xmm7, xmm7 + movd xmm5, [eax] + punpcklwd xmm5, xmm5 + punpckldq xmm5, xmm5 + movdqa xmm6, xmm5 + punpcklqdq xmm5, xmm5 + punpckhqdq xmm6, xmm6 + + mov eax, [esp + 12 + 4] + mov edx, [esp + 12 + 8] + mov esi, [esp + 12 + 12] + mov edi, [esp + 12 + 16] + mov ecx, [esp + 12 + 24] + + sub esi, edi + sub esi, edi + movdqa xmm7, [h264_d0x20_sse2] + + movdqu xmm0, [eax] + movdqa xmm1, xmm0 + psrldq xmm1, 1 + punpcklbw xmm0, xmm1 + +.hloop_chroma: + lea esi, [esi+2*edi] + + movdqu xmm2, [eax+edx] + movdqa xmm3, xmm2 + psrldq xmm3, 1 + punpcklbw xmm2, xmm3 + movdqa xmm4, xmm2 + + pmaddubsw xmm0, xmm5 + pmaddubsw xmm2, xmm6 + paddw xmm0, xmm2 + paddw xmm0, xmm7 + psrlw xmm0, 6 + packuswb xmm0, xmm0 + movq [esi],xmm0 + + lea eax, [eax+2*edx] + movdqu xmm2, [eax] + movdqa xmm3, xmm2 + psrldq xmm3, 1 + punpcklbw xmm2, xmm3 + movdqa xmm0, xmm2 + + pmaddubsw xmm4, xmm5 + pmaddubsw xmm2, xmm6 + paddw xmm4, xmm2 + paddw xmm4, xmm7 + psrlw xmm4, 6 + packuswb xmm4, xmm4 + movq [esi+edi],xmm4 + + sub ecx, 2 + jnz .hloop_chroma + pop edi + pop esi + pop ebx + + ret + + diff --git a/codec/encoder/core/asm/deblock.asm b/codec/encoder/core/asm/deblock.asm index 5614c254..ed82286c 100644 --- a/codec/encoder/core/asm/deblock.asm +++ b/codec/encoder/core/asm/deblock.asm @@ -1,2113 +1,2113 @@ -;*! -;* \copy -;* Copyright (c) 2009-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* deblock.asm -;* -;* Abstract -;* edge loop -;* -;* History -;* 08/07/2009 Created -;* -;* -;*************************************************************************/ -%include "asm_inc.asm" -BITS 32 - -;******************************************************************************* -; Macros and other preprocessor constants -;******************************************************************************* - -%ifdef FORMAT_COFF -SECTION .rodata pData -%else -SECTION .rodata align=16 -%endif - -SECTION .text - -;******************************************************************************** -; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, -; int32_t iAlpha, int32_t iBeta) -;******************************************************************************** -WELS_EXTERN DeblockChromaEq4V_sse2 - -ALIGN 16 -DeblockChromaEq4V_sse2: - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,68h - mov edx,[ebp+10h] ; iStride - mov eax,[ebp+8] ; pPixCb - mov ecx,[ebp+0Ch] ; pPixCr - movq xmm4,[ecx] - movq xmm5,[edx+ecx] - push esi - push edi - lea esi,[edx+edx] - mov edi,eax - sub edi,esi - movq xmm1,[edi] - mov edi,ecx - sub edi,esi - movq xmm2,[edi] - punpcklqdq xmm1,xmm2 - mov esi,eax - sub esi,edx - movq xmm2,[esi] - mov edi,ecx - sub edi,edx - movq xmm3,[edi] - punpcklqdq xmm2,xmm3 - movq xmm3,[eax] - punpcklqdq xmm3,xmm4 - movq xmm4,[edx+eax] - mov edx, [ebp + 14h] - punpcklqdq xmm4,xmm5 - movd xmm5,edx - mov edx, [ebp + 18h] - pxor xmm0,xmm0 - movdqa xmm6,xmm5 - punpcklwd xmm6,xmm5 - pshufd xmm5,xmm6,0 - movd xmm6,edx - movdqa xmm7,xmm6 - punpcklwd xmm7,xmm6 - pshufd xmm6,xmm7,0 - movdqa xmm7,xmm1 - punpckhbw xmm1,xmm0 - punpcklbw xmm7,xmm0 - movdqa [esp+40h],xmm1 - movdqa [esp+60h],xmm7 - movdqa xmm7,xmm2 - punpcklbw xmm7,xmm0 - movdqa [esp+10h],xmm7 - movdqa xmm7,xmm3 - punpcklbw xmm7,xmm0 - punpckhbw xmm3,xmm0 - movdqa [esp+50h],xmm7 - movdqa xmm7,xmm4 - punpckhbw xmm4,xmm0 - punpckhbw xmm2,xmm0 - punpcklbw xmm7,xmm0 - movdqa [esp+30h],xmm3 - movdqa xmm3,[esp+10h] - movdqa xmm1,xmm3 - psubw xmm1,[esp+50h] - pabsw xmm1,xmm1 - movdqa [esp+20h],xmm4 - movdqa xmm0,xmm5 - pcmpgtw xmm0,xmm1 - movdqa xmm1,[esp+60h] - psubw xmm1,xmm3 - pabsw xmm1,xmm1 - movdqa xmm4,xmm6 - pcmpgtw xmm4,xmm1 - pand xmm0,xmm4 - movdqa xmm1,xmm7 - psubw xmm1,[esp+50h] - pabsw xmm1,xmm1 - movdqa xmm4,xmm6 - pcmpgtw xmm4,xmm1 - movdqa xmm1,xmm2 - psubw xmm1,[esp+30h] - pabsw xmm1,xmm1 - pcmpgtw xmm5,xmm1 - movdqa xmm1,[esp+40h] - pand xmm0,xmm4 - psubw xmm1,xmm2 - pabsw xmm1,xmm1 - movdqa xmm4,xmm6 - pcmpgtw xmm4,xmm1 - movdqa xmm1,[esp+20h] - psubw xmm1,[esp+30h] - pand xmm5,xmm4 - pabsw xmm1,xmm1 - pcmpgtw xmm6,xmm1 - pand xmm5,xmm6 - mov edx,2 - movsx edx,dx - movd xmm1,edx - movdqa xmm4,xmm1 - punpcklwd xmm4,xmm1 - pshufd xmm1,xmm4,0 - movdqa xmm4,[esp+60h] - movdqa xmm6,xmm4 - paddw xmm6,xmm4 - paddw xmm6,xmm3 - paddw xmm6,xmm7 - movdqa [esp+10h],xmm1 - paddw xmm6,[esp+10h] - psraw xmm6,2 - movdqa xmm4,xmm0 - pandn xmm4,xmm3 - movdqa xmm3,[esp+40h] - movdqa xmm1,xmm0 - pand xmm1,xmm6 - por xmm1,xmm4 - movdqa xmm6,xmm3 - paddw xmm6,xmm3 - movdqa xmm3,[esp+10h] - paddw xmm6,xmm2 - paddw xmm6,[esp+20h] - paddw xmm6,xmm3 - psraw xmm6,2 - movdqa xmm4,xmm5 - pand xmm4,xmm6 - movdqa xmm6,xmm5 - pandn xmm6,xmm2 - por xmm4,xmm6 - packuswb xmm1,xmm4 - movdqa xmm4,[esp+50h] - movdqa xmm6,xmm7 - paddw xmm6,xmm7 - paddw xmm6,xmm4 - paddw xmm6,[esp+60h] - paddw xmm6,xmm3 - psraw xmm6,2 - movdqa xmm2,xmm0 - pand xmm2,xmm6 - pandn xmm0,xmm4 - por xmm2,xmm0 - movdqa xmm0,[esp+20h] - movdqa xmm6,xmm0 - paddw xmm6,xmm0 - movdqa xmm0,[esp+30h] - paddw xmm6,xmm0 - paddw xmm6,[esp+40h] - movdqa xmm4,xmm5 - paddw xmm6,xmm3 - movq [esi],xmm1 - psraw xmm6,2 - pand xmm4,xmm6 - pandn xmm5,xmm0 - por xmm4,xmm5 - packuswb xmm2,xmm4 - movq [eax],xmm2 - psrldq xmm1,8 - movq [edi],xmm1 - pop edi - psrldq xmm2,8 - movq [ecx],xmm2 - pop esi - mov esp,ebp - pop ebp - ret - -;****************************************************************************** -; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, -; int32_t iAlpha, int32_t iBeta, int8_t * pTC); -;******************************************************************************* - -WELS_EXTERN DeblockChromaLt4V_sse2 - -DeblockChromaLt4V_sse2: - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,0E4h - push ebx - push esi - mov esi, [ebp+1Ch] ; pTC - movsx ebx, byte [esi+2] - push edi - movsx di,byte [esi+3] - mov word [esp+0Ch],bx - movsx bx,byte [esi+1] - movsx esi,byte [esi] - mov word [esp+0Eh],si - movzx esi,di - movd xmm1,esi - movzx esi,di - movd xmm2,esi - mov si,word [esp+0Ch] - mov edx, [ebp + 10h] - mov eax, [ebp + 08h] - movzx edi,si - movzx esi,si - mov ecx, [ebp + 0Ch] - movd xmm4,esi - movzx esi,bx - movd xmm5,esi - movd xmm3,edi - movzx esi,bx - movd xmm6,esi - mov si,word [esp+0Eh] - movzx edi,si - movzx esi,si - punpcklwd xmm6,xmm2 - pxor xmm0,xmm0 - movdqa [esp+40h],xmm0 - movd xmm7,edi - movd xmm0,esi - lea esi,[edx+edx] - mov edi,eax - sub edi,esi - punpcklwd xmm5,xmm1 - movdqa xmm1,[esp+40h] - punpcklwd xmm0,xmm4 - movq xmm4,[edx+ecx] - punpcklwd xmm7,xmm3 - movq xmm3,[eax] - punpcklwd xmm0,xmm6 - movq xmm6,[edi] - punpcklwd xmm7,xmm5 - punpcklwd xmm0,xmm7 - mov edi,ecx - sub edi,esi - movdqa xmm2,xmm1 - psubw xmm2,xmm0 - movdqa [esp+60h],xmm2 - movq xmm2, [edi] - punpcklqdq xmm6,xmm2 - mov esi,eax - sub esi,edx - movq xmm7,[esi] - mov edi,ecx - sub edi,edx - movq xmm2,[edi] - punpcklqdq xmm7,xmm2 - movq xmm2,[ecx] - punpcklqdq xmm3,xmm2 - movq xmm2,[edx+eax] - movsx edx,word [ebp + 14h] - punpcklqdq xmm2,xmm4 - movdqa [esp+0E0h],xmm2 - movd xmm2,edx - movsx edx,word [ebp + 18h] - movdqa xmm4,xmm2 - punpcklwd xmm4,xmm2 - movd xmm2,edx - movdqa xmm5,xmm2 - punpcklwd xmm5,xmm2 - pshufd xmm2,xmm5,0 - movdqa [esp+50h],xmm2 - movdqa xmm2,xmm6 - punpcklbw xmm2,xmm1 - movdqa [esp+0D0h],xmm3 - pshufd xmm4,xmm4,0 - movdqa [esp+30h],xmm2 - punpckhbw xmm6,xmm1 - movdqa [esp+80h],xmm6 - movdqa xmm6,[esp+0D0h] - punpckhbw xmm6,xmm1 - movdqa [esp+70h],xmm6 - movdqa xmm6, [esp+0E0h] - punpckhbw xmm6,xmm1 - movdqa [esp+90h],xmm6 - movdqa xmm5, [esp+0E0h] - movdqa xmm2,xmm7 - punpckhbw xmm7,xmm1 - punpcklbw xmm5,xmm1 - movdqa [esp+0A0h],xmm7 - punpcklbw xmm3,xmm1 - mov edx,4 - punpcklbw xmm2,xmm1 - movsx edx,dx - movd xmm6,edx - movdqa xmm7,xmm6 - punpcklwd xmm7,xmm6 - pshufd xmm6,xmm7,0 - movdqa xmm7,[esp+30h] - movdqa [esp+20h],xmm6 - psubw xmm7,xmm5 - movdqa xmm6,xmm0 - pcmpgtw xmm6,xmm1 - movdqa xmm1,[esp+60h] - movdqa [esp+40h],xmm6 - movdqa xmm6,xmm3 - psubw xmm6,xmm2 - psllw xmm6,2 - paddw xmm6,xmm7 - paddw xmm6, [esp+20h] - movdqa xmm7, [esp+50h] - psraw xmm6,3 - pmaxsw xmm1,xmm6 - movdqa [esp+10h],xmm0 - movdqa xmm6, [esp+10h] - pminsw xmm6,xmm1 - movdqa [esp+10h],xmm6 - movdqa xmm1,xmm2 - psubw xmm1,xmm3 - pabsw xmm1,xmm1 - movdqa xmm6,xmm4 - pcmpgtw xmm6,xmm1 - movdqa xmm1, [esp+30h] - psubw xmm1,xmm2 - pabsw xmm1,xmm1 - pcmpgtw xmm7,xmm1 - movdqa xmm1,[esp+50h] - pand xmm6,xmm7 - movdqa xmm7,[esp+50h] - psubw xmm5,xmm3 - pabsw xmm5,xmm5 - pcmpgtw xmm1,xmm5 - movdqa xmm5,[esp+80h] - psubw xmm5,[esp+90h] - pand xmm6,xmm1 - pand xmm6,[esp+40h] - movdqa xmm1,[esp+10h] - pand xmm1,xmm6 - movdqa xmm6,[esp+70h] - movdqa [esp+30h],xmm1 - movdqa xmm1,[esp+0A0h] - psubw xmm6,xmm1 - psllw xmm6,2 - paddw xmm6,xmm5 - paddw xmm6,[esp+20h] - movdqa xmm5,[esp+60h] - psraw xmm6,3 - pmaxsw xmm5,xmm6 - pminsw xmm0,xmm5 - movdqa xmm5,[esp+70h] - movdqa xmm6,xmm1 - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pcmpgtw xmm4,xmm6 - movdqa xmm6,[esp+80h] - psubw xmm6,xmm1 - pabsw xmm6,xmm6 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+90h] - pand xmm4,xmm7 - movdqa xmm7,[esp+50h] - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pcmpgtw xmm7,xmm6 - pand xmm4,xmm7 - pand xmm4,[esp+40h] - pand xmm0,xmm4 - movdqa xmm4,[esp+30h] - paddw xmm2,xmm4 - paddw xmm1,xmm0 - packuswb xmm2,xmm1 - movq [esi],xmm2 - psubw xmm3,xmm4 - psubw xmm5,xmm0 - packuswb xmm3,xmm5 - movq [eax],xmm3 - psrldq xmm2,8 - movq [edi],xmm2 - pop edi - pop esi - psrldq xmm3,8 - movq [ecx],xmm3 - pop ebx - mov esp,ebp - pop ebp - ret - -;*************************************************************************** -; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, -; int32_t iAlpha, int32_t iBeta) -;*************************************************************************** - -WELS_EXTERN DeblockChromaEq4H_sse2 - -ALIGN 16 - -DeblockChromaEq4H_sse2: - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,0C8h - mov ecx,dword [ebp+8] - mov edx,dword [ebp+0Ch] - mov eax,dword [ebp+10h] - sub ecx,2 - sub edx,2 - push esi - lea esi,[eax+eax*2] - mov dword [esp+18h],ecx - mov dword [esp+4],edx - lea ecx,[ecx+eax*4] - lea edx,[edx+eax*4] - lea eax,[esp+7Ch] - push edi - mov dword [esp+14h],esi - mov dword [esp+18h],ecx - mov dword [esp+0Ch],edx - mov dword [esp+10h],eax - mov esi,dword [esp+1Ch] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+14h] - movd xmm0,dword [esi] - movd xmm1,dword [esi+ecx] - movd xmm2,dword [esi+ecx*2] - movd xmm3,dword [esi+edx] - mov esi,dword [esp+8] - movd xmm4,dword [esi] - movd xmm5,dword [esi+ecx] - movd xmm6,dword [esi+ecx*2] - movd xmm7,dword [esi+edx] - punpckldq xmm0,xmm4 - punpckldq xmm1,xmm5 - punpckldq xmm2,xmm6 - punpckldq xmm3,xmm7 - mov esi,dword [esp+18h] - mov edi,dword [esp+0Ch] - movd xmm4,dword [esi] - movd xmm5,dword [edi] - punpckldq xmm4,xmm5 - punpcklqdq xmm0,xmm4 - movd xmm4,dword [esi+ecx] - movd xmm5,dword [edi+ecx] - punpckldq xmm4,xmm5 - punpcklqdq xmm1,xmm4 - movd xmm4,dword [esi+ecx*2] - movd xmm5,dword [edi+ecx*2] - punpckldq xmm4,xmm5 - punpcklqdq xmm2,xmm4 - movd xmm4,dword [esi+edx] - movd xmm5,dword [edi+edx] - punpckldq xmm4,xmm5 - punpcklqdq xmm3,xmm4 - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov edi,dword [esp+10h] - movdqa [edi],xmm0 - movdqa [edi+10h],xmm5 - movdqa [edi+20h],xmm1 - movdqa [edi+30h],xmm6 - movsx ecx,word [ebp+14h] - movsx edx,word [ebp+18h] - movdqa xmm6,[esp+80h] - movdqa xmm4,[esp+90h] - movdqa xmm5,[esp+0A0h] - movdqa xmm7,[esp+0B0h] - pxor xmm0,xmm0 - movd xmm1,ecx - movdqa xmm2,xmm1 - punpcklwd xmm2,xmm1 - pshufd xmm1,xmm2,0 - movd xmm2,edx - movdqa xmm3,xmm2 - punpcklwd xmm3,xmm2 - pshufd xmm2,xmm3,0 - movdqa xmm3,xmm6 - punpckhbw xmm6,xmm0 - movdqa [esp+60h],xmm6 - movdqa xmm6,[esp+90h] - punpckhbw xmm6,xmm0 - movdqa [esp+30h],xmm6 - movdqa xmm6,[esp+0A0h] - punpckhbw xmm6,xmm0 - movdqa [esp+40h],xmm6 - movdqa xmm6,[esp+0B0h] - punpckhbw xmm6,xmm0 - movdqa [esp+70h],xmm6 - punpcklbw xmm7,xmm0 - punpcklbw xmm4,xmm0 - punpcklbw xmm5,xmm0 - punpcklbw xmm3,xmm0 - movdqa [esp+50h],xmm7 - movdqa xmm6,xmm4 - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - movdqa xmm0,xmm1 - pcmpgtw xmm0,xmm6 - movdqa xmm6,xmm3 - psubw xmm6,xmm4 - pabsw xmm6,xmm6 - movdqa xmm7,xmm2 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+50h] - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pand xmm0,xmm7 - movdqa xmm7,xmm2 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+30h] - psubw xmm6,[esp+40h] - pabsw xmm6,xmm6 - pcmpgtw xmm1,xmm6 - movdqa xmm6,[esp+60h] - psubw xmm6,[esp+30h] - pabsw xmm6,xmm6 - pand xmm0,xmm7 - movdqa xmm7,xmm2 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+70h] - psubw xmm6,[esp+40h] - pabsw xmm6,xmm6 - pand xmm1,xmm7 - pcmpgtw xmm2,xmm6 - pand xmm1,xmm2 - mov eax,2 - movsx ecx,ax - movd xmm2,ecx - movdqa xmm6,xmm2 - punpcklwd xmm6,xmm2 - pshufd xmm2,xmm6,0 - movdqa [esp+20h],xmm2 - movdqa xmm2,xmm3 - paddw xmm2,xmm3 - paddw xmm2,xmm4 - paddw xmm2,[esp+50h] - paddw xmm2,[esp+20h] - psraw xmm2,2 - movdqa xmm6,xmm0 - pand xmm6,xmm2 - movdqa xmm2,xmm0 - pandn xmm2,xmm4 - por xmm6,xmm2 - movdqa xmm2,[esp+60h] - movdqa xmm7,xmm2 - paddw xmm7,xmm2 - paddw xmm7,[esp+30h] - paddw xmm7,[esp+70h] - paddw xmm7,[esp+20h] - movdqa xmm4,xmm1 - movdqa xmm2,xmm1 - pandn xmm2,[esp+30h] - psraw xmm7,2 - pand xmm4,xmm7 - por xmm4,xmm2 - movdqa xmm2,[esp+50h] - packuswb xmm6,xmm4 - movdqa [esp+90h],xmm6 - movdqa xmm6,xmm2 - paddw xmm6,xmm2 - movdqa xmm2,[esp+20h] - paddw xmm6,xmm5 - paddw xmm6,xmm3 - movdqa xmm4,xmm0 - pandn xmm0,xmm5 - paddw xmm6,xmm2 - psraw xmm6,2 - pand xmm4,xmm6 - por xmm4,xmm0 - movdqa xmm0,[esp+70h] - movdqa xmm5,xmm0 - paddw xmm5,xmm0 - movdqa xmm0,[esp+40h] - paddw xmm5,xmm0 - paddw xmm5,[esp+60h] - movdqa xmm3,xmm1 - paddw xmm5,xmm2 - psraw xmm5,2 - pand xmm3,xmm5 - pandn xmm1,xmm0 - por xmm3,xmm1 - packuswb xmm4,xmm3 - movdqa [esp+0A0h],xmm4 - mov esi,dword [esp+10h] - movdqa xmm0,[esi] - movdqa xmm1,[esi+10h] - movdqa xmm2,[esi+20h] - movdqa xmm3,[esi+30h] - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov esi,dword [esp+1Ch] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+14h] - mov edi,dword [esp+8] - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov esi,dword [esp+18h] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov edi,dword [esp+0Ch] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - pop edi - pop esi - mov esp,ebp - pop ebp - ret - -;******************************************************************************* -; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, -; int32_t iAlpha, int32_t iBeta, int8_t * pTC); -;******************************************************************************* - -WELS_EXTERN DeblockChromaLt4H_sse2 - -ALIGN 16 - -DeblockChromaLt4H_sse2: - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,108h - mov ecx,dword [ebp+8] - mov edx,dword [ebp+0Ch] - mov eax,dword [ebp+10h] - sub ecx,2 - sub edx,2 - push esi - lea esi,[eax+eax*2] - mov dword [esp+10h],ecx - mov dword [esp+4],edx - lea ecx,[ecx+eax*4] - lea edx,[edx+eax*4] - lea eax,[esp+6Ch] - push edi - mov dword [esp+0Ch],esi - mov dword [esp+18h],ecx - mov dword [esp+10h],edx - mov dword [esp+1Ch],eax - mov esi,dword [esp+14h] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+0Ch] - movd xmm0,dword [esi] - movd xmm1,dword [esi+ecx] - movd xmm2,dword [esi+ecx*2] - movd xmm3,dword [esi+edx] - mov esi,dword [esp+8] - movd xmm4,dword [esi] - movd xmm5,dword [esi+ecx] - movd xmm6,dword [esi+ecx*2] - movd xmm7,dword [esi+edx] - punpckldq xmm0,xmm4 - punpckldq xmm1,xmm5 - punpckldq xmm2,xmm6 - punpckldq xmm3,xmm7 - mov esi,dword [esp+18h] - mov edi,dword [esp+10h] - movd xmm4,dword [esi] - movd xmm5,dword [edi] - punpckldq xmm4,xmm5 - punpcklqdq xmm0,xmm4 - movd xmm4,dword [esi+ecx] - movd xmm5,dword [edi+ecx] - punpckldq xmm4,xmm5 - punpcklqdq xmm1,xmm4 - movd xmm4,dword [esi+ecx*2] - movd xmm5,dword [edi+ecx*2] - punpckldq xmm4,xmm5 - punpcklqdq xmm2,xmm4 - movd xmm4,dword [esi+edx] - movd xmm5,dword [edi+edx] - punpckldq xmm4,xmm5 - punpcklqdq xmm3,xmm4 - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov edi,dword [esp+1Ch] - movdqa [edi],xmm0 - movdqa [edi+10h],xmm5 - movdqa [edi+20h],xmm1 - movdqa [edi+30h],xmm6 - mov eax,dword [ebp+1Ch] - movsx cx,byte [eax+3] - movsx dx,byte [eax+2] - movsx si,byte [eax+1] - movsx ax,byte [eax] - movzx edi,cx - movzx ecx,cx - movd xmm2,ecx - movzx ecx,dx - movzx edx,dx - movd xmm3,ecx - movd xmm4,edx - movzx ecx,si - movzx edx,si - movd xmm5,ecx - pxor xmm0,xmm0 - movd xmm6,edx - movzx ecx,ax - movdqa [esp+60h],xmm0 - movzx edx,ax - movsx eax,word [ebp+14h] - punpcklwd xmm6,xmm2 - movd xmm1,edi - movd xmm7,ecx - movsx ecx,word [ebp+18h] - movd xmm0,edx - punpcklwd xmm7,xmm3 - punpcklwd xmm5,xmm1 - movdqa xmm1,[esp+60h] - punpcklwd xmm7,xmm5 - movdqa xmm5,[esp+0A0h] - punpcklwd xmm0,xmm4 - punpcklwd xmm0,xmm6 - movdqa xmm6, [esp+70h] - punpcklwd xmm0,xmm7 - movdqa xmm7,[esp+80h] - movdqa xmm2,xmm1 - psubw xmm2,xmm0 - movdqa [esp+0D0h],xmm2 - movd xmm2,eax - movdqa xmm3,xmm2 - punpcklwd xmm3,xmm2 - pshufd xmm4,xmm3,0 - movd xmm2,ecx - movdqa xmm3,xmm2 - punpcklwd xmm3,xmm2 - pshufd xmm2,xmm3,0 - movdqa xmm3, [esp+90h] - movdqa [esp+50h],xmm2 - movdqa xmm2,xmm6 - punpcklbw xmm2,xmm1 - punpckhbw xmm6,xmm1 - movdqa [esp+40h],xmm2 - movdqa [esp+0B0h],xmm6 - movdqa xmm6,[esp+90h] - movdqa xmm2,xmm7 - punpckhbw xmm7,xmm1 - punpckhbw xmm6,xmm1 - punpcklbw xmm2,xmm1 - punpcklbw xmm3,xmm1 - punpcklbw xmm5,xmm1 - movdqa [esp+0F0h],xmm7 - movdqa [esp+0C0h],xmm6 - movdqa xmm6, [esp+0A0h] - punpckhbw xmm6,xmm1 - movdqa [esp+0E0h],xmm6 - mov edx,4 - movsx eax,dx - movd xmm6,eax - movdqa xmm7,xmm6 - punpcklwd xmm7,xmm6 - pshufd xmm6,xmm7,0 - movdqa [esp+30h],xmm6 - movdqa xmm7, [esp+40h] - psubw xmm7,xmm5 - movdqa xmm6,xmm0 - pcmpgtw xmm6,xmm1 - movdqa [esp+60h],xmm6 - movdqa xmm1, [esp+0D0h] - movdqa xmm6,xmm3 - psubw xmm6,xmm2 - psllw xmm6,2 - paddw xmm6,xmm7 - paddw xmm6,[esp+30h] - psraw xmm6,3 - pmaxsw xmm1,xmm6 - movdqa xmm7,[esp+50h] - movdqa [esp+20h],xmm0 - movdqa xmm6, [esp+20h] - pminsw xmm6,xmm1 - movdqa [esp+20h],xmm6 - movdqa xmm6,xmm4 - movdqa xmm1,xmm2 - psubw xmm1,xmm3 - pabsw xmm1,xmm1 - pcmpgtw xmm6,xmm1 - movdqa xmm1, [esp+40h] - psubw xmm1,xmm2 - pabsw xmm1,xmm1 - pcmpgtw xmm7,xmm1 - movdqa xmm1, [esp+50h] - pand xmm6,xmm7 - movdqa xmm7, [esp+50h] - psubw xmm5,xmm3 - pabsw xmm5,xmm5 - pcmpgtw xmm1,xmm5 - movdqa xmm5, [esp+0B0h] - psubw xmm5,[esp+0E0h] - pand xmm6,xmm1 - pand xmm6, [esp+60h] - movdqa xmm1, [esp+20h] - pand xmm1,xmm6 - movdqa xmm6, [esp+0C0h] - movdqa [esp+40h],xmm1 - movdqa xmm1, [esp+0F0h] - psubw xmm6,xmm1 - psllw xmm6,2 - paddw xmm6,xmm5 - paddw xmm6, [esp+30h] - movdqa xmm5, [esp+0D0h] - psraw xmm6,3 - pmaxsw xmm5,xmm6 - pminsw xmm0,xmm5 - movdqa xmm5,[esp+0C0h] - movdqa xmm6,xmm1 - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pcmpgtw xmm4,xmm6 - movdqa xmm6,[esp+0B0h] - psubw xmm6,xmm1 - pabsw xmm6,xmm6 - pcmpgtw xmm7,xmm6 - movdqa xmm6, [esp+0E0h] - pand xmm4,xmm7 - movdqa xmm7, [esp+50h] - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pcmpgtw xmm7,xmm6 - pand xmm4,xmm7 - pand xmm4,[esp+60h] - pand xmm0,xmm4 - movdqa xmm4, [esp+40h] - paddw xmm2,xmm4 - paddw xmm1,xmm0 - psubw xmm3,xmm4 - psubw xmm5,xmm0 - packuswb xmm2,xmm1 - packuswb xmm3,xmm5 - movdqa [esp+80h],xmm2 - movdqa [esp+90h],xmm3 - mov esi,dword [esp+1Ch] - movdqa xmm0, [esi] - movdqa xmm1, [esi+10h] - movdqa xmm2, [esi+20h] - movdqa xmm3, [esi+30h] - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov esi,dword [esp+14h] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+0Ch] - mov edi,dword [esp+8] - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov esi,dword [esp+18h] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov edi,dword [esp+10h] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - pop edi - pop esi - mov esp,ebp - pop ebp - ret - - - -;******************************************************************************* -; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, -; int32_t iBeta, int8_t * pTC) -;******************************************************************************* - - -WELS_EXTERN DeblockLumaLt4V_sse2 - -ALIGN 16 - -DeblockLumaLt4V_sse2: - push ebp - mov ebp, esp - and esp, -16 ; fffffff0H - sub esp, 420 ; 000001a4H - mov eax, dword [ebp+8] - mov ecx, dword [ebp+12] - - pxor xmm0, xmm0 - push ebx - mov edx, dword [ebp+24] - movdqa [esp+424-384], xmm0 - push esi - - lea esi, [ecx+ecx*2] - push edi - mov edi, eax - sub edi, esi - movdqa xmm0, [edi] - - lea esi, [ecx+ecx] - movdqa [esp+432-208], xmm0 - mov edi, eax - sub edi, esi - movdqa xmm0, [edi] - movdqa [esp+448-208], xmm0 - - mov ebx, eax - sub ebx, ecx - movdqa xmm0, [ebx] - movdqa [esp+464-208], xmm0 - - movdqa xmm0, [eax] - - add ecx, eax - movdqa [esp+480-208], xmm0 - movdqa xmm0, [ecx] - mov dword [esp+432-404], ecx - - movsx ecx, word [ebp+16] - movdqa [esp+496-208], xmm0 - movdqa xmm0, [esi+eax] - - movsx si, byte [edx] - movdqa [esp+512-208], xmm0 - movd xmm0, ecx - movsx ecx, word [ebp+20] - movdqa xmm1, xmm0 - punpcklwd xmm1, xmm0 - pshufd xmm0, xmm1, 0 - movdqa [esp+432-112], xmm0 - movd xmm0, ecx - movsx cx, byte [edx+1] - movdqa xmm1, xmm0 - punpcklwd xmm1, xmm0 - mov dword [esp+432-408], ebx - movzx ebx, cx - pshufd xmm0, xmm1, 0 - movd xmm1, ebx - movzx ebx, cx - movd xmm2, ebx - movzx ebx, cx - movzx ecx, cx - movd xmm4, ecx - movzx ecx, si - movd xmm5, ecx - movzx ecx, si - movd xmm6, ecx - movzx ecx, si - movd xmm7, ecx - movzx ecx, si - movdqa [esp+432-336], xmm0 - movd xmm0, ecx - - movsx cx, byte [edx+3] - movsx dx, byte [edx+2] - movd xmm3, ebx - punpcklwd xmm0, xmm4 - movzx esi, cx - punpcklwd xmm6, xmm2 - punpcklwd xmm5, xmm1 - punpcklwd xmm0, xmm6 - punpcklwd xmm7, xmm3 - punpcklwd xmm7, xmm5 - punpcklwd xmm0, xmm7 - movdqa [esp+432-400], xmm0 - movd xmm0, esi - movzx esi, cx - movd xmm2, esi - movzx esi, cx - movzx ecx, cx - movd xmm4, ecx - movzx ecx, dx - movd xmm3, esi - movd xmm5, ecx - punpcklwd xmm5, xmm0 - - movdqa xmm0, [esp+432-384] - movzx ecx, dx - movd xmm6, ecx - movzx ecx, dx - movzx edx, dx - punpcklwd xmm6, xmm2 - movd xmm7, ecx - movd xmm1, edx - - movdqa xmm2, [esp+448-208] - punpcklbw xmm2, xmm0 - - mov ecx, 4 - movsx edx, cx - punpcklwd xmm7, xmm3 - punpcklwd xmm7, xmm5 - movdqa xmm5, [esp+496-208] - movdqa xmm3, [esp+464-208] - punpcklbw xmm5, xmm0 - movdqa [esp+432-240], xmm5 - movdqa xmm5, [esp+512-208] - punpcklbw xmm5, xmm0 - movdqa [esp+432-352], xmm5 - punpcklwd xmm1, xmm4 - movdqa xmm4, [esp+432-208] - punpcklwd xmm1, xmm6 - movdqa xmm6, [esp+480-208] - punpcklwd xmm1, xmm7 - punpcklbw xmm6, xmm0 - punpcklbw xmm3, xmm0 - punpcklbw xmm4, xmm0 - movdqa xmm7, xmm3 - psubw xmm7, xmm4 - pabsw xmm7, xmm7 - movdqa [esp+432-272], xmm4 - movdqa xmm4, [esp+432-336] - movdqa xmm5, xmm4 - pcmpgtw xmm5, xmm7 - movdqa [esp+432-288], xmm5 - movdqa xmm7, xmm6 - psubw xmm7, [esp+432-352] - pabsw xmm7, xmm7 - movdqa xmm5, xmm4 - pcmpgtw xmm5, xmm7 - movdqa [esp+432-256], xmm5 - movdqa xmm5, xmm3 - pavgw xmm5, xmm6 - movdqa [esp+432-304], xmm5 - movdqa xmm5, [esp+432-400] - psubw xmm5, [esp+432-288] - psubw xmm5, [esp+432-256] - movdqa [esp+432-224], xmm5 - movdqa xmm5, xmm6 - psubw xmm5, xmm3 - movdqa [esp+432-32], xmm6 - psubw xmm6, [esp+432-240] - movdqa xmm7, xmm5 - movdqa [esp+432-384], xmm5 - movdqa xmm5, [esp+432-112] - pabsw xmm7, xmm7 - pcmpgtw xmm5, xmm7 - pabsw xmm6, xmm6 - movdqa xmm7, xmm4 - pcmpgtw xmm7, xmm6 - - pand xmm5, xmm7 - movdqa xmm6, xmm3 - psubw xmm6, xmm2 - pabsw xmm6, xmm6 - movdqa xmm7, xmm4 - pcmpgtw xmm7, xmm6 - movdqa xmm6, [esp+432-400] - pand xmm5, xmm7 - movdqa xmm7, xmm6 - pcmpeqw xmm6, xmm0 - pcmpgtw xmm7, xmm0 - por xmm7, xmm6 - pand xmm5, xmm7 - movdqa [esp+432-320], xmm5 - movd xmm5, edx - movdqa xmm6, xmm5 - punpcklwd xmm6, xmm5 - pshufd xmm5, xmm6, 0 - movdqa [esp+432-336], xmm5 - movdqa xmm5, [esp+432-224] - movdqa [esp+432-368], xmm5 - movdqa xmm6, xmm0 - psubw xmm6, xmm5 - movdqa xmm5, [esp+432-384] - psllw xmm5, 2 - movdqa xmm7, xmm2 - psubw xmm7, [esp+432-240] - paddw xmm7, xmm5 - paddw xmm7, [esp+432-336] - movdqa xmm5, [esp+432-368] - psraw xmm7, 3 - pmaxsw xmm6, xmm7 - pminsw xmm5, xmm6 - - pand xmm5, [esp+432-320] - movdqa xmm6, [esp+432-400] - movdqa [esp+432-64], xmm5 - movdqa [esp+432-384], xmm6 - movdqa xmm5, xmm0 - psubw xmm5, xmm6 - movdqa [esp+432-368], xmm5 - movdqa xmm6, xmm5 - movdqa xmm5, [esp+432-272] - paddw xmm5, [esp+432-304] - movdqa xmm7, xmm2 - paddw xmm7, xmm2 - psubw xmm5, xmm7 - psraw xmm5, 1 - pmaxsw xmm6, xmm5 - movdqa xmm5, [esp+432-384] - pminsw xmm5, xmm6 - - pand xmm5, [esp+432-320] - pand xmm5, [esp+432-288] - movdqa xmm6, [esp+432-240] - movdqa [esp+432-96], xmm5 - movdqa xmm5, [esp+432-352] - paddw xmm5, [esp+432-304] - movdqa xmm7, xmm6 - paddw xmm7, xmm6 - movdqa xmm6, [esp+432-368] - psubw xmm5, xmm7 - - movdqa xmm7, [esp+496-208] - psraw xmm5, 1 - pmaxsw xmm6, xmm5 - movdqa xmm5, [esp+432-400] - pminsw xmm5, xmm6 - pand xmm5, [esp+432-320] - pand xmm5, [esp+432-256] - movdqa xmm6, [esp+448-208] - punpckhbw xmm7, xmm0 - movdqa [esp+432-352], xmm7 - - movdqa xmm7, [esp+512-208] - punpckhbw xmm6, xmm0 - movdqa [esp+432-48], xmm5 - movdqa xmm5, [esp+432-208] - movdqa [esp+432-368], xmm6 - movdqa xmm6, [esp+464-208] - punpckhbw xmm7, xmm0 - punpckhbw xmm5, xmm0 - movdqa [esp+432-384], xmm7 - punpckhbw xmm6, xmm0 - movdqa [esp+432-400], xmm6 - - movdqa xmm7, [esp+432-400] - movdqa xmm6, [esp+480-208] - psubw xmm7, xmm5 - movdqa [esp+432-16], xmm5 - pabsw xmm7, xmm7 - punpckhbw xmm6, xmm0 - movdqa xmm5, xmm4 - pcmpgtw xmm5, xmm7 - movdqa [esp+432-288], xmm5 - - movdqa xmm7, xmm6 - psubw xmm7, [esp+432-384] - pabsw xmm7, xmm7 - movdqa xmm5, xmm4 - pcmpgtw xmm5, xmm7 - movdqa [esp+432-256], xmm5 - - movdqa xmm5, [esp+432-400] - movdqa [esp+432-80], xmm6 - pavgw xmm5, xmm6 - movdqa [esp+432-304], xmm5 - - movdqa xmm5, xmm1 - psubw xmm5, [esp+432-288] - psubw xmm5, [esp+432-256] - movdqa [esp+432-224], xmm5 - movdqa xmm5, xmm6 - psubw xmm5, [esp+432-400] - psubw xmm6, [esp+432-352] - movdqa [esp+432-272], xmm5 - movdqa xmm7, xmm5 - movdqa xmm5, [esp+432-112] - pabsw xmm7, xmm7 - pcmpgtw xmm5, xmm7 - movdqa xmm7, xmm4 - pabsw xmm6, xmm6 - pcmpgtw xmm7, xmm6 - movdqa xmm6, [esp+432-368] - - pand xmm5, xmm7 - movdqa xmm7, [esp+432-400] - psubw xmm7, xmm6 - psubw xmm6, [esp+432-352] - pabsw xmm7, xmm7 - pcmpgtw xmm4, xmm7 - pand xmm5, xmm4 - - paddw xmm2, [esp+432-96] - movdqa xmm4, xmm1 - pcmpgtw xmm4, xmm0 - movdqa xmm7, xmm1 - pcmpeqw xmm7, xmm0 - por xmm4, xmm7 - pand xmm5, xmm4 - movdqa xmm4, [esp+432-224] - movdqa [esp+432-320], xmm5 - movdqa xmm5, [esp+432-272] - movdqa xmm7, xmm0 - psubw xmm7, xmm4 - psubw xmm0, xmm1 - psllw xmm5, 2 - paddw xmm6, xmm5 - paddw xmm6, [esp+432-336] - movdqa xmm5, [esp+432-368] - movdqa [esp+432-336], xmm0 - psraw xmm6, 3 - pmaxsw xmm7, xmm6 - pminsw xmm4, xmm7 - pand xmm4, [esp+432-320] - movdqa xmm6, xmm0 - movdqa xmm0, [esp+432-16] - paddw xmm0, [esp+432-304] - movdqa [esp+432-272], xmm4 - movdqa xmm4, [esp+432-368] - paddw xmm4, xmm4 - psubw xmm0, xmm4 - - movdqa xmm4, [esp+432-64] - psraw xmm0, 1 - pmaxsw xmm6, xmm0 - movdqa xmm0, [esp+432-400] - movdqa xmm7, xmm1 - pminsw xmm7, xmm6 - movdqa xmm6, [esp+432-320] - pand xmm7, xmm6 - pand xmm7, [esp+432-288] - paddw xmm5, xmm7 - packuswb xmm2, xmm5 - movdqa xmm5, [esp+432-272] - paddw xmm0, xmm5 - paddw xmm3, xmm4 - packuswb xmm3, xmm0 - - movdqa xmm0, [esp+432-32] - psubw xmm0, xmm4 - movdqa xmm4, [esp+432-80] - psubw xmm4, xmm5 - - movdqa xmm5, [esp+432-240] - paddw xmm5, [esp+432-48] - packuswb xmm0, xmm4 - movdqa xmm4, [esp+432-384] - paddw xmm4, [esp+432-304] - movdqa [esp+480-208], xmm0 - movdqa xmm0, [esp+432-352] - movdqa xmm7, xmm0 - paddw xmm0, xmm0 - - mov ecx, dword [esp+432-408] - - mov edx, dword [esp+432-404] - psubw xmm4, xmm0 - movdqa xmm0, [esp+432-336] - movdqa [edi], xmm2 - psraw xmm4, 1 - pmaxsw xmm0, xmm4 - pminsw xmm1, xmm0 - movdqa xmm0, [esp+480-208] - - pop edi - pand xmm1, xmm6 - pand xmm1, [esp+428-256] - movdqa [ecx], xmm3 - paddw xmm7, xmm1 - pop esi - packuswb xmm5, xmm7 - movdqa [eax], xmm0 - movdqa [edx], xmm5 - pop ebx - mov esp, ebp - pop ebp - ret - - -;******************************************************************************* -; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, -; int32_t iBeta) -;******************************************************************************* - -WELS_EXTERN DeblockLumaEq4V_sse2 - -ALIGN 16 - -DeblockLumaEq4V_sse2: - - push ebp - mov ebp, esp - and esp, -16 ; fffffff0H - sub esp, 628 ; 00000274H - mov eax, dword [ebp+8] - mov ecx, dword [ebp+12] - push ebx - push esi - - lea edx, [ecx*4] - pxor xmm0, xmm0 - movdqa xmm2, xmm0 - - movdqa xmm0, [ecx+eax] - mov esi, eax - sub esi, edx - movdqa xmm3, [esi] - movdqa xmm5, [eax] - push edi - lea edi, [ecx+ecx] - lea ebx, [ecx+ecx*2] - mov dword [esp+640-600], edi - mov esi, eax - sub esi, edi - movdqa xmm1, [esi] - movdqa [esp+720-272], xmm0 - mov edi, eax - sub edi, ecx - movdqa xmm4, [edi] - add ecx, eax - mov dword [esp+640-596], ecx - - mov ecx, dword [esp+640-600] - movdqa xmm0, [ecx+eax] - movdqa [esp+736-272], xmm0 - - movdqa xmm0, [eax+ebx] - mov edx, eax - sub edx, ebx - - movsx ebx, word [ebp+16] - movdqa xmm6, [edx] - add ecx, eax - movdqa [esp+752-272], xmm0 - movd xmm0, ebx - - movsx ebx, word [ebp+20] - movdqa xmm7, xmm0 - punpcklwd xmm7, xmm0 - pshufd xmm0, xmm7, 0 - movdqa [esp+640-320], xmm0 - movd xmm0, ebx - movdqa xmm7, xmm0 - punpcklwd xmm7, xmm0 - pshufd xmm0, xmm7, 0 - - movdqa xmm7, [esp+736-272] - punpcklbw xmm7, xmm2 - movdqa [esp+640-416], xmm7 - movdqa [esp+640-512], xmm0 - movdqa xmm0, xmm1 - movdqa [esp+672-272], xmm1 - movdqa xmm1, xmm4 - movdqa [esp+704-272], xmm5 - punpcklbw xmm5, xmm2 - punpcklbw xmm1, xmm2 - - movdqa xmm7, xmm5 - psubw xmm7, xmm1 - pabsw xmm7, xmm7 - movdqa [esp+640-560], xmm7 - punpcklbw xmm0, xmm2 - movdqa [esp+688-272], xmm4 - movdqa xmm4, [esp+720-272] - movdqa [esp+640-480], xmm0 - - movdqa xmm7, xmm1 - psubw xmm7, xmm0 - - movdqa xmm0, [esp+640-512] - pabsw xmm7, xmm7 - punpcklbw xmm4, xmm2 - pcmpgtw xmm0, xmm7 - movdqa [esp+640-384], xmm4 - movdqa xmm7, xmm5 - psubw xmm7, xmm4 - movdqa xmm4, [esp+640-512] - movdqa [esp+656-272], xmm6 - punpcklbw xmm6, xmm2 - pabsw xmm7, xmm7 - movdqa [esp+640-48], xmm2 - movdqa [esp+640-368], xmm6 - movdqa [esp+640-144], xmm1 - movdqa [esp+640-400], xmm5 - pcmpgtw xmm4, xmm7 - pand xmm0, xmm4 - movdqa xmm4, [esp+640-320] - pcmpgtw xmm4, [esp+640-560] - pand xmm0, xmm4 - - mov ebx, 2 - movsx ebx, bx - movd xmm4, ebx - movdqa xmm7, xmm4 - punpcklwd xmm7, xmm4 - movdqa xmm4, [esp+640-320] - psraw xmm4, 2 - pshufd xmm7, xmm7, 0 - paddw xmm4, xmm7 - movdqa [esp+640-576], xmm4 - pcmpgtw xmm4, [esp+640-560] - movdqa [esp+640-560], xmm4 - - movdqa xmm4, [esp+640-512] - movdqa [esp+640-624], xmm7 - movdqa xmm7, xmm1 - psubw xmm7, xmm6 - pabsw xmm7, xmm7 - pcmpgtw xmm4, xmm7 - - pand xmm4, [esp+640-560] - movdqa [esp+640-544], xmm4 - movdqa xmm4, [esp+640-512] - movdqa xmm7, xmm5 - psubw xmm7, [esp+640-416] - pabsw xmm7, xmm7 - pcmpgtw xmm4, xmm7 - - pand xmm4, [esp+640-560] - movdqa [esp+640-560], xmm4 - - movdqa xmm4, [esp+640-544] - pandn xmm4, xmm6 - movdqa [esp+640-16], xmm4 - mov ebx, 4 - movsx ebx, bx - movd xmm4, ebx - movdqa xmm7, xmm4 - punpcklwd xmm7, xmm4 - movdqa xmm4, xmm3 - punpcklbw xmm4, xmm2 - psllw xmm4, 1 - paddw xmm4, xmm6 - paddw xmm4, xmm6 - paddw xmm4, xmm6 - paddw xmm4, [esp+640-480] - - movdqa xmm6, [esp+640-560] - pshufd xmm7, xmm7, 0 - paddw xmm4, xmm1 - movdqa [esp+640-592], xmm7 - paddw xmm4, xmm5 - paddw xmm4, xmm7 - movdqa xmm7, [esp+640-416] - pandn xmm6, xmm7 - movdqa [esp+640-80], xmm6 - movdqa xmm6, [esp+752-272] - punpcklbw xmm6, xmm2 - psllw xmm6, 1 - paddw xmm6, xmm7 - paddw xmm6, xmm7 - paddw xmm6, xmm7 - paddw xmm6, [esp+640-384] - - movdqa xmm7, [esp+640-480] - paddw xmm6, xmm5 - paddw xmm6, xmm1 - paddw xmm6, [esp+640-592] - psraw xmm6, 3 - pand xmm6, [esp+640-560] - movdqa [esp+640-112], xmm6 - movdqa xmm6, [esp+640-544] - pandn xmm6, xmm7 - movdqa [esp+640-336], xmm6 - movdqa xmm6, [esp+640-544] - movdqa [esp+640-528], xmm6 - movdqa xmm6, [esp+640-368] - paddw xmm6, xmm7 - movdqa xmm7, xmm1 - psraw xmm4, 3 - pand xmm4, [esp+640-544] - paddw xmm7, xmm5 - paddw xmm6, xmm7 - paddw xmm6, [esp+640-624] - movdqa xmm7, [esp+640-528] - - paddw xmm5, xmm1 - psraw xmm6, 2 - pand xmm7, xmm6 - - movdqa xmm6, [esp+640-384] - movdqa [esp+640-64], xmm7 - movdqa xmm7, [esp+640-560] - pandn xmm7, xmm6 - movdqa [esp+640-304], xmm7 - movdqa xmm7, [esp+640-560] - movdqa [esp+640-528], xmm7 - movdqa xmm7, [esp+640-416] - paddw xmm7, xmm6 - paddw xmm7, xmm5 - paddw xmm7, [esp+640-624] - movdqa xmm5, [esp+640-528] - psraw xmm7, 2 - pand xmm5, xmm7 - movdqa [esp+640-32], xmm5 - - movdqa xmm5, [esp+640-544] - movdqa [esp+640-528], xmm5 - movdqa xmm5, [esp+640-480] - movdqa xmm7, xmm5 - paddw xmm7, xmm5 - movdqa xmm5, xmm1 - paddw xmm5, xmm6 - paddw xmm6, [esp+640-592] - paddw xmm7, xmm5 - paddw xmm7, [esp+640-624] - movdqa xmm5, [esp+640-528] - psraw xmm7, 2 - pandn xmm5, xmm7 - movdqa xmm7, [esp+640-480] - paddw xmm7, xmm1 - paddw xmm7, [esp+640-400] - movdqa xmm1, [esp+640-544] - movdqa [esp+640-352], xmm5 - movdqa xmm5, [esp+640-368] - psllw xmm7, 1 - paddw xmm7, xmm6 - paddw xmm5, xmm7 - - movdqa xmm7, [esp+640-400] - psraw xmm5, 3 - pand xmm1, xmm5 - movdqa xmm5, [esp+640-480] - movdqa [esp+640-96], xmm1 - movdqa xmm1, [esp+640-560] - movdqa [esp+640-528], xmm1 - movdqa xmm1, [esp+640-384] - movdqa xmm6, xmm1 - paddw xmm6, xmm1 - paddw xmm1, [esp+640-400] - paddw xmm1, [esp+640-144] - paddw xmm7, xmm5 - paddw xmm5, [esp+640-592] - paddw xmm6, xmm7 - paddw xmm6, [esp+640-624] - movdqa xmm7, [esp+640-528] - psraw xmm6, 2 - psllw xmm1, 1 - paddw xmm1, xmm5 - - movdqa xmm5, [esp+656-272] - pandn xmm7, xmm6 - movdqa xmm6, [esp+640-416] - paddw xmm6, xmm1 - movdqa xmm1, [esp+640-560] - psraw xmm6, 3 - pand xmm1, xmm6 - - movdqa xmm6, [esp+704-272] - movdqa [esp+640-128], xmm1 - movdqa xmm1, [esp+672-272] - punpckhbw xmm1, xmm2 - movdqa [esp+640-448], xmm1 - movdqa xmm1, [esp+688-272] - punpckhbw xmm1, xmm2 - punpckhbw xmm6, xmm2 - movdqa [esp+640-288], xmm7 - punpckhbw xmm5, xmm2 - movdqa [esp+640-496], xmm1 - movdqa [esp+640-432], xmm6 - - movdqa xmm7, [esp+720-272] - punpckhbw xmm7, xmm2 - movdqa [esp+640-464], xmm7 - - movdqa xmm7, [esp+736-272] - punpckhbw xmm7, xmm2 - movdqa [esp+640-528], xmm7 - - movdqa xmm7, xmm6 - - psubw xmm6, [esp+640-464] - psubw xmm7, xmm1 - pabsw xmm7, xmm7 - movdqa [esp+640-560], xmm7 - por xmm4, [esp+640-16] - pabsw xmm6, xmm6 - movdqa xmm7, xmm1 - psubw xmm7, [esp+640-448] - - movdqa xmm1, [esp+640-512] - pabsw xmm7, xmm7 - pcmpgtw xmm1, xmm7 - movdqa xmm7, [esp+640-512] - pcmpgtw xmm7, xmm6 - movdqa xmm6, [esp+640-320] - pand xmm1, xmm7 - movdqa xmm7, [esp+640-560] - pcmpgtw xmm6, xmm7 - pand xmm1, xmm6 - - movdqa xmm6, [esp+640-576] - pcmpgtw xmm6, xmm7 - - movdqa xmm7, [esp+640-496] - punpckhbw xmm3, xmm2 - movdqa [esp+640-560], xmm6 - movdqa xmm6, [esp+640-512] - psubw xmm7, xmm5 - pabsw xmm7, xmm7 - pcmpgtw xmm6, xmm7 - - pand xmm6, [esp+640-560] - movdqa xmm7, [esp+640-432] - psubw xmm7, [esp+640-528] - - psllw xmm3, 1 - movdqa [esp+640-544], xmm6 - movdqa xmm6, [esp+640-512] - - movdqa xmm2, [esp+640-544] - paddw xmm3, xmm5 - paddw xmm3, xmm5 - paddw xmm3, xmm5 - paddw xmm3, [esp+640-448] - paddw xmm3, [esp+640-496] - pabsw xmm7, xmm7 - pcmpgtw xmm6, xmm7 - pand xmm6, [esp+640-560] - movdqa [esp+640-560], xmm6 - - movdqa xmm6, xmm0 - pand xmm6, xmm4 - movdqa xmm4, xmm0 - pandn xmm4, [esp+640-368] - por xmm6, xmm4 - movdqa xmm4, [esp+640-432] - paddw xmm3, xmm4 - paddw xmm3, [esp+640-592] - psraw xmm3, 3 - pand xmm3, xmm2 - pandn xmm2, xmm5 - por xmm3, xmm2 - movdqa xmm7, xmm1 - pand xmm7, xmm3 - movdqa xmm3, [esp+640-64] - por xmm3, [esp+640-336] - movdqa xmm2, xmm1 - pandn xmm2, xmm5 - por xmm7, xmm2 - - movdqa xmm2, xmm0 - pand xmm2, xmm3 - movdqa xmm3, xmm0 - pandn xmm3, [esp+640-480] - por xmm2, xmm3 - packuswb xmm6, xmm7 - movdqa [esp+640-336], xmm2 - movdqa [esp+656-272], xmm6 - movdqa xmm6, [esp+640-544] - movdqa xmm2, xmm5 - paddw xmm2, [esp+640-448] - movdqa xmm3, xmm1 - movdqa xmm7, [esp+640-496] - paddw xmm7, xmm4 - paddw xmm2, xmm7 - paddw xmm2, [esp+640-624] - movdqa xmm7, [esp+640-544] - psraw xmm2, 2 - pand xmm6, xmm2 - movdqa xmm2, [esp+640-448] - pandn xmm7, xmm2 - por xmm6, xmm7 - pand xmm3, xmm6 - movdqa xmm6, xmm1 - pandn xmm6, xmm2 - paddw xmm2, [esp+640-496] - paddw xmm2, xmm4 - por xmm3, xmm6 - movdqa xmm6, [esp+640-336] - packuswb xmm6, xmm3 - psllw xmm2, 1 - movdqa [esp+672-272], xmm6 - movdqa xmm6, [esp+640-96] - por xmm6, [esp+640-352] - - movdqa xmm3, xmm0 - pand xmm3, xmm6 - movdqa xmm6, xmm0 - pandn xmm6, [esp+640-144] - por xmm3, xmm6 - movdqa xmm6, [esp+640-544] - movdqa [esp+640-352], xmm3 - movdqa xmm3, [esp+640-464] - paddw xmm3, [esp+640-592] - paddw xmm2, xmm3 - movdqa xmm3, [esp+640-448] - paddw xmm5, xmm2 - movdqa xmm2, [esp+640-496] - psraw xmm5, 3 - pand xmm6, xmm5 - movdqa xmm5, [esp+640-464] - paddw xmm2, xmm5 - paddw xmm5, [esp+640-432] - movdqa xmm4, xmm3 - paddw xmm4, xmm3 - paddw xmm4, xmm2 - paddw xmm4, [esp+640-624] - movdqa xmm2, [esp+640-544] - paddw xmm3, [esp+640-592] - psraw xmm4, 2 - pandn xmm2, xmm4 - por xmm6, xmm2 - movdqa xmm7, xmm1 - pand xmm7, xmm6 - movdqa xmm6, [esp+640-496] - movdqa xmm2, xmm1 - pandn xmm2, xmm6 - por xmm7, xmm2 - movdqa xmm2, [esp+640-352] - packuswb xmm2, xmm7 - movdqa [esp+688-272], xmm2 - movdqa xmm2, [esp+640-128] - por xmm2, [esp+640-288] - - movdqa xmm4, xmm0 - pand xmm4, xmm2 - paddw xmm5, xmm6 - movdqa xmm2, xmm0 - pandn xmm2, [esp+640-400] - por xmm4, xmm2 - movdqa xmm2, [esp+640-528] - psllw xmm5, 1 - paddw xmm5, xmm3 - movdqa xmm3, [esp+640-560] - paddw xmm2, xmm5 - psraw xmm2, 3 - movdqa [esp+640-288], xmm4 - movdqa xmm4, [esp+640-560] - pand xmm4, xmm2 - movdqa xmm2, [esp+640-464] - movdqa xmm5, xmm2 - paddw xmm5, xmm2 - movdqa xmm2, [esp+640-432] - paddw xmm2, [esp+640-448] - movdqa xmm7, xmm1 - paddw xmm5, xmm2 - paddw xmm5, [esp+640-624] - movdqa xmm6, [esp+640-560] - psraw xmm5, 2 - pandn xmm3, xmm5 - por xmm4, xmm3 - movdqa xmm3, [esp+640-32] - por xmm3, [esp+640-304] - pand xmm7, xmm4 - movdqa xmm4, [esp+640-432] - movdqa xmm5, [esp+640-464] - movdqa xmm2, xmm1 - pandn xmm2, xmm4 - paddw xmm4, [esp+640-496] - por xmm7, xmm2 - movdqa xmm2, [esp+640-288] - packuswb xmm2, xmm7 - movdqa [esp+704-272], xmm2 - - movdqa xmm2, xmm0 - pand xmm2, xmm3 - movdqa xmm3, xmm0 - pandn xmm3, [esp+640-384] - por xmm2, xmm3 - movdqa [esp+640-304], xmm2 - movdqa xmm2, [esp+640-528] - movdqa xmm3, xmm2 - paddw xmm3, [esp+640-464] - paddw xmm3, xmm4 - paddw xmm3, [esp+640-624] - psraw xmm3, 2 - pand xmm6, xmm3 - movdqa xmm3, [esp+640-560] - movdqa xmm4, xmm3 - pandn xmm4, xmm5 - por xmm6, xmm4 - movdqa xmm7, xmm1 - pand xmm7, xmm6 - movdqa xmm6, [esp+640-304] - movdqa xmm4, xmm1 - pandn xmm4, xmm5 - por xmm7, xmm4 - - movdqa xmm4, xmm0 - pandn xmm0, [esp+640-416] - packuswb xmm6, xmm7 - movdqa xmm7, [esp+640-112] - por xmm7, [esp+640-80] - pand xmm4, xmm7 - por xmm4, xmm0 - movdqa xmm0, [esp+752-272] - punpckhbw xmm0, [esp+640-48] - psllw xmm0, 1 - paddw xmm0, xmm2 - paddw xmm0, xmm2 - paddw xmm0, xmm2 - paddw xmm0, xmm5 - paddw xmm0, [esp+640-432] - paddw xmm0, [esp+640-496] - paddw xmm0, [esp+640-592] - psraw xmm0, 3 - pand xmm0, xmm3 - movdqa xmm7, xmm1 - pandn xmm3, xmm2 - por xmm0, xmm3 - pand xmm7, xmm0 - - movdqa xmm0, [esp+656-272] - movdqa [edx], xmm0 - - movdqa xmm0, [esp+672-272] - - mov edx, dword [esp+640-596] - movdqa [esi], xmm0 - movdqa xmm0, [esp+688-272] - movdqa [edi], xmm0 - movdqa xmm0, [esp+704-272] - - pop edi - pandn xmm1, xmm2 - movdqa [eax], xmm0 - por xmm7, xmm1 - pop esi - packuswb xmm4, xmm7 - movdqa [edx], xmm6 - movdqa [ecx], xmm4 - pop ebx - mov esp, ebp - pop ebp - ret - - -;******************************************************************************** -; -; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst); -; -;******************************************************************************** - -WELS_EXTERN DeblockLumaTransposeH2V_sse2 - -ALIGN 16 - -DeblockLumaTransposeH2V_sse2: - push ebp - push ebx - mov ebp, esp - and esp,0FFFFFFF0h - sub esp, 10h - - mov eax, [ebp + 0Ch] - mov ecx, [ebp + 10h] - lea edx, [eax + ecx * 8] - lea ebx, [ecx*3] - - movq xmm0, [eax] - movq xmm7, [edx] - punpcklqdq xmm0, xmm7 - movq xmm1, [eax + ecx] - movq xmm7, [edx + ecx] - punpcklqdq xmm1, xmm7 - movq xmm2, [eax + ecx*2] - movq xmm7, [edx + ecx*2] - punpcklqdq xmm2, xmm7 - movq xmm3, [eax + ebx] - movq xmm7, [edx + ebx] - punpcklqdq xmm3, xmm7 - - lea eax, [eax + ecx * 4] - lea edx, [edx + ecx * 4] - movq xmm4, [eax] - movq xmm7, [edx] - punpcklqdq xmm4, xmm7 - movq xmm5, [eax + ecx] - movq xmm7, [edx + ecx] - punpcklqdq xmm5, xmm7 - movq xmm6, [eax + ecx*2] - movq xmm7, [edx + ecx*2] - punpcklqdq xmm6, xmm7 - - movdqa [esp], xmm0 - movq xmm7, [eax + ebx] - movq xmm0, [edx + ebx] - punpcklqdq xmm7, xmm0 - movdqa xmm0, [esp] - - SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp] - ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 - - mov eax, [ebp + 14h] - movdqa [eax], xmm4 - movdqa [eax + 10h], xmm2 - movdqa [eax + 20h], xmm3 - movdqa [eax + 30h], xmm7 - movdqa [eax + 40h], xmm5 - movdqa [eax + 50h], xmm1 - movdqa [eax + 60h], xmm6 - movdqa [eax + 70h], xmm0 - - mov esp, ebp - pop ebx - pop ebp - ret - - - -;******************************************************************************************* -; -; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc); -; -;******************************************************************************************* - -WELS_EXTERN DeblockLumaTransposeV2H_sse2 - -ALIGN 16 - -DeblockLumaTransposeV2H_sse2: - push ebp - mov ebp, esp - - and esp, 0FFFFFFF0h - sub esp, 10h - - mov eax, [ebp + 10h] - mov ecx, [ebp + 0Ch] - mov edx, [ebp + 08h] - - movdqa xmm0, [eax] - movdqa xmm1, [eax + 10h] - movdqa xmm2, [eax + 20h] - movdqa xmm3, [eax + 30h] - movdqa xmm4, [eax + 40h] - movdqa xmm5, [eax + 50h] - movdqa xmm6, [eax + 60h] - movdqa xmm7, [eax + 70h] - - SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp] - ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 - - lea eax, [ecx * 3] - - movq [edx], xmm4 - movq [edx + ecx], xmm2 - movq [edx + ecx*2], xmm3 - movq [edx + eax], xmm7 - - lea edx, [edx + ecx*4] - movq [edx], xmm5 - movq [edx + ecx], xmm1 - movq [edx + ecx*2], xmm6 - movq [edx + eax], xmm0 - - psrldq xmm4, 8 - psrldq xmm2, 8 - psrldq xmm3, 8 - psrldq xmm7, 8 - psrldq xmm5, 8 - psrldq xmm1, 8 - psrldq xmm6, 8 - psrldq xmm0, 8 - - lea edx, [edx + ecx*4] - movq [edx], xmm4 - movq [edx + ecx], xmm2 - movq [edx + ecx*2], xmm3 - movq [edx + eax], xmm7 - - lea edx, [edx + ecx*4] - movq [edx], xmm5 - movq [edx + ecx], xmm1 - movq [edx + ecx*2], xmm6 - movq [edx + eax], xmm0 - - - mov esp, ebp - pop ebp +;*! +;* \copy +;* Copyright (c) 2009-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* deblock.asm +;* +;* Abstract +;* edge loop +;* +;* History +;* 08/07/2009 Created +;* +;* +;*************************************************************************/ +%include "asm_inc.asm" +BITS 32 + +;******************************************************************************* +; Macros and other preprocessor constants +;******************************************************************************* + +%ifdef FORMAT_COFF +SECTION .rodata pData +%else +SECTION .rodata align=16 +%endif + +SECTION .text + +;******************************************************************************** +; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, +; int32_t iAlpha, int32_t iBeta) +;******************************************************************************** +WELS_EXTERN DeblockChromaEq4V_sse2 + +ALIGN 16 +DeblockChromaEq4V_sse2: + push ebp + mov ebp,esp + and esp,0FFFFFFF0h + sub esp,68h + mov edx,[ebp+10h] ; iStride + mov eax,[ebp+8] ; pPixCb + mov ecx,[ebp+0Ch] ; pPixCr + movq xmm4,[ecx] + movq xmm5,[edx+ecx] + push esi + push edi + lea esi,[edx+edx] + mov edi,eax + sub edi,esi + movq xmm1,[edi] + mov edi,ecx + sub edi,esi + movq xmm2,[edi] + punpcklqdq xmm1,xmm2 + mov esi,eax + sub esi,edx + movq xmm2,[esi] + mov edi,ecx + sub edi,edx + movq xmm3,[edi] + punpcklqdq xmm2,xmm3 + movq xmm3,[eax] + punpcklqdq xmm3,xmm4 + movq xmm4,[edx+eax] + mov edx, [ebp + 14h] + punpcklqdq xmm4,xmm5 + movd xmm5,edx + mov edx, [ebp + 18h] + pxor xmm0,xmm0 + movdqa xmm6,xmm5 + punpcklwd xmm6,xmm5 + pshufd xmm5,xmm6,0 + movd xmm6,edx + movdqa xmm7,xmm6 + punpcklwd xmm7,xmm6 + pshufd xmm6,xmm7,0 + movdqa xmm7,xmm1 + punpckhbw xmm1,xmm0 + punpcklbw xmm7,xmm0 + movdqa [esp+40h],xmm1 + movdqa [esp+60h],xmm7 + movdqa xmm7,xmm2 + punpcklbw xmm7,xmm0 + movdqa [esp+10h],xmm7 + movdqa xmm7,xmm3 + punpcklbw xmm7,xmm0 + punpckhbw xmm3,xmm0 + movdqa [esp+50h],xmm7 + movdqa xmm7,xmm4 + punpckhbw xmm4,xmm0 + punpckhbw xmm2,xmm0 + punpcklbw xmm7,xmm0 + movdqa [esp+30h],xmm3 + movdqa xmm3,[esp+10h] + movdqa xmm1,xmm3 + psubw xmm1,[esp+50h] + pabsw xmm1,xmm1 + movdqa [esp+20h],xmm4 + movdqa xmm0,xmm5 + pcmpgtw xmm0,xmm1 + movdqa xmm1,[esp+60h] + psubw xmm1,xmm3 + pabsw xmm1,xmm1 + movdqa xmm4,xmm6 + pcmpgtw xmm4,xmm1 + pand xmm0,xmm4 + movdqa xmm1,xmm7 + psubw xmm1,[esp+50h] + pabsw xmm1,xmm1 + movdqa xmm4,xmm6 + pcmpgtw xmm4,xmm1 + movdqa xmm1,xmm2 + psubw xmm1,[esp+30h] + pabsw xmm1,xmm1 + pcmpgtw xmm5,xmm1 + movdqa xmm1,[esp+40h] + pand xmm0,xmm4 + psubw xmm1,xmm2 + pabsw xmm1,xmm1 + movdqa xmm4,xmm6 + pcmpgtw xmm4,xmm1 + movdqa xmm1,[esp+20h] + psubw xmm1,[esp+30h] + pand xmm5,xmm4 + pabsw xmm1,xmm1 + pcmpgtw xmm6,xmm1 + pand xmm5,xmm6 + mov edx,2 + movsx edx,dx + movd xmm1,edx + movdqa xmm4,xmm1 + punpcklwd xmm4,xmm1 + pshufd xmm1,xmm4,0 + movdqa xmm4,[esp+60h] + movdqa xmm6,xmm4 + paddw xmm6,xmm4 + paddw xmm6,xmm3 + paddw xmm6,xmm7 + movdqa [esp+10h],xmm1 + paddw xmm6,[esp+10h] + psraw xmm6,2 + movdqa xmm4,xmm0 + pandn xmm4,xmm3 + movdqa xmm3,[esp+40h] + movdqa xmm1,xmm0 + pand xmm1,xmm6 + por xmm1,xmm4 + movdqa xmm6,xmm3 + paddw xmm6,xmm3 + movdqa xmm3,[esp+10h] + paddw xmm6,xmm2 + paddw xmm6,[esp+20h] + paddw xmm6,xmm3 + psraw xmm6,2 + movdqa xmm4,xmm5 + pand xmm4,xmm6 + movdqa xmm6,xmm5 + pandn xmm6,xmm2 + por xmm4,xmm6 + packuswb xmm1,xmm4 + movdqa xmm4,[esp+50h] + movdqa xmm6,xmm7 + paddw xmm6,xmm7 + paddw xmm6,xmm4 + paddw xmm6,[esp+60h] + paddw xmm6,xmm3 + psraw xmm6,2 + movdqa xmm2,xmm0 + pand xmm2,xmm6 + pandn xmm0,xmm4 + por xmm2,xmm0 + movdqa xmm0,[esp+20h] + movdqa xmm6,xmm0 + paddw xmm6,xmm0 + movdqa xmm0,[esp+30h] + paddw xmm6,xmm0 + paddw xmm6,[esp+40h] + movdqa xmm4,xmm5 + paddw xmm6,xmm3 + movq [esi],xmm1 + psraw xmm6,2 + pand xmm4,xmm6 + pandn xmm5,xmm0 + por xmm4,xmm5 + packuswb xmm2,xmm4 + movq [eax],xmm2 + psrldq xmm1,8 + movq [edi],xmm1 + pop edi + psrldq xmm2,8 + movq [ecx],xmm2 + pop esi + mov esp,ebp + pop ebp + ret + +;****************************************************************************** +; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, +; int32_t iAlpha, int32_t iBeta, int8_t * pTC); +;******************************************************************************* + +WELS_EXTERN DeblockChromaLt4V_sse2 + +DeblockChromaLt4V_sse2: + push ebp + mov ebp,esp + and esp,0FFFFFFF0h + sub esp,0E4h + push ebx + push esi + mov esi, [ebp+1Ch] ; pTC + movsx ebx, byte [esi+2] + push edi + movsx di,byte [esi+3] + mov word [esp+0Ch],bx + movsx bx,byte [esi+1] + movsx esi,byte [esi] + mov word [esp+0Eh],si + movzx esi,di + movd xmm1,esi + movzx esi,di + movd xmm2,esi + mov si,word [esp+0Ch] + mov edx, [ebp + 10h] + mov eax, [ebp + 08h] + movzx edi,si + movzx esi,si + mov ecx, [ebp + 0Ch] + movd xmm4,esi + movzx esi,bx + movd xmm5,esi + movd xmm3,edi + movzx esi,bx + movd xmm6,esi + mov si,word [esp+0Eh] + movzx edi,si + movzx esi,si + punpcklwd xmm6,xmm2 + pxor xmm0,xmm0 + movdqa [esp+40h],xmm0 + movd xmm7,edi + movd xmm0,esi + lea esi,[edx+edx] + mov edi,eax + sub edi,esi + punpcklwd xmm5,xmm1 + movdqa xmm1,[esp+40h] + punpcklwd xmm0,xmm4 + movq xmm4,[edx+ecx] + punpcklwd xmm7,xmm3 + movq xmm3,[eax] + punpcklwd xmm0,xmm6 + movq xmm6,[edi] + punpcklwd xmm7,xmm5 + punpcklwd xmm0,xmm7 + mov edi,ecx + sub edi,esi + movdqa xmm2,xmm1 + psubw xmm2,xmm0 + movdqa [esp+60h],xmm2 + movq xmm2, [edi] + punpcklqdq xmm6,xmm2 + mov esi,eax + sub esi,edx + movq xmm7,[esi] + mov edi,ecx + sub edi,edx + movq xmm2,[edi] + punpcklqdq xmm7,xmm2 + movq xmm2,[ecx] + punpcklqdq xmm3,xmm2 + movq xmm2,[edx+eax] + movsx edx,word [ebp + 14h] + punpcklqdq xmm2,xmm4 + movdqa [esp+0E0h],xmm2 + movd xmm2,edx + movsx edx,word [ebp + 18h] + movdqa xmm4,xmm2 + punpcklwd xmm4,xmm2 + movd xmm2,edx + movdqa xmm5,xmm2 + punpcklwd xmm5,xmm2 + pshufd xmm2,xmm5,0 + movdqa [esp+50h],xmm2 + movdqa xmm2,xmm6 + punpcklbw xmm2,xmm1 + movdqa [esp+0D0h],xmm3 + pshufd xmm4,xmm4,0 + movdqa [esp+30h],xmm2 + punpckhbw xmm6,xmm1 + movdqa [esp+80h],xmm6 + movdqa xmm6,[esp+0D0h] + punpckhbw xmm6,xmm1 + movdqa [esp+70h],xmm6 + movdqa xmm6, [esp+0E0h] + punpckhbw xmm6,xmm1 + movdqa [esp+90h],xmm6 + movdqa xmm5, [esp+0E0h] + movdqa xmm2,xmm7 + punpckhbw xmm7,xmm1 + punpcklbw xmm5,xmm1 + movdqa [esp+0A0h],xmm7 + punpcklbw xmm3,xmm1 + mov edx,4 + punpcklbw xmm2,xmm1 + movsx edx,dx + movd xmm6,edx + movdqa xmm7,xmm6 + punpcklwd xmm7,xmm6 + pshufd xmm6,xmm7,0 + movdqa xmm7,[esp+30h] + movdqa [esp+20h],xmm6 + psubw xmm7,xmm5 + movdqa xmm6,xmm0 + pcmpgtw xmm6,xmm1 + movdqa xmm1,[esp+60h] + movdqa [esp+40h],xmm6 + movdqa xmm6,xmm3 + psubw xmm6,xmm2 + psllw xmm6,2 + paddw xmm6,xmm7 + paddw xmm6, [esp+20h] + movdqa xmm7, [esp+50h] + psraw xmm6,3 + pmaxsw xmm1,xmm6 + movdqa [esp+10h],xmm0 + movdqa xmm6, [esp+10h] + pminsw xmm6,xmm1 + movdqa [esp+10h],xmm6 + movdqa xmm1,xmm2 + psubw xmm1,xmm3 + pabsw xmm1,xmm1 + movdqa xmm6,xmm4 + pcmpgtw xmm6,xmm1 + movdqa xmm1, [esp+30h] + psubw xmm1,xmm2 + pabsw xmm1,xmm1 + pcmpgtw xmm7,xmm1 + movdqa xmm1,[esp+50h] + pand xmm6,xmm7 + movdqa xmm7,[esp+50h] + psubw xmm5,xmm3 + pabsw xmm5,xmm5 + pcmpgtw xmm1,xmm5 + movdqa xmm5,[esp+80h] + psubw xmm5,[esp+90h] + pand xmm6,xmm1 + pand xmm6,[esp+40h] + movdqa xmm1,[esp+10h] + pand xmm1,xmm6 + movdqa xmm6,[esp+70h] + movdqa [esp+30h],xmm1 + movdqa xmm1,[esp+0A0h] + psubw xmm6,xmm1 + psllw xmm6,2 + paddw xmm6,xmm5 + paddw xmm6,[esp+20h] + movdqa xmm5,[esp+60h] + psraw xmm6,3 + pmaxsw xmm5,xmm6 + pminsw xmm0,xmm5 + movdqa xmm5,[esp+70h] + movdqa xmm6,xmm1 + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pcmpgtw xmm4,xmm6 + movdqa xmm6,[esp+80h] + psubw xmm6,xmm1 + pabsw xmm6,xmm6 + pcmpgtw xmm7,xmm6 + movdqa xmm6,[esp+90h] + pand xmm4,xmm7 + movdqa xmm7,[esp+50h] + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pcmpgtw xmm7,xmm6 + pand xmm4,xmm7 + pand xmm4,[esp+40h] + pand xmm0,xmm4 + movdqa xmm4,[esp+30h] + paddw xmm2,xmm4 + paddw xmm1,xmm0 + packuswb xmm2,xmm1 + movq [esi],xmm2 + psubw xmm3,xmm4 + psubw xmm5,xmm0 + packuswb xmm3,xmm5 + movq [eax],xmm3 + psrldq xmm2,8 + movq [edi],xmm2 + pop edi + pop esi + psrldq xmm3,8 + movq [ecx],xmm3 + pop ebx + mov esp,ebp + pop ebp + ret + +;*************************************************************************** +; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, +; int32_t iAlpha, int32_t iBeta) +;*************************************************************************** + +WELS_EXTERN DeblockChromaEq4H_sse2 + +ALIGN 16 + +DeblockChromaEq4H_sse2: + push ebp + mov ebp,esp + and esp,0FFFFFFF0h + sub esp,0C8h + mov ecx,dword [ebp+8] + mov edx,dword [ebp+0Ch] + mov eax,dword [ebp+10h] + sub ecx,2 + sub edx,2 + push esi + lea esi,[eax+eax*2] + mov dword [esp+18h],ecx + mov dword [esp+4],edx + lea ecx,[ecx+eax*4] + lea edx,[edx+eax*4] + lea eax,[esp+7Ch] + push edi + mov dword [esp+14h],esi + mov dword [esp+18h],ecx + mov dword [esp+0Ch],edx + mov dword [esp+10h],eax + mov esi,dword [esp+1Ch] + mov ecx,dword [ebp+10h] + mov edx,dword [esp+14h] + movd xmm0,dword [esi] + movd xmm1,dword [esi+ecx] + movd xmm2,dword [esi+ecx*2] + movd xmm3,dword [esi+edx] + mov esi,dword [esp+8] + movd xmm4,dword [esi] + movd xmm5,dword [esi+ecx] + movd xmm6,dword [esi+ecx*2] + movd xmm7,dword [esi+edx] + punpckldq xmm0,xmm4 + punpckldq xmm1,xmm5 + punpckldq xmm2,xmm6 + punpckldq xmm3,xmm7 + mov esi,dword [esp+18h] + mov edi,dword [esp+0Ch] + movd xmm4,dword [esi] + movd xmm5,dword [edi] + punpckldq xmm4,xmm5 + punpcklqdq xmm0,xmm4 + movd xmm4,dword [esi+ecx] + movd xmm5,dword [edi+ecx] + punpckldq xmm4,xmm5 + punpcklqdq xmm1,xmm4 + movd xmm4,dword [esi+ecx*2] + movd xmm5,dword [edi+ecx*2] + punpckldq xmm4,xmm5 + punpcklqdq xmm2,xmm4 + movd xmm4,dword [esi+edx] + movd xmm5,dword [edi+edx] + punpckldq xmm4,xmm5 + punpcklqdq xmm3,xmm4 + movdqa xmm6,xmm0 + punpcklbw xmm0,xmm1 + punpckhbw xmm6,xmm1 + movdqa xmm7,xmm2 + punpcklbw xmm2,xmm3 + punpckhbw xmm7,xmm3 + movdqa xmm4,xmm0 + movdqa xmm5,xmm6 + punpcklwd xmm0,xmm2 + punpckhwd xmm4,xmm2 + punpcklwd xmm6,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm1,xmm0 + movdqa xmm2,xmm4 + punpckldq xmm0,xmm6 + punpckhdq xmm1,xmm6 + punpckldq xmm4,xmm5 + punpckhdq xmm2,xmm5 + movdqa xmm5,xmm0 + movdqa xmm6,xmm1 + punpcklqdq xmm0,xmm4 + punpckhqdq xmm5,xmm4 + punpcklqdq xmm1,xmm2 + punpckhqdq xmm6,xmm2 + mov edi,dword [esp+10h] + movdqa [edi],xmm0 + movdqa [edi+10h],xmm5 + movdqa [edi+20h],xmm1 + movdqa [edi+30h],xmm6 + movsx ecx,word [ebp+14h] + movsx edx,word [ebp+18h] + movdqa xmm6,[esp+80h] + movdqa xmm4,[esp+90h] + movdqa xmm5,[esp+0A0h] + movdqa xmm7,[esp+0B0h] + pxor xmm0,xmm0 + movd xmm1,ecx + movdqa xmm2,xmm1 + punpcklwd xmm2,xmm1 + pshufd xmm1,xmm2,0 + movd xmm2,edx + movdqa xmm3,xmm2 + punpcklwd xmm3,xmm2 + pshufd xmm2,xmm3,0 + movdqa xmm3,xmm6 + punpckhbw xmm6,xmm0 + movdqa [esp+60h],xmm6 + movdqa xmm6,[esp+90h] + punpckhbw xmm6,xmm0 + movdqa [esp+30h],xmm6 + movdqa xmm6,[esp+0A0h] + punpckhbw xmm6,xmm0 + movdqa [esp+40h],xmm6 + movdqa xmm6,[esp+0B0h] + punpckhbw xmm6,xmm0 + movdqa [esp+70h],xmm6 + punpcklbw xmm7,xmm0 + punpcklbw xmm4,xmm0 + punpcklbw xmm5,xmm0 + punpcklbw xmm3,xmm0 + movdqa [esp+50h],xmm7 + movdqa xmm6,xmm4 + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + movdqa xmm0,xmm1 + pcmpgtw xmm0,xmm6 + movdqa xmm6,xmm3 + psubw xmm6,xmm4 + pabsw xmm6,xmm6 + movdqa xmm7,xmm2 + pcmpgtw xmm7,xmm6 + movdqa xmm6,[esp+50h] + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pand xmm0,xmm7 + movdqa xmm7,xmm2 + pcmpgtw xmm7,xmm6 + movdqa xmm6,[esp+30h] + psubw xmm6,[esp+40h] + pabsw xmm6,xmm6 + pcmpgtw xmm1,xmm6 + movdqa xmm6,[esp+60h] + psubw xmm6,[esp+30h] + pabsw xmm6,xmm6 + pand xmm0,xmm7 + movdqa xmm7,xmm2 + pcmpgtw xmm7,xmm6 + movdqa xmm6,[esp+70h] + psubw xmm6,[esp+40h] + pabsw xmm6,xmm6 + pand xmm1,xmm7 + pcmpgtw xmm2,xmm6 + pand xmm1,xmm2 + mov eax,2 + movsx ecx,ax + movd xmm2,ecx + movdqa xmm6,xmm2 + punpcklwd xmm6,xmm2 + pshufd xmm2,xmm6,0 + movdqa [esp+20h],xmm2 + movdqa xmm2,xmm3 + paddw xmm2,xmm3 + paddw xmm2,xmm4 + paddw xmm2,[esp+50h] + paddw xmm2,[esp+20h] + psraw xmm2,2 + movdqa xmm6,xmm0 + pand xmm6,xmm2 + movdqa xmm2,xmm0 + pandn xmm2,xmm4 + por xmm6,xmm2 + movdqa xmm2,[esp+60h] + movdqa xmm7,xmm2 + paddw xmm7,xmm2 + paddw xmm7,[esp+30h] + paddw xmm7,[esp+70h] + paddw xmm7,[esp+20h] + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + pandn xmm2,[esp+30h] + psraw xmm7,2 + pand xmm4,xmm7 + por xmm4,xmm2 + movdqa xmm2,[esp+50h] + packuswb xmm6,xmm4 + movdqa [esp+90h],xmm6 + movdqa xmm6,xmm2 + paddw xmm6,xmm2 + movdqa xmm2,[esp+20h] + paddw xmm6,xmm5 + paddw xmm6,xmm3 + movdqa xmm4,xmm0 + pandn xmm0,xmm5 + paddw xmm6,xmm2 + psraw xmm6,2 + pand xmm4,xmm6 + por xmm4,xmm0 + movdqa xmm0,[esp+70h] + movdqa xmm5,xmm0 + paddw xmm5,xmm0 + movdqa xmm0,[esp+40h] + paddw xmm5,xmm0 + paddw xmm5,[esp+60h] + movdqa xmm3,xmm1 + paddw xmm5,xmm2 + psraw xmm5,2 + pand xmm3,xmm5 + pandn xmm1,xmm0 + por xmm3,xmm1 + packuswb xmm4,xmm3 + movdqa [esp+0A0h],xmm4 + mov esi,dword [esp+10h] + movdqa xmm0,[esi] + movdqa xmm1,[esi+10h] + movdqa xmm2,[esi+20h] + movdqa xmm3,[esi+30h] + movdqa xmm6,xmm0 + punpcklbw xmm0,xmm1 + punpckhbw xmm6,xmm1 + movdqa xmm7,xmm2 + punpcklbw xmm2,xmm3 + punpckhbw xmm7,xmm3 + movdqa xmm4,xmm0 + movdqa xmm5,xmm6 + punpcklwd xmm0,xmm2 + punpckhwd xmm4,xmm2 + punpcklwd xmm6,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm1,xmm0 + movdqa xmm2,xmm4 + punpckldq xmm0,xmm6 + punpckhdq xmm1,xmm6 + punpckldq xmm4,xmm5 + punpckhdq xmm2,xmm5 + movdqa xmm5,xmm0 + movdqa xmm6,xmm1 + punpcklqdq xmm0,xmm4 + punpckhqdq xmm5,xmm4 + punpcklqdq xmm1,xmm2 + punpckhqdq xmm6,xmm2 + mov esi,dword [esp+1Ch] + mov ecx,dword [ebp+10h] + mov edx,dword [esp+14h] + mov edi,dword [esp+8] + movd dword [esi],xmm0 + movd dword [esi+ecx],xmm5 + movd dword [esi+ecx*2],xmm1 + movd dword [esi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + mov esi,dword [esp+18h] + movd dword [edi],xmm0 + movd dword [edi+ecx],xmm5 + movd dword [edi+ecx*2],xmm1 + movd dword [edi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + movd dword [esi],xmm0 + movd dword [esi+ecx],xmm5 + movd dword [esi+ecx*2],xmm1 + movd dword [esi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + mov edi,dword [esp+0Ch] + movd dword [edi],xmm0 + movd dword [edi+ecx],xmm5 + movd dword [edi+ecx*2],xmm1 + movd dword [edi+edx],xmm6 + pop edi + pop esi + mov esp,ebp + pop ebp + ret + +;******************************************************************************* +; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, +; int32_t iAlpha, int32_t iBeta, int8_t * pTC); +;******************************************************************************* + +WELS_EXTERN DeblockChromaLt4H_sse2 + +ALIGN 16 + +DeblockChromaLt4H_sse2: + push ebp + mov ebp,esp + and esp,0FFFFFFF0h + sub esp,108h + mov ecx,dword [ebp+8] + mov edx,dword [ebp+0Ch] + mov eax,dword [ebp+10h] + sub ecx,2 + sub edx,2 + push esi + lea esi,[eax+eax*2] + mov dword [esp+10h],ecx + mov dword [esp+4],edx + lea ecx,[ecx+eax*4] + lea edx,[edx+eax*4] + lea eax,[esp+6Ch] + push edi + mov dword [esp+0Ch],esi + mov dword [esp+18h],ecx + mov dword [esp+10h],edx + mov dword [esp+1Ch],eax + mov esi,dword [esp+14h] + mov ecx,dword [ebp+10h] + mov edx,dword [esp+0Ch] + movd xmm0,dword [esi] + movd xmm1,dword [esi+ecx] + movd xmm2,dword [esi+ecx*2] + movd xmm3,dword [esi+edx] + mov esi,dword [esp+8] + movd xmm4,dword [esi] + movd xmm5,dword [esi+ecx] + movd xmm6,dword [esi+ecx*2] + movd xmm7,dword [esi+edx] + punpckldq xmm0,xmm4 + punpckldq xmm1,xmm5 + punpckldq xmm2,xmm6 + punpckldq xmm3,xmm7 + mov esi,dword [esp+18h] + mov edi,dword [esp+10h] + movd xmm4,dword [esi] + movd xmm5,dword [edi] + punpckldq xmm4,xmm5 + punpcklqdq xmm0,xmm4 + movd xmm4,dword [esi+ecx] + movd xmm5,dword [edi+ecx] + punpckldq xmm4,xmm5 + punpcklqdq xmm1,xmm4 + movd xmm4,dword [esi+ecx*2] + movd xmm5,dword [edi+ecx*2] + punpckldq xmm4,xmm5 + punpcklqdq xmm2,xmm4 + movd xmm4,dword [esi+edx] + movd xmm5,dword [edi+edx] + punpckldq xmm4,xmm5 + punpcklqdq xmm3,xmm4 + movdqa xmm6,xmm0 + punpcklbw xmm0,xmm1 + punpckhbw xmm6,xmm1 + movdqa xmm7,xmm2 + punpcklbw xmm2,xmm3 + punpckhbw xmm7,xmm3 + movdqa xmm4,xmm0 + movdqa xmm5,xmm6 + punpcklwd xmm0,xmm2 + punpckhwd xmm4,xmm2 + punpcklwd xmm6,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm1,xmm0 + movdqa xmm2,xmm4 + punpckldq xmm0,xmm6 + punpckhdq xmm1,xmm6 + punpckldq xmm4,xmm5 + punpckhdq xmm2,xmm5 + movdqa xmm5,xmm0 + movdqa xmm6,xmm1 + punpcklqdq xmm0,xmm4 + punpckhqdq xmm5,xmm4 + punpcklqdq xmm1,xmm2 + punpckhqdq xmm6,xmm2 + mov edi,dword [esp+1Ch] + movdqa [edi],xmm0 + movdqa [edi+10h],xmm5 + movdqa [edi+20h],xmm1 + movdqa [edi+30h],xmm6 + mov eax,dword [ebp+1Ch] + movsx cx,byte [eax+3] + movsx dx,byte [eax+2] + movsx si,byte [eax+1] + movsx ax,byte [eax] + movzx edi,cx + movzx ecx,cx + movd xmm2,ecx + movzx ecx,dx + movzx edx,dx + movd xmm3,ecx + movd xmm4,edx + movzx ecx,si + movzx edx,si + movd xmm5,ecx + pxor xmm0,xmm0 + movd xmm6,edx + movzx ecx,ax + movdqa [esp+60h],xmm0 + movzx edx,ax + movsx eax,word [ebp+14h] + punpcklwd xmm6,xmm2 + movd xmm1,edi + movd xmm7,ecx + movsx ecx,word [ebp+18h] + movd xmm0,edx + punpcklwd xmm7,xmm3 + punpcklwd xmm5,xmm1 + movdqa xmm1,[esp+60h] + punpcklwd xmm7,xmm5 + movdqa xmm5,[esp+0A0h] + punpcklwd xmm0,xmm4 + punpcklwd xmm0,xmm6 + movdqa xmm6, [esp+70h] + punpcklwd xmm0,xmm7 + movdqa xmm7,[esp+80h] + movdqa xmm2,xmm1 + psubw xmm2,xmm0 + movdqa [esp+0D0h],xmm2 + movd xmm2,eax + movdqa xmm3,xmm2 + punpcklwd xmm3,xmm2 + pshufd xmm4,xmm3,0 + movd xmm2,ecx + movdqa xmm3,xmm2 + punpcklwd xmm3,xmm2 + pshufd xmm2,xmm3,0 + movdqa xmm3, [esp+90h] + movdqa [esp+50h],xmm2 + movdqa xmm2,xmm6 + punpcklbw xmm2,xmm1 + punpckhbw xmm6,xmm1 + movdqa [esp+40h],xmm2 + movdqa [esp+0B0h],xmm6 + movdqa xmm6,[esp+90h] + movdqa xmm2,xmm7 + punpckhbw xmm7,xmm1 + punpckhbw xmm6,xmm1 + punpcklbw xmm2,xmm1 + punpcklbw xmm3,xmm1 + punpcklbw xmm5,xmm1 + movdqa [esp+0F0h],xmm7 + movdqa [esp+0C0h],xmm6 + movdqa xmm6, [esp+0A0h] + punpckhbw xmm6,xmm1 + movdqa [esp+0E0h],xmm6 + mov edx,4 + movsx eax,dx + movd xmm6,eax + movdqa xmm7,xmm6 + punpcklwd xmm7,xmm6 + pshufd xmm6,xmm7,0 + movdqa [esp+30h],xmm6 + movdqa xmm7, [esp+40h] + psubw xmm7,xmm5 + movdqa xmm6,xmm0 + pcmpgtw xmm6,xmm1 + movdqa [esp+60h],xmm6 + movdqa xmm1, [esp+0D0h] + movdqa xmm6,xmm3 + psubw xmm6,xmm2 + psllw xmm6,2 + paddw xmm6,xmm7 + paddw xmm6,[esp+30h] + psraw xmm6,3 + pmaxsw xmm1,xmm6 + movdqa xmm7,[esp+50h] + movdqa [esp+20h],xmm0 + movdqa xmm6, [esp+20h] + pminsw xmm6,xmm1 + movdqa [esp+20h],xmm6 + movdqa xmm6,xmm4 + movdqa xmm1,xmm2 + psubw xmm1,xmm3 + pabsw xmm1,xmm1 + pcmpgtw xmm6,xmm1 + movdqa xmm1, [esp+40h] + psubw xmm1,xmm2 + pabsw xmm1,xmm1 + pcmpgtw xmm7,xmm1 + movdqa xmm1, [esp+50h] + pand xmm6,xmm7 + movdqa xmm7, [esp+50h] + psubw xmm5,xmm3 + pabsw xmm5,xmm5 + pcmpgtw xmm1,xmm5 + movdqa xmm5, [esp+0B0h] + psubw xmm5,[esp+0E0h] + pand xmm6,xmm1 + pand xmm6, [esp+60h] + movdqa xmm1, [esp+20h] + pand xmm1,xmm6 + movdqa xmm6, [esp+0C0h] + movdqa [esp+40h],xmm1 + movdqa xmm1, [esp+0F0h] + psubw xmm6,xmm1 + psllw xmm6,2 + paddw xmm6,xmm5 + paddw xmm6, [esp+30h] + movdqa xmm5, [esp+0D0h] + psraw xmm6,3 + pmaxsw xmm5,xmm6 + pminsw xmm0,xmm5 + movdqa xmm5,[esp+0C0h] + movdqa xmm6,xmm1 + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pcmpgtw xmm4,xmm6 + movdqa xmm6,[esp+0B0h] + psubw xmm6,xmm1 + pabsw xmm6,xmm6 + pcmpgtw xmm7,xmm6 + movdqa xmm6, [esp+0E0h] + pand xmm4,xmm7 + movdqa xmm7, [esp+50h] + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pcmpgtw xmm7,xmm6 + pand xmm4,xmm7 + pand xmm4,[esp+60h] + pand xmm0,xmm4 + movdqa xmm4, [esp+40h] + paddw xmm2,xmm4 + paddw xmm1,xmm0 + psubw xmm3,xmm4 + psubw xmm5,xmm0 + packuswb xmm2,xmm1 + packuswb xmm3,xmm5 + movdqa [esp+80h],xmm2 + movdqa [esp+90h],xmm3 + mov esi,dword [esp+1Ch] + movdqa xmm0, [esi] + movdqa xmm1, [esi+10h] + movdqa xmm2, [esi+20h] + movdqa xmm3, [esi+30h] + movdqa xmm6,xmm0 + punpcklbw xmm0,xmm1 + punpckhbw xmm6,xmm1 + movdqa xmm7,xmm2 + punpcklbw xmm2,xmm3 + punpckhbw xmm7,xmm3 + movdqa xmm4,xmm0 + movdqa xmm5,xmm6 + punpcklwd xmm0,xmm2 + punpckhwd xmm4,xmm2 + punpcklwd xmm6,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm1,xmm0 + movdqa xmm2,xmm4 + punpckldq xmm0,xmm6 + punpckhdq xmm1,xmm6 + punpckldq xmm4,xmm5 + punpckhdq xmm2,xmm5 + movdqa xmm5,xmm0 + movdqa xmm6,xmm1 + punpcklqdq xmm0,xmm4 + punpckhqdq xmm5,xmm4 + punpcklqdq xmm1,xmm2 + punpckhqdq xmm6,xmm2 + mov esi,dword [esp+14h] + mov ecx,dword [ebp+10h] + mov edx,dword [esp+0Ch] + mov edi,dword [esp+8] + movd dword [esi],xmm0 + movd dword [esi+ecx],xmm5 + movd dword [esi+ecx*2],xmm1 + movd dword [esi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + mov esi,dword [esp+18h] + movd dword [edi],xmm0 + movd dword [edi+ecx],xmm5 + movd dword [edi+ecx*2],xmm1 + movd dword [edi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + movd dword [esi],xmm0 + movd dword [esi+ecx],xmm5 + movd dword [esi+ecx*2],xmm1 + movd dword [esi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + mov edi,dword [esp+10h] + movd dword [edi],xmm0 + movd dword [edi+ecx],xmm5 + movd dword [edi+ecx*2],xmm1 + movd dword [edi+edx],xmm6 + pop edi + pop esi + mov esp,ebp + pop ebp + ret + + + +;******************************************************************************* +; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, +; int32_t iBeta, int8_t * pTC) +;******************************************************************************* + + +WELS_EXTERN DeblockLumaLt4V_sse2 + +ALIGN 16 + +DeblockLumaLt4V_sse2: + push ebp + mov ebp, esp + and esp, -16 ; fffffff0H + sub esp, 420 ; 000001a4H + mov eax, dword [ebp+8] + mov ecx, dword [ebp+12] + + pxor xmm0, xmm0 + push ebx + mov edx, dword [ebp+24] + movdqa [esp+424-384], xmm0 + push esi + + lea esi, [ecx+ecx*2] + push edi + mov edi, eax + sub edi, esi + movdqa xmm0, [edi] + + lea esi, [ecx+ecx] + movdqa [esp+432-208], xmm0 + mov edi, eax + sub edi, esi + movdqa xmm0, [edi] + movdqa [esp+448-208], xmm0 + + mov ebx, eax + sub ebx, ecx + movdqa xmm0, [ebx] + movdqa [esp+464-208], xmm0 + + movdqa xmm0, [eax] + + add ecx, eax + movdqa [esp+480-208], xmm0 + movdqa xmm0, [ecx] + mov dword [esp+432-404], ecx + + movsx ecx, word [ebp+16] + movdqa [esp+496-208], xmm0 + movdqa xmm0, [esi+eax] + + movsx si, byte [edx] + movdqa [esp+512-208], xmm0 + movd xmm0, ecx + movsx ecx, word [ebp+20] + movdqa xmm1, xmm0 + punpcklwd xmm1, xmm0 + pshufd xmm0, xmm1, 0 + movdqa [esp+432-112], xmm0 + movd xmm0, ecx + movsx cx, byte [edx+1] + movdqa xmm1, xmm0 + punpcklwd xmm1, xmm0 + mov dword [esp+432-408], ebx + movzx ebx, cx + pshufd xmm0, xmm1, 0 + movd xmm1, ebx + movzx ebx, cx + movd xmm2, ebx + movzx ebx, cx + movzx ecx, cx + movd xmm4, ecx + movzx ecx, si + movd xmm5, ecx + movzx ecx, si + movd xmm6, ecx + movzx ecx, si + movd xmm7, ecx + movzx ecx, si + movdqa [esp+432-336], xmm0 + movd xmm0, ecx + + movsx cx, byte [edx+3] + movsx dx, byte [edx+2] + movd xmm3, ebx + punpcklwd xmm0, xmm4 + movzx esi, cx + punpcklwd xmm6, xmm2 + punpcklwd xmm5, xmm1 + punpcklwd xmm0, xmm6 + punpcklwd xmm7, xmm3 + punpcklwd xmm7, xmm5 + punpcklwd xmm0, xmm7 + movdqa [esp+432-400], xmm0 + movd xmm0, esi + movzx esi, cx + movd xmm2, esi + movzx esi, cx + movzx ecx, cx + movd xmm4, ecx + movzx ecx, dx + movd xmm3, esi + movd xmm5, ecx + punpcklwd xmm5, xmm0 + + movdqa xmm0, [esp+432-384] + movzx ecx, dx + movd xmm6, ecx + movzx ecx, dx + movzx edx, dx + punpcklwd xmm6, xmm2 + movd xmm7, ecx + movd xmm1, edx + + movdqa xmm2, [esp+448-208] + punpcklbw xmm2, xmm0 + + mov ecx, 4 + movsx edx, cx + punpcklwd xmm7, xmm3 + punpcklwd xmm7, xmm5 + movdqa xmm5, [esp+496-208] + movdqa xmm3, [esp+464-208] + punpcklbw xmm5, xmm0 + movdqa [esp+432-240], xmm5 + movdqa xmm5, [esp+512-208] + punpcklbw xmm5, xmm0 + movdqa [esp+432-352], xmm5 + punpcklwd xmm1, xmm4 + movdqa xmm4, [esp+432-208] + punpcklwd xmm1, xmm6 + movdqa xmm6, [esp+480-208] + punpcklwd xmm1, xmm7 + punpcklbw xmm6, xmm0 + punpcklbw xmm3, xmm0 + punpcklbw xmm4, xmm0 + movdqa xmm7, xmm3 + psubw xmm7, xmm4 + pabsw xmm7, xmm7 + movdqa [esp+432-272], xmm4 + movdqa xmm4, [esp+432-336] + movdqa xmm5, xmm4 + pcmpgtw xmm5, xmm7 + movdqa [esp+432-288], xmm5 + movdqa xmm7, xmm6 + psubw xmm7, [esp+432-352] + pabsw xmm7, xmm7 + movdqa xmm5, xmm4 + pcmpgtw xmm5, xmm7 + movdqa [esp+432-256], xmm5 + movdqa xmm5, xmm3 + pavgw xmm5, xmm6 + movdqa [esp+432-304], xmm5 + movdqa xmm5, [esp+432-400] + psubw xmm5, [esp+432-288] + psubw xmm5, [esp+432-256] + movdqa [esp+432-224], xmm5 + movdqa xmm5, xmm6 + psubw xmm5, xmm3 + movdqa [esp+432-32], xmm6 + psubw xmm6, [esp+432-240] + movdqa xmm7, xmm5 + movdqa [esp+432-384], xmm5 + movdqa xmm5, [esp+432-112] + pabsw xmm7, xmm7 + pcmpgtw xmm5, xmm7 + pabsw xmm6, xmm6 + movdqa xmm7, xmm4 + pcmpgtw xmm7, xmm6 + + pand xmm5, xmm7 + movdqa xmm6, xmm3 + psubw xmm6, xmm2 + pabsw xmm6, xmm6 + movdqa xmm7, xmm4 + pcmpgtw xmm7, xmm6 + movdqa xmm6, [esp+432-400] + pand xmm5, xmm7 + movdqa xmm7, xmm6 + pcmpeqw xmm6, xmm0 + pcmpgtw xmm7, xmm0 + por xmm7, xmm6 + pand xmm5, xmm7 + movdqa [esp+432-320], xmm5 + movd xmm5, edx + movdqa xmm6, xmm5 + punpcklwd xmm6, xmm5 + pshufd xmm5, xmm6, 0 + movdqa [esp+432-336], xmm5 + movdqa xmm5, [esp+432-224] + movdqa [esp+432-368], xmm5 + movdqa xmm6, xmm0 + psubw xmm6, xmm5 + movdqa xmm5, [esp+432-384] + psllw xmm5, 2 + movdqa xmm7, xmm2 + psubw xmm7, [esp+432-240] + paddw xmm7, xmm5 + paddw xmm7, [esp+432-336] + movdqa xmm5, [esp+432-368] + psraw xmm7, 3 + pmaxsw xmm6, xmm7 + pminsw xmm5, xmm6 + + pand xmm5, [esp+432-320] + movdqa xmm6, [esp+432-400] + movdqa [esp+432-64], xmm5 + movdqa [esp+432-384], xmm6 + movdqa xmm5, xmm0 + psubw xmm5, xmm6 + movdqa [esp+432-368], xmm5 + movdqa xmm6, xmm5 + movdqa xmm5, [esp+432-272] + paddw xmm5, [esp+432-304] + movdqa xmm7, xmm2 + paddw xmm7, xmm2 + psubw xmm5, xmm7 + psraw xmm5, 1 + pmaxsw xmm6, xmm5 + movdqa xmm5, [esp+432-384] + pminsw xmm5, xmm6 + + pand xmm5, [esp+432-320] + pand xmm5, [esp+432-288] + movdqa xmm6, [esp+432-240] + movdqa [esp+432-96], xmm5 + movdqa xmm5, [esp+432-352] + paddw xmm5, [esp+432-304] + movdqa xmm7, xmm6 + paddw xmm7, xmm6 + movdqa xmm6, [esp+432-368] + psubw xmm5, xmm7 + + movdqa xmm7, [esp+496-208] + psraw xmm5, 1 + pmaxsw xmm6, xmm5 + movdqa xmm5, [esp+432-400] + pminsw xmm5, xmm6 + pand xmm5, [esp+432-320] + pand xmm5, [esp+432-256] + movdqa xmm6, [esp+448-208] + punpckhbw xmm7, xmm0 + movdqa [esp+432-352], xmm7 + + movdqa xmm7, [esp+512-208] + punpckhbw xmm6, xmm0 + movdqa [esp+432-48], xmm5 + movdqa xmm5, [esp+432-208] + movdqa [esp+432-368], xmm6 + movdqa xmm6, [esp+464-208] + punpckhbw xmm7, xmm0 + punpckhbw xmm5, xmm0 + movdqa [esp+432-384], xmm7 + punpckhbw xmm6, xmm0 + movdqa [esp+432-400], xmm6 + + movdqa xmm7, [esp+432-400] + movdqa xmm6, [esp+480-208] + psubw xmm7, xmm5 + movdqa [esp+432-16], xmm5 + pabsw xmm7, xmm7 + punpckhbw xmm6, xmm0 + movdqa xmm5, xmm4 + pcmpgtw xmm5, xmm7 + movdqa [esp+432-288], xmm5 + + movdqa xmm7, xmm6 + psubw xmm7, [esp+432-384] + pabsw xmm7, xmm7 + movdqa xmm5, xmm4 + pcmpgtw xmm5, xmm7 + movdqa [esp+432-256], xmm5 + + movdqa xmm5, [esp+432-400] + movdqa [esp+432-80], xmm6 + pavgw xmm5, xmm6 + movdqa [esp+432-304], xmm5 + + movdqa xmm5, xmm1 + psubw xmm5, [esp+432-288] + psubw xmm5, [esp+432-256] + movdqa [esp+432-224], xmm5 + movdqa xmm5, xmm6 + psubw xmm5, [esp+432-400] + psubw xmm6, [esp+432-352] + movdqa [esp+432-272], xmm5 + movdqa xmm7, xmm5 + movdqa xmm5, [esp+432-112] + pabsw xmm7, xmm7 + pcmpgtw xmm5, xmm7 + movdqa xmm7, xmm4 + pabsw xmm6, xmm6 + pcmpgtw xmm7, xmm6 + movdqa xmm6, [esp+432-368] + + pand xmm5, xmm7 + movdqa xmm7, [esp+432-400] + psubw xmm7, xmm6 + psubw xmm6, [esp+432-352] + pabsw xmm7, xmm7 + pcmpgtw xmm4, xmm7 + pand xmm5, xmm4 + + paddw xmm2, [esp+432-96] + movdqa xmm4, xmm1 + pcmpgtw xmm4, xmm0 + movdqa xmm7, xmm1 + pcmpeqw xmm7, xmm0 + por xmm4, xmm7 + pand xmm5, xmm4 + movdqa xmm4, [esp+432-224] + movdqa [esp+432-320], xmm5 + movdqa xmm5, [esp+432-272] + movdqa xmm7, xmm0 + psubw xmm7, xmm4 + psubw xmm0, xmm1 + psllw xmm5, 2 + paddw xmm6, xmm5 + paddw xmm6, [esp+432-336] + movdqa xmm5, [esp+432-368] + movdqa [esp+432-336], xmm0 + psraw xmm6, 3 + pmaxsw xmm7, xmm6 + pminsw xmm4, xmm7 + pand xmm4, [esp+432-320] + movdqa xmm6, xmm0 + movdqa xmm0, [esp+432-16] + paddw xmm0, [esp+432-304] + movdqa [esp+432-272], xmm4 + movdqa xmm4, [esp+432-368] + paddw xmm4, xmm4 + psubw xmm0, xmm4 + + movdqa xmm4, [esp+432-64] + psraw xmm0, 1 + pmaxsw xmm6, xmm0 + movdqa xmm0, [esp+432-400] + movdqa xmm7, xmm1 + pminsw xmm7, xmm6 + movdqa xmm6, [esp+432-320] + pand xmm7, xmm6 + pand xmm7, [esp+432-288] + paddw xmm5, xmm7 + packuswb xmm2, xmm5 + movdqa xmm5, [esp+432-272] + paddw xmm0, xmm5 + paddw xmm3, xmm4 + packuswb xmm3, xmm0 + + movdqa xmm0, [esp+432-32] + psubw xmm0, xmm4 + movdqa xmm4, [esp+432-80] + psubw xmm4, xmm5 + + movdqa xmm5, [esp+432-240] + paddw xmm5, [esp+432-48] + packuswb xmm0, xmm4 + movdqa xmm4, [esp+432-384] + paddw xmm4, [esp+432-304] + movdqa [esp+480-208], xmm0 + movdqa xmm0, [esp+432-352] + movdqa xmm7, xmm0 + paddw xmm0, xmm0 + + mov ecx, dword [esp+432-408] + + mov edx, dword [esp+432-404] + psubw xmm4, xmm0 + movdqa xmm0, [esp+432-336] + movdqa [edi], xmm2 + psraw xmm4, 1 + pmaxsw xmm0, xmm4 + pminsw xmm1, xmm0 + movdqa xmm0, [esp+480-208] + + pop edi + pand xmm1, xmm6 + pand xmm1, [esp+428-256] + movdqa [ecx], xmm3 + paddw xmm7, xmm1 + pop esi + packuswb xmm5, xmm7 + movdqa [eax], xmm0 + movdqa [edx], xmm5 + pop ebx + mov esp, ebp + pop ebp + ret + + +;******************************************************************************* +; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, +; int32_t iBeta) +;******************************************************************************* + +WELS_EXTERN DeblockLumaEq4V_sse2 + +ALIGN 16 + +DeblockLumaEq4V_sse2: + + push ebp + mov ebp, esp + and esp, -16 ; fffffff0H + sub esp, 628 ; 00000274H + mov eax, dword [ebp+8] + mov ecx, dword [ebp+12] + push ebx + push esi + + lea edx, [ecx*4] + pxor xmm0, xmm0 + movdqa xmm2, xmm0 + + movdqa xmm0, [ecx+eax] + mov esi, eax + sub esi, edx + movdqa xmm3, [esi] + movdqa xmm5, [eax] + push edi + lea edi, [ecx+ecx] + lea ebx, [ecx+ecx*2] + mov dword [esp+640-600], edi + mov esi, eax + sub esi, edi + movdqa xmm1, [esi] + movdqa [esp+720-272], xmm0 + mov edi, eax + sub edi, ecx + movdqa xmm4, [edi] + add ecx, eax + mov dword [esp+640-596], ecx + + mov ecx, dword [esp+640-600] + movdqa xmm0, [ecx+eax] + movdqa [esp+736-272], xmm0 + + movdqa xmm0, [eax+ebx] + mov edx, eax + sub edx, ebx + + movsx ebx, word [ebp+16] + movdqa xmm6, [edx] + add ecx, eax + movdqa [esp+752-272], xmm0 + movd xmm0, ebx + + movsx ebx, word [ebp+20] + movdqa xmm7, xmm0 + punpcklwd xmm7, xmm0 + pshufd xmm0, xmm7, 0 + movdqa [esp+640-320], xmm0 + movd xmm0, ebx + movdqa xmm7, xmm0 + punpcklwd xmm7, xmm0 + pshufd xmm0, xmm7, 0 + + movdqa xmm7, [esp+736-272] + punpcklbw xmm7, xmm2 + movdqa [esp+640-416], xmm7 + movdqa [esp+640-512], xmm0 + movdqa xmm0, xmm1 + movdqa [esp+672-272], xmm1 + movdqa xmm1, xmm4 + movdqa [esp+704-272], xmm5 + punpcklbw xmm5, xmm2 + punpcklbw xmm1, xmm2 + + movdqa xmm7, xmm5 + psubw xmm7, xmm1 + pabsw xmm7, xmm7 + movdqa [esp+640-560], xmm7 + punpcklbw xmm0, xmm2 + movdqa [esp+688-272], xmm4 + movdqa xmm4, [esp+720-272] + movdqa [esp+640-480], xmm0 + + movdqa xmm7, xmm1 + psubw xmm7, xmm0 + + movdqa xmm0, [esp+640-512] + pabsw xmm7, xmm7 + punpcklbw xmm4, xmm2 + pcmpgtw xmm0, xmm7 + movdqa [esp+640-384], xmm4 + movdqa xmm7, xmm5 + psubw xmm7, xmm4 + movdqa xmm4, [esp+640-512] + movdqa [esp+656-272], xmm6 + punpcklbw xmm6, xmm2 + pabsw xmm7, xmm7 + movdqa [esp+640-48], xmm2 + movdqa [esp+640-368], xmm6 + movdqa [esp+640-144], xmm1 + movdqa [esp+640-400], xmm5 + pcmpgtw xmm4, xmm7 + pand xmm0, xmm4 + movdqa xmm4, [esp+640-320] + pcmpgtw xmm4, [esp+640-560] + pand xmm0, xmm4 + + mov ebx, 2 + movsx ebx, bx + movd xmm4, ebx + movdqa xmm7, xmm4 + punpcklwd xmm7, xmm4 + movdqa xmm4, [esp+640-320] + psraw xmm4, 2 + pshufd xmm7, xmm7, 0 + paddw xmm4, xmm7 + movdqa [esp+640-576], xmm4 + pcmpgtw xmm4, [esp+640-560] + movdqa [esp+640-560], xmm4 + + movdqa xmm4, [esp+640-512] + movdqa [esp+640-624], xmm7 + movdqa xmm7, xmm1 + psubw xmm7, xmm6 + pabsw xmm7, xmm7 + pcmpgtw xmm4, xmm7 + + pand xmm4, [esp+640-560] + movdqa [esp+640-544], xmm4 + movdqa xmm4, [esp+640-512] + movdqa xmm7, xmm5 + psubw xmm7, [esp+640-416] + pabsw xmm7, xmm7 + pcmpgtw xmm4, xmm7 + + pand xmm4, [esp+640-560] + movdqa [esp+640-560], xmm4 + + movdqa xmm4, [esp+640-544] + pandn xmm4, xmm6 + movdqa [esp+640-16], xmm4 + mov ebx, 4 + movsx ebx, bx + movd xmm4, ebx + movdqa xmm7, xmm4 + punpcklwd xmm7, xmm4 + movdqa xmm4, xmm3 + punpcklbw xmm4, xmm2 + psllw xmm4, 1 + paddw xmm4, xmm6 + paddw xmm4, xmm6 + paddw xmm4, xmm6 + paddw xmm4, [esp+640-480] + + movdqa xmm6, [esp+640-560] + pshufd xmm7, xmm7, 0 + paddw xmm4, xmm1 + movdqa [esp+640-592], xmm7 + paddw xmm4, xmm5 + paddw xmm4, xmm7 + movdqa xmm7, [esp+640-416] + pandn xmm6, xmm7 + movdqa [esp+640-80], xmm6 + movdqa xmm6, [esp+752-272] + punpcklbw xmm6, xmm2 + psllw xmm6, 1 + paddw xmm6, xmm7 + paddw xmm6, xmm7 + paddw xmm6, xmm7 + paddw xmm6, [esp+640-384] + + movdqa xmm7, [esp+640-480] + paddw xmm6, xmm5 + paddw xmm6, xmm1 + paddw xmm6, [esp+640-592] + psraw xmm6, 3 + pand xmm6, [esp+640-560] + movdqa [esp+640-112], xmm6 + movdqa xmm6, [esp+640-544] + pandn xmm6, xmm7 + movdqa [esp+640-336], xmm6 + movdqa xmm6, [esp+640-544] + movdqa [esp+640-528], xmm6 + movdqa xmm6, [esp+640-368] + paddw xmm6, xmm7 + movdqa xmm7, xmm1 + psraw xmm4, 3 + pand xmm4, [esp+640-544] + paddw xmm7, xmm5 + paddw xmm6, xmm7 + paddw xmm6, [esp+640-624] + movdqa xmm7, [esp+640-528] + + paddw xmm5, xmm1 + psraw xmm6, 2 + pand xmm7, xmm6 + + movdqa xmm6, [esp+640-384] + movdqa [esp+640-64], xmm7 + movdqa xmm7, [esp+640-560] + pandn xmm7, xmm6 + movdqa [esp+640-304], xmm7 + movdqa xmm7, [esp+640-560] + movdqa [esp+640-528], xmm7 + movdqa xmm7, [esp+640-416] + paddw xmm7, xmm6 + paddw xmm7, xmm5 + paddw xmm7, [esp+640-624] + movdqa xmm5, [esp+640-528] + psraw xmm7, 2 + pand xmm5, xmm7 + movdqa [esp+640-32], xmm5 + + movdqa xmm5, [esp+640-544] + movdqa [esp+640-528], xmm5 + movdqa xmm5, [esp+640-480] + movdqa xmm7, xmm5 + paddw xmm7, xmm5 + movdqa xmm5, xmm1 + paddw xmm5, xmm6 + paddw xmm6, [esp+640-592] + paddw xmm7, xmm5 + paddw xmm7, [esp+640-624] + movdqa xmm5, [esp+640-528] + psraw xmm7, 2 + pandn xmm5, xmm7 + movdqa xmm7, [esp+640-480] + paddw xmm7, xmm1 + paddw xmm7, [esp+640-400] + movdqa xmm1, [esp+640-544] + movdqa [esp+640-352], xmm5 + movdqa xmm5, [esp+640-368] + psllw xmm7, 1 + paddw xmm7, xmm6 + paddw xmm5, xmm7 + + movdqa xmm7, [esp+640-400] + psraw xmm5, 3 + pand xmm1, xmm5 + movdqa xmm5, [esp+640-480] + movdqa [esp+640-96], xmm1 + movdqa xmm1, [esp+640-560] + movdqa [esp+640-528], xmm1 + movdqa xmm1, [esp+640-384] + movdqa xmm6, xmm1 + paddw xmm6, xmm1 + paddw xmm1, [esp+640-400] + paddw xmm1, [esp+640-144] + paddw xmm7, xmm5 + paddw xmm5, [esp+640-592] + paddw xmm6, xmm7 + paddw xmm6, [esp+640-624] + movdqa xmm7, [esp+640-528] + psraw xmm6, 2 + psllw xmm1, 1 + paddw xmm1, xmm5 + + movdqa xmm5, [esp+656-272] + pandn xmm7, xmm6 + movdqa xmm6, [esp+640-416] + paddw xmm6, xmm1 + movdqa xmm1, [esp+640-560] + psraw xmm6, 3 + pand xmm1, xmm6 + + movdqa xmm6, [esp+704-272] + movdqa [esp+640-128], xmm1 + movdqa xmm1, [esp+672-272] + punpckhbw xmm1, xmm2 + movdqa [esp+640-448], xmm1 + movdqa xmm1, [esp+688-272] + punpckhbw xmm1, xmm2 + punpckhbw xmm6, xmm2 + movdqa [esp+640-288], xmm7 + punpckhbw xmm5, xmm2 + movdqa [esp+640-496], xmm1 + movdqa [esp+640-432], xmm6 + + movdqa xmm7, [esp+720-272] + punpckhbw xmm7, xmm2 + movdqa [esp+640-464], xmm7 + + movdqa xmm7, [esp+736-272] + punpckhbw xmm7, xmm2 + movdqa [esp+640-528], xmm7 + + movdqa xmm7, xmm6 + + psubw xmm6, [esp+640-464] + psubw xmm7, xmm1 + pabsw xmm7, xmm7 + movdqa [esp+640-560], xmm7 + por xmm4, [esp+640-16] + pabsw xmm6, xmm6 + movdqa xmm7, xmm1 + psubw xmm7, [esp+640-448] + + movdqa xmm1, [esp+640-512] + pabsw xmm7, xmm7 + pcmpgtw xmm1, xmm7 + movdqa xmm7, [esp+640-512] + pcmpgtw xmm7, xmm6 + movdqa xmm6, [esp+640-320] + pand xmm1, xmm7 + movdqa xmm7, [esp+640-560] + pcmpgtw xmm6, xmm7 + pand xmm1, xmm6 + + movdqa xmm6, [esp+640-576] + pcmpgtw xmm6, xmm7 + + movdqa xmm7, [esp+640-496] + punpckhbw xmm3, xmm2 + movdqa [esp+640-560], xmm6 + movdqa xmm6, [esp+640-512] + psubw xmm7, xmm5 + pabsw xmm7, xmm7 + pcmpgtw xmm6, xmm7 + + pand xmm6, [esp+640-560] + movdqa xmm7, [esp+640-432] + psubw xmm7, [esp+640-528] + + psllw xmm3, 1 + movdqa [esp+640-544], xmm6 + movdqa xmm6, [esp+640-512] + + movdqa xmm2, [esp+640-544] + paddw xmm3, xmm5 + paddw xmm3, xmm5 + paddw xmm3, xmm5 + paddw xmm3, [esp+640-448] + paddw xmm3, [esp+640-496] + pabsw xmm7, xmm7 + pcmpgtw xmm6, xmm7 + pand xmm6, [esp+640-560] + movdqa [esp+640-560], xmm6 + + movdqa xmm6, xmm0 + pand xmm6, xmm4 + movdqa xmm4, xmm0 + pandn xmm4, [esp+640-368] + por xmm6, xmm4 + movdqa xmm4, [esp+640-432] + paddw xmm3, xmm4 + paddw xmm3, [esp+640-592] + psraw xmm3, 3 + pand xmm3, xmm2 + pandn xmm2, xmm5 + por xmm3, xmm2 + movdqa xmm7, xmm1 + pand xmm7, xmm3 + movdqa xmm3, [esp+640-64] + por xmm3, [esp+640-336] + movdqa xmm2, xmm1 + pandn xmm2, xmm5 + por xmm7, xmm2 + + movdqa xmm2, xmm0 + pand xmm2, xmm3 + movdqa xmm3, xmm0 + pandn xmm3, [esp+640-480] + por xmm2, xmm3 + packuswb xmm6, xmm7 + movdqa [esp+640-336], xmm2 + movdqa [esp+656-272], xmm6 + movdqa xmm6, [esp+640-544] + movdqa xmm2, xmm5 + paddw xmm2, [esp+640-448] + movdqa xmm3, xmm1 + movdqa xmm7, [esp+640-496] + paddw xmm7, xmm4 + paddw xmm2, xmm7 + paddw xmm2, [esp+640-624] + movdqa xmm7, [esp+640-544] + psraw xmm2, 2 + pand xmm6, xmm2 + movdqa xmm2, [esp+640-448] + pandn xmm7, xmm2 + por xmm6, xmm7 + pand xmm3, xmm6 + movdqa xmm6, xmm1 + pandn xmm6, xmm2 + paddw xmm2, [esp+640-496] + paddw xmm2, xmm4 + por xmm3, xmm6 + movdqa xmm6, [esp+640-336] + packuswb xmm6, xmm3 + psllw xmm2, 1 + movdqa [esp+672-272], xmm6 + movdqa xmm6, [esp+640-96] + por xmm6, [esp+640-352] + + movdqa xmm3, xmm0 + pand xmm3, xmm6 + movdqa xmm6, xmm0 + pandn xmm6, [esp+640-144] + por xmm3, xmm6 + movdqa xmm6, [esp+640-544] + movdqa [esp+640-352], xmm3 + movdqa xmm3, [esp+640-464] + paddw xmm3, [esp+640-592] + paddw xmm2, xmm3 + movdqa xmm3, [esp+640-448] + paddw xmm5, xmm2 + movdqa xmm2, [esp+640-496] + psraw xmm5, 3 + pand xmm6, xmm5 + movdqa xmm5, [esp+640-464] + paddw xmm2, xmm5 + paddw xmm5, [esp+640-432] + movdqa xmm4, xmm3 + paddw xmm4, xmm3 + paddw xmm4, xmm2 + paddw xmm4, [esp+640-624] + movdqa xmm2, [esp+640-544] + paddw xmm3, [esp+640-592] + psraw xmm4, 2 + pandn xmm2, xmm4 + por xmm6, xmm2 + movdqa xmm7, xmm1 + pand xmm7, xmm6 + movdqa xmm6, [esp+640-496] + movdqa xmm2, xmm1 + pandn xmm2, xmm6 + por xmm7, xmm2 + movdqa xmm2, [esp+640-352] + packuswb xmm2, xmm7 + movdqa [esp+688-272], xmm2 + movdqa xmm2, [esp+640-128] + por xmm2, [esp+640-288] + + movdqa xmm4, xmm0 + pand xmm4, xmm2 + paddw xmm5, xmm6 + movdqa xmm2, xmm0 + pandn xmm2, [esp+640-400] + por xmm4, xmm2 + movdqa xmm2, [esp+640-528] + psllw xmm5, 1 + paddw xmm5, xmm3 + movdqa xmm3, [esp+640-560] + paddw xmm2, xmm5 + psraw xmm2, 3 + movdqa [esp+640-288], xmm4 + movdqa xmm4, [esp+640-560] + pand xmm4, xmm2 + movdqa xmm2, [esp+640-464] + movdqa xmm5, xmm2 + paddw xmm5, xmm2 + movdqa xmm2, [esp+640-432] + paddw xmm2, [esp+640-448] + movdqa xmm7, xmm1 + paddw xmm5, xmm2 + paddw xmm5, [esp+640-624] + movdqa xmm6, [esp+640-560] + psraw xmm5, 2 + pandn xmm3, xmm5 + por xmm4, xmm3 + movdqa xmm3, [esp+640-32] + por xmm3, [esp+640-304] + pand xmm7, xmm4 + movdqa xmm4, [esp+640-432] + movdqa xmm5, [esp+640-464] + movdqa xmm2, xmm1 + pandn xmm2, xmm4 + paddw xmm4, [esp+640-496] + por xmm7, xmm2 + movdqa xmm2, [esp+640-288] + packuswb xmm2, xmm7 + movdqa [esp+704-272], xmm2 + + movdqa xmm2, xmm0 + pand xmm2, xmm3 + movdqa xmm3, xmm0 + pandn xmm3, [esp+640-384] + por xmm2, xmm3 + movdqa [esp+640-304], xmm2 + movdqa xmm2, [esp+640-528] + movdqa xmm3, xmm2 + paddw xmm3, [esp+640-464] + paddw xmm3, xmm4 + paddw xmm3, [esp+640-624] + psraw xmm3, 2 + pand xmm6, xmm3 + movdqa xmm3, [esp+640-560] + movdqa xmm4, xmm3 + pandn xmm4, xmm5 + por xmm6, xmm4 + movdqa xmm7, xmm1 + pand xmm7, xmm6 + movdqa xmm6, [esp+640-304] + movdqa xmm4, xmm1 + pandn xmm4, xmm5 + por xmm7, xmm4 + + movdqa xmm4, xmm0 + pandn xmm0, [esp+640-416] + packuswb xmm6, xmm7 + movdqa xmm7, [esp+640-112] + por xmm7, [esp+640-80] + pand xmm4, xmm7 + por xmm4, xmm0 + movdqa xmm0, [esp+752-272] + punpckhbw xmm0, [esp+640-48] + psllw xmm0, 1 + paddw xmm0, xmm2 + paddw xmm0, xmm2 + paddw xmm0, xmm2 + paddw xmm0, xmm5 + paddw xmm0, [esp+640-432] + paddw xmm0, [esp+640-496] + paddw xmm0, [esp+640-592] + psraw xmm0, 3 + pand xmm0, xmm3 + movdqa xmm7, xmm1 + pandn xmm3, xmm2 + por xmm0, xmm3 + pand xmm7, xmm0 + + movdqa xmm0, [esp+656-272] + movdqa [edx], xmm0 + + movdqa xmm0, [esp+672-272] + + mov edx, dword [esp+640-596] + movdqa [esi], xmm0 + movdqa xmm0, [esp+688-272] + movdqa [edi], xmm0 + movdqa xmm0, [esp+704-272] + + pop edi + pandn xmm1, xmm2 + movdqa [eax], xmm0 + por xmm7, xmm1 + pop esi + packuswb xmm4, xmm7 + movdqa [edx], xmm6 + movdqa [ecx], xmm4 + pop ebx + mov esp, ebp + pop ebp + ret + + +;******************************************************************************** +; +; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst); +; +;******************************************************************************** + +WELS_EXTERN DeblockLumaTransposeH2V_sse2 + +ALIGN 16 + +DeblockLumaTransposeH2V_sse2: + push ebp + push ebx + mov ebp, esp + and esp,0FFFFFFF0h + sub esp, 10h + + mov eax, [ebp + 0Ch] + mov ecx, [ebp + 10h] + lea edx, [eax + ecx * 8] + lea ebx, [ecx*3] + + movq xmm0, [eax] + movq xmm7, [edx] + punpcklqdq xmm0, xmm7 + movq xmm1, [eax + ecx] + movq xmm7, [edx + ecx] + punpcklqdq xmm1, xmm7 + movq xmm2, [eax + ecx*2] + movq xmm7, [edx + ecx*2] + punpcklqdq xmm2, xmm7 + movq xmm3, [eax + ebx] + movq xmm7, [edx + ebx] + punpcklqdq xmm3, xmm7 + + lea eax, [eax + ecx * 4] + lea edx, [edx + ecx * 4] + movq xmm4, [eax] + movq xmm7, [edx] + punpcklqdq xmm4, xmm7 + movq xmm5, [eax + ecx] + movq xmm7, [edx + ecx] + punpcklqdq xmm5, xmm7 + movq xmm6, [eax + ecx*2] + movq xmm7, [edx + ecx*2] + punpcklqdq xmm6, xmm7 + + movdqa [esp], xmm0 + movq xmm7, [eax + ebx] + movq xmm0, [edx + ebx] + punpcklqdq xmm7, xmm0 + movdqa xmm0, [esp] + + SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp] + ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 + + mov eax, [ebp + 14h] + movdqa [eax], xmm4 + movdqa [eax + 10h], xmm2 + movdqa [eax + 20h], xmm3 + movdqa [eax + 30h], xmm7 + movdqa [eax + 40h], xmm5 + movdqa [eax + 50h], xmm1 + movdqa [eax + 60h], xmm6 + movdqa [eax + 70h], xmm0 + + mov esp, ebp + pop ebx + pop ebp + ret + + + +;******************************************************************************************* +; +; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc); +; +;******************************************************************************************* + +WELS_EXTERN DeblockLumaTransposeV2H_sse2 + +ALIGN 16 + +DeblockLumaTransposeV2H_sse2: + push ebp + mov ebp, esp + + and esp, 0FFFFFFF0h + sub esp, 10h + + mov eax, [ebp + 10h] + mov ecx, [ebp + 0Ch] + mov edx, [ebp + 08h] + + movdqa xmm0, [eax] + movdqa xmm1, [eax + 10h] + movdqa xmm2, [eax + 20h] + movdqa xmm3, [eax + 30h] + movdqa xmm4, [eax + 40h] + movdqa xmm5, [eax + 50h] + movdqa xmm6, [eax + 60h] + movdqa xmm7, [eax + 70h] + + SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp] + ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 + + lea eax, [ecx * 3] + + movq [edx], xmm4 + movq [edx + ecx], xmm2 + movq [edx + ecx*2], xmm3 + movq [edx + eax], xmm7 + + lea edx, [edx + ecx*4] + movq [edx], xmm5 + movq [edx + ecx], xmm1 + movq [edx + ecx*2], xmm6 + movq [edx + eax], xmm0 + + psrldq xmm4, 8 + psrldq xmm2, 8 + psrldq xmm3, 8 + psrldq xmm7, 8 + psrldq xmm5, 8 + psrldq xmm1, 8 + psrldq xmm6, 8 + psrldq xmm0, 8 + + lea edx, [edx + ecx*4] + movq [edx], xmm4 + movq [edx + ecx], xmm2 + movq [edx + ecx*2], xmm3 + movq [edx + eax], xmm7 + + lea edx, [edx + ecx*4] + movq [edx], xmm5 + movq [edx + ecx], xmm1 + movq [edx + ecx*2], xmm6 + movq [edx + eax], xmm0 + + + mov esp, ebp + pop ebp ret \ No newline at end of file diff --git a/codec/encoder/core/asm/mc_chroma.asm b/codec/encoder/core/asm/mc_chroma.asm index 740063e4..649fc735 100644 --- a/codec/encoder/core/asm/mc_chroma.asm +++ b/codec/encoder/core/asm/mc_chroma.asm @@ -1,317 +1,317 @@ -;*! -;* \copy -;* Copyright (c) 2004-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* mc_chroma.asm -;* -;* Abstract -;* mmx motion compensation for chroma -;* -;* History -;* 10/13/2004 Created -;* -;* -;*************************************************************************/ -%include "asm_inc.asm" - -BITS 32 - -;*********************************************************************** -; Local Data (Read Only) -;*********************************************************************** - -SECTION .rodata align=16 - -;*********************************************************************** -; Various memory constants (trigonometric values or rounding values) -;*********************************************************************** - -ALIGN 16 -h264_d0x20_sse2: - dw 32,32,32,32,32,32,32,32 -ALIGN 16 -h264_d0x20_mmx: - dw 32,32,32,32 - - -;============================================================================= -; Code -;============================================================================= - -SECTION .text - -ALIGN 16 -;******************************************************************************* -; void McChromaWidthEq4_mmx( uint8_t *src, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; uint8_t *pABCD, -; int32_t iHeigh ); -;******************************************************************************* -WELS_EXTERN McChromaWidthEq4_mmx -McChromaWidthEq4_mmx: - push esi - push edi - push ebx - - mov eax, [esp +12 + 20] - movd mm3, [eax] - WELS_Zero mm7 - punpcklbw mm3, mm3 - movq mm4, mm3 - punpcklwd mm3, mm3 - punpckhwd mm4, mm4 - - movq mm5, mm3 - punpcklbw mm3, mm7 - punpckhbw mm5, mm7 - - movq mm6, mm4 - punpcklbw mm4, mm7 - punpckhbw mm6, mm7 - - mov esi, [esp +12+ 4] - mov eax, [esp + 12 + 8] - mov edi, [esp + 12 + 12] - mov edx, [esp + 12 + 16] - mov ecx, [esp + 12 + 24] - - lea ebx, [esi + eax] - movd mm0, [esi] - movd mm1, [esi+1] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 -.xloop: - - pmullw mm0, mm3 - pmullw mm1, mm5 - paddw mm0, mm1 - - movd mm1, [ebx] - punpcklbw mm1, mm7 - movq mm2, mm1 - pmullw mm1, mm4 - paddw mm0, mm1 - - movd mm1, [ebx+1] - punpcklbw mm1, mm7 - movq mm7, mm1 - pmullw mm1,mm6 - paddw mm0, mm1 - movq mm1,mm7 - - paddw mm0, [h264_d0x20_mmx] - psrlw mm0, 6 - - WELS_Zero mm7 - packuswb mm0, mm7 - movd [edi], mm0 - - movq mm0, mm2 - - lea edi, [edi +edx ] - lea ebx, [ebx + eax] - - dec ecx - jnz near .xloop - WELSEMMS - pop ebx - pop edi - pop esi - ret - - -ALIGN 16 -;******************************************************************************* -; void McChromaWidthEq8_sse2( uint8_t *pSrc, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; uint8_t *pABCD, -; int32_t iheigh ); -;******************************************************************************* -WELS_EXTERN McChromaWidthEq8_sse2 -McChromaWidthEq8_sse2: - push esi - push edi - push ebx - - mov eax, [esp +12 + 20] - movd xmm3, [eax] - WELS_Zero xmm7 - punpcklbw xmm3, xmm3 - punpcklwd xmm3, xmm3 - - movdqa xmm4, xmm3 - punpckldq xmm3, xmm3 - punpckhdq xmm4, xmm4 - movdqa xmm5, xmm3 - movdqa xmm6, xmm4 - - punpcklbw xmm3, xmm7 - punpckhbw xmm5, xmm7 - punpcklbw xmm4, xmm7 - punpckhbw xmm6, xmm7 - - mov esi, [esp +12+ 4] - mov eax, [esp + 12 + 8] - mov edi, [esp + 12 + 12] - mov edx, [esp + 12 + 16] - mov ecx, [esp + 12 + 24] - - lea ebx, [esi + eax] - movq xmm0, [esi] - movq xmm1, [esi+1] - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 -.xloop: - - pmullw xmm0, xmm3 - pmullw xmm1, xmm5 - paddw xmm0, xmm1 - - movq xmm1, [ebx] - punpcklbw xmm1, xmm7 - movdqa xmm2, xmm1 - pmullw xmm1, xmm4 - paddw xmm0, xmm1 - - movq xmm1, [ebx+1] - punpcklbw xmm1, xmm7 - movdqa xmm7, xmm1 - pmullw xmm1, xmm6 - paddw xmm0, xmm1 - movdqa xmm1,xmm7 - - paddw xmm0, [h264_d0x20_sse2] - psrlw xmm0, 6 - - WELS_Zero xmm7 - packuswb xmm0, xmm7 - movq [edi], xmm0 - - movdqa xmm0, xmm2 - - lea edi, [edi +edx ] - lea ebx, [ebx + eax] - - dec ecx - jnz near .xloop - - pop ebx - pop edi - pop esi - ret - - - - -ALIGN 16 -;*********************************************************************** -; void McChromaWidthEq8_ssse3( uint8_t *pSrc, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; uint8_t *pABCD, -; int32_t iHeigh); -;*********************************************************************** -WELS_EXTERN McChromaWidthEq8_ssse3 -McChromaWidthEq8_ssse3: - push ebx - push esi - push edi - - mov eax, [esp + 12 + 20] - - pxor xmm7, xmm7 - movd xmm5, [eax] - punpcklwd xmm5, xmm5 - punpckldq xmm5, xmm5 - movdqa xmm6, xmm5 - punpcklqdq xmm5, xmm5 - punpckhqdq xmm6, xmm6 - - mov eax, [esp + 12 + 4] - mov edx, [esp + 12 + 8] - mov esi, [esp + 12 + 12] - mov edi, [esp + 12 + 16] - mov ecx, [esp + 12 + 24] - - sub esi, edi - sub esi, edi - movdqa xmm7, [h264_d0x20_sse2] - - movdqu xmm0, [eax] - movdqa xmm1, xmm0 - psrldq xmm1, 1 - punpcklbw xmm0, xmm1 - -.hloop_chroma: - lea esi, [esi+2*edi] - - movdqu xmm2, [eax+edx] - movdqa xmm3, xmm2 - psrldq xmm3, 1 - punpcklbw xmm2, xmm3 - movdqa xmm4, xmm2 - - pmaddubsw xmm0, xmm5 - pmaddubsw xmm2, xmm6 - paddw xmm0, xmm2 - paddw xmm0, xmm7 - psrlw xmm0, 6 - packuswb xmm0, xmm0 - movq [esi],xmm0 - - lea eax, [eax+2*edx] - movdqu xmm2, [eax] - movdqa xmm3, xmm2 - psrldq xmm3, 1 - punpcklbw xmm2, xmm3 - movdqa xmm0, xmm2 - - pmaddubsw xmm4, xmm5 - pmaddubsw xmm2, xmm6 - paddw xmm4, xmm2 - paddw xmm4, xmm7 - psrlw xmm4, 6 - packuswb xmm4, xmm4 - movq [esi+edi],xmm4 - - sub ecx, 2 - jnz .hloop_chroma - pop edi - pop esi - pop ebx - - ret - - +;*! +;* \copy +;* Copyright (c) 2004-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* mc_chroma.asm +;* +;* Abstract +;* mmx motion compensation for chroma +;* +;* History +;* 10/13/2004 Created +;* +;* +;*************************************************************************/ +%include "asm_inc.asm" + +BITS 32 + +;*********************************************************************** +; Local Data (Read Only) +;*********************************************************************** + +SECTION .rodata align=16 + +;*********************************************************************** +; Various memory constants (trigonometric values or rounding values) +;*********************************************************************** + +ALIGN 16 +h264_d0x20_sse2: + dw 32,32,32,32,32,32,32,32 +ALIGN 16 +h264_d0x20_mmx: + dw 32,32,32,32 + + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +ALIGN 16 +;******************************************************************************* +; void McChromaWidthEq4_mmx( uint8_t *src, +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; uint8_t *pABCD, +; int32_t iHeigh ); +;******************************************************************************* +WELS_EXTERN McChromaWidthEq4_mmx +McChromaWidthEq4_mmx: + push esi + push edi + push ebx + + mov eax, [esp +12 + 20] + movd mm3, [eax] + WELS_Zero mm7 + punpcklbw mm3, mm3 + movq mm4, mm3 + punpcklwd mm3, mm3 + punpckhwd mm4, mm4 + + movq mm5, mm3 + punpcklbw mm3, mm7 + punpckhbw mm5, mm7 + + movq mm6, mm4 + punpcklbw mm4, mm7 + punpckhbw mm6, mm7 + + mov esi, [esp +12+ 4] + mov eax, [esp + 12 + 8] + mov edi, [esp + 12 + 12] + mov edx, [esp + 12 + 16] + mov ecx, [esp + 12 + 24] + + lea ebx, [esi + eax] + movd mm0, [esi] + movd mm1, [esi+1] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 +.xloop: + + pmullw mm0, mm3 + pmullw mm1, mm5 + paddw mm0, mm1 + + movd mm1, [ebx] + punpcklbw mm1, mm7 + movq mm2, mm1 + pmullw mm1, mm4 + paddw mm0, mm1 + + movd mm1, [ebx+1] + punpcklbw mm1, mm7 + movq mm7, mm1 + pmullw mm1,mm6 + paddw mm0, mm1 + movq mm1,mm7 + + paddw mm0, [h264_d0x20_mmx] + psrlw mm0, 6 + + WELS_Zero mm7 + packuswb mm0, mm7 + movd [edi], mm0 + + movq mm0, mm2 + + lea edi, [edi +edx ] + lea ebx, [ebx + eax] + + dec ecx + jnz near .xloop + WELSEMMS + pop ebx + pop edi + pop esi + ret + + +ALIGN 16 +;******************************************************************************* +; void McChromaWidthEq8_sse2( uint8_t *pSrc, +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; uint8_t *pABCD, +; int32_t iheigh ); +;******************************************************************************* +WELS_EXTERN McChromaWidthEq8_sse2 +McChromaWidthEq8_sse2: + push esi + push edi + push ebx + + mov eax, [esp +12 + 20] + movd xmm3, [eax] + WELS_Zero xmm7 + punpcklbw xmm3, xmm3 + punpcklwd xmm3, xmm3 + + movdqa xmm4, xmm3 + punpckldq xmm3, xmm3 + punpckhdq xmm4, xmm4 + movdqa xmm5, xmm3 + movdqa xmm6, xmm4 + + punpcklbw xmm3, xmm7 + punpckhbw xmm5, xmm7 + punpcklbw xmm4, xmm7 + punpckhbw xmm6, xmm7 + + mov esi, [esp +12+ 4] + mov eax, [esp + 12 + 8] + mov edi, [esp + 12 + 12] + mov edx, [esp + 12 + 16] + mov ecx, [esp + 12 + 24] + + lea ebx, [esi + eax] + movq xmm0, [esi] + movq xmm1, [esi+1] + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 +.xloop: + + pmullw xmm0, xmm3 + pmullw xmm1, xmm5 + paddw xmm0, xmm1 + + movq xmm1, [ebx] + punpcklbw xmm1, xmm7 + movdqa xmm2, xmm1 + pmullw xmm1, xmm4 + paddw xmm0, xmm1 + + movq xmm1, [ebx+1] + punpcklbw xmm1, xmm7 + movdqa xmm7, xmm1 + pmullw xmm1, xmm6 + paddw xmm0, xmm1 + movdqa xmm1,xmm7 + + paddw xmm0, [h264_d0x20_sse2] + psrlw xmm0, 6 + + WELS_Zero xmm7 + packuswb xmm0, xmm7 + movq [edi], xmm0 + + movdqa xmm0, xmm2 + + lea edi, [edi +edx ] + lea ebx, [ebx + eax] + + dec ecx + jnz near .xloop + + pop ebx + pop edi + pop esi + ret + + + + +ALIGN 16 +;*********************************************************************** +; void McChromaWidthEq8_ssse3( uint8_t *pSrc, +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; uint8_t *pABCD, +; int32_t iHeigh); +;*********************************************************************** +WELS_EXTERN McChromaWidthEq8_ssse3 +McChromaWidthEq8_ssse3: + push ebx + push esi + push edi + + mov eax, [esp + 12 + 20] + + pxor xmm7, xmm7 + movd xmm5, [eax] + punpcklwd xmm5, xmm5 + punpckldq xmm5, xmm5 + movdqa xmm6, xmm5 + punpcklqdq xmm5, xmm5 + punpckhqdq xmm6, xmm6 + + mov eax, [esp + 12 + 4] + mov edx, [esp + 12 + 8] + mov esi, [esp + 12 + 12] + mov edi, [esp + 12 + 16] + mov ecx, [esp + 12 + 24] + + sub esi, edi + sub esi, edi + movdqa xmm7, [h264_d0x20_sse2] + + movdqu xmm0, [eax] + movdqa xmm1, xmm0 + psrldq xmm1, 1 + punpcklbw xmm0, xmm1 + +.hloop_chroma: + lea esi, [esi+2*edi] + + movdqu xmm2, [eax+edx] + movdqa xmm3, xmm2 + psrldq xmm3, 1 + punpcklbw xmm2, xmm3 + movdqa xmm4, xmm2 + + pmaddubsw xmm0, xmm5 + pmaddubsw xmm2, xmm6 + paddw xmm0, xmm2 + paddw xmm0, xmm7 + psrlw xmm0, 6 + packuswb xmm0, xmm0 + movq [esi],xmm0 + + lea eax, [eax+2*edx] + movdqu xmm2, [eax] + movdqa xmm3, xmm2 + psrldq xmm3, 1 + punpcklbw xmm2, xmm3 + movdqa xmm0, xmm2 + + pmaddubsw xmm4, xmm5 + pmaddubsw xmm2, xmm6 + paddw xmm4, xmm2 + paddw xmm4, xmm7 + psrlw xmm4, 6 + packuswb xmm4, xmm4 + movq [esi+edi],xmm4 + + sub ecx, 2 + jnz .hloop_chroma + pop edi + pop esi + pop ebx + + ret + + diff --git a/processing/build/linux/makefile b/processing/build/linux/makefile index db5b9e99..5297b662 100644 --- a/processing/build/linux/makefile +++ b/processing/build/linux/makefile @@ -1,94 +1,94 @@ -NASM = 1 -NAME = libwelsvp - -OUTDIR = ../../../bin/linux -BINDIR = ../../bin -OBJDIR = ../../obj -SRCDIRS = ../../src/asm \ - ../../src/common \ - ../../src/adaptivequantization \ - ../../src/backgounddetection \ - ../../src/denoise \ - ../../src/downsample \ - ../../src/scenechangedetection \ - ../../src/vaacalc \ - ../../src/complexityanalysis -SRCDIRS += ../../src/imagerotate - - -TARGETLIB = $(BINDIR)/$(NAME).so - -CC = $(shell which gcc) -AS = $(shell which nasm) -GCC = gcc -m32 - -CPPFLAGS = -Wall -g -O3 -ifeq ($(NASM), 1) -CPPFLAGS += -DX86_ASM -endif -ASMFLAGS = -f elf -DNOPREFIX -I ../../src/asm/ -LDFLAGS = -lstdc++ -ldl - -SRCEXTS = .cpp -ifeq ($(NASM), 1) -SRCEXTS += .asm -endif -HDREXTS = .h -SOURCES = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS)))) -HEADERS = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS)))) -SRC_CPP = $(filter %.cpp,$(SOURCES)) -SRC_ASM = $(filter %.asm,$(SOURCES)) -OBJS = $(addsuffix .o, $(basename $(SOURCES))) -DEPS = $(OBJS:.o=.d) - -DEP_OPT = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \ - echo "-MM -MP"; else echo "-M"; fi ) -DEPEND_cpp.d = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS)) -DEPEND_asm.d = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS)) -COMPILE.cpp = $(GCC) $(CPPFLAGS) -c -COMPILE.asm = $(AS) $(ASMFLAGS) -LINK = $(GCC) $(LDFLAGS) - -.PHONY: all objs tags ctags clean distclean - -.SUFFIXES: - -all: $(TARGETLIB) - -%.d:%.cpp - @echo -n $(dir $<) > $@ - @$(DEPEND_cpp.d) $< >> $@ - -%.d:%.asm - @echo -n $(dir $<) > $@ - @$(DEPEND_asm.d) $< >> $@ - -objs:$(OBJS) - -%.o:%.cpp - $(COMPILE.cpp) $< -o $@ - -%.o:%.asm - $(COMPILE.asm) $< -o $@ - -tags: $(HEADERS) $(SOURCES) - etags $(HEADERS) $(SOURCES) - -ctags: $(HEADERS) $(SOURCES) - ctags $(HEADERS) $(SOURCES) - -$(TARGETLIB):$(OBJS) - @if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi - $(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@ - @echo produce the lib to $(TARGETLIB). - @if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi - @cp -f $(TARGETLIB) $(OUTDIR) - @cp -f $(TARGETLIB) ../../../testbin - @echo copy the lib to $(OUTDIR). - -clean: - rm -f $(OBJS) $(TARGETLIB) - -distclean: clean - rm -f $(DEPS) TAGS - +NASM = 1 +NAME = libwelsvp + +OUTDIR = ../../../bin/linux +BINDIR = ../../bin +OBJDIR = ../../obj +SRCDIRS = ../../src/asm \ + ../../src/common \ + ../../src/adaptivequantization \ + ../../src/backgounddetection \ + ../../src/denoise \ + ../../src/downsample \ + ../../src/scenechangedetection \ + ../../src/vaacalc \ + ../../src/complexityanalysis +SRCDIRS += ../../src/imagerotate + + +TARGETLIB = $(BINDIR)/$(NAME).so + +CC = $(shell which gcc) +AS = $(shell which nasm) +GCC = gcc -m32 + +CPPFLAGS = -Wall -g -O3 +ifeq ($(NASM), 1) +CPPFLAGS += -DX86_ASM +endif +ASMFLAGS = -f elf -DNOPREFIX -I ../../src/asm/ +LDFLAGS = -lstdc++ -ldl + +SRCEXTS = .cpp +ifeq ($(NASM), 1) +SRCEXTS += .asm +endif +HDREXTS = .h +SOURCES = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS)))) +HEADERS = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS)))) +SRC_CPP = $(filter %.cpp,$(SOURCES)) +SRC_ASM = $(filter %.asm,$(SOURCES)) +OBJS = $(addsuffix .o, $(basename $(SOURCES))) +DEPS = $(OBJS:.o=.d) + +DEP_OPT = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \ + echo "-MM -MP"; else echo "-M"; fi ) +DEPEND_cpp.d = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS)) +DEPEND_asm.d = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS)) +COMPILE.cpp = $(GCC) $(CPPFLAGS) -c +COMPILE.asm = $(AS) $(ASMFLAGS) +LINK = $(GCC) $(LDFLAGS) + +.PHONY: all objs tags ctags clean distclean + +.SUFFIXES: + +all: $(TARGETLIB) + +%.d:%.cpp + @echo -n $(dir $<) > $@ + @$(DEPEND_cpp.d) $< >> $@ + +%.d:%.asm + @echo -n $(dir $<) > $@ + @$(DEPEND_asm.d) $< >> $@ + +objs:$(OBJS) + +%.o:%.cpp + $(COMPILE.cpp) $< -o $@ + +%.o:%.asm + $(COMPILE.asm) $< -o $@ + +tags: $(HEADERS) $(SOURCES) + etags $(HEADERS) $(SOURCES) + +ctags: $(HEADERS) $(SOURCES) + ctags $(HEADERS) $(SOURCES) + +$(TARGETLIB):$(OBJS) + @if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi + $(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@ + @echo produce the lib to $(TARGETLIB). + @if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi + @cp -f $(TARGETLIB) $(OUTDIR) + @cp -f $(TARGETLIB) ../../../testbin + @echo copy the lib to $(OUTDIR). + +clean: + rm -f $(OBJS) $(TARGETLIB) + +distclean: clean + rm -f $(DEPS) TAGS + diff --git a/processing/src/asm/denoisefilter.asm b/processing/src/asm/denoisefilter.asm index 3adc0cdd..968214d8 100644 --- a/processing/src/asm/denoisefilter.asm +++ b/processing/src/asm/denoisefilter.asm @@ -1,263 +1,263 @@ -;*! -;* \copy -;* Copyright (c) 2010-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* predenoise.asm -;* -;* Abstract -;* denoise for SVC2.1 -;* History -;* 4/13/2010 Created -;* 7/30/2010 Modified -;* -;* -;*************************************************************************/ -%include "asm_inc.asm" - -;*********************************************************************** -; Constant -;*********************************************************************** -SECTION .rodata align=16 - -sse2_32 times 8 dw 32 -sse2_20 times 8 dw 20 - - -BITS 32 -;*********************************************************************** -; Code -;*********************************************************************** -SECTION .text - -%macro WEIGHT_LINE 9 - movq %2, %9 - punpcklbw %2, %7 - movdqa %8, %2 - - movdqa %1, %6 - psubusb %1, %8 - psubusb %8, %6 - por %8, %1 ; ABS(curPixel - centerPixel); - - movdqa %1, %3 - psubusb %1, %8 - - pmullw %1, %1 - psrlw %1, 5 - pmullw %2, %1 - paddusw %4, %1 - paddusw %5, %2 -%endmacro - -%macro WEIGHT_LINE1_UV 4 - movdqa %2, %1 - punpcklbw %2, %4 - paddw %3, %2 - - movdqa %2, %1 - psrldq %2, 1 - punpcklbw %2, %4 - paddw %3, %2 - - movdqa %2, %1 - psrldq %2, 2 - punpcklbw %2, %4 - psllw %2, 1 - paddw %3, %2 - - movdqa %2, %1 - psrldq %2, 3 - punpcklbw %2, %4 - paddw %3, %2 - - movdqa %2, %1 - psrldq %2, 4 - punpcklbw %2, %4 - paddw %3, %2 -%endmacro - -%macro WEIGHT_LINE2_UV 4 - movdqa %2, %1 - punpcklbw %2, %4 - paddw %3, %2 - - movdqa %2, %1 - psrldq %2, 1 - punpcklbw %2, %4 - psllw %2, 1 - paddw %3, %2 - - movdqa %2, %1 - psrldq %2, 2 - punpcklbw %2, %4 - psllw %2, 2 - paddw %3, %2 - - movdqa %2, %1 - psrldq %2, 3 - punpcklbw %2, %4 - psllw %2, 1 - paddw %3, %2 - - movdqa %2, %1 - psrldq %2, 4 - punpcklbw %2, %4 - paddw %3, %2 -%endmacro - -%macro WEIGHT_LINE3_UV 4 - movdqa %2, %1 - punpcklbw %2, %4 - psllw %2, 1 - paddw %3, %2 - - movdqa %2, %1 - psrldq %2, 1 - punpcklbw %2, %4 - psllw %2, 2 - paddw %3, %2 - - movdqa %2, %1 - psrldq %2, 2 - punpcklbw %2, %4 - pmullw %2, [sse2_20] - paddw %3, %2 - - movdqa %2, %1 - psrldq %2, 3 - punpcklbw %2, %4 - psllw %2, 2 - paddw %3, %2 - - movdqa %2, %1 - psrldq %2, 4 - punpcklbw %2, %4 - psllw %2, 1 - paddw %3, %2 -%endmacro - -ALIGN 16 -WELS_EXTERN BilateralLumaFilter8_sse2 -;*********************************************************************** -; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride); -;*********************************************************************** -; 1 2 3 -; 4 0 5 -; 6 7 8 -; 0: the center point -%define pushsize 4 -%define pixel esp + pushsize + 4 -%define stride esp + pushsize + 8 -BilateralLumaFilter8_sse2: - push ebx - - pxor xmm7, xmm7 - mov eax, [pixel] - mov ebx, eax - movq xmm6, [eax] - punpcklbw xmm6, xmm7 - movdqa xmm3, [sse2_32] - pxor xmm4, xmm4 ; nTotWeight - pxor xmm5, xmm5 ; nSum - - dec eax - mov ecx, [stride] - - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 4 - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 5 - - sub eax, ecx - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 1 - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 2 - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 3 - - lea eax, [eax + ecx * 2] - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 6 - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 7 - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 8 - - pcmpeqw xmm0, xmm0 - psrlw xmm0, 15 - psllw xmm0, 8 - psubusw xmm0, xmm4 - pmullw xmm0, xmm6 - paddusw xmm5, xmm0 - psrlw xmm5, 8 - packuswb xmm5, xmm5 - movq [ebx], xmm5 - - pop ebx - ret - -WELS_EXTERN WaverageChromaFilter8_sse2 -;*********************************************************************** -; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride); -;*********************************************************************** -;5x5 filter: -;1 1 2 1 1 -;1 2 4 2 1 -;2 4 20 4 2 -;1 2 4 2 1 -;1 1 2 1 1 - -ALIGN 16 -WaverageChromaFilter8_sse2: - mov edx, [esp + 4] ; pixels - mov ecx, [esp + 8] ; stride - - mov eax, ecx - add eax, eax - sub edx, eax ; pixels - 2 * stride - sub edx, 2 - - pxor xmm0, xmm0 - pxor xmm3, xmm3 - - movdqu xmm1, [edx] - WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0 - - movdqu xmm1, [edx + ecx] - WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0 - - add edx, eax - movdqu xmm1, [edx] - WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0 - - movdqu xmm1, [edx + ecx] - WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0 - - movdqu xmm1, [edx + ecx * 2] - WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0 - - psrlw xmm3, 6 - packuswb xmm3, xmm3 - movq [edx + 2], xmm3 - +;*! +;* \copy +;* Copyright (c) 2010-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* predenoise.asm +;* +;* Abstract +;* denoise for SVC2.1 +;* History +;* 4/13/2010 Created +;* 7/30/2010 Modified +;* +;* +;*************************************************************************/ +%include "asm_inc.asm" + +;*********************************************************************** +; Constant +;*********************************************************************** +SECTION .rodata align=16 + +sse2_32 times 8 dw 32 +sse2_20 times 8 dw 20 + + +BITS 32 +;*********************************************************************** +; Code +;*********************************************************************** +SECTION .text + +%macro WEIGHT_LINE 9 + movq %2, %9 + punpcklbw %2, %7 + movdqa %8, %2 + + movdqa %1, %6 + psubusb %1, %8 + psubusb %8, %6 + por %8, %1 ; ABS(curPixel - centerPixel); + + movdqa %1, %3 + psubusb %1, %8 + + pmullw %1, %1 + psrlw %1, 5 + pmullw %2, %1 + paddusw %4, %1 + paddusw %5, %2 +%endmacro + +%macro WEIGHT_LINE1_UV 4 + movdqa %2, %1 + punpcklbw %2, %4 + paddw %3, %2 + + movdqa %2, %1 + psrldq %2, 1 + punpcklbw %2, %4 + paddw %3, %2 + + movdqa %2, %1 + psrldq %2, 2 + punpcklbw %2, %4 + psllw %2, 1 + paddw %3, %2 + + movdqa %2, %1 + psrldq %2, 3 + punpcklbw %2, %4 + paddw %3, %2 + + movdqa %2, %1 + psrldq %2, 4 + punpcklbw %2, %4 + paddw %3, %2 +%endmacro + +%macro WEIGHT_LINE2_UV 4 + movdqa %2, %1 + punpcklbw %2, %4 + paddw %3, %2 + + movdqa %2, %1 + psrldq %2, 1 + punpcklbw %2, %4 + psllw %2, 1 + paddw %3, %2 + + movdqa %2, %1 + psrldq %2, 2 + punpcklbw %2, %4 + psllw %2, 2 + paddw %3, %2 + + movdqa %2, %1 + psrldq %2, 3 + punpcklbw %2, %4 + psllw %2, 1 + paddw %3, %2 + + movdqa %2, %1 + psrldq %2, 4 + punpcklbw %2, %4 + paddw %3, %2 +%endmacro + +%macro WEIGHT_LINE3_UV 4 + movdqa %2, %1 + punpcklbw %2, %4 + psllw %2, 1 + paddw %3, %2 + + movdqa %2, %1 + psrldq %2, 1 + punpcklbw %2, %4 + psllw %2, 2 + paddw %3, %2 + + movdqa %2, %1 + psrldq %2, 2 + punpcklbw %2, %4 + pmullw %2, [sse2_20] + paddw %3, %2 + + movdqa %2, %1 + psrldq %2, 3 + punpcklbw %2, %4 + psllw %2, 2 + paddw %3, %2 + + movdqa %2, %1 + psrldq %2, 4 + punpcklbw %2, %4 + psllw %2, 1 + paddw %3, %2 +%endmacro + +ALIGN 16 +WELS_EXTERN BilateralLumaFilter8_sse2 +;*********************************************************************** +; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride); +;*********************************************************************** +; 1 2 3 +; 4 0 5 +; 6 7 8 +; 0: the center point +%define pushsize 4 +%define pixel esp + pushsize + 4 +%define stride esp + pushsize + 8 +BilateralLumaFilter8_sse2: + push ebx + + pxor xmm7, xmm7 + mov eax, [pixel] + mov ebx, eax + movq xmm6, [eax] + punpcklbw xmm6, xmm7 + movdqa xmm3, [sse2_32] + pxor xmm4, xmm4 ; nTotWeight + pxor xmm5, xmm5 ; nSum + + dec eax + mov ecx, [stride] + + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 4 + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 5 + + sub eax, ecx + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 1 + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 2 + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 3 + + lea eax, [eax + ecx * 2] + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 6 + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 7 + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 8 + + pcmpeqw xmm0, xmm0 + psrlw xmm0, 15 + psllw xmm0, 8 + psubusw xmm0, xmm4 + pmullw xmm0, xmm6 + paddusw xmm5, xmm0 + psrlw xmm5, 8 + packuswb xmm5, xmm5 + movq [ebx], xmm5 + + pop ebx + ret + +WELS_EXTERN WaverageChromaFilter8_sse2 +;*********************************************************************** +; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride); +;*********************************************************************** +;5x5 filter: +;1 1 2 1 1 +;1 2 4 2 1 +;2 4 20 4 2 +;1 2 4 2 1 +;1 1 2 1 1 + +ALIGN 16 +WaverageChromaFilter8_sse2: + mov edx, [esp + 4] ; pixels + mov ecx, [esp + 8] ; stride + + mov eax, ecx + add eax, eax + sub edx, eax ; pixels - 2 * stride + sub edx, 2 + + pxor xmm0, xmm0 + pxor xmm3, xmm3 + + movdqu xmm1, [edx] + WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0 + + movdqu xmm1, [edx + ecx] + WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0 + + add edx, eax + movdqu xmm1, [edx] + WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0 + + movdqu xmm1, [edx + ecx] + WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0 + + movdqu xmm1, [edx + ecx * 2] + WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0 + + psrlw xmm3, 6 + packuswb xmm3, xmm3 + movq [edx + 2], xmm3 + ret \ No newline at end of file diff --git a/processing/src/asm/downsample_bilinear.asm b/processing/src/asm/downsample_bilinear.asm index 44897f28..c6f4765e 100644 --- a/processing/src/asm/downsample_bilinear.asm +++ b/processing/src/asm/downsample_bilinear.asm @@ -1,1225 +1,1225 @@ -;*! -;* \copy -;* Copyright (c) 2009-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* upsampling.asm -;* -;* Abstract -;* SIMD for pixel domain down sampling -;* -;* History -;* 10/22/2009 Created -;* -;*************************************************************************/ -%include "asm_inc.asm" -BITS 32 - -;*********************************************************************** -; Macros and other preprocessor constants -;*********************************************************************** - - -;*********************************************************************** -; Some constants -;*********************************************************************** - -;*********************************************************************** -; Local Data (Read Only) -;*********************************************************************** - -SECTION .rodata align=16 - -;*********************************************************************** -; Various memory constants (trigonometric values or rounding values) -;*********************************************************************** - -ALIGN 16 -shufb_mask_low: - db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h -shufb_mask_high: - db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h - - -ALIGN 16 - -;*********************************************************************** -; Code -;*********************************************************************** - -SECTION .text - -WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse -;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); -;*********************************************************************** -ALIGN 16 -DyadicBilinearDownsamplerWidthx32_sse: - push ebx - push edx - push esi - push edi - push ebp - - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight - - sar ebp, $1 ; iSrcHeight >> 1 - -.yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $1 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 32 bytes -.xloops: - ; 1st part horizonal loop: x16 bytes - ; mem hi<- ->lo - ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E - ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M - ;=> target: - ;: H G F E D C B A, P O N M L K J I - ;: h g f e d c b a, p o n m l k j i - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movq mm0, [esi] ; 1st pSrc line - movq mm1, [esi+8] ; 1st pSrc line + 8 - movq mm2, [esi+ecx] ; 2nd pSrc line - movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8 - - ; to handle mm0, mm1, mm2, mm3 - pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B - pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B - punpcklbw mm4, mm5 ; d c D C b a B A - pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4 - - pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B - pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B - punpcklbw mm5, mm6 ; h g H G f e F E - pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5 - - pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B - pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B - punpcklbw mm6, mm7 ; l k L K j i J I - pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6 - - pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B - pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B - punpcklbw mm7, mm0 ; p o P O n m N M - pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7 - - ; to handle mm4, mm5, mm6, mm7 - movq mm0, mm4 ; - punpckldq mm0, mm5 ; H G F E D C B A - punpckhdq mm4, mm5 ; h g f e d c b a - - movq mm1, mm6 - punpckldq mm1, mm7 ; P O N M L K J I - punpckhdq mm6, mm7 ; p o n m l k j i - - ; avg within MB horizon width (16 x 2 lines) - pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 - pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 - pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once - - ; 2nd part horizonal loop: x16 bytes - ; mem hi<- ->lo - ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E - ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M - ;=> target: - ;: H G F E D C B A, P O N M L K J I - ;: h g f e d c b a, p o n m l k j i - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movq mm1, [esi+16] ; 1st pSrc line + 16 - movq mm2, [esi+24] ; 1st pSrc line + 24 - movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16 - movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24 - - ; to handle mm1, mm2, mm3, mm4 - pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B - pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B - punpcklbw mm5, mm6 ; d c D C b a B A - pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5 - - pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B - pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B - punpcklbw mm6, mm7 ; h g H G f e F E - pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6 - - pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B - pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B - punpcklbw mm7, mm1 ; l k L K j i J I - pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7 - - pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B - pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B - punpcklbw mm1, mm2 ; p o P O n m N M - pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1 - - ; to handle mm5, mm6, mm7, mm1 - movq mm2, mm5 - punpckldq mm2, mm6 ; H G F E D C B A - punpckhdq mm5, mm6 ; h g f e d c b a - - movq mm3, mm7 - punpckldq mm3, mm1 ; P O N M L K J I - punpckhdq mm7, mm1 ; p o n m l k j i - - ; avg within MB horizon width (16 x 2 lines) - pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 - pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 - pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part - - movq [edi ], mm0 - movq [edi+8], mm2 - - ; next SMB - lea esi, [esi+32] - lea edi, [edi+16] - - dec eax - jg near .xloops - - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - - dec ebp - jg near .yloops - - WELSEMMS - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret - -WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse -;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); -;*********************************************************************** -ALIGN 16 -DyadicBilinearDownsamplerWidthx16_sse: - push ebx - push edx - push esi - push edi - push ebp - - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight - - sar ebp, $1 ; iSrcHeight >> 1 - -.yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $1 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 16 bytes -.xloops: - ; 1st part horizonal loop: x16 bytes - ; mem hi<- ->lo - ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E - ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M - ;=> target: - ;: H G F E D C B A, P O N M L K J I - ;: h g f e d c b a, p o n m l k j i - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movq mm0, [esi] ; 1st pSrc line - movq mm1, [esi+8] ; 1st pSrc line + 8 - movq mm2, [esi+ecx] ; 2nd pSrc line - movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8 - - ; to handle mm0, mm1, mm2, mm3 - pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B - pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B - punpcklbw mm4, mm5 ; d c D C b a B A - pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4 - - pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B - pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B - punpcklbw mm5, mm6 ; h g H G f e F E - pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5 - - pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B - pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B - punpcklbw mm6, mm7 ; l k L K j i J I - pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6 - - pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B - pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B - punpcklbw mm7, mm0 ; p o P O n m N M - pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7 - - ; to handle mm4, mm5, mm6, mm7 - movq mm0, mm4 ; - punpckldq mm0, mm5 ; H G F E D C B A - punpckhdq mm4, mm5 ; h g f e d c b a - - movq mm1, mm6 - punpckldq mm1, mm7 ; P O N M L K J I - punpckhdq mm6, mm7 ; p o n m l k j i - - ; avg within MB horizon width (16 x 2 lines) - pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 - pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 - pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once - - movq [edi ], mm0 - - ; next SMB - lea esi, [esi+16] - lea edi, [edi+8] - - dec eax - jg near .xloops - - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - - dec ebp - jg near .yloops - - WELSEMMS - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret - -WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse -;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); -;*********************************************************************** -ALIGN 16 -DyadicBilinearDownsamplerWidthx8_sse: - push ebx - push edx - push esi - push edi - push ebp - - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight - - sar ebp, $1 ; iSrcHeight >> 1 - -.yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $1 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $2 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 8 bytes -.xloops: - ; 1st part horizonal loop: x8 bytes - ; mem hi<- ->lo - ;1st Line Src: mm0: d D c C b B a A - ;2nd Line Src: mm1: h H g G f F e E - ;=> target: - ;: H G F E D C B A - ;: h g f e d c b a - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movq mm0, [esi] ; 1st pSrc line - movq mm1, [esi+ecx] ; 2nd pSrc line - - ; to handle mm0, mm1, mm2, mm3 - pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B - pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B - punpcklbw mm2, mm3 ; d c D C b a B A - pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4 - - pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B - pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B - punpcklbw mm4, mm5 ; h g H G f e F E - pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5 - - ; to handle mm2, mm4 - movq mm0, mm2 ; - punpckldq mm0, mm4 ; H G F E D C B A - punpckhdq mm2, mm4 ; h g f e d c b a - - ; avg within MB horizon width (16 x 2 lines) - pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2 - pshufw mm1, mm0, 04eh ; 01001110 B - pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once - - movd [edi], mm0 - - ; next unit - lea esi, [esi+8] - lea edi, [edi+4] - - dec eax - jg near .xloops - - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - - dec ebp - jg near .yloops - - WELSEMMS - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret - - - -; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse -WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3 -;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); -;*********************************************************************** -ALIGN 16 -DyadicBilinearDownsamplerWidthx32_ssse3: - push ebx - push edx - push esi - push edi - push ebp - - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight - - sar ebp, $1 ; iSrcHeight >> 1 - - movdqa xmm7, [shufb_mask_low] ; mask low - movdqa xmm6, [shufb_mask_high] ; mask high - -.yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $1 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 32 bytes -.xloops: - ; 1st part horizonal loop: x16 bytes - ; mem hi<- ->lo - ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A - ; xmm1: p P o O n N m M l L k K j J i I - ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A - ; xmm3: p P o O n N m M l L k K j J i I - ;=> target: - ;: P O N M L K J I H G F E D C B A - ;: p o n m l k j i h g f e d c b a - ;: P .. A - ;: p .. a - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movdqa xmm0, [esi] ; 1st_src_line - movdqa xmm1, [esi+16] ; 1st_src_line + 16 - movdqa xmm2, [esi+ecx] ; 2nd_src_line - movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16 - - ; packing & avg - movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A - pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A - pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a - ; another implementation for xmm4 high bits -; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 -; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a - pavgb xmm0, xmm4 - - movdqa xmm5, xmm1 - pshufb xmm1, xmm7 - pshufb xmm5, xmm6 -; psubb xmm5, xmm1 -; psrlw xmm5, 8 - pavgb xmm1, xmm5 - - movdqa xmm4, xmm2 - pshufb xmm2, xmm7 - pshufb xmm4, xmm6 -; psubb xmm4, xmm2 -; psrlw xmm4, 8 - pavgb xmm2, xmm4 - - movdqa xmm5, xmm3 - pshufb xmm3, xmm7 - pshufb xmm5, xmm6 -; psubb xmm5, xmm3 -; psrlw xmm5, 8 - pavgb xmm3, xmm5 - - packuswb xmm0, xmm1 - packuswb xmm2, xmm3 - pavgb xmm0, xmm2 - - ; write pDst - movdqa [edi], xmm0 - - ; next SMB - lea esi, [esi+32] - lea edi, [edi+16] - - dec eax - jg near .xloops - - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - - dec ebp - jg near .yloops - - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret - -WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3 -;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); -;*********************************************************************** -ALIGN 16 -DyadicBilinearDownsamplerWidthx16_ssse3: - push ebx - push edx - push esi - push edi - push ebp - - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight - - sar ebp, $1 ; iSrcHeight >> 1 - movdqa xmm7, [shufb_mask_low] ; mask low - movdqa xmm6, [shufb_mask_high] ; mask high - -.yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $1 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 16 bytes -.xloops: - ; horizonal loop: x16 bytes by source - ; mem hi<- ->lo - ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A - ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I - ;=> target: - ;: H G F E D C B A, P O N M L K J I - ;: h g f e d c b a, p o n m l k j i - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movdqa xmm0, [esi] ; 1st_src_line - movdqa xmm1, [esi+ecx] ; 2nd_src_line - - ; packing & avg - movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A - pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A - pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a - ; another implementation for xmm2 high bits -; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 -; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a - pavgb xmm0, xmm2 - - movdqa xmm3, xmm1 - pshufb xmm1, xmm7 - pshufb xmm3, xmm6 -; psubb xmm3, xmm1 -; psrlw xmm3, 8 - pavgb xmm1, xmm3 - - pavgb xmm0, xmm1 - packuswb xmm0, xmm1 - - ; write pDst - movq [edi], xmm0 - - ; next SMB - lea esi, [esi+16] - lea edi, [edi+8] - - dec eax - jg near .xloops - - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - - dec ebp - jg near .yloops - - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret - -; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse -WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4 -;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); -;*********************************************************************** -ALIGN 16 -DyadicBilinearDownsamplerWidthx32_sse4: - push ebx - push edx - push esi - push edi - push ebp - - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight - - sar ebp, $1 ; iSrcHeight >> 1 - - movdqa xmm7, [shufb_mask_low] ; mask low - movdqa xmm6, [shufb_mask_high] ; mask high - -.yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $1 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 32 bytes -.xloops: - ; 1st part horizonal loop: x16 bytes - ; mem hi<- ->lo - ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A - ; xmm1: p P o O n N m M l L k K j J i I - ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A - ; xmm3: p P o O n N m M l L k K j J i I - ;=> target: - ;: P O N M L K J I H G F E D C B A - ;: p o n m l k j i h g f e d c b a - ;: P .. A - ;: p .. a - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movntdqa xmm0, [esi] ; 1st_src_line - movntdqa xmm1, [esi+16] ; 1st_src_line + 16 - movntdqa xmm2, [esi+ecx] ; 2nd_src_line - movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16 - - ; packing & avg - movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A - pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A - pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a -; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 -; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a - pavgb xmm0, xmm4 - - movdqa xmm5, xmm1 - pshufb xmm1, xmm7 - pshufb xmm5, xmm6 -; psubb xmm5, xmm1 -; psrlw xmm5, 8 - pavgb xmm1, xmm5 - - movdqa xmm4, xmm2 - pshufb xmm2, xmm7 - pshufb xmm4, xmm6 -; psubb xmm4, xmm2 -; psrlw xmm4, 8 - pavgb xmm2, xmm4 - - movdqa xmm5, xmm3 - pshufb xmm3, xmm7 - pshufb xmm5, xmm6 -; psubb xmm5, xmm3 -; psrlw xmm5, 8 - pavgb xmm3, xmm5 - - packuswb xmm0, xmm1 - packuswb xmm2, xmm3 - pavgb xmm0, xmm2 - - ; write pDst - movdqa [edi], xmm0 - - ; next SMB - lea esi, [esi+32] - lea edi, [edi+16] - - dec eax - jg near .xloops - - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - - dec ebp - jg near .yloops - - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret - -WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4 -;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); -;*********************************************************************** -ALIGN 16 -DyadicBilinearDownsamplerWidthx16_sse4: - push ebx - push edx - push esi - push edi - push ebp - - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight - - sar ebp, $1 ; iSrcHeight >> 1 - movdqa xmm7, [shufb_mask_low] ; mask low - movdqa xmm6, [shufb_mask_high] ; mask high - -.yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $1 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 16 bytes -.xloops: - ; horizonal loop: x16 bytes by source - ; mem hi<- ->lo - ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A - ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I - ;=> target: - ;: H G F E D C B A, P O N M L K J I - ;: h g f e d c b a, p o n m l k j i - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movntdqa xmm0, [esi] ; 1st_src_line - movntdqa xmm1, [esi+ecx] ; 2nd_src_line - - ; packing & avg - movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A - pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A - pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a -; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 -; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a - pavgb xmm0, xmm2 - - movdqa xmm3, xmm1 - pshufb xmm1, xmm7 - pshufb xmm3, xmm6 -; psubb xmm3, xmm1 -; psrlw xmm3, 8 - pavgb xmm1, xmm3 - - pavgb xmm0, xmm1 - packuswb xmm0, xmm1 - - ; write pDst - movq [edi], xmm0 - - ; next SMB - lea esi, [esi+16] - lea edi, [edi+8] - - dec eax - jg near .xloops - - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - - dec ebp - jg near .yloops - - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret - - - - - -WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2 -;************************************************************************************************************** -;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, -; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight, -; unsigned int uiScaleX, unsigned int uiScaleY ); -;{ -;************************************************************************************************************** - -ALIGN 16 -GeneralBilinearAccurateDownsampler_sse2: - push ebp - push esi - push edi - push ebx -%define pushsize 16 -%define localsize 28 -%define pDstData esp + pushsize + localsize + 4 -%define dwDstStride esp + pushsize + localsize + 8 -%define dwDstWidth esp + pushsize + localsize + 12 -%define dwDstHeight esp + pushsize + localsize + 16 -%define pSrcData esp + pushsize + localsize + 20 -%define dwSrcStride esp + pushsize + localsize + 24 -%define dwSrcWidth esp + pushsize + localsize + 28 -%define dwSrcHeight esp + pushsize + localsize + 32 -%define scale esp + 0 -%define uiScaleX esp + pushsize + localsize + 36 -%define uiScaleY esp + pushsize + localsize + 40 -%define tmpHeight esp + 12 -%define yInverse esp + 16 -%define xInverse esp + 20 -%define dstStep esp + 24 - sub esp, localsize - - pxor xmm0, xmm0 - mov edx, 32767 - mov eax, [uiScaleX] - and eax, 32767 - mov ebx, eax - neg ebx - and ebx, 32767 - movd xmm1, eax ; uinc(uiScaleX mod 32767) - movd xmm2, ebx ; -uinc - psllq xmm1, 32 - por xmm1, xmm2 ; 0 0 uinc -uinc (dword) - pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc - - mov eax, [uiScaleY] - and eax, 32767 - mov ebx, eax - neg ebx - and ebx, 32767 - movd xmm6, eax ; vinc(uiScaleY mod 32767) - movd xmm2, ebx ; -vinc - psllq xmm6, 32 - por xmm6, xmm2 ; 0 0 vinc -vinc (dword) - pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc - - mov edx, 40003fffh - movd xmm5, edx - punpcklwd xmm5, xmm0 ; 16384 16383 - pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383 - - -DOWNSAMPLE: - - mov eax, [dwDstHeight] - mov edi, [pDstData] - mov edx, [dwDstStride] - mov ecx, [dwDstWidth] - sub edx, ecx - mov [dstStep], edx ; stride - width - dec eax - mov [tmpHeight], eax - mov eax, 16384 - mov [yInverse], eax - - pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383 - -HEIGHT: - mov eax, [yInverse] - mov esi, [pSrcData] - shr eax, 15 - mul dword [dwSrcStride] - add esi, eax ; get current row address - mov ebp, esi - add ebp, [dwSrcStride] - - mov eax, 16384 - mov [xInverse], eax - mov ecx, [dwDstWidth] - dec ecx - - movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383 - -WIDTH: - mov eax, [xInverse] - shr eax, 15 - - movd xmm1, [esi+eax] ; xxxxxxba - movd xmm2, [ebp+eax] ; xxxxxxdc - pxor xmm0, xmm0 - punpcklwd xmm1, xmm2 ; xxxxdcba - punpcklbw xmm1, xmm0 ; 0d0c0b0a - punpcklwd xmm1, xmm0 ; 000d000c000b000a - - movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv - pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 - movdqa xmm0, xmm2 - pmuludq xmm2, xmm1 - psrlq xmm0, 32 - psrlq xmm1, 32 - pmuludq xmm0, xmm1 - paddq xmm2, xmm0 - pshufd xmm1, xmm2, 00001110b - paddq xmm2, xmm1 - psrlq xmm2, 29 - - movd eax, xmm2 - inc eax - shr eax, 1 - mov [edi], al - inc edi - - mov eax, [uiScaleX] - add [xInverse], eax - - paddw xmm3, xmm7 ; inc u - psllw xmm3, 1 - psrlw xmm3, 1 - - loop WIDTH - -WIDTH_END: - mov eax, [xInverse] - shr eax, 15 - mov cl, [esi+eax] - mov [edi], cl - inc edi - - mov eax, [uiScaleY] - add [yInverse], eax - add edi, [dstStep] - - paddw xmm4, xmm6 ; inc v - psllw xmm4, 1 - psrlw xmm4, 1 - - dec dword [tmpHeight] - jg HEIGHT - - -LAST_ROW: - mov eax, [yInverse] - mov esi, [pSrcData] - shr eax, 15 - mul dword [dwSrcStride] - add esi, eax ; get current row address - - mov eax, 16384 - mov [xInverse], eax - mov ecx, [dwDstWidth] - -LAST_ROW_WIDTH: - mov eax, [xInverse] - shr eax, 15 - - mov al, [esi+eax] - mov [edi], al - inc edi - - mov eax, [uiScaleX] - add [xInverse], eax - - loop LAST_ROW_WIDTH - -LAST_ROW_END: - - add esp, localsize - pop ebx - pop edi - pop esi - pop ebp -%undef pushsize -%undef localsize -%undef pSrcData -%undef dwSrcWidth -%undef dwSrcHeight -%undef dwSrcStride -%undef pDstData -%undef dwDstWidth -%undef dwDstHeight -%undef dwDstStride -%undef scale -%undef uiScaleX -%undef uiScaleY -%undef tmpHeight -%undef yInverse -%undef xInverse -%undef dstStep - ret - - - - -WELS_EXTERN GeneralBilinearFastDownsampler_sse2 -;************************************************************************************************************** -;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, -; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight, -; unsigned int uiScaleX, unsigned int uiScaleY ); -;{ -;************************************************************************************************************** - -ALIGN 16 -GeneralBilinearFastDownsampler_sse2: - push ebp - push esi - push edi - push ebx -%define pushsize 16 -%define localsize 28 -%define pDstData esp + pushsize + localsize + 4 -%define dwDstStride esp + pushsize + localsize + 8 -%define dwDstWidth esp + pushsize + localsize + 12 -%define dwDstHeight esp + pushsize + localsize + 16 -%define pSrcData esp + pushsize + localsize + 20 -%define dwSrcStride esp + pushsize + localsize + 24 -%define dwSrcWidth esp + pushsize + localsize + 28 -%define dwSrcHeight esp + pushsize + localsize + 32 -%define scale esp + 0 -%define uiScaleX esp + pushsize + localsize + 36 -%define uiScaleY esp + pushsize + localsize + 40 -%define tmpHeight esp + 12 -%define yInverse esp + 16 -%define xInverse esp + 20 -%define dstStep esp + 24 - sub esp, localsize - - pxor xmm0, xmm0 - mov edx, 65535 - mov eax, [uiScaleX] - and eax, edx - mov ebx, eax - neg ebx - and ebx, 65535 - movd xmm1, eax ; uinc(uiScaleX mod 65536) - movd xmm2, ebx ; -uinc - psllq xmm1, 32 - por xmm1, xmm2 ; 0 uinc 0 -uinc - pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc - - mov eax, [uiScaleY] - and eax, 32767 - mov ebx, eax - neg ebx - and ebx, 32767 - movd xmm6, eax ; vinc(uiScaleY mod 32767) - movd xmm2, ebx ; -vinc - psllq xmm6, 32 - por xmm6, xmm2 ; 0 vinc 0 -vinc - pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc - - mov edx, 80007fffh ; 32768 32767 - movd xmm5, edx - pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767 - mov ebx, 16384 - - -FAST_DOWNSAMPLE: - - mov eax, [dwDstHeight] - mov edi, [pDstData] - mov edx, [dwDstStride] - mov ecx, [dwDstWidth] - sub edx, ecx - mov [dstStep], edx ; stride - width - dec eax - mov [tmpHeight], eax - mov eax, 16384 - mov [yInverse], eax - - pshuflw xmm4, xmm5, 01010000b - psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383 - -FAST_HEIGHT: - mov eax, [yInverse] - mov esi, [pSrcData] - shr eax, 15 - mul dword [dwSrcStride] - add esi, eax ; get current row address - mov ebp, esi - add ebp, [dwSrcStride] - - mov eax, 32768 - mov [xInverse], eax - mov ecx, [dwDstWidth] - dec ecx - - movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767 - -FAST_WIDTH: - mov eax, [xInverse] - shr eax, 16 - - movd xmm1, [esi+eax] ; xxxxxxba - movd xmm2, [ebp+eax] ; xxxxxxdc - punpcklwd xmm1, xmm2 ; xxxxdcba - punpcklbw xmm1, xmm0 ; 0d0c0b0a - - movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv - pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 - pmaddwd xmm2, xmm1 - pshufd xmm1, xmm2, 00000001b - paddd xmm2, xmm1 - movd xmm1, ebx - paddd xmm2, xmm1 - psrld xmm2, 15 - - packuswb xmm2, xmm0 - movd eax, xmm2 - mov [edi], al - inc edi - - mov eax, [uiScaleX] - add [xInverse], eax - - paddw xmm3, xmm7 ; inc u - - loop FAST_WIDTH - -FAST_WIDTH_END: - mov eax, [xInverse] - shr eax, 16 - mov cl, [esi+eax] - mov [edi], cl - inc edi - - mov eax, [uiScaleY] - add [yInverse], eax - add edi, [dstStep] - - paddw xmm4, xmm6 ; inc v - psllw xmm4, 1 - psrlw xmm4, 1 - - dec dword [tmpHeight] - jg FAST_HEIGHT - - -FAST_LAST_ROW: - mov eax, [yInverse] - mov esi, [pSrcData] - shr eax, 15 - mul dword [dwSrcStride] - add esi, eax ; get current row address - - mov eax, 32768 - mov [xInverse], eax - mov ecx, [dwDstWidth] - -FAST_LAST_ROW_WIDTH: - mov eax, [xInverse] - shr eax, 16 - - mov al, [esi+eax] - mov [edi], al - inc edi - - mov eax, [uiScaleX] - add [xInverse], eax - - loop FAST_LAST_ROW_WIDTH - -FAST_LAST_ROW_END: - - add esp, localsize - pop ebx - pop edi - pop esi - pop ebp -%undef pushsize -%undef localsize -%undef pSrcData -%undef dwSrcWidth -%undef dwSrcHeight -%undef dwSrcStride -%undef pDstData -%undef dwDstWidth -%undef dwDstHeight -%undef dwDstStride -%undef scale -%undef uiScaleX -%undef uiScaleY -%undef tmpHeight -%undef yInverse -%undef xInverse -%undef dstStep +;*! +;* \copy +;* Copyright (c) 2009-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* upsampling.asm +;* +;* Abstract +;* SIMD for pixel domain down sampling +;* +;* History +;* 10/22/2009 Created +;* +;*************************************************************************/ +%include "asm_inc.asm" +BITS 32 + +;*********************************************************************** +; Macros and other preprocessor constants +;*********************************************************************** + + +;*********************************************************************** +; Some constants +;*********************************************************************** + +;*********************************************************************** +; Local Data (Read Only) +;*********************************************************************** + +SECTION .rodata align=16 + +;*********************************************************************** +; Various memory constants (trigonometric values or rounding values) +;*********************************************************************** + +ALIGN 16 +shufb_mask_low: + db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h +shufb_mask_high: + db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h + + +ALIGN 16 + +;*********************************************************************** +; Code +;*********************************************************************** + +SECTION .text + +WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse +;*********************************************************************** +; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); +;*********************************************************************** +ALIGN 16 +DyadicBilinearDownsamplerWidthx32_sse: + push ebx + push edx + push esi + push edi + push ebp + + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight + + sar ebp, $1 ; iSrcHeight >> 1 + +.yloops: + mov eax, [esp+40] ; iSrcWidth + sar eax, $1 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 32 bytes +.xloops: + ; 1st part horizonal loop: x16 bytes + ; mem hi<- ->lo + ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E + ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M + ;=> target: + ;: H G F E D C B A, P O N M L K J I + ;: h g f e d c b a, p o n m l k j i + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movq mm0, [esi] ; 1st pSrc line + movq mm1, [esi+8] ; 1st pSrc line + 8 + movq mm2, [esi+ecx] ; 2nd pSrc line + movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8 + + ; to handle mm0, mm1, mm2, mm3 + pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B + pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B + punpcklbw mm4, mm5 ; d c D C b a B A + pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4 + + pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B + pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B + punpcklbw mm5, mm6 ; h g H G f e F E + pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5 + + pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B + pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B + punpcklbw mm6, mm7 ; l k L K j i J I + pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6 + + pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B + pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B + punpcklbw mm7, mm0 ; p o P O n m N M + pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7 + + ; to handle mm4, mm5, mm6, mm7 + movq mm0, mm4 ; + punpckldq mm0, mm5 ; H G F E D C B A + punpckhdq mm4, mm5 ; h g f e d c b a + + movq mm1, mm6 + punpckldq mm1, mm7 ; P O N M L K J I + punpckhdq mm6, mm7 ; p o n m l k j i + + ; avg within MB horizon width (16 x 2 lines) + pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 + pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 + pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once + + ; 2nd part horizonal loop: x16 bytes + ; mem hi<- ->lo + ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E + ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M + ;=> target: + ;: H G F E D C B A, P O N M L K J I + ;: h g f e d c b a, p o n m l k j i + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movq mm1, [esi+16] ; 1st pSrc line + 16 + movq mm2, [esi+24] ; 1st pSrc line + 24 + movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16 + movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24 + + ; to handle mm1, mm2, mm3, mm4 + pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B + pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B + punpcklbw mm5, mm6 ; d c D C b a B A + pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5 + + pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B + pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B + punpcklbw mm6, mm7 ; h g H G f e F E + pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6 + + pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B + pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B + punpcklbw mm7, mm1 ; l k L K j i J I + pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7 + + pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B + pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B + punpcklbw mm1, mm2 ; p o P O n m N M + pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1 + + ; to handle mm5, mm6, mm7, mm1 + movq mm2, mm5 + punpckldq mm2, mm6 ; H G F E D C B A + punpckhdq mm5, mm6 ; h g f e d c b a + + movq mm3, mm7 + punpckldq mm3, mm1 ; P O N M L K J I + punpckhdq mm7, mm1 ; p o n m l k j i + + ; avg within MB horizon width (16 x 2 lines) + pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 + pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 + pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part + + movq [edi ], mm0 + movq [edi+8], mm2 + + ; next SMB + lea esi, [esi+32] + lea edi, [edi+16] + + dec eax + jg near .xloops + + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + + dec ebp + jg near .yloops + + WELSEMMS + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret + +WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse +;*********************************************************************** +; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); +;*********************************************************************** +ALIGN 16 +DyadicBilinearDownsamplerWidthx16_sse: + push ebx + push edx + push esi + push edi + push ebp + + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight + + sar ebp, $1 ; iSrcHeight >> 1 + +.yloops: + mov eax, [esp+40] ; iSrcWidth + sar eax, $1 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 16 bytes +.xloops: + ; 1st part horizonal loop: x16 bytes + ; mem hi<- ->lo + ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E + ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M + ;=> target: + ;: H G F E D C B A, P O N M L K J I + ;: h g f e d c b a, p o n m l k j i + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movq mm0, [esi] ; 1st pSrc line + movq mm1, [esi+8] ; 1st pSrc line + 8 + movq mm2, [esi+ecx] ; 2nd pSrc line + movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8 + + ; to handle mm0, mm1, mm2, mm3 + pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B + pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B + punpcklbw mm4, mm5 ; d c D C b a B A + pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4 + + pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B + pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B + punpcklbw mm5, mm6 ; h g H G f e F E + pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5 + + pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B + pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B + punpcklbw mm6, mm7 ; l k L K j i J I + pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6 + + pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B + pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B + punpcklbw mm7, mm0 ; p o P O n m N M + pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7 + + ; to handle mm4, mm5, mm6, mm7 + movq mm0, mm4 ; + punpckldq mm0, mm5 ; H G F E D C B A + punpckhdq mm4, mm5 ; h g f e d c b a + + movq mm1, mm6 + punpckldq mm1, mm7 ; P O N M L K J I + punpckhdq mm6, mm7 ; p o n m l k j i + + ; avg within MB horizon width (16 x 2 lines) + pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 + pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 + pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once + + movq [edi ], mm0 + + ; next SMB + lea esi, [esi+16] + lea edi, [edi+8] + + dec eax + jg near .xloops + + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + + dec ebp + jg near .yloops + + WELSEMMS + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret + +WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse +;*********************************************************************** +; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); +;*********************************************************************** +ALIGN 16 +DyadicBilinearDownsamplerWidthx8_sse: + push ebx + push edx + push esi + push edi + push ebp + + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight + + sar ebp, $1 ; iSrcHeight >> 1 + +.yloops: + mov eax, [esp+40] ; iSrcWidth + sar eax, $1 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $2 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 8 bytes +.xloops: + ; 1st part horizonal loop: x8 bytes + ; mem hi<- ->lo + ;1st Line Src: mm0: d D c C b B a A + ;2nd Line Src: mm1: h H g G f F e E + ;=> target: + ;: H G F E D C B A + ;: h g f e d c b a + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movq mm0, [esi] ; 1st pSrc line + movq mm1, [esi+ecx] ; 2nd pSrc line + + ; to handle mm0, mm1, mm2, mm3 + pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B + pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B + punpcklbw mm2, mm3 ; d c D C b a B A + pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4 + + pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B + pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B + punpcklbw mm4, mm5 ; h g H G f e F E + pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5 + + ; to handle mm2, mm4 + movq mm0, mm2 ; + punpckldq mm0, mm4 ; H G F E D C B A + punpckhdq mm2, mm4 ; h g f e d c b a + + ; avg within MB horizon width (16 x 2 lines) + pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2 + pshufw mm1, mm0, 04eh ; 01001110 B + pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once + + movd [edi], mm0 + + ; next unit + lea esi, [esi+8] + lea edi, [edi+4] + + dec eax + jg near .xloops + + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + + dec ebp + jg near .yloops + + WELSEMMS + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret + + + +; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse +WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3 +;*********************************************************************** +; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); +;*********************************************************************** +ALIGN 16 +DyadicBilinearDownsamplerWidthx32_ssse3: + push ebx + push edx + push esi + push edi + push ebp + + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight + + sar ebp, $1 ; iSrcHeight >> 1 + + movdqa xmm7, [shufb_mask_low] ; mask low + movdqa xmm6, [shufb_mask_high] ; mask high + +.yloops: + mov eax, [esp+40] ; iSrcWidth + sar eax, $1 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 32 bytes +.xloops: + ; 1st part horizonal loop: x16 bytes + ; mem hi<- ->lo + ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A + ; xmm1: p P o O n N m M l L k K j J i I + ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A + ; xmm3: p P o O n N m M l L k K j J i I + ;=> target: + ;: P O N M L K J I H G F E D C B A + ;: p o n m l k j i h g f e d c b a + ;: P .. A + ;: p .. a + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa xmm0, [esi] ; 1st_src_line + movdqa xmm1, [esi+16] ; 1st_src_line + 16 + movdqa xmm2, [esi+ecx] ; 2nd_src_line + movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16 + + ; packing & avg + movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A + pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A + pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a + ; another implementation for xmm4 high bits +; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 +; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a + pavgb xmm0, xmm4 + + movdqa xmm5, xmm1 + pshufb xmm1, xmm7 + pshufb xmm5, xmm6 +; psubb xmm5, xmm1 +; psrlw xmm5, 8 + pavgb xmm1, xmm5 + + movdqa xmm4, xmm2 + pshufb xmm2, xmm7 + pshufb xmm4, xmm6 +; psubb xmm4, xmm2 +; psrlw xmm4, 8 + pavgb xmm2, xmm4 + + movdqa xmm5, xmm3 + pshufb xmm3, xmm7 + pshufb xmm5, xmm6 +; psubb xmm5, xmm3 +; psrlw xmm5, 8 + pavgb xmm3, xmm5 + + packuswb xmm0, xmm1 + packuswb xmm2, xmm3 + pavgb xmm0, xmm2 + + ; write pDst + movdqa [edi], xmm0 + + ; next SMB + lea esi, [esi+32] + lea edi, [edi+16] + + dec eax + jg near .xloops + + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + + dec ebp + jg near .yloops + + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret + +WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3 +;*********************************************************************** +; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); +;*********************************************************************** +ALIGN 16 +DyadicBilinearDownsamplerWidthx16_ssse3: + push ebx + push edx + push esi + push edi + push ebp + + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight + + sar ebp, $1 ; iSrcHeight >> 1 + movdqa xmm7, [shufb_mask_low] ; mask low + movdqa xmm6, [shufb_mask_high] ; mask high + +.yloops: + mov eax, [esp+40] ; iSrcWidth + sar eax, $1 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 16 bytes +.xloops: + ; horizonal loop: x16 bytes by source + ; mem hi<- ->lo + ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A + ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I + ;=> target: + ;: H G F E D C B A, P O N M L K J I + ;: h g f e d c b a, p o n m l k j i + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa xmm0, [esi] ; 1st_src_line + movdqa xmm1, [esi+ecx] ; 2nd_src_line + + ; packing & avg + movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A + pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A + pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a + ; another implementation for xmm2 high bits +; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 +; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a + pavgb xmm0, xmm2 + + movdqa xmm3, xmm1 + pshufb xmm1, xmm7 + pshufb xmm3, xmm6 +; psubb xmm3, xmm1 +; psrlw xmm3, 8 + pavgb xmm1, xmm3 + + pavgb xmm0, xmm1 + packuswb xmm0, xmm1 + + ; write pDst + movq [edi], xmm0 + + ; next SMB + lea esi, [esi+16] + lea edi, [edi+8] + + dec eax + jg near .xloops + + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + + dec ebp + jg near .yloops + + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret + +; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse +WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4 +;*********************************************************************** +; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); +;*********************************************************************** +ALIGN 16 +DyadicBilinearDownsamplerWidthx32_sse4: + push ebx + push edx + push esi + push edi + push ebp + + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight + + sar ebp, $1 ; iSrcHeight >> 1 + + movdqa xmm7, [shufb_mask_low] ; mask low + movdqa xmm6, [shufb_mask_high] ; mask high + +.yloops: + mov eax, [esp+40] ; iSrcWidth + sar eax, $1 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 32 bytes +.xloops: + ; 1st part horizonal loop: x16 bytes + ; mem hi<- ->lo + ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A + ; xmm1: p P o O n N m M l L k K j J i I + ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A + ; xmm3: p P o O n N m M l L k K j J i I + ;=> target: + ;: P O N M L K J I H G F E D C B A + ;: p o n m l k j i h g f e d c b a + ;: P .. A + ;: p .. a + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movntdqa xmm0, [esi] ; 1st_src_line + movntdqa xmm1, [esi+16] ; 1st_src_line + 16 + movntdqa xmm2, [esi+ecx] ; 2nd_src_line + movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16 + + ; packing & avg + movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A + pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A + pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a +; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 +; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a + pavgb xmm0, xmm4 + + movdqa xmm5, xmm1 + pshufb xmm1, xmm7 + pshufb xmm5, xmm6 +; psubb xmm5, xmm1 +; psrlw xmm5, 8 + pavgb xmm1, xmm5 + + movdqa xmm4, xmm2 + pshufb xmm2, xmm7 + pshufb xmm4, xmm6 +; psubb xmm4, xmm2 +; psrlw xmm4, 8 + pavgb xmm2, xmm4 + + movdqa xmm5, xmm3 + pshufb xmm3, xmm7 + pshufb xmm5, xmm6 +; psubb xmm5, xmm3 +; psrlw xmm5, 8 + pavgb xmm3, xmm5 + + packuswb xmm0, xmm1 + packuswb xmm2, xmm3 + pavgb xmm0, xmm2 + + ; write pDst + movdqa [edi], xmm0 + + ; next SMB + lea esi, [esi+32] + lea edi, [edi+16] + + dec eax + jg near .xloops + + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + + dec ebp + jg near .yloops + + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret + +WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4 +;*********************************************************************** +; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); +;*********************************************************************** +ALIGN 16 +DyadicBilinearDownsamplerWidthx16_sse4: + push ebx + push edx + push esi + push edi + push ebp + + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight + + sar ebp, $1 ; iSrcHeight >> 1 + movdqa xmm7, [shufb_mask_low] ; mask low + movdqa xmm6, [shufb_mask_high] ; mask high + +.yloops: + mov eax, [esp+40] ; iSrcWidth + sar eax, $1 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 16 bytes +.xloops: + ; horizonal loop: x16 bytes by source + ; mem hi<- ->lo + ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A + ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I + ;=> target: + ;: H G F E D C B A, P O N M L K J I + ;: h g f e d c b a, p o n m l k j i + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movntdqa xmm0, [esi] ; 1st_src_line + movntdqa xmm1, [esi+ecx] ; 2nd_src_line + + ; packing & avg + movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A + pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A + pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a +; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 +; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a + pavgb xmm0, xmm2 + + movdqa xmm3, xmm1 + pshufb xmm1, xmm7 + pshufb xmm3, xmm6 +; psubb xmm3, xmm1 +; psrlw xmm3, 8 + pavgb xmm1, xmm3 + + pavgb xmm0, xmm1 + packuswb xmm0, xmm1 + + ; write pDst + movq [edi], xmm0 + + ; next SMB + lea esi, [esi+16] + lea edi, [edi+8] + + dec eax + jg near .xloops + + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + + dec ebp + jg near .yloops + + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret + + + + + +WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2 +;************************************************************************************************************** +;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, +; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight, +; unsigned int uiScaleX, unsigned int uiScaleY ); +;{ +;************************************************************************************************************** + +ALIGN 16 +GeneralBilinearAccurateDownsampler_sse2: + push ebp + push esi + push edi + push ebx +%define pushsize 16 +%define localsize 28 +%define pDstData esp + pushsize + localsize + 4 +%define dwDstStride esp + pushsize + localsize + 8 +%define dwDstWidth esp + pushsize + localsize + 12 +%define dwDstHeight esp + pushsize + localsize + 16 +%define pSrcData esp + pushsize + localsize + 20 +%define dwSrcStride esp + pushsize + localsize + 24 +%define dwSrcWidth esp + pushsize + localsize + 28 +%define dwSrcHeight esp + pushsize + localsize + 32 +%define scale esp + 0 +%define uiScaleX esp + pushsize + localsize + 36 +%define uiScaleY esp + pushsize + localsize + 40 +%define tmpHeight esp + 12 +%define yInverse esp + 16 +%define xInverse esp + 20 +%define dstStep esp + 24 + sub esp, localsize + + pxor xmm0, xmm0 + mov edx, 32767 + mov eax, [uiScaleX] + and eax, 32767 + mov ebx, eax + neg ebx + and ebx, 32767 + movd xmm1, eax ; uinc(uiScaleX mod 32767) + movd xmm2, ebx ; -uinc + psllq xmm1, 32 + por xmm1, xmm2 ; 0 0 uinc -uinc (dword) + pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc + + mov eax, [uiScaleY] + and eax, 32767 + mov ebx, eax + neg ebx + and ebx, 32767 + movd xmm6, eax ; vinc(uiScaleY mod 32767) + movd xmm2, ebx ; -vinc + psllq xmm6, 32 + por xmm6, xmm2 ; 0 0 vinc -vinc (dword) + pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc + + mov edx, 40003fffh + movd xmm5, edx + punpcklwd xmm5, xmm0 ; 16384 16383 + pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383 + + +DOWNSAMPLE: + + mov eax, [dwDstHeight] + mov edi, [pDstData] + mov edx, [dwDstStride] + mov ecx, [dwDstWidth] + sub edx, ecx + mov [dstStep], edx ; stride - width + dec eax + mov [tmpHeight], eax + mov eax, 16384 + mov [yInverse], eax + + pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383 + +HEIGHT: + mov eax, [yInverse] + mov esi, [pSrcData] + shr eax, 15 + mul dword [dwSrcStride] + add esi, eax ; get current row address + mov ebp, esi + add ebp, [dwSrcStride] + + mov eax, 16384 + mov [xInverse], eax + mov ecx, [dwDstWidth] + dec ecx + + movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383 + +WIDTH: + mov eax, [xInverse] + shr eax, 15 + + movd xmm1, [esi+eax] ; xxxxxxba + movd xmm2, [ebp+eax] ; xxxxxxdc + pxor xmm0, xmm0 + punpcklwd xmm1, xmm2 ; xxxxdcba + punpcklbw xmm1, xmm0 ; 0d0c0b0a + punpcklwd xmm1, xmm0 ; 000d000c000b000a + + movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv + pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 + movdqa xmm0, xmm2 + pmuludq xmm2, xmm1 + psrlq xmm0, 32 + psrlq xmm1, 32 + pmuludq xmm0, xmm1 + paddq xmm2, xmm0 + pshufd xmm1, xmm2, 00001110b + paddq xmm2, xmm1 + psrlq xmm2, 29 + + movd eax, xmm2 + inc eax + shr eax, 1 + mov [edi], al + inc edi + + mov eax, [uiScaleX] + add [xInverse], eax + + paddw xmm3, xmm7 ; inc u + psllw xmm3, 1 + psrlw xmm3, 1 + + loop WIDTH + +WIDTH_END: + mov eax, [xInverse] + shr eax, 15 + mov cl, [esi+eax] + mov [edi], cl + inc edi + + mov eax, [uiScaleY] + add [yInverse], eax + add edi, [dstStep] + + paddw xmm4, xmm6 ; inc v + psllw xmm4, 1 + psrlw xmm4, 1 + + dec dword [tmpHeight] + jg HEIGHT + + +LAST_ROW: + mov eax, [yInverse] + mov esi, [pSrcData] + shr eax, 15 + mul dword [dwSrcStride] + add esi, eax ; get current row address + + mov eax, 16384 + mov [xInverse], eax + mov ecx, [dwDstWidth] + +LAST_ROW_WIDTH: + mov eax, [xInverse] + shr eax, 15 + + mov al, [esi+eax] + mov [edi], al + inc edi + + mov eax, [uiScaleX] + add [xInverse], eax + + loop LAST_ROW_WIDTH + +LAST_ROW_END: + + add esp, localsize + pop ebx + pop edi + pop esi + pop ebp +%undef pushsize +%undef localsize +%undef pSrcData +%undef dwSrcWidth +%undef dwSrcHeight +%undef dwSrcStride +%undef pDstData +%undef dwDstWidth +%undef dwDstHeight +%undef dwDstStride +%undef scale +%undef uiScaleX +%undef uiScaleY +%undef tmpHeight +%undef yInverse +%undef xInverse +%undef dstStep + ret + + + + +WELS_EXTERN GeneralBilinearFastDownsampler_sse2 +;************************************************************************************************************** +;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, +; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight, +; unsigned int uiScaleX, unsigned int uiScaleY ); +;{ +;************************************************************************************************************** + +ALIGN 16 +GeneralBilinearFastDownsampler_sse2: + push ebp + push esi + push edi + push ebx +%define pushsize 16 +%define localsize 28 +%define pDstData esp + pushsize + localsize + 4 +%define dwDstStride esp + pushsize + localsize + 8 +%define dwDstWidth esp + pushsize + localsize + 12 +%define dwDstHeight esp + pushsize + localsize + 16 +%define pSrcData esp + pushsize + localsize + 20 +%define dwSrcStride esp + pushsize + localsize + 24 +%define dwSrcWidth esp + pushsize + localsize + 28 +%define dwSrcHeight esp + pushsize + localsize + 32 +%define scale esp + 0 +%define uiScaleX esp + pushsize + localsize + 36 +%define uiScaleY esp + pushsize + localsize + 40 +%define tmpHeight esp + 12 +%define yInverse esp + 16 +%define xInverse esp + 20 +%define dstStep esp + 24 + sub esp, localsize + + pxor xmm0, xmm0 + mov edx, 65535 + mov eax, [uiScaleX] + and eax, edx + mov ebx, eax + neg ebx + and ebx, 65535 + movd xmm1, eax ; uinc(uiScaleX mod 65536) + movd xmm2, ebx ; -uinc + psllq xmm1, 32 + por xmm1, xmm2 ; 0 uinc 0 -uinc + pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc + + mov eax, [uiScaleY] + and eax, 32767 + mov ebx, eax + neg ebx + and ebx, 32767 + movd xmm6, eax ; vinc(uiScaleY mod 32767) + movd xmm2, ebx ; -vinc + psllq xmm6, 32 + por xmm6, xmm2 ; 0 vinc 0 -vinc + pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc + + mov edx, 80007fffh ; 32768 32767 + movd xmm5, edx + pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767 + mov ebx, 16384 + + +FAST_DOWNSAMPLE: + + mov eax, [dwDstHeight] + mov edi, [pDstData] + mov edx, [dwDstStride] + mov ecx, [dwDstWidth] + sub edx, ecx + mov [dstStep], edx ; stride - width + dec eax + mov [tmpHeight], eax + mov eax, 16384 + mov [yInverse], eax + + pshuflw xmm4, xmm5, 01010000b + psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383 + +FAST_HEIGHT: + mov eax, [yInverse] + mov esi, [pSrcData] + shr eax, 15 + mul dword [dwSrcStride] + add esi, eax ; get current row address + mov ebp, esi + add ebp, [dwSrcStride] + + mov eax, 32768 + mov [xInverse], eax + mov ecx, [dwDstWidth] + dec ecx + + movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767 + +FAST_WIDTH: + mov eax, [xInverse] + shr eax, 16 + + movd xmm1, [esi+eax] ; xxxxxxba + movd xmm2, [ebp+eax] ; xxxxxxdc + punpcklwd xmm1, xmm2 ; xxxxdcba + punpcklbw xmm1, xmm0 ; 0d0c0b0a + + movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv + pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 + pmaddwd xmm2, xmm1 + pshufd xmm1, xmm2, 00000001b + paddd xmm2, xmm1 + movd xmm1, ebx + paddd xmm2, xmm1 + psrld xmm2, 15 + + packuswb xmm2, xmm0 + movd eax, xmm2 + mov [edi], al + inc edi + + mov eax, [uiScaleX] + add [xInverse], eax + + paddw xmm3, xmm7 ; inc u + + loop FAST_WIDTH + +FAST_WIDTH_END: + mov eax, [xInverse] + shr eax, 16 + mov cl, [esi+eax] + mov [edi], cl + inc edi + + mov eax, [uiScaleY] + add [yInverse], eax + add edi, [dstStep] + + paddw xmm4, xmm6 ; inc v + psllw xmm4, 1 + psrlw xmm4, 1 + + dec dword [tmpHeight] + jg FAST_HEIGHT + + +FAST_LAST_ROW: + mov eax, [yInverse] + mov esi, [pSrcData] + shr eax, 15 + mul dword [dwSrcStride] + add esi, eax ; get current row address + + mov eax, 32768 + mov [xInverse], eax + mov ecx, [dwDstWidth] + +FAST_LAST_ROW_WIDTH: + mov eax, [xInverse] + shr eax, 16 + + mov al, [esi+eax] + mov [edi], al + inc edi + + mov eax, [uiScaleX] + add [xInverse], eax + + loop FAST_LAST_ROW_WIDTH + +FAST_LAST_ROW_END: + + add esp, localsize + pop ebx + pop edi + pop esi + pop ebp +%undef pushsize +%undef localsize +%undef pSrcData +%undef dwSrcWidth +%undef dwSrcHeight +%undef dwSrcStride +%undef pDstData +%undef dwDstWidth +%undef dwDstHeight +%undef dwDstStride +%undef scale +%undef uiScaleX +%undef uiScaleY +%undef tmpHeight +%undef yInverse +%undef xInverse +%undef dstStep ret \ No newline at end of file diff --git a/processing/src/asm/intra_pred.asm b/processing/src/asm/intra_pred.asm index 905adbed..c8ed004f 100644 --- a/processing/src/asm/intra_pred.asm +++ b/processing/src/asm/intra_pred.asm @@ -1,145 +1,145 @@ -;*! -;* \copy -;* Copyright (c) 2009-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* intra_pred.asm -;* -;* Abstract -;* sse2 function for intra predict operations -;* -;* History -;* 18/09/2009 Created -;* -;* -;*************************************************************************/ -%include "../../src/asm/asm_inc.asm" - -BITS 32 -;*********************************************************************** -; Local Data (Read Only) -;*********************************************************************** - -%ifdef FORMAT_COFF -SECTION .rodata data -%else -SECTION .rodata align=16 -%endif - - -align 16 -mmx_01bytes: times 16 db 1 - -;*********************************************************************** -; macros -;*********************************************************************** -%macro COPY_16_TIMES 2 - movdqa %2, [%1-16] - psrldq %2, 15 - pmuludq %2, [mmx_01bytes] - pshufd %2, %2, 0 -%endmacro - -%macro COPY_16_TIMESS 3 - movdqa %2, [%1+%3-16] - psrldq %2, 15 - pmuludq %2, [mmx_01bytes] - pshufd %2, %2, 0 -%endmacro - -;*********************************************************************** -; Code -;*********************************************************************** - -SECTION .text - -;*********************************************************************** -; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); -;*********************************************************************** - -%macro SSE2_PRED_H_16X16_TWO_LINE 1 - lea eax, [eax+ecx*2] - - COPY_16_TIMES eax, xmm0 - movdqa [edx+%1], xmm0 - COPY_16_TIMESS eax, xmm0, ecx - movdqa [edx+%1+0x10], xmm0 -%endmacro - -WELS_EXTERN WelsI16x16LumaPredH_sse2 -WelsI16x16LumaPredH_sse2: - mov edx, [esp+4] ; pred - mov eax, [esp+8] ; pRef - mov ecx, [esp+12] ; stride - - COPY_16_TIMES eax, xmm0 - movdqa [edx], xmm0 - COPY_16_TIMESS eax, xmm0, ecx - movdqa [edx+0x10], xmm0 - - SSE2_PRED_H_16X16_TWO_LINE 0x20 - SSE2_PRED_H_16X16_TWO_LINE 0x40 - SSE2_PRED_H_16X16_TWO_LINE 0x60 - SSE2_PRED_H_16X16_TWO_LINE 0x80 - SSE2_PRED_H_16X16_TWO_LINE 0xa0 - SSE2_PRED_H_16X16_TWO_LINE 0xc0 - SSE2_PRED_H_16X16_TWO_LINE 0xe0 - - ret - -;*********************************************************************** -; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); -;*********************************************************************** -WELS_EXTERN WelsI16x16LumaPredV_sse2 -WelsI16x16LumaPredV_sse2: - mov edx, [esp+4] ; pred - mov eax, [esp+8] ; pRef - mov ecx, [esp+12] ; stride - - sub eax, ecx - movdqa xmm0, [eax] - - movdqa [edx], xmm0 - movdqa [edx+10h], xmm0 - movdqa [edx+20h], xmm0 - movdqa [edx+30h], xmm0 - movdqa [edx+40h], xmm0 - movdqa [edx+50h], xmm0 - movdqa [edx+60h], xmm0 - movdqa [edx+70h], xmm0 - movdqa [edx+80h], xmm0 - movdqa [edx+90h], xmm0 - movdqa [edx+160], xmm0 - movdqa [edx+176], xmm0 - movdqa [edx+192], xmm0 - movdqa [edx+208], xmm0 - movdqa [edx+224], xmm0 - movdqa [edx+240], xmm0 - +;*! +;* \copy +;* Copyright (c) 2009-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* intra_pred.asm +;* +;* Abstract +;* sse2 function for intra predict operations +;* +;* History +;* 18/09/2009 Created +;* +;* +;*************************************************************************/ +%include "../../src/asm/asm_inc.asm" + +BITS 32 +;*********************************************************************** +; Local Data (Read Only) +;*********************************************************************** + +%ifdef FORMAT_COFF +SECTION .rodata data +%else +SECTION .rodata align=16 +%endif + + +align 16 +mmx_01bytes: times 16 db 1 + +;*********************************************************************** +; macros +;*********************************************************************** +%macro COPY_16_TIMES 2 + movdqa %2, [%1-16] + psrldq %2, 15 + pmuludq %2, [mmx_01bytes] + pshufd %2, %2, 0 +%endmacro + +%macro COPY_16_TIMESS 3 + movdqa %2, [%1+%3-16] + psrldq %2, 15 + pmuludq %2, [mmx_01bytes] + pshufd %2, %2, 0 +%endmacro + +;*********************************************************************** +; Code +;*********************************************************************** + +SECTION .text + +;*********************************************************************** +; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); +;*********************************************************************** + +%macro SSE2_PRED_H_16X16_TWO_LINE 1 + lea eax, [eax+ecx*2] + + COPY_16_TIMES eax, xmm0 + movdqa [edx+%1], xmm0 + COPY_16_TIMESS eax, xmm0, ecx + movdqa [edx+%1+0x10], xmm0 +%endmacro + +WELS_EXTERN WelsI16x16LumaPredH_sse2 +WelsI16x16LumaPredH_sse2: + mov edx, [esp+4] ; pred + mov eax, [esp+8] ; pRef + mov ecx, [esp+12] ; stride + + COPY_16_TIMES eax, xmm0 + movdqa [edx], xmm0 + COPY_16_TIMESS eax, xmm0, ecx + movdqa [edx+0x10], xmm0 + + SSE2_PRED_H_16X16_TWO_LINE 0x20 + SSE2_PRED_H_16X16_TWO_LINE 0x40 + SSE2_PRED_H_16X16_TWO_LINE 0x60 + SSE2_PRED_H_16X16_TWO_LINE 0x80 + SSE2_PRED_H_16X16_TWO_LINE 0xa0 + SSE2_PRED_H_16X16_TWO_LINE 0xc0 + SSE2_PRED_H_16X16_TWO_LINE 0xe0 + + ret + +;*********************************************************************** +; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); +;*********************************************************************** +WELS_EXTERN WelsI16x16LumaPredV_sse2 +WelsI16x16LumaPredV_sse2: + mov edx, [esp+4] ; pred + mov eax, [esp+8] ; pRef + mov ecx, [esp+12] ; stride + + sub eax, ecx + movdqa xmm0, [eax] + + movdqa [edx], xmm0 + movdqa [edx+10h], xmm0 + movdqa [edx+20h], xmm0 + movdqa [edx+30h], xmm0 + movdqa [edx+40h], xmm0 + movdqa [edx+50h], xmm0 + movdqa [edx+60h], xmm0 + movdqa [edx+70h], xmm0 + movdqa [edx+80h], xmm0 + movdqa [edx+90h], xmm0 + movdqa [edx+160], xmm0 + movdqa [edx+176], xmm0 + movdqa [edx+192], xmm0 + movdqa [edx+208], xmm0 + movdqa [edx+224], xmm0 + movdqa [edx+240], xmm0 + ret \ No newline at end of file diff --git a/processing/src/asm/sad.asm b/processing/src/asm/sad.asm index 977e5e72..cb517323 100644 --- a/processing/src/asm/sad.asm +++ b/processing/src/asm/sad.asm @@ -1,79 +1,79 @@ -;*! -;* \copy -;* Copyright (c) 2009-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* pixel_sse2.asm -;* -;* Abstract -;* WelsSampleSad8x8_sse21 -;* -;* History -;* 8/5/2009 Created -;* -;* -;*************************************************************************/ - -%include "asm_inc.asm" - -BITS 32 - -;*********************************************************************** -; Macros and other preprocessor constants -;*********************************************************************** - -%macro SAD_8x4 0 - movq xmm0, [eax] - movq xmm1, [eax+ebx] - lea eax, [eax+2*ebx] - movhps xmm0, [eax] - movhps xmm1, [eax+ebx] - - movq xmm2, [ecx] - movq xmm3, [ecx+edx] - lea ecx, [ecx+2*edx] - movhps xmm2, [ecx] - movhps xmm3, [ecx+edx] - psadbw xmm0, xmm2 - psadbw xmm1, xmm3 - paddw xmm6, xmm0 - paddw xmm6, xmm1 -%endmacro - - - -%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline -and %1, 0x1f|(%3>>1) -cmp %1, (32-%2)|(%3>>1) -%endmacro - - +;*! +;* \copy +;* Copyright (c) 2009-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* pixel_sse2.asm +;* +;* Abstract +;* WelsSampleSad8x8_sse21 +;* +;* History +;* 8/5/2009 Created +;* +;* +;*************************************************************************/ + +%include "asm_inc.asm" + +BITS 32 + +;*********************************************************************** +; Macros and other preprocessor constants +;*********************************************************************** + +%macro SAD_8x4 0 + movq xmm0, [eax] + movq xmm1, [eax+ebx] + lea eax, [eax+2*ebx] + movhps xmm0, [eax] + movhps xmm1, [eax+ebx] + + movq xmm2, [ecx] + movq xmm3, [ecx+edx] + lea ecx, [ecx+2*edx] + movhps xmm2, [ecx] + movhps xmm3, [ecx+edx] + psadbw xmm0, xmm2 + psadbw xmm1, xmm3 + paddw xmm6, xmm0 + paddw xmm6, xmm1 +%endmacro + + + +%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline +and %1, 0x1f|(%3>>1) +cmp %1, (32-%2)|(%3>>1) +%endmacro + + %macro SSE2_GetSad8x4 0 movq xmm0, [eax] movq xmm1, [eax+ebx] @@ -90,12 +90,12 @@ cmp %1, (32-%2)|(%3>>1) psadbw xmm1, xmm3 paddw xmm6, xmm0 paddw xmm6, xmm1 -%endmacro +%endmacro -;*********************************************************************** -; Code -;*********************************************************************** +;*********************************************************************** +; Code +;*********************************************************************** SECTION .text WELS_EXTERN WelsSampleSad8x8_sse21 diff --git a/processing/src/asm/vaa.asm b/processing/src/asm/vaa.asm index 36b078ea..62fb2882 100644 --- a/processing/src/asm/vaa.asm +++ b/processing/src/asm/vaa.asm @@ -1,1589 +1,1589 @@ -;*! -;* \copy -;* Copyright (c) 2010-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* -;* vaa.asm -;* -;* Abstract -;* sse2 for pVaa routines -;* -;* History -;* 04/14/2010 Created -;* -;*************************************************************************/ -%include "asm_inc.asm" -BITS 32 - -;*********************************************************************** -; Macros and other preprocessor constants -;*********************************************************************** - -;%macro SUM_SSE2 4 ; dst, pSrc, zero, pack1_8x2 -; movdqa %1, %2 -; punpcklbw %1, %3 -; punpckhbw %2, %3 -; paddw %1, %2 -; pmaddwd %1, %4 -; pshufd %2, %1, 04Eh ; 01001110 B -; paddd %1, %2 -; pshufd %2, %1, 0B1h ; 10110001 B -; paddd %1, %2 -;%endmacro ; END OF SUM_SSE2 - -; by comparing it outperforms than phaddw(SSSE3) sets -%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp - ; @sum_8x2 begin - pshufd %2, %1, 04Eh ; 01001110 B - paddw %1, %2 - pshuflw %2, %1, 04Eh ; 01001110 B - paddw %1, %2 - pshuflw %2, %1, 0B1h ; 10110001 B - paddw %1, %2 - ; end of @sum_8x2 -%endmacro ; END of SUM_WORD_8x2_SSE2 - -%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero - movdqa %1, %2 - punpcklbw %1, %3 - punpckhbw %2, %3 - pmaddwd %1, %1 - pmaddwd %2, %2 - paddd %1, %2 - pshufd %2, %1, 04Eh ; 01001110 B - paddd %1, %2 - pshufd %2, %1, 0B1h ; 10110001 B - paddd %1, %2 -%endmacro ; END OF SUM_SQR_SSE2 - -%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4 - movdqa %1, [esi ] ; line 0 - movdqa %2, [esi+ecx] ; line 1 - movdqa %3, %1 - punpcklbw %1, xmm7 - punpckhbw %3, xmm7 - movdqa %4, %2 - punpcklbw %4, xmm7 - punpckhbw %2, xmm7 - paddw %1, %4 - paddw %2, %3 - movdqa %3, [esi+ebx] ; line 2 - movdqa %4, [esi+edx] ; line 3 - movdqa %5, %3 - punpcklbw %3, xmm7 - punpckhbw %5, xmm7 - movdqa %6, %4 - punpcklbw %6, xmm7 - punpckhbw %4, xmm7 - paddw %3, %6 - paddw %4, %5 - paddw %1, %3 ; block 0, 1 - paddw %2, %4 ; block 2, 3 - pshufd %3, %1, 0B1h - pshufd %4, %2, 0B1h - paddw %1, %3 - paddw %2, %4 - movdqa %3, %1 - movdqa %4, %2 - pshuflw %5, %1, 0B1h - pshufhw %6, %3, 0B1h - paddw %1, %5 - paddw %3, %6 - pshuflw %5, %2, 0B1h - pshufhw %6, %4, 0B1h - paddw %2, %5 - paddw %4, %6 - punpcklwd %1, %2 - punpckhwd %3, %4 - punpcklwd %1, %3 - psraw %1, $4 -%endmacro - -%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4 - movdqa %1, [esi ] ; line 0 - movdqa %2, [esi+ecx] ; line 1 - movdqa %3, %1 - punpcklbw %1, xmm7 - punpckhbw %3, xmm7 - movdqa %4, %2 - punpcklbw %4, xmm7 - punpckhbw %2, xmm7 - paddw %1, %4 - paddw %2, %3 - movdqa %3, [esi+ebx] ; line 2 - movdqa %4, [esi+edx] ; line 3 - movdqa %5, %3 - punpcklbw %3, xmm7 - punpckhbw %5, xmm7 - movdqa %6, %4 - punpcklbw %6, xmm7 - punpckhbw %4, xmm7 - paddw %3, %6 - paddw %4, %5 - paddw %1, %3 ; block 0, 1 - paddw %2, %4 ; block 2, 3 - phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; .. - phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; .... - psraw %1, $4 -%endmacro - -%macro WELS_SAD_16x2_SSE2 0 - movdqa xmm1, [esi] - movdqa xmm2, [edi] - movdqa xmm3, [esi+ebx] - movdqa xmm4, [edi+ebx] - psadbw xmm1, xmm2 - psadbw xmm3, xmm4 - paddd xmm6, xmm1 - paddd xmm6, xmm3 - lea esi, [esi+ebx*2] - lea edi, [edi+ebx*2] -%endmacro - -%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 0 - movdqa xmm1, [esi] - movdqa xmm2, [edi] - movdqa xmm3, xmm1 - psadbw xmm3, xmm2 - paddd xmm6, xmm3 - - movdqa xmm3, xmm1 - psadbw xmm3, xmm0 - paddd xmm5, xmm3 - - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm4, xmm1 - paddd xmm4, xmm2 - - add esi, ebx - add edi, ebx -%endmacro - -%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0 - movdqa xmm1, [esi] - movdqa xmm2, [edi] - movdqa xmm3, xmm1 - psadbw xmm3, xmm2 - paddd xmm7, xmm3 ; sad - - movdqa xmm3, xmm1 - pmaxub xmm3, xmm2 - pminub xmm2, xmm1 - psubb xmm3, xmm2 ; diff - - movdqa xmm2, xmm1 - psadbw xmm2, xmm0 - paddd xmm6, xmm2 ; sum - - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm5, xmm1 - paddd xmm5, xmm2 ; sqsum - - movdqa xmm1, xmm3 - punpcklbw xmm1, xmm0 - punpckhbw xmm3, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm3, xmm3 - paddd xmm4, xmm1 - paddd xmm4, xmm3 ; sqdiff - - add esi, ebx - add edi, ebx -%endmacro - -%macro WELS_SAD_SD_MAD_16x1_SSE2 4 -%define sad_reg %1 -%define sum_cur_reg %2 -%define sum_ref_reg %3 -%define mad_reg %4 - movdqa xmm1, [esi] - movdqa xmm2, [edi] - movdqa xmm3, xmm1 - psadbw xmm3, xmm0 - paddd sum_cur_reg, xmm3 ; sum_cur - movdqa xmm3, xmm2 - psadbw xmm3, xmm0 - paddd sum_ref_reg, xmm3 ; sum_ref - - movdqa xmm3, xmm1 - pmaxub xmm3, xmm2 - pminub xmm2, xmm1 - psubb xmm3, xmm2 ; abs diff - pmaxub mad_reg, xmm3 ; max abs diff - - psadbw xmm3, xmm0 - paddd sad_reg, xmm3 ; sad - - add esi, ebx - add edi, ebx -%endmacro - - -%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used -%define max_reg %1 - movdqa xmm1, max_reg - psrldq xmm1, 4 - pmaxub max_reg, xmm1 - movdqa xmm1, max_reg - psrldq xmm1, 2 - pmaxub max_reg, xmm1 - movdqa xmm1, max_reg - psrldq xmm1, 1 - pmaxub max_reg, xmm1 -%endmacro - -%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 4 -%define sad_reg %1 -%define sum_reg %2 -%define mad_reg %3 -%define sqdiff_reg %4 - movdqa xmm1, [esi] - movdqa xmm2, xmm1 - movdqa xmm3, xmm1 - punpcklbw xmm2, xmm0 - punpckhbw xmm3, xmm0 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - paddd xmm2, xmm3 - movdqa xmm3, xmm2 - psllq xmm2, 32 - psrlq xmm3, 32 - psllq xmm3, 32 - paddd xmm2, xmm3 - paddd sad_reg, xmm2 ; sqsum - - movdqa xmm2, [edi] - movdqa xmm3, xmm1 - psadbw xmm3, xmm0 - paddd sum_reg, xmm3 ; sum_cur - movdqa xmm3, xmm2 - psadbw xmm3, xmm0 - pslldq xmm3, 4 - paddd sum_reg, xmm3 ; sum_ref - - movdqa xmm3, xmm1 - pmaxub xmm3, xmm2 - pminub xmm2, xmm1 - psubb xmm3, xmm2 ; abs diff - pmaxub mad_reg, xmm3 ; max abs diff - - movdqa xmm1, xmm3 - psadbw xmm3, xmm0 - paddd sad_reg, xmm3 ; sad - - movdqa xmm3, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm3, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm3, xmm3 - paddd sqdiff_reg, xmm1 - paddd sqdiff_reg, xmm3 ; sqdiff - - add esi, ebx - add edi, ebx -%endmacro - - -;*********************************************************************** -; Local Data (Read Only) -;*********************************************************************** - -;SECTION .rodata align=16 - -;ALIGN 16 -;pack1_8x2: -; dw 1, 1, 1, 1, 1, 1, 1, 1 - -;*********************************************************************** -; Code -;*********************************************************************** - -SECTION .text - -WELS_EXTERN rc_sad_frame_sse2 -;*********************************************************************** -; uint32_t rc_sad_frame_sse2( uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride ); -;*********************************************************************** -ALIGN 16 -rc_sad_frame_sse2: - push esi - push edi - push ebp - push ebx - push edx - - mov esi, [esp+24] - mov edi, [esp+28] - mov ebx, [esp+32] - mov ecx, [esp+36] - mov edx, [esp+40] - pxor xmm0, xmm0 -.hloop: - mov eax, ebx - mov ebp, $0 -.wloop: - movdqa xmm1, [esi+ebp] - movdqa xmm2, [edi+ebp] - psadbw xmm1, xmm2 - pshufd xmm2, xmm1, 0f6h ; 11110110 B ; movhlps for float - paddd xmm1, xmm2 - paddd xmm0, xmm1 - add ebp, 010h - dec eax - jnz near .wloop - lea esi, [esi+edx] - lea edi, [edi+edx] - dec ecx - jnz near .hloop - - movd eax, xmm0 - pop edx - pop ebx - pop ebp - pop edi - pop esi - ret - - -WELS_EXTERN SampleVariance16x16_sse2 -;*********************************************************************** -; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture ); -;*********************************************************************** -ALIGN 16 -SampleVariance16x16_sse2: - push esi - push edi - push ebx - - sub esp, 16 - %define SUM [esp] - %define SUM_CUR [esp+4] - %define SQR [esp+8] - %define SQR_CUR [esp+12] - %define PUSH_SIZE 28 ; 12 + 16 - - mov edi, [esp+PUSH_SIZE+4] ; y_ref - mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride - mov esi, [esp+PUSH_SIZE+12] ; y_src - mov eax, [esp+PUSH_SIZE+16] ; y_src_stride - mov ecx, 010h ; height = 16 - - pxor xmm7, xmm7 - movdqu SUM, xmm7 - -.hloops: - movdqa xmm0, [edi] ; y_ref - movdqa xmm1, [esi] ; y_src - movdqa xmm2, xmm0 ; store first for future process - movdqa xmm3, xmm1 - ; sum += diff; - movdqa xmm4, xmm0 - psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79] - ; to be continued for sum - pshufd xmm5, xmm4, 0C6h ; 11000110 B - paddw xmm4, xmm5 - movd ebx, xmm4 - add SUM, ebx - - ; sqr += diff * diff; - pmaxub xmm0, xmm1 - pminub xmm1, xmm2 - psubb xmm0, xmm1 ; diff - SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero - movd ebx, xmm1 - add SQR, ebx - - ; sum_cur += y_src[x]; - movdqa xmm0, xmm3 ; cur_orig - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 - punpckhbw xmm1, xmm7 - paddw xmm0, xmm1 ; 8x2 - SUM_WORD_8x2_SSE2 xmm0, xmm1 - movd ebx, xmm0 - and ebx, 0ffffh - add SUM_CUR, ebx - - ; sqr_cur += y_src[x] * y_src[x]; - SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero - movd ebx, xmm0 - add SQR_CUR, ebx - - lea edi, [edi+edx] - lea esi, [esi+eax] - dec ecx - jnz near .hloops - - mov ebx, 0 - mov bx, word SUM - sar ebx, 8 - imul ebx, ebx - mov ecx, SQR - sar ecx, 8 - sub ecx, ebx - mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture - mov [edi], cx ; to store uiMotionIndex - mov ebx, 0 - mov bx, word SUM_CUR - sar ebx, 8 - imul ebx, ebx - mov ecx, SQR_CUR - sar ecx, 8 - sub ecx, ebx - mov [edi+2], cx ; to store uiTextureIndex - - %undef SUM - %undef SUM_CUR - %undef SQR - %undef SQR_CUR - %undef PUSH_SIZE - - add esp, 16 - pop ebx - pop edi - pop esi - - ret - -; , 6/7/2010 - -%ifndef NO_DYNAMIC_VP -WELS_EXTERN AnalysisVaaInfoIntra_sse2 -;*********************************************************************** -; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t linesize ); -;*********************************************************************** -ALIGN 16 -AnalysisVaaInfoIntra_sse2: - push ebx - push edx - push esi - push edi - push ebp - - mov ebp, esp - and ebp, 0fh - sub esp, ebp - sub esp, 32 - %define PUSH_SIZE 52 ; 20 + 32 - - mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y - mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize - - mov ebx, ecx - sal ebx, $1 ; linesize x 2 [ebx] - mov edx, ebx - add edx, ecx ; linesize x 3 [edx] - mov eax, ebx - sal eax, $1 ; linesize x 4 [eax] - - pxor xmm7, xmm7 - - ; loops - VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - movq [esp], xmm0 - - lea esi, [esi+eax] - VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - movq [esp+8], xmm0 - - lea esi, [esi+eax] - VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - movq [esp+16], xmm0 - - lea esi, [esi+eax] - VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - movq [esp+24], xmm0 - - movdqa xmm0, [esp] ; block 0~7 - movdqa xmm1, [esp+16] ; block 8~15 - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - SUM_WORD_8x2_SSE2 xmm0, xmm3 - - pmullw xmm1, xmm1 - pmullw xmm2, xmm2 - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 - punpcklwd xmm1, xmm7 - punpckhwd xmm3, xmm7 - punpcklwd xmm2, xmm7 - punpckhwd xmm4, xmm7 - paddd xmm1, xmm2 - paddd xmm3, xmm4 - paddd xmm1, xmm3 - pshufd xmm2, xmm1, 01Bh - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 0B1h - paddd xmm1, xmm2 - - movd ebx, xmm0 - and ebx, 0ffffh ; effective low word truncated - mov ecx, ebx - imul ebx, ecx - sar ebx, $4 - movd eax, xmm1 - sub eax, ebx - - %undef PUSH_SIZE - add esp, 32 - add esp, ebp - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret - -WELS_EXTERN AnalysisVaaInfoIntra_ssse3 -;*********************************************************************** -; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t linesize ); -;*********************************************************************** -ALIGN 16 -AnalysisVaaInfoIntra_ssse3: - push ebx - push edx - push esi - push edi - push ebp - - mov ebp, esp - and ebp, 0fh - sub esp, ebp - sub esp, 32 - %define PUSH_SIZE 52 ; 20 + 32 - - mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y - mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize - - mov ebx, ecx - sal ebx, $1 ; linesize x 2 [ebx] - mov edx, ebx - add edx, ecx ; linesize x 3 [edx] - mov eax, ebx - sal eax, $1 ; linesize x 4 [eax] - - pxor xmm7, xmm7 - - ; loops - VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - movq [esp], xmm0 - - lea esi, [esi+eax] - VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 - movq [esp+8], xmm1 - - lea esi, [esi+eax] - VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - movq [esp+16], xmm0 - - lea esi, [esi+eax] - VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 - movq [esp+24], xmm1 - - movdqa xmm0, [esp] ; block 0~7 - movdqa xmm1, [esp+16] ; block 8~15 - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets - - pmullw xmm1, xmm1 - pmullw xmm2, xmm2 - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 - punpcklwd xmm1, xmm7 - punpckhwd xmm3, xmm7 - punpcklwd xmm2, xmm7 - punpckhwd xmm4, xmm7 - paddd xmm1, xmm2 - paddd xmm3, xmm4 - paddd xmm1, xmm3 - pshufd xmm2, xmm1, 01Bh - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 0B1h - paddd xmm1, xmm2 - - movd ebx, xmm0 - and ebx, 0ffffh ; effective low work truncated - mov ecx, ebx - imul ebx, ecx - sar ebx, $4 - movd eax, xmm1 - sub eax, ebx - - %undef PUSH_SIZE - add esp, 32 - add esp, ebp - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret -%endif - - - -WELS_EXTERN abs_difference_mbrow_sse2 -;************************************************************************************************************* -;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride, -; int32_t gom_pixel_num, int32_t *pSum) -;************************************************************************************************************* -ALIGN 16 -abs_difference_mbrow_sse2: -%define ref_orig esp + pushsize + 4 -%define cur_orig esp + pushsize + 8 -%define iPicStride esp + pushsize + 12 -%define gom_pixel_num esp + pushsize + 16 -%define pSum esp + pushsize + 20 -%define pushsize 12 - push esi - push edi - push ebx - mov esi, [ref_orig] - mov edi, [cur_orig] - mov ebx, [iPicStride] - mov eax, [gom_pixel_num] - mov ecx, 16 ;MB_WIDTH_LUMA - pxor xmm0, xmm0 -mb_width_loop_p: - mov edx, esi - add edx, eax ; end address -gom_row_loop_p: - movdqa xmm1, [esi] - movdqa xmm2, [edi] - psadbw xmm1, xmm2 - paddd xmm0, xmm1 - add esi, 16 - add edi, 16 - cmp esi, edx - jl gom_row_loop_p - - sub esi, eax - sub edi, eax - add esi, ebx - add edi, ebx - loop mb_width_loop_p - - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddd xmm1, xmm0 - movd eax, xmm1 - mov edx, [pSum] ; pSum - add [edx], eax - -%undef ref_orig -%undef cur_orig -%undef iPicStride -%undef gom_pixel_num -%undef pSum -%undef pushsize - pop ebx - pop edi - pop esi - ret - - - - -WELS_EXTERN sum_sqrsum_mbrow_sse2 -;************************************************************************************************************* -;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride, -; int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum) -;************************************************************************************************************* -ALIGN 16 -sum_sqrsum_mbrow_sse2: -%define cur_orig esp + pushsize + 4 -%define iPicStride esp + pushsize + 8 -%define gom_pixel_num esp + pushsize + 12 -%define pSum esp + pushsize + 16 -%define pSqrSum esp + pushsize + 20 -%define pushsize 8 - push esi - push ebx - mov esi, [cur_orig] - mov eax, [gom_pixel_num] - mov ebx, [iPicStride] - mov ecx, 16 ;MB_WIDTH_LUMA - pxor xmm0, xmm0 ; zero - pxor xmm1, xmm1 ; sum - pxor xmm2, xmm2 ; sqr sum -mb_width_loop_i: - mov edx, esi - add edx, eax ; end address -gom_row_loop_i: - movdqa xmm3, [esi] - movdqa xmm4, xmm3 - psadbw xmm4, xmm0 - paddd xmm1, xmm4 - movdqa xmm4, xmm3 - punpcklbw xmm4, xmm0 - punpckhbw xmm3, xmm0 - pmaddwd xmm4, xmm4 - pmaddwd xmm3, xmm3 - paddd xmm2, xmm3 - paddd xmm2, xmm4 - add esi, 16 - cmp esi, edx - jl gom_row_loop_i - - sub esi, eax - add esi, ebx - loop mb_width_loop_i - - movdqa xmm3, xmm1 - psrldq xmm3, 8 - paddd xmm1, xmm3 - movd eax, xmm1 - mov edx, [pSum] - add [edx], eax - - movdqa xmm3, xmm2 - psrldq xmm3, 8 - paddd xmm2, xmm3 - movdqa xmm3, xmm2 - psrldq xmm3, 4 - paddd xmm2, xmm3 - movd eax, xmm2 - mov edx, [pSqrSum] - add [edx], eax - - -%undef cur_orig -%undef iPicStride -%undef gom_pixel_num -%undef pSum -%undef pSqrSum -%undef pushsize - pop ebx - pop esi - ret - - - -WELS_EXTERN VAACalcSad_sse2 -;************************************************************************************************************* -;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight -; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8) -;************************************************************************************************************* - - -ALIGN 16 -VAACalcSad_sse2: -%define cur_data esp + pushsize + 4 -%define ref_data esp + pushsize + 8 -%define iPicWidth esp + pushsize + 12 -%define iPicHeight esp + pushsize + 16 -%define iPicStride esp + pushsize + 20 -%define psadframe esp + pushsize + 24 -%define psad8x8 esp + pushsize + 28 -%define pushsize 12 - push esi - push edi - push ebx - mov esi, [cur_data] - mov edi, [ref_data] - mov ebx, [iPicStride] - mov edx, [psad8x8] - mov eax, ebx - - shr dword [iPicWidth], 4 ; iPicWidth/16 - shr dword [iPicHeight], 4 ; iPicHeight/16 - shl eax, 4 ; iPicStride*16 - pxor xmm0, xmm0 - pxor xmm7, xmm7 ; iFrameSad -height_loop: - mov ecx, dword [iPicWidth] - push esi - push edi -width_loop: - pxor xmm6, xmm6 ; - WELS_SAD_16x2_SSE2 - WELS_SAD_16x2_SSE2 - WELS_SAD_16x2_SSE2 - WELS_SAD_16x2_SSE2 - paddd xmm7, xmm6 - movd [edx], xmm6 - psrldq xmm6, 8 - movd [edx+4], xmm6 - - pxor xmm6, xmm6 - WELS_SAD_16x2_SSE2 - WELS_SAD_16x2_SSE2 - WELS_SAD_16x2_SSE2 - WELS_SAD_16x2_SSE2 - paddd xmm7, xmm6 - movd [edx+8], xmm6 - psrldq xmm6, 8 - movd [edx+12], xmm6 - - add edx, 16 - sub esi, eax - sub edi, eax - add esi, 16 - add edi, 16 - - dec ecx - jnz width_loop - - pop edi - pop esi - add esi, eax - add edi, eax - - dec dword [iPicHeight] - jnz height_loop - - mov edx, [psadframe] - movdqa xmm5, xmm7 - psrldq xmm7, 8 - paddd xmm7, xmm5 - movd [edx], xmm7 - -%undef cur_data -%undef ref_data -%undef iPicWidth -%undef iPicHeight -%undef iPicStride -%undef psadframe -%undef psad8x8 -%undef pushsize - pop ebx - pop edi - pop esi - ret - - -WELS_EXTERN VAACalcSadVar_sse2 -;************************************************************************************************************* -;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight -; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16) -;************************************************************************************************************* - - -ALIGN 16 -VAACalcSadVar_sse2: -%define localsize 8 -%define cur_data esp + pushsize + localsize + 4 -%define ref_data esp + pushsize + localsize + 8 -%define iPicWidth esp + pushsize + localsize + 12 -%define iPicHeight esp + pushsize + localsize + 16 -%define iPicStride esp + pushsize + localsize + 20 -%define psadframe esp + pushsize + localsize + 24 -%define psad8x8 esp + pushsize + localsize + 28 -%define psum16x16 esp + pushsize + localsize + 32 -%define psqsum16x16 esp + pushsize + localsize + 36 -%define tmp_esi esp + 0 -%define tmp_edi esp + 4 -%define pushsize 16 - push ebp - push esi - push edi - push ebx - sub esp, localsize - mov esi, [cur_data] - mov edi, [ref_data] - mov ebx, [iPicStride] - mov edx, [psad8x8] - mov eax, ebx - - shr dword [iPicWidth], 4 ; iPicWidth/16 - shr dword [iPicHeight], 4 ; iPicHeight/16 - shl eax, 4 ; iPicStride*16 - pxor xmm0, xmm0 - pxor xmm7, xmm7 ; iFrameSad -var_height_loop: - mov ecx, dword [iPicWidth] - mov [tmp_esi], esi - mov [tmp_edi], edi -var_width_loop: - pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8 - pxor xmm5, xmm5 ; pSum16x16 - pxor xmm4, xmm4 ; sqsum_16x16 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - paddd xmm7, xmm6 - movd [edx], xmm6 - psrldq xmm6, 8 - movd [edx+4], xmm6 - - pxor xmm6, xmm6 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - WELS_SAD_SUM_SQSUM_16x1_SSE2 - paddd xmm7, xmm6 - movd [edx+8], xmm6 - psrldq xmm6, 8 - movd [edx+12], xmm6 - - mov ebp, [psum16x16] - movdqa xmm1, xmm5 - psrldq xmm1, 8 - paddd xmm5, xmm1 - movd [ebp], xmm5 - add dword [psum16x16], 4 - - movdqa xmm5, xmm4 - psrldq xmm5, 8 - paddd xmm4, xmm5 - movdqa xmm3, xmm4 - psrldq xmm3, 4 - paddd xmm4, xmm3 - - mov ebp, [psqsum16x16] - movd [ebp], xmm4 - add dword [psqsum16x16], 4 - - add edx, 16 - sub esi, eax - sub edi, eax - add esi, 16 - add edi, 16 - - dec ecx - jnz var_width_loop - - mov esi, [tmp_esi] - mov edi, [tmp_edi] - add esi, eax - add edi, eax - - dec dword [iPicHeight] - jnz var_height_loop - - mov edx, [psadframe] - movdqa xmm5, xmm7 - psrldq xmm7, 8 - paddd xmm7, xmm5 - movd [edx], xmm7 - - add esp, localsize - pop ebx - pop edi - pop esi - pop ebp -%undef cur_data -%undef ref_data -%undef iPicWidth -%undef iPicHeight -%undef iPicStride -%undef psadframe -%undef psad8x8 -%undef psum16x16 -%undef psqsum16x16 -%undef tmp_esi -%undef tmp_edi -%undef pushsize -%undef localsize - ret - - - -WELS_EXTERN VAACalcSadSsd_sse2 -;************************************************************************************************************* -;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, -; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16) -;************************************************************************************************************* - - -ALIGN 16 -VAACalcSadSsd_sse2: -%define localsize 12 -%define cur_data esp + pushsize + localsize + 4 -%define ref_data esp + pushsize + localsize + 8 -%define iPicWidth esp + pushsize + localsize + 12 -%define iPicHeight esp + pushsize + localsize + 16 -%define iPicStride esp + pushsize + localsize + 20 -%define psadframe esp + pushsize + localsize + 24 -%define psad8x8 esp + pushsize + localsize + 28 -%define psum16x16 esp + pushsize + localsize + 32 -%define psqsum16x16 esp + pushsize + localsize + 36 -%define psqdiff16x16 esp + pushsize + localsize + 40 -%define tmp_esi esp + 0 -%define tmp_edi esp + 4 -%define tmp_sadframe esp + 8 -%define pushsize 16 - push ebp - push esi - push edi - push ebx - sub esp, localsize - mov ecx, [iPicWidth] - mov ecx, [iPicHeight] - mov esi, [cur_data] - mov edi, [ref_data] - mov ebx, [iPicStride] - mov edx, [psad8x8] - mov eax, ebx - - shr dword [iPicWidth], 4 ; iPicWidth/16 - shr dword [iPicHeight], 4 ; iPicHeight/16 - shl eax, 4 ; iPicStride*16 - mov ecx, [iPicWidth] - mov ecx, [iPicHeight] - pxor xmm0, xmm0 - movd [tmp_sadframe], xmm0 -sqdiff_height_loop: - mov ecx, dword [iPicWidth] - mov [tmp_esi], esi - mov [tmp_edi], edi -sqdiff_width_loop: - pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8 - pxor xmm6, xmm6 ; pSum16x16 - pxor xmm5, xmm5 ; sqsum_16x16 four dword - pxor xmm4, xmm4 ; sqdiff_16x16 four Dword - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - movdqa xmm1, xmm7 - movd [edx], xmm7 - psrldq xmm7, 8 - paddd xmm1, xmm7 - movd [edx+4], xmm7 - movd ebp, xmm1 - add [tmp_sadframe], ebp - - pxor xmm7, xmm7 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 - movdqa xmm1, xmm7 - movd [edx+8], xmm7 - psrldq xmm7, 8 - paddd xmm1, xmm7 - movd [edx+12], xmm7 - movd ebp, xmm1 - add [tmp_sadframe], ebp - - mov ebp, [psum16x16] - movdqa xmm1, xmm6 - psrldq xmm1, 8 - paddd xmm6, xmm1 - movd [ebp], xmm6 - add dword [psum16x16], 4 - - mov ebp, [psqsum16x16] - pshufd xmm6, xmm5, 14 ;00001110 - paddd xmm6, xmm5 - pshufd xmm5, xmm6, 1 ;00000001 - paddd xmm5, xmm6 - movd [ebp], xmm5 - add dword [psqsum16x16], 4 - - mov ebp, [psqdiff16x16] - pshufd xmm5, xmm4, 14 ; 00001110 - paddd xmm5, xmm4 - pshufd xmm4, xmm5, 1 ; 00000001 - paddd xmm4, xmm5 - movd [ebp], xmm4 - add dword [psqdiff16x16], 4 - - add edx, 16 - sub esi, eax - sub edi, eax - add esi, 16 - add edi, 16 - - dec ecx - jnz sqdiff_width_loop - - mov esi, [tmp_esi] - mov edi, [tmp_edi] - add esi, eax - add edi, eax - - dec dword [iPicHeight] - jnz sqdiff_height_loop - - mov ebx, [tmp_sadframe] - mov eax, [psadframe] - mov [eax], ebx - - add esp, localsize - pop ebx - pop edi - pop esi - pop ebp -%undef cur_data -%undef ref_data -%undef iPicWidth -%undef iPicHeight -%undef iPicStride -%undef psadframe -%undef psad8x8 -%undef psum16x16 -%undef psqsum16x16 -%undef psqdiff16x16 -%undef tmp_esi -%undef tmp_edi -%undef tmp_sadframe -%undef pushsize -%undef localsize - ret - - - - - -WELS_EXTERN VAACalcSadBgd_sse2 -;************************************************************************************************************* -;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, -; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8) -;************************************************************************************************************* - - -ALIGN 16 -VAACalcSadBgd_sse2: -%define localsize 12 -%define cur_data esp + pushsize + localsize + 4 -%define ref_data esp + pushsize + localsize + 8 -%define iPicWidth esp + pushsize + localsize + 12 -%define iPicHeight esp + pushsize + localsize + 16 -%define iPicStride esp + pushsize + localsize + 20 -%define psadframe esp + pushsize + localsize + 24 -%define psad8x8 esp + pushsize + localsize + 28 -%define p_sd8x8 esp + pushsize + localsize + 32 -%define p_mad8x8 esp + pushsize + localsize + 36 -%define tmp_esi esp + 0 -%define tmp_edi esp + 4 -%define tmp_ecx esp + 8 -%define pushsize 16 - push ebp - push esi - push edi - push ebx - sub esp, localsize - mov esi, [cur_data] - mov edi, [ref_data] - mov ebx, [iPicStride] - mov eax, ebx - - shr dword [iPicWidth], 4 ; iPicWidth/16 - shr dword [iPicHeight], 4 ; iPicHeight/16 - shl eax, 4 ; iPicStride*16 - xor ebp, ebp - pxor xmm0, xmm0 -bgd_height_loop: - mov ecx, dword [iPicWidth] - mov [tmp_esi], esi - mov [tmp_edi], edi -bgd_width_loop: - pxor xmm7, xmm7 ; pSad8x8 - pxor xmm6, xmm6 ; sum_cur_8x8 - pxor xmm5, xmm5 ; sum_ref_8x8 - pxor xmm4, xmm4 ; pMad8x8 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - - - mov edx, [p_mad8x8] - WELS_MAX_REG_SSE2 xmm4 - - ;movdqa xmm1, xmm4 - ;punpcklbw xmm1, xmm0 - ;punpcklwd xmm1, xmm0 - ;movd [edx], xmm1 - ;punpckhbw xmm4, xmm0 - ;punpcklwd xmm4, xmm0 - ;movd [edx+4], xmm4 - ;add edx, 8 - ;mov [p_mad8x8], edx - mov [tmp_ecx], ecx - movhlps xmm1, xmm4 - movd ecx, xmm4 - mov [edx], cl - movd ecx, xmm1 - mov [edx+1],cl - add edx, 2 - mov [p_mad8x8], edx - - - pslldq xmm7, 4 - pslldq xmm6, 4 - pslldq xmm5, 4 - - - pxor xmm4, xmm4 ; pMad8x8 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - - mov edx, [p_mad8x8] - WELS_MAX_REG_SSE2 xmm4 - - ;movdqa xmm1, xmm4 - ;punpcklbw xmm1, xmm0 - ;punpcklwd xmm1, xmm0 - ;movd [edx], xmm1 - ;punpckhbw xmm4, xmm0 - ;punpcklwd xmm4, xmm0 - ;movd [edx+4], xmm4 - ;add edx, 8 - ;mov [p_mad8x8], edx - movhlps xmm1, xmm4 - movd ecx, xmm4 - mov [edx], cl - movd ecx, xmm1 - mov [edx+1],cl - add edx, 2 - mov [p_mad8x8], edx - - ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2 - - mov edx, [psad8x8] - pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0 - movdqa [edx], xmm1 - add edx, 16 - mov [psad8x8], edx ; sad8x8 - - paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0 - pshufd xmm2, xmm1, 00000011b - paddd xmm1, xmm2 - movd edx, xmm1 - add ebp, edx ; sad frame - - mov edx, [p_sd8x8] - psubd xmm6, xmm5 - pshufd xmm1, xmm6, 10001101b - movdqa [edx], xmm1 - add edx, 16 - mov [p_sd8x8], edx - - - add edx, 16 - sub esi, eax - sub edi, eax - add esi, 16 - add edi, 16 - - mov ecx, [tmp_ecx] - dec ecx - jnz bgd_width_loop - - mov esi, [tmp_esi] - mov edi, [tmp_edi] - add esi, eax - add edi, eax - - dec dword [iPicHeight] - jnz bgd_height_loop - - mov edx, [psadframe] - mov [edx], ebp - - add esp, localsize - pop ebx - pop edi - pop esi - pop ebp -%undef cur_data -%undef ref_data -%undef iPicWidth -%undef iPicHeight -%undef iPicStride -%undef psadframe -%undef psad8x8 -%undef p_sd8x8 -%undef p_mad8x8 -%undef tmp_esi -%undef tmp_edi -%undef pushsize -%undef localsize - ret - - - -WELS_EXTERN VAACalcSadSsdBgd_sse2 -;************************************************************************************************************* -;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, -; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, -; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8) -;************************************************************************************************************* - - -ALIGN 16 -VAACalcSadSsdBgd_sse2: -%define localsize 16 -%define cur_data esp + pushsize + localsize + 4 -%define ref_data esp + pushsize + localsize + 8 -%define iPicWidth esp + pushsize + localsize + 12 -%define iPicHeight esp + pushsize + localsize + 16 -%define iPicStride esp + pushsize + localsize + 20 -%define psadframe esp + pushsize + localsize + 24 -%define psad8x8 esp + pushsize + localsize + 28 -%define psum16x16 esp + pushsize + localsize + 32 -%define psqsum16x16 esp + pushsize + localsize + 36 -%define psqdiff16x16 esp + pushsize + localsize + 40 -%define p_sd8x8 esp + pushsize + localsize + 44 -%define p_mad8x8 esp + pushsize + localsize + 48 -%define tmp_esi esp + 0 -%define tmp_edi esp + 4 -%define tmp_sadframe esp + 8 -%define tmp_ecx esp + 12 -%define pushsize 16 - push ebp - push esi - push edi - push ebx - sub esp, localsize - mov esi, [cur_data] - mov edi, [ref_data] - mov ebx, [iPicStride] - mov eax, ebx - - shr dword [iPicWidth], 4 ; iPicWidth/16 - shr dword [iPicHeight], 4 ; iPicHeight/16 - shl eax, 4 ; iPicStride*16 - pxor xmm0, xmm0 - movd [tmp_sadframe], xmm0 -sqdiff_bgd_height_loop: - mov ecx, dword [iPicWidth] - mov [tmp_esi], esi - mov [tmp_edi], edi -sqdiff_bgd_width_loop: - pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0 - pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 - pxor xmm5, xmm5 ; pMad8x8 - pxor xmm4, xmm4 ; sqdiff_16x16 four Dword - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - - mov edx, [psad8x8] - movdqa xmm2, xmm7 - pshufd xmm1, xmm2, 00001110b - movd [edx], xmm2 - movd [edx+4], xmm1 - add edx, 8 - mov [psad8x8], edx ; sad8x8 - - paddd xmm1, xmm2 - movd edx, xmm1 - add [tmp_sadframe], edx ; iFrameSad - - mov edx, [psum16x16] - movdqa xmm1, xmm6 - pshufd xmm2, xmm1, 00001110b - paddd xmm1, xmm2 - movd [edx], xmm1 ; sum - - mov edx, [p_sd8x8] - pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 - psubd xmm6, xmm1 ; 00 diff1 00 diff0 - pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 - movq [edx], xmm1 - add edx, 8 - mov [p_sd8x8], edx - - mov edx, [p_mad8x8] - WELS_MAX_REG_SSE2 xmm5 - ;movdqa xmm1, xmm5 - ;punpcklbw xmm1, xmm0 - ;punpcklwd xmm1, xmm0 - ;movd [edx], xmm1 - ;punpckhbw xmm5, xmm0 - ;punpcklwd xmm5, xmm0 - ;movd [edx+4], xmm5 - ;add edx, 8 - ;mov [p_mad8x8], edx - mov [tmp_ecx], ecx - movhlps xmm1, xmm5 - movd ecx, xmm5 - mov [edx], cl - movd ecx, xmm1 - mov [edx+1],cl - add edx, 2 - mov [p_mad8x8], edx - - psrlq xmm7, 32 - psllq xmm7, 32 ; clear sad - pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 - pxor xmm5, xmm5 ; pMad8x8 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 - - mov edx, [psad8x8] - movdqa xmm2, xmm7 - pshufd xmm1, xmm2, 00001110b - movd [edx], xmm2 - movd [edx+4], xmm1 - add edx, 8 - mov [psad8x8], edx ; sad8x8 - - paddd xmm1, xmm2 - movd edx, xmm1 - add [tmp_sadframe], edx ; iFrameSad - - mov edx, [psum16x16] - movdqa xmm1, xmm6 - pshufd xmm2, xmm1, 00001110b - paddd xmm1, xmm2 - movd ebp, xmm1 ; sum - add [edx], ebp - add edx, 4 - mov [psum16x16], edx - - mov edx, [psqsum16x16] - psrlq xmm7, 32 - pshufd xmm2, xmm7, 00001110b - paddd xmm2, xmm7 - movd [edx], xmm2 ; sqsum - add edx, 4 - mov [psqsum16x16], edx - - mov edx, [p_sd8x8] - pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 - psubd xmm6, xmm1 ; 00 diff1 00 diff0 - pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 - movq [edx], xmm1 - add edx, 8 - mov [p_sd8x8], edx - - mov edx, [p_mad8x8] - WELS_MAX_REG_SSE2 xmm5 - ;movdqa xmm1, xmm5 - ;punpcklbw xmm1, xmm0 - ;punpcklwd xmm1, xmm0 - ;movd [edx], xmm1 - ;punpckhbw xmm5, xmm0 - ;punpcklwd xmm5, xmm0 - ;movd [edx+4], xmm5 - ;add edx, 8 - ;mov [p_mad8x8], edx - movhlps xmm1, xmm5 - movd ecx, xmm5 - mov [edx], cl - movd ecx, xmm1 - mov [edx+1],cl - add edx, 2 - mov [p_mad8x8], edx - - mov edx, [psqdiff16x16] - pshufd xmm1, xmm4, 00001110b - paddd xmm4, xmm1 - pshufd xmm1, xmm4, 00000001b - paddd xmm4, xmm1 - movd [edx], xmm4 - add edx, 4 - mov [psqdiff16x16], edx - - add edx, 16 - sub esi, eax - sub edi, eax - add esi, 16 - add edi, 16 - - mov ecx, [tmp_ecx] - dec ecx - jnz sqdiff_bgd_width_loop - - mov esi, [tmp_esi] - mov edi, [tmp_edi] - add esi, eax - add edi, eax - - dec dword [iPicHeight] - jnz sqdiff_bgd_height_loop - - mov edx, [psadframe] - mov ebp, [tmp_sadframe] - mov [edx], ebp - - add esp, localsize - pop ebx - pop edi - pop esi - pop ebp -%undef cur_data -%undef ref_data -%undef iPicWidth -%undef iPicHeight -%undef iPicStride -%undef psadframe -%undef psad8x8 -%undef psum16x16 -%undef psqsum16x16 -%undef psqdiff16x16 -%undef p_sd8x8 -%undef p_mad8x8 -%undef tmp_esi -%undef tmp_edi -%undef pushsize -%undef localsize - ret +;*! +;* \copy +;* Copyright (c) 2010-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* +;* vaa.asm +;* +;* Abstract +;* sse2 for pVaa routines +;* +;* History +;* 04/14/2010 Created +;* +;*************************************************************************/ +%include "asm_inc.asm" +BITS 32 + +;*********************************************************************** +; Macros and other preprocessor constants +;*********************************************************************** + +;%macro SUM_SSE2 4 ; dst, pSrc, zero, pack1_8x2 +; movdqa %1, %2 +; punpcklbw %1, %3 +; punpckhbw %2, %3 +; paddw %1, %2 +; pmaddwd %1, %4 +; pshufd %2, %1, 04Eh ; 01001110 B +; paddd %1, %2 +; pshufd %2, %1, 0B1h ; 10110001 B +; paddd %1, %2 +;%endmacro ; END OF SUM_SSE2 + +; by comparing it outperforms than phaddw(SSSE3) sets +%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp + ; @sum_8x2 begin + pshufd %2, %1, 04Eh ; 01001110 B + paddw %1, %2 + pshuflw %2, %1, 04Eh ; 01001110 B + paddw %1, %2 + pshuflw %2, %1, 0B1h ; 10110001 B + paddw %1, %2 + ; end of @sum_8x2 +%endmacro ; END of SUM_WORD_8x2_SSE2 + +%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero + movdqa %1, %2 + punpcklbw %1, %3 + punpckhbw %2, %3 + pmaddwd %1, %1 + pmaddwd %2, %2 + paddd %1, %2 + pshufd %2, %1, 04Eh ; 01001110 B + paddd %1, %2 + pshufd %2, %1, 0B1h ; 10110001 B + paddd %1, %2 +%endmacro ; END OF SUM_SQR_SSE2 + +%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4 + movdqa %1, [esi ] ; line 0 + movdqa %2, [esi+ecx] ; line 1 + movdqa %3, %1 + punpcklbw %1, xmm7 + punpckhbw %3, xmm7 + movdqa %4, %2 + punpcklbw %4, xmm7 + punpckhbw %2, xmm7 + paddw %1, %4 + paddw %2, %3 + movdqa %3, [esi+ebx] ; line 2 + movdqa %4, [esi+edx] ; line 3 + movdqa %5, %3 + punpcklbw %3, xmm7 + punpckhbw %5, xmm7 + movdqa %6, %4 + punpcklbw %6, xmm7 + punpckhbw %4, xmm7 + paddw %3, %6 + paddw %4, %5 + paddw %1, %3 ; block 0, 1 + paddw %2, %4 ; block 2, 3 + pshufd %3, %1, 0B1h + pshufd %4, %2, 0B1h + paddw %1, %3 + paddw %2, %4 + movdqa %3, %1 + movdqa %4, %2 + pshuflw %5, %1, 0B1h + pshufhw %6, %3, 0B1h + paddw %1, %5 + paddw %3, %6 + pshuflw %5, %2, 0B1h + pshufhw %6, %4, 0B1h + paddw %2, %5 + paddw %4, %6 + punpcklwd %1, %2 + punpckhwd %3, %4 + punpcklwd %1, %3 + psraw %1, $4 +%endmacro + +%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4 + movdqa %1, [esi ] ; line 0 + movdqa %2, [esi+ecx] ; line 1 + movdqa %3, %1 + punpcklbw %1, xmm7 + punpckhbw %3, xmm7 + movdqa %4, %2 + punpcklbw %4, xmm7 + punpckhbw %2, xmm7 + paddw %1, %4 + paddw %2, %3 + movdqa %3, [esi+ebx] ; line 2 + movdqa %4, [esi+edx] ; line 3 + movdqa %5, %3 + punpcklbw %3, xmm7 + punpckhbw %5, xmm7 + movdqa %6, %4 + punpcklbw %6, xmm7 + punpckhbw %4, xmm7 + paddw %3, %6 + paddw %4, %5 + paddw %1, %3 ; block 0, 1 + paddw %2, %4 ; block 2, 3 + phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; .. + phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; .... + psraw %1, $4 +%endmacro + +%macro WELS_SAD_16x2_SSE2 0 + movdqa xmm1, [esi] + movdqa xmm2, [edi] + movdqa xmm3, [esi+ebx] + movdqa xmm4, [edi+ebx] + psadbw xmm1, xmm2 + psadbw xmm3, xmm4 + paddd xmm6, xmm1 + paddd xmm6, xmm3 + lea esi, [esi+ebx*2] + lea edi, [edi+ebx*2] +%endmacro + +%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 0 + movdqa xmm1, [esi] + movdqa xmm2, [edi] + movdqa xmm3, xmm1 + psadbw xmm3, xmm2 + paddd xmm6, xmm3 + + movdqa xmm3, xmm1 + psadbw xmm3, xmm0 + paddd xmm5, xmm3 + + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm4, xmm1 + paddd xmm4, xmm2 + + add esi, ebx + add edi, ebx +%endmacro + +%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0 + movdqa xmm1, [esi] + movdqa xmm2, [edi] + movdqa xmm3, xmm1 + psadbw xmm3, xmm2 + paddd xmm7, xmm3 ; sad + + movdqa xmm3, xmm1 + pmaxub xmm3, xmm2 + pminub xmm2, xmm1 + psubb xmm3, xmm2 ; diff + + movdqa xmm2, xmm1 + psadbw xmm2, xmm0 + paddd xmm6, xmm2 ; sum + + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm5, xmm1 + paddd xmm5, xmm2 ; sqsum + + movdqa xmm1, xmm3 + punpcklbw xmm1, xmm0 + punpckhbw xmm3, xmm0 + pmaddwd xmm1, xmm1 + pmaddwd xmm3, xmm3 + paddd xmm4, xmm1 + paddd xmm4, xmm3 ; sqdiff + + add esi, ebx + add edi, ebx +%endmacro + +%macro WELS_SAD_SD_MAD_16x1_SSE2 4 +%define sad_reg %1 +%define sum_cur_reg %2 +%define sum_ref_reg %3 +%define mad_reg %4 + movdqa xmm1, [esi] + movdqa xmm2, [edi] + movdqa xmm3, xmm1 + psadbw xmm3, xmm0 + paddd sum_cur_reg, xmm3 ; sum_cur + movdqa xmm3, xmm2 + psadbw xmm3, xmm0 + paddd sum_ref_reg, xmm3 ; sum_ref + + movdqa xmm3, xmm1 + pmaxub xmm3, xmm2 + pminub xmm2, xmm1 + psubb xmm3, xmm2 ; abs diff + pmaxub mad_reg, xmm3 ; max abs diff + + psadbw xmm3, xmm0 + paddd sad_reg, xmm3 ; sad + + add esi, ebx + add edi, ebx +%endmacro + + +%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used +%define max_reg %1 + movdqa xmm1, max_reg + psrldq xmm1, 4 + pmaxub max_reg, xmm1 + movdqa xmm1, max_reg + psrldq xmm1, 2 + pmaxub max_reg, xmm1 + movdqa xmm1, max_reg + psrldq xmm1, 1 + pmaxub max_reg, xmm1 +%endmacro + +%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 4 +%define sad_reg %1 +%define sum_reg %2 +%define mad_reg %3 +%define sqdiff_reg %4 + movdqa xmm1, [esi] + movdqa xmm2, xmm1 + movdqa xmm3, xmm1 + punpcklbw xmm2, xmm0 + punpckhbw xmm3, xmm0 + pmaddwd xmm2, xmm2 + pmaddwd xmm3, xmm3 + paddd xmm2, xmm3 + movdqa xmm3, xmm2 + psllq xmm2, 32 + psrlq xmm3, 32 + psllq xmm3, 32 + paddd xmm2, xmm3 + paddd sad_reg, xmm2 ; sqsum + + movdqa xmm2, [edi] + movdqa xmm3, xmm1 + psadbw xmm3, xmm0 + paddd sum_reg, xmm3 ; sum_cur + movdqa xmm3, xmm2 + psadbw xmm3, xmm0 + pslldq xmm3, 4 + paddd sum_reg, xmm3 ; sum_ref + + movdqa xmm3, xmm1 + pmaxub xmm3, xmm2 + pminub xmm2, xmm1 + psubb xmm3, xmm2 ; abs diff + pmaxub mad_reg, xmm3 ; max abs diff + + movdqa xmm1, xmm3 + psadbw xmm3, xmm0 + paddd sad_reg, xmm3 ; sad + + movdqa xmm3, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm3, xmm0 + pmaddwd xmm1, xmm1 + pmaddwd xmm3, xmm3 + paddd sqdiff_reg, xmm1 + paddd sqdiff_reg, xmm3 ; sqdiff + + add esi, ebx + add edi, ebx +%endmacro + + +;*********************************************************************** +; Local Data (Read Only) +;*********************************************************************** + +;SECTION .rodata align=16 + +;ALIGN 16 +;pack1_8x2: +; dw 1, 1, 1, 1, 1, 1, 1, 1 + +;*********************************************************************** +; Code +;*********************************************************************** + +SECTION .text + +WELS_EXTERN rc_sad_frame_sse2 +;*********************************************************************** +; uint32_t rc_sad_frame_sse2( uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride ); +;*********************************************************************** +ALIGN 16 +rc_sad_frame_sse2: + push esi + push edi + push ebp + push ebx + push edx + + mov esi, [esp+24] + mov edi, [esp+28] + mov ebx, [esp+32] + mov ecx, [esp+36] + mov edx, [esp+40] + pxor xmm0, xmm0 +.hloop: + mov eax, ebx + mov ebp, $0 +.wloop: + movdqa xmm1, [esi+ebp] + movdqa xmm2, [edi+ebp] + psadbw xmm1, xmm2 + pshufd xmm2, xmm1, 0f6h ; 11110110 B ; movhlps for float + paddd xmm1, xmm2 + paddd xmm0, xmm1 + add ebp, 010h + dec eax + jnz near .wloop + lea esi, [esi+edx] + lea edi, [edi+edx] + dec ecx + jnz near .hloop + + movd eax, xmm0 + pop edx + pop ebx + pop ebp + pop edi + pop esi + ret + + +WELS_EXTERN SampleVariance16x16_sse2 +;*********************************************************************** +; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture ); +;*********************************************************************** +ALIGN 16 +SampleVariance16x16_sse2: + push esi + push edi + push ebx + + sub esp, 16 + %define SUM [esp] + %define SUM_CUR [esp+4] + %define SQR [esp+8] + %define SQR_CUR [esp+12] + %define PUSH_SIZE 28 ; 12 + 16 + + mov edi, [esp+PUSH_SIZE+4] ; y_ref + mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride + mov esi, [esp+PUSH_SIZE+12] ; y_src + mov eax, [esp+PUSH_SIZE+16] ; y_src_stride + mov ecx, 010h ; height = 16 + + pxor xmm7, xmm7 + movdqu SUM, xmm7 + +.hloops: + movdqa xmm0, [edi] ; y_ref + movdqa xmm1, [esi] ; y_src + movdqa xmm2, xmm0 ; store first for future process + movdqa xmm3, xmm1 + ; sum += diff; + movdqa xmm4, xmm0 + psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79] + ; to be continued for sum + pshufd xmm5, xmm4, 0C6h ; 11000110 B + paddw xmm4, xmm5 + movd ebx, xmm4 + add SUM, ebx + + ; sqr += diff * diff; + pmaxub xmm0, xmm1 + pminub xmm1, xmm2 + psubb xmm0, xmm1 ; diff + SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero + movd ebx, xmm1 + add SQR, ebx + + ; sum_cur += y_src[x]; + movdqa xmm0, xmm3 ; cur_orig + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 + punpckhbw xmm1, xmm7 + paddw xmm0, xmm1 ; 8x2 + SUM_WORD_8x2_SSE2 xmm0, xmm1 + movd ebx, xmm0 + and ebx, 0ffffh + add SUM_CUR, ebx + + ; sqr_cur += y_src[x] * y_src[x]; + SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero + movd ebx, xmm0 + add SQR_CUR, ebx + + lea edi, [edi+edx] + lea esi, [esi+eax] + dec ecx + jnz near .hloops + + mov ebx, 0 + mov bx, word SUM + sar ebx, 8 + imul ebx, ebx + mov ecx, SQR + sar ecx, 8 + sub ecx, ebx + mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture + mov [edi], cx ; to store uiMotionIndex + mov ebx, 0 + mov bx, word SUM_CUR + sar ebx, 8 + imul ebx, ebx + mov ecx, SQR_CUR + sar ecx, 8 + sub ecx, ebx + mov [edi+2], cx ; to store uiTextureIndex + + %undef SUM + %undef SUM_CUR + %undef SQR + %undef SQR_CUR + %undef PUSH_SIZE + + add esp, 16 + pop ebx + pop edi + pop esi + + ret + +; , 6/7/2010 + +%ifndef NO_DYNAMIC_VP +WELS_EXTERN AnalysisVaaInfoIntra_sse2 +;*********************************************************************** +; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t linesize ); +;*********************************************************************** +ALIGN 16 +AnalysisVaaInfoIntra_sse2: + push ebx + push edx + push esi + push edi + push ebp + + mov ebp, esp + and ebp, 0fh + sub esp, ebp + sub esp, 32 + %define PUSH_SIZE 52 ; 20 + 32 + + mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y + mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize + + mov ebx, ecx + sal ebx, $1 ; linesize x 2 [ebx] + mov edx, ebx + add edx, ecx ; linesize x 3 [edx] + mov eax, ebx + sal eax, $1 ; linesize x 4 [eax] + + pxor xmm7, xmm7 + + ; loops + VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + movq [esp], xmm0 + + lea esi, [esi+eax] + VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + movq [esp+8], xmm0 + + lea esi, [esi+eax] + VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + movq [esp+16], xmm0 + + lea esi, [esi+eax] + VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + movq [esp+24], xmm0 + + movdqa xmm0, [esp] ; block 0~7 + movdqa xmm1, [esp+16] ; block 8~15 + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + SUM_WORD_8x2_SSE2 xmm0, xmm3 + + pmullw xmm1, xmm1 + pmullw xmm2, xmm2 + movdqa xmm3, xmm1 + movdqa xmm4, xmm2 + punpcklwd xmm1, xmm7 + punpckhwd xmm3, xmm7 + punpcklwd xmm2, xmm7 + punpckhwd xmm4, xmm7 + paddd xmm1, xmm2 + paddd xmm3, xmm4 + paddd xmm1, xmm3 + pshufd xmm2, xmm1, 01Bh + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0B1h + paddd xmm1, xmm2 + + movd ebx, xmm0 + and ebx, 0ffffh ; effective low word truncated + mov ecx, ebx + imul ebx, ecx + sar ebx, $4 + movd eax, xmm1 + sub eax, ebx + + %undef PUSH_SIZE + add esp, 32 + add esp, ebp + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret + +WELS_EXTERN AnalysisVaaInfoIntra_ssse3 +;*********************************************************************** +; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t linesize ); +;*********************************************************************** +ALIGN 16 +AnalysisVaaInfoIntra_ssse3: + push ebx + push edx + push esi + push edi + push ebp + + mov ebp, esp + and ebp, 0fh + sub esp, ebp + sub esp, 32 + %define PUSH_SIZE 52 ; 20 + 32 + + mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y + mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize + + mov ebx, ecx + sal ebx, $1 ; linesize x 2 [ebx] + mov edx, ebx + add edx, ecx ; linesize x 3 [edx] + mov eax, ebx + sal eax, $1 ; linesize x 4 [eax] + + pxor xmm7, xmm7 + + ; loops + VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + movq [esp], xmm0 + + lea esi, [esi+eax] + VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 + movq [esp+8], xmm1 + + lea esi, [esi+eax] + VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + movq [esp+16], xmm0 + + lea esi, [esi+eax] + VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 + movq [esp+24], xmm1 + + movdqa xmm0, [esp] ; block 0~7 + movdqa xmm1, [esp+16] ; block 8~15 + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets + + pmullw xmm1, xmm1 + pmullw xmm2, xmm2 + movdqa xmm3, xmm1 + movdqa xmm4, xmm2 + punpcklwd xmm1, xmm7 + punpckhwd xmm3, xmm7 + punpcklwd xmm2, xmm7 + punpckhwd xmm4, xmm7 + paddd xmm1, xmm2 + paddd xmm3, xmm4 + paddd xmm1, xmm3 + pshufd xmm2, xmm1, 01Bh + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0B1h + paddd xmm1, xmm2 + + movd ebx, xmm0 + and ebx, 0ffffh ; effective low work truncated + mov ecx, ebx + imul ebx, ecx + sar ebx, $4 + movd eax, xmm1 + sub eax, ebx + + %undef PUSH_SIZE + add esp, 32 + add esp, ebp + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret +%endif + + + +WELS_EXTERN abs_difference_mbrow_sse2 +;************************************************************************************************************* +;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride, +; int32_t gom_pixel_num, int32_t *pSum) +;************************************************************************************************************* +ALIGN 16 +abs_difference_mbrow_sse2: +%define ref_orig esp + pushsize + 4 +%define cur_orig esp + pushsize + 8 +%define iPicStride esp + pushsize + 12 +%define gom_pixel_num esp + pushsize + 16 +%define pSum esp + pushsize + 20 +%define pushsize 12 + push esi + push edi + push ebx + mov esi, [ref_orig] + mov edi, [cur_orig] + mov ebx, [iPicStride] + mov eax, [gom_pixel_num] + mov ecx, 16 ;MB_WIDTH_LUMA + pxor xmm0, xmm0 +mb_width_loop_p: + mov edx, esi + add edx, eax ; end address +gom_row_loop_p: + movdqa xmm1, [esi] + movdqa xmm2, [edi] + psadbw xmm1, xmm2 + paddd xmm0, xmm1 + add esi, 16 + add edi, 16 + cmp esi, edx + jl gom_row_loop_p + + sub esi, eax + sub edi, eax + add esi, ebx + add edi, ebx + loop mb_width_loop_p + + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddd xmm1, xmm0 + movd eax, xmm1 + mov edx, [pSum] ; pSum + add [edx], eax + +%undef ref_orig +%undef cur_orig +%undef iPicStride +%undef gom_pixel_num +%undef pSum +%undef pushsize + pop ebx + pop edi + pop esi + ret + + + + +WELS_EXTERN sum_sqrsum_mbrow_sse2 +;************************************************************************************************************* +;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride, +; int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum) +;************************************************************************************************************* +ALIGN 16 +sum_sqrsum_mbrow_sse2: +%define cur_orig esp + pushsize + 4 +%define iPicStride esp + pushsize + 8 +%define gom_pixel_num esp + pushsize + 12 +%define pSum esp + pushsize + 16 +%define pSqrSum esp + pushsize + 20 +%define pushsize 8 + push esi + push ebx + mov esi, [cur_orig] + mov eax, [gom_pixel_num] + mov ebx, [iPicStride] + mov ecx, 16 ;MB_WIDTH_LUMA + pxor xmm0, xmm0 ; zero + pxor xmm1, xmm1 ; sum + pxor xmm2, xmm2 ; sqr sum +mb_width_loop_i: + mov edx, esi + add edx, eax ; end address +gom_row_loop_i: + movdqa xmm3, [esi] + movdqa xmm4, xmm3 + psadbw xmm4, xmm0 + paddd xmm1, xmm4 + movdqa xmm4, xmm3 + punpcklbw xmm4, xmm0 + punpckhbw xmm3, xmm0 + pmaddwd xmm4, xmm4 + pmaddwd xmm3, xmm3 + paddd xmm2, xmm3 + paddd xmm2, xmm4 + add esi, 16 + cmp esi, edx + jl gom_row_loop_i + + sub esi, eax + add esi, ebx + loop mb_width_loop_i + + movdqa xmm3, xmm1 + psrldq xmm3, 8 + paddd xmm1, xmm3 + movd eax, xmm1 + mov edx, [pSum] + add [edx], eax + + movdqa xmm3, xmm2 + psrldq xmm3, 8 + paddd xmm2, xmm3 + movdqa xmm3, xmm2 + psrldq xmm3, 4 + paddd xmm2, xmm3 + movd eax, xmm2 + mov edx, [pSqrSum] + add [edx], eax + + +%undef cur_orig +%undef iPicStride +%undef gom_pixel_num +%undef pSum +%undef pSqrSum +%undef pushsize + pop ebx + pop esi + ret + + + +WELS_EXTERN VAACalcSad_sse2 +;************************************************************************************************************* +;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight +; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8) +;************************************************************************************************************* + + +ALIGN 16 +VAACalcSad_sse2: +%define cur_data esp + pushsize + 4 +%define ref_data esp + pushsize + 8 +%define iPicWidth esp + pushsize + 12 +%define iPicHeight esp + pushsize + 16 +%define iPicStride esp + pushsize + 20 +%define psadframe esp + pushsize + 24 +%define psad8x8 esp + pushsize + 28 +%define pushsize 12 + push esi + push edi + push ebx + mov esi, [cur_data] + mov edi, [ref_data] + mov ebx, [iPicStride] + mov edx, [psad8x8] + mov eax, ebx + + shr dword [iPicWidth], 4 ; iPicWidth/16 + shr dword [iPicHeight], 4 ; iPicHeight/16 + shl eax, 4 ; iPicStride*16 + pxor xmm0, xmm0 + pxor xmm7, xmm7 ; iFrameSad +height_loop: + mov ecx, dword [iPicWidth] + push esi + push edi +width_loop: + pxor xmm6, xmm6 ; + WELS_SAD_16x2_SSE2 + WELS_SAD_16x2_SSE2 + WELS_SAD_16x2_SSE2 + WELS_SAD_16x2_SSE2 + paddd xmm7, xmm6 + movd [edx], xmm6 + psrldq xmm6, 8 + movd [edx+4], xmm6 + + pxor xmm6, xmm6 + WELS_SAD_16x2_SSE2 + WELS_SAD_16x2_SSE2 + WELS_SAD_16x2_SSE2 + WELS_SAD_16x2_SSE2 + paddd xmm7, xmm6 + movd [edx+8], xmm6 + psrldq xmm6, 8 + movd [edx+12], xmm6 + + add edx, 16 + sub esi, eax + sub edi, eax + add esi, 16 + add edi, 16 + + dec ecx + jnz width_loop + + pop edi + pop esi + add esi, eax + add edi, eax + + dec dword [iPicHeight] + jnz height_loop + + mov edx, [psadframe] + movdqa xmm5, xmm7 + psrldq xmm7, 8 + paddd xmm7, xmm5 + movd [edx], xmm7 + +%undef cur_data +%undef ref_data +%undef iPicWidth +%undef iPicHeight +%undef iPicStride +%undef psadframe +%undef psad8x8 +%undef pushsize + pop ebx + pop edi + pop esi + ret + + +WELS_EXTERN VAACalcSadVar_sse2 +;************************************************************************************************************* +;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight +; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16) +;************************************************************************************************************* + + +ALIGN 16 +VAACalcSadVar_sse2: +%define localsize 8 +%define cur_data esp + pushsize + localsize + 4 +%define ref_data esp + pushsize + localsize + 8 +%define iPicWidth esp + pushsize + localsize + 12 +%define iPicHeight esp + pushsize + localsize + 16 +%define iPicStride esp + pushsize + localsize + 20 +%define psadframe esp + pushsize + localsize + 24 +%define psad8x8 esp + pushsize + localsize + 28 +%define psum16x16 esp + pushsize + localsize + 32 +%define psqsum16x16 esp + pushsize + localsize + 36 +%define tmp_esi esp + 0 +%define tmp_edi esp + 4 +%define pushsize 16 + push ebp + push esi + push edi + push ebx + sub esp, localsize + mov esi, [cur_data] + mov edi, [ref_data] + mov ebx, [iPicStride] + mov edx, [psad8x8] + mov eax, ebx + + shr dword [iPicWidth], 4 ; iPicWidth/16 + shr dword [iPicHeight], 4 ; iPicHeight/16 + shl eax, 4 ; iPicStride*16 + pxor xmm0, xmm0 + pxor xmm7, xmm7 ; iFrameSad +var_height_loop: + mov ecx, dword [iPicWidth] + mov [tmp_esi], esi + mov [tmp_edi], edi +var_width_loop: + pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8 + pxor xmm5, xmm5 ; pSum16x16 + pxor xmm4, xmm4 ; sqsum_16x16 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + paddd xmm7, xmm6 + movd [edx], xmm6 + psrldq xmm6, 8 + movd [edx+4], xmm6 + + pxor xmm6, xmm6 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + WELS_SAD_SUM_SQSUM_16x1_SSE2 + paddd xmm7, xmm6 + movd [edx+8], xmm6 + psrldq xmm6, 8 + movd [edx+12], xmm6 + + mov ebp, [psum16x16] + movdqa xmm1, xmm5 + psrldq xmm1, 8 + paddd xmm5, xmm1 + movd [ebp], xmm5 + add dword [psum16x16], 4 + + movdqa xmm5, xmm4 + psrldq xmm5, 8 + paddd xmm4, xmm5 + movdqa xmm3, xmm4 + psrldq xmm3, 4 + paddd xmm4, xmm3 + + mov ebp, [psqsum16x16] + movd [ebp], xmm4 + add dword [psqsum16x16], 4 + + add edx, 16 + sub esi, eax + sub edi, eax + add esi, 16 + add edi, 16 + + dec ecx + jnz var_width_loop + + mov esi, [tmp_esi] + mov edi, [tmp_edi] + add esi, eax + add edi, eax + + dec dword [iPicHeight] + jnz var_height_loop + + mov edx, [psadframe] + movdqa xmm5, xmm7 + psrldq xmm7, 8 + paddd xmm7, xmm5 + movd [edx], xmm7 + + add esp, localsize + pop ebx + pop edi + pop esi + pop ebp +%undef cur_data +%undef ref_data +%undef iPicWidth +%undef iPicHeight +%undef iPicStride +%undef psadframe +%undef psad8x8 +%undef psum16x16 +%undef psqsum16x16 +%undef tmp_esi +%undef tmp_edi +%undef pushsize +%undef localsize + ret + + + +WELS_EXTERN VAACalcSadSsd_sse2 +;************************************************************************************************************* +;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, +; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16) +;************************************************************************************************************* + + +ALIGN 16 +VAACalcSadSsd_sse2: +%define localsize 12 +%define cur_data esp + pushsize + localsize + 4 +%define ref_data esp + pushsize + localsize + 8 +%define iPicWidth esp + pushsize + localsize + 12 +%define iPicHeight esp + pushsize + localsize + 16 +%define iPicStride esp + pushsize + localsize + 20 +%define psadframe esp + pushsize + localsize + 24 +%define psad8x8 esp + pushsize + localsize + 28 +%define psum16x16 esp + pushsize + localsize + 32 +%define psqsum16x16 esp + pushsize + localsize + 36 +%define psqdiff16x16 esp + pushsize + localsize + 40 +%define tmp_esi esp + 0 +%define tmp_edi esp + 4 +%define tmp_sadframe esp + 8 +%define pushsize 16 + push ebp + push esi + push edi + push ebx + sub esp, localsize + mov ecx, [iPicWidth] + mov ecx, [iPicHeight] + mov esi, [cur_data] + mov edi, [ref_data] + mov ebx, [iPicStride] + mov edx, [psad8x8] + mov eax, ebx + + shr dword [iPicWidth], 4 ; iPicWidth/16 + shr dword [iPicHeight], 4 ; iPicHeight/16 + shl eax, 4 ; iPicStride*16 + mov ecx, [iPicWidth] + mov ecx, [iPicHeight] + pxor xmm0, xmm0 + movd [tmp_sadframe], xmm0 +sqdiff_height_loop: + mov ecx, dword [iPicWidth] + mov [tmp_esi], esi + mov [tmp_edi], edi +sqdiff_width_loop: + pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8 + pxor xmm6, xmm6 ; pSum16x16 + pxor xmm5, xmm5 ; sqsum_16x16 four dword + pxor xmm4, xmm4 ; sqdiff_16x16 four Dword + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + movdqa xmm1, xmm7 + movd [edx], xmm7 + psrldq xmm7, 8 + paddd xmm1, xmm7 + movd [edx+4], xmm7 + movd ebp, xmm1 + add [tmp_sadframe], ebp + + pxor xmm7, xmm7 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 + movdqa xmm1, xmm7 + movd [edx+8], xmm7 + psrldq xmm7, 8 + paddd xmm1, xmm7 + movd [edx+12], xmm7 + movd ebp, xmm1 + add [tmp_sadframe], ebp + + mov ebp, [psum16x16] + movdqa xmm1, xmm6 + psrldq xmm1, 8 + paddd xmm6, xmm1 + movd [ebp], xmm6 + add dword [psum16x16], 4 + + mov ebp, [psqsum16x16] + pshufd xmm6, xmm5, 14 ;00001110 + paddd xmm6, xmm5 + pshufd xmm5, xmm6, 1 ;00000001 + paddd xmm5, xmm6 + movd [ebp], xmm5 + add dword [psqsum16x16], 4 + + mov ebp, [psqdiff16x16] + pshufd xmm5, xmm4, 14 ; 00001110 + paddd xmm5, xmm4 + pshufd xmm4, xmm5, 1 ; 00000001 + paddd xmm4, xmm5 + movd [ebp], xmm4 + add dword [psqdiff16x16], 4 + + add edx, 16 + sub esi, eax + sub edi, eax + add esi, 16 + add edi, 16 + + dec ecx + jnz sqdiff_width_loop + + mov esi, [tmp_esi] + mov edi, [tmp_edi] + add esi, eax + add edi, eax + + dec dword [iPicHeight] + jnz sqdiff_height_loop + + mov ebx, [tmp_sadframe] + mov eax, [psadframe] + mov [eax], ebx + + add esp, localsize + pop ebx + pop edi + pop esi + pop ebp +%undef cur_data +%undef ref_data +%undef iPicWidth +%undef iPicHeight +%undef iPicStride +%undef psadframe +%undef psad8x8 +%undef psum16x16 +%undef psqsum16x16 +%undef psqdiff16x16 +%undef tmp_esi +%undef tmp_edi +%undef tmp_sadframe +%undef pushsize +%undef localsize + ret + + + + + +WELS_EXTERN VAACalcSadBgd_sse2 +;************************************************************************************************************* +;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, +; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8) +;************************************************************************************************************* + + +ALIGN 16 +VAACalcSadBgd_sse2: +%define localsize 12 +%define cur_data esp + pushsize + localsize + 4 +%define ref_data esp + pushsize + localsize + 8 +%define iPicWidth esp + pushsize + localsize + 12 +%define iPicHeight esp + pushsize + localsize + 16 +%define iPicStride esp + pushsize + localsize + 20 +%define psadframe esp + pushsize + localsize + 24 +%define psad8x8 esp + pushsize + localsize + 28 +%define p_sd8x8 esp + pushsize + localsize + 32 +%define p_mad8x8 esp + pushsize + localsize + 36 +%define tmp_esi esp + 0 +%define tmp_edi esp + 4 +%define tmp_ecx esp + 8 +%define pushsize 16 + push ebp + push esi + push edi + push ebx + sub esp, localsize + mov esi, [cur_data] + mov edi, [ref_data] + mov ebx, [iPicStride] + mov eax, ebx + + shr dword [iPicWidth], 4 ; iPicWidth/16 + shr dword [iPicHeight], 4 ; iPicHeight/16 + shl eax, 4 ; iPicStride*16 + xor ebp, ebp + pxor xmm0, xmm0 +bgd_height_loop: + mov ecx, dword [iPicWidth] + mov [tmp_esi], esi + mov [tmp_edi], edi +bgd_width_loop: + pxor xmm7, xmm7 ; pSad8x8 + pxor xmm6, xmm6 ; sum_cur_8x8 + pxor xmm5, xmm5 ; sum_ref_8x8 + pxor xmm4, xmm4 ; pMad8x8 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + + + mov edx, [p_mad8x8] + WELS_MAX_REG_SSE2 xmm4 + + ;movdqa xmm1, xmm4 + ;punpcklbw xmm1, xmm0 + ;punpcklwd xmm1, xmm0 + ;movd [edx], xmm1 + ;punpckhbw xmm4, xmm0 + ;punpcklwd xmm4, xmm0 + ;movd [edx+4], xmm4 + ;add edx, 8 + ;mov [p_mad8x8], edx + mov [tmp_ecx], ecx + movhlps xmm1, xmm4 + movd ecx, xmm4 + mov [edx], cl + movd ecx, xmm1 + mov [edx+1],cl + add edx, 2 + mov [p_mad8x8], edx + + + pslldq xmm7, 4 + pslldq xmm6, 4 + pslldq xmm5, 4 + + + pxor xmm4, xmm4 ; pMad8x8 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + + mov edx, [p_mad8x8] + WELS_MAX_REG_SSE2 xmm4 + + ;movdqa xmm1, xmm4 + ;punpcklbw xmm1, xmm0 + ;punpcklwd xmm1, xmm0 + ;movd [edx], xmm1 + ;punpckhbw xmm4, xmm0 + ;punpcklwd xmm4, xmm0 + ;movd [edx+4], xmm4 + ;add edx, 8 + ;mov [p_mad8x8], edx + movhlps xmm1, xmm4 + movd ecx, xmm4 + mov [edx], cl + movd ecx, xmm1 + mov [edx+1],cl + add edx, 2 + mov [p_mad8x8], edx + + ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2 + + mov edx, [psad8x8] + pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0 + movdqa [edx], xmm1 + add edx, 16 + mov [psad8x8], edx ; sad8x8 + + paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0 + pshufd xmm2, xmm1, 00000011b + paddd xmm1, xmm2 + movd edx, xmm1 + add ebp, edx ; sad frame + + mov edx, [p_sd8x8] + psubd xmm6, xmm5 + pshufd xmm1, xmm6, 10001101b + movdqa [edx], xmm1 + add edx, 16 + mov [p_sd8x8], edx + + + add edx, 16 + sub esi, eax + sub edi, eax + add esi, 16 + add edi, 16 + + mov ecx, [tmp_ecx] + dec ecx + jnz bgd_width_loop + + mov esi, [tmp_esi] + mov edi, [tmp_edi] + add esi, eax + add edi, eax + + dec dword [iPicHeight] + jnz bgd_height_loop + + mov edx, [psadframe] + mov [edx], ebp + + add esp, localsize + pop ebx + pop edi + pop esi + pop ebp +%undef cur_data +%undef ref_data +%undef iPicWidth +%undef iPicHeight +%undef iPicStride +%undef psadframe +%undef psad8x8 +%undef p_sd8x8 +%undef p_mad8x8 +%undef tmp_esi +%undef tmp_edi +%undef pushsize +%undef localsize + ret + + + +WELS_EXTERN VAACalcSadSsdBgd_sse2 +;************************************************************************************************************* +;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, +; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, +; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8) +;************************************************************************************************************* + + +ALIGN 16 +VAACalcSadSsdBgd_sse2: +%define localsize 16 +%define cur_data esp + pushsize + localsize + 4 +%define ref_data esp + pushsize + localsize + 8 +%define iPicWidth esp + pushsize + localsize + 12 +%define iPicHeight esp + pushsize + localsize + 16 +%define iPicStride esp + pushsize + localsize + 20 +%define psadframe esp + pushsize + localsize + 24 +%define psad8x8 esp + pushsize + localsize + 28 +%define psum16x16 esp + pushsize + localsize + 32 +%define psqsum16x16 esp + pushsize + localsize + 36 +%define psqdiff16x16 esp + pushsize + localsize + 40 +%define p_sd8x8 esp + pushsize + localsize + 44 +%define p_mad8x8 esp + pushsize + localsize + 48 +%define tmp_esi esp + 0 +%define tmp_edi esp + 4 +%define tmp_sadframe esp + 8 +%define tmp_ecx esp + 12 +%define pushsize 16 + push ebp + push esi + push edi + push ebx + sub esp, localsize + mov esi, [cur_data] + mov edi, [ref_data] + mov ebx, [iPicStride] + mov eax, ebx + + shr dword [iPicWidth], 4 ; iPicWidth/16 + shr dword [iPicHeight], 4 ; iPicHeight/16 + shl eax, 4 ; iPicStride*16 + pxor xmm0, xmm0 + movd [tmp_sadframe], xmm0 +sqdiff_bgd_height_loop: + mov ecx, dword [iPicWidth] + mov [tmp_esi], esi + mov [tmp_edi], edi +sqdiff_bgd_width_loop: + pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0 + pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 + pxor xmm5, xmm5 ; pMad8x8 + pxor xmm4, xmm4 ; sqdiff_16x16 four Dword + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + + mov edx, [psad8x8] + movdqa xmm2, xmm7 + pshufd xmm1, xmm2, 00001110b + movd [edx], xmm2 + movd [edx+4], xmm1 + add edx, 8 + mov [psad8x8], edx ; sad8x8 + + paddd xmm1, xmm2 + movd edx, xmm1 + add [tmp_sadframe], edx ; iFrameSad + + mov edx, [psum16x16] + movdqa xmm1, xmm6 + pshufd xmm2, xmm1, 00001110b + paddd xmm1, xmm2 + movd [edx], xmm1 ; sum + + mov edx, [p_sd8x8] + pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 + psubd xmm6, xmm1 ; 00 diff1 00 diff0 + pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 + movq [edx], xmm1 + add edx, 8 + mov [p_sd8x8], edx + + mov edx, [p_mad8x8] + WELS_MAX_REG_SSE2 xmm5 + ;movdqa xmm1, xmm5 + ;punpcklbw xmm1, xmm0 + ;punpcklwd xmm1, xmm0 + ;movd [edx], xmm1 + ;punpckhbw xmm5, xmm0 + ;punpcklwd xmm5, xmm0 + ;movd [edx+4], xmm5 + ;add edx, 8 + ;mov [p_mad8x8], edx + mov [tmp_ecx], ecx + movhlps xmm1, xmm5 + movd ecx, xmm5 + mov [edx], cl + movd ecx, xmm1 + mov [edx+1],cl + add edx, 2 + mov [p_mad8x8], edx + + psrlq xmm7, 32 + psllq xmm7, 32 ; clear sad + pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 + pxor xmm5, xmm5 ; pMad8x8 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 + + mov edx, [psad8x8] + movdqa xmm2, xmm7 + pshufd xmm1, xmm2, 00001110b + movd [edx], xmm2 + movd [edx+4], xmm1 + add edx, 8 + mov [psad8x8], edx ; sad8x8 + + paddd xmm1, xmm2 + movd edx, xmm1 + add [tmp_sadframe], edx ; iFrameSad + + mov edx, [psum16x16] + movdqa xmm1, xmm6 + pshufd xmm2, xmm1, 00001110b + paddd xmm1, xmm2 + movd ebp, xmm1 ; sum + add [edx], ebp + add edx, 4 + mov [psum16x16], edx + + mov edx, [psqsum16x16] + psrlq xmm7, 32 + pshufd xmm2, xmm7, 00001110b + paddd xmm2, xmm7 + movd [edx], xmm2 ; sqsum + add edx, 4 + mov [psqsum16x16], edx + + mov edx, [p_sd8x8] + pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 + psubd xmm6, xmm1 ; 00 diff1 00 diff0 + pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 + movq [edx], xmm1 + add edx, 8 + mov [p_sd8x8], edx + + mov edx, [p_mad8x8] + WELS_MAX_REG_SSE2 xmm5 + ;movdqa xmm1, xmm5 + ;punpcklbw xmm1, xmm0 + ;punpcklwd xmm1, xmm0 + ;movd [edx], xmm1 + ;punpckhbw xmm5, xmm0 + ;punpcklwd xmm5, xmm0 + ;movd [edx+4], xmm5 + ;add edx, 8 + ;mov [p_mad8x8], edx + movhlps xmm1, xmm5 + movd ecx, xmm5 + mov [edx], cl + movd ecx, xmm1 + mov [edx+1],cl + add edx, 2 + mov [p_mad8x8], edx + + mov edx, [psqdiff16x16] + pshufd xmm1, xmm4, 00001110b + paddd xmm4, xmm1 + pshufd xmm1, xmm4, 00000001b + paddd xmm4, xmm1 + movd [edx], xmm4 + add edx, 4 + mov [psqdiff16x16], edx + + add edx, 16 + sub esi, eax + sub edi, eax + add esi, 16 + add edi, 16 + + mov ecx, [tmp_ecx] + dec ecx + jnz sqdiff_bgd_width_loop + + mov esi, [tmp_esi] + mov edi, [tmp_edi] + add esi, eax + add edi, eax + + dec dword [iPicHeight] + jnz sqdiff_bgd_height_loop + + mov edx, [psadframe] + mov ebp, [tmp_sadframe] + mov [edx], ebp + + add esp, localsize + pop ebx + pop edi + pop esi + pop ebp +%undef cur_data +%undef ref_data +%undef iPicWidth +%undef iPicHeight +%undef iPicStride +%undef psadframe +%undef psad8x8 +%undef psum16x16 +%undef psqsum16x16 +%undef psqdiff16x16 +%undef p_sd8x8 +%undef p_mad8x8 +%undef tmp_esi +%undef tmp_edi +%undef pushsize +%undef localsize + ret diff --git a/processing/src/common/WelsVP.def b/processing/src/common/WelsVP.def index 650c2d73..8371658b 100644 --- a/processing/src/common/WelsVP.def +++ b/processing/src/common/WelsVP.def @@ -1,36 +1,36 @@ -;*! -;* \copy -;* Copyright (c) 2011-2013, Cisco Systems -;* All rights reserved. -;* -;* Redistribution and use in source and binary forms, with or without -;* modification, are permitted provided that the following conditions -;* are met: -;* -;* * Redistributions of source code must retain the above copyright -;* notice, this list of conditions and the following disclaimer. -;* -;* * Redistributions in binary form must reproduce the above copyright -;* notice, this list of conditions and the following disclaimer in -;* the documentation and/or other materials provided with the -;* distribution. -;* -;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -;* POSSIBILITY OF SUCH DAMAGE. -;* -;* - -LIBRARY welsvp.dll -EXPORTS - CreateVpInterface PRIVATE +;*! +;* \copy +;* Copyright (c) 2011-2013, Cisco Systems +;* All rights reserved. +;* +;* Redistribution and use in source and binary forms, with or without +;* modification, are permitted provided that the following conditions +;* are met: +;* +;* * Redistributions of source code must retain the above copyright +;* notice, this list of conditions and the following disclaimer. +;* +;* * Redistributions in binary form must reproduce the above copyright +;* notice, this list of conditions and the following disclaimer in +;* the documentation and/or other materials provided with the +;* distribution. +;* +;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;* POSSIBILITY OF SUCH DAMAGE. +;* +;* + +LIBRARY welsvp.dll +EXPORTS + CreateVpInterface PRIVATE DestroyVpInterface PRIVATE \ No newline at end of file diff --git a/testbin/layer2.cfg b/testbin/layer2.cfg index a2d8d8c7..7c13ae15 100644 --- a/testbin/layer2.cfg +++ b/testbin/layer2.cfg @@ -1,39 +1,39 @@ -# Layer Configuration File - - -#============================== INPUT / OUTPUT ============================== -SourceWidth 320 # Input frame width -SourceHeight 192 # Input frame height -FrameRateIn 12 # Input frame rate [Hz] -FrameRateOut 12 # Output frame rate [Hz] -InputFile CiscoVT2people_320x192_12fps.yuv # Input file -ReconFile rec_layer2.yuv # Reconstructed file - -#============================== CODING ============================== -ProfileIdc 66 # value of profile_idc (or 0 for auto detection) - -InitialQP 24 # Quantization parameters for base quality layer -#================================ RATE CONTROL =============================== -SpatialBitrate 600 # Unit: kbps, controled by DisableRC also -#============================== MultiSlice Slice Argument ============================== -# for S/M Slice(s) mode settings -SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below; -SliceSize 1500 -SliceNum 1 # multiple slices number specified - -SlicesAssign0 960 # count number of MBs in slice #0 -SlicesAssign1 0 # count number of MBs in slice #1 -SlicesAssign2 0 # count number of MBs in slice #2 -SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing -SlicesAssign4 0 # count number of MBs in slice #4 -SlicesAssign5 0 # count number of MBs in slice #5 -SlicesAssign6 0 # count number of MBs in slice #6 -SlicesAssign7 0 # count number of MBs in slice #7 - -### DESIGN OF SLICE MODE #### -# 0 SM_SINGLE_SLICE | SliceNum==1 -# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread -# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved. -# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Typical of single row of mbs each slice?+ slice size constraint which including re-encoding -# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame) - +# Layer Configuration File + + +#============================== INPUT / OUTPUT ============================== +SourceWidth 320 # Input frame width +SourceHeight 192 # Input frame height +FrameRateIn 12 # Input frame rate [Hz] +FrameRateOut 12 # Output frame rate [Hz] +InputFile CiscoVT2people_320x192_12fps.yuv # Input file +ReconFile rec_layer2.yuv # Reconstructed file + +#============================== CODING ============================== +ProfileIdc 66 # value of profile_idc (or 0 for auto detection) + +InitialQP 24 # Quantization parameters for base quality layer +#================================ RATE CONTROL =============================== +SpatialBitrate 600 # Unit: kbps, controled by DisableRC also +#============================== MultiSlice Slice Argument ============================== +# for S/M Slice(s) mode settings +SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below; +SliceSize 1500 +SliceNum 1 # multiple slices number specified + +SlicesAssign0 960 # count number of MBs in slice #0 +SlicesAssign1 0 # count number of MBs in slice #1 +SlicesAssign2 0 # count number of MBs in slice #2 +SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing +SlicesAssign4 0 # count number of MBs in slice #4 +SlicesAssign5 0 # count number of MBs in slice #5 +SlicesAssign6 0 # count number of MBs in slice #6 +SlicesAssign7 0 # count number of MBs in slice #7 + +### DESIGN OF SLICE MODE #### +# 0 SM_SINGLE_SLICE | SliceNum==1 +# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread +# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved. +# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Typical of single row of mbs each slice?+ slice size constraint which including re-encoding +# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame) + diff --git a/testbin/layer2_vd.cfg b/testbin/layer2_vd.cfg index 0112846b..3c59b970 100644 --- a/testbin/layer2_vd.cfg +++ b/testbin/layer2_vd.cfg @@ -1,39 +1,39 @@ -# Layer Configuration File - - -#============================== INPUT / OUTPUT ============================== -SourceWidth 320 # Input frame width -SourceHeight 192 # Input frame height -FrameRateIn 12 # Input frame rate [Hz] -FrameRateOut 12 # Output frame rate [Hz] -InputFile CiscoVT2people_320x192_12fps.yuv # Input file -ReconFile rec_layer2.yuv # Reconstructed file - -#============================== CODING ============================== -ProfileIdc 66 # value of profile_idc (or 0 for auto detection) - -InitialQP 24 # Quantization parameters for base quality layer -#================================ RATE CONTROL =============================== -SpatialBitrate 600 # Unit: kbps, controled by DisableRC also -#============================== MultiSlice Slice Argument ============================== -# for S/M Slice(s) mode settings -SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below; -SliceSize 1500 -SliceNum 1 # multiple slices number specified - -SlicesAssign0 960 # count number of MBs in slice #0 -SlicesAssign1 0 # count number of MBs in slice #1 -SlicesAssign2 0 # count number of MBs in slice #2 -SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing -SlicesAssign4 0 # count number of MBs in slice #4 -SlicesAssign5 0 # count number of MBs in slice #5 -SlicesAssign6 0 # count number of MBs in slice #6 -SlicesAssign7 0 # count number of MBs in slice #7 - -### DESIGN OF SLICE MODE, 100804, Sijia #### -# 0 SM_SINGLE_SLICE | SliceNum==1 -# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread -# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved. -# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding -# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame) - +# Layer Configuration File + + +#============================== INPUT / OUTPUT ============================== +SourceWidth 320 # Input frame width +SourceHeight 192 # Input frame height +FrameRateIn 12 # Input frame rate [Hz] +FrameRateOut 12 # Output frame rate [Hz] +InputFile CiscoVT2people_320x192_12fps.yuv # Input file +ReconFile rec_layer2.yuv # Reconstructed file + +#============================== CODING ============================== +ProfileIdc 66 # value of profile_idc (or 0 for auto detection) + +InitialQP 24 # Quantization parameters for base quality layer +#================================ RATE CONTROL =============================== +SpatialBitrate 600 # Unit: kbps, controled by DisableRC also +#============================== MultiSlice Slice Argument ============================== +# for S/M Slice(s) mode settings +SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below; +SliceSize 1500 +SliceNum 1 # multiple slices number specified + +SlicesAssign0 960 # count number of MBs in slice #0 +SlicesAssign1 0 # count number of MBs in slice #1 +SlicesAssign2 0 # count number of MBs in slice #2 +SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing +SlicesAssign4 0 # count number of MBs in slice #4 +SlicesAssign5 0 # count number of MBs in slice #5 +SlicesAssign6 0 # count number of MBs in slice #6 +SlicesAssign7 0 # count number of MBs in slice #7 + +### DESIGN OF SLICE MODE, 100804, Sijia #### +# 0 SM_SINGLE_SLICE | SliceNum==1 +# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread +# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved. +# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding +# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame) + diff --git a/testbin/layer2_vd_rc.cfg b/testbin/layer2_vd_rc.cfg index 0112846b..3c59b970 100644 --- a/testbin/layer2_vd_rc.cfg +++ b/testbin/layer2_vd_rc.cfg @@ -1,39 +1,39 @@ -# Layer Configuration File - - -#============================== INPUT / OUTPUT ============================== -SourceWidth 320 # Input frame width -SourceHeight 192 # Input frame height -FrameRateIn 12 # Input frame rate [Hz] -FrameRateOut 12 # Output frame rate [Hz] -InputFile CiscoVT2people_320x192_12fps.yuv # Input file -ReconFile rec_layer2.yuv # Reconstructed file - -#============================== CODING ============================== -ProfileIdc 66 # value of profile_idc (or 0 for auto detection) - -InitialQP 24 # Quantization parameters for base quality layer -#================================ RATE CONTROL =============================== -SpatialBitrate 600 # Unit: kbps, controled by DisableRC also -#============================== MultiSlice Slice Argument ============================== -# for S/M Slice(s) mode settings -SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below; -SliceSize 1500 -SliceNum 1 # multiple slices number specified - -SlicesAssign0 960 # count number of MBs in slice #0 -SlicesAssign1 0 # count number of MBs in slice #1 -SlicesAssign2 0 # count number of MBs in slice #2 -SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing -SlicesAssign4 0 # count number of MBs in slice #4 -SlicesAssign5 0 # count number of MBs in slice #5 -SlicesAssign6 0 # count number of MBs in slice #6 -SlicesAssign7 0 # count number of MBs in slice #7 - -### DESIGN OF SLICE MODE, 100804, Sijia #### -# 0 SM_SINGLE_SLICE | SliceNum==1 -# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread -# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved. -# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding -# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame) - +# Layer Configuration File + + +#============================== INPUT / OUTPUT ============================== +SourceWidth 320 # Input frame width +SourceHeight 192 # Input frame height +FrameRateIn 12 # Input frame rate [Hz] +FrameRateOut 12 # Output frame rate [Hz] +InputFile CiscoVT2people_320x192_12fps.yuv # Input file +ReconFile rec_layer2.yuv # Reconstructed file + +#============================== CODING ============================== +ProfileIdc 66 # value of profile_idc (or 0 for auto detection) + +InitialQP 24 # Quantization parameters for base quality layer +#================================ RATE CONTROL =============================== +SpatialBitrate 600 # Unit: kbps, controled by DisableRC also +#============================== MultiSlice Slice Argument ============================== +# for S/M Slice(s) mode settings +SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below; +SliceSize 1500 +SliceNum 1 # multiple slices number specified + +SlicesAssign0 960 # count number of MBs in slice #0 +SlicesAssign1 0 # count number of MBs in slice #1 +SlicesAssign2 0 # count number of MBs in slice #2 +SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing +SlicesAssign4 0 # count number of MBs in slice #4 +SlicesAssign5 0 # count number of MBs in slice #5 +SlicesAssign6 0 # count number of MBs in slice #6 +SlicesAssign7 0 # count number of MBs in slice #7 + +### DESIGN OF SLICE MODE, 100804, Sijia #### +# 0 SM_SINGLE_SLICE | SliceNum==1 +# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread +# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved. +# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding +# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame) + diff --git a/testbin/welsenc.cfg b/testbin/welsenc.cfg index 66c79a3e..09cccd52 100644 --- a/testbin/welsenc.cfg +++ b/testbin/welsenc.cfg @@ -1,63 +1,63 @@ -# Cisco Scalable H.264/AVC Extension Encoder Configuration File - -#============================== GENERAL ============================== -OutputFile test.264 # Bitstream file -MaxFrameRate 30 # Maximum frame rate [Hz] -FramesToBeEncoded -1 # Number of frames (at input frame rate) - -GOPSize 4 # GOP Size (at maximum frame rate), 16 -IntraPeriod 0 # Intra Period ( multipler of GoP size or -1) -EnableSpsPpsIDAddition 1 - -EnableFrameCropping 1 # enable frame cropping flag - -#============================== LOOP FILTER ============================== -LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off, - # 2: on except for slice boundaries, - # 3: two stage. slice boundries on in second stage - # 4: Luma on but Chroma off (w.r.t. idc=0) - # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2) - # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3) -LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range -LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range - -InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off, - # 2: on except for slice boundaries, - # 3: two stage. slice boundries on in second stage - # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0) - # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2) - # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3) -InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking -InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking - -#============================== SOFTWARE IMPLEMENTATION ============================== -MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads; - -#============================== RATE CONTROL ============================== -EnableRC 1 # ENABLE RC -TargetBitrate 5000 # Unit: kbps, controled by EnableRC also - -#============================== DENOISE CONTROL ============================== -EnableDenoise 0 # Enable Denoise (1: enable, 0: disable) - -#============================== SCENE CHANGE DETECTION CONTROL ======================= -EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable) - -#============================== BACKGROUND DETECTION CONTROL ============================== -EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable) - -#============================== ADAPTIVE QUANTIZATION CONTROL ======================= -EnableAdaptiveQuantization 1 # Enable Adaptive Quantization (1: enable, 0: disable) - -#============================== LONG TERM REFERENCE CONTROL ============================== -EnableLongTermReference 0 # Enable Long Term Reference (1: enable, 0: disable) -LtrMarkPeriod 30 # Long Term Reference Marking Period - -#============================== LAYER DEFINITION ============================== -PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on) - # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers) - # Can be disabled when no inter spatial layer prediction in case of its value as 0 -NumLayers 1 # Number of layers -//LayerCfg layer0.cfg # Layer 0 configuration file -//LayerCfg layer1.cfg # Layer 1 configuration file -LayerCfg layer2.cfg # Layer 2 configuration file +# Cisco Scalable H.264/AVC Extension Encoder Configuration File + +#============================== GENERAL ============================== +OutputFile test.264 # Bitstream file +MaxFrameRate 30 # Maximum frame rate [Hz] +FramesToBeEncoded -1 # Number of frames (at input frame rate) + +GOPSize 4 # GOP Size (at maximum frame rate), 16 +IntraPeriod 0 # Intra Period ( multipler of GoP size or -1) +EnableSpsPpsIDAddition 1 + +EnableFrameCropping 1 # enable frame cropping flag + +#============================== LOOP FILTER ============================== +LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off, + # 2: on except for slice boundaries, + # 3: two stage. slice boundries on in second stage + # 4: Luma on but Chroma off (w.r.t. idc=0) + # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2) + # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3) +LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range +LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range + +InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off, + # 2: on except for slice boundaries, + # 3: two stage. slice boundries on in second stage + # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0) + # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2) + # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3) +InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking +InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking + +#============================== SOFTWARE IMPLEMENTATION ============================== +MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads; + +#============================== RATE CONTROL ============================== +EnableRC 1 # ENABLE RC +TargetBitrate 5000 # Unit: kbps, controled by EnableRC also + +#============================== DENOISE CONTROL ============================== +EnableDenoise 0 # Enable Denoise (1: enable, 0: disable) + +#============================== SCENE CHANGE DETECTION CONTROL ======================= +EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable) + +#============================== BACKGROUND DETECTION CONTROL ============================== +EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable) + +#============================== ADAPTIVE QUANTIZATION CONTROL ======================= +EnableAdaptiveQuantization 1 # Enable Adaptive Quantization (1: enable, 0: disable) + +#============================== LONG TERM REFERENCE CONTROL ============================== +EnableLongTermReference 0 # Enable Long Term Reference (1: enable, 0: disable) +LtrMarkPeriod 30 # Long Term Reference Marking Period + +#============================== LAYER DEFINITION ============================== +PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on) + # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers) + # Can be disabled when no inter spatial layer prediction in case of its value as 0 +NumLayers 1 # Number of layers +//LayerCfg layer0.cfg # Layer 0 configuration file +//LayerCfg layer1.cfg # Layer 1 configuration file +LayerCfg layer2.cfg # Layer 2 configuration file diff --git a/testbin/welsenc_vd_1d.cfg b/testbin/welsenc_vd_1d.cfg index bdd90222..41d8f773 100644 --- a/testbin/welsenc_vd_1d.cfg +++ b/testbin/welsenc_vd_1d.cfg @@ -1,63 +1,63 @@ -# Cisco Scalable H.264/AVC Extension Encoder Configuration File - -#============================== GENERAL ============================== -OutputFile test_vd_1d.264 # Bitstream file -MaxFrameRate 30 # Maximum frame rate [Hz] -FramesToBeEncoded -1 # Number of frames (at input frame rate) - -GOPSize 4 # GOP Size (at maximum frame rate), 16 -IntraPeriod 0 # Intra Period ( multipler of GoP size or -1) -EnableSpsPpsIDAddition 1 - -EnableFrameCropping 1 # enable frame cropping flag - -#============================== LOOP FILTER ============================== -LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off, - # 2: on except for slice boundaries, - # 3: two stage. slice boundries on in second stage - # 4: Luma on but Chroma off (w.r.t. idc=0) - # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2) - # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3) -LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range -LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range - -InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off, - # 2: on except for slice boundaries, - # 3: two stage. slice boundries on in second stage - # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0) - # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2) - # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3) -InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking -InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking - -#============================== SOFTWARE IMPLEMENTATION ============================== -MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads; - -#============================== RATE CONTROL ============================== -EnableRC 0 # ENABLE RC -TargetBitrate 5000 # Unit: kbps, controled by EnableRC also - -#============================== DENOISE CONTROL ============================== -EnableDenoise 0 # Enable Denoise (1: enable, 0: disable) - -#============================== SCENE CHANGE DETECTION CONTROL ======================= -EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable) - -#============================== BACKGROUND DETECTION CONTROL ============================== -EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable) - -#============================== ADAPTIVE QUANTIZATION CONTROL ======================= -EnableAdaptiveQuantization 0 # Enable Adaptive Quantization (1: enable, 0: disable) - -#============================== LONG TERM REFERENCE CONTROL ============================== -EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable) -LtrMarkPeriod 30 # Long Term Reference Marking Period - -#============================== LAYER DEFINITION ============================== -PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on) - # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers) - # Can be disabled when no inter spatial layer prediction in case of its value as 0 -NumLayers 1 # Number of layers -//LayerCfg layer0_vd.cfg # Layer 0 configuration file -//LayerCfg layer1_vd.cfg # Layer 1 configuration file -LayerCfg layer2_vd.cfg # Layer 2 configuration file +# Cisco Scalable H.264/AVC Extension Encoder Configuration File + +#============================== GENERAL ============================== +OutputFile test_vd_1d.264 # Bitstream file +MaxFrameRate 30 # Maximum frame rate [Hz] +FramesToBeEncoded -1 # Number of frames (at input frame rate) + +GOPSize 4 # GOP Size (at maximum frame rate), 16 +IntraPeriod 0 # Intra Period ( multipler of GoP size or -1) +EnableSpsPpsIDAddition 1 + +EnableFrameCropping 1 # enable frame cropping flag + +#============================== LOOP FILTER ============================== +LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off, + # 2: on except for slice boundaries, + # 3: two stage. slice boundries on in second stage + # 4: Luma on but Chroma off (w.r.t. idc=0) + # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2) + # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3) +LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range +LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range + +InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off, + # 2: on except for slice boundaries, + # 3: two stage. slice boundries on in second stage + # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0) + # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2) + # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3) +InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking +InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking + +#============================== SOFTWARE IMPLEMENTATION ============================== +MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads; + +#============================== RATE CONTROL ============================== +EnableRC 0 # ENABLE RC +TargetBitrate 5000 # Unit: kbps, controled by EnableRC also + +#============================== DENOISE CONTROL ============================== +EnableDenoise 0 # Enable Denoise (1: enable, 0: disable) + +#============================== SCENE CHANGE DETECTION CONTROL ======================= +EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable) + +#============================== BACKGROUND DETECTION CONTROL ============================== +EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable) + +#============================== ADAPTIVE QUANTIZATION CONTROL ======================= +EnableAdaptiveQuantization 0 # Enable Adaptive Quantization (1: enable, 0: disable) + +#============================== LONG TERM REFERENCE CONTROL ============================== +EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable) +LtrMarkPeriod 30 # Long Term Reference Marking Period + +#============================== LAYER DEFINITION ============================== +PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on) + # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers) + # Can be disabled when no inter spatial layer prediction in case of its value as 0 +NumLayers 1 # Number of layers +//LayerCfg layer0_vd.cfg # Layer 0 configuration file +//LayerCfg layer1_vd.cfg # Layer 1 configuration file +LayerCfg layer2_vd.cfg # Layer 2 configuration file diff --git a/testbin/welsenc_vd_rc.cfg b/testbin/welsenc_vd_rc.cfg index ed981149..9351c045 100644 --- a/testbin/welsenc_vd_rc.cfg +++ b/testbin/welsenc_vd_rc.cfg @@ -1,63 +1,63 @@ -# Cisco Scalable H.264/AVC Extension Encoder Configuration File - -#============================== GENERAL ============================== -OutputFile test_vd_rc.264 # Bitstream file -MaxFrameRate 30 # Maximum frame rate [Hz] -FramesToBeEncoded -1 # Number of frames (at input frame rate), -1 - -GOPSize 8 # GOP Size (at maximum frame rate), 16 -IntraPeriod 0 # Intra Period ( multipler of GoP size or -1) -EnableSpsPpsIDAddition 1 - -EnableFrameCropping 1 # enable frame cropping flag - -#============================== LOOP FILTER ============================== -LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off, - # 2: on except for slice boundaries, - # 3: two stage. slice boundries on in second stage - # 4: Luma on but Chroma off (w.r.t. idc=0) - # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2) - # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3) -LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range -LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range - -InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off, - # 2: on except for slice boundaries, - # 3: two stage. slice boundries on in second stage - # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0) - # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2) - # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3) -InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking -InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking - -#============================== SOFTWARE IMPLEMENTATION ============================== -MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads; - -#============================== RATE CONTROL ============================== -EnableRC 1 # ENABLE RC -TargetBitrate 600 # Unit: kbps, controled by EnableRC also - -#============================== DENOISE CONTROL ============================== -EnableDenoise 1 # Enable Denoise (1: enable, 0: disable) - -#============================== SCENE CHANGE DETECTION CONTROL ======================= -EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable) - -#============================== BACKGROUND DETECTION CONTROL ============================== -EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable) - -#============================== ADAPTIVE QUANTIZATION CONTROL ======================= -EnableAdaptiveQuantization 1 # Enable Adaptive Quantization (1: enable, 0: disable) - -#============================== LONG TERM REFERENCE CONTROL ============================== -EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable) -LtrMarkPeriod 30 # Long Term Reference Marking Period - -#============================== LAYER DEFINITION ============================== -PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on) - # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers) - # Can be disabled when no inter spatial layer prediction in case of its value as 0 -NumLayers 1 # Number of layers -//LayerCfg layer0_vd.cfg # Layer 0 configuration file -//LayerCfg layer1_vd.cfg # Layer 1 configuration file -LayerCfg layer2_vd_rc.cfg # Layer 2 configuration file +# Cisco Scalable H.264/AVC Extension Encoder Configuration File + +#============================== GENERAL ============================== +OutputFile test_vd_rc.264 # Bitstream file +MaxFrameRate 30 # Maximum frame rate [Hz] +FramesToBeEncoded -1 # Number of frames (at input frame rate), -1 + +GOPSize 8 # GOP Size (at maximum frame rate), 16 +IntraPeriod 0 # Intra Period ( multipler of GoP size or -1) +EnableSpsPpsIDAddition 1 + +EnableFrameCropping 1 # enable frame cropping flag + +#============================== LOOP FILTER ============================== +LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off, + # 2: on except for slice boundaries, + # 3: two stage. slice boundries on in second stage + # 4: Luma on but Chroma off (w.r.t. idc=0) + # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2) + # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3) +LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range +LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range + +InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off, + # 2: on except for slice boundaries, + # 3: two stage. slice boundries on in second stage + # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0) + # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2) + # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3) +InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking +InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking + +#============================== SOFTWARE IMPLEMENTATION ============================== +MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads; + +#============================== RATE CONTROL ============================== +EnableRC 1 # ENABLE RC +TargetBitrate 600 # Unit: kbps, controled by EnableRC also + +#============================== DENOISE CONTROL ============================== +EnableDenoise 1 # Enable Denoise (1: enable, 0: disable) + +#============================== SCENE CHANGE DETECTION CONTROL ======================= +EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable) + +#============================== BACKGROUND DETECTION CONTROL ============================== +EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable) + +#============================== ADAPTIVE QUANTIZATION CONTROL ======================= +EnableAdaptiveQuantization 1 # Enable Adaptive Quantization (1: enable, 0: disable) + +#============================== LONG TERM REFERENCE CONTROL ============================== +EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable) +LtrMarkPeriod 30 # Long Term Reference Marking Period + +#============================== LAYER DEFINITION ============================== +PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on) + # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers) + # Can be disabled when no inter spatial layer prediction in case of its value as 0 +NumLayers 1 # Number of layers +//LayerCfg layer0_vd.cfg # Layer 0 configuration file +//LayerCfg layer1_vd.cfg # Layer 1 configuration file +LayerCfg layer2_vd_rc.cfg # Layer 2 configuration file