
Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged.
557 lines
15 KiB
NASM
557 lines
15 KiB
NASM
;*!
|
|
;* \copy
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
;* All rights reserved.
|
|
;*
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
;* modification, are permitted provided that the following conditions
|
|
;* are met:
|
|
;*
|
|
;* ?Redistributions of source code must retain the above copyright
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
;*
|
|
;* ?Redistributions in binary form must reproduce the above copyright
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
;* the documentation and/or other materials provided with the
|
|
;* distribution.
|
|
;*
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
;*
|
|
;*
|
|
;* dct.asm
|
|
;*
|
|
;* Abstract
|
|
;* WelsDctFourT4_sse2
|
|
;*
|
|
;* History
|
|
;* 8/4/2009 Created
|
|
;*
|
|
;*
|
|
;*************************************************************************/
|
|
|
|
%include "asm_inc.asm"
|
|
|
|
BITS 32
|
|
|
|
SECTION .rodata align=16
|
|
|
|
;***********************************************************************
|
|
; Constant
|
|
;***********************************************************************
|
|
|
|
align 16
|
|
SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
|
|
dw 10, 13, 10, 13, 13, 16, 13, 16,
|
|
dw 11, 14, 11, 14, 14, 18, 14, 18,
|
|
dw 11, 14, 11, 14, 14, 18, 14, 18,
|
|
dw 13, 16, 13, 16, 16, 20, 16, 20,
|
|
dw 13, 16, 13, 16, 16, 20, 16, 20,
|
|
dw 14, 18, 14, 18, 18, 23, 18, 23,
|
|
dw 14, 18, 14, 18, 18, 23, 18, 23,
|
|
dw 16, 20, 16, 20, 20, 25, 20, 25,
|
|
dw 16, 20, 16, 20, 20, 25, 20, 25,
|
|
dw 18, 23, 18, 23, 23, 29, 23, 29,
|
|
dw 18, 23, 18, 23, 23, 29, 23, 29
|
|
|
|
|
|
;***********************************************************************
|
|
; MMX functions
|
|
;***********************************************************************
|
|
|
|
%macro MMX_LoadDiff4P 5
|
|
movd %1, [%3]
|
|
movd %2, [%4]
|
|
punpcklbw %1, %5
|
|
punpcklbw %2, %5
|
|
psubw %1, %2
|
|
%endmacro
|
|
|
|
%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
|
|
MMX_LoadDiff4P %1, %9, %5, %7, %10
|
|
MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
|
|
lea %5, [%5+2*%6]
|
|
lea %7, [%7+2*%8]
|
|
MMX_LoadDiff4P %3, %9, %5, %7, %10
|
|
MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
|
|
%endmacro
|
|
|
|
%macro MMX_SumSubMul2 3
|
|
movq %3, %1
|
|
psllw %1, $1
|
|
paddw %1, %2
|
|
psllw %2, $1
|
|
psubw %3, %2
|
|
%endmacro
|
|
|
|
%macro MMX_SumSubDiv2 3
|
|
movq %3, %2
|
|
psraw %3, $1
|
|
paddw %3, %1
|
|
psraw %1, $1
|
|
psubw %1, %2
|
|
%endmacro
|
|
|
|
%macro MMX_SumSub 3
|
|
movq %3, %2
|
|
psubw %2, %1
|
|
paddw %1, %3
|
|
%endmacro
|
|
|
|
%macro MMX_DCT 6
|
|
MMX_SumSub %4, %1, %6
|
|
MMX_SumSub %3, %2, %6
|
|
MMX_SumSub %3, %4, %6
|
|
MMX_SumSubMul2 %1, %2, %5
|
|
%endmacro
|
|
|
|
%macro MMX_IDCT 6
|
|
MMX_SumSub %4, %5, %6
|
|
MMX_SumSubDiv2 %3, %2, %1
|
|
MMX_SumSub %1, %4, %6
|
|
MMX_SumSub %3, %5, %6
|
|
%endmacro
|
|
|
|
%macro MMX_StoreDiff4P 6
|
|
movd %2, %6
|
|
punpcklbw %2, %4
|
|
paddw %1, %3
|
|
psraw %1, $6
|
|
paddsw %1, %2
|
|
packuswb %1, %2
|
|
movd %5, %1
|
|
%endmacro
|
|
|
|
ALIGN 16
|
|
;***********************************************************************
|
|
; void __cdecl WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsDctT4_mmx
|
|
WelsDctT4_mmx:
|
|
push ebx
|
|
mov eax, [esp+12] ; pix1
|
|
mov ebx, [esp+16] ; i_pix1
|
|
mov ecx, [esp+20] ; pix2
|
|
mov edx, [esp+24] ; i_pix2
|
|
|
|
WELS_Zero mm7
|
|
|
|
MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, eax, ebx, ecx, edx, mm0, mm7
|
|
|
|
MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
|
|
MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
|
|
|
|
MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
|
|
MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
|
|
|
|
mov eax, [esp+ 8] ; pDct
|
|
movq [eax+ 0], mm2
|
|
movq [eax+ 8], mm1
|
|
movq [eax+16], mm5
|
|
movq [eax+24], mm4
|
|
|
|
WELSEMMS
|
|
pop ebx
|
|
ret
|
|
|
|
|
|
;***********************************************************************
|
|
; void __cdecl WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsIDctT4Rec_mmx
|
|
WelsIDctT4Rec_mmx:
|
|
push ebx
|
|
%define pushsize 4
|
|
%define p_dst esp+pushsize+4
|
|
%define i_dst esp+pushsize+8
|
|
%define p_pred esp+pushsize+12
|
|
%define i_pred esp+pushsize+16
|
|
%define pDct esp+pushsize+20
|
|
|
|
mov eax, [pDct ]
|
|
movq mm0, [eax+ 0]
|
|
movq mm1, [eax+ 8]
|
|
movq mm2, [eax+16]
|
|
movq mm3, [eax+24]
|
|
mov edx, [p_dst ]
|
|
mov ecx, [i_dst ]
|
|
mov eax, [p_pred]
|
|
mov ebx, [i_pred]
|
|
|
|
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
|
|
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
|
|
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
|
|
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
|
|
|
|
WELS_Zero mm7
|
|
WELS_DW32 mm6
|
|
|
|
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [edx], [eax]
|
|
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
|
|
lea edx, [edx+2*ecx]
|
|
lea eax, [eax+2*ebx]
|
|
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [edx], [eax]
|
|
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
|
|
|
|
WELSEMMS
|
|
%undef pushsize
|
|
%undef p_dst
|
|
%undef i_dst
|
|
%undef p_pred
|
|
%undef i_pred
|
|
%undef pDct
|
|
pop ebx
|
|
ret
|
|
|
|
|
|
;***********************************************************************
|
|
; SSE2 functions
|
|
;***********************************************************************
|
|
%macro SSE2_Store4x8p 6
|
|
SSE2_XSawp qdq, %2, %3, %6
|
|
SSE2_XSawp qdq, %4, %5, %3
|
|
MOVDQ [%1+0x00], %2
|
|
MOVDQ [%1+0x10], %4
|
|
MOVDQ [%1+0x20], %6
|
|
MOVDQ [%1+0x30], %3
|
|
%endmacro
|
|
|
|
%macro SSE2_Load4x8p 6
|
|
MOVDQ %2, [%1+0x00]
|
|
MOVDQ %4, [%1+0x10]
|
|
MOVDQ %6, [%1+0x20]
|
|
MOVDQ %3, [%1+0x30]
|
|
SSE2_XSawp qdq, %4, %3, %5
|
|
SSE2_XSawp qdq, %2, %6, %3
|
|
%endmacro
|
|
|
|
%macro SSE2_SumSubMul2 3
|
|
movdqa %3, %1
|
|
paddw %1, %1
|
|
paddw %1, %2
|
|
psubw %3, %2
|
|
psubw %3, %2
|
|
%endmacro
|
|
|
|
%macro SSE2_SumSubDiv2 4
|
|
movdqa %4, %1
|
|
movdqa %3, %2
|
|
psraw %2, $1
|
|
psraw %4, $1
|
|
paddw %1, %2
|
|
psubw %4, %3
|
|
%endmacro
|
|
|
|
%macro SSE2_StoreDiff8p 6
|
|
paddw %1, %3
|
|
psraw %1, $6
|
|
movq %2, %6
|
|
punpcklbw %2, %4
|
|
paddsw %2, %1
|
|
packuswb %2, %2
|
|
movq %5, %2
|
|
%endmacro
|
|
|
|
%macro SSE2_StoreDiff8p 5
|
|
movq %2, %5
|
|
punpcklbw %2, %3
|
|
paddsw %2, %1
|
|
packuswb %2, %2
|
|
movq %4, %2
|
|
%endmacro
|
|
|
|
%macro SSE2_Load8DC 6
|
|
movdqa %1, %6 ; %1 = dc0 dc1
|
|
paddw %1, %5
|
|
psraw %1, $6 ; (dc + 32) >> 6
|
|
|
|
movdqa %2, %1
|
|
psrldq %2, 4
|
|
punpcklwd %2, %2
|
|
punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
|
|
|
|
movdqa %3, %1
|
|
psrldq %3, 8
|
|
punpcklwd %3, %3
|
|
punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
|
|
|
|
movdqa %4, %1
|
|
psrldq %4, 12
|
|
punpcklwd %4, %4
|
|
punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
|
|
|
|
punpcklwd %1, %1
|
|
punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
|
|
%endmacro
|
|
|
|
%macro SSE2_DCT 6
|
|
SSE2_SumSub %6, %3, %5
|
|
SSE2_SumSub %1, %2, %5
|
|
SSE2_SumSub %3, %2, %5
|
|
SSE2_SumSubMul2 %6, %1, %4
|
|
%endmacro
|
|
|
|
%macro SSE2_IDCT 7
|
|
SSE2_SumSub %7, %2, %6
|
|
SSE2_SumSubDiv2 %1, %3, %5, %4
|
|
SSE2_SumSub %2, %1, %5
|
|
SSE2_SumSub %7, %4, %5
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsDctFourT4_sse2
|
|
ALIGN 16
|
|
WelsDctFourT4_sse2:
|
|
push ebx
|
|
push esi
|
|
mov esi, [esp+12]
|
|
mov eax, [esp+16] ; pix1
|
|
mov ebx, [esp+20] ; i_pix1
|
|
mov ecx, [esp+24] ; pix2
|
|
mov edx, [esp+28] ; i_pix2
|
|
|
|
pxor xmm7, xmm7
|
|
|
|
;Load 4x8
|
|
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [eax ], [ecx]
|
|
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [eax+ebx ], [ecx+edx]
|
|
lea eax, [eax + 2 * ebx]
|
|
lea ecx, [ecx + 2 * edx]
|
|
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [eax], [ecx]
|
|
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
|
|
|
|
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
|
|
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
|
|
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
|
|
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
|
|
|
|
SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
|
|
|
|
lea eax, [eax + 2 * ebx]
|
|
lea ecx, [ecx + 2 * edx]
|
|
|
|
;Load 4x8
|
|
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [eax ], [ecx ]
|
|
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [eax+ebx ], [ecx+edx]
|
|
lea eax, [eax + 2 * ebx]
|
|
lea ecx, [ecx + 2 * edx]
|
|
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [eax], [ecx]
|
|
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
|
|
|
|
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
|
|
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
|
|
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
|
|
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
|
|
|
|
lea esi, [esi+64]
|
|
SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
|
|
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
|
|
|
|
%define rec esp + pushsize + 4
|
|
%define stride esp + pushsize + 8
|
|
%define pred esp + pushsize + 12
|
|
%define pred_stride esp + pushsize + 16
|
|
%define rs esp + pushsize + 20
|
|
;***********************************************************************
|
|
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsIDctFourT4Rec_sse2
|
|
ALIGN 16
|
|
WelsIDctFourT4Rec_sse2:
|
|
%define pushsize 8
|
|
push ebx
|
|
push esi
|
|
|
|
mov eax, [rec]
|
|
mov ebx, [stride]
|
|
mov ecx, [pred]
|
|
mov edx, [pred_stride]
|
|
mov esi, [rs]
|
|
|
|
;Load 4x8
|
|
SSE2_Load4x8p esi, xmm0, xmm1, xmm4, xmm2, xmm5
|
|
|
|
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
|
|
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
|
|
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
|
|
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
|
|
|
|
WELS_Zero xmm7
|
|
WELS_DW32 xmm6
|
|
|
|
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [eax ], [ecx]
|
|
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [eax + ebx ], [ecx + edx]
|
|
lea eax, [eax + 2 * ebx]
|
|
lea ecx, [ecx + 2 * edx]
|
|
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [eax], [ecx]
|
|
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [eax + ebx ], [ecx + edx]
|
|
|
|
add esi, 64
|
|
lea eax, [eax + 2 * ebx]
|
|
lea ecx, [ecx + 2 * edx]
|
|
SSE2_Load4x8p esi, xmm0, xmm1, xmm4, xmm2, xmm5
|
|
|
|
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
|
|
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
|
|
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
|
|
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
|
|
|
|
WELS_Zero xmm7
|
|
WELS_DW32 xmm6
|
|
|
|
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [eax ], [ecx]
|
|
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [eax + ebx ], [ecx + edx]
|
|
lea eax, [eax + 2 * ebx]
|
|
lea ecx, [ecx + 2 * edx]
|
|
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [eax], [ecx]
|
|
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [eax + ebx], [ecx + edx]
|
|
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
|
|
%macro SSE2_StoreDiff4x8p 8
|
|
SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
|
|
SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
|
|
SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
|
|
SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
|
|
ALIGN 16
|
|
%define pushsize 8
|
|
%define luma_dc esp + pushsize + 20
|
|
WelsIDctRecI16x16Dc_sse2:
|
|
push esi
|
|
push edi
|
|
|
|
mov ecx, [luma_dc]
|
|
mov eax, [rec]
|
|
mov edx, [stride]
|
|
mov esi, [pred]
|
|
mov edi, [pred_stride]
|
|
pxor xmm7, xmm7
|
|
WELS_DW32 xmm6
|
|
|
|
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [ecx]
|
|
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
|
|
|
|
lea eax, [eax + 2 * edx]
|
|
lea esi, [esi + 2 * edi]
|
|
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
|
|
|
|
lea eax, [eax + 2 * edx]
|
|
lea esi, [esi + 2 * edi]
|
|
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
|
|
|
|
lea eax, [eax + 2 * edx]
|
|
lea esi, [esi + 2 * edi]
|
|
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
|
|
|
|
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]
|
|
lea eax, [eax + 2 * edx]
|
|
lea esi, [esi + 2 * edi]
|
|
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
|
|
|
|
lea eax, [eax + 2 * edx]
|
|
lea esi, [esi + 2 * edi]
|
|
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
|
|
|
|
lea eax, [eax + 2 * edx]
|
|
lea esi, [esi + 2 * edi]
|
|
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
|
|
|
|
lea eax, [eax + 2 * edx]
|
|
lea esi, [esi + 2 * edi]
|
|
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
|
|
|
|
pop edi
|
|
pop esi
|
|
ret
|
|
|
|
|
|
|
|
%macro SSE2_SumSubD 3
|
|
movdqa %3, %2
|
|
paddd %2, %1
|
|
psubd %1, %3
|
|
%endmacro
|
|
|
|
%macro SSE2_SumSubDiv2D 4
|
|
paddd %1, %2
|
|
paddd %1, %3
|
|
psrad %1, 1
|
|
movdqa %4, %1
|
|
psubd %4, %2
|
|
%endmacro
|
|
|
|
%macro SSE2_Load4Col 5
|
|
movsx edx, WORD[%5]
|
|
movd %1, edx
|
|
movsx edx, WORD[%5 + 0x20]
|
|
movd %2, edx
|
|
punpckldq %1, %2
|
|
movsx edx, WORD[%5 + 0x80]
|
|
movd %3, edx
|
|
movsx edx, WORD[%5 + 0xa0]
|
|
movd %4, edx
|
|
punpckldq %3, %4
|
|
punpcklqdq %1, %3
|
|
%endmacro
|
|
|
|
;***********************************************************************
|
|
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
|
|
;***********************************************************************
|
|
WELS_EXTERN WelsHadamardT4Dc_sse2
|
|
WelsHadamardT4Dc_sse2:
|
|
mov eax, [esp + 4] ; luma_dc
|
|
mov ecx, [esp + 8] ; pDct
|
|
|
|
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, ecx
|
|
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, ecx + 0x40
|
|
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, ecx + 0x100
|
|
SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, ecx + 0x140
|
|
|
|
SSE2_SumSubD xmm1, xmm2, xmm7
|
|
SSE2_SumSubD xmm3, xmm4, xmm7
|
|
SSE2_SumSubD xmm2, xmm4, xmm7
|
|
SSE2_SumSubD xmm1, xmm3, xmm7
|
|
|
|
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
|
|
|
|
SSE2_SumSubD xmm4, xmm3, xmm7
|
|
SSE2_SumSubD xmm5, xmm1, xmm7
|
|
|
|
WELS_DD1 xmm6
|
|
SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
|
|
SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
|
|
SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
|
|
|
|
packssdw xmm3, xmm4
|
|
packssdw xmm2, xmm1
|
|
movdqa [eax+ 0], xmm3
|
|
movdqa [eax+16], xmm2
|
|
|
|
ret
|
|
|
|
|