openh264/codec/encoder/core/asm/dct.asm

580 lines
16 KiB
NASM
Raw Normal View History

2013-12-09 13:51:09 +01:00
;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* ?Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* ?Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* dct.asm
;*
;* Abstract
;* WelsDctFourT4_sse2
;*
;* History
;* 8/4/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
SECTION .rodata align=16
;***********************************************************************
; Constant
;***********************************************************************
2013-12-09 13:51:09 +01:00
align 16
SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
2013-12-09 13:51:09 +01:00
dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 11, 14, 11, 14, 14, 18, 14, 18,
2013-12-09 13:51:09 +01:00
dw 11, 14, 11, 14, 14, 18, 14, 18,
dw 13, 16, 13, 16, 16, 20, 16, 20,
dw 13, 16, 13, 16, 16, 20, 16, 20,
dw 14, 18, 14, 18, 18, 23, 18, 23,
2013-12-09 13:51:09 +01:00
dw 14, 18, 14, 18, 18, 23, 18, 23,
dw 16, 20, 16, 20, 20, 25, 20, 25,
dw 16, 20, 16, 20, 20, 25, 20, 25,
dw 18, 23, 18, 23, 23, 29, 23, 29,
2013-12-09 13:51:09 +01:00
dw 18, 23, 18, 23, 23, 29, 23, 29
2013-12-09 13:51:09 +01:00
;***********************************************************************
; MMX functions
;***********************************************************************
2013-12-09 13:51:09 +01:00
%macro MMX_LoadDiff4P 5
movd %1, [%3]
movd %2, [%4]
punpcklbw %1, %5
punpcklbw %2, %5
psubw %1, %2
%endmacro
%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
MMX_LoadDiff4P %1, %9, %5, %7, %10
MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
lea %5, [%5+2*%6]
lea %7, [%7+2*%8]
MMX_LoadDiff4P %3, %9, %5, %7, %10
MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
%endmacro
%macro MMX_SumSubMul2 3
movq %3, %1
psllw %1, $1
paddw %1, %2
psllw %2, $1
psubw %3, %2
%endmacro
%macro MMX_SumSubDiv2 3
movq %3, %2
psraw %3, $1
paddw %3, %1
psraw %1, $1
psubw %1, %2
%endmacro
%macro MMX_SumSub 3
movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
%macro MMX_DCT 6
MMX_SumSub %4, %1, %6
MMX_SumSub %3, %2, %6
MMX_SumSub %3, %4, %6
MMX_SumSubMul2 %1, %2, %5
2013-12-09 13:51:09 +01:00
%endmacro
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6
%endmacro
%macro MMX_StoreDiff4P 6
movd %2, %6
punpcklbw %2, %4
paddw %1, %3
psraw %1, $6
paddsw %1, %2
packuswb %1, %2
movd %5, %1
%endmacro
2014-01-03 07:49:45 +01:00
SECTION .text
2013-12-09 13:51:09 +01:00
ALIGN 16
;***********************************************************************
; void __cdecl WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
;***********************************************************************
WELS_EXTERN WelsDctT4_mmx
WelsDctT4_mmx:
2014-01-03 07:49:45 +01:00
;push ebx
;mov eax, [esp+12] ; pix1
;mov ebx, [esp+16] ; i_pix1
;mov ecx, [esp+20] ; pix2
;mov edx, [esp+24] ; i_pix2
%assign push_num 0
LOAD_5_PARA
%ifndef X86_32
movsx r2, r2d
movsx r4, r4d
%endif
2013-12-09 13:51:09 +01:00
WELS_Zero mm7
2014-01-03 07:49:45 +01:00
MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
2013-12-09 13:51:09 +01:00
MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
2013-12-09 13:51:09 +01:00
MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
2013-12-09 13:51:09 +01:00
MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
2014-01-03 07:49:45 +01:00
;mov eax, [esp+ 8] ; pDct
movq [r0+ 0], mm2
movq [r0+ 8], mm1
movq [r0+16], mm5
movq [r0+24], mm4
WELSEMMS
LOAD_5_PARA_POP
;pop ebx
2013-12-09 13:51:09 +01:00
ret
;***********************************************************************
; void __cdecl WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
;***********************************************************************
WELS_EXTERN WelsIDctT4Rec_mmx
WelsIDctT4Rec_mmx:
2014-01-03 07:49:45 +01:00
;push ebx
;%define pushsize 4
;%define p_dst esp+pushsize+4
;%define i_dst esp+pushsize+8
;%define p_pred esp+pushsize+12
;%define i_pred esp+pushsize+16
;%define pDct esp+pushsize+20
%assign push_num 0
LOAD_5_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
%endif
; mov eax, [pDct ]
movq mm0, [r4+ 0]
movq mm1, [r4+ 8]
movq mm2, [r4+16]
movq mm3, [r4+24]
;mov edx, [p_dst ] ; r0
;mov ecx, [i_dst ] ; r1
;mov eax, [p_pred] ; r2
;mov ebx, [i_pred] ; r3
2013-12-09 13:51:09 +01:00
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
WELS_Zero mm7
WELS_DW32 mm6
2014-01-03 07:49:45 +01:00
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
2013-12-09 13:51:09 +01:00
WELSEMMS
2014-01-03 07:49:45 +01:00
LOAD_5_PARA_POP
;%undef pushsize
;%undef p_dst
;%undef i_dst
;%undef p_pred
;%undef i_pred
;%undef pDct
; pop ebx
2013-12-09 13:51:09 +01:00
ret
;***********************************************************************
; SSE2 functions
;***********************************************************************
%macro SSE2_Store4x8p 6
SSE2_XSawp qdq, %2, %3, %6
SSE2_XSawp qdq, %4, %5, %3
MOVDQ [%1+0x00], %2
MOVDQ [%1+0x10], %4
MOVDQ [%1+0x20], %6
MOVDQ [%1+0x30], %3
2013-12-09 13:51:09 +01:00
%endmacro
%macro SSE2_Load4x8p 6
MOVDQ %2, [%1+0x00]
MOVDQ %4, [%1+0x10]
MOVDQ %6, [%1+0x20]
MOVDQ %3, [%1+0x30]
2013-12-09 13:51:09 +01:00
SSE2_XSawp qdq, %4, %3, %5
SSE2_XSawp qdq, %2, %6, %3
%endmacro
%macro SSE2_SumSubMul2 3
movdqa %3, %1
paddw %1, %1
paddw %1, %2
psubw %3, %2
psubw %3, %2
%endmacro
%macro SSE2_SumSubDiv2 4
movdqa %4, %1
movdqa %3, %2
psraw %2, $1
psraw %4, $1
paddw %1, %2
psubw %4, %3
%endmacro
%macro SSE2_StoreDiff8p 6
paddw %1, %3
psraw %1, $6
movq %2, %6
punpcklbw %2, %4
paddsw %2, %1
packuswb %2, %2
movq %5, %2
%endmacro
%macro SSE2_StoreDiff8p 5
movq %2, %5
punpcklbw %2, %3
paddsw %2, %1
packuswb %2, %2
movq %4, %2
%endmacro
%macro SSE2_Load8DC 6
movdqa %1, %6 ; %1 = dc0 dc1
2013-12-09 13:51:09 +01:00
paddw %1, %5
psraw %1, $6 ; (dc + 32) >> 6
2013-12-09 13:51:09 +01:00
movdqa %2, %1
psrldq %2, 4
punpcklwd %2, %2
punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
2013-12-09 13:51:09 +01:00
movdqa %3, %1
psrldq %3, 8
punpcklwd %3, %3
punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
2013-12-09 13:51:09 +01:00
movdqa %4, %1
psrldq %4, 12
punpcklwd %4, %4
punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
2013-12-09 13:51:09 +01:00
punpcklwd %1, %1
punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
2013-12-09 13:51:09 +01:00
%endmacro
%macro SSE2_DCT 6
SSE2_SumSub %6, %3, %5
SSE2_SumSub %1, %2, %5
SSE2_SumSub %3, %2, %5
SSE2_SumSubMul2 %6, %1, %4
2013-12-09 13:51:09 +01:00
%endmacro
%macro SSE2_IDCT 7
SSE2_SumSub %7, %2, %6
SSE2_SumSubDiv2 %1, %3, %5, %4
SSE2_SumSub %2, %1, %5
2013-12-09 13:51:09 +01:00
SSE2_SumSub %7, %4, %5
%endmacro
;***********************************************************************
; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
;***********************************************************************
WELS_EXTERN WelsDctFourT4_sse2
ALIGN 16
WelsDctFourT4_sse2:
2014-01-03 07:49:45 +01:00
;push ebx
;push esi
;mov esi, [esp+12]
;mov eax, [esp+16] ; pix1
;mov ebx, [esp+20] ; i_pix1
;mov ecx, [esp+24] ; pix2
;mov edx, [esp+28] ; i_pix2
%assign push_num 0
LOAD_5_PARA
%ifndef X86_32
movsx r2, r2d
movsx r4, r4d
%endif
2013-12-09 13:51:09 +01:00
pxor xmm7, xmm7
;Load 4x8
2014-01-03 07:49:45 +01:00
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
2013-12-09 13:51:09 +01:00
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
2013-12-09 13:51:09 +01:00
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
2014-01-03 07:49:45 +01:00
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
2014-01-03 07:49:45 +01:00
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
2013-12-09 13:51:09 +01:00
;Load 4x8
2014-01-03 07:49:45 +01:00
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
2013-12-09 13:51:09 +01:00
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
2013-12-09 13:51:09 +01:00
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
2014-01-03 07:49:45 +01:00
lea r0, [r0+64]
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
2014-01-03 07:49:45 +01:00
;pop esi
;pop ebx
LOAD_5_PARA_POP
2013-12-09 13:51:09 +01:00
ret
2014-01-03 07:49:45 +01:00
;%define rec esp + pushsize + 4
;%define stride esp + pushsize + 8
;%define pred esp + pushsize + 12
;%define pred_stride esp + pushsize + 16
;%define rs esp + pushsize + 20
2013-12-09 13:51:09 +01:00
;***********************************************************************
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
;***********************************************************************
WELS_EXTERN WelsIDctFourT4Rec_sse2
ALIGN 16
WelsIDctFourT4Rec_sse2:
2014-01-03 07:49:45 +01:00
;%define pushsize 8
; push ebx
; push esi
; mov eax, [rec]
; mov ebx, [stride]
; mov ecx, [pred]
; mov edx, [pred_stride]
; mov esi, [rs]
%assign push_num 0
LOAD_5_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
%endif
2013-12-09 13:51:09 +01:00
;Load 4x8
2014-01-03 07:49:45 +01:00
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
2013-12-09 13:51:09 +01:00
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
2013-12-09 13:51:09 +01:00
WELS_Zero xmm7
WELS_DW32 xmm6
2014-01-03 07:49:45 +01:00
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
2014-01-03 07:49:45 +01:00
add r4, 64
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
2013-12-09 13:51:09 +01:00
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
2013-12-09 13:51:09 +01:00
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
WELS_Zero xmm7
WELS_DW32 xmm6
2014-01-03 07:49:45 +01:00
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
LOAD_5_PARA_POP
; pop esi
; pop ebx
2013-12-09 13:51:09 +01:00
ret
2013-12-09 13:51:09 +01:00
%macro SSE2_StoreDiff4x8p 8
SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
2013-12-09 13:51:09 +01:00
SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
2013-12-09 13:51:09 +01:00
%endmacro
2013-12-09 13:51:09 +01:00
;***********************************************************************
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
;***********************************************************************
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
ALIGN 16
2014-01-03 07:49:45 +01:00
;%define pushsize 8
;%define luma_dc esp + pushsize + 20
2013-12-09 13:51:09 +01:00
WelsIDctRecI16x16Dc_sse2:
2014-01-03 07:49:45 +01:00
%assign push_num 0
LOAD_5_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
%endif
; push esi
; push edi
;mov ecx, [luma_dc] ; r4
;mov eax, [rec] ; r0
;mov edx, [stride] ; r1
;mov esi, [pred]; r2
;mov edi, [pred_stride]; r3
2013-12-09 13:51:09 +01:00
pxor xmm7, xmm7
WELS_DW32 xmm6
2014-01-03 07:49:45 +01:00
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
2014-01-03 07:49:45 +01:00
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
2014-01-03 07:49:45 +01:00
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
2014-01-03 07:49:45 +01:00
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
2014-01-03 07:49:45 +01:00
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
2014-01-03 07:49:45 +01:00
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
2014-01-03 07:49:45 +01:00
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
2014-01-03 07:49:45 +01:00
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
LOAD_5_PARA_POP
;pop edi
;pop esi
2013-12-09 13:51:09 +01:00
ret
%macro SSE2_SumSubD 3
movdqa %3, %2
paddd %2, %1
psubd %1, %3
%endmacro
%macro SSE2_SumSubDiv2D 4
paddd %1, %2
paddd %1, %3
psrad %1, 1
movdqa %4, %1
psubd %4, %2
%endmacro
%macro SSE2_Load4Col 5
2014-01-03 07:49:45 +01:00
movsx r2, WORD[%5]
movd %1, r2d
movsx r2, WORD[%5 + 0x20]
movd %2, r2d
2013-12-09 13:51:09 +01:00
punpckldq %1, %2
2014-01-03 07:49:45 +01:00
movsx r2, WORD[%5 + 0x80]
movd %3, r2d
movsx r2, WORD[%5 + 0xa0]
movd %4, r2d
2013-12-09 13:51:09 +01:00
punpckldq %3, %4
punpcklqdq %1, %3
%endmacro
2013-12-09 13:51:09 +01:00
;***********************************************************************
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
;***********************************************************************
WELS_EXTERN WelsHadamardT4Dc_sse2
WelsHadamardT4Dc_sse2:
2014-01-03 07:49:45 +01:00
;mov eax, [esp + 4] ; luma_dc
;mov ecx, [esp + 8] ; pDct
%assign push_num 0
LOAD_2_PARA
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
2013-12-09 13:51:09 +01:00
SSE2_SumSubD xmm1, xmm2, xmm7
SSE2_SumSubD xmm3, xmm4, xmm7
SSE2_SumSubD xmm2, xmm4, xmm7
SSE2_SumSubD xmm1, xmm3, xmm7
2013-12-09 13:51:09 +01:00
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
2013-12-09 13:51:09 +01:00
SSE2_SumSubD xmm4, xmm3, xmm7
SSE2_SumSubD xmm5, xmm1, xmm7
WELS_DD1 xmm6
2013-12-09 13:51:09 +01:00
SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
packssdw xmm3, xmm4
packssdw xmm2, xmm1
2014-01-03 07:49:45 +01:00
movdqa [r0+ 0], xmm3
movdqa [r0+16], xmm2
ret