2013-12-09 13:51:09 +01:00
|
|
|
;*!
|
|
|
|
;* \copy
|
|
|
|
;* Copyright (c) 2009-2013, Cisco Systems
|
|
|
|
;* All rights reserved.
|
|
|
|
;*
|
|
|
|
;* Redistribution and use in source and binary forms, with or without
|
|
|
|
;* modification, are permitted provided that the following conditions
|
|
|
|
;* are met:
|
|
|
|
;*
|
|
|
|
;* ?Redistributions of source code must retain the above copyright
|
|
|
|
;* notice, this list of conditions and the following disclaimer.
|
|
|
|
;*
|
|
|
|
;* ?Redistributions in binary form must reproduce the above copyright
|
|
|
|
;* notice, this list of conditions and the following disclaimer in
|
|
|
|
;* the documentation and/or other materials provided with the
|
|
|
|
;* distribution.
|
|
|
|
;*
|
|
|
|
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
|
|
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
|
|
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
|
|
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
|
|
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
|
|
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
;* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
;*
|
|
|
|
;*
|
|
|
|
;* dct.asm
|
|
|
|
;*
|
|
|
|
;* Abstract
|
|
|
|
;* WelsDctFourT4_sse2
|
|
|
|
;*
|
|
|
|
;* History
|
|
|
|
;* 8/4/2009 Created
|
|
|
|
;*
|
|
|
|
;*
|
|
|
|
;*************************************************************************/
|
|
|
|
|
|
|
|
%include "asm_inc.asm"
|
|
|
|
|
|
|
|
SECTION .rodata align=16
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
; Constant
|
2013-12-13 09:06:44 +01:00
|
|
|
;***********************************************************************
|
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
align 16
|
2013-12-13 09:06:44 +01:00
|
|
|
SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
|
2013-12-09 13:51:09 +01:00
|
|
|
dw 10, 13, 10, 13, 13, 16, 13, 16,
|
2013-12-13 09:06:44 +01:00
|
|
|
dw 11, 14, 11, 14, 14, 18, 14, 18,
|
2013-12-09 13:51:09 +01:00
|
|
|
dw 11, 14, 11, 14, 14, 18, 14, 18,
|
|
|
|
dw 13, 16, 13, 16, 16, 20, 16, 20,
|
2013-12-13 09:06:44 +01:00
|
|
|
dw 13, 16, 13, 16, 16, 20, 16, 20,
|
|
|
|
dw 14, 18, 14, 18, 18, 23, 18, 23,
|
2013-12-09 13:51:09 +01:00
|
|
|
dw 14, 18, 14, 18, 18, 23, 18, 23,
|
|
|
|
dw 16, 20, 16, 20, 20, 25, 20, 25,
|
2013-12-13 09:06:44 +01:00
|
|
|
dw 16, 20, 16, 20, 20, 25, 20, 25,
|
|
|
|
dw 18, 23, 18, 23, 23, 29, 23, 29,
|
2013-12-09 13:51:09 +01:00
|
|
|
dw 18, 23, 18, 23, 23, 29, 23, 29
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
; MMX functions
|
2013-12-13 09:06:44 +01:00
|
|
|
;***********************************************************************
|
2013-12-09 13:51:09 +01:00
|
|
|
|
|
|
|
%macro MMX_LoadDiff4P 5
|
|
|
|
movd %1, [%3]
|
|
|
|
movd %2, [%4]
|
|
|
|
punpcklbw %1, %5
|
|
|
|
punpcklbw %2, %5
|
|
|
|
psubw %1, %2
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
|
|
|
|
MMX_LoadDiff4P %1, %9, %5, %7, %10
|
|
|
|
MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
|
|
|
|
lea %5, [%5+2*%6]
|
|
|
|
lea %7, [%7+2*%8]
|
|
|
|
MMX_LoadDiff4P %3, %9, %5, %7, %10
|
|
|
|
MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro MMX_SumSubMul2 3
|
|
|
|
movq %3, %1
|
2014-01-05 13:38:20 +01:00
|
|
|
psllw %1, $01
|
2013-12-09 13:51:09 +01:00
|
|
|
paddw %1, %2
|
2014-01-05 13:38:20 +01:00
|
|
|
psllw %2, $01
|
2013-12-09 13:51:09 +01:00
|
|
|
psubw %3, %2
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro MMX_SumSubDiv2 3
|
|
|
|
movq %3, %2
|
2014-01-05 13:38:20 +01:00
|
|
|
psraw %3, $01
|
2013-12-09 13:51:09 +01:00
|
|
|
paddw %3, %1
|
2014-01-05 13:38:20 +01:00
|
|
|
psraw %1, $01
|
2013-12-09 13:51:09 +01:00
|
|
|
psubw %1, %2
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro MMX_SumSub 3
|
|
|
|
movq %3, %2
|
|
|
|
psubw %2, %1
|
|
|
|
paddw %1, %3
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro MMX_DCT 6
|
|
|
|
MMX_SumSub %4, %1, %6
|
|
|
|
MMX_SumSub %3, %2, %6
|
|
|
|
MMX_SumSub %3, %4, %6
|
2013-12-13 09:06:44 +01:00
|
|
|
MMX_SumSubMul2 %1, %2, %5
|
2013-12-09 13:51:09 +01:00
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro MMX_IDCT 6
|
|
|
|
MMX_SumSub %4, %5, %6
|
|
|
|
MMX_SumSubDiv2 %3, %2, %1
|
|
|
|
MMX_SumSub %1, %4, %6
|
|
|
|
MMX_SumSub %3, %5, %6
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro MMX_StoreDiff4P 6
|
|
|
|
movd %2, %6
|
|
|
|
punpcklbw %2, %4
|
|
|
|
paddw %1, %3
|
2014-01-05 13:38:20 +01:00
|
|
|
psraw %1, $06
|
2013-12-09 13:51:09 +01:00
|
|
|
paddsw %1, %2
|
|
|
|
packuswb %1, %2
|
|
|
|
movd %5, %1
|
|
|
|
%endmacro
|
2014-01-03 07:49:45 +01:00
|
|
|
SECTION .text
|
2013-12-09 13:51:09 +01:00
|
|
|
ALIGN 16
|
|
|
|
;***********************************************************************
|
|
|
|
; void __cdecl WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsDctT4_mmx
|
|
|
|
WelsDctT4_mmx:
|
2014-01-03 07:49:45 +01:00
|
|
|
;push ebx
|
|
|
|
;mov eax, [esp+12] ; pix1
|
|
|
|
;mov ebx, [esp+16] ; i_pix1
|
|
|
|
;mov ecx, [esp+20] ; pix2
|
|
|
|
;mov edx, [esp+24] ; i_pix2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
|
|
|
%ifndef X86_32
|
|
|
|
movsx r2, r2d
|
|
|
|
movsx r4, r4d
|
|
|
|
%endif
|
2013-12-09 13:51:09 +01:00
|
|
|
WELS_Zero mm7
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
|
2013-12-09 13:51:09 +01:00
|
|
|
|
2013-12-13 09:06:44 +01:00
|
|
|
MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
|
2013-12-09 13:51:09 +01:00
|
|
|
MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
|
2013-12-13 09:06:44 +01:00
|
|
|
|
|
|
|
MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
|
2013-12-09 13:51:09 +01:00
|
|
|
MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
|
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
;mov eax, [esp+ 8] ; pDct
|
|
|
|
movq [r0+ 0], mm2
|
|
|
|
movq [r0+ 8], mm1
|
|
|
|
movq [r0+16], mm5
|
|
|
|
movq [r0+24], mm4
|
|
|
|
WELSEMMS
|
|
|
|
LOAD_5_PARA_POP
|
|
|
|
;pop ebx
|
2013-12-09 13:51:09 +01:00
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
; void __cdecl WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsIDctT4Rec_mmx
|
|
|
|
WelsIDctT4Rec_mmx:
|
2014-01-03 07:49:45 +01:00
|
|
|
;push ebx
|
|
|
|
;%define pushsize 4
|
|
|
|
;%define p_dst esp+pushsize+4
|
|
|
|
;%define i_dst esp+pushsize+8
|
|
|
|
;%define p_pred esp+pushsize+12
|
|
|
|
;%define i_pred esp+pushsize+16
|
|
|
|
;%define pDct esp+pushsize+20
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
|
|
|
%ifndef X86_32
|
|
|
|
movsx r1, r1d
|
|
|
|
movsx r3, r3d
|
|
|
|
%endif
|
2014-01-05 13:16:22 +01:00
|
|
|
; mov eax, [pDct ]
|
2014-01-03 07:49:45 +01:00
|
|
|
movq mm0, [r4+ 0]
|
|
|
|
movq mm1, [r4+ 8]
|
|
|
|
movq mm2, [r4+16]
|
|
|
|
movq mm3, [r4+24]
|
|
|
|
;mov edx, [p_dst ] ; r0
|
|
|
|
;mov ecx, [i_dst ] ; r1
|
|
|
|
;mov eax, [p_pred] ; r2
|
|
|
|
;mov ebx, [i_pred] ; r3
|
2013-12-09 13:51:09 +01:00
|
|
|
|
|
|
|
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
|
|
|
|
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
|
|
|
|
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
|
|
|
|
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
|
|
|
|
|
|
|
|
WELS_Zero mm7
|
|
|
|
WELS_DW32 mm6
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
|
|
|
|
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
|
|
|
|
lea r0, [r0+2*r1]
|
|
|
|
lea r2, [r2+2*r3]
|
|
|
|
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
|
|
|
|
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
WELSEMMS
|
2014-01-03 07:49:45 +01:00
|
|
|
LOAD_5_PARA_POP
|
|
|
|
;%undef pushsize
|
|
|
|
;%undef p_dst
|
|
|
|
;%undef i_dst
|
|
|
|
;%undef p_pred
|
|
|
|
;%undef i_pred
|
|
|
|
;%undef pDct
|
|
|
|
; pop ebx
|
2013-12-09 13:51:09 +01:00
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
; SSE2 functions
|
|
|
|
;***********************************************************************
|
|
|
|
%macro SSE2_Store4x8p 6
|
|
|
|
SSE2_XSawp qdq, %2, %3, %6
|
|
|
|
SSE2_XSawp qdq, %4, %5, %3
|
2013-12-13 09:06:44 +01:00
|
|
|
MOVDQ [%1+0x00], %2
|
|
|
|
MOVDQ [%1+0x10], %4
|
|
|
|
MOVDQ [%1+0x20], %6
|
|
|
|
MOVDQ [%1+0x30], %3
|
2013-12-09 13:51:09 +01:00
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_Load4x8p 6
|
|
|
|
MOVDQ %2, [%1+0x00]
|
2013-12-13 09:06:44 +01:00
|
|
|
MOVDQ %4, [%1+0x10]
|
|
|
|
MOVDQ %6, [%1+0x20]
|
|
|
|
MOVDQ %3, [%1+0x30]
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_XSawp qdq, %4, %3, %5
|
|
|
|
SSE2_XSawp qdq, %2, %6, %3
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_SumSubMul2 3
|
|
|
|
movdqa %3, %1
|
|
|
|
paddw %1, %1
|
|
|
|
paddw %1, %2
|
|
|
|
psubw %3, %2
|
|
|
|
psubw %3, %2
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_SumSubDiv2 4
|
|
|
|
movdqa %4, %1
|
|
|
|
movdqa %3, %2
|
2014-01-05 13:38:20 +01:00
|
|
|
psraw %2, $01
|
|
|
|
psraw %4, $01
|
2013-12-09 13:51:09 +01:00
|
|
|
paddw %1, %2
|
|
|
|
psubw %4, %3
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_StoreDiff8p 6
|
|
|
|
paddw %1, %3
|
2014-01-05 13:38:20 +01:00
|
|
|
psraw %1, $06
|
2013-12-09 13:51:09 +01:00
|
|
|
movq %2, %6
|
|
|
|
punpcklbw %2, %4
|
|
|
|
paddsw %2, %1
|
|
|
|
packuswb %2, %2
|
|
|
|
movq %5, %2
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_StoreDiff8p 5
|
|
|
|
movq %2, %5
|
|
|
|
punpcklbw %2, %3
|
|
|
|
paddsw %2, %1
|
|
|
|
packuswb %2, %2
|
|
|
|
movq %4, %2
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_Load8DC 6
|
2013-12-13 09:06:44 +01:00
|
|
|
movdqa %1, %6 ; %1 = dc0 dc1
|
2013-12-09 13:51:09 +01:00
|
|
|
paddw %1, %5
|
2014-01-05 13:38:20 +01:00
|
|
|
psraw %1, $06 ; (dc + 32) >> 6
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
movdqa %2, %1
|
|
|
|
psrldq %2, 4
|
|
|
|
punpcklwd %2, %2
|
2013-12-13 09:06:44 +01:00
|
|
|
punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
|
2013-12-09 13:51:09 +01:00
|
|
|
|
|
|
|
movdqa %3, %1
|
|
|
|
psrldq %3, 8
|
|
|
|
punpcklwd %3, %3
|
|
|
|
punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
movdqa %4, %1
|
|
|
|
psrldq %4, 12
|
|
|
|
punpcklwd %4, %4
|
|
|
|
punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
punpcklwd %1, %1
|
2013-12-13 09:06:44 +01:00
|
|
|
punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
|
2013-12-09 13:51:09 +01:00
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_DCT 6
|
2013-12-13 09:06:44 +01:00
|
|
|
SSE2_SumSub %6, %3, %5
|
|
|
|
SSE2_SumSub %1, %2, %5
|
|
|
|
SSE2_SumSub %3, %2, %5
|
|
|
|
SSE2_SumSubMul2 %6, %1, %4
|
2013-12-09 13:51:09 +01:00
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_IDCT 7
|
2013-12-13 09:06:44 +01:00
|
|
|
SSE2_SumSub %7, %2, %6
|
|
|
|
SSE2_SumSubDiv2 %1, %3, %5, %4
|
|
|
|
SSE2_SumSub %2, %1, %5
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_SumSub %7, %4, %5
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
;***********************************************************************
|
|
|
|
; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsDctFourT4_sse2
|
|
|
|
ALIGN 16
|
|
|
|
WelsDctFourT4_sse2:
|
2014-01-03 07:49:45 +01:00
|
|
|
;push ebx
|
|
|
|
;push esi
|
|
|
|
;mov esi, [esp+12]
|
|
|
|
;mov eax, [esp+16] ; pix1
|
|
|
|
;mov ebx, [esp+20] ; i_pix1
|
|
|
|
;mov ecx, [esp+24] ; pix2
|
|
|
|
;mov edx, [esp+28] ; i_pix2
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
|
|
|
%ifndef X86_32
|
|
|
|
movsx r2, r2d
|
|
|
|
movsx r4, r4d
|
|
|
|
%endif
|
2013-12-09 13:51:09 +01:00
|
|
|
pxor xmm7, xmm7
|
|
|
|
;Load 4x8
|
2014-01-03 07:49:45 +01:00
|
|
|
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
|
|
|
|
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
|
|
|
|
lea r1, [r1 + 2 * r2]
|
|
|
|
lea r3, [r3 + 2 * r4]
|
|
|
|
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
|
|
|
|
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
|
|
|
|
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
|
2013-12-13 09:06:44 +01:00
|
|
|
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
lea r1, [r1 + 2 * r2]
|
|
|
|
lea r3, [r3 + 2 * r4]
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
;Load 4x8
|
2014-01-03 07:49:45 +01:00
|
|
|
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
|
|
|
|
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
|
|
|
|
lea r1, [r1 + 2 * r2]
|
|
|
|
lea r3, [r3 + 2 * r4]
|
|
|
|
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
|
|
|
|
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
|
2013-12-13 09:06:44 +01:00
|
|
|
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
|
|
|
|
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
lea r0, [r0+64]
|
|
|
|
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
;pop esi
|
|
|
|
;pop ebx
|
|
|
|
LOAD_5_PARA_POP
|
2013-12-09 13:51:09 +01:00
|
|
|
ret
|
|
|
|
|
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
;%define rec esp + pushsize + 4
|
|
|
|
;%define stride esp + pushsize + 8
|
|
|
|
;%define pred esp + pushsize + 12
|
|
|
|
;%define pred_stride esp + pushsize + 16
|
|
|
|
;%define rs esp + pushsize + 20
|
2013-12-09 13:51:09 +01:00
|
|
|
;***********************************************************************
|
|
|
|
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsIDctFourT4Rec_sse2
|
|
|
|
ALIGN 16
|
|
|
|
WelsIDctFourT4Rec_sse2:
|
2014-01-03 07:49:45 +01:00
|
|
|
;%define pushsize 8
|
|
|
|
; push ebx
|
|
|
|
; push esi
|
|
|
|
|
|
|
|
; mov eax, [rec]
|
|
|
|
; mov ebx, [stride]
|
|
|
|
; mov ecx, [pred]
|
|
|
|
; mov edx, [pred_stride]
|
|
|
|
; mov esi, [rs]
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
|
|
|
%ifndef X86_32
|
|
|
|
movsx r1, r1d
|
|
|
|
movsx r3, r3d
|
|
|
|
%endif
|
2013-12-09 13:51:09 +01:00
|
|
|
;Load 4x8
|
2014-01-03 07:49:45 +01:00
|
|
|
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
|
|
|
|
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
|
|
|
|
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
|
|
|
|
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
WELS_Zero xmm7
|
|
|
|
WELS_DW32 xmm6
|
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
|
|
|
|
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
|
|
|
|
lea r0, [r0 + 2 * r1]
|
|
|
|
lea r2, [r2 + 2 * r3]
|
|
|
|
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
|
|
|
|
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
add r4, 64
|
|
|
|
lea r0, [r0 + 2 * r1]
|
|
|
|
lea r2, [r2 + 2 * r3]
|
|
|
|
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
|
2013-12-13 09:06:44 +01:00
|
|
|
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
|
|
|
|
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
|
|
|
|
|
|
|
|
WELS_Zero xmm7
|
|
|
|
WELS_DW32 xmm6
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
|
|
|
|
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
|
|
|
|
lea r0, [r0 + 2 * r1]
|
|
|
|
lea r2, [r2 + 2 * r3]
|
|
|
|
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
|
|
|
|
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
|
|
|
|
LOAD_5_PARA_POP
|
|
|
|
; pop esi
|
|
|
|
; pop ebx
|
2013-12-09 13:51:09 +01:00
|
|
|
ret
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
%macro SSE2_StoreDiff4x8p 8
|
|
|
|
SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
|
2013-12-13 09:06:44 +01:00
|
|
|
SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
|
2013-12-13 09:06:44 +01:00
|
|
|
SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
|
2013-12-09 13:51:09 +01:00
|
|
|
%endmacro
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
;***********************************************************************
|
|
|
|
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
|
|
|
|
ALIGN 16
|
2014-01-03 07:49:45 +01:00
|
|
|
;%define pushsize 8
|
|
|
|
;%define luma_dc esp + pushsize + 20
|
2013-12-09 13:51:09 +01:00
|
|
|
WelsIDctRecI16x16Dc_sse2:
|
2014-01-03 07:49:45 +01:00
|
|
|
%assign push_num 0
|
|
|
|
LOAD_5_PARA
|
|
|
|
%ifndef X86_32
|
|
|
|
movsx r1, r1d
|
|
|
|
movsx r3, r3d
|
|
|
|
%endif
|
|
|
|
; push esi
|
|
|
|
; push edi
|
|
|
|
|
|
|
|
;mov ecx, [luma_dc] ; r4
|
|
|
|
;mov eax, [rec] ; r0
|
|
|
|
;mov edx, [stride] ; r1
|
|
|
|
;mov esi, [pred]; r2
|
|
|
|
;mov edi, [pred_stride]; r3
|
2013-12-09 13:51:09 +01:00
|
|
|
pxor xmm7, xmm7
|
|
|
|
WELS_DW32 xmm6
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
|
|
|
|
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
lea r0, [r0 + 2 * r1]
|
|
|
|
lea r2, [r2 + 2 * r3]
|
|
|
|
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
lea r0, [r0 + 2 * r1]
|
|
|
|
lea r2, [r2 + 2 * r3]
|
|
|
|
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
lea r0, [r0 + 2 * r1]
|
|
|
|
lea r2, [r2 + 2 * r3]
|
|
|
|
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
|
|
|
|
lea r0, [r0 + 2 * r1]
|
|
|
|
lea r2, [r2 + 2 * r3]
|
|
|
|
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
lea r0, [r0 + 2 * r1]
|
|
|
|
lea r2, [r2 + 2 * r3]
|
|
|
|
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
lea r0, [r0 + 2 * r1]
|
|
|
|
lea r2, [r2 + 2 * r3]
|
|
|
|
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2014-01-03 07:49:45 +01:00
|
|
|
lea r0, [r0 + 2 * r1]
|
|
|
|
lea r2, [r2 + 2 * r3]
|
|
|
|
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
|
|
|
LOAD_5_PARA_POP
|
|
|
|
;pop edi
|
|
|
|
;pop esi
|
2013-12-09 13:51:09 +01:00
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%macro SSE2_SumSubD 3
|
|
|
|
movdqa %3, %2
|
|
|
|
paddd %2, %1
|
|
|
|
psubd %1, %3
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro SSE2_SumSubDiv2D 4
|
|
|
|
paddd %1, %2
|
|
|
|
paddd %1, %3
|
|
|
|
psrad %1, 1
|
|
|
|
movdqa %4, %1
|
|
|
|
psubd %4, %2
|
|
|
|
%endmacro
|
|
|
|
%macro SSE2_Load4Col 5
|
2014-01-03 07:49:45 +01:00
|
|
|
movsx r2, WORD[%5]
|
|
|
|
movd %1, r2d
|
|
|
|
movsx r2, WORD[%5 + 0x20]
|
|
|
|
movd %2, r2d
|
2013-12-09 13:51:09 +01:00
|
|
|
punpckldq %1, %2
|
2014-01-03 07:49:45 +01:00
|
|
|
movsx r2, WORD[%5 + 0x80]
|
|
|
|
movd %3, r2d
|
|
|
|
movsx r2, WORD[%5 + 0xa0]
|
|
|
|
movd %4, r2d
|
2013-12-09 13:51:09 +01:00
|
|
|
punpckldq %3, %4
|
|
|
|
punpcklqdq %1, %3
|
|
|
|
%endmacro
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
;***********************************************************************
|
|
|
|
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
|
|
|
|
;***********************************************************************
|
|
|
|
WELS_EXTERN WelsHadamardT4Dc_sse2
|
|
|
|
WelsHadamardT4Dc_sse2:
|
2014-01-03 07:49:45 +01:00
|
|
|
;mov eax, [esp + 4] ; luma_dc
|
|
|
|
;mov ecx, [esp + 8] ; pDct
|
|
|
|
%assign push_num 0
|
|
|
|
LOAD_2_PARA
|
|
|
|
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
|
|
|
|
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
|
|
|
|
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
|
|
|
|
SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_SumSubD xmm1, xmm2, xmm7
|
|
|
|
SSE2_SumSubD xmm3, xmm4, xmm7
|
|
|
|
SSE2_SumSubD xmm2, xmm4, xmm7
|
2013-12-13 09:06:44 +01:00
|
|
|
SSE2_SumSubD xmm1, xmm3, xmm7
|
2013-12-09 13:51:09 +01:00
|
|
|
|
|
|
|
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
|
2013-12-13 09:06:44 +01:00
|
|
|
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_SumSubD xmm4, xmm3, xmm7
|
|
|
|
SSE2_SumSubD xmm5, xmm1, xmm7
|
|
|
|
|
2013-12-13 09:06:44 +01:00
|
|
|
WELS_DD1 xmm6
|
2013-12-09 13:51:09 +01:00
|
|
|
SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
|
|
|
|
SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
|
|
|
|
SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
|
|
|
|
|
|
|
|
packssdw xmm3, xmm4
|
|
|
|
packssdw xmm2, xmm1
|
2014-01-03 07:49:45 +01:00
|
|
|
movdqa [r0+ 0], xmm3
|
|
|
|
movdqa [r0+16], xmm2
|
2013-12-13 09:06:44 +01:00
|
|
|
|
|
|
|
ret
|