Sindre Aamås 98042f1600 [Decoder] Use encoder x86 IDCT routines
Move asm routines to common. Delete obsolete decoder routines.

Use wider routines where applicable.

~1.07x overall faster decode on a quick 720p30 4Mbps test on Haswell.
2016-03-09 10:41:42 +01:00

109 lines
3.7 KiB
NASM

;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* ?Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* ?Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* dct.asm
;*
;* History
;* 8/4/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
SECTION .text
;***********************************************************************
; SSE2 functions
;***********************************************************************
%macro SSE2_SumSubD 3
movdqa %3, %2
paddd %2, %1
psubd %1, %3
%endmacro
%macro SSE2_SumSubDiv2D 4
paddd %1, %2
paddd %1, %3
psrad %1, 1
movdqa %4, %1
psubd %4, %2
%endmacro
%macro SSE2_Load4Col 5
movsx r2, WORD[%5]
movd %1, r2d
movsx r2, WORD[%5 + 0x20]
movd %2, r2d
punpckldq %1, %2
movsx r2, WORD[%5 + 0x80]
movd %3, r2d
movsx r2, WORD[%5 + 0xa0]
movd %4, r2d
punpckldq %3, %4
punpcklqdq %1, %3
%endmacro
;***********************************************************************
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
;***********************************************************************
WELS_EXTERN WelsHadamardT4Dc_sse2
%assign push_num 0
LOAD_2_PARA
PUSH_XMM 8
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
SSE2_SumSubD xmm1, xmm2, xmm7
SSE2_SumSubD xmm3, xmm4, xmm7
SSE2_SumSubD xmm2, xmm4, xmm7
SSE2_SumSubD xmm1, xmm3, xmm7
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
SSE2_SumSubD xmm4, xmm3, xmm7
SSE2_SumSubD xmm5, xmm1, xmm7
WELS_DD1 xmm6
SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
packssdw xmm3, xmm4
packssdw xmm2, xmm1
movdqa [r0+ 0], xmm3
movdqa [r0+16], xmm2
POP_XMM
ret