p1 and p2 are int32_t. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>
		
			
				
	
	
		
			117 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
			
		
		
	
	
			117 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
;******************************************************************************
 | 
						|
;* TAK DSP SIMD optimizations
 | 
						|
;*
 | 
						|
;* Copyright (C) 2015 Paul B Mahol
 | 
						|
;*
 | 
						|
;* This file is part of FFmpeg.
 | 
						|
;*
 | 
						|
;* FFmpeg is free software; you can redistribute it and/or
 | 
						|
;* modify it under the terms of the GNU Lesser General Public
 | 
						|
;* License as published by the Free Software Foundation; either
 | 
						|
;* version 2.1 of the License, or (at your option) any later version.
 | 
						|
;*
 | 
						|
;* FFmpeg is distributed in the hope that it will be useful,
 | 
						|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
;* Lesser General Public License for more details.
 | 
						|
;*
 | 
						|
;* You should have received a copy of the GNU Lesser General Public
 | 
						|
;* License along with FFmpeg; if not, write to the Free Software
 | 
						|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
						|
;******************************************************************************
 | 
						|
 | 
						|
%include "libavutil/x86/x86util.asm"
 | 
						|
 | 
						|
SECTION_RODATA
 | 
						|
 | 
						|
pd_128: times 4 dd 128
 | 
						|
 | 
						|
SECTION .text
 | 
						|
 | 
						|
INIT_XMM sse2
 | 
						|
cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
 | 
						|
    shl                     lengthd, 2
 | 
						|
    add                         p1q, lengthq
 | 
						|
    add                         p2q, lengthq
 | 
						|
    neg                     lengthq
 | 
						|
.loop:
 | 
						|
    mova                         m0, [p1q+lengthq+mmsize*0]
 | 
						|
    mova                         m1, [p1q+lengthq+mmsize*1]
 | 
						|
    paddd                        m0, [p2q+lengthq+mmsize*0]
 | 
						|
    paddd                        m1, [p2q+lengthq+mmsize*1]
 | 
						|
    mova     [p2q+lengthq+mmsize*0], m0
 | 
						|
    mova     [p2q+lengthq+mmsize*1], m1
 | 
						|
    add                     lengthq, mmsize*2
 | 
						|
    jl .loop
 | 
						|
    REP_RET
 | 
						|
 | 
						|
cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
 | 
						|
    shl                     lengthd, 2
 | 
						|
    add                         p1q, lengthq
 | 
						|
    add                         p2q, lengthq
 | 
						|
    neg                     lengthq
 | 
						|
 | 
						|
.loop:
 | 
						|
    mova                         m0, [p2q+lengthq+mmsize*0]
 | 
						|
    mova                         m1, [p2q+lengthq+mmsize*1]
 | 
						|
    psubd                        m0, [p1q+lengthq+mmsize*0]
 | 
						|
    psubd                        m1, [p1q+lengthq+mmsize*1]
 | 
						|
    mova     [p1q+lengthq+mmsize*0], m0
 | 
						|
    mova     [p1q+lengthq+mmsize*1], m1
 | 
						|
    add                     lengthq, mmsize*2
 | 
						|
    jl .loop
 | 
						|
    REP_RET
 | 
						|
 | 
						|
cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
 | 
						|
    shl                     lengthd, 2
 | 
						|
    add                         p1q, lengthq
 | 
						|
    add                         p2q, lengthq
 | 
						|
    neg                     lengthq
 | 
						|
 | 
						|
.loop:
 | 
						|
    mova                         m0, [p1q+lengthq]
 | 
						|
    mova                         m1, [p2q+lengthq]
 | 
						|
    mova                         m3, [p1q+lengthq+mmsize]
 | 
						|
    mova                         m4, [p2q+lengthq+mmsize]
 | 
						|
    mova                         m2, m1
 | 
						|
    mova                         m5, m4
 | 
						|
    psrad                        m2, 1
 | 
						|
    psrad                        m5, 1
 | 
						|
    psubd                        m0, m2
 | 
						|
    psubd                        m3, m5
 | 
						|
    paddd                        m1, m0
 | 
						|
    paddd                        m4, m3
 | 
						|
    mova              [p1q+lengthq], m0
 | 
						|
    mova              [p2q+lengthq], m1
 | 
						|
    mova       [p1q+lengthq+mmsize], m3
 | 
						|
    mova       [p2q+lengthq+mmsize], m4
 | 
						|
    add                     lengthq, mmsize*2
 | 
						|
    jl .loop
 | 
						|
    REP_RET
 | 
						|
 | 
						|
INIT_XMM sse4
 | 
						|
cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
 | 
						|
    shl             lengthd, 2
 | 
						|
    add                 p1q, lengthq
 | 
						|
    add                 p2q, lengthq
 | 
						|
    neg             lengthq
 | 
						|
 | 
						|
    movd                 m2, dshiftm
 | 
						|
    movd                 m3, dfactorm
 | 
						|
    pshufd               m3, m3, 0
 | 
						|
    mova                 m4, [pd_128]
 | 
						|
 | 
						|
.loop:
 | 
						|
    mova                 m0, [p1q+lengthq]
 | 
						|
    mova                 m1, [p2q+lengthq]
 | 
						|
    psrad                m1, m2
 | 
						|
    pmulld               m1, m3
 | 
						|
    paddd                m1, m4
 | 
						|
    psrad                m1, 8
 | 
						|
    pslld                m1, m2
 | 
						|
    psubd                m1, m0
 | 
						|
    mova      [p1q+lengthq], m1
 | 
						|
    add             lengthq, mmsize
 | 
						|
    jl .loop
 | 
						|
    REP_RET
 |