ffmpeg/libavcodec/arm/dcadsp_vfp.S

221 lines
7.0 KiB
ArmAsm
Raw Normal View History

/*
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
POUT .req a1
PIN .req a2
PCOEF .req a3
DECIFACTOR .req a4
OLDFPSCR .req a4
COUNTER .req ip
SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4
IN0 .req s4
IN1 .req s5
IN2 .req s6
IN3 .req s7
IN4 .req s0
IN5 .req s1
IN6 .req s2
IN7 .req s3
COEF0 .req s8 @ coefficient elements
COEF1 .req s9
COEF2 .req s10
COEF3 .req s11
COEF4 .req s12
COEF5 .req s13
COEF6 .req s14
COEF7 .req s15
ACCUM0 .req s16 @ double-buffered multiply-accumulate results
ACCUM4 .req s20
POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
POST1 .req s25
POST2 .req s26
POST3 .req s27
.macro inner_loop decifactor, dir, tail, head
.ifc "\dir","up"
.set X, 0
.set Y, 4
.else
.set X, 4*JMAX*4 - 4
.set Y, -4
.endif
.ifnc "\head",""
vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
.endif
.ifnc "\tail",""
vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
.endif
.ifnc "\head",""
vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
.endif
.ifnc "\tail",""
vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar)
.endif
.ifnc "\head",""
vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
.ifc "\tail",""
vmul.f ACCUM4, COEF4, IN1 @ vector operation
.endif
vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
.ifnc "\tail",""
vmul.f ACCUM4, COEF4, IN1 @ vector operation
.endif
vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
.endif
.ifnc "\tail",""
vstmia POUT!, {POST0-POST3}
.endif
.ifnc "\head",""
vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
.if \decifactor == 32
vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
.endif
.endif
.endm
.macro dca_lfe_fir decifactor
.if \decifactor == 32
.set JMAX, 8
vpush {s16-s31}
vmov SCALE32, s0 @ duplicate scalar across vector
vldr IN4, [PIN, #-4*4]
vldr IN5, [PIN, #-5*4]
vldr IN6, [PIN, #-6*4]
vldr IN7, [PIN, #-7*4]
.else
.set JMAX, 4
vpush {s16-s27}
.endif
mov COUNTER, #\decifactor/4 - 1
inner_loop \decifactor, up,, head
1: add PCOEF, PCOEF, #4*JMAX*4
subs COUNTER, COUNTER, #1
inner_loop \decifactor, up, tail, head
bne 1b
inner_loop \decifactor, up, tail
mov COUNTER, #\decifactor/4 - 1
inner_loop \decifactor, down,, head
1: sub PCOEF, PCOEF, #4*JMAX*4
subs COUNTER, COUNTER, #1
inner_loop \decifactor, down, tail, head
bne 1b
inner_loop \decifactor, down, tail
.if \decifactor == 32
vpop {s16-s31}
.else
vpop {s16-s27}
.endif
fmxr FPSCR, OLDFPSCR
bx lr
.endm
/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
* int decifactor, float scale)
*/
function ff_dca_lfe_fir_vfp, export=1
teq DECIFACTOR, #32
fmrx OLDFPSCR, FPSCR
ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, ip
NOVFP vldr s0, [sp]
vldr IN0, [PIN, #-0*4]
vldr IN1, [PIN, #-1*4]
vldr IN2, [PIN, #-2*4]
vldr IN3, [PIN, #-3*4]
beq 32f
64: dca_lfe_fir 64
.ltorg
32: dca_lfe_fir 32
endfunc
.unreq POUT
.unreq PIN
.unreq PCOEF
.unreq DECIFACTOR
.unreq OLDFPSCR
.unreq COUNTER
.unreq SCALE32
.unreq SCALE64
.unreq IN0
.unreq IN1
.unreq IN2
.unreq IN3
.unreq IN4
.unreq IN5
.unreq IN6
.unreq IN7
.unreq COEF0
.unreq COEF1
.unreq COEF2
.unreq COEF3
.unreq COEF4
.unreq COEF5
.unreq COEF6
.unreq COEF7
.unreq ACCUM0
.unreq ACCUM4
.unreq POST0
.unreq POST1
.unreq POST2
.unreq POST3