arm64: port synth_filter_float_neon from arm
~25% faster dts decoding overall. The checkasm CPU cycles numbers are not that useful since synth_filter_float() calls FFTContext.imdct_half(). cortex-a57 cortex-a53 synth_filter_float_c: 1866.2 3490.9 synth_filter_float_neon: 915.0 1531.5 With fftc.imdct_half forced to imdct_half_neon: cortex-a57 cortex-a53 synth_filter_float_c: 1718.4 3025.3 synth_filter_float_neon: 926.2 1530.1
This commit is contained in:
parent
c33c1fa8af
commit
705f5e5e15
@ -16,7 +16,8 @@ OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o
|
||||
|
||||
ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
|
||||
|
||||
NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o \
|
||||
aarch64/synth_filter_neon.o
|
||||
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
|
||||
|
@ -27,4 +27,7 @@
|
||||
#define CELT_TMP 0x10
|
||||
#define CELT_TWIDDLE (CELT_TMP + 0x8) // loaded as pair
|
||||
|
||||
/* FFTContext */
|
||||
#define IMDCT_HALF 0x48
|
||||
|
||||
#endif /* AVCODEC_AARCH64_ASM_OFFSETS_H */
|
||||
|
@ -22,7 +22,15 @@
|
||||
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavcodec/dcadsp.h"
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
#include "asm-offsets.h"
|
||||
|
||||
#if HAVE_NEON || HAVE_VFP
|
||||
AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
|
||||
#endif
|
||||
|
||||
void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
|
||||
void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
|
||||
@ -49,3 +57,11 @@ av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
|
||||
s->decode_hf = ff_decode_hf_neon;
|
||||
}
|
||||
}
|
||||
|
||||
av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
s->synth_filter_float = ff_synth_filter_float_neon;
|
||||
}
|
||||
|
119
libavcodec/aarch64/synth_filter_neon.S
Normal file
119
libavcodec/aarch64/synth_filter_neon.S
Normal file
@ -0,0 +1,119 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "asm-offsets.h"
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
.macro inner_loop
|
||||
ld1 {v29.4s}, [x9], x15
|
||||
ld1 {v28.4s}, [x8], x15
|
||||
ld1 {v30.4s}, [x10], x15
|
||||
ld1 {v31.4s}, [x11], x15
|
||||
rev64 v28.4s, v28.4s
|
||||
ld1 {v24.4s}, [x4], x15
|
||||
ld1 {v25.4s}, [x5], x15
|
||||
rev64 v31.4s, v31.4s
|
||||
ld1 {v26.4s}, [x6], x15
|
||||
fmla v5.4s, v25.4s, v29.4s
|
||||
ld1 {v27.4s}, [x7], x15
|
||||
ext v28.16b, v28.16b, v28.16b, #8
|
||||
ext v31.16b, v31.16b, v31.16b, #8
|
||||
fmla v6.4s, v26.4s, v30.4s
|
||||
fmls v4.4s, v24.4s, v28.4s
|
||||
fmla v7.4s, v27.4s, v31.4s
|
||||
.endm
|
||||
|
||||
function ff_synth_filter_float_neon, export=1
|
||||
ldr w7, [x2] // *synth_buf_offset
|
||||
ldr x9, [x0, #IMDCT_HALF] // imdct_half function pointer
|
||||
sxtw x7, w7
|
||||
stp x3, x4, [sp, #-64]!
|
||||
add x1, x1, x7, lsl #2 // synth_buf
|
||||
sub w8, w7, #32
|
||||
stp x5, x1, [sp, #16]
|
||||
bic x7, x7, #63
|
||||
and w8, w8, #511
|
||||
stp x7, x30, [sp, #32]
|
||||
str w8, [x2]
|
||||
str s0, [sp, #48]
|
||||
|
||||
mov x2, x6 // in
|
||||
|
||||
blr x9
|
||||
|
||||
ldp x2, x4, [sp] // synct_buf_2, window
|
||||
ldp x13, x9, [sp, #16] // out, synth_buf
|
||||
ldp x0, x30, [sp, #32] // *synth_buf_offset
|
||||
ldr s0, [sp, #48]
|
||||
|
||||
add x3, x2, #16*4 // synct_buf_2 + 16
|
||||
add x14, x13, #16*4 // out + 16
|
||||
add x8, x9, #12*4
|
||||
mov x15, #64*4
|
||||
mov x1, #4
|
||||
1:
|
||||
add x10, x9, #16*4 // synth_buf
|
||||
add x11, x8, #16*4
|
||||
add x5, x4, #16*4 // window
|
||||
add x6, x4, #32*4
|
||||
add x7, x4, #48*4
|
||||
|
||||
ld1 {v4.4s}, [x2] // a
|
||||
ld1 {v5.4s}, [x3] // b
|
||||
movi v6.4s, #0 // c
|
||||
movi v7.4s, #0 // d
|
||||
|
||||
mov x12, #512
|
||||
2:
|
||||
sub x12, x12, #64
|
||||
cmp x12, x0
|
||||
inner_loop
|
||||
b.gt 2b
|
||||
|
||||
sub x8, x8, #512*4
|
||||
sub x9, x9, #512*4
|
||||
cbz x12, 4f
|
||||
sub x10, x10, #512*4
|
||||
sub x11, x11, #512*4
|
||||
3:
|
||||
subs x12, x12, #64
|
||||
inner_loop
|
||||
b.gt 3b
|
||||
4:
|
||||
subs x1, x1, #1
|
||||
fmul v4.4s, v4.4s, v0.s[0]
|
||||
fmul v5.4s, v5.4s, v0.s[0]
|
||||
st1 {v6.4s}, [x2], #16
|
||||
st1 {v7.4s}, [x3], #16
|
||||
st1 {v4.4s}, [x13], #16
|
||||
st1 {v5.4s}, [x14], #16
|
||||
b.le 10f
|
||||
|
||||
sub x4, x4, #508*4 // window
|
||||
add x9, x9, #4*4 // synth_buf
|
||||
sub x8, x8, #4*4 // synth_buf
|
||||
b 1b
|
||||
|
||||
10:
|
||||
add sp, sp, #64
|
||||
ret
|
||||
endfunc
|
@ -60,6 +60,10 @@ av_cold void ff_synth_filter_init(SynthFilterContext *c)
|
||||
{
|
||||
c->synth_filter_float = synth_filter_float;
|
||||
|
||||
if (ARCH_ARM) ff_synth_filter_init_arm(c);
|
||||
if (ARCH_X86) ff_synth_filter_init_x86(c);
|
||||
if (ARCH_AARCH64)
|
||||
ff_synth_filter_init_aarch64(c);
|
||||
if (ARCH_ARM)
|
||||
ff_synth_filter_init_arm(c);
|
||||
if (ARCH_X86)
|
||||
ff_synth_filter_init_x86(c);
|
||||
}
|
||||
|
@ -32,6 +32,7 @@ typedef struct SynthFilterContext {
|
||||
} SynthFilterContext;
|
||||
|
||||
void ff_synth_filter_init(SynthFilterContext *c);
|
||||
void ff_synth_filter_init_aarch64(SynthFilterContext *c);
|
||||
void ff_synth_filter_init_arm(SynthFilterContext *c);
|
||||
void ff_synth_filter_init_x86(SynthFilterContext *c);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user