arm: Add VFP-accelerated version of int32_to_float_fmul_scalar

Before After Mean StdDev Mean StdDev Change This function 1175.0 4.4 366.2 18.3 +220.8% Overall 19285.5 292.0 18420.5 489.1 +4.7% Signed-off-by: Martin Storsjö <martin@martin.st>
2013-07-15 18:28:10 +01:00 · 2013-07-15 18:28:10 +01:00 · ce9ed10ac2
commit ce9ed10ac2
parent 41ef1d360b
2 changed files with 48 additions and 0 deletions
--- a/libavcodec/arm/fmtconvert_init_arm.c
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@ -28,6 +28,9 @@
 void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src,
                                        float mul, int len);
 void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src,
                                       float mul, int len);
 void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
 void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
@ -38,6 +41,13 @@ av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx
    int cpu_flags = av_get_cpu_flags();
    if (have_vfp(cpu_flags) && have_armv6(cpu_flags)) {
        if (!have_vfpv3(cpu_flags)) {
            // This function doesn't use anything armv6 specific in itself,
            // but ff_float_to_int16_vfp which is in the same assembly source
            // file does, thus the whole file requires armv6 to be built.
            c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp;
        }
        c->float_to_int16 = ff_float_to_int16_vfp;
    }
--- a/libavcodec/arm/fmtconvert_vfp.S
+++ b/libavcodec/arm/fmtconvert_vfp.S
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
 * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
 *
 * This file is part of Libav.
 *
@ -76,3 +77,40 @@ function ff_float_to_int16_vfp, export=1
        vpop            {d8-d11}
        pop             {r4-r8,pc}
 endfunc
 /**
 * ARM VFP optimised int32 to float conversion.
 * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
 * (16 bytes alignment is best for BCM2835), little-endian.
 */
@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
 function ff_int32_to_float_fmul_scalar_vfp, export=1
 VFP     tmp     .req    a4
 VFP     len     .req    a3
 NOVFP   tmp     .req    a3
 NOVFP   len     .req    a4
 NOVFP   vmov    s0, a3
        ldr     tmp, =0x03070000           @ RunFast mode, short vectors of length 8, stride 1
        fmrx    ip, FPSCR
        fmxr    FPSCR, tmp
 1:
        vldmia          a2!, {s8-s15}
        vcvt.f32.s32    s8, s8
        vcvt.f32.s32    s9, s9
        vcvt.f32.s32    s10, s10
        vcvt.f32.s32    s11, s11
        vcvt.f32.s32    s12, s12
        vcvt.f32.s32    s13, s13
        vcvt.f32.s32    s14, s14
        vcvt.f32.s32    s15, s15
        vmul.f32        s8, s8, s0
        subs            len, len, #8
        vstmia          a1!, {s8-s11}
        vstmia          a1!, {s12-s15}
        bne             1b
        fmxr    FPSCR, ip
        bx      lr
 endfunc
        .unreq  tmp
        .unreq  len