ffmpeg/libavutil/x86/float_dsp_init.c

/*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "config.h"

#include "libavutil/cpu.h"
#include "libavutil/float_dsp.h"
#include "cpu.h"
#include "asm.h"

void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
                        int len);
void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
                        int len);

void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
                               int len);
void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
                               int len);

void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
                               int len);

void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
                                double mul, int len);
void ff_vector_dmul_scalar_avx(double *dst, const double *src,
                               double mul, int len);

void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
                            const float *src2, int len);
void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
                            const float *src2, int len);

void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
                                const float *src1, int len);
void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
                                const float *src1, int len);

float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);

void ff_butterflies_float_sse(float *src0, float *src1, int len);

#if HAVE_6REGS && HAVE_INLINE_ASM
static void vector_fmul_window_3dnowext(float *dst, const float *src0,
                                        const float *src1, const float *win,
                                        int len)
{
    x86_reg i = -len * 4;
    x86_reg j =  len * 4 - 8;
    __asm__ volatile (
        "1:                             \n"
        "pswapd (%5, %1), %%mm1         \n"
        "movq   (%5, %0), %%mm0         \n"
        "pswapd (%4, %1), %%mm5         \n"
        "movq   (%3, %0), %%mm4         \n"
        "movq      %%mm0, %%mm2         \n"
        "movq      %%mm1, %%mm3         \n"
        "pfmul     %%mm4, %%mm2         \n" // src0[len + i] * win[len + i]
        "pfmul     %%mm5, %%mm3         \n" // src1[j]       * win[len + j]
        "pfmul     %%mm4, %%mm1         \n" // src0[len + i] * win[len + j]
        "pfmul     %%mm5, %%mm0         \n" // src1[j]       * win[len + i]
        "pfadd     %%mm3, %%mm2         \n"
        "pfsub     %%mm0, %%mm1         \n"
        "pswapd    %%mm2, %%mm2         \n"
        "movq      %%mm1, (%2, %0)      \n"
        "movq      %%mm2, (%2, %1)      \n"
        "sub          $8, %1            \n"
        "add          $8, %0            \n"
        "jl           1b                \n"
        "femms                          \n"
        : "+r"(i), "+r"(j)
        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
    );
}

static void vector_fmul_window_sse(float *dst, const float *src0,
                                   const float *src1, const float *win, int len)
{
    x86_reg i = -len * 4;
    x86_reg j =  len * 4 - 16;
    __asm__ volatile (
        "1:                             \n"
        "movaps      (%5, %1), %%xmm1   \n"
        "movaps      (%5, %0), %%xmm0   \n"
        "movaps      (%4, %1), %%xmm5   \n"
        "movaps      (%3, %0), %%xmm4   \n"
        "shufps $0x1b, %%xmm1, %%xmm1   \n"
        "shufps $0x1b, %%xmm5, %%xmm5   \n"
        "movaps        %%xmm0, %%xmm2   \n"
        "movaps        %%xmm1, %%xmm3   \n"
        "mulps         %%xmm4, %%xmm2   \n" // src0[len + i] * win[len + i]
        "mulps         %%xmm5, %%xmm3   \n" // src1[j]       * win[len + j]
        "mulps         %%xmm4, %%xmm1   \n" // src0[len + i] * win[len + j]
        "mulps         %%xmm5, %%xmm0   \n" // src1[j]       * win[len + i]
        "addps         %%xmm3, %%xmm2   \n"
        "subps         %%xmm0, %%xmm1   \n"
        "shufps $0x1b, %%xmm2, %%xmm2   \n"
        "movaps        %%xmm1, (%2, %0) \n"
        "movaps        %%xmm2, (%2, %1) \n"
        "sub              $16, %1       \n"
        "add              $16, %0       \n"
        "jl                1b           \n"
        : "+r"(i), "+r"(j)
        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
    );
}
#endif /* HAVE_6REGS && HAVE_INLINE_ASM */

void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
    int mm_flags = av_get_cpu_flags();

#if HAVE_6REGS && HAVE_INLINE_ASM
    if (INLINE_AMD3DNOWEXT(mm_flags)) {
        fdsp->vector_fmul_window  = vector_fmul_window_3dnowext;
    }
    if (INLINE_SSE(mm_flags)) {
        fdsp->vector_fmul_window = vector_fmul_window_sse;
    }
#endif
    if (EXTERNAL_SSE(mm_flags)) {
        fdsp->vector_fmul = ff_vector_fmul_sse;
        fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
        fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
        fdsp->vector_fmul_add    = ff_vector_fmul_add_sse;
        fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
        fdsp->scalarproduct_float = ff_scalarproduct_float_sse;
        fdsp->butterflies_float   = ff_butterflies_float_sse;
    }
    if (EXTERNAL_SSE2(mm_flags)) {
        fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
    }
    if (EXTERNAL_AVX(mm_flags)) {
        fdsp->vector_fmul = ff_vector_fmul_avx;
        fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
        fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
        fdsp->vector_fmul_add    = ff_vector_fmul_add_avx;
        fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
    }
}
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00			`/*`
Merge remote-tracking branch 'qatar/master' * qatar/master: float_dsp: ppc: add a separate header for Altivec function prototypes ARM: fix float_dsp breakage from d5a7229 Add a float DSP framework to libavutil PPC: Move types_altivec.h and util_altivec.h from libavcodec to libavutil ARM: Move asm.S from libavcodec to libavutil vc1dsp: mark put/avg_vc1_mspel_mc() always_inline Merged-by: Michael Niedermayer <michaelni@gmx.at> 2012-06-08 23:02:54 +02:00			`* This file is part of FFmpeg.`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00			`*`
Merge remote-tracking branch 'qatar/master' * qatar/master: float_dsp: ppc: add a separate header for Altivec function prototypes ARM: fix float_dsp breakage from d5a7229 Add a float DSP framework to libavutil PPC: Move types_altivec.h and util_altivec.h from libavcodec to libavutil ARM: Move asm.S from libavcodec to libavutil vc1dsp: mark put/avg_vc1_mspel_mc() always_inline Merged-by: Michael Niedermayer <michaelni@gmx.at> 2012-06-08 23:02:54 +02:00			`* FFmpeg is free software; you can redistribute it and/or`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
Merge remote-tracking branch 'qatar/master' * qatar/master: float_dsp: ppc: add a separate header for Altivec function prototypes ARM: fix float_dsp breakage from d5a7229 Add a float DSP framework to libavutil PPC: Move types_altivec.h and util_altivec.h from libavcodec to libavutil ARM: Move asm.S from libavcodec to libavutil vc1dsp: mark put/avg_vc1_mspel_mc() always_inline Merged-by: Michael Niedermayer <michaelni@gmx.at> 2012-06-08 23:02:54 +02:00			`* FFmpeg is distributed in the hope that it will be useful,`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
Merge remote-tracking branch 'qatar/master' * qatar/master: float_dsp: ppc: add a separate header for Altivec function prototypes ARM: fix float_dsp breakage from d5a7229 Add a float DSP framework to libavutil PPC: Move types_altivec.h and util_altivec.h from libavcodec to libavutil ARM: Move asm.S from libavcodec to libavutil vc1dsp: mark put/avg_vc1_mspel_mc() always_inline Merged-by: Michael Niedermayer <michaelni@gmx.at> 2012-06-08 23:02:54 +02:00			`* License along with FFmpeg; if not, write to the Free Software`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#include "config.h"`

			`#include "libavutil/cpu.h"`
			`#include "libavutil/float_dsp.h"`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 19:01:05 +02:00			`#include "cpu.h"`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-07 05:47:30 +01:00			`#include "asm.h"`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00
cosmetics: Remove unnecessary extern keywords from function declarations 2013-03-26 18:41:24 +01:00			`void ff_vector_fmul_sse(float dst, const float src0, const float *src1,`
			`int len);`
			`void ff_vector_fmul_avx(float dst, const float src0, const float *src1,`
			`int len);`

			`void ff_vector_fmac_scalar_sse(float dst, const float src, float mul,`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00			`int len);`
cosmetics: Remove unnecessary extern keywords from function declarations 2013-03-26 18:41:24 +01:00			`void ff_vector_fmac_scalar_avx(float dst, const float src, float mul,`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00			`int len);`

cosmetics: Remove unnecessary extern keywords from function declarations 2013-03-26 18:41:24 +01:00			`void ff_vector_fmul_scalar_sse(float dst, const float src, float mul,`
			`int len);`
x86: float_dsp: add SSE version of vector_fmul_scalar() 2012-09-23 00:41:25 +02:00
cosmetics: Remove unnecessary extern keywords from function declarations 2013-03-26 18:41:24 +01:00			`void ff_vector_dmul_scalar_sse2(double dst, const double src,`
			`double mul, int len);`
			`void ff_vector_dmul_scalar_avx(double dst, const double src,`
			`double mul, int len);`
float_dsp: add vector_dmul_scalar() to multiply a vector of doubles Include x86-optimized versions for SSE2 and AVX. 2012-09-24 21:00:53 +02:00
floatdsp: move vector_fmul_add from dsputil to avfloatdsp. 2013-01-20 07:26:58 +01:00			`void ff_vector_fmul_add_sse(float dst, const float src0, const float *src1,`
			`const float *src2, int len);`
			`void ff_vector_fmul_add_avx(float dst, const float src0, const float *src1,`
			`const float *src2, int len);`

floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp. Now, nellymoserenc and aacenc no longer depends on dsputil. Independent of this patch, wmaprodec also does not depend on dsputil, so I removed it from there also. 2013-01-20 22:20:30 +01:00			`void ff_vector_fmul_reverse_sse(float dst, const float src0,`
			`const float *src1, int len);`
			`void ff_vector_fmul_reverse_avx(float dst, const float src0,`
			`const float *src1, int len);`

floatdsp: move scalarproduct_float from dsputil to avfloatdsp. This makes the aac decoder and all voice codecs independent of dsputil. 2013-01-21 00:41:52 +01:00			`float ff_scalarproduct_float_sse(const float v1, const float v2, int order);`

x86: float dsp: butterflies_float SSE 97c -> 49c Some codecs could benefit from more unrolling, but AAC doesn't. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-04-12 21:07:01 +02:00			`void ff_butterflies_float_sse(float src0, float src1, int len);`

float_dsp: Add #ifdef HAVE_INLINE_ASM around vector_fmul_window This fixes builds on 64bit MSVC. Signed-off-by: Martin Storsjö <martin@martin.st> 2013-01-17 17:58:25 +01:00			`#if HAVE_6REGS && HAVE_INLINE_ASM`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-07 05:47:30 +01:00			`static void vector_fmul_window_3dnowext(float dst, const float src0,`
			`const float src1, const float win,`
			`int len)`
			`{`
			`x86_reg i = -len * 4;`
			`x86_reg j = len * 4 - 8;`
			`__asm__ volatile (`
			`"1: \n"`
			`"pswapd (%5, %1), %%mm1 \n"`
			`"movq (%5, %0), %%mm0 \n"`
			`"pswapd (%4, %1), %%mm5 \n"`
			`"movq (%3, %0), %%mm4 \n"`
			`"movq %%mm0, %%mm2 \n"`
			`"movq %%mm1, %%mm3 \n"`
			`"pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]`
			`"pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]`
			`"pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]`
			`"pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]`
			`"pfadd %%mm3, %%mm2 \n"`
			`"pfsub %%mm0, %%mm1 \n"`
			`"pswapd %%mm2, %%mm2 \n"`
			`"movq %%mm1, (%2, %0) \n"`
			`"movq %%mm2, (%2, %1) \n"`
			`"sub $8, %1 \n"`
			`"add $8, %0 \n"`
			`"jl 1b \n"`
			`"femms \n"`
			`: "+r"(i), "+r"(j)`
			`: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)`
			`);`
			`}`

			`static void vector_fmul_window_sse(float dst, const float src0,`
			`const float src1, const float win, int len)`
			`{`
			`x86_reg i = -len * 4;`
			`x86_reg j = len * 4 - 16;`
			`__asm__ volatile (`
			`"1: \n"`
			`"movaps (%5, %1), %%xmm1 \n"`
			`"movaps (%5, %0), %%xmm0 \n"`
			`"movaps (%4, %1), %%xmm5 \n"`
			`"movaps (%3, %0), %%xmm4 \n"`
			`"shufps $0x1b, %%xmm1, %%xmm1 \n"`
			`"shufps $0x1b, %%xmm5, %%xmm5 \n"`
			`"movaps %%xmm0, %%xmm2 \n"`
			`"movaps %%xmm1, %%xmm3 \n"`
			`"mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]`
			`"mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]`
			`"mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]`
			`"mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]`
			`"addps %%xmm3, %%xmm2 \n"`
			`"subps %%xmm0, %%xmm1 \n"`
			`"shufps $0x1b, %%xmm2, %%xmm2 \n"`
			`"movaps %%xmm1, (%2, %0) \n"`
			`"movaps %%xmm2, (%2, %1) \n"`
			`"sub $16, %1 \n"`
			`"add $16, %0 \n"`
			`"jl 1b \n"`
			`: "+r"(i), "+r"(j)`
			`: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)`
			`);`
			`}`
float_dsp: Add #ifdef HAVE_INLINE_ASM around vector_fmul_window This fixes builds on 64bit MSVC. Signed-off-by: Martin Storsjö <martin@martin.st> 2013-01-17 17:58:25 +01:00			`#endif /* HAVE_6REGS && HAVE_INLINE_ASM */`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-07 05:47:30 +01:00
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00			`void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)`
			`{`
			`int mm_flags = av_get_cpu_flags();`

float_dsp: Add #ifdef HAVE_INLINE_ASM around vector_fmul_window This fixes builds on 64bit MSVC. Signed-off-by: Martin Storsjö <martin@martin.st> 2013-01-17 17:58:25 +01:00			`#if HAVE_6REGS && HAVE_INLINE_ASM`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-07 05:47:30 +01:00			`if (INLINE_AMD3DNOWEXT(mm_flags)) {`
			`fdsp->vector_fmul_window = vector_fmul_window_3dnowext;`
			`}`
			`if (INLINE_SSE(mm_flags)) {`
			`fdsp->vector_fmul_window = vector_fmul_window_sse;`
			`}`
			`#endif`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 19:01:05 +02:00			`if (EXTERNAL_SSE(mm_flags)) {`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00			`fdsp->vector_fmul = ff_vector_fmul_sse;`
float_dsp: add x86-optimized functions for vector_fmac_scalar() 2012-06-09 05:20:59 +02:00			`fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;`
x86: float_dsp: add SSE version of vector_fmul_scalar() 2012-09-23 00:41:25 +02:00			`fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;`
floatdsp: move vector_fmul_add from dsputil to avfloatdsp. 2013-01-20 07:26:58 +01:00			`fdsp->vector_fmul_add = ff_vector_fmul_add_sse;`
floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp. Now, nellymoserenc and aacenc no longer depends on dsputil. Independent of this patch, wmaprodec also does not depend on dsputil, so I removed it from there also. 2013-01-20 22:20:30 +01:00			`fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;`
floatdsp: move scalarproduct_float from dsputil to avfloatdsp. This makes the aac decoder and all voice codecs independent of dsputil. 2013-01-21 00:41:52 +01:00			`fdsp->scalarproduct_float = ff_scalarproduct_float_sse;`
x86: float dsp: butterflies_float SSE 97c -> 49c Some codecs could benefit from more unrolling, but AAC doesn't. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-04-12 21:07:01 +02:00			`fdsp->butterflies_float = ff_butterflies_float_sse;`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00			`}`
float_dsp: add vector_dmul_scalar() to multiply a vector of doubles Include x86-optimized versions for SSE2 and AVX. 2012-09-24 21:00:53 +02:00			`if (EXTERNAL_SSE2(mm_flags)) {`
			`fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;`
			`}`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 19:01:05 +02:00			`if (EXTERNAL_AVX(mm_flags)) {`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00			`fdsp->vector_fmul = ff_vector_fmul_avx;`
float_dsp: add x86-optimized functions for vector_fmac_scalar() 2012-06-09 05:20:59 +02:00			`fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;`
float_dsp: add vector_dmul_scalar() to multiply a vector of doubles Include x86-optimized versions for SSE2 and AVX. 2012-09-24 21:00:53 +02:00			`fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;`
floatdsp: move vector_fmul_add from dsputil to avfloatdsp. 2013-01-20 07:26:58 +01:00			`fdsp->vector_fmul_add = ff_vector_fmul_add_avx;`
floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp. Now, nellymoserenc and aacenc no longer depends on dsputil. Independent of this patch, wmaprodec also does not depend on dsputil, so I removed it from there also. 2013-01-20 22:20:30 +01:00			`fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 18:58:41 +02:00			`}`
			`}`