ffmpeg/libavcodec/x86/fft_sse.c

/*
 * FFT/MDCT transform with SSE optimizations
 * Copyright (c) 2008 Loren Merritt
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
#include "fft.h"

DECLARE_ALIGNED(16, static const int, m1m1m1m1)[4] =
    { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };

void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);

void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
{
    int n = 1 << s->nbits;

    ff_fft_dispatch_interleave_sse(z, s->nbits);

    if(n <= 16) {
        x86_reg i = -8*n;
        __asm__ volatile(
            "1: \n"
            "movaps     (%0,%1), %%xmm0 \n"
            "movaps      %%xmm0, %%xmm1 \n"
            "unpcklps 16(%0,%1), %%xmm0 \n"
            "unpckhps 16(%0,%1), %%xmm1 \n"
            "movaps      %%xmm0,   (%0,%1) \n"
            "movaps      %%xmm1, 16(%0,%1) \n"
            "add $32, %0 \n"
            "jl 1b \n"
            :"+r"(i)
            :"r"(z+n)
            :"memory"
        );
    }
}

void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
{
    int n = 1 << s->nbits;
    int i;
    for(i=0; i<n; i+=2) {
        __asm__ volatile(
            "movaps %2, %%xmm0 \n"
            "movlps %%xmm0, %0 \n"
            "movhps %%xmm0, %1 \n"
            :"=m"(s->tmp_buf[s->revtab[i]]),
             "=m"(s->tmp_buf[s->revtab[i+1]])
            :"m"(z[i])
        );
    }
    memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
}

void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
{
    x86_reg j, k;
    long n = s->mdct_size;
    long n4 = n >> 2;

    ff_imdct_half_sse(s, output+n4, input);

    j = -n;
    k = n-16;
    __asm__ volatile(
        "movaps %4, %%xmm7 \n"
        "1: \n"
        "movaps       (%2,%1), %%xmm0 \n"
        "movaps       (%3,%0), %%xmm1 \n"
        "shufps $0x1b, %%xmm0, %%xmm0 \n"
        "shufps $0x1b, %%xmm1, %%xmm1 \n"
        "xorps         %%xmm7, %%xmm0 \n"
        "movaps        %%xmm1, (%3,%1) \n"
        "movaps        %%xmm0, (%2,%0) \n"
        "sub $16, %1 \n"
        "add $16, %0 \n"
        "jl 1b \n"
        :"+r"(j), "+r"(k)
        :"r"(output+n4), "r"(output+n4*3),
         "m"(*m1m1m1m1)
    );
}
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 01:34:08 +01:00			`/*`
			`* FFT/MDCT transform with SSE optimizations`
optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder. Originally committed as revision 14700 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 02:33:34 +02:00			`* Copyright (c) 2008 Loren Merritt`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 01:34:08 +01:00			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 17:30:46 +02:00			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 01:34:08 +01:00			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 17:30:46 +02:00			`* version 2.1 of the License, or (at your option) any later version.`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 01:34:08 +01:00			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 17:30:46 +02:00			`* FFmpeg is distributed in the hope that it will be useful,`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 01:34:08 +01:00			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 17:30:46 +02:00			`* License along with FFmpeg; if not, write to the Free Software`
Update licensing information: The FSF changed postal address. Originally committed as revision 4842 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-01-12 23:43:26 +01:00			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 01:34:08 +01:00			`*/`
Use full path for #includes from another directory. Originally committed as revision 13098 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-05-09 13:56:36 +02:00
			`#include "libavutil/x86_cpu.h"`
			`#include "libavcodec/dsputil.h"`
Move per-arch fft init bits into the corresponding subdirs Originally committed as revision 19864 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-09-15 23:14:14 +02:00			`#include "fft.h"`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 01:34:08 +01:00
Move array specifiers outside DECLARE_ALIGNED() invocations Originally committed as revision 21377 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-01-22 04:25:11 +01:00			`DECLARE_ALIGNED(16, static const int, m1m1m1m1)[4] =`
sse implementation of imdct. patch mostly by Zuxy Meng (zuxy dot meng at gmail dot com) Originally committed as revision 6311 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-09-21 18:37:39 +02:00			`{ 1 << 31, 1 << 31, 1 << 31, 1 << 31 };`

split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 02:26:58 +02:00			`void ff_fft_dispatch_sse(FFTComplex *z, int nbits);`
			`void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 01:34:08 +01:00
fft_() renamed into ff_fft_() patch by (Gildas Bazin <gbazin at altern dot org>) Originally committed as revision 2882 to svn://svn.ffmpeg.org/ffmpeg/trunk 2004-03-13 22:43:24 +01:00			`void ff_fft_calc_sse(FFTContext s, FFTComplex z)`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 01:34:08 +01:00			`{`
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 02:26:58 +02:00			`int n = 1 << s->nbits;`

			`ff_fft_dispatch_interleave_sse(z, s->nbits);`

			`if(n <= 16) {`
			`x86_reg i = -8*n;`
Convert asm keyword into __asm__. Neither the asm() nor the __asm__() keyword is part of the C99 standard, but while GCC accepts the former in C89 syntax, it is not accepted in C99 unless GNU extensions are turned on (with -fasm). The latter form is accepted in any syntax as an extension (without requiring further command-line options). Sun Studio C99 compiler also does not accept asm() while accepting __asm__(), albeit reporting warnings that it's not valid C99 syntax. Originally committed as revision 15627 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-10-16 15:34:09 +02:00			`__asm__ volatile(`
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 02:26:58 +02:00			`"1: \n"`
			`"movaps (%0,%1), %%xmm0 \n"`
			`"movaps %%xmm0, %%xmm1 \n"`
			`"unpcklps 16(%0,%1), %%xmm0 \n"`
			`"unpckhps 16(%0,%1), %%xmm1 \n"`
			`"movaps %%xmm0, (%0,%1) \n"`
			`"movaps %%xmm1, 16(%0,%1) \n"`
			`"add $32, %0 \n"`
			`"jl 1b \n"`
			`:"+r"(i)`
			`:"r"(z+n)`
			`:"memory"`
			`);`
			`}`
			`}`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 01:34:08 +01:00
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 02:26:58 +02:00			`void ff_fft_permute_sse(FFTContext s, FFTComplex z)`
			`{`
			`int n = 1 << s->nbits;`
			`int i;`
			`for(i=0; i<n; i+=2) {`
Convert asm keyword into __asm__. Neither the asm() nor the __asm__() keyword is part of the C99 standard, but while GCC accepts the former in C89 syntax, it is not accepted in C99 unless GNU extensions are turned on (with -fasm). The latter form is accepted in any syntax as an extension (without requiring further command-line options). Sun Studio C99 compiler also does not accept asm() while accepting __asm__(), albeit reporting warnings that it's not valid C99 syntax. Originally committed as revision 15627 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-10-16 15:34:09 +02:00			`__asm__ volatile(`
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 02:26:58 +02:00			`"movaps %2, %%xmm0 \n"`
			`"movlps %%xmm0, %0 \n"`
			`"movhps %%xmm0, %1 \n"`
			`:"=m"(s->tmp_buf[s->revtab[i]]),`
			`"=m"(s->tmp_buf[s->revtab[i+1]])`
			`:"m"(z[i])`
			`);`
			`}`
			`memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-10-28 01:34:08 +01:00			`}`
added define for builtins use - inverse fix by Romain Dolbeau Originally committed as revision 1410 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 18:41:43 +01:00
Merge FFTContext and MDCTContext Originally committed as revision 19931 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-09-20 19:30:20 +02:00			`void ff_imdct_calc_sse(FFTContext s, FFTSample output, const FFTSample *input)`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-13 17:03:58 +02:00			`{`
			`x86_reg j, k;`
imdct/x86: Use "s->mdct_size" instead of "1 << s->mdct_bits". It generates smaller cleaner code. Originally committed as revision 24887 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-23 17:51:09 +02:00			`long n = s->mdct_size;`
optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder. Originally committed as revision 14700 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 02:33:34 +02:00			`long n4 = n >> 2;`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-13 17:03:58 +02:00
optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder. Originally committed as revision 14700 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 02:33:34 +02:00			`ff_imdct_half_sse(s, output+n4, input);`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-13 17:03:58 +02:00
			`j = -n;`
			`k = n-16;`
Convert asm keyword into __asm__. Neither the asm() nor the __asm__() keyword is part of the C99 standard, but while GCC accepts the former in C89 syntax, it is not accepted in C99 unless GNU extensions are turned on (with -fasm). The latter form is accepted in any syntax as an extension (without requiring further command-line options). Sun Studio C99 compiler also does not accept asm() while accepting __asm__(), albeit reporting warnings that it's not valid C99 syntax. Originally committed as revision 15627 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-10-16 15:34:09 +02:00			`__asm__ volatile(`
optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder. Originally committed as revision 14700 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 02:33:34 +02:00			`"movaps %4, %%xmm7 \n"`
			`"1: \n"`
			`"movaps (%2,%1), %%xmm0 \n"`
			`"movaps (%3,%0), %%xmm1 \n"`
			`"shufps $0x1b, %%xmm0, %%xmm0 \n"`
			`"shufps $0x1b, %%xmm1, %%xmm1 \n"`
			`"xorps %%xmm7, %%xmm0 \n"`
			`"movaps %%xmm1, (%3,%1) \n"`
			`"movaps %%xmm0, (%2,%0) \n"`
			`"sub $16, %1 \n"`
			`"add $16, %0 \n"`
			`"jl 1b \n"`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-13 17:03:58 +02:00			`:"+r"(j), "+r"(k)`
optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder. Originally committed as revision 14700 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-08-12 02:33:34 +02:00			`:"r"(output+n4), "r"(output+n4*3),`
			`"m"(*m1m1m1m1)`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-13 17:03:58 +02:00			`);`
			`}`