3dnow2 implementation of imdct.
6% faster vorbis and wma. Originally committed as revision 5954 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
2c5ad5fd74
commit
bcfa3e58ee
@ -594,6 +594,8 @@ void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3],
|
|||||||
FFTSample type */
|
FFTSample type */
|
||||||
typedef float FFTSample;
|
typedef float FFTSample;
|
||||||
|
|
||||||
|
struct MDCTContext;
|
||||||
|
|
||||||
typedef struct FFTComplex {
|
typedef struct FFTComplex {
|
||||||
FFTSample re, im;
|
FFTSample re, im;
|
||||||
} FFTComplex;
|
} FFTComplex;
|
||||||
@ -605,6 +607,8 @@ typedef struct FFTContext {
|
|||||||
FFTComplex *exptab;
|
FFTComplex *exptab;
|
||||||
FFTComplex *exptab1; /* only used by SSE code */
|
FFTComplex *exptab1; /* only used by SSE code */
|
||||||
void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
|
void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
|
||||||
|
void (*imdct_calc)(struct MDCTContext *s, FFTSample *output,
|
||||||
|
const FFTSample *input, FFTSample *tmp);
|
||||||
} FFTContext;
|
} FFTContext;
|
||||||
|
|
||||||
int ff_fft_init(FFTContext *s, int nbits, int inverse);
|
int ff_fft_init(FFTContext *s, int nbits, int inverse);
|
||||||
@ -635,6 +639,8 @@ typedef struct MDCTContext {
|
|||||||
int ff_mdct_init(MDCTContext *s, int nbits, int inverse);
|
int ff_mdct_init(MDCTContext *s, int nbits, int inverse);
|
||||||
void ff_imdct_calc(MDCTContext *s, FFTSample *output,
|
void ff_imdct_calc(MDCTContext *s, FFTSample *output,
|
||||||
const FFTSample *input, FFTSample *tmp);
|
const FFTSample *input, FFTSample *tmp);
|
||||||
|
void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
|
||||||
|
const FFTSample *input, FFTSample *tmp);
|
||||||
void ff_mdct_calc(MDCTContext *s, FFTSample *out,
|
void ff_mdct_calc(MDCTContext *s, FFTSample *out,
|
||||||
const FFTSample *input, FFTSample *tmp);
|
const FFTSample *input, FFTSample *tmp);
|
||||||
void ff_mdct_end(MDCTContext *s);
|
void ff_mdct_end(MDCTContext *s);
|
||||||
|
@ -54,6 +54,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
|
|||||||
s->exptab[i].im = s1;
|
s->exptab[i].im = s1;
|
||||||
}
|
}
|
||||||
s->fft_calc = ff_fft_calc_c;
|
s->fft_calc = ff_fft_calc_c;
|
||||||
|
s->imdct_calc = ff_imdct_calc;
|
||||||
s->exptab1 = NULL;
|
s->exptab1 = NULL;
|
||||||
|
|
||||||
/* compute constant table for HAVE_SSE version */
|
/* compute constant table for HAVE_SSE version */
|
||||||
@ -62,11 +63,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
|
|||||||
int has_vectors = 0;
|
int has_vectors = 0;
|
||||||
|
|
||||||
#if defined(HAVE_MMX)
|
#if defined(HAVE_MMX)
|
||||||
#ifdef HAVE_MM3DNOW
|
|
||||||
has_vectors = mm_support() & (MM_3DNOW | MM_3DNOWEXT | MM_SSE | MM_SSE2);
|
has_vectors = mm_support() & (MM_3DNOW | MM_3DNOWEXT | MM_SSE | MM_SSE2);
|
||||||
#else
|
|
||||||
has_vectors = mm_support() & (MM_SSE | MM_SSE2);
|
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAVE_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE)
|
#if defined(HAVE_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE)
|
||||||
has_vectors = mm_support() & MM_ALTIVEC;
|
has_vectors = mm_support() & MM_ALTIVEC;
|
||||||
@ -98,6 +95,8 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
|
|||||||
} while (nblocks != 0);
|
} while (nblocks != 0);
|
||||||
av_freep(&s->exptab);
|
av_freep(&s->exptab);
|
||||||
#if defined(HAVE_MMX)
|
#if defined(HAVE_MMX)
|
||||||
|
if (has_vectors & MM_3DNOWEXT)
|
||||||
|
s->imdct_calc = ff_imdct_calc_3dn2;
|
||||||
#ifdef HAVE_MM3DNOW
|
#ifdef HAVE_MM3DNOW
|
||||||
if (has_vectors & MM_3DNOWEXT)
|
if (has_vectors & MM_3DNOWEXT)
|
||||||
/* 3DNowEx for Athlon(XP) */
|
/* 3DNowEx for Athlon(XP) */
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* FFT/MDCT transform with Extended 3DNow! optimizations
|
* FFT/MDCT transform with Extended 3DNow! optimizations
|
||||||
* Copyright (c) 2006 Zuxy MENG Jie.
|
* Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
|
||||||
* Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
|
* Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
|
||||||
*
|
*
|
||||||
* This library is free software; you can redistribute it and/or
|
* This library is free software; you can redistribute it and/or
|
||||||
@ -134,3 +134,84 @@ void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
|
||||||
|
const FFTSample *input, FFTSample *tmp)
|
||||||
|
{
|
||||||
|
int k, n8, n4, n2, n;
|
||||||
|
const uint16_t *revtab = s->fft.revtab;
|
||||||
|
const FFTSample *tcos = s->tcos;
|
||||||
|
const FFTSample *tsin = s->tsin;
|
||||||
|
const FFTSample *in1, *in2;
|
||||||
|
FFTComplex *z = (FFTComplex *)tmp;
|
||||||
|
|
||||||
|
n = 1 << s->nbits;
|
||||||
|
n2 = n >> 1;
|
||||||
|
n4 = n >> 2;
|
||||||
|
n8 = n >> 3;
|
||||||
|
|
||||||
|
/* pre rotation */
|
||||||
|
in1 = input;
|
||||||
|
in2 = input + n2 - 1;
|
||||||
|
for(k = 0; k < n4; k++) {
|
||||||
|
asm volatile(
|
||||||
|
"movd %1, %%mm0 \n\t"
|
||||||
|
"movd %3, %%mm1 \n\t"
|
||||||
|
"punpckldq %2, %%mm0 \n\t"
|
||||||
|
"punpckldq %4, %%mm1 \n\t"
|
||||||
|
"movq %%mm0, %%mm2 \n\t"
|
||||||
|
"pfmul %%mm1, %%mm0 \n\t"
|
||||||
|
"pswapd %%mm1, %%mm1 \n\t"
|
||||||
|
"pfmul %%mm1, %%mm2 \n\t"
|
||||||
|
"pfpnacc %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, %0 \n\t"
|
||||||
|
:"=m"(z[revtab[k]])
|
||||||
|
:"m"(in2[-2*k]), "m"(in1[2*k]),
|
||||||
|
"m"(tcos[k]), "m"(tsin[k])
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
ff_fft_calc(&s->fft, z);
|
||||||
|
|
||||||
|
/* post rotation + reordering */
|
||||||
|
for(k = 0; k < n4; k++) {
|
||||||
|
asm volatile(
|
||||||
|
"movq %0, %%mm0 \n\t"
|
||||||
|
"movd %1, %%mm1 \n\t"
|
||||||
|
"punpckldq %2, %%mm1 \n\t"
|
||||||
|
"movq %%mm0, %%mm2 \n\t"
|
||||||
|
"pfmul %%mm1, %%mm0 \n\t"
|
||||||
|
"pswapd %%mm1, %%mm1 \n\t"
|
||||||
|
"pfmul %%mm1, %%mm2 \n\t"
|
||||||
|
"pfpnacc %%mm2, %%mm0 \n\t"
|
||||||
|
"movq %%mm0, %0 \n\t"
|
||||||
|
:"+m"(z[k])
|
||||||
|
:"m"(tcos[k]), "m"(tsin[k])
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
asm volatile("movd %0, %%mm7" ::"r"(1<<31));
|
||||||
|
for(k = 0; k < n8; k++) {
|
||||||
|
asm volatile(
|
||||||
|
"movq %4, %%mm0 \n\t"
|
||||||
|
"pswapd %5, %%mm1 \n\t"
|
||||||
|
"movq %%mm0, %%mm2 \n\t"
|
||||||
|
"pxor %%mm7, %%mm2 \n\t"
|
||||||
|
"punpckldq %%mm1, %%mm2 \n\t"
|
||||||
|
"pswapd %%mm2, %%mm3 \n\t"
|
||||||
|
"punpckhdq %%mm1, %%mm0 \n\t"
|
||||||
|
"pswapd %%mm0, %%mm4 \n\t"
|
||||||
|
"pxor %%mm7, %%mm0 \n\t"
|
||||||
|
"pxor %%mm7, %%mm4 \n\t"
|
||||||
|
"movq %%mm0, %0 \n\t" // { -z[n8+k].im, z[n8-1-k].re }
|
||||||
|
"movq %%mm4, %1 \n\t" // { -z[n8-1-k].re, z[n8+k].im }
|
||||||
|
"movq %%mm2, %2 \n\t" // { -z[n8+k].re, z[n8-1-k].im }
|
||||||
|
"movq %%mm3, %3 \n\t" // { z[n8-1-k].im, -z[n8+k].re }
|
||||||
|
:"=m"(output[2*k]), "=m"(output[n2-2-2*k]),
|
||||||
|
"=m"(output[n2+2*k]), "=m"(output[n-2-2*k])
|
||||||
|
:"m"(z[n8+k]), "m"(z[n8-1-k])
|
||||||
|
:"memory"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
asm volatile("emms");
|
||||||
|
}
|
||||||
|
@ -1598,7 +1598,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) {
|
|||||||
|
|
||||||
saved_start=vc->saved_start;
|
saved_start=vc->saved_start;
|
||||||
|
|
||||||
ff_imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp);
|
vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp);
|
||||||
|
|
||||||
if (vc->modes[mode_number].blockflag) {
|
if (vc->modes[mode_number].blockflag) {
|
||||||
// -- overlap/add
|
// -- overlap/add
|
||||||
|
@ -1113,7 +1113,7 @@ static int wma_decode_block(WMADecodeContext *s)
|
|||||||
|
|
||||||
n = s->block_len;
|
n = s->block_len;
|
||||||
n4 = s->block_len / 2;
|
n4 = s->block_len / 2;
|
||||||
ff_imdct_calc(&s->mdct_ctx[bsize],
|
s->mdct_ctx[bsize].fft.imdct_calc(&s->mdct_ctx[bsize],
|
||||||
output, s->coefs[ch], s->mdct_tmp);
|
output, s->coefs[ch], s->mdct_tmp);
|
||||||
|
|
||||||
/* XXX: optimize all that by build the window and
|
/* XXX: optimize all that by build the window and
|
||||||
|
Loading…
x
Reference in New Issue
Block a user