simd downmix
13% faster ac3 if downmixing Originally committed as revision 14742 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
45d9d61889
commit
ac2e556456
@ -632,27 +632,25 @@ static inline void do_imdct(AC3DecodeContext *s, int channels)
|
|||||||
/**
|
/**
|
||||||
* Downmix the output to mono or stereo.
|
* Downmix the output to mono or stereo.
|
||||||
*/
|
*/
|
||||||
static av_noinline void ac3_downmix(AC3DecodeContext *s,
|
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
|
||||||
float samples[AC3_MAX_CHANNELS][256])
|
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
float v0, v1;
|
float v0, v1;
|
||||||
|
if(out_ch == 2) {
|
||||||
if(s->output_mode == AC3_CHMODE_STEREO) {
|
for(i=0; i<len; i++) {
|
||||||
for(i=0; i<256; i++) {
|
|
||||||
v0 = v1 = 0.0f;
|
v0 = v1 = 0.0f;
|
||||||
for(j=0; j<s->fbw_channels; j++) {
|
for(j=0; j<in_ch; j++) {
|
||||||
v0 += samples[j][i] * s->downmix_coeffs[j][0];
|
v0 += samples[j][i] * matrix[j][0];
|
||||||
v1 += samples[j][i] * s->downmix_coeffs[j][1];
|
v1 += samples[j][i] * matrix[j][1];
|
||||||
}
|
}
|
||||||
samples[0][i] = v0;
|
samples[0][i] = v0;
|
||||||
samples[1][i] = v1;
|
samples[1][i] = v1;
|
||||||
}
|
}
|
||||||
} else if(s->output_mode == AC3_CHMODE_MONO) {
|
} else if(out_ch == 1) {
|
||||||
for(i=0; i<256; i++) {
|
for(i=0; i<len; i++) {
|
||||||
v0 = 0.0f;
|
v0 = 0.0f;
|
||||||
for(j=0; j<s->fbw_channels; j++)
|
for(j=0; j<in_ch; j++)
|
||||||
v0 += samples[j][i] * s->downmix_coeffs[j][0];
|
v0 += samples[j][i] * matrix[j][0];
|
||||||
samples[0][i] = v0;
|
samples[0][i] = v0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1018,17 +1016,16 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
|
|||||||
do_imdct(s, s->channels);
|
do_imdct(s, s->channels);
|
||||||
|
|
||||||
if(downmix_output) {
|
if(downmix_output) {
|
||||||
ac3_downmix(s, s->output);
|
s->dsp.ac3_downmix(s->output, s->downmix_coeffs, s->out_channels, s->fbw_channels, 256);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if(downmix_output) {
|
if(downmix_output) {
|
||||||
ac3_downmix(s, s->transform_coeffs+1);
|
s->dsp.ac3_downmix(s->transform_coeffs+1, s->downmix_coeffs, s->out_channels, s->fbw_channels, 256);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(downmix_output && !s->downmixed) {
|
if(downmix_output && !s->downmixed) {
|
||||||
s->downmixed = 1;
|
s->downmixed = 1;
|
||||||
// FIXME delay[] is half the size of the other downmixes
|
s->dsp.ac3_downmix(s->delay, s->downmix_coeffs, s->out_channels, s->fbw_channels, 128);
|
||||||
ac3_downmix(s, s->delay);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
do_imdct(s, s->out_channels);
|
do_imdct(s, s->out_channels);
|
||||||
|
@ -41,6 +41,9 @@ void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, in
|
|||||||
/* vorbis.c */
|
/* vorbis.c */
|
||||||
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
|
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
|
||||||
|
|
||||||
|
/* ac3dec.c */
|
||||||
|
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
|
||||||
|
|
||||||
/* flacenc.c */
|
/* flacenc.c */
|
||||||
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
|
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
|
||||||
|
|
||||||
@ -4476,6 +4479,9 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
|
|||||||
#ifdef CONFIG_VORBIS_DECODER
|
#ifdef CONFIG_VORBIS_DECODER
|
||||||
c->vorbis_inverse_coupling = vorbis_inverse_coupling;
|
c->vorbis_inverse_coupling = vorbis_inverse_coupling;
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef CONFIG_AC3_DECODER
|
||||||
|
c->ac3_downmix = ff_ac3_downmix_c;
|
||||||
|
#endif
|
||||||
#ifdef CONFIG_FLAC_ENCODER
|
#ifdef CONFIG_FLAC_ENCODER
|
||||||
c->flac_compute_autocorr = ff_flac_compute_autocorr;
|
c->flac_compute_autocorr = ff_flac_compute_autocorr;
|
||||||
#endif
|
#endif
|
||||||
|
@ -360,6 +360,7 @@ typedef struct DSPContext {
|
|||||||
|
|
||||||
/* assume len is a multiple of 4, and arrays are 16-byte aligned */
|
/* assume len is a multiple of 4, and arrays are 16-byte aligned */
|
||||||
void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
|
void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
|
||||||
|
void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
|
||||||
/* no alignment needed */
|
/* no alignment needed */
|
||||||
void (*flac_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
|
void (*flac_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
|
||||||
/* assume len is a multiple of 8, and arrays are 16-byte aligned */
|
/* assume len is a multiple of 8, and arrays are 16-byte aligned */
|
||||||
|
@ -1842,6 +1842,105 @@ static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define IF1(x) x
|
||||||
|
#define IF0(x)
|
||||||
|
|
||||||
|
#define MIX5(mono,stereo)\
|
||||||
|
asm volatile(\
|
||||||
|
"movss 0(%2), %%xmm5 \n"\
|
||||||
|
"movss 8(%2), %%xmm6 \n"\
|
||||||
|
"movss 24(%2), %%xmm7 \n"\
|
||||||
|
"shufps $0, %%xmm5, %%xmm5 \n"\
|
||||||
|
"shufps $0, %%xmm6, %%xmm6 \n"\
|
||||||
|
"shufps $0, %%xmm7, %%xmm7 \n"\
|
||||||
|
"1: \n"\
|
||||||
|
"movaps (%0,%1), %%xmm0 \n"\
|
||||||
|
"movaps 0x400(%0,%1), %%xmm1 \n"\
|
||||||
|
"movaps 0x800(%0,%1), %%xmm2 \n"\
|
||||||
|
"movaps 0xc00(%0,%1), %%xmm3 \n"\
|
||||||
|
"movaps 0x1000(%0,%1), %%xmm4 \n"\
|
||||||
|
"mulps %%xmm5, %%xmm0 \n"\
|
||||||
|
"mulps %%xmm6, %%xmm1 \n"\
|
||||||
|
"mulps %%xmm5, %%xmm2 \n"\
|
||||||
|
"mulps %%xmm7, %%xmm3 \n"\
|
||||||
|
"mulps %%xmm7, %%xmm4 \n"\
|
||||||
|
stereo("addps %%xmm1, %%xmm0 \n")\
|
||||||
|
"addps %%xmm1, %%xmm2 \n"\
|
||||||
|
"addps %%xmm3, %%xmm0 \n"\
|
||||||
|
"addps %%xmm4, %%xmm2 \n"\
|
||||||
|
mono("addps %%xmm2, %%xmm0 \n")\
|
||||||
|
"movaps %%xmm0, (%0,%1) \n"\
|
||||||
|
stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
|
||||||
|
"add $16, %0 \n"\
|
||||||
|
"jl 1b \n"\
|
||||||
|
:"+&r"(i)\
|
||||||
|
:"r"(samples[0]+len), "r"(matrix)\
|
||||||
|
:"memory"\
|
||||||
|
);
|
||||||
|
|
||||||
|
#define MIX_MISC(stereo)\
|
||||||
|
asm volatile(\
|
||||||
|
"1: \n"\
|
||||||
|
"movaps (%3,%0), %%xmm0 \n"\
|
||||||
|
stereo("movaps %%xmm0, %%xmm1 \n")\
|
||||||
|
"mulps %%xmm6, %%xmm0 \n"\
|
||||||
|
stereo("mulps %%xmm7, %%xmm1 \n")\
|
||||||
|
"lea 1024(%3,%0), %1 \n"\
|
||||||
|
"mov %5, %2 \n"\
|
||||||
|
"2: \n"\
|
||||||
|
"movaps (%1), %%xmm2 \n"\
|
||||||
|
stereo("movaps %%xmm2, %%xmm3 \n")\
|
||||||
|
"mulps (%4,%2), %%xmm2 \n"\
|
||||||
|
stereo("mulps 16(%4,%2), %%xmm3 \n")\
|
||||||
|
"addps %%xmm2, %%xmm0 \n"\
|
||||||
|
stereo("addps %%xmm3, %%xmm1 \n")\
|
||||||
|
"add $1024, %1 \n"\
|
||||||
|
"add $32, %2 \n"\
|
||||||
|
"jl 2b \n"\
|
||||||
|
"movaps %%xmm0, (%3,%0) \n"\
|
||||||
|
stereo("movaps %%xmm1, 1024(%3,%0) \n")\
|
||||||
|
"add $16, %0 \n"\
|
||||||
|
"jl 1b \n"\
|
||||||
|
:"+&r"(i), "=&r"(j), "=&r"(k)\
|
||||||
|
:"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
|
||||||
|
:"memory"\
|
||||||
|
);
|
||||||
|
|
||||||
|
static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
|
||||||
|
{
|
||||||
|
int (*matrix_cmp)[2] = (int(*)[2])matrix;
|
||||||
|
intptr_t i,j,k;
|
||||||
|
|
||||||
|
i = -len*sizeof(float);
|
||||||
|
if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
|
||||||
|
MIX5(IF0,IF1);
|
||||||
|
} else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
|
||||||
|
MIX5(IF1,IF0);
|
||||||
|
} else {
|
||||||
|
DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]);
|
||||||
|
j = 2*in_ch*sizeof(float);
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"sub $8, %0 \n"
|
||||||
|
"movss (%2,%0), %%xmm6 \n"
|
||||||
|
"movss 4(%2,%0), %%xmm7 \n"
|
||||||
|
"shufps $0, %%xmm6, %%xmm6 \n"
|
||||||
|
"shufps $0, %%xmm7, %%xmm7 \n"
|
||||||
|
"movaps %%xmm6, (%1,%0,4) \n"
|
||||||
|
"movaps %%xmm7, 16(%1,%0,4) \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
:"+&r"(j)
|
||||||
|
:"r"(matrix_simd), "r"(matrix)
|
||||||
|
:"memory"
|
||||||
|
);
|
||||||
|
if(out_ch == 2) {
|
||||||
|
MIX_MISC(IF1);
|
||||||
|
} else {
|
||||||
|
MIX_MISC(IF0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void vector_fmul_3dnow(float *dst, const float *src, int len){
|
static void vector_fmul_3dnow(float *dst, const float *src, int len){
|
||||||
x86_reg i = (len-4)*4;
|
x86_reg i = (len-4)*4;
|
||||||
asm volatile(
|
asm volatile(
|
||||||
@ -2682,6 +2781,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
|||||||
}
|
}
|
||||||
if(mm_flags & MM_SSE){
|
if(mm_flags & MM_SSE){
|
||||||
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
|
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
|
||||||
|
c->ac3_downmix = ac3_downmix_sse;
|
||||||
c->vector_fmul = vector_fmul_sse;
|
c->vector_fmul = vector_fmul_sse;
|
||||||
c->vector_fmul_reverse = vector_fmul_reverse_sse;
|
c->vector_fmul_reverse = vector_fmul_reverse_sse;
|
||||||
c->vector_fmul_add_add = vector_fmul_add_add_sse;
|
c->vector_fmul_add_add = vector_fmul_add_add_sse;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user