4cb6964244
The vector dequantization has a test in a loop preventing effective SIMD implementation. By moving it out of the loop, this loop can be DSPized. Therefore, modify the current DSP implementation. In particular, the DSP implementation no longer has to handle null loop sizes. The decode_hf implementations have following timings: For x86 Arrandale: C SSE SSE2 SSE4 win32: 260 162 119 104 win64: 242 N/A 89 72 The arm NEON optimizations follow in a later patch as external asm. The now unused check for the y modifier in arm inline asm is removed from configure.
86 lines
3.2 KiB
C
86 lines
3.2 KiB
C
/*
|
|
* Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
|
|
*
|
|
* This file is part of Libav.
|
|
*
|
|
* Libav is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* Libav is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with Libav; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "libavutil/attributes.h"
|
|
#include "libavutil/cpu.h"
|
|
#include "libavutil/x86/cpu.h"
|
|
#include "libavcodec/dcadsp.h"
|
|
|
|
void ff_decode_hf_sse(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
|
|
const int8_t hf_vq[1024][32], intptr_t vq_offset,
|
|
int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
|
|
void ff_decode_hf_sse2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
|
|
const int8_t hf_vq[1024][32], intptr_t vq_offset,
|
|
int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
|
|
void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
|
|
const int8_t hf_vq[1024][32], intptr_t vq_offset,
|
|
int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
|
|
void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
|
|
void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
|
|
|
|
av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
|
|
{
|
|
int cpu_flags = av_get_cpu_flags();
|
|
|
|
if (EXTERNAL_SSE(cpu_flags)) {
|
|
#if ARCH_X86_32
|
|
s->decode_hf = ff_decode_hf_sse;
|
|
#endif
|
|
s->lfe_fir[0] = ff_dca_lfe_fir0_sse;
|
|
s->lfe_fir[1] = ff_dca_lfe_fir1_sse;
|
|
}
|
|
|
|
if (EXTERNAL_SSE2(cpu_flags)) {
|
|
s->decode_hf = ff_decode_hf_sse2;
|
|
}
|
|
|
|
if (EXTERNAL_SSE4(cpu_flags)) {
|
|
s->decode_hf = ff_decode_hf_sse4;
|
|
}
|
|
}
|
|
|
|
void ff_synth_filter_inner_sse2(float *synth_buf_ptr, float synth_buf2[32],
|
|
const float window[512],
|
|
float out[32], intptr_t offset, float scale);
|
|
|
|
static void synth_filter_sse2(FFTContext *imdct,
|
|
float *synth_buf_ptr, int *synth_buf_offset,
|
|
float synth_buf2[32], const float window[512],
|
|
float out[32], const float in[32], float scale)
|
|
{
|
|
float *synth_buf= synth_buf_ptr + *synth_buf_offset;
|
|
|
|
imdct->imdct_half(imdct, synth_buf, in);
|
|
|
|
ff_synth_filter_inner_sse2(synth_buf, synth_buf2, window,
|
|
out, *synth_buf_offset, scale);
|
|
|
|
*synth_buf_offset = (*synth_buf_offset - 32) & 511;
|
|
}
|
|
|
|
av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
|
|
{
|
|
int cpu_flags = av_get_cpu_flags();
|
|
|
|
if (EXTERNAL_SSE2(cpu_flags)) {
|
|
s->synth_filter_float = synth_filter_sse2;
|
|
}
|
|
}
|