x86: sbrdsp: implement SSE2 hf_apply_noise
233 to 105 cycles on Arrandale and Win64. Replacing the multiplication by s_m[m] by a pand and a pxor with appropriate vectors is slower. Unrolling is a 15 cycles win. A SSE version was 4 cycles slower. Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
committed by
Michael Niedermayer
parent
380cfce2b2
commit
76c7277385
@@ -38,6 +38,19 @@ void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
|
||||
void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
|
||||
void ff_sbr_qmf_pre_shuffle_sse2(float *z);
|
||||
|
||||
void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
|
||||
av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
@@ -55,5 +68,9 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2;
|
||||
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2;
|
||||
s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
|
||||
s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
|
||||
s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
|
||||
s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user