lavr: replace the SSE version of ff_conv_fltp_to_flt_6ch() with SSE4 and AVX
The current SSE version is slower than the MMX version on Athlon64 and Sandy Bridge, but the SSE4 and AVX versions are faster on Sandy Bridge.
This commit is contained in:
parent
0b45334a58
commit
5cc6d5244d
@ -54,26 +54,24 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
|
||||
mova m3, [srcq+src3q]
|
||||
mova m4, [srcq+src4q]
|
||||
mova m5, [srcq+src5q]
|
||||
%if cpuflag(sse)
|
||||
%if cpuflag(sse4)
|
||||
SBUTTERFLYPS 0, 1, 6
|
||||
SBUTTERFLYPS 2, 3, 6
|
||||
SBUTTERFLYPS 4, 5, 6
|
||||
|
||||
movaps m6, m4
|
||||
shufps m4, m0, q3210
|
||||
blendps m6, m4, m0, 1100b
|
||||
movlhps m0, m2
|
||||
movhlps m6, m2
|
||||
movaps [dstq ], m0
|
||||
movaps [dstq+16], m4
|
||||
movaps [dstq+32], m6
|
||||
|
||||
movaps m6, m5
|
||||
shufps m5, m1, q3210
|
||||
movhlps m4, m2
|
||||
blendps m2, m5, m1, 1100b
|
||||
movlhps m1, m3
|
||||
movhlps m6, m3
|
||||
movhlps m5, m3
|
||||
|
||||
movaps [dstq ], m0
|
||||
movaps [dstq+16], m6
|
||||
movaps [dstq+32], m4
|
||||
movaps [dstq+48], m1
|
||||
movaps [dstq+64], m5
|
||||
movaps [dstq+80], m6
|
||||
movaps [dstq+64], m2
|
||||
movaps [dstq+80], m5
|
||||
%else ; mmx
|
||||
SBUTTERFLY dq, 0, 1, 6
|
||||
SBUTTERFLY dq, 2, 3, 6
|
||||
@ -100,5 +98,9 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
|
||||
|
||||
INIT_MMX mmx
|
||||
CONV_FLTP_TO_FLT_6CH
|
||||
INIT_XMM sse
|
||||
INIT_XMM sse4
|
||||
CONV_FLTP_TO_FLT_6CH
|
||||
%if HAVE_AVX
|
||||
INIT_XMM avx
|
||||
CONV_FLTP_TO_FLT_6CH
|
||||
%endif
|
||||
|
@ -22,8 +22,9 @@
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavresample/audio_convert.h"
|
||||
|
||||
extern void ff_conv_fltp_to_flt_6ch_mmx(float *dst, float *const *src, int len);
|
||||
extern void ff_conv_fltp_to_flt_6ch_sse(float *dst, float *const *src, int len);
|
||||
extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len);
|
||||
extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len);
|
||||
extern void ff_conv_fltp_to_flt_6ch_avx (float *dst, float *const *src, int len);
|
||||
|
||||
av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
|
||||
{
|
||||
@ -34,9 +35,13 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
|
||||
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
|
||||
6, 1, 4, "MMX", ff_conv_fltp_to_flt_6ch_mmx);
|
||||
}
|
||||
if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
|
||||
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
|
||||
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
|
||||
6, 16, 4, "SSE", ff_conv_fltp_to_flt_6ch_sse);
|
||||
6, 16, 4, "SSE4", ff_conv_fltp_to_flt_6ch_sse4);
|
||||
}
|
||||
if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
|
||||
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
|
||||
6, 16, 4, "AVX", ff_conv_fltp_to_flt_6ch_avx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -42,10 +42,9 @@
|
||||
%endmacro
|
||||
|
||||
%macro SBUTTERFLYPS 3
|
||||
movaps m%3, m%1
|
||||
unpcklps m%1, m%2
|
||||
unpckhps m%3, m%2
|
||||
SWAP %2, %3
|
||||
unpcklps m%3, m%1, m%2
|
||||
unpckhps m%1, m%1, m%2
|
||||
SWAP %1, %3, %2
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE4x4B 5
|
||||
|
Loading…
x
Reference in New Issue
Block a user