lavr: add x86-optimized mixing functions
Adds optimized functions for mixing 3 through 8 input channels to 1 and 2 output channels in fltp or s16p format with flt coeffs.
This commit is contained in:
parent
79687079a9
commit
2f096bb10e
@ -246,9 +246,10 @@ static int handle_buffered_output(AVAudioResampleContext *avr,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int avresample_convert(AVAudioResampleContext *avr, void **output,
|
||||
int out_plane_size, int out_samples, void **input,
|
||||
int in_plane_size, int in_samples)
|
||||
int attribute_align_arg avresample_convert(AVAudioResampleContext *avr,
|
||||
void **output, int out_plane_size,
|
||||
int out_samples, void **input,
|
||||
int in_plane_size, int in_samples)
|
||||
{
|
||||
AudioData input_buffer;
|
||||
AudioData output_buffer;
|
||||
|
@ -226,3 +226,296 @@ MIX_1_TO_2_S16P_FLT
|
||||
INIT_XMM avx
|
||||
MIX_1_TO_2_S16P_FLT
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix,
|
||||
; int len, int out_ch, int in_ch);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp
|
||||
; define some names to make the code clearer
|
||||
%assign in_channels %1
|
||||
%assign out_channels %2
|
||||
%assign stereo out_channels - 1
|
||||
%ifidn %3, s16p
|
||||
%assign is_s16 1
|
||||
%else
|
||||
%assign is_s16 0
|
||||
%endif
|
||||
|
||||
; determine how many matrix elements must go on the stack vs. mmregs
|
||||
%assign matrix_elements in_channels * out_channels
|
||||
%if is_s16
|
||||
%if stereo
|
||||
%assign needed_mmregs 7
|
||||
%else
|
||||
%assign needed_mmregs 5
|
||||
%endif
|
||||
%else
|
||||
%if stereo
|
||||
%assign needed_mmregs 4
|
||||
%else
|
||||
%assign needed_mmregs 3
|
||||
%endif
|
||||
%endif
|
||||
%assign matrix_elements_mm num_mmregs - needed_mmregs
|
||||
%if matrix_elements < matrix_elements_mm
|
||||
%assign matrix_elements_mm matrix_elements
|
||||
%endif
|
||||
%if matrix_elements_mm < matrix_elements
|
||||
%assign matrix_elements_stack matrix_elements - matrix_elements_mm
|
||||
%else
|
||||
%assign matrix_elements_stack 0
|
||||
%endif
|
||||
|
||||
cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, src0, src1, len, src2, src3, src4, src5, src6, src7
|
||||
|
||||
; get aligned stack space if needed
|
||||
%if matrix_elements_stack > 0
|
||||
%if mmsize == 32
|
||||
%assign bkpreg %1 + 1
|
||||
%define bkpq r %+ bkpreg %+ q
|
||||
mov bkpq, rsp
|
||||
and rsp, ~(mmsize-1)
|
||||
sub rsp, matrix_elements_stack * mmsize
|
||||
%else
|
||||
%assign pad matrix_elements_stack * mmsize + (mmsize - gprsize) - (stack_offset & (mmsize - gprsize))
|
||||
SUB rsp, pad
|
||||
%endif
|
||||
%endif
|
||||
|
||||
; load matrix pointers
|
||||
%define matrix0q r1q
|
||||
%define matrix1q r3q
|
||||
%if stereo
|
||||
mov matrix1q, [matrix0q+gprsize]
|
||||
%endif
|
||||
mov matrix0q, [matrix0q]
|
||||
|
||||
; define matrix coeff names
|
||||
%assign %%i 0
|
||||
%assign %%j needed_mmregs
|
||||
%rep in_channels
|
||||
%if %%i >= matrix_elements_mm
|
||||
CAT_XDEFINE mx_stack_0_, %%i, 1
|
||||
CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
|
||||
%else
|
||||
CAT_XDEFINE mx_stack_0_, %%i, 0
|
||||
CAT_XDEFINE mx_0_, %%i, m %+ %%j
|
||||
%assign %%j %%j+1
|
||||
%endif
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%if stereo
|
||||
%assign %%i 0
|
||||
%rep in_channels
|
||||
%if in_channels + %%i >= matrix_elements_mm
|
||||
CAT_XDEFINE mx_stack_1_, %%i, 1
|
||||
CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
|
||||
%else
|
||||
CAT_XDEFINE mx_stack_1_, %%i, 0
|
||||
CAT_XDEFINE mx_1_, %%i, m %+ %%j
|
||||
%assign %%j %%j+1
|
||||
%endif
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%endif
|
||||
|
||||
; load/splat matrix coeffs
|
||||
%assign %%i 0
|
||||
%rep in_channels
|
||||
%if mx_stack_0_ %+ %%i
|
||||
VBROADCASTSS m0, [matrix0q+4*%%i]
|
||||
mova mx_0_ %+ %%i, m0
|
||||
%else
|
||||
VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
|
||||
%endif
|
||||
%if stereo
|
||||
%if mx_stack_1_ %+ %%i
|
||||
VBROADCASTSS m0, [matrix1q+4*%%i]
|
||||
mova mx_1_ %+ %%i, m0
|
||||
%else
|
||||
VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
|
||||
%endif
|
||||
%endif
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
|
||||
; load channel pointers to registers as offsets from the first channel pointer
|
||||
%if ARCH_X86_64
|
||||
movsxd lenq, r2d
|
||||
%endif
|
||||
shl lenq, 2-is_s16
|
||||
%assign %%i 1
|
||||
%rep (in_channels - 1)
|
||||
%if ARCH_X86_32 && in_channels >= 7 && %%i >= 5
|
||||
mov src5q, [src0q+%%i*gprsize]
|
||||
add src5q, lenq
|
||||
mov src %+ %%i %+ m, src5q
|
||||
%else
|
||||
mov src %+ %%i %+ q, [src0q+%%i*gprsize]
|
||||
add src %+ %%i %+ q, lenq
|
||||
%endif
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
mov src0q, [src0q]
|
||||
add src0q, lenq
|
||||
neg lenq
|
||||
.loop
|
||||
; for x86-32 with 7-8 channels we do not have enough gp registers for all src
|
||||
; pointers, so we have to load some of them from the stack each time
|
||||
%define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5
|
||||
%if is_s16
|
||||
; mix with s16p input
|
||||
mova m0, [src0q+lenq]
|
||||
S16_TO_S32_SX 0, 1
|
||||
cvtdq2ps m0, m0
|
||||
cvtdq2ps m1, m1
|
||||
%if stereo
|
||||
mulps m2, m0, mx_1_0
|
||||
mulps m3, m1, mx_1_0
|
||||
%endif
|
||||
mulps m0, m0, mx_0_0
|
||||
mulps m1, m1, mx_0_0
|
||||
%assign %%i 1
|
||||
%rep (in_channels - 1)
|
||||
%if copy_src_from_stack
|
||||
%define src_ptr src5q
|
||||
%else
|
||||
%define src_ptr src %+ %%i %+ q
|
||||
%endif
|
||||
%if stereo
|
||||
%if copy_src_from_stack
|
||||
mov src_ptr, src %+ %%i %+ m
|
||||
%endif
|
||||
mova m4, [src_ptr+lenq]
|
||||
S16_TO_S32_SX 4, 5
|
||||
cvtdq2ps m4, m4
|
||||
cvtdq2ps m5, m5
|
||||
fmaddps m2, m4, mx_1_ %+ %%i, m2, m6
|
||||
fmaddps m3, m5, mx_1_ %+ %%i, m3, m6
|
||||
fmaddps m0, m4, mx_0_ %+ %%i, m0, m4
|
||||
fmaddps m1, m5, mx_0_ %+ %%i, m1, m5
|
||||
%else
|
||||
%if copy_src_from_stack
|
||||
mov src_ptr, src %+ %%i %+ m
|
||||
%endif
|
||||
mova m2, [src_ptr+lenq]
|
||||
S16_TO_S32_SX 2, 3
|
||||
cvtdq2ps m2, m2
|
||||
cvtdq2ps m3, m3
|
||||
fmaddps m0, m2, mx_0_ %+ %%i, m0, m4
|
||||
fmaddps m1, m3, mx_0_ %+ %%i, m1, m4
|
||||
%endif
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%if stereo
|
||||
cvtps2dq m2, m2
|
||||
cvtps2dq m3, m3
|
||||
packssdw m2, m3
|
||||
mova [src1q+lenq], m2
|
||||
%endif
|
||||
cvtps2dq m0, m0
|
||||
cvtps2dq m1, m1
|
||||
packssdw m0, m1
|
||||
mova [src0q+lenq], m0
|
||||
%else
|
||||
; mix with fltp input
|
||||
%if stereo || mx_stack_0_0
|
||||
mova m0, [src0q+lenq]
|
||||
%endif
|
||||
%if stereo
|
||||
mulps m1, m0, mx_1_0
|
||||
%endif
|
||||
%if stereo || mx_stack_0_0
|
||||
mulps m0, m0, mx_0_0
|
||||
%else
|
||||
mulps m0, [src0q+lenq], mx_0_0
|
||||
%endif
|
||||
%assign %%i 1
|
||||
%rep (in_channels - 1)
|
||||
%if copy_src_from_stack
|
||||
%define src_ptr src5q
|
||||
mov src_ptr, src %+ %%i %+ m
|
||||
%else
|
||||
%define src_ptr src %+ %%i %+ q
|
||||
%endif
|
||||
; avoid extra load for mono if matrix is in a mm register
|
||||
%if stereo || mx_stack_0_ %+ %%i
|
||||
mova m2, [src_ptr+lenq]
|
||||
%endif
|
||||
%if stereo
|
||||
fmaddps m1, m2, mx_1_ %+ %%i, m1, m3
|
||||
%endif
|
||||
%if stereo || mx_stack_0_ %+ %%i
|
||||
fmaddps m0, m2, mx_0_ %+ %%i, m0, m2
|
||||
%else
|
||||
fmaddps m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
|
||||
%endif
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
mova [src0q+lenq], m0
|
||||
%if stereo
|
||||
mova [src1q+lenq], m1
|
||||
%endif
|
||||
%endif
|
||||
|
||||
add lenq, mmsize
|
||||
jl .loop
|
||||
; restore stack pointer
|
||||
%if matrix_elements_stack > 0
|
||||
%if mmsize == 32
|
||||
mov rsp, bkpq
|
||||
%else
|
||||
ADD rsp, pad
|
||||
%endif
|
||||
%endif
|
||||
; zero ymm high halves
|
||||
%if mmsize == 32
|
||||
vzeroupper
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro MIX_3_8_TO_1_2_FLT_FUNCS 0
|
||||
%assign %%i 3
|
||||
%rep 6
|
||||
INIT_XMM sse
|
||||
MIX_3_8_TO_1_2_FLT %%i, 1, fltp
|
||||
MIX_3_8_TO_1_2_FLT %%i, 2, fltp
|
||||
INIT_XMM sse2
|
||||
MIX_3_8_TO_1_2_FLT %%i, 1, s16p
|
||||
MIX_3_8_TO_1_2_FLT %%i, 2, s16p
|
||||
INIT_XMM sse4
|
||||
MIX_3_8_TO_1_2_FLT %%i, 1, s16p
|
||||
MIX_3_8_TO_1_2_FLT %%i, 2, s16p
|
||||
; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues
|
||||
%if HAVE_AVX
|
||||
%if ARCH_X86_64 || %%i < 6
|
||||
INIT_YMM avx
|
||||
%else
|
||||
INIT_XMM avx
|
||||
%endif
|
||||
MIX_3_8_TO_1_2_FLT %%i, 1, fltp
|
||||
MIX_3_8_TO_1_2_FLT %%i, 2, fltp
|
||||
INIT_XMM avx
|
||||
MIX_3_8_TO_1_2_FLT %%i, 1, s16p
|
||||
MIX_3_8_TO_1_2_FLT %%i, 2, s16p
|
||||
%endif
|
||||
%if HAVE_FMA4
|
||||
%if ARCH_X86_64 || %%i < 6
|
||||
INIT_YMM fma4
|
||||
%else
|
||||
INIT_XMM fma4
|
||||
%endif
|
||||
MIX_3_8_TO_1_2_FLT %%i, 1, fltp
|
||||
MIX_3_8_TO_1_2_FLT %%i, 2, fltp
|
||||
INIT_XMM fma4
|
||||
MIX_3_8_TO_1_2_FLT %%i, 1, s16p
|
||||
MIX_3_8_TO_1_2_FLT %%i, 2, s16p
|
||||
%endif
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
MIX_3_8_TO_1_2_FLT_FUNCS
|
||||
|
@ -47,6 +47,129 @@ extern void ff_mix_1_to_2_s16p_flt_sse4(int16_t **src, float **matrix, int len,
|
||||
extern void ff_mix_1_to_2_s16p_flt_avx (int16_t **src, float **matrix, int len,
|
||||
int out_ch, int in_ch);
|
||||
|
||||
#define DEFINE_MIX_3_8_TO_1_2(chan) \
|
||||
extern void ff_mix_ ## chan ## _to_1_fltp_flt_sse(float **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
extern void ff_mix_ ## chan ## _to_2_fltp_flt_sse(float **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
\
|
||||
extern void ff_mix_ ## chan ## _to_1_s16p_flt_sse2(int16_t **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
extern void ff_mix_ ## chan ## _to_2_s16p_flt_sse2(int16_t **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
\
|
||||
extern void ff_mix_ ## chan ## _to_1_s16p_flt_sse4(int16_t **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
extern void ff_mix_ ## chan ## _to_2_s16p_flt_sse4(int16_t **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
\
|
||||
extern void ff_mix_ ## chan ## _to_1_fltp_flt_avx(float **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
extern void ff_mix_ ## chan ## _to_2_fltp_flt_avx(float **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
\
|
||||
extern void ff_mix_ ## chan ## _to_1_s16p_flt_avx(int16_t **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
extern void ff_mix_ ## chan ## _to_2_s16p_flt_avx(int16_t **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
\
|
||||
extern void ff_mix_ ## chan ## _to_1_fltp_flt_fma4(float **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
extern void ff_mix_ ## chan ## _to_2_fltp_flt_fma4(float **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
\
|
||||
extern void ff_mix_ ## chan ## _to_1_s16p_flt_fma4(int16_t **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch); \
|
||||
extern void ff_mix_ ## chan ## _to_2_s16p_flt_fma4(int16_t **src, \
|
||||
float **matrix, int len, \
|
||||
int out_ch, int in_ch);
|
||||
|
||||
DEFINE_MIX_3_8_TO_1_2(3)
|
||||
DEFINE_MIX_3_8_TO_1_2(4)
|
||||
DEFINE_MIX_3_8_TO_1_2(5)
|
||||
DEFINE_MIX_3_8_TO_1_2(6)
|
||||
DEFINE_MIX_3_8_TO_1_2(7)
|
||||
DEFINE_MIX_3_8_TO_1_2(8)
|
||||
|
||||
#define SET_MIX_3_8_TO_1_2(chan) \
|
||||
if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 1, 16, 4, "SSE", \
|
||||
ff_mix_ ## chan ## _to_1_fltp_flt_sse); \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 2, 16, 4, "SSE", \
|
||||
ff_mix_## chan ##_to_2_fltp_flt_sse); \
|
||||
} \
|
||||
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 1, 16, 8, "SSE2", \
|
||||
ff_mix_ ## chan ## _to_1_s16p_flt_sse2); \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 2, 16, 8, "SSE2", \
|
||||
ff_mix_ ## chan ## _to_2_s16p_flt_sse2); \
|
||||
} \
|
||||
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) { \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 1, 16, 8, "SSE4", \
|
||||
ff_mix_ ## chan ## _to_1_s16p_flt_sse4); \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 2, 16, 8, "SSE4", \
|
||||
ff_mix_ ## chan ## _to_2_s16p_flt_sse4); \
|
||||
} \
|
||||
if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { \
|
||||
int ptr_align = 32; \
|
||||
int smp_align = 8; \
|
||||
if (ARCH_X86_32 || chan >= 6) { \
|
||||
ptr_align = 16; \
|
||||
smp_align = 4; \
|
||||
} \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 1, ptr_align, smp_align, "AVX", \
|
||||
ff_mix_ ## chan ## _to_1_fltp_flt_avx); \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 2, ptr_align, smp_align, "AVX", \
|
||||
ff_mix_ ## chan ## _to_2_fltp_flt_avx); \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 1, 16, 8, "AVX", \
|
||||
ff_mix_ ## chan ## _to_1_s16p_flt_avx); \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 2, 16, 8, "AVX", \
|
||||
ff_mix_ ## chan ## _to_2_s16p_flt_avx); \
|
||||
} \
|
||||
if (mm_flags & AV_CPU_FLAG_FMA4 && HAVE_FMA4) { \
|
||||
int ptr_align = 32; \
|
||||
int smp_align = 8; \
|
||||
if (ARCH_X86_32 || chan >= 6) { \
|
||||
ptr_align = 16; \
|
||||
smp_align = 4; \
|
||||
} \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 1, ptr_align, smp_align, "FMA4", \
|
||||
ff_mix_ ## chan ## _to_1_fltp_flt_fma4); \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 2, ptr_align, smp_align, "FMA4", \
|
||||
ff_mix_ ## chan ## _to_2_fltp_flt_fma4); \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 1, 16, 8, "FMA4", \
|
||||
ff_mix_ ## chan ## _to_1_s16p_flt_fma4); \
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
|
||||
chan, 2, 16, 8, "FMA4", \
|
||||
ff_mix_ ## chan ## _to_2_s16p_flt_fma4); \
|
||||
}
|
||||
|
||||
av_cold void ff_audio_mix_init_x86(AudioMix *am)
|
||||
{
|
||||
#if HAVE_YASM
|
||||
@ -80,5 +203,12 @@ av_cold void ff_audio_mix_init_x86(AudioMix *am)
|
||||
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,
|
||||
1, 2, 16, 8, "AVX", ff_mix_1_to_2_s16p_flt_avx);
|
||||
}
|
||||
|
||||
SET_MIX_3_8_TO_1_2(3)
|
||||
SET_MIX_3_8_TO_1_2(4)
|
||||
SET_MIX_3_8_TO_1_2(5)
|
||||
SET_MIX_3_8_TO_1_2(6)
|
||||
SET_MIX_3_8_TO_1_2(7)
|
||||
SET_MIX_3_8_TO_1_2(8)
|
||||
#endif
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user