Add float_interleave() to FmtConvertContext with x86-optimized versions.

Partially based on patches by clsid2 in ffdshow-tryout.
ff_float_interleave6() x86 improvements by Loren Merrit.
This commit is contained in:
Justin Ruggles 2011-04-24 17:50:17 -04:00
parent f907ad9b85
commit 32f8fb8ecf
4 changed files with 200 additions and 0 deletions

View File

@ -56,11 +56,31 @@ static void float_to_int16_interleave_c(int16_t *dst, const float **src,
} }
} }
void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
int channels)
{
int j, c;
unsigned int i;
if (channels == 2) {
for (i = 0; i < len; i++) {
dst[2*i] = src[0][i];
dst[2*i+1] = src[1][i];
}
} else if (channels == 1 && len < INT_MAX / sizeof(float)) {
memcpy(dst, src[0], len * sizeof(float));
} else {
for (c = 0; c < channels; c++)
for (i = 0, j = c; i < len; i++, j += channels)
dst[j] = src[c][i];
}
}
av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx) av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
{ {
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
c->float_to_int16 = float_to_int16_c; c->float_to_int16 = float_to_int16_c;
c->float_to_int16_interleave = float_to_int16_interleave_c; c->float_to_int16_interleave = float_to_int16_interleave_c;
c->float_interleave = ff_float_interleave_c;
if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx); if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx); if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx);

View File

@ -68,8 +68,17 @@ typedef struct FmtConvertContext {
*/ */
void (*float_to_int16_interleave)(int16_t *dst, const float **src, void (*float_to_int16_interleave)(int16_t *dst, const float **src,
long len, int channels); long len, int channels);
/**
* Convert an array of interleaved float to multiple arrays of float.
*/
void (*float_interleave)(float *dst, const float **src, unsigned int len,
int channels);
} FmtConvertContext; } FmtConvertContext;
void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
int channels);
void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx); void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx); void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);

View File

@ -20,6 +20,7 @@
;****************************************************************************** ;******************************************************************************
%include "x86inc.asm" %include "x86inc.asm"
%include "x86util.asm"
section .text align=16 section .text align=16
@ -89,3 +90,143 @@ FLOAT_TO_INT16_INTERLEAVE6 3dnow
%undef pswapd %undef pswapd
FLOAT_TO_INT16_INTERLEAVE6 3dn2 FLOAT_TO_INT16_INTERLEAVE6 3dn2
%undef cvtps2pi %undef cvtps2pi
;-----------------------------------------------------------------------------
; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
;-----------------------------------------------------------------------------
%macro BUTTERFLYPS 3
movaps m%3, m%1
unpcklps m%1, m%2
unpckhps m%3, m%2
SWAP %2, %3
%endmacro
%macro FLOAT_INTERLEAVE6 2
cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
%ifdef ARCH_X86_64
%define lend r10d
mov lend, r2d
%else
%define lend dword r2m
%endif
mov src1q, [srcq+1*gprsize]
mov src2q, [srcq+2*gprsize]
mov src3q, [srcq+3*gprsize]
mov src4q, [srcq+4*gprsize]
mov src5q, [srcq+5*gprsize]
mov srcq, [srcq]
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
sub src4q, srcq
sub src5q, srcq
.loop:
%ifidn %1, sse
movaps m0, [srcq]
movaps m1, [srcq+src1q]
movaps m2, [srcq+src2q]
movaps m3, [srcq+src3q]
movaps m4, [srcq+src4q]
movaps m5, [srcq+src5q]
BUTTERFLYPS 0, 1, 6
BUTTERFLYPS 2, 3, 6
BUTTERFLYPS 4, 5, 6
movaps m6, m4
shufps m4, m0, 0xe4
movlhps m0, m2
movhlps m6, m2
movaps [dstq ], m0
movaps [dstq+16], m4
movaps [dstq+32], m6
movaps m6, m5
shufps m5, m1, 0xe4
movlhps m1, m3
movhlps m6, m3
movaps [dstq+48], m1
movaps [dstq+64], m5
movaps [dstq+80], m6
%else ; mmx
movq m0, [srcq]
movq m1, [srcq+src1q]
movq m2, [srcq+src2q]
movq m3, [srcq+src3q]
movq m4, [srcq+src4q]
movq m5, [srcq+src5q]
SBUTTERFLY dq, 0, 1, 6
SBUTTERFLY dq, 2, 3, 6
SBUTTERFLY dq, 4, 5, 6
movq [dstq ], m0
movq [dstq+ 8], m2
movq [dstq+16], m4
movq [dstq+24], m1
movq [dstq+32], m3
movq [dstq+40], m5
%endif
add srcq, mmsize
add dstq, mmsize*6
sub lend, mmsize/4
jg .loop
%ifidn %1, mmx
emms
%endif
REP_RET
%endmacro
INIT_MMX
FLOAT_INTERLEAVE6 mmx, 0
INIT_XMM
FLOAT_INTERLEAVE6 sse, 7
;-----------------------------------------------------------------------------
; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
;-----------------------------------------------------------------------------
%macro FLOAT_INTERLEAVE2 2
cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
mov src1q, [srcq+gprsize]
mov srcq, [srcq ]
sub src1q, srcq
.loop
MOVPS m0, [srcq ]
MOVPS m1, [srcq+src1q ]
MOVPS m3, [srcq +mmsize]
MOVPS m4, [srcq+src1q+mmsize]
MOVPS m2, m0
PUNPCKLDQ m0, m1
PUNPCKHDQ m2, m1
MOVPS m1, m3
PUNPCKLDQ m3, m4
PUNPCKHDQ m1, m4
MOVPS [dstq ], m0
MOVPS [dstq+1*mmsize], m2
MOVPS [dstq+2*mmsize], m3
MOVPS [dstq+3*mmsize], m1
add srcq, mmsize*2
add dstq, mmsize*4
sub lend, mmsize/2
jg .loop
%ifidn %1, mmx
emms
%endif
REP_RET
%endmacro
INIT_MMX
%define MOVPS movq
%define PUNPCKLDQ punpckldq
%define PUNPCKHDQ punpckhdq
FLOAT_INTERLEAVE2 mmx, 0
INIT_XMM
%define MOVPS movaps
%define PUNPCKLDQ unpcklps
%define PUNPCKHDQ unpckhps
FLOAT_INTERLEAVE2 sse, 5

View File

@ -235,11 +235,40 @@ static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long
float_to_int16_interleave_3dnow(dst, src, len, channels); float_to_int16_interleave_3dnow(dst, src, len, channels);
} }
void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
static void float_interleave_mmx(float *dst, const float **src,
unsigned int len, int channels)
{
if (channels == 2) {
ff_float_interleave2_mmx(dst, src, len);
} else if (channels == 6)
ff_float_interleave6_mmx(dst, src, len);
else
ff_float_interleave_c(dst, src, len, channels);
}
static void float_interleave_sse(float *dst, const float **src,
unsigned int len, int channels)
{
if (channels == 2) {
ff_float_interleave2_sse(dst, src, len);
} else if (channels == 6)
ff_float_interleave6_sse(dst, src, len);
else
ff_float_interleave_c(dst, src, len, channels);
}
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
{ {
int mm_flags = av_get_cpu_flags(); int mm_flags = av_get_cpu_flags();
if (mm_flags & AV_CPU_FLAG_MMX) { if (mm_flags & AV_CPU_FLAG_MMX) {
c->float_interleave = float_interleave_mmx;
if(mm_flags & AV_CPU_FLAG_3DNOW){ if(mm_flags & AV_CPU_FLAG_3DNOW){
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
@ -256,6 +285,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
c->float_to_int16 = float_to_int16_sse; c->float_to_int16 = float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse; c->float_to_int16_interleave = float_to_int16_interleave_sse;
c->float_interleave = float_interleave_sse;
} }
if(mm_flags & AV_CPU_FLAG_SSE2){ if(mm_flags & AV_CPU_FLAG_SSE2){
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;