twinvq: add SSE/AVX optimized sum/difference stereo interleaving
This commit is contained in:
parent
7b966566da
commit
9d06037d48
@ -2509,6 +2509,18 @@ static void butterflies_float_c(float *restrict v1, float *restrict v2,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void butterflies_float_interleave_c(float *dst, const float *src0,
|
||||||
|
const float *src1, int len)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < len; i++) {
|
||||||
|
float f1 = src0[i];
|
||||||
|
float f2 = src1[i];
|
||||||
|
dst[2*i ] = f1 + f2;
|
||||||
|
dst[2*i + 1] = f1 - f2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
|
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
|
||||||
{
|
{
|
||||||
float p = 0.0;
|
float p = 0.0;
|
||||||
@ -3036,6 +3048,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
|
|||||||
c->vector_clip_int32 = vector_clip_int32_c;
|
c->vector_clip_int32 = vector_clip_int32_c;
|
||||||
c->scalarproduct_float = scalarproduct_float_c;
|
c->scalarproduct_float = scalarproduct_float_c;
|
||||||
c->butterflies_float = butterflies_float_c;
|
c->butterflies_float = butterflies_float_c;
|
||||||
|
c->butterflies_float_interleave = butterflies_float_interleave_c;
|
||||||
c->vector_fmul_scalar = vector_fmul_scalar_c;
|
c->vector_fmul_scalar = vector_fmul_scalar_c;
|
||||||
c->vector_fmac_scalar = vector_fmac_scalar_c;
|
c->vector_fmac_scalar = vector_fmac_scalar_c;
|
||||||
|
|
||||||
|
@ -453,6 +453,23 @@ typedef struct DSPContext {
|
|||||||
*/
|
*/
|
||||||
void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
|
void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate the sum and difference of two vectors of floats and interleave
|
||||||
|
* results into a separate output vector of floats, with each sum
|
||||||
|
* positioned before the corresponding difference.
|
||||||
|
*
|
||||||
|
* @param dst output vector
|
||||||
|
* constraints: 16-byte aligned
|
||||||
|
* @param src0 first input vector
|
||||||
|
* constraints: 32-byte aligned
|
||||||
|
* @param src1 second input vector
|
||||||
|
* constraints: 32-byte aligned
|
||||||
|
* @param len number of elements in the input
|
||||||
|
* constraints: multiple of 8
|
||||||
|
*/
|
||||||
|
void (*butterflies_float_interleave)(float *dst, const float *src0,
|
||||||
|
const float *src1, int len);
|
||||||
|
|
||||||
/* (I)DCT */
|
/* (I)DCT */
|
||||||
void (*fdct)(DCTELEM *block/* align 16*/);
|
void (*fdct)(DCTELEM *block/* align 16*/);
|
||||||
void (*fdct248)(DCTELEM *block/* align 16*/);
|
void (*fdct248)(DCTELEM *block/* align 16*/);
|
||||||
|
@ -665,8 +665,9 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype,
|
|||||||
float *out)
|
float *out)
|
||||||
{
|
{
|
||||||
const ModeTab *mtab = tctx->mtab;
|
const ModeTab *mtab = tctx->mtab;
|
||||||
|
int size1, size2;
|
||||||
float *prev_buf = tctx->prev_frame + tctx->last_block_pos[0];
|
float *prev_buf = tctx->prev_frame + tctx->last_block_pos[0];
|
||||||
int i, j;
|
int i;
|
||||||
|
|
||||||
for (i = 0; i < tctx->avctx->channels; i++) {
|
for (i = 0; i < tctx->avctx->channels; i++) {
|
||||||
imdct_and_window(tctx, ftype, wtype,
|
imdct_and_window(tctx, ftype, wtype,
|
||||||
@ -675,27 +676,24 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype,
|
|||||||
i);
|
i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size2 = tctx->last_block_pos[0];
|
||||||
|
size1 = mtab->size - size2;
|
||||||
if (tctx->avctx->channels == 2) {
|
if (tctx->avctx->channels == 2) {
|
||||||
for (i = 0; i < mtab->size - tctx->last_block_pos[0]; i++) {
|
tctx->dsp.butterflies_float_interleave(out, prev_buf,
|
||||||
float f1 = prev_buf[ i];
|
&prev_buf[2*mtab->size],
|
||||||
float f2 = prev_buf[2*mtab->size + i];
|
size1);
|
||||||
out[2*i ] = f1 + f2;
|
|
||||||
out[2*i + 1] = f1 - f2;
|
out += 2 * size1;
|
||||||
}
|
|
||||||
for (j = 0; i < mtab->size; j++,i++) {
|
tctx->dsp.butterflies_float_interleave(out, tctx->curr_frame,
|
||||||
float f1 = tctx->curr_frame[ j];
|
&tctx->curr_frame[2*mtab->size],
|
||||||
float f2 = tctx->curr_frame[2*mtab->size + j];
|
size2);
|
||||||
out[2*i ] = f1 + f2;
|
|
||||||
out[2*i + 1] = f1 - f2;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
memcpy(out, prev_buf,
|
memcpy(out, prev_buf, size1 * sizeof(*out));
|
||||||
(mtab->size - tctx->last_block_pos[0]) * sizeof(*out));
|
|
||||||
|
|
||||||
out += mtab->size - tctx->last_block_pos[0];
|
out += size1;
|
||||||
|
|
||||||
memcpy(out, tctx->curr_frame,
|
memcpy(out, tctx->curr_frame, size2 * sizeof(*out));
|
||||||
(tctx->last_block_pos[0]) * sizeof(*out));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2424,6 +2424,11 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min
|
|||||||
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min,
|
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min,
|
||||||
int32_t max, unsigned int len);
|
int32_t max, unsigned int len);
|
||||||
|
|
||||||
|
extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
|
||||||
|
const float *src1, int len);
|
||||||
|
extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
|
||||||
|
const float *src1, int len);
|
||||||
|
|
||||||
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||||
{
|
{
|
||||||
int mm_flags = av_get_cpu_flags();
|
int mm_flags = av_get_cpu_flags();
|
||||||
@ -2868,6 +2873,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
|||||||
c->vector_clipf = vector_clipf_sse;
|
c->vector_clipf = vector_clipf_sse;
|
||||||
#if HAVE_YASM
|
#if HAVE_YASM
|
||||||
c->scalarproduct_float = ff_scalarproduct_float_sse;
|
c->scalarproduct_float = ff_scalarproduct_float_sse;
|
||||||
|
c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
|
if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
|
||||||
@ -2925,6 +2931,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
|||||||
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
|
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
|
||||||
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
|
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
|
||||||
}
|
}
|
||||||
|
c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -1129,3 +1129,51 @@ VECTOR_CLIP_INT32 11, 1, 1, 0
|
|||||||
%else
|
%else
|
||||||
VECTOR_CLIP_INT32 6, 1, 0, 0
|
VECTOR_CLIP_INT32 6, 1, 0, 0
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
; void ff_butterflies_float_interleave(float *dst, const float *src0,
|
||||||
|
; const float *src1, int len);
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
%macro BUTTERFLIES_FLOAT_INTERLEAVE 0
|
||||||
|
cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
|
||||||
|
%ifdef ARCH_X86_64
|
||||||
|
movsxd lenq, lend
|
||||||
|
%endif
|
||||||
|
test lenq, lenq
|
||||||
|
jz .end
|
||||||
|
shl lenq, 2
|
||||||
|
lea src0q, [src0q + lenq]
|
||||||
|
lea src1q, [src1q + lenq]
|
||||||
|
lea dstq, [ dstq + 2*lenq]
|
||||||
|
neg lenq
|
||||||
|
.loop:
|
||||||
|
mova m0, [src0q + lenq]
|
||||||
|
mova m1, [src1q + lenq]
|
||||||
|
subps m2, m0, m1
|
||||||
|
addps m0, m0, m1
|
||||||
|
unpcklps m1, m0, m2
|
||||||
|
unpckhps m0, m0, m2
|
||||||
|
%if cpuflag(avx)
|
||||||
|
vextractf128 [dstq + 2*lenq ], m1, 0
|
||||||
|
vextractf128 [dstq + 2*lenq + 16], m0, 0
|
||||||
|
vextractf128 [dstq + 2*lenq + 32], m1, 1
|
||||||
|
vextractf128 [dstq + 2*lenq + 48], m0, 1
|
||||||
|
%else
|
||||||
|
mova [dstq + 2*lenq ], m1
|
||||||
|
mova [dstq + 2*lenq + mmsize], m0
|
||||||
|
%endif
|
||||||
|
add lenq, mmsize
|
||||||
|
jl .loop
|
||||||
|
%if mmsize == 32
|
||||||
|
vzeroupper
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
|
.end:
|
||||||
|
REP_RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_XMM sse
|
||||||
|
BUTTERFLIES_FLOAT_INTERLEAVE
|
||||||
|
INIT_YMM avx
|
||||||
|
BUTTERFLIES_FLOAT_INTERLEAVE
|
||||||
|
Loading…
x
Reference in New Issue
Block a user