Drop unused args from vector_fmul_add_add, simpify code, and rename

The src3 and step arguments to vector_fmul_add_add() are always zero
and one, respectively.  This removes these arguments from the function,
simplifies the code accordingly, and renames the function to better
match the new operation.

Originally committed as revision 20061 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Måns Rullgård
2009-09-27 16:51:54 +00:00
parent 7f5c14210b
commit 952e872198
5 changed files with 18 additions and 142 deletions

View File

@@ -2125,34 +2125,9 @@ static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *
);
}
static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
const float *src2, int src3, int len, int step){
static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
const float *src2, int len){
x86_reg i = (len-4)*4;
if(step == 2 && src3 == 0){
dst += (len-4)*2;
__asm__ volatile(
"1: \n\t"
"movq (%2,%0), %%mm0 \n\t"
"movq 8(%2,%0), %%mm1 \n\t"
"pfmul (%3,%0), %%mm0 \n\t"
"pfmul 8(%3,%0), %%mm1 \n\t"
"pfadd (%4,%0), %%mm0 \n\t"
"pfadd 8(%4,%0), %%mm1 \n\t"
"movd %%mm0, (%1) \n\t"
"movd %%mm1, 16(%1) \n\t"
"psrlq $32, %%mm0 \n\t"
"psrlq $32, %%mm1 \n\t"
"movd %%mm0, 8(%1) \n\t"
"movd %%mm1, 24(%1) \n\t"
"sub $32, %1 \n\t"
"sub $16, %0 \n\t"
"jge 1b \n\t"
:"+r"(i), "+r"(dst)
:"r"(src0), "r"(src1), "r"(src2)
:"memory"
);
}
else if(step == 1 && src3 == 0){
__asm__ volatile(
"1: \n\t"
"movq (%2,%0), %%mm0 \n\t"
@@ -2169,47 +2144,11 @@ static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float
:"r"(dst), "r"(src0), "r"(src1), "r"(src2)
:"memory"
);
}
else
ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
__asm__ volatile("femms");
}
static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
const float *src2, int src3, int len, int step){
static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
const float *src2, int len){
x86_reg i = (len-8)*4;
if(step == 2 && src3 == 0){
dst += (len-8)*2;
__asm__ volatile(
"1: \n\t"
"movaps (%2,%0), %%xmm0 \n\t"
"movaps 16(%2,%0), %%xmm1 \n\t"
"mulps (%3,%0), %%xmm0 \n\t"
"mulps 16(%3,%0), %%xmm1 \n\t"
"addps (%4,%0), %%xmm0 \n\t"
"addps 16(%4,%0), %%xmm1 \n\t"
"movss %%xmm0, (%1) \n\t"
"movss %%xmm1, 32(%1) \n\t"
"movhlps %%xmm0, %%xmm2 \n\t"
"movhlps %%xmm1, %%xmm3 \n\t"
"movss %%xmm2, 16(%1) \n\t"
"movss %%xmm3, 48(%1) \n\t"
"shufps $0xb1, %%xmm0, %%xmm0 \n\t"
"shufps $0xb1, %%xmm1, %%xmm1 \n\t"
"movss %%xmm0, 8(%1) \n\t"
"movss %%xmm1, 40(%1) \n\t"
"movhlps %%xmm0, %%xmm2 \n\t"
"movhlps %%xmm1, %%xmm3 \n\t"
"movss %%xmm2, 24(%1) \n\t"
"movss %%xmm3, 56(%1) \n\t"
"sub $64, %1 \n\t"
"sub $32, %0 \n\t"
"jge 1b \n\t"
:"+r"(i), "+r"(dst)
:"r"(src0), "r"(src1), "r"(src2)
:"memory"
);
}
else if(step == 1 && src3 == 0){
__asm__ volatile(
"1: \n\t"
"movaps (%2,%0), %%xmm0 \n\t"
@@ -2226,9 +2165,6 @@ static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *
:"r"(dst), "r"(src0), "r"(src1), "r"(src2)
:"memory"
);
}
else
ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
}
static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
@@ -3077,7 +3013,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->ac3_downmix = ac3_downmix_sse;
c->vector_fmul = vector_fmul_sse;
c->vector_fmul_reverse = vector_fmul_reverse_sse;
c->vector_fmul_add_add = vector_fmul_add_add_sse;
c->vector_fmul_add = vector_fmul_add_sse;
c->vector_fmul_window = vector_fmul_window_sse;
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
c->vector_clipf = vector_clipf_sse;
@@ -3085,7 +3021,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->float_to_int16_interleave = float_to_int16_interleave_sse;
}
if(mm_flags & FF_MM_3DNOW)
c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
if(mm_flags & FF_MM_SSE2){
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
c->float_to_int16 = float_to_int16_sse2;