SSE2 optimizations for Monkey's Audio decoder vector functions
Originally committed as revision 14161 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
3a8322b133
commit
d7e1fc4254
@ -2092,6 +2092,79 @@ extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_st
|
||||
extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
|
||||
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
|
||||
|
||||
|
||||
static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
|
||||
{
|
||||
x86_reg o = -(order << 1);
|
||||
v1 += order;
|
||||
v2 += order;
|
||||
asm volatile(
|
||||
"1: \n\t"
|
||||
"movdqu (%1,%2), %%xmm0 \n\t"
|
||||
"movdqu 16(%1,%2), %%xmm1 \n\t"
|
||||
"paddw (%0,%2), %%xmm0 \n\t"
|
||||
"paddw 16(%0,%2), %%xmm1 \n\t"
|
||||
"movdqa %%xmm0, (%0,%2) \n\t"
|
||||
"movdqa %%xmm1, 16(%0,%2) \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
"js 1b \n\t"
|
||||
: "+r"(v1), "+r"(v2), "+r"(o)
|
||||
);
|
||||
}
|
||||
|
||||
static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
|
||||
{
|
||||
x86_reg o = -(order << 1);
|
||||
v1 += order;
|
||||
v2 += order;
|
||||
asm volatile(
|
||||
"1: \n\t"
|
||||
"movdqa (%0,%2), %%xmm0 \n\t"
|
||||
"movdqa 16(%0,%2), %%xmm2 \n\t"
|
||||
"movdqu (%1,%2), %%xmm1 \n\t"
|
||||
"movdqu 16(%1,%2), %%xmm3 \n\t"
|
||||
"psubw %%xmm1, %%xmm0 \n\t"
|
||||
"psubw %%xmm3, %%xmm2 \n\t"
|
||||
"movdqa %%xmm0, (%0,%2) \n\t"
|
||||
"movdqa %%xmm2, 16(%0,%2) \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
"js 1b \n\t"
|
||||
: "+r"(v1), "+r"(v2), "+r"(o)
|
||||
);
|
||||
}
|
||||
|
||||
static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
|
||||
{
|
||||
int res = 0;
|
||||
DECLARE_ALIGNED_16(int64_t, sh);
|
||||
x86_reg o = -(order << 1);
|
||||
|
||||
v1 += order;
|
||||
v2 += order;
|
||||
sh = shift;
|
||||
asm volatile(
|
||||
"pxor %%xmm7, %%xmm7 \n\t"
|
||||
"1: \n\t"
|
||||
"movdqu (%0,%3), %%xmm0 \n\t"
|
||||
"movdqu 16(%0,%3), %%xmm1 \n\t"
|
||||
"pmaddwd (%1,%3), %%xmm0 \n\t"
|
||||
"pmaddwd 16(%1,%3), %%xmm1 \n\t"
|
||||
"paddd %%xmm0, %%xmm7 \n\t"
|
||||
"paddd %%xmm1, %%xmm7 \n\t"
|
||||
"add $32, %3 \n\t"
|
||||
"js 1b \n\t"
|
||||
"movhlps %%xmm7, %%xmm2 \n\t"
|
||||
"paddd %%xmm2, %%xmm7 \n\t"
|
||||
"psrad %4, %%xmm7 \n\t"
|
||||
"pshuflw $0x4E, %%xmm7,%%xmm2 \n\t"
|
||||
"paddd %%xmm2, %%xmm7 \n\t"
|
||||
"movd %%xmm7, %2 \n\t"
|
||||
: "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
|
||||
: "m"(sh)
|
||||
);
|
||||
return res;
|
||||
}
|
||||
|
||||
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
{
|
||||
mm_flags = mm_support();
|
||||
@ -2463,6 +2536,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
}
|
||||
if(mm_flags & MM_3DNOW)
|
||||
c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
|
||||
if(mm_flags & MM_SSE2){
|
||||
c->add_int16 = add_int16_sse2;
|
||||
c->sub_int16 = sub_int16_sse2;
|
||||
c->scalarproduct_int16 = scalarproduct_int16_sse2;
|
||||
}
|
||||
}
|
||||
|
||||
if (ENABLE_ENCODERS)
|
||||
|
Loading…
Reference in New Issue
Block a user