diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c index dd6061cb09..9cf2866ef3 100644 --- a/libavcodec/i386/dsputil_mmx.c +++ b/libavcodec/i386/dsputil_mmx.c @@ -2092,6 +2092,79 @@ extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_st extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); + +static void add_int16_sse2(int16_t * v1, int16_t * v2, int order) +{ + x86_reg o = -(order << 1); + v1 += order; + v2 += order; + asm volatile( + "1: \n\t" + "movdqu (%1,%2), %%xmm0 \n\t" + "movdqu 16(%1,%2), %%xmm1 \n\t" + "paddw (%0,%2), %%xmm0 \n\t" + "paddw 16(%0,%2), %%xmm1 \n\t" + "movdqa %%xmm0, (%0,%2) \n\t" + "movdqa %%xmm1, 16(%0,%2) \n\t" + "add $32, %2 \n\t" + "js 1b \n\t" + : "+r"(v1), "+r"(v2), "+r"(o) + ); +} + +static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order) +{ + x86_reg o = -(order << 1); + v1 += order; + v2 += order; + asm volatile( + "1: \n\t" + "movdqa (%0,%2), %%xmm0 \n\t" + "movdqa 16(%0,%2), %%xmm2 \n\t" + "movdqu (%1,%2), %%xmm1 \n\t" + "movdqu 16(%1,%2), %%xmm3 \n\t" + "psubw %%xmm1, %%xmm0 \n\t" + "psubw %%xmm3, %%xmm2 \n\t" + "movdqa %%xmm0, (%0,%2) \n\t" + "movdqa %%xmm2, 16(%0,%2) \n\t" + "add $32, %2 \n\t" + "js 1b \n\t" + : "+r"(v1), "+r"(v2), "+r"(o) + ); +} + +static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) +{ + int res = 0; + DECLARE_ALIGNED_16(int64_t, sh); + x86_reg o = -(order << 1); + + v1 += order; + v2 += order; + sh = shift; + asm volatile( + "pxor %%xmm7, %%xmm7 \n\t" + "1: \n\t" + "movdqu (%0,%3), %%xmm0 \n\t" + "movdqu 16(%0,%3), %%xmm1 \n\t" + "pmaddwd (%1,%3), %%xmm0 \n\t" + "pmaddwd 16(%1,%3), %%xmm1 \n\t" + "paddd %%xmm0, %%xmm7 \n\t" + "paddd %%xmm1, %%xmm7 \n\t" + "add $32, %3 \n\t" + "js 1b \n\t" + "movhlps %%xmm7, %%xmm2 \n\t" + "paddd %%xmm2, %%xmm7 \n\t" + "psrad %4, %%xmm7 \n\t" + "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t" + "paddd %%xmm2, %%xmm7 \n\t" + "movd %%xmm7, %2 \n\t" + : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o) + : "m"(sh) + ); + return res; +} + void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) { mm_flags = mm_support(); @@ -2463,6 +2536,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) } if(mm_flags & MM_3DNOW) c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse + if(mm_flags & MM_SSE2){ + c->add_int16 = add_int16_sse2; + c->sub_int16 = sub_int16_sse2; + c->scalarproduct_int16 = scalarproduct_int16_sse2; + } } if (ENABLE_ENCODERS)