diff --git a/libavcodec/acelp_pitch_delay.c b/libavcodec/acelp_pitch_delay.c index 395247dd2a..214a272c32 100644 --- a/libavcodec/acelp_pitch_delay.c +++ b/libavcodec/acelp_pitch_delay.c @@ -106,7 +106,7 @@ int16_t ff_acelp_decode_gain_code( mr_energy += quant_energy[i] * ma_prediction_coeff[i]; mr_energy = gain_corr_factor * exp(M_LN10 / (20 << 23) * mr_energy) / - sqrt(dsp->scalarproduct_int16(fc_v, fc_v, subframe_size, 0)); + sqrt(dsp->scalarproduct_int16(fc_v, fc_v, subframe_size)); return mr_energy >> 12; } diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c index 68e5b3ed42..b2931fe525 100644 --- a/libavcodec/arm/dsputil_init_neon.c +++ b/libavcodec/arm/dsputil_init_neon.c @@ -171,8 +171,7 @@ void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min, void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); -int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len, - int shift); +int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len); int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2, const int16_t *v3, int len, int mul); diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S index 8bb58afb18..ea479bb580 100644 --- a/libavcodec/arm/int_neon.S +++ b/libavcodec/arm/int_neon.S @@ -29,31 +29,7 @@ function ff_scalarproduct_int16_neon, export=1 vmov.i16 q1, #0 vmov.i16 q2, #0 vmov.i16 q3, #0 - negs r3, r3 - beq 2f - - vdup.s32 q12, r3 1: vld1.16 {d16-d17}, [r0]! - vld1.16 {d20-d21}, [r1,:128]! - vmull.s16 q12, d16, d20 - vld1.16 {d18-d19}, [r0]! - vmull.s16 q13, d17, d21 - vld1.16 {d22-d23}, [r1,:128]! - vmull.s16 q14, d18, d22 - vmull.s16 q15, d19, d23 - vshl.s32 q8, q12, q12 - vshl.s32 q9, q13, q12 - vadd.s32 q0, q0, q8 - vshl.s32 q10, q14, q12 - vadd.s32 q1, q1, q9 - vshl.s32 q11, q15, q12 - vadd.s32 q2, q2, q10 - vadd.s32 q3, q3, q11 - subs r2, r2, #16 - bne 1b - b 3f - -2: vld1.16 {d16-d17}, [r0]! vld1.16 {d20-d21}, [r1,:128]! vmlal.s16 q0, d16, d20 vld1.16 {d18-d19}, [r0]! @@ -62,9 +38,9 @@ function ff_scalarproduct_int16_neon, export=1 vmlal.s16 q2, d18, d22 vmlal.s16 q3, d19, d23 subs r2, r2, #16 - bne 2b + bne 1b -3: vpadd.s32 d16, d0, d1 + vpadd.s32 d16, d0, d1 vpadd.s32 d17, d2, d3 vpadd.s32 d10, d4, d5 vpadd.s32 d11, d6, d7 diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 29c5976596..f5b7d076d1 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -2559,12 +2559,12 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i } } -static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift) +static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order) { int res = 0; while (order--) - res += (*v1++ * *v2++) >> shift; + res += *v1++ * *v2++; return res; } diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 0a6165685e..aa026e15f5 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -536,9 +536,8 @@ typedef struct DSPContext { /** * Calculate scalar product of two vectors. * @param len length of vectors, should be multiple of 16 - * @param shift number of bits to discard from product */ - int32_t (*scalarproduct_int16)(const int16_t *v1, const int16_t *v2/*align 16*/, int len, int shift); + int32_t (*scalarproduct_int16)(const int16_t *v1, const int16_t *v2/*align 16*/, int len); /* ape functions */ /** * Calculate scalar product of v1 and v2, diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 09940d147d..da08bdab50 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -35,13 +35,12 @@ pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 SECTION_TEXT %macro SCALARPRODUCT 1 -; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) -cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift +; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order) +cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order shl orderq, 1 add v1q, orderq add v2q, orderq neg orderq - movd m3, shiftm pxor m2, m2 .loop: movu m0, [v1q + orderq] @@ -55,10 +54,8 @@ cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift %if mmsize == 16 movhlps m0, m2 paddd m2, m0 - psrad m2, m3 pshuflw m0, m2, 0x4e %else - psrad m2, m3 pshufw m0, m2, 0x4e %endif paddd m2, m0