Optimized function WebRtcSpl_ScaleAndAddVectorsWithRound() for ARM-NEON platforms, and refactor it for generic C.

We removed it out of ilbc_specific_functions.c, since it's used not only in iLBC. Passed the unit test. Review URL: https://webrtc-codereview.appspot.com/426009 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1904 4adac7df-926f-26a2-2b94-8c16560cd09d
2012-03-16 16:29:37 +00:00
parent 7e26ad3828
commit bb966ca835
5 changed files with 147 additions and 56 deletions
--- a/src/common_audio/signal_processing/Android.mk
+++ b/src/common_audio/signal_processing/Android.mk
@@ -57,7 +57,8 @@ ifeq ($(ARCH_ARM_HAVE_NEON),true)
 LOCAL_SRC_FILES += \
    cross_correlation_neon.s \
    downsample_fast_neon.s \
-    min_max_operations_neon.s
+    min_max_operations_neon.s \
    vector_scaling_operations_neon.s
 LOCAL_CFLAGS += \
    $(MY_ARM_CFLAGS_NEON)
 else
--- a/src/common_audio/signal_processing/ilbc_specific_functions.c
+++ b/src/common_audio/signal_processing/ilbc_specific_functions.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -11,35 +11,16 @@
 /*
 * This file contains implementations of the iLBC specific functions
 * WebRtcSpl_ScaleAndAddVectorsWithRound()
 * WebRtcSpl_ReverseOrderMultArrayElements()
 * WebRtcSpl_ElementwiseVectorMult()
 * WebRtcSpl_AddVectorsAndShift()
 * WebRtcSpl_AddAffineVectorToVector()
 * WebRtcSpl_AffineTransformVector()
 *
 * The description header can be found in signal_processing_library.h
 *
 */
 #include "signal_processing_library.h"
 void WebRtcSpl_ScaleAndAddVectorsWithRound(WebRtc_Word16 *vector1, WebRtc_Word16 scale1,
                                           WebRtc_Word16 *vector2, WebRtc_Word16 scale2,
                                           WebRtc_Word16 right_shifts, WebRtc_Word16 *out,
                                           WebRtc_Word16 vector_length)
 {
    int i;
    WebRtc_Word16 roundVal;
    roundVal = 1 << right_shifts;
    roundVal = roundVal >> 1;
    for (i = 0; i < vector_length; i++)
    {
        out[i] = (WebRtc_Word16)((WEBRTC_SPL_MUL_16_16(vector1[i], scale1)
                + WEBRTC_SPL_MUL_16_16(vector2[i], scale2) + roundVal) >> right_shifts);
    }
 }
 void WebRtcSpl_ReverseOrderMultArrayElements(WebRtc_Word16 *out, G_CONST WebRtc_Word16 *in,
                                             G_CONST WebRtc_Word16 *win,
                                             WebRtc_Word16 vector_length,
--- a/src/common_audio/signal_processing/include/signal_processing_library.h
+++ b/src/common_audio/signal_processing/include/signal_processing_library.h
@@ -268,17 +268,37 @@ void WebRtcSpl_ScaleAndAddVectors(G_CONST WebRtc_Word16* in_vector1,
                                  WebRtc_Word16 gain2, int right_shifts2,
                                  WebRtc_Word16* out_vector,
                                  int vector_length);
 // Performs the vector operation:
 //   out_vector[k] = ((scale1 * in_vector1[k]) + (scale2 * in_vector2[k])
 //        + round_value) >> right_shifts,
 //   where  round_value = (1 << right_shifts) >> 1.
 //
 // Input:
 //      - in_vector1       : Input vector 1
 //      - in_vector1_scale : Gain to be used for vector 1
 //      - in_vector2       : Input vector 2
 //      - in_vector2_scale : Gain to be used for vector 2
 //      - right_shifts     : Number of right bit shifts to be applied
 //      - length           : Number of elements in the input vectors
 //
 // Output:
 //      - out_vector       : Output vector
 // Return value            : 0 if OK, -1 if (in_vector1 == NULL
 //                           || in_vector2 == NULL || out_vector == NULL
 //                           || length <= 0 || right_shift < 0).
 int WebRtcSpl_ScaleAndAddVectorsWithRound(const int16_t* in_vector1,
                                          int16_t in_vector1_scale,
                                          const int16_t* in_vector2,
                                          int16_t in_vector2_scale,
                                          int right_shifts,
                                          int16_t* out_vector,
                                          int length);
 // End: Vector scaling operations.
 // iLBC specific functions. Implementations in ilbc_specific_functions.c.
 // Description at bottom of file.
 void WebRtcSpl_ScaleAndAddVectorsWithRound(WebRtc_Word16* in_vector1,
                                           WebRtc_Word16 scale1,
                                           WebRtc_Word16* in_vector2,
                                           WebRtc_Word16 scale2,
                                           WebRtc_Word16 right_shifts,
                                           WebRtc_Word16* out_vector,
                                           WebRtc_Word16 vector_length);
 void WebRtcSpl_ReverseOrderMultArrayElements(WebRtc_Word16* out_vector,
                                             G_CONST WebRtc_Word16* in_vector,
                                             G_CONST WebRtc_Word16* window,
@@ -991,30 +1011,6 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band,
 //      - out_vector    : Output vector
 //
 //
 // WebRtcSpl_ScaleAndAddVectorsWithRound(...)
 //
 // Performs the vector operation:
 //
 //  out_vector[k] = ((scale1*in_vector1[k]) + (scale2*in_vector2[k])
 //                      + round_value) >> right_shifts
 //
 //      where:
 //
 //  round_value = (1<<right_shifts)>>1
 //
 // Input:
 //      - in_vector1    : Input vector 1
 //      - scale1        : Gain to be used for vector 1
 //      - in_vector2    : Input vector 2
 //      - scale2        : Gain to be used for vector 2
 //      - right_shifts  : Number of right bit shifts to be applied
 //      - vector_length : Number of elements in the input vectors
 //
 // Output:
 //      - out_vector    : Output vector
 //
 //
 // WebRtcSpl_ReverseOrderMultArrayElements(...)
 //
--- a/src/common_audio/signal_processing/vector_scaling_operations.c
+++ b/src/common_audio/signal_processing/vector_scaling_operations.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -17,9 +17,7 @@
 * WebRtcSpl_ScaleVector()
 * WebRtcSpl_ScaleVectorWithSat()
 * WebRtcSpl_ScaleAndAddVectors()
- *
+ * WebRtcSpl_ScaleAndAddVectorsWithRound()
 * The description header can be found in signal_processing_library.h
 *
 */
 #include "signal_processing_library.h"
@@ -149,3 +147,30 @@ void WebRtcSpl_ScaleAndAddVectors(G_CONST WebRtc_Word16 *in1, WebRtc_Word16 gain
                + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(gain2, *in2ptr++, shift2);
    }
 }
 #if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
 int WebRtcSpl_ScaleAndAddVectorsWithRound(const int16_t* in_vector1,
                                          int16_t in_vector1_scale,
                                          const int16_t* in_vector2,
                                          int16_t in_vector2_scale,
                                          int right_shifts,
                                          int16_t* out_vector,
                                          int length) {
  int i = 0;
  int round_value = (1 << right_shifts) >> 1;
  if (in_vector1 == NULL || in_vector2 == NULL || out_vector == NULL ||
      length <= 0 || right_shifts < 0) {
    return -1;
  }
  for (i = 0; i < length; i++) {
    out_vector[i] = (int16_t)((
        WEBRTC_SPL_MUL_16_16(in_vector1[i], in_vector1_scale)
        + WEBRTC_SPL_MUL_16_16(in_vector2[i], in_vector2_scale)
        + round_value) >> right_shifts);
  }
  return 0;
 }
 #endif
--- a/src/common_audio/signal_processing/vector_scaling_operations_neon.s
+++ b/src/common_audio/signal_processing/vector_scaling_operations_neon.s
@@ -0,0 +1,88 @@
@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS.  All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ vector_scaling_operations_neon.s
@ This file contains the function WebRtcSpl_ScaleAndAddVectorsWithRound(),
@ optimized for ARM Neon platform. Output is bit-exact with the reference
@ C code in vector_scaling_operations.c.
 .arch armv7-a
 .fpu neon
 .align  2
 .global WebRtcSpl_ScaleAndAddVectorsWithRound
 WebRtcSpl_ScaleAndAddVectorsWithRound:
 .fnstart
  push {r4-r9}
  ldr r4, [sp, #32]           @ length
  ldr r5, [sp, #28]           @ out_vector
  ldrsh r6, [sp, #24]         @ right_shifts
  cmp r4, #0
  ble END                     @ Return if length <= 0.
  cmp r4, #8
  blt SET_ROUND_VALUE
  vdup.16 d26, r1             @ in_vector1_scale
  vdup.16 d27, r3             @ in_vector2_scale
  @ Neon instructions can only right shift by an immediate value. To shift right
  @ by a register value, we have to do a left shift left by the negative value.
  rsb r7, r6, #0
  vdup.16 q12, r7             @ -right_shifts
  bic r7, r4, #7              @ Counter for LOOP_UNROLLED_BY_8: length / 8 * 8.
 LOOP_UNROLLED_BY_8:
  vld1.16 {d28, d29}, [r0]!   @ in_vector1[]
  vld1.16 {d30, d31}, [r2]!   @ in_vector2[]
  vmull.s16 q0, d28, d26
  vmull.s16 q1, d29, d26
  vmull.s16 q2, d30, d27
  vmull.s16 q3, d31, d27
  vadd.s32 q0, q2
  vadd.s32 q1, q3
  vrshl.s32 q0, q12           @ Round shift right by right_shifts.
  vrshl.s32 q1, q12
  vmovn.i32 d0, q0            @ Cast to 16 bit values.
  vmovn.i32 d1, q1
  subs r7, #8
  vst1.16 {d0, d1}, [r5]!
  bgt LOOP_UNROLLED_BY_8
  ands r4, #0xFF              @ Counter for LOOP_NO_UNROLLING: length % 8.
  beq END
 SET_ROUND_VALUE:
  mov r9, #1
  lsl r9, r6
  lsr r9, #1
 LOOP_NO_UNROLLING:
  ldrh  r7, [r0], #2
  ldrh  r8, [r2], #2
  smulbb r7, r7, r1
  smulbb r8, r8, r3
  subs r4, #1
  add r7, r9
  add r7, r8
  asr r7, r6
  strh r7, [r5], #2
  bne LOOP_NO_UNROLLING
 END:
  pop {r4-r9}
  bx  lr
 .fnend