Optimized function WebRtcSpl_ScaleAndAddVectorsWithRound() for ARM-NEON platforms, and refactor it for generic C.

We removed it out of ilbc_specific_functions.c, since it's used not only in iLBC. Passed the unit test. Review URL: https://webrtc-codereview.appspot.com/426009 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1904 4adac7df-926f-26a2-2b94-8c16560cd09d
2012-03-16 16:29:37 +00:00
parent 7e26ad3828
commit bb966ca835
5 changed files with 147 additions and 56 deletions
--- a/src/common_audio/signal_processing/Android.mk
+++ b/src/common_audio/signal_processing/Android.mk
@@ -57,7 +57,8 @@ ifeq ($(ARCH_ARM_HAVE_NEON),true)
 LOCAL_SRC_FILES += \
    cross_correlation_neon.s \
    downsample_fast_neon.s \
-    min_max_operations_neon.s
+    min_max_operations_neon.s \
+    vector_scaling_operations_neon.s
 LOCAL_CFLAGS += \
    $(MY_ARM_CFLAGS_NEON)
 else
--- a/src/common_audio/signal_processing/ilbc_specific_functions.c
+++ b/src/common_audio/signal_processing/ilbc_specific_functions.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -11,35 +11,16 @@

 /*
 * This file contains implementations of the iLBC specific functions
- * WebRtcSpl_ScaleAndAddVectorsWithRound()
 * WebRtcSpl_ReverseOrderMultArrayElements()
 * WebRtcSpl_ElementwiseVectorMult()
 * WebRtcSpl_AddVectorsAndShift()
 * WebRtcSpl_AddAffineVectorToVector()
 * WebRtcSpl_AffineTransformVector()
 *
- * The description header can be found in signal_processing_library.h
- *
 */

 #include "signal_processing_library.h"

-void WebRtcSpl_ScaleAndAddVectorsWithRound(WebRtc_Word16 *vector1, WebRtc_Word16 scale1,
-                                           WebRtc_Word16 *vector2, WebRtc_Word16 scale2,
-                                           WebRtc_Word16 right_shifts, WebRtc_Word16 *out,
-                                           WebRtc_Word16 vector_length)
-{
-    int i;
-    WebRtc_Word16 roundVal;
-    roundVal = 1 << right_shifts;
-    roundVal = roundVal >> 1;
-    for (i = 0; i < vector_length; i++)
-    {
-        out[i] = (WebRtc_Word16)((WEBRTC_SPL_MUL_16_16(vector1[i], scale1)
-                + WEBRTC_SPL_MUL_16_16(vector2[i], scale2) + roundVal) >> right_shifts);
-    }
-}
-
 void WebRtcSpl_ReverseOrderMultArrayElements(WebRtc_Word16 *out, G_CONST WebRtc_Word16 *in,
                                             G_CONST WebRtc_Word16 *win,
                                             WebRtc_Word16 vector_length,
--- a/src/common_audio/signal_processing/include/signal_processing_library.h
+++ b/src/common_audio/signal_processing/include/signal_processing_library.h
@@ -268,17 +268,37 @@ void WebRtcSpl_ScaleAndAddVectors(G_CONST WebRtc_Word16* in_vector1,
                                  WebRtc_Word16 gain2, int right_shifts2,
                                  WebRtc_Word16* out_vector,
                                  int vector_length);
+
+// Performs the vector operation:
+//   out_vector[k] = ((scale1 * in_vector1[k]) + (scale2 * in_vector2[k])
+//        + round_value) >> right_shifts,
+//   where  round_value = (1 << right_shifts) >> 1.
+//
+// Input:
+//      - in_vector1       : Input vector 1
+//      - in_vector1_scale : Gain to be used for vector 1
+//      - in_vector2       : Input vector 2
+//      - in_vector2_scale : Gain to be used for vector 2
+//      - right_shifts     : Number of right bit shifts to be applied
+//      - length           : Number of elements in the input vectors
+//
+// Output:
+//      - out_vector       : Output vector
+// Return value            : 0 if OK, -1 if (in_vector1 == NULL
+//                           || in_vector2 == NULL || out_vector == NULL
+//                           || length <= 0 || right_shift < 0).
+int WebRtcSpl_ScaleAndAddVectorsWithRound(const int16_t* in_vector1,
+                                          int16_t in_vector1_scale,
+                                          const int16_t* in_vector2,
+                                          int16_t in_vector2_scale,
+                                          int right_shifts,
+                                          int16_t* out_vector,
+                                          int length);
+
 // End: Vector scaling operations.

 // iLBC specific functions. Implementations in ilbc_specific_functions.c.
 // Description at bottom of file.
-void WebRtcSpl_ScaleAndAddVectorsWithRound(WebRtc_Word16* in_vector1,
-                                           WebRtc_Word16 scale1,
-                                           WebRtc_Word16* in_vector2,
-                                           WebRtc_Word16 scale2,
-                                           WebRtc_Word16 right_shifts,
-                                           WebRtc_Word16* out_vector,
-                                           WebRtc_Word16 vector_length);
 void WebRtcSpl_ReverseOrderMultArrayElements(WebRtc_Word16* out_vector,
                                             G_CONST WebRtc_Word16* in_vector,
                                             G_CONST WebRtc_Word16* window,
@@ -991,30 +1011,6 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band,
 //      - out_vector    : Output vector
 //

-//
-// WebRtcSpl_ScaleAndAddVectorsWithRound(...)
-//
-// Performs the vector operation:
-//
-//  out_vector[k] = ((scale1*in_vector1[k]) + (scale2*in_vector2[k])
-//                      + round_value) >> right_shifts
-//
-//      where:
-//
-//  round_value = (1<<right_shifts)>>1
-//
-// Input:
-//      - in_vector1    : Input vector 1
-//      - scale1        : Gain to be used for vector 1
-//      - in_vector2    : Input vector 2
-//      - scale2        : Gain to be used for vector 2
-//      - right_shifts  : Number of right bit shifts to be applied
-//      - vector_length : Number of elements in the input vectors
-//
-// Output:
-//      - out_vector    : Output vector
-//
-
 //
 // WebRtcSpl_ReverseOrderMultArrayElements(...)
 //
--- a/src/common_audio/signal_processing/vector_scaling_operations.c
+++ b/src/common_audio/signal_processing/vector_scaling_operations.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -17,9 +17,7 @@
 * WebRtcSpl_ScaleVector()
 * WebRtcSpl_ScaleVectorWithSat()
 * WebRtcSpl_ScaleAndAddVectors()
- *
- * The description header can be found in signal_processing_library.h
- *
+ * WebRtcSpl_ScaleAndAddVectorsWithRound()
 */

 #include "signal_processing_library.h"
@@ -149,3 +147,30 @@ void WebRtcSpl_ScaleAndAddVectors(G_CONST WebRtc_Word16 *in1, WebRtc_Word16 gain
                + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(gain2, *in2ptr++, shift2);
    }
 }
+
+#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
+int WebRtcSpl_ScaleAndAddVectorsWithRound(const int16_t* in_vector1,
+                                          int16_t in_vector1_scale,
+                                          const int16_t* in_vector2,
+                                          int16_t in_vector2_scale,
+                                          int right_shifts,
+                                          int16_t* out_vector,
+                                          int length) {
+  int i = 0;
+  int round_value = (1 << right_shifts) >> 1;
+
+  if (in_vector1 == NULL || in_vector2 == NULL || out_vector == NULL ||
+      length <= 0 || right_shifts < 0) {
+    return -1;
+  }
+
+  for (i = 0; i < length; i++) {
+    out_vector[i] = (int16_t)((
+        WEBRTC_SPL_MUL_16_16(in_vector1[i], in_vector1_scale)
+        + WEBRTC_SPL_MUL_16_16(in_vector2[i], in_vector2_scale)
+        + round_value) >> right_shifts);
+  }
+
+  return 0;
+}
+#endif
--- a/src/common_audio/signal_processing/vector_scaling_operations_neon.s
+++ b/src/common_audio/signal_processing/vector_scaling_operations_neon.s
@@ -0,0 +1,88 @@
+@
+@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS.  All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+
+@ vector_scaling_operations_neon.s
+@ This file contains the function WebRtcSpl_ScaleAndAddVectorsWithRound(),
+@ optimized for ARM Neon platform. Output is bit-exact with the reference
+@ C code in vector_scaling_operations.c.
+
+.arch armv7-a
+.fpu neon
+
+.align  2
+.global WebRtcSpl_ScaleAndAddVectorsWithRound
+
+WebRtcSpl_ScaleAndAddVectorsWithRound:
+.fnstart
+
+  push {r4-r9}
+
+  ldr r4, [sp, #32]           @ length
+  ldr r5, [sp, #28]           @ out_vector
+  ldrsh r6, [sp, #24]         @ right_shifts
+
+  cmp r4, #0
+  ble END                     @ Return if length <= 0.
+
+  cmp r4, #8
+  blt SET_ROUND_VALUE
+
+  vdup.16 d26, r1             @ in_vector1_scale
+  vdup.16 d27, r3             @ in_vector2_scale
+
+  @ Neon instructions can only right shift by an immediate value. To shift right
+  @ by a register value, we have to do a left shift left by the negative value.
+  rsb r7, r6, #0
+  vdup.16 q12, r7             @ -right_shifts
+
+  bic r7, r4, #7              @ Counter for LOOP_UNROLLED_BY_8: length / 8 * 8.
+
+LOOP_UNROLLED_BY_8:
+  vld1.16 {d28, d29}, [r0]!   @ in_vector1[]
+  vld1.16 {d30, d31}, [r2]!   @ in_vector2[]
+  vmull.s16 q0, d28, d26
+  vmull.s16 q1, d29, d26
+  vmull.s16 q2, d30, d27
+  vmull.s16 q3, d31, d27
+  vadd.s32 q0, q2
+  vadd.s32 q1, q3
+  vrshl.s32 q0, q12           @ Round shift right by right_shifts.
+  vrshl.s32 q1, q12
+  vmovn.i32 d0, q0            @ Cast to 16 bit values.
+  vmovn.i32 d1, q1
+  subs r7, #8
+  vst1.16 {d0, d1}, [r5]!
+  bgt LOOP_UNROLLED_BY_8
+
+  ands r4, #0xFF              @ Counter for LOOP_NO_UNROLLING: length % 8.
+  beq END
+
+SET_ROUND_VALUE:
+  mov r9, #1
+  lsl r9, r6
+  lsr r9, #1
+
+LOOP_NO_UNROLLING:
+  ldrh  r7, [r0], #2
+  ldrh  r8, [r2], #2
+  smulbb r7, r7, r1
+  smulbb r8, r8, r3
+  subs r4, #1
+  add r7, r9
+  add r7, r8
+  asr r7, r6
+  strh r7, [r5], #2
+  bne LOOP_NO_UNROLLING
+
+END:
+  pop {r4-r9}
+  bx  lr
+
+.fnend