Optimized function WebRtcSpl_DownsampleFast for ARM-NEON platform.

Review URL: https://webrtc-codereview.appspot.com/371001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1629 4adac7df-926f-26a2-2b94-8c16560cd09d
2012-02-07 18:03:11 +00:00 · 2012-02-07 18:03:11 +00:00 · 551fcc04ec
commit 551fcc04ec
parent 236e842bca
5 changed files with 290 additions and 90 deletions
--- a/src/common_audio/signal_processing/Android.mk
+++ b/src/common_audio/signal_processing/Android.mk
@ -24,7 +24,6 @@ LOCAL_SRC_FILES := \
    copy_set_operations.c \
    division_operations.c \
    dot_product_with_scale.c \
-    downsample_fast.c \
    energy.c \
    filter_ar.c \
    filter_ma_fast_q12.c \
@ -58,12 +57,14 @@ LOCAL_C_INCLUDES := \
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
 LOCAL_SRC_FILES += \
    min_max_operations_neon.c \
-    cross_correlation_neon.s
+    cross_correlation_neon.s \
+    downsample_fast_neon.s
 LOCAL_CFLAGS += \
    $(MY_ARM_CFLAGS_NEON)
 else
 LOCAL_SRC_FILES += \
-    cross_correlation.c
+    cross_correlation.c \
+    downsample_fast.c
 endif

 ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
--- a/src/common_audio/signal_processing/downsample_fast.c
+++ b/src/common_audio/signal_processing/downsample_fast.c
@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@ -8,52 +8,40 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
-/*
- * This file contains the function WebRtcSpl_DownsampleFast().
- * The description header can be found in signal_processing_library.h
- *
- */
-
 #include "signal_processing_library.h"

-int WebRtcSpl_DownsampleFast(WebRtc_Word16 *in_ptr, WebRtc_Word16 in_length,
-                             WebRtc_Word16 *out_ptr, WebRtc_Word16 out_length,
-                             WebRtc_Word16 *B, WebRtc_Word16 B_length, WebRtc_Word16 factor,
-                             WebRtc_Word16 delay)
-{
-    WebRtc_Word32 o;
-    int i, j;
+// TODO(Bjornv): Change the function parameter order to WebRTC code style.
+int WebRtcSpl_DownsampleFast(const int16_t* data_in,
+                             int data_in_length,
+                             int16_t* data_out,
+                             int data_out_length,
+                             const int16_t* __restrict coefficients,
+                             int coefficients_length,
+                             int factor,
+                             int delay) {
+  int i = 0;
+  int j = 0;
+  int32_t out_s32 = 0;
+  int endpos = delay + factor * (data_out_length - 1) + 1;

-    WebRtc_Word16 *downsampled_ptr = out_ptr;
-    WebRtc_Word16 *b_ptr;
-    WebRtc_Word16 *x_ptr;
-    WebRtc_Word16 endpos = delay
-            + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(factor, (out_length - 1)) + 1;
+  // Return error if any of the running conditions doesn't meet.
+  if (data_out_length <= 0 || coefficients_length <= 0
+                           || data_in_length < endpos) {
+    return -1;
+  }

-    if (in_length < endpos)
-    {
-        return -1;
+  for (i = delay; i < endpos; i += factor) {
+    out_s32 = 2048;  // Round value, 0.5 in Q12.
+
+    for (j = 0; j < coefficients_length; j++) {
+      out_s32 += coefficients[j] * data_in[i - j];  // Q12.
    }

-    for (i = delay; i < endpos; i += factor)
-    {
-        b_ptr = &B[0];
-        x_ptr = &in_ptr[i];
+    out_s32 >>= 12;  // Q0.

-        o = (WebRtc_Word32)2048; // Round val
+    // Saturate and store the output.
+    *data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
+  }

-        for (j = 0; j < B_length; j++)
-        {
-            o += WEBRTC_SPL_MUL_16_16(*b_ptr++, *x_ptr--);
-        }
-
-        o = WEBRTC_SPL_RSHIFT_W32(o, 12);
-
-        // If output is higher than 32768, saturate it. Same with negative side
-
-        *downsampled_ptr++ = WebRtcSpl_SatW32ToW16(o);
-    }
-
-    return 0;
+  return 0;
 }
--- a/src/common_audio/signal_processing/downsample_fast_neon.s
+++ b/src/common_audio/signal_processing/downsample_fast_neon.s
@ -0,0 +1,222 @@
+@
+@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS.  All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+
+@ This file contains the function WebRtcSpl_DownsampleFast(), optimized for
+@ ARM Neon platform. The description header can be found in
+@ signal_processing_library.h
+@
+@ The reference C code is in file downsample_fast.c. Bit-exact.
+
+.arch armv7-a
+.fpu neon
+
+.align  2
+.global WebRtcSpl_DownsampleFast
+
+WebRtcSpl_DownsampleFast:
+
+.fnstart
+
+.save {r4-r11}
+  push {r4-r11}
+
+  cmp r3, #0                                @ data_out_length <= 0?
+  movle r0, #-1
+  ble END
+
+  ldrsh r12, [sp, #44]
+  ldr r5, [sp, #40]                         @ r5: factor
+  add r4, r12, #1                           @ r4: delay + 1
+  sub r3, r3, #1                            @ r3: data_out_length - 1
+  smulbb r3, r5, r3
+  ldr r8, [sp, #32]                         @ &coefficients[0]
+  mov r9, r12                               @ Iteration counter for outer loops.
+  add r3, r4                                @ delay + factor * (out_length-1) +1
+
+  cmp r3, r1                                @ data_in_length < endpos?
+  movgt r0, #-1
+  bgt END
+
+  @ Initializations.
+  sub r3, r5, asl #3
+  add r11, r0, r12, asl #1                  @ &data_in[delay]
+  ldr r0, [sp, #36]                         @ coefficients_length
+  add r3, r5                                @ endpos - factor * 7
+
+  cmp r0, #0                                @ coefficients_length <= 0 ?
+  movle r0, #-1
+  ble END
+
+  add r8, r0, asl #1                        @ &coeffieient[coefficients_length]
+  cmp r9, r3
+  bge POST_LOOP_ENDPOS                      @ branch when Iteration < 8 times.
+
+@
+@ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others)
+@
+  mov r4, #-2
+
+  @ Direct program flow to the right channel.
+
+  @ r10 is an offset to &data_in[] in the loop. After an iteration, we need to
+  @ move the pointer back to original after advancing 16 bytes by a vld1, and
+  @ then move 2 bytes forward to increment one more sample.
+  cmp r5, #2
+  moveq r10, #-14
+  beq LOOP_ENDPOS_FACTOR2                   @ Branch when factor == 2
+
+  @ Similar here, for r10, we need to move the pointer back to original after
+  @ advancing 32 bytes, then move 2 bytes forward to increment one sample.
+  cmp r5, #4
+  moveq r10, #-30
+  beq LOOP_ENDPOS_FACTOR4                   @ Branch when factor == 4
+
+  @ For r10, we need to move the pointer back to original after advancing
+  @ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample.
+  mov r10, r5, asl #4
+  rsb r10, #2
+  add r10, r5, asl #1
+  lsl r5, #1                                @ r5 = factor * sizeof(data_in)
+
+@ The general case (factor != 2 && factor != 4)
+LOOP_ENDPOS_GENERAL:
+  @ Initializations.
+  vmov.i32 q2, #2048
+  vmov.i32 q3, #2048
+  sub r7, r8, #2
+  sub r12, r0, #1                           @ coefficients_length - 1
+  sub r1, r11, r12, asl #1                  @ &data_in[i - j]
+
+LOOP_COEFF_LENGTH_GENERAL:
+  vld1.16 {d2[], d3[]}, [r7], r4            @ coefficients[j]
+  vld1.16 d0[0], [r1], r5                   @ data_in[i - j]
+  vld1.16 d0[1], [r1], r5                   @ data_in[i + factor - j]
+  vld1.16 d0[2], [r1], r5                   @ data_in[i + factor * 2 - j]
+  vld1.16 d0[3], [r1], r5                   @ data_in[i + factor * 3 - j]
+  vld1.16 d1[0], [r1], r5                   @ data_in[i + factor * 4 - j]
+  vld1.16 d1[1], [r1], r5                   @ data_in[i + factor * 5 - j]
+  vld1.16 d1[2], [r1], r5                   @ data_in[i + factor * 6 - j]
+  vld1.16 d1[3], [r1], r10                  @ data_in[i + factor * 7 - j]
+  subs r12, #1
+  vmlal.s16 q2, d0, d2
+  vmlal.s16 q3, d1, d3
+  bge LOOP_COEFF_LENGTH_GENERAL
+
+  @ Shift, saturate, and store the result.
+  vqshrn.s32 d0, q2, #12
+  vqshrn.s32 d1, q3, #12
+  vst1.16 {d0, d1}, [r2]!
+
+  add r11, r5, asl #3                       @ r11 -> &data_in[i + factor * 8]
+  add r9, r5, asl #2                        @ Counter i = delay + factor * 8.
+  cmp r9, r3                                @ i < endpos - factor * 7 ?
+  blt LOOP_ENDPOS_GENERAL
+  asr r5, #1                                @ Restore r5 to the value of factor.
+  b POST_LOOP_ENDPOS
+
+@ The case for factor == 2.
+LOOP_ENDPOS_FACTOR2:
+  @ Initializations.
+  vmov.i32 q2, #2048
+  vmov.i32 q3, #2048
+  sub r7, r8, #2
+  sub r12, r0, #1                           @ coefficients_length - 1
+  sub r1, r11, r12, asl #1                  @ &data_in[i - j]
+
+LOOP_COEFF_LENGTH_FACTOR2:
+  vld1.16 {d16[], d17[]}, [r7], r4          @ coefficients[j]
+  vld2.16 {d0, d1}, [r1]!                   @ data_in[]
+  vld2.16 {d2, d3}, [r1], r10               @ data_in[]
+  subs r12, #1
+  vmlal.s16 q2, d0, d16
+  vmlal.s16 q3, d2, d17
+  bge LOOP_COEFF_LENGTH_FACTOR2
+
+  @ Shift, saturate, and store the result.
+  vqshrn.s32 d0, q2, #12
+  vqshrn.s32 d1, q3, #12
+  vst1.16 {d0, d1}, [r2]!
+
+  add r11, r5, asl #4                       @ r11 -> &data_in[i + factor * 8]
+  add r9, r5, asl #3                        @ Counter i = delay + factor * 8.
+  cmp r9, r3                                @ i < endpos - factor * 7 ?
+  blt LOOP_ENDPOS_FACTOR2
+  b POST_LOOP_ENDPOS
+
+@ The case for factor == 4.
+LOOP_ENDPOS_FACTOR4:
+  @ Initializations.
+  vmov.i32 q2, #2048
+  vmov.i32 q3, #2048
+  sub r7, r8, #2
+  sub r12, r0, #1                           @ coefficients_length - 1
+  sub r1, r11, r12, asl #1                  @ &data_in[i - j]
+
+LOOP_COEFF_LENGTH_FACTOR4:
+  vld1.16 {d16[], d17[]}, [r7], r4          @ coefficients[j]
+  vld4.16 {d0, d1, d2, d3}, [r1]!           @ data_in[]
+  vld4.16 {d18, d19, d20, d21}, [r1], r10   @ data_in[]
+  subs r12, #1
+  vmlal.s16 q2, d0, d16
+  vmlal.s16 q3, d18, d17
+  bge LOOP_COEFF_LENGTH_FACTOR4
+
+  @ Shift, saturate, and store the result.
+  vqshrn.s32 d0, q2, #12
+  vqshrn.s32 d1, q3, #12
+  vst1.16 {d0, d1}, [r2]!
+
+  add r11, r5, asl #4                       @ r11 -> &data_in[i + factor * 8]
+  add r9, r5, asl #3                        @ Counter i = delay + factor * 8.
+  cmp r9, r3                                @ i < endpos - factor * 7 ?
+  blt LOOP_ENDPOS_FACTOR4
+
+@
+@ Second part, do the rest iterations (if any).
+@
+
+POST_LOOP_ENDPOS:
+  add r3, r5, asl #3
+  sub r3, r5                                @ Restore r3 to endpos.
+  cmp r9, r3
+  movge r0, #0
+  bge END
+
+LOOP2_ENDPOS:
+  @ Initializations.
+  mov r7, r8
+  sub r12, r0, #1                           @ coefficients_length - 1
+  sub r6, r11, r12, asl #1                  @ &data_in[i - j]
+
+  mov r1, #2048
+
+LOOP2_COEFF_LENGTH:
+  ldrsh r4, [r7, #-2]!                      @ coefficients[j]
+  ldrsh r10, [r6], #2                       @ data_in[i - j]
+  smlabb r1, r4, r10, r1
+  subs r12, #1
+  bge LOOP2_COEFF_LENGTH
+
+  @ Shift, saturate, and store the result.
+  ssat r1, #16, r1, asr #12
+  strh r1, [r2], #2
+
+  add r11, r5, asl #1                       @ r11 -> &data_in[i + factor]
+  add r9, r5                                @ Counter i = delay + factor.
+  cmp r9, r3                                @ i < endpos?
+  blt LOOP2_ENDPOS
+
+  mov r0, #0
+
+END:
+  pop {r4-r11}
+  bx  lr
+
+.fnend
--- a/src/common_audio/signal_processing/filter_ar_fast_q12.c
+++ b/src/common_audio/signal_processing/filter_ar_fast_q12.c
@ -13,9 +13,9 @@

 // TODO(bjornv): Change the return type to report errors.

-void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
+void WebRtcSpl_FilterARFastQ12(const int16_t* data_in,
                               int16_t* data_out,
-                               int16_t* __restrict coefficients,
+                               const int16_t* __restrict coefficients,
                               int coefficients_length,
                               int data_length) {
  int i = 0;
--- a/src/common_audio/signal_processing/include/signal_processing_library.h
+++ b/src/common_audio/signal_processing/include/signal_processing_library.h
@ -386,35 +386,46 @@ void WebRtcSpl_FilterMAFastQ12(WebRtc_Word16* in_vector,
                               WebRtc_Word16 ma_coef_length,
                               WebRtc_Word16 vector_length);

-// WebRtcSpl_FilterARFastQ12(...)
-//
 // Performs a AR filtering on a vector in Q12
-//
 // Input:
-//      - data_in                : Input samples
-//      - data_out               : State information in positions
-//                                   data_out[-order] .. data_out[-1]
-//      - coefficients           : Filter coefficients (in Q12)
-//      - coefficients_length    : Number of coefficients (order+1)
-//      - data_length            : Number of samples to be filtered
-//
+//      - data_in            : Input samples
+//      - data_out           : State information in positions
+//                               data_out[-order] .. data_out[-1]
+//      - coefficients       : Filter coefficients (in Q12)
+//      - coefficients_length: Number of coefficients (order+1)
+//      - data_length        : Number of samples to be filtered
 // Output:
-//      - data_out               : Filtered samples
-
-void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
+//      - data_out           : Filtered samples
+void WebRtcSpl_FilterARFastQ12(const int16_t* data_in,
                               int16_t* data_out,
-                               int16_t* __restrict coefficients,
+                               const int16_t* __restrict coefficients,
                               int coefficients_length,
                               int data_length);

-int WebRtcSpl_DownsampleFast(WebRtc_Word16* in_vector,
-                             WebRtc_Word16 in_vector_length,
-                             WebRtc_Word16* out_vector,
-                             WebRtc_Word16 out_vector_length,
-                             WebRtc_Word16* ma_coef,
-                             WebRtc_Word16 ma_coef_length,
-                             WebRtc_Word16 factor,
-                             WebRtc_Word16 delay);
+// Performs a MA down sampling filter on a vector
+// Input:
+//      - data_in            : Input samples (state in positions
+//                               data_in[-order] .. data_in[-1])
+//      - data_in_length     : Number of samples in |data_in| to be filtered.
+//                               This must be at least
+//                               |delay| + |factor|*(|out_vector_length|-1) + 1)
+//      - data_out_length    : Number of down sampled samples desired
+//      - coefficients       : Filter coefficients (in Q12)
+//      - coefficients_length: Number of coefficients (order+1)
+//      - factor             : Decimation factor
+//      - delay              : Delay of filter (compensated for in out_vector)
+// Output:
+//      - data_out           : Filtered samples
+// Return value              : 0 if OK, -1 if |in_vector| is too short
+int WebRtcSpl_DownsampleFast(const int16_t* data_in,
+                             int data_in_length,
+                             int16_t* data_out,
+                             int data_out_length,
+                             const int16_t* __restrict coefficients,
+                             int coefficients_length,
+                             int factor,
+                             int delay);
+
 // End: Filter operations.

 // FFT operations
@ -1454,28 +1465,6 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band,
 //      - out_vector        : Filtered samples
 //

-//
-// WebRtcSpl_DownsampleFast(...)
-//
-// Performs a MA down sampling filter on a vector
-//
-// Input:
-//      - in_vector         : Input samples (state in positions
-//                            in_vector[-order] .. in_vector[-1])
-//      - in_vector_length  : Number of samples in |in_vector| to be filtered.
-//                            This must be at least
-//                            |delay| + |factor|*(|out_vector_length|-1) + 1)
-//      - out_vector_length : Number of down sampled samples desired
-//      - ma_coef           : Filter coefficients (in Q12)
-//      - ma_coef_length    : Number of B coefficients (order+1)
-//      - factor            : Decimation factor
-//      - delay             : Delay of filter (compensated for in out_vector)
-//
-// Output:
-//      - out_vector        : Filtered samples
-//
-// Return value             : 0 if OK, -1 if |in_vector| is too short
-//

 //
 // WebRtcSpl_DotProductWithScale(...)