Optimized function WebRtcSpl_FilterARFastQ12 for ARM platform.

Speed close to doubled for an offline test in NetEq. Bit exact. Review URL: https://webrtc-codereview.appspot.com/346001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1520 4adac7df-926f-26a2-2b94-8c16560cd09d
2012-01-24 02:12:49 +00:00
parent 6da8eeb946
commit 4bc24c4afa
4 changed files with 280 additions and 56 deletions
--- a/src/common_audio/signal_processing/Android.mk
+++ b/src/common_audio/signal_processing/Android.mk
@@ -1,4 +1,4 @@
-# Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+# Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 #
 # Use of this source code is governed by a BSD-style license
 # that can be found in the LICENSE file in the root of the source
@@ -27,7 +27,6 @@ LOCAL_SRC_FILES := \
    downsample_fast.c \
    energy.c \
    filter_ar.c \
-    filter_ar_fast_q12.c \
    filter_ma_fast_q12.c \
    get_hanning_window.c \
    get_scaling_square.c \
@@ -68,6 +67,14 @@ LOCAL_SRC_FILES += \
    cross_correlation.c
 endif

+ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
+LOCAL_SRC_FILES += \
+    filter_ar_fast_q12_armv7.s
+else
+LOCAL_SRC_FILES += \
+    filter_ar_fast_q12.c
+endif
+
 LOCAL_SHARED_LIBRARIES := libstlport

 ifeq ($(TARGET_OS)-$(TARGET_SIMULATOR),linux-true)
--- a/src/common_audio/signal_processing/filter_ar_fast_q12.c
+++ b/src/common_audio/signal_processing/filter_ar_fast_q12.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -7,43 +7,37 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
-
-
-/*
- * This file contains the function WebRtcSpl_FilterARFastQ12().
- * The description header can be found in signal_processing_library.h
- *
- */
+#include <assert.h>

 #include "signal_processing_library.h"

-void WebRtcSpl_FilterARFastQ12(WebRtc_Word16 *in, WebRtc_Word16 *out, WebRtc_Word16 *A,
-                               WebRtc_Word16 A_length, WebRtc_Word16 length)
-{
-    WebRtc_Word32 o;
-    int i, j;
+// TODO(bjornv): Change the return type to report errors.

-    WebRtc_Word16 *x_ptr = &in[0];
-    WebRtc_Word16 *filtered_ptr = &out[0];
+void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
+                               int16_t* data_out,
+                               int16_t* __restrict coefficients,
+                               int coefficients_length,
+                               int data_length) {
+  int i = 0;
+  int j = 0;

-    for (i = 0; i < length; i++)
-    {
-        // Calculate filtered[i]
-        G_CONST WebRtc_Word16 *a_ptr = &A[0];
-        WebRtc_Word16 *state_ptr = &out[i - 1];
+  assert(data_length > 0);
+  assert(coefficients_length > 1);

-        o = WEBRTC_SPL_MUL_16_16(*x_ptr++, *a_ptr++);
+  for (i = 0; i < data_length; i++) {
+    int32_t output = 0;
+    int32_t sum = 0;

-        for (j = 1; j < A_length; j++)
-        {
-            o -= WEBRTC_SPL_MUL_16_16(*a_ptr++,*state_ptr--);
-        }
-
-        // Saturate the output
-        o = WEBRTC_SPL_SAT((WebRtc_Word32)134215679, o, (WebRtc_Word32)-134217728);
-
-        *filtered_ptr++ = (WebRtc_Word16)((o + (WebRtc_Word32)2048) >> 12);
+    for (j = coefficients_length - 1; j > 0; j--) {
+      sum += coefficients[j] * data_out[i - j];
    }

-    return;
+    output = coefficients[0] * data_in[i];
+    output -= sum;
+
+    // Saturate and store the output.
+    output = WEBRTC_SPL_SAT(134215679, output, -134217728);
+    data_out[i] = (int16_t)((output + 2048) >> 12);
+  }
 }
+
--- a/src/common_audio/signal_processing/filter_ar_fast_q12_armv7.s
+++ b/src/common_audio/signal_processing/filter_ar_fast_q12_armv7.s
@@ -0,0 +1,223 @@
+@
+@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS.  All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+
+@ This file contains the function WebRtcSpl_FilterARFastQ12(), optimized for
+@ ARMv7  platform. The description header can be found in
+@ signal_processing_library.h
+@
+@ Output is bit-exact with the generic C code as in filter_ar_fast_q12.c, and
+@ the reference C code at end of this file.
+
+@ Assumptions:
+@ (1) data_length > 0
+@ (2) coefficients_length > 1
+
+@ Register usage:
+@
+@ r0:  &data_in[i]
+@ r1:  &data_out[i], for result ouput
+@ r2:  &coefficients[0]
+@ r3:  coefficients_length
+@ r4:  Iteration counter for the outer loop.
+@ r5:  data_out[j] as multiplication inputs
+@ r6:  Calculated value for output data_out[]; interation counter for inner loop
+@ r7:  Partial sum of a filtering multiplication results
+@ r8:  Partial sum of a filtering multiplication results
+@ r9:  &data_out[], for filtering input; data_in[i]
+@ r10: coefficients[j]
+@ r11: Scratch
+@ r12: &coefficients[j]
+
+.arch armv7-a
+
+.align  2
+.global WebRtcSpl_FilterARFastQ12
+
+WebRtcSpl_FilterARFastQ12:
+
+.fnstart
+
+.save {r4-r11}
+  push {r4-r11}
+
+  ldrsh r12, [sp, #32]         @ data_length
+  subs r4, r12, #1
+  beq ODD_LENGTH               @ jump if data_length == 1
+
+LOOP_LENGTH:
+  add r12, r2, r3, lsl #1
+  sub r12, #4                  @ &coefficients[coefficients_length - 2]
+  sub r9, r1, r3, lsl #1
+  add r9, #2                   @ &data_out[i - coefficients_length + 1]
+  ldr r5, [r9], #4             @ data_out[i - coefficients_length + {1,2}]
+
+  mov r7, #0                   @ sum1
+  mov r8, #0                   @ sum2
+  subs r6, r3, #3              @ Iteration counter for inner loop.
+  beq ODD_A_LENGTH             @ branch if coefficients_length == 3
+  blt POST_LOOP_A_LENGTH       @ branch if coefficients_length == 2
+
+LOOP_A_LENGTH:
+  ldr r10, [r12], #-4          @ coefficients[j - 1], coefficients[j]
+  subs r6, #2
+  smlatt r8, r10, r5, r8       @ sum2 += coefficients[j] * data_out[i - j + 1];
+  smlatb r7, r10, r5, r7       @ sum1 += coefficients[j] * data_out[i - j];
+  smlabt r7, r10, r5, r7       @ coefficients[j - 1] * data_out[i - j + 1];
+  ldr r5, [r9], #4             @ data_out[i - j + 2],  data_out[i - j + 3]
+  smlabb r8, r10, r5, r8       @ coefficients[j - 1] * data_out[i - j + 2];
+  bgt LOOP_A_LENGTH
+  blt POST_LOOP_A_LENGTH
+
+ODD_A_LENGTH:
+  ldrsh r10, [r12, #2]         @ Filter coefficients coefficients[2]
+  sub r12, #2                  @ &coefficients[0]
+  smlabb r7, r10, r5, r7       @ sum1 += coefficients[2] * data_out[i - 2];
+  smlabt r8, r10, r5, r8       @ sum2 += coefficients[2] * data_out[i - 1];
+  ldr r5, [r9, #-2]            @ data_out[i - 1],  data_out[i]
+
+POST_LOOP_A_LENGTH:
+  ldr r10, [r12]               @ coefficients[0], coefficients[1]
+  smlatb r7, r10, r5, r7       @ sum1 += coefficients[1] * data_out[i - 1];
+
+  ldr r9, [r0], #4             @ data_in[i], data_in[i + 1]
+  smulbb r6, r10, r9           @ output1 = coefficients[0] * data_in[i];
+  sub r6, r7                   @ output1 -= sum1;
+
+  sbfx r11, r6, #12, #16
+  ssat r7, #16, r6, asr #12
+  cmp r7, r11
+  addeq r6, r6, #2048
+  ssat r6, #16, r6, asr #12
+  strh r6, [r1], #2            @ Store data_out[i]
+
+  smlatb r8, r10, r6, r8       @ sum2 += coefficients[1] * data_out[i];
+  smulbt r6, r10, r9           @ output2 = coefficients[0] * data_in[i + 1];
+  sub r6, r8                   @ output1 -= sum1;
+
+  sbfx r11, r6, #12, #16
+  ssat r7, #16, r6, asr #12
+  cmp r7, r11
+  addeq r6, r6, #2048
+  ssat r6, #16, r6, asr #12
+  strh r6, [r1], #2            @ Store data_out[i + 1]
+
+  subs r4, #2
+  bgt LOOP_LENGTH
+  blt END                      @ For even data_length, it's done. Jump to END.
+
+@ Process i = data_length -1, for the case of an odd length.
+ODD_LENGTH:
+  add r12, r2, r3, lsl #1
+  sub r12, #4                  @ &coefficients[coefficients_length - 2]
+  sub r9, r1, r3, lsl #1
+  add r9, #2                   @ &data_out[i - coefficients_length + 1]
+  mov r7, #0                   @ sum1
+  mov r8, #0                   @ sum1
+  subs r6, r3, #2              @ inner loop counter
+  beq EVEN_A_LENGTH            @ branch if coefficients_length == 2
+
+LOOP2_A_LENGTH:
+  ldr r10, [r12], #-4          @ coefficients[j - 1], coefficients[j]
+  ldr r5, [r9], #4             @ data_out[i - j],  data_out[i - j + 1]
+  subs r6, #2
+  smlatb r7, r10, r5, r7       @ sum1 += coefficients[j] * data_out[i - j];
+  smlabt r8, r10, r5, r8       @ coefficients[j - 1] * data_out[i - j + 1];
+  bgt LOOP2_A_LENGTH
+  addlt r12, #2
+  blt POST_LOOP2_A_LENGTH
+
+EVEN_A_LENGTH:
+  ldrsh r10, [r12, #2]         @ Filter coefficients coefficients[1]
+  ldrsh r5, [r9]               @ data_out[i - 1]
+  smlabb r7, r10, r5, r7       @ sum1 += coefficients[1] * data_out[i - 1];
+
+POST_LOOP2_A_LENGTH:
+  ldrsh r10, [r12]             @ Filter coefficients coefficients[0]
+  ldrsh r9, [r0]               @ data_in[i]
+  smulbb r6, r10, r9           @ output1 = coefficients[0] * data_in[i];
+  sub r6, r7                   @ output1 -= sum1;
+  sub r6, r8                   @ output1 -= sum1;
+  sbfx r8, r6, #12, #16
+  ssat r7, #16, r6, asr #12
+  cmp r7, r8
+  addeq r6, r6, #2048
+  ssat r6, #16, r6, asr #12
+  strh r6, [r1]                @ Store the data_out[i]
+
+END:
+  pop {r4-r11}
+  bx  lr
+
+.fnend
+
+
+@Reference C code:
+@
+@void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
+@                               int16_t* data_out,
+@                               int16_t* __restrict coefficients,
+@                               int coefficients_length,
+@                               int data_length) {
+@  int i = 0;
+@  int j = 0;
+@
+@  for (i = 0; i < data_length - 1; i += 2) {
+@    int32_t output1 = 0;
+@    int32_t sum1 = 0;
+@    int32_t output2 = 0;
+@    int32_t sum2 = 0;
+@
+@    for (j = coefficients_length - 1; j > 2; j -= 2) {
+@      sum1 += coefficients[j]      * data_out[i - j];
+@      sum1 += coefficients[j - 1]  * data_out[i - j + 1];
+@      sum2 += coefficients[j]     * data_out[i - j + 1];
+@      sum2 += coefficients[j - 1] * data_out[i - j + 2];
+@    }
+@
+@    if (j == 2) {
+@      sum1 += coefficients[2] * data_out[i - 2];
+@      sum2 += coefficients[2] * data_out[i - 1];
+@    }
+@
+@    sum1 += coefficients[1] * data_out[i - 1];
+@    output1 = coefficients[0] * data_in[i];
+@    output1 -= sum1;
+@    // Saturate and store the output.
+@    output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728);
+@    data_out[i] = (int16_t)((output1 + 2048) >> 12);
+@
+@    sum2 += coefficients[1] * data_out[i];
+@    output2 = coefficients[0] * data_in[i + 1];
+@    output2 -= sum2;
+@    // Saturate and store the output.
+@    output2 = WEBRTC_SPL_SAT(134215679, output2, -134217728);
+@    data_out[i + 1] = (int16_t)((output2 + 2048) >> 12);
+@  }
+@
+@  if (i == data_length - 1) {
+@    int32_t output1 = 0;
+@    int32_t sum1 = 0;
+@
+@    for (j = coefficients_length - 1; j > 1; j -= 2) {
+@      sum1 += coefficients[j]      * data_out[i - j];
+@      sum1 += coefficients[j - 1]  * data_out[i - j + 1];
+@    }
+@
+@    if (j == 1) {
+@      sum1 += coefficients[1] * data_out[i - 1];
+@    }
+@
+@    output1 = coefficients[0] * data_in[i];
+@    output1 -= sum1;
+@    // Saturate and store the output.
+@    output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728);
+@    data_out[i] = (int16_t)((output1 + 2048) >> 12);
+@  }
+@}
--- a/src/common_audio/signal_processing/include/signal_processing_library.h
+++ b/src/common_audio/signal_processing/include/signal_processing_library.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -385,11 +385,28 @@ void WebRtcSpl_FilterMAFastQ12(WebRtc_Word16* in_vector,
                               WebRtc_Word16* ma_coef,
                               WebRtc_Word16 ma_coef_length,
                               WebRtc_Word16 vector_length);
-void WebRtcSpl_FilterARFastQ12(WebRtc_Word16* in_vector,
-                               WebRtc_Word16* out_vector,
-                               WebRtc_Word16* ar_coef,
-                               WebRtc_Word16 ar_coef_length,
-                               WebRtc_Word16 vector_length);
+
+// WebRtcSpl_FilterARFastQ12(...)
+//
+// Performs a AR filtering on a vector in Q12
+//
+// Input:
+//      - data_in                : Input samples
+//      - data_out               : State information in positions
+//                                   data_out[-order] .. data_out[-1]
+//      - coefficients           : Filter coefficients (in Q12)
+//      - coefficients_length    : Number of coefficients (order+1)
+//      - data_length            : Number of samples to be filtered
+//
+// Output:
+//      - data_out               : Filtered samples
+
+void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
+                               int16_t* data_out,
+                               int16_t* __restrict coefficients,
+                               int coefficients_length,
+                               int data_length);
+
 int WebRtcSpl_DownsampleFast(WebRtc_Word16* in_vector,
                             WebRtc_Word16 in_vector_length,
                             WebRtc_Word16* out_vector,
@@ -1437,23 +1454,6 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band,
 //      - out_vector        : Filtered samples
 //

-//
-// WebRtcSpl_FilterARFastQ12(...)
-//
-// Performs a AR filtering on a vector in Q12
-//
-// Input:
-//      - in_vector         : Input samples
-//      - out_vector        : State information in positions
-//                            out_vector[-order] .. out_vector[-1]
-//      - ar_coef           : Filter coefficients (in Q12)
-//      - ar_coef_length    : Number of B coefficients (order+1)
-//      - vector_length     : Number of samples to be filtered
-//
-// Output:
-//      - out_vector        : Filtered samples
-//
-
 //
 // WebRtcSpl_DownsampleFast(...)
 //