From 1786436eb27822ec00980c8fe2c5ecdfd789dac5 Mon Sep 17 00:00:00 2001
From: "kma@webrtc.org" <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d>
Date: Wed, 14 Nov 2012 18:44:24 +0000
Subject: [PATCH] Pure Neon assembly coding for WebRtcIsacfix_AutocorrNeon() in
 iSAC-Fix. Review URL: https://webrtc-codereview.appspot.com/939018

git-svn-id: http://webrtc.googlecode.com/svn/trunk@3098 4adac7df-926f-26a2-2b94-8c16560cd09d
---
 .../codecs/isac/fix/source/Android.mk         |   3 +-
 .../codecs/isac/fix/source/codec.h            |   7 +
 .../codecs/isac/fix/source/filters.c          |  18 +-
 .../codecs/isac/fix/source/filters_neon.S     | 145 +++++++++++++++
 .../codecs/isac/fix/source/filters_neon.c     | 167 ------------------
 .../isac/fix/source/filters_unittest.cc       |  69 ++++++++
 .../codecs/isac/fix/source/isacfix.gypi       |   2 +-
 .../codecs/isac/isacfix_test.gypi             |   3 +-
 8 files changed, 232 insertions(+), 182 deletions(-)
 create mode 100644 webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S
 delete mode 100644 webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.c
 create mode 100644 webrtc/modules/audio_coding/codecs/isac/fix/source/filters_unittest.cc

diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/Android.mk b/webrtc/modules/audio_coding/codecs/isac/fix/source/Android.mk
index bd2a91df0..e7745ea4a 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/Android.mk
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/Android.mk
@@ -87,7 +87,8 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := libwebrtc_isacfix_neon
 LOCAL_MODULE_TAGS := optional
 LOCAL_SRC_FILES := \
-    filters_neon.c \
+    filterbanks_neon.S \
+    filters_neon.S \
     lattice_neon.S \
     lpc_masking_model_neon.S
 
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h b/webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h
index 1b4f98723..516fb4491 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h
@@ -21,6 +21,9 @@
 
 #include "structs.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 int WebRtcIsacfix_EstimateBandwidth(BwEstimatorstr   *bwest_str,
                                     Bitstr_dec       *streamdata,
@@ -176,4 +179,8 @@ typedef void (*FilterMaLoopFix)(int16_t input0,
                                 int32_t* ptr2);
 extern FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix;
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif /* WEBRTC_MODULES_AUDIO_CODING_CODECS_ISAC_FIX_SOURCE_CODEC_H_ */
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters.c
index 6ee047753..a5ebd392d 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters.c
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters.c
@@ -8,19 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-/*
- * filters.c
- *
- * This file contains function WebRtcIsacfix_AutocorrC,
- * AllpassFilterForDec32, and WebRtcIsacfix_DecimateAllpass32
- *
- */
+#include <assert.h>
 
-#include <string.h>
-
-#include "pitch_estimator.h"
-#include "lpc_masking_model.h"
-#include "codec.h"
+#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
 
 // Autocorrelation function in fixed point.
 // NOTE! Different from SPLIB-version in how it scales the signal.
@@ -36,6 +26,10 @@ int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r,
   uint32_t temp = 0;
   int64_t prod = 0;
 
+  // The ARM assembly code assumptoins.
+  assert(N % 4 == 0);
+  assert(N >= 8);
+
   // Calculate r[0].
   for (i = 0; i < N; i++) {
     prod += WEBRTC_SPL_MUL_16_16(x[i], x[i]);
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S
new file mode 100644
index 000000000..feb93c93f
--- /dev/null
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S
@@ -0,0 +1,145 @@
+@
+@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS.  All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+@ Reference code in filters.c. Output is bit-exact.
+
+#include "settings.h"
+
+.global WebRtcIsacfix_AutocorrNeon
+.align  2
+
+@ int WebRtcIsacfix_AutocorrNeon(
+@     WebRtc_Word32* __restrict r,
+@     const WebRtc_Word16* __restrict x,
+@     WebRtc_Word16 N,
+@     WebRtc_Word16 order,
+@     WebRtc_Word16* __restrict scale);
+
+WebRtcIsacfix_AutocorrNeon:
+  push       {r3 - r12}
+
+  @ Constant initializations
+  mov        r4, #33
+  vmov.i32   d0, #0
+  vmov.i32   q8, #0
+  vmov.i32   d29, #0               @ Initialize (-scale).
+  vmov.u8    d30, #255             @ Initialize d30 as -1.
+  vmov.i32   d0[0], r4             @ d0: 00000033 (low), 00000000 (high)
+  vmov.i32   d25, #32
+
+  mov        r5, r1                @ x
+  mov        r6, r2                @ N
+
+@ Generate the first coefficient r0.
+LOOP_R0:
+  vld1.16    {d18}, [r5]!          @ x[]
+  subs       r6, r6, #4
+  vmull.s16  q9, d18, d18
+  vpadal.s32 q8, q9
+  bgt        LOOP_R0
+
+  vadd.i64   d16, d16, d17
+
+  @ Calculate scaling (the value of shifting).
+  vmov       d17, d16
+
+  @ Check overflow and determine the value for 'scale'.
+  @ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and
+  @ lower 32-bit words. Note that we don't care about the value of the upper
+  @ word in d17.
+
+  @ Check the case of 1 bit overflow. If it occurs store the results for
+  @ scale and r[0] in d17 and d29.
+
+  vshr.u64   d3, d16, #1
+  vclt.s32   d1, d16, #0           @ < 0 ?
+  vbit       d17, d3, d1           @ For r[0]
+  vbit       d29, d30, d1          @ -scale = -1
+
+  @ For the case of more than 1 bit overflow. If it occurs overwrite the
+  @ results for scale and r[0] in d17 and d29.
+  vclz.s32   d5, d16               @ Leading zeros of the two 32 bit words.
+  vshr.s64   d26, d5, #32          @ Keep only the upper 32 bits.
+  vsub.i64   d31, d26, d0          @ zeros - 33
+  vshl.i64   d27, d26, #32
+  vorr       d27, d26              @ Duplicate the high word with its low one.
+  vshl.u64   d2, d16, d31          @ Shift by (-scale).
+  vclt.s32   d1, d27, d25          @ < 32 ?
+  vbit       d17, d2, d1           @ For r[0]
+  vbit       d29, d31, d1          @ -scale
+
+  vst1.32    d17[0], [r0]!         @ r[0]
+  mov        r5, #1                @ outer loop counter
+
+@ Generate rest of the coefficients
+LOOP_R:
+  vmov.i32   q8, #0                @ Initialize the accumulation result.
+  vmov.i32   q9, #0                @ Initialize the accumulation result.
+  mov        r7, r1                @ &x[0]
+  add        r6, r7, r5, lsl #1    @ x[i]
+  sub        r12, r2, r5           @ N - i
+  lsr        r8, r12, #3           @ inner loop counter
+  sub        r12, r8, lsl #3       @ Leftover samples to be processed
+
+LOOP_8X_SAMPLES:                   @ Multiple of 8 samples
+  vld1.16    {d20, d21}, [r7]!     @ x[0, ...]
+  vld1.16    {d22, d23}, [r6]!     @ x[i, ...]
+  vmull.s16  q12, d20, d22
+  vmull.s16  q13, d21, d23
+  subs       r8, #1
+  vpadal.s32 q8, q12
+  vpadal.s32 q9, q13
+  bgt        LOOP_8X_SAMPLES
+
+  cmp r12, #4
+  blt REST_SAMPLES
+
+Four_SAMPLES:
+  vld1.16    d20, [r7]!
+  vld1.16    d22, [r6]!
+  vmull.s16  q12, d20, d22
+  vpadal.s32 q8, q12
+  sub r12, #4
+
+REST_SAMPLES:
+  mov        r8, #0                @ Initialize lower word of the accumulation.
+  mov        r4, #0                @ Initialize upper word of the accumulation.
+  cmp r12, #0
+  ble SUMUP
+
+LOOP_REST_SAMPLES:
+  ldrh       r9, [r7], #2          @ x[0, ...]
+  ldrh       r10, [r6], #2         @ x[i, ...]
+  smulbb     r11, r9, r10
+  adds       r8, r8, r11           @ lower word of the accumulation.
+  adc        r4, r4, r11, asr #31  @ upper word of the accumulation.
+  subs       r12, #1
+  bgt        LOOP_REST_SAMPLES
+
+@ Added the multiplication results together and do a shift.
+SUMUP:
+  vadd.i64   d16, d17
+  vadd.i64   d18, d19
+  vadd.i64   d18, d16
+  vmov       d17, r8, r4
+  vadd.i64   d18, d17
+  vshl.s64   d18, d29              @ Shift left by (-scale).
+  vst1.32    d18[0], [r0]!         @ r[i]
+
+  add        r5, #1
+  cmp        r5, r3
+  ble        LOOP_R
+
+  vneg.s32   d29, d29              @ Get value for 'scale'.
+  ldr        r2, [sp, #40]         @ &scale
+  add        r0, r3, #1            @ return (order + 1)
+  vst1.s16   d29[0], [r2]          @ Store 'scale'
+
+  pop        {r3 - r12}
+  bx         lr
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.c
deleted file mode 100644
index 93143fe43..000000000
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-/*
- * filters_neon.c
- *
- * This file contains function WebRtcIsacfix_AutocorrNeon, optimized for
- * ARM Neon platform.
- *
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "codec.h"
-
-// Autocorrelation function in fixed point.
-// NOTE! Different from SPLIB-version in how it scales the signal.
-int WebRtcIsacfix_AutocorrNeon(
-    WebRtc_Word32* __restrict r,
-    const WebRtc_Word16* __restrict x,
-    WebRtc_Word16 N,
-    WebRtc_Word16 order,
-    WebRtc_Word16* __restrict scale) {
-
-  // The 1st for loop assumed N % 4 == 0.
-  assert(N % 4 == 0);
-
-  int i = 0;
-  int zeros_low = 0;
-  int zeros_high = 0;
-  int16_t scaling = 0;
-  int32_t sum = 0;
-
-  // Step 1, calculate r[0] and how much scaling is needed.
-
-  int16x4_t reg16x4;
-  int64x1_t reg64x1a;
-  int64x1_t reg64x1b;
-  int32x4_t reg32x4;
-  int64x2_t reg64x2 = vdupq_n_s64(0); // zeros
-
-  // Loop over the samples and do:
-  // sum += WEBRTC_SPL_MUL_16_16(x[i], x[i]);
-  for (i = 0; i < N; i += 4) {
-    reg16x4 = vld1_s16(&x[i]);
-    reg32x4 = vmull_s16(reg16x4, reg16x4);
-    reg64x2 = vpadalq_s32(reg64x2, reg32x4);
-  }
-  reg64x1a = vget_low_s64(reg64x2);
-  reg64x1b = vget_high_s64(reg64x2);
-  reg64x1a = vadd_s64(reg64x1a, reg64x1b);
-
-  // Calculate the value of shifting (scaling).
-  __asm__ __volatile__(
-    "vmov %[z_l], %[z_h], %P[reg]\n\t"
-    "clz %[z_l], %[z_l]\n\t"
-    "clz %[z_h], %[z_h]\n\t"
-    :[z_l]"+r"(zeros_low),
-     [z_h]"+r"(zeros_high)
-    :[reg]"w"(reg64x1a)
-  );
-  if (zeros_high != 32) {
-    scaling = (32 - zeros_high + 1);
-  } else if (zeros_low == 0) {
-    scaling = 1;
-  }
-  reg64x1b = -scaling;
-  reg64x1a = vshl_s64(reg64x1a, reg64x1b);
-
-  // Record the result.
-  r[0] = (int32_t)vget_lane_s64(reg64x1a, 0);
-
-
-  // Step 2, perform the actual correlation calculation.
-
-  /* Original C code (for the rest of the function):
-  for (i = 1; i < order + 1; i++)  {
-    prod = 0;
-    for (j = 0; j < N - i; j++) {
-      prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]);
-    }
-    sum = (int32_t)(prod >> scaling);
-    r[i] = sum;
-  }
-  */
-
-  for (i = 1; i < order + 1; i++) {
-    int32_t prod_lower = 0;
-    int32_t prod_upper = 0;
-    const int16_t* ptr0 = &x[0];
-    const int16_t* ptr1 = &x[i];
-    int32_t tmp = 0;
-
-    // Initialize the sum (q9) to zero.
-    __asm__ __volatile__("vmov.i32 q9, #0\n\t":::"q9");
-
-    // Calculate the major block of the samples (a multiple of 8).
-    for (; ptr0 < &x[N - i - 7];) {
-      __asm__ __volatile__(
-        "vld1.16 {d20, d21}, [%[ptr0]]!\n\t"
-        "vld1.16 {d22, d23}, [%[ptr1]]!\n\t"
-        "vmull.s16 q12, d20, d22\n\t"
-        "vmull.s16 q13, d21, d23\n\t"
-        "vpadal.s32 q9, q12\n\t"
-        "vpadal.s32 q9, q13\n\t"
-
-        // Specify constraints.
-        :[ptr0]"+r"(ptr0),
-        [ptr1]"+r"(ptr1)
-        :
-        :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27"
-      );
-    }
-
-    // Calculate the rest of the samples.
-    for (; ptr0 < &x[N - i]; ptr0++, ptr1++) {
-      __asm__ __volatile__(
-        "smulbb %[tmp], %[ptr0], %[ptr1]\n\t"
-        "adds %[prod_lower], %[prod_lower], %[tmp]\n\t"
-        "adc %[prod_upper], %[prod_upper], %[tmp], asr #31\n\t"
-
-        // Specify constraints.
-        :[prod_lower]"+r"(prod_lower),
-        [prod_upper]"+r"(prod_upper),
-        [tmp]"+r"(tmp)
-        :[ptr0]"r"(*ptr0),
-        [ptr1]"r"(*ptr1)
-      );
-    }
-
-    // Sum the results up, and do shift.
-    __asm__ __volatile__(
-      "vadd.i64 d18, d19\n\t"
-      "vmov.32 d17[0], %[prod_lower]\n\t"
-      "vmov.32 d17[1], %[prod_upper]\n\t"
-      "vadd.i64 d17, d18\n\t"
-      "mov %[tmp], %[scaling], asr #31\n\t"
-      "vmov.32 d16, %[scaling], %[tmp]\n\t"
-      "vshl.s64 d17, d16\n\t"
-      "vmov.32 %[sum], d17[0]\n\t"
-
-      // Specify constraints.
-      :[sum]"=r"(sum),
-      [tmp]"+r"(tmp)
-      :[prod_upper]"r"(prod_upper),
-      [prod_lower]"r"(prod_lower),
-      [scaling]"r"(-scaling)
-      :"d16", "d17", "d18", "d19"
-    );
-
-    // Record the result.
-    r[i] = sum;
-  }
-
-  // Record the result.
-  *scale = scaling;
-
-  return(order + 1);
-}
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_unittest.cc b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_unittest.cc
new file mode 100644
index 000000000..e070789e4
--- /dev/null
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_unittest.cc
@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "gtest/gtest.h"
+#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
+#include "webrtc/system_wrappers/interface/cpu_features_wrapper.h"
+#include "webrtc/typedefs.h"
+
+class FiltersTest : public testing::Test {
+ protected:
+  // Pass a function pointer to the Tester function.
+  void FiltersTester(AutocorrFix WebRtcIsacfix_AutocorrFixFunction) {
+    const int kOrder = 12;
+    const int kBuffer = 40;
+    int16_t scale = 0;
+    int32_t r_buffer[kOrder + 2] = {0};
+
+    // Test an overflow case.
+    const int16_t x_buffer_0[kBuffer] = {0, 0, 3010, 22351, 21106, 16969, -2095,
+        -664, 3513, -30980, 32767, -23839, 13335, 20289, -6831, 339, -17207,
+        32767, 4959, 6177, 32767, 16599, -4747, 20504, 3513, -30980, 32767,
+        -23839, 13335, 20289, 0, -16969, -2095, -664, 3513, 31981, 32767,
+        -13839, 23336, 30281};
+    const int32_t r_expected_0[kOrder + 2] = {1872498461, -224288754, 203789985,
+        483400487, -208272635, 2436500, 137785322, 266600814, -208486262,
+        329510080, 137949184, -161738972, -26894267, 237630192};
+
+    WebRtcIsacfix_AutocorrFixFunction(r_buffer, x_buffer_0,
+                                      kBuffer, kOrder + 1, &scale);
+    for (int i = 0; i < kOrder + 2; i++) {
+      EXPECT_EQ(r_expected_0[i], r_buffer[i]);
+    }
+    EXPECT_EQ(3, scale);
+
+    // Test a no-overflow case.
+    const int16_t x_buffer_1[kBuffer] = {0, 0, 300, 21, 206, 169, -295,
+        -664, 3513, -300, 327, -29, 15, 289, -6831, 339, -107,
+        37, 59, 6177, 327, 169, -4747, 204, 313, -980, 767,
+        -9, 135, 289, 0, -6969, -2095, -664, 0, 1, 7,
+        -39, 236, 281};
+    const int32_t r_expected_1[kOrder + 2] = {176253864, 8126617, 1983287,
+        -26196788, -3487363, -42839676, -24644043, 3469813, 30559879, 31905045,
+        5101567, 29328896, -55787438, -13163978};
+
+    WebRtcIsacfix_AutocorrFixFunction(r_buffer, x_buffer_1,
+                                      kBuffer, kOrder + 1, &scale);
+    for (int i = 0; i < kOrder + 2; i++) {
+      EXPECT_EQ(r_expected_1[i], r_buffer[i]);
+    }
+    EXPECT_EQ(0, scale);
+  }
+};
+
+TEST_F(FiltersTest, AutocorrFixTest) {
+  FiltersTester(WebRtcIsacfix_AutocorrC);
+#ifdef WEBRTC_DETECT_ARM_NEON
+  if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
+    FiltersTester(WebRtcIsacfix_AutocorrNeon);
+  }
+#elif defined(WEBRTC_ARCH_ARM_NEON)
+  FiltersTester(WebRtcIsacfix_AutocorrNeon);
+#endif
+}
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi
index 499ecda84..866e8e621 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi
@@ -97,8 +97,8 @@
             '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
           ],
           'sources': [
+            'filters_neon.S',
             'filterbanks_neon.S',
-            'filters_neon.c',
             'lattice_neon.S',
             'lpc_masking_model_neon.S',
           ],
diff --git a/webrtc/modules/audio_coding/codecs/isac/isacfix_test.gypi b/webrtc/modules/audio_coding/codecs/isac/isacfix_test.gypi
index 861e2422e..04bdecdcf 100644
--- a/webrtc/modules/audio_coding/codecs/isac/isacfix_test.gypi
+++ b/webrtc/modules/audio_coding/codecs/isac/isacfix_test.gypi
@@ -32,8 +32,9 @@
         '<(webrtc_root)/test/test.gyp:test_support_main',
       ],
       'sources': [
-        'fix/source/lpc_masking_model_unittest.cc',
+        'fix/source/filters_unittest.cc',
         'fix/source/filterbanks_unittest.cc',
+        'fix/source/lpc_masking_model_unittest.cc',
       ],
     },
   ],