From 1786436eb27822ec00980c8fe2c5ecdfd789dac5 Mon Sep 17 00:00:00 2001 From: "kma@webrtc.org" Date: Wed, 14 Nov 2012 18:44:24 +0000 Subject: [PATCH] Pure Neon assembly coding for WebRtcIsacfix_AutocorrNeon() in iSAC-Fix. Review URL: https://webrtc-codereview.appspot.com/939018 git-svn-id: http://webrtc.googlecode.com/svn/trunk@3098 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../codecs/isac/fix/source/Android.mk | 3 +- .../codecs/isac/fix/source/codec.h | 7 + .../codecs/isac/fix/source/filters.c | 18 +- .../codecs/isac/fix/source/filters_neon.S | 145 +++++++++++++++ .../codecs/isac/fix/source/filters_neon.c | 167 ------------------ .../isac/fix/source/filters_unittest.cc | 69 ++++++++ .../codecs/isac/fix/source/isacfix.gypi | 2 +- .../codecs/isac/isacfix_test.gypi | 3 +- 8 files changed, 232 insertions(+), 182 deletions(-) create mode 100644 webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S delete mode 100644 webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.c create mode 100644 webrtc/modules/audio_coding/codecs/isac/fix/source/filters_unittest.cc diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/Android.mk b/webrtc/modules/audio_coding/codecs/isac/fix/source/Android.mk index bd2a91df0..e7745ea4a 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/Android.mk +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/Android.mk @@ -87,7 +87,8 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := libwebrtc_isacfix_neon LOCAL_MODULE_TAGS := optional LOCAL_SRC_FILES := \ - filters_neon.c \ + filterbanks_neon.S \ + filters_neon.S \ lattice_neon.S \ lpc_masking_model_neon.S diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h b/webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h index 1b4f98723..516fb4491 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h @@ -21,6 +21,9 @@ #include "structs.h" +#ifdef __cplusplus +extern "C" { +#endif int WebRtcIsacfix_EstimateBandwidth(BwEstimatorstr *bwest_str, Bitstr_dec *streamdata, @@ -176,4 +179,8 @@ typedef void (*FilterMaLoopFix)(int16_t input0, int32_t* ptr2); extern FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix; +#ifdef __cplusplus +} // extern "C" +#endif + #endif /* WEBRTC_MODULES_AUDIO_CODING_CODECS_ISAC_FIX_SOURCE_CODEC_H_ */ diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters.c index 6ee047753..a5ebd392d 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters.c +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters.c @@ -8,19 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -/* - * filters.c - * - * This file contains function WebRtcIsacfix_AutocorrC, - * AllpassFilterForDec32, and WebRtcIsacfix_DecimateAllpass32 - * - */ +#include -#include - -#include "pitch_estimator.h" -#include "lpc_masking_model.h" -#include "codec.h" +#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h" // Autocorrelation function in fixed point. // NOTE! Different from SPLIB-version in how it scales the signal. @@ -36,6 +26,10 @@ int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r, uint32_t temp = 0; int64_t prod = 0; + // The ARM assembly code assumptoins. + assert(N % 4 == 0); + assert(N >= 8); + // Calculate r[0]. for (i = 0; i < N; i++) { prod += WEBRTC_SPL_MUL_16_16(x[i], x[i]); diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S new file mode 100644 index 000000000..feb93c93f --- /dev/null +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S @@ -0,0 +1,145 @@ +@ +@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ +@ Reference code in filters.c. Output is bit-exact. + +#include "settings.h" + +.global WebRtcIsacfix_AutocorrNeon +.align 2 + +@ int WebRtcIsacfix_AutocorrNeon( +@ WebRtc_Word32* __restrict r, +@ const WebRtc_Word16* __restrict x, +@ WebRtc_Word16 N, +@ WebRtc_Word16 order, +@ WebRtc_Word16* __restrict scale); + +WebRtcIsacfix_AutocorrNeon: + push {r3 - r12} + + @ Constant initializations + mov r4, #33 + vmov.i32 d0, #0 + vmov.i32 q8, #0 + vmov.i32 d29, #0 @ Initialize (-scale). + vmov.u8 d30, #255 @ Initialize d30 as -1. + vmov.i32 d0[0], r4 @ d0: 00000033 (low), 00000000 (high) + vmov.i32 d25, #32 + + mov r5, r1 @ x + mov r6, r2 @ N + +@ Generate the first coefficient r0. +LOOP_R0: + vld1.16 {d18}, [r5]! @ x[] + subs r6, r6, #4 + vmull.s16 q9, d18, d18 + vpadal.s32 q8, q9 + bgt LOOP_R0 + + vadd.i64 d16, d16, d17 + + @ Calculate scaling (the value of shifting). + vmov d17, d16 + + @ Check overflow and determine the value for 'scale'. + @ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and + @ lower 32-bit words. Note that we don't care about the value of the upper + @ word in d17. + + @ Check the case of 1 bit overflow. If it occurs store the results for + @ scale and r[0] in d17 and d29. + + vshr.u64 d3, d16, #1 + vclt.s32 d1, d16, #0 @ < 0 ? + vbit d17, d3, d1 @ For r[0] + vbit d29, d30, d1 @ -scale = -1 + + @ For the case of more than 1 bit overflow. If it occurs overwrite the + @ results for scale and r[0] in d17 and d29. + vclz.s32 d5, d16 @ Leading zeros of the two 32 bit words. + vshr.s64 d26, d5, #32 @ Keep only the upper 32 bits. + vsub.i64 d31, d26, d0 @ zeros - 33 + vshl.i64 d27, d26, #32 + vorr d27, d26 @ Duplicate the high word with its low one. + vshl.u64 d2, d16, d31 @ Shift by (-scale). + vclt.s32 d1, d27, d25 @ < 32 ? + vbit d17, d2, d1 @ For r[0] + vbit d29, d31, d1 @ -scale + + vst1.32 d17[0], [r0]! @ r[0] + mov r5, #1 @ outer loop counter + +@ Generate rest of the coefficients +LOOP_R: + vmov.i32 q8, #0 @ Initialize the accumulation result. + vmov.i32 q9, #0 @ Initialize the accumulation result. + mov r7, r1 @ &x[0] + add r6, r7, r5, lsl #1 @ x[i] + sub r12, r2, r5 @ N - i + lsr r8, r12, #3 @ inner loop counter + sub r12, r8, lsl #3 @ Leftover samples to be processed + +LOOP_8X_SAMPLES: @ Multiple of 8 samples + vld1.16 {d20, d21}, [r7]! @ x[0, ...] + vld1.16 {d22, d23}, [r6]! @ x[i, ...] + vmull.s16 q12, d20, d22 + vmull.s16 q13, d21, d23 + subs r8, #1 + vpadal.s32 q8, q12 + vpadal.s32 q9, q13 + bgt LOOP_8X_SAMPLES + + cmp r12, #4 + blt REST_SAMPLES + +Four_SAMPLES: + vld1.16 d20, [r7]! + vld1.16 d22, [r6]! + vmull.s16 q12, d20, d22 + vpadal.s32 q8, q12 + sub r12, #4 + +REST_SAMPLES: + mov r8, #0 @ Initialize lower word of the accumulation. + mov r4, #0 @ Initialize upper word of the accumulation. + cmp r12, #0 + ble SUMUP + +LOOP_REST_SAMPLES: + ldrh r9, [r7], #2 @ x[0, ...] + ldrh r10, [r6], #2 @ x[i, ...] + smulbb r11, r9, r10 + adds r8, r8, r11 @ lower word of the accumulation. + adc r4, r4, r11, asr #31 @ upper word of the accumulation. + subs r12, #1 + bgt LOOP_REST_SAMPLES + +@ Added the multiplication results together and do a shift. +SUMUP: + vadd.i64 d16, d17 + vadd.i64 d18, d19 + vadd.i64 d18, d16 + vmov d17, r8, r4 + vadd.i64 d18, d17 + vshl.s64 d18, d29 @ Shift left by (-scale). + vst1.32 d18[0], [r0]! @ r[i] + + add r5, #1 + cmp r5, r3 + ble LOOP_R + + vneg.s32 d29, d29 @ Get value for 'scale'. + ldr r2, [sp, #40] @ &scale + add r0, r3, #1 @ return (order + 1) + vst1.s16 d29[0], [r2] @ Store 'scale' + + pop {r3 - r12} + bx lr diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.c deleted file mode 100644 index 93143fe43..000000000 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.c +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -/* - * filters_neon.c - * - * This file contains function WebRtcIsacfix_AutocorrNeon, optimized for - * ARM Neon platform. - * - */ - -#include -#include - -#include "codec.h" - -// Autocorrelation function in fixed point. -// NOTE! Different from SPLIB-version in how it scales the signal. -int WebRtcIsacfix_AutocorrNeon( - WebRtc_Word32* __restrict r, - const WebRtc_Word16* __restrict x, - WebRtc_Word16 N, - WebRtc_Word16 order, - WebRtc_Word16* __restrict scale) { - - // The 1st for loop assumed N % 4 == 0. - assert(N % 4 == 0); - - int i = 0; - int zeros_low = 0; - int zeros_high = 0; - int16_t scaling = 0; - int32_t sum = 0; - - // Step 1, calculate r[0] and how much scaling is needed. - - int16x4_t reg16x4; - int64x1_t reg64x1a; - int64x1_t reg64x1b; - int32x4_t reg32x4; - int64x2_t reg64x2 = vdupq_n_s64(0); // zeros - - // Loop over the samples and do: - // sum += WEBRTC_SPL_MUL_16_16(x[i], x[i]); - for (i = 0; i < N; i += 4) { - reg16x4 = vld1_s16(&x[i]); - reg32x4 = vmull_s16(reg16x4, reg16x4); - reg64x2 = vpadalq_s32(reg64x2, reg32x4); - } - reg64x1a = vget_low_s64(reg64x2); - reg64x1b = vget_high_s64(reg64x2); - reg64x1a = vadd_s64(reg64x1a, reg64x1b); - - // Calculate the value of shifting (scaling). - __asm__ __volatile__( - "vmov %[z_l], %[z_h], %P[reg]\n\t" - "clz %[z_l], %[z_l]\n\t" - "clz %[z_h], %[z_h]\n\t" - :[z_l]"+r"(zeros_low), - [z_h]"+r"(zeros_high) - :[reg]"w"(reg64x1a) - ); - if (zeros_high != 32) { - scaling = (32 - zeros_high + 1); - } else if (zeros_low == 0) { - scaling = 1; - } - reg64x1b = -scaling; - reg64x1a = vshl_s64(reg64x1a, reg64x1b); - - // Record the result. - r[0] = (int32_t)vget_lane_s64(reg64x1a, 0); - - - // Step 2, perform the actual correlation calculation. - - /* Original C code (for the rest of the function): - for (i = 1; i < order + 1; i++) { - prod = 0; - for (j = 0; j < N - i; j++) { - prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]); - } - sum = (int32_t)(prod >> scaling); - r[i] = sum; - } - */ - - for (i = 1; i < order + 1; i++) { - int32_t prod_lower = 0; - int32_t prod_upper = 0; - const int16_t* ptr0 = &x[0]; - const int16_t* ptr1 = &x[i]; - int32_t tmp = 0; - - // Initialize the sum (q9) to zero. - __asm__ __volatile__("vmov.i32 q9, #0\n\t":::"q9"); - - // Calculate the major block of the samples (a multiple of 8). - for (; ptr0 < &x[N - i - 7];) { - __asm__ __volatile__( - "vld1.16 {d20, d21}, [%[ptr0]]!\n\t" - "vld1.16 {d22, d23}, [%[ptr1]]!\n\t" - "vmull.s16 q12, d20, d22\n\t" - "vmull.s16 q13, d21, d23\n\t" - "vpadal.s32 q9, q12\n\t" - "vpadal.s32 q9, q13\n\t" - - // Specify constraints. - :[ptr0]"+r"(ptr0), - [ptr1]"+r"(ptr1) - : - :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27" - ); - } - - // Calculate the rest of the samples. - for (; ptr0 < &x[N - i]; ptr0++, ptr1++) { - __asm__ __volatile__( - "smulbb %[tmp], %[ptr0], %[ptr1]\n\t" - "adds %[prod_lower], %[prod_lower], %[tmp]\n\t" - "adc %[prod_upper], %[prod_upper], %[tmp], asr #31\n\t" - - // Specify constraints. - :[prod_lower]"+r"(prod_lower), - [prod_upper]"+r"(prod_upper), - [tmp]"+r"(tmp) - :[ptr0]"r"(*ptr0), - [ptr1]"r"(*ptr1) - ); - } - - // Sum the results up, and do shift. - __asm__ __volatile__( - "vadd.i64 d18, d19\n\t" - "vmov.32 d17[0], %[prod_lower]\n\t" - "vmov.32 d17[1], %[prod_upper]\n\t" - "vadd.i64 d17, d18\n\t" - "mov %[tmp], %[scaling], asr #31\n\t" - "vmov.32 d16, %[scaling], %[tmp]\n\t" - "vshl.s64 d17, d16\n\t" - "vmov.32 %[sum], d17[0]\n\t" - - // Specify constraints. - :[sum]"=r"(sum), - [tmp]"+r"(tmp) - :[prod_upper]"r"(prod_upper), - [prod_lower]"r"(prod_lower), - [scaling]"r"(-scaling) - :"d16", "d17", "d18", "d19" - ); - - // Record the result. - r[i] = sum; - } - - // Record the result. - *scale = scaling; - - return(order + 1); -} diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_unittest.cc b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_unittest.cc new file mode 100644 index 000000000..e070789e4 --- /dev/null +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_unittest.cc @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "gtest/gtest.h" +#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h" +#include "webrtc/system_wrappers/interface/cpu_features_wrapper.h" +#include "webrtc/typedefs.h" + +class FiltersTest : public testing::Test { + protected: + // Pass a function pointer to the Tester function. + void FiltersTester(AutocorrFix WebRtcIsacfix_AutocorrFixFunction) { + const int kOrder = 12; + const int kBuffer = 40; + int16_t scale = 0; + int32_t r_buffer[kOrder + 2] = {0}; + + // Test an overflow case. + const int16_t x_buffer_0[kBuffer] = {0, 0, 3010, 22351, 21106, 16969, -2095, + -664, 3513, -30980, 32767, -23839, 13335, 20289, -6831, 339, -17207, + 32767, 4959, 6177, 32767, 16599, -4747, 20504, 3513, -30980, 32767, + -23839, 13335, 20289, 0, -16969, -2095, -664, 3513, 31981, 32767, + -13839, 23336, 30281}; + const int32_t r_expected_0[kOrder + 2] = {1872498461, -224288754, 203789985, + 483400487, -208272635, 2436500, 137785322, 266600814, -208486262, + 329510080, 137949184, -161738972, -26894267, 237630192}; + + WebRtcIsacfix_AutocorrFixFunction(r_buffer, x_buffer_0, + kBuffer, kOrder + 1, &scale); + for (int i = 0; i < kOrder + 2; i++) { + EXPECT_EQ(r_expected_0[i], r_buffer[i]); + } + EXPECT_EQ(3, scale); + + // Test a no-overflow case. + const int16_t x_buffer_1[kBuffer] = {0, 0, 300, 21, 206, 169, -295, + -664, 3513, -300, 327, -29, 15, 289, -6831, 339, -107, + 37, 59, 6177, 327, 169, -4747, 204, 313, -980, 767, + -9, 135, 289, 0, -6969, -2095, -664, 0, 1, 7, + -39, 236, 281}; + const int32_t r_expected_1[kOrder + 2] = {176253864, 8126617, 1983287, + -26196788, -3487363, -42839676, -24644043, 3469813, 30559879, 31905045, + 5101567, 29328896, -55787438, -13163978}; + + WebRtcIsacfix_AutocorrFixFunction(r_buffer, x_buffer_1, + kBuffer, kOrder + 1, &scale); + for (int i = 0; i < kOrder + 2; i++) { + EXPECT_EQ(r_expected_1[i], r_buffer[i]); + } + EXPECT_EQ(0, scale); + } +}; + +TEST_F(FiltersTest, AutocorrFixTest) { + FiltersTester(WebRtcIsacfix_AutocorrC); +#ifdef WEBRTC_DETECT_ARM_NEON + if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) { + FiltersTester(WebRtcIsacfix_AutocorrNeon); + } +#elif defined(WEBRTC_ARCH_ARM_NEON) + FiltersTester(WebRtcIsacfix_AutocorrNeon); +#endif +} diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi index 499ecda84..866e8e621 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi @@ -97,8 +97,8 @@ '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing', ], 'sources': [ + 'filters_neon.S', 'filterbanks_neon.S', - 'filters_neon.c', 'lattice_neon.S', 'lpc_masking_model_neon.S', ], diff --git a/webrtc/modules/audio_coding/codecs/isac/isacfix_test.gypi b/webrtc/modules/audio_coding/codecs/isac/isacfix_test.gypi index 861e2422e..04bdecdcf 100644 --- a/webrtc/modules/audio_coding/codecs/isac/isacfix_test.gypi +++ b/webrtc/modules/audio_coding/codecs/isac/isacfix_test.gypi @@ -32,8 +32,9 @@ '<(webrtc_root)/test/test.gyp:test_support_main', ], 'sources': [ - 'fix/source/lpc_masking_model_unittest.cc', + 'fix/source/filters_unittest.cc', 'fix/source/filterbanks_unittest.cc', + 'fix/source/lpc_masking_model_unittest.cc', ], }, ],