diff --git a/webrtc/modules/audio_processing/aec/Android.mk b/webrtc/modules/audio_processing/aec/Android.mk index 3ad52b966..181e87d9a 100644 --- a/webrtc/modules/audio_processing/aec/Android.mk +++ b/webrtc/modules/audio_processing/aec/Android.mk @@ -47,3 +47,12 @@ ifndef NDK_ROOT include external/stlport/libstlport.mk endif include $(BUILD_STATIC_LIBRARY) + +######################### +# Build the neon library. +ifeq ($(WEBRTC_BUILD_NEON_LIBS),true) + +LOCAL_SRC_FILES += \ + aec_core_neon.c + +endif # ifeq ($(WEBRTC_BUILD_NEON_LIBS),true) diff --git a/webrtc/modules/audio_processing/aec/aec_core.c b/webrtc/modules/audio_processing/aec/aec_core.c index 2d2116a43..207c6dc3b 100644 --- a/webrtc/modules/audio_processing/aec/aec_core.c +++ b/webrtc/modules/audio_processing/aec/aec_core.c @@ -67,7 +67,7 @@ static const float sqrtHanning[65] = { // Matlab code to produce table: // weightCurve = [0 ; 0.3 * sqrt(linspace(0,1,64))' + 0.1]; // fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', weightCurve); -const float WebRtcAec_weightCurve[65] = { +ALIGN16_BEG const float ALIGN16_END WebRtcAec_weightCurve[65] = { 0.0000f, 0.1000f, 0.1378f, 0.1535f, 0.1655f, 0.1756f, 0.1845f, 0.1926f, 0.2000f, 0.2069f, 0.2134f, 0.2195f, 0.2254f, 0.2309f, 0.2363f, 0.2414f, 0.2464f, 0.2512f, 0.2558f, 0.2604f, 0.2648f, 0.2690f, 0.2732f, 0.2773f, @@ -81,7 +81,7 @@ const float WebRtcAec_weightCurve[65] = { // Matlab code to produce table: // overDriveCurve = [sqrt(linspace(0,1,65))' + 1]; // fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', overDriveCurve); -const float WebRtcAec_overDriveCurve[65] = { +ALIGN16_BEG const float ALIGN16_END WebRtcAec_overDriveCurve[65] = { 1.0000f, 1.1250f, 1.1768f, 1.2165f, 1.2500f, 1.2795f, 1.3062f, 1.3307f, 1.3536f, 1.3750f, 1.3953f, 1.4146f, 1.4330f, 1.4507f, 1.4677f, 1.4841f, 1.5000f, 1.5154f, 1.5303f, 1.5449f, 1.5590f, 1.5728f, 1.5863f, 1.5995f, @@ -582,6 +582,10 @@ int WebRtcAec_InitAec(AecCore* aec, int sampFreq) { WebRtcAec_InitAec_mips(); #endif +#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON) + WebRtcAec_InitAec_neon(); +#endif + aec_rdft_init(); return 0; diff --git a/webrtc/modules/audio_processing/aec/aec_core.h b/webrtc/modules/audio_processing/aec/aec_core.h index cd2acfe69..93bfed466 100644 --- a/webrtc/modules/audio_processing/aec/aec_core.h +++ b/webrtc/modules/audio_processing/aec/aec_core.h @@ -57,6 +57,9 @@ void WebRtcAec_InitAec_SSE2(void); #if defined(MIPS_FPU_LE) void WebRtcAec_InitAec_mips(void); #endif +#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON) +void WebRtcAec_InitAec_neon(void); +#endif void WebRtcAec_BufferFarendPartition(AecCore* aec, const float* farend); void WebRtcAec_ProcessFrame(AecCore* aec, diff --git a/webrtc/modules/audio_processing/aec/aec_core_neon.c b/webrtc/modules/audio_processing/aec/aec_core_neon.c new file mode 100644 index 000000000..d751a4b17 --- /dev/null +++ b/webrtc/modules/audio_processing/aec/aec_core_neon.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * The core AEC algorithm, neon version of speed-critical functions. + * + * Based on aec_core_sse2.c. + */ + +#include "webrtc/modules/audio_processing/aec/aec_core.h" + +#include +#include + +#include "webrtc/modules/audio_processing/aec/aec_core_internal.h" +#include "webrtc/modules/audio_processing/aec/aec_rdft.h" + +enum { kShiftExponentIntoTopMantissa = 8 }; +enum { kFloatExponentShift = 23 }; + +extern const float WebRtcAec_weightCurve[65]; +extern const float WebRtcAec_overDriveCurve[65]; + +static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) { + // a^b = exp2(b * log2(a)) + // exp2(x) and log2(x) are calculated using polynomial approximations. + float32x4_t log2_a, b_log2_a, a_exp_b; + + // Calculate log2(x), x = a. + { + // To calculate log2(x), we decompose x like this: + // x = y * 2^n + // n is an integer + // y is in the [1.0, 2.0) range + // + // log2(x) = log2(y) + n + // n can be evaluated by playing with float representation. + // log2(y) in a small range can be approximated, this code uses an order + // five polynomial approximation. The coefficients have been + // estimated with the Remez algorithm and the resulting + // polynomial has a maximum relative error of 0.00086%. + + // Compute n. + // This is done by masking the exponent, shifting it into the top bit of + // the mantissa, putting eight into the biased exponent (to shift/ + // compensate the fact that the exponent has been shifted in the top/ + // fractional part and finally getting rid of the implicit leading one + // from the mantissa by substracting it out. + const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); + const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); + const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); + const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a), + vec_float_exponent_mask); + const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); + const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); + const float32x4_t n = + vsubq_f32(vreinterpretq_f32_u32(n_0), + vreinterpretq_f32_u32(vec_implicit_leading_one)); + // Compute y. + const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); + const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); + const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a), + vec_mantissa_mask); + const float32x4_t y = + vreinterpretq_f32_u32(vorrq_u32(mantissa, + vec_zero_biased_exponent_is_one)); + // Approximate log2(y) ~= (y - 1) * pol5(y). + // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 + const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); + const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); + const float32x4_t C3 = vdupq_n_f32(-1.2315303f); + const float32x4_t C2 = vdupq_n_f32(2.5988452f); + const float32x4_t C1 = vdupq_n_f32(-3.3241990f); + const float32x4_t C0 = vdupq_n_f32(3.1157899f); + float32x4_t pol5_y = C5; + pol5_y = vmlaq_f32(C4, y, pol5_y); + pol5_y = vmlaq_f32(C3, y, pol5_y); + pol5_y = vmlaq_f32(C2, y, pol5_y); + pol5_y = vmlaq_f32(C1, y, pol5_y); + pol5_y = vmlaq_f32(C0, y, pol5_y); + const float32x4_t y_minus_one = + vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one)); + const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y); + + // Combine parts. + log2_a = vaddq_f32(n, log2_y); + } + + // b * log2(a) + b_log2_a = vmulq_f32(b, log2_a); + + // Calculate exp2(x), x = b * log2(a). + { + // To calculate 2^x, we decompose x like this: + // x = n + y + // n is an integer, the value of x - 0.5 rounded down, therefore + // y is in the [0.5, 1.5) range + // + // 2^x = 2^n * 2^y + // 2^n can be evaluated by playing with float representation. + // 2^y in a small range can be approximated, this code uses an order two + // polynomial approximation. The coefficients have been estimated + // with the Remez algorithm and the resulting polynomial has a + // maximum relative error of 0.17%. + // To avoid over/underflow, we reduce the range of input to ]-127, 129]. + const float32x4_t max_input = vdupq_n_f32(129.f); + const float32x4_t min_input = vdupq_n_f32(-126.99999f); + const float32x4_t x_min = vminq_f32(b_log2_a, max_input); + const float32x4_t x_max = vmaxq_f32(x_min, min_input); + // Compute n. + const float32x4_t half = vdupq_n_f32(0.5f); + const float32x4_t x_minus_half = vsubq_f32(x_max, half); + const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half); + + // Compute 2^n. + const int32x4_t float_exponent_bias = vdupq_n_s32(127); + const int32x4_t two_n_exponent = + vaddq_s32(x_minus_half_floor, float_exponent_bias); + const float32x4_t two_n = + vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift)); + // Compute y. + const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor)); + + // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. + const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f); + const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f); + const float32x4_t C0 = vdupq_n_f32(1.0017247f); + float32x4_t exp2_y = C2; + exp2_y = vmlaq_f32(C1, y, exp2_y); + exp2_y = vmlaq_f32(C0, y, exp2_y); + + // Combine parts. + a_exp_b = vmulq_f32(exp2_y, two_n); + } + + return a_exp_b; +} + +static void OverdriveAndSuppressNEON(AecCore* aec, + float hNl[PART_LEN1], + const float hNlFb, + float efw[2][PART_LEN1]) { + int i; + const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb); + const float32x4_t vec_one = vdupq_n_f32(1.0f); + const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); + const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); + + // vectorized code (four at once) + for (i = 0; i + 3 < PART_LEN1; i += 4) { + // Weight subbands + float32x4_t vec_hNl = vld1q_f32(&hNl[i]); + const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); + const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); + const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve, + vec_hNlFb); + const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); + const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve, + vec_hNl); + const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger), + vreinterpretq_u32_f32(vec_hNl)); + const float32x4_t vec_one_weightCurve_add = + vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); + const uint32x4_t vec_if1 = + vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); + + vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); + + { + const float32x4_t vec_overDriveCurve = + vld1q_f32(&WebRtcAec_overDriveCurve[i]); + const float32x4_t vec_overDriveSm_overDriveCurve = + vmulq_f32(vec_overDriveSm, vec_overDriveCurve); + vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve); + vst1q_f32(&hNl[i], vec_hNl); + } + + // Suppress error signal + { + float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]); + float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]); + vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl); + vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl); + + // Ooura fft returns incorrect sign on imaginary component. It matters + // here because we are making an additive change with comfort noise. + vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one); + vst1q_f32(&efw[0][i], vec_efw_re); + vst1q_f32(&efw[1][i], vec_efw_im); + } + } + + // scalar code for the remaining items. + for (; i < PART_LEN1; i++) { + // Weight subbands + if (hNl[i] > hNlFb) { + hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + + (1 - WebRtcAec_weightCurve[i]) * hNl[i]; + } + + hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]); + + // Suppress error signal + efw[0][i] *= hNl[i]; + efw[1][i] *= hNl[i]; + + // Ooura fft returns incorrect sign on imaginary component. It matters + // here because we are making an additive change with comfort noise. + efw[1][i] *= -1; + } +} + +void WebRtcAec_InitAec_neon(void) { + WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON; +} + diff --git a/webrtc/modules/audio_processing/audio_processing.gypi b/webrtc/modules/audio_processing/audio_processing.gypi index c73bb2a47..b1d18c5b0 100644 --- a/webrtc/modules/audio_processing/audio_processing.gypi +++ b/webrtc/modules/audio_processing/audio_processing.gypi @@ -199,6 +199,7 @@ '<(webrtc_root)/common_audio/common_audio.gyp:common_audio', ], 'sources': [ + 'aec/aec_core_neon.c', 'aecm/aecm_core_neon.c', 'ns/nsx_core_neon.c', ],