Neon version of OverdriveAndSuppress()

audioproc reports the average frame time going from 279us to 255us with the test data used.

the output does not match the c version, but the difference seen is +-1.

Performance gain on Nexus7: 8.8%

BUG=3131
TESTED=trybots and manually
R=bjornv@webrtc.org, cd@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/19539004

Patch from Scott LaVarnway <slavarnw@gmail.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@6433 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
bjornv@webrtc.org 2014-06-13 14:50:23 +00:00
parent db56390f7e
commit af6f02f7bd
5 changed files with 242 additions and 2 deletions

View File

@ -47,3 +47,12 @@ ifndef NDK_ROOT
include external/stlport/libstlport.mk
endif
include $(BUILD_STATIC_LIBRARY)
#########################
# Build the neon library.
ifeq ($(WEBRTC_BUILD_NEON_LIBS),true)
LOCAL_SRC_FILES += \
aec_core_neon.c
endif # ifeq ($(WEBRTC_BUILD_NEON_LIBS),true)

View File

@ -67,7 +67,7 @@ static const float sqrtHanning[65] = {
// Matlab code to produce table:
// weightCurve = [0 ; 0.3 * sqrt(linspace(0,1,64))' + 0.1];
// fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', weightCurve);
const float WebRtcAec_weightCurve[65] = {
ALIGN16_BEG const float ALIGN16_END WebRtcAec_weightCurve[65] = {
0.0000f, 0.1000f, 0.1378f, 0.1535f, 0.1655f, 0.1756f, 0.1845f, 0.1926f,
0.2000f, 0.2069f, 0.2134f, 0.2195f, 0.2254f, 0.2309f, 0.2363f, 0.2414f,
0.2464f, 0.2512f, 0.2558f, 0.2604f, 0.2648f, 0.2690f, 0.2732f, 0.2773f,
@ -81,7 +81,7 @@ const float WebRtcAec_weightCurve[65] = {
// Matlab code to produce table:
// overDriveCurve = [sqrt(linspace(0,1,65))' + 1];
// fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', overDriveCurve);
const float WebRtcAec_overDriveCurve[65] = {
ALIGN16_BEG const float ALIGN16_END WebRtcAec_overDriveCurve[65] = {
1.0000f, 1.1250f, 1.1768f, 1.2165f, 1.2500f, 1.2795f, 1.3062f, 1.3307f,
1.3536f, 1.3750f, 1.3953f, 1.4146f, 1.4330f, 1.4507f, 1.4677f, 1.4841f,
1.5000f, 1.5154f, 1.5303f, 1.5449f, 1.5590f, 1.5728f, 1.5863f, 1.5995f,
@ -582,6 +582,10 @@ int WebRtcAec_InitAec(AecCore* aec, int sampFreq) {
WebRtcAec_InitAec_mips();
#endif
#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON)
WebRtcAec_InitAec_neon();
#endif
aec_rdft_init();
return 0;

View File

@ -57,6 +57,9 @@ void WebRtcAec_InitAec_SSE2(void);
#if defined(MIPS_FPU_LE)
void WebRtcAec_InitAec_mips(void);
#endif
#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON)
void WebRtcAec_InitAec_neon(void);
#endif
void WebRtcAec_BufferFarendPartition(AecCore* aec, const float* farend);
void WebRtcAec_ProcessFrame(AecCore* aec,

View File

@ -0,0 +1,223 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* The core AEC algorithm, neon version of speed-critical functions.
*
* Based on aec_core_sse2.c.
*/
#include "webrtc/modules/audio_processing/aec/aec_core.h"
#include <arm_neon.h>
#include <math.h>
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
enum { kShiftExponentIntoTopMantissa = 8 };
enum { kFloatExponentShift = 23 };
extern const float WebRtcAec_weightCurve[65];
extern const float WebRtcAec_overDriveCurve[65];
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
// a^b = exp2(b * log2(a))
// exp2(x) and log2(x) are calculated using polynomial approximations.
float32x4_t log2_a, b_log2_a, a_exp_b;
// Calculate log2(x), x = a.
{
// To calculate log2(x), we decompose x like this:
// x = y * 2^n
// n is an integer
// y is in the [1.0, 2.0) range
//
// log2(x) = log2(y) + n
// n can be evaluated by playing with float representation.
// log2(y) in a small range can be approximated, this code uses an order
// five polynomial approximation. The coefficients have been
// estimated with the Remez algorithm and the resulting
// polynomial has a maximum relative error of 0.00086%.
// Compute n.
// This is done by masking the exponent, shifting it into the top bit of
// the mantissa, putting eight into the biased exponent (to shift/
// compensate the fact that the exponent has been shifted in the top/
// fractional part and finally getting rid of the implicit leading one
// from the mantissa by substracting it out.
const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000);
const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000);
const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000);
const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a),
vec_float_exponent_mask);
const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa);
const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent);
const float32x4_t n =
vsubq_f32(vreinterpretq_f32_u32(n_0),
vreinterpretq_f32_u32(vec_implicit_leading_one));
// Compute y.
const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF);
const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000);
const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a),
vec_mantissa_mask);
const float32x4_t y =
vreinterpretq_f32_u32(vorrq_u32(mantissa,
vec_zero_biased_exponent_is_one));
// Approximate log2(y) ~= (y - 1) * pol5(y).
// pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f);
const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f);
const float32x4_t C3 = vdupq_n_f32(-1.2315303f);
const float32x4_t C2 = vdupq_n_f32(2.5988452f);
const float32x4_t C1 = vdupq_n_f32(-3.3241990f);
const float32x4_t C0 = vdupq_n_f32(3.1157899f);
float32x4_t pol5_y = C5;
pol5_y = vmlaq_f32(C4, y, pol5_y);
pol5_y = vmlaq_f32(C3, y, pol5_y);
pol5_y = vmlaq_f32(C2, y, pol5_y);
pol5_y = vmlaq_f32(C1, y, pol5_y);
pol5_y = vmlaq_f32(C0, y, pol5_y);
const float32x4_t y_minus_one =
vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one));
const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y);
// Combine parts.
log2_a = vaddq_f32(n, log2_y);
}
// b * log2(a)
b_log2_a = vmulq_f32(b, log2_a);
// Calculate exp2(x), x = b * log2(a).
{
// To calculate 2^x, we decompose x like this:
// x = n + y
// n is an integer, the value of x - 0.5 rounded down, therefore
// y is in the [0.5, 1.5) range
//
// 2^x = 2^n * 2^y
// 2^n can be evaluated by playing with float representation.
// 2^y in a small range can be approximated, this code uses an order two
// polynomial approximation. The coefficients have been estimated
// with the Remez algorithm and the resulting polynomial has a
// maximum relative error of 0.17%.
// To avoid over/underflow, we reduce the range of input to ]-127, 129].
const float32x4_t max_input = vdupq_n_f32(129.f);
const float32x4_t min_input = vdupq_n_f32(-126.99999f);
const float32x4_t x_min = vminq_f32(b_log2_a, max_input);
const float32x4_t x_max = vmaxq_f32(x_min, min_input);
// Compute n.
const float32x4_t half = vdupq_n_f32(0.5f);
const float32x4_t x_minus_half = vsubq_f32(x_max, half);
const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half);
// Compute 2^n.
const int32x4_t float_exponent_bias = vdupq_n_s32(127);
const int32x4_t two_n_exponent =
vaddq_s32(x_minus_half_floor, float_exponent_bias);
const float32x4_t two_n =
vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift));
// Compute y.
const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor));
// Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f);
const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f);
const float32x4_t C0 = vdupq_n_f32(1.0017247f);
float32x4_t exp2_y = C2;
exp2_y = vmlaq_f32(C1, y, exp2_y);
exp2_y = vmlaq_f32(C0, y, exp2_y);
// Combine parts.
a_exp_b = vmulq_f32(exp2_y, two_n);
}
return a_exp_b;
}
static void OverdriveAndSuppressNEON(AecCore* aec,
float hNl[PART_LEN1],
const float hNlFb,
float efw[2][PART_LEN1]) {
int i;
const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb);
const float32x4_t vec_one = vdupq_n_f32(1.0f);
const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f);
const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm);
// vectorized code (four at once)
for (i = 0; i + 3 < PART_LEN1; i += 4) {
// Weight subbands
float32x4_t vec_hNl = vld1q_f32(&hNl[i]);
const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]);
const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb);
const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve,
vec_hNlFb);
const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve);
const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve,
vec_hNl);
const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger),
vreinterpretq_u32_f32(vec_hNl));
const float32x4_t vec_one_weightCurve_add =
vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl);
const uint32x4_t vec_if1 =
vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add));
vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1));
{
const float32x4_t vec_overDriveCurve =
vld1q_f32(&WebRtcAec_overDriveCurve[i]);
const float32x4_t vec_overDriveSm_overDriveCurve =
vmulq_f32(vec_overDriveSm, vec_overDriveCurve);
vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve);
vst1q_f32(&hNl[i], vec_hNl);
}
// Suppress error signal
{
float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]);
float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]);
vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl);
vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl);
// Ooura fft returns incorrect sign on imaginary component. It matters
// here because we are making an additive change with comfort noise.
vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one);
vst1q_f32(&efw[0][i], vec_efw_re);
vst1q_f32(&efw[1][i], vec_efw_im);
}
}
// scalar code for the remaining items.
for (; i < PART_LEN1; i++) {
// Weight subbands
if (hNl[i] > hNlFb) {
hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
(1 - WebRtcAec_weightCurve[i]) * hNl[i];
}
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
// Suppress error signal
efw[0][i] *= hNl[i];
efw[1][i] *= hNl[i];
// Ooura fft returns incorrect sign on imaginary component. It matters
// here because we are making an additive change with comfort noise.
efw[1][i] *= -1;
}
}
void WebRtcAec_InitAec_neon(void) {
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON;
}

View File

@ -199,6 +199,7 @@
'<(webrtc_root)/common_audio/common_audio.gyp:common_audio',
],
'sources': [
'aec/aec_core_neon.c',
'aecm/aecm_core_neon.c',
'ns/nsx_core_neon.c',
],