Refactored Neon code for AECM module, by using pure assembly code.

Bit exact. Review URL: https://webrtc-codereview.appspot.com/447008 git-svn-id: http://webrtc.googlecode.com/svn/trunk@2382 4adac7df-926f-26a2-2b94-8c16560cd09d
2012-06-07 16:17:17 +00:00 · 2012-06-07 16:17:17 +00:00 · f85b35a2f4
commit f85b35a2f4
parent 38506ef4d3
7 changed files with 576 additions and 114 deletions
--- a/src/modules/audio_processing/aecm/Android.mk
+++ b/src/modules/audio_processing/aecm/Android.mk
@ -56,7 +56,21 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := libwebrtc_aecm_neon
 LOCAL_MODULE_TAGS := optional

-LOCAL_SRC_FILES := aecm_core_neon.c
+GEN := $(LOCAL_PATH)/aecm_core_neon_offsets.h
+
+# Generate a header file aecm_core_neon_offsets.h which will be included in
+# assembly file aecm_core_neon.S, from file aecm_core_neon_offsets.c.
+$(GEN): $(LOCAL_PATH)/../../../../src/build/generate_asm_header.py \
+            $(intermediates)/aecm_core_neon_offsets.S
+	@python $^ $@ offset_aecm_
+
+$(intermediates)/aecm_core_neon_offsets.S: \
+	    $(LOCAL_PATH)/aecm_core_neon_offsets.c
+	@$(TARGET_CC) $(addprefix -I, $(LOCAL_INCLUDES)) $(addprefix -isystem ,\
+            $(TARGET_C_INCLUDES)) -S -o $@ $^
+
+LOCAL_GENERATED_SOURCES := $(GEN)
+LOCAL_SRC_FILES := aecm_core_neon.S

 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
@ -70,6 +84,8 @@ LOCAL_C_INCLUDES := \
    $(LOCAL_PATH)/../../.. \
    $(LOCAL_PATH)/../../../common_audio/signal_processing/include

+LOCAL_INCLUDES := $(LOCAL_C_INCLUDES)
+
 ifndef NDK_ROOT
 include external/stlport/libstlport.mk
 endif
--- a/src/modules/audio_processing/aecm/aecm_core.c
+++ b/src/modules/audio_processing/aecm/aecm_core.c
@ -11,6 +11,7 @@
 #include "aecm_core.h"

 #include <assert.h>
+#include <stddef.h>
 #include <stdlib.h>

 #include "cpu_features_wrapper.h"
@ -197,6 +198,15 @@ static const WebRtc_Word16 kSinTable[] = {
 static const WebRtc_Word16 kNoiseEstQDomain = 15;
 static const WebRtc_Word16 kNoiseEstIncCount = 5;

+// TODO(andrew): put this into general WebRTC so other modules can use it.
+// Define a compiler-time assertion.
+#define WEBRTC_STATIC_ASSERT(name, boolean_cond) \
+  static char const static_assert_##name[(boolean_cond) ? 1 : -1] = {'!'}
+
+// Assert a preprocessor definition at compile-time. It's an assumption
+// used in assembly code, so check the assembly files before any change.
+WEBRTC_STATIC_ASSERT(PART_LEN, PART_LEN % 16 == 0);
+
 static void ComfortNoise(AecmCore_t* aecm,
                         const WebRtc_UWord16* dfa,
                         complex16_t* out,
@ -395,6 +405,18 @@ static void WindowAndFFTC(WebRtc_Word16* fft,
    }
 }

+// Initialize function pointers for ARM Neon platform.
+#if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON)
+static void WebRtcAecm_InitNeon(void)
+{
+  WebRtcAecm_WindowAndFFT = WebRtcAecm_WindowAndFFTNeon;
+  WebRtcAecm_InverseFFTAndWindow = WebRtcAecm_InverseFFTAndWindowNeon;
+  WebRtcAecm_CalcLinearEnergies = WebRtcAecm_CalcLinearEnergiesNeon;
+  WebRtcAecm_StoreAdaptiveChannel = WebRtcAecm_StoreAdaptiveChannelNeon;
+  WebRtcAecm_ResetAdaptiveChannel = WebRtcAecm_ResetAdaptiveChannelNeon;
+}
+#endif
+
 static void InverseFFTAndWindowC(AecmCore_t* aecm,
                                 WebRtc_Word16* fft,
                                 complex16_t* efw,
@ -673,7 +695,7 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
    uint64_t features = WebRtc_GetCPUFeaturesARM();
    if ((features & kCPUFeatureNEON) != 0)
    {
-        WebRtcAecm_InitNeon();
+      WebRtcAecm_InitNeon();
    }
 #elif defined(WEBRTC_ARCH_ARM_NEON)
    WebRtcAecm_InitNeon();
@ -1850,7 +1872,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm,
            {
                hnl[i] = 0;
            }
-    
+
            // Remove outliers
            if (numPosCoef < 3)
            {
--- a/src/modules/audio_processing/aecm/aecm_core.h
+++ b/src/modules/audio_processing/aecm/aecm_core.h
@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@ -10,92 +10,13 @@

 // Performs echo control (suppression) with fft routines in fixed-point

-#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_MAIN_SOURCE_AECM_CORE_H_
-#define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_MAIN_SOURCE_AECM_CORE_H_
-
-#define AECM_DYNAMIC_Q // turn on/off dynamic Q-domain
-//#define AECM_WITH_ABS_APPROX
-//#define AECM_SHORT                // for 32 sample partition length (otherwise 64)
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_

 #include "typedefs.h"
 #include "signal_processing_library.h"

-// Algorithm parameters
-
-#define FRAME_LEN       80              // Total frame length, 10 ms
-#ifdef AECM_SHORT
-
-#define PART_LEN        32              // Length of partition
-#define PART_LEN_SHIFT  6               // Length of (PART_LEN * 2) in base 2
-
-#else
-
-#define PART_LEN        64              // Length of partition
-#define PART_LEN_SHIFT  7               // Length of (PART_LEN * 2) in base 2
-
-#endif
-
-#define PART_LEN1       (PART_LEN + 1)  // Unique fft coefficients
-#define PART_LEN2       (PART_LEN << 1) // Length of partition * 2
-#define PART_LEN4       (PART_LEN << 2) // Length of partition * 4
-#define FAR_BUF_LEN     PART_LEN4       // Length of buffers
-#define MAX_DELAY 100
-
-// Counter parameters
-#ifdef AECM_SHORT
-
-#define CONV_LEN        1024            // Convergence length used at startup
-#else
-
-#define CONV_LEN        512             // Convergence length used at startup
-#endif
-
-#define CONV_LEN2       (CONV_LEN << 1) // Convergence length * 2 used at startup
-// Energy parameters
-#define MAX_BUF_LEN     64              // History length of energy signals
-
-#define FAR_ENERGY_MIN  1025            // Lowest Far energy level: At least 2 in energy
-#define FAR_ENERGY_DIFF 929             // Allowed difference between max and min
-
-#define ENERGY_DEV_OFFSET       0       // The energy error offset in Q8
-#define ENERGY_DEV_TOL  400             // The energy estimation tolerance in Q8
-#define FAR_ENERGY_VAD_REGION   230     // Far VAD tolerance region
-// Stepsize parameters
-#define MU_MIN          10              // Min stepsize 2^-MU_MIN (far end energy dependent)
-#define MU_MAX          1               // Max stepsize 2^-MU_MAX (far end energy dependent)
-#define MU_DIFF         9               // MU_MIN - MU_MAX
-// Channel parameters
-#define MIN_MSE_COUNT   20              // Min number of consecutive blocks with enough far end
-                                        // energy to compare channel estimates
-#define MIN_MSE_DIFF    29              // The ratio between adapted and stored channel to
-                                        // accept a new storage (0.8 in Q-MSE_RESOLUTION)
-#define MSE_RESOLUTION  5               // MSE parameter resolution
-#define RESOLUTION_CHANNEL16    12      // W16 Channel in Q-RESOLUTION_CHANNEL16
-#define RESOLUTION_CHANNEL32    28      // W32 Channel in Q-RESOLUTION_CHANNEL
-#define CHANNEL_VAD     16              // Minimum energy in frequency band to update channel
-// Suppression gain parameters: SUPGAIN_ parameters in Q-(RESOLUTION_SUPGAIN)
-#define RESOLUTION_SUPGAIN      8       // Channel in Q-(RESOLUTION_SUPGAIN)
-#define SUPGAIN_DEFAULT (1 << RESOLUTION_SUPGAIN)   // Default suppression gain
-#define SUPGAIN_ERROR_PARAM_A   3072    // Estimation error parameter (Maximum gain) (8 in Q8)
-#define SUPGAIN_ERROR_PARAM_B   1536    // Estimation error parameter (Gain before going down)
-#define SUPGAIN_ERROR_PARAM_D   SUPGAIN_DEFAULT // Estimation error parameter
-                                                // (Should be the same as Default) (1 in Q8)
-#define SUPGAIN_EPC_DT  200             // = SUPGAIN_ERROR_PARAM_C * ENERGY_DEV_TOL
-// Defines for "check delay estimation"
-#define CORR_WIDTH      31              // Number of samples to correlate over.
-#define CORR_MAX        16              // Maximum correlation offset
-#define CORR_MAX_BUF    63
-#define CORR_DEV        4
-#define CORR_MAX_LEVEL  20
-#define CORR_MAX_LOW    4
-#define CORR_BUF_LEN    (CORR_MAX << 1) + 1
-// Note that CORR_WIDTH + 2*CORR_MAX <= MAX_BUF_LEN
-
-#define ONE_Q14         (1 << 14)
-
-// NLP defines
-#define NLP_COMP_LOW    3277            // 0.2 in Q14
-#define NLP_COMP_HIGH   ONE_Q14         // 1 in Q14
+#include "aecm_defines.h"

 extern const WebRtc_Word16 WebRtcAecm_kSqrtHanning[];

@ -368,8 +289,33 @@ typedef void (*InverseFFTAndWindow)(
    const WebRtc_Word16* nearendClean);
 extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;

-// Initialization of the above function pointers for ARM Neon.
-void WebRtcAecm_InitNeon(void);
+// For the above function pointers, functions for generic platforms are declared
+// and defined as static in file aecm_core.c, while those for ARM Neon platforms
+// are declared below and defined in file aecm_core_neon.s.
+#if (defined WEBRTC_DETECT_ARM_NEON) || defined (WEBRTC_ARCH_ARM_NEON)
+void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft,
+                                 const WebRtc_Word16* time_signal,
+                                 complex16_t* freq_signal,
+                                 int time_signal_scaling);

+void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
+                                        WebRtc_Word16* fft,
+                                        complex16_t* efw,
+                                        WebRtc_Word16* output,
+                                        const WebRtc_Word16* nearendClean);
+
+void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
+                                       const WebRtc_UWord16* far_spectrum,
+                                       WebRtc_Word32* echo_est,
+                                       WebRtc_UWord32* far_energy,
+                                       WebRtc_UWord32* echo_energy_adapt,
+                                       WebRtc_UWord32* echo_energy_stored);
+
+void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
+                                         const WebRtc_UWord16* far_spectrum,
+                                         WebRtc_Word32* echo_est);
+
+void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm);
+#endif

 #endif
--- a/src/modules/audio_processing/aecm/aecm_core_neon.S
+++ b/src/modules/audio_processing/aecm/aecm_core_neon.S
@ -0,0 +1,361 @@
+@
+@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS.  All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+
+@ aecm_core_neon.s
+@ This file contains some functions in AECM, optimized for ARM Neon
+@ platforms. Reference C code is in file aecm_core.c. Bit-exact.
+
+.arch armv7-a
+.fpu neon
+
+#include "aecm_defines.h"
+#include "aecm_core_neon_offsets.h"
+
+.extern WebRtcAecm_kSqrtHanning
+
+.global WebRtcAecm_WindowAndFFTNeon
+.global WebRtcAecm_InverseFFTAndWindowNeon
+.global WebRtcAecm_CalcLinearEnergiesNeon
+.global WebRtcAecm_StoreAdaptiveChannelNeon
+.global WebRtcAecm_ResetAdaptiveChannelNeon
+
+@ void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft,
+@                                  const WebRtc_Word16* time_signal,
+@                                  complex16_t* freq_signal,
+@                                  int time_signal_scaling);
+.align  2
+WebRtcAecm_WindowAndFFTNeon:
+.fnstart
+.save {r4, r5, lr}
+  push {r4, r5, lr}
+
+  vdup.16 d16, r3
+  mov r5, r2                                 @ WebRtcSpl_ComplexIFFT changes r2.
+
+  vmov.i16 d21, #0                           @ For imaginary parts of |fft|.
+  vmov.i16 d27, #0                           @ For imaginary parts of |fft|.
+  ldr r2, =WebRtcAecm_kSqrtHanning
+  adr lr, kSqrtHanningReversed
+  add r4, r0, #(PART_LEN2 * 2)               @ &fft[PART_LEN2]
+  add r12, r1, #(PART_LEN * 2)               @ time_signal[PART_LEN]
+  mov r3, #(PART_LEN / 4)                    @ Loop counter, unrolled by 4
+
+LOOP_PART_LEN:
+  vld1.16 d0, [r1, :64]!                     @ time_signal[i]
+  vld1.16 d22, [r12, :64]!                   @ time_signal[i + PART_LEN]
+  vld1.16 d17, [r2, :64]!                    @ WebRtcAecm_kSqrtHanning[i]
+  vld1.16 d23, [lr, :64]!                    @ kSqrtHanningReversed[i]
+  vshl.s16  d18, d0, d16
+  vshl.s16  d22, d22, d16
+  vmull.s16 q9, d18, d17
+  vmull.s16 q12, d22, d23
+  subs r3, #1
+  vshrn.i32 d20, q9, #14
+  vshrn.i32 d26, q12, #14
+  vst2.16 {d20, d21}, [r0, :128]!            @ fft[j]
+  vst2.16 {d26, d27}, [r4, :128]!            @ fft[PART_LEN2 + j]
+  bgt LOOP_PART_LEN
+
+  sub r4, r0, #(PART_LEN2 * 2)               @ r4 points to fft[0]
+  mov r0, r4
+  mov r1, #7
+  bl  WebRtcSpl_ComplexBitReverse
+
+  mov r0, r4
+  mov r1, #7
+  mov r2, #1
+  bl  WebRtcSpl_ComplexFFT
+
+  mov r3, #(PART_LEN * 2 / 16)               @ Loop counter, unrolled by 16.
+
+LOOP_PART_LEN2:
+  @ freq_signal[i].real = fft[j];
+  @ freq_signal[i].imag = - fft[j+1];
+  vld2.16 {d20, d21, d22, d23}, [r4, :256]!
+  subs r3, #1
+  vneg.s16 d22, d22
+  vneg.s16 d23, d23
+  vst2.16 {d20, d21, d22, d23}, [r5, :256]!
+  bgt LOOP_PART_LEN2
+
+  pop {r4, r5, pc}
+.fnend
+
+@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
+@                                         WebRtc_Word16* fft,
+@                                         complex16_t* efw,
+@                                         WebRtc_Word16* output,
+@                                         const WebRtc_Word16* nearendClean);
+.align  2
+WebRtcAecm_InverseFFTAndWindowNeon:
+.fnstart
+.save {r4-r8, lr}
+  push {r4-r8, lr}
+
+  @ Values of r0, r1, and r3 will change in WebRtcSpl_ComplexIFFT
+  @ and WebRtcSpl_ComplexBitReverse.
+  mov r4, r1
+  mov r5, r0
+  mov r7, r3
+
+  add r3, r1, #((PART_LEN4 - 6) * 2)         @ &fft[PART_LEN4 - 6]
+  mov r6, #(PART_LEN / 4)                    @ Loop counter, unrolled by 4
+  add r12, r2, #(PART_LEN * 4)               @ &efw[PART_LEN]
+  mov r8, #-16
+
+LOOP_PRE_IFFT:
+  vld2.16 {q10}, [r2, :128]!
+  vmov q11, q10
+  vneg.s16 d23, d23
+  vst2.16 {d22, d23}, [r1, :128]!
+  vrev64.16 q10, q10
+  subs r6, #1
+  vst2.16 {q10}, [r3], r8
+  bgt LOOP_PRE_IFFT
+
+  @  fft[PART_LEN2] = efw[PART_LEN].real;
+  @  fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
+  ldr r8, [r12]
+  ssub16 r2, r6, r8
+  mov r1, #(PART_LEN2 * 2)
+  pkhbt r8, r8, r2
+  str r8, [r4, r1]
+
+  mov r0, r4
+  mov r1, #7
+  bl  WebRtcSpl_ComplexBitReverse
+
+  mov r0, r4
+  mov r1, #7
+  mov r2, #1
+  bl  WebRtcSpl_ComplexIFFT
+
+  mov r1, r4
+  mov r2, r4
+  mov r3, #(PART_LEN * 2 / 8)                @ Loop counter, unrolled by 8.
+
+LOOP_GET_REAL_VALUES:
+  vld2.16 {q10, q11}, [r2, :256]!
+  subs r3, #1
+  vst1.16 {q10}, [r1, :128]!
+  bgt LOOP_GET_REAL_VALUES
+
+  ldr r6, =offset_aecm_outBuf
+  ldr r12, =offset_aecm_dfaCleanQDomain
+  ldr r8, [r5, r6]                           @ &aecm->outBuf[0]
+  ldrsh r2, [r5, r12]                        @ &aecm->dfaCleanQDomain[0]
+
+  adr r12, kSqrtHanningReversed
+  ldr r6, =WebRtcAecm_kSqrtHanning
+  rsb r0, r2, r0                             @ outCFFT - aecm->dfaCleanQDomain
+  vdup.32 q9, r0
+  add r0, r4, #(PART_LEN * 2)                @ &fft[PART_LEN]
+  mov r3, #(PART_LEN / 4)                    @ Loop counter, unrolled by 4.
+
+LOOP_POST_IFFT:
+  vld1.16 d16, [r4, :64]                     @ fft[i];
+  vld1.16 d17, [r6, :64]!                    @ WebRtcAecm_kSqrtHanning[i]
+  vld1.16 d20, [r8, :64]                     @ aecm->outBuf[i]
+  vmull.s16 q8, d16, d17
+  vmovl.s16 q10, d20
+  vrshr.s32 q8, q8, #14
+  vld1.16 d0, [r0, :64]!                     @ &fft[PART_LEN + i]
+  vshl.s32 q8, q8, q9
+  vld1.16 d1, [r12, :64]!                    @ kSqrtHanningReversed[i]
+  vadd.i32 q8, q10
+  vmull.s16 q0, d0, d1
+  vqshrn.s32 d16, q8, #0
+  vshr.s32 q0, q0, #14
+  vst1.16 d16, [r4, :64]!                    @ fft[i];
+  vshl.s32 q0, q0, q9
+  vst1.16 d16, [r7, :64]!                    @ output[i]
+  vqshrn.s32 d0, q0, #0
+  subs r3, #1
+  vst1.16 d0, [r8, :64]!                     @ aecm->outBuf[i]
+  bgt LOOP_POST_IFFT
+
+  ldr r3, =offset_aecm_xBuf
+  ldr r12, =offset_aecm_dBufNoisy
+  ldr r3, [r5, r3]                           @ &aecm->xBuf[0]
+  ldr r1, [r5, r12]                          @ &aecm->dBufNoisy[0]
+  add r2, r3, #(PART_LEN * 2)                @ &aecm->xBuf[PART_LEN]
+  add r0, r1, #(PART_LEN * 2)                @ &aecm->dBufNoisy[PART_LEN]
+  mov r4, #(PART_LEN / 16)                   @ Loop counter, unrolled by 16.
+
+LOOP_COPY:
+  vld1.16 {q10, q11}, [r2, :256]!
+  vld1.16 {q12, q13}, [r0, :256]!
+  subs r4, #1
+  vst1.16 {q10, q11}, [r3, :256]!
+  vst1.16 {q12, q13}, [r1, :256]!
+  bgt LOOP_COPY
+
+  ldr r2, [sp, #24]
+  cmp r2, #0                                  @ Check if (nearendClean != NULL).
+  beq END
+
+  ldr r4, =offset_aecm_dBufClean
+  ldr r1, [r5, r4]                            @ &aecm->dBufClean[0]
+  add r0, r1, #(PART_LEN * 2)                 @ &aecm->dBufClean[PART_LEN]
+
+  vld1.16 {q10, q11}, [r0, :256]!
+  vld1.16 {q12, q13}, [r0, :256]!
+  vst1.16 {q10, q11}, [r1, :256]!
+  vst1.16 {q12, q13}, [r1, :256]!
+  vld1.16 {q10, q11}, [r0, :256]!
+  vld1.16 {q12, q13}, [r0, :256]!
+  vst1.16 {q10, q11}, [r1, :256]!
+  vst1.16 {q12, q13}, [r1, :256]!
+
+END:
+  pop {r4-r8, pc}
+.fnend
+
+@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
+@                                        const WebRtc_UWord16* far_spectrum,
+@                                        WebRtc_Word32* echo_est,
+@                                        WebRtc_UWord32* far_energy,
+@                                        WebRtc_UWord32* echo_energy_adapt,
+@                                        WebRtc_UWord32* echo_energy_stored);
+.align  2
+WebRtcAecm_CalcLinearEnergiesNeon:
+.fnstart
+.save {r4-r7}
+  push {r4-r7}
+
+  vmov.i32 q14, #0
+  vmov.i32 q8,  #0
+  vmov.i32 q9,  #0
+
+  ldr r7, =offset_aecm_channelStored
+  ldr r5, =offset_aecm_channelAdapt16
+
+  mov r4, r2
+  mov r12, #(PART_LEN / 8)                   @  Loop counter, unrolled by 8.
+  ldr r6, [r0, r7]
+  ldr r7, [r0, r5]
+
+LOOP_CALC_LINEAR_ENERGIES:
+  vld1.16 {d26, d27}, [r1]!                  @ far_spectrum[i]
+  vld1.16 {d24, d25}, [r6, :128]!            @ &aecm->channelStored[i]
+  vld1.16 {d0, d1}, [r7, :128]!              @ &aecm->channelAdapt16[i]
+  vaddw.u16 q14, q14, d26
+  vmull.u16 q10, d26, d24
+  vmull.u16 q11, d27, d25
+  vaddw.u16 q14, q14, d27
+  vmull.u16 q1, d26, d0
+  vst1.32 {q10, q11}, [r4, :256]!            @ &echo_est[i]
+  vadd.u32 q8, q10
+  vmull.u16 q2, d27, d1
+  vadd.u32 q8, q11
+  vadd.u32 q9, q1
+  subs r12, #1
+  vadd.u32 q9, q2
+  bgt LOOP_CALC_LINEAR_ENERGIES
+
+  vadd.u32 d28, d29
+  vpadd.u32 d28, d28
+  vmov.32 r12, d28[0]
+  vadd.u32 d18, d19
+  vpadd.u32 d18, d18
+  vmov.32 r5, d18[0]                         @ echo_energy_adapt_r
+  vadd.u32 d16, d17
+  vpadd.u32 d16, d16
+
+  ldrh  r1, [r1]                             @ far_spectrum[i]
+  add r12, r12, r1
+  str r12, [r3]                              @ far_energy
+  vmov.32 r2, d16[0]
+
+  ldrsh r12, [r6]                            @ aecm->channelStored[i]
+  ldrh  r6, [r7]                             @ aecm->channelAdapt16[i]
+  mul r0, r12, r1
+  mla r1, r6, r1, r5
+  add r2, r2, r0
+  str r0, [r4]                               @ echo_est[i]
+  ldr r4, [sp, #20]                          @ &echo_energy_stored
+  str r2, [r4]
+  ldr r3, [sp, #16]                          @ &echo_energy_adapt
+  str r1, [r3]
+
+  pop {r4-r7}
+  bx  lr
+.fnend
+
+@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
+@                                          const uint16_t* far_spectrum,
+@                                          int32_t* echo_est);
+.align  2
+WebRtcAecm_StoreAdaptiveChannelNeon:
+.fnstart
+  ldr r3, =offset_aecm_channelAdapt16
+  ldr r12, =offset_aecm_channelStored
+  ldr r3, [r0, r3]
+  ldr r0, [r0, r12]
+  mov r12, #(PART_LEN / 8)                   @ Loop counter, unrolled by 8.
+
+LOOP_STORE_ADAPTIVE_CHANNEL:
+  vld1.16 {d24, d25}, [r3, :128]!            @ &aecm->channelAdapt16[i]
+  vld1.16 {d26, d27}, [r1]!                  @ &far_spectrum[i]
+  vst1.16 {d24, d25}, [r0, :128]!            @ &aecm->channelStored[i]
+  vmull.u16 q10, d26, d24
+  vmull.u16 q11, d27, d25
+  vst1.16 {q10, q11}, [r2, :256]!            @ echo_est[i]
+  subs r12, #1
+  bgt LOOP_STORE_ADAPTIVE_CHANNEL
+
+  ldrsh  r12, [r3]
+  strh  r12, [r0]
+  ldrh  r1, [r1]
+  mul r3, r1, r12
+  str r3, [r2]
+
+  bx  lr
+.fnend
+
+@ void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm);
+.align  2
+WebRtcAecm_ResetAdaptiveChannelNeon:
+.fnstart
+  ldr r1, =offset_aecm_channelAdapt16
+  ldr r2, =offset_aecm_channelAdapt32
+  movw r3, #offset_aecm_channelStored
+  ldr r1, [r0, r1]                           @ &aecm->channelAdapt16[0]
+  ldr r2, [r0, r2]                           @ &aecm->channelAdapt32[0]
+  ldr r0, [r0, r3]                           @ &aecm->channelStored[0]
+  mov r3, #(PART_LEN / 8)                    @ Loop counter, unrolled by 8.
+
+LOOP_RESET_ADAPTIVE_CHANNEL:
+  vld1.16 {d24, d25}, [r0, :128]!
+  subs r3, #1
+  vst1.16 {d24, d25}, [r1, :128]!
+  vshll.s16 q10, d24, #16
+  vshll.s16 q11, d25, #16
+  vst1.16 {q10, q11}, [r2, :256]!
+  bgt LOOP_RESET_ADAPTIVE_CHANNEL
+
+  ldrh  r0, [r0]
+  strh  r0, [r1]
+  mov r0, r0, asl #16
+  str r0, [r2]
+
+  bx  lr
+.fnend
+
+  @ Square root of Hanning window in Q14. Compared to WebRtcAecm_kSqrtHanning,
+  @ the order was reversed and one useless element (0) was removed.
+.align  3
+kSqrtHanningReversed:
+  .hword 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947
+  .hword 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571
+  .hword 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335
+  .hword 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370
+  .hword 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101
+  .hword 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399
--- a/src/modules/audio_processing/aecm/aecm_core_neon.c
+++ b/src/modules/audio_processing/aecm/aecm_core_neon.c
@ -34,10 +34,10 @@ static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) =
  1594,  1196,  798,   399
 };

-static void WindowAndFFTNeon(WebRtc_Word16* fft,
-                             const WebRtc_Word16* time_signal,
-                             complex16_t* freq_signal,
-                             int time_signal_scaling) {
+void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft,
+                                 const WebRtc_Word16* time_signal,
+                                 complex16_t* freq_signal,
+                                 int time_signal_scaling) {
  int i, j;

  int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
@ -86,11 +86,11 @@ static void WindowAndFFTNeon(WebRtc_Word16* fft,
  }
 }

-static void InverseFFTAndWindowNeon(AecmCore_t* aecm,
-                                    WebRtc_Word16* fft,
-                                    complex16_t* efw,
-                                    WebRtc_Word16* output,
-                                    const WebRtc_Word16* nearendClean) {
+void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
+                                        WebRtc_Word16* fft,
+                                        complex16_t* efw,
+                                        WebRtc_Word16* output,
+                                        const WebRtc_Word16* nearendClean) {
  int i, j, outCFFT;

  // Synthesis
@ -186,12 +186,12 @@ static void InverseFFTAndWindowNeon(AecmCore_t* aecm,
  }
 }

-static void CalcLinearEnergiesNeon(AecmCore_t* aecm,
-                                   const WebRtc_UWord16* far_spectrum,
-                                   WebRtc_Word32* echo_est,
-                                   WebRtc_UWord32* far_energy,
-                                   WebRtc_UWord32* echo_energy_adapt,
-                                   WebRtc_UWord32* echo_energy_stored) {
+void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
+                                       const WebRtc_UWord16* far_spectrum,
+                                       WebRtc_Word32* echo_est,
+                                       WebRtc_UWord32* far_energy,
+                                       WebRtc_UWord32* echo_energy_adapt,
+                                       WebRtc_UWord32* echo_energy_stored) {
  int i;

  register WebRtc_UWord32 far_energy_r;
@ -249,9 +249,9 @@ static void CalcLinearEnergiesNeon(AecmCore_t* aecm,
      aecm->channelAdapt16[i], far_spectrum[i]);
 }

-static void StoreAdaptiveChannelNeon(AecmCore_t* aecm,
-                                     const WebRtc_UWord16* far_spectrum,
-                                     WebRtc_Word32* echo_est) {
+void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
+                                         const WebRtc_UWord16* far_spectrum,
+                                         WebRtc_Word32* echo_est) {
  int i;

  // During startup we store the channel every block.
@ -271,7 +271,7 @@ static void StoreAdaptiveChannelNeon(AecmCore_t* aecm,
  echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
 }

-static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
+void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
  int i;

  for (i = 0; i < PART_LEN - 7; i += 8) {
@ -292,10 +292,3 @@ static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
      (WebRtc_Word32)aecm->channelStored[i], 16);
 }

-void WebRtcAecm_InitNeon(void) {
-  WebRtcAecm_WindowAndFFT = WindowAndFFTNeon;
-  WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon;
-  WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon;
-  WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon;
-  WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon;
-}
--- a/src/modules/audio_processing/aecm/aecm_core_neon_offsets.c
+++ b/src/modules/audio_processing/aecm/aecm_core_neon_offsets.c
@ -0,0 +1,26 @@
+
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "aecm_core.h"
+
+#include <stddef.h>
+
+// Define offset variables that will be compiled and abstracted to constant
+// defines, which will then only be used in ARM assembly code.
+int offset_aecm_dfaCleanQDomain = offsetof(AecmCore_t, dfaCleanQDomain);
+int offset_aecm_outBuf = offsetof(AecmCore_t, outBuf);
+int offset_aecm_xBuf = offsetof(AecmCore_t, xBuf);
+int offset_aecm_dBufNoisy = offsetof(AecmCore_t, dBufNoisy);
+int offset_aecm_dBufClean = offsetof(AecmCore_t, dBufClean);
+int offset_aecm_channelStored = offsetof(AecmCore_t, channelStored);
+int offset_aecm_channelAdapt16 = offsetof(AecmCore_t, channelAdapt16);
+int offset_aecm_channelAdapt32 = offsetof(AecmCore_t, channelAdapt32);
+
--- a/src/modules/audio_processing/aecm/aecm_defines.h
+++ b/src/modules/audio_processing/aecm/aecm_defines.h
@ -0,0 +1,98 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_DEFINES_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_DEFINES_H_
+
+#define AECM_DYNAMIC_Q                 /* Turn on/off dynamic Q-domain. */
+
+/* #define AECM_SHORT                   For 32 sample partition length. */
+
+/* Algorithm parameters */
+#define FRAME_LEN       80             /* Total frame length, 10 ms. */
+
+#ifdef AECM_SHORT
+#define PART_LEN        32             /* Length of partition. */
+#define PART_LEN_SHIFT  6              /* Length of (PART_LEN * 2) in base 2. */
+#else
+#define PART_LEN        64             /* Length of partition. */
+#define PART_LEN_SHIFT  7              /* Length of (PART_LEN * 2) in base 2. */
+#endif
+
+#define PART_LEN1       (PART_LEN + 1)  /* Unique fft coefficients. */
+#define PART_LEN2       (PART_LEN << 1) /* Length of partition * 2. */
+#define PART_LEN4       (PART_LEN << 2) /* Length of partition * 4. */
+#define FAR_BUF_LEN     PART_LEN4       /* Length of buffers. */
+#define MAX_DELAY       100
+
+/* Counter parameters */
+#ifdef AECM_SHORT
+#define CONV_LEN        1024         /* Convergence length used at startup. */
+#else
+#define CONV_LEN        512          /* Convergence length used at startup. */
+#endif
+#define CONV_LEN2       (CONV_LEN << 1) /* Used at startup. */
+
+/* Energy parameters */
+#define MAX_BUF_LEN     64           /* History length of energy signals. */
+#define FAR_ENERGY_MIN  1025         /* Lowest Far energy level: At least 2 */
+                                     /* in energy. */
+#define FAR_ENERGY_DIFF 929          /* Allowed difference between max */
+                                     /* and min. */
+#define ENERGY_DEV_OFFSET       0    /* The energy error offset in Q8. */
+#define ENERGY_DEV_TOL  400          /* The energy estimation tolerance (Q8). */
+#define FAR_ENERGY_VAD_REGION   230  /* Far VAD tolerance region. */
+
+/* Stepsize parameters */
+#define MU_MIN          10          /* Min stepsize 2^-MU_MIN (far end energy */
+                                    /* dependent). */
+#define MU_MAX          1           /* Max stepsize 2^-MU_MAX (far end energy */
+                                    /* dependent). */
+#define MU_DIFF         9           /* MU_MIN - MU_MAX */
+
+/* Channel parameters */
+#define MIN_MSE_COUNT   20 /* Min number of consecutive blocks with enough */
+                           /* far end energy to compare channel estimates. */
+#define MIN_MSE_DIFF    29 /* The ratio between adapted and stored channel to */
+                           /* accept a new storage (0.8 in Q-MSE_RESOLUTION). */
+#define MSE_RESOLUTION  5           /* MSE parameter resolution. */
+#define RESOLUTION_CHANNEL16    12  /* W16 Channel in Q-RESOLUTION_CHANNEL16. */
+#define RESOLUTION_CHANNEL32    28  /* W32 Channel in Q-RESOLUTION_CHANNEL. */
+#define CHANNEL_VAD     16          /* Minimum energy in frequency band */
+                                    /* to update channel. */
+
+/* Suppression gain parameters: SUPGAIN parameters in Q-(RESOLUTION_SUPGAIN). */
+#define RESOLUTION_SUPGAIN      8     /* Channel in Q-(RESOLUTION_SUPGAIN). */
+#define SUPGAIN_DEFAULT (1 << RESOLUTION_SUPGAIN)  /* Default. */
+#define SUPGAIN_ERROR_PARAM_A   3072  /* Estimation error parameter */
+                                      /* (Maximum gain) (8 in Q8). */
+#define SUPGAIN_ERROR_PARAM_B   1536  /* Estimation error parameter */
+                                      /* (Gain before going down). */
+#define SUPGAIN_ERROR_PARAM_D   SUPGAIN_DEFAULT /* Estimation error parameter */
+                                /* (Should be the same as Default) (1 in Q8). */
+#define SUPGAIN_EPC_DT  200     /* SUPGAIN_ERROR_PARAM_C * ENERGY_DEV_TOL */
+
+/* Defines for "check delay estimation" */
+#define CORR_WIDTH      31      /* Number of samples to correlate over. */
+#define CORR_MAX        16      /* Maximum correlation offset. */
+#define CORR_MAX_BUF    63
+#define CORR_DEV        4
+#define CORR_MAX_LEVEL  20
+#define CORR_MAX_LOW    4
+#define CORR_BUF_LEN    (CORR_MAX << 1) + 1
+/* Note that CORR_WIDTH + 2*CORR_MAX <= MAX_BUF_LEN. */
+
+#define ONE_Q14         (1 << 14)
+
+/* NLP defines */
+#define NLP_COMP_LOW    3277    /* 0.2 in Q14 */
+#define NLP_COMP_HIGH   ONE_Q14 /* 1 in Q14 */
+
+#endif