For Android ARMv7 platforms, added a feature of dynamically detecting the existence of Neon,

and when it's present, switch to some functions optimized for Neon at run time. Review URL: http://webrtc-codereview.appspot.com/268002 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1096 4adac7df-926f-26a2-2b94-8c16560cd09d
2011-12-03 18:34:50 +00:00 · 2011-12-03 18:34:50 +00:00 · b59c031660
commit b59c031660
parent ae7017d588
13 changed files with 1274 additions and 796 deletions
--- a/Android.mk
+++ b/Android.mk
@ -54,6 +54,7 @@ include $(MY_WEBRTC_ROOT_PATH)/libvpx.mk
 LOCAL_PATH := $(call my-dir)
 include $(CLEAR_VARS)
 include $(LOCAL_PATH)/../../external/webrtc/android-webrtc.mk
 LOCAL_ARM_MODE := arm
 LOCAL_MODULE := libwebrtc_audio_preprocessing
@ -71,6 +72,17 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
    libwebrtc_aecm \
    libwebrtc_system_wrappers
 # Add Neon libraries.
 ifneq (,$(filter '-DWEBRTC_DETECT_ARM_NEON',$(MY_WEBRTC_COMMON_DEFS)))
 LOCAL_WHOLE_STATIC_LIBRARIES += \
    libwebrtc_aecm_neon \
    libwebrtc_ns_neon
 else ifeq ($(ARCH_ARM_HAVE_NEON),true)
 LOCAL_WHOLE_STATIC_LIBRARIES += \
    libwebrtc_aecm_neon \
    libwebrtc_ns_neon
 endif
 LOCAL_STATIC_LIBRARIES := \
    libprotobuf-cpp-2.3.0-lite
--- a/android-webrtc.mk
+++ b/android-webrtc.mk
@ -21,8 +21,9 @@ MY_WEBRTC_COMMON_DEFS := \
 #    '-DWEBRTC_MODULE_UTILITY_VIDEO' [module media_file] [module utility]
 ifeq ($(TARGET_ARCH),arm)
 MY_WEBRTC_COMMON_DEFS += \
    '-DWEBRTC_ARM_INLINE_CALLS' \
    '-DWEBRTC_ARCH_ARM'
 #    '-DWEBRTC_DETECT_ARM_NEON' # only used in a build configuration without Neon
 # TODO(kma): figure out if the above define could be moved to NDK build only.
 # TODO(kma): test if the code under next two macros works with generic GCC compilers
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
--- a/src/modules/audio_processing/aecm/Android.mk
+++ b/src/modules/audio_processing/aecm/Android.mk
@ -6,6 +6,9 @@
 # in the file PATENTS.  All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 #############################
 # Build the non-neon library.
 LOCAL_PATH := $(call my-dir)
 include $(CLEAR_VARS)
@ -21,21 +24,16 @@ LOCAL_SRC_FILES := \
    aecm_core.c
 # Flags passed to both C and C++ files.
-LOCAL_CFLAGS := \
+LOCAL_CFLAGS := $(MY_WEBRTC_COMMON_DEFS)
    $(MY_WEBRTC_COMMON_DEFS)
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
 LOCAL_SRC_FILES += \
    aecm_core_neon.c
 LOCAL_CFLAGS += \
    $(MY_ARM_CFLAGS_NEON)
 endif
 LOCAL_C_INCLUDES := \
    $(LOCAL_PATH)/interface \
    $(LOCAL_PATH)/../utility \
    $(LOCAL_PATH)/../../.. \
-    $(LOCAL_PATH)/../../../common_audio/signal_processing/include
+    $(LOCAL_PATH)/../../../common_audio/signal_processing/include \
    $(LOCAL_PATH)/../../../system_wrappers/interface
 LOCAL_STATIC_LIBRARIES += libwebrtc_system_wrappers
 LOCAL_SHARED_LIBRARIES := \
    libcutils \
@ -46,3 +44,31 @@ ifndef NDK_ROOT
 include external/stlport/libstlport.mk
 endif
 include $(BUILD_STATIC_LIBRARY)
 #########################
 # Build the neon library.
 include $(CLEAR_VARS)
 LOCAL_ARM_MODE := arm
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := libwebrtc_aecm_neon
 LOCAL_MODULE_TAGS := optional
 LOCAL_SRC_FILES := aecm_core_neon.c
 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
    $(MY_WEBRTC_COMMON_DEFS) \
    -mfpu=neon \
    -flax-vector-conversions
 LOCAL_C_INCLUDES := \
    $(LOCAL_PATH)/interface \
    $(LOCAL_PATH)/../../.. \
    $(LOCAL_PATH)/../../../common_audio/signal_processing/include
 ifndef NDK_ROOT
 include external/stlport/libstlport.mk
 endif
 include $(BUILD_STATIC_LIBRARY)
--- a/src/modules/audio_processing/aecm/aecm_core.c
+++ b/src/modules/audio_processing/aecm/aecm_core.c
@ -13,8 +13,9 @@
 #include <assert.h>
 #include <stdlib.h>
-#include "echo_control_mobile.h"
+#include "cpu_features_wrapper.h"
 #include "delay_estimator_wrapper.h"
 #include "echo_control_mobile.h"
 #include "ring_buffer.h"
 #include "typedefs.h"
@ -263,6 +264,13 @@ static const uint16_t* AlignedFarend(AecmCore_t* self, int* far_q, int delay) {
 HANDLE logFile = NULL;
 #endif
 // Declare function pointers.
 CalcLinearEnergies WebRtcAecm_CalcLinearEnergies;
 StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel;
 ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
 WindowAndFFT WebRtcAecm_WindowAndFFT;
 InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
 int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
 {
    AecmCore_t *aecm = malloc(sizeof(AecmCore_t));
@ -346,6 +354,194 @@ void WebRtcAecm_InitEchoPathCore(AecmCore_t* aecm, const WebRtc_Word16* echo_pat
    aecm->mseChannelCount = 0;
 }
 static void WindowAndFFTC(WebRtc_Word16* fft,
                          const WebRtc_Word16* time_signal,
                          complex16_t* freq_signal,
                          int time_signal_scaling)
 {
    int i, j;
    memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
    // FFT of signal
    for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
    {
        // Window time domain signal and insert into real part of
        // transformation array |fft|
        fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
            (time_signal[i] << time_signal_scaling),
            WebRtcAecm_kSqrtHanning[i],
            14);
        fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
            (time_signal[i + PART_LEN] << time_signal_scaling),
            WebRtcAecm_kSqrtHanning[PART_LEN - i],
            14);
        // Inserting zeros in imaginary parts not necessary since we
        // initialized the array with all zeros
    }
    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
    WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
    // Take only the first PART_LEN2 samples
    for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
    {
        freq_signal[i].real = fft[j];
        // The imaginary part has to switch sign
        freq_signal[i].imag = - fft[j+1];
    }
 }
 static void InverseFFTAndWindowC(AecmCore_t* aecm,
                                 WebRtc_Word16* fft,
                                 complex16_t* efw,
                                 WebRtc_Word16* output,
                                 const WebRtc_Word16* nearendClean)
 {
    int i, j, outCFFT;
    WebRtc_Word32 tmp32no1;
    // Synthesis
    for (i = 1; i < PART_LEN; i++)
    {
        j = WEBRTC_SPL_LSHIFT_W32(i, 1);
        fft[j] = efw[i].real;
        // mirrored data, even
        fft[PART_LEN4 - j] = efw[i].real;
        fft[j + 1] = -efw[i].imag;
        //mirrored data, odd
        fft[PART_LEN4 - (j - 1)] = efw[i].imag;
    }
    fft[0] = efw[0].real;
    fft[1] = -efw[0].imag;
    fft[PART_LEN2] = efw[PART_LEN].real;
    fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
    // inverse FFT, result should be scaled with outCFFT
    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
    outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
    //take only the real values and scale with outCFFT
    for (i = 0; i < PART_LEN2; i++)
    {
        j = WEBRTC_SPL_LSHIFT_W32(i, 1);
        fft[i] = fft[j];
    }
    for (i = 0; i < PART_LEN; i++)
    {
        fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
                fft[i],
                WebRtcAecm_kSqrtHanning[i],
                14);
        tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
                outCFFT - aecm->dfaCleanQDomain);
        fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
                tmp32no1 + aecm->outBuf[i],
                WEBRTC_SPL_WORD16_MIN);
        output[i] = fft[i];
        tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
                fft[PART_LEN + i],
                WebRtcAecm_kSqrtHanning[PART_LEN - i],
                14);
        tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
                outCFFT - aecm->dfaCleanQDomain);
        aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
                WEBRTC_SPL_WORD16_MAX,
                tmp32no1,
                WEBRTC_SPL_WORD16_MIN);
    }
 #ifdef ARM_WINM_LOG_
    // measure tick end
    QueryPerformanceCounter((LARGE_INTEGER*)&end);
    diff__ = ((end - start) * 1000) / (freq/1000);
    milliseconds = (unsigned int)(diff__ & 0xffffffff);
    WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
 #endif
    // Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
    memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
    memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
    if (nearendClean != NULL)
    {
        memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
    }
 }
 static void CalcLinearEnergiesC(AecmCore_t* aecm,
                                const WebRtc_UWord16* far_spectrum,
                                WebRtc_Word32* echo_est,
                                WebRtc_UWord32* far_energy,
                                WebRtc_UWord32* echo_energy_adapt,
                                WebRtc_UWord32* echo_energy_stored)
 {
    int i;
    // Get energy for the delayed far end signal and estimated
    // echo using both stored and adapted channels.
    for (i = 0; i < PART_LEN1; i++)
    {
        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
                                           far_spectrum[i]);
        (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
        (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
                                          far_spectrum[i]);
        (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
    }
 }
 static void StoreAdaptiveChannelC(AecmCore_t* aecm,
                                  const WebRtc_UWord16* far_spectrum,
                                  WebRtc_Word32* echo_est)
 {
    int i;
    // During startup we store the channel every block.
    memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
    // Recalculate echo estimate
    for (i = 0; i < PART_LEN; i += 4)
    {
        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
                                           far_spectrum[i]);
        echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
                                           far_spectrum[i + 1]);
        echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
                                           far_spectrum[i + 2]);
        echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
                                           far_spectrum[i + 3]);
    }
    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
                                       far_spectrum[i]);
 }
 static void ResetAdaptiveChannelC(AecmCore_t* aecm)
 {
    int i;
    // The stored channel has a significantly lower MSE than the adaptive one for
    // two consecutive calculations. Reset the adaptive channel.
    memcpy(aecm->channelAdapt16, aecm->channelStored,
           sizeof(WebRtc_Word16) * PART_LEN1);
    // Restore the W32 channel
    for (i = 0; i < PART_LEN; i += 4)
    {
        aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
                (WebRtc_Word32)aecm->channelStored[i], 16);
        aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
                (WebRtc_Word32)aecm->channelStored[i + 1], 16);
        aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
                (WebRtc_Word32)aecm->channelStored[i + 2], 16);
        aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
                (WebRtc_Word32)aecm->channelStored[i + 3], 16);
    }
    aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
 }
 // WebRtcAecm_InitCore(...)
 //
 // This function initializes the AECM instant created with WebRtcAecm_CreateCore(...)
@ -463,6 +659,23 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
    assert(PART_LEN % 16 == 0);
    // Initialize function pointers.
    WebRtcAecm_WindowAndFFT = WindowAndFFTC;
    WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowC;
    WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesC;
    WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelC;
    WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelC;
 #ifdef WEBRTC_DETECT_ARM_NEON
    uint64_t features = WebRtc_GetCPUFeaturesARM();
    if ((features & kCPUFeatureNEON) != 0)
    {
        WebRtcAecm_InitNeon();
    }
 #elif defined(WEBRTC_ARCH_ARM_NEON)
    WebRtcAecm_InitNeon();
 #endif
    return 0;
 }
@ -1890,194 +2103,3 @@ void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const far
    aecm->farBufReadPos += readLen;
 }
 #if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
 void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
                    const WebRtc_Word16* time_signal,
                    complex16_t* freq_signal,
                    int time_signal_scaling)
 {
    int i, j;
    memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
    // FFT of signal
    for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
    {
        // Window time domain signal and insert into real part of
        // transformation array |fft|
        fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
            (time_signal[i] << time_signal_scaling),
            WebRtcAecm_kSqrtHanning[i],
            14);
        fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
            (time_signal[i + PART_LEN] << time_signal_scaling),
            WebRtcAecm_kSqrtHanning[PART_LEN - i],
            14);
        // Inserting zeros in imaginary parts not necessary since we
        // initialized the array with all zeros
    }
    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
    WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
    // Take only the first PART_LEN2 samples
    for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
    {
        freq_signal[i].real = fft[j];
        // The imaginary part has to switch sign
        freq_signal[i].imag = - fft[j+1];
    }
 }
 void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
                        WebRtc_Word16* fft,
                        complex16_t* efw,
                        WebRtc_Word16* output,
                        const WebRtc_Word16* nearendClean)
 {
    int i, j, outCFFT;
    WebRtc_Word32 tmp32no1;
    // Synthesis
    for (i = 1; i < PART_LEN; i++)
    {
        j = WEBRTC_SPL_LSHIFT_W32(i, 1);
        fft[j] = efw[i].real;
        // mirrored data, even
        fft[PART_LEN4 - j] = efw[i].real;
        fft[j + 1] = -efw[i].imag;
        //mirrored data, odd
        fft[PART_LEN4 - (j - 1)] = efw[i].imag;
    }
    fft[0] = efw[0].real;
    fft[1] = -efw[0].imag;
    fft[PART_LEN2] = efw[PART_LEN].real;
    fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
    // inverse FFT, result should be scaled with outCFFT
    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
    outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
    //take only the real values and scale with outCFFT
    for (i = 0; i < PART_LEN2; i++)
    {
        j = WEBRTC_SPL_LSHIFT_W32(i, 1);
        fft[i] = fft[j];
    }
    for (i = 0; i < PART_LEN; i++)
    {
        fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
                fft[i],
                WebRtcAecm_kSqrtHanning[i],
                14);
        tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
                outCFFT - aecm->dfaCleanQDomain);
        fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
                tmp32no1 + aecm->outBuf[i],
                WEBRTC_SPL_WORD16_MIN);
        output[i] = fft[i];
        tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
                fft[PART_LEN + i],
                WebRtcAecm_kSqrtHanning[PART_LEN - i],
                14);
        tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
                outCFFT - aecm->dfaCleanQDomain);
        aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
                WEBRTC_SPL_WORD16_MAX,
                tmp32no1,
                WEBRTC_SPL_WORD16_MIN);
    }
 #ifdef ARM_WINM_LOG_
    // measure tick end
    QueryPerformanceCounter((LARGE_INTEGER*)&end);
    diff__ = ((end - start) * 1000) / (freq/1000);
    milliseconds = (unsigned int)(diff__ & 0xffffffff);
    WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
 #endif
    // Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
    memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
    memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
    if (nearendClean != NULL)
    {
        memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
    }
 }
 void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
                                   const WebRtc_UWord16* far_spectrum,
                                   WebRtc_Word32* echo_est,
                                   WebRtc_UWord32* far_energy,
                                   WebRtc_UWord32* echo_energy_adapt,
                                   WebRtc_UWord32* echo_energy_stored)
 {
    int i;
    // Get energy for the delayed far end signal and estimated
    // echo using both stored and adapted channels.
    for (i = 0; i < PART_LEN1; i++)
    {
        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
                                           far_spectrum[i]);
        (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
        (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
                                          far_spectrum[i]);
        (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
    }
 }
 void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
                                     const WebRtc_UWord16* far_spectrum,
                                     WebRtc_Word32* echo_est)
 {
    int i;
    // During startup we store the channel every block.
    memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
    // Recalculate echo estimate
    for (i = 0; i < PART_LEN; i += 4)
    {
        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
                                           far_spectrum[i]);
        echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
                                           far_spectrum[i + 1]);
        echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
                                           far_spectrum[i + 2]);
        echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
                                           far_spectrum[i + 3]);
    }
    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
                                       far_spectrum[i]);
 }
 void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
 {
    int i;
    // The stored channel has a significantly lower MSE than the adaptive one for
    // two consecutive calculations. Reset the adaptive channel.
    memcpy(aecm->channelAdapt16, aecm->channelStored,
           sizeof(WebRtc_Word16) * PART_LEN1);
    // Restore the W32 channel
    for (i = 0; i < PART_LEN; i += 4)
    {
        aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
                (WebRtc_Word32)aecm->channelStored[i], 16);
        aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
                (WebRtc_Word32)aecm->channelStored[i + 1], 16);
        aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
                (WebRtc_Word32)aecm->channelStored[i + 2], 16);
        aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
                (WebRtc_Word32)aecm->channelStored[i + 3], 16);
    }
    aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
 }
 #endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
--- a/src/modules/audio_processing/aecm/aecm_core.h
+++ b/src/modules/audio_processing/aecm/aecm_core.h
@ -332,32 +332,44 @@ void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * co
 void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const farend,
                              const int farLen, const int knownDelay);
-///////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
-// Some internal functions shared by ARM NEON and generic C code:
+// Some function pointers, for internal functions shared by ARM NEON and 
 // generic C code.
 //
-
+typedef void (*CalcLinearEnergies)(
-void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
+    AecmCore_t* aecm,
    const WebRtc_UWord16* far_spectrum,
    WebRtc_Word32* echoEst,
    WebRtc_UWord32* far_energy,
    WebRtc_UWord32* echo_energy_adapt,
    WebRtc_UWord32* echo_energy_stored);
 extern CalcLinearEnergies WebRtcAecm_CalcLinearEnergies;
-void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
+typedef void (*StoreAdaptiveChannel)(
    AecmCore_t* aecm,
    const WebRtc_UWord16* far_spectrum,
    WebRtc_Word32* echo_est);
 extern StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel;
-void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm);
+typedef void (*ResetAdaptiveChannel)(AecmCore_t* aecm);
 extern ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
-void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
+typedef void (*WindowAndFFT)(
    WebRtc_Word16* fft,
    const WebRtc_Word16* time_signal,
    complex16_t* freq_signal,
    int time_signal_scaling);
 extern WindowAndFFT WebRtcAecm_WindowAndFFT;
-void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
+typedef void (*InverseFFTAndWindow)(
-                                    WebRtc_Word16* fft,
+    AecmCore_t* aecm,
-                                    complex16_t* efw,
+    WebRtc_Word16* fft, complex16_t* efw,
    WebRtc_Word16* output,
    const WebRtc_Word16* nearendClean);
 extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
 // Initialization of the above function pointers for ARM Neon.
 void WebRtcAecm_InitNeon(void);
 #endif
--- a/src/modules/audio_processing/aecm/aecm_core_neon.c
+++ b/src/modules/audio_processing/aecm/aecm_core_neon.c
@ -7,7 +7,6 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
 #include "aecm_core.h"
@ -16,7 +15,7 @@
 // Square root of Hanning window in Q14.
-static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = {       
+static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = {
  16384, 16373, 16354, 16325, 
  16286, 16237, 16179, 16111,
  16034, 15947, 15851, 15746,
@ -35,18 +34,16 @@ static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8)))
  1594,  1196,  798,   399
 };
-void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
+static void WindowAndFFTNeon(WebRtc_Word16* fft,
                             const WebRtc_Word16* time_signal,
                             complex16_t* freq_signal,
-                             int time_signal_scaling)
+                             int time_signal_scaling) {
 {
  int i, j;
  int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
  __asm__("vmov.i16 d21, #0" ::: "d21");
-    for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
+  for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
    {
    int16x4_t tmp16x4_0;
    int16x4_t tmp16x4_1;
    int32x4_t tmp32x4_0;
@ -80,8 +77,7 @@ void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
  WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
  // Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
-    for(i = 0, j = 0; j < PART_LEN2; i += 8, j += 16)
+  for (i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) {
    {
    __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
    __asm__("vneg.s16 d22, d22" : : : "q10");
    __asm__("vneg.s16 d23, d23" : : : "q11");
@ -90,18 +86,16 @@ void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
  }
 }
-void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
+static void InverseFFTAndWindowNeon(AecmCore_t* aecm,
                                    WebRtc_Word16* fft,
                                    complex16_t* efw,
                                    WebRtc_Word16* output,
-                        const WebRtc_Word16* nearendClean)
+                                    const WebRtc_Word16* nearendClean) {
 {
  int i, j, outCFFT;
  WebRtc_Word32 tmp32no1;
  // Synthesis
-    for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
+  for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
    {
    // We overwrite two more elements in fft[], but it's ok.
    __asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
    __asm__("vmov q11, q10" : : : "q10", "q11");
@ -121,8 +115,7 @@ void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
  outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
  // Take only the real values and scale with outCFFT.
-    for (i = 0, j = 0; i < PART_LEN2; i += 8, j+= 16)
+  for (i = 0, j = 0; i < PART_LEN2; i += 8, j += 16) {
    {
    __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
    __asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
  }
@ -130,8 +123,7 @@ void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
  int32x4_t tmp32x4_2;
  __asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
      (outCFFT - aecm->dfaCleanQDomain)));
-    for (i = 0; i < PART_LEN; i += 4)
+  for (i = 0; i < PART_LEN; i += 4) {
    {
    int16x4_t tmp16x4_0;
    int16x4_t tmp16x4_1;
    int32x4_t tmp32x4_0;
@ -174,22 +166,19 @@ void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
  }
  // Copy the current block to the old position (outBuf is shifted elsewhere).
-    for (i = 0; i < PART_LEN; i += 16)
+  for (i = 0; i < PART_LEN; i += 16) {
    {
    __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
            "r"(&aecm->xBuf[i + PART_LEN]) : "q10");
    __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
  }
-    for (i = 0; i < PART_LEN; i += 16)
+  for (i = 0; i < PART_LEN; i += 16) {
    {
    __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
            "r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10");
    __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
            "r"(&aecm->dBufNoisy[i]): "q10");
  }
  if (nearendClean != NULL) {
-        for (i = 0; i < PART_LEN; i += 16)
+    for (i = 0; i < PART_LEN; i += 16) {
        {
      __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
              "r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
      __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
@ -198,13 +187,12 @@ void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
  }
 }
-void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
+static void CalcLinearEnergiesNeon(AecmCore_t* aecm,
                                   const WebRtc_UWord16* far_spectrum,
                                   WebRtc_Word32* echo_est,
                                   WebRtc_UWord32* far_energy,
                                   WebRtc_UWord32* echo_energy_adapt,
-                                   WebRtc_UWord32* echo_energy_stored)
+                                   WebRtc_UWord32* echo_energy_stored) {
 {
  int i;
  register WebRtc_UWord32 far_energy_r;
@ -216,8 +204,7 @@ void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
  __asm__("vmov.i32 q8,  #0" : : : "q8"); // echo_energy_stored
  __asm__("vmov.i32 q9,  #0" : : : "q9"); // echo_energy_adapt
-    for(i = 0; i < PART_LEN -7; i += 8)
+  for (i = 0; i < PART_LEN - 7; i += 8) {
    {
    // far_energy += (WebRtc_UWord32)(far_spectrum[i]);
    __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
    __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
@ -264,16 +251,14 @@ void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
      aecm->channelAdapt16[i], far_spectrum[i]);
 }
-void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
+static void StoreAdaptiveChannelNeon(AecmCore_t* aecm,
                                     const WebRtc_UWord16* far_spectrum,
-                                     WebRtc_Word32* echo_est)
+                                     WebRtc_Word32* echo_est) {
 {
  int i;
  // During startup we store the channel every block.
  // Recalculate echo estimate.
-    for(i = 0; i < PART_LEN -7; i += 8)
+  for (i = 0; i < PART_LEN - 7; i += 8) {
    {
    // aecm->channelStored[i] = acem->channelAdapt16[i];
    // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
    __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
@ -288,12 +273,10 @@ void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
  echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
 }
-void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
+static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
 {
  int i;
-    for(i = 0; i < PART_LEN -7; i += 8)
+  for (i = 0; i < PART_LEN - 7; i += 8) {
    {
    // aecm->channelAdapt16[i] = aecm->channelStored[i];
    // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
    //                           aecm->channelStored[i], 16);
@ -311,4 +294,10 @@ void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
      (WebRtc_Word32)aecm->channelStored[i], 16);
 }
-#endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
+void WebRtcAecm_InitNeon(void) {
  WebRtcAecm_WindowAndFFT = WindowAndFFTNeon;
  WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon;
  WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon;
  WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon;
  WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon;
 }
--- a/src/modules/audio_processing/ns/Android.mk
+++ b/src/modules/audio_processing/ns/Android.mk
@ -6,6 +6,8 @@
 # in the file PATENTS.  All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 #############################
 # Build the non-neon library.
 LOCAL_PATH := $(call my-dir)
 include $(CLEAR_VARS)
@ -20,25 +22,20 @@ LOCAL_SRC_FILES := \
    noise_suppression_x.c \
    nsx_core.c
-# floating point
+# Files for floating point.
 # noise_suppression.c ns_core.c 
 # Flags passed to both C and C++ files.
-LOCAL_CFLAGS := \
+LOCAL_CFLAGS := $(MY_WEBRTC_COMMON_DEFS)
    $(MY_WEBRTC_COMMON_DEFS)
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
 LOCAL_SRC_FILES += \
    nsx_core_neon.c
 LOCAL_CFLAGS += \
    $(MY_ARM_CFLAGS_NEON)
 endif
 LOCAL_C_INCLUDES := \
    $(LOCAL_PATH)/interface \
    $(LOCAL_PATH)/../utility \
    $(LOCAL_PATH)/../../.. \
-    $(LOCAL_PATH)/../../../common_audio/signal_processing/include 
+    $(LOCAL_PATH)/../../../common_audio/signal_processing/include \
    $(LOCAL_PATH)/../../../system_wrappers/interface
 LOCAL_STATIC_LIBRARIES += libwebrtc_system_wrappers
 LOCAL_SHARED_LIBRARIES := \
    libcutils \
@ -49,3 +46,31 @@ ifndef NDK_ROOT
 include external/stlport/libstlport.mk
 endif
 include $(BUILD_STATIC_LIBRARY)
 #############################
 # Build the neon library.
 include $(CLEAR_VARS)
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := libwebrtc_ns_neon
 LOCAL_MODULE_TAGS := optional
 LOCAL_GENERATED_SOURCES :=
 LOCAL_SRC_FILES := nsx_core_neon.c
 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
    $(MY_WEBRTC_COMMON_DEFS) \
    -mfpu=neon \
    -flax-vector-conversions
 LOCAL_C_INCLUDES := \
    $(LOCAL_PATH)/interface \
    $(LOCAL_PATH)/../../.. \
    $(LOCAL_PATH)/../../../common_audio/signal_processing/include
 ifndef NDK_ROOT
 include external/stlport/libstlport.mk
 endif
 include $(BUILD_STATIC_LIBRARY)
--- a/src/modules/audio_processing/ns/nsx_core.c
+++ b/src/modules/audio_processing/ns/nsx_core.c
@ -16,6 +16,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include "cpu_features_wrapper.h"
 #include "nsx_core.h"
 // Skip first frequency bins during estimation. (0 <= value < 64)
@ -426,6 +427,271 @@ static const WebRtc_Word16 kDeterminantEstMatrix[66] = {
  355,    330
 };
 // Declare function pointers.
 NoiseEstimation WebRtcNsx_NoiseEstimation;
 PrepareSpectrum WebRtcNsx_PrepareSpectrum;
 SynthesisUpdate WebRtcNsx_SynthesisUpdate;
 AnalysisUpdate WebRtcNsx_AnalysisUpdate;
 Denormalize WebRtcNsx_Denormalize;
 CreateComplexBuffer WebRtcNsx_CreateComplexBuffer;
 // Update the noise estimation information.
 static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
  WebRtc_Word32 tmp32no1 = 0;
  WebRtc_Word32 tmp32no2 = 0;
  WebRtc_Word16 tmp16 = 0;
  const WebRtc_Word16 kExp2Const = 11819; // Q13
  int i = 0;
  tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
                                   inst->magnLen);
  // Guarantee a Q-domain as high as possible and still fit in int16
  inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
                   kExp2Const, tmp16, 21);
  for (i = 0; i < inst->magnLen; i++) {
    // inst->quantile[i]=exp(inst->lquantile[offset+i]);
    // in Q21
    tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
                                    inst->noiseEstLogQuantile[offset + i]);
    tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
    tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
    tmp16 -= 21;// shift 21 to get result in Q0
    tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise)
    if (tmp16 < 0) {
      tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
    } else {
      tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
    }
    inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1);
  }
 }
 // Noise Estimation
 static void NoiseEstimationC(NsxInst_t* inst,
                             uint16_t* magn,
                             uint32_t* noise,
                             int16_t* q_noise) {
  WebRtc_Word32 numerator = FACTOR_Q16;
  WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
  WebRtc_Word16 countProd, delta, zeros, frac;
  WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
  const int16_t log2_const = 22713; // Q15
  const int16_t width_factor = 21845;
  int i, s, offset;
  tabind = inst->stages - inst->normData;
  assert(tabind < 9);
  assert(tabind > -9);
  if (tabind < 0) {
    logval = -WebRtcNsx_kLogTable[-tabind];
  } else {
    logval = WebRtcNsx_kLogTable[tabind];
  }
  // lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
  // magn is in Q(-stages), and the real lmagn values are:
  // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
  // lmagn in Q8
  for (i = 0; i < inst->magnLen; i++) {
    if (magn[i]) {
      zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
      frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros)
                              & 0x7FFFFFFF) >> 23);
      // log2(magn(i))
      assert(frac < 256);
      log2 = (WebRtc_Word16)(((31 - zeros) << 8)
                             + WebRtcNsx_kLogTableFrac[frac]);
      // log2(magn(i))*log(2)
      lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
      // + log(2^stages)
      lmagn[i] += logval;
    } else {
      lmagn[i] = logval;//0;
    }
  }
  // loop over simultaneous estimates
  for (s = 0; s < SIMULT; s++) {
    offset = s * inst->magnLen;
    // Get counter values from state
    counter = inst->noiseEstCounter[s];
    assert(counter < 201);
    countDiv = WebRtcNsx_kCounterDiv[counter];
    countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
    // quant_est(...)
    for (i = 0; i < inst->magnLen; i++) {
      // compute delta
      if (inst->noiseEstDensity[offset + i] > 512) {
        delta = WebRtcSpl_DivW32W16ResW16(numerator,
                                          inst->noiseEstDensity[offset + i]);
      } else {
        delta = FACTOR_Q7;
        if (inst->blockIndex < END_STARTUP_LONG) {
          // Smaller step size during startup. This prevents from using
          // unrealistic values causing overflow.
          delta = FACTOR_Q7_STARTUP;
        }
      }
      // update log quantile estimate
      tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
      if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
        // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
        // CounterDiv=1/(inst->counter[s]+1) in Q15
        tmp16 += 2;
        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
        inst->noiseEstLogQuantile[offset + i] += tmp16no1;
      } else {
        tmp16 += 1;
        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
        // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
        inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
        if (inst->noiseEstLogQuantile[offset + i] < logval) {
          // This is the smallest fixed point representation we can
          // have, hence we limit the output.
          inst->noiseEstLogQuantile[offset + i] = logval;
        }
      }
      // update density estimate
      if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
          < WIDTH_Q8) {
        tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
                     inst->noiseEstDensity[offset + i], countProd, 15);
        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
                     width_factor, countDiv, 15);
        inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
      }
    } // end loop over magnitude spectrum
    if (counter >= END_STARTUP_LONG) {
      inst->noiseEstCounter[s] = 0;
      if (inst->blockIndex >= END_STARTUP_LONG) {
        UpdateNoiseEstimate(inst, offset);
      }
    }
    inst->noiseEstCounter[s]++;
  } // end loop over simultaneous estimates
  // Sequentially update the noise during startup
  if (inst->blockIndex < END_STARTUP_LONG) {
    UpdateNoiseEstimate(inst, offset);
  }
  for (i = 0; i < inst->magnLen; i++) {
    noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
  }
  (*q_noise) = (WebRtc_Word16)inst->qNoise;
 }
 // Filter the data in the frequency domain, and create spectrum.
 static void PrepareSpectrumC(NsxInst_t* inst, int16_t* freq_buf) {
  int i = 0, j = 0;
  int16_t tmp16 = 0;
  for (i = 0; i < inst->magnLen; i++) {
    inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
        (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
    inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
        (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
  }
  freq_buf[0] = inst->real[0];
  freq_buf[1] = -inst->imag[0];
  for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
    tmp16 = (inst->anaLen << 1) - j;
    freq_buf[j] = inst->real[i];
    freq_buf[j + 1] = -inst->imag[i];
    freq_buf[tmp16] = inst->real[i];
    freq_buf[tmp16 + 1] = inst->imag[i];
  }
  freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
  freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
 }
 // Denormalize the input buffer.
 static __inline void DenormalizeC(NsxInst_t* inst, int16_t* in, int factor) {
  int i = 0, j = 0;
  int32_t tmp32 = 0;
  for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
    tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j],
                                 factor - inst->normData);
    inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
  }
 }
 // For the noise supression process, synthesis, read out fully processed
 // segment, and update synthesis buffer.
 static void SynthesisUpdateC(NsxInst_t* inst,
                             int16_t* out_frame,
                             int16_t gain_factor) {
  int i = 0;
  int16_t tmp16a = 0;
  int16_t tmp16b = 0;
  int32_t tmp32 = 0;
  // synthesis
  for (i = 0; i < inst->anaLen; i++) {
    tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
                 inst->window[i], inst->real[i], 14); // Q0, window in Q14
    tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0
    // Down shift with rounding
    tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
    inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i],
                                                      tmp16b); // Q0
  }
  // read out fully processed segment
  for (i = 0; i < inst->blockLen10ms; i++) {
    out_frame[i] = inst->synthesisBuffer[i]; // Q0
  }
  // update synthesis buffer
  WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
                        inst->synthesisBuffer + inst->blockLen10ms,
                        inst->anaLen - inst->blockLen10ms);
  WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
      + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
 }
 // Update analysis buffer for lower band, and window data before FFT.
 static void AnalysisUpdateC(NsxInst_t* inst,
                            int16_t* out,
                            int16_t* new_speech) {
  int i = 0;
  // For lower band update analysis buffer.
  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
                        inst->analysisBuffer + inst->blockLen10ms,
                        inst->anaLen - inst->blockLen10ms);
  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
      + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
  // Window data before FFT.
  for (i = 0; i < inst->anaLen; i++) {
    out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
               inst->window[i], inst->analysisBuffer[i], 14); // Q0
  }
 }
 // Create a complex number buffer (out[]) as the intput (in[]) interleaved with
 // zeros, and normalize it.
 static __inline void CreateComplexBufferC(NsxInst_t* inst,
                                          int16_t* in,
                                          int16_t* out) {
  int i = 0, j = 0;
  for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
    out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
    out[j + 1] = 0; // Insert zeros in imaginary part
  }
 }
 void WebRtcNsx_CalcParametricNoiseEstimate(NsxInst_t* inst,
                                           WebRtc_Word16 pink_noise_exp_avg,
                                           WebRtc_Word32 pink_noise_num_avg,
@ -600,6 +866,24 @@ WebRtc_Word32 WebRtcNsx_InitCore(NsxInst_t* inst, WebRtc_UWord32 fs) {
  inst->file5 = fopen("file5.pcm", "wb");
 #endif
  // Initialize function pointers.
  WebRtcNsx_NoiseEstimation = NoiseEstimationC;
  WebRtcNsx_PrepareSpectrum = PrepareSpectrumC;
  WebRtcNsx_SynthesisUpdate = SynthesisUpdateC;
  WebRtcNsx_AnalysisUpdate = AnalysisUpdateC;
  WebRtcNsx_Denormalize = DenormalizeC;
  WebRtcNsx_CreateComplexBuffer = CreateComplexBufferC;
 #ifdef WEBRTC_DETECT_ARM_NEON
    uint64_t features = WebRtc_GetCPUFeaturesARM();
    if ((features & kCPUFeatureNEON) != 0)
    {
        WebRtcNsx_InitNeon();
    }
 #elif defined(WEBRTC_ARCH_ARM_NEON)
    WebRtcNsx_InitNeon();
 #endif
  inst->initFlag = 1;
  return 0;
@ -2157,263 +2441,4 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* speechFrame, short* speechFram
  return 0;
 }
 #if !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
 // Update the noise estimation information.
 static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
  WebRtc_Word32 tmp32no1 = 0;
  WebRtc_Word32 tmp32no2 = 0;
  WebRtc_Word16 tmp16 = 0;
  const WebRtc_Word16 kExp2Const = 11819; // Q13
  int i = 0;
  tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
                                   inst->magnLen);
  // Guarantee a Q-domain as high as possible and still fit in int16
  inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
                   kExp2Const, tmp16, 21);
  for (i = 0; i < inst->magnLen; i++) {
    // inst->quantile[i]=exp(inst->lquantile[offset+i]);
    // in Q21
    tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
                                    inst->noiseEstLogQuantile[offset + i]);
    tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
    tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
    tmp16 -= 21;// shift 21 to get result in Q0
    tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise)
    if (tmp16 < 0) {
      tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
    } else {
      tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
    }
    inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1);
  }
 }
 // Noise Estimation
 void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
                               uint16_t* magn,
                               uint32_t* noise,
                               int16_t* q_noise) {
  WebRtc_Word32 numerator = FACTOR_Q16;
  WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
  WebRtc_Word16 countProd, delta, zeros, frac;
  WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
  const int16_t log2_const = 22713; // Q15
  const int16_t width_factor = 21845;
  int i, s, offset;
  tabind = inst->stages - inst->normData;
  assert(tabind < 9);
  assert(tabind > -9);
  if (tabind < 0) {
    logval = -WebRtcNsx_kLogTable[-tabind];
  } else {
    logval = WebRtcNsx_kLogTable[tabind];
  }
  // lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
  // magn is in Q(-stages), and the real lmagn values are:
  // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
  // lmagn in Q8
  for (i = 0; i < inst->magnLen; i++) {
    if (magn[i]) {
      zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
      frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros)
                              & 0x7FFFFFFF) >> 23);
      // log2(magn(i))
      assert(frac < 256);
      log2 = (WebRtc_Word16)(((31 - zeros) << 8)
                             + WebRtcNsx_kLogTableFrac[frac]);
      // log2(magn(i))*log(2)
      lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
      // + log(2^stages)
      lmagn[i] += logval;
    } else {
      lmagn[i] = logval;//0;
    }
  }
  // loop over simultaneous estimates
  for (s = 0; s < SIMULT; s++) {
    offset = s * inst->magnLen;
    // Get counter values from state
    counter = inst->noiseEstCounter[s];
    assert(counter < 201);
    countDiv = WebRtcNsx_kCounterDiv[counter];
    countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
    // quant_est(...)
    for (i = 0; i < inst->magnLen; i++) {
      // compute delta
      if (inst->noiseEstDensity[offset + i] > 512) {
        delta = WebRtcSpl_DivW32W16ResW16(numerator,
                                          inst->noiseEstDensity[offset + i]);
      } else {
        delta = FACTOR_Q7;
        if (inst->blockIndex < END_STARTUP_LONG) {
          // Smaller step size during startup. This prevents from using
          // unrealistic values causing overflow.
          delta = FACTOR_Q7_STARTUP;
        }
      }
      // update log quantile estimate
      tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
      if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
        // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
        // CounterDiv=1/(inst->counter[s]+1) in Q15
        tmp16 += 2;
        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
        inst->noiseEstLogQuantile[offset + i] += tmp16no1;
      } else {
        tmp16 += 1;
        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
        // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
        inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
        if (inst->noiseEstLogQuantile[offset + i] < logval) {
          // This is the smallest fixed point representation we can
          // have, hence we limit the output.
          inst->noiseEstLogQuantile[offset + i] = logval;
        }
      }
      // update density estimate
      if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
          < WIDTH_Q8) {
        tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
                     inst->noiseEstDensity[offset + i], countProd, 15);
        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
                     width_factor, countDiv, 15);
        inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
      }
    } // end loop over magnitude spectrum
    if (counter >= END_STARTUP_LONG) {
      inst->noiseEstCounter[s] = 0;
      if (inst->blockIndex >= END_STARTUP_LONG) {
        UpdateNoiseEstimate(inst, offset);
      }
    }
    inst->noiseEstCounter[s]++;
  } // end loop over simultaneous estimates
  // Sequentially update the noise during startup
  if (inst->blockIndex < END_STARTUP_LONG) {
    UpdateNoiseEstimate(inst, offset);
  }
  for (i = 0; i < inst->magnLen; i++) {
    noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
  }
  (*q_noise) = (WebRtc_Word16)inst->qNoise;
 }
 // Filter the data in the frequency domain, and create spectrum.
 void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
  int i = 0, j = 0;
  int16_t tmp16 = 0;
  for (i = 0; i < inst->magnLen; i++) {
    inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
        (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
    inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
        (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
  }
  freq_buf[0] = inst->real[0];
  freq_buf[1] = -inst->imag[0];
  for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
    tmp16 = (inst->anaLen << 1) - j;
    freq_buf[j] = inst->real[i];
    freq_buf[j + 1] = -inst->imag[i];
    freq_buf[tmp16] = inst->real[i];
    freq_buf[tmp16 + 1] = inst->imag[i];
  }
  freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
  freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
 }
 // Denormalize the input buffer.
 __inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
  int i = 0, j = 0;
  int32_t tmp32 = 0;
  for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
    tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j],
                                 factor - inst->normData);
    inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
  }
 }
 // For the noise supression process, synthesis, read out fully processed
 // segment, and update synthesis buffer.
 void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
                               int16_t* out_frame,
                               int16_t gain_factor) {
  int i = 0;
  int16_t tmp16a = 0;
  int16_t tmp16b = 0;
  int32_t tmp32 = 0;
  // synthesis
  for (i = 0; i < inst->anaLen; i++) {
    tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
                 inst->window[i], inst->real[i], 14); // Q0, window in Q14
    tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0
    // Down shift with rounding
    tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
    inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i],
                                                      tmp16b); // Q0
  }
  // read out fully processed segment
  for (i = 0; i < inst->blockLen10ms; i++) {
    out_frame[i] = inst->synthesisBuffer[i]; // Q0
  }
  // update synthesis buffer
  WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
                        inst->synthesisBuffer + inst->blockLen10ms,
                        inst->anaLen - inst->blockLen10ms);
  WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
      + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
 }
 // Update analysis buffer for lower band, and window data before FFT.
 void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
                              int16_t* out,
                              int16_t* new_speech) {
  int i = 0;
  // For lower band update analysis buffer.
  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
                        inst->analysisBuffer + inst->blockLen10ms,
                        inst->anaLen - inst->blockLen10ms);
  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
      + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
  // Window data before FFT.
  for (i = 0; i < inst->anaLen; i++) {
    out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
               inst->window[i], inst->analysisBuffer[i], 14); // Q0
  }
 }
 // Create a complex number buffer (out[]) as the intput (in[]) interleaved with
 // zeros, and normalize it.
 __inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
                                            int16_t* in,
                                            int16_t* out) {
  int i = 0, j = 0;
  for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
    out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
    out[j + 1] = 0; // Insert zeros in imaginary part
  }
 }
 #endif  // !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
--- a/src/modules/audio_processing/ns/nsx_core.h
+++ b/src/modules/audio_processing/ns/nsx_core.h
@ -165,40 +165,51 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst,
                          short* outFrameHigh);
 /****************************************************************************
- * Internal functions and variable declarations shared with optimized code.
+ * Some function pointers, for internal functions shared by ARM NEON and 
 * generic C code.
 */
 // Noise Estimation.
-void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
+typedef void (*NoiseEstimation)(NsxInst_t* inst,
                                uint16_t* magn,
                                uint32_t* noise,
                                int16_t* q_noise);
 extern NoiseEstimation WebRtcNsx_NoiseEstimation;
 // Filter the data in the frequency domain, and create spectrum.
-void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst,
+typedef void (*PrepareSpectrum)(NsxInst_t* inst,
                                int16_t* freq_buff);
 extern PrepareSpectrum WebRtcNsx_PrepareSpectrum;
 // For the noise supression process, synthesis, read out fully processed
 // segment, and update synthesis buffer.
-void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
+typedef void (*SynthesisUpdate)(NsxInst_t* inst,
                                int16_t* out_frame,
                                int16_t gain_factor);
 extern SynthesisUpdate WebRtcNsx_SynthesisUpdate;
 // Update analysis buffer for lower band, and window data before FFT.
-void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
+typedef void (*AnalysisUpdate)(NsxInst_t* inst,
                               int16_t* out,
                               int16_t* new_speech);
 extern AnalysisUpdate WebRtcNsx_AnalysisUpdate;
 // Denormalize the input buffer.
-__inline void WebRtcNsx_Denormalize(NsxInst_t* inst,
+typedef void (*Denormalize)(NsxInst_t* inst,
                            int16_t* in,
                            int factor);
 extern Denormalize WebRtcNsx_Denormalize;
 // Create a complex number buffer, as the intput interleaved with zeros,
 // and normalize it.
-__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
+typedef void (*CreateComplexBuffer)(NsxInst_t* inst,
                                    int16_t* in,
                                    int16_t* out);
 extern CreateComplexBuffer WebRtcNsx_CreateComplexBuffer;
 /****************************************************************************
 * Initialization of the above function pointers for ARM Neon.
 */
 void WebRtcNsx_InitNeon(void);
 extern const WebRtc_Word16 WebRtcNsx_kLogTable[9];
 extern const WebRtc_Word16 WebRtcNsx_kLogTableFrac[256];
--- a/src/modules/audio_processing/ns/nsx_core_neon.c
+++ b/src/modules/audio_processing/ns/nsx_core_neon.c
@ -8,15 +8,13 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #if defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)
 #include "nsx_core.h"
 #include <arm_neon.h>
 #include <assert.h>
 // Update the noise estimation information.
-static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
+static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset) {
  int i = 0;
  const int16_t kExp2Const = 11819; // Q13
  int16_t* ptr_noiseEstLogQuantile = NULL;
@ -94,7 +92,7 @@ static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
 }
 // Noise Estimation
-void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
+static void NoiseEstimationNeon(NsxInst_t* inst,
                                uint16_t* magn,
                                uint32_t* noise,
                                int16_t* q_noise) {
@ -302,7 +300,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
    if (counter >= END_STARTUP_LONG) {
      inst->noiseEstCounter[s] = 0;
      if (inst->blockIndex >= END_STARTUP_LONG) {
-        UpdateNoiseEstimate(inst, offset);
+        UpdateNoiseEstimateNeon(inst, offset);
      }
    }
    inst->noiseEstCounter[s]++;
@ -311,7 +309,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
  // Sequentially update the noise during startup
  if (inst->blockIndex < END_STARTUP_LONG) {
-    UpdateNoiseEstimate(inst, offset);
+    UpdateNoiseEstimateNeon(inst, offset);
  }
  for (i = 0; i < inst->magnLen; i++) {
@ -321,7 +319,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
 }
 // Filter the data in the frequency domain, and create spectrum.
-void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
+static void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
  // (1) Filtering.
@ -338,7 +336,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
  uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0];
  // Filter the rest in the frequency domain.
-  for (; ptr_real < &inst->real[inst->magnLen - 1]; ) {
+  for (; ptr_real < &inst->real[inst->magnLen - 1];) {
    // Loop unrolled once. Both pointers are incremented by 4 twice.
    __asm__ __volatile__(
      "vld1.16 d20, [%[ptr_real]]\n\t"
@ -400,7 +398,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
  int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8];
  ptr_real = &inst->real[1];
  ptr_imag = &inst->imag[1];
-  for (; ptr_real < &inst->real[inst->anaLen2 - 11]; ) {
+  for (; ptr_real < &inst->real[inst->anaLen2 - 11];) {
    // Loop unrolled once. All pointers are incremented twice.
    __asm__ __volatile__(
      "vld1.16 d22, [%[ptr_real]]!\n\t"
@ -456,13 +454,13 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
 }
 // Denormalize the input buffer.
-__inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
+static __inline void DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor) {
  int16_t* ptr_real = &inst->real[0];
  int16_t* ptr_in = &in[0];
  __asm__ __volatile__("vdup.32 q10, %0" ::
                       "r"((int32_t)(factor - inst->normData)) : "q10");
-  for (; ptr_real < &inst->real[inst->anaLen]; ) {
+  for (; ptr_real < &inst->real[inst->anaLen];) {
    // Loop unrolled once. Both pointers are incremented.
    __asm__ __volatile__(
@ -495,7 +493,7 @@ __inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
 // For the noise supress process, synthesis, read out fully processed segment,
 // and update synthesis buffer.
-void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
+static void SynthesisUpdateNeon(NsxInst_t* inst,
                                int16_t* out_frame,
                                int16_t gain_factor) {
  int16_t* ptr_real = &inst->real[0];
@ -505,7 +503,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
  // synthesis
  __asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24");
  // Loop unrolled once. All pointers are incremented in the assembly code.
-  for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) {
+  for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
    __asm__ __volatile__(
      // Load variables.
      "vld1.16 d22, [%[ptr_real]]!\n\t"
@ -553,7 +551,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
  int16_t* ptr_out = &out_frame[0];
  ptr_syn = &inst->synthesisBuffer[0];
  // read out fully processed segment
-  for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms]; ) {
+  for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms];) {
    // Loop unrolled once. Both pointers are incremented in the assembly code.
    __asm__ __volatile__(
      // out_frame[i] = inst->synthesisBuffer[i]; // Q0
@ -575,7 +573,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
  //                      inst->anaLen - inst->blockLen10ms);
  ptr_out = &inst->synthesisBuffer[0],
  ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms];
-  for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) {
+  for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
    // Loop unrolled once. Both pointers are incremented in the assembly code.
    __asm__ __volatile__(
      "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
@ -593,7 +591,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
  // WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
  //    + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
  __asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10");
-  for (; ptr_out < &inst->synthesisBuffer[inst->anaLen]; ) {
+  for (; ptr_out < &inst->synthesisBuffer[inst->anaLen];) {
    // Loop unrolled once. Pointer is incremented in the assembly code.
    __asm__ __volatile__(
      "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
@ -606,7 +604,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
 }
 // Update analysis buffer for lower band, and window data before FFT.
-void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
+static void AnalysisUpdateNeon(NsxInst_t* inst,
                               int16_t* out,
                               int16_t* new_speech) {
@ -617,7 +615,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
  // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
  //                      inst->analysisBuffer + inst->blockLen10ms,
  //                      inst->anaLen - inst->blockLen10ms);
-  for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms]; ) {
+  for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) {
    // Loop unrolled once, so both pointers are incremented by 8 twice.
    __asm__ __volatile__(
      "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
@ -633,7 +631,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
  // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
  //    + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
-  for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen]; ) {
+  for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) {
    // Loop unrolled once, so both pointers are incremented by 8 twice.
    __asm__ __volatile__(
      "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
@ -651,7 +649,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
  int16_t* ptr_window = &inst->window[0];
  ptr_out = &out[0];
  ptr_ana = &inst->analysisBuffer[0];
-  for (; ptr_out < &out[inst->anaLen]; ) {
+  for (; ptr_out < &out[inst->anaLen];) {
    // Loop unrolled once, so all pointers are incremented by 4 twice.
    __asm__ __volatile__(
@ -683,7 +681,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
 // Create a complex number buffer (out[]) as the intput (in[]) interleaved with
 // zeros, and normalize it.
-__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
+static __inline void CreateComplexBufferNeon(NsxInst_t* inst,
                                             int16_t* in,
                                             int16_t* out) {
  int16_t* ptr_out = &out[0];
@ -691,7 +689,7 @@ __inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
  __asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25");
  __asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10");
-  for (; ptr_in < &in[inst->anaLen]; ) {
+  for (; ptr_in < &in[inst->anaLen];) {
    // Loop unrolled once, so ptr_in is incremented by 8 twice,
    // and ptr_out is incremented by 8 four times.
@ -724,4 +722,12 @@ __inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
    );
  }
 }
-#endif // defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)
+
 void WebRtcNsx_InitNeon(void) {
  WebRtcNsx_NoiseEstimation = NoiseEstimationNeon;
  WebRtcNsx_PrepareSpectrum = PrepareSpectrumNeon;
  WebRtcNsx_SynthesisUpdate = SynthesisUpdateNeon;
  WebRtcNsx_AnalysisUpdate = AnalysisUpdateNeon;
  WebRtcNsx_Denormalize = DenormalizeNeon;
  WebRtcNsx_CreateComplexBuffer = CreateComplexBufferNeon;
 }
--- a/src/system_wrappers/interface/cpu_features_wrapper.h
+++ b/src/system_wrappers/interface/cpu_features_wrapper.h
@ -15,18 +15,33 @@
 extern "C" {
 #endif
-// list of features.
+#include <typedefs.h>
 // List of features in x86.
 typedef enum {
  kSSE2,
  kSSE3
 } CPUFeature;
 // List of features in ARM.
 enum {
  kCPUFeatureARMv7       = (1 << 0),
  kCPUFeatureVFPv3       = (1 << 1),
  kCPUFeatureNEON        = (1 << 2),
  kCPUFeatureLDREXSTREX  = (1 << 3)
 };
 typedef int (*WebRtc_CPUInfo)(CPUFeature feature);
 // returns true if the CPU supports the feature.
 extern WebRtc_CPUInfo WebRtc_GetCPUInfo;
 // No CPU feature is available => straight C path.
 extern WebRtc_CPUInfo WebRtc_GetCPUInfoNoASM;
 // Return the features in an ARM device.
 // It detects the features in the hardware platform, and returns supported 
 // values in the above enum definition as a bitmask.
 extern uint64_t WebRtc_GetCPUFeaturesARM(void);
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/system_wrappers/source/Android.mk
+++ b/src/system_wrappers/source/Android.mk
@ -25,6 +25,7 @@ LOCAL_SRC_FILES := \
    condition_variable.cc \
    cpu_dummy.cc \
    cpu_features.cc \
    cpu_features_arm.c \
    cpu_info.cc \
    critical_section.cc \
    event.cc \
--- a/src/system_wrappers/source/cpu_features_arm.c
+++ b/src/system_wrappers/source/cpu_features_arm.c
@ -0,0 +1,333 @@
 /*
 *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 // This file is derived from Android's NDK package r7, located at
 // <ndk>/sources/android/cpufeatures/ (downloadable from
 // http://developer.android.com/sdk/ndk/index.html).
 #include "cpu_features_wrapper.h"
 #include <fcntl.h>
 #include <errno.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 // Define CPU family.
 typedef enum {
  CPU_FAMILY_UNKNOWN = 0,
  CPU_FAMILY_ARM,
  CPU_FAMILY_X86,
  CPU_FAMILY_MAX  // Do not remove.
 } CpuFamily;
 static pthread_once_t g_once;
 static CpuFamily g_cpuFamily;
 static uint64_t g_cpuFeatures;
 static int g_cpuCount;
 static const int cpufeatures_debug = 0;
 #ifdef __arm__
 #  define DEFAULT_CPU_FAMILY  CPU_FAMILY_ARM
 #elif defined __i386__
 #  define DEFAULT_CPU_FAMILY  CPU_FAMILY_X86
 #else
 #  define DEFAULT_CPU_FAMILY  CPU_FAMILY_UNKNOWN
 #endif
 #define  D(...) \
  do { \
    if (cpufeatures_debug) { \
      printf(__VA_ARGS__); fflush(stdout); \
    } \
  } while (0)
 /* Read the content of /proc/cpuinfo into a user-provided buffer.
 * Return the length of the data, or -1 on error. Does *not*
 * zero-terminate the content. Will not read more
 * than 'buffsize' bytes.
 */
 static int read_file(const char*  pathname, char*  buffer, size_t  buffsize) {
  int  fd, len;
  fd = open(pathname, O_RDONLY);
  if (fd < 0)
    return -1;
  do {
    len = read(fd, buffer, buffsize);
  } while (len < 0 && errno == EINTR);
  close(fd);
  return len;
 }
 /* Extract the content of a the first occurence of a given field in
 * the content of /proc/cpuinfo and return it as a heap-allocated
 * string that must be freed by the caller.
 *
 * Return NULL if not found
 */
 static char* extract_cpuinfo_field(char* buffer, int buflen, const char* field) {
  int  fieldlen = strlen(field);
  char* bufend = buffer + buflen;
  char* result = NULL;
  int len, ignore;
  const char* p, *q;
  /* Look for first field occurence, and ensures it starts the line.
   */
  p = buffer;
  bufend = buffer + buflen;
  for (;;) {
    p = memmem(p, bufend - p, field, fieldlen);
    if (p == NULL)
      goto EXIT;
    if (p == buffer || p[-1] == '\n')
      break;
    p += fieldlen;
  }
  /* Skip to the first column followed by a space */
  p += fieldlen;
  p  = memchr(p, ':', bufend - p);
  if (p == NULL || p[1] != ' ')
    goto EXIT;
  /* Find the end of the line */
  p += 2;
  q = memchr(p, '\n', bufend - p);
  if (q == NULL)
    q = bufend;
  /* Copy the line into a heap-allocated buffer */
  len = q - p;
  result = malloc(len + 1);
  if (result == NULL)
    goto EXIT;
  memcpy(result, p, len);
  result[len] = '\0';
 EXIT:
  return result;
 }
 /* Count the number of occurences of a given field prefix in /proc/cpuinfo.
 */
 static int count_cpuinfo_field(char* buffer, int buflen, const char* field) {
  int fieldlen = strlen(field);
  const char* p = buffer;
  const char* bufend = buffer + buflen;
  const char* q;
  int count = 0;
  for (;;) {
    const char* q;
    p = memmem(p, bufend - p, field, fieldlen);
    if (p == NULL)
      break;
    /* Ensure that the field is at the start of a line */
    if (p > buffer && p[-1] != '\n') {
      p += fieldlen;
      continue;
    }
    /* skip any whitespace */
    q = p + fieldlen;
    while (q < bufend && (*q == ' ' || *q == '\t'))
      q++;
    /* we must have a colon now */
    if (q < bufend && *q == ':') {
      count += 1;
      q ++;
    }
    p = q;
  }
  return count;
 }
 /* Like strlen(), but for constant string literals */
 #define STRLEN_CONST(x)  ((sizeof(x)-1)
 /* Checks that a space-separated list of items contains one given 'item'.
 * Returns 1 if found, 0 otherwise.
 */
 static int has_list_item(const char* list, const char* item) {
  const char*  p = list;
  int itemlen = strlen(item);
  if (list == NULL)
    return 0;
  while (*p) {
    const char*  q;
    /* skip spaces */
    while (*p == ' ' || *p == '\t')
      p++;
    /* find end of current list item */
    q = p;
    while (*q && *q != ' ' && *q != '\t')
      q++;
    if (itemlen == q - p && !memcmp(p, item, itemlen))
      return 1;
    /* skip to next item */
    p = q;
  }
  return 0;
 }
 static void cpuInit(void) {
  char cpuinfo[4096];
  int  cpuinfo_len;
  g_cpuFamily   = DEFAULT_CPU_FAMILY;
  g_cpuFeatures = 0;
  g_cpuCount    = 1;
  cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, sizeof cpuinfo);
  D("cpuinfo_len is (%d):\n%.*s\n", cpuinfo_len,
    cpuinfo_len >= 0 ? cpuinfo_len : 0, cpuinfo);
  if (cpuinfo_len < 0) { /* should not happen */
    return;
  }
  /* Count the CPU cores, the value may be 0 for single-core CPUs */
  g_cpuCount = count_cpuinfo_field(cpuinfo, cpuinfo_len, "processor");
  if (g_cpuCount == 0) {
    g_cpuCount = count_cpuinfo_field(cpuinfo, cpuinfo_len, "Processor");
    if (g_cpuCount == 0) {
      g_cpuCount = 1;
    }
  }
  D("found cpuCount = %d\n", g_cpuCount);
 #ifdef __arm__
  {
    char*  features = NULL;
    char*  architecture = NULL;
    /* Extract architecture from the "CPU Architecture" field.
     * The list is well-known, unlike the the output of
     * the 'Processor' field which can vary greatly.
     *
     * See the definition of the 'proc_arch' array in
     * $KERNEL/arch/arm/kernel/setup.c and the 'c_show' function in
     * same file.
     */
    char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
                                          "CPU architecture");
    if (cpuArch != NULL) {
      char*  end;
      long   archNumber;
      int    hasARMv7 = 0;
      D("found cpuArch = '%s'\n", cpuArch);
      /* read the initial decimal number, ignore the rest */
      archNumber = strtol(cpuArch, &end, 10);
      /* Here we assume that ARMv8 will be upwards compatible with v7
          * in the future. Unfortunately, there is no 'Features' field to
          * indicate that Thumb-2 is supported.
          */
      if (end > cpuArch && archNumber >= 7) {
        hasARMv7 = 1;
      }
      /* Unfortunately, it seems that certain ARMv6-based CPUs
       * report an incorrect architecture number of 7!
       *
       * We try to correct this by looking at the 'elf_format'
       * field reported by the 'Processor' field, which is of the
       * form of "(v7l)" for an ARMv7-based CPU, and "(v6l)" for
       * an ARMv6-one.
       */
      if (hasARMv7) {
        char* cpuProc = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
                                              "Processor");
        if (cpuProc != NULL) {
          D("found cpuProc = '%s'\n", cpuProc);
          if (has_list_item(cpuProc, "(v6l)")) {
            D("CPU processor and architecture mismatch!!\n");
            hasARMv7 = 0;
          }
          free(cpuProc);
        }
      }
      if (hasARMv7) {
        g_cpuFeatures |= kCPUFeatureARMv7;
      }
      /* The LDREX / STREX instructions are available from ARMv6 */
      if (archNumber >= 6) {
        g_cpuFeatures |= kCPUFeatureLDREXSTREX;
      }
      free(cpuArch);
    }
    /* Extract the list of CPU features from 'Features' field */
    char* cpuFeatures = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
                                              "Features");
    if (cpuFeatures != NULL) {
      D("found cpuFeatures = '%s'\n", cpuFeatures);
      if (has_list_item(cpuFeatures, "vfpv3"))
        g_cpuFeatures |= kCPUFeatureVFPv3;
      else if (has_list_item(cpuFeatures, "vfpv3d16"))
        g_cpuFeatures |= kCPUFeatureVFPv3;
      if (has_list_item(cpuFeatures, "neon")) {
        /* Note: Certain kernels only report neon but not vfpv3
            *       in their features list. However, ARM mandates
            *       that if Neon is implemented, so must be VFPv3
            *       so always set the flag.
            */
        g_cpuFeatures |= kCPUFeatureNEON |
                         kCPUFeatureVFPv3;
      }
      free(cpuFeatures);
    }
  }
 #endif  // __arm__
 #ifdef __i386__
  g_cpuFamily = CPU_FAMILY_X86;
 #endif
 }
 uint64_t WebRtc_GetCPUFeaturesARM(void) {
  pthread_once(&g_once, cpuInit);
  return g_cpuFeatures;
 }