1st check-in for AECM Neon optimization.

Review URL: http://webrtc-codereview.appspot.com/104001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@359 4adac7df-926f-26a2-2b94-8c16560cd09d
2011-08-13 06:33:38 +00:00 · 2011-08-13 06:33:38 +00:00 · 1959e6fcb5
commit 1959e6fcb5
parent 4033e1245d
4 changed files with 399 additions and 113 deletions
--- a/src/modules/audio_processing/aecm/main/source/Android.mk
+++ b/src/modules/audio_processing/aecm/main/source/Android.mk
@ -25,6 +25,11 @@ LOCAL_SRC_FILES := \
 LOCAL_CFLAGS := \
    $(MY_WEBRTC_COMMON_DEFS)

+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+    LOCAL_SRC_FILES += aecm_core_neon.c
+    LOCAL_CFLAGS += $(CFLAGS_NEON)
+endif
+
 LOCAL_C_INCLUDES := \
    $(LOCAL_PATH)/../interface \
    $(LOCAL_PATH)/../../../utility \
--- a/src/modules/audio_processing/aecm/main/source/aecm_core.c
+++ b/src/modules/audio_processing/aecm/main/source/aecm_core.c
@ -31,7 +31,7 @@ FILE *testfile;
 #ifdef AECM_SHORT

 // Square root of Hanning window in Q14
-static const WebRtc_Word16 kSqrtHanning[] =
+const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] =
 {
    0, 804, 1606, 2404, 3196, 3981, 4756, 5520,
    6270, 7005, 7723, 8423, 9102, 9760, 10394, 11003,
@ -43,12 +43,15 @@ static const WebRtc_Word16 kSqrtHanning[] =
 #else

 // Square root of Hanning window in Q14
-static const WebRtc_Word16 kSqrtHanning[] = {0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
-        3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364,
-        8705, 9040, 9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514, 11795, 12068, 12335,
-        12594, 12845, 13089, 13325, 13553, 13773, 13985, 14189, 14384, 14571, 14749, 14918,
-        15079, 15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034, 16111, 16179, 16237,
-        16286, 16325, 16354, 16373, 16384};
+const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] __attribute__ ((aligned (8))) =
+{
+    0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
+    3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364,
+    8705, 9040, 9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514, 11795, 12068, 12335,
+    12594, 12845, 13089, 13325, 13553, 13773, 13985, 14189, 14384, 14571, 14749, 14918,
+    15079, 15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034, 16111, 16179, 16237,
+    16286, 16325, 16354, 16373, 16384
+};

 #endif

@ -98,11 +101,6 @@ static const WebRtc_Word16 kNoiseEstIncCount = 5;
 HANDLE logFile = NULL;
 #endif

-static void WebRtcAecm_ComfortNoise(AecmCore_t* const aecm, const WebRtc_UWord16 * const dfa,
-                                    WebRtc_Word16 * const outReal,
-                                    WebRtc_Word16 * const outImag,
-                                    const WebRtc_Word16 * const lambda);
-
 int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
 {
    AecmCore_t *aecm = malloc(sizeof(AecmCore_t));
@ -147,6 +145,18 @@ int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
        return -1;
    }

+    // Init some aecm pointers. 16-byte alignment is only necessary for Neon code currently.
+    aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 15) & ~ 15);
+    aecm->dBufClean = (WebRtc_Word16*) (((uintptr_t)aecm->dBufClean_buf + 15) & ~ 15);
+    aecm->dBufNoisy = (WebRtc_Word16*) (((uintptr_t)aecm->dBufNoisy_buf + 15) & ~ 15);
+    aecm->outBuf = (WebRtc_Word16*) (((uintptr_t)aecm->outBuf_buf + 15) & ~ 15);
+    aecm->channelStored = (WebRtc_Word16*) (((uintptr_t)
+                                             aecm->channelStored_buf + 15) & ~ 15);
+    aecm->channelAdapt16 = (WebRtc_Word16*) (((uintptr_t)
+                                              aecm->channelAdapt16_buf + 15) & ~ 15);
+    aecm->channelAdapt32 = (WebRtc_Word32*) (((uintptr_t)
+                                              aecm->channelAdapt32_buf + 31) & ~ 31);
+
    return 0;
 }

@ -209,10 +219,10 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
    WebRtcApm_InitBuffer(aecm->nearCleanFrameBuf);
    WebRtcApm_InitBuffer(aecm->outFrameBuf);

-    memset(aecm->xBuf, 0, sizeof(aecm->xBuf));
-    memset(aecm->dBufClean, 0, sizeof(aecm->dBufClean));
-    memset(aecm->dBufNoisy, 0, sizeof(aecm->dBufNoisy));
-    memset(aecm->outBuf, 0, sizeof(aecm->outBuf));
+    memset(aecm->xBuf_buf, 0, sizeof(aecm->xBuf_buf));
+    memset(aecm->dBufClean_buf, 0, sizeof(aecm->dBufClean_buf));
+    memset(aecm->dBufNoisy_buf, 0, sizeof(aecm->dBufNoisy_buf));
+    memset(aecm->outBuf_buf, 0, sizeof(aecm->outBuf_buf));

    aecm->seed = 666;
    aecm->totCount = 0;
@ -287,6 +297,8 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
    aecm->supGainErrParamDiffAB = SUPGAIN_ERROR_PARAM_A - SUPGAIN_ERROR_PARAM_B;
    aecm->supGainErrParamDiffBD = SUPGAIN_ERROR_PARAM_B - SUPGAIN_ERROR_PARAM_D;

+    assert(PART_LEN % 16 == 0);
+
    return 0;
 }

@ -481,18 +493,8 @@ void WebRtcAecm_CalcEnergies(AecmCore_t * aecm,
    aecm->nearLogEnergy[0] = tmp16;
    // END: Get log of near end energy

-    // Get energy for the delayed far end signal and estimated
-    // echo using both stored and adapted channels.
-    for (i = 0; i < PART_LEN1; i++)
-    {
-        // Get estimated echo energies for adaptive channel and stored channel
-        echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
-                                           far_spectrum[i]);
-        tmpFar += (WebRtc_UWord32)(far_spectrum[i]);
-        tmpAdapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
-                                          far_spectrum[i]);
-        tmpStored += (WebRtc_UWord32)echoEst[i];
-    }
+    WebRtcAecm_CalcLinearEnergies(aecm, far_spectrum, echoEst, &tmpFar, &tmpAdapt, &tmpStored);
+
    // Shift buffers
    memmove(aecm->echoAdaptLogEnergy + 1, aecm->echoAdaptLogEnergy,
            sizeof(WebRtc_Word16) * (MAX_BUF_LEN - 1));
@ -814,22 +816,9 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm,
    // Determine if we should store or restore the channel
    if ((aecm->startupState == 0) & (aecm->currentVADValue))
    {
-        // During startup we store the channel every block.
-        memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
-        // Recalculate echo estimate
-        for (i = 0; i < PART_LEN; i += 4)
-        {
-            echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
-                                               far_spectrum[i]);
-            echoEst[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
-                                               far_spectrum[i + 1]);
-            echoEst[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
-                                               far_spectrum[i + 2]);
-            echoEst[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
-                                               far_spectrum[i + 3]);
-        }
-        echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
-                                           far_spectrum[i]);
+        // During startup we store the channel every block,
+        // and we recalculate echo estimate
+        WebRtcAecm_StoreAdaptiveChannel(aecm, far_spectrum, echoEst);
    } else
    {
        if (aecm->farLogEnergy < aecm->farEnergyMSE)
@ -865,43 +854,14 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm,
            {
                // The stored channel has a significantly lower MSE than the adaptive one for
                // two consecutive calculations. Reset the adaptive channel.
-                memcpy(aecm->channelAdapt16, aecm->channelStored,
-                       sizeof(WebRtc_Word16) * PART_LEN1);
-                // Restore the W32 channel
-                for (i = 0; i < PART_LEN; i += 4)
-                {
-                    aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
-                            (WebRtc_Word32)aecm->channelStored[i], 16);
-                    aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
-                            (WebRtc_Word32)aecm->channelStored[i + 1], 16);
-                    aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
-                            (WebRtc_Word32)aecm->channelStored[i + 2], 16);
-                    aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
-                            (WebRtc_Word32)aecm->channelStored[i + 3], 16);
-                }
-                aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
-
+                WebRtcAecm_ResetAdaptiveChannel(aecm);
            } else if (((MIN_MSE_DIFF * mseStored) > (mseAdapt << MSE_RESOLUTION)) & (mseAdapt
                    < aecm->mseThreshold) & (aecm->mseAdaptOld < aecm->mseThreshold))
            {
                // The adaptive channel has a significantly lower MSE than the stored one.
                // The MSE for the adaptive channel has also been low for two consecutive
                // calculations. Store the adaptive channel.
-                memcpy(aecm->channelStored, aecm->channelAdapt16,
-                       sizeof(WebRtc_Word16) * PART_LEN1);
-                // Recalculate echo estimate
-                for (i = 0; i < PART_LEN; i += 4)
-                {
-                    echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
-                                                       far_spectrum[i]);
-                    echoEst[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
-                                                       far_spectrum[i + 1]);
-                    echoEst[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
-                                                       far_spectrum[i + 2]);
-                    echoEst[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
-                                                       far_spectrum[i + 3]);
-                }
-                echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
+                WebRtcAecm_StoreAdaptiveChannel(aecm, far_spectrum, echoEst);

                // Update threshold
                if (aecm->mseThreshold == WEBRTC_SPL_WORD32_MAX)
@ -1032,7 +992,9 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
    WebRtc_Word32 tmp32no1;
    WebRtc_Word32 tmp32no2;

-    WebRtc_Word16 fft[PART_LEN4];
+    // In fft_buf, +8 for 16-byte alignment, and +2 to make some loops safe.
+    WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 8];
+    WebRtc_Word16 *fft = (WebRtc_Word16 *) (((uintptr_t) fft_buf + 15) & ~15);

    WebRtc_Word16 tmp16no1;
    WebRtc_Word16 tmp16no2;
@ -1048,23 +1010,7 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
    time_signal_scaling = WebRtcSpl_NormW16(tmp16no1);
 #endif

-    memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
-    // FFT of signal
-    for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
-    {
-        // Window time domain signal and insert into real part of
-        // transformation array |fft|
-        fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
-            (time_signal[i] << time_signal_scaling),
-            kSqrtHanning[i],
-            14);
-        fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
-            (time_signal[PART_LEN + i] << time_signal_scaling),
-            kSqrtHanning[PART_LEN - i],
-            14);
-        // Inserting zeros in imaginary parts not necessary since we
-        // initialized the array with all zeros
-    }
+    WebRtcAecm_PrepareFft(fft, time_signal, time_signal_scaling);

    // Fourier transformation of time domain signal.
    // The result is scaled with 1/PART_LEN2, that is, the result is in Q(-6)
@ -1187,7 +1133,10 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
    WebRtc_UWord32 tmpU32;

    WebRtc_Word32 tmp32no1;
-    WebRtc_Word32 echoEst32[PART_LEN1];
+
+    // +8 for 32-byte alignment.
+    WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8];
+    WebRtc_Word32 *echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31);

    WebRtc_UWord16 xfa[PART_LEN1];
    WebRtc_UWord16 dfaNoisy[PART_LEN1];
@ -1540,9 +1489,9 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
        for (i = 0; i < PART_LEN1; i++)
        {
            efwReal[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwReal[i],
-                                                                            hnl[i], 14));
+                                                                           hnl[i], 14));
            efwImag[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwImag[i],
-                                                                            hnl[i], 14));
+                                                                           hnl[i], 14));
        }
    }

@ -1595,7 +1544,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
    {
        fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
                fft[i],
-                kSqrtHanning[i],
+                WebRtcAecm_kSqrtHanning[i],
                14);
        tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
                outCFFT - aecm->dfaCleanQDomain);
@ -1606,7 +1555,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,

        tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
                fft[PART_LEN + i],
-                kSqrtHanning[PART_LEN - i],
+                WebRtcAecm_kSqrtHanning[PART_LEN - i],
                14);
        tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
                outCFFT - aecm->dfaCleanQDomain);
@ -1623,7 +1572,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
    milliseconds = (unsigned int)(diff__ & 0xffffffff);
    WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
 #endif
-    // Copy the current block to the old position (outBuf is shifted elsewhere)
+    // Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
    memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
    memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
    if (nearendClean != NULL)
@ -1634,6 +1583,105 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
    return 0;
 }

+#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
+
+void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
+                           const WebRtc_Word16* time_signal,
+                           int time_signal_scaling)
+{
+    int i, j;
+
+    memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
+    // FFT of signal
+    for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
+    {
+        // Window time domain signal and insert into real part of
+        // transformation array |fft|
+        fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
+            (time_signal[i] << time_signal_scaling),
+            WebRtcAecm_kSqrtHanning[i],
+            14);
+        fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
+            (time_signal[PART_LEN + i] << time_signal_scaling),
+            WebRtcAecm_kSqrtHanning[PART_LEN - i],
+            14);
+        // Inserting zeros in imaginary parts not necessary since we
+        // initialized the array with all zeros
+    }
+}
+
+void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
+                                   const WebRtc_UWord16* far_spectrum,
+                                   WebRtc_Word32* echo_est,
+                                   WebRtc_UWord32* far_energy,
+                                   WebRtc_UWord32* echo_energy_adapt,
+                                   WebRtc_UWord32* echo_energy_stored)
+{
+    int i;
+
+    // Get energy for the delayed far end signal and estimated
+    // echo using both stored and adapted channels.
+    for (i = 0; i < PART_LEN1; i++)
+    {
+        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
+                                           far_spectrum[i]);
+        (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
+        (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
+                                          far_spectrum[i]);
+        (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
+    }
+}
+
+void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
+                                     const WebRtc_UWord16* far_spectrum,
+                                     WebRtc_Word32* echo_est)
+{
+    int i;
+
+    // During startup we store the channel every block.
+    memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
+    // Recalculate echo estimate
+    for (i = 0; i < PART_LEN; i += 4)
+    {
+        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
+                                           far_spectrum[i]);
+        echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
+                                           far_spectrum[i + 1]);
+        echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
+                                           far_spectrum[i + 2]);
+        echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
+                                           far_spectrum[i + 3]);
+    }
+    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
+                                       far_spectrum[i]);
+}
+
+void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
+{
+    int i;
+
+    // The stored channel has a significantly lower MSE than the adaptive one for
+    // two consecutive calculations. Reset the adaptive channel.
+    memcpy(aecm->channelAdapt16, aecm->channelStored,
+           sizeof(WebRtc_Word16) * PART_LEN1);
+    // Restore the W32 channel
+    for (i = 0; i < PART_LEN; i += 4)
+    {
+        aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
+                (WebRtc_Word32)aecm->channelStored[i], 16);
+        aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
+                (WebRtc_Word32)aecm->channelStored[i + 1], 16);
+        aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
+                (WebRtc_Word32)aecm->channelStored[i + 2], 16);
+        aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
+                (WebRtc_Word32)aecm->channelStored[i + 3], 16);
+    }
+    aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
+}
+
+#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
+
+
 // Generate comfort noise and add to output signal.
 //
 // \param[in]     aecm     Handle of the AECM instance.
@ -1642,11 +1690,11 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
 // \param[in,out] outImag Imaginary part of the output signal (Q[aecm->dfaQDomain]).
 // \param[in]     lambda  Suppression gain with which to scale the noise level (Q14).
 //
-static void WebRtcAecm_ComfortNoise(AecmCore_t * const aecm,
-                                    const WebRtc_UWord16 * const dfa,
-                                    WebRtc_Word16 * const outReal,
-                                    WebRtc_Word16 * const outImag,
-                                    const WebRtc_Word16 * const lambda)
+void WebRtcAecm_ComfortNoise(AecmCore_t * aecm,
+                             const WebRtc_UWord16* dfa,
+                             WebRtc_Word16* outReal,
+                             WebRtc_Word16* outImag,
+                             const WebRtc_Word16* lambda)
 {
    WebRtc_Word16 i;
    WebRtc_Word16 tmp16;
@ -1792,7 +1840,8 @@ static void WebRtcAecm_ComfortNoise(AecmCore_t * const aecm,
 #endif
 }

-void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * const farend,
+void WebRtcAecm_BufferFarFrame(AecmCore_t* const aecm,
+                               const WebRtc_Word16* const farend,
                               const int farLen)
 {
    int writeLen = farLen, writePos = 0;
--- a/src/modules/audio_processing/aecm/main/source/aecm_core.h
+++ b/src/modules/audio_processing/aecm/main/source/aecm_core.h
@ -97,6 +97,8 @@
 #define NLP_COMP_LOW    3277            // 0.2 in Q14
 #define NLP_COMP_HIGH   ONE_Q14         // 1 in Q14

+extern const WebRtc_Word16 WebRtcAecm_kSqrtHanning[];
+
 typedef struct
 {
    int farBufWritePos;
@ -110,11 +112,6 @@ typedef struct
    void *nearCleanFrameBuf;
    void *outFrameBuf;

-    WebRtc_Word16 xBuf[PART_LEN2]; // farend
-    WebRtc_Word16 dBufClean[PART_LEN2]; // nearend
-    WebRtc_Word16 dBufNoisy[PART_LEN2]; // nearend
-    WebRtc_Word16 outBuf[PART_LEN];
-
    WebRtc_Word16 farBuf[FAR_BUF_LEN];

    WebRtc_Word16 mult;
@ -139,9 +136,26 @@ typedef struct
    WebRtc_Word16 echoAdaptLogEnergy[MAX_BUF_LEN];
    WebRtc_Word16 echoStoredLogEnergy[MAX_BUF_LEN];

-    WebRtc_Word16 channelAdapt16[PART_LEN1];
-    WebRtc_Word32 channelAdapt32[PART_LEN1];
-    WebRtc_Word16 channelStored[PART_LEN1];
+    // The extra 16 or 32 bytes in the following buffers are for alignment based Neon code.
+    // It's designed this way since the current GCC compiler can't align a buffer in 16 or 32
+    // byte boundaries properly.
+    WebRtc_Word16 channelStored_buf[PART_LEN1 + 8];
+    WebRtc_Word16 channelAdapt16_buf[PART_LEN1 + 8];
+    WebRtc_Word32 channelAdapt32_buf[PART_LEN1 + 8];
+    WebRtc_Word16 xBuf_buf[PART_LEN2 + 8]; // farend
+    WebRtc_Word16 dBufClean_buf[PART_LEN2 + 8]; // nearend
+    WebRtc_Word16 dBufNoisy_buf[PART_LEN2 + 8]; // nearend
+    WebRtc_Word16 outBuf_buf[PART_LEN + 8];
+
+    // Pointers to the above buffers
+    WebRtc_Word16 *channelStored;
+    WebRtc_Word16 *channelAdapt16;
+    WebRtc_Word32 *channelAdapt32;
+    WebRtc_Word16 *xBuf;
+    WebRtc_Word16 *dBufClean;
+    WebRtc_Word16 *dBufNoisy;
+    WebRtc_Word16 *outBuf;
+
    WebRtc_Word32 echoFilt[PART_LEN1];
    WebRtc_Word16 nearFilt[PART_LEN1];
    WebRtc_Word32 noiseEst[PART_LEN1];
@ -308,4 +322,27 @@ void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * co
 void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const farend,
                              const int farLen, const int knownDelay);

+///////////////////////////////////////////////////////////////////////////////////////////////
+// Some internal functions shared by ARM NEON and generic C code:
+//
+
+WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t * aecm);
+
+void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
+                                   const WebRtc_UWord16* far_spectrum,
+                                   WebRtc_Word32* echoEst,
+                                   WebRtc_UWord32* far_energy,
+                                   WebRtc_UWord32* echo_energy_adapt,
+                                   WebRtc_UWord32* echo_energy_stored);
+
+void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
+                                     const WebRtc_UWord16* far_spectrum,
+                                     WebRtc_Word32* echo_est);
+
+void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm);
+
+void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
+                           const WebRtc_Word16* time_signal,
+                           int time_signal_scaling);
+
 #endif
--- a/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c
+++ b/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c
@ -0,0 +1,195 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
+
+#include "aecm_core.h"
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#include "aecm_delay_estimator.h"
+#include "echo_control_mobile.h"
+#include "ring_buffer.h"
+#include "typedefs.h"
+
+// Square root of Hanning window in Q14
+static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = {       
+     16384, 16373, 16354, 16325,  
+     16286, 16237, 16179, 16111,  
+     16034, 15947, 15851, 15746,  
+     15631, 15506, 15373, 15231,  
+     15079, 14918, 14749, 14571,  
+     14384, 14189, 13985, 13773,  
+     13553, 13325, 13089, 12845,  
+     12594, 12335, 12068, 11795,  
+     11514, 11227, 10933, 10633,  
+     10326, 10013, 9695,  9370,   
+     9040,  8705,  8364,  8019,   
+     7668,  7313,  6954,  6591,   
+     6224,  5853,  5478,  5101,   
+     4720,  4337,  3951,  3562,   
+     3172,  2780,  2386,  1990,   
+     1594,  1196,  798,   399
+};
+
+void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
+                                   const WebRtc_UWord16* far_spectrum,
+                                   WebRtc_Word32* echoEst,
+                                   WebRtc_UWord32* far_energy,
+                                   WebRtc_UWord32* echo_energy_adapt,
+                                   WebRtc_UWord32* echo_energy_stored)
+{
+    int i;
+
+    register WebRtc_UWord32 far_energy_r;
+    register WebRtc_UWord32 echo_energy_stored_r;
+    register WebRtc_UWord32 echo_energy_adapt_r;
+    uint32x4_t tmp32x4_0;
+
+    __asm__("vmov.i32 q14, #0" : : : "q14"); //far_energy
+    __asm__("vmov.i32 q8,  #0" : : : "q8"); //echo_energy_stored
+    __asm__("vmov.i32 q9,  #0" : : : "q9"); //echo_energy_adapt
+
+    for(i = 0; i < PART_LEN -7; i += 8)
+    {
+        //far_energy += (WebRtc_UWord32)(far_spectrum[i]);
+        __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
+        __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
+        __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
+
+        // Get estimated echo energies for adaptive channel and stored channel
+        //echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
+        __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
+        __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
+        __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
+        __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echoEst[i]): "q10", "q11");
+
+        //echo_energy_stored += (WebRtc_UWord32)echoEst[i];
+        __asm__("vadd.u32 q8, q10" : : : "q10", "q8");
+        __asm__("vadd.u32 q8, q11" : : : "q11", "q8");
+
+        //echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], far_spectrum[i]);
+        __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
+        __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
+        __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
+        __asm__("vadd.u32 q9, q10" : : : "q9", "q15");
+        __asm__("vadd.u32 q9, q11" : : : "q9", "q11");
+    }
+
+    __asm__("vadd.u32 d28, d29" : : : "q14");
+    __asm__("vpadd.u32 d28, d28" : : : "q14");
+    __asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
+
+    __asm__("vadd.u32 d18, d19" : : : "q9");
+    __asm__("vpadd.u32 d18, d18" : : : "q9");
+    __asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
+
+    __asm__("vadd.u32 d16, d17" : : : "q8");
+    __asm__("vpadd.u32 d16, d16" : : : "q8");
+    __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
+
+    // Get estimated echo energies for adaptive channel and stored channel
+    echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
+    *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echoEst[i];
+    *far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
+    *echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
+        aecm->channelAdapt16[i], far_spectrum[i]);
+}
+
+void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
+                                     const WebRtc_UWord16* far_spectrum,
+                                     WebRtc_Word32* echo_est)
+{
+    int i;
+
+    // During startup we store the channel every block.
+    // Recalculate echo estimate.
+    for(i = 0; i < PART_LEN -7; i += 8)
+    {
+        // aecm->channelStored[i] = acem->channelAdapt16[i];
+        // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
+        __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
+        __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
+        __asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
+        __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
+        __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
+        __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
+                               "r"(&echo_est[i]) : "q10", "q11");
+    }
+    aecm->channelStored[i] = aecm->channelAdapt16[i];
+    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
+}
+
+void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
+{
+    int i;
+
+    for(i = 0; i < PART_LEN -7; i += 8)
+    {
+        // aecm->channelAdapt16[i] = aecm->channelStored[i];
+        // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
+        //                           aecm->channelStored[i], 16);
+        __asm__("vld1.16 {d24, d25}, [%0, :128]" : :
+                        "r"(&aecm->channelStored[i]) : "q12");
+        __asm__("vst1.16 {d24, d25}, [%0, :128]" : :
+                        "r"(&aecm->channelAdapt16[i]) : "q12");
+        __asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
+        __asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
+        __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
+                        "r"(&aecm->channelAdapt32[i]): "q10", "q11");
+    }
+    aecm->channelAdapt16[i] = aecm->channelStored[i];
+    aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
+            (WebRtc_Word32)aecm->channelStored[i], 16);
+}
+
+void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
+                           const WebRtc_Word16* time_signal,
+                           int time_signal_scaling)
+{
+    int i, j;
+    int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
+    __asm__("vmov.i16 d21, #0" ::: "d21");
+
+    for(i = 0, j = 0; i < PART_LEN-3; i += 4, j += 8)
+    {
+        int16x4_t tmp16x4_0;
+        int16x4_t tmp16x4_1;
+        int32x4_t tmp32x4_0;
+
+        /* Window near end */
+        // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
+        //       << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
+        tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
+
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
+        tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
+
+        __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
+        __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
+
+        // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
+        //      (time_signal[PART_LEN + i] << time_signal_scaling),
+        //       WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[PART_LEN + i]));
+        tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
+
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
+        tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
+
+        __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
+        __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
+    }
+}
+
+#endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
+