From 1959e6fcb54da8d9979af50eeda83f282b75351b Mon Sep 17 00:00:00 2001 From: "kma@google.com" Date: Sat, 13 Aug 2011 06:33:38 +0000 Subject: [PATCH] 1st check-in for AECM Neon optimization. Review URL: http://webrtc-codereview.appspot.com/104001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@359 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../aecm/main/source/Android.mk | 5 + .../aecm/main/source/aecm_core.c | 259 +++++++++++------- .../aecm/main/source/aecm_core.h | 53 +++- .../aecm/main/source/aecm_core_neon.c | 195 +++++++++++++ 4 files changed, 399 insertions(+), 113 deletions(-) create mode 100644 src/modules/audio_processing/aecm/main/source/aecm_core_neon.c diff --git a/src/modules/audio_processing/aecm/main/source/Android.mk b/src/modules/audio_processing/aecm/main/source/Android.mk index c0d24788c..b295061fc 100644 --- a/src/modules/audio_processing/aecm/main/source/Android.mk +++ b/src/modules/audio_processing/aecm/main/source/Android.mk @@ -25,6 +25,11 @@ LOCAL_SRC_FILES := \ LOCAL_CFLAGS := \ $(MY_WEBRTC_COMMON_DEFS) +ifeq ($(ARCH_ARM_HAVE_NEON),true) + LOCAL_SRC_FILES += aecm_core_neon.c + LOCAL_CFLAGS += $(CFLAGS_NEON) +endif + LOCAL_C_INCLUDES := \ $(LOCAL_PATH)/../interface \ $(LOCAL_PATH)/../../../utility \ diff --git a/src/modules/audio_processing/aecm/main/source/aecm_core.c b/src/modules/audio_processing/aecm/main/source/aecm_core.c index 842dd2da1..8947f53fe 100644 --- a/src/modules/audio_processing/aecm/main/source/aecm_core.c +++ b/src/modules/audio_processing/aecm/main/source/aecm_core.c @@ -31,7 +31,7 @@ FILE *testfile; #ifdef AECM_SHORT // Square root of Hanning window in Q14 -static const WebRtc_Word16 kSqrtHanning[] = +const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] = { 0, 804, 1606, 2404, 3196, 3981, 4756, 5520, 6270, 7005, 7723, 8423, 9102, 9760, 10394, 11003, @@ -43,12 +43,15 @@ static const WebRtc_Word16 kSqrtHanning[] = #else // Square root of Hanning window in Q14 -static const WebRtc_Word16 kSqrtHanning[] = {0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172, - 3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364, - 8705, 9040, 9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514, 11795, 12068, 12335, - 12594, 12845, 13089, 13325, 13553, 13773, 13985, 14189, 14384, 14571, 14749, 14918, - 15079, 15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034, 16111, 16179, 16237, - 16286, 16325, 16354, 16373, 16384}; +const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] __attribute__ ((aligned (8))) = +{ + 0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172, + 3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364, + 8705, 9040, 9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514, 11795, 12068, 12335, + 12594, 12845, 13089, 13325, 13553, 13773, 13985, 14189, 14384, 14571, 14749, 14918, + 15079, 15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034, 16111, 16179, 16237, + 16286, 16325, 16354, 16373, 16384 +}; #endif @@ -98,11 +101,6 @@ static const WebRtc_Word16 kNoiseEstIncCount = 5; HANDLE logFile = NULL; #endif -static void WebRtcAecm_ComfortNoise(AecmCore_t* const aecm, const WebRtc_UWord16 * const dfa, - WebRtc_Word16 * const outReal, - WebRtc_Word16 * const outImag, - const WebRtc_Word16 * const lambda); - int WebRtcAecm_CreateCore(AecmCore_t **aecmInst) { AecmCore_t *aecm = malloc(sizeof(AecmCore_t)); @@ -147,6 +145,18 @@ int WebRtcAecm_CreateCore(AecmCore_t **aecmInst) return -1; } + // Init some aecm pointers. 16-byte alignment is only necessary for Neon code currently. + aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 15) & ~ 15); + aecm->dBufClean = (WebRtc_Word16*) (((uintptr_t)aecm->dBufClean_buf + 15) & ~ 15); + aecm->dBufNoisy = (WebRtc_Word16*) (((uintptr_t)aecm->dBufNoisy_buf + 15) & ~ 15); + aecm->outBuf = (WebRtc_Word16*) (((uintptr_t)aecm->outBuf_buf + 15) & ~ 15); + aecm->channelStored = (WebRtc_Word16*) (((uintptr_t) + aecm->channelStored_buf + 15) & ~ 15); + aecm->channelAdapt16 = (WebRtc_Word16*) (((uintptr_t) + aecm->channelAdapt16_buf + 15) & ~ 15); + aecm->channelAdapt32 = (WebRtc_Word32*) (((uintptr_t) + aecm->channelAdapt32_buf + 31) & ~ 31); + return 0; } @@ -209,10 +219,10 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq) WebRtcApm_InitBuffer(aecm->nearCleanFrameBuf); WebRtcApm_InitBuffer(aecm->outFrameBuf); - memset(aecm->xBuf, 0, sizeof(aecm->xBuf)); - memset(aecm->dBufClean, 0, sizeof(aecm->dBufClean)); - memset(aecm->dBufNoisy, 0, sizeof(aecm->dBufNoisy)); - memset(aecm->outBuf, 0, sizeof(aecm->outBuf)); + memset(aecm->xBuf_buf, 0, sizeof(aecm->xBuf_buf)); + memset(aecm->dBufClean_buf, 0, sizeof(aecm->dBufClean_buf)); + memset(aecm->dBufNoisy_buf, 0, sizeof(aecm->dBufNoisy_buf)); + memset(aecm->outBuf_buf, 0, sizeof(aecm->outBuf_buf)); aecm->seed = 666; aecm->totCount = 0; @@ -287,6 +297,8 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq) aecm->supGainErrParamDiffAB = SUPGAIN_ERROR_PARAM_A - SUPGAIN_ERROR_PARAM_B; aecm->supGainErrParamDiffBD = SUPGAIN_ERROR_PARAM_B - SUPGAIN_ERROR_PARAM_D; + assert(PART_LEN % 16 == 0); + return 0; } @@ -481,18 +493,8 @@ void WebRtcAecm_CalcEnergies(AecmCore_t * aecm, aecm->nearLogEnergy[0] = tmp16; // END: Get log of near end energy - // Get energy for the delayed far end signal and estimated - // echo using both stored and adapted channels. - for (i = 0; i < PART_LEN1; i++) - { - // Get estimated echo energies for adaptive channel and stored channel - echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], - far_spectrum[i]); - tmpFar += (WebRtc_UWord32)(far_spectrum[i]); - tmpAdapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], - far_spectrum[i]); - tmpStored += (WebRtc_UWord32)echoEst[i]; - } + WebRtcAecm_CalcLinearEnergies(aecm, far_spectrum, echoEst, &tmpFar, &tmpAdapt, &tmpStored); + // Shift buffers memmove(aecm->echoAdaptLogEnergy + 1, aecm->echoAdaptLogEnergy, sizeof(WebRtc_Word16) * (MAX_BUF_LEN - 1)); @@ -814,22 +816,9 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm, // Determine if we should store or restore the channel if ((aecm->startupState == 0) & (aecm->currentVADValue)) { - // During startup we store the channel every block. - memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1); - // Recalculate echo estimate - for (i = 0; i < PART_LEN; i += 4) - { - echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], - far_spectrum[i]); - echoEst[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], - far_spectrum[i + 1]); - echoEst[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], - far_spectrum[i + 2]); - echoEst[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], - far_spectrum[i + 3]); - } - echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], - far_spectrum[i]); + // During startup we store the channel every block, + // and we recalculate echo estimate + WebRtcAecm_StoreAdaptiveChannel(aecm, far_spectrum, echoEst); } else { if (aecm->farLogEnergy < aecm->farEnergyMSE) @@ -865,43 +854,14 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm, { // The stored channel has a significantly lower MSE than the adaptive one for // two consecutive calculations. Reset the adaptive channel. - memcpy(aecm->channelAdapt16, aecm->channelStored, - sizeof(WebRtc_Word16) * PART_LEN1); - // Restore the W32 channel - for (i = 0; i < PART_LEN; i += 4) - { - aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i], 16); - aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i + 1], 16); - aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i + 2], 16); - aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i + 3], 16); - } - aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16); - + WebRtcAecm_ResetAdaptiveChannel(aecm); } else if (((MIN_MSE_DIFF * mseStored) > (mseAdapt << MSE_RESOLUTION)) & (mseAdapt < aecm->mseThreshold) & (aecm->mseAdaptOld < aecm->mseThreshold)) { // The adaptive channel has a significantly lower MSE than the stored one. // The MSE for the adaptive channel has also been low for two consecutive // calculations. Store the adaptive channel. - memcpy(aecm->channelStored, aecm->channelAdapt16, - sizeof(WebRtc_Word16) * PART_LEN1); - // Recalculate echo estimate - for (i = 0; i < PART_LEN; i += 4) - { - echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], - far_spectrum[i]); - echoEst[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], - far_spectrum[i + 1]); - echoEst[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], - far_spectrum[i + 2]); - echoEst[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], - far_spectrum[i + 3]); - } - echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); + WebRtcAecm_StoreAdaptiveChannel(aecm, far_spectrum, echoEst); // Update threshold if (aecm->mseThreshold == WEBRTC_SPL_WORD32_MAX) @@ -1032,7 +992,9 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal, WebRtc_Word32 tmp32no1; WebRtc_Word32 tmp32no2; - WebRtc_Word16 fft[PART_LEN4]; + // In fft_buf, +8 for 16-byte alignment, and +2 to make some loops safe. + WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 8]; + WebRtc_Word16 *fft = (WebRtc_Word16 *) (((uintptr_t) fft_buf + 15) & ~15); WebRtc_Word16 tmp16no1; WebRtc_Word16 tmp16no2; @@ -1048,23 +1010,7 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal, time_signal_scaling = WebRtcSpl_NormW16(tmp16no1); #endif - memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4); - // FFT of signal - for (i = 0, j = 0; i < PART_LEN; i++, j += 2) - { - // Window time domain signal and insert into real part of - // transformation array |fft| - fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( - (time_signal[i] << time_signal_scaling), - kSqrtHanning[i], - 14); - fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( - (time_signal[PART_LEN + i] << time_signal_scaling), - kSqrtHanning[PART_LEN - i], - 14); - // Inserting zeros in imaginary parts not necessary since we - // initialized the array with all zeros - } + WebRtcAecm_PrepareFft(fft, time_signal, time_signal_scaling); // Fourier transformation of time domain signal. // The result is scaled with 1/PART_LEN2, that is, the result is in Q(-6) @@ -1187,7 +1133,10 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, WebRtc_UWord32 tmpU32; WebRtc_Word32 tmp32no1; - WebRtc_Word32 echoEst32[PART_LEN1]; + + // +8 for 32-byte alignment. + WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8]; + WebRtc_Word32 *echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31); WebRtc_UWord16 xfa[PART_LEN1]; WebRtc_UWord16 dfaNoisy[PART_LEN1]; @@ -1540,9 +1489,9 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, for (i = 0; i < PART_LEN1; i++) { efwReal[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwReal[i], - hnl[i], 14)); + hnl[i], 14)); efwImag[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwImag[i], - hnl[i], 14)); + hnl[i], 14)); } } @@ -1595,7 +1544,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, { fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( fft[i], - kSqrtHanning[i], + WebRtcAecm_kSqrtHanning[i], 14); tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i], outCFFT - aecm->dfaCleanQDomain); @@ -1606,7 +1555,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( fft[PART_LEN + i], - kSqrtHanning[PART_LEN - i], + WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain); @@ -1623,7 +1572,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, milliseconds = (unsigned int)(diff__ & 0xffffffff); WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL); #endif - // Copy the current block to the old position (outBuf is shifted elsewhere) + // Copy the current block to the old position (aecm->outBuf is shifted elsewhere) memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); if (nearendClean != NULL) @@ -1634,6 +1583,105 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, return 0; } +#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) + +void WebRtcAecm_PrepareFft(WebRtc_Word16* fft, + const WebRtc_Word16* time_signal, + int time_signal_scaling) +{ + int i, j; + + memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4); + // FFT of signal + for (i = 0, j = 0; i < PART_LEN; i++, j += 2) + { + // Window time domain signal and insert into real part of + // transformation array |fft| + fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( + (time_signal[i] << time_signal_scaling), + WebRtcAecm_kSqrtHanning[i], + 14); + fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( + (time_signal[PART_LEN + i] << time_signal_scaling), + WebRtcAecm_kSqrtHanning[PART_LEN - i], + 14); + // Inserting zeros in imaginary parts not necessary since we + // initialized the array with all zeros + } +} + +void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est, + WebRtc_UWord32* far_energy, + WebRtc_UWord32* echo_energy_adapt, + WebRtc_UWord32* echo_energy_stored) +{ + int i; + + // Get energy for the delayed far end signal and estimated + // echo using both stored and adapted channels. + for (i = 0; i < PART_LEN1; i++) + { + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], + far_spectrum[i]); + (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]); + (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], + far_spectrum[i]); + (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i]; + } +} + +void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est) +{ + int i; + + // During startup we store the channel every block. + memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1); + // Recalculate echo estimate + for (i = 0; i < PART_LEN; i += 4) + { + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], + far_spectrum[i]); + echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], + far_spectrum[i + 1]); + echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], + far_spectrum[i + 2]); + echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], + far_spectrum[i + 3]); + } + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], + far_spectrum[i]); +} + +void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm) +{ + int i; + + // The stored channel has a significantly lower MSE than the adaptive one for + // two consecutive calculations. Reset the adaptive channel. + memcpy(aecm->channelAdapt16, aecm->channelStored, + sizeof(WebRtc_Word16) * PART_LEN1); + // Restore the W32 channel + for (i = 0; i < PART_LEN; i += 4) + { + aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i], 16); + aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i + 1], 16); + aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i + 2], 16); + aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i + 3], 16); + } + aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16); +} + +#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) + + // Generate comfort noise and add to output signal. // // \param[in] aecm Handle of the AECM instance. @@ -1642,11 +1690,11 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, // \param[in,out] outImag Imaginary part of the output signal (Q[aecm->dfaQDomain]). // \param[in] lambda Suppression gain with which to scale the noise level (Q14). // -static void WebRtcAecm_ComfortNoise(AecmCore_t * const aecm, - const WebRtc_UWord16 * const dfa, - WebRtc_Word16 * const outReal, - WebRtc_Word16 * const outImag, - const WebRtc_Word16 * const lambda) +void WebRtcAecm_ComfortNoise(AecmCore_t * aecm, + const WebRtc_UWord16* dfa, + WebRtc_Word16* outReal, + WebRtc_Word16* outImag, + const WebRtc_Word16* lambda) { WebRtc_Word16 i; WebRtc_Word16 tmp16; @@ -1792,7 +1840,8 @@ static void WebRtcAecm_ComfortNoise(AecmCore_t * const aecm, #endif } -void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * const farend, +void WebRtcAecm_BufferFarFrame(AecmCore_t* const aecm, + const WebRtc_Word16* const farend, const int farLen) { int writeLen = farLen, writePos = 0; diff --git a/src/modules/audio_processing/aecm/main/source/aecm_core.h b/src/modules/audio_processing/aecm/main/source/aecm_core.h index 05f4c8597..1050dee16 100644 --- a/src/modules/audio_processing/aecm/main/source/aecm_core.h +++ b/src/modules/audio_processing/aecm/main/source/aecm_core.h @@ -97,6 +97,8 @@ #define NLP_COMP_LOW 3277 // 0.2 in Q14 #define NLP_COMP_HIGH ONE_Q14 // 1 in Q14 +extern const WebRtc_Word16 WebRtcAecm_kSqrtHanning[]; + typedef struct { int farBufWritePos; @@ -110,11 +112,6 @@ typedef struct void *nearCleanFrameBuf; void *outFrameBuf; - WebRtc_Word16 xBuf[PART_LEN2]; // farend - WebRtc_Word16 dBufClean[PART_LEN2]; // nearend - WebRtc_Word16 dBufNoisy[PART_LEN2]; // nearend - WebRtc_Word16 outBuf[PART_LEN]; - WebRtc_Word16 farBuf[FAR_BUF_LEN]; WebRtc_Word16 mult; @@ -139,9 +136,26 @@ typedef struct WebRtc_Word16 echoAdaptLogEnergy[MAX_BUF_LEN]; WebRtc_Word16 echoStoredLogEnergy[MAX_BUF_LEN]; - WebRtc_Word16 channelAdapt16[PART_LEN1]; - WebRtc_Word32 channelAdapt32[PART_LEN1]; - WebRtc_Word16 channelStored[PART_LEN1]; + // The extra 16 or 32 bytes in the following buffers are for alignment based Neon code. + // It's designed this way since the current GCC compiler can't align a buffer in 16 or 32 + // byte boundaries properly. + WebRtc_Word16 channelStored_buf[PART_LEN1 + 8]; + WebRtc_Word16 channelAdapt16_buf[PART_LEN1 + 8]; + WebRtc_Word32 channelAdapt32_buf[PART_LEN1 + 8]; + WebRtc_Word16 xBuf_buf[PART_LEN2 + 8]; // farend + WebRtc_Word16 dBufClean_buf[PART_LEN2 + 8]; // nearend + WebRtc_Word16 dBufNoisy_buf[PART_LEN2 + 8]; // nearend + WebRtc_Word16 outBuf_buf[PART_LEN + 8]; + + // Pointers to the above buffers + WebRtc_Word16 *channelStored; + WebRtc_Word16 *channelAdapt16; + WebRtc_Word32 *channelAdapt32; + WebRtc_Word16 *xBuf; + WebRtc_Word16 *dBufClean; + WebRtc_Word16 *dBufNoisy; + WebRtc_Word16 *outBuf; + WebRtc_Word32 echoFilt[PART_LEN1]; WebRtc_Word16 nearFilt[PART_LEN1]; WebRtc_Word32 noiseEst[PART_LEN1]; @@ -308,4 +322,27 @@ void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * co void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const farend, const int farLen, const int knownDelay); +/////////////////////////////////////////////////////////////////////////////////////////////// +// Some internal functions shared by ARM NEON and generic C code: +// + +WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t * aecm); + +void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echoEst, + WebRtc_UWord32* far_energy, + WebRtc_UWord32* echo_energy_adapt, + WebRtc_UWord32* echo_energy_stored); + +void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est); + +void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm); + +void WebRtcAecm_PrepareFft(WebRtc_Word16* fft, + const WebRtc_Word16* time_signal, + int time_signal_scaling); + #endif diff --git a/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c b/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c new file mode 100644 index 000000000..cfac49ad6 --- /dev/null +++ b/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON) + +#include "aecm_core.h" + +#include +#include +#include + +#include "aecm_delay_estimator.h" +#include "echo_control_mobile.h" +#include "ring_buffer.h" +#include "typedefs.h" + +// Square root of Hanning window in Q14 +static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = { + 16384, 16373, 16354, 16325, + 16286, 16237, 16179, 16111, + 16034, 15947, 15851, 15746, + 15631, 15506, 15373, 15231, + 15079, 14918, 14749, 14571, + 14384, 14189, 13985, 13773, + 13553, 13325, 13089, 12845, + 12594, 12335, 12068, 11795, + 11514, 11227, 10933, 10633, + 10326, 10013, 9695, 9370, + 9040, 8705, 8364, 8019, + 7668, 7313, 6954, 6591, + 6224, 5853, 5478, 5101, + 4720, 4337, 3951, 3562, + 3172, 2780, 2386, 1990, + 1594, 1196, 798, 399 +}; + +void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echoEst, + WebRtc_UWord32* far_energy, + WebRtc_UWord32* echo_energy_adapt, + WebRtc_UWord32* echo_energy_stored) +{ + int i; + + register WebRtc_UWord32 far_energy_r; + register WebRtc_UWord32 echo_energy_stored_r; + register WebRtc_UWord32 echo_energy_adapt_r; + uint32x4_t tmp32x4_0; + + __asm__("vmov.i32 q14, #0" : : : "q14"); //far_energy + __asm__("vmov.i32 q8, #0" : : : "q8"); //echo_energy_stored + __asm__("vmov.i32 q9, #0" : : : "q9"); //echo_energy_adapt + + for(i = 0; i < PART_LEN -7; i += 8) + { + //far_energy += (WebRtc_UWord32)(far_spectrum[i]); + __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); + __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13"); + __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13"); + + // Get estimated echo energies for adaptive channel and stored channel + //echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); + __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); + __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); + __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); + __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echoEst[i]): "q10", "q11"); + + //echo_energy_stored += (WebRtc_UWord32)echoEst[i]; + __asm__("vadd.u32 q8, q10" : : : "q10", "q8"); + __asm__("vadd.u32 q8, q11" : : : "q11", "q8"); + + //echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], far_spectrum[i]); + __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); + __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); + __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); + __asm__("vadd.u32 q9, q10" : : : "q9", "q15"); + __asm__("vadd.u32 q9, q11" : : : "q9", "q11"); + } + + __asm__("vadd.u32 d28, d29" : : : "q14"); + __asm__("vpadd.u32 d28, d28" : : : "q14"); + __asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14"); + + __asm__("vadd.u32 d18, d19" : : : "q9"); + __asm__("vpadd.u32 d18, d18" : : : "q9"); + __asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9"); + + __asm__("vadd.u32 d16, d17" : : : "q8"); + __asm__("vpadd.u32 d16, d16" : : : "q8"); + __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8"); + + // Get estimated echo energies for adaptive channel and stored channel + echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); + *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echoEst[i]; + *far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]); + *echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16( + aecm->channelAdapt16[i], far_spectrum[i]); +} + +void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est) +{ + int i; + + // During startup we store the channel every block. + // Recalculate echo estimate. + for(i = 0; i < PART_LEN -7; i += 8) + { + // aecm->channelStored[i] = acem->channelAdapt16[i]; + // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); + __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); + __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); + __asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); + __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); + __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); + __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : + "r"(&echo_est[i]) : "q10", "q11"); + } + aecm->channelStored[i] = aecm->channelAdapt16[i]; + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); +} + +void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm) +{ + int i; + + for(i = 0; i < PART_LEN -7; i += 8) + { + // aecm->channelAdapt16[i] = aecm->channelStored[i]; + // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32) + // aecm->channelStored[i], 16); + __asm__("vld1.16 {d24, d25}, [%0, :128]" : : + "r"(&aecm->channelStored[i]) : "q12"); + __asm__("vst1.16 {d24, d25}, [%0, :128]" : : + "r"(&aecm->channelAdapt16[i]) : "q12"); + __asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10"); + __asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11"); + __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : + "r"(&aecm->channelAdapt32[i]): "q10", "q11"); + } + aecm->channelAdapt16[i] = aecm->channelStored[i]; + aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i], 16); +} + +void WebRtcAecm_PrepareFft(WebRtc_Word16* fft, + const WebRtc_Word16* time_signal, + int time_signal_scaling) +{ + int i, j; + int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling); + __asm__("vmov.i16 d21, #0" ::: "d21"); + + for(i = 0, j = 0; i < PART_LEN-3; i += 4, j += 8) + { + int16x4_t tmp16x4_0; + int16x4_t tmp16x4_1; + int32x4_t tmp32x4_0; + + /* Window near end */ + // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i] + // << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i])); + tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); + + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); + tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); + + __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); + __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10"); + + // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( + // (time_signal[PART_LEN + i] << time_signal_scaling), + // WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[PART_LEN + i])); + tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); + + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); + tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); + + __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); + __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10"); + } +} + +#endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON) +