1st check-in for AECM Neon optimization.
Review URL: http://webrtc-codereview.appspot.com/104001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@359 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
		| @@ -25,6 +25,11 @@ LOCAL_SRC_FILES := \ | ||||
| LOCAL_CFLAGS := \ | ||||
|     $(MY_WEBRTC_COMMON_DEFS) | ||||
|  | ||||
| ifeq ($(ARCH_ARM_HAVE_NEON),true) | ||||
|     LOCAL_SRC_FILES += aecm_core_neon.c | ||||
|     LOCAL_CFLAGS += $(CFLAGS_NEON) | ||||
| endif | ||||
|  | ||||
| LOCAL_C_INCLUDES := \ | ||||
|     $(LOCAL_PATH)/../interface \ | ||||
|     $(LOCAL_PATH)/../../../utility \ | ||||
|   | ||||
| @@ -31,7 +31,7 @@ FILE *testfile; | ||||
| #ifdef AECM_SHORT | ||||
|  | ||||
| // Square root of Hanning window in Q14 | ||||
| static const WebRtc_Word16 kSqrtHanning[] = | ||||
| const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] = | ||||
| { | ||||
|     0, 804, 1606, 2404, 3196, 3981, 4756, 5520, | ||||
|     6270, 7005, 7723, 8423, 9102, 9760, 10394, 11003, | ||||
| @@ -43,12 +43,15 @@ static const WebRtc_Word16 kSqrtHanning[] = | ||||
| #else | ||||
|  | ||||
| // Square root of Hanning window in Q14 | ||||
| static const WebRtc_Word16 kSqrtHanning[] = {0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172, | ||||
|         3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364, | ||||
|         8705, 9040, 9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514, 11795, 12068, 12335, | ||||
|         12594, 12845, 13089, 13325, 13553, 13773, 13985, 14189, 14384, 14571, 14749, 14918, | ||||
|         15079, 15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034, 16111, 16179, 16237, | ||||
|         16286, 16325, 16354, 16373, 16384}; | ||||
| const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] __attribute__ ((aligned (8))) = | ||||
| { | ||||
|     0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172, | ||||
|     3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364, | ||||
|     8705, 9040, 9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514, 11795, 12068, 12335, | ||||
|     12594, 12845, 13089, 13325, 13553, 13773, 13985, 14189, 14384, 14571, 14749, 14918, | ||||
|     15079, 15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034, 16111, 16179, 16237, | ||||
|     16286, 16325, 16354, 16373, 16384 | ||||
| }; | ||||
|  | ||||
| #endif | ||||
|  | ||||
| @@ -98,11 +101,6 @@ static const WebRtc_Word16 kNoiseEstIncCount = 5; | ||||
| HANDLE logFile = NULL; | ||||
| #endif | ||||
|  | ||||
| static void WebRtcAecm_ComfortNoise(AecmCore_t* const aecm, const WebRtc_UWord16 * const dfa, | ||||
|                                     WebRtc_Word16 * const outReal, | ||||
|                                     WebRtc_Word16 * const outImag, | ||||
|                                     const WebRtc_Word16 * const lambda); | ||||
|  | ||||
| int WebRtcAecm_CreateCore(AecmCore_t **aecmInst) | ||||
| { | ||||
|     AecmCore_t *aecm = malloc(sizeof(AecmCore_t)); | ||||
| @@ -147,6 +145,18 @@ int WebRtcAecm_CreateCore(AecmCore_t **aecmInst) | ||||
|         return -1; | ||||
|     } | ||||
|  | ||||
|     // Init some aecm pointers. 16-byte alignment is only necessary for Neon code currently. | ||||
|     aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 15) & ~ 15); | ||||
|     aecm->dBufClean = (WebRtc_Word16*) (((uintptr_t)aecm->dBufClean_buf + 15) & ~ 15); | ||||
|     aecm->dBufNoisy = (WebRtc_Word16*) (((uintptr_t)aecm->dBufNoisy_buf + 15) & ~ 15); | ||||
|     aecm->outBuf = (WebRtc_Word16*) (((uintptr_t)aecm->outBuf_buf + 15) & ~ 15); | ||||
|     aecm->channelStored = (WebRtc_Word16*) (((uintptr_t) | ||||
|                                              aecm->channelStored_buf + 15) & ~ 15); | ||||
|     aecm->channelAdapt16 = (WebRtc_Word16*) (((uintptr_t) | ||||
|                                               aecm->channelAdapt16_buf + 15) & ~ 15); | ||||
|     aecm->channelAdapt32 = (WebRtc_Word32*) (((uintptr_t) | ||||
|                                               aecm->channelAdapt32_buf + 31) & ~ 31); | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| @@ -209,10 +219,10 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq) | ||||
|     WebRtcApm_InitBuffer(aecm->nearCleanFrameBuf); | ||||
|     WebRtcApm_InitBuffer(aecm->outFrameBuf); | ||||
|  | ||||
|     memset(aecm->xBuf, 0, sizeof(aecm->xBuf)); | ||||
|     memset(aecm->dBufClean, 0, sizeof(aecm->dBufClean)); | ||||
|     memset(aecm->dBufNoisy, 0, sizeof(aecm->dBufNoisy)); | ||||
|     memset(aecm->outBuf, 0, sizeof(aecm->outBuf)); | ||||
|     memset(aecm->xBuf_buf, 0, sizeof(aecm->xBuf_buf)); | ||||
|     memset(aecm->dBufClean_buf, 0, sizeof(aecm->dBufClean_buf)); | ||||
|     memset(aecm->dBufNoisy_buf, 0, sizeof(aecm->dBufNoisy_buf)); | ||||
|     memset(aecm->outBuf_buf, 0, sizeof(aecm->outBuf_buf)); | ||||
|  | ||||
|     aecm->seed = 666; | ||||
|     aecm->totCount = 0; | ||||
| @@ -287,6 +297,8 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq) | ||||
|     aecm->supGainErrParamDiffAB = SUPGAIN_ERROR_PARAM_A - SUPGAIN_ERROR_PARAM_B; | ||||
|     aecm->supGainErrParamDiffBD = SUPGAIN_ERROR_PARAM_B - SUPGAIN_ERROR_PARAM_D; | ||||
|  | ||||
|     assert(PART_LEN % 16 == 0); | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| @@ -481,18 +493,8 @@ void WebRtcAecm_CalcEnergies(AecmCore_t * aecm, | ||||
|     aecm->nearLogEnergy[0] = tmp16; | ||||
|     // END: Get log of near end energy | ||||
|  | ||||
|     // Get energy for the delayed far end signal and estimated | ||||
|     // echo using both stored and adapted channels. | ||||
|     for (i = 0; i < PART_LEN1; i++) | ||||
|     { | ||||
|         // Get estimated echo energies for adaptive channel and stored channel | ||||
|         echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], | ||||
|                                            far_spectrum[i]); | ||||
|         tmpFar += (WebRtc_UWord32)(far_spectrum[i]); | ||||
|         tmpAdapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], | ||||
|                                           far_spectrum[i]); | ||||
|         tmpStored += (WebRtc_UWord32)echoEst[i]; | ||||
|     } | ||||
|     WebRtcAecm_CalcLinearEnergies(aecm, far_spectrum, echoEst, &tmpFar, &tmpAdapt, &tmpStored); | ||||
|  | ||||
|     // Shift buffers | ||||
|     memmove(aecm->echoAdaptLogEnergy + 1, aecm->echoAdaptLogEnergy, | ||||
|             sizeof(WebRtc_Word16) * (MAX_BUF_LEN - 1)); | ||||
| @@ -814,22 +816,9 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm, | ||||
|     // Determine if we should store or restore the channel | ||||
|     if ((aecm->startupState == 0) & (aecm->currentVADValue)) | ||||
|     { | ||||
|         // During startup we store the channel every block. | ||||
|         memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1); | ||||
|         // Recalculate echo estimate | ||||
|         for (i = 0; i < PART_LEN; i += 4) | ||||
|         { | ||||
|             echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], | ||||
|                                                far_spectrum[i]); | ||||
|             echoEst[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], | ||||
|                                                far_spectrum[i + 1]); | ||||
|             echoEst[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], | ||||
|                                                far_spectrum[i + 2]); | ||||
|             echoEst[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], | ||||
|                                                far_spectrum[i + 3]); | ||||
|         } | ||||
|         echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], | ||||
|                                            far_spectrum[i]); | ||||
|         // During startup we store the channel every block, | ||||
|         // and we recalculate echo estimate | ||||
|         WebRtcAecm_StoreAdaptiveChannel(aecm, far_spectrum, echoEst); | ||||
|     } else | ||||
|     { | ||||
|         if (aecm->farLogEnergy < aecm->farEnergyMSE) | ||||
| @@ -865,43 +854,14 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm, | ||||
|             { | ||||
|                 // The stored channel has a significantly lower MSE than the adaptive one for | ||||
|                 // two consecutive calculations. Reset the adaptive channel. | ||||
|                 memcpy(aecm->channelAdapt16, aecm->channelStored, | ||||
|                        sizeof(WebRtc_Word16) * PART_LEN1); | ||||
|                 // Restore the W32 channel | ||||
|                 for (i = 0; i < PART_LEN; i += 4) | ||||
|                 { | ||||
|                     aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( | ||||
|                             (WebRtc_Word32)aecm->channelStored[i], 16); | ||||
|                     aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32( | ||||
|                             (WebRtc_Word32)aecm->channelStored[i + 1], 16); | ||||
|                     aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32( | ||||
|                             (WebRtc_Word32)aecm->channelStored[i + 2], 16); | ||||
|                     aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32( | ||||
|                             (WebRtc_Word32)aecm->channelStored[i + 3], 16); | ||||
|                 } | ||||
|                 aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16); | ||||
|  | ||||
|                 WebRtcAecm_ResetAdaptiveChannel(aecm); | ||||
|             } else if (((MIN_MSE_DIFF * mseStored) > (mseAdapt << MSE_RESOLUTION)) & (mseAdapt | ||||
|                     < aecm->mseThreshold) & (aecm->mseAdaptOld < aecm->mseThreshold)) | ||||
|             { | ||||
|                 // The adaptive channel has a significantly lower MSE than the stored one. | ||||
|                 // The MSE for the adaptive channel has also been low for two consecutive | ||||
|                 // calculations. Store the adaptive channel. | ||||
|                 memcpy(aecm->channelStored, aecm->channelAdapt16, | ||||
|                        sizeof(WebRtc_Word16) * PART_LEN1); | ||||
|                 // Recalculate echo estimate | ||||
|                 for (i = 0; i < PART_LEN; i += 4) | ||||
|                 { | ||||
|                     echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], | ||||
|                                                        far_spectrum[i]); | ||||
|                     echoEst[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], | ||||
|                                                        far_spectrum[i + 1]); | ||||
|                     echoEst[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], | ||||
|                                                        far_spectrum[i + 2]); | ||||
|                     echoEst[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], | ||||
|                                                        far_spectrum[i + 3]); | ||||
|                 } | ||||
|                 echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); | ||||
|                 WebRtcAecm_StoreAdaptiveChannel(aecm, far_spectrum, echoEst); | ||||
|  | ||||
|                 // Update threshold | ||||
|                 if (aecm->mseThreshold == WEBRTC_SPL_WORD32_MAX) | ||||
| @@ -1032,7 +992,9 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal, | ||||
|     WebRtc_Word32 tmp32no1; | ||||
|     WebRtc_Word32 tmp32no2; | ||||
|  | ||||
|     WebRtc_Word16 fft[PART_LEN4]; | ||||
|     // In fft_buf, +8 for 16-byte alignment, and +2 to make some loops safe. | ||||
|     WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 8]; | ||||
|     WebRtc_Word16 *fft = (WebRtc_Word16 *) (((uintptr_t) fft_buf + 15) & ~15); | ||||
|  | ||||
|     WebRtc_Word16 tmp16no1; | ||||
|     WebRtc_Word16 tmp16no2; | ||||
| @@ -1048,23 +1010,7 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal, | ||||
|     time_signal_scaling = WebRtcSpl_NormW16(tmp16no1); | ||||
| #endif | ||||
|  | ||||
|     memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4); | ||||
|     // FFT of signal | ||||
|     for (i = 0, j = 0; i < PART_LEN; i++, j += 2) | ||||
|     { | ||||
|         // Window time domain signal and insert into real part of | ||||
|         // transformation array |fft| | ||||
|         fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( | ||||
|             (time_signal[i] << time_signal_scaling), | ||||
|             kSqrtHanning[i], | ||||
|             14); | ||||
|         fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( | ||||
|             (time_signal[PART_LEN + i] << time_signal_scaling), | ||||
|             kSqrtHanning[PART_LEN - i], | ||||
|             14); | ||||
|         // Inserting zeros in imaginary parts not necessary since we | ||||
|         // initialized the array with all zeros | ||||
|     } | ||||
|     WebRtcAecm_PrepareFft(fft, time_signal, time_signal_scaling); | ||||
|  | ||||
|     // Fourier transformation of time domain signal. | ||||
|     // The result is scaled with 1/PART_LEN2, that is, the result is in Q(-6) | ||||
| @@ -1187,7 +1133,10 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, | ||||
|     WebRtc_UWord32 tmpU32; | ||||
|  | ||||
|     WebRtc_Word32 tmp32no1; | ||||
|     WebRtc_Word32 echoEst32[PART_LEN1]; | ||||
|  | ||||
|     // +8 for 32-byte alignment. | ||||
|     WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8]; | ||||
|     WebRtc_Word32 *echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31); | ||||
|  | ||||
|     WebRtc_UWord16 xfa[PART_LEN1]; | ||||
|     WebRtc_UWord16 dfaNoisy[PART_LEN1]; | ||||
| @@ -1540,9 +1489,9 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, | ||||
|         for (i = 0; i < PART_LEN1; i++) | ||||
|         { | ||||
|             efwReal[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwReal[i], | ||||
|                                                                             hnl[i], 14)); | ||||
|                                                                            hnl[i], 14)); | ||||
|             efwImag[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwImag[i], | ||||
|                                                                             hnl[i], 14)); | ||||
|                                                                            hnl[i], 14)); | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -1595,7 +1544,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, | ||||
|     { | ||||
|         fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( | ||||
|                 fft[i], | ||||
|                 kSqrtHanning[i], | ||||
|                 WebRtcAecm_kSqrtHanning[i], | ||||
|                 14); | ||||
|         tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i], | ||||
|                 outCFFT - aecm->dfaCleanQDomain); | ||||
| @@ -1606,7 +1555,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, | ||||
|  | ||||
|         tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( | ||||
|                 fft[PART_LEN + i], | ||||
|                 kSqrtHanning[PART_LEN - i], | ||||
|                 WebRtcAecm_kSqrtHanning[PART_LEN - i], | ||||
|                 14); | ||||
|         tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, | ||||
|                 outCFFT - aecm->dfaCleanQDomain); | ||||
| @@ -1623,7 +1572,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, | ||||
|     milliseconds = (unsigned int)(diff__ & 0xffffffff); | ||||
|     WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL); | ||||
| #endif | ||||
|     // Copy the current block to the old position (outBuf is shifted elsewhere) | ||||
|     // Copy the current block to the old position (aecm->outBuf is shifted elsewhere) | ||||
|     memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); | ||||
|     memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); | ||||
|     if (nearendClean != NULL) | ||||
| @@ -1634,6 +1583,105 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| #if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) | ||||
|  | ||||
| void WebRtcAecm_PrepareFft(WebRtc_Word16* fft, | ||||
|                            const WebRtc_Word16* time_signal, | ||||
|                            int time_signal_scaling) | ||||
| { | ||||
|     int i, j; | ||||
|  | ||||
|     memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4); | ||||
|     // FFT of signal | ||||
|     for (i = 0, j = 0; i < PART_LEN; i++, j += 2) | ||||
|     { | ||||
|         // Window time domain signal and insert into real part of | ||||
|         // transformation array |fft| | ||||
|         fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( | ||||
|             (time_signal[i] << time_signal_scaling), | ||||
|             WebRtcAecm_kSqrtHanning[i], | ||||
|             14); | ||||
|         fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( | ||||
|             (time_signal[PART_LEN + i] << time_signal_scaling), | ||||
|             WebRtcAecm_kSqrtHanning[PART_LEN - i], | ||||
|             14); | ||||
|         // Inserting zeros in imaginary parts not necessary since we | ||||
|         // initialized the array with all zeros | ||||
|     } | ||||
| } | ||||
|  | ||||
| void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm, | ||||
|                                    const WebRtc_UWord16* far_spectrum, | ||||
|                                    WebRtc_Word32* echo_est, | ||||
|                                    WebRtc_UWord32* far_energy, | ||||
|                                    WebRtc_UWord32* echo_energy_adapt, | ||||
|                                    WebRtc_UWord32* echo_energy_stored) | ||||
| { | ||||
|     int i; | ||||
|  | ||||
|     // Get energy for the delayed far end signal and estimated | ||||
|     // echo using both stored and adapted channels. | ||||
|     for (i = 0; i < PART_LEN1; i++) | ||||
|     { | ||||
|         echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], | ||||
|                                            far_spectrum[i]); | ||||
|         (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]); | ||||
|         (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], | ||||
|                                           far_spectrum[i]); | ||||
|         (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i]; | ||||
|     } | ||||
| } | ||||
|  | ||||
| void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, | ||||
|                                      const WebRtc_UWord16* far_spectrum, | ||||
|                                      WebRtc_Word32* echo_est) | ||||
| { | ||||
|     int i; | ||||
|  | ||||
|     // During startup we store the channel every block. | ||||
|     memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1); | ||||
|     // Recalculate echo estimate | ||||
|     for (i = 0; i < PART_LEN; i += 4) | ||||
|     { | ||||
|         echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], | ||||
|                                            far_spectrum[i]); | ||||
|         echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], | ||||
|                                            far_spectrum[i + 1]); | ||||
|         echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], | ||||
|                                            far_spectrum[i + 2]); | ||||
|         echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], | ||||
|                                            far_spectrum[i + 3]); | ||||
|     } | ||||
|     echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], | ||||
|                                        far_spectrum[i]); | ||||
| } | ||||
|  | ||||
| void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm) | ||||
| { | ||||
|     int i; | ||||
|  | ||||
|     // The stored channel has a significantly lower MSE than the adaptive one for | ||||
|     // two consecutive calculations. Reset the adaptive channel. | ||||
|     memcpy(aecm->channelAdapt16, aecm->channelStored, | ||||
|            sizeof(WebRtc_Word16) * PART_LEN1); | ||||
|     // Restore the W32 channel | ||||
|     for (i = 0; i < PART_LEN; i += 4) | ||||
|     { | ||||
|         aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( | ||||
|                 (WebRtc_Word32)aecm->channelStored[i], 16); | ||||
|         aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32( | ||||
|                 (WebRtc_Word32)aecm->channelStored[i + 1], 16); | ||||
|         aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32( | ||||
|                 (WebRtc_Word32)aecm->channelStored[i + 2], 16); | ||||
|         aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32( | ||||
|                 (WebRtc_Word32)aecm->channelStored[i + 3], 16); | ||||
|     } | ||||
|     aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16); | ||||
| } | ||||
|  | ||||
| #endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) | ||||
|  | ||||
|  | ||||
| // Generate comfort noise and add to output signal. | ||||
| // | ||||
| // \param[in]     aecm     Handle of the AECM instance. | ||||
| @@ -1642,11 +1690,11 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, | ||||
| // \param[in,out] outImag Imaginary part of the output signal (Q[aecm->dfaQDomain]). | ||||
| // \param[in]     lambda  Suppression gain with which to scale the noise level (Q14). | ||||
| // | ||||
| static void WebRtcAecm_ComfortNoise(AecmCore_t * const aecm, | ||||
|                                     const WebRtc_UWord16 * const dfa, | ||||
|                                     WebRtc_Word16 * const outReal, | ||||
|                                     WebRtc_Word16 * const outImag, | ||||
|                                     const WebRtc_Word16 * const lambda) | ||||
| void WebRtcAecm_ComfortNoise(AecmCore_t * aecm, | ||||
|                              const WebRtc_UWord16* dfa, | ||||
|                              WebRtc_Word16* outReal, | ||||
|                              WebRtc_Word16* outImag, | ||||
|                              const WebRtc_Word16* lambda) | ||||
| { | ||||
|     WebRtc_Word16 i; | ||||
|     WebRtc_Word16 tmp16; | ||||
| @@ -1792,7 +1840,8 @@ static void WebRtcAecm_ComfortNoise(AecmCore_t * const aecm, | ||||
| #endif | ||||
| } | ||||
|  | ||||
| void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * const farend, | ||||
| void WebRtcAecm_BufferFarFrame(AecmCore_t* const aecm, | ||||
|                                const WebRtc_Word16* const farend, | ||||
|                                const int farLen) | ||||
| { | ||||
|     int writeLen = farLen, writePos = 0; | ||||
|   | ||||
| @@ -97,6 +97,8 @@ | ||||
| #define NLP_COMP_LOW    3277            // 0.2 in Q14 | ||||
| #define NLP_COMP_HIGH   ONE_Q14         // 1 in Q14 | ||||
|  | ||||
| extern const WebRtc_Word16 WebRtcAecm_kSqrtHanning[]; | ||||
|  | ||||
| typedef struct | ||||
| { | ||||
|     int farBufWritePos; | ||||
| @@ -110,11 +112,6 @@ typedef struct | ||||
|     void *nearCleanFrameBuf; | ||||
|     void *outFrameBuf; | ||||
|  | ||||
|     WebRtc_Word16 xBuf[PART_LEN2]; // farend | ||||
|     WebRtc_Word16 dBufClean[PART_LEN2]; // nearend | ||||
|     WebRtc_Word16 dBufNoisy[PART_LEN2]; // nearend | ||||
|     WebRtc_Word16 outBuf[PART_LEN]; | ||||
|  | ||||
|     WebRtc_Word16 farBuf[FAR_BUF_LEN]; | ||||
|  | ||||
|     WebRtc_Word16 mult; | ||||
| @@ -139,9 +136,26 @@ typedef struct | ||||
|     WebRtc_Word16 echoAdaptLogEnergy[MAX_BUF_LEN]; | ||||
|     WebRtc_Word16 echoStoredLogEnergy[MAX_BUF_LEN]; | ||||
|  | ||||
|     WebRtc_Word16 channelAdapt16[PART_LEN1]; | ||||
|     WebRtc_Word32 channelAdapt32[PART_LEN1]; | ||||
|     WebRtc_Word16 channelStored[PART_LEN1]; | ||||
|     // The extra 16 or 32 bytes in the following buffers are for alignment based Neon code. | ||||
|     // It's designed this way since the current GCC compiler can't align a buffer in 16 or 32 | ||||
|     // byte boundaries properly. | ||||
|     WebRtc_Word16 channelStored_buf[PART_LEN1 + 8]; | ||||
|     WebRtc_Word16 channelAdapt16_buf[PART_LEN1 + 8]; | ||||
|     WebRtc_Word32 channelAdapt32_buf[PART_LEN1 + 8]; | ||||
|     WebRtc_Word16 xBuf_buf[PART_LEN2 + 8]; // farend | ||||
|     WebRtc_Word16 dBufClean_buf[PART_LEN2 + 8]; // nearend | ||||
|     WebRtc_Word16 dBufNoisy_buf[PART_LEN2 + 8]; // nearend | ||||
|     WebRtc_Word16 outBuf_buf[PART_LEN + 8]; | ||||
|  | ||||
|     // Pointers to the above buffers | ||||
|     WebRtc_Word16 *channelStored; | ||||
|     WebRtc_Word16 *channelAdapt16; | ||||
|     WebRtc_Word32 *channelAdapt32; | ||||
|     WebRtc_Word16 *xBuf; | ||||
|     WebRtc_Word16 *dBufClean; | ||||
|     WebRtc_Word16 *dBufNoisy; | ||||
|     WebRtc_Word16 *outBuf; | ||||
|  | ||||
|     WebRtc_Word32 echoFilt[PART_LEN1]; | ||||
|     WebRtc_Word16 nearFilt[PART_LEN1]; | ||||
|     WebRtc_Word32 noiseEst[PART_LEN1]; | ||||
| @@ -308,4 +322,27 @@ void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * co | ||||
| void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const farend, | ||||
|                               const int farLen, const int knownDelay); | ||||
|  | ||||
| /////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Some internal functions shared by ARM NEON and generic C code: | ||||
| // | ||||
|  | ||||
| WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t * aecm); | ||||
|  | ||||
| void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm, | ||||
|                                    const WebRtc_UWord16* far_spectrum, | ||||
|                                    WebRtc_Word32* echoEst, | ||||
|                                    WebRtc_UWord32* far_energy, | ||||
|                                    WebRtc_UWord32* echo_energy_adapt, | ||||
|                                    WebRtc_UWord32* echo_energy_stored); | ||||
|  | ||||
| void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, | ||||
|                                      const WebRtc_UWord16* far_spectrum, | ||||
|                                      WebRtc_Word32* echo_est); | ||||
|  | ||||
| void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm); | ||||
|  | ||||
| void WebRtcAecm_PrepareFft(WebRtc_Word16* fft, | ||||
|                            const WebRtc_Word16* time_signal, | ||||
|                            int time_signal_scaling); | ||||
|  | ||||
| #endif | ||||
|   | ||||
							
								
								
									
										195
									
								
								src/modules/audio_processing/aecm/main/source/aecm_core_neon.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										195
									
								
								src/modules/audio_processing/aecm/main/source/aecm_core_neon.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,195 @@ | ||||
| /* | ||||
|  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | ||||
|  * | ||||
|  *  Use of this source code is governed by a BSD-style license | ||||
|  *  that can be found in the LICENSE file in the root of the source | ||||
|  *  tree. An additional intellectual property rights grant can be found | ||||
|  *  in the file PATENTS.  All contributing project authors may | ||||
|  *  be found in the AUTHORS file in the root of the source tree. | ||||
|  */ | ||||
| #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON) | ||||
|  | ||||
| #include "aecm_core.h" | ||||
|  | ||||
| #include <arm_neon.h> | ||||
| #include <assert.h> | ||||
| #include <stdlib.h> | ||||
|  | ||||
| #include "aecm_delay_estimator.h" | ||||
| #include "echo_control_mobile.h" | ||||
| #include "ring_buffer.h" | ||||
| #include "typedefs.h" | ||||
|  | ||||
| // Square root of Hanning window in Q14 | ||||
| static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = {        | ||||
|      16384, 16373, 16354, 16325,   | ||||
|      16286, 16237, 16179, 16111,   | ||||
|      16034, 15947, 15851, 15746,   | ||||
|      15631, 15506, 15373, 15231,   | ||||
|      15079, 14918, 14749, 14571,   | ||||
|      14384, 14189, 13985, 13773,   | ||||
|      13553, 13325, 13089, 12845,   | ||||
|      12594, 12335, 12068, 11795,   | ||||
|      11514, 11227, 10933, 10633,   | ||||
|      10326, 10013, 9695,  9370,    | ||||
|      9040,  8705,  8364,  8019,    | ||||
|      7668,  7313,  6954,  6591,    | ||||
|      6224,  5853,  5478,  5101,    | ||||
|      4720,  4337,  3951,  3562,    | ||||
|      3172,  2780,  2386,  1990,    | ||||
|      1594,  1196,  798,   399 | ||||
| }; | ||||
|  | ||||
| void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm, | ||||
|                                    const WebRtc_UWord16* far_spectrum, | ||||
|                                    WebRtc_Word32* echoEst, | ||||
|                                    WebRtc_UWord32* far_energy, | ||||
|                                    WebRtc_UWord32* echo_energy_adapt, | ||||
|                                    WebRtc_UWord32* echo_energy_stored) | ||||
| { | ||||
|     int i; | ||||
|  | ||||
|     register WebRtc_UWord32 far_energy_r; | ||||
|     register WebRtc_UWord32 echo_energy_stored_r; | ||||
|     register WebRtc_UWord32 echo_energy_adapt_r; | ||||
|     uint32x4_t tmp32x4_0; | ||||
|  | ||||
|     __asm__("vmov.i32 q14, #0" : : : "q14"); //far_energy | ||||
|     __asm__("vmov.i32 q8,  #0" : : : "q8"); //echo_energy_stored | ||||
|     __asm__("vmov.i32 q9,  #0" : : : "q9"); //echo_energy_adapt | ||||
|  | ||||
|     for(i = 0; i < PART_LEN -7; i += 8) | ||||
|     { | ||||
|         //far_energy += (WebRtc_UWord32)(far_spectrum[i]); | ||||
|         __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); | ||||
|         __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13"); | ||||
|         __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13"); | ||||
|  | ||||
|         // Get estimated echo energies for adaptive channel and stored channel | ||||
|         //echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); | ||||
|         __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); | ||||
|         __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); | ||||
|         __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); | ||||
|         __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echoEst[i]): "q10", "q11"); | ||||
|  | ||||
|         //echo_energy_stored += (WebRtc_UWord32)echoEst[i]; | ||||
|         __asm__("vadd.u32 q8, q10" : : : "q10", "q8"); | ||||
|         __asm__("vadd.u32 q8, q11" : : : "q11", "q8"); | ||||
|  | ||||
|         //echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], far_spectrum[i]); | ||||
|         __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); | ||||
|         __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); | ||||
|         __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); | ||||
|         __asm__("vadd.u32 q9, q10" : : : "q9", "q15"); | ||||
|         __asm__("vadd.u32 q9, q11" : : : "q9", "q11"); | ||||
|     } | ||||
|  | ||||
|     __asm__("vadd.u32 d28, d29" : : : "q14"); | ||||
|     __asm__("vpadd.u32 d28, d28" : : : "q14"); | ||||
|     __asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14"); | ||||
|  | ||||
|     __asm__("vadd.u32 d18, d19" : : : "q9"); | ||||
|     __asm__("vpadd.u32 d18, d18" : : : "q9"); | ||||
|     __asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9"); | ||||
|  | ||||
|     __asm__("vadd.u32 d16, d17" : : : "q8"); | ||||
|     __asm__("vpadd.u32 d16, d16" : : : "q8"); | ||||
|     __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8"); | ||||
|  | ||||
|     // Get estimated echo energies for adaptive channel and stored channel | ||||
|     echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); | ||||
|     *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echoEst[i]; | ||||
|     *far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]); | ||||
|     *echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16( | ||||
|         aecm->channelAdapt16[i], far_spectrum[i]); | ||||
| } | ||||
|  | ||||
| void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, | ||||
|                                      const WebRtc_UWord16* far_spectrum, | ||||
|                                      WebRtc_Word32* echo_est) | ||||
| { | ||||
|     int i; | ||||
|  | ||||
|     // During startup we store the channel every block. | ||||
|     // Recalculate echo estimate. | ||||
|     for(i = 0; i < PART_LEN -7; i += 8) | ||||
|     { | ||||
|         // aecm->channelStored[i] = acem->channelAdapt16[i]; | ||||
|         // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); | ||||
|         __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); | ||||
|         __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); | ||||
|         __asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); | ||||
|         __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); | ||||
|         __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); | ||||
|         __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : | ||||
|                                "r"(&echo_est[i]) : "q10", "q11"); | ||||
|     } | ||||
|     aecm->channelStored[i] = aecm->channelAdapt16[i]; | ||||
|     echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); | ||||
| } | ||||
|  | ||||
| void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm) | ||||
| { | ||||
|     int i; | ||||
|  | ||||
|     for(i = 0; i < PART_LEN -7; i += 8) | ||||
|     { | ||||
|         // aecm->channelAdapt16[i] = aecm->channelStored[i]; | ||||
|         // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32) | ||||
|         //                           aecm->channelStored[i], 16); | ||||
|         __asm__("vld1.16 {d24, d25}, [%0, :128]" : : | ||||
|                         "r"(&aecm->channelStored[i]) : "q12"); | ||||
|         __asm__("vst1.16 {d24, d25}, [%0, :128]" : : | ||||
|                         "r"(&aecm->channelAdapt16[i]) : "q12"); | ||||
|         __asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10"); | ||||
|         __asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11"); | ||||
|         __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : | ||||
|                         "r"(&aecm->channelAdapt32[i]): "q10", "q11"); | ||||
|     } | ||||
|     aecm->channelAdapt16[i] = aecm->channelStored[i]; | ||||
|     aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( | ||||
|             (WebRtc_Word32)aecm->channelStored[i], 16); | ||||
| } | ||||
|  | ||||
| void WebRtcAecm_PrepareFft(WebRtc_Word16* fft, | ||||
|                            const WebRtc_Word16* time_signal, | ||||
|                            int time_signal_scaling) | ||||
| { | ||||
|     int i, j; | ||||
|     int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling); | ||||
|     __asm__("vmov.i16 d21, #0" ::: "d21"); | ||||
|  | ||||
|     for(i = 0, j = 0; i < PART_LEN-3; i += 4, j += 8) | ||||
|     { | ||||
|         int16x4_t tmp16x4_0; | ||||
|         int16x4_t tmp16x4_1; | ||||
|         int32x4_t tmp32x4_0; | ||||
|  | ||||
|         /* Window near end */ | ||||
|         // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i] | ||||
|         //       << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14); | ||||
|         __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i])); | ||||
|         tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); | ||||
|  | ||||
|         __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); | ||||
|         tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); | ||||
|  | ||||
|         __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); | ||||
|         __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10"); | ||||
|  | ||||
|         // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( | ||||
|         //      (time_signal[PART_LEN + i] << time_signal_scaling), | ||||
|         //       WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); | ||||
|         __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[PART_LEN + i])); | ||||
|         tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); | ||||
|  | ||||
|         __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); | ||||
|         tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); | ||||
|  | ||||
|         __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); | ||||
|         __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10"); | ||||
|     } | ||||
| } | ||||
|  | ||||
| #endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON) | ||||
|  | ||||
		Reference in New Issue
	
	Block a user
	 kma@google.com
					kma@google.com