1st check-in for AECM Neon optimization.
Review URL: http://webrtc-codereview.appspot.com/104001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@359 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
4033e1245d
commit
1959e6fcb5
@ -25,6 +25,11 @@ LOCAL_SRC_FILES := \
|
||||
LOCAL_CFLAGS := \
|
||||
$(MY_WEBRTC_COMMON_DEFS)
|
||||
|
||||
ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
||||
LOCAL_SRC_FILES += aecm_core_neon.c
|
||||
LOCAL_CFLAGS += $(CFLAGS_NEON)
|
||||
endif
|
||||
|
||||
LOCAL_C_INCLUDES := \
|
||||
$(LOCAL_PATH)/../interface \
|
||||
$(LOCAL_PATH)/../../../utility \
|
||||
|
@ -31,7 +31,7 @@ FILE *testfile;
|
||||
#ifdef AECM_SHORT
|
||||
|
||||
// Square root of Hanning window in Q14
|
||||
static const WebRtc_Word16 kSqrtHanning[] =
|
||||
const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] =
|
||||
{
|
||||
0, 804, 1606, 2404, 3196, 3981, 4756, 5520,
|
||||
6270, 7005, 7723, 8423, 9102, 9760, 10394, 11003,
|
||||
@ -43,12 +43,15 @@ static const WebRtc_Word16 kSqrtHanning[] =
|
||||
#else
|
||||
|
||||
// Square root of Hanning window in Q14
|
||||
static const WebRtc_Word16 kSqrtHanning[] = {0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
|
||||
3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364,
|
||||
8705, 9040, 9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514, 11795, 12068, 12335,
|
||||
12594, 12845, 13089, 13325, 13553, 13773, 13985, 14189, 14384, 14571, 14749, 14918,
|
||||
15079, 15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034, 16111, 16179, 16237,
|
||||
16286, 16325, 16354, 16373, 16384};
|
||||
const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] __attribute__ ((aligned (8))) =
|
||||
{
|
||||
0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
|
||||
3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364,
|
||||
8705, 9040, 9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514, 11795, 12068, 12335,
|
||||
12594, 12845, 13089, 13325, 13553, 13773, 13985, 14189, 14384, 14571, 14749, 14918,
|
||||
15079, 15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034, 16111, 16179, 16237,
|
||||
16286, 16325, 16354, 16373, 16384
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@ -98,11 +101,6 @@ static const WebRtc_Word16 kNoiseEstIncCount = 5;
|
||||
HANDLE logFile = NULL;
|
||||
#endif
|
||||
|
||||
static void WebRtcAecm_ComfortNoise(AecmCore_t* const aecm, const WebRtc_UWord16 * const dfa,
|
||||
WebRtc_Word16 * const outReal,
|
||||
WebRtc_Word16 * const outImag,
|
||||
const WebRtc_Word16 * const lambda);
|
||||
|
||||
int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
|
||||
{
|
||||
AecmCore_t *aecm = malloc(sizeof(AecmCore_t));
|
||||
@ -147,6 +145,18 @@ int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Init some aecm pointers. 16-byte alignment is only necessary for Neon code currently.
|
||||
aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 15) & ~ 15);
|
||||
aecm->dBufClean = (WebRtc_Word16*) (((uintptr_t)aecm->dBufClean_buf + 15) & ~ 15);
|
||||
aecm->dBufNoisy = (WebRtc_Word16*) (((uintptr_t)aecm->dBufNoisy_buf + 15) & ~ 15);
|
||||
aecm->outBuf = (WebRtc_Word16*) (((uintptr_t)aecm->outBuf_buf + 15) & ~ 15);
|
||||
aecm->channelStored = (WebRtc_Word16*) (((uintptr_t)
|
||||
aecm->channelStored_buf + 15) & ~ 15);
|
||||
aecm->channelAdapt16 = (WebRtc_Word16*) (((uintptr_t)
|
||||
aecm->channelAdapt16_buf + 15) & ~ 15);
|
||||
aecm->channelAdapt32 = (WebRtc_Word32*) (((uintptr_t)
|
||||
aecm->channelAdapt32_buf + 31) & ~ 31);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -209,10 +219,10 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
|
||||
WebRtcApm_InitBuffer(aecm->nearCleanFrameBuf);
|
||||
WebRtcApm_InitBuffer(aecm->outFrameBuf);
|
||||
|
||||
memset(aecm->xBuf, 0, sizeof(aecm->xBuf));
|
||||
memset(aecm->dBufClean, 0, sizeof(aecm->dBufClean));
|
||||
memset(aecm->dBufNoisy, 0, sizeof(aecm->dBufNoisy));
|
||||
memset(aecm->outBuf, 0, sizeof(aecm->outBuf));
|
||||
memset(aecm->xBuf_buf, 0, sizeof(aecm->xBuf_buf));
|
||||
memset(aecm->dBufClean_buf, 0, sizeof(aecm->dBufClean_buf));
|
||||
memset(aecm->dBufNoisy_buf, 0, sizeof(aecm->dBufNoisy_buf));
|
||||
memset(aecm->outBuf_buf, 0, sizeof(aecm->outBuf_buf));
|
||||
|
||||
aecm->seed = 666;
|
||||
aecm->totCount = 0;
|
||||
@ -287,6 +297,8 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
|
||||
aecm->supGainErrParamDiffAB = SUPGAIN_ERROR_PARAM_A - SUPGAIN_ERROR_PARAM_B;
|
||||
aecm->supGainErrParamDiffBD = SUPGAIN_ERROR_PARAM_B - SUPGAIN_ERROR_PARAM_D;
|
||||
|
||||
assert(PART_LEN % 16 == 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -481,18 +493,8 @@ void WebRtcAecm_CalcEnergies(AecmCore_t * aecm,
|
||||
aecm->nearLogEnergy[0] = tmp16;
|
||||
// END: Get log of near end energy
|
||||
|
||||
// Get energy for the delayed far end signal and estimated
|
||||
// echo using both stored and adapted channels.
|
||||
for (i = 0; i < PART_LEN1; i++)
|
||||
{
|
||||
// Get estimated echo energies for adaptive channel and stored channel
|
||||
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
tmpFar += (WebRtc_UWord32)(far_spectrum[i]);
|
||||
tmpAdapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
|
||||
far_spectrum[i]);
|
||||
tmpStored += (WebRtc_UWord32)echoEst[i];
|
||||
}
|
||||
WebRtcAecm_CalcLinearEnergies(aecm, far_spectrum, echoEst, &tmpFar, &tmpAdapt, &tmpStored);
|
||||
|
||||
// Shift buffers
|
||||
memmove(aecm->echoAdaptLogEnergy + 1, aecm->echoAdaptLogEnergy,
|
||||
sizeof(WebRtc_Word16) * (MAX_BUF_LEN - 1));
|
||||
@ -814,22 +816,9 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm,
|
||||
// Determine if we should store or restore the channel
|
||||
if ((aecm->startupState == 0) & (aecm->currentVADValue))
|
||||
{
|
||||
// During startup we store the channel every block.
|
||||
memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
|
||||
// Recalculate echo estimate
|
||||
for (i = 0; i < PART_LEN; i += 4)
|
||||
{
|
||||
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
echoEst[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
|
||||
far_spectrum[i + 1]);
|
||||
echoEst[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
|
||||
far_spectrum[i + 2]);
|
||||
echoEst[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
|
||||
far_spectrum[i + 3]);
|
||||
}
|
||||
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
// During startup we store the channel every block,
|
||||
// and we recalculate echo estimate
|
||||
WebRtcAecm_StoreAdaptiveChannel(aecm, far_spectrum, echoEst);
|
||||
} else
|
||||
{
|
||||
if (aecm->farLogEnergy < aecm->farEnergyMSE)
|
||||
@ -865,43 +854,14 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm,
|
||||
{
|
||||
// The stored channel has a significantly lower MSE than the adaptive one for
|
||||
// two consecutive calculations. Reset the adaptive channel.
|
||||
memcpy(aecm->channelAdapt16, aecm->channelStored,
|
||||
sizeof(WebRtc_Word16) * PART_LEN1);
|
||||
// Restore the W32 channel
|
||||
for (i = 0; i < PART_LEN; i += 4)
|
||||
{
|
||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i], 16);
|
||||
aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i + 1], 16);
|
||||
aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i + 2], 16);
|
||||
aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i + 3], 16);
|
||||
}
|
||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
|
||||
|
||||
WebRtcAecm_ResetAdaptiveChannel(aecm);
|
||||
} else if (((MIN_MSE_DIFF * mseStored) > (mseAdapt << MSE_RESOLUTION)) & (mseAdapt
|
||||
< aecm->mseThreshold) & (aecm->mseAdaptOld < aecm->mseThreshold))
|
||||
{
|
||||
// The adaptive channel has a significantly lower MSE than the stored one.
|
||||
// The MSE for the adaptive channel has also been low for two consecutive
|
||||
// calculations. Store the adaptive channel.
|
||||
memcpy(aecm->channelStored, aecm->channelAdapt16,
|
||||
sizeof(WebRtc_Word16) * PART_LEN1);
|
||||
// Recalculate echo estimate
|
||||
for (i = 0; i < PART_LEN; i += 4)
|
||||
{
|
||||
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
echoEst[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
|
||||
far_spectrum[i + 1]);
|
||||
echoEst[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
|
||||
far_spectrum[i + 2]);
|
||||
echoEst[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
|
||||
far_spectrum[i + 3]);
|
||||
}
|
||||
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
WebRtcAecm_StoreAdaptiveChannel(aecm, far_spectrum, echoEst);
|
||||
|
||||
// Update threshold
|
||||
if (aecm->mseThreshold == WEBRTC_SPL_WORD32_MAX)
|
||||
@ -1032,7 +992,9 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
|
||||
WebRtc_Word32 tmp32no1;
|
||||
WebRtc_Word32 tmp32no2;
|
||||
|
||||
WebRtc_Word16 fft[PART_LEN4];
|
||||
// In fft_buf, +8 for 16-byte alignment, and +2 to make some loops safe.
|
||||
WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 8];
|
||||
WebRtc_Word16 *fft = (WebRtc_Word16 *) (((uintptr_t) fft_buf + 15) & ~15);
|
||||
|
||||
WebRtc_Word16 tmp16no1;
|
||||
WebRtc_Word16 tmp16no2;
|
||||
@ -1048,23 +1010,7 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
|
||||
time_signal_scaling = WebRtcSpl_NormW16(tmp16no1);
|
||||
#endif
|
||||
|
||||
memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
|
||||
// FFT of signal
|
||||
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
|
||||
{
|
||||
// Window time domain signal and insert into real part of
|
||||
// transformation array |fft|
|
||||
fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
(time_signal[i] << time_signal_scaling),
|
||||
kSqrtHanning[i],
|
||||
14);
|
||||
fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
(time_signal[PART_LEN + i] << time_signal_scaling),
|
||||
kSqrtHanning[PART_LEN - i],
|
||||
14);
|
||||
// Inserting zeros in imaginary parts not necessary since we
|
||||
// initialized the array with all zeros
|
||||
}
|
||||
WebRtcAecm_PrepareFft(fft, time_signal, time_signal_scaling);
|
||||
|
||||
// Fourier transformation of time domain signal.
|
||||
// The result is scaled with 1/PART_LEN2, that is, the result is in Q(-6)
|
||||
@ -1187,7 +1133,10 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
|
||||
WebRtc_UWord32 tmpU32;
|
||||
|
||||
WebRtc_Word32 tmp32no1;
|
||||
WebRtc_Word32 echoEst32[PART_LEN1];
|
||||
|
||||
// +8 for 32-byte alignment.
|
||||
WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8];
|
||||
WebRtc_Word32 *echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31);
|
||||
|
||||
WebRtc_UWord16 xfa[PART_LEN1];
|
||||
WebRtc_UWord16 dfaNoisy[PART_LEN1];
|
||||
@ -1540,9 +1489,9 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
|
||||
for (i = 0; i < PART_LEN1; i++)
|
||||
{
|
||||
efwReal[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwReal[i],
|
||||
hnl[i], 14));
|
||||
hnl[i], 14));
|
||||
efwImag[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwImag[i],
|
||||
hnl[i], 14));
|
||||
hnl[i], 14));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1595,7 +1544,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
|
||||
{
|
||||
fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
fft[i],
|
||||
kSqrtHanning[i],
|
||||
WebRtcAecm_kSqrtHanning[i],
|
||||
14);
|
||||
tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
|
||||
outCFFT - aecm->dfaCleanQDomain);
|
||||
@ -1606,7 +1555,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
|
||||
|
||||
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
fft[PART_LEN + i],
|
||||
kSqrtHanning[PART_LEN - i],
|
||||
WebRtcAecm_kSqrtHanning[PART_LEN - i],
|
||||
14);
|
||||
tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
|
||||
outCFFT - aecm->dfaCleanQDomain);
|
||||
@ -1623,7 +1572,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
|
||||
milliseconds = (unsigned int)(diff__ & 0xffffffff);
|
||||
WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
|
||||
#endif
|
||||
// Copy the current block to the old position (outBuf is shifted elsewhere)
|
||||
// Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
|
||||
memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
||||
memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
||||
if (nearendClean != NULL)
|
||||
@ -1634,6 +1583,105 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
|
||||
|
||||
void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
|
||||
const WebRtc_Word16* time_signal,
|
||||
int time_signal_scaling)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
|
||||
// FFT of signal
|
||||
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
|
||||
{
|
||||
// Window time domain signal and insert into real part of
|
||||
// transformation array |fft|
|
||||
fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
(time_signal[i] << time_signal_scaling),
|
||||
WebRtcAecm_kSqrtHanning[i],
|
||||
14);
|
||||
fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
(time_signal[PART_LEN + i] << time_signal_scaling),
|
||||
WebRtcAecm_kSqrtHanning[PART_LEN - i],
|
||||
14);
|
||||
// Inserting zeros in imaginary parts not necessary since we
|
||||
// initialized the array with all zeros
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est,
|
||||
WebRtc_UWord32* far_energy,
|
||||
WebRtc_UWord32* echo_energy_adapt,
|
||||
WebRtc_UWord32* echo_energy_stored)
|
||||
{
|
||||
int i;
|
||||
|
||||
// Get energy for the delayed far end signal and estimated
|
||||
// echo using both stored and adapted channels.
|
||||
for (i = 0; i < PART_LEN1; i++)
|
||||
{
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
(*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
|
||||
(*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
|
||||
far_spectrum[i]);
|
||||
(*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est)
|
||||
{
|
||||
int i;
|
||||
|
||||
// During startup we store the channel every block.
|
||||
memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
|
||||
// Recalculate echo estimate
|
||||
for (i = 0; i < PART_LEN; i += 4)
|
||||
{
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
|
||||
far_spectrum[i + 1]);
|
||||
echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
|
||||
far_spectrum[i + 2]);
|
||||
echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
|
||||
far_spectrum[i + 3]);
|
||||
}
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
}
|
||||
|
||||
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
|
||||
{
|
||||
int i;
|
||||
|
||||
// The stored channel has a significantly lower MSE than the adaptive one for
|
||||
// two consecutive calculations. Reset the adaptive channel.
|
||||
memcpy(aecm->channelAdapt16, aecm->channelStored,
|
||||
sizeof(WebRtc_Word16) * PART_LEN1);
|
||||
// Restore the W32 channel
|
||||
for (i = 0; i < PART_LEN; i += 4)
|
||||
{
|
||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i], 16);
|
||||
aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i + 1], 16);
|
||||
aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i + 2], 16);
|
||||
aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i + 3], 16);
|
||||
}
|
||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
|
||||
}
|
||||
|
||||
#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
|
||||
|
||||
|
||||
// Generate comfort noise and add to output signal.
|
||||
//
|
||||
// \param[in] aecm Handle of the AECM instance.
|
||||
@ -1642,11 +1690,11 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
|
||||
// \param[in,out] outImag Imaginary part of the output signal (Q[aecm->dfaQDomain]).
|
||||
// \param[in] lambda Suppression gain with which to scale the noise level (Q14).
|
||||
//
|
||||
static void WebRtcAecm_ComfortNoise(AecmCore_t * const aecm,
|
||||
const WebRtc_UWord16 * const dfa,
|
||||
WebRtc_Word16 * const outReal,
|
||||
WebRtc_Word16 * const outImag,
|
||||
const WebRtc_Word16 * const lambda)
|
||||
void WebRtcAecm_ComfortNoise(AecmCore_t * aecm,
|
||||
const WebRtc_UWord16* dfa,
|
||||
WebRtc_Word16* outReal,
|
||||
WebRtc_Word16* outImag,
|
||||
const WebRtc_Word16* lambda)
|
||||
{
|
||||
WebRtc_Word16 i;
|
||||
WebRtc_Word16 tmp16;
|
||||
@ -1792,7 +1840,8 @@ static void WebRtcAecm_ComfortNoise(AecmCore_t * const aecm,
|
||||
#endif
|
||||
}
|
||||
|
||||
void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * const farend,
|
||||
void WebRtcAecm_BufferFarFrame(AecmCore_t* const aecm,
|
||||
const WebRtc_Word16* const farend,
|
||||
const int farLen)
|
||||
{
|
||||
int writeLen = farLen, writePos = 0;
|
||||
|
@ -97,6 +97,8 @@
|
||||
#define NLP_COMP_LOW 3277 // 0.2 in Q14
|
||||
#define NLP_COMP_HIGH ONE_Q14 // 1 in Q14
|
||||
|
||||
extern const WebRtc_Word16 WebRtcAecm_kSqrtHanning[];
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int farBufWritePos;
|
||||
@ -110,11 +112,6 @@ typedef struct
|
||||
void *nearCleanFrameBuf;
|
||||
void *outFrameBuf;
|
||||
|
||||
WebRtc_Word16 xBuf[PART_LEN2]; // farend
|
||||
WebRtc_Word16 dBufClean[PART_LEN2]; // nearend
|
||||
WebRtc_Word16 dBufNoisy[PART_LEN2]; // nearend
|
||||
WebRtc_Word16 outBuf[PART_LEN];
|
||||
|
||||
WebRtc_Word16 farBuf[FAR_BUF_LEN];
|
||||
|
||||
WebRtc_Word16 mult;
|
||||
@ -139,9 +136,26 @@ typedef struct
|
||||
WebRtc_Word16 echoAdaptLogEnergy[MAX_BUF_LEN];
|
||||
WebRtc_Word16 echoStoredLogEnergy[MAX_BUF_LEN];
|
||||
|
||||
WebRtc_Word16 channelAdapt16[PART_LEN1];
|
||||
WebRtc_Word32 channelAdapt32[PART_LEN1];
|
||||
WebRtc_Word16 channelStored[PART_LEN1];
|
||||
// The extra 16 or 32 bytes in the following buffers are for alignment based Neon code.
|
||||
// It's designed this way since the current GCC compiler can't align a buffer in 16 or 32
|
||||
// byte boundaries properly.
|
||||
WebRtc_Word16 channelStored_buf[PART_LEN1 + 8];
|
||||
WebRtc_Word16 channelAdapt16_buf[PART_LEN1 + 8];
|
||||
WebRtc_Word32 channelAdapt32_buf[PART_LEN1 + 8];
|
||||
WebRtc_Word16 xBuf_buf[PART_LEN2 + 8]; // farend
|
||||
WebRtc_Word16 dBufClean_buf[PART_LEN2 + 8]; // nearend
|
||||
WebRtc_Word16 dBufNoisy_buf[PART_LEN2 + 8]; // nearend
|
||||
WebRtc_Word16 outBuf_buf[PART_LEN + 8];
|
||||
|
||||
// Pointers to the above buffers
|
||||
WebRtc_Word16 *channelStored;
|
||||
WebRtc_Word16 *channelAdapt16;
|
||||
WebRtc_Word32 *channelAdapt32;
|
||||
WebRtc_Word16 *xBuf;
|
||||
WebRtc_Word16 *dBufClean;
|
||||
WebRtc_Word16 *dBufNoisy;
|
||||
WebRtc_Word16 *outBuf;
|
||||
|
||||
WebRtc_Word32 echoFilt[PART_LEN1];
|
||||
WebRtc_Word16 nearFilt[PART_LEN1];
|
||||
WebRtc_Word32 noiseEst[PART_LEN1];
|
||||
@ -308,4 +322,27 @@ void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * co
|
||||
void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const farend,
|
||||
const int farLen, const int knownDelay);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Some internal functions shared by ARM NEON and generic C code:
|
||||
//
|
||||
|
||||
WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t * aecm);
|
||||
|
||||
void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echoEst,
|
||||
WebRtc_UWord32* far_energy,
|
||||
WebRtc_UWord32* echo_energy_adapt,
|
||||
WebRtc_UWord32* echo_energy_stored);
|
||||
|
||||
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est);
|
||||
|
||||
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm);
|
||||
|
||||
void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
|
||||
const WebRtc_Word16* time_signal,
|
||||
int time_signal_scaling);
|
||||
|
||||
#endif
|
||||
|
195
src/modules/audio_processing/aecm/main/source/aecm_core_neon.c
Normal file
195
src/modules/audio_processing/aecm/main/source/aecm_core_neon.c
Normal file
@ -0,0 +1,195 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
#if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
|
||||
|
||||
#include "aecm_core.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "aecm_delay_estimator.h"
|
||||
#include "echo_control_mobile.h"
|
||||
#include "ring_buffer.h"
|
||||
#include "typedefs.h"
|
||||
|
||||
// Square root of Hanning window in Q14
|
||||
static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = {
|
||||
16384, 16373, 16354, 16325,
|
||||
16286, 16237, 16179, 16111,
|
||||
16034, 15947, 15851, 15746,
|
||||
15631, 15506, 15373, 15231,
|
||||
15079, 14918, 14749, 14571,
|
||||
14384, 14189, 13985, 13773,
|
||||
13553, 13325, 13089, 12845,
|
||||
12594, 12335, 12068, 11795,
|
||||
11514, 11227, 10933, 10633,
|
||||
10326, 10013, 9695, 9370,
|
||||
9040, 8705, 8364, 8019,
|
||||
7668, 7313, 6954, 6591,
|
||||
6224, 5853, 5478, 5101,
|
||||
4720, 4337, 3951, 3562,
|
||||
3172, 2780, 2386, 1990,
|
||||
1594, 1196, 798, 399
|
||||
};
|
||||
|
||||
void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echoEst,
|
||||
WebRtc_UWord32* far_energy,
|
||||
WebRtc_UWord32* echo_energy_adapt,
|
||||
WebRtc_UWord32* echo_energy_stored)
|
||||
{
|
||||
int i;
|
||||
|
||||
register WebRtc_UWord32 far_energy_r;
|
||||
register WebRtc_UWord32 echo_energy_stored_r;
|
||||
register WebRtc_UWord32 echo_energy_adapt_r;
|
||||
uint32x4_t tmp32x4_0;
|
||||
|
||||
__asm__("vmov.i32 q14, #0" : : : "q14"); //far_energy
|
||||
__asm__("vmov.i32 q8, #0" : : : "q8"); //echo_energy_stored
|
||||
__asm__("vmov.i32 q9, #0" : : : "q9"); //echo_energy_adapt
|
||||
|
||||
for(i = 0; i < PART_LEN -7; i += 8)
|
||||
{
|
||||
//far_energy += (WebRtc_UWord32)(far_spectrum[i]);
|
||||
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
|
||||
__asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
|
||||
__asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
|
||||
|
||||
// Get estimated echo energies for adaptive channel and stored channel
|
||||
//echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
|
||||
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
|
||||
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
|
||||
__asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echoEst[i]): "q10", "q11");
|
||||
|
||||
//echo_energy_stored += (WebRtc_UWord32)echoEst[i];
|
||||
__asm__("vadd.u32 q8, q10" : : : "q10", "q8");
|
||||
__asm__("vadd.u32 q8, q11" : : : "q11", "q8");
|
||||
|
||||
//echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], far_spectrum[i]);
|
||||
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
|
||||
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
|
||||
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
|
||||
__asm__("vadd.u32 q9, q10" : : : "q9", "q15");
|
||||
__asm__("vadd.u32 q9, q11" : : : "q9", "q11");
|
||||
}
|
||||
|
||||
__asm__("vadd.u32 d28, d29" : : : "q14");
|
||||
__asm__("vpadd.u32 d28, d28" : : : "q14");
|
||||
__asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
|
||||
|
||||
__asm__("vadd.u32 d18, d19" : : : "q9");
|
||||
__asm__("vpadd.u32 d18, d18" : : : "q9");
|
||||
__asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
|
||||
|
||||
__asm__("vadd.u32 d16, d17" : : : "q8");
|
||||
__asm__("vpadd.u32 d16, d16" : : : "q8");
|
||||
__asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
|
||||
|
||||
// Get estimated echo energies for adaptive channel and stored channel
|
||||
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
*echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echoEst[i];
|
||||
*far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
|
||||
*echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
|
||||
aecm->channelAdapt16[i], far_spectrum[i]);
|
||||
}
|
||||
|
||||
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est)
|
||||
{
|
||||
int i;
|
||||
|
||||
// During startup we store the channel every block.
|
||||
// Recalculate echo estimate.
|
||||
for(i = 0; i < PART_LEN -7; i += 8)
|
||||
{
|
||||
// aecm->channelStored[i] = acem->channelAdapt16[i];
|
||||
// echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
|
||||
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
|
||||
__asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
|
||||
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
|
||||
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
|
||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&echo_est[i]) : "q10", "q11");
|
||||
}
|
||||
aecm->channelStored[i] = aecm->channelAdapt16[i];
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
}
|
||||
|
||||
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
|
||||
{
|
||||
int i;
|
||||
|
||||
for(i = 0; i < PART_LEN -7; i += 8)
|
||||
{
|
||||
// aecm->channelAdapt16[i] = aecm->channelStored[i];
|
||||
// aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
|
||||
// aecm->channelStored[i], 16);
|
||||
__asm__("vld1.16 {d24, d25}, [%0, :128]" : :
|
||||
"r"(&aecm->channelStored[i]) : "q12");
|
||||
__asm__("vst1.16 {d24, d25}, [%0, :128]" : :
|
||||
"r"(&aecm->channelAdapt16[i]) : "q12");
|
||||
__asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
|
||||
__asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
|
||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&aecm->channelAdapt32[i]): "q10", "q11");
|
||||
}
|
||||
aecm->channelAdapt16[i] = aecm->channelStored[i];
|
||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i], 16);
|
||||
}
|
||||
|
||||
void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
|
||||
const WebRtc_Word16* time_signal,
|
||||
int time_signal_scaling)
|
||||
{
|
||||
int i, j;
|
||||
int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
|
||||
__asm__("vmov.i16 d21, #0" ::: "d21");
|
||||
|
||||
for(i = 0, j = 0; i < PART_LEN-3; i += 4, j += 8)
|
||||
{
|
||||
int16x4_t tmp16x4_0;
|
||||
int16x4_t tmp16x4_1;
|
||||
int32x4_t tmp32x4_0;
|
||||
|
||||
/* Window near end */
|
||||
// fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
|
||||
// << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
|
||||
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
|
||||
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
|
||||
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
|
||||
|
||||
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
|
||||
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
|
||||
|
||||
// fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
// (time_signal[PART_LEN + i] << time_signal_scaling),
|
||||
// WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[PART_LEN + i]));
|
||||
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
|
||||
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
|
||||
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
|
||||
|
||||
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
|
||||
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
|
||||
}
|
||||
}
|
||||
|
||||
#endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
|
||||
|
Loading…
x
Reference in New Issue
Block a user