1st check-in for AECM Neon optimization.

Review URL: http://webrtc-codereview.appspot.com/104001

git-svn-id: http://webrtc.googlecode.com/svn/trunk@359 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@google.com 2011-08-13 06:33:38 +00:00
parent 4033e1245d
commit 1959e6fcb5
4 changed files with 399 additions and 113 deletions

View File

@ -25,6 +25,11 @@ LOCAL_SRC_FILES := \
LOCAL_CFLAGS := \
$(MY_WEBRTC_COMMON_DEFS)
ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_SRC_FILES += aecm_core_neon.c
LOCAL_CFLAGS += $(CFLAGS_NEON)
endif
LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/../interface \
$(LOCAL_PATH)/../../../utility \

View File

@ -31,7 +31,7 @@ FILE *testfile;
#ifdef AECM_SHORT
// Square root of Hanning window in Q14
static const WebRtc_Word16 kSqrtHanning[] =
const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] =
{
0, 804, 1606, 2404, 3196, 3981, 4756, 5520,
6270, 7005, 7723, 8423, 9102, 9760, 10394, 11003,
@ -43,12 +43,15 @@ static const WebRtc_Word16 kSqrtHanning[] =
#else
// Square root of Hanning window in Q14
static const WebRtc_Word16 kSqrtHanning[] = {0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364,
8705, 9040, 9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514, 11795, 12068, 12335,
12594, 12845, 13089, 13325, 13553, 13773, 13985, 14189, 14384, 14571, 14749, 14918,
15079, 15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034, 16111, 16179, 16237,
16286, 16325, 16354, 16373, 16384};
const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] __attribute__ ((aligned (8))) =
{
0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364,
8705, 9040, 9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514, 11795, 12068, 12335,
12594, 12845, 13089, 13325, 13553, 13773, 13985, 14189, 14384, 14571, 14749, 14918,
15079, 15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034, 16111, 16179, 16237,
16286, 16325, 16354, 16373, 16384
};
#endif
@ -98,11 +101,6 @@ static const WebRtc_Word16 kNoiseEstIncCount = 5;
HANDLE logFile = NULL;
#endif
static void WebRtcAecm_ComfortNoise(AecmCore_t* const aecm, const WebRtc_UWord16 * const dfa,
WebRtc_Word16 * const outReal,
WebRtc_Word16 * const outImag,
const WebRtc_Word16 * const lambda);
int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
{
AecmCore_t *aecm = malloc(sizeof(AecmCore_t));
@ -147,6 +145,18 @@ int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
return -1;
}
// Init some aecm pointers. 16-byte alignment is only necessary for Neon code currently.
aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 15) & ~ 15);
aecm->dBufClean = (WebRtc_Word16*) (((uintptr_t)aecm->dBufClean_buf + 15) & ~ 15);
aecm->dBufNoisy = (WebRtc_Word16*) (((uintptr_t)aecm->dBufNoisy_buf + 15) & ~ 15);
aecm->outBuf = (WebRtc_Word16*) (((uintptr_t)aecm->outBuf_buf + 15) & ~ 15);
aecm->channelStored = (WebRtc_Word16*) (((uintptr_t)
aecm->channelStored_buf + 15) & ~ 15);
aecm->channelAdapt16 = (WebRtc_Word16*) (((uintptr_t)
aecm->channelAdapt16_buf + 15) & ~ 15);
aecm->channelAdapt32 = (WebRtc_Word32*) (((uintptr_t)
aecm->channelAdapt32_buf + 31) & ~ 31);
return 0;
}
@ -209,10 +219,10 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
WebRtcApm_InitBuffer(aecm->nearCleanFrameBuf);
WebRtcApm_InitBuffer(aecm->outFrameBuf);
memset(aecm->xBuf, 0, sizeof(aecm->xBuf));
memset(aecm->dBufClean, 0, sizeof(aecm->dBufClean));
memset(aecm->dBufNoisy, 0, sizeof(aecm->dBufNoisy));
memset(aecm->outBuf, 0, sizeof(aecm->outBuf));
memset(aecm->xBuf_buf, 0, sizeof(aecm->xBuf_buf));
memset(aecm->dBufClean_buf, 0, sizeof(aecm->dBufClean_buf));
memset(aecm->dBufNoisy_buf, 0, sizeof(aecm->dBufNoisy_buf));
memset(aecm->outBuf_buf, 0, sizeof(aecm->outBuf_buf));
aecm->seed = 666;
aecm->totCount = 0;
@ -287,6 +297,8 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
aecm->supGainErrParamDiffAB = SUPGAIN_ERROR_PARAM_A - SUPGAIN_ERROR_PARAM_B;
aecm->supGainErrParamDiffBD = SUPGAIN_ERROR_PARAM_B - SUPGAIN_ERROR_PARAM_D;
assert(PART_LEN % 16 == 0);
return 0;
}
@ -481,18 +493,8 @@ void WebRtcAecm_CalcEnergies(AecmCore_t * aecm,
aecm->nearLogEnergy[0] = tmp16;
// END: Get log of near end energy
// Get energy for the delayed far end signal and estimated
// echo using both stored and adapted channels.
for (i = 0; i < PART_LEN1; i++)
{
// Get estimated echo energies for adaptive channel and stored channel
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
tmpFar += (WebRtc_UWord32)(far_spectrum[i]);
tmpAdapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
far_spectrum[i]);
tmpStored += (WebRtc_UWord32)echoEst[i];
}
WebRtcAecm_CalcLinearEnergies(aecm, far_spectrum, echoEst, &tmpFar, &tmpAdapt, &tmpStored);
// Shift buffers
memmove(aecm->echoAdaptLogEnergy + 1, aecm->echoAdaptLogEnergy,
sizeof(WebRtc_Word16) * (MAX_BUF_LEN - 1));
@ -814,22 +816,9 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm,
// Determine if we should store or restore the channel
if ((aecm->startupState == 0) & (aecm->currentVADValue))
{
// During startup we store the channel every block.
memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
// Recalculate echo estimate
for (i = 0; i < PART_LEN; i += 4)
{
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
echoEst[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
far_spectrum[i + 1]);
echoEst[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
far_spectrum[i + 2]);
echoEst[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
far_spectrum[i + 3]);
}
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
// During startup we store the channel every block,
// and we recalculate echo estimate
WebRtcAecm_StoreAdaptiveChannel(aecm, far_spectrum, echoEst);
} else
{
if (aecm->farLogEnergy < aecm->farEnergyMSE)
@ -865,43 +854,14 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm,
{
// The stored channel has a significantly lower MSE than the adaptive one for
// two consecutive calculations. Reset the adaptive channel.
memcpy(aecm->channelAdapt16, aecm->channelStored,
sizeof(WebRtc_Word16) * PART_LEN1);
// Restore the W32 channel
for (i = 0; i < PART_LEN; i += 4)
{
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i], 16);
aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 1], 16);
aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 2], 16);
aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 3], 16);
}
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
WebRtcAecm_ResetAdaptiveChannel(aecm);
} else if (((MIN_MSE_DIFF * mseStored) > (mseAdapt << MSE_RESOLUTION)) & (mseAdapt
< aecm->mseThreshold) & (aecm->mseAdaptOld < aecm->mseThreshold))
{
// The adaptive channel has a significantly lower MSE than the stored one.
// The MSE for the adaptive channel has also been low for two consecutive
// calculations. Store the adaptive channel.
memcpy(aecm->channelStored, aecm->channelAdapt16,
sizeof(WebRtc_Word16) * PART_LEN1);
// Recalculate echo estimate
for (i = 0; i < PART_LEN; i += 4)
{
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
echoEst[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
far_spectrum[i + 1]);
echoEst[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
far_spectrum[i + 2]);
echoEst[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
far_spectrum[i + 3]);
}
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
WebRtcAecm_StoreAdaptiveChannel(aecm, far_spectrum, echoEst);
// Update threshold
if (aecm->mseThreshold == WEBRTC_SPL_WORD32_MAX)
@ -1032,7 +992,9 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
WebRtc_Word32 tmp32no1;
WebRtc_Word32 tmp32no2;
WebRtc_Word16 fft[PART_LEN4];
// In fft_buf, +8 for 16-byte alignment, and +2 to make some loops safe.
WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 8];
WebRtc_Word16 *fft = (WebRtc_Word16 *) (((uintptr_t) fft_buf + 15) & ~15);
WebRtc_Word16 tmp16no1;
WebRtc_Word16 tmp16no2;
@ -1048,23 +1010,7 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
time_signal_scaling = WebRtcSpl_NormW16(tmp16no1);
#endif
memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
// FFT of signal
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
{
// Window time domain signal and insert into real part of
// transformation array |fft|
fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i] << time_signal_scaling),
kSqrtHanning[i],
14);
fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[PART_LEN + i] << time_signal_scaling),
kSqrtHanning[PART_LEN - i],
14);
// Inserting zeros in imaginary parts not necessary since we
// initialized the array with all zeros
}
WebRtcAecm_PrepareFft(fft, time_signal, time_signal_scaling);
// Fourier transformation of time domain signal.
// The result is scaled with 1/PART_LEN2, that is, the result is in Q(-6)
@ -1187,7 +1133,10 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
WebRtc_UWord32 tmpU32;
WebRtc_Word32 tmp32no1;
WebRtc_Word32 echoEst32[PART_LEN1];
// +8 for 32-byte alignment.
WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8];
WebRtc_Word32 *echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31);
WebRtc_UWord16 xfa[PART_LEN1];
WebRtc_UWord16 dfaNoisy[PART_LEN1];
@ -1540,9 +1489,9 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
for (i = 0; i < PART_LEN1; i++)
{
efwReal[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwReal[i],
hnl[i], 14));
hnl[i], 14));
efwImag[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwImag[i],
hnl[i], 14));
hnl[i], 14));
}
}
@ -1595,7 +1544,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
{
fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
fft[i],
kSqrtHanning[i],
WebRtcAecm_kSqrtHanning[i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
outCFFT - aecm->dfaCleanQDomain);
@ -1606,7 +1555,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
fft[PART_LEN + i],
kSqrtHanning[PART_LEN - i],
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
outCFFT - aecm->dfaCleanQDomain);
@ -1623,7 +1572,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
milliseconds = (unsigned int)(diff__ & 0xffffffff);
WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
#endif
// Copy the current block to the old position (outBuf is shifted elsewhere)
// Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
if (nearendClean != NULL)
@ -1634,6 +1583,105 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
return 0;
}
#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
int time_signal_scaling)
{
int i, j;
memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
// FFT of signal
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
{
// Window time domain signal and insert into real part of
// transformation array |fft|
fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[i],
14);
fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[PART_LEN + i] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
// Inserting zeros in imaginary parts not necessary since we
// initialized the array with all zeros
}
}
void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est,
WebRtc_UWord32* far_energy,
WebRtc_UWord32* echo_energy_adapt,
WebRtc_UWord32* echo_energy_stored)
{
int i;
// Get energy for the delayed far end signal and estimated
// echo using both stored and adapted channels.
for (i = 0; i < PART_LEN1; i++)
{
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
(*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
(*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
far_spectrum[i]);
(*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
}
}
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est)
{
int i;
// During startup we store the channel every block.
memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
// Recalculate echo estimate
for (i = 0; i < PART_LEN; i += 4)
{
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
far_spectrum[i + 1]);
echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
far_spectrum[i + 2]);
echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
far_spectrum[i + 3]);
}
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
}
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
{
int i;
// The stored channel has a significantly lower MSE than the adaptive one for
// two consecutive calculations. Reset the adaptive channel.
memcpy(aecm->channelAdapt16, aecm->channelStored,
sizeof(WebRtc_Word16) * PART_LEN1);
// Restore the W32 channel
for (i = 0; i < PART_LEN; i += 4)
{
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i], 16);
aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 1], 16);
aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 2], 16);
aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 3], 16);
}
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
}
#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
// Generate comfort noise and add to output signal.
//
// \param[in] aecm Handle of the AECM instance.
@ -1642,11 +1690,11 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
// \param[in,out] outImag Imaginary part of the output signal (Q[aecm->dfaQDomain]).
// \param[in] lambda Suppression gain with which to scale the noise level (Q14).
//
static void WebRtcAecm_ComfortNoise(AecmCore_t * const aecm,
const WebRtc_UWord16 * const dfa,
WebRtc_Word16 * const outReal,
WebRtc_Word16 * const outImag,
const WebRtc_Word16 * const lambda)
void WebRtcAecm_ComfortNoise(AecmCore_t * aecm,
const WebRtc_UWord16* dfa,
WebRtc_Word16* outReal,
WebRtc_Word16* outImag,
const WebRtc_Word16* lambda)
{
WebRtc_Word16 i;
WebRtc_Word16 tmp16;
@ -1792,7 +1840,8 @@ static void WebRtcAecm_ComfortNoise(AecmCore_t * const aecm,
#endif
}
void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * const farend,
void WebRtcAecm_BufferFarFrame(AecmCore_t* const aecm,
const WebRtc_Word16* const farend,
const int farLen)
{
int writeLen = farLen, writePos = 0;

View File

@ -97,6 +97,8 @@
#define NLP_COMP_LOW 3277 // 0.2 in Q14
#define NLP_COMP_HIGH ONE_Q14 // 1 in Q14
extern const WebRtc_Word16 WebRtcAecm_kSqrtHanning[];
typedef struct
{
int farBufWritePos;
@ -110,11 +112,6 @@ typedef struct
void *nearCleanFrameBuf;
void *outFrameBuf;
WebRtc_Word16 xBuf[PART_LEN2]; // farend
WebRtc_Word16 dBufClean[PART_LEN2]; // nearend
WebRtc_Word16 dBufNoisy[PART_LEN2]; // nearend
WebRtc_Word16 outBuf[PART_LEN];
WebRtc_Word16 farBuf[FAR_BUF_LEN];
WebRtc_Word16 mult;
@ -139,9 +136,26 @@ typedef struct
WebRtc_Word16 echoAdaptLogEnergy[MAX_BUF_LEN];
WebRtc_Word16 echoStoredLogEnergy[MAX_BUF_LEN];
WebRtc_Word16 channelAdapt16[PART_LEN1];
WebRtc_Word32 channelAdapt32[PART_LEN1];
WebRtc_Word16 channelStored[PART_LEN1];
// The extra 16 or 32 bytes in the following buffers are for alignment based Neon code.
// It's designed this way since the current GCC compiler can't align a buffer in 16 or 32
// byte boundaries properly.
WebRtc_Word16 channelStored_buf[PART_LEN1 + 8];
WebRtc_Word16 channelAdapt16_buf[PART_LEN1 + 8];
WebRtc_Word32 channelAdapt32_buf[PART_LEN1 + 8];
WebRtc_Word16 xBuf_buf[PART_LEN2 + 8]; // farend
WebRtc_Word16 dBufClean_buf[PART_LEN2 + 8]; // nearend
WebRtc_Word16 dBufNoisy_buf[PART_LEN2 + 8]; // nearend
WebRtc_Word16 outBuf_buf[PART_LEN + 8];
// Pointers to the above buffers
WebRtc_Word16 *channelStored;
WebRtc_Word16 *channelAdapt16;
WebRtc_Word32 *channelAdapt32;
WebRtc_Word16 *xBuf;
WebRtc_Word16 *dBufClean;
WebRtc_Word16 *dBufNoisy;
WebRtc_Word16 *outBuf;
WebRtc_Word32 echoFilt[PART_LEN1];
WebRtc_Word16 nearFilt[PART_LEN1];
WebRtc_Word32 noiseEst[PART_LEN1];
@ -308,4 +322,27 @@ void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * co
void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const farend,
const int farLen, const int knownDelay);
///////////////////////////////////////////////////////////////////////////////////////////////
// Some internal functions shared by ARM NEON and generic C code:
//
WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t * aecm);
void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echoEst,
WebRtc_UWord32* far_energy,
WebRtc_UWord32* echo_energy_adapt,
WebRtc_UWord32* echo_energy_stored);
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est);
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm);
void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
int time_signal_scaling);
#endif

View File

@ -0,0 +1,195 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
#include "aecm_core.h"
#include <arm_neon.h>
#include <assert.h>
#include <stdlib.h>
#include "aecm_delay_estimator.h"
#include "echo_control_mobile.h"
#include "ring_buffer.h"
#include "typedefs.h"
// Square root of Hanning window in Q14
static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = {
16384, 16373, 16354, 16325,
16286, 16237, 16179, 16111,
16034, 15947, 15851, 15746,
15631, 15506, 15373, 15231,
15079, 14918, 14749, 14571,
14384, 14189, 13985, 13773,
13553, 13325, 13089, 12845,
12594, 12335, 12068, 11795,
11514, 11227, 10933, 10633,
10326, 10013, 9695, 9370,
9040, 8705, 8364, 8019,
7668, 7313, 6954, 6591,
6224, 5853, 5478, 5101,
4720, 4337, 3951, 3562,
3172, 2780, 2386, 1990,
1594, 1196, 798, 399
};
void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echoEst,
WebRtc_UWord32* far_energy,
WebRtc_UWord32* echo_energy_adapt,
WebRtc_UWord32* echo_energy_stored)
{
int i;
register WebRtc_UWord32 far_energy_r;
register WebRtc_UWord32 echo_energy_stored_r;
register WebRtc_UWord32 echo_energy_adapt_r;
uint32x4_t tmp32x4_0;
__asm__("vmov.i32 q14, #0" : : : "q14"); //far_energy
__asm__("vmov.i32 q8, #0" : : : "q8"); //echo_energy_stored
__asm__("vmov.i32 q9, #0" : : : "q9"); //echo_energy_adapt
for(i = 0; i < PART_LEN -7; i += 8)
{
//far_energy += (WebRtc_UWord32)(far_spectrum[i]);
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
__asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
__asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
// Get estimated echo energies for adaptive channel and stored channel
//echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echoEst[i]): "q10", "q11");
//echo_energy_stored += (WebRtc_UWord32)echoEst[i];
__asm__("vadd.u32 q8, q10" : : : "q10", "q8");
__asm__("vadd.u32 q8, q11" : : : "q11", "q8");
//echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], far_spectrum[i]);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vadd.u32 q9, q10" : : : "q9", "q15");
__asm__("vadd.u32 q9, q11" : : : "q9", "q11");
}
__asm__("vadd.u32 d28, d29" : : : "q14");
__asm__("vpadd.u32 d28, d28" : : : "q14");
__asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
__asm__("vadd.u32 d18, d19" : : : "q9");
__asm__("vpadd.u32 d18, d18" : : : "q9");
__asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
__asm__("vadd.u32 d16, d17" : : : "q8");
__asm__("vpadd.u32 d16, d16" : : : "q8");
__asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
// Get estimated echo energies for adaptive channel and stored channel
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
*echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echoEst[i];
*far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
*echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
aecm->channelAdapt16[i], far_spectrum[i]);
}
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est)
{
int i;
// During startup we store the channel every block.
// Recalculate echo estimate.
for(i = 0; i < PART_LEN -7; i += 8)
{
// aecm->channelStored[i] = acem->channelAdapt16[i];
// echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&echo_est[i]) : "q10", "q11");
}
aecm->channelStored[i] = aecm->channelAdapt16[i];
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
}
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
{
int i;
for(i = 0; i < PART_LEN -7; i += 8)
{
// aecm->channelAdapt16[i] = aecm->channelStored[i];
// aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
// aecm->channelStored[i], 16);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : :
"r"(&aecm->channelStored[i]) : "q12");
__asm__("vst1.16 {d24, d25}, [%0, :128]" : :
"r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
__asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->channelAdapt32[i]): "q10", "q11");
}
aecm->channelAdapt16[i] = aecm->channelStored[i];
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i], 16);
}
void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
int time_signal_scaling)
{
int i, j;
int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
__asm__("vmov.i16 d21, #0" ::: "d21");
for(i = 0, j = 0; i < PART_LEN-3; i += 4, j += 8)
{
int16x4_t tmp16x4_0;
int16x4_t tmp16x4_1;
int32x4_t tmp32x4_0;
/* Window near end */
// fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
// << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
// fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
// (time_signal[PART_LEN + i] << time_signal_scaling),
// WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[PART_LEN + i]));
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
}
}
#endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)