2nd check in

Review URL: http://webrtc-codereview.appspot.com/112002

git-svn-id: http://webrtc.googlecode.com/svn/trunk@372 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@google.com 2011-08-16 03:28:28 +00:00
parent 3d48d5b107
commit 8dd7466b52
3 changed files with 472 additions and 321 deletions

View File

@ -28,6 +28,14 @@ FILE *dfile;
FILE *testfile;
#endif
#ifdef _MSC_VER // visual c++
#define ALIGN8_BEG __declspec(align(8))
#define ALIGN8_END
#else // gcc or icc
#define ALIGN8_BEG
#define ALIGN8_END __attribute__((aligned(8)))
#endif
#ifdef AECM_SHORT
// Square root of Hanning window in Q14
@ -43,7 +51,7 @@ const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] =
#else
// Square root of Hanning window in Q14
const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] =
const ALIGN8_BEG WebRtc_Word16 WebRtcAecm_kSqrtHanning[] ALIGN8_END =
{
0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364,
@ -97,12 +105,13 @@ static const WebRtc_Word16 kChannelStored16kHz[PART_LEN1] = {
static const WebRtc_Word16 kNoiseEstQDomain = 15;
static const WebRtc_Word16 kNoiseEstIncCount = 5;
static void ComfortNoise(AecmCore_t * aecm,
static void ComfortNoise(AecmCore_t* aecm,
const WebRtc_UWord16* dfa,
WebRtc_Word16* outReal,
WebRtc_Word16* outImag,
complex16_t* out,
const WebRtc_Word16* lambda);
static WebRtc_Word16 CalcSuppressionGain(AecmCore_t * const aecm);
#ifdef ARM_WINM_LOG
HANDLE logFile = NULL;
#endif
@ -151,10 +160,11 @@ int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
return -1;
}
// Init some aecm pointers. 16-byte alignment is only necessary for Neon code currently.
aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 15) & ~ 15);
aecm->dBufClean = (WebRtc_Word16*) (((uintptr_t)aecm->dBufClean_buf + 15) & ~ 15);
aecm->dBufNoisy = (WebRtc_Word16*) (((uintptr_t)aecm->dBufNoisy_buf + 15) & ~ 15);
// Init some aecm pointers. 16 and 32 byte alignment is only necessary
// for Neon code currently.
aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 31) & ~ 31);
aecm->dBufClean = (WebRtc_Word16*) (((uintptr_t)aecm->dBufClean_buf + 31) & ~ 31);
aecm->dBufNoisy = (WebRtc_Word16*) (((uintptr_t)aecm->dBufNoisy_buf + 31) & ~ 31);
aecm->outBuf = (WebRtc_Word16*) (((uintptr_t)aecm->outBuf_buf + 15) & ~ 15);
aecm->channelStored = (WebRtc_Word16*) (((uintptr_t)
aecm->channelStored_buf + 15) & ~ 15);
@ -345,7 +355,9 @@ int WebRtcAecm_ProcessFrame(AecmCore_t * aecm,
WebRtc_Word16 farBlock[PART_LEN];
WebRtc_Word16 nearNoisyBlock[PART_LEN];
WebRtc_Word16 nearCleanBlock[PART_LEN];
WebRtc_Word16 outBlock[PART_LEN];
WebRtc_Word16 outBlock_buf[PART_LEN + 8]; // Align buffer to 8-byte boundary.
WebRtc_Word16* outBlock = (WebRtc_Word16*) (((uintptr_t) outBlock_buf + 15) & ~ 15);
WebRtc_Word16 farFrame[FRAME_LEN];
int size = 0;
@ -892,7 +904,7 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm,
// END: Determine if we should store or reset channel estimate.
}
// WebRtcAecm_CalcSuppressionGain(...)
// CalcSuppressionGain(...)
//
// This function calculates the suppression gain that is used in the Wiener filter.
//
@ -902,7 +914,7 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm,
// level (Q14).
//
//
WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t* aecm)
static WebRtc_Word16 CalcSuppressionGain(AecmCore_t * const aecm)
{
WebRtc_Word32 tmp32no1;
@ -985,8 +997,7 @@ WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t* aecm)
// return value The Q-domain of current frequency values
//
static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
WebRtc_Word16* freq_signal_real,
WebRtc_Word16* freq_signal_imag,
complex16_t* freq_signal,
WebRtc_UWord16* freq_signal_abs,
WebRtc_UWord32* freq_signal_sum_abs)
{
@ -998,9 +1009,9 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
WebRtc_Word32 tmp32no1;
WebRtc_Word32 tmp32no2;
// In fft_buf, +8 for 16-byte alignment, and +2 to make some loops safe.
WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 8];
WebRtc_Word16 *fft = (WebRtc_Word16 *) (((uintptr_t) fft_buf + 15) & ~15);
// In fft_buf, +16 for 32-byte alignment.
WebRtc_Word16 fft_buf[PART_LEN4 + 16];
WebRtc_Word16 *fft = (WebRtc_Word16 *) (((uintptr_t) fft_buf + 31) & ~31);
WebRtc_Word16 tmp16no1;
WebRtc_Word16 tmp16no2;
@ -1016,46 +1027,30 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
time_signal_scaling = WebRtcSpl_NormW16(tmp16no1);
#endif
WebRtcAecm_PrepareFft(fft, time_signal, time_signal_scaling);
// Fourier transformation of time domain signal.
// The result is scaled with 1/PART_LEN2, that is, the result is in Q(-6)
// for PART_LEN = 32
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
ret = WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
// Take only the first PART_LEN2 samples
for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
{
freq_signal_real[i] = fft[j];
// The imaginary part has to switch sign
freq_signal_imag[i] = - fft[j+1];
}
WebRtcAecm_WindowAndFFT(fft, time_signal, freq_signal, time_signal_scaling);
// Extract imaginary and real part, calculate the magnitude for all frequency bins
freq_signal_imag[0] = 0;
freq_signal_imag[PART_LEN] = 0;
freq_signal_real[PART_LEN] = fft[PART_LEN2];
freq_signal[0].imag = 0;
freq_signal[PART_LEN].imag = 0;
freq_signal[PART_LEN].real = fft[PART_LEN2];
freq_signal_abs[0] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16(
freq_signal_real[0]);
freq_signal[0].real);
freq_signal_abs[PART_LEN] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16(
freq_signal_real[PART_LEN]);
freq_signal[PART_LEN].real);
(*freq_signal_sum_abs) = (WebRtc_UWord32)(freq_signal_abs[0]) +
(WebRtc_UWord32)(freq_signal_abs[PART_LEN]);
for (i = 1; i < PART_LEN; i++)
{
if (freq_signal_real[i] == 0)
if (freq_signal[i].real == 0)
{
freq_signal_abs[i] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16(
freq_signal_imag[i]);
freq_signal[i].imag);
}
else if (freq_signal_imag[i] == 0)
else if (freq_signal[i].imag == 0)
{
freq_signal_abs[i] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16(
freq_signal_real[i]);
freq_signal[i].real);
}
else
{
@ -1066,8 +1061,8 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
// The parameters alpha and beta are stored in Q15
#ifdef AECM_WITH_ABS_APPROX
tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal_real[i]);
tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal_imag[i]);
tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal[i].real);
tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal[i].imag);
if(tmp16no1 > tmp16no2)
{
@ -1103,13 +1098,13 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
(WebRtc_UWord16)tmp16no2;
#else
#ifdef WEBRTC_ARCH_ARM_V7A
__asm__("smulbb %0, %1, %2" : "=r"(tmp32no1) : "r"(freq_signal_real[i]),
"r"(freq_signal_real[i]));
__asm__("smlabb %0, %1, %2, %3" :: "r"(tmp32no2), "r"(freq_signal_imag[i]),
"r"(freq_signal_imag[i]), "r"(tmp32no1));
__asm__("smulbb %0, %1, %2" : "=r"(tmp32no1) : "r"(freq_signal[i].real),
"r"(freq_signal[i].real));
__asm__("smlabb %0, %1, %2, %3" :: "r"(tmp32no2), "r"(freq_signal[i].imag),
"r"(freq_signal[i].imag), "r"(tmp32no1));
#else
tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal_real[i]);
tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal_imag[i]);
tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal[i].real);
tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal[i].imag);
tmp32no1 = WEBRTC_SPL_MUL_16_16(tmp16no1, tmp16no1);
tmp32no2 = WEBRTC_SPL_MUL_16_16(tmp16no2, tmp16no2);
tmp32no2 = WEBRTC_SPL_ADD_SAT_W32(tmp32no1, tmp32no2);
@ -1125,7 +1120,8 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
return time_signal_scaling;
}
int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
int WebRtcAecm_ProcessBlock(AecmCore_t * aecm,
const WebRtc_Word16 * farend,
const WebRtc_Word16 * nearendNoisy,
const WebRtc_Word16 * nearendClean,
WebRtc_Word16 * output)
@ -1140,10 +1136,6 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
WebRtc_Word32 tmp32no1;
// +8 for 32-byte alignment.
WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8];
WebRtc_Word32 *echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31);
WebRtc_UWord16 xfa[PART_LEN1];
WebRtc_UWord16 dfaNoisy[PART_LEN1];
WebRtc_UWord16 dfaClean[PART_LEN1];
@ -1151,11 +1143,18 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
const WebRtc_UWord16* far_spectrum_ptr = NULL;
int outCFFT;
WebRtc_Word16 fft[PART_LEN4];
WebRtc_Word16 dfwReal[PART_LEN1];
WebRtc_Word16 dfwImag[PART_LEN1];
WebRtc_Word16 efwReal[PART_LEN1];
WebRtc_Word16 efwImag[PART_LEN1];
// 32 byte aligned buffers (with +8 or +16).
// TODO (kma): define fft with complex16_t.
WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 16]; // +2 to make a loop safe.
WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8];
WebRtc_Word32 dfw_buf[PART_LEN1 + 8];
WebRtc_Word32 efw_buf[PART_LEN1 + 8];
WebRtc_Word16* fft = (WebRtc_Word16*) (((uintptr_t) fft_buf + 31) & ~ 31);
WebRtc_Word32* echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31);
complex16_t* dfw = (complex16_t*) (((uintptr_t) dfw_buf + 31) & ~ 31);
complex16_t* efw = (complex16_t*) (((uintptr_t) efw_buf + 31) & ~ 31);
WebRtc_Word16 hnl[PART_LEN1];
WebRtc_Word16 numPosCoef = 0;
WebRtc_Word16 nlpGain = ONE_Q14;
@ -1206,15 +1205,13 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
// Transform far end signal from time domain to frequency domain.
zerosXBuf = TimeToFrequencyDomain(aecm->xBuf,
dfwReal,
dfwImag,
dfw,
xfa,
&xfaSum);
// Transform noisy near end signal from time domain to frequency domain.
zerosDBufNoisy = TimeToFrequencyDomain(aecm->dBufNoisy,
dfwReal,
dfwImag,
dfw,
dfaNoisy,
&dfaNoisySum);
aecm->dfaNoisyQDomainOld = aecm->dfaNoisyQDomain;
@ -1231,8 +1228,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
{
// Transform clean near end signal from time domain to frequency domain.
zerosDBufClean = TimeToFrequencyDomain(aecm->dBufClean,
dfwReal,
dfwImag,
dfw,
dfaClean,
&dfaCleanSum);
aecm->dfaCleanQDomainOld = aecm->dfaCleanQDomain;
@ -1300,7 +1296,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
// This is the channel estimation algorithm.
// It is base on NLMS but has a variable step length, which was calculated above.
WebRtcAecm_UpdateChannel(aecm, far_spectrum_ptr, zerosXBuf, dfaNoisy, mu, echoEst32);
supGain = WebRtcAecm_CalcSuppressionGain(aecm);
supGain = CalcSuppressionGain(aecm);
#ifdef ARM_WINM_LOG_
// measure tick end
@ -1483,9 +1479,9 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
}
// multiply with Wiener coefficients
efwReal[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwReal[i],
efw[i].real = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].real,
hnl[i], 14));
efwImag[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwImag[i],
efw[i].imag = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].imag,
hnl[i], 14));
}
}
@ -1494,16 +1490,16 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
// multiply with Wiener coefficients
for (i = 0; i < PART_LEN1; i++)
{
efwReal[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwReal[i],
efw[i].real = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].real,
hnl[i], 14));
efwImag[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwImag[i],
efw[i].imag = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].imag,
hnl[i], 14));
}
}
if (aecm->cngMode == AecmTrue)
{
ComfortNoise(aecm, ptrDfaClean, efwReal, efwImag, hnl);
ComfortNoise(aecm, ptrDfaClean, efw, hnl);
}
#ifdef ARM_WINM_LOG_
@ -1516,177 +1512,11 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
QueryPerformanceCounter((LARGE_INTEGER*)&start);
#endif
// Synthesis
for (i = 1; i < PART_LEN; i++)
{
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
fft[j] = efwReal[i];
// mirrored data, even
fft[PART_LEN4 - j] = efwReal[i];
fft[j + 1] = -efwImag[i];
//mirrored data, odd
fft[PART_LEN4 - (j - 1)] = efwImag[i];
}
fft[0] = efwReal[0];
fft[1] = -efwImag[0];
fft[PART_LEN2] = efwReal[PART_LEN];
fft[PART_LEN2 + 1] = -efwImag[PART_LEN];
// inverse FFT, result should be scaled with outCFFT
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
//take only the real values and scale with outCFFT
for (i = 0; i < PART_LEN2; i++)
{
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
fft[i] = fft[j];
}
for (i = 0; i < PART_LEN; i++)
{
fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
fft[i],
WebRtcAecm_kSqrtHanning[i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
outCFFT - aecm->dfaCleanQDomain);
fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
tmp32no1 + aecm->outBuf[i],
WEBRTC_SPL_WORD16_MIN);
output[i] = fft[i];
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
fft[PART_LEN + i],
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
outCFFT - aecm->dfaCleanQDomain);
aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
WEBRTC_SPL_WORD16_MAX,
tmp32no1,
WEBRTC_SPL_WORD16_MIN);
}
#ifdef ARM_WINM_LOG_
// measure tick end
QueryPerformanceCounter((LARGE_INTEGER*)&end);
diff__ = ((end - start) * 1000) / (freq/1000);
milliseconds = (unsigned int)(diff__ & 0xffffffff);
WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
#endif
// Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
if (nearendClean != NULL)
{
memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
}
WebRtcAecm_InverseFFTAndWindow(aecm, fft, efw, output, nearendClean);
return 0;
}
#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
int time_signal_scaling)
{
int i, j;
memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
// FFT of signal
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
{
// Window time domain signal and insert into real part of
// transformation array |fft|
fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[i],
14);
fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[PART_LEN + i] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
// Inserting zeros in imaginary parts not necessary since we
// initialized the array with all zeros
}
}
void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est,
WebRtc_UWord32* far_energy,
WebRtc_UWord32* echo_energy_adapt,
WebRtc_UWord32* echo_energy_stored)
{
int i;
// Get energy for the delayed far end signal and estimated
// echo using both stored and adapted channels.
for (i = 0; i < PART_LEN1; i++)
{
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
(*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
(*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
far_spectrum[i]);
(*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
}
}
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est)
{
int i;
// During startup we store the channel every block.
memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
// Recalculate echo estimate
for (i = 0; i < PART_LEN; i += 4)
{
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
far_spectrum[i + 1]);
echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
far_spectrum[i + 2]);
echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
far_spectrum[i + 3]);
}
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
}
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
{
int i;
// The stored channel has a significantly lower MSE than the adaptive one for
// two consecutive calculations. Reset the adaptive channel.
memcpy(aecm->channelAdapt16, aecm->channelStored,
sizeof(WebRtc_Word16) * PART_LEN1);
// Restore the W32 channel
for (i = 0; i < PART_LEN; i += 4)
{
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i], 16);
aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 1], 16);
aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 2], 16);
aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 3], 16);
}
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
}
#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
// Generate comfort noise and add to output signal.
//
@ -1696,10 +1526,9 @@ void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
// \param[in,out] outImag Imaginary part of the output signal (Q[aecm->dfaQDomain]).
// \param[in] lambda Suppression gain with which to scale the noise level (Q14).
//
static void ComfortNoise(AecmCore_t * aecm,
static void ComfortNoise(AecmCore_t* aecm,
const WebRtc_UWord16* dfa,
WebRtc_Word16* outReal,
WebRtc_Word16* outImag,
complex16_t* out,
const WebRtc_Word16* lambda)
{
WebRtc_Word16 i;
@ -1827,22 +1656,22 @@ static void ComfortNoise(AecmCore_t * aecm,
#if (!defined ARM_WINM) && (!defined ARM9E_GCC) && (!defined ANDROID_AECOPT)
for (i = 0; i < PART_LEN1; i++)
{
outReal[i] = WEBRTC_SPL_ADD_SAT_W16(outReal[i], uReal[i]);
outImag[i] = WEBRTC_SPL_ADD_SAT_W16(outImag[i], uImag[i]);
out[i].real = WEBRTC_SPL_ADD_SAT_W16(out[i].real, uReal[i]);
out[i].imag = WEBRTC_SPL_ADD_SAT_W16(out[i].imag, uImag[i]);
}
#else
for (i = 0; i < PART_LEN1 -1; )
{
outReal[i] = WEBRTC_SPL_ADD_SAT_W16(outReal[i], uReal[i]);
outImag[i] = WEBRTC_SPL_ADD_SAT_W16(outImag[i], uImag[i]);
out[i].real = WEBRTC_SPL_ADD_SAT_W16(out[i].real, uReal[i]);
out[i].imag = WEBRTC_SPL_ADD_SAT_W16(out[i].imag, uImag[i]);
i++;
outReal[i] = WEBRTC_SPL_ADD_SAT_W16(outReal[i], uReal[i]);
outImag[i] = WEBRTC_SPL_ADD_SAT_W16(outImag[i], uImag[i]);
out[i].real = WEBRTC_SPL_ADD_SAT_W16(out[i].real, uReal[i]);
out[i].imag = WEBRTC_SPL_ADD_SAT_W16(out[i].imag, uImag[i]);
i++;
}
outReal[i] = WEBRTC_SPL_ADD_SAT_W16(outReal[i], uReal[i]);
outImag[i] = WEBRTC_SPL_ADD_SAT_W16(outImag[i], uImag[i]);
out[i].real = WEBRTC_SPL_ADD_SAT_W16(out[i].real, uReal[i]);
out[i].imag = WEBRTC_SPL_ADD_SAT_W16(out[i].imag, uImag[i]);
#endif
}
@ -1906,3 +1735,196 @@ void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const far
sizeof(WebRtc_Word16) * readLen);
aecm->farBufReadPos += readLen;
}
#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
int time_signal_scaling)
{
int i, j;
memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
// FFT of signal
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
{
// Window time domain signal and insert into real part of
// transformation array |fft|
fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[i],
14);
fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i + PART_LEN] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
// Inserting zeros in imaginary parts not necessary since we
// initialized the array with all zeros
}
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
// Take only the first PART_LEN2 samples
for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
{
freq_signal[i].real = fft[j];
// The imaginary part has to switch sign
freq_signal[i].imag = - fft[j+1];
}
}
void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
WebRtc_Word16* fft,
complex16_t* efw,
WebRtc_Word16* output,
const WebRtc_Word16* nearendClean)
{
int i, j, outCFFT;
WebRtc_Word32 tmp32no1;
// Synthesis
for (i = 1; i < PART_LEN; i++)
{
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
fft[j] = efw[i].real;
// mirrored data, even
fft[PART_LEN4 - j] = efw[i].real;
fft[j + 1] = -efw[i].imag;
//mirrored data, odd
fft[PART_LEN4 - (j - 1)] = efw[i].imag;
}
fft[0] = efw[0].real;
fft[1] = -efw[0].imag;
fft[PART_LEN2] = efw[PART_LEN].real;
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
// inverse FFT, result should be scaled with outCFFT
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
//take only the real values and scale with outCFFT
for (i = 0; i < PART_LEN2; i++)
{
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
fft[i] = fft[j];
}
for (i = 0; i < PART_LEN; i++)
{
fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
fft[i],
WebRtcAecm_kSqrtHanning[i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
outCFFT - aecm->dfaCleanQDomain);
fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
tmp32no1 + aecm->outBuf[i],
WEBRTC_SPL_WORD16_MIN);
output[i] = fft[i];
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
fft[PART_LEN + i],
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
outCFFT - aecm->dfaCleanQDomain);
aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
WEBRTC_SPL_WORD16_MAX,
tmp32no1,
WEBRTC_SPL_WORD16_MIN);
}
#ifdef ARM_WINM_LOG_
// measure tick end
QueryPerformanceCounter((LARGE_INTEGER*)&end);
diff__ = ((end - start) * 1000) / (freq/1000);
milliseconds = (unsigned int)(diff__ & 0xffffffff);
WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
#endif
// Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
if (nearendClean != NULL)
{
memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
}
}
void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est,
WebRtc_UWord32* far_energy,
WebRtc_UWord32* echo_energy_adapt,
WebRtc_UWord32* echo_energy_stored)
{
int i;
// Get energy for the delayed far end signal and estimated
// echo using both stored and adapted channels.
for (i = 0; i < PART_LEN1; i++)
{
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
(*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
(*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
far_spectrum[i]);
(*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
}
}
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est)
{
int i;
// During startup we store the channel every block.
memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
// Recalculate echo estimate
for (i = 0; i < PART_LEN; i += 4)
{
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
far_spectrum[i + 1]);
echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
far_spectrum[i + 2]);
echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
far_spectrum[i + 3]);
}
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
}
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
{
int i;
// The stored channel has a significantly lower MSE than the adaptive one for
// two consecutive calculations. Reset the adaptive channel.
memcpy(aecm->channelAdapt16, aecm->channelStored,
sizeof(WebRtc_Word16) * PART_LEN1);
// Restore the W32 channel
for (i = 0; i < PART_LEN; i += 4)
{
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i], 16);
aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 1], 16);
aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 2], 16);
aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 3], 16);
}
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
}
#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))

View File

@ -99,6 +99,11 @@
extern const WebRtc_Word16 WebRtcAecm_kSqrtHanning[];
typedef struct {
WebRtc_Word16 real;
WebRtc_Word16 imag;
} complex16_t;
typedef struct
{
int farBufWritePos;
@ -142,9 +147,9 @@ typedef struct
WebRtc_Word16 channelStored_buf[PART_LEN1 + 8];
WebRtc_Word16 channelAdapt16_buf[PART_LEN1 + 8];
WebRtc_Word32 channelAdapt32_buf[PART_LEN1 + 8];
WebRtc_Word16 xBuf_buf[PART_LEN2 + 8]; // farend
WebRtc_Word16 dBufClean_buf[PART_LEN2 + 8]; // nearend
WebRtc_Word16 dBufNoisy_buf[PART_LEN2 + 8]; // nearend
WebRtc_Word16 xBuf_buf[PART_LEN2 + 16]; // farend
WebRtc_Word16 dBufClean_buf[PART_LEN2 + 16]; // nearend
WebRtc_Word16 dBufNoisy_buf[PART_LEN2 + 16]; // nearend
WebRtc_Word16 outBuf_buf[PART_LEN + 8];
// Pointers to the above buffers
@ -326,9 +331,7 @@ void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const far
// Some internal functions shared by ARM NEON and generic C code:
//
WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t * aecm);
void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echoEst,
WebRtc_UWord32* far_energy,
@ -341,8 +344,15 @@ void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm);
void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
int time_signal_scaling);
void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
int time_signal_scaling);
void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
WebRtc_Word16* fft,
complex16_t* efw,
WebRtc_Word16* output,
const WebRtc_Word16* nearendClean);
#endif

View File

@ -13,14 +13,9 @@
#include <arm_neon.h>
#include <assert.h>
#include <stdlib.h>
#include "aecm_delay_estimator.h"
#include "echo_control_mobile.h"
#include "ring_buffer.h"
#include "typedefs.h"
// Square root of Hanning window in Q14
// Square root of Hanning window in Q14.
static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = {
16384, 16373, 16354, 16325,
16286, 16237, 16179, 16111,
@ -40,9 +35,172 @@ static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8)))
1594, 1196, 798, 399
};
void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
int time_signal_scaling)
{
int i, j;
int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
__asm__("vmov.i16 d21, #0" ::: "d21");
for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
{
int16x4_t tmp16x4_0;
int16x4_t tmp16x4_1;
int32x4_t tmp32x4_0;
/* Window near end */
// fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
// << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
// fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
// (time_signal[PART_LEN + i] << time_signal_scaling),
// WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN]));
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
}
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
// Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
for(i = 0, j = 0; j < PART_LEN2; i += 8, j += 16)
{
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
__asm__("vneg.s16 d22, d22" : : : "q10");
__asm__("vneg.s16 d23, d23" : : : "q11");
__asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&freq_signal[i].real): "q10", "q11");
}
}
void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
WebRtc_Word16* fft,
complex16_t* efw,
WebRtc_Word16* output,
const WebRtc_Word16* nearendClean)
{
int i, j, outCFFT;
WebRtc_Word32 tmp32no1;
// Synthesis
for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
{
// We overwrite two more elements in fft[], but it's ok.
__asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
__asm__("vmov q11, q10" : : : "q10", "q11");
__asm__("vneg.s16 d23, d23" : : : "q11");
__asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11");
__asm__("vrev64.16 q10, q10" : : : "q10");
__asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10");
}
fft[PART_LEN2] = efw[PART_LEN].real;
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
// Inverse FFT, result should be scaled with outCFFT.
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
// Take only the real values and scale with outCFFT.
for (i = 0, j = 0; i < PART_LEN2; i += 8, j+= 16)
{
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
__asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
}
int32x4_t tmp32x4_2;
__asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
(outCFFT - aecm->dfaCleanQDomain)));
for (i = 0; i < PART_LEN; i += 4)
{
int16x4_t tmp16x4_0;
int16x4_t tmp16x4_1;
int32x4_t tmp32x4_0;
int32x4_t tmp32x4_1;
// fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
// fft[i], WebRtcAecm_kSqrtHanning[i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i]));
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
__asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
__asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
// tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
// outCFFT - aecm->dfaCleanQDomain);
__asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
// fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
// tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN);
// output[i] = fft[i];
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
__asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
__asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
__asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i]));
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
// tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
// fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i]));
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
__asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
__asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
// tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain);
__asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
// outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
// WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
__asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
}
// Copy the current block to the old position (outBuf is shifted elsewhere).
for (i = 0; i < PART_LEN; i += 16)
{
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->xBuf[i + PART_LEN]) : "q10");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
}
for (i = 0; i < PART_LEN; i += 16)
{
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufNoisy[i]): "q10");
}
if (nearendClean != NULL) {
for (i = 0; i < PART_LEN; i += 16)
{
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufClean[i]): "q10");
}
}
}
void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echoEst,
WebRtc_Word32* echo_est,
WebRtc_UWord32* far_energy,
WebRtc_UWord32* echo_energy_adapt,
WebRtc_UWord32* echo_energy_stored)
@ -54,29 +212,31 @@ void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
register WebRtc_UWord32 echo_energy_adapt_r;
uint32x4_t tmp32x4_0;
__asm__("vmov.i32 q14, #0" : : : "q14"); //far_energy
__asm__("vmov.i32 q8, #0" : : : "q8"); //echo_energy_stored
__asm__("vmov.i32 q9, #0" : : : "q9"); //echo_energy_adapt
__asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy
__asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored
__asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt
for(i = 0; i < PART_LEN -7; i += 8)
{
//far_energy += (WebRtc_UWord32)(far_spectrum[i]);
// far_energy += (WebRtc_UWord32)(far_spectrum[i]);
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
__asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
__asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
// Get estimated echo energies for adaptive channel and stored channel
//echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
// Get estimated echo energies for adaptive channel and stored channel.
// echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echoEst[i]): "q10", "q11");
__asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
"q10", "q11");
//echo_energy_stored += (WebRtc_UWord32)echoEst[i];
// echo_energy_stored += (WebRtc_UWord32)echoEst[i];
__asm__("vadd.u32 q8, q10" : : : "q10", "q8");
__asm__("vadd.u32 q8, q11" : : : "q11", "q8");
//echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], far_spectrum[i]);
// echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(
// aecm->channelAdapt16[i], far_spectrum[i]);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
@ -96,9 +256,9 @@ void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
__asm__("vpadd.u32 d16, d16" : : : "q8");
__asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
// Get estimated echo energies for adaptive channel and stored channel
echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
*echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echoEst[i];
// Get estimated echo energies for adaptive channel and stored channel.
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
*echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i];
*far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
*echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
aecm->channelAdapt16[i], far_spectrum[i]);
@ -128,7 +288,7 @@ void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
}
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
{
int i;
@ -151,45 +311,4 @@ void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
(WebRtc_Word32)aecm->channelStored[i], 16);
}
void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
int time_signal_scaling)
{
int i, j;
int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
__asm__("vmov.i16 d21, #0" ::: "d21");
for(i = 0, j = 0; i < PART_LEN-3; i += 4, j += 8)
{
int16x4_t tmp16x4_0;
int16x4_t tmp16x4_1;
int32x4_t tmp32x4_0;
/* Window near end */
// fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
// << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
// fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
// (time_signal[PART_LEN + i] << time_signal_scaling),
// WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[PART_LEN + i]));
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
}
}
#endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)