diff --git a/webrtc/modules/audio_processing/aec/aec_core_sse2.c b/webrtc/modules/audio_processing/aec/aec_core_sse2.c index b1bffcbb9..4d9b4efe3 100644 --- a/webrtc/modules/audio_processing/aec/aec_core_sse2.c +++ b/webrtc/modules/audio_processing/aec/aec_core_sse2.c @@ -16,7 +16,6 @@ #include #include // memset -#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" #include "webrtc/modules/audio_processing/aec/aec_common.h" #include "webrtc/modules/audio_processing/aec/aec_core_internal.h" #include "webrtc/modules/audio_processing/aec/aec_rdft.h" @@ -420,312 +419,9 @@ static void OverdriveAndSuppressSSE2(AecCore* aec, } } -__inline static void _mm_add_ps_4x1(__m128 sum, float *dst) { - // A+B C+D - sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); - // A+B+C+D A+B+C+D - sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); - _mm_store_ss(dst, sum); -} -static int PartitionDelay(const AecCore* aec) { - // Measures the energy in each filter partition and returns the partition with - // highest energy. - // TODO(bjornv): Spread computational cost by computing one partition per - // block? - float wfEnMax = 0; - int i; - int delay = 0; - - for (i = 0; i < aec->num_partitions; i++) { - int j; - int pos = i * PART_LEN1; - float wfEn = 0; - __m128 vec_wfEn = _mm_set1_ps(0.0f); - // vectorized code (four at once) - for (j = 0; j + 3 < PART_LEN1; j += 4) { - const __m128 vec_wfBuf0 = _mm_loadu_ps(&aec->wfBuf[0][pos + j]); - const __m128 vec_wfBuf1 = _mm_loadu_ps(&aec->wfBuf[1][pos + j]); - vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0)); - vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1)); - } - _mm_add_ps_4x1(vec_wfEn, &wfEn); - - // scalar code for the remaining items. - for (; j < PART_LEN1; j++) { - wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] + - aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j]; - } - - if (wfEn > wfEnMax) { - wfEnMax = wfEn; - delay = i; - } - } - return delay; -} - -// Updates the following smoothed Power Spectral Densities (PSD): -// - sd : near-end -// - se : residual echo -// - sx : far-end -// - sde : cross-PSD of near-end and residual echo -// - sxd : cross-PSD of near-end and far-end -// -// In addition to updating the PSDs, also the filter diverge state is determined -// upon actions are taken. -static void SmoothedPSD(AecCore* aec, - float efw[2][PART_LEN1], - float dfw[2][PART_LEN1], - float xfw[2][PART_LEN1]) { - // Power estimate smoothing coefficients. - const float* ptrGCoh = aec->extended_filter_enabled - ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] - : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; - int i; - float sdSum = 0, seSum = 0; - const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD); - const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]); - const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]); - __m128 vec_sdSum = _mm_set1_ps(0.0f); - __m128 vec_seSum = _mm_set1_ps(0.0f); - - for (i = 0; i + 3 < PART_LEN1; i += 4) { - const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]); - const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]); - const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]); - const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]); - const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]); - const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]); - __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0); - __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0); - __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0); - __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0); - __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0); - __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0); - vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1)); - vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1)); - vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1)); - vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15); - vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1)); - vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1)); - vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1)); - _mm_storeu_ps(&aec->sd[i], vec_sd); - _mm_storeu_ps(&aec->se[i], vec_se); - _mm_storeu_ps(&aec->sx[i], vec_sx); - - { - const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]); - const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); - __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654, - _MM_SHUFFLE(2, 0, 2, 0)); - __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654, - _MM_SHUFFLE(3, 1, 3, 1)); - __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0); - __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1); - vec_a = _mm_mul_ps(vec_a, vec_GCoh0); - vec_b = _mm_mul_ps(vec_b, vec_GCoh0); - vec_dfwefw0011 = _mm_add_ps(vec_dfwefw0011, - _mm_mul_ps(vec_dfw1, vec_efw1)); - vec_dfwefw0110 = _mm_sub_ps(vec_dfwefw0110, - _mm_mul_ps(vec_dfw1, vec_efw0)); - vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1)); - vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1)); - _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); - _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); - } - - { - const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]); - const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); - __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654, - _MM_SHUFFLE(2, 0, 2, 0)); - __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654, - _MM_SHUFFLE(3, 1, 3, 1)); - __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0); - __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1); - vec_a = _mm_mul_ps(vec_a, vec_GCoh0); - vec_b = _mm_mul_ps(vec_b, vec_GCoh0); - vec_dfwxfw0011 = _mm_add_ps(vec_dfwxfw0011, - _mm_mul_ps(vec_dfw1, vec_xfw1)); - vec_dfwxfw0110 = _mm_sub_ps(vec_dfwxfw0110, - _mm_mul_ps(vec_dfw1, vec_xfw0)); - vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1)); - vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1)); - _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); - _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); - } - - vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd); - vec_seSum = _mm_add_ps(vec_seSum, vec_se); - } - - _mm_add_ps_4x1(vec_sdSum, &sdSum); - _mm_add_ps_4x1(vec_seSum, &seSum); - - for (; i < PART_LEN1; i++) { - aec->sd[i] = ptrGCoh[0] * aec->sd[i] + - ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); - aec->se[i] = ptrGCoh[0] * aec->se[i] + - ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); - // We threshold here to protect against the ill-effects of a zero farend. - // The threshold is not arbitrarily chosen, but balances protection and - // adverse interaction with the algorithm's tuning. - // TODO(bjornv): investigate further why this is so sensitive. - aec->sx[i] = - ptrGCoh[0] * aec->sx[i] + - ptrGCoh[1] * WEBRTC_SPL_MAX( - xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], - WebRtcAec_kMinFarendPSD); - - aec->sde[i][0] = - ptrGCoh[0] * aec->sde[i][0] + - ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); - aec->sde[i][1] = - ptrGCoh[0] * aec->sde[i][1] + - ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); - - aec->sxd[i][0] = - ptrGCoh[0] * aec->sxd[i][0] + - ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); - aec->sxd[i][1] = - ptrGCoh[0] * aec->sxd[i][1] + - ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); - - sdSum += aec->sd[i]; - seSum += aec->se[i]; - } - - // Divergent filter safeguard. - aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; - - if (aec->divergeState) - memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1); - - // Reset if error is significantly larger than nearend (13 dB). - if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum)) - memset(aec->wfBuf, 0, sizeof(aec->wfBuf)); -} - -// Window time domain data to be used by the fft. -__inline static void WindowData(float* x_windowed, const float* x) { - int i; - for (i = 0; i < PART_LEN; i += 4) { - const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]); - const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]); - const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]); - // A B C D - __m128 vec_sqrtHanning_rev = - _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); - // D C B A - vec_sqrtHanning_rev = - _mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev, - _MM_SHUFFLE(0, 1, 2, 3)); - _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning)); - _mm_storeu_ps(&x_windowed[PART_LEN + i], - _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev)); - } -} - -// Puts fft output data into a complex valued array. -__inline static void StoreAsComplex(const float* data, - float data_complex[2][PART_LEN1]) { - int i; - for (i = 0; i < PART_LEN; i += 4) { - const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]); - const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]); - const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4, - _MM_SHUFFLE(2, 0, 2, 0)); - const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4, - _MM_SHUFFLE(3, 1, 3, 1)); - _mm_storeu_ps(&data_complex[0][i], vec_a); - _mm_storeu_ps(&data_complex[1][i], vec_b); - } - // fix beginning/end values - data_complex[1][0] = 0; - data_complex[1][PART_LEN] = 0; - data_complex[0][0] = data[0]; - data_complex[0][PART_LEN] = data[1]; -} - -static void SubbandCoherenceSSE2(AecCore* aec, - float efw[2][PART_LEN1], - float xfw[2][PART_LEN1], - float* fft, - float* cohde, - float* cohxd) { - float dfw[2][PART_LEN1]; - int i; - - if (aec->delayEstCtr == 0) - aec->delayIdx = PartitionDelay(aec); - - // Use delayed far. - memcpy(xfw, - aec->xfwBuf + aec->delayIdx * PART_LEN1, - sizeof(xfw[0][0]) * 2 * PART_LEN1); - - // Windowed near fft - WindowData(fft, aec->dBuf); - aec_rdft_forward_128(fft); - StoreAsComplex(fft, dfw); - - // Windowed error fft - WindowData(fft, aec->eBuf); - aec_rdft_forward_128(fft); - StoreAsComplex(fft, efw); - - SmoothedPSD(aec, efw, dfw, xfw); - - { - const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); - - // Subband coherence - for (i = 0; i + 3 < PART_LEN1; i += 4) { - const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); - const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); - const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); - const __m128 vec_sdse = _mm_add_ps(vec_1eminus10, - _mm_mul_ps(vec_sd, vec_se)); - const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10, - _mm_mul_ps(vec_sd, vec_sx)); - const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]); - const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); - const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]); - const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); - const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, - _MM_SHUFFLE(2, 0, 2, 0)); - const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, - _MM_SHUFFLE(3, 1, 3, 1)); - const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, - _MM_SHUFFLE(2, 0, 2, 0)); - const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, - _MM_SHUFFLE(3, 1, 3, 1)); - __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0); - __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0); - vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1)); - vec_cohde = _mm_div_ps(vec_cohde, vec_sdse); - vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1)); - vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx); - _mm_storeu_ps(&cohde[i], vec_cohde); - _mm_storeu_ps(&cohxd[i], vec_cohxd); - } - - // scalar code for the remaining items. - for (; i < PART_LEN1; i++) { - cohde[i] = - (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / - (aec->sd[i] * aec->se[i] + 1e-10f); - cohxd[i] = - (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / - (aec->sx[i] * aec->sd[i] + 1e-10f); - } - } -} - void WebRtcAec_InitAec_SSE2(void) { WebRtcAec_FilterFar = FilterFarSSE2; WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; - WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; }