Revert 6860 "SSE2 version of SubbandCoherence()"
> SSE2 version of SubbandCoherence() > > The performance gain on a x86 laptop (Intel(R) Core(TM) i5-2520M CPU @ 2.50GHz) > reported by audioproc is ~3.3% > > The output is bit exact. > > R=bjornv@webrtc.org, cd@webrtc.org > > Review URL: https://webrtc-codereview.appspot.com/18779004 > > Patch from Scott LaVarnway <slavarnw@gmail.com>. TBR=bjornv@webrtc.org Review URL: https://webrtc-codereview.appspot.com/16289004 git-svn-id: http://webrtc.googlecode.com/svn/trunk@6861 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
		| @@ -16,7 +16,6 @@ | ||||
| #include <math.h> | ||||
| #include <string.h>  // memset | ||||
|  | ||||
| #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" | ||||
| #include "webrtc/modules/audio_processing/aec/aec_common.h" | ||||
| #include "webrtc/modules/audio_processing/aec/aec_core_internal.h" | ||||
| #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | ||||
| @@ -420,312 +419,9 @@ static void OverdriveAndSuppressSSE2(AecCore* aec, | ||||
|   } | ||||
| } | ||||
|  | ||||
| __inline static void _mm_add_ps_4x1(__m128 sum, float *dst) { | ||||
|   // A+B C+D | ||||
|   sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); | ||||
|   // A+B+C+D A+B+C+D | ||||
|   sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); | ||||
|   _mm_store_ss(dst, sum); | ||||
| } | ||||
| static int PartitionDelay(const AecCore* aec) { | ||||
|   // Measures the energy in each filter partition and returns the partition with | ||||
|   // highest energy. | ||||
|   // TODO(bjornv): Spread computational cost by computing one partition per | ||||
|   // block? | ||||
|   float wfEnMax = 0; | ||||
|   int i; | ||||
|   int delay = 0; | ||||
|  | ||||
|   for (i = 0; i < aec->num_partitions; i++) { | ||||
|     int j; | ||||
|     int pos = i * PART_LEN1; | ||||
|     float wfEn = 0; | ||||
|     __m128 vec_wfEn = _mm_set1_ps(0.0f); | ||||
|     // vectorized code (four at once) | ||||
|     for (j = 0; j + 3 < PART_LEN1; j += 4) { | ||||
|       const __m128 vec_wfBuf0 = _mm_loadu_ps(&aec->wfBuf[0][pos + j]); | ||||
|       const __m128 vec_wfBuf1 = _mm_loadu_ps(&aec->wfBuf[1][pos + j]); | ||||
|       vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0)); | ||||
|       vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1)); | ||||
|     } | ||||
|     _mm_add_ps_4x1(vec_wfEn, &wfEn); | ||||
|  | ||||
|     // scalar code for the remaining items. | ||||
|     for (; j < PART_LEN1; j++) { | ||||
|       wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] + | ||||
|               aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j]; | ||||
|     } | ||||
|  | ||||
|     if (wfEn > wfEnMax) { | ||||
|       wfEnMax = wfEn; | ||||
|       delay = i; | ||||
|     } | ||||
|   } | ||||
|   return delay; | ||||
| } | ||||
|  | ||||
| // Updates the following smoothed  Power Spectral Densities (PSD): | ||||
| //  - sd  : near-end | ||||
| //  - se  : residual echo | ||||
| //  - sx  : far-end | ||||
| //  - sde : cross-PSD of near-end and residual echo | ||||
| //  - sxd : cross-PSD of near-end and far-end | ||||
| // | ||||
| // In addition to updating the PSDs, also the filter diverge state is determined | ||||
| // upon actions are taken. | ||||
| static void SmoothedPSD(AecCore* aec, | ||||
|                         float efw[2][PART_LEN1], | ||||
|                         float dfw[2][PART_LEN1], | ||||
|                         float xfw[2][PART_LEN1]) { | ||||
|   // Power estimate smoothing coefficients. | ||||
|   const float* ptrGCoh = aec->extended_filter_enabled | ||||
|       ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] | ||||
|       : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; | ||||
|   int i; | ||||
|   float sdSum = 0, seSum = 0; | ||||
|   const __m128 vec_15 =  _mm_set1_ps(WebRtcAec_kMinFarendPSD); | ||||
|   const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]); | ||||
|   const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]); | ||||
|   __m128 vec_sdSum = _mm_set1_ps(0.0f); | ||||
|   __m128 vec_seSum = _mm_set1_ps(0.0f); | ||||
|  | ||||
|   for (i = 0; i + 3 < PART_LEN1; i += 4) { | ||||
|     const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]); | ||||
|     const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]); | ||||
|     const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]); | ||||
|     const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]); | ||||
|     const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]); | ||||
|     const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]); | ||||
|     __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0); | ||||
|     __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0); | ||||
|     __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0); | ||||
|     __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0); | ||||
|     __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0); | ||||
|     __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0); | ||||
|     vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1)); | ||||
|     vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1)); | ||||
|     vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1)); | ||||
|     vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15); | ||||
|     vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1)); | ||||
|     vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1)); | ||||
|     vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1)); | ||||
|     _mm_storeu_ps(&aec->sd[i], vec_sd); | ||||
|     _mm_storeu_ps(&aec->se[i], vec_se); | ||||
|     _mm_storeu_ps(&aec->sx[i], vec_sx); | ||||
|  | ||||
|     { | ||||
|       const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]); | ||||
|       const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); | ||||
|       __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654, | ||||
|                                     _MM_SHUFFLE(2, 0, 2, 0)); | ||||
|       __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654, | ||||
|                                     _MM_SHUFFLE(3, 1, 3, 1)); | ||||
|       __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0); | ||||
|       __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1); | ||||
|       vec_a = _mm_mul_ps(vec_a, vec_GCoh0); | ||||
|       vec_b = _mm_mul_ps(vec_b, vec_GCoh0); | ||||
|       vec_dfwefw0011 = _mm_add_ps(vec_dfwefw0011, | ||||
|                                   _mm_mul_ps(vec_dfw1, vec_efw1)); | ||||
|       vec_dfwefw0110 = _mm_sub_ps(vec_dfwefw0110, | ||||
|                                   _mm_mul_ps(vec_dfw1, vec_efw0)); | ||||
|       vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1)); | ||||
|       vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1)); | ||||
|       _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); | ||||
|       _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); | ||||
|     } | ||||
|  | ||||
|     { | ||||
|       const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]); | ||||
|       const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); | ||||
|       __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654, | ||||
|                                     _MM_SHUFFLE(2, 0, 2, 0)); | ||||
|       __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654, | ||||
|                                     _MM_SHUFFLE(3, 1, 3, 1)); | ||||
|       __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0); | ||||
|       __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1); | ||||
|       vec_a = _mm_mul_ps(vec_a, vec_GCoh0); | ||||
|       vec_b = _mm_mul_ps(vec_b, vec_GCoh0); | ||||
|       vec_dfwxfw0011 = _mm_add_ps(vec_dfwxfw0011, | ||||
|                                   _mm_mul_ps(vec_dfw1, vec_xfw1)); | ||||
|       vec_dfwxfw0110 = _mm_sub_ps(vec_dfwxfw0110, | ||||
|                                   _mm_mul_ps(vec_dfw1, vec_xfw0)); | ||||
|       vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1)); | ||||
|       vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1)); | ||||
|       _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); | ||||
|       _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); | ||||
|     } | ||||
|  | ||||
|     vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd); | ||||
|     vec_seSum = _mm_add_ps(vec_seSum, vec_se); | ||||
|   } | ||||
|  | ||||
|   _mm_add_ps_4x1(vec_sdSum, &sdSum); | ||||
|   _mm_add_ps_4x1(vec_seSum, &seSum); | ||||
|  | ||||
|   for (; i < PART_LEN1; i++) { | ||||
|     aec->sd[i] = ptrGCoh[0] * aec->sd[i] + | ||||
|                  ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); | ||||
|     aec->se[i] = ptrGCoh[0] * aec->se[i] + | ||||
|                  ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); | ||||
|     // We threshold here to protect against the ill-effects of a zero farend. | ||||
|     // The threshold is not arbitrarily chosen, but balances protection and | ||||
|     // adverse interaction with the algorithm's tuning. | ||||
|     // TODO(bjornv): investigate further why this is so sensitive. | ||||
|     aec->sx[i] = | ||||
|         ptrGCoh[0] * aec->sx[i] + | ||||
|         ptrGCoh[1] * WEBRTC_SPL_MAX( | ||||
|             xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], | ||||
|             WebRtcAec_kMinFarendPSD); | ||||
|  | ||||
|     aec->sde[i][0] = | ||||
|         ptrGCoh[0] * aec->sde[i][0] + | ||||
|         ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); | ||||
|     aec->sde[i][1] = | ||||
|         ptrGCoh[0] * aec->sde[i][1] + | ||||
|         ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); | ||||
|  | ||||
|     aec->sxd[i][0] = | ||||
|         ptrGCoh[0] * aec->sxd[i][0] + | ||||
|         ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); | ||||
|     aec->sxd[i][1] = | ||||
|         ptrGCoh[0] * aec->sxd[i][1] + | ||||
|         ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); | ||||
|  | ||||
|     sdSum += aec->sd[i]; | ||||
|     seSum += aec->se[i]; | ||||
|   } | ||||
|  | ||||
|   // Divergent filter safeguard. | ||||
|   aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; | ||||
|  | ||||
|   if (aec->divergeState) | ||||
|     memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1); | ||||
|  | ||||
|   // Reset if error is significantly larger than nearend (13 dB). | ||||
|   if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum)) | ||||
|     memset(aec->wfBuf, 0, sizeof(aec->wfBuf)); | ||||
| } | ||||
|  | ||||
| // Window time domain data to be used by the fft. | ||||
| __inline static void WindowData(float* x_windowed, const float* x) { | ||||
|   int i; | ||||
|   for (i = 0; i < PART_LEN; i += 4) { | ||||
|     const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]); | ||||
|     const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]); | ||||
|     const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]); | ||||
|     // A B C D | ||||
|     __m128 vec_sqrtHanning_rev = | ||||
|         _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); | ||||
|     // D C B A | ||||
|     vec_sqrtHanning_rev = | ||||
|         _mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev, | ||||
|                        _MM_SHUFFLE(0, 1, 2, 3)); | ||||
|     _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning)); | ||||
|     _mm_storeu_ps(&x_windowed[PART_LEN + i], | ||||
|                   _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev)); | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Puts fft output data into a complex valued array. | ||||
| __inline static void StoreAsComplex(const float* data, | ||||
|                                     float data_complex[2][PART_LEN1]) { | ||||
|   int i; | ||||
|   for (i = 0; i < PART_LEN; i += 4) { | ||||
|     const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]); | ||||
|     const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]); | ||||
|     const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4, | ||||
|                                         _MM_SHUFFLE(2, 0, 2, 0)); | ||||
|     const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4, | ||||
|                                         _MM_SHUFFLE(3, 1, 3, 1)); | ||||
|     _mm_storeu_ps(&data_complex[0][i], vec_a); | ||||
|     _mm_storeu_ps(&data_complex[1][i], vec_b); | ||||
|   } | ||||
|   // fix beginning/end values | ||||
|   data_complex[1][0] = 0; | ||||
|   data_complex[1][PART_LEN] = 0; | ||||
|   data_complex[0][0] = data[0]; | ||||
|   data_complex[0][PART_LEN] = data[1]; | ||||
| } | ||||
|  | ||||
| static void SubbandCoherenceSSE2(AecCore* aec, | ||||
|                                  float efw[2][PART_LEN1], | ||||
|                                  float xfw[2][PART_LEN1], | ||||
|                                  float* fft, | ||||
|                                  float* cohde, | ||||
|                                  float* cohxd) { | ||||
|   float dfw[2][PART_LEN1]; | ||||
|   int i; | ||||
|  | ||||
|   if (aec->delayEstCtr == 0) | ||||
|     aec->delayIdx = PartitionDelay(aec); | ||||
|  | ||||
|   // Use delayed far. | ||||
|   memcpy(xfw, | ||||
|          aec->xfwBuf + aec->delayIdx * PART_LEN1, | ||||
|          sizeof(xfw[0][0]) * 2 * PART_LEN1); | ||||
|  | ||||
|   // Windowed near fft | ||||
|   WindowData(fft, aec->dBuf); | ||||
|   aec_rdft_forward_128(fft); | ||||
|   StoreAsComplex(fft, dfw); | ||||
|  | ||||
|   // Windowed error fft | ||||
|   WindowData(fft, aec->eBuf); | ||||
|   aec_rdft_forward_128(fft); | ||||
|   StoreAsComplex(fft, efw); | ||||
|  | ||||
|   SmoothedPSD(aec, efw, dfw, xfw); | ||||
|  | ||||
|   { | ||||
|     const __m128 vec_1eminus10 =  _mm_set1_ps(1e-10f); | ||||
|  | ||||
|     // Subband coherence | ||||
|     for (i = 0; i + 3 < PART_LEN1; i += 4) { | ||||
|       const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); | ||||
|       const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); | ||||
|       const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); | ||||
|       const __m128 vec_sdse = _mm_add_ps(vec_1eminus10, | ||||
|                                          _mm_mul_ps(vec_sd, vec_se)); | ||||
|       const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10, | ||||
|                                          _mm_mul_ps(vec_sd, vec_sx)); | ||||
|       const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]); | ||||
|       const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); | ||||
|       const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]); | ||||
|       const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); | ||||
|       const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, | ||||
|                                               _MM_SHUFFLE(2, 0, 2, 0)); | ||||
|       const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, | ||||
|                                               _MM_SHUFFLE(3, 1, 3, 1)); | ||||
|       const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, | ||||
|                                               _MM_SHUFFLE(2, 0, 2, 0)); | ||||
|       const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, | ||||
|                                               _MM_SHUFFLE(3, 1, 3, 1)); | ||||
|       __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0); | ||||
|       __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0); | ||||
|       vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1)); | ||||
|       vec_cohde = _mm_div_ps(vec_cohde, vec_sdse); | ||||
|       vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1)); | ||||
|       vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx); | ||||
|       _mm_storeu_ps(&cohde[i], vec_cohde); | ||||
|       _mm_storeu_ps(&cohxd[i], vec_cohxd); | ||||
|     } | ||||
|  | ||||
|     // scalar code for the remaining items. | ||||
|     for (; i < PART_LEN1; i++) { | ||||
|       cohde[i] = | ||||
|           (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / | ||||
|           (aec->sd[i] * aec->se[i] + 1e-10f); | ||||
|       cohxd[i] = | ||||
|           (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / | ||||
|           (aec->sx[i] * aec->sd[i] + 1e-10f); | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| void WebRtcAec_InitAec_SSE2(void) { | ||||
|   WebRtcAec_FilterFar = FilterFarSSE2; | ||||
|   WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | ||||
|   WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | ||||
|   WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | ||||
|   WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 bjornv@webrtc.org
					bjornv@webrtc.org