Revert 6860 "SSE2 version of SubbandCoherence()"
> SSE2 version of SubbandCoherence() > > The performance gain on a x86 laptop (Intel(R) Core(TM) i5-2520M CPU @ 2.50GHz) > reported by audioproc is ~3.3% > > The output is bit exact. > > R=bjornv@webrtc.org, cd@webrtc.org > > Review URL: https://webrtc-codereview.appspot.com/18779004 > > Patch from Scott LaVarnway <slavarnw@gmail.com>. TBR=bjornv@webrtc.org Review URL: https://webrtc-codereview.appspot.com/16289004 git-svn-id: http://webrtc.googlecode.com/svn/trunk@6861 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
		| @@ -16,7 +16,6 @@ | |||||||
| #include <math.h> | #include <math.h> | ||||||
| #include <string.h>  // memset | #include <string.h>  // memset | ||||||
|  |  | ||||||
| #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" |  | ||||||
| #include "webrtc/modules/audio_processing/aec/aec_common.h" | #include "webrtc/modules/audio_processing/aec/aec_common.h" | ||||||
| #include "webrtc/modules/audio_processing/aec/aec_core_internal.h" | #include "webrtc/modules/audio_processing/aec/aec_core_internal.h" | ||||||
| #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | ||||||
| @@ -420,312 +419,9 @@ static void OverdriveAndSuppressSSE2(AecCore* aec, | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| __inline static void _mm_add_ps_4x1(__m128 sum, float *dst) { |  | ||||||
|   // A+B C+D |  | ||||||
|   sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); |  | ||||||
|   // A+B+C+D A+B+C+D |  | ||||||
|   sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); |  | ||||||
|   _mm_store_ss(dst, sum); |  | ||||||
| } |  | ||||||
| static int PartitionDelay(const AecCore* aec) { |  | ||||||
|   // Measures the energy in each filter partition and returns the partition with |  | ||||||
|   // highest energy. |  | ||||||
|   // TODO(bjornv): Spread computational cost by computing one partition per |  | ||||||
|   // block? |  | ||||||
|   float wfEnMax = 0; |  | ||||||
|   int i; |  | ||||||
|   int delay = 0; |  | ||||||
|  |  | ||||||
|   for (i = 0; i < aec->num_partitions; i++) { |  | ||||||
|     int j; |  | ||||||
|     int pos = i * PART_LEN1; |  | ||||||
|     float wfEn = 0; |  | ||||||
|     __m128 vec_wfEn = _mm_set1_ps(0.0f); |  | ||||||
|     // vectorized code (four at once) |  | ||||||
|     for (j = 0; j + 3 < PART_LEN1; j += 4) { |  | ||||||
|       const __m128 vec_wfBuf0 = _mm_loadu_ps(&aec->wfBuf[0][pos + j]); |  | ||||||
|       const __m128 vec_wfBuf1 = _mm_loadu_ps(&aec->wfBuf[1][pos + j]); |  | ||||||
|       vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0)); |  | ||||||
|       vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1)); |  | ||||||
|     } |  | ||||||
|     _mm_add_ps_4x1(vec_wfEn, &wfEn); |  | ||||||
|  |  | ||||||
|     // scalar code for the remaining items. |  | ||||||
|     for (; j < PART_LEN1; j++) { |  | ||||||
|       wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] + |  | ||||||
|               aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j]; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (wfEn > wfEnMax) { |  | ||||||
|       wfEnMax = wfEn; |  | ||||||
|       delay = i; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   return delay; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // Updates the following smoothed  Power Spectral Densities (PSD): |  | ||||||
| //  - sd  : near-end |  | ||||||
| //  - se  : residual echo |  | ||||||
| //  - sx  : far-end |  | ||||||
| //  - sde : cross-PSD of near-end and residual echo |  | ||||||
| //  - sxd : cross-PSD of near-end and far-end |  | ||||||
| // |  | ||||||
| // In addition to updating the PSDs, also the filter diverge state is determined |  | ||||||
| // upon actions are taken. |  | ||||||
| static void SmoothedPSD(AecCore* aec, |  | ||||||
|                         float efw[2][PART_LEN1], |  | ||||||
|                         float dfw[2][PART_LEN1], |  | ||||||
|                         float xfw[2][PART_LEN1]) { |  | ||||||
|   // Power estimate smoothing coefficients. |  | ||||||
|   const float* ptrGCoh = aec->extended_filter_enabled |  | ||||||
|       ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] |  | ||||||
|       : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; |  | ||||||
|   int i; |  | ||||||
|   float sdSum = 0, seSum = 0; |  | ||||||
|   const __m128 vec_15 =  _mm_set1_ps(WebRtcAec_kMinFarendPSD); |  | ||||||
|   const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]); |  | ||||||
|   const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]); |  | ||||||
|   __m128 vec_sdSum = _mm_set1_ps(0.0f); |  | ||||||
|   __m128 vec_seSum = _mm_set1_ps(0.0f); |  | ||||||
|  |  | ||||||
|   for (i = 0; i + 3 < PART_LEN1; i += 4) { |  | ||||||
|     const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]); |  | ||||||
|     const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]); |  | ||||||
|     const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]); |  | ||||||
|     const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]); |  | ||||||
|     const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]); |  | ||||||
|     const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]); |  | ||||||
|     __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0); |  | ||||||
|     __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0); |  | ||||||
|     __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0); |  | ||||||
|     __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0); |  | ||||||
|     __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0); |  | ||||||
|     __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0); |  | ||||||
|     vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1)); |  | ||||||
|     vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1)); |  | ||||||
|     vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1)); |  | ||||||
|     vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15); |  | ||||||
|     vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1)); |  | ||||||
|     vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1)); |  | ||||||
|     vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1)); |  | ||||||
|     _mm_storeu_ps(&aec->sd[i], vec_sd); |  | ||||||
|     _mm_storeu_ps(&aec->se[i], vec_se); |  | ||||||
|     _mm_storeu_ps(&aec->sx[i], vec_sx); |  | ||||||
|  |  | ||||||
|     { |  | ||||||
|       const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]); |  | ||||||
|       const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); |  | ||||||
|       __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654, |  | ||||||
|                                     _MM_SHUFFLE(2, 0, 2, 0)); |  | ||||||
|       __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654, |  | ||||||
|                                     _MM_SHUFFLE(3, 1, 3, 1)); |  | ||||||
|       __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0); |  | ||||||
|       __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1); |  | ||||||
|       vec_a = _mm_mul_ps(vec_a, vec_GCoh0); |  | ||||||
|       vec_b = _mm_mul_ps(vec_b, vec_GCoh0); |  | ||||||
|       vec_dfwefw0011 = _mm_add_ps(vec_dfwefw0011, |  | ||||||
|                                   _mm_mul_ps(vec_dfw1, vec_efw1)); |  | ||||||
|       vec_dfwefw0110 = _mm_sub_ps(vec_dfwefw0110, |  | ||||||
|                                   _mm_mul_ps(vec_dfw1, vec_efw0)); |  | ||||||
|       vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1)); |  | ||||||
|       vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1)); |  | ||||||
|       _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); |  | ||||||
|       _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     { |  | ||||||
|       const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]); |  | ||||||
|       const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); |  | ||||||
|       __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654, |  | ||||||
|                                     _MM_SHUFFLE(2, 0, 2, 0)); |  | ||||||
|       __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654, |  | ||||||
|                                     _MM_SHUFFLE(3, 1, 3, 1)); |  | ||||||
|       __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0); |  | ||||||
|       __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1); |  | ||||||
|       vec_a = _mm_mul_ps(vec_a, vec_GCoh0); |  | ||||||
|       vec_b = _mm_mul_ps(vec_b, vec_GCoh0); |  | ||||||
|       vec_dfwxfw0011 = _mm_add_ps(vec_dfwxfw0011, |  | ||||||
|                                   _mm_mul_ps(vec_dfw1, vec_xfw1)); |  | ||||||
|       vec_dfwxfw0110 = _mm_sub_ps(vec_dfwxfw0110, |  | ||||||
|                                   _mm_mul_ps(vec_dfw1, vec_xfw0)); |  | ||||||
|       vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1)); |  | ||||||
|       vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1)); |  | ||||||
|       _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); |  | ||||||
|       _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd); |  | ||||||
|     vec_seSum = _mm_add_ps(vec_seSum, vec_se); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   _mm_add_ps_4x1(vec_sdSum, &sdSum); |  | ||||||
|   _mm_add_ps_4x1(vec_seSum, &seSum); |  | ||||||
|  |  | ||||||
|   for (; i < PART_LEN1; i++) { |  | ||||||
|     aec->sd[i] = ptrGCoh[0] * aec->sd[i] + |  | ||||||
|                  ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); |  | ||||||
|     aec->se[i] = ptrGCoh[0] * aec->se[i] + |  | ||||||
|                  ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); |  | ||||||
|     // We threshold here to protect against the ill-effects of a zero farend. |  | ||||||
|     // The threshold is not arbitrarily chosen, but balances protection and |  | ||||||
|     // adverse interaction with the algorithm's tuning. |  | ||||||
|     // TODO(bjornv): investigate further why this is so sensitive. |  | ||||||
|     aec->sx[i] = |  | ||||||
|         ptrGCoh[0] * aec->sx[i] + |  | ||||||
|         ptrGCoh[1] * WEBRTC_SPL_MAX( |  | ||||||
|             xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], |  | ||||||
|             WebRtcAec_kMinFarendPSD); |  | ||||||
|  |  | ||||||
|     aec->sde[i][0] = |  | ||||||
|         ptrGCoh[0] * aec->sde[i][0] + |  | ||||||
|         ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); |  | ||||||
|     aec->sde[i][1] = |  | ||||||
|         ptrGCoh[0] * aec->sde[i][1] + |  | ||||||
|         ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); |  | ||||||
|  |  | ||||||
|     aec->sxd[i][0] = |  | ||||||
|         ptrGCoh[0] * aec->sxd[i][0] + |  | ||||||
|         ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); |  | ||||||
|     aec->sxd[i][1] = |  | ||||||
|         ptrGCoh[0] * aec->sxd[i][1] + |  | ||||||
|         ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); |  | ||||||
|  |  | ||||||
|     sdSum += aec->sd[i]; |  | ||||||
|     seSum += aec->se[i]; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   // Divergent filter safeguard. |  | ||||||
|   aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; |  | ||||||
|  |  | ||||||
|   if (aec->divergeState) |  | ||||||
|     memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1); |  | ||||||
|  |  | ||||||
|   // Reset if error is significantly larger than nearend (13 dB). |  | ||||||
|   if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum)) |  | ||||||
|     memset(aec->wfBuf, 0, sizeof(aec->wfBuf)); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // Window time domain data to be used by the fft. |  | ||||||
| __inline static void WindowData(float* x_windowed, const float* x) { |  | ||||||
|   int i; |  | ||||||
|   for (i = 0; i < PART_LEN; i += 4) { |  | ||||||
|     const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]); |  | ||||||
|     const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]); |  | ||||||
|     const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]); |  | ||||||
|     // A B C D |  | ||||||
|     __m128 vec_sqrtHanning_rev = |  | ||||||
|         _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); |  | ||||||
|     // D C B A |  | ||||||
|     vec_sqrtHanning_rev = |  | ||||||
|         _mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev, |  | ||||||
|                        _MM_SHUFFLE(0, 1, 2, 3)); |  | ||||||
|     _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning)); |  | ||||||
|     _mm_storeu_ps(&x_windowed[PART_LEN + i], |  | ||||||
|                   _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev)); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // Puts fft output data into a complex valued array. |  | ||||||
| __inline static void StoreAsComplex(const float* data, |  | ||||||
|                                     float data_complex[2][PART_LEN1]) { |  | ||||||
|   int i; |  | ||||||
|   for (i = 0; i < PART_LEN; i += 4) { |  | ||||||
|     const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]); |  | ||||||
|     const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]); |  | ||||||
|     const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4, |  | ||||||
|                                         _MM_SHUFFLE(2, 0, 2, 0)); |  | ||||||
|     const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4, |  | ||||||
|                                         _MM_SHUFFLE(3, 1, 3, 1)); |  | ||||||
|     _mm_storeu_ps(&data_complex[0][i], vec_a); |  | ||||||
|     _mm_storeu_ps(&data_complex[1][i], vec_b); |  | ||||||
|   } |  | ||||||
|   // fix beginning/end values |  | ||||||
|   data_complex[1][0] = 0; |  | ||||||
|   data_complex[1][PART_LEN] = 0; |  | ||||||
|   data_complex[0][0] = data[0]; |  | ||||||
|   data_complex[0][PART_LEN] = data[1]; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| static void SubbandCoherenceSSE2(AecCore* aec, |  | ||||||
|                                  float efw[2][PART_LEN1], |  | ||||||
|                                  float xfw[2][PART_LEN1], |  | ||||||
|                                  float* fft, |  | ||||||
|                                  float* cohde, |  | ||||||
|                                  float* cohxd) { |  | ||||||
|   float dfw[2][PART_LEN1]; |  | ||||||
|   int i; |  | ||||||
|  |  | ||||||
|   if (aec->delayEstCtr == 0) |  | ||||||
|     aec->delayIdx = PartitionDelay(aec); |  | ||||||
|  |  | ||||||
|   // Use delayed far. |  | ||||||
|   memcpy(xfw, |  | ||||||
|          aec->xfwBuf + aec->delayIdx * PART_LEN1, |  | ||||||
|          sizeof(xfw[0][0]) * 2 * PART_LEN1); |  | ||||||
|  |  | ||||||
|   // Windowed near fft |  | ||||||
|   WindowData(fft, aec->dBuf); |  | ||||||
|   aec_rdft_forward_128(fft); |  | ||||||
|   StoreAsComplex(fft, dfw); |  | ||||||
|  |  | ||||||
|   // Windowed error fft |  | ||||||
|   WindowData(fft, aec->eBuf); |  | ||||||
|   aec_rdft_forward_128(fft); |  | ||||||
|   StoreAsComplex(fft, efw); |  | ||||||
|  |  | ||||||
|   SmoothedPSD(aec, efw, dfw, xfw); |  | ||||||
|  |  | ||||||
|   { |  | ||||||
|     const __m128 vec_1eminus10 =  _mm_set1_ps(1e-10f); |  | ||||||
|  |  | ||||||
|     // Subband coherence |  | ||||||
|     for (i = 0; i + 3 < PART_LEN1; i += 4) { |  | ||||||
|       const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); |  | ||||||
|       const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); |  | ||||||
|       const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); |  | ||||||
|       const __m128 vec_sdse = _mm_add_ps(vec_1eminus10, |  | ||||||
|                                          _mm_mul_ps(vec_sd, vec_se)); |  | ||||||
|       const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10, |  | ||||||
|                                          _mm_mul_ps(vec_sd, vec_sx)); |  | ||||||
|       const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]); |  | ||||||
|       const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); |  | ||||||
|       const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]); |  | ||||||
|       const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); |  | ||||||
|       const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, |  | ||||||
|                                               _MM_SHUFFLE(2, 0, 2, 0)); |  | ||||||
|       const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, |  | ||||||
|                                               _MM_SHUFFLE(3, 1, 3, 1)); |  | ||||||
|       const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, |  | ||||||
|                                               _MM_SHUFFLE(2, 0, 2, 0)); |  | ||||||
|       const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, |  | ||||||
|                                               _MM_SHUFFLE(3, 1, 3, 1)); |  | ||||||
|       __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0); |  | ||||||
|       __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0); |  | ||||||
|       vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1)); |  | ||||||
|       vec_cohde = _mm_div_ps(vec_cohde, vec_sdse); |  | ||||||
|       vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1)); |  | ||||||
|       vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx); |  | ||||||
|       _mm_storeu_ps(&cohde[i], vec_cohde); |  | ||||||
|       _mm_storeu_ps(&cohxd[i], vec_cohxd); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // scalar code for the remaining items. |  | ||||||
|     for (; i < PART_LEN1; i++) { |  | ||||||
|       cohde[i] = |  | ||||||
|           (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / |  | ||||||
|           (aec->sd[i] * aec->se[i] + 1e-10f); |  | ||||||
|       cohxd[i] = |  | ||||||
|           (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / |  | ||||||
|           (aec->sx[i] * aec->sd[i] + 1e-10f); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void WebRtcAec_InitAec_SSE2(void) { | void WebRtcAec_InitAec_SSE2(void) { | ||||||
|   WebRtcAec_FilterFar = FilterFarSSE2; |   WebRtcAec_FilterFar = FilterFarSSE2; | ||||||
|   WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |   WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | ||||||
|   WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |   WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | ||||||
|   WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; |   WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | ||||||
|   WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; |  | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 bjornv@webrtc.org
					bjornv@webrtc.org