Optimization of "overdrive and suppress":

* float accuracy pow function, vectorized pow approximation, general vectorization. * 10.2% AEC overall speedup for the straight C path. * 16.1% AEC overall speedup for the SSE2 path. Review URL: http://webrtc-codereview.appspot.com/24016 git-svn-id: http://webrtc.googlecode.com/svn/trunk@72 4adac7df-926f-26a2-2b94-8c16560cd09d
2011-06-13 18:56:48 +00:00 · 2011-06-13 18:56:48 +00:00 · 5af7a804ea
commit 5af7a804ea
parent 706b7258f5
3 changed files with 273 additions and 45 deletions
--- a/modules/audio_processing/aec/main/source/aec_core.c
+++ b/modules/audio_processing/aec/main/source/aec_core.c
@ -82,7 +82,7 @@ static const float sqrtHanning[65] = {
 weightCurve = [0 ; 0.3 * sqrt(linspace(0,1,64))' + 0.1];
 fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', weightCurve);
 */
-static const float weightCurve[65] = {
+const float WebRtcAec_weightCurve[65] = {
    0.0000f, 0.1000f, 0.1378f, 0.1535f, 0.1655f, 0.1756f,
    0.1845f, 0.1926f, 0.2000f, 0.2069f, 0.2134f, 0.2195f,
    0.2254f, 0.2309f, 0.2363f, 0.2414f, 0.2464f, 0.2512f,
@ -100,7 +100,7 @@ static const float weightCurve[65] = {
 overDriveCurve = [sqrt(linspace(0,1,65))' + 1];
 fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', overDriveCurve);
 */
-static const float overDriveCurve[65] = {
+const float WebRtcAec_overDriveCurve[65] = {
    1.0000f, 1.1250f, 1.1768f, 1.2165f, 1.2500f, 1.2795f,
    1.3062f, 1.3307f, 1.3536f, 1.3750f, 1.3953f, 1.4146f,
    1.4330f, 1.4507f, 1.4677f, 1.4841f, 1.5000f, 1.5154f,
@ -128,7 +128,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
 static void GetHighbandGain(const float *lambda, float *nlpGainHband);

 // Comfort_noise also computes noise for H band returned in comfortNoiseHband
-static void ComfortNoise(aec_t *aec, complex_t *efw,
+static void ComfortNoise(aec_t *aec, float efw[2][PART_LEN1],
                                  complex_t *comfortNoiseHband,
                                  const float *noisePow, const float *lambda);

@ -314,9 +314,32 @@ static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1],
  }
 }

+static void OverdriveAndSuppress(aec_t *aec, float hNl[PART_LEN1],
+                                 const float hNlFb,
+                                 float efw[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < PART_LEN1; i++) {
+    // Weight subbands
+    if (hNl[i] > hNlFb) {
+      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
+          (1 - WebRtcAec_weightCurve[i]) * hNl[i];
+    }
+    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
+
+    // Suppress error signal
+    efw[0][i] *= hNl[i];
+    efw[1][i] *= hNl[i];
+
+    // Ooura fft returns incorrect sign on imaginary component. It matters here
+    // because we are making an additive change with comfort noise.
+    efw[1][i] *= -1;
+  }
+}
+
 WebRtcAec_FilterFar_t WebRtcAec_FilterFar;
 WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal;
 WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
+WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;

 int WebRtcAec_InitAec(aec_t *aec, int sampFreq)
 {
@ -444,6 +467,7 @@ int WebRtcAec_InitAec(aec_t *aec, int sampFreq)
    WebRtcAec_FilterFar = FilterFar;
    WebRtcAec_ScaleErrorSignal = ScaleErrorSignal;
    WebRtcAec_FilterAdaptation = FilterAdaptation;
+    WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppress;
    if (WebRtc_GetCPUInfo(kSSE2)) {
 #if defined(__SSE2__)
      WebRtcAec_InitAec_SSE2();
@ -753,7 +777,8 @@ static void ProcessBlock(aec_t *aec, const short *farend,

 static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output, short *outputH)
 {
-    complex_t dfw[PART_LEN1], efw[PART_LEN1], xfw[PART_LEN1];
+    float efw[2][PART_LEN1], dfw[2][PART_LEN1];
+    complex_t xfw[PART_LEN1];
    complex_t comfortNoiseHband[PART_LEN1];
    float fft[PART_LEN2];
    float scale, dtmp;
@ -841,13 +866,13 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
    }
    rdft(PART_LEN2, 1, fft, ip, wfft);

-    dfw[0][1] = 0;
-    dfw[PART_LEN][1] = 0;
+    dfw[1][0] = 0;
+    dfw[1][PART_LEN] = 0;
    dfw[0][0] = fft[0];
-    dfw[PART_LEN][0] = fft[1];
+    dfw[0][PART_LEN] = fft[1];
    for (i = 1; i < PART_LEN; i++) {
-        dfw[i][0] = fft[2 * i];
-        dfw[i][1] = fft[2 * i + 1];
+        dfw[0][i] = fft[2 * i];
+        dfw[1][i] = fft[2 * i + 1];
    }

    // Windowed error fft
@ -856,21 +881,21 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
        fft[PART_LEN + i] = aec->eBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i];
    }
    rdft(PART_LEN2, 1, fft, ip, wfft);
-    efw[0][1] = 0;
-    efw[PART_LEN][1] = 0;
+    efw[1][0] = 0;
+    efw[1][PART_LEN] = 0;
    efw[0][0] = fft[0];
-    efw[PART_LEN][0] = fft[1];
+    efw[0][PART_LEN] = fft[1];
    for (i = 1; i < PART_LEN; i++) {
-        efw[i][0] = fft[2 * i];
-        efw[i][1] = fft[2 * i + 1];
+        efw[0][i] = fft[2 * i];
+        efw[1][i] = fft[2 * i + 1];
    }

    // Smoothed PSD
    for (i = 0; i < PART_LEN1; i++) {
        aec->sd[i] = ptrGCoh[0] * aec->sd[i] + ptrGCoh[1] *
-            (dfw[i][0] * dfw[i][0] + dfw[i][1] * dfw[i][1]);
+            (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
        aec->se[i] = ptrGCoh[0] * aec->se[i] + ptrGCoh[1] *
-            (efw[i][0] * efw[i][0] + efw[i][1] * efw[i][1]);
+            (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
        // We threshold here to protect against the ill-effects of a zero farend.
        // The threshold is not arbitrarily chosen, but balances protection and
        // adverse interaction with the algorithm's tuning.
@ -879,14 +904,14 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
            WEBRTC_SPL_MAX(xfw[i][0] * xfw[i][0] + xfw[i][1] * xfw[i][1], 15);

        aec->sde[i][0] = ptrGCoh[0] * aec->sde[i][0] + ptrGCoh[1] *
-            (dfw[i][0] * efw[i][0] + dfw[i][1] * efw[i][1]);
+            (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
        aec->sde[i][1] = ptrGCoh[0] * aec->sde[i][1] + ptrGCoh[1] *
-            (dfw[i][0] * efw[i][1] - dfw[i][1] * efw[i][0]);
+            (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);

        aec->sxd[i][0] = ptrGCoh[0] * aec->sxd[i][0] + ptrGCoh[1] *
-            (dfw[i][0] * xfw[i][0] + dfw[i][1] * xfw[i][1]);
+            (dfw[0][i] * xfw[i][0] + dfw[1][i] * xfw[i][1]);
        aec->sxd[i][1] = ptrGCoh[0] * aec->sxd[i][1] + ptrGCoh[1] *
-            (dfw[i][0] * xfw[i][1] - dfw[i][1] * xfw[i][0]);
+            (dfw[0][i] * xfw[i][1] - dfw[1][i] * xfw[i][0]);

        sdSum += aec->sd[i];
        seSum += aec->se[i];
@ -1007,29 +1032,13 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,

    // Smooth the overdrive.
    if (aec->overDrive < aec->overDriveSm) {
-        aec->overDriveSm = 0.99f * aec->overDriveSm + 0.01f * aec->overDrive;
+      aec->overDriveSm = 0.99f * aec->overDriveSm + 0.01f * aec->overDrive;
    }
    else {
-        aec->overDriveSm = 0.9f * aec->overDriveSm + 0.1f * aec->overDrive;
-    }
-
-    for (i = 0; i < PART_LEN1; i++) {
-        // Weight subbands
-        if (hNl[i] > hNlFb) {
-            hNl[i] = weightCurve[i] * hNlFb + (1 - weightCurve[i]) * hNl[i];
-        }
-
-        hNl[i] = (float)pow(hNl[i], aec->overDriveSm * overDriveCurve[i]);
-
-        // Suppress error signal
-        efw[i][0] *= hNl[i];
-        efw[i][1] *= hNl[i];
-
-        // Ooura fft returns incorrect sign on imaginary component.
-        // It matters here because we are making an additive change with comfort noise.
-        efw[i][1] *= -1;
+      aec->overDriveSm = 0.9f * aec->overDriveSm + 0.1f * aec->overDrive;
    }

+    WebRtcAec_OverdriveAndSuppress(aec, hNl, hNlFb, efw);

 #ifdef G167
    if (aec->cnToggle) {
@ -1042,11 +1051,11 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,

    // Inverse error fft.
    fft[0] = efw[0][0];
-    fft[1] = efw[PART_LEN][0];
+    fft[1] = efw[0][PART_LEN];
    for (i = 1; i < PART_LEN; i++) {
-        fft[2*i] = efw[i][0];
+        fft[2*i] = efw[0][i];
        // Sign change required by Ooura fft.
-        fft[2*i + 1] = -efw[i][1];
+        fft[2*i + 1] = -efw[1][i];
    }
    rdft(PART_LEN2, -1, fft, ip, wfft);

@ -1126,7 +1135,7 @@ static void GetHighbandGain(const float *lambda, float *nlpGainHband)
    nlpGainHband[0] /= (float)(PART_LEN1 - 1 - freqAvgIc);
 }

-static void ComfortNoise(aec_t *aec, complex_t *efw,
+static void ComfortNoise(aec_t *aec, float efw[2][PART_LEN1],
    complex_t *comfortNoiseHband, const float *noisePow, const float *lambda)
 {
    int i, num;
@ -1159,8 +1168,8 @@ static void ComfortNoise(aec_t *aec, complex_t *efw,
        // This is the proper weighting to match the background noise power
        tmp = sqrtf(WEBRTC_SPL_MAX(1 - lambda[i] * lambda[i], 0));
        //tmp = 1 - lambda[i];
-        efw[i][0] += tmp * u[i][0];
-        efw[i][1] += tmp * u[i][1];
+        efw[0][i] += tmp * u[i][0];
+        efw[1][i] += tmp * u[i][1];
    }

    // For H band comfort noise
--- a/modules/audio_processing/aec/main/source/aec_core.h
+++ b/modules/audio_processing/aec/main/source/aec_core.h
@ -176,6 +176,9 @@ typedef void (*WebRtcAec_FilterAdaptation_t)
  (aec_t *aec, float *fft, float ef[2][PART_LEN1], int ip[IP_LEN],
   float wfft[W_LEN]);
 extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
+typedef void (*WebRtcAec_OverdriveAndSuppress_t)
+  (aec_t *aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]);
+extern WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;

 int WebRtcAec_CreateAec(aec_t **aec);
 int WebRtcAec_FreeAec(aec_t *aec);
--- a/modules/audio_processing/aec/main/source/aec_core_sse2.c
+++ b/modules/audio_processing/aec/main/source/aec_core_sse2.c
@ -210,10 +210,226 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1],
  }
 }

+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+static __m128 mm_pow_ps(__m128 a, __m128 b)
+{
+  // a^b = exp2(b * log2(a))
+  //   exp2(x) and log2(x) are calculated using polynomial approximations.
+  __m128 log2_a, b_log2_a, a_exp_b;
+
+  // Calculate log2(x), x = a.
+  {
+    // To calculate log2(x), we decompose x like this:
+    //   x = y * 2^n
+    //     n is an integer
+    //     y is in the [1.0, 2.0) range
+    //
+    //   log2(x) = log2(y) + n
+    //     n       can be evaluated by playing with float representation.
+    //     log2(y) in a small range can be approximated, this code uses an order
+    //             five polynomial approximation. The coefficients have been
+    //             estimated with the Remez algorithm and the resulting
+    //             polynomial has a maximum relative error of 0.00086%.
+
+    // Compute n.
+    //    This is done by masking the exponent, shifting it into the top bit of
+    //    the mantissa, putting eight into the biased exponent (to shift/
+    //    compensate the fact that the exponent has been shifted in the top/
+    //    fractional part and finally getting rid of the implicit leading one
+    //    from the mantissa by substracting it out.
+    static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END =
+        {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
+    static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END =
+        {0x43800000, 0x43800000, 0x43800000, 0x43800000};
+    static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END =
+        {0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
+    static const int shift_exponent_into_top_mantissa = 8;
+    const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask));
+    const __m128 n_1 = (__m128)_mm_srli_epi32((__m128i)two_n,
+        shift_exponent_into_top_mantissa);
+    const __m128 n_0 = _mm_or_ps(
+        (__m128)n_1, *((__m128 *)eight_biased_exponent));
+    const __m128 n   = _mm_sub_ps(n_0,  *((__m128 *)implicit_leading_one));
+
+    // Compute y.
+    static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END =
+        {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
+    static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END =
+        {0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
+    const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask));
+    const __m128 y        = _mm_or_ps(
+        mantissa,  *((__m128 *)zero_biased_exponent_is_one));
+
+    // Approximate log2(y) ~= (y - 1) * pol5(y).
+    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
+    static const ALIGN16_BEG float ALIGN16_END C5[4] =
+        {-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
+    static const ALIGN16_BEG float ALIGN16_END C4[4] =
+        {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
+    static const ALIGN16_BEG float ALIGN16_END C3[4] =
+        {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
+    static const ALIGN16_BEG float ALIGN16_END C2[4] =
+        {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
+    static const ALIGN16_BEG float ALIGN16_END C1[4] =
+        {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
+    static const ALIGN16_BEG float ALIGN16_END C0[4] =
+        {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
+    const __m128 pol5_y_0 = _mm_mul_ps(y,        *((__m128 *)C5));
+    const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4));
+    const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
+    const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3));
+    const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
+    const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2));
+    const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
+    const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1));
+    const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
+    const __m128 pol5_y   = _mm_add_ps(pol5_y_8, *((__m128 *)C0));
+    const __m128 y_minus_one = _mm_sub_ps(
+        y, *((__m128 *)zero_biased_exponent_is_one));
+    const __m128 log2_y = _mm_mul_ps(y_minus_one ,  pol5_y);
+
+    // Combine parts.
+    log2_a = _mm_add_ps(n, log2_y);
+  }
+
+  // b * log2(a)
+  b_log2_a = _mm_mul_ps(b, log2_a);
+
+  // Calculate exp2(x), x = b * log2(a).
+  {
+    // To calculate 2^x, we decompose x like this:
+    //   x = n + y
+    //     n is an integer, the value of x - 0.5 rounded down, therefore
+    //     y is in the [0.5, 1.5) range
+    //
+    //   2^x = 2^n * 2^y
+    //     2^n can be evaluated by playing with float representation.
+    //     2^y in a small range can be approximated, this code uses an order two
+    //         polynomial approximation. The coefficients have been estimated
+    //         with the Remez algorithm and the resulting polynomial has a
+    //         maximum relative error of 0.17%.
+
+    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
+    static const ALIGN16_BEG float max_input[4] ALIGN16_END =
+        {129.f, 129.f, 129.f, 129.f};
+    static const ALIGN16_BEG float min_input[4] ALIGN16_END =
+        {-126.99999f, -126.99999f, -126.99999f, -126.99999f};
+    const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input));
+    const __m128 x_max = _mm_max_ps(x_min,    *((__m128 *)min_input));
+    // Compute n.
+    static const ALIGN16_BEG float half[4] ALIGN16_END =
+        {0.5f, 0.5f, 0.5f, 0.5f};
+    const __m128  x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half));
+    const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
+    // Compute 2^n.
+    static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END =
+        {127, 127, 127, 127};
+    static const int float_exponent_shift = 23;
+    const __m128i two_n_exponent = _mm_add_epi32(
+        x_minus_half_floor, *((__m128i *)float_exponent_bias));
+    const __m128  two_n = (__m128)_mm_slli_epi32(
+        two_n_exponent, float_exponent_shift);
+    // Compute y.
+    const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
+    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
+    static const ALIGN16_BEG float C2[4] ALIGN16_END =
+        {3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
+    static const ALIGN16_BEG float C1[4] ALIGN16_END =
+        {6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
+    static const ALIGN16_BEG float C0[4] ALIGN16_END =
+        {1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f};
+    const __m128 exp2_y_0 = _mm_mul_ps(y,        *((__m128 *)C2));
+    const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1));
+    const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
+    const __m128 exp2_y   = _mm_add_ps(exp2_y_2, *((__m128 *)C0));
+
+    // Combine parts.
+    a_exp_b = _mm_mul_ps(exp2_y, two_n);
+  }
+  return a_exp_b;
+}
+
+extern const float WebRtcAec_weightCurve[65];
+extern const float WebRtcAec_overDriveCurve[65];
+
+static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
+                                     const float hNlFb,
+                                     float efw[2][PART_LEN1]) {
+  int i;
+  const __m128 vec_hNlFb = _mm_set1_ps(hNlFb);
+  const __m128 vec_one = _mm_set1_ps(1.0f);
+  const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
+  const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
+  // vectorized code (four at once)
+  for (i = 0; i + 3 < PART_LEN1; i+=4) {
+    // Weight subbands
+    __m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
+    const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
+    const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
+    const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(
+        vec_weightCurve, vec_hNlFb);
+    const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
+    const __m128 vec_one_weightCurve_hNl = _mm_mul_ps(
+        vec_one_weightCurve, vec_hNl);
+    const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
+    const __m128 vec_if1 = _mm_and_ps(
+        bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
+    vec_hNl = _mm_or_ps(vec_if0, vec_if1);
+
+    {
+      const __m128 vec_overDriveCurve = _mm_loadu_ps(
+          &WebRtcAec_overDriveCurve[i]);
+      const __m128 vec_overDriveSm_overDriveCurve = _mm_mul_ps(
+          vec_overDriveSm, vec_overDriveCurve);
+      vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
+      _mm_storeu_ps(&hNl[i], vec_hNl);
+    }
+
+    // Suppress error signal
+    {
+      __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]);
+      __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]);
+      vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl);
+      vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl);
+
+      // Ooura fft returns incorrect sign on imaginary component. It matters
+      // here because we are making an additive change with comfort noise.
+      vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one);
+      _mm_storeu_ps(&efw[0][i], vec_efw_re);
+      _mm_storeu_ps(&efw[1][i], vec_efw_im);
+    }
+  }
+  // scalar code for the remaining items.
+  for (; i < PART_LEN1; i++) {
+    // Weight subbands
+    if (hNl[i] > hNlFb) {
+      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
+          (1 - WebRtcAec_weightCurve[i]) * hNl[i];
+    }
+    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
+
+    // Suppress error signal
+    efw[0][i] *= hNl[i];
+    efw[1][i] *= hNl[i];
+
+    // Ooura fft returns incorrect sign on imaginary component. It matters
+    // here because we are making an additive change with comfort noise.
+    efw[1][i] *= -1;
+  }
+}
+
 void WebRtcAec_InitAec_SSE2(void) {
  WebRtcAec_FilterFar = FilterFarSSE2;
  WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;
  WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
+  WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
 }

 #endif   //__SSE2__