From 181f543de48245607f0d84ee03422960dc760f95 Mon Sep 17 00:00:00 2001
From: "cduvivier@google.com"
 <cduvivier@google.com@4adac7df-926f-26a2-2b94-8c16560cd09d>
Date: Fri, 24 Jun 2011 18:22:47 +0000
Subject: [PATCH] AEC specific version of " Real Discrete Fourier Transform".

Lots of AEC CPU usage is coming from calls to 'rdft'. To optimize this,
deep changes (modification of memory layout, ...) have to be done and it
is not practical to do them in an utility library. Most of these changes
will occur in subsequent CLs.

The new file 'aec_core_rdft.c' is a copy of 'modules/audio_processing/
utility/fft4g.c' whose size has been significantly reduced by removing
all code non-necessary to compute rdft. The main entry point and utility
functions have also been modified to take into account the fact that all
'rdft' calls performed by AEC have a length of 128. This yields:
* 1.8% AEC overall speedup for the straight C path.
* 2.3% AEC overall speedup for the SSE2 path.
Review URL: http://webrtc-codereview.appspot.com/44008

git-svn-id: http://webrtc.googlecode.com/svn/trunk@126 4adac7df-926f-26a2-2b94-8c16560cd09d
---
 LICENSE_THIRD_PARTY                           |   1 +
 .../audio_processing/aec/main/source/aec.gyp  |   1 +
 .../aec/main/source/aec_core.c                |  22 +-
 .../aec/main/source/aec_core.h                |   5 +-
 .../aec/main/source/aec_core_rdft.c           | 511 ++++++++++++++++++
 .../aec/main/source/aec_core_sse2.c           |   4 +-
 6 files changed, 530 insertions(+), 14 deletions(-)
 create mode 100644 modules/audio_processing/aec/main/source/aec_core_rdft.c

diff --git a/LICENSE_THIRD_PARTY b/LICENSE_THIRD_PARTY
index 2086d2844..31d27e8a0 100644
--- a/LICENSE_THIRD_PARTY
+++ b/LICENSE_THIRD_PARTY
@@ -15,6 +15,7 @@ modules/audio_device/main/source/Mac/portaudio/pa_memorybarrier.h
 modules/audio_device/main/source/Mac/portaudio/pa_ringbuffer.h
 modules/audio_device/main/source/Mac/portaudio/pa_ringbuffer.c
 modules/audio_processing/utility/fft4g.c
+modules/audio_processing/aec/main/source/aec_core_rdft.c
 system_wrappers/interface/fix_interlocked_exchange_pointer_windows.h
 system_wrappers/source/condition_variable_windows.cc
 system_wrappers/source/spreadsortlib/constants.hpp
diff --git a/modules/audio_processing/aec/main/source/aec.gyp b/modules/audio_processing/aec/main/source/aec.gyp
index 96f9bd75b..ed33d7f71 100644
--- a/modules/audio_processing/aec/main/source/aec.gyp
+++ b/modules/audio_processing/aec/main/source/aec.gyp
@@ -31,6 +31,7 @@
         'echo_cancellation.c',
         'aec_core.c',
         'aec_core_sse2.c',
+        'aec_core_rdft.c',
         'aec_core.h',
         'resampler.c',
         'resampler.h',
diff --git a/modules/audio_processing/aec/main/source/aec_core.c b/modules/audio_processing/aec/main/source/aec_core.c
index 3303a8a88..eb8a90fc1 100644
--- a/modules/audio_processing/aec/main/source/aec_core.c
+++ b/modules/audio_processing/aec/main/source/aec_core.c
@@ -291,7 +291,7 @@ static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1],
                    -aec->xfBuf[1][xPos + PART_LEN],
                    ef[0][PART_LEN], ef[1][PART_LEN]);
 
-    rdft(PART_LEN2, -1, fft, ip, wfft);
+    aec_rdft_128(-1, fft, ip, wfft);
     memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
 
     // fft scaling
@@ -301,7 +301,7 @@ static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1],
         fft[j] *= scale;
       }
     }
-    rdft(PART_LEN2, 1, fft, ip, wfft);
+    aec_rdft_128(1, fft, ip, wfft);
 
     aec->wfBuf[0][pos] += fft[0];
     aec->wfBuf[0][pos + PART_LEN] += fft[1];
@@ -613,7 +613,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
 
     // Setting this on the first call initializes work arrays.
     ip[0] = 0;
-    rdft(PART_LEN2, 1, fft, ip, wfft);
+    aec_rdft_128(1, fft, ip, wfft);
 
     // Far fft
     xf[1][0] = 0;
@@ -628,7 +628,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
 
     // Near fft
     memcpy(fft, aec->dBuf, sizeof(float) * PART_LEN2);
-    rdft(PART_LEN2, 1, fft, ip, wfft);
+    aec_rdft_128(1, fft, ip, wfft);
     df[0][1] = 0;
     df[PART_LEN][1] = 0;
     df[0][0] = fft[0];
@@ -704,7 +704,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
         fft[2 * i] = yf[0][i];
         fft[2 * i + 1] = yf[1][i];
     }
-    rdft(PART_LEN2, -1, fft, ip, wfft);
+    aec_rdft_128(-1, fft, ip, wfft);
 
     scale = 2.0f / PART_LEN2;
     for (i = 0; i < PART_LEN; i++) {
@@ -719,7 +719,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
     memcpy(aec->eBuf + PART_LEN, e, sizeof(float) * PART_LEN);
     memset(fft, 0, sizeof(float) * PART_LEN);
     memcpy(fft + PART_LEN, e, sizeof(float) * PART_LEN);
-    rdft(PART_LEN2, 1, fft, ip, wfft);
+    aec_rdft_128(1, fft, ip, wfft);
 
     ef[1][0] = 0;
     ef[1][PART_LEN] = 0;
@@ -842,7 +842,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
         fft[i] = aec->xBuf[i] * sqrtHanning[i];
         fft[PART_LEN + i] = aec->xBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i];
     }
-    rdft(PART_LEN2, 1, fft, ip, wfft);
+    aec_rdft_128(1, fft, ip, wfft);
 
     xfw[0][1] = 0;
     xfw[PART_LEN][1] = 0;
@@ -864,7 +864,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
         fft[i] = aec->dBuf[i] * sqrtHanning[i];
         fft[PART_LEN + i] = aec->dBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i];
     }
-    rdft(PART_LEN2, 1, fft, ip, wfft);
+    aec_rdft_128(1, fft, ip, wfft);
 
     dfw[1][0] = 0;
     dfw[1][PART_LEN] = 0;
@@ -880,7 +880,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
         fft[i] = aec->eBuf[i] * sqrtHanning[i];
         fft[PART_LEN + i] = aec->eBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i];
     }
-    rdft(PART_LEN2, 1, fft, ip, wfft);
+    aec_rdft_128(1, fft, ip, wfft);
     efw[1][0] = 0;
     efw[1][PART_LEN] = 0;
     efw[0][0] = fft[0];
@@ -1057,7 +1057,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
         // Sign change required by Ooura fft.
         fft[2*i + 1] = -efw[1][i];
     }
-    rdft(PART_LEN2, -1, fft, ip, wfft);
+    aec_rdft_128(-1, fft, ip, wfft);
 
     // Overlap and add to obtain output.
     scale = 2.0f / PART_LEN2;
@@ -1089,7 +1089,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
                 fft[2*i] = comfortNoiseHband[i][0];
                 fft[2*i + 1] = comfortNoiseHband[i][1];
             }
-            rdft(PART_LEN2, -1, fft, ip, wfft);
+            aec_rdft_128(-1, fft, ip, wfft);
             scale = 2.0f / PART_LEN2;
         }
 
diff --git a/modules/audio_processing/aec/main/source/aec_core.h b/modules/audio_processing/aec/main/source/aec_core.h
index ae043038a..2a619085d 100644
--- a/modules/audio_processing/aec/main/source/aec_core.h
+++ b/modules/audio_processing/aec/main/source/aec_core.h
@@ -16,7 +16,6 @@
 #define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
 
 #include <stdio.h>
-#include "fft4g.h"
 #include "typedefs.h"
 #include "signal_processing_library.h"
 
@@ -191,5 +190,9 @@ void WebRtcAec_ProcessFrame(aec_t *aec, const short *farend,
                        short *out, short *outH,
                        int knownDelay);
 
+// aec_core_rdft.c
+void aec_rdft_128(int, float *, int *, float *);
+
+
 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
 
diff --git a/modules/audio_processing/aec/main/source/aec_core_rdft.c b/modules/audio_processing/aec/main/source/aec_core_rdft.c
new file mode 100644
index 000000000..3f4d30049
--- /dev/null
+++ b/modules/audio_processing/aec/main/source/aec_core_rdft.c
@@ -0,0 +1,511 @@
+/*
+ * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
+ * Copyright Takuya OOURA, 1996-2001
+ *
+ * You may use, copy, modify and distribute this code for any purpose (include
+ * commercial use) and without fee. Please refer to this package when you modify
+ * this code.
+ *
+ * Changes by the WebRTC authors:
+ *    - Trivial type modifications.
+ *    - Minimal code subset to do rdft of length 128.
+ *    - Optimizations because of known length.
+ *
+ *  All changes are covered by the WebRTC license and IP grant:
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "aec_core.h"
+
+static void bitrv2_32or128(int n, int *ip, float *a) {
+  // n is 32 or 128
+  int j, j1, k, k1, m, m2;
+  float xr, xi, yr, yi;
+
+  ip[0] = 0;
+  {
+    int l = n;
+    m = 1;
+    while ((m << 3) < l) {
+      l >>= 1;
+      for (j = 0; j < m; j++) {
+        ip[m + j] = ip[j] + l;
+      }
+      m <<= 1;
+    }
+  }
+  m2 = 2 * m;
+  for (k = 0; k < m; k++) {
+    for (j = 0; j < k; j++) {
+      j1 = 2 * j + ip[k];
+      k1 = 2 * k + ip[j];
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 += m2;
+      k1 += 2 * m2;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 += m2;
+      k1 -= m2;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 += m2;
+      k1 += 2 * m2;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+    }
+    j1 = 2 * k + m2 + ip[k];
+    k1 = j1 + m2;
+    xr = a[j1];
+    xi = a[j1 + 1];
+    yr = a[k1];
+    yi = a[k1 + 1];
+    a[j1] = yr;
+    a[j1 + 1] = yi;
+    a[k1] = xr;
+    a[k1 + 1] = xi;
+  }
+}
+
+static void makewt(int *ip, float *w) {
+  const int nw = 32;
+  int j, nwh;
+  float delta, x, y;
+
+  ip[0] = nw;
+  ip[1] = 1;
+  nwh = nw >> 1;
+  delta = atanf(1.0f) / nwh;
+  w[0] = 1;
+  w[1] = 0;
+  w[nwh] = cosf(delta * nwh);
+  w[nwh + 1] = w[nwh];
+  for (j = 2; j < nwh; j += 2) {
+    x = cosf(delta * j);
+    y = sinf(delta * j);
+    w[j] = x;
+    w[j + 1] = y;
+    w[nw - j] = y;
+    w[nw - j + 1] = x;
+  }
+  bitrv2_32or128(nw, ip + 2, w);
+}
+
+static void makect_32(int *ip, float *c) {
+  const int nc = 32;
+  int j, nch;
+  float delta;
+
+  ip[1] = nc;
+  nch = nc >> 1;
+  delta = atanf(1.0f) / nch;
+  c[0] = cosf(delta * nch);
+  c[nch] = 0.5f * c[0];
+  for (j = 1; j < nch; j++) {
+    c[j] = 0.5f * cosf(delta * j);
+    c[nc - j] = 0.5f * sinf(delta * j);
+  }
+}
+
+static void cft1st_128(float *a, float *w) {
+  const int n = 128;
+  int j, k1, k2;
+  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  x0r = a[0] + a[2];
+  x0i = a[1] + a[3];
+  x1r = a[0] - a[2];
+  x1i = a[1] - a[3];
+  x2r = a[4] + a[6];
+  x2i = a[5] + a[7];
+  x3r = a[4] - a[6];
+  x3i = a[5] - a[7];
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[4] = x0r - x2r;
+  a[5] = x0i - x2i;
+  a[2] = x1r - x3i;
+  a[3] = x1i + x3r;
+  a[6] = x1r + x3i;
+  a[7] = x1i - x3r;
+  wk1r = w[2];
+  x0r = a[8] + a[10];
+  x0i = a[9] + a[11];
+  x1r = a[8] - a[10];
+  x1i = a[9] - a[11];
+  x2r = a[12] + a[14];
+  x2i = a[13] + a[15];
+  x3r = a[12] - a[14];
+  x3i = a[13] - a[15];
+  a[8] = x0r + x2r;
+  a[9] = x0i + x2i;
+  a[12] = x2i - x0i;
+  a[13] = x0r - x2r;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  a[10] = wk1r * (x0r - x0i);
+  a[11] = wk1r * (x0r + x0i);
+  x0r = x3i + x1r;
+  x0i = x3r - x1i;
+  a[14] = wk1r * (x0i - x0r);
+  a[15] = wk1r * (x0i + x0r);
+  k1 = 0;
+  for (j = 16; j < n; j += 16) {
+    k1 += 2;
+    k2 = 2 * k1;
+    wk2r = w[k1];
+    wk2i = w[k1 + 1];
+    wk1r = w[k2];
+    wk1i = w[k2 + 1];
+    wk3r = wk1r - 2 * wk2i * wk1i;
+    wk3i = 2 * wk2i * wk1r - wk1i;
+    x0r = a[j] + a[j + 2];
+    x0i = a[j + 1] + a[j + 3];
+    x1r = a[j] - a[j + 2];
+    x1i = a[j + 1] - a[j + 3];
+    x2r = a[j + 4] + a[j + 6];
+    x2i = a[j + 5] + a[j + 7];
+    x3r = a[j + 4] - a[j + 6];
+    x3i = a[j + 5] - a[j + 7];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i + x2i;
+    x0r -= x2r;
+    x0i -= x2i;
+    a[j + 4] = wk2r * x0r - wk2i * x0i;
+    a[j + 5] = wk2r * x0i + wk2i * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j + 2] = wk1r * x0r - wk1i * x0i;
+    a[j + 3] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j + 6] = wk3r * x0r - wk3i * x0i;
+    a[j + 7] = wk3r * x0i + wk3i * x0r;
+    wk1r = w[k2 + 2];
+    wk1i = w[k2 + 3];
+    wk3r = wk1r - 2 * wk2r * wk1i;
+    wk3i = 2 * wk2r * wk1r - wk1i;
+    x0r = a[j + 8] + a[j + 10];
+    x0i = a[j + 9] + a[j + 11];
+    x1r = a[j + 8] - a[j + 10];
+    x1i = a[j + 9] - a[j + 11];
+    x2r = a[j + 12] + a[j + 14];
+    x2i = a[j + 13] + a[j + 15];
+    x3r = a[j + 12] - a[j + 14];
+    x3i = a[j + 13] - a[j + 15];
+    a[j + 8] = x0r + x2r;
+    a[j + 9] = x0i + x2i;
+    x0r -= x2r;
+    x0i -= x2i;
+    a[j + 12] = -wk2i * x0r - wk2r * x0i;
+    a[j + 13] = -wk2i * x0i + wk2r * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j + 10] = wk1r * x0r - wk1i * x0i;
+    a[j + 11] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j + 14] = wk3r * x0r - wk3i * x0i;
+    a[j + 15] = wk3r * x0i + wk3i * x0r;
+  }
+}
+
+static void cftmdl_128(int l, float *a, float *w) {
+  const int n = 128;
+  int j, j1, j2, j3, k, k1, k2, m, m2;
+  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  m = l << 2;
+  for (j = 0; j < l; j += 2) {
+    j1 = j + l;
+    j2 = j1 + l;
+    j3 = j2 + l;
+    x0r = a[j] + a[j1];
+    x0i = a[j + 1] + a[j1 + 1];
+    x1r = a[j] - a[j1];
+    x1i = a[j + 1] - a[j1 + 1];
+    x2r = a[j2] + a[j3];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2] - a[j3];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i + x2i;
+    a[j2] = x0r - x2r;
+    a[j2 + 1] = x0i - x2i;
+    a[j1] = x1r - x3i;
+    a[j1 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+  }
+  wk1r = w[2];
+  for (j = m; j < l + m; j += 2) {
+    j1 = j + l;
+    j2 = j1 + l;
+    j3 = j2 + l;
+    x0r = a[j] + a[j1];
+    x0i = a[j + 1] + a[j1 + 1];
+    x1r = a[j] - a[j1];
+    x1i = a[j + 1] - a[j1 + 1];
+    x2r = a[j2] + a[j3];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2] - a[j3];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i + x2i;
+    a[j2] = x2i - x0i;
+    a[j2 + 1] = x0r - x2r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j1] = wk1r * (x0r - x0i);
+    a[j1 + 1] = wk1r * (x0r + x0i);
+    x0r = x3i + x1r;
+    x0i = x3r - x1i;
+    a[j3] = wk1r * (x0i - x0r);
+    a[j3 + 1] = wk1r * (x0i + x0r);
+  }
+  k1 = 0;
+  m2 = 2 * m;
+  for (k = m2; k < n; k += m2) {
+    k1 += 2;
+    k2 = 2 * k1;
+    wk2r = w[k1];
+    wk2i = w[k1 + 1];
+    wk1r = w[k2];
+    wk1i = w[k2 + 1];
+    wk3r = wk1r - 2 * wk2i * wk1i;
+    wk3i = 2 * wk2i * wk1r - wk1i;
+    for (j = k; j < l + k; j += 2) {
+      j1 = j + l;
+      j2 = j1 + l;
+      j3 = j2 + l;
+      x0r = a[j] + a[j1];
+      x0i = a[j + 1] + a[j1 + 1];
+      x1r = a[j] - a[j1];
+      x1i = a[j + 1] - a[j1 + 1];
+      x2r = a[j2] + a[j3];
+      x2i = a[j2 + 1] + a[j3 + 1];
+      x3r = a[j2] - a[j3];
+      x3i = a[j2 + 1] - a[j3 + 1];
+      a[j] = x0r + x2r;
+      a[j + 1] = x0i + x2i;
+      x0r -= x2r;
+      x0i -= x2i;
+      a[j2] = wk2r * x0r - wk2i * x0i;
+      a[j2 + 1] = wk2r * x0i + wk2i * x0r;
+      x0r = x1r - x3i;
+      x0i = x1i + x3r;
+      a[j1] = wk1r * x0r - wk1i * x0i;
+      a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+      x0r = x1r + x3i;
+      x0i = x1i - x3r;
+      a[j3] = wk3r * x0r - wk3i * x0i;
+      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+    }
+    wk1r = w[k2 + 2];
+    wk1i = w[k2 + 3];
+    wk3r = wk1r - 2 * wk2r * wk1i;
+    wk3i = 2 * wk2r * wk1r - wk1i;
+    for (j = k + m; j < l + (k + m); j += 2) {
+      j1 = j + l;
+      j2 = j1 + l;
+      j3 = j2 + l;
+      x0r = a[j] + a[j1];
+      x0i = a[j + 1] + a[j1 + 1];
+      x1r = a[j] - a[j1];
+      x1i = a[j + 1] - a[j1 + 1];
+      x2r = a[j2] + a[j3];
+      x2i = a[j2 + 1] + a[j3 + 1];
+      x3r = a[j2] - a[j3];
+      x3i = a[j2 + 1] - a[j3 + 1];
+      a[j] = x0r + x2r;
+      a[j + 1] = x0i + x2i;
+      x0r -= x2r;
+      x0i -= x2i;
+      a[j2] = -wk2i * x0r - wk2r * x0i;
+      a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
+      x0r = x1r - x3i;
+      x0i = x1i + x3r;
+      a[j1] = wk1r * x0r - wk1i * x0i;
+      a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+      x0r = x1r + x3i;
+      x0i = x1i - x3r;
+      a[j3] = wk3r * x0r - wk3i * x0i;
+      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+    }
+  }
+}
+
+static void cftfsub_128(float *a, float *w) {
+  int j, j1, j2, j3, l;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  cft1st_128(a, w);
+  cftmdl_128(8, a, w);
+  l = 32;
+  for (j = 0; j < l; j += 2) {
+    j1 = j + l;
+    j2 = j1 + l;
+    j3 = j2 + l;
+    x0r = a[j] + a[j1];
+    x0i = a[j + 1] + a[j1 + 1];
+    x1r = a[j] - a[j1];
+    x1i = a[j + 1] - a[j1 + 1];
+    x2r = a[j2] + a[j3];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2] - a[j3];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i + x2i;
+    a[j2] = x0r - x2r;
+    a[j2 + 1] = x0i - x2i;
+    a[j1] = x1r - x3i;
+    a[j1 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+  }
+}
+
+static void cftbsub_128(float *a, float *w) {
+  int j, j1, j2, j3, l;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  cft1st_128(a, w);
+  cftmdl_128(8, a, w);
+  l = 32;
+
+  for (j = 0; j < l; j += 2) {
+    j1 = j + l;
+    j2 = j1 + l;
+    j3 = j2 + l;
+    x0r = a[j] + a[j1];
+    x0i = -a[j + 1] - a[j1 + 1];
+    x1r = a[j] - a[j1];
+    x1i = -a[j + 1] + a[j1 + 1];
+    x2r = a[j2] + a[j3];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2] - a[j3];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i - x2i;
+    a[j2] = x0r - x2r;
+    a[j2 + 1] = x0i + x2i;
+    a[j1] = x1r - x3i;
+    a[j1 + 1] = x1i - x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i + x3r;
+  }
+}
+
+static void rftfsub_128(float *a, int nc, float *c) {
+  const int n = 128;
+  int j, k, kk, ks, m;
+  float wkr, wki, xr, xi, yr, yi;
+
+  m = n >> 1;
+  ks = 2 * nc / m;
+  kk = 0;
+  for (j = 2; j < m; j += 2) {
+    k = n - j;
+    kk += ks;
+    wkr = 0.5f - c[nc - kk];
+    wki = c[kk];
+    xr = a[j] - a[k];
+    xi = a[j + 1] + a[k + 1];
+    yr = wkr * xr - wki * xi;
+    yi = wkr * xi + wki * xr;
+    a[j] -= yr;
+    a[j + 1] -= yi;
+    a[k] += yr;
+    a[k + 1] -= yi;
+  }
+}
+
+static void rftbsub_128(float *a, int nc, float *c) {
+  const int n = 128;
+  int j, k, kk, ks, m;
+  float wkr, wki, xr, xi, yr, yi;
+
+  a[1] = -a[1];
+  m = n >> 1;
+  ks = 2 * nc / m;
+  kk = 0;
+  for (j = 2; j < m; j += 2) {
+    k = n - j;
+    kk += ks;
+    wkr = 0.5f - c[nc - kk];
+    wki = c[kk];
+    xr = a[j] - a[k];
+    xi = a[j + 1] + a[k + 1];
+    yr = wkr * xr + wki * xi;
+    yi = wkr * xi - wki * xr;
+    a[j] -= yr;
+    a[j + 1] = yi - a[j + 1];
+    a[k] += yr;
+    a[k + 1] = yi - a[k + 1];
+  }
+  a[m + 1] = -a[m + 1];
+}
+
+void aec_rdft_128(int isgn, float *a, int *ip, float *w)
+{
+  const int n = 128;
+  int nw, nc;
+  float xi;
+
+  nw = ip[0];
+  if (n > (nw << 2)) {
+    nw = n >> 2;
+    makewt(ip, w);
+  }
+  nc = ip[1];
+  if (n > (nc << 2)) {
+    nc = n >> 2;
+    makect_32(ip, w + nw);
+  }
+  if (isgn >= 0) {
+    bitrv2_32or128(n, ip + 2, a);
+    cftfsub_128(a, w);
+    rftfsub_128(a, nc, w + nw);
+    xi = a[0] - a[1];
+    a[0] += a[1];
+    a[1] = xi;
+  } else {
+    a[1] = 0.5f * (a[0] - a[1]);
+    a[0] -= a[1];
+    rftbsub_128(a, nc, w + nw);
+    bitrv2_32or128(n, ip + 2, a);
+    cftbsub_128(a, w);
+  }
+}
diff --git a/modules/audio_processing/aec/main/source/aec_core_sse2.c b/modules/audio_processing/aec/main/source/aec_core_sse2.c
index d7f30b58c..c020e7bb4 100644
--- a/modules/audio_processing/aec/main/source/aec_core_sse2.c
+++ b/modules/audio_processing/aec/main/source/aec_core_sse2.c
@@ -174,7 +174,7 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1],
                    -aec->xfBuf[1][xPos + PART_LEN],
                    ef[0][PART_LEN], ef[1][PART_LEN]);
 
-    rdft(PART_LEN2, -1, fft, ip, wfft);
+    aec_rdft_128(-1, fft, ip, wfft);
     memset(fft + PART_LEN, 0, sizeof(float)*PART_LEN);
 
     // fft scaling
@@ -187,7 +187,7 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1],
         _mm_storeu_ps(&fft[j], fft_scale);
       }
     }
-    rdft(PART_LEN2, 1, fft, ip, wfft);
+    aec_rdft_128(1, fft, ip, wfft);
 
     {
       float wt1 = aec->wfBuf[1][pos];