Optimization/cleanup of 'aec_rfdt' initialization (constants, LUT, ...):

* 2.7% AEC overall speedup for the straight C path. * 3.5% AEC overall speedup for the SSE2 path. Review URL: http://webrtc-codereview.appspot.com/60001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@152 4adac7df-926f-26a2-2b94-8c16560cd09d
2011-07-06 18:32:59 +00:00 · 2011-07-06 18:32:59 +00:00 · fae3b31707
commit fae3b31707
parent 7c4469bf61
6 changed files with 78 additions and 80 deletions
--- a/modules/audio_processing/aec/main/source/aec_core.c
+++ b/modules/audio_processing/aec/main/source/aec_core.c
@ -123,8 +123,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
 static void BufferFar(aec_t *aec, const short *farend, int farLen);
 static void FetchFar(aec_t *aec, short *farend, int farLen, int knownDelay);

-static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
-                         short *outputH);
+static void NonLinearProcessing(aec_t *aec, short *output, short *outputH);

 static void GetHighbandGain(const float *lambda, float *nlpGainHband);

@ -256,8 +255,7 @@ static void ScaleErrorSignal(aec_t *aec, float ef[2][PART_LEN1])
  }
 }

-static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1],
-                      int ip[IP_LEN], float wfft[W_LEN]) {
+static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1]) {
  int i, j;
  for (i = 0; i < NR_PART; i++) {
    int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
@ -292,7 +290,7 @@ static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1],
                   -aec->xfBuf[1][xPos + PART_LEN],
                   ef[0][PART_LEN], ef[1][PART_LEN]);

-    aec_rdft_128(-1, fft, ip, wfft);
+    aec_rdft_128(-1, fft);
    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);

    // fft scaling
@ -302,7 +300,7 @@ static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1],
        fft[j] *= scale;
      }
    }
-    aec_rdft_128(1, fft, ip, wfft);
+    aec_rdft_128(1, fft);

    aec->wfBuf[0][pos] += fft[0];
    aec->wfBuf[0][pos + PART_LEN] += fft[1];
@ -574,8 +572,6 @@ static void ProcessBlock(aec_t *aec, const short *farend,
    float fft[PART_LEN2];
    float xf[2][PART_LEN1], yf[2][PART_LEN1], ef[2][PART_LEN1];
    complex_t df[PART_LEN1];
-    int ip[IP_LEN];
-    float wfft[W_LEN];

    const float gPow[2] = {0.9f, 0.1f};

@ -613,9 +609,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
        memcpy(aec->dBufH + PART_LEN, dH, sizeof(float) * PART_LEN);
    }

-    // Setting this on the first call initializes work arrays.
-    ip[0] = 0;
-    aec_rdft_128(1, fft, ip, wfft);
+    aec_rdft_128(1, fft);

    // Far fft
    xf[1][0] = 0;
@ -630,7 +624,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,

    // Near fft
    memcpy(fft, aec->dBuf, sizeof(float) * PART_LEN2);
-    aec_rdft_128(1, fft, ip, wfft);
+    aec_rdft_128(1, fft);
    df[0][1] = 0;
    df[PART_LEN][1] = 0;
    df[0][0] = fft[0];
@ -706,7 +700,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
        fft[2 * i] = yf[0][i];
        fft[2 * i + 1] = yf[1][i];
    }
-    aec_rdft_128(-1, fft, ip, wfft);
+    aec_rdft_128(-1, fft);

    scale = 2.0f / PART_LEN2;
    for (i = 0; i < PART_LEN; i++) {
@ -721,7 +715,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
    memcpy(aec->eBuf + PART_LEN, e, sizeof(float) * PART_LEN);
    memset(fft, 0, sizeof(float) * PART_LEN);
    memcpy(fft + PART_LEN, e, sizeof(float) * PART_LEN);
-    aec_rdft_128(1, fft, ip, wfft);
+    aec_rdft_128(1, fft);

    ef[1][0] = 0;
    ef[1][PART_LEN] = 0;
@ -738,12 +732,12 @@ static void ProcessBlock(aec_t *aec, const short *farend,
    if (aec->adaptToggle) {
 #endif
        // Filter adaptation
-        WebRtcAec_FilterAdaptation(aec, fft, ef, ip, wfft);
+        WebRtcAec_FilterAdaptation(aec, fft, ef);
 #ifdef G167
    }
 #endif

-    NonLinearProcessing(aec, ip, wfft, output, outputH);
+    NonLinearProcessing(aec, output, outputH);

 #if defined(AEC_DEBUG) || defined(G167)
    for (i = 0; i < PART_LEN; i++) {
@ -777,7 +771,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
 #endif
 }

-static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output, short *outputH)
+static void NonLinearProcessing(aec_t *aec, short *output, short *outputH)
 {
    float efw[2][PART_LEN1], dfw[2][PART_LEN1];
    complex_t xfw[PART_LEN1];
@ -844,7 +838,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
        fft[i] = aec->xBuf[i] * sqrtHanning[i];
        fft[PART_LEN + i] = aec->xBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i];
    }
-    aec_rdft_128(1, fft, ip, wfft);
+    aec_rdft_128(1, fft);

    xfw[0][1] = 0;
    xfw[PART_LEN][1] = 0;
@ -866,7 +860,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
        fft[i] = aec->dBuf[i] * sqrtHanning[i];
        fft[PART_LEN + i] = aec->dBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i];
    }
-    aec_rdft_128(1, fft, ip, wfft);
+    aec_rdft_128(1, fft);

    dfw[1][0] = 0;
    dfw[1][PART_LEN] = 0;
@ -882,7 +876,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
        fft[i] = aec->eBuf[i] * sqrtHanning[i];
        fft[PART_LEN + i] = aec->eBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i];
    }
-    aec_rdft_128(1, fft, ip, wfft);
+    aec_rdft_128(1, fft);
    efw[1][0] = 0;
    efw[1][PART_LEN] = 0;
    efw[0][0] = fft[0];
@ -1059,7 +1053,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
        // Sign change required by Ooura fft.
        fft[2*i + 1] = -efw[1][i];
    }
-    aec_rdft_128(-1, fft, ip, wfft);
+    aec_rdft_128(-1, fft);

    // Overlap and add to obtain output.
    scale = 2.0f / PART_LEN2;
@ -1091,7 +1085,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
                fft[2*i] = comfortNoiseHband[i][0];
                fft[2*i + 1] = comfortNoiseHband[i][1];
            }
-            aec_rdft_128(-1, fft, ip, wfft);
+            aec_rdft_128(-1, fft);
            scale = 2.0f / PART_LEN2;
        }

--- a/modules/audio_processing/aec/main/source/aec_core.h
+++ b/modules/audio_processing/aec/main/source/aec_core.h
@ -172,8 +172,7 @@ extern WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal;
 #define IP_LEN PART_LEN // this must be at least ceil(2 + sqrt(PART_LEN))
 #define W_LEN PART_LEN
 typedef void (*WebRtcAec_FilterAdaptation_t)
-  (aec_t *aec, float *fft, float ef[2][PART_LEN1], int ip[IP_LEN],
-   float wfft[W_LEN]);
+  (aec_t *aec, float *fft, float ef[2][PART_LEN1]);
 extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
 typedef void (*WebRtcAec_OverdriveAndSuppress_t)
  (aec_t *aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]);
--- a/modules/audio_processing/aec/main/source/aec_core_sse2.c
+++ b/modules/audio_processing/aec/main/source/aec_core_sse2.c
@ -126,8 +126,7 @@ static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
  }
 }

-static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1],
-                          int ip[IP_LEN], float wfft[W_LEN]) {
+static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1]) {
  int i, j;
  for (i = 0; i < NR_PART; i++) {
    int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
@ -175,7 +174,7 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1],
                   -aec->xfBuf[1][xPos + PART_LEN],
                   ef[0][PART_LEN], ef[1][PART_LEN]);

-    aec_rdft_128(-1, fft, ip, wfft);
+    aec_rdft_128(-1, fft);
    memset(fft + PART_LEN, 0, sizeof(float)*PART_LEN);

    // fft scaling
@ -188,7 +187,7 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1],
        _mm_storeu_ps(&fft[j], fft_scale);
      }
    }
-    aec_rdft_128(1, fft, ip, wfft);
+    aec_rdft_128(1, fft);

    {
      float wt1 = aec->wfBuf[1][pos];
--- a/modules/audio_processing/aec/main/source/aec_rdft.c
+++ b/modules/audio_processing/aec/main/source/aec_rdft.c
@ -24,6 +24,9 @@
 #include "aec_rdft.h"
 #include "system_wrappers/interface/cpu_features_wrapper.h"

+float rdft_w[64];
+static int ip[16];
+
 static void bitrv2_32or128(int n, int *ip, float *a) {
  // n is 32 or 128
  int j, j1, k, k1, m, m2;
@ -98,7 +101,7 @@ static void bitrv2_32or128(int n, int *ip, float *a) {
  }
 }

-static void makewt(int *ip, float *w) {
+static void makewt_32() {
  const int nw = 32;
  int j, nwh;
  float delta, x, y;
@ -107,22 +110,23 @@ static void makewt(int *ip, float *w) {
  ip[1] = 1;
  nwh = nw >> 1;
  delta = atanf(1.0f) / nwh;
-  w[0] = 1;
-  w[1] = 0;
-  w[nwh] = cosf(delta * nwh);
-  w[nwh + 1] = w[nwh];
+  rdft_w[0] = 1;
+  rdft_w[1] = 0;
+  rdft_w[nwh] = cosf(delta * nwh);
+  rdft_w[nwh + 1] = rdft_w[nwh];
  for (j = 2; j < nwh; j += 2) {
    x = cosf(delta * j);
    y = sinf(delta * j);
-    w[j] = x;
-    w[j + 1] = y;
-    w[nw - j] = y;
-    w[nw - j + 1] = x;
+    rdft_w[j] = x;
+    rdft_w[j + 1] = y;
+    rdft_w[nw - j] = y;
+    rdft_w[nw - j + 1] = x;
  }
-  bitrv2_32or128(nw, ip + 2, w);
+  bitrv2_32or128(nw, ip + 2, rdft_w);
 }

-static void makect_32(int *ip, float *c) {
+static void makect_32() {
+  float *c = rdft_w + 32;
  const int nc = 32;
  int j, nch;
  float delta;
@ -138,7 +142,7 @@ static void makect_32(int *ip, float *c) {
  }
 }

-static void cft1st_128(float *a, float *w) {
+static void cft1st_128(float *a) {
  const int n = 128;
  int j, k1, k2;
  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
@ -160,7 +164,7 @@ static void cft1st_128(float *a, float *w) {
  a[3] = x1i + x3r;
  a[6] = x1r + x3i;
  a[7] = x1i - x3r;
-  wk1r = w[2];
+  wk1r = rdft_w[2];
  x0r = a[8] + a[10];
  x0i = a[9] + a[11];
  x1r = a[8] - a[10];
@ -185,10 +189,10 @@ static void cft1st_128(float *a, float *w) {
  for (j = 16; j < n; j += 16) {
    k1 += 2;
    k2 = 2 * k1;
-    wk2r = w[k1];
-    wk2i = w[k1 + 1];
-    wk1r = w[k2];
-    wk1i = w[k2 + 1];
+    wk2r = rdft_w[k1];
+    wk2i = rdft_w[k1 + 1];
+    wk1r = rdft_w[k2];
+    wk1i = rdft_w[k2 + 1];
    wk3r = wk1r - 2 * wk2i * wk1i;
    wk3i = 2 * wk2i * wk1r - wk1i;
    x0r = a[j] + a[j + 2];
@ -213,8 +217,8 @@ static void cft1st_128(float *a, float *w) {
    x0i = x1i - x3r;
    a[j + 6] = wk3r * x0r - wk3i * x0i;
    a[j + 7] = wk3r * x0i + wk3i * x0r;
-    wk1r = w[k2 + 2];
-    wk1i = w[k2 + 3];
+    wk1r = rdft_w[k2 + 2];
+    wk1i = rdft_w[k2 + 3];
    wk3r = wk1r - 2 * wk2r * wk1i;
    wk3i = 2 * wk2r * wk1r - wk1i;
    x0r = a[j + 8] + a[j + 10];
@ -242,7 +246,7 @@ static void cft1st_128(float *a, float *w) {
  }
 }

-static void cftmdl_128(int l, float *a, float *w) {
+static void cftmdl_128(int l, float *a) {
  const int n = 128;
  int j, j1, j2, j3, k, k1, k2, m, m2;
  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
@ -270,7 +274,7 @@ static void cftmdl_128(int l, float *a, float *w) {
    a[j3] = x1r + x3i;
    a[j3 + 1] = x1i - x3r;
  }
-  wk1r = w[2];
+  wk1r = rdft_w[2];
  for (j = m; j < l + m; j += 2) {
    j1 = j + l;
    j2 = j1 + l;
@ -301,10 +305,10 @@ static void cftmdl_128(int l, float *a, float *w) {
  for (k = m2; k < n; k += m2) {
    k1 += 2;
    k2 = 2 * k1;
-    wk2r = w[k1];
-    wk2i = w[k1 + 1];
-    wk1r = w[k2];
-    wk1i = w[k2 + 1];
+    wk2r = rdft_w[k1];
+    wk2i = rdft_w[k1 + 1];
+    wk1r = rdft_w[k2];
+    wk1i = rdft_w[k2 + 1];
    wk3r = wk1r - 2 * wk2i * wk1i;
    wk3i = 2 * wk2i * wk1r - wk1i;
    for (j = k; j < l + k; j += 2) {
@ -334,8 +338,8 @@ static void cftmdl_128(int l, float *a, float *w) {
      a[j3] = wk3r * x0r - wk3i * x0i;
      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
    }
-    wk1r = w[k2 + 2];
-    wk1i = w[k2 + 3];
+    wk1r = rdft_w[k2 + 2];
+    wk1i = rdft_w[k2 + 3];
    wk3r = wk1r - 2 * wk2r * wk1i;
    wk3i = 2 * wk2r * wk1r - wk1i;
    for (j = k + m; j < l + (k + m); j += 2) {
@ -368,12 +372,12 @@ static void cftmdl_128(int l, float *a, float *w) {
  }
 }

-static void cftfsub_128(float *a, float *w) {
+static void cftfsub_128(float *a) {
  int j, j1, j2, j3, l;
  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;

-  cft1st_128(a, w);
-  cftmdl_128(8, a, w);
+  cft1st_128(a);
+  cftmdl_128(8, a);
  l = 32;
  for (j = 0; j < l; j += 2) {
    j1 = j + l;
@ -398,12 +402,12 @@ static void cftfsub_128(float *a, float *w) {
  }
 }

-static void cftbsub_128(float *a, float *w) {
+static void cftbsub_128(float *a) {
  int j, j1, j2, j3, l;
  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;

-  cft1st_128(a, w);
-  cftmdl_128(8, a, w);
+  cft1st_128(a);
+  cftmdl_128(8, a);
  l = 32;

  for (j = 0; j < l; j += 2) {
@ -429,7 +433,8 @@ static void cftbsub_128(float *a, float *w) {
  }
 }

-static void rftfsub_128_C(float *a, float *c) {
+static void rftfsub_128_C(float *a) {
+  const float *c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;

@ -449,7 +454,8 @@ static void rftfsub_128_C(float *a, float *c) {
  }
 }

-static void rftbsub_128_C(float *a, float *c) {
+static void rftbsub_128_C(float *a) {
+  const float *c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;

@ -471,33 +477,25 @@ static void rftbsub_128_C(float *a, float *c) {
  a[65] = -a[65];
 }

-void aec_rdft_128(int isgn, float *a, int *ip, float *w)
-{
+void aec_rdft_128(int isgn, float *a) {
  const int n = 128;
  int nw;
  float xi;

  nw = ip[0];
-  if (n > (nw << 2)) {
-    nw = n >> 2;
-    makewt(ip, w);
-  }
-  if (n > (ip[1] << 2)) {
-    makect_32(ip, w + nw);
-  }
  if (isgn >= 0) {
    bitrv2_32or128(n, ip + 2, a);
-    cftfsub_128(a, w);
-    rftfsub_128(a, w + nw);
+    cftfsub_128(a);
+    rftfsub_128(a);
    xi = a[0] - a[1];
    a[0] += a[1];
    a[1] = xi;
  } else {
    a[1] = 0.5f * (a[0] - a[1]);
    a[0] -= a[1];
-    rftbsub_128(a, w + nw);
+    rftbsub_128(a);
    bitrv2_32or128(n, ip + 2, a);
-    cftbsub_128(a, w);
+    cftbsub_128(a);
  }
 }

@ -513,4 +511,7 @@ void aec_rdft_init(void) {
    aec_rdft_init_sse2();
 #endif
  }
+  // init library constants.
+  makewt_32();
+  makect_32();
 }
--- a/modules/audio_processing/aec/main/source/aec_rdft.h
+++ b/modules/audio_processing/aec/main/source/aec_rdft.h
@ -8,12 +8,15 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+// constants shared by all paths (C, SSE2).
+extern float rdft_w[64];
+
 // code path selection function pointers
-typedef void (*rft_sub_128_t)(float *a, float *c);
+typedef void (*rft_sub_128_t)(float *a);
 extern rft_sub_128_t rftfsub_128;
 extern rft_sub_128_t rftbsub_128;

 // entry points
 void aec_rdft_init(void);
 void aec_rdft_init_sse2(void);
-void aec_rdft_128(int, float *, int *, float *);
+void aec_rdft_128(int isgn, float *a);
--- a/modules/audio_processing/aec/main/source/aec_rdft_sse2.c
+++ b/modules/audio_processing/aec/main/source/aec_rdft_sse2.c
@ -20,7 +20,8 @@
 # define ALIGN16_END __attribute__((aligned(16)))
 #endif

-static void rftfsub_128_SSE2(float *a, float *c) {
+static void rftfsub_128_SSE2(float *a) {
+  const float *c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;

@ -109,7 +110,8 @@ static void rftfsub_128_SSE2(float *a, float *c) {
  }
 }

-static void rftbsub_128_SSE2(float *a, float *c) {
+static void rftbsub_128_SSE2(float *a) {
+  const float *c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;