
* scalar optimization, vectorization (including new file for SSE2 code and path selection mechanism). * 0.5% AEC overall speedup for the straight C path. * 3.0% AEC overall speedup for the SSE2 path. Review URL: http://webrtc-codereview.appspot.com/46005 git-svn-id: http://webrtc.googlecode.com/svn/trunk@134 4adac7df-926f-26a2-2b94-8c16560cd09d
521 lines
12 KiB
C
521 lines
12 KiB
C
/*
|
|
* http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
|
|
* Copyright Takuya OOURA, 1996-2001
|
|
*
|
|
* You may use, copy, modify and distribute this code for any purpose (include
|
|
* commercial use) and without fee. Please refer to this package when you modify
|
|
* this code.
|
|
*
|
|
* Changes by the WebRTC authors:
|
|
* - Trivial type modifications.
|
|
* - Minimal code subset to do rdft of length 128.
|
|
* - Optimizations because of known length.
|
|
*
|
|
* All changes are covered by the WebRTC license and IP grant:
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include <math.h>
|
|
|
|
#include "aec_rdft.h"
|
|
#include "system_wrappers/interface/cpu_features_wrapper.h"
|
|
|
|
static void bitrv2_32or128(int n, int *ip, float *a) {
|
|
// n is 32 or 128
|
|
int j, j1, k, k1, m, m2;
|
|
float xr, xi, yr, yi;
|
|
|
|
ip[0] = 0;
|
|
{
|
|
int l = n;
|
|
m = 1;
|
|
while ((m << 3) < l) {
|
|
l >>= 1;
|
|
for (j = 0; j < m; j++) {
|
|
ip[m + j] = ip[j] + l;
|
|
}
|
|
m <<= 1;
|
|
}
|
|
}
|
|
m2 = 2 * m;
|
|
for (k = 0; k < m; k++) {
|
|
for (j = 0; j < k; j++) {
|
|
j1 = 2 * j + ip[k];
|
|
k1 = 2 * k + ip[j];
|
|
xr = a[j1];
|
|
xi = a[j1 + 1];
|
|
yr = a[k1];
|
|
yi = a[k1 + 1];
|
|
a[j1] = yr;
|
|
a[j1 + 1] = yi;
|
|
a[k1] = xr;
|
|
a[k1 + 1] = xi;
|
|
j1 += m2;
|
|
k1 += 2 * m2;
|
|
xr = a[j1];
|
|
xi = a[j1 + 1];
|
|
yr = a[k1];
|
|
yi = a[k1 + 1];
|
|
a[j1] = yr;
|
|
a[j1 + 1] = yi;
|
|
a[k1] = xr;
|
|
a[k1 + 1] = xi;
|
|
j1 += m2;
|
|
k1 -= m2;
|
|
xr = a[j1];
|
|
xi = a[j1 + 1];
|
|
yr = a[k1];
|
|
yi = a[k1 + 1];
|
|
a[j1] = yr;
|
|
a[j1 + 1] = yi;
|
|
a[k1] = xr;
|
|
a[k1 + 1] = xi;
|
|
j1 += m2;
|
|
k1 += 2 * m2;
|
|
xr = a[j1];
|
|
xi = a[j1 + 1];
|
|
yr = a[k1];
|
|
yi = a[k1 + 1];
|
|
a[j1] = yr;
|
|
a[j1 + 1] = yi;
|
|
a[k1] = xr;
|
|
a[k1 + 1] = xi;
|
|
}
|
|
j1 = 2 * k + m2 + ip[k];
|
|
k1 = j1 + m2;
|
|
xr = a[j1];
|
|
xi = a[j1 + 1];
|
|
yr = a[k1];
|
|
yi = a[k1 + 1];
|
|
a[j1] = yr;
|
|
a[j1 + 1] = yi;
|
|
a[k1] = xr;
|
|
a[k1 + 1] = xi;
|
|
}
|
|
}
|
|
|
|
static void makewt(int *ip, float *w) {
|
|
const int nw = 32;
|
|
int j, nwh;
|
|
float delta, x, y;
|
|
|
|
ip[0] = nw;
|
|
ip[1] = 1;
|
|
nwh = nw >> 1;
|
|
delta = atanf(1.0f) / nwh;
|
|
w[0] = 1;
|
|
w[1] = 0;
|
|
w[nwh] = cosf(delta * nwh);
|
|
w[nwh + 1] = w[nwh];
|
|
for (j = 2; j < nwh; j += 2) {
|
|
x = cosf(delta * j);
|
|
y = sinf(delta * j);
|
|
w[j] = x;
|
|
w[j + 1] = y;
|
|
w[nw - j] = y;
|
|
w[nw - j + 1] = x;
|
|
}
|
|
bitrv2_32or128(nw, ip + 2, w);
|
|
}
|
|
|
|
static void makect_32(int *ip, float *c) {
|
|
const int nc = 32;
|
|
int j, nch;
|
|
float delta;
|
|
|
|
ip[1] = nc;
|
|
nch = nc >> 1;
|
|
delta = atanf(1.0f) / nch;
|
|
c[0] = cosf(delta * nch);
|
|
c[nch] = 0.5f * c[0];
|
|
for (j = 1; j < nch; j++) {
|
|
c[j] = 0.5f * cosf(delta * j);
|
|
c[nc - j] = 0.5f * sinf(delta * j);
|
|
}
|
|
}
|
|
|
|
static void cft1st_128(float *a, float *w) {
|
|
const int n = 128;
|
|
int j, k1, k2;
|
|
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
|
|
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
|
|
|
x0r = a[0] + a[2];
|
|
x0i = a[1] + a[3];
|
|
x1r = a[0] - a[2];
|
|
x1i = a[1] - a[3];
|
|
x2r = a[4] + a[6];
|
|
x2i = a[5] + a[7];
|
|
x3r = a[4] - a[6];
|
|
x3i = a[5] - a[7];
|
|
a[0] = x0r + x2r;
|
|
a[1] = x0i + x2i;
|
|
a[4] = x0r - x2r;
|
|
a[5] = x0i - x2i;
|
|
a[2] = x1r - x3i;
|
|
a[3] = x1i + x3r;
|
|
a[6] = x1r + x3i;
|
|
a[7] = x1i - x3r;
|
|
wk1r = w[2];
|
|
x0r = a[8] + a[10];
|
|
x0i = a[9] + a[11];
|
|
x1r = a[8] - a[10];
|
|
x1i = a[9] - a[11];
|
|
x2r = a[12] + a[14];
|
|
x2i = a[13] + a[15];
|
|
x3r = a[12] - a[14];
|
|
x3i = a[13] - a[15];
|
|
a[8] = x0r + x2r;
|
|
a[9] = x0i + x2i;
|
|
a[12] = x2i - x0i;
|
|
a[13] = x0r - x2r;
|
|
x0r = x1r - x3i;
|
|
x0i = x1i + x3r;
|
|
a[10] = wk1r * (x0r - x0i);
|
|
a[11] = wk1r * (x0r + x0i);
|
|
x0r = x3i + x1r;
|
|
x0i = x3r - x1i;
|
|
a[14] = wk1r * (x0i - x0r);
|
|
a[15] = wk1r * (x0i + x0r);
|
|
k1 = 0;
|
|
for (j = 16; j < n; j += 16) {
|
|
k1 += 2;
|
|
k2 = 2 * k1;
|
|
wk2r = w[k1];
|
|
wk2i = w[k1 + 1];
|
|
wk1r = w[k2];
|
|
wk1i = w[k2 + 1];
|
|
wk3r = wk1r - 2 * wk2i * wk1i;
|
|
wk3i = 2 * wk2i * wk1r - wk1i;
|
|
x0r = a[j] + a[j + 2];
|
|
x0i = a[j + 1] + a[j + 3];
|
|
x1r = a[j] - a[j + 2];
|
|
x1i = a[j + 1] - a[j + 3];
|
|
x2r = a[j + 4] + a[j + 6];
|
|
x2i = a[j + 5] + a[j + 7];
|
|
x3r = a[j + 4] - a[j + 6];
|
|
x3i = a[j + 5] - a[j + 7];
|
|
a[j] = x0r + x2r;
|
|
a[j + 1] = x0i + x2i;
|
|
x0r -= x2r;
|
|
x0i -= x2i;
|
|
a[j + 4] = wk2r * x0r - wk2i * x0i;
|
|
a[j + 5] = wk2r * x0i + wk2i * x0r;
|
|
x0r = x1r - x3i;
|
|
x0i = x1i + x3r;
|
|
a[j + 2] = wk1r * x0r - wk1i * x0i;
|
|
a[j + 3] = wk1r * x0i + wk1i * x0r;
|
|
x0r = x1r + x3i;
|
|
x0i = x1i - x3r;
|
|
a[j + 6] = wk3r * x0r - wk3i * x0i;
|
|
a[j + 7] = wk3r * x0i + wk3i * x0r;
|
|
wk1r = w[k2 + 2];
|
|
wk1i = w[k2 + 3];
|
|
wk3r = wk1r - 2 * wk2r * wk1i;
|
|
wk3i = 2 * wk2r * wk1r - wk1i;
|
|
x0r = a[j + 8] + a[j + 10];
|
|
x0i = a[j + 9] + a[j + 11];
|
|
x1r = a[j + 8] - a[j + 10];
|
|
x1i = a[j + 9] - a[j + 11];
|
|
x2r = a[j + 12] + a[j + 14];
|
|
x2i = a[j + 13] + a[j + 15];
|
|
x3r = a[j + 12] - a[j + 14];
|
|
x3i = a[j + 13] - a[j + 15];
|
|
a[j + 8] = x0r + x2r;
|
|
a[j + 9] = x0i + x2i;
|
|
x0r -= x2r;
|
|
x0i -= x2i;
|
|
a[j + 12] = -wk2i * x0r - wk2r * x0i;
|
|
a[j + 13] = -wk2i * x0i + wk2r * x0r;
|
|
x0r = x1r - x3i;
|
|
x0i = x1i + x3r;
|
|
a[j + 10] = wk1r * x0r - wk1i * x0i;
|
|
a[j + 11] = wk1r * x0i + wk1i * x0r;
|
|
x0r = x1r + x3i;
|
|
x0i = x1i - x3r;
|
|
a[j + 14] = wk3r * x0r - wk3i * x0i;
|
|
a[j + 15] = wk3r * x0i + wk3i * x0r;
|
|
}
|
|
}
|
|
|
|
static void cftmdl_128(int l, float *a, float *w) {
|
|
const int n = 128;
|
|
int j, j1, j2, j3, k, k1, k2, m, m2;
|
|
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
|
|
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
|
|
|
m = l << 2;
|
|
for (j = 0; j < l; j += 2) {
|
|
j1 = j + l;
|
|
j2 = j1 + l;
|
|
j3 = j2 + l;
|
|
x0r = a[j] + a[j1];
|
|
x0i = a[j + 1] + a[j1 + 1];
|
|
x1r = a[j] - a[j1];
|
|
x1i = a[j + 1] - a[j1 + 1];
|
|
x2r = a[j2] + a[j3];
|
|
x2i = a[j2 + 1] + a[j3 + 1];
|
|
x3r = a[j2] - a[j3];
|
|
x3i = a[j2 + 1] - a[j3 + 1];
|
|
a[j] = x0r + x2r;
|
|
a[j + 1] = x0i + x2i;
|
|
a[j2] = x0r - x2r;
|
|
a[j2 + 1] = x0i - x2i;
|
|
a[j1] = x1r - x3i;
|
|
a[j1 + 1] = x1i + x3r;
|
|
a[j3] = x1r + x3i;
|
|
a[j3 + 1] = x1i - x3r;
|
|
}
|
|
wk1r = w[2];
|
|
for (j = m; j < l + m; j += 2) {
|
|
j1 = j + l;
|
|
j2 = j1 + l;
|
|
j3 = j2 + l;
|
|
x0r = a[j] + a[j1];
|
|
x0i = a[j + 1] + a[j1 + 1];
|
|
x1r = a[j] - a[j1];
|
|
x1i = a[j + 1] - a[j1 + 1];
|
|
x2r = a[j2] + a[j3];
|
|
x2i = a[j2 + 1] + a[j3 + 1];
|
|
x3r = a[j2] - a[j3];
|
|
x3i = a[j2 + 1] - a[j3 + 1];
|
|
a[j] = x0r + x2r;
|
|
a[j + 1] = x0i + x2i;
|
|
a[j2] = x2i - x0i;
|
|
a[j2 + 1] = x0r - x2r;
|
|
x0r = x1r - x3i;
|
|
x0i = x1i + x3r;
|
|
a[j1] = wk1r * (x0r - x0i);
|
|
a[j1 + 1] = wk1r * (x0r + x0i);
|
|
x0r = x3i + x1r;
|
|
x0i = x3r - x1i;
|
|
a[j3] = wk1r * (x0i - x0r);
|
|
a[j3 + 1] = wk1r * (x0i + x0r);
|
|
}
|
|
k1 = 0;
|
|
m2 = 2 * m;
|
|
for (k = m2; k < n; k += m2) {
|
|
k1 += 2;
|
|
k2 = 2 * k1;
|
|
wk2r = w[k1];
|
|
wk2i = w[k1 + 1];
|
|
wk1r = w[k2];
|
|
wk1i = w[k2 + 1];
|
|
wk3r = wk1r - 2 * wk2i * wk1i;
|
|
wk3i = 2 * wk2i * wk1r - wk1i;
|
|
for (j = k; j < l + k; j += 2) {
|
|
j1 = j + l;
|
|
j2 = j1 + l;
|
|
j3 = j2 + l;
|
|
x0r = a[j] + a[j1];
|
|
x0i = a[j + 1] + a[j1 + 1];
|
|
x1r = a[j] - a[j1];
|
|
x1i = a[j + 1] - a[j1 + 1];
|
|
x2r = a[j2] + a[j3];
|
|
x2i = a[j2 + 1] + a[j3 + 1];
|
|
x3r = a[j2] - a[j3];
|
|
x3i = a[j2 + 1] - a[j3 + 1];
|
|
a[j] = x0r + x2r;
|
|
a[j + 1] = x0i + x2i;
|
|
x0r -= x2r;
|
|
x0i -= x2i;
|
|
a[j2] = wk2r * x0r - wk2i * x0i;
|
|
a[j2 + 1] = wk2r * x0i + wk2i * x0r;
|
|
x0r = x1r - x3i;
|
|
x0i = x1i + x3r;
|
|
a[j1] = wk1r * x0r - wk1i * x0i;
|
|
a[j1 + 1] = wk1r * x0i + wk1i * x0r;
|
|
x0r = x1r + x3i;
|
|
x0i = x1i - x3r;
|
|
a[j3] = wk3r * x0r - wk3i * x0i;
|
|
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
|
|
}
|
|
wk1r = w[k2 + 2];
|
|
wk1i = w[k2 + 3];
|
|
wk3r = wk1r - 2 * wk2r * wk1i;
|
|
wk3i = 2 * wk2r * wk1r - wk1i;
|
|
for (j = k + m; j < l + (k + m); j += 2) {
|
|
j1 = j + l;
|
|
j2 = j1 + l;
|
|
j3 = j2 + l;
|
|
x0r = a[j] + a[j1];
|
|
x0i = a[j + 1] + a[j1 + 1];
|
|
x1r = a[j] - a[j1];
|
|
x1i = a[j + 1] - a[j1 + 1];
|
|
x2r = a[j2] + a[j3];
|
|
x2i = a[j2 + 1] + a[j3 + 1];
|
|
x3r = a[j2] - a[j3];
|
|
x3i = a[j2 + 1] - a[j3 + 1];
|
|
a[j] = x0r + x2r;
|
|
a[j + 1] = x0i + x2i;
|
|
x0r -= x2r;
|
|
x0i -= x2i;
|
|
a[j2] = -wk2i * x0r - wk2r * x0i;
|
|
a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
|
|
x0r = x1r - x3i;
|
|
x0i = x1i + x3r;
|
|
a[j1] = wk1r * x0r - wk1i * x0i;
|
|
a[j1 + 1] = wk1r * x0i + wk1i * x0r;
|
|
x0r = x1r + x3i;
|
|
x0i = x1i - x3r;
|
|
a[j3] = wk3r * x0r - wk3i * x0i;
|
|
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void cftfsub_128(float *a, float *w) {
|
|
int j, j1, j2, j3, l;
|
|
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
|
|
|
cft1st_128(a, w);
|
|
cftmdl_128(8, a, w);
|
|
l = 32;
|
|
for (j = 0; j < l; j += 2) {
|
|
j1 = j + l;
|
|
j2 = j1 + l;
|
|
j3 = j2 + l;
|
|
x0r = a[j] + a[j1];
|
|
x0i = a[j + 1] + a[j1 + 1];
|
|
x1r = a[j] - a[j1];
|
|
x1i = a[j + 1] - a[j1 + 1];
|
|
x2r = a[j2] + a[j3];
|
|
x2i = a[j2 + 1] + a[j3 + 1];
|
|
x3r = a[j2] - a[j3];
|
|
x3i = a[j2 + 1] - a[j3 + 1];
|
|
a[j] = x0r + x2r;
|
|
a[j + 1] = x0i + x2i;
|
|
a[j2] = x0r - x2r;
|
|
a[j2 + 1] = x0i - x2i;
|
|
a[j1] = x1r - x3i;
|
|
a[j1 + 1] = x1i + x3r;
|
|
a[j3] = x1r + x3i;
|
|
a[j3 + 1] = x1i - x3r;
|
|
}
|
|
}
|
|
|
|
static void cftbsub_128(float *a, float *w) {
|
|
int j, j1, j2, j3, l;
|
|
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
|
|
|
|
cft1st_128(a, w);
|
|
cftmdl_128(8, a, w);
|
|
l = 32;
|
|
|
|
for (j = 0; j < l; j += 2) {
|
|
j1 = j + l;
|
|
j2 = j1 + l;
|
|
j3 = j2 + l;
|
|
x0r = a[j] + a[j1];
|
|
x0i = -a[j + 1] - a[j1 + 1];
|
|
x1r = a[j] - a[j1];
|
|
x1i = -a[j + 1] + a[j1 + 1];
|
|
x2r = a[j2] + a[j3];
|
|
x2i = a[j2 + 1] + a[j3 + 1];
|
|
x3r = a[j2] - a[j3];
|
|
x3i = a[j2 + 1] - a[j3 + 1];
|
|
a[j] = x0r + x2r;
|
|
a[j + 1] = x0i - x2i;
|
|
a[j2] = x0r - x2r;
|
|
a[j2 + 1] = x0i + x2i;
|
|
a[j1] = x1r - x3i;
|
|
a[j1 + 1] = x1i - x3r;
|
|
a[j3] = x1r + x3i;
|
|
a[j3 + 1] = x1i + x3r;
|
|
}
|
|
}
|
|
|
|
static void rftfsub_128_C(float *a, float *c) {
|
|
int j1, j2, k1, k2;
|
|
float wkr, wki, xr, xi, yr, yi;
|
|
|
|
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
|
|
k2 = 128 - j2;
|
|
k1 = 32 - j1;
|
|
wkr = 0.5f - c[k1];
|
|
wki = c[j1];
|
|
xr = a[j2 + 0] - a[k2 + 0];
|
|
xi = a[j2 + 1] + a[k2 + 1];
|
|
yr = wkr * xr - wki * xi;
|
|
yi = wkr * xi + wki * xr;
|
|
a[j2 + 0] -= yr;
|
|
a[j2 + 1] -= yi;
|
|
a[k2 + 0] += yr;
|
|
a[k2 + 1] -= yi;
|
|
}
|
|
}
|
|
|
|
static void rftbsub_128(float *a, int nc, float *c) {
|
|
const int n = 128;
|
|
int j, k, kk, ks, m;
|
|
float wkr, wki, xr, xi, yr, yi;
|
|
|
|
a[1] = -a[1];
|
|
m = n >> 1;
|
|
ks = 2 * nc / m;
|
|
kk = 0;
|
|
for (j = 2; j < m; j += 2) {
|
|
k = n - j;
|
|
kk += ks;
|
|
wkr = 0.5f - c[nc - kk];
|
|
wki = c[kk];
|
|
xr = a[j] - a[k];
|
|
xi = a[j + 1] + a[k + 1];
|
|
yr = wkr * xr + wki * xi;
|
|
yi = wkr * xi - wki * xr;
|
|
a[j] -= yr;
|
|
a[j + 1] = yi - a[j + 1];
|
|
a[k] += yr;
|
|
a[k + 1] = yi - a[k + 1];
|
|
}
|
|
a[m + 1] = -a[m + 1];
|
|
}
|
|
|
|
void aec_rdft_128(int isgn, float *a, int *ip, float *w)
|
|
{
|
|
const int n = 128;
|
|
int nw, nc;
|
|
float xi;
|
|
|
|
nw = ip[0];
|
|
if (n > (nw << 2)) {
|
|
nw = n >> 2;
|
|
makewt(ip, w);
|
|
}
|
|
nc = ip[1];
|
|
if (n > (nc << 2)) {
|
|
nc = n >> 2;
|
|
makect_32(ip, w + nw);
|
|
}
|
|
if (isgn >= 0) {
|
|
bitrv2_32or128(n, ip + 2, a);
|
|
cftfsub_128(a, w);
|
|
rftfsub_128(a, w + nw);
|
|
xi = a[0] - a[1];
|
|
a[0] += a[1];
|
|
a[1] = xi;
|
|
} else {
|
|
a[1] = 0.5f * (a[0] - a[1]);
|
|
a[0] -= a[1];
|
|
rftbsub_128(a, nc, w + nw);
|
|
bitrv2_32or128(n, ip + 2, a);
|
|
cftbsub_128(a, w);
|
|
}
|
|
}
|
|
|
|
// code path selection
|
|
rftfsub_128_t rftfsub_128;
|
|
|
|
void aec_rdft_init(void) {
|
|
rftfsub_128 = rftfsub_128_C;
|
|
if (WebRtc_GetCPUInfo(kSSE2)) {
|
|
#if defined(__SSE2__)
|
|
aec_rdft_init_sse2();
|
|
#endif
|
|
}
|
|
}
|