Optimization/cleanup of 'aec_rfdt' initialization (constants, LUT, ...):

* 2.7% AEC overall speedup for the straight C path.
* 3.5% AEC overall speedup for the SSE2 path.
Review URL: http://webrtc-codereview.appspot.com/60001

git-svn-id: http://webrtc.googlecode.com/svn/trunk@152 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
cduvivier@google.com 2011-07-06 18:32:59 +00:00
parent 7c4469bf61
commit fae3b31707
6 changed files with 78 additions and 80 deletions

View File

@ -123,8 +123,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
static void BufferFar(aec_t *aec, const short *farend, int farLen);
static void FetchFar(aec_t *aec, short *farend, int farLen, int knownDelay);
static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
short *outputH);
static void NonLinearProcessing(aec_t *aec, short *output, short *outputH);
static void GetHighbandGain(const float *lambda, float *nlpGainHband);
@ -256,8 +255,7 @@ static void ScaleErrorSignal(aec_t *aec, float ef[2][PART_LEN1])
}
}
static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1],
int ip[IP_LEN], float wfft[W_LEN]) {
static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1]) {
int i, j;
for (i = 0; i < NR_PART; i++) {
int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
@ -292,7 +290,7 @@ static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1],
-aec->xfBuf[1][xPos + PART_LEN],
ef[0][PART_LEN], ef[1][PART_LEN]);
aec_rdft_128(-1, fft, ip, wfft);
aec_rdft_128(-1, fft);
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
// fft scaling
@ -302,7 +300,7 @@ static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1],
fft[j] *= scale;
}
}
aec_rdft_128(1, fft, ip, wfft);
aec_rdft_128(1, fft);
aec->wfBuf[0][pos] += fft[0];
aec->wfBuf[0][pos + PART_LEN] += fft[1];
@ -574,8 +572,6 @@ static void ProcessBlock(aec_t *aec, const short *farend,
float fft[PART_LEN2];
float xf[2][PART_LEN1], yf[2][PART_LEN1], ef[2][PART_LEN1];
complex_t df[PART_LEN1];
int ip[IP_LEN];
float wfft[W_LEN];
const float gPow[2] = {0.9f, 0.1f};
@ -613,9 +609,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
memcpy(aec->dBufH + PART_LEN, dH, sizeof(float) * PART_LEN);
}
// Setting this on the first call initializes work arrays.
ip[0] = 0;
aec_rdft_128(1, fft, ip, wfft);
aec_rdft_128(1, fft);
// Far fft
xf[1][0] = 0;
@ -630,7 +624,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
// Near fft
memcpy(fft, aec->dBuf, sizeof(float) * PART_LEN2);
aec_rdft_128(1, fft, ip, wfft);
aec_rdft_128(1, fft);
df[0][1] = 0;
df[PART_LEN][1] = 0;
df[0][0] = fft[0];
@ -706,7 +700,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
fft[2 * i] = yf[0][i];
fft[2 * i + 1] = yf[1][i];
}
aec_rdft_128(-1, fft, ip, wfft);
aec_rdft_128(-1, fft);
scale = 2.0f / PART_LEN2;
for (i = 0; i < PART_LEN; i++) {
@ -721,7 +715,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
memcpy(aec->eBuf + PART_LEN, e, sizeof(float) * PART_LEN);
memset(fft, 0, sizeof(float) * PART_LEN);
memcpy(fft + PART_LEN, e, sizeof(float) * PART_LEN);
aec_rdft_128(1, fft, ip, wfft);
aec_rdft_128(1, fft);
ef[1][0] = 0;
ef[1][PART_LEN] = 0;
@ -738,12 +732,12 @@ static void ProcessBlock(aec_t *aec, const short *farend,
if (aec->adaptToggle) {
#endif
// Filter adaptation
WebRtcAec_FilterAdaptation(aec, fft, ef, ip, wfft);
WebRtcAec_FilterAdaptation(aec, fft, ef);
#ifdef G167
}
#endif
NonLinearProcessing(aec, ip, wfft, output, outputH);
NonLinearProcessing(aec, output, outputH);
#if defined(AEC_DEBUG) || defined(G167)
for (i = 0; i < PART_LEN; i++) {
@ -777,7 +771,7 @@ static void ProcessBlock(aec_t *aec, const short *farend,
#endif
}
static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output, short *outputH)
static void NonLinearProcessing(aec_t *aec, short *output, short *outputH)
{
float efw[2][PART_LEN1], dfw[2][PART_LEN1];
complex_t xfw[PART_LEN1];
@ -844,7 +838,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
fft[i] = aec->xBuf[i] * sqrtHanning[i];
fft[PART_LEN + i] = aec->xBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i];
}
aec_rdft_128(1, fft, ip, wfft);
aec_rdft_128(1, fft);
xfw[0][1] = 0;
xfw[PART_LEN][1] = 0;
@ -866,7 +860,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
fft[i] = aec->dBuf[i] * sqrtHanning[i];
fft[PART_LEN + i] = aec->dBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i];
}
aec_rdft_128(1, fft, ip, wfft);
aec_rdft_128(1, fft);
dfw[1][0] = 0;
dfw[1][PART_LEN] = 0;
@ -882,7 +876,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
fft[i] = aec->eBuf[i] * sqrtHanning[i];
fft[PART_LEN + i] = aec->eBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i];
}
aec_rdft_128(1, fft, ip, wfft);
aec_rdft_128(1, fft);
efw[1][0] = 0;
efw[1][PART_LEN] = 0;
efw[0][0] = fft[0];
@ -1059,7 +1053,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
// Sign change required by Ooura fft.
fft[2*i + 1] = -efw[1][i];
}
aec_rdft_128(-1, fft, ip, wfft);
aec_rdft_128(-1, fft);
// Overlap and add to obtain output.
scale = 2.0f / PART_LEN2;
@ -1091,7 +1085,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
fft[2*i] = comfortNoiseHband[i][0];
fft[2*i + 1] = comfortNoiseHband[i][1];
}
aec_rdft_128(-1, fft, ip, wfft);
aec_rdft_128(-1, fft);
scale = 2.0f / PART_LEN2;
}

View File

@ -172,8 +172,7 @@ extern WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal;
#define IP_LEN PART_LEN // this must be at least ceil(2 + sqrt(PART_LEN))
#define W_LEN PART_LEN
typedef void (*WebRtcAec_FilterAdaptation_t)
(aec_t *aec, float *fft, float ef[2][PART_LEN1], int ip[IP_LEN],
float wfft[W_LEN]);
(aec_t *aec, float *fft, float ef[2][PART_LEN1]);
extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
typedef void (*WebRtcAec_OverdriveAndSuppress_t)
(aec_t *aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]);

View File

@ -126,8 +126,7 @@ static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
}
}
static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1],
int ip[IP_LEN], float wfft[W_LEN]) {
static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1]) {
int i, j;
for (i = 0; i < NR_PART; i++) {
int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
@ -175,7 +174,7 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1],
-aec->xfBuf[1][xPos + PART_LEN],
ef[0][PART_LEN], ef[1][PART_LEN]);
aec_rdft_128(-1, fft, ip, wfft);
aec_rdft_128(-1, fft);
memset(fft + PART_LEN, 0, sizeof(float)*PART_LEN);
// fft scaling
@ -188,7 +187,7 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1],
_mm_storeu_ps(&fft[j], fft_scale);
}
}
aec_rdft_128(1, fft, ip, wfft);
aec_rdft_128(1, fft);
{
float wt1 = aec->wfBuf[1][pos];

View File

@ -24,6 +24,9 @@
#include "aec_rdft.h"
#include "system_wrappers/interface/cpu_features_wrapper.h"
float rdft_w[64];
static int ip[16];
static void bitrv2_32or128(int n, int *ip, float *a) {
// n is 32 or 128
int j, j1, k, k1, m, m2;
@ -98,7 +101,7 @@ static void bitrv2_32or128(int n, int *ip, float *a) {
}
}
static void makewt(int *ip, float *w) {
static void makewt_32() {
const int nw = 32;
int j, nwh;
float delta, x, y;
@ -107,22 +110,23 @@ static void makewt(int *ip, float *w) {
ip[1] = 1;
nwh = nw >> 1;
delta = atanf(1.0f) / nwh;
w[0] = 1;
w[1] = 0;
w[nwh] = cosf(delta * nwh);
w[nwh + 1] = w[nwh];
rdft_w[0] = 1;
rdft_w[1] = 0;
rdft_w[nwh] = cosf(delta * nwh);
rdft_w[nwh + 1] = rdft_w[nwh];
for (j = 2; j < nwh; j += 2) {
x = cosf(delta * j);
y = sinf(delta * j);
w[j] = x;
w[j + 1] = y;
w[nw - j] = y;
w[nw - j + 1] = x;
rdft_w[j] = x;
rdft_w[j + 1] = y;
rdft_w[nw - j] = y;
rdft_w[nw - j + 1] = x;
}
bitrv2_32or128(nw, ip + 2, w);
bitrv2_32or128(nw, ip + 2, rdft_w);
}
static void makect_32(int *ip, float *c) {
static void makect_32() {
float *c = rdft_w + 32;
const int nc = 32;
int j, nch;
float delta;
@ -138,7 +142,7 @@ static void makect_32(int *ip, float *c) {
}
}
static void cft1st_128(float *a, float *w) {
static void cft1st_128(float *a) {
const int n = 128;
int j, k1, k2;
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
@ -160,7 +164,7 @@ static void cft1st_128(float *a, float *w) {
a[3] = x1i + x3r;
a[6] = x1r + x3i;
a[7] = x1i - x3r;
wk1r = w[2];
wk1r = rdft_w[2];
x0r = a[8] + a[10];
x0i = a[9] + a[11];
x1r = a[8] - a[10];
@ -185,10 +189,10 @@ static void cft1st_128(float *a, float *w) {
for (j = 16; j < n; j += 16) {
k1 += 2;
k2 = 2 * k1;
wk2r = w[k1];
wk2i = w[k1 + 1];
wk1r = w[k2];
wk1i = w[k2 + 1];
wk2r = rdft_w[k1];
wk2i = rdft_w[k1 + 1];
wk1r = rdft_w[k2];
wk1i = rdft_w[k2 + 1];
wk3r = wk1r - 2 * wk2i * wk1i;
wk3i = 2 * wk2i * wk1r - wk1i;
x0r = a[j] + a[j + 2];
@ -213,8 +217,8 @@ static void cft1st_128(float *a, float *w) {
x0i = x1i - x3r;
a[j + 6] = wk3r * x0r - wk3i * x0i;
a[j + 7] = wk3r * x0i + wk3i * x0r;
wk1r = w[k2 + 2];
wk1i = w[k2 + 3];
wk1r = rdft_w[k2 + 2];
wk1i = rdft_w[k2 + 3];
wk3r = wk1r - 2 * wk2r * wk1i;
wk3i = 2 * wk2r * wk1r - wk1i;
x0r = a[j + 8] + a[j + 10];
@ -242,7 +246,7 @@ static void cft1st_128(float *a, float *w) {
}
}
static void cftmdl_128(int l, float *a, float *w) {
static void cftmdl_128(int l, float *a) {
const int n = 128;
int j, j1, j2, j3, k, k1, k2, m, m2;
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
@ -270,7 +274,7 @@ static void cftmdl_128(int l, float *a, float *w) {
a[j3] = x1r + x3i;
a[j3 + 1] = x1i - x3r;
}
wk1r = w[2];
wk1r = rdft_w[2];
for (j = m; j < l + m; j += 2) {
j1 = j + l;
j2 = j1 + l;
@ -301,10 +305,10 @@ static void cftmdl_128(int l, float *a, float *w) {
for (k = m2; k < n; k += m2) {
k1 += 2;
k2 = 2 * k1;
wk2r = w[k1];
wk2i = w[k1 + 1];
wk1r = w[k2];
wk1i = w[k2 + 1];
wk2r = rdft_w[k1];
wk2i = rdft_w[k1 + 1];
wk1r = rdft_w[k2];
wk1i = rdft_w[k2 + 1];
wk3r = wk1r - 2 * wk2i * wk1i;
wk3i = 2 * wk2i * wk1r - wk1i;
for (j = k; j < l + k; j += 2) {
@ -334,8 +338,8 @@ static void cftmdl_128(int l, float *a, float *w) {
a[j3] = wk3r * x0r - wk3i * x0i;
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
}
wk1r = w[k2 + 2];
wk1i = w[k2 + 3];
wk1r = rdft_w[k2 + 2];
wk1i = rdft_w[k2 + 3];
wk3r = wk1r - 2 * wk2r * wk1i;
wk3i = 2 * wk2r * wk1r - wk1i;
for (j = k + m; j < l + (k + m); j += 2) {
@ -368,12 +372,12 @@ static void cftmdl_128(int l, float *a, float *w) {
}
}
static void cftfsub_128(float *a, float *w) {
static void cftfsub_128(float *a) {
int j, j1, j2, j3, l;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
cft1st_128(a, w);
cftmdl_128(8, a, w);
cft1st_128(a);
cftmdl_128(8, a);
l = 32;
for (j = 0; j < l; j += 2) {
j1 = j + l;
@ -398,12 +402,12 @@ static void cftfsub_128(float *a, float *w) {
}
}
static void cftbsub_128(float *a, float *w) {
static void cftbsub_128(float *a) {
int j, j1, j2, j3, l;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
cft1st_128(a, w);
cftmdl_128(8, a, w);
cft1st_128(a);
cftmdl_128(8, a);
l = 32;
for (j = 0; j < l; j += 2) {
@ -429,7 +433,8 @@ static void cftbsub_128(float *a, float *w) {
}
}
static void rftfsub_128_C(float *a, float *c) {
static void rftfsub_128_C(float *a) {
const float *c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
@ -449,7 +454,8 @@ static void rftfsub_128_C(float *a, float *c) {
}
}
static void rftbsub_128_C(float *a, float *c) {
static void rftbsub_128_C(float *a) {
const float *c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
@ -471,33 +477,25 @@ static void rftbsub_128_C(float *a, float *c) {
a[65] = -a[65];
}
void aec_rdft_128(int isgn, float *a, int *ip, float *w)
{
void aec_rdft_128(int isgn, float *a) {
const int n = 128;
int nw;
float xi;
nw = ip[0];
if (n > (nw << 2)) {
nw = n >> 2;
makewt(ip, w);
}
if (n > (ip[1] << 2)) {
makect_32(ip, w + nw);
}
if (isgn >= 0) {
bitrv2_32or128(n, ip + 2, a);
cftfsub_128(a, w);
rftfsub_128(a, w + nw);
cftfsub_128(a);
rftfsub_128(a);
xi = a[0] - a[1];
a[0] += a[1];
a[1] = xi;
} else {
a[1] = 0.5f * (a[0] - a[1]);
a[0] -= a[1];
rftbsub_128(a, w + nw);
rftbsub_128(a);
bitrv2_32or128(n, ip + 2, a);
cftbsub_128(a, w);
cftbsub_128(a);
}
}
@ -513,4 +511,7 @@ void aec_rdft_init(void) {
aec_rdft_init_sse2();
#endif
}
// init library constants.
makewt_32();
makect_32();
}

View File

@ -8,12 +8,15 @@
* be found in the AUTHORS file in the root of the source tree.
*/
// constants shared by all paths (C, SSE2).
extern float rdft_w[64];
// code path selection function pointers
typedef void (*rft_sub_128_t)(float *a, float *c);
typedef void (*rft_sub_128_t)(float *a);
extern rft_sub_128_t rftfsub_128;
extern rft_sub_128_t rftbsub_128;
// entry points
void aec_rdft_init(void);
void aec_rdft_init_sse2(void);
void aec_rdft_128(int, float *, int *, float *);
void aec_rdft_128(int isgn, float *a);

View File

@ -20,7 +20,8 @@
# define ALIGN16_END __attribute__((aligned(16)))
#endif
static void rftfsub_128_SSE2(float *a, float *c) {
static void rftfsub_128_SSE2(float *a) {
const float *c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
@ -109,7 +110,8 @@ static void rftfsub_128_SSE2(float *a, float *c) {
}
}
static void rftbsub_128_SSE2(float *a, float *c) {
static void rftbsub_128_SSE2(float *a) {
const float *c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;