Optimization of "overdrive and suppress":

* float accuracy pow function, vectorized pow approximation, general
  vectorization.
* 10.2% AEC overall speedup for the straight C path.
* 16.1% AEC overall speedup for the SSE2 path.
Review URL: http://webrtc-codereview.appspot.com/24016

git-svn-id: http://webrtc.googlecode.com/svn/trunk@72 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
cduvivier@google.com 2011-06-13 18:56:48 +00:00
parent 706b7258f5
commit 5af7a804ea
3 changed files with 273 additions and 45 deletions

View File

@ -82,7 +82,7 @@ static const float sqrtHanning[65] = {
weightCurve = [0 ; 0.3 * sqrt(linspace(0,1,64))' + 0.1];
fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', weightCurve);
*/
static const float weightCurve[65] = {
const float WebRtcAec_weightCurve[65] = {
0.0000f, 0.1000f, 0.1378f, 0.1535f, 0.1655f, 0.1756f,
0.1845f, 0.1926f, 0.2000f, 0.2069f, 0.2134f, 0.2195f,
0.2254f, 0.2309f, 0.2363f, 0.2414f, 0.2464f, 0.2512f,
@ -100,7 +100,7 @@ static const float weightCurve[65] = {
overDriveCurve = [sqrt(linspace(0,1,65))' + 1];
fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', overDriveCurve);
*/
static const float overDriveCurve[65] = {
const float WebRtcAec_overDriveCurve[65] = {
1.0000f, 1.1250f, 1.1768f, 1.2165f, 1.2500f, 1.2795f,
1.3062f, 1.3307f, 1.3536f, 1.3750f, 1.3953f, 1.4146f,
1.4330f, 1.4507f, 1.4677f, 1.4841f, 1.5000f, 1.5154f,
@ -128,7 +128,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
static void GetHighbandGain(const float *lambda, float *nlpGainHband);
// Comfort_noise also computes noise for H band returned in comfortNoiseHband
static void ComfortNoise(aec_t *aec, complex_t *efw,
static void ComfortNoise(aec_t *aec, float efw[2][PART_LEN1],
complex_t *comfortNoiseHband,
const float *noisePow, const float *lambda);
@ -314,9 +314,32 @@ static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1],
}
}
static void OverdriveAndSuppress(aec_t *aec, float hNl[PART_LEN1],
const float hNlFb,
float efw[2][PART_LEN1]) {
int i;
for (i = 0; i < PART_LEN1; i++) {
// Weight subbands
if (hNl[i] > hNlFb) {
hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
(1 - WebRtcAec_weightCurve[i]) * hNl[i];
}
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
// Suppress error signal
efw[0][i] *= hNl[i];
efw[1][i] *= hNl[i];
// Ooura fft returns incorrect sign on imaginary component. It matters here
// because we are making an additive change with comfort noise.
efw[1][i] *= -1;
}
}
WebRtcAec_FilterFar_t WebRtcAec_FilterFar;
WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal;
WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;
int WebRtcAec_InitAec(aec_t *aec, int sampFreq)
{
@ -444,6 +467,7 @@ int WebRtcAec_InitAec(aec_t *aec, int sampFreq)
WebRtcAec_FilterFar = FilterFar;
WebRtcAec_ScaleErrorSignal = ScaleErrorSignal;
WebRtcAec_FilterAdaptation = FilterAdaptation;
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppress;
if (WebRtc_GetCPUInfo(kSSE2)) {
#if defined(__SSE2__)
WebRtcAec_InitAec_SSE2();
@ -753,7 +777,8 @@ static void ProcessBlock(aec_t *aec, const short *farend,
static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output, short *outputH)
{
complex_t dfw[PART_LEN1], efw[PART_LEN1], xfw[PART_LEN1];
float efw[2][PART_LEN1], dfw[2][PART_LEN1];
complex_t xfw[PART_LEN1];
complex_t comfortNoiseHband[PART_LEN1];
float fft[PART_LEN2];
float scale, dtmp;
@ -841,13 +866,13 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
}
rdft(PART_LEN2, 1, fft, ip, wfft);
dfw[0][1] = 0;
dfw[PART_LEN][1] = 0;
dfw[1][0] = 0;
dfw[1][PART_LEN] = 0;
dfw[0][0] = fft[0];
dfw[PART_LEN][0] = fft[1];
dfw[0][PART_LEN] = fft[1];
for (i = 1; i < PART_LEN; i++) {
dfw[i][0] = fft[2 * i];
dfw[i][1] = fft[2 * i + 1];
dfw[0][i] = fft[2 * i];
dfw[1][i] = fft[2 * i + 1];
}
// Windowed error fft
@ -856,21 +881,21 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
fft[PART_LEN + i] = aec->eBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i];
}
rdft(PART_LEN2, 1, fft, ip, wfft);
efw[0][1] = 0;
efw[PART_LEN][1] = 0;
efw[1][0] = 0;
efw[1][PART_LEN] = 0;
efw[0][0] = fft[0];
efw[PART_LEN][0] = fft[1];
efw[0][PART_LEN] = fft[1];
for (i = 1; i < PART_LEN; i++) {
efw[i][0] = fft[2 * i];
efw[i][1] = fft[2 * i + 1];
efw[0][i] = fft[2 * i];
efw[1][i] = fft[2 * i + 1];
}
// Smoothed PSD
for (i = 0; i < PART_LEN1; i++) {
aec->sd[i] = ptrGCoh[0] * aec->sd[i] + ptrGCoh[1] *
(dfw[i][0] * dfw[i][0] + dfw[i][1] * dfw[i][1]);
(dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
aec->se[i] = ptrGCoh[0] * aec->se[i] + ptrGCoh[1] *
(efw[i][0] * efw[i][0] + efw[i][1] * efw[i][1]);
(efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
// We threshold here to protect against the ill-effects of a zero farend.
// The threshold is not arbitrarily chosen, but balances protection and
// adverse interaction with the algorithm's tuning.
@ -879,14 +904,14 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
WEBRTC_SPL_MAX(xfw[i][0] * xfw[i][0] + xfw[i][1] * xfw[i][1], 15);
aec->sde[i][0] = ptrGCoh[0] * aec->sde[i][0] + ptrGCoh[1] *
(dfw[i][0] * efw[i][0] + dfw[i][1] * efw[i][1]);
(dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
aec->sde[i][1] = ptrGCoh[0] * aec->sde[i][1] + ptrGCoh[1] *
(dfw[i][0] * efw[i][1] - dfw[i][1] * efw[i][0]);
(dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
aec->sxd[i][0] = ptrGCoh[0] * aec->sxd[i][0] + ptrGCoh[1] *
(dfw[i][0] * xfw[i][0] + dfw[i][1] * xfw[i][1]);
(dfw[0][i] * xfw[i][0] + dfw[1][i] * xfw[i][1]);
aec->sxd[i][1] = ptrGCoh[0] * aec->sxd[i][1] + ptrGCoh[1] *
(dfw[i][0] * xfw[i][1] - dfw[i][1] * xfw[i][0]);
(dfw[0][i] * xfw[i][1] - dfw[1][i] * xfw[i][0]);
sdSum += aec->sd[i];
seSum += aec->se[i];
@ -1007,29 +1032,13 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
// Smooth the overdrive.
if (aec->overDrive < aec->overDriveSm) {
aec->overDriveSm = 0.99f * aec->overDriveSm + 0.01f * aec->overDrive;
aec->overDriveSm = 0.99f * aec->overDriveSm + 0.01f * aec->overDrive;
}
else {
aec->overDriveSm = 0.9f * aec->overDriveSm + 0.1f * aec->overDrive;
}
for (i = 0; i < PART_LEN1; i++) {
// Weight subbands
if (hNl[i] > hNlFb) {
hNl[i] = weightCurve[i] * hNlFb + (1 - weightCurve[i]) * hNl[i];
}
hNl[i] = (float)pow(hNl[i], aec->overDriveSm * overDriveCurve[i]);
// Suppress error signal
efw[i][0] *= hNl[i];
efw[i][1] *= hNl[i];
// Ooura fft returns incorrect sign on imaginary component.
// It matters here because we are making an additive change with comfort noise.
efw[i][1] *= -1;
aec->overDriveSm = 0.9f * aec->overDriveSm + 0.1f * aec->overDrive;
}
WebRtcAec_OverdriveAndSuppress(aec, hNl, hNlFb, efw);
#ifdef G167
if (aec->cnToggle) {
@ -1042,11 +1051,11 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
// Inverse error fft.
fft[0] = efw[0][0];
fft[1] = efw[PART_LEN][0];
fft[1] = efw[0][PART_LEN];
for (i = 1; i < PART_LEN; i++) {
fft[2*i] = efw[i][0];
fft[2*i] = efw[0][i];
// Sign change required by Ooura fft.
fft[2*i + 1] = -efw[i][1];
fft[2*i + 1] = -efw[1][i];
}
rdft(PART_LEN2, -1, fft, ip, wfft);
@ -1126,7 +1135,7 @@ static void GetHighbandGain(const float *lambda, float *nlpGainHband)
nlpGainHband[0] /= (float)(PART_LEN1 - 1 - freqAvgIc);
}
static void ComfortNoise(aec_t *aec, complex_t *efw,
static void ComfortNoise(aec_t *aec, float efw[2][PART_LEN1],
complex_t *comfortNoiseHband, const float *noisePow, const float *lambda)
{
int i, num;
@ -1159,8 +1168,8 @@ static void ComfortNoise(aec_t *aec, complex_t *efw,
// This is the proper weighting to match the background noise power
tmp = sqrtf(WEBRTC_SPL_MAX(1 - lambda[i] * lambda[i], 0));
//tmp = 1 - lambda[i];
efw[i][0] += tmp * u[i][0];
efw[i][1] += tmp * u[i][1];
efw[0][i] += tmp * u[i][0];
efw[1][i] += tmp * u[i][1];
}
// For H band comfort noise

View File

@ -176,6 +176,9 @@ typedef void (*WebRtcAec_FilterAdaptation_t)
(aec_t *aec, float *fft, float ef[2][PART_LEN1], int ip[IP_LEN],
float wfft[W_LEN]);
extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
typedef void (*WebRtcAec_OverdriveAndSuppress_t)
(aec_t *aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]);
extern WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;
int WebRtcAec_CreateAec(aec_t **aec);
int WebRtcAec_FreeAec(aec_t *aec);

View File

@ -210,10 +210,226 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1],
}
}
#ifdef _MSC_VER /* visual c++ */
# define ALIGN16_BEG __declspec(align(16))
# define ALIGN16_END
#else /* gcc or icc */
# define ALIGN16_BEG
# define ALIGN16_END __attribute__((aligned(16)))
#endif
static __m128 mm_pow_ps(__m128 a, __m128 b)
{
// a^b = exp2(b * log2(a))
// exp2(x) and log2(x) are calculated using polynomial approximations.
__m128 log2_a, b_log2_a, a_exp_b;
// Calculate log2(x), x = a.
{
// To calculate log2(x), we decompose x like this:
// x = y * 2^n
// n is an integer
// y is in the [1.0, 2.0) range
//
// log2(x) = log2(y) + n
// n can be evaluated by playing with float representation.
// log2(y) in a small range can be approximated, this code uses an order
// five polynomial approximation. The coefficients have been
// estimated with the Remez algorithm and the resulting
// polynomial has a maximum relative error of 0.00086%.
// Compute n.
// This is done by masking the exponent, shifting it into the top bit of
// the mantissa, putting eight into the biased exponent (to shift/
// compensate the fact that the exponent has been shifted in the top/
// fractional part and finally getting rid of the implicit leading one
// from the mantissa by substracting it out.
static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END =
{0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END =
{0x43800000, 0x43800000, 0x43800000, 0x43800000};
static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END =
{0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
static const int shift_exponent_into_top_mantissa = 8;
const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask));
const __m128 n_1 = (__m128)_mm_srli_epi32((__m128i)two_n,
shift_exponent_into_top_mantissa);
const __m128 n_0 = _mm_or_ps(
(__m128)n_1, *((__m128 *)eight_biased_exponent));
const __m128 n = _mm_sub_ps(n_0, *((__m128 *)implicit_leading_one));
// Compute y.
static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END =
{0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END =
{0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask));
const __m128 y = _mm_or_ps(
mantissa, *((__m128 *)zero_biased_exponent_is_one));
// Approximate log2(y) ~= (y - 1) * pol5(y).
// pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
static const ALIGN16_BEG float ALIGN16_END C5[4] =
{-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
static const ALIGN16_BEG float ALIGN16_END C4[4] =
{3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
static const ALIGN16_BEG float ALIGN16_END C3[4] =
{-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
static const ALIGN16_BEG float ALIGN16_END C2[4] =
{2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
static const ALIGN16_BEG float ALIGN16_END C1[4] =
{-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
static const ALIGN16_BEG float ALIGN16_END C0[4] =
{3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128 *)C5));
const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4));
const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3));
const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2));
const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1));
const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128 *)C0));
const __m128 y_minus_one = _mm_sub_ps(
y, *((__m128 *)zero_biased_exponent_is_one));
const __m128 log2_y = _mm_mul_ps(y_minus_one , pol5_y);
// Combine parts.
log2_a = _mm_add_ps(n, log2_y);
}
// b * log2(a)
b_log2_a = _mm_mul_ps(b, log2_a);
// Calculate exp2(x), x = b * log2(a).
{
// To calculate 2^x, we decompose x like this:
// x = n + y
// n is an integer, the value of x - 0.5 rounded down, therefore
// y is in the [0.5, 1.5) range
//
// 2^x = 2^n * 2^y
// 2^n can be evaluated by playing with float representation.
// 2^y in a small range can be approximated, this code uses an order two
// polynomial approximation. The coefficients have been estimated
// with the Remez algorithm and the resulting polynomial has a
// maximum relative error of 0.17%.
// To avoid over/underflow, we reduce the range of input to ]-127, 129].
static const ALIGN16_BEG float max_input[4] ALIGN16_END =
{129.f, 129.f, 129.f, 129.f};
static const ALIGN16_BEG float min_input[4] ALIGN16_END =
{-126.99999f, -126.99999f, -126.99999f, -126.99999f};
const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input));
const __m128 x_max = _mm_max_ps(x_min, *((__m128 *)min_input));
// Compute n.
static const ALIGN16_BEG float half[4] ALIGN16_END =
{0.5f, 0.5f, 0.5f, 0.5f};
const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half));
const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
// Compute 2^n.
static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END =
{127, 127, 127, 127};
static const int float_exponent_shift = 23;
const __m128i two_n_exponent = _mm_add_epi32(
x_minus_half_floor, *((__m128i *)float_exponent_bias));
const __m128 two_n = (__m128)_mm_slli_epi32(
two_n_exponent, float_exponent_shift);
// Compute y.
const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
// Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
static const ALIGN16_BEG float C2[4] ALIGN16_END =
{3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
static const ALIGN16_BEG float C1[4] ALIGN16_END =
{6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
static const ALIGN16_BEG float C0[4] ALIGN16_END =
{1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f};
const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128 *)C2));
const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1));
const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128 *)C0));
// Combine parts.
a_exp_b = _mm_mul_ps(exp2_y, two_n);
}
return a_exp_b;
}
extern const float WebRtcAec_weightCurve[65];
extern const float WebRtcAec_overDriveCurve[65];
static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
const float hNlFb,
float efw[2][PART_LEN1]) {
int i;
const __m128 vec_hNlFb = _mm_set1_ps(hNlFb);
const __m128 vec_one = _mm_set1_ps(1.0f);
const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
// vectorized code (four at once)
for (i = 0; i + 3 < PART_LEN1; i+=4) {
// Weight subbands
__m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(
vec_weightCurve, vec_hNlFb);
const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
const __m128 vec_one_weightCurve_hNl = _mm_mul_ps(
vec_one_weightCurve, vec_hNl);
const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
const __m128 vec_if1 = _mm_and_ps(
bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
vec_hNl = _mm_or_ps(vec_if0, vec_if1);
{
const __m128 vec_overDriveCurve = _mm_loadu_ps(
&WebRtcAec_overDriveCurve[i]);
const __m128 vec_overDriveSm_overDriveCurve = _mm_mul_ps(
vec_overDriveSm, vec_overDriveCurve);
vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
_mm_storeu_ps(&hNl[i], vec_hNl);
}
// Suppress error signal
{
__m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]);
__m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]);
vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl);
vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl);
// Ooura fft returns incorrect sign on imaginary component. It matters
// here because we are making an additive change with comfort noise.
vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one);
_mm_storeu_ps(&efw[0][i], vec_efw_re);
_mm_storeu_ps(&efw[1][i], vec_efw_im);
}
}
// scalar code for the remaining items.
for (; i < PART_LEN1; i++) {
// Weight subbands
if (hNl[i] > hNlFb) {
hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
(1 - WebRtcAec_weightCurve[i]) * hNl[i];
}
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
// Suppress error signal
efw[0][i] *= hNl[i];
efw[1][i] *= hNl[i];
// Ooura fft returns incorrect sign on imaginary component. It matters
// here because we are making an additive change with comfort noise.
efw[1][i] *= -1;
}
}
void WebRtcAec_InitAec_SSE2(void) {
WebRtcAec_FilterFar = FilterFarSSE2;
WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;
WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
}
#endif //__SSE2__