Optimization of "overdrive and suppress":
* float accuracy pow function, vectorized pow approximation, general vectorization. * 10.2% AEC overall speedup for the straight C path. * 16.1% AEC overall speedup for the SSE2 path. Review URL: http://webrtc-codereview.appspot.com/24016 git-svn-id: http://webrtc.googlecode.com/svn/trunk@72 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
706b7258f5
commit
5af7a804ea
@ -82,7 +82,7 @@ static const float sqrtHanning[65] = {
|
||||
weightCurve = [0 ; 0.3 * sqrt(linspace(0,1,64))' + 0.1];
|
||||
fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', weightCurve);
|
||||
*/
|
||||
static const float weightCurve[65] = {
|
||||
const float WebRtcAec_weightCurve[65] = {
|
||||
0.0000f, 0.1000f, 0.1378f, 0.1535f, 0.1655f, 0.1756f,
|
||||
0.1845f, 0.1926f, 0.2000f, 0.2069f, 0.2134f, 0.2195f,
|
||||
0.2254f, 0.2309f, 0.2363f, 0.2414f, 0.2464f, 0.2512f,
|
||||
@ -100,7 +100,7 @@ static const float weightCurve[65] = {
|
||||
overDriveCurve = [sqrt(linspace(0,1,65))' + 1];
|
||||
fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', overDriveCurve);
|
||||
*/
|
||||
static const float overDriveCurve[65] = {
|
||||
const float WebRtcAec_overDriveCurve[65] = {
|
||||
1.0000f, 1.1250f, 1.1768f, 1.2165f, 1.2500f, 1.2795f,
|
||||
1.3062f, 1.3307f, 1.3536f, 1.3750f, 1.3953f, 1.4146f,
|
||||
1.4330f, 1.4507f, 1.4677f, 1.4841f, 1.5000f, 1.5154f,
|
||||
@ -128,7 +128,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
|
||||
static void GetHighbandGain(const float *lambda, float *nlpGainHband);
|
||||
|
||||
// Comfort_noise also computes noise for H band returned in comfortNoiseHband
|
||||
static void ComfortNoise(aec_t *aec, complex_t *efw,
|
||||
static void ComfortNoise(aec_t *aec, float efw[2][PART_LEN1],
|
||||
complex_t *comfortNoiseHband,
|
||||
const float *noisePow, const float *lambda);
|
||||
|
||||
@ -314,9 +314,32 @@ static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1],
|
||||
}
|
||||
}
|
||||
|
||||
static void OverdriveAndSuppress(aec_t *aec, float hNl[PART_LEN1],
|
||||
const float hNlFb,
|
||||
float efw[2][PART_LEN1]) {
|
||||
int i;
|
||||
for (i = 0; i < PART_LEN1; i++) {
|
||||
// Weight subbands
|
||||
if (hNl[i] > hNlFb) {
|
||||
hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
|
||||
(1 - WebRtcAec_weightCurve[i]) * hNl[i];
|
||||
}
|
||||
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
|
||||
|
||||
// Suppress error signal
|
||||
efw[0][i] *= hNl[i];
|
||||
efw[1][i] *= hNl[i];
|
||||
|
||||
// Ooura fft returns incorrect sign on imaginary component. It matters here
|
||||
// because we are making an additive change with comfort noise.
|
||||
efw[1][i] *= -1;
|
||||
}
|
||||
}
|
||||
|
||||
WebRtcAec_FilterFar_t WebRtcAec_FilterFar;
|
||||
WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal;
|
||||
WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
|
||||
WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;
|
||||
|
||||
int WebRtcAec_InitAec(aec_t *aec, int sampFreq)
|
||||
{
|
||||
@ -444,6 +467,7 @@ int WebRtcAec_InitAec(aec_t *aec, int sampFreq)
|
||||
WebRtcAec_FilterFar = FilterFar;
|
||||
WebRtcAec_ScaleErrorSignal = ScaleErrorSignal;
|
||||
WebRtcAec_FilterAdaptation = FilterAdaptation;
|
||||
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppress;
|
||||
if (WebRtc_GetCPUInfo(kSSE2)) {
|
||||
#if defined(__SSE2__)
|
||||
WebRtcAec_InitAec_SSE2();
|
||||
@ -753,7 +777,8 @@ static void ProcessBlock(aec_t *aec, const short *farend,
|
||||
|
||||
static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output, short *outputH)
|
||||
{
|
||||
complex_t dfw[PART_LEN1], efw[PART_LEN1], xfw[PART_LEN1];
|
||||
float efw[2][PART_LEN1], dfw[2][PART_LEN1];
|
||||
complex_t xfw[PART_LEN1];
|
||||
complex_t comfortNoiseHband[PART_LEN1];
|
||||
float fft[PART_LEN2];
|
||||
float scale, dtmp;
|
||||
@ -841,13 +866,13 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
|
||||
}
|
||||
rdft(PART_LEN2, 1, fft, ip, wfft);
|
||||
|
||||
dfw[0][1] = 0;
|
||||
dfw[PART_LEN][1] = 0;
|
||||
dfw[1][0] = 0;
|
||||
dfw[1][PART_LEN] = 0;
|
||||
dfw[0][0] = fft[0];
|
||||
dfw[PART_LEN][0] = fft[1];
|
||||
dfw[0][PART_LEN] = fft[1];
|
||||
for (i = 1; i < PART_LEN; i++) {
|
||||
dfw[i][0] = fft[2 * i];
|
||||
dfw[i][1] = fft[2 * i + 1];
|
||||
dfw[0][i] = fft[2 * i];
|
||||
dfw[1][i] = fft[2 * i + 1];
|
||||
}
|
||||
|
||||
// Windowed error fft
|
||||
@ -856,21 +881,21 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
|
||||
fft[PART_LEN + i] = aec->eBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i];
|
||||
}
|
||||
rdft(PART_LEN2, 1, fft, ip, wfft);
|
||||
efw[0][1] = 0;
|
||||
efw[PART_LEN][1] = 0;
|
||||
efw[1][0] = 0;
|
||||
efw[1][PART_LEN] = 0;
|
||||
efw[0][0] = fft[0];
|
||||
efw[PART_LEN][0] = fft[1];
|
||||
efw[0][PART_LEN] = fft[1];
|
||||
for (i = 1; i < PART_LEN; i++) {
|
||||
efw[i][0] = fft[2 * i];
|
||||
efw[i][1] = fft[2 * i + 1];
|
||||
efw[0][i] = fft[2 * i];
|
||||
efw[1][i] = fft[2 * i + 1];
|
||||
}
|
||||
|
||||
// Smoothed PSD
|
||||
for (i = 0; i < PART_LEN1; i++) {
|
||||
aec->sd[i] = ptrGCoh[0] * aec->sd[i] + ptrGCoh[1] *
|
||||
(dfw[i][0] * dfw[i][0] + dfw[i][1] * dfw[i][1]);
|
||||
(dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
|
||||
aec->se[i] = ptrGCoh[0] * aec->se[i] + ptrGCoh[1] *
|
||||
(efw[i][0] * efw[i][0] + efw[i][1] * efw[i][1]);
|
||||
(efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
|
||||
// We threshold here to protect against the ill-effects of a zero farend.
|
||||
// The threshold is not arbitrarily chosen, but balances protection and
|
||||
// adverse interaction with the algorithm's tuning.
|
||||
@ -879,14 +904,14 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
|
||||
WEBRTC_SPL_MAX(xfw[i][0] * xfw[i][0] + xfw[i][1] * xfw[i][1], 15);
|
||||
|
||||
aec->sde[i][0] = ptrGCoh[0] * aec->sde[i][0] + ptrGCoh[1] *
|
||||
(dfw[i][0] * efw[i][0] + dfw[i][1] * efw[i][1]);
|
||||
(dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
|
||||
aec->sde[i][1] = ptrGCoh[0] * aec->sde[i][1] + ptrGCoh[1] *
|
||||
(dfw[i][0] * efw[i][1] - dfw[i][1] * efw[i][0]);
|
||||
(dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
|
||||
|
||||
aec->sxd[i][0] = ptrGCoh[0] * aec->sxd[i][0] + ptrGCoh[1] *
|
||||
(dfw[i][0] * xfw[i][0] + dfw[i][1] * xfw[i][1]);
|
||||
(dfw[0][i] * xfw[i][0] + dfw[1][i] * xfw[i][1]);
|
||||
aec->sxd[i][1] = ptrGCoh[0] * aec->sxd[i][1] + ptrGCoh[1] *
|
||||
(dfw[i][0] * xfw[i][1] - dfw[i][1] * xfw[i][0]);
|
||||
(dfw[0][i] * xfw[i][1] - dfw[1][i] * xfw[i][0]);
|
||||
|
||||
sdSum += aec->sd[i];
|
||||
seSum += aec->se[i];
|
||||
@ -1007,29 +1032,13 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
|
||||
|
||||
// Smooth the overdrive.
|
||||
if (aec->overDrive < aec->overDriveSm) {
|
||||
aec->overDriveSm = 0.99f * aec->overDriveSm + 0.01f * aec->overDrive;
|
||||
aec->overDriveSm = 0.99f * aec->overDriveSm + 0.01f * aec->overDrive;
|
||||
}
|
||||
else {
|
||||
aec->overDriveSm = 0.9f * aec->overDriveSm + 0.1f * aec->overDrive;
|
||||
}
|
||||
|
||||
for (i = 0; i < PART_LEN1; i++) {
|
||||
// Weight subbands
|
||||
if (hNl[i] > hNlFb) {
|
||||
hNl[i] = weightCurve[i] * hNlFb + (1 - weightCurve[i]) * hNl[i];
|
||||
}
|
||||
|
||||
hNl[i] = (float)pow(hNl[i], aec->overDriveSm * overDriveCurve[i]);
|
||||
|
||||
// Suppress error signal
|
||||
efw[i][0] *= hNl[i];
|
||||
efw[i][1] *= hNl[i];
|
||||
|
||||
// Ooura fft returns incorrect sign on imaginary component.
|
||||
// It matters here because we are making an additive change with comfort noise.
|
||||
efw[i][1] *= -1;
|
||||
aec->overDriveSm = 0.9f * aec->overDriveSm + 0.1f * aec->overDrive;
|
||||
}
|
||||
|
||||
WebRtcAec_OverdriveAndSuppress(aec, hNl, hNlFb, efw);
|
||||
|
||||
#ifdef G167
|
||||
if (aec->cnToggle) {
|
||||
@ -1042,11 +1051,11 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output,
|
||||
|
||||
// Inverse error fft.
|
||||
fft[0] = efw[0][0];
|
||||
fft[1] = efw[PART_LEN][0];
|
||||
fft[1] = efw[0][PART_LEN];
|
||||
for (i = 1; i < PART_LEN; i++) {
|
||||
fft[2*i] = efw[i][0];
|
||||
fft[2*i] = efw[0][i];
|
||||
// Sign change required by Ooura fft.
|
||||
fft[2*i + 1] = -efw[i][1];
|
||||
fft[2*i + 1] = -efw[1][i];
|
||||
}
|
||||
rdft(PART_LEN2, -1, fft, ip, wfft);
|
||||
|
||||
@ -1126,7 +1135,7 @@ static void GetHighbandGain(const float *lambda, float *nlpGainHband)
|
||||
nlpGainHband[0] /= (float)(PART_LEN1 - 1 - freqAvgIc);
|
||||
}
|
||||
|
||||
static void ComfortNoise(aec_t *aec, complex_t *efw,
|
||||
static void ComfortNoise(aec_t *aec, float efw[2][PART_LEN1],
|
||||
complex_t *comfortNoiseHband, const float *noisePow, const float *lambda)
|
||||
{
|
||||
int i, num;
|
||||
@ -1159,8 +1168,8 @@ static void ComfortNoise(aec_t *aec, complex_t *efw,
|
||||
// This is the proper weighting to match the background noise power
|
||||
tmp = sqrtf(WEBRTC_SPL_MAX(1 - lambda[i] * lambda[i], 0));
|
||||
//tmp = 1 - lambda[i];
|
||||
efw[i][0] += tmp * u[i][0];
|
||||
efw[i][1] += tmp * u[i][1];
|
||||
efw[0][i] += tmp * u[i][0];
|
||||
efw[1][i] += tmp * u[i][1];
|
||||
}
|
||||
|
||||
// For H band comfort noise
|
||||
|
@ -176,6 +176,9 @@ typedef void (*WebRtcAec_FilterAdaptation_t)
|
||||
(aec_t *aec, float *fft, float ef[2][PART_LEN1], int ip[IP_LEN],
|
||||
float wfft[W_LEN]);
|
||||
extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
|
||||
typedef void (*WebRtcAec_OverdriveAndSuppress_t)
|
||||
(aec_t *aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]);
|
||||
extern WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;
|
||||
|
||||
int WebRtcAec_CreateAec(aec_t **aec);
|
||||
int WebRtcAec_FreeAec(aec_t *aec);
|
||||
|
@ -210,10 +210,226 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1],
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _MSC_VER /* visual c++ */
|
||||
# define ALIGN16_BEG __declspec(align(16))
|
||||
# define ALIGN16_END
|
||||
#else /* gcc or icc */
|
||||
# define ALIGN16_BEG
|
||||
# define ALIGN16_END __attribute__((aligned(16)))
|
||||
#endif
|
||||
|
||||
static __m128 mm_pow_ps(__m128 a, __m128 b)
|
||||
{
|
||||
// a^b = exp2(b * log2(a))
|
||||
// exp2(x) and log2(x) are calculated using polynomial approximations.
|
||||
__m128 log2_a, b_log2_a, a_exp_b;
|
||||
|
||||
// Calculate log2(x), x = a.
|
||||
{
|
||||
// To calculate log2(x), we decompose x like this:
|
||||
// x = y * 2^n
|
||||
// n is an integer
|
||||
// y is in the [1.0, 2.0) range
|
||||
//
|
||||
// log2(x) = log2(y) + n
|
||||
// n can be evaluated by playing with float representation.
|
||||
// log2(y) in a small range can be approximated, this code uses an order
|
||||
// five polynomial approximation. The coefficients have been
|
||||
// estimated with the Remez algorithm and the resulting
|
||||
// polynomial has a maximum relative error of 0.00086%.
|
||||
|
||||
// Compute n.
|
||||
// This is done by masking the exponent, shifting it into the top bit of
|
||||
// the mantissa, putting eight into the biased exponent (to shift/
|
||||
// compensate the fact that the exponent has been shifted in the top/
|
||||
// fractional part and finally getting rid of the implicit leading one
|
||||
// from the mantissa by substracting it out.
|
||||
static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END =
|
||||
{0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
|
||||
static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END =
|
||||
{0x43800000, 0x43800000, 0x43800000, 0x43800000};
|
||||
static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END =
|
||||
{0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
|
||||
static const int shift_exponent_into_top_mantissa = 8;
|
||||
const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask));
|
||||
const __m128 n_1 = (__m128)_mm_srli_epi32((__m128i)two_n,
|
||||
shift_exponent_into_top_mantissa);
|
||||
const __m128 n_0 = _mm_or_ps(
|
||||
(__m128)n_1, *((__m128 *)eight_biased_exponent));
|
||||
const __m128 n = _mm_sub_ps(n_0, *((__m128 *)implicit_leading_one));
|
||||
|
||||
// Compute y.
|
||||
static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END =
|
||||
{0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
|
||||
static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END =
|
||||
{0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
|
||||
const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask));
|
||||
const __m128 y = _mm_or_ps(
|
||||
mantissa, *((__m128 *)zero_biased_exponent_is_one));
|
||||
|
||||
// Approximate log2(y) ~= (y - 1) * pol5(y).
|
||||
// pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
|
||||
static const ALIGN16_BEG float ALIGN16_END C5[4] =
|
||||
{-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
|
||||
static const ALIGN16_BEG float ALIGN16_END C4[4] =
|
||||
{3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
|
||||
static const ALIGN16_BEG float ALIGN16_END C3[4] =
|
||||
{-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
|
||||
static const ALIGN16_BEG float ALIGN16_END C2[4] =
|
||||
{2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
|
||||
static const ALIGN16_BEG float ALIGN16_END C1[4] =
|
||||
{-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
|
||||
static const ALIGN16_BEG float ALIGN16_END C0[4] =
|
||||
{3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
|
||||
const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128 *)C5));
|
||||
const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4));
|
||||
const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
|
||||
const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3));
|
||||
const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
|
||||
const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2));
|
||||
const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
|
||||
const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1));
|
||||
const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
|
||||
const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128 *)C0));
|
||||
const __m128 y_minus_one = _mm_sub_ps(
|
||||
y, *((__m128 *)zero_biased_exponent_is_one));
|
||||
const __m128 log2_y = _mm_mul_ps(y_minus_one , pol5_y);
|
||||
|
||||
// Combine parts.
|
||||
log2_a = _mm_add_ps(n, log2_y);
|
||||
}
|
||||
|
||||
// b * log2(a)
|
||||
b_log2_a = _mm_mul_ps(b, log2_a);
|
||||
|
||||
// Calculate exp2(x), x = b * log2(a).
|
||||
{
|
||||
// To calculate 2^x, we decompose x like this:
|
||||
// x = n + y
|
||||
// n is an integer, the value of x - 0.5 rounded down, therefore
|
||||
// y is in the [0.5, 1.5) range
|
||||
//
|
||||
// 2^x = 2^n * 2^y
|
||||
// 2^n can be evaluated by playing with float representation.
|
||||
// 2^y in a small range can be approximated, this code uses an order two
|
||||
// polynomial approximation. The coefficients have been estimated
|
||||
// with the Remez algorithm and the resulting polynomial has a
|
||||
// maximum relative error of 0.17%.
|
||||
|
||||
// To avoid over/underflow, we reduce the range of input to ]-127, 129].
|
||||
static const ALIGN16_BEG float max_input[4] ALIGN16_END =
|
||||
{129.f, 129.f, 129.f, 129.f};
|
||||
static const ALIGN16_BEG float min_input[4] ALIGN16_END =
|
||||
{-126.99999f, -126.99999f, -126.99999f, -126.99999f};
|
||||
const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input));
|
||||
const __m128 x_max = _mm_max_ps(x_min, *((__m128 *)min_input));
|
||||
// Compute n.
|
||||
static const ALIGN16_BEG float half[4] ALIGN16_END =
|
||||
{0.5f, 0.5f, 0.5f, 0.5f};
|
||||
const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half));
|
||||
const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
|
||||
// Compute 2^n.
|
||||
static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END =
|
||||
{127, 127, 127, 127};
|
||||
static const int float_exponent_shift = 23;
|
||||
const __m128i two_n_exponent = _mm_add_epi32(
|
||||
x_minus_half_floor, *((__m128i *)float_exponent_bias));
|
||||
const __m128 two_n = (__m128)_mm_slli_epi32(
|
||||
two_n_exponent, float_exponent_shift);
|
||||
// Compute y.
|
||||
const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
|
||||
// Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
|
||||
static const ALIGN16_BEG float C2[4] ALIGN16_END =
|
||||
{3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
|
||||
static const ALIGN16_BEG float C1[4] ALIGN16_END =
|
||||
{6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
|
||||
static const ALIGN16_BEG float C0[4] ALIGN16_END =
|
||||
{1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f};
|
||||
const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128 *)C2));
|
||||
const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1));
|
||||
const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
|
||||
const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128 *)C0));
|
||||
|
||||
// Combine parts.
|
||||
a_exp_b = _mm_mul_ps(exp2_y, two_n);
|
||||
}
|
||||
return a_exp_b;
|
||||
}
|
||||
|
||||
extern const float WebRtcAec_weightCurve[65];
|
||||
extern const float WebRtcAec_overDriveCurve[65];
|
||||
|
||||
static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
|
||||
const float hNlFb,
|
||||
float efw[2][PART_LEN1]) {
|
||||
int i;
|
||||
const __m128 vec_hNlFb = _mm_set1_ps(hNlFb);
|
||||
const __m128 vec_one = _mm_set1_ps(1.0f);
|
||||
const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
|
||||
const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
|
||||
// vectorized code (four at once)
|
||||
for (i = 0; i + 3 < PART_LEN1; i+=4) {
|
||||
// Weight subbands
|
||||
__m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
|
||||
const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
|
||||
const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
|
||||
const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(
|
||||
vec_weightCurve, vec_hNlFb);
|
||||
const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
|
||||
const __m128 vec_one_weightCurve_hNl = _mm_mul_ps(
|
||||
vec_one_weightCurve, vec_hNl);
|
||||
const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
|
||||
const __m128 vec_if1 = _mm_and_ps(
|
||||
bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
|
||||
vec_hNl = _mm_or_ps(vec_if0, vec_if1);
|
||||
|
||||
{
|
||||
const __m128 vec_overDriveCurve = _mm_loadu_ps(
|
||||
&WebRtcAec_overDriveCurve[i]);
|
||||
const __m128 vec_overDriveSm_overDriveCurve = _mm_mul_ps(
|
||||
vec_overDriveSm, vec_overDriveCurve);
|
||||
vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
|
||||
_mm_storeu_ps(&hNl[i], vec_hNl);
|
||||
}
|
||||
|
||||
// Suppress error signal
|
||||
{
|
||||
__m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]);
|
||||
__m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]);
|
||||
vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl);
|
||||
vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl);
|
||||
|
||||
// Ooura fft returns incorrect sign on imaginary component. It matters
|
||||
// here because we are making an additive change with comfort noise.
|
||||
vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one);
|
||||
_mm_storeu_ps(&efw[0][i], vec_efw_re);
|
||||
_mm_storeu_ps(&efw[1][i], vec_efw_im);
|
||||
}
|
||||
}
|
||||
// scalar code for the remaining items.
|
||||
for (; i < PART_LEN1; i++) {
|
||||
// Weight subbands
|
||||
if (hNl[i] > hNlFb) {
|
||||
hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
|
||||
(1 - WebRtcAec_weightCurve[i]) * hNl[i];
|
||||
}
|
||||
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
|
||||
|
||||
// Suppress error signal
|
||||
efw[0][i] *= hNl[i];
|
||||
efw[1][i] *= hNl[i];
|
||||
|
||||
// Ooura fft returns incorrect sign on imaginary component. It matters
|
||||
// here because we are making an additive change with comfort noise.
|
||||
efw[1][i] *= -1;
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAec_InitAec_SSE2(void) {
|
||||
WebRtcAec_FilterFar = FilterFarSSE2;
|
||||
WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;
|
||||
WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
|
||||
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
|
||||
}
|
||||
|
||||
#endif //__SSE2__
|
||||
|
Loading…
x
Reference in New Issue
Block a user