VAD refactoring: Code style changes of local function.
Changes applied to local GmmProbability(): * Replaced shift macros with shift operations * Indentation and braces * Removed redundant code * Removed unnecessary type casts * Name changes * Adjusted comments Tested with vad_unittests and audioproc_unittest BUG=None TEST=None Review URL: https://webrtc-codereview.appspot.com/475002 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1991 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
cac787842c
commit
2273f325b2
@ -91,409 +91,385 @@ static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
|
|||||||
static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
|
static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
|
||||||
|
|
||||||
// Calculates the probabilities for both speech and background noise using
|
// Calculates the probabilities for both speech and background noise using
|
||||||
// Gaussian Mixture Models. A hypothesis-test is performed to decide which type
|
// Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
|
||||||
// of signal is most probable.
|
// type of signal is most probable.
|
||||||
//
|
//
|
||||||
// - inst [i/o] : Pointer to VAD instance
|
// - self [i/o] : Pointer to VAD instance
|
||||||
// - feature_vector [i] : Feature vector = log10(energy in frequency band)
|
// - feature_vector [i] : Feature vector = log10(energy in frequency band)
|
||||||
// - total_power [i] : Total power in audio frame.
|
// - total_power [i] : Total power in audio frame.
|
||||||
// - frame_length [i] : Number of input samples
|
// - frame_length [i] : Number of input samples
|
||||||
//
|
//
|
||||||
// - returns : the VAD decision (0 - noise, 1 - speech).
|
// - returns : the VAD decision (0 - noise, 1 - speech).
|
||||||
static int16_t GmmProbability(VadInstT *inst, int16_t *feature_vector,
|
static int16_t GmmProbability(VadInstT* self, int16_t* feature_vector,
|
||||||
int16_t total_power, int frame_length)
|
int16_t total_power, int frame_length) {
|
||||||
{
|
int n, k;
|
||||||
int n, k;
|
int16_t backval;
|
||||||
int16_t backval;
|
int16_t h0, h1;
|
||||||
int16_t h0, h1;
|
int16_t ratvec, xval;
|
||||||
int16_t ratvec, xval;
|
int16_t vadflag = 0;
|
||||||
int16_t vadflag;
|
int16_t shifts0, shifts1;
|
||||||
int16_t shifts0, shifts1;
|
int16_t tmp16, tmp16_1, tmp16_2;
|
||||||
int16_t tmp16, tmp16_1, tmp16_2;
|
int16_t diff;
|
||||||
int16_t diff, nr, pos;
|
int nr, pos;
|
||||||
int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
|
int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
|
||||||
int16_t delt, ndelt;
|
int16_t delt, ndelt;
|
||||||
int16_t maxspe, maxmu;
|
int16_t maxspe, maxmu;
|
||||||
int16_t deltaN[kTableSize], deltaS[kTableSize];
|
int16_t deltaN[kTableSize], deltaS[kTableSize];
|
||||||
int16_t ngprvec[kTableSize], sgprvec[kTableSize];
|
int16_t ngprvec[kTableSize], sgprvec[kTableSize];
|
||||||
int32_t h0test, h1test;
|
int32_t h0test, h1test;
|
||||||
int32_t tmp32_1, tmp32_2;
|
int32_t tmp32_1, tmp32_2;
|
||||||
int32_t dotVal;
|
int32_t dotVal;
|
||||||
int32_t nmid, smid;
|
int32_t nmid, smid;
|
||||||
int32_t probn[kNumGaussians], probs[kNumGaussians];
|
int32_t probn[kNumGaussians], probs[kNumGaussians];
|
||||||
int16_t *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr, *nstd1ptr, *nstd2ptr,
|
int16_t *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr;
|
||||||
*sstd1ptr, *sstd2ptr;
|
int16_t *nstd1ptr, *nstd2ptr, *sstd1ptr, *sstd2ptr;
|
||||||
int16_t overhead1, overhead2, individualTest, totalTest;
|
int16_t overhead1, overhead2, individualTest, totalTest;
|
||||||
|
|
||||||
// Set the thresholds to different values based on frame length
|
// Set various thresholds based on frame lengths (80, 160 or 240 samples).
|
||||||
if (frame_length == 80)
|
if (frame_length == 80) {
|
||||||
{
|
overhead1 = self->over_hang_max_1[0];
|
||||||
// 80 input samples
|
overhead2 = self->over_hang_max_2[0];
|
||||||
overhead1 = inst->over_hang_max_1[0];
|
individualTest = self->individual[0];
|
||||||
overhead2 = inst->over_hang_max_2[0];
|
totalTest = self->total[0];
|
||||||
individualTest = inst->individual[0];
|
} else if (frame_length == 160) {
|
||||||
totalTest = inst->total[0];
|
overhead1 = self->over_hang_max_1[1];
|
||||||
} else if (frame_length == 160)
|
overhead2 = self->over_hang_max_2[1];
|
||||||
{
|
individualTest = self->individual[1];
|
||||||
// 160 input samples
|
totalTest = self->total[1];
|
||||||
overhead1 = inst->over_hang_max_1[1];
|
} else {
|
||||||
overhead2 = inst->over_hang_max_2[1];
|
overhead1 = self->over_hang_max_1[2];
|
||||||
individualTest = inst->individual[1];
|
overhead2 = self->over_hang_max_2[2];
|
||||||
totalTest = inst->total[1];
|
individualTest = self->individual[2];
|
||||||
} else
|
totalTest = self->total[2];
|
||||||
{
|
}
|
||||||
// 240 input samples
|
|
||||||
overhead1 = inst->over_hang_max_1[2];
|
if (total_power > kMinEnergy) {
|
||||||
overhead2 = inst->over_hang_max_2[2];
|
// We have a signal present.
|
||||||
individualTest = inst->individual[2];
|
// Set pointers to the Gaussian parameters.
|
||||||
totalTest = inst->total[2];
|
nmean1ptr = &self->noise_means[0];
|
||||||
|
nmean2ptr = &self->noise_means[kNumChannels];
|
||||||
|
smean1ptr = &self->speech_means[0];
|
||||||
|
smean2ptr = &self->speech_means[kNumChannels];
|
||||||
|
nstd1ptr = &self->noise_stds[0];
|
||||||
|
nstd2ptr = &self->noise_stds[kNumChannels];
|
||||||
|
sstd1ptr = &self->speech_stds[0];
|
||||||
|
sstd2ptr = &self->speech_stds[kNumChannels];
|
||||||
|
|
||||||
|
dotVal = 0;
|
||||||
|
for (n = 0; n < kNumChannels; n++) {
|
||||||
|
// Perform for all channels.
|
||||||
|
pos = (n << 1);
|
||||||
|
xval = feature_vector[n];
|
||||||
|
|
||||||
|
// Probability for Noise, Q7 * Q20 = Q27.
|
||||||
|
tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean1ptr++, *nstd1ptr++,
|
||||||
|
&deltaN[pos]);
|
||||||
|
probn[0] = kNoiseDataWeights[n] * tmp32_1;
|
||||||
|
tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++,
|
||||||
|
&deltaN[pos + 1]);
|
||||||
|
probn[1] = kNoiseDataWeights[n + kNumChannels] * tmp32_1;
|
||||||
|
h0test = probn[0] + probn[1]; // Q27
|
||||||
|
h0 = (int16_t) (h0test >> 12); // Q15
|
||||||
|
|
||||||
|
// Probability for Speech.
|
||||||
|
tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean1ptr++, *sstd1ptr++,
|
||||||
|
&deltaS[pos]);
|
||||||
|
probs[0] = kSpeechDataWeights[n] * tmp32_1;
|
||||||
|
tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++,
|
||||||
|
&deltaS[pos + 1]);
|
||||||
|
probs[1] = kSpeechDataWeights[n + kNumChannels] * tmp32_1;
|
||||||
|
h1test = probs[0] + probs[1]; // Q27
|
||||||
|
h1 = (int16_t) (h1test >> 12); // Q15
|
||||||
|
|
||||||
|
// Calculate the log likelihood ratio. Approximate log2(H1/H0) with
|
||||||
|
// |shifts0| - |shifts1|.
|
||||||
|
shifts0 = WebRtcSpl_NormW32(h0test);
|
||||||
|
shifts1 = WebRtcSpl_NormW32(h1test);
|
||||||
|
|
||||||
|
if ((h0test > 0) && (h1test > 0)) {
|
||||||
|
ratvec = shifts0 - shifts1;
|
||||||
|
} else if (h1test > 0) {
|
||||||
|
ratvec = 31 - shifts1;
|
||||||
|
} else if (h0test > 0) {
|
||||||
|
ratvec = shifts0 - 31;
|
||||||
|
} else {
|
||||||
|
ratvec = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// VAD decision with spectrum weighting.
|
||||||
|
dotVal += WEBRTC_SPL_MUL_16_16(ratvec, kSpectrumWeight[n]);
|
||||||
|
|
||||||
|
// Individual channel test.
|
||||||
|
if ((ratvec << 2) > individualTest) {
|
||||||
|
vadflag = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Probabilities used when updating model.
|
||||||
|
if (h0 > 0) {
|
||||||
|
tmp32_1 = probn[0] & 0xFFFFF000; // Q27
|
||||||
|
tmp32_2 = (tmp32_1 << 2); // Q29
|
||||||
|
ngprvec[pos] = (int16_t) WebRtcSpl_DivW32W16(tmp32_2, h0); // Q14
|
||||||
|
ngprvec[pos + 1] = 16384 - ngprvec[pos];
|
||||||
|
} else {
|
||||||
|
ngprvec[pos] = 16384;
|
||||||
|
ngprvec[pos + 1] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Probabilities used when updating model.
|
||||||
|
if (h1 > 0) {
|
||||||
|
tmp32_1 = probs[0] & 0xFFFFF000;
|
||||||
|
tmp32_2 = (tmp32_1 << 2);
|
||||||
|
sgprvec[pos] = (int16_t) WebRtcSpl_DivW32W16(tmp32_2, h1);
|
||||||
|
sgprvec[pos + 1] = 16384 - sgprvec[pos];
|
||||||
|
} else {
|
||||||
|
sgprvec[pos] = 0;
|
||||||
|
sgprvec[pos + 1] = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (total_power > kMinEnergy)
|
// Overall test.
|
||||||
{ // If signal present at all
|
if (dotVal >= totalTest) {
|
||||||
|
vadflag |= 1;
|
||||||
// Set pointers to the gaussian parameters
|
|
||||||
nmean1ptr = &inst->noise_means[0];
|
|
||||||
nmean2ptr = &inst->noise_means[kNumChannels];
|
|
||||||
smean1ptr = &inst->speech_means[0];
|
|
||||||
smean2ptr = &inst->speech_means[kNumChannels];
|
|
||||||
nstd1ptr = &inst->noise_stds[0];
|
|
||||||
nstd2ptr = &inst->noise_stds[kNumChannels];
|
|
||||||
sstd1ptr = &inst->speech_stds[0];
|
|
||||||
sstd2ptr = &inst->speech_stds[kNumChannels];
|
|
||||||
|
|
||||||
vadflag = 0;
|
|
||||||
dotVal = 0;
|
|
||||||
for (n = 0; n < kNumChannels; n++)
|
|
||||||
{ // For all channels
|
|
||||||
|
|
||||||
pos = WEBRTC_SPL_LSHIFT_W16(n, 1);
|
|
||||||
xval = feature_vector[n];
|
|
||||||
|
|
||||||
// Probability for Noise, Q7 * Q20 = Q27
|
|
||||||
tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean1ptr++, *nstd1ptr++,
|
|
||||||
&deltaN[pos]);
|
|
||||||
probn[0] = (int32_t)(kNoiseDataWeights[n] * tmp32_1);
|
|
||||||
tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++,
|
|
||||||
&deltaN[pos + 1]);
|
|
||||||
probn[1] = (int32_t)(kNoiseDataWeights[n + kNumChannels] * tmp32_1);
|
|
||||||
h0test = probn[0] + probn[1]; // Q27
|
|
||||||
h0 = (int16_t)WEBRTC_SPL_RSHIFT_W32(h0test, 12); // Q15
|
|
||||||
|
|
||||||
// Probability for Speech
|
|
||||||
tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean1ptr++, *sstd1ptr++,
|
|
||||||
&deltaS[pos]);
|
|
||||||
probs[0] = (int32_t)(kSpeechDataWeights[n] * tmp32_1);
|
|
||||||
tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++,
|
|
||||||
&deltaS[pos + 1]);
|
|
||||||
probs[1] = (int32_t)(kSpeechDataWeights[n + kNumChannels] * tmp32_1);
|
|
||||||
h1test = probs[0] + probs[1]; // Q27
|
|
||||||
h1 = (int16_t)WEBRTC_SPL_RSHIFT_W32(h1test, 12); // Q15
|
|
||||||
|
|
||||||
// Get likelihood ratio. Approximate log2(H1/H0) with shifts0 - shifts1
|
|
||||||
shifts0 = WebRtcSpl_NormW32(h0test);
|
|
||||||
shifts1 = WebRtcSpl_NormW32(h1test);
|
|
||||||
|
|
||||||
if ((h0test > 0) && (h1test > 0))
|
|
||||||
{
|
|
||||||
ratvec = shifts0 - shifts1;
|
|
||||||
} else if (h1test > 0)
|
|
||||||
{
|
|
||||||
ratvec = 31 - shifts1;
|
|
||||||
} else if (h0test > 0)
|
|
||||||
{
|
|
||||||
ratvec = shifts0 - 31;
|
|
||||||
} else
|
|
||||||
{
|
|
||||||
ratvec = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// VAD decision with spectrum weighting
|
|
||||||
dotVal += WEBRTC_SPL_MUL_16_16(ratvec, kSpectrumWeight[n]);
|
|
||||||
|
|
||||||
// Individual channel test
|
|
||||||
if ((ratvec << 2) > individualTest)
|
|
||||||
{
|
|
||||||
vadflag = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Probabilities used when updating model
|
|
||||||
if (h0 > 0)
|
|
||||||
{
|
|
||||||
tmp32_1 = probn[0] & 0xFFFFF000; // Q27
|
|
||||||
tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); // Q29
|
|
||||||
ngprvec[pos] = (int16_t)WebRtcSpl_DivW32W16(tmp32_2, h0);
|
|
||||||
ngprvec[pos + 1] = 16384 - ngprvec[pos];
|
|
||||||
} else
|
|
||||||
{
|
|
||||||
ngprvec[pos] = 16384;
|
|
||||||
ngprvec[pos + 1] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Probabilities used when updating model
|
|
||||||
if (h1 > 0)
|
|
||||||
{
|
|
||||||
tmp32_1 = probs[0] & 0xFFFFF000;
|
|
||||||
tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2);
|
|
||||||
sgprvec[pos] = (int16_t)WebRtcSpl_DivW32W16(tmp32_2, h1);
|
|
||||||
sgprvec[pos + 1] = 16384 - sgprvec[pos];
|
|
||||||
} else
|
|
||||||
{
|
|
||||||
sgprvec[pos] = 0;
|
|
||||||
sgprvec[pos + 1] = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Overall test
|
|
||||||
if (dotVal >= totalTest)
|
|
||||||
{
|
|
||||||
vadflag |= 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set pointers to the means and standard deviations.
|
|
||||||
nmean1ptr = &inst->noise_means[0];
|
|
||||||
smean1ptr = &inst->speech_means[0];
|
|
||||||
nstd1ptr = &inst->noise_stds[0];
|
|
||||||
sstd1ptr = &inst->speech_stds[0];
|
|
||||||
|
|
||||||
maxspe = 12800;
|
|
||||||
|
|
||||||
// Update the model's parameters
|
|
||||||
for (n = 0; n < kNumChannels; n++)
|
|
||||||
{
|
|
||||||
|
|
||||||
pos = WEBRTC_SPL_LSHIFT_W16(n, 1);
|
|
||||||
|
|
||||||
// Get min value in past which is used for long term correction
|
|
||||||
backval = WebRtcVad_FindMinimum(inst, feature_vector[n], n); // Q4
|
|
||||||
|
|
||||||
// Compute the "global" mean, that is the sum of the two means weighted
|
|
||||||
nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7
|
|
||||||
nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+kNumChannels],
|
|
||||||
*(nmean1ptr+kNumChannels));
|
|
||||||
tmp16_1 = (int16_t)WEBRTC_SPL_RSHIFT_W32(nmid, 6); // Q8
|
|
||||||
|
|
||||||
for (k = 0; k < kNumGaussians; k++)
|
|
||||||
{
|
|
||||||
|
|
||||||
nr = pos + k;
|
|
||||||
|
|
||||||
nmean2ptr = nmean1ptr + k * kNumChannels;
|
|
||||||
smean2ptr = smean1ptr + k * kNumChannels;
|
|
||||||
nstd2ptr = nstd1ptr + k * kNumChannels;
|
|
||||||
sstd2ptr = sstd1ptr + k * kNumChannels;
|
|
||||||
nmk = *nmean2ptr;
|
|
||||||
smk = *smean2ptr;
|
|
||||||
nsk = *nstd2ptr;
|
|
||||||
ssk = *sstd2ptr;
|
|
||||||
|
|
||||||
// Update noise mean vector if the frame consists of noise only
|
|
||||||
nmk2 = nmk;
|
|
||||||
if (!vadflag)
|
|
||||||
{
|
|
||||||
// deltaN = (x-mu)/sigma^2
|
|
||||||
// ngprvec[k] = probn[k]/(probn[0] + probn[1])
|
|
||||||
|
|
||||||
delt = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[nr],
|
|
||||||
deltaN[nr], 11); // Q14*Q11
|
|
||||||
nmk2 = nmk + (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delt,
|
|
||||||
kNoiseUpdateConst,
|
|
||||||
22); // Q7+(Q14*Q15>>22)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Long term correction of the noise mean
|
|
||||||
ndelt = WEBRTC_SPL_LSHIFT_W16(backval, 4);
|
|
||||||
ndelt -= tmp16_1; // Q8 - Q8
|
|
||||||
nmk3 = nmk2 + (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ndelt,
|
|
||||||
kBackEta,
|
|
||||||
9); // Q7+(Q8*Q8)>>9
|
|
||||||
|
|
||||||
// Control that the noise mean does not drift to much
|
|
||||||
tmp16 = WEBRTC_SPL_LSHIFT_W16(k+5, 7);
|
|
||||||
if (nmk3 < tmp16)
|
|
||||||
nmk3 = tmp16;
|
|
||||||
tmp16 = WEBRTC_SPL_LSHIFT_W16(72+k-n, 7);
|
|
||||||
if (nmk3 > tmp16)
|
|
||||||
nmk3 = tmp16;
|
|
||||||
*nmean2ptr = nmk3;
|
|
||||||
|
|
||||||
if (vadflag)
|
|
||||||
{
|
|
||||||
// Update speech mean vector:
|
|
||||||
// deltaS = (x-mu)/sigma^2
|
|
||||||
// sgprvec[k] = probn[k]/(probn[0] + probn[1])
|
|
||||||
|
|
||||||
delt = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[nr],
|
|
||||||
deltaS[nr],
|
|
||||||
11); // (Q14*Q11)>>11=Q14
|
|
||||||
tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delt,
|
|
||||||
kSpeechUpdateConst,
|
|
||||||
21) + 1;
|
|
||||||
smk2 = smk + (tmp16 >> 1); // Q7 + (Q14 * Q15 >> 22)
|
|
||||||
|
|
||||||
// Control that the speech mean does not drift to much
|
|
||||||
maxmu = maxspe + 640;
|
|
||||||
if (smk2 < kMinimumMean[k])
|
|
||||||
smk2 = kMinimumMean[k];
|
|
||||||
if (smk2 > maxmu)
|
|
||||||
smk2 = maxmu;
|
|
||||||
|
|
||||||
*smean2ptr = smk2;
|
|
||||||
|
|
||||||
// (Q7>>3) = Q4
|
|
||||||
tmp16 = WEBRTC_SPL_RSHIFT_W16((smk + 4), 3);
|
|
||||||
|
|
||||||
tmp16 = feature_vector[n] - tmp16; // Q4
|
|
||||||
tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[nr], tmp16, 3);
|
|
||||||
tmp32_2 = tmp32_1 - (int32_t)4096; // Q12
|
|
||||||
tmp16 = WEBRTC_SPL_RSHIFT_W16((sgprvec[nr]), 2);
|
|
||||||
tmp32_1 = (int32_t)(tmp16 * tmp32_2);// (Q15>>3)*(Q14>>2)=Q12*Q12=Q24
|
|
||||||
|
|
||||||
tmp32_2 = WEBRTC_SPL_RSHIFT_W32(tmp32_1, 4); // Q20
|
|
||||||
|
|
||||||
// 0.1 * Q20 / Q7 = Q13
|
|
||||||
if (tmp32_2 > 0)
|
|
||||||
tmp16 = (int16_t)WebRtcSpl_DivW32W16(tmp32_2, ssk * 10);
|
|
||||||
else
|
|
||||||
{
|
|
||||||
tmp16 = (int16_t)WebRtcSpl_DivW32W16(-tmp32_2, ssk * 10);
|
|
||||||
tmp16 = -tmp16;
|
|
||||||
}
|
|
||||||
// divide by 4 giving an update factor of 0.025
|
|
||||||
tmp16 += 128; // Rounding
|
|
||||||
ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8);
|
|
||||||
// Division with 8 plus Q7
|
|
||||||
if (ssk < kMinStd)
|
|
||||||
ssk = kMinStd;
|
|
||||||
*sstd2ptr = ssk;
|
|
||||||
} else
|
|
||||||
{
|
|
||||||
// Update GMM variance vectors
|
|
||||||
// deltaN * (feature_vector[n] - nmk) - 1, Q11 * Q4
|
|
||||||
tmp16 = feature_vector[n] - WEBRTC_SPL_RSHIFT_W16(nmk, 3);
|
|
||||||
|
|
||||||
// (Q15>>3) * (Q14>>2) = Q12 * Q12 = Q24
|
|
||||||
tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[nr], tmp16, 3) - 4096;
|
|
||||||
tmp16 = WEBRTC_SPL_RSHIFT_W16((ngprvec[nr]+2), 2);
|
|
||||||
tmp32_2 = (int32_t)(tmp16 * tmp32_1);
|
|
||||||
tmp32_1 = WEBRTC_SPL_RSHIFT_W32(tmp32_2, 14);
|
|
||||||
// Q20 * approx 0.001 (2^-10=0.0009766)
|
|
||||||
|
|
||||||
// Q20 / Q7 = Q13
|
|
||||||
tmp16 = (int16_t)WebRtcSpl_DivW32W16(tmp32_1, nsk);
|
|
||||||
if (tmp32_1 > 0)
|
|
||||||
tmp16 = (int16_t)WebRtcSpl_DivW32W16(tmp32_1, nsk);
|
|
||||||
else
|
|
||||||
{
|
|
||||||
tmp16 = (int16_t)WebRtcSpl_DivW32W16(-tmp32_1, nsk);
|
|
||||||
tmp16 = -tmp16;
|
|
||||||
}
|
|
||||||
tmp16 += 32; // Rounding
|
|
||||||
nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6);
|
|
||||||
|
|
||||||
if (nsk < kMinStd)
|
|
||||||
nsk = kMinStd;
|
|
||||||
|
|
||||||
*nstd2ptr = nsk;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Separate models if they are too close - nmid in Q14
|
|
||||||
nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr);
|
|
||||||
nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+kNumChannels], *nmean2ptr);
|
|
||||||
|
|
||||||
// smid in Q14
|
|
||||||
smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr);
|
|
||||||
smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+kNumChannels], *smean2ptr);
|
|
||||||
|
|
||||||
// diff = "global" speech mean - "global" noise mean
|
|
||||||
diff = (int16_t)WEBRTC_SPL_RSHIFT_W32(smid, 9);
|
|
||||||
tmp16 = (int16_t)WEBRTC_SPL_RSHIFT_W32(nmid, 9);
|
|
||||||
diff -= tmp16;
|
|
||||||
|
|
||||||
if (diff < kMinimumDifference[n])
|
|
||||||
{
|
|
||||||
|
|
||||||
tmp16 = kMinimumDifference[n] - diff; // Q5
|
|
||||||
|
|
||||||
// tmp16_1 = ~0.8 * (kMinimumDifference - diff) in Q7
|
|
||||||
// tmp16_2 = ~0.2 * (kMinimumDifference - diff) in Q7
|
|
||||||
tmp16_1 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(13, tmp16, 2);
|
|
||||||
tmp16_2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(3, tmp16, 2);
|
|
||||||
|
|
||||||
// First Gauss, speech model
|
|
||||||
tmp16 = tmp16_1 + *smean1ptr;
|
|
||||||
*smean1ptr = tmp16;
|
|
||||||
smid = WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n]);
|
|
||||||
|
|
||||||
// Second Gauss, speech model
|
|
||||||
tmp16 = tmp16_1 + *smean2ptr;
|
|
||||||
*smean2ptr = tmp16;
|
|
||||||
smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+kNumChannels]);
|
|
||||||
|
|
||||||
// First Gauss, noise model
|
|
||||||
tmp16 = *nmean1ptr - tmp16_2;
|
|
||||||
*nmean1ptr = tmp16;
|
|
||||||
|
|
||||||
nmid = WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n]);
|
|
||||||
|
|
||||||
// Second Gauss, noise model
|
|
||||||
tmp16 = *nmean2ptr - tmp16_2;
|
|
||||||
*nmean2ptr = tmp16;
|
|
||||||
nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+kNumChannels]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Control that the speech & noise means do not drift to much
|
|
||||||
maxspe = kMaximumSpeech[n];
|
|
||||||
tmp16_2 = (int16_t)WEBRTC_SPL_RSHIFT_W32(smid, 7);
|
|
||||||
if (tmp16_2 > maxspe)
|
|
||||||
{ // Upper limit of speech model
|
|
||||||
tmp16_2 -= maxspe;
|
|
||||||
|
|
||||||
*smean1ptr -= tmp16_2;
|
|
||||||
*smean2ptr -= tmp16_2;
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp16_2 = (int16_t)WEBRTC_SPL_RSHIFT_W32(nmid, 7);
|
|
||||||
if (tmp16_2 > kMaximumNoise[n])
|
|
||||||
{
|
|
||||||
tmp16_2 -= kMaximumNoise[n];
|
|
||||||
|
|
||||||
*nmean1ptr -= tmp16_2;
|
|
||||||
*nmean2ptr -= tmp16_2;
|
|
||||||
}
|
|
||||||
|
|
||||||
nmean1ptr++;
|
|
||||||
smean1ptr++;
|
|
||||||
nstd1ptr++;
|
|
||||||
sstd1ptr++;
|
|
||||||
}
|
|
||||||
inst->frame_counter++;
|
|
||||||
} else
|
|
||||||
{
|
|
||||||
vadflag = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Hangover smoothing
|
// Set pointers to the means and standard deviations.
|
||||||
if (!vadflag)
|
nmean1ptr = &self->noise_means[0];
|
||||||
{
|
smean1ptr = &self->speech_means[0];
|
||||||
if (inst->over_hang > 0)
|
nstd1ptr = &self->noise_stds[0];
|
||||||
{
|
sstd1ptr = &self->speech_stds[0];
|
||||||
vadflag = 2 + inst->over_hang;
|
|
||||||
inst->over_hang = inst->over_hang - 1;
|
maxspe = 12800;
|
||||||
|
|
||||||
|
// Update the model parameters.
|
||||||
|
for (n = 0; n < kNumChannels; n++) {
|
||||||
|
pos = (n << 1);
|
||||||
|
|
||||||
|
// Get minimum value in past which is used for long term correction.
|
||||||
|
backval = WebRtcVad_FindMinimum(self, feature_vector[n], n); // Q4
|
||||||
|
|
||||||
|
// Compute the "global" mean, that is the sum of the two means weighted.
|
||||||
|
nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7
|
||||||
|
nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n + kNumChannels],
|
||||||
|
*(nmean1ptr + kNumChannels));
|
||||||
|
tmp16_1 = (int16_t) (nmid >> 6); // Q8
|
||||||
|
|
||||||
|
for (k = 0; k < kNumGaussians; k++) {
|
||||||
|
nr = pos + k;
|
||||||
|
|
||||||
|
nmean2ptr = nmean1ptr + k * kNumChannels;
|
||||||
|
smean2ptr = smean1ptr + k * kNumChannels;
|
||||||
|
nstd2ptr = nstd1ptr + k * kNumChannels;
|
||||||
|
sstd2ptr = sstd1ptr + k * kNumChannels;
|
||||||
|
nmk = *nmean2ptr;
|
||||||
|
smk = *smean2ptr;
|
||||||
|
nsk = *nstd2ptr;
|
||||||
|
ssk = *sstd2ptr;
|
||||||
|
|
||||||
|
// Update noise mean vector if the frame consists of noise only.
|
||||||
|
nmk2 = nmk;
|
||||||
|
if (!vadflag) {
|
||||||
|
// deltaN = (x-mu)/sigma^2
|
||||||
|
// ngprvec[k] = probn[k]/(probn[0] + probn[1])
|
||||||
|
|
||||||
|
// (Q14 * Q11 >> 11) = Q14.
|
||||||
|
delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[nr], deltaN[nr],
|
||||||
|
11);
|
||||||
|
// Q7 + (Q14 * Q15 >> 22) = Q7.
|
||||||
|
nmk2 = nmk + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt,
|
||||||
|
kNoiseUpdateConst,
|
||||||
|
22);
|
||||||
}
|
}
|
||||||
inst->num_of_speech = 0;
|
|
||||||
} else
|
// Long term correction of the noise mean.
|
||||||
{
|
// Q8 - Q8 = Q8.
|
||||||
inst->num_of_speech = inst->num_of_speech + 1;
|
ndelt = (backval << 4) - tmp16_1;
|
||||||
if (inst->num_of_speech > kMaxSpeechFrames)
|
// Q7 + (Q8 * Q8) >> 9 = Q7.
|
||||||
{
|
nmk3 = nmk2 + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ndelt, kBackEta, 9);
|
||||||
inst->num_of_speech = kMaxSpeechFrames;
|
|
||||||
inst->over_hang = overhead2;
|
// Control that the noise mean does not drift to much.
|
||||||
} else
|
tmp16 = (int16_t) ((k + 5) << 7);
|
||||||
inst->over_hang = overhead1;
|
if (nmk3 < tmp16) {
|
||||||
|
nmk3 = tmp16;
|
||||||
|
}
|
||||||
|
tmp16 = (int16_t) ((72 + k - n) << 7);
|
||||||
|
if (nmk3 > tmp16) {
|
||||||
|
nmk3 = tmp16;
|
||||||
|
}
|
||||||
|
*nmean2ptr = nmk3;
|
||||||
|
|
||||||
|
if (vadflag) {
|
||||||
|
// Update speech mean vector:
|
||||||
|
// |deltaS| = (x-mu)/sigma^2
|
||||||
|
// sgprvec[k] = probn[k]/(probn[0] + probn[1])
|
||||||
|
|
||||||
|
// (Q14 * Q11) >> 11 = Q14.
|
||||||
|
delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[nr], deltaS[nr],
|
||||||
|
11);
|
||||||
|
// Q14 * Q15 >> 21 = Q8.
|
||||||
|
tmp16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt, kSpeechUpdateConst,
|
||||||
|
21);
|
||||||
|
// Q7 + (Q8 >> 1) = Q7. With rounding.
|
||||||
|
smk2 = smk + ((tmp16 + 1) >> 1);
|
||||||
|
|
||||||
|
// Control that the speech mean does not drift to much.
|
||||||
|
maxmu = maxspe + 640;
|
||||||
|
if (smk2 < kMinimumMean[k]) {
|
||||||
|
smk2 = kMinimumMean[k];
|
||||||
|
}
|
||||||
|
if (smk2 > maxmu) {
|
||||||
|
smk2 = maxmu;
|
||||||
|
}
|
||||||
|
*smean2ptr = smk2; // Q7.
|
||||||
|
|
||||||
|
// (Q7 >> 3) = Q4. With rounding.
|
||||||
|
tmp16 = ((smk + 4) >> 3);
|
||||||
|
|
||||||
|
tmp16 = feature_vector[n] - tmp16; // Q4
|
||||||
|
// (Q11 * Q4 >> 3) = Q12.
|
||||||
|
tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[nr], tmp16, 3);
|
||||||
|
tmp32_2 = tmp32_1 - 4096;
|
||||||
|
tmp16 = (sgprvec[nr] >> 2);
|
||||||
|
// (Q14 >> 2) * Q12 = Q24.
|
||||||
|
tmp32_1 = tmp16 * tmp32_2;
|
||||||
|
|
||||||
|
tmp32_2 = (tmp32_1 >> 4); // Q20
|
||||||
|
|
||||||
|
// 0.1 * Q20 / Q7 = Q13.
|
||||||
|
if (tmp32_2 > 0) {
|
||||||
|
tmp16 = (int16_t) WebRtcSpl_DivW32W16(tmp32_2, ssk * 10);
|
||||||
|
} else {
|
||||||
|
tmp16 = (int16_t) WebRtcSpl_DivW32W16(-tmp32_2, ssk * 10);
|
||||||
|
tmp16 = -tmp16;
|
||||||
|
}
|
||||||
|
// Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
|
||||||
|
// Note that division by 4 equals shift by 2, hence,
|
||||||
|
// (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
|
||||||
|
tmp16 += 128; // Rounding.
|
||||||
|
ssk += (tmp16 >> 8);
|
||||||
|
if (ssk < kMinStd) {
|
||||||
|
ssk = kMinStd;
|
||||||
|
}
|
||||||
|
*sstd2ptr = ssk;
|
||||||
|
} else {
|
||||||
|
// Update GMM variance vectors.
|
||||||
|
// deltaN * (feature_vector[n] - nmk) - 1
|
||||||
|
// Q4 - (Q7 >> 3) = Q4.
|
||||||
|
tmp16 = feature_vector[n] - (nmk >> 3);
|
||||||
|
// (Q11 * Q4 >> 3) = Q12.
|
||||||
|
tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[nr], tmp16, 3) - 4096;
|
||||||
|
|
||||||
|
// (Q14 >> 2) * Q12 = Q24.
|
||||||
|
tmp16 = ((ngprvec[nr] + 2) >> 2);
|
||||||
|
tmp32_2 = tmp16 * tmp32_1;
|
||||||
|
// Q20 * approx 0.001 (2^-10=0.0009766), hence,
|
||||||
|
// (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
|
||||||
|
tmp32_1 = (tmp32_2 >> 14);
|
||||||
|
|
||||||
|
// Q20 / Q7 = Q13.
|
||||||
|
if (tmp32_1 > 0) {
|
||||||
|
tmp16 = (int16_t) WebRtcSpl_DivW32W16(tmp32_1, nsk);
|
||||||
|
} else {
|
||||||
|
tmp16 = (int16_t) WebRtcSpl_DivW32W16(-tmp32_1, nsk);
|
||||||
|
tmp16 = -tmp16;
|
||||||
|
}
|
||||||
|
tmp16 += 32; // Rounding
|
||||||
|
nsk += (tmp16 >> 6); // Q13 >> 6 = Q7.
|
||||||
|
if (nsk < kMinStd) {
|
||||||
|
nsk = kMinStd;
|
||||||
|
}
|
||||||
|
*nstd2ptr = nsk;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Separate models if they are too close.
|
||||||
|
// |nmid| in Q14 (= Q7 * Q7).
|
||||||
|
nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr);
|
||||||
|
nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n + kNumChannels],
|
||||||
|
*nmean2ptr);
|
||||||
|
|
||||||
|
// |smid| in Q14 (= Q7 * Q7).
|
||||||
|
smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr);
|
||||||
|
smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n + kNumChannels],
|
||||||
|
*smean2ptr);
|
||||||
|
|
||||||
|
// |diff| = "global" speech mean - "global" noise mean.
|
||||||
|
// (Q14 >> 9) - (Q14 >> 9) = Q5.
|
||||||
|
diff = (int16_t) (smid >> 9) - (int16_t) (nmid >> 9);
|
||||||
|
if (diff < kMinimumDifference[n]) {
|
||||||
|
tmp16 = kMinimumDifference[n] - diff;
|
||||||
|
|
||||||
|
// |tmp16_1| = ~0.8 * (kMinimumDifference - diff) in Q7.
|
||||||
|
// |tmp16_2| = ~0.2 * (kMinimumDifference - diff) in Q7.
|
||||||
|
tmp16_1 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(13, tmp16, 2);
|
||||||
|
tmp16_2 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(3, tmp16, 2);
|
||||||
|
|
||||||
|
// First Gaussian, speech model.
|
||||||
|
tmp16 = tmp16_1 + *smean1ptr;
|
||||||
|
*smean1ptr = tmp16;
|
||||||
|
smid = WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n]);
|
||||||
|
|
||||||
|
// Second Gaussian, speech model.
|
||||||
|
tmp16 = tmp16_1 + *smean2ptr;
|
||||||
|
*smean2ptr = tmp16;
|
||||||
|
smid += WEBRTC_SPL_MUL_16_16(tmp16,
|
||||||
|
kSpeechDataWeights[n + kNumChannels]);
|
||||||
|
|
||||||
|
// First Gaussian, noise model.
|
||||||
|
tmp16 = *nmean1ptr - tmp16_2;
|
||||||
|
*nmean1ptr = tmp16;
|
||||||
|
nmid = WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n]);
|
||||||
|
|
||||||
|
// Second Gaussian, noise model.
|
||||||
|
tmp16 = *nmean2ptr - tmp16_2;
|
||||||
|
*nmean2ptr = tmp16;
|
||||||
|
nmid += WEBRTC_SPL_MUL_16_16(tmp16,
|
||||||
|
kNoiseDataWeights[n + kNumChannels]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Control that the speech & noise means do not drift to much.
|
||||||
|
maxspe = kMaximumSpeech[n];
|
||||||
|
tmp16_2 = (int16_t) (smid >> 7);
|
||||||
|
if (tmp16_2 > maxspe) {
|
||||||
|
// Upper limit of speech model.
|
||||||
|
tmp16_2 -= maxspe;
|
||||||
|
|
||||||
|
*smean1ptr -= tmp16_2;
|
||||||
|
*smean2ptr -= tmp16_2;
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp16_2 = (int16_t) (nmid >> 7);
|
||||||
|
if (tmp16_2 > kMaximumNoise[n]) {
|
||||||
|
tmp16_2 -= kMaximumNoise[n];
|
||||||
|
|
||||||
|
*nmean1ptr -= tmp16_2;
|
||||||
|
*nmean2ptr -= tmp16_2;
|
||||||
|
}
|
||||||
|
|
||||||
|
nmean1ptr++;
|
||||||
|
smean1ptr++;
|
||||||
|
nstd1ptr++;
|
||||||
|
sstd1ptr++;
|
||||||
}
|
}
|
||||||
return vadflag;
|
self->frame_counter++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Smooth with respect to transition hysteresis.
|
||||||
|
if (!vadflag) {
|
||||||
|
if (self->over_hang > 0) {
|
||||||
|
vadflag = 2 + self->over_hang;
|
||||||
|
self->over_hang--;
|
||||||
|
}
|
||||||
|
self->num_of_speech = 0;
|
||||||
|
} else {
|
||||||
|
self->num_of_speech++;
|
||||||
|
if (self->num_of_speech > kMaxSpeechFrames) {
|
||||||
|
self->num_of_speech = kMaxSpeechFrames;
|
||||||
|
self->over_hang = overhead2;
|
||||||
|
} else {
|
||||||
|
self->over_hang = overhead1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return vadflag;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize the VAD. Set aggressiveness mode to default value.
|
// Initialize the VAD. Set aggressiveness mode to default value.
|
||||||
|
Loading…
Reference in New Issue
Block a user