diff --git a/src/common_audio/vad/vad_core.c b/src/common_audio/vad/vad_core.c index ef6665885..daf9fa0cc 100644 --- a/src/common_audio/vad/vad_core.c +++ b/src/common_audio/vad/vad_core.c @@ -91,409 +91,385 @@ static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 }; static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 }; // Calculates the probabilities for both speech and background noise using -// Gaussian Mixture Models. A hypothesis-test is performed to decide which type -// of signal is most probable. +// Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which +// type of signal is most probable. // -// - inst [i/o] : Pointer to VAD instance +// - self [i/o] : Pointer to VAD instance // - feature_vector [i] : Feature vector = log10(energy in frequency band) // - total_power [i] : Total power in audio frame. // - frame_length [i] : Number of input samples // // - returns : the VAD decision (0 - noise, 1 - speech). -static int16_t GmmProbability(VadInstT *inst, int16_t *feature_vector, - int16_t total_power, int frame_length) -{ - int n, k; - int16_t backval; - int16_t h0, h1; - int16_t ratvec, xval; - int16_t vadflag; - int16_t shifts0, shifts1; - int16_t tmp16, tmp16_1, tmp16_2; - int16_t diff, nr, pos; - int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk; - int16_t delt, ndelt; - int16_t maxspe, maxmu; - int16_t deltaN[kTableSize], deltaS[kTableSize]; - int16_t ngprvec[kTableSize], sgprvec[kTableSize]; - int32_t h0test, h1test; - int32_t tmp32_1, tmp32_2; - int32_t dotVal; - int32_t nmid, smid; - int32_t probn[kNumGaussians], probs[kNumGaussians]; - int16_t *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr, *nstd1ptr, *nstd2ptr, - *sstd1ptr, *sstd2ptr; - int16_t overhead1, overhead2, individualTest, totalTest; +static int16_t GmmProbability(VadInstT* self, int16_t* feature_vector, + int16_t total_power, int frame_length) { + int n, k; + int16_t backval; + int16_t h0, h1; + int16_t ratvec, xval; + int16_t vadflag = 0; + int16_t shifts0, shifts1; + int16_t tmp16, tmp16_1, tmp16_2; + int16_t diff; + int nr, pos; + int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk; + int16_t delt, ndelt; + int16_t maxspe, maxmu; + int16_t deltaN[kTableSize], deltaS[kTableSize]; + int16_t ngprvec[kTableSize], sgprvec[kTableSize]; + int32_t h0test, h1test; + int32_t tmp32_1, tmp32_2; + int32_t dotVal; + int32_t nmid, smid; + int32_t probn[kNumGaussians], probs[kNumGaussians]; + int16_t *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr; + int16_t *nstd1ptr, *nstd2ptr, *sstd1ptr, *sstd2ptr; + int16_t overhead1, overhead2, individualTest, totalTest; - // Set the thresholds to different values based on frame length - if (frame_length == 80) - { - // 80 input samples - overhead1 = inst->over_hang_max_1[0]; - overhead2 = inst->over_hang_max_2[0]; - individualTest = inst->individual[0]; - totalTest = inst->total[0]; - } else if (frame_length == 160) - { - // 160 input samples - overhead1 = inst->over_hang_max_1[1]; - overhead2 = inst->over_hang_max_2[1]; - individualTest = inst->individual[1]; - totalTest = inst->total[1]; - } else - { - // 240 input samples - overhead1 = inst->over_hang_max_1[2]; - overhead2 = inst->over_hang_max_2[2]; - individualTest = inst->individual[2]; - totalTest = inst->total[2]; + // Set various thresholds based on frame lengths (80, 160 or 240 samples). + if (frame_length == 80) { + overhead1 = self->over_hang_max_1[0]; + overhead2 = self->over_hang_max_2[0]; + individualTest = self->individual[0]; + totalTest = self->total[0]; + } else if (frame_length == 160) { + overhead1 = self->over_hang_max_1[1]; + overhead2 = self->over_hang_max_2[1]; + individualTest = self->individual[1]; + totalTest = self->total[1]; + } else { + overhead1 = self->over_hang_max_1[2]; + overhead2 = self->over_hang_max_2[2]; + individualTest = self->individual[2]; + totalTest = self->total[2]; + } + + if (total_power > kMinEnergy) { + // We have a signal present. + // Set pointers to the Gaussian parameters. + nmean1ptr = &self->noise_means[0]; + nmean2ptr = &self->noise_means[kNumChannels]; + smean1ptr = &self->speech_means[0]; + smean2ptr = &self->speech_means[kNumChannels]; + nstd1ptr = &self->noise_stds[0]; + nstd2ptr = &self->noise_stds[kNumChannels]; + sstd1ptr = &self->speech_stds[0]; + sstd2ptr = &self->speech_stds[kNumChannels]; + + dotVal = 0; + for (n = 0; n < kNumChannels; n++) { + // Perform for all channels. + pos = (n << 1); + xval = feature_vector[n]; + + // Probability for Noise, Q7 * Q20 = Q27. + tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean1ptr++, *nstd1ptr++, + &deltaN[pos]); + probn[0] = kNoiseDataWeights[n] * tmp32_1; + tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++, + &deltaN[pos + 1]); + probn[1] = kNoiseDataWeights[n + kNumChannels] * tmp32_1; + h0test = probn[0] + probn[1]; // Q27 + h0 = (int16_t) (h0test >> 12); // Q15 + + // Probability for Speech. + tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean1ptr++, *sstd1ptr++, + &deltaS[pos]); + probs[0] = kSpeechDataWeights[n] * tmp32_1; + tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++, + &deltaS[pos + 1]); + probs[1] = kSpeechDataWeights[n + kNumChannels] * tmp32_1; + h1test = probs[0] + probs[1]; // Q27 + h1 = (int16_t) (h1test >> 12); // Q15 + + // Calculate the log likelihood ratio. Approximate log2(H1/H0) with + // |shifts0| - |shifts1|. + shifts0 = WebRtcSpl_NormW32(h0test); + shifts1 = WebRtcSpl_NormW32(h1test); + + if ((h0test > 0) && (h1test > 0)) { + ratvec = shifts0 - shifts1; + } else if (h1test > 0) { + ratvec = 31 - shifts1; + } else if (h0test > 0) { + ratvec = shifts0 - 31; + } else { + ratvec = 0; + } + + // VAD decision with spectrum weighting. + dotVal += WEBRTC_SPL_MUL_16_16(ratvec, kSpectrumWeight[n]); + + // Individual channel test. + if ((ratvec << 2) > individualTest) { + vadflag = 1; + } + + // Probabilities used when updating model. + if (h0 > 0) { + tmp32_1 = probn[0] & 0xFFFFF000; // Q27 + tmp32_2 = (tmp32_1 << 2); // Q29 + ngprvec[pos] = (int16_t) WebRtcSpl_DivW32W16(tmp32_2, h0); // Q14 + ngprvec[pos + 1] = 16384 - ngprvec[pos]; + } else { + ngprvec[pos] = 16384; + ngprvec[pos + 1] = 0; + } + + // Probabilities used when updating model. + if (h1 > 0) { + tmp32_1 = probs[0] & 0xFFFFF000; + tmp32_2 = (tmp32_1 << 2); + sgprvec[pos] = (int16_t) WebRtcSpl_DivW32W16(tmp32_2, h1); + sgprvec[pos + 1] = 16384 - sgprvec[pos]; + } else { + sgprvec[pos] = 0; + sgprvec[pos + 1] = 0; + } } - if (total_power > kMinEnergy) - { // If signal present at all - - // Set pointers to the gaussian parameters - nmean1ptr = &inst->noise_means[0]; - nmean2ptr = &inst->noise_means[kNumChannels]; - smean1ptr = &inst->speech_means[0]; - smean2ptr = &inst->speech_means[kNumChannels]; - nstd1ptr = &inst->noise_stds[0]; - nstd2ptr = &inst->noise_stds[kNumChannels]; - sstd1ptr = &inst->speech_stds[0]; - sstd2ptr = &inst->speech_stds[kNumChannels]; - - vadflag = 0; - dotVal = 0; - for (n = 0; n < kNumChannels; n++) - { // For all channels - - pos = WEBRTC_SPL_LSHIFT_W16(n, 1); - xval = feature_vector[n]; - - // Probability for Noise, Q7 * Q20 = Q27 - tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean1ptr++, *nstd1ptr++, - &deltaN[pos]); - probn[0] = (int32_t)(kNoiseDataWeights[n] * tmp32_1); - tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++, - &deltaN[pos + 1]); - probn[1] = (int32_t)(kNoiseDataWeights[n + kNumChannels] * tmp32_1); - h0test = probn[0] + probn[1]; // Q27 - h0 = (int16_t)WEBRTC_SPL_RSHIFT_W32(h0test, 12); // Q15 - - // Probability for Speech - tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean1ptr++, *sstd1ptr++, - &deltaS[pos]); - probs[0] = (int32_t)(kSpeechDataWeights[n] * tmp32_1); - tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++, - &deltaS[pos + 1]); - probs[1] = (int32_t)(kSpeechDataWeights[n + kNumChannels] * tmp32_1); - h1test = probs[0] + probs[1]; // Q27 - h1 = (int16_t)WEBRTC_SPL_RSHIFT_W32(h1test, 12); // Q15 - - // Get likelihood ratio. Approximate log2(H1/H0) with shifts0 - shifts1 - shifts0 = WebRtcSpl_NormW32(h0test); - shifts1 = WebRtcSpl_NormW32(h1test); - - if ((h0test > 0) && (h1test > 0)) - { - ratvec = shifts0 - shifts1; - } else if (h1test > 0) - { - ratvec = 31 - shifts1; - } else if (h0test > 0) - { - ratvec = shifts0 - 31; - } else - { - ratvec = 0; - } - - // VAD decision with spectrum weighting - dotVal += WEBRTC_SPL_MUL_16_16(ratvec, kSpectrumWeight[n]); - - // Individual channel test - if ((ratvec << 2) > individualTest) - { - vadflag = 1; - } - - // Probabilities used when updating model - if (h0 > 0) - { - tmp32_1 = probn[0] & 0xFFFFF000; // Q27 - tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); // Q29 - ngprvec[pos] = (int16_t)WebRtcSpl_DivW32W16(tmp32_2, h0); - ngprvec[pos + 1] = 16384 - ngprvec[pos]; - } else - { - ngprvec[pos] = 16384; - ngprvec[pos + 1] = 0; - } - - // Probabilities used when updating model - if (h1 > 0) - { - tmp32_1 = probs[0] & 0xFFFFF000; - tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); - sgprvec[pos] = (int16_t)WebRtcSpl_DivW32W16(tmp32_2, h1); - sgprvec[pos + 1] = 16384 - sgprvec[pos]; - } else - { - sgprvec[pos] = 0; - sgprvec[pos + 1] = 0; - } - } - - // Overall test - if (dotVal >= totalTest) - { - vadflag |= 1; - } - - // Set pointers to the means and standard deviations. - nmean1ptr = &inst->noise_means[0]; - smean1ptr = &inst->speech_means[0]; - nstd1ptr = &inst->noise_stds[0]; - sstd1ptr = &inst->speech_stds[0]; - - maxspe = 12800; - - // Update the model's parameters - for (n = 0; n < kNumChannels; n++) - { - - pos = WEBRTC_SPL_LSHIFT_W16(n, 1); - - // Get min value in past which is used for long term correction - backval = WebRtcVad_FindMinimum(inst, feature_vector[n], n); // Q4 - - // Compute the "global" mean, that is the sum of the two means weighted - nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7 - nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+kNumChannels], - *(nmean1ptr+kNumChannels)); - tmp16_1 = (int16_t)WEBRTC_SPL_RSHIFT_W32(nmid, 6); // Q8 - - for (k = 0; k < kNumGaussians; k++) - { - - nr = pos + k; - - nmean2ptr = nmean1ptr + k * kNumChannels; - smean2ptr = smean1ptr + k * kNumChannels; - nstd2ptr = nstd1ptr + k * kNumChannels; - sstd2ptr = sstd1ptr + k * kNumChannels; - nmk = *nmean2ptr; - smk = *smean2ptr; - nsk = *nstd2ptr; - ssk = *sstd2ptr; - - // Update noise mean vector if the frame consists of noise only - nmk2 = nmk; - if (!vadflag) - { - // deltaN = (x-mu)/sigma^2 - // ngprvec[k] = probn[k]/(probn[0] + probn[1]) - - delt = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[nr], - deltaN[nr], 11); // Q14*Q11 - nmk2 = nmk + (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delt, - kNoiseUpdateConst, - 22); // Q7+(Q14*Q15>>22) - } - - // Long term correction of the noise mean - ndelt = WEBRTC_SPL_LSHIFT_W16(backval, 4); - ndelt -= tmp16_1; // Q8 - Q8 - nmk3 = nmk2 + (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ndelt, - kBackEta, - 9); // Q7+(Q8*Q8)>>9 - - // Control that the noise mean does not drift to much - tmp16 = WEBRTC_SPL_LSHIFT_W16(k+5, 7); - if (nmk3 < tmp16) - nmk3 = tmp16; - tmp16 = WEBRTC_SPL_LSHIFT_W16(72+k-n, 7); - if (nmk3 > tmp16) - nmk3 = tmp16; - *nmean2ptr = nmk3; - - if (vadflag) - { - // Update speech mean vector: - // deltaS = (x-mu)/sigma^2 - // sgprvec[k] = probn[k]/(probn[0] + probn[1]) - - delt = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[nr], - deltaS[nr], - 11); // (Q14*Q11)>>11=Q14 - tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delt, - kSpeechUpdateConst, - 21) + 1; - smk2 = smk + (tmp16 >> 1); // Q7 + (Q14 * Q15 >> 22) - - // Control that the speech mean does not drift to much - maxmu = maxspe + 640; - if (smk2 < kMinimumMean[k]) - smk2 = kMinimumMean[k]; - if (smk2 > maxmu) - smk2 = maxmu; - - *smean2ptr = smk2; - - // (Q7>>3) = Q4 - tmp16 = WEBRTC_SPL_RSHIFT_W16((smk + 4), 3); - - tmp16 = feature_vector[n] - tmp16; // Q4 - tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[nr], tmp16, 3); - tmp32_2 = tmp32_1 - (int32_t)4096; // Q12 - tmp16 = WEBRTC_SPL_RSHIFT_W16((sgprvec[nr]), 2); - tmp32_1 = (int32_t)(tmp16 * tmp32_2);// (Q15>>3)*(Q14>>2)=Q12*Q12=Q24 - - tmp32_2 = WEBRTC_SPL_RSHIFT_W32(tmp32_1, 4); // Q20 - - // 0.1 * Q20 / Q7 = Q13 - if (tmp32_2 > 0) - tmp16 = (int16_t)WebRtcSpl_DivW32W16(tmp32_2, ssk * 10); - else - { - tmp16 = (int16_t)WebRtcSpl_DivW32W16(-tmp32_2, ssk * 10); - tmp16 = -tmp16; - } - // divide by 4 giving an update factor of 0.025 - tmp16 += 128; // Rounding - ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8); - // Division with 8 plus Q7 - if (ssk < kMinStd) - ssk = kMinStd; - *sstd2ptr = ssk; - } else - { - // Update GMM variance vectors - // deltaN * (feature_vector[n] - nmk) - 1, Q11 * Q4 - tmp16 = feature_vector[n] - WEBRTC_SPL_RSHIFT_W16(nmk, 3); - - // (Q15>>3) * (Q14>>2) = Q12 * Q12 = Q24 - tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[nr], tmp16, 3) - 4096; - tmp16 = WEBRTC_SPL_RSHIFT_W16((ngprvec[nr]+2), 2); - tmp32_2 = (int32_t)(tmp16 * tmp32_1); - tmp32_1 = WEBRTC_SPL_RSHIFT_W32(tmp32_2, 14); - // Q20 * approx 0.001 (2^-10=0.0009766) - - // Q20 / Q7 = Q13 - tmp16 = (int16_t)WebRtcSpl_DivW32W16(tmp32_1, nsk); - if (tmp32_1 > 0) - tmp16 = (int16_t)WebRtcSpl_DivW32W16(tmp32_1, nsk); - else - { - tmp16 = (int16_t)WebRtcSpl_DivW32W16(-tmp32_1, nsk); - tmp16 = -tmp16; - } - tmp16 += 32; // Rounding - nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6); - - if (nsk < kMinStd) - nsk = kMinStd; - - *nstd2ptr = nsk; - } - } - - // Separate models if they are too close - nmid in Q14 - nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); - nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+kNumChannels], *nmean2ptr); - - // smid in Q14 - smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr); - smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+kNumChannels], *smean2ptr); - - // diff = "global" speech mean - "global" noise mean - diff = (int16_t)WEBRTC_SPL_RSHIFT_W32(smid, 9); - tmp16 = (int16_t)WEBRTC_SPL_RSHIFT_W32(nmid, 9); - diff -= tmp16; - - if (diff < kMinimumDifference[n]) - { - - tmp16 = kMinimumDifference[n] - diff; // Q5 - - // tmp16_1 = ~0.8 * (kMinimumDifference - diff) in Q7 - // tmp16_2 = ~0.2 * (kMinimumDifference - diff) in Q7 - tmp16_1 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(13, tmp16, 2); - tmp16_2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(3, tmp16, 2); - - // First Gauss, speech model - tmp16 = tmp16_1 + *smean1ptr; - *smean1ptr = tmp16; - smid = WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n]); - - // Second Gauss, speech model - tmp16 = tmp16_1 + *smean2ptr; - *smean2ptr = tmp16; - smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+kNumChannels]); - - // First Gauss, noise model - tmp16 = *nmean1ptr - tmp16_2; - *nmean1ptr = tmp16; - - nmid = WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n]); - - // Second Gauss, noise model - tmp16 = *nmean2ptr - tmp16_2; - *nmean2ptr = tmp16; - nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+kNumChannels]); - } - - // Control that the speech & noise means do not drift to much - maxspe = kMaximumSpeech[n]; - tmp16_2 = (int16_t)WEBRTC_SPL_RSHIFT_W32(smid, 7); - if (tmp16_2 > maxspe) - { // Upper limit of speech model - tmp16_2 -= maxspe; - - *smean1ptr -= tmp16_2; - *smean2ptr -= tmp16_2; - } - - tmp16_2 = (int16_t)WEBRTC_SPL_RSHIFT_W32(nmid, 7); - if (tmp16_2 > kMaximumNoise[n]) - { - tmp16_2 -= kMaximumNoise[n]; - - *nmean1ptr -= tmp16_2; - *nmean2ptr -= tmp16_2; - } - - nmean1ptr++; - smean1ptr++; - nstd1ptr++; - sstd1ptr++; - } - inst->frame_counter++; - } else - { - vadflag = 0; + // Overall test. + if (dotVal >= totalTest) { + vadflag |= 1; } - // Hangover smoothing - if (!vadflag) - { - if (inst->over_hang > 0) - { - vadflag = 2 + inst->over_hang; - inst->over_hang = inst->over_hang - 1; + // Set pointers to the means and standard deviations. + nmean1ptr = &self->noise_means[0]; + smean1ptr = &self->speech_means[0]; + nstd1ptr = &self->noise_stds[0]; + sstd1ptr = &self->speech_stds[0]; + + maxspe = 12800; + + // Update the model parameters. + for (n = 0; n < kNumChannels; n++) { + pos = (n << 1); + + // Get minimum value in past which is used for long term correction. + backval = WebRtcVad_FindMinimum(self, feature_vector[n], n); // Q4 + + // Compute the "global" mean, that is the sum of the two means weighted. + nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7 + nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n + kNumChannels], + *(nmean1ptr + kNumChannels)); + tmp16_1 = (int16_t) (nmid >> 6); // Q8 + + for (k = 0; k < kNumGaussians; k++) { + nr = pos + k; + + nmean2ptr = nmean1ptr + k * kNumChannels; + smean2ptr = smean1ptr + k * kNumChannels; + nstd2ptr = nstd1ptr + k * kNumChannels; + sstd2ptr = sstd1ptr + k * kNumChannels; + nmk = *nmean2ptr; + smk = *smean2ptr; + nsk = *nstd2ptr; + ssk = *sstd2ptr; + + // Update noise mean vector if the frame consists of noise only. + nmk2 = nmk; + if (!vadflag) { + // deltaN = (x-mu)/sigma^2 + // ngprvec[k] = probn[k]/(probn[0] + probn[1]) + + // (Q14 * Q11 >> 11) = Q14. + delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[nr], deltaN[nr], + 11); + // Q7 + (Q14 * Q15 >> 22) = Q7. + nmk2 = nmk + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt, + kNoiseUpdateConst, + 22); } - inst->num_of_speech = 0; - } else - { - inst->num_of_speech = inst->num_of_speech + 1; - if (inst->num_of_speech > kMaxSpeechFrames) - { - inst->num_of_speech = kMaxSpeechFrames; - inst->over_hang = overhead2; - } else - inst->over_hang = overhead1; + + // Long term correction of the noise mean. + // Q8 - Q8 = Q8. + ndelt = (backval << 4) - tmp16_1; + // Q7 + (Q8 * Q8) >> 9 = Q7. + nmk3 = nmk2 + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ndelt, kBackEta, 9); + + // Control that the noise mean does not drift to much. + tmp16 = (int16_t) ((k + 5) << 7); + if (nmk3 < tmp16) { + nmk3 = tmp16; + } + tmp16 = (int16_t) ((72 + k - n) << 7); + if (nmk3 > tmp16) { + nmk3 = tmp16; + } + *nmean2ptr = nmk3; + + if (vadflag) { + // Update speech mean vector: + // |deltaS| = (x-mu)/sigma^2 + // sgprvec[k] = probn[k]/(probn[0] + probn[1]) + + // (Q14 * Q11) >> 11 = Q14. + delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[nr], deltaS[nr], + 11); + // Q14 * Q15 >> 21 = Q8. + tmp16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt, kSpeechUpdateConst, + 21); + // Q7 + (Q8 >> 1) = Q7. With rounding. + smk2 = smk + ((tmp16 + 1) >> 1); + + // Control that the speech mean does not drift to much. + maxmu = maxspe + 640; + if (smk2 < kMinimumMean[k]) { + smk2 = kMinimumMean[k]; + } + if (smk2 > maxmu) { + smk2 = maxmu; + } + *smean2ptr = smk2; // Q7. + + // (Q7 >> 3) = Q4. With rounding. + tmp16 = ((smk + 4) >> 3); + + tmp16 = feature_vector[n] - tmp16; // Q4 + // (Q11 * Q4 >> 3) = Q12. + tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[nr], tmp16, 3); + tmp32_2 = tmp32_1 - 4096; + tmp16 = (sgprvec[nr] >> 2); + // (Q14 >> 2) * Q12 = Q24. + tmp32_1 = tmp16 * tmp32_2; + + tmp32_2 = (tmp32_1 >> 4); // Q20 + + // 0.1 * Q20 / Q7 = Q13. + if (tmp32_2 > 0) { + tmp16 = (int16_t) WebRtcSpl_DivW32W16(tmp32_2, ssk * 10); + } else { + tmp16 = (int16_t) WebRtcSpl_DivW32W16(-tmp32_2, ssk * 10); + tmp16 = -tmp16; + } + // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4). + // Note that division by 4 equals shift by 2, hence, + // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7. + tmp16 += 128; // Rounding. + ssk += (tmp16 >> 8); + if (ssk < kMinStd) { + ssk = kMinStd; + } + *sstd2ptr = ssk; + } else { + // Update GMM variance vectors. + // deltaN * (feature_vector[n] - nmk) - 1 + // Q4 - (Q7 >> 3) = Q4. + tmp16 = feature_vector[n] - (nmk >> 3); + // (Q11 * Q4 >> 3) = Q12. + tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[nr], tmp16, 3) - 4096; + + // (Q14 >> 2) * Q12 = Q24. + tmp16 = ((ngprvec[nr] + 2) >> 2); + tmp32_2 = tmp16 * tmp32_1; + // Q20 * approx 0.001 (2^-10=0.0009766), hence, + // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20. + tmp32_1 = (tmp32_2 >> 14); + + // Q20 / Q7 = Q13. + if (tmp32_1 > 0) { + tmp16 = (int16_t) WebRtcSpl_DivW32W16(tmp32_1, nsk); + } else { + tmp16 = (int16_t) WebRtcSpl_DivW32W16(-tmp32_1, nsk); + tmp16 = -tmp16; + } + tmp16 += 32; // Rounding + nsk += (tmp16 >> 6); // Q13 >> 6 = Q7. + if (nsk < kMinStd) { + nsk = kMinStd; + } + *nstd2ptr = nsk; + } + } + + // Separate models if they are too close. + // |nmid| in Q14 (= Q7 * Q7). + nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); + nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n + kNumChannels], + *nmean2ptr); + + // |smid| in Q14 (= Q7 * Q7). + smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr); + smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n + kNumChannels], + *smean2ptr); + + // |diff| = "global" speech mean - "global" noise mean. + // (Q14 >> 9) - (Q14 >> 9) = Q5. + diff = (int16_t) (smid >> 9) - (int16_t) (nmid >> 9); + if (diff < kMinimumDifference[n]) { + tmp16 = kMinimumDifference[n] - diff; + + // |tmp16_1| = ~0.8 * (kMinimumDifference - diff) in Q7. + // |tmp16_2| = ~0.2 * (kMinimumDifference - diff) in Q7. + tmp16_1 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(13, tmp16, 2); + tmp16_2 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(3, tmp16, 2); + + // First Gaussian, speech model. + tmp16 = tmp16_1 + *smean1ptr; + *smean1ptr = tmp16; + smid = WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n]); + + // Second Gaussian, speech model. + tmp16 = tmp16_1 + *smean2ptr; + *smean2ptr = tmp16; + smid += WEBRTC_SPL_MUL_16_16(tmp16, + kSpeechDataWeights[n + kNumChannels]); + + // First Gaussian, noise model. + tmp16 = *nmean1ptr - tmp16_2; + *nmean1ptr = tmp16; + nmid = WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n]); + + // Second Gaussian, noise model. + tmp16 = *nmean2ptr - tmp16_2; + *nmean2ptr = tmp16; + nmid += WEBRTC_SPL_MUL_16_16(tmp16, + kNoiseDataWeights[n + kNumChannels]); + } + + // Control that the speech & noise means do not drift to much. + maxspe = kMaximumSpeech[n]; + tmp16_2 = (int16_t) (smid >> 7); + if (tmp16_2 > maxspe) { + // Upper limit of speech model. + tmp16_2 -= maxspe; + + *smean1ptr -= tmp16_2; + *smean2ptr -= tmp16_2; + } + + tmp16_2 = (int16_t) (nmid >> 7); + if (tmp16_2 > kMaximumNoise[n]) { + tmp16_2 -= kMaximumNoise[n]; + + *nmean1ptr -= tmp16_2; + *nmean2ptr -= tmp16_2; + } + + nmean1ptr++; + smean1ptr++; + nstd1ptr++; + sstd1ptr++; } - return vadflag; + self->frame_counter++; + } + + // Smooth with respect to transition hysteresis. + if (!vadflag) { + if (self->over_hang > 0) { + vadflag = 2 + self->over_hang; + self->over_hang--; + } + self->num_of_speech = 0; + } else { + self->num_of_speech++; + if (self->num_of_speech > kMaxSpeechFrames) { + self->num_of_speech = kMaxSpeechFrames; + self->over_hang = overhead2; + } else { + self->over_hang = overhead1; + } + } + return vadflag; } // Initialize the VAD. Set aggressiveness mode to default value.