VAD refactoring: Removed macro file.
In this CL we've replaced the VAD macros with static const or enum. Priority=low BUG= TEST=vad_unittest Review URL: https://webrtc-codereview.appspot.com/453004 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1913 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
ac9fd8af09
commit
a496b03c78
@ -12,7 +12,6 @@
|
||||
|
||||
#include "signal_processing_library.h"
|
||||
#include "typedefs.h"
|
||||
#include "vad_defines.h"
|
||||
#include "vad_filterbank.h"
|
||||
#include "vad_gmm.h"
|
||||
#include "vad_sp.h"
|
||||
@ -66,6 +65,31 @@ static const int16_t kMinStd = 384;
|
||||
static const short kDefaultMode = 0;
|
||||
static const int kInitCheck = 42;
|
||||
|
||||
// Constants used in WebRtcVad_set_mode_core().
|
||||
//
|
||||
// Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
|
||||
//
|
||||
// Mode 0, Quality.
|
||||
static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
|
||||
static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
|
||||
static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
|
||||
static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
|
||||
// Mode 1, Low bitrate.
|
||||
static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
|
||||
static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
|
||||
static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
|
||||
static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
|
||||
// Mode 2, Aggressive.
|
||||
static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
|
||||
static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
|
||||
static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
|
||||
static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
|
||||
// Mode 3, Very aggressive.
|
||||
static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
|
||||
static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
|
||||
static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
|
||||
static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
|
||||
|
||||
// Calculates the probabilities for both speech and background noise using
|
||||
// Gaussian Mixture Models. A hypothesis-test is performed to decide which type
|
||||
// of signal is most probable.
|
||||
@ -90,13 +114,13 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
|
||||
WebRtc_Word16 nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
|
||||
WebRtc_Word16 delt, ndelt;
|
||||
WebRtc_Word16 maxspe, maxmu;
|
||||
WebRtc_Word16 deltaN[NUM_TABLE_VALUES], deltaS[NUM_TABLE_VALUES];
|
||||
WebRtc_Word16 ngprvec[NUM_TABLE_VALUES], sgprvec[NUM_TABLE_VALUES];
|
||||
WebRtc_Word16 deltaN[kTableSize], deltaS[kTableSize];
|
||||
WebRtc_Word16 ngprvec[kTableSize], sgprvec[kTableSize];
|
||||
WebRtc_Word32 h0test, h1test;
|
||||
WebRtc_Word32 tmp32_1, tmp32_2;
|
||||
WebRtc_Word32 dotVal;
|
||||
WebRtc_Word32 nmid, smid;
|
||||
WebRtc_Word32 probn[NUM_MODELS], probs[NUM_MODELS];
|
||||
WebRtc_Word32 probn[kNumGaussians], probs[kNumGaussians];
|
||||
WebRtc_Word16 *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr, *nstd1ptr, *nstd2ptr,
|
||||
*sstd1ptr, *sstd2ptr;
|
||||
WebRtc_Word16 overhead1, overhead2, individualTest, totalTest;
|
||||
@ -125,22 +149,22 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
|
||||
totalTest = inst->total[2];
|
||||
}
|
||||
|
||||
if (total_power > MIN_ENERGY)
|
||||
if (total_power > kMinEnergy)
|
||||
{ // If signal present at all
|
||||
|
||||
// Set pointers to the gaussian parameters
|
||||
nmean1ptr = &inst->noise_means[0];
|
||||
nmean2ptr = &inst->noise_means[NUM_CHANNELS];
|
||||
nmean2ptr = &inst->noise_means[kNumChannels];
|
||||
smean1ptr = &inst->speech_means[0];
|
||||
smean2ptr = &inst->speech_means[NUM_CHANNELS];
|
||||
smean2ptr = &inst->speech_means[kNumChannels];
|
||||
nstd1ptr = &inst->noise_stds[0];
|
||||
nstd2ptr = &inst->noise_stds[NUM_CHANNELS];
|
||||
nstd2ptr = &inst->noise_stds[kNumChannels];
|
||||
sstd1ptr = &inst->speech_stds[0];
|
||||
sstd2ptr = &inst->speech_stds[NUM_CHANNELS];
|
||||
sstd2ptr = &inst->speech_stds[kNumChannels];
|
||||
|
||||
vadflag = 0;
|
||||
dotVal = 0;
|
||||
for (n = 0; n < NUM_CHANNELS; n++)
|
||||
for (n = 0; n < kNumChannels; n++)
|
||||
{ // For all channels
|
||||
|
||||
pos = WEBRTC_SPL_LSHIFT_W16(n, 1);
|
||||
@ -152,7 +176,7 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
|
||||
probn[0] = (WebRtc_Word32)(kNoiseDataWeights[n] * tmp32_1);
|
||||
tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++,
|
||||
&deltaN[pos + 1]);
|
||||
probn[1] = (WebRtc_Word32)(kNoiseDataWeights[n + NUM_CHANNELS] * tmp32_1);
|
||||
probn[1] = (WebRtc_Word32)(kNoiseDataWeights[n + kNumChannels] * tmp32_1);
|
||||
h0test = probn[0] + probn[1]; // Q27
|
||||
h0 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h0test, 12); // Q15
|
||||
|
||||
@ -162,7 +186,7 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
|
||||
probs[0] = (WebRtc_Word32)(kSpeechDataWeights[n] * tmp32_1);
|
||||
tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++,
|
||||
&deltaS[pos + 1]);
|
||||
probs[1] = (WebRtc_Word32)(kSpeechDataWeights[n + NUM_CHANNELS] * tmp32_1);
|
||||
probs[1] = (WebRtc_Word32)(kSpeechDataWeights[n + kNumChannels] * tmp32_1);
|
||||
h1test = probs[0] + probs[1]; // Q27
|
||||
h1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h1test, 12); // Q15
|
||||
|
||||
@ -235,7 +259,7 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
|
||||
maxspe = 12800;
|
||||
|
||||
// Update the model's parameters
|
||||
for (n = 0; n < NUM_CHANNELS; n++)
|
||||
for (n = 0; n < kNumChannels; n++)
|
||||
{
|
||||
|
||||
pos = WEBRTC_SPL_LSHIFT_W16(n, 1);
|
||||
@ -245,19 +269,19 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
|
||||
|
||||
// Compute the "global" mean, that is the sum of the two means weighted
|
||||
nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7
|
||||
nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS],
|
||||
*(nmean1ptr+NUM_CHANNELS));
|
||||
nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+kNumChannels],
|
||||
*(nmean1ptr+kNumChannels));
|
||||
tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 6); // Q8
|
||||
|
||||
for (k = 0; k < NUM_MODELS; k++)
|
||||
for (k = 0; k < kNumGaussians; k++)
|
||||
{
|
||||
|
||||
nr = pos + k;
|
||||
|
||||
nmean2ptr = nmean1ptr + k * NUM_CHANNELS;
|
||||
smean2ptr = smean1ptr + k * NUM_CHANNELS;
|
||||
nstd2ptr = nstd1ptr + k * NUM_CHANNELS;
|
||||
sstd2ptr = sstd1ptr + k * NUM_CHANNELS;
|
||||
nmean2ptr = nmean1ptr + k * kNumChannels;
|
||||
smean2ptr = smean1ptr + k * kNumChannels;
|
||||
nstd2ptr = nstd1ptr + k * kNumChannels;
|
||||
sstd2ptr = sstd1ptr + k * kNumChannels;
|
||||
nmk = *nmean2ptr;
|
||||
smk = *smean2ptr;
|
||||
nsk = *nstd2ptr;
|
||||
@ -376,11 +400,11 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
|
||||
|
||||
// Separate models if they are too close - nmid in Q14
|
||||
nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr);
|
||||
nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], *nmean2ptr);
|
||||
nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+kNumChannels], *nmean2ptr);
|
||||
|
||||
// smid in Q14
|
||||
smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr);
|
||||
smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+NUM_CHANNELS], *smean2ptr);
|
||||
smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+kNumChannels], *smean2ptr);
|
||||
|
||||
// diff = "global" speech mean - "global" noise mean
|
||||
diff = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 9);
|
||||
@ -405,7 +429,7 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
|
||||
// Second Gauss, speech model
|
||||
tmp16 = tmp16_1 + *smean2ptr;
|
||||
*smean2ptr = tmp16;
|
||||
smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+NUM_CHANNELS]);
|
||||
smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+kNumChannels]);
|
||||
|
||||
// First Gauss, noise model
|
||||
tmp16 = *nmean1ptr - tmp16_2;
|
||||
@ -416,7 +440,7 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
|
||||
// Second Gauss, noise model
|
||||
tmp16 = *nmean2ptr - tmp16_2;
|
||||
*nmean2ptr = tmp16;
|
||||
nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+NUM_CHANNELS]);
|
||||
nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+kNumChannels]);
|
||||
}
|
||||
|
||||
// Control that the speech & noise means do not drift to much
|
||||
@ -491,7 +515,7 @@ int WebRtcVad_InitCore(VadInstT* self) {
|
||||
sizeof(self->downsampling_filter_states));
|
||||
|
||||
// Read initial PDF parameters.
|
||||
for (i = 0; i < NUM_TABLE_VALUES; i++) {
|
||||
for (i = 0; i < kTableSize; i++) {
|
||||
self->noise_means[i] = kNoiseDataMeans[i];
|
||||
self->speech_means[i] = kSpeechDataMeans[i];
|
||||
self->noise_stds[i] = kNoiseDataStds[i];
|
||||
@ -499,7 +523,7 @@ int WebRtcVad_InitCore(VadInstT* self) {
|
||||
}
|
||||
|
||||
// Initialize Index and Minimum value vectors.
|
||||
for (i = 0; i < 16 * NUM_CHANNELS; i++) {
|
||||
for (i = 0; i < 16 * kNumChannels; i++) {
|
||||
self->low_value_vector[i] = 10000;
|
||||
self->index_vector[i] = 0;
|
||||
}
|
||||
@ -512,7 +536,7 @@ int WebRtcVad_InitCore(VadInstT* self) {
|
||||
memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
|
||||
|
||||
// Initialize mean value memory, for WebRtcVad_FindMinimum().
|
||||
for (i = 0; i < NUM_CHANNELS; i++) {
|
||||
for (i = 0; i < kNumChannels; i++) {
|
||||
self->mean_value[i] = 1600;
|
||||
}
|
||||
|
||||
@ -527,83 +551,60 @@ int WebRtcVad_InitCore(VadInstT* self) {
|
||||
}
|
||||
|
||||
// Set aggressiveness mode
|
||||
int WebRtcVad_set_mode_core(VadInstT *inst, int mode)
|
||||
{
|
||||
int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
|
||||
int return_value = 0;
|
||||
|
||||
if (mode == 0)
|
||||
{
|
||||
// Quality mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
|
||||
switch (mode) {
|
||||
case 0:
|
||||
// Quality mode.
|
||||
memcpy(self->over_hang_max_1, kOverHangMax1Q,
|
||||
sizeof(self->over_hang_max_1));
|
||||
memcpy(self->over_hang_max_2, kOverHangMax2Q,
|
||||
sizeof(self->over_hang_max_2));
|
||||
memcpy(self->individual, kLocalThresholdQ,
|
||||
sizeof(self->individual));
|
||||
memcpy(self->total, kGlobalThresholdQ,
|
||||
sizeof(self->total));
|
||||
break;
|
||||
case 1:
|
||||
// Low bitrate mode.
|
||||
memcpy(self->over_hang_max_1, kOverHangMax1LBR,
|
||||
sizeof(self->over_hang_max_1));
|
||||
memcpy(self->over_hang_max_2, kOverHangMax2LBR,
|
||||
sizeof(self->over_hang_max_2));
|
||||
memcpy(self->individual, kLocalThresholdLBR,
|
||||
sizeof(self->individual));
|
||||
memcpy(self->total, kGlobalThresholdLBR,
|
||||
sizeof(self->total));
|
||||
break;
|
||||
case 2:
|
||||
// Aggressive mode.
|
||||
memcpy(self->over_hang_max_1, kOverHangMax1AGG,
|
||||
sizeof(self->over_hang_max_1));
|
||||
memcpy(self->over_hang_max_2, kOverHangMax2AGG,
|
||||
sizeof(self->over_hang_max_2));
|
||||
memcpy(self->individual, kLocalThresholdAGG,
|
||||
sizeof(self->individual));
|
||||
memcpy(self->total, kGlobalThresholdAGG,
|
||||
sizeof(self->total));
|
||||
break;
|
||||
case 3:
|
||||
// Very aggressive mode.
|
||||
memcpy(self->over_hang_max_1, kOverHangMax1VAG,
|
||||
sizeof(self->over_hang_max_1));
|
||||
memcpy(self->over_hang_max_2, kOverHangMax2VAG,
|
||||
sizeof(self->over_hang_max_2));
|
||||
memcpy(self->individual, kLocalThresholdVAG,
|
||||
sizeof(self->individual));
|
||||
memcpy(self->total, kGlobalThresholdVAG,
|
||||
sizeof(self->total));
|
||||
break;
|
||||
default:
|
||||
return_value = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_Q;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_Q;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_Q;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_Q;
|
||||
inst->total[1] = TOTAL_20MS_Q;
|
||||
inst->total[2] = TOTAL_30MS_Q;
|
||||
} else if (mode == 1)
|
||||
{
|
||||
// Low bitrate mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_LBR;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_LBR;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_LBR;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_LBR;
|
||||
inst->total[1] = TOTAL_20MS_LBR;
|
||||
inst->total[2] = TOTAL_30MS_LBR;
|
||||
} else if (mode == 2)
|
||||
{
|
||||
// Aggressive mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_AGG;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_AGG;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_AGG;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_AGG;
|
||||
inst->total[1] = TOTAL_20MS_AGG;
|
||||
inst->total[2] = TOTAL_30MS_AGG;
|
||||
} else if (mode == 3)
|
||||
{
|
||||
// Very aggressive mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_VAG;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_VAG;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_VAG;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_VAG;
|
||||
inst->total[1] = TOTAL_20MS_VAG;
|
||||
inst->total[2] = TOTAL_30MS_VAG;
|
||||
} else
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
return return_value;
|
||||
}
|
||||
|
||||
// Calculate VAD decision by first extracting feature values and then calculate
|
||||
@ -650,7 +651,7 @@ WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame
|
||||
WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
|
||||
int frame_length)
|
||||
{
|
||||
WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power;
|
||||
WebRtc_Word16 feature_vector[kNumChannels], total_power;
|
||||
|
||||
// Get power in the bands
|
||||
total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
|
||||
|
@ -17,26 +17,30 @@
|
||||
#define WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
|
||||
|
||||
#include "typedefs.h"
|
||||
#include "vad_defines.h"
|
||||
|
||||
enum { kNumChannels = 6 }; // Number of frequency bands (named channels).
|
||||
enum { kNumGaussians = 2 }; // Number of Gaussians per channel in the GMM.
|
||||
enum { kTableSize = kNumChannels * kNumGaussians };
|
||||
enum { kMinEnergy = 10 }; // Minimum energy required to trigger audio signal.
|
||||
|
||||
typedef struct VadInstT_
|
||||
{
|
||||
|
||||
WebRtc_Word16 vad;
|
||||
WebRtc_Word32 downsampling_filter_states[4];
|
||||
WebRtc_Word16 noise_means[NUM_TABLE_VALUES];
|
||||
WebRtc_Word16 speech_means[NUM_TABLE_VALUES];
|
||||
WebRtc_Word16 noise_stds[NUM_TABLE_VALUES];
|
||||
WebRtc_Word16 speech_stds[NUM_TABLE_VALUES];
|
||||
WebRtc_Word16 noise_means[kTableSize];
|
||||
WebRtc_Word16 speech_means[kTableSize];
|
||||
WebRtc_Word16 noise_stds[kTableSize];
|
||||
WebRtc_Word16 speech_stds[kTableSize];
|
||||
// TODO(bjornv): Change to |frame_count|.
|
||||
WebRtc_Word32 frame_counter;
|
||||
WebRtc_Word16 over_hang; // Over Hang
|
||||
WebRtc_Word16 num_of_speech;
|
||||
// TODO(bjornv): Change to |age_vector|.
|
||||
WebRtc_Word16 index_vector[16 * NUM_CHANNELS];
|
||||
WebRtc_Word16 low_value_vector[16 * NUM_CHANNELS];
|
||||
WebRtc_Word16 index_vector[16 * kNumChannels];
|
||||
WebRtc_Word16 low_value_vector[16 * kNumChannels];
|
||||
// TODO(bjornv): Change to |median|.
|
||||
WebRtc_Word16 mean_value[NUM_CHANNELS];
|
||||
WebRtc_Word16 mean_value[kNumChannels];
|
||||
WebRtc_Word16 upper_state[5];
|
||||
WebRtc_Word16 lower_state[5];
|
||||
WebRtc_Word16 hp_filter_state[4];
|
||||
@ -75,7 +79,7 @@ int WebRtcVad_InitCore(VadInstT* self);
|
||||
* -1 - Error
|
||||
*/
|
||||
|
||||
int WebRtcVad_set_mode_core(VadInstT* inst, int mode);
|
||||
int WebRtcVad_set_mode_core(VadInstT* self, int mode);
|
||||
|
||||
/****************************************************************************
|
||||
* WebRtcVad_CalcVad32khz(...)
|
||||
|
@ -1,93 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* This header file includes the macros used in VAD.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_VAD_DEFINES_H_
|
||||
#define WEBRTC_VAD_DEFINES_H_
|
||||
|
||||
#define NUM_CHANNELS 6 // Eight frequency bands
|
||||
#define NUM_MODELS 2 // Number of Gaussian models
|
||||
#define NUM_TABLE_VALUES NUM_CHANNELS * NUM_MODELS
|
||||
|
||||
#define MIN_ENERGY 10
|
||||
#define ALPHA1 6553 // 0.2 in Q15
|
||||
#define ALPHA2 32439 // 0.99 in Q15
|
||||
// Mode 0, Quality thresholds - Different thresholds for the different frame lengths
|
||||
#define INDIVIDUAL_10MS_Q 24
|
||||
#define INDIVIDUAL_20MS_Q 21 // (log10(2)*66)<<2 ~=16
|
||||
#define INDIVIDUAL_30MS_Q 24
|
||||
|
||||
#define TOTAL_10MS_Q 57
|
||||
#define TOTAL_20MS_Q 48
|
||||
#define TOTAL_30MS_Q 57
|
||||
|
||||
#define OHMAX1_10MS_Q 8 // Max Overhang 1
|
||||
#define OHMAX2_10MS_Q 14 // Max Overhang 2
|
||||
#define OHMAX1_20MS_Q 4 // Max Overhang 1
|
||||
#define OHMAX2_20MS_Q 7 // Max Overhang 2
|
||||
#define OHMAX1_30MS_Q 3
|
||||
#define OHMAX2_30MS_Q 5
|
||||
|
||||
// Mode 1, Low bitrate thresholds - Different thresholds for the different frame lengths
|
||||
#define INDIVIDUAL_10MS_LBR 37
|
||||
#define INDIVIDUAL_20MS_LBR 32
|
||||
#define INDIVIDUAL_30MS_LBR 37
|
||||
|
||||
#define TOTAL_10MS_LBR 100
|
||||
#define TOTAL_20MS_LBR 80
|
||||
#define TOTAL_30MS_LBR 100
|
||||
|
||||
#define OHMAX1_10MS_LBR 8 // Max Overhang 1
|
||||
#define OHMAX2_10MS_LBR 14 // Max Overhang 2
|
||||
#define OHMAX1_20MS_LBR 4
|
||||
#define OHMAX2_20MS_LBR 7
|
||||
|
||||
#define OHMAX1_30MS_LBR 3
|
||||
#define OHMAX2_30MS_LBR 5
|
||||
|
||||
// Mode 2, Very aggressive thresholds - Different thresholds for the different frame lengths
|
||||
#define INDIVIDUAL_10MS_AGG 82
|
||||
#define INDIVIDUAL_20MS_AGG 78
|
||||
#define INDIVIDUAL_30MS_AGG 82
|
||||
|
||||
#define TOTAL_10MS_AGG 285 //580
|
||||
#define TOTAL_20MS_AGG 260
|
||||
#define TOTAL_30MS_AGG 285
|
||||
|
||||
#define OHMAX1_10MS_AGG 6 // Max Overhang 1
|
||||
#define OHMAX2_10MS_AGG 9 // Max Overhang 2
|
||||
#define OHMAX1_20MS_AGG 3
|
||||
#define OHMAX2_20MS_AGG 5
|
||||
|
||||
#define OHMAX1_30MS_AGG 2
|
||||
#define OHMAX2_30MS_AGG 3
|
||||
|
||||
// Mode 3, Super aggressive thresholds - Different thresholds for the different frame lengths
|
||||
#define INDIVIDUAL_10MS_VAG 94
|
||||
#define INDIVIDUAL_20MS_VAG 94
|
||||
#define INDIVIDUAL_30MS_VAG 94
|
||||
|
||||
#define TOTAL_10MS_VAG 1100 //1700
|
||||
#define TOTAL_20MS_VAG 1050
|
||||
#define TOTAL_30MS_VAG 1100
|
||||
|
||||
#define OHMAX1_10MS_VAG 6 // Max Overhang 1
|
||||
#define OHMAX2_10MS_VAG 9 // Max Overhang 2
|
||||
#define OHMAX1_20MS_VAG 3
|
||||
#define OHMAX2_20MS_VAG 5
|
||||
|
||||
#define OHMAX1_30MS_VAG 2
|
||||
#define OHMAX2_30MS_VAG 3
|
||||
|
||||
#endif // WEBRTC_VAD_DEFINES_H_
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
@ -14,7 +14,6 @@
|
||||
|
||||
#include "signal_processing_library.h"
|
||||
#include "typedefs.h"
|
||||
#include "vad_defines.h"
|
||||
|
||||
// Constants used in LogOfEnergy().
|
||||
static const int16_t kLogConst = 24660; // 160*log10(2) in Q9.
|
||||
@ -151,7 +150,7 @@ static void SplitFilter(const int16_t* data_in, int data_length,
|
||||
// - total_energy [i/o] : An external energy updated with the energy of
|
||||
// |data_in|.
|
||||
// NOTE: |total_energy| is only updated if
|
||||
// |total_energy| <= MIN_ENERGY.
|
||||
// |total_energy| <= |kMinEnergy|.
|
||||
// - log_energy [o] : 10 * log10("energy of |data_in|") given in Q4.
|
||||
static void LogOfEnergy(const int16_t* data_in, int data_length,
|
||||
int16_t offset, int16_t* total_energy,
|
||||
@ -228,18 +227,18 @@ static void LogOfEnergy(const int16_t* data_in, int data_length,
|
||||
*log_energy += offset;
|
||||
|
||||
// Update the approximate |total_energy| with the energy of |data_in|, if
|
||||
// |total_energy| has not exceeded MIN_ENERGY. |total_energy| is used as an
|
||||
// |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an
|
||||
// energy indicator in WebRtcVad_GmmProbability() in vad_core.c.
|
||||
if (*total_energy <= MIN_ENERGY) {
|
||||
if (*total_energy <= kMinEnergy) {
|
||||
if (tot_rshifts >= 0) {
|
||||
// We know by construction that the |energy| > MIN_ENERGY in Q0, so add an
|
||||
// arbitrary value such that |total_energy| exceeds MIN_ENERGY.
|
||||
*total_energy += MIN_ENERGY + 1;
|
||||
// We know by construction that the |energy| > |kMinEnergy| in Q0, so add
|
||||
// an arbitrary value such that |total_energy| exceeds |kMinEnergy|.
|
||||
*total_energy += kMinEnergy + 1;
|
||||
} else {
|
||||
// By construction |energy| is represented by 15 bits, hence any number of
|
||||
// right shifted |energy| will fit in an int16_t. In addition, adding the
|
||||
// value to |total_energy| is wrap around safe as long as
|
||||
// MIN_ENERGY < 8192.
|
||||
// |kMinEnergy| < 8192.
|
||||
*total_energy += (int16_t) (energy >> -tot_rshifts); // Q0.
|
||||
}
|
||||
}
|
||||
@ -266,7 +265,7 @@ int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
|
||||
|
||||
assert(data_length >= 0);
|
||||
assert(data_length <= 240);
|
||||
assert(4 < NUM_CHANNELS - 1); // Checking maximum |frequency_band|.
|
||||
assert(4 < kNumChannels - 1); // Checking maximum |frequency_band|.
|
||||
|
||||
// Split at 2000 Hz and downsample.
|
||||
SplitFilter(in_ptr, data_length, &self->upper_state[frequency_band],
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
@ -19,7 +19,7 @@
|
||||
#include "vad_core.h"
|
||||
|
||||
// Takes |data_length| samples of |data_in| and calculates the logarithm of the
|
||||
// energy of each of the |NUM_CHANNELS| = 6 frequency bands used by the VAD:
|
||||
// energy of each of the |kNumChannels| = 6 frequency bands used by the VAD:
|
||||
// 80 Hz - 250 Hz
|
||||
// 250 Hz - 500 Hz
|
||||
// 500 Hz - 1000 Hz
|
||||
@ -30,7 +30,7 @@
|
||||
// The values are given in Q4 and written to |features|. Further, an approximate
|
||||
// overall energy is returned. The return value is used in
|
||||
// WebRtcVad_GmmProbability() as a signal indicator, hence it is arbitrary above
|
||||
// the threshold MIN_ENERGY.
|
||||
// the threshold |kMinEnergy|.
|
||||
//
|
||||
// - self [i/o] : State information of the VAD.
|
||||
// - data_in [i] : Input audio data, for feature extraction.
|
||||
|
@ -16,7 +16,6 @@
|
||||
|
||||
extern "C" {
|
||||
#include "vad_core.h"
|
||||
#include "vad_defines.h"
|
||||
#include "vad_filterbank.h"
|
||||
}
|
||||
|
||||
@ -27,14 +26,14 @@ enum { kNumValidFrameLengths = 3 };
|
||||
TEST_F(VadTest, vad_filterbank) {
|
||||
VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
|
||||
static const int16_t kReference[kNumValidFrameLengths] = { 48, 11, 11 };
|
||||
static const int16_t kFeatures[kNumValidFrameLengths * NUM_CHANNELS] = {
|
||||
static const int16_t kFeatures[kNumValidFrameLengths * kNumChannels] = {
|
||||
1213, 759, 587, 462, 434, 272,
|
||||
1479, 1385, 1291, 1200, 1103, 1099,
|
||||
1732, 1692, 1681, 1629, 1436, 1436
|
||||
};
|
||||
static const int16_t kOffsetVector[NUM_CHANNELS] = {
|
||||
static const int16_t kOffsetVector[kNumChannels] = {
|
||||
368, 368, 272, 176, 176, 176 };
|
||||
int16_t features[NUM_CHANNELS];
|
||||
int16_t features[kNumChannels];
|
||||
|
||||
// Construct a speech signal that will trigger the VAD in all modes. It is
|
||||
// known that (i * i) will wrap around, but that doesn't matter in this case.
|
||||
@ -50,8 +49,8 @@ TEST_F(VadTest, vad_filterbank) {
|
||||
EXPECT_EQ(kReference[frame_length_index],
|
||||
WebRtcVad_CalculateFeatures(self, speech, kFrameLengths[j],
|
||||
features));
|
||||
for (int k = 0; k < NUM_CHANNELS; ++k) {
|
||||
EXPECT_EQ(kFeatures[k + frame_length_index * NUM_CHANNELS],
|
||||
for (int k = 0; k < kNumChannels; ++k) {
|
||||
EXPECT_EQ(kFeatures[k + frame_length_index * kNumChannels],
|
||||
features[k]);
|
||||
}
|
||||
frame_length_index++;
|
||||
@ -66,7 +65,7 @@ TEST_F(VadTest, vad_filterbank) {
|
||||
if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) {
|
||||
EXPECT_EQ(0, WebRtcVad_CalculateFeatures(self, speech, kFrameLengths[j],
|
||||
features));
|
||||
for (int k = 0; k < NUM_CHANNELS; ++k) {
|
||||
for (int k = 0; k < kNumChannels; ++k) {
|
||||
EXPECT_EQ(kOffsetVector[k], features[k]);
|
||||
}
|
||||
}
|
||||
@ -82,7 +81,7 @@ TEST_F(VadTest, vad_filterbank) {
|
||||
ASSERT_EQ(0, WebRtcVad_InitCore(self));
|
||||
EXPECT_EQ(0, WebRtcVad_CalculateFeatures(self, speech, kFrameLengths[j],
|
||||
features));
|
||||
for (int k = 0; k < NUM_CHANNELS; ++k) {
|
||||
for (int k = 0; k < kNumChannels; ++k) {
|
||||
EXPECT_EQ(kOffsetVector[k], features[k]);
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
@ -14,11 +14,13 @@
|
||||
|
||||
#include "signal_processing_library.h"
|
||||
#include "typedefs.h"
|
||||
#include "vad_defines.h"
|
||||
#include "vad_core.h"
|
||||
|
||||
// Allpass filter coefficients, upper and lower, in Q13.
|
||||
// Upper: 0.64, Lower: 0.17.
|
||||
static const int16_t kAllPassCoefsQ13[2] = { 5243, 1392 }; // Q13
|
||||
static const int16_t kAllPassCoefsQ13[2] = { 5243, 1392 }; // Q13.
|
||||
static const int16_t kSmoothingDown = 6553; // 0.2 in Q15.
|
||||
static const int16_t kSmoothingUp = 32439; // 0.99 in Q15.
|
||||
|
||||
// TODO(bjornv): Move this function to vad_filterbank.c.
|
||||
// Downsampling filter based on splitting filter and allpass functions.
|
||||
@ -72,7 +74,7 @@ int16_t WebRtcVad_FindMinimum(VadInstT* self,
|
||||
int16_t* value_ptr = &self->low_value_vector[offset];
|
||||
int16_t *p1, *p2, *p3;
|
||||
|
||||
assert(channel < NUM_CHANNELS);
|
||||
assert(channel < kNumChannels);
|
||||
|
||||
// Each value in |low_value_vector| is getting 1 loop older.
|
||||
// Update age of each value in |age_ptr|, and remove old values.
|
||||
@ -167,9 +169,9 @@ int16_t WebRtcVad_FindMinimum(VadInstT* self,
|
||||
// Smooth the median value.
|
||||
if (self->frame_counter > 0) {
|
||||
if (current_median < self->mean_value[channel]) {
|
||||
alpha = (int16_t) ALPHA1; // 0.2 in Q15.
|
||||
alpha = kSmoothingDown; // 0.2 in Q15.
|
||||
} else {
|
||||
alpha = (int16_t) ALPHA2; // 0.99 in Q15.
|
||||
alpha = kSmoothingUp; // 0.99 in Q15.
|
||||
}
|
||||
}
|
||||
tmp32 = WEBRTC_SPL_MUL_16_16(alpha + 1, self->mean_value[channel]);
|
||||
|
@ -16,7 +16,6 @@
|
||||
|
||||
extern "C" {
|
||||
#include "vad_core.h"
|
||||
#include "vad_defines.h"
|
||||
#include "vad_sp.h"
|
||||
}
|
||||
|
||||
@ -63,7 +62,7 @@ TEST_F(VadTest, vad_sp) {
|
||||
// ordered.
|
||||
for (int16_t i = 0; i < 16; ++i) {
|
||||
int16_t value = 500 * (i + 1);
|
||||
for (int j = 0; j < NUM_CHANNELS; ++j) {
|
||||
for (int j = 0; j < kNumChannels; ++j) {
|
||||
// Use values both above and below initialized value.
|
||||
EXPECT_EQ(kReferenceMin[i], WebRtcVad_FindMinimum(self, value, j));
|
||||
EXPECT_EQ(kReferenceMin[i + 16], WebRtcVad_FindMinimum(self, 12000, j));
|
||||
|
Loading…
x
Reference in New Issue
Block a user