VAD refactoring: Removed macro file.

In this CL we've replaced the VAD macros with static const or enum.

Priority=low

BUG=
TEST=vad_unittest

Review URL: https://webrtc-codereview.appspot.com/453004

git-svn-id: http://webrtc.googlecode.com/svn/trunk@1913 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
bjornv@webrtc.org 2012-03-20 12:53:06 +00:00
parent ac9fd8af09
commit a496b03c78
8 changed files with 145 additions and 234 deletions

View File

@ -12,7 +12,6 @@
#include "signal_processing_library.h"
#include "typedefs.h"
#include "vad_defines.h"
#include "vad_filterbank.h"
#include "vad_gmm.h"
#include "vad_sp.h"
@ -66,6 +65,31 @@ static const int16_t kMinStd = 384;
static const short kDefaultMode = 0;
static const int kInitCheck = 42;
// Constants used in WebRtcVad_set_mode_core().
//
// Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
//
// Mode 0, Quality.
static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
// Mode 1, Low bitrate.
static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
// Mode 2, Aggressive.
static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
// Mode 3, Very aggressive.
static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
// Calculates the probabilities for both speech and background noise using
// Gaussian Mixture Models. A hypothesis-test is performed to decide which type
// of signal is most probable.
@ -90,13 +114,13 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
WebRtc_Word16 nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
WebRtc_Word16 delt, ndelt;
WebRtc_Word16 maxspe, maxmu;
WebRtc_Word16 deltaN[NUM_TABLE_VALUES], deltaS[NUM_TABLE_VALUES];
WebRtc_Word16 ngprvec[NUM_TABLE_VALUES], sgprvec[NUM_TABLE_VALUES];
WebRtc_Word16 deltaN[kTableSize], deltaS[kTableSize];
WebRtc_Word16 ngprvec[kTableSize], sgprvec[kTableSize];
WebRtc_Word32 h0test, h1test;
WebRtc_Word32 tmp32_1, tmp32_2;
WebRtc_Word32 dotVal;
WebRtc_Word32 nmid, smid;
WebRtc_Word32 probn[NUM_MODELS], probs[NUM_MODELS];
WebRtc_Word32 probn[kNumGaussians], probs[kNumGaussians];
WebRtc_Word16 *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr, *nstd1ptr, *nstd2ptr,
*sstd1ptr, *sstd2ptr;
WebRtc_Word16 overhead1, overhead2, individualTest, totalTest;
@ -125,22 +149,22 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
totalTest = inst->total[2];
}
if (total_power > MIN_ENERGY)
if (total_power > kMinEnergy)
{ // If signal present at all
// Set pointers to the gaussian parameters
nmean1ptr = &inst->noise_means[0];
nmean2ptr = &inst->noise_means[NUM_CHANNELS];
nmean2ptr = &inst->noise_means[kNumChannels];
smean1ptr = &inst->speech_means[0];
smean2ptr = &inst->speech_means[NUM_CHANNELS];
smean2ptr = &inst->speech_means[kNumChannels];
nstd1ptr = &inst->noise_stds[0];
nstd2ptr = &inst->noise_stds[NUM_CHANNELS];
nstd2ptr = &inst->noise_stds[kNumChannels];
sstd1ptr = &inst->speech_stds[0];
sstd2ptr = &inst->speech_stds[NUM_CHANNELS];
sstd2ptr = &inst->speech_stds[kNumChannels];
vadflag = 0;
dotVal = 0;
for (n = 0; n < NUM_CHANNELS; n++)
for (n = 0; n < kNumChannels; n++)
{ // For all channels
pos = WEBRTC_SPL_LSHIFT_W16(n, 1);
@ -152,7 +176,7 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
probn[0] = (WebRtc_Word32)(kNoiseDataWeights[n] * tmp32_1);
tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++,
&deltaN[pos + 1]);
probn[1] = (WebRtc_Word32)(kNoiseDataWeights[n + NUM_CHANNELS] * tmp32_1);
probn[1] = (WebRtc_Word32)(kNoiseDataWeights[n + kNumChannels] * tmp32_1);
h0test = probn[0] + probn[1]; // Q27
h0 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h0test, 12); // Q15
@ -162,7 +186,7 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
probs[0] = (WebRtc_Word32)(kSpeechDataWeights[n] * tmp32_1);
tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++,
&deltaS[pos + 1]);
probs[1] = (WebRtc_Word32)(kSpeechDataWeights[n + NUM_CHANNELS] * tmp32_1);
probs[1] = (WebRtc_Word32)(kSpeechDataWeights[n + kNumChannels] * tmp32_1);
h1test = probs[0] + probs[1]; // Q27
h1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h1test, 12); // Q15
@ -235,7 +259,7 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
maxspe = 12800;
// Update the model's parameters
for (n = 0; n < NUM_CHANNELS; n++)
for (n = 0; n < kNumChannels; n++)
{
pos = WEBRTC_SPL_LSHIFT_W16(n, 1);
@ -245,19 +269,19 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
// Compute the "global" mean, that is the sum of the two means weighted
nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7
nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS],
*(nmean1ptr+NUM_CHANNELS));
nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+kNumChannels],
*(nmean1ptr+kNumChannels));
tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 6); // Q8
for (k = 0; k < NUM_MODELS; k++)
for (k = 0; k < kNumGaussians; k++)
{
nr = pos + k;
nmean2ptr = nmean1ptr + k * NUM_CHANNELS;
smean2ptr = smean1ptr + k * NUM_CHANNELS;
nstd2ptr = nstd1ptr + k * NUM_CHANNELS;
sstd2ptr = sstd1ptr + k * NUM_CHANNELS;
nmean2ptr = nmean1ptr + k * kNumChannels;
smean2ptr = smean1ptr + k * kNumChannels;
nstd2ptr = nstd1ptr + k * kNumChannels;
sstd2ptr = sstd1ptr + k * kNumChannels;
nmk = *nmean2ptr;
smk = *smean2ptr;
nsk = *nstd2ptr;
@ -376,11 +400,11 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
// Separate models if they are too close - nmid in Q14
nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr);
nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], *nmean2ptr);
nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+kNumChannels], *nmean2ptr);
// smid in Q14
smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr);
smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+NUM_CHANNELS], *smean2ptr);
smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+kNumChannels], *smean2ptr);
// diff = "global" speech mean - "global" noise mean
diff = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 9);
@ -405,7 +429,7 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
// Second Gauss, speech model
tmp16 = tmp16_1 + *smean2ptr;
*smean2ptr = tmp16;
smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+NUM_CHANNELS]);
smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+kNumChannels]);
// First Gauss, noise model
tmp16 = *nmean1ptr - tmp16_2;
@ -416,7 +440,7 @@ static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
// Second Gauss, noise model
tmp16 = *nmean2ptr - tmp16_2;
*nmean2ptr = tmp16;
nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+NUM_CHANNELS]);
nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+kNumChannels]);
}
// Control that the speech & noise means do not drift to much
@ -491,7 +515,7 @@ int WebRtcVad_InitCore(VadInstT* self) {
sizeof(self->downsampling_filter_states));
// Read initial PDF parameters.
for (i = 0; i < NUM_TABLE_VALUES; i++) {
for (i = 0; i < kTableSize; i++) {
self->noise_means[i] = kNoiseDataMeans[i];
self->speech_means[i] = kSpeechDataMeans[i];
self->noise_stds[i] = kNoiseDataStds[i];
@ -499,7 +523,7 @@ int WebRtcVad_InitCore(VadInstT* self) {
}
// Initialize Index and Minimum value vectors.
for (i = 0; i < 16 * NUM_CHANNELS; i++) {
for (i = 0; i < 16 * kNumChannels; i++) {
self->low_value_vector[i] = 10000;
self->index_vector[i] = 0;
}
@ -512,7 +536,7 @@ int WebRtcVad_InitCore(VadInstT* self) {
memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
// Initialize mean value memory, for WebRtcVad_FindMinimum().
for (i = 0; i < NUM_CHANNELS; i++) {
for (i = 0; i < kNumChannels; i++) {
self->mean_value[i] = 1600;
}
@ -527,83 +551,60 @@ int WebRtcVad_InitCore(VadInstT* self) {
}
// Set aggressiveness mode
int WebRtcVad_set_mode_core(VadInstT *inst, int mode)
{
int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
int return_value = 0;
if (mode == 0)
{
// Quality mode
inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
switch (mode) {
case 0:
// Quality mode.
memcpy(self->over_hang_max_1, kOverHangMax1Q,
sizeof(self->over_hang_max_1));
memcpy(self->over_hang_max_2, kOverHangMax2Q,
sizeof(self->over_hang_max_2));
memcpy(self->individual, kLocalThresholdQ,
sizeof(self->individual));
memcpy(self->total, kGlobalThresholdQ,
sizeof(self->total));
break;
case 1:
// Low bitrate mode.
memcpy(self->over_hang_max_1, kOverHangMax1LBR,
sizeof(self->over_hang_max_1));
memcpy(self->over_hang_max_2, kOverHangMax2LBR,
sizeof(self->over_hang_max_2));
memcpy(self->individual, kLocalThresholdLBR,
sizeof(self->individual));
memcpy(self->total, kGlobalThresholdLBR,
sizeof(self->total));
break;
case 2:
// Aggressive mode.
memcpy(self->over_hang_max_1, kOverHangMax1AGG,
sizeof(self->over_hang_max_1));
memcpy(self->over_hang_max_2, kOverHangMax2AGG,
sizeof(self->over_hang_max_2));
memcpy(self->individual, kLocalThresholdAGG,
sizeof(self->individual));
memcpy(self->total, kGlobalThresholdAGG,
sizeof(self->total));
break;
case 3:
// Very aggressive mode.
memcpy(self->over_hang_max_1, kOverHangMax1VAG,
sizeof(self->over_hang_max_1));
memcpy(self->over_hang_max_2, kOverHangMax2VAG,
sizeof(self->over_hang_max_2));
memcpy(self->individual, kLocalThresholdVAG,
sizeof(self->individual));
memcpy(self->total, kGlobalThresholdVAG,
sizeof(self->total));
break;
default:
return_value = -1;
break;
}
inst->individual[0] = INDIVIDUAL_10MS_Q;
inst->individual[1] = INDIVIDUAL_20MS_Q;
inst->individual[2] = INDIVIDUAL_30MS_Q;
inst->total[0] = TOTAL_10MS_Q;
inst->total[1] = TOTAL_20MS_Q;
inst->total[2] = TOTAL_30MS_Q;
} else if (mode == 1)
{
// Low bitrate mode
inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_LBR;
inst->individual[1] = INDIVIDUAL_20MS_LBR;
inst->individual[2] = INDIVIDUAL_30MS_LBR;
inst->total[0] = TOTAL_10MS_LBR;
inst->total[1] = TOTAL_20MS_LBR;
inst->total[2] = TOTAL_30MS_LBR;
} else if (mode == 2)
{
// Aggressive mode
inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_AGG;
inst->individual[1] = INDIVIDUAL_20MS_AGG;
inst->individual[2] = INDIVIDUAL_30MS_AGG;
inst->total[0] = TOTAL_10MS_AGG;
inst->total[1] = TOTAL_20MS_AGG;
inst->total[2] = TOTAL_30MS_AGG;
} else if (mode == 3)
{
// Very aggressive mode
inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_VAG;
inst->individual[1] = INDIVIDUAL_20MS_VAG;
inst->individual[2] = INDIVIDUAL_30MS_VAG;
inst->total[0] = TOTAL_10MS_VAG;
inst->total[1] = TOTAL_20MS_VAG;
inst->total[2] = TOTAL_30MS_VAG;
} else
{
return -1;
}
return 0;
return return_value;
}
// Calculate VAD decision by first extracting feature values and then calculate
@ -650,7 +651,7 @@ WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame
WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
int frame_length)
{
WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power;
WebRtc_Word16 feature_vector[kNumChannels], total_power;
// Get power in the bands
total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,

View File

@ -17,26 +17,30 @@
#define WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
#include "typedefs.h"
#include "vad_defines.h"
enum { kNumChannels = 6 }; // Number of frequency bands (named channels).
enum { kNumGaussians = 2 }; // Number of Gaussians per channel in the GMM.
enum { kTableSize = kNumChannels * kNumGaussians };
enum { kMinEnergy = 10 }; // Minimum energy required to trigger audio signal.
typedef struct VadInstT_
{
WebRtc_Word16 vad;
WebRtc_Word32 downsampling_filter_states[4];
WebRtc_Word16 noise_means[NUM_TABLE_VALUES];
WebRtc_Word16 speech_means[NUM_TABLE_VALUES];
WebRtc_Word16 noise_stds[NUM_TABLE_VALUES];
WebRtc_Word16 speech_stds[NUM_TABLE_VALUES];
WebRtc_Word16 noise_means[kTableSize];
WebRtc_Word16 speech_means[kTableSize];
WebRtc_Word16 noise_stds[kTableSize];
WebRtc_Word16 speech_stds[kTableSize];
// TODO(bjornv): Change to |frame_count|.
WebRtc_Word32 frame_counter;
WebRtc_Word16 over_hang; // Over Hang
WebRtc_Word16 num_of_speech;
// TODO(bjornv): Change to |age_vector|.
WebRtc_Word16 index_vector[16 * NUM_CHANNELS];
WebRtc_Word16 low_value_vector[16 * NUM_CHANNELS];
WebRtc_Word16 index_vector[16 * kNumChannels];
WebRtc_Word16 low_value_vector[16 * kNumChannels];
// TODO(bjornv): Change to |median|.
WebRtc_Word16 mean_value[NUM_CHANNELS];
WebRtc_Word16 mean_value[kNumChannels];
WebRtc_Word16 upper_state[5];
WebRtc_Word16 lower_state[5];
WebRtc_Word16 hp_filter_state[4];
@ -75,7 +79,7 @@ int WebRtcVad_InitCore(VadInstT* self);
* -1 - Error
*/
int WebRtcVad_set_mode_core(VadInstT* inst, int mode);
int WebRtcVad_set_mode_core(VadInstT* self, int mode);
/****************************************************************************
* WebRtcVad_CalcVad32khz(...)

View File

@ -1,93 +0,0 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* This header file includes the macros used in VAD.
*/
#ifndef WEBRTC_VAD_DEFINES_H_
#define WEBRTC_VAD_DEFINES_H_
#define NUM_CHANNELS 6 // Eight frequency bands
#define NUM_MODELS 2 // Number of Gaussian models
#define NUM_TABLE_VALUES NUM_CHANNELS * NUM_MODELS
#define MIN_ENERGY 10
#define ALPHA1 6553 // 0.2 in Q15
#define ALPHA2 32439 // 0.99 in Q15
// Mode 0, Quality thresholds - Different thresholds for the different frame lengths
#define INDIVIDUAL_10MS_Q 24
#define INDIVIDUAL_20MS_Q 21 // (log10(2)*66)<<2 ~=16
#define INDIVIDUAL_30MS_Q 24
#define TOTAL_10MS_Q 57
#define TOTAL_20MS_Q 48
#define TOTAL_30MS_Q 57
#define OHMAX1_10MS_Q 8 // Max Overhang 1
#define OHMAX2_10MS_Q 14 // Max Overhang 2
#define OHMAX1_20MS_Q 4 // Max Overhang 1
#define OHMAX2_20MS_Q 7 // Max Overhang 2
#define OHMAX1_30MS_Q 3
#define OHMAX2_30MS_Q 5
// Mode 1, Low bitrate thresholds - Different thresholds for the different frame lengths
#define INDIVIDUAL_10MS_LBR 37
#define INDIVIDUAL_20MS_LBR 32
#define INDIVIDUAL_30MS_LBR 37
#define TOTAL_10MS_LBR 100
#define TOTAL_20MS_LBR 80
#define TOTAL_30MS_LBR 100
#define OHMAX1_10MS_LBR 8 // Max Overhang 1
#define OHMAX2_10MS_LBR 14 // Max Overhang 2
#define OHMAX1_20MS_LBR 4
#define OHMAX2_20MS_LBR 7
#define OHMAX1_30MS_LBR 3
#define OHMAX2_30MS_LBR 5
// Mode 2, Very aggressive thresholds - Different thresholds for the different frame lengths
#define INDIVIDUAL_10MS_AGG 82
#define INDIVIDUAL_20MS_AGG 78
#define INDIVIDUAL_30MS_AGG 82
#define TOTAL_10MS_AGG 285 //580
#define TOTAL_20MS_AGG 260
#define TOTAL_30MS_AGG 285
#define OHMAX1_10MS_AGG 6 // Max Overhang 1
#define OHMAX2_10MS_AGG 9 // Max Overhang 2
#define OHMAX1_20MS_AGG 3
#define OHMAX2_20MS_AGG 5
#define OHMAX1_30MS_AGG 2
#define OHMAX2_30MS_AGG 3
// Mode 3, Super aggressive thresholds - Different thresholds for the different frame lengths
#define INDIVIDUAL_10MS_VAG 94
#define INDIVIDUAL_20MS_VAG 94
#define INDIVIDUAL_30MS_VAG 94
#define TOTAL_10MS_VAG 1100 //1700
#define TOTAL_20MS_VAG 1050
#define TOTAL_30MS_VAG 1100
#define OHMAX1_10MS_VAG 6 // Max Overhang 1
#define OHMAX2_10MS_VAG 9 // Max Overhang 2
#define OHMAX1_20MS_VAG 3
#define OHMAX2_20MS_VAG 5
#define OHMAX1_30MS_VAG 2
#define OHMAX2_30MS_VAG 3
#endif // WEBRTC_VAD_DEFINES_H_

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -14,7 +14,6 @@
#include "signal_processing_library.h"
#include "typedefs.h"
#include "vad_defines.h"
// Constants used in LogOfEnergy().
static const int16_t kLogConst = 24660; // 160*log10(2) in Q9.
@ -151,7 +150,7 @@ static void SplitFilter(const int16_t* data_in, int data_length,
// - total_energy [i/o] : An external energy updated with the energy of
// |data_in|.
// NOTE: |total_energy| is only updated if
// |total_energy| <= MIN_ENERGY.
// |total_energy| <= |kMinEnergy|.
// - log_energy [o] : 10 * log10("energy of |data_in|") given in Q4.
static void LogOfEnergy(const int16_t* data_in, int data_length,
int16_t offset, int16_t* total_energy,
@ -228,18 +227,18 @@ static void LogOfEnergy(const int16_t* data_in, int data_length,
*log_energy += offset;
// Update the approximate |total_energy| with the energy of |data_in|, if
// |total_energy| has not exceeded MIN_ENERGY. |total_energy| is used as an
// |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an
// energy indicator in WebRtcVad_GmmProbability() in vad_core.c.
if (*total_energy <= MIN_ENERGY) {
if (*total_energy <= kMinEnergy) {
if (tot_rshifts >= 0) {
// We know by construction that the |energy| > MIN_ENERGY in Q0, so add an
// arbitrary value such that |total_energy| exceeds MIN_ENERGY.
*total_energy += MIN_ENERGY + 1;
// We know by construction that the |energy| > |kMinEnergy| in Q0, so add
// an arbitrary value such that |total_energy| exceeds |kMinEnergy|.
*total_energy += kMinEnergy + 1;
} else {
// By construction |energy| is represented by 15 bits, hence any number of
// right shifted |energy| will fit in an int16_t. In addition, adding the
// value to |total_energy| is wrap around safe as long as
// MIN_ENERGY < 8192.
// |kMinEnergy| < 8192.
*total_energy += (int16_t) (energy >> -tot_rshifts); // Q0.
}
}
@ -266,7 +265,7 @@ int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
assert(data_length >= 0);
assert(data_length <= 240);
assert(4 < NUM_CHANNELS - 1); // Checking maximum |frequency_band|.
assert(4 < kNumChannels - 1); // Checking maximum |frequency_band|.
// Split at 2000 Hz and downsample.
SplitFilter(in_ptr, data_length, &self->upper_state[frequency_band],

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -19,7 +19,7 @@
#include "vad_core.h"
// Takes |data_length| samples of |data_in| and calculates the logarithm of the
// energy of each of the |NUM_CHANNELS| = 6 frequency bands used by the VAD:
// energy of each of the |kNumChannels| = 6 frequency bands used by the VAD:
// 80 Hz - 250 Hz
// 250 Hz - 500 Hz
// 500 Hz - 1000 Hz
@ -30,7 +30,7 @@
// The values are given in Q4 and written to |features|. Further, an approximate
// overall energy is returned. The return value is used in
// WebRtcVad_GmmProbability() as a signal indicator, hence it is arbitrary above
// the threshold MIN_ENERGY.
// the threshold |kMinEnergy|.
//
// - self [i/o] : State information of the VAD.
// - data_in [i] : Input audio data, for feature extraction.

View File

@ -16,7 +16,6 @@
extern "C" {
#include "vad_core.h"
#include "vad_defines.h"
#include "vad_filterbank.h"
}
@ -27,14 +26,14 @@ enum { kNumValidFrameLengths = 3 };
TEST_F(VadTest, vad_filterbank) {
VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
static const int16_t kReference[kNumValidFrameLengths] = { 48, 11, 11 };
static const int16_t kFeatures[kNumValidFrameLengths * NUM_CHANNELS] = {
static const int16_t kFeatures[kNumValidFrameLengths * kNumChannels] = {
1213, 759, 587, 462, 434, 272,
1479, 1385, 1291, 1200, 1103, 1099,
1732, 1692, 1681, 1629, 1436, 1436
};
static const int16_t kOffsetVector[NUM_CHANNELS] = {
static const int16_t kOffsetVector[kNumChannels] = {
368, 368, 272, 176, 176, 176 };
int16_t features[NUM_CHANNELS];
int16_t features[kNumChannels];
// Construct a speech signal that will trigger the VAD in all modes. It is
// known that (i * i) will wrap around, but that doesn't matter in this case.
@ -50,8 +49,8 @@ TEST_F(VadTest, vad_filterbank) {
EXPECT_EQ(kReference[frame_length_index],
WebRtcVad_CalculateFeatures(self, speech, kFrameLengths[j],
features));
for (int k = 0; k < NUM_CHANNELS; ++k) {
EXPECT_EQ(kFeatures[k + frame_length_index * NUM_CHANNELS],
for (int k = 0; k < kNumChannels; ++k) {
EXPECT_EQ(kFeatures[k + frame_length_index * kNumChannels],
features[k]);
}
frame_length_index++;
@ -66,7 +65,7 @@ TEST_F(VadTest, vad_filterbank) {
if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) {
EXPECT_EQ(0, WebRtcVad_CalculateFeatures(self, speech, kFrameLengths[j],
features));
for (int k = 0; k < NUM_CHANNELS; ++k) {
for (int k = 0; k < kNumChannels; ++k) {
EXPECT_EQ(kOffsetVector[k], features[k]);
}
}
@ -82,7 +81,7 @@ TEST_F(VadTest, vad_filterbank) {
ASSERT_EQ(0, WebRtcVad_InitCore(self));
EXPECT_EQ(0, WebRtcVad_CalculateFeatures(self, speech, kFrameLengths[j],
features));
for (int k = 0; k < NUM_CHANNELS; ++k) {
for (int k = 0; k < kNumChannels; ++k) {
EXPECT_EQ(kOffsetVector[k], features[k]);
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -14,11 +14,13 @@
#include "signal_processing_library.h"
#include "typedefs.h"
#include "vad_defines.h"
#include "vad_core.h"
// Allpass filter coefficients, upper and lower, in Q13.
// Upper: 0.64, Lower: 0.17.
static const int16_t kAllPassCoefsQ13[2] = { 5243, 1392 }; // Q13
static const int16_t kAllPassCoefsQ13[2] = { 5243, 1392 }; // Q13.
static const int16_t kSmoothingDown = 6553; // 0.2 in Q15.
static const int16_t kSmoothingUp = 32439; // 0.99 in Q15.
// TODO(bjornv): Move this function to vad_filterbank.c.
// Downsampling filter based on splitting filter and allpass functions.
@ -72,7 +74,7 @@ int16_t WebRtcVad_FindMinimum(VadInstT* self,
int16_t* value_ptr = &self->low_value_vector[offset];
int16_t *p1, *p2, *p3;
assert(channel < NUM_CHANNELS);
assert(channel < kNumChannels);
// Each value in |low_value_vector| is getting 1 loop older.
// Update age of each value in |age_ptr|, and remove old values.
@ -167,9 +169,9 @@ int16_t WebRtcVad_FindMinimum(VadInstT* self,
// Smooth the median value.
if (self->frame_counter > 0) {
if (current_median < self->mean_value[channel]) {
alpha = (int16_t) ALPHA1; // 0.2 in Q15.
alpha = kSmoothingDown; // 0.2 in Q15.
} else {
alpha = (int16_t) ALPHA2; // 0.99 in Q15.
alpha = kSmoothingUp; // 0.99 in Q15.
}
}
tmp32 = WEBRTC_SPL_MUL_16_16(alpha + 1, self->mean_value[channel]);

View File

@ -16,7 +16,6 @@
extern "C" {
#include "vad_core.h"
#include "vad_defines.h"
#include "vad_sp.h"
}
@ -63,7 +62,7 @@ TEST_F(VadTest, vad_sp) {
// ordered.
for (int16_t i = 0; i < 16; ++i) {
int16_t value = 500 * (i + 1);
for (int j = 0; j < NUM_CHANNELS; ++j) {
for (int j = 0; j < kNumChannels; ++j) {
// Use values both above and below initialized value.
EXPECT_EQ(kReferenceMin[i], WebRtcVad_FindMinimum(self, value, j));
EXPECT_EQ(kReferenceMin[i + 16], WebRtcVad_FindMinimum(self, 12000, j));