diff --git a/src/common_audio/vad/vad_filterbank.c b/src/common_audio/vad/vad_filterbank.c index 63eef5b2b..4477c8856 100644 --- a/src/common_audio/vad/vad_filterbank.c +++ b/src/common_audio/vad/vad_filterbank.c @@ -19,26 +19,32 @@ #include "typedefs.h" #include "vad_defines.h" -// Constant 160*log10(2) in Q9 +// Constant 160*log10(2) in Q9. static const int16_t kLogConst = 24660; -// Coefficients used by WebRtcVad_HpOutput, Q14 +// Coefficients used by HighPassFilter, Q14. static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 }; static const int16_t kHpPoleCoefs[3] = { 16384, -7756, 5620 }; -// Allpass filter coefficients, upper and lower, in Q15 +// Allpass filter coefficients, upper and lower, in Q15. // Upper: 0.64, Lower: 0.17 static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 }; -// Adjustment for division with two in WebRtcVad_SplitFilter +// Adjustment for division with two in SplitFilter. static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 }; -void WebRtcVad_HpOutput(int16_t* in_vector, - int in_vector_length, - int16_t* filter_state, - int16_t* out_vector) { +// High pass filtering, with a cut-off frequency at 80 Hz, if the |in_vector| is +// sampled at 500 Hz. +// +// - in_vector [i] : Input audio data sampled at 500 Hz. +// - in_vector_length [i] : Length of input and output data. +// - filter_state [i/o] : State of the filter. +// - out_vector [o] : Output audio data in the frequency interval +// 80 - 250 Hz. +static void HighPassFilter(const int16_t* in_vector, int in_vector_length, + int16_t* filter_state, int16_t* out_vector) { int i; - int16_t* in_ptr = in_vector; + const int16_t* in_ptr = in_vector; int16_t* out_ptr = out_vector; int32_t tmp32 = 0; @@ -70,11 +76,19 @@ void WebRtcVad_HpOutput(int16_t* in_vector, } } -void WebRtcVad_Allpass(int16_t* in_vector, - int16_t filter_coefficients, - int vector_length, - int16_t* filter_state, - int16_t* out_vector) { +// All pass filtering of |in_vector|, used before splitting the signal into two +// frequency bands (low pass vs high pass). +// Note that |in_vector| and |out_vector| can NOT correspond to the same +// address. +// +// - in_vector [i] : Input audio signal given in Q0. +// - vector_length [i] : Length of input and output data. +// - filter_coefficient [i] : Given in Q15. +// - filter_state [i/o] : State of the filter given in Q(-1). +// - out_vector [o] : Output audio signal given in Q(-1). +static void AllPassFilter(const int16_t* in_vector, int vector_length, + int16_t filter_coefficient, int16_t* filter_state, + int16_t* out_vector) { // The filter can only cause overflow (in the w16 output variable) // if more than 4 consecutive input numbers are of maximum value and // has the the same sign as the impulse responses first taps. @@ -87,11 +101,11 @@ void WebRtcVad_Allpass(int16_t* in_vector, int32_t state32 = WEBRTC_SPL_LSHIFT_W32((int32_t) (*filter_state), 16); // Q31 for (i = 0; i < vector_length; i++) { - tmp32 = state32 + WEBRTC_SPL_MUL_16_16(filter_coefficients, (*in_vector)); + tmp32 = state32 + WEBRTC_SPL_MUL_16_16(filter_coefficient, (*in_vector)); tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32, 16); *out_vector++ = tmp16; in32 = WEBRTC_SPL_LSHIFT_W32(((int32_t) (*in_vector)), 14); - state32 = in32 - WEBRTC_SPL_MUL_16_16(filter_coefficients, tmp16); + state32 = in32 - WEBRTC_SPL_MUL_16_16(filter_coefficient, tmp16); state32 = WEBRTC_SPL_LSHIFT_W32(state32, 1); in_vector += 2; } @@ -99,23 +113,32 @@ void WebRtcVad_Allpass(int16_t* in_vector, *filter_state = (int16_t) WEBRTC_SPL_RSHIFT_W32(state32, 16); } -void WebRtcVad_SplitFilter(int16_t* in_vector, - int in_vector_length, - int16_t* upper_state, - int16_t* lower_state, - int16_t* out_vector_hp, - int16_t* out_vector_lp) { +// Splits |in_vector| into |out_vector_hp| and |out_vector_lp| corresponding to +// an upper (high pass) part and a lower (low pass) part respectively. +// +// - in_vector [i] : Input audio data to be split into two frequency +// bands. +// - in_vector_length [i] : Length of |in_vector|. +// - upper_state [i/o] : State of the upper filter, given in Q(-1). +// - lower_state [i/o] : State of the lower filter, given in Q(-1). +// - out_vector_hp [o] : Output audio data of the upper half of the +// spectrum. The length is |in_vector_length| / 2. +// - out_vector_lp [o] : Output audio data of the lower half of the +// spectrum. The length is |in_vector_length| / 2. +static void SplitFilter(const int16_t* in_vector, int in_vector_length, + int16_t* upper_state, int16_t* lower_state, + int16_t* out_vector_hp, int16_t* out_vector_lp) { int16_t tmp_out; int i; int half_length = WEBRTC_SPL_RSHIFT_W16(in_vector_length, 1); // All-pass filtering upper branch - WebRtcVad_Allpass(&in_vector[0], kAllPassCoefsQ15[0], half_length, - upper_state, out_vector_hp); + AllPassFilter(&in_vector[0], half_length, kAllPassCoefsQ15[0], upper_state, + out_vector_hp); // All-pass filtering lower branch - WebRtcVad_Allpass(&in_vector[1], kAllPassCoefsQ15[1], half_length, - lower_state, out_vector_lp); + AllPassFilter(&in_vector[1], half_length, kAllPassCoefsQ15[1], lower_state, + out_vector_lp); // Make LP and HP signals for (i = 0; i < half_length; i++) { @@ -125,113 +148,24 @@ void WebRtcVad_SplitFilter(int16_t* in_vector, } } -int16_t WebRtcVad_get_features(VadInstT* inst, - int16_t* in_vector, - int frame_size, - int16_t* out_vector) { - int16_t power = 0; - // We expect |frame_size| to be 80, 160 or 240 samples, which corresponds to - // 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will - // have at most 120 samples after the first split and at most 60 samples after - // the second split. - int16_t hp_120[120], lp_120[120]; - int16_t hp_60[60], lp_60[60]; - // Initialize variables for the first SplitFilter(). - int length = frame_size; - int frequency_band = 0; - int16_t* in_ptr = in_vector; - int16_t* hp_out_ptr = hp_120; - int16_t* lp_out_ptr = lp_120; - - // Split at 2000 Hz and downsample - WebRtcVad_SplitFilter(in_ptr, length, &inst->upper_state[frequency_band], - &inst->lower_state[frequency_band], hp_out_ptr, - lp_out_ptr); - - // Split at 3000 Hz and downsample - frequency_band = 1; - in_ptr = hp_120; - hp_out_ptr = hp_60; - lp_out_ptr = lp_60; - length = WEBRTC_SPL_RSHIFT_W16(frame_size, 1); - - WebRtcVad_SplitFilter(in_ptr, length, &inst->upper_state[frequency_band], - &inst->lower_state[frequency_band], hp_out_ptr, - lp_out_ptr); - - // Energy in 3000 Hz - 4000 Hz - length = WEBRTC_SPL_RSHIFT_W16(length, 1); - WebRtcVad_LogOfEnergy(hp_60, length, kOffsetVector[5], &power, - &out_vector[5]); - - // Energy in 2000 Hz - 3000 Hz - WebRtcVad_LogOfEnergy(lp_60, length, kOffsetVector[4], &power, - &out_vector[4]); - - // Split at 1000 Hz and downsample - frequency_band = 2; - in_ptr = lp_120; - hp_out_ptr = hp_60; - lp_out_ptr = lp_60; - length = WEBRTC_SPL_RSHIFT_W16(frame_size, 1); - WebRtcVad_SplitFilter(in_ptr, length, &inst->upper_state[frequency_band], - &inst->lower_state[frequency_band], hp_out_ptr, - lp_out_ptr); - - // Energy in 1000 Hz - 2000 Hz - length = WEBRTC_SPL_RSHIFT_W16(length, 1); - WebRtcVad_LogOfEnergy(hp_60, length, kOffsetVector[3], &power, - &out_vector[3]); - - // Split at 500 Hz - frequency_band = 3; - in_ptr = lp_60; - hp_out_ptr = hp_120; - lp_out_ptr = lp_120; - - WebRtcVad_SplitFilter(in_ptr, length, &inst->upper_state[frequency_band], - &inst->lower_state[frequency_band], hp_out_ptr, - lp_out_ptr); - - // Energy in 500 Hz - 1000 Hz - length = WEBRTC_SPL_RSHIFT_W16(length, 1); - WebRtcVad_LogOfEnergy(hp_120, length, kOffsetVector[2], &power, - &out_vector[2]); - - // Split at 250 Hz - frequency_band = 4; - in_ptr = lp_120; - hp_out_ptr = hp_60; - lp_out_ptr = lp_60; - - WebRtcVad_SplitFilter(in_ptr, length, &inst->upper_state[frequency_band], - &inst->lower_state[frequency_band], hp_out_ptr, - lp_out_ptr); - - // Energy in 250 Hz - 500 Hz - length = WEBRTC_SPL_RSHIFT_W16(length, 1); - WebRtcVad_LogOfEnergy(hp_60, length, kOffsetVector[1], &power, - &out_vector[1]); - - // Remove DC and LFs - WebRtcVad_HpOutput(lp_60, length, inst->hp_filter_state, hp_120); - - // Power in 80 Hz - 250 Hz - WebRtcVad_LogOfEnergy(hp_120, length, kOffsetVector[0], &power, - &out_vector[0]); - - return power; -} - -void WebRtcVad_LogOfEnergy(int16_t* vector, - int vector_length, - int16_t offset, - int16_t* power, - int16_t* log_energy) { +// Calculates the energy in dB of |in_vector|, and also updates an overall +// |power| if necessary. +// +// - in_vector [i] : Input audio data for energy calculation. +// - vector_length [i] : Length of input data. +// - offset [i] : Offset value added to |log_energy|. +// - power [i/o] : Signal power updated with the energy from +// |in_vector|. +// NOTE: |power| is only updated if +// |power| < MIN_ENERGY. +// - log_energy [o] : 10 * log10("energy of |in_vector|") given in Q4. +static void LogOfEnergy(const int16_t* in_vector, int vector_length, + int16_t offset, int16_t* power, int16_t* log_energy) { int shfts = 0, shfts2 = 0; int16_t energy_s16 = 0; int16_t zeros = 0, frac = 0, log2 = 0; - int32_t energy = WebRtcSpl_Energy(vector, vector_length, &shfts); + int32_t energy = WebRtcSpl_Energy((int16_t*) in_vector, vector_length, + &shfts); if (energy > 0) { @@ -276,3 +210,88 @@ void WebRtcVad_LogOfEnergy(int16_t* vector, } } } + +int16_t WebRtcVad_get_features(VadInstT* inst, const int16_t* in_vector, + int frame_size, int16_t* out_vector) { + int16_t power = 0; + // We expect |frame_size| to be 80, 160 or 240 samples, which corresponds to + // 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will + // have at most 120 samples after the first split and at most 60 samples after + // the second split. + int16_t hp_120[120], lp_120[120]; + int16_t hp_60[60], lp_60[60]; + // Initialize variables for the first SplitFilter(). + int length = frame_size; + int frequency_band = 0; + const int16_t* in_ptr = in_vector; + int16_t* hp_out_ptr = hp_120; + int16_t* lp_out_ptr = lp_120; + + // Split at 2000 Hz and downsample + SplitFilter(in_ptr, length, &inst->upper_state[frequency_band], + &inst->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); + + // Split at 3000 Hz and downsample + frequency_band = 1; + in_ptr = hp_120; + hp_out_ptr = hp_60; + lp_out_ptr = lp_60; + length = WEBRTC_SPL_RSHIFT_W16(frame_size, 1); + + SplitFilter(in_ptr, length, &inst->upper_state[frequency_band], + &inst->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); + + // Energy in 3000 Hz - 4000 Hz + length = WEBRTC_SPL_RSHIFT_W16(length, 1); + LogOfEnergy(hp_60, length, kOffsetVector[5], &power, &out_vector[5]); + + // Energy in 2000 Hz - 3000 Hz + LogOfEnergy(lp_60, length, kOffsetVector[4], &power, &out_vector[4]); + + // Split at 1000 Hz and downsample + frequency_band = 2; + in_ptr = lp_120; + hp_out_ptr = hp_60; + lp_out_ptr = lp_60; + length = WEBRTC_SPL_RSHIFT_W16(frame_size, 1); + SplitFilter(in_ptr, length, &inst->upper_state[frequency_band], + &inst->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); + + // Energy in 1000 Hz - 2000 Hz + length = WEBRTC_SPL_RSHIFT_W16(length, 1); + LogOfEnergy(hp_60, length, kOffsetVector[3], &power, &out_vector[3]); + + // Split at 500 Hz + frequency_band = 3; + in_ptr = lp_60; + hp_out_ptr = hp_120; + lp_out_ptr = lp_120; + + SplitFilter(in_ptr, length, &inst->upper_state[frequency_band], + &inst->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); + + // Energy in 500 Hz - 1000 Hz + length = WEBRTC_SPL_RSHIFT_W16(length, 1); + LogOfEnergy(hp_120, length, kOffsetVector[2], &power, &out_vector[2]); + + // Split at 250 Hz + frequency_band = 4; + in_ptr = lp_120; + hp_out_ptr = hp_60; + lp_out_ptr = lp_60; + + SplitFilter(in_ptr, length, &inst->upper_state[frequency_band], + &inst->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); + + // Energy in 250 Hz - 500 Hz + length = WEBRTC_SPL_RSHIFT_W16(length, 1); + LogOfEnergy(hp_60, length, kOffsetVector[1], &power, &out_vector[1]); + + // Remove DC and LFs + HighPassFilter(lp_60, length, inst->hp_filter_state, hp_120); + + // Power in 80 Hz - 250 Hz + LogOfEnergy(hp_120, length, kOffsetVector[0], &power, &out_vector[0]); + + return power; +} diff --git a/src/common_audio/vad/vad_filterbank.h b/src/common_audio/vad/vad_filterbank.h index 1285c47dd..c774b135b 100644 --- a/src/common_audio/vad/vad_filterbank.h +++ b/src/common_audio/vad/vad_filterbank.h @@ -19,89 +19,19 @@ #include "typedefs.h" #include "vad_core.h" -// TODO(bjornv): Move local functions to vad_filterbank.c and make static. -/**************************************************************************** - * WebRtcVad_HpOutput(...) - * - * This function removes DC from the lowest frequency band - * - * Input: - * - in_vector : Samples in the frequency interval 0 - 250 Hz - * - in_vector_length : Length of input and output vector - * - filter_state : Current state of the filter - * - * Output: - * - out_vector : Samples in the frequency interval 80 - 250 Hz - * - filter_state : Updated state of the filter - * - */ -void WebRtcVad_HpOutput(int16_t* in_vector, - int in_vector_length, - int16_t* filter_state, - int16_t* out_vector); - -/**************************************************************************** - * WebRtcVad_Allpass(...) - * - * This function is used when before splitting a speech file into - * different frequency bands - * - * Note! Do NOT let the arrays in_vector and out_vector correspond to the same address. - * - * Input: - * - in_vector : (Q0) - * - filter_coefficients : (Q15) - * - vector_length : Length of input and output vector - * - filter_state : Current state of the filter (Q(-1)) - * - * Output: - * - out_vector : Output speech signal (Q(-1)) - * - filter_state : Updated state of the filter (Q(-1)) - * - */ -void WebRtcVad_Allpass(int16_t* in_vector, - int16_t filter_coefficients, - int vector_length, - int16_t* filter_state, - int16_t* outw16); - -/**************************************************************************** - * WebRtcVad_SplitFilter(...) - * - * This function is used when before splitting a speech file into - * different frequency bands - * - * Input: - * - in_vector : Input signal to be split into two frequency bands. - * - upper_state : Current state of the upper filter - * - lower_state : Current state of the lower filter - * - in_vector_length : Length of input vector - * - * Output: - * - out_vector_hp : Upper half of the spectrum - * - out_vector_lp : Lower half of the spectrum - * - upper_state : Updated state of the upper filter - * - lower_state : Updated state of the lower filter - * - */ -void WebRtcVad_SplitFilter(int16_t* in_vector, - int in_vector_length, - int16_t* upper_state, - int16_t* lower_state, - int16_t* out_vector_hp, - int16_t* out_vector_lp); - +// TODO(bjornv): Rename to CalcFeatures() or similar. Update at the same time +// comments and parameter order. /**************************************************************************** * WebRtcVad_get_features(...) * - * This function is used to get the logarithm of the power of each of the + * This function is used to get the logarithm of the power of each of the * 6 frequency bands used by the VAD: * 80 Hz - 250 Hz * 250 Hz - 500 Hz * 500 Hz - 1000 Hz * 1000 Hz - 2000 Hz * 2000 Hz - 3000 Hz - * 3000 Hz - 4000 Hz + * 3000 Hz - 4000 Hz * * Input: * - inst : Pointer to VAD instance @@ -110,35 +40,13 @@ void WebRtcVad_SplitFilter(int16_t* in_vector, * * Output: * - out_vector : 10*log10(power in each freq. band), Q4 - * + * * Return: total power in the signal (NOTE! This value is not exact since it * is only used in a comparison. */ int16_t WebRtcVad_get_features(VadInstT* inst, - int16_t* in_vector, + const int16_t* in_vector, int frame_size, int16_t* out_vector); -/**************************************************************************** - * WebRtcVad_LogOfEnergy(...) - * - * This function is used to get the logarithm of the power of one frequency band. - * - * Input: - * - vector : Input speech samples for one frequency band - * - offset : Offset value for the current frequency band - * - vector_length : Length of input vector - * - * Output: - * - log_energy : 10*log10(energy); - * - power : Update total power in speech frame. NOTE! This value - * is not exact since it is only used in a comparison. - * - */ -void WebRtcVad_LogOfEnergy(int16_t* vector, - int vector_length, - int16_t offset, - int16_t* power, - int16_t* log_energy); - #endif // WEBRTC_COMMON_AUDIO_VAD_VAD_FILTERBANK_H_