From 40ea5106f6294f4bab791c89d5634bd130dcb96b Mon Sep 17 00:00:00 2001 From: "bjornv@webrtc.org" Date: Thu, 12 Jan 2012 12:47:42 +0000 Subject: [PATCH] Refactoring vad_filterbank Made internal function LogOfEnergy() more efficient. Includes - Name change "vector" -> "data" - Complete refactor of LogOfEnergy() - Removed lint warning Major changes: * Removed unnecessary variables * Reduced number of shifts * Removed one norm calculation TEST=vad_unittests, audioproc_unittest Review URL: http://webrtc-codereview.appspot.com/347004 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1407 4adac7df-926f-26a2-2b94-8c16560cd09d --- src/common_audio/vad/vad_filterbank.c | 235 ++++++++++-------- src/common_audio/vad/vad_filterbank.h | 12 +- .../vad/vad_filterbank_unittest.cc | 23 +- 3 files changed, 154 insertions(+), 116 deletions(-) diff --git a/src/common_audio/vad/vad_filterbank.c b/src/common_audio/vad/vad_filterbank.c index 40cae8927..2f5db448c 100644 --- a/src/common_audio/vad/vad_filterbank.c +++ b/src/common_audio/vad/vad_filterbank.c @@ -16,8 +16,9 @@ #include "typedefs.h" #include "vad_defines.h" -// Constant 160*log10(2) in Q9. -static const int16_t kLogConst = 24660; +// Constants used in LogOfEnergy(). +static const int16_t kLogConst = 24660; // 160*log10(2) in Q9. +static const int16_t kLogEnergyIntPart = 14336; // 14 in Q10 // Coefficients used by HighPassFilter, Q14. static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 }; @@ -30,19 +31,19 @@ static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 }; // Adjustment for division with two in SplitFilter. static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 }; -// High pass filtering, with a cut-off frequency at 80 Hz, if the |in_vector| is +// High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is // sampled at 500 Hz. // -// - in_vector [i] : Input audio data sampled at 500 Hz. -// - in_vector_length [i] : Length of input and output data. -// - filter_state [i/o] : State of the filter. -// - out_vector [o] : Output audio data in the frequency interval -// 80 - 250 Hz. -static void HighPassFilter(const int16_t* in_vector, int in_vector_length, - int16_t* filter_state, int16_t* out_vector) { +// - data_in [i] : Input audio data sampled at 500 Hz. +// - data_length [i] : Length of input and output data. +// - filter_state [i/o] : State of the filter. +// - data_out [o] : Output audio data in the frequency interval +// 80 - 250 Hz. +static void HighPassFilter(const int16_t* data_in, int data_length, + int16_t* filter_state, int16_t* data_out) { int i; - const int16_t* in_ptr = in_vector; - int16_t* out_ptr = out_vector; + const int16_t* in_ptr = data_in; + int16_t* out_ptr = data_out; int32_t tmp32 = 0; @@ -54,7 +55,7 @@ static void HighPassFilter(const int16_t* in_vector, int in_vector_length, // The all-pole section has a max amplification of a single sample of: 1.9931 // Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532 - for (i = 0; i < in_vector_length; i++) { + for (i = 0; i < data_length; i++) { // All-zero section (filter coefficients in Q14). tmp32 = WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[0], *in_ptr); tmp32 += WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[1], filter_state[0]); @@ -71,19 +72,18 @@ static void HighPassFilter(const int16_t* in_vector, int in_vector_length, } } -// All pass filtering of |in_vector|, used before splitting the signal into two +// All pass filtering of |data_in|, used before splitting the signal into two // frequency bands (low pass vs high pass). -// Note that |in_vector| and |out_vector| can NOT correspond to the same -// address. +// Note that |data_in| and |data_out| can NOT correspond to the same address. // -// - in_vector [i] : Input audio signal given in Q0. -// - vector_length [i] : Length of input and output data. +// - data_in [i] : Input audio signal given in Q0. +// - data_length [i] : Length of input and output data. // - filter_coefficient [i] : Given in Q15. // - filter_state [i/o] : State of the filter given in Q(-1). -// - out_vector [o] : Output audio signal given in Q(-1). -static void AllPassFilter(const int16_t* in_vector, int vector_length, +// - data_out [o] : Output audio signal given in Q(-1). +static void AllPassFilter(const int16_t* data_in, int data_length, int16_t filter_coefficient, int16_t* filter_state, - int16_t* out_vector) { + int16_t* data_out) { // The filter can only cause overflow (in the w16 output variable) // if more than 4 consecutive input numbers are of maximum value and // has the the same sign as the impulse responses first taps. @@ -95,120 +95,159 @@ static void AllPassFilter(const int16_t* in_vector, int vector_length, int32_t tmp32 = 0; int32_t state32 = ((int32_t) (*filter_state) << 16); // Q15 - for (i = 0; i < vector_length; i++) { - tmp32 = state32 + WEBRTC_SPL_MUL_16_16(filter_coefficient, *in_vector); + for (i = 0; i < data_length; i++) { + tmp32 = state32 + WEBRTC_SPL_MUL_16_16(filter_coefficient, *data_in); tmp16 = (int16_t) (tmp32 >> 16); // Q(-1) - *out_vector++ = tmp16; - state32 = (((int32_t) (*in_vector)) << 14); // Q14 + *data_out++ = tmp16; + state32 = (((int32_t) (*data_in)) << 14); // Q14 state32 -= WEBRTC_SPL_MUL_16_16(filter_coefficient, tmp16); // Q14 state32 <<= 1; // Q15. - in_vector += 2; + data_in += 2; } *filter_state = (int16_t) (state32 >> 16); // Q(-1) } -// Splits |in_vector| into |out_vector_hp| and |out_vector_lp| corresponding to +// Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to // an upper (high pass) part and a lower (low pass) part respectively. // -// - in_vector [i] : Input audio data to be split into two frequency -// bands. -// - in_vector_length [i] : Length of |in_vector|. -// - upper_state [i/o] : State of the upper filter, given in Q(-1). -// - lower_state [i/o] : State of the lower filter, given in Q(-1). -// - out_vector_hp [o] : Output audio data of the upper half of the -// spectrum. The length is |in_vector_length| / 2. -// - out_vector_lp [o] : Output audio data of the lower half of the -// spectrum. The length is |in_vector_length| / 2. -static void SplitFilter(const int16_t* in_vector, int in_vector_length, +// - data_in [i] : Input audio data to be split into two frequency bands. +// - data_length [i] : Length of |data_in|. +// - upper_state [i/o] : State of the upper filter, given in Q(-1). +// - lower_state [i/o] : State of the lower filter, given in Q(-1). +// - hp_data_out [o] : Output audio data of the upper half of the spectrum. +// The length is |data_length| / 2. +// - lp_data_out [o] : Output audio data of the lower half of the spectrum. +// The length is |data_length| / 2. +static void SplitFilter(const int16_t* data_in, int data_length, int16_t* upper_state, int16_t* lower_state, - int16_t* out_vector_hp, int16_t* out_vector_lp) { + int16_t* hp_data_out, int16_t* lp_data_out) { int i; - int half_length = in_vector_length >> 1; // Downsampling by 2. + int half_length = data_length >> 1; // Downsampling by 2. int16_t tmp_out; // All-pass filtering upper branch. - AllPassFilter(&in_vector[0], half_length, kAllPassCoefsQ15[0], upper_state, - out_vector_hp); + AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state, + hp_data_out); // All-pass filtering lower branch. - AllPassFilter(&in_vector[1], half_length, kAllPassCoefsQ15[1], lower_state, - out_vector_lp); + AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state, + lp_data_out); // Make LP and HP signals. for (i = 0; i < half_length; i++) { - tmp_out = *out_vector_hp; - *out_vector_hp++ -= *out_vector_lp; - *out_vector_lp++ += tmp_out; + tmp_out = *hp_data_out; + *hp_data_out++ -= *lp_data_out; + *lp_data_out++ += tmp_out; } } -// Calculates the energy in dB of |in_vector|, and also updates an overall -// |power| if necessary. +// Calculates the energy of |data_in| in dB, and also updates an overall +// |total_energy| if necessary. // -// - in_vector [i] : Input audio data for energy calculation. -// - vector_length [i] : Length of input data. -// - offset [i] : Offset value added to |log_energy|. -// - power [i/o] : Signal power updated with the energy from -// |in_vector|. -// NOTE: |power| is only updated if -// |power| < MIN_ENERGY. -// - log_energy [o] : 10 * log10("energy of |in_vector|") given in Q4. -static void LogOfEnergy(const int16_t* in_vector, int vector_length, - int16_t offset, int16_t* power, int16_t* log_energy) { - int shfts = 0, shfts2 = 0; - int16_t energy_s16 = 0; - int16_t zeros = 0, frac = 0, log2 = 0; - int32_t energy = WebRtcSpl_Energy((int16_t*) in_vector, vector_length, - &shfts); +// - data_in [i] : Input audio data for energy calculation. +// - data_length [i] : Length of input data. +// - offset [i] : Offset value added to |log_energy|. +// - total_energy [i/o] : An external energy updated with the energy of +// |data_in|. +// NOTE: |total_energy| is only updated if +// |total_energy| <= MIN_ENERGY. +// - log_energy [o] : 10 * log10("energy of |data_in|") given in Q4. +static void LogOfEnergy(const int16_t* data_in, int data_length, + int16_t offset, int16_t* total_energy, + int16_t* log_energy) { + // |tot_rshifts| accumulates the number of right shifts performed on |energy|. + int tot_rshifts = 0; + // The |energy| will be normalized to 15 bits. We use unsigned integer because + // we eventually will mask out the fractional part. + uint32_t energy = 0; - if (energy > 0) { + assert(data_in != NULL); + assert(data_length > 0); - shfts2 = 16 - WebRtcSpl_NormW32(energy); - shfts += shfts2; - // "shfts" is the total number of right shifts that has been done to - // energy_s16. - energy_s16 = (int16_t) WEBRTC_SPL_SHIFT_W32(energy, -shfts2); + energy = (uint32_t) WebRtcSpl_Energy((int16_t*) data_in, data_length, + &tot_rshifts); - // Find: - // 160*log10(energy_s16*2^shfts) = 160*log10(2)*log2(energy_s16*2^shfts) = - // 160*log10(2)*(log2(energy_s16) + log2(2^shfts)) = - // 160*log10(2)*(log2(energy_s16) + shfts) + if (energy != 0) { + // By construction, normalizing to 15 bits is equivalent with 17 leading + // zeros of an unsigned 32 bit value. + int normalizing_rshifts = 17 - WebRtcSpl_NormU32(energy); + // In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is + // (14 << 10), which is what we initialize |log2_energy| with. For a more + // detailed derivations, see below. + int16_t log2_energy = kLogEnergyIntPart; - zeros = WebRtcSpl_NormU32(energy_s16); - frac = (int16_t) (((uint32_t) ((int32_t) (energy_s16) << zeros) - & 0x7FFFFFFF) >> 21); - log2 = (int16_t) (((31 - zeros) << 10) + frac); + tot_rshifts += normalizing_rshifts; + // Normalize |energy| to 15 bits. + // |tot_rshifts| is now the total number of right shifts performed on + // |energy| after normalization. This means that |energy| is in + // Q(-tot_rshifts). + if (normalizing_rshifts < 0) { + energy <<= -normalizing_rshifts; + } else { + energy >>= normalizing_rshifts; + } - *log_energy = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(kLogConst, log2, 19) - + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(shfts, kLogConst, 9); + // Calculate the energy of |data_in| in dB, in Q4. + // + // 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") = + // 160 * log10(|energy| * 2^|tot_rshifts|) = + // 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) = + // 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) = + // (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) = + // |kLogConst| * (|log2_energy| + |tot_rshifts|) + // + // We know by construction that |energy| is normalized to 15 bits. Hence, + // |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15. + // Further, we'd like |log2_energy| in Q10 + // log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) = + // 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) = + // 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~= + // (14 << 10) + 2^10 * (frac_Q15 * 2^-14) = + // (14 << 10) + (frac_Q15 * 2^-4) = (14 << 10) + (frac_Q15 >> 4) + // + // Note that frac_Q15 = (|energy| & 0x00003FFF) + + // Calculate and add the fractional part to |log2_energy|. + log2_energy += (int16_t) ((energy & 0x00003FFF) >> 4); + + // |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0. + // Note that we in our derivation above have accounted for an output in Q4. + *log_energy = (int16_t) (WEBRTC_SPL_MUL_16_16_RSFT( + kLogConst, log2_energy, 19) + + WEBRTC_SPL_MUL_16_16_RSFT(tot_rshifts, kLogConst, 9)); if (*log_energy < 0) { *log_energy = 0; } } else { - *log_energy = 0; - shfts = -15; - energy_s16 = 0; + *log_energy = offset; + return; } *log_energy += offset; - // Total power in frame - if (*power <= MIN_ENERGY) { - if (shfts > 0) { - *power += MIN_ENERGY + 1; - } else if (WEBRTC_SPL_SHIFT_W16(energy_s16, shfts) > MIN_ENERGY) { - *power += MIN_ENERGY + 1; + // Update the approximate |total_energy| with the energy of |data_in|, if + // |total_energy| has not exceeded MIN_ENERGY. |total_energy| is used as an + // energy indicator in WebRtcVad_GmmProbability() in vad_core.c. + if (*total_energy <= MIN_ENERGY) { + if (tot_rshifts >= 0) { + // We know by construction that the |energy| > MIN_ENERGY in Q0, so add an + // arbitrary value such that |total_energy| exceeds MIN_ENERGY. + *total_energy += MIN_ENERGY + 1; } else { - *power += WEBRTC_SPL_SHIFT_W16(energy_s16, shfts); + // By construction |energy| is represented by 15 bits, hence any number of + // right shifted |energy| will fit in an int16_t. In addition, adding the + // value to |total_energy| is wrap around safe as long as + // MIN_ENERGY < 8192. + *total_energy += (int16_t) (energy >> -tot_rshifts); // Q0. } } } int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in, - int data_length, int16_t* data_out) { - int16_t power = 0; + int data_length, int16_t* features) { + int16_t total_energy = 0; // We expect |data_length| to be 80, 160 or 240 samples, which corresponds to // 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will // have at most 120 samples after the first split and at most 60 samples after @@ -244,10 +283,10 @@ int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in, // Energy in 3000 Hz - 4000 Hz. length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz. - LogOfEnergy(hp_60, length, kOffsetVector[5], &power, &data_out[5]); + LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]); // Energy in 2000 Hz - 3000 Hz. - LogOfEnergy(lp_60, length, kOffsetVector[4], &power, &data_out[4]); + LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]); // For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample. frequency_band = 2; @@ -260,7 +299,7 @@ int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in, // Energy in 1000 Hz - 2000 Hz. length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz. - LogOfEnergy(hp_60, length, kOffsetVector[3], &power, &data_out[3]); + LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]); // For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample. frequency_band = 3; @@ -272,7 +311,7 @@ int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in, // Energy in 500 Hz - 1000 Hz. length >>= 1; // |data_length| / 8 <=> bandwidth = 500 Hz. - LogOfEnergy(hp_120, length, kOffsetVector[2], &power, &data_out[2]); + LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]); // For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample. frequency_band = 4; @@ -284,13 +323,13 @@ int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in, // Energy in 250 Hz - 500 Hz. length >>= 1; // |data_length| / 16 <=> bandwidth = 250 Hz. - LogOfEnergy(hp_60, length, kOffsetVector[1], &power, &data_out[1]); + LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]); // Remove 0 Hz - 80 Hz, by high pass filtering the lower band. HighPassFilter(lp_60, length, self->hp_filter_state, hp_120); // Energy in 80 Hz - 250 Hz. - LogOfEnergy(hp_120, length, kOffsetVector[0], &power, &data_out[0]); + LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]); - return power; + return total_energy; } diff --git a/src/common_audio/vad/vad_filterbank.h b/src/common_audio/vad/vad_filterbank.h index 668036a32..0c5c00c6a 100644 --- a/src/common_audio/vad/vad_filterbank.h +++ b/src/common_audio/vad/vad_filterbank.h @@ -19,7 +19,7 @@ #include "vad_core.h" // Takes |data_length| samples of |data_in| and calculates the logarithm of the -// power of each of the |NUM_CHANNELS| = 6 frequency bands used by the VAD: +// energy of each of the |NUM_CHANNELS| = 6 frequency bands used by the VAD: // 80 Hz - 250 Hz // 250 Hz - 500 Hz // 500 Hz - 1000 Hz @@ -27,18 +27,18 @@ // 2000 Hz - 3000 Hz // 3000 Hz - 4000 Hz // -// The values are given in Q4 and written to |data_out|. Further, an approximate -// overall power is returned. The return value is used in +// The values are given in Q4 and written to |features|. Further, an approximate +// overall energy is returned. The return value is used in // WebRtcVad_GmmProbability() as a signal indicator, hence it is arbitrary above // the threshold MIN_ENERGY. // // - self [i/o] : State information of the VAD. // - data_in [i] : Input audio data, for feature extraction. // - data_length [i] : Audio data size, in number of samples. -// - data_out [o] : 10 * log10(power in each frequency band), Q4. -// - returns : Total power of the signal (NOTE! This value is not +// - features [o] : 10 * log10(energy in each frequency band), Q4. +// - returns : Total energy of the signal (NOTE! This value is not // exact. It is only used in a comparison.) int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in, - int data_length, int16_t* data_out); + int data_length, int16_t* features); #endif // WEBRTC_COMMON_AUDIO_VAD_VAD_FILTERBANK_H_ diff --git a/src/common_audio/vad/vad_filterbank_unittest.cc b/src/common_audio/vad/vad_filterbank_unittest.cc index 48c89d0ef..04fddce59 100644 --- a/src/common_audio/vad/vad_filterbank_unittest.cc +++ b/src/common_audio/vad/vad_filterbank_unittest.cc @@ -25,17 +25,16 @@ namespace { enum { kNumValidFrameLengths = 3 }; TEST_F(VadTest, vad_filterbank) { - VadInstT* self = (VadInstT*) malloc(sizeof(VadInstT)); - static const int16_t kReference[kNumValidFrameLengths] = { 15, 11, 11 }; - static const int16_t kReferencePowers[kNumValidFrameLengths * NUM_CHANNELS] = - { + VadInstT* self = reinterpret_cast(malloc(sizeof(VadInstT))); + static const int16_t kReference[kNumValidFrameLengths] = { 48, 11, 11 }; + static const int16_t kFeatures[kNumValidFrameLengths * NUM_CHANNELS] = { 1213, 759, 587, 462, 434, 272, 1479, 1385, 1291, 1200, 1103, 1099, 1732, 1692, 1681, 1629, 1436, 1436 }; static const int16_t kOffsetVector[NUM_CHANNELS] = { 368, 368, 272, 176, 176, 176 }; - int16_t data_out[NUM_CHANNELS]; + int16_t features[NUM_CHANNELS]; // Construct a speech signal that will trigger the VAD in all modes. It is // known that (i * i) will wrap around, but that doesn't matter in this case. @@ -50,10 +49,10 @@ TEST_F(VadTest, vad_filterbank) { if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) { EXPECT_EQ(kReference[frame_length_index], WebRtcVad_CalculateFeatures(self, speech, kFrameLengths[j], - data_out)); + features)); for (int k = 0; k < NUM_CHANNELS; ++k) { - EXPECT_EQ(kReferencePowers[k + frame_length_index * NUM_CHANNELS], - data_out[k]); + EXPECT_EQ(kFeatures[k + frame_length_index * NUM_CHANNELS], + features[k]); } frame_length_index++; } @@ -66,9 +65,9 @@ TEST_F(VadTest, vad_filterbank) { for (size_t j = 0; j < kFrameLengthsSize; ++j) { if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) { EXPECT_EQ(0, WebRtcVad_CalculateFeatures(self, speech, kFrameLengths[j], - data_out)); + features)); for (int k = 0; k < NUM_CHANNELS; ++k) { - EXPECT_EQ(kOffsetVector[k], data_out[k]); + EXPECT_EQ(kOffsetVector[k], features[k]); } } } @@ -82,9 +81,9 @@ TEST_F(VadTest, vad_filterbank) { if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) { ASSERT_EQ(0, WebRtcVad_InitCore(self, 0)); EXPECT_EQ(0, WebRtcVad_CalculateFeatures(self, speech, kFrameLengths[j], - data_out)); + features)); for (int k = 0; k < NUM_CHANNELS; ++k) { - EXPECT_EQ(kOffsetVector[k], data_out[k]); + EXPECT_EQ(kOffsetVector[k], features[k]); } } }