Refactoring vad_filterbank

Made internal function LogOfEnergy() more efficient.
Includes
- Name change "vector" -> "data"
- Complete refactor of LogOfEnergy()
- Removed lint warning

Major changes:
* Removed unnecessary variables
* Reduced number of shifts
* Removed one norm calculation


TEST=vad_unittests, audioproc_unittest
Review URL: http://webrtc-codereview.appspot.com/347004

git-svn-id: http://webrtc.googlecode.com/svn/trunk@1407 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
bjornv@webrtc.org 2012-01-12 12:47:42 +00:00
parent b39a3b4a7a
commit 40ea5106f6
3 changed files with 154 additions and 116 deletions

View File

@ -16,8 +16,9 @@
#include "typedefs.h"
#include "vad_defines.h"
// Constant 160*log10(2) in Q9.
static const int16_t kLogConst = 24660;
// Constants used in LogOfEnergy().
static const int16_t kLogConst = 24660; // 160*log10(2) in Q9.
static const int16_t kLogEnergyIntPart = 14336; // 14 in Q10
// Coefficients used by HighPassFilter, Q14.
static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 };
@ -30,19 +31,19 @@ static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 };
// Adjustment for division with two in SplitFilter.
static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 };
// High pass filtering, with a cut-off frequency at 80 Hz, if the |in_vector| is
// High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is
// sampled at 500 Hz.
//
// - in_vector [i] : Input audio data sampled at 500 Hz.
// - in_vector_length [i] : Length of input and output data.
// - filter_state [i/o] : State of the filter.
// - out_vector [o] : Output audio data in the frequency interval
// 80 - 250 Hz.
static void HighPassFilter(const int16_t* in_vector, int in_vector_length,
int16_t* filter_state, int16_t* out_vector) {
// - data_in [i] : Input audio data sampled at 500 Hz.
// - data_length [i] : Length of input and output data.
// - filter_state [i/o] : State of the filter.
// - data_out [o] : Output audio data in the frequency interval
// 80 - 250 Hz.
static void HighPassFilter(const int16_t* data_in, int data_length,
int16_t* filter_state, int16_t* data_out) {
int i;
const int16_t* in_ptr = in_vector;
int16_t* out_ptr = out_vector;
const int16_t* in_ptr = data_in;
int16_t* out_ptr = data_out;
int32_t tmp32 = 0;
@ -54,7 +55,7 @@ static void HighPassFilter(const int16_t* in_vector, int in_vector_length,
// The all-pole section has a max amplification of a single sample of: 1.9931
// Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532
for (i = 0; i < in_vector_length; i++) {
for (i = 0; i < data_length; i++) {
// All-zero section (filter coefficients in Q14).
tmp32 = WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[0], *in_ptr);
tmp32 += WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[1], filter_state[0]);
@ -71,19 +72,18 @@ static void HighPassFilter(const int16_t* in_vector, int in_vector_length,
}
}
// All pass filtering of |in_vector|, used before splitting the signal into two
// All pass filtering of |data_in|, used before splitting the signal into two
// frequency bands (low pass vs high pass).
// Note that |in_vector| and |out_vector| can NOT correspond to the same
// address.
// Note that |data_in| and |data_out| can NOT correspond to the same address.
//
// - in_vector [i] : Input audio signal given in Q0.
// - vector_length [i] : Length of input and output data.
// - data_in [i] : Input audio signal given in Q0.
// - data_length [i] : Length of input and output data.
// - filter_coefficient [i] : Given in Q15.
// - filter_state [i/o] : State of the filter given in Q(-1).
// - out_vector [o] : Output audio signal given in Q(-1).
static void AllPassFilter(const int16_t* in_vector, int vector_length,
// - data_out [o] : Output audio signal given in Q(-1).
static void AllPassFilter(const int16_t* data_in, int data_length,
int16_t filter_coefficient, int16_t* filter_state,
int16_t* out_vector) {
int16_t* data_out) {
// The filter can only cause overflow (in the w16 output variable)
// if more than 4 consecutive input numbers are of maximum value and
// has the the same sign as the impulse responses first taps.
@ -95,120 +95,159 @@ static void AllPassFilter(const int16_t* in_vector, int vector_length,
int32_t tmp32 = 0;
int32_t state32 = ((int32_t) (*filter_state) << 16); // Q15
for (i = 0; i < vector_length; i++) {
tmp32 = state32 + WEBRTC_SPL_MUL_16_16(filter_coefficient, *in_vector);
for (i = 0; i < data_length; i++) {
tmp32 = state32 + WEBRTC_SPL_MUL_16_16(filter_coefficient, *data_in);
tmp16 = (int16_t) (tmp32 >> 16); // Q(-1)
*out_vector++ = tmp16;
state32 = (((int32_t) (*in_vector)) << 14); // Q14
*data_out++ = tmp16;
state32 = (((int32_t) (*data_in)) << 14); // Q14
state32 -= WEBRTC_SPL_MUL_16_16(filter_coefficient, tmp16); // Q14
state32 <<= 1; // Q15.
in_vector += 2;
data_in += 2;
}
*filter_state = (int16_t) (state32 >> 16); // Q(-1)
}
// Splits |in_vector| into |out_vector_hp| and |out_vector_lp| corresponding to
// Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to
// an upper (high pass) part and a lower (low pass) part respectively.
//
// - in_vector [i] : Input audio data to be split into two frequency
// bands.
// - in_vector_length [i] : Length of |in_vector|.
// - upper_state [i/o] : State of the upper filter, given in Q(-1).
// - lower_state [i/o] : State of the lower filter, given in Q(-1).
// - out_vector_hp [o] : Output audio data of the upper half of the
// spectrum. The length is |in_vector_length| / 2.
// - out_vector_lp [o] : Output audio data of the lower half of the
// spectrum. The length is |in_vector_length| / 2.
static void SplitFilter(const int16_t* in_vector, int in_vector_length,
// - data_in [i] : Input audio data to be split into two frequency bands.
// - data_length [i] : Length of |data_in|.
// - upper_state [i/o] : State of the upper filter, given in Q(-1).
// - lower_state [i/o] : State of the lower filter, given in Q(-1).
// - hp_data_out [o] : Output audio data of the upper half of the spectrum.
// The length is |data_length| / 2.
// - lp_data_out [o] : Output audio data of the lower half of the spectrum.
// The length is |data_length| / 2.
static void SplitFilter(const int16_t* data_in, int data_length,
int16_t* upper_state, int16_t* lower_state,
int16_t* out_vector_hp, int16_t* out_vector_lp) {
int16_t* hp_data_out, int16_t* lp_data_out) {
int i;
int half_length = in_vector_length >> 1; // Downsampling by 2.
int half_length = data_length >> 1; // Downsampling by 2.
int16_t tmp_out;
// All-pass filtering upper branch.
AllPassFilter(&in_vector[0], half_length, kAllPassCoefsQ15[0], upper_state,
out_vector_hp);
AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state,
hp_data_out);
// All-pass filtering lower branch.
AllPassFilter(&in_vector[1], half_length, kAllPassCoefsQ15[1], lower_state,
out_vector_lp);
AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state,
lp_data_out);
// Make LP and HP signals.
for (i = 0; i < half_length; i++) {
tmp_out = *out_vector_hp;
*out_vector_hp++ -= *out_vector_lp;
*out_vector_lp++ += tmp_out;
tmp_out = *hp_data_out;
*hp_data_out++ -= *lp_data_out;
*lp_data_out++ += tmp_out;
}
}
// Calculates the energy in dB of |in_vector|, and also updates an overall
// |power| if necessary.
// Calculates the energy of |data_in| in dB, and also updates an overall
// |total_energy| if necessary.
//
// - in_vector [i] : Input audio data for energy calculation.
// - vector_length [i] : Length of input data.
// - offset [i] : Offset value added to |log_energy|.
// - power [i/o] : Signal power updated with the energy from
// |in_vector|.
// NOTE: |power| is only updated if
// |power| < MIN_ENERGY.
// - log_energy [o] : 10 * log10("energy of |in_vector|") given in Q4.
static void LogOfEnergy(const int16_t* in_vector, int vector_length,
int16_t offset, int16_t* power, int16_t* log_energy) {
int shfts = 0, shfts2 = 0;
int16_t energy_s16 = 0;
int16_t zeros = 0, frac = 0, log2 = 0;
int32_t energy = WebRtcSpl_Energy((int16_t*) in_vector, vector_length,
&shfts);
// - data_in [i] : Input audio data for energy calculation.
// - data_length [i] : Length of input data.
// - offset [i] : Offset value added to |log_energy|.
// - total_energy [i/o] : An external energy updated with the energy of
// |data_in|.
// NOTE: |total_energy| is only updated if
// |total_energy| <= MIN_ENERGY.
// - log_energy [o] : 10 * log10("energy of |data_in|") given in Q4.
static void LogOfEnergy(const int16_t* data_in, int data_length,
int16_t offset, int16_t* total_energy,
int16_t* log_energy) {
// |tot_rshifts| accumulates the number of right shifts performed on |energy|.
int tot_rshifts = 0;
// The |energy| will be normalized to 15 bits. We use unsigned integer because
// we eventually will mask out the fractional part.
uint32_t energy = 0;
if (energy > 0) {
assert(data_in != NULL);
assert(data_length > 0);
shfts2 = 16 - WebRtcSpl_NormW32(energy);
shfts += shfts2;
// "shfts" is the total number of right shifts that has been done to
// energy_s16.
energy_s16 = (int16_t) WEBRTC_SPL_SHIFT_W32(energy, -shfts2);
energy = (uint32_t) WebRtcSpl_Energy((int16_t*) data_in, data_length,
&tot_rshifts);
// Find:
// 160*log10(energy_s16*2^shfts) = 160*log10(2)*log2(energy_s16*2^shfts) =
// 160*log10(2)*(log2(energy_s16) + log2(2^shfts)) =
// 160*log10(2)*(log2(energy_s16) + shfts)
if (energy != 0) {
// By construction, normalizing to 15 bits is equivalent with 17 leading
// zeros of an unsigned 32 bit value.
int normalizing_rshifts = 17 - WebRtcSpl_NormU32(energy);
// In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
// (14 << 10), which is what we initialize |log2_energy| with. For a more
// detailed derivations, see below.
int16_t log2_energy = kLogEnergyIntPart;
zeros = WebRtcSpl_NormU32(energy_s16);
frac = (int16_t) (((uint32_t) ((int32_t) (energy_s16) << zeros)
& 0x7FFFFFFF) >> 21);
log2 = (int16_t) (((31 - zeros) << 10) + frac);
tot_rshifts += normalizing_rshifts;
// Normalize |energy| to 15 bits.
// |tot_rshifts| is now the total number of right shifts performed on
// |energy| after normalization. This means that |energy| is in
// Q(-tot_rshifts).
if (normalizing_rshifts < 0) {
energy <<= -normalizing_rshifts;
} else {
energy >>= normalizing_rshifts;
}
*log_energy = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(kLogConst, log2, 19)
+ (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(shfts, kLogConst, 9);
// Calculate the energy of |data_in| in dB, in Q4.
//
// 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
// 160 * log10(|energy| * 2^|tot_rshifts|) =
// 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) =
// 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) =
// (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) =
// |kLogConst| * (|log2_energy| + |tot_rshifts|)
//
// We know by construction that |energy| is normalized to 15 bits. Hence,
// |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
// Further, we'd like |log2_energy| in Q10
// log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
// 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
// 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
// (14 << 10) + 2^10 * (frac_Q15 * 2^-14) =
// (14 << 10) + (frac_Q15 * 2^-4) = (14 << 10) + (frac_Q15 >> 4)
//
// Note that frac_Q15 = (|energy| & 0x00003FFF)
// Calculate and add the fractional part to |log2_energy|.
log2_energy += (int16_t) ((energy & 0x00003FFF) >> 4);
// |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0.
// Note that we in our derivation above have accounted for an output in Q4.
*log_energy = (int16_t) (WEBRTC_SPL_MUL_16_16_RSFT(
kLogConst, log2_energy, 19) +
WEBRTC_SPL_MUL_16_16_RSFT(tot_rshifts, kLogConst, 9));
if (*log_energy < 0) {
*log_energy = 0;
}
} else {
*log_energy = 0;
shfts = -15;
energy_s16 = 0;
*log_energy = offset;
return;
}
*log_energy += offset;
// Total power in frame
if (*power <= MIN_ENERGY) {
if (shfts > 0) {
*power += MIN_ENERGY + 1;
} else if (WEBRTC_SPL_SHIFT_W16(energy_s16, shfts) > MIN_ENERGY) {
*power += MIN_ENERGY + 1;
// Update the approximate |total_energy| with the energy of |data_in|, if
// |total_energy| has not exceeded MIN_ENERGY. |total_energy| is used as an
// energy indicator in WebRtcVad_GmmProbability() in vad_core.c.
if (*total_energy <= MIN_ENERGY) {
if (tot_rshifts >= 0) {
// We know by construction that the |energy| > MIN_ENERGY in Q0, so add an
// arbitrary value such that |total_energy| exceeds MIN_ENERGY.
*total_energy += MIN_ENERGY + 1;
} else {
*power += WEBRTC_SPL_SHIFT_W16(energy_s16, shfts);
// By construction |energy| is represented by 15 bits, hence any number of
// right shifted |energy| will fit in an int16_t. In addition, adding the
// value to |total_energy| is wrap around safe as long as
// MIN_ENERGY < 8192.
*total_energy += (int16_t) (energy >> -tot_rshifts); // Q0.
}
}
}
int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
int data_length, int16_t* data_out) {
int16_t power = 0;
int data_length, int16_t* features) {
int16_t total_energy = 0;
// We expect |data_length| to be 80, 160 or 240 samples, which corresponds to
// 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will
// have at most 120 samples after the first split and at most 60 samples after
@ -244,10 +283,10 @@ int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
// Energy in 3000 Hz - 4000 Hz.
length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
LogOfEnergy(hp_60, length, kOffsetVector[5], &power, &data_out[5]);
LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]);
// Energy in 2000 Hz - 3000 Hz.
LogOfEnergy(lp_60, length, kOffsetVector[4], &power, &data_out[4]);
LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]);
// For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample.
frequency_band = 2;
@ -260,7 +299,7 @@ int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
// Energy in 1000 Hz - 2000 Hz.
length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
LogOfEnergy(hp_60, length, kOffsetVector[3], &power, &data_out[3]);
LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]);
// For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample.
frequency_band = 3;
@ -272,7 +311,7 @@ int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
// Energy in 500 Hz - 1000 Hz.
length >>= 1; // |data_length| / 8 <=> bandwidth = 500 Hz.
LogOfEnergy(hp_120, length, kOffsetVector[2], &power, &data_out[2]);
LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]);
// For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample.
frequency_band = 4;
@ -284,13 +323,13 @@ int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
// Energy in 250 Hz - 500 Hz.
length >>= 1; // |data_length| / 16 <=> bandwidth = 250 Hz.
LogOfEnergy(hp_60, length, kOffsetVector[1], &power, &data_out[1]);
LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]);
// Remove 0 Hz - 80 Hz, by high pass filtering the lower band.
HighPassFilter(lp_60, length, self->hp_filter_state, hp_120);
// Energy in 80 Hz - 250 Hz.
LogOfEnergy(hp_120, length, kOffsetVector[0], &power, &data_out[0]);
LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]);
return power;
return total_energy;
}

View File

@ -19,7 +19,7 @@
#include "vad_core.h"
// Takes |data_length| samples of |data_in| and calculates the logarithm of the
// power of each of the |NUM_CHANNELS| = 6 frequency bands used by the VAD:
// energy of each of the |NUM_CHANNELS| = 6 frequency bands used by the VAD:
// 80 Hz - 250 Hz
// 250 Hz - 500 Hz
// 500 Hz - 1000 Hz
@ -27,18 +27,18 @@
// 2000 Hz - 3000 Hz
// 3000 Hz - 4000 Hz
//
// The values are given in Q4 and written to |data_out|. Further, an approximate
// overall power is returned. The return value is used in
// The values are given in Q4 and written to |features|. Further, an approximate
// overall energy is returned. The return value is used in
// WebRtcVad_GmmProbability() as a signal indicator, hence it is arbitrary above
// the threshold MIN_ENERGY.
//
// - self [i/o] : State information of the VAD.
// - data_in [i] : Input audio data, for feature extraction.
// - data_length [i] : Audio data size, in number of samples.
// - data_out [o] : 10 * log10(power in each frequency band), Q4.
// - returns : Total power of the signal (NOTE! This value is not
// - features [o] : 10 * log10(energy in each frequency band), Q4.
// - returns : Total energy of the signal (NOTE! This value is not
// exact. It is only used in a comparison.)
int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
int data_length, int16_t* data_out);
int data_length, int16_t* features);
#endif // WEBRTC_COMMON_AUDIO_VAD_VAD_FILTERBANK_H_

View File

@ -25,17 +25,16 @@ namespace {
enum { kNumValidFrameLengths = 3 };
TEST_F(VadTest, vad_filterbank) {
VadInstT* self = (VadInstT*) malloc(sizeof(VadInstT));
static const int16_t kReference[kNumValidFrameLengths] = { 15, 11, 11 };
static const int16_t kReferencePowers[kNumValidFrameLengths * NUM_CHANNELS] =
{
VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
static const int16_t kReference[kNumValidFrameLengths] = { 48, 11, 11 };
static const int16_t kFeatures[kNumValidFrameLengths * NUM_CHANNELS] = {
1213, 759, 587, 462, 434, 272,
1479, 1385, 1291, 1200, 1103, 1099,
1732, 1692, 1681, 1629, 1436, 1436
};
static const int16_t kOffsetVector[NUM_CHANNELS] = {
368, 368, 272, 176, 176, 176 };
int16_t data_out[NUM_CHANNELS];
int16_t features[NUM_CHANNELS];
// Construct a speech signal that will trigger the VAD in all modes. It is
// known that (i * i) will wrap around, but that doesn't matter in this case.
@ -50,10 +49,10 @@ TEST_F(VadTest, vad_filterbank) {
if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) {
EXPECT_EQ(kReference[frame_length_index],
WebRtcVad_CalculateFeatures(self, speech, kFrameLengths[j],
data_out));
features));
for (int k = 0; k < NUM_CHANNELS; ++k) {
EXPECT_EQ(kReferencePowers[k + frame_length_index * NUM_CHANNELS],
data_out[k]);
EXPECT_EQ(kFeatures[k + frame_length_index * NUM_CHANNELS],
features[k]);
}
frame_length_index++;
}
@ -66,9 +65,9 @@ TEST_F(VadTest, vad_filterbank) {
for (size_t j = 0; j < kFrameLengthsSize; ++j) {
if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) {
EXPECT_EQ(0, WebRtcVad_CalculateFeatures(self, speech, kFrameLengths[j],
data_out));
features));
for (int k = 0; k < NUM_CHANNELS; ++k) {
EXPECT_EQ(kOffsetVector[k], data_out[k]);
EXPECT_EQ(kOffsetVector[k], features[k]);
}
}
}
@ -82,9 +81,9 @@ TEST_F(VadTest, vad_filterbank) {
if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) {
ASSERT_EQ(0, WebRtcVad_InitCore(self, 0));
EXPECT_EQ(0, WebRtcVad_CalculateFeatures(self, speech, kFrameLengths[j],
data_out));
features));
for (int k = 0; k < NUM_CHANNELS; ++k) {
EXPECT_EQ(kOffsetVector[k], data_out[k]);
EXPECT_EQ(kOffsetVector[k], features[k]);
}
}
}