diff --git a/src/common_audio/vad/vad_core.h b/src/common_audio/vad/vad_core.h index 544caf5ab..cad6ca4a7 100644 --- a/src/common_audio/vad/vad_core.h +++ b/src/common_audio/vad/vad_core.h @@ -28,11 +28,14 @@ typedef struct VadInstT_ WebRtc_Word16 speech_means[NUM_TABLE_VALUES]; WebRtc_Word16 noise_stds[NUM_TABLE_VALUES]; WebRtc_Word16 speech_stds[NUM_TABLE_VALUES]; + // TODO(bjornv): Change to |frame_count|. WebRtc_Word32 frame_counter; WebRtc_Word16 over_hang; // Over Hang WebRtc_Word16 num_of_speech; + // TODO(bjornv): Change to |age_vector|. WebRtc_Word16 index_vector[16 * NUM_CHANNELS]; WebRtc_Word16 low_value_vector[16 * NUM_CHANNELS]; + // TODO(bjornv): Change to |median|. WebRtc_Word16 mean_value[NUM_CHANNELS]; WebRtc_Word16 upper_state[5]; WebRtc_Word16 lower_state[5]; diff --git a/src/common_audio/vad/vad_sp.c b/src/common_audio/vad/vad_sp.c index 620ab9772..4fface3a6 100644 --- a/src/common_audio/vad/vad_sp.c +++ b/src/common_audio/vad/vad_sp.c @@ -8,229 +8,174 @@ * be found in the AUTHORS file in the root of the source tree. */ - -/* - * This file includes the implementation of the VAD internal calls for - * Downsampling and FindMinimum. - * For function call descriptions; See vad_sp.h. - */ - #include "vad_sp.h" +#include + #include "signal_processing_library.h" #include "typedefs.h" #include "vad_defines.h" -// Allpass filter coefficients, upper and lower, in Q13 -// Upper: 0.64, Lower: 0.17 -static const WebRtc_Word16 kAllPassCoefsQ13[2] = {5243, 1392}; // Q13 +// Allpass filter coefficients, upper and lower, in Q13. +// Upper: 0.64, Lower: 0.17. +static const int16_t kAllPassCoefsQ13[2] = { 5243, 1392 }; // Q13 -// Downsampling filter based on the splitting filter and the allpass functions -// in vad_filterbank.c -void WebRtcVad_Downsampling(WebRtc_Word16* signal_in, - WebRtc_Word16* signal_out, - WebRtc_Word32* filter_state, - int inlen) -{ - WebRtc_Word16 tmp16_1, tmp16_2; - WebRtc_Word32 tmp32_1, tmp32_2; - int n, halflen; +// TODO(bjornv): Move this function to vad_filterbank.c. +// Downsampling filter based on splitting filter and allpass functions. +void WebRtcVad_Downsampling(int16_t* signal_in, + int16_t* signal_out, + int32_t* filter_state, + int in_length) { + int16_t tmp16_1 = 0, tmp16_2 = 0; + int32_t tmp32_1 = filter_state[0]; + int32_t tmp32_2 = filter_state[1]; + int n = 0; + int half_length = (in_length >> 1); // Downsampling by 2 gives half length. - // Downsampling by 2 and get two branches - halflen = WEBRTC_SPL_RSHIFT_W16(inlen, 1); + // Filter coefficients in Q13, filter state in Q0. + for (n = 0; n < half_length; n++) { + // All-pass filtering upper branch. + tmp16_1 = (int16_t) ((tmp32_1 >> 1) + + WEBRTC_SPL_MUL_16_16_RSFT(kAllPassCoefsQ13[0], *signal_in, 14)); + *signal_out = tmp16_1; + tmp32_1 = (int32_t) (*signal_in++) - + WEBRTC_SPL_MUL_16_16_RSFT(kAllPassCoefsQ13[0], tmp16_1, 12); - tmp32_1 = filter_state[0]; - tmp32_2 = filter_state[1]; - - // Filter coefficients in Q13, filter state in Q0 - for (n = 0; n < halflen; n++) - { - // All-pass filtering upper branch - tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(tmp32_1, 1) - + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((kAllPassCoefsQ13[0]), - *signal_in, 14); - *signal_out = tmp16_1; - tmp32_1 = (WebRtc_Word32)(*signal_in++) - - (WebRtc_Word32)WEBRTC_SPL_MUL_16_16_RSFT((kAllPassCoefsQ13[0]), tmp16_1, 12); - - // All-pass filtering lower branch - tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(tmp32_2, 1) - + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((kAllPassCoefsQ13[1]), - *signal_in, 14); - *signal_out++ += tmp16_2; - tmp32_2 = (WebRtc_Word32)(*signal_in++) - - (WebRtc_Word32)WEBRTC_SPL_MUL_16_16_RSFT((kAllPassCoefsQ13[1]), tmp16_2, 12); - } - filter_state[0] = tmp32_1; - filter_state[1] = tmp32_2; + // All-pass filtering lower branch. + tmp16_2 = (int16_t) ((tmp32_2 >> 1) + + WEBRTC_SPL_MUL_16_16_RSFT(kAllPassCoefsQ13[1], *signal_in, 14)); + *signal_out++ += tmp16_2; + tmp32_2 = (int32_t) (*signal_in++) - + WEBRTC_SPL_MUL_16_16_RSFT(kAllPassCoefsQ13[1], tmp16_2, 12); + } + // Store the filter states. + filter_state[0] = tmp32_1; + filter_state[1] = tmp32_2; } -WebRtc_Word16 WebRtcVad_FindMinimum(VadInstT* inst, - WebRtc_Word16 x, - int n) -{ - int i, j, k, II = -1, offset; - WebRtc_Word16 meanV, alpha; - WebRtc_Word32 tmp32, tmp32_1; - WebRtc_Word16 *valptr, *idxptr, *p1, *p2, *p3; +// Inserts |feature_value| into |low_value_vector|, if it is one of the 16 +// smallest values the last 100 frames. Then calculates and returns the median +// of the five smallest values. +int16_t WebRtcVad_FindMinimum(VadInstT* self, + int16_t feature_value, + int channel) { + int i = 0, j = 0; + int position = -1; + // Offset to beginning of the 16 minimum values in memory. + int offset = (channel << 4); + int16_t current_median = 1600; + int16_t alpha = 0; + int32_t tmp32 = 0; + // Pointer to memory for the 16 minimum values and the age of each value of + // the |channel|. + int16_t* age_ptr = &self->index_vector[offset]; + int16_t* value_ptr = &self->low_value_vector[offset]; + int16_t *p1, *p2, *p3; - // Offset to beginning of the 16 minimum values in memory - offset = WEBRTC_SPL_LSHIFT_W16(n, 4); + assert(channel < NUM_CHANNELS); - // Pointer to memory for the 16 minimum values and the age of each value - idxptr = &inst->index_vector[offset]; - valptr = &inst->low_value_vector[offset]; + // Each value in |low_value_vector| is getting 1 loop older. + // Update age of each value in |age_ptr|, and remove old values. + for (i = 0; i < 16; i++) { + p3 = age_ptr + i; + if (*p3 != 100) { + *p3 += 1; + } else { + p1 = value_ptr + i + 1; + p2 = p3 + 1; + for (j = i; j < 16; j++) { + *(value_ptr + j) = *p1++; + *(age_ptr + j) = *p2++; + } + *(age_ptr + 15) = 101; + *(value_ptr + 15) = 10000; + } + } - // Each value in low_value_vector is getting 1 loop older. - // Update age of each value in indexVal, and remove old values. - for (i = 0; i < 16; i++) - { - p3 = idxptr + i; - if (*p3 != 100) - { - *p3 += 1; - } else - { - p1 = valptr + i + 1; - p2 = p3 + 1; - for (j = i; j < 16; j++) - { - *(valptr + j) = *p1++; - *(idxptr + j) = *p2++; - } - *(idxptr + 15) = 101; - *(valptr + 15) = 10000; + // Check if |feature_value| is smaller than any of the values in + // |low_value_vector|. If so, find the |position| where to insert the new + // value. + if (feature_value < *(value_ptr + 7)) { + if (feature_value < *(value_ptr + 3)) { + if (feature_value < *(value_ptr + 1)) { + if (feature_value < *value_ptr) { + position = 0; + } else { + position = 1; } + } else if (feature_value < *(value_ptr + 2)) { + position = 2; + } else { + position = 3; + } + } else if (feature_value < *(value_ptr + 5)) { + if (feature_value < *(value_ptr + 4)) { + position = 4; + } else { + position = 5; + } + } else if (feature_value < *(value_ptr + 6)) { + position = 6; + } else { + position = 7; } - - // Check if x smaller than any of the values in low_value_vector. - // If so, find position. - if (x < *(valptr + 7)) - { - if (x < *(valptr + 3)) - { - if (x < *(valptr + 1)) - { - if (x < *valptr) - { - II = 0; - } else - { - II = 1; - } - } else if (x < *(valptr + 2)) - { - II = 2; - } else - { - II = 3; - } - } else if (x < *(valptr + 5)) - { - if (x < *(valptr + 4)) - { - II = 4; - } else - { - II = 5; - } - } else if (x < *(valptr + 6)) - { - II = 6; - } else - { - II = 7; - } - } else if (x < *(valptr + 15)) - { - if (x < *(valptr + 11)) - { - if (x < *(valptr + 9)) - { - if (x < *(valptr + 8)) - { - II = 8; - } else - { - II = 9; - } - } else if (x < *(valptr + 10)) - { - II = 10; - } else - { - II = 11; - } - } else if (x < *(valptr + 13)) - { - if (x < *(valptr + 12)) - { - II = 12; - } else - { - II = 13; - } - } else if (x < *(valptr + 14)) - { - II = 14; - } else - { - II = 15; + } else if (feature_value < *(value_ptr + 15)) { + if (feature_value < *(value_ptr + 11)) { + if (feature_value < *(value_ptr + 9)) { + if (feature_value < *(value_ptr + 8)) { + position = 8; + } else { + position = 9; } + } else if (feature_value < *(value_ptr + 10)) { + position = 10; + } else { + position = 11; + } + } else if (feature_value < *(value_ptr + 13)) { + if (feature_value < *(value_ptr + 12)) { + position = 12; + } else { + position = 13; + } + } else if (feature_value < *(value_ptr + 14)) { + position = 14; + } else { + position = 15; } + } - // Put new min value on right position and shift bigger values up - if (II > -1) - { - for (i = 15; i > II; i--) - { - k = i - 1; - *(valptr + i) = *(valptr + k); - *(idxptr + i) = *(idxptr + k); - } - *(valptr + II) = x; - *(idxptr + II) = 1; + // If we have a new small value, put it in the correct position and shift + // larger values up. + if (position > -1) { + for (i = 15; i > position; i--) { + j = i - 1; + *(value_ptr + i) = *(value_ptr + j); + *(age_ptr + i) = *(age_ptr + j); } + *(value_ptr + position) = feature_value; + *(age_ptr + position) = 1; + } - meanV = 0; - if ((inst->frame_counter) > 4) - { - j = 5; - } else - { - j = inst->frame_counter; + // Get |current_median|. + if (self->frame_counter > 2) { + current_median = *(value_ptr + 2); + } else if (self->frame_counter > 0) { + current_median = *value_ptr; + } + + // Smooth the median value. + if (self->frame_counter > 0) { + if (current_median < self->mean_value[channel]) { + alpha = (int16_t) ALPHA1; // 0.2 in Q15. + } else { + alpha = (int16_t) ALPHA2; // 0.99 in Q15. } + } + tmp32 = WEBRTC_SPL_MUL_16_16(alpha + 1, self->mean_value[channel]); + tmp32 += WEBRTC_SPL_MUL_16_16(WEBRTC_SPL_WORD16_MAX - alpha, current_median); + tmp32 += 16384; + self->mean_value[channel] = (int16_t) (tmp32 >> 15); - if (j > 2) - { - meanV = *(valptr + 2); - } else if (j > 0) - { - meanV = *valptr; - } else - { - meanV = 1600; - } - - if (inst->frame_counter > 0) - { - if (meanV < inst->mean_value[n]) - { - alpha = (WebRtc_Word16)ALPHA1; // 0.2 in Q15 - } else - { - alpha = (WebRtc_Word16)ALPHA2; // 0.99 in Q15 - } - } else - { - alpha = 0; - } - - tmp32 = WEBRTC_SPL_MUL_16_16((alpha+1), inst->mean_value[n]); - tmp32_1 = WEBRTC_SPL_MUL_16_16(WEBRTC_SPL_WORD16_MAX - alpha, meanV); - tmp32 += tmp32_1; - tmp32 += 16384; - inst->mean_value[n] = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(tmp32, 15); - - return inst->mean_value[n]; + return self->mean_value[channel]; } diff --git a/src/common_audio/vad/vad_sp.h b/src/common_audio/vad/vad_sp.h index ae15c11ad..95c3b4c89 100644 --- a/src/common_audio/vad/vad_sp.h +++ b/src/common_audio/vad/vad_sp.h @@ -9,52 +9,46 @@ */ -/* - * This header file includes the VAD internal calls for Downsampling and FindMinimum. - * Specific function calls are given below. - */ +// This file includes specific signal processing tools used in vad_core.c. -#ifndef WEBRTC_VAD_SP_H_ -#define WEBRTC_VAD_SP_H_ +#ifndef WEBRTC_COMMON_AUDIO_VAD_VAD_SP_H_ +#define WEBRTC_COMMON_AUDIO_VAD_VAD_SP_H_ +#include "typedefs.h" #include "vad_core.h" -/**************************************************************************** - * WebRtcVad_Downsampling(...) - * - * Downsamples the signal a factor 2, eg. 32->16 or 16->8 - * - * Input: - * - signal_in : Input signal - * - in_length : Length of input signal in samples - * - * Input & Output: - * - filter_state : Filter state for first all-pass filters - * - * Output: - * - signal_out : Downsampled signal (of length len/2) - */ -void WebRtcVad_Downsampling(WebRtc_Word16* signal_in, - WebRtc_Word16* signal_out, - WebRtc_Word32* filter_state, +// Downsamples the signal by a factor 2, eg. 32->16 or 16->8. +// +// Inputs: +// - signal_in : Input signal. +// - in_length : Length of input signal in samples. +// +// Input & Output: +// - filter_state : Current filter states of the two all-pass filters. The +// |filter_state| is updated after all samples have been +// processed. +// +// Output: +// - signal_out : Downsampled signal (of length |in_length| / 2). +void WebRtcVad_Downsampling(int16_t* signal_in, + int16_t* signal_out, + int32_t* filter_state, int in_length); -/**************************************************************************** - * WebRtcVad_FindMinimum(...) - * - * Find the five lowest values of x in 100 frames long window. Return a mean - * value of these five values. - * - * Input: - * - feature_value : Feature value - * - channel : Channel number - * - * Input & Output: - * - inst : State information - * - * Output: - * return value : Weighted minimum value for a moving window. - */ -WebRtc_Word16 WebRtcVad_FindMinimum(VadInstT* inst, WebRtc_Word16 feature_value, int channel); +// Updates and returns the smoothed feature minimum. As minimum we use the +// median of the five smallest feature values in a 100 frames long window. +// +// Inputs: +// - feature_value : New feature value to update with. +// - channel : Channel number. +// +// Input & Output: +// - handle : State information of the VAD. +// +// Returns: +// : Smoothed minimum value for a moving window. +int16_t WebRtcVad_FindMinimum(VadInstT* handle, + int16_t feature_value, + int channel); -#endif // WEBRTC_VAD_SP_H_ +#endif // WEBRTC_COMMON_AUDIO_VAD_VAD_SP_H_ diff --git a/src/common_audio/vad/vad_unittest.cc b/src/common_audio/vad/vad_unittest.cc index 66e5ffad5..54a397a30 100644 --- a/src/common_audio/vad/vad_unittest.cc +++ b/src/common_audio/vad/vad_unittest.cc @@ -15,12 +15,12 @@ #include "typedefs.h" #include "webrtc_vad.h" -#ifdef __cplusplus -extern "C" -{ +// TODO(bjornv): Move the internal unit tests to separate files. +extern "C" { +#include "vad_core.h" #include "vad_gmm.h" +#include "vad_sp.h" } -#endif namespace webrtc { namespace { @@ -28,11 +28,12 @@ const int16_t kModes[] = { 0, 1, 2, 3 }; const size_t kModesSize = sizeof(kModes) / sizeof(*kModes); // Rates we support. -const int16_t kRates[] = { 8000, 16000, 32000 }; +const int16_t kRates[] = { 8000, 12000, 16000, 24000, 32000 }; const size_t kRatesSize = sizeof(kRates) / sizeof(*kRates); // Frame lengths we support. const int16_t kMaxFrameLength = 960; -const int16_t kFrameLengths[] = { 80, 160, 240, 320, 480, 640, 960 }; +const int16_t kFrameLengths[] = { 80, 120, 160, 240, 320, 480, 640, + kMaxFrameLength }; const size_t kFrameLengthsSize = sizeof(kFrameLengths) / sizeof(*kFrameLengths); // Returns true if the rate and frame length combination is valid. @@ -182,6 +183,51 @@ TEST_F(VadTest, GMMTests) { EXPECT_EQ(13440, delta); } +TEST_F(VadTest, SPTests) { + VadInstT* handle = (VadInstT*) malloc(sizeof(VadInstT)); + int16_t zeros[kMaxFrameLength] = { 0 }; + int32_t state[2] = { 0 }; + int16_t data_in[kMaxFrameLength]; + int16_t data_out[kMaxFrameLength]; + + const int16_t kReferenceMin[32] = { + 1600, 720, 509, 512, 532, 552, 570, 588, + 606, 624, 642, 659, 675, 691, 707, 723, + 1600, 544, 502, 522, 542, 561, 579, 597, + 615, 633, 651, 667, 683, 699, 715, 731 + }; + + // Construct a speech signal that will trigger the VAD in all modes. It is + // known that (i * i) will wrap around, but that doesn't matter in this case. + for (int16_t i = 0; i < kMaxFrameLength; ++i) { + data_in[i] = (i * i); + } + // Input values all zeros, expect all zeros out. + WebRtcVad_Downsampling(zeros, data_out, state, (int) kMaxFrameLength); + EXPECT_EQ(0, state[0]); + EXPECT_EQ(0, state[1]); + for (int16_t i = 0; i < kMaxFrameLength / 2; ++i) { + EXPECT_EQ(0, data_out[i]); + } + // Make a simple non-zero data test. + WebRtcVad_Downsampling(data_in, data_out, state, (int) kMaxFrameLength); + EXPECT_EQ(207, state[0]); + EXPECT_EQ(2270, state[1]); + + ASSERT_EQ(0, WebRtcVad_InitCore(handle, 0)); + for (int16_t i = 0; i < 16; ++i) { + int16_t value = 500 * (i + 1); + for (int j = 0; j < NUM_CHANNELS; ++j) { + // Use values both above and below initialized value. + EXPECT_EQ(kReferenceMin[i], WebRtcVad_FindMinimum(handle, value, j)); + EXPECT_EQ(kReferenceMin[i + 16], WebRtcVad_FindMinimum(handle, 12000, j)); + } + handle->frame_counter++; + } + + free(handle); +} + // TODO(bjornv): Add a process test, run on file. } // namespace