From ef433579cb8319e93f75a27bde4b348c87537278 Mon Sep 17 00:00:00 2001 From: "tina.legrand@webrtc.org" Date: Mon, 15 Oct 2012 17:46:19 +0000 Subject: [PATCH] Adding support for 48 kHz input to VAD. This CL adds support for 48 kHz sampling frequency in the VAD, by adding downsampling from 48 to 8 kHz. BUG= TEST=vad_unittest Review URL: https://webrtc-codereview.appspot.com/855010 git-svn-id: http://webrtc.googlecode.com/svn/trunk@2926 4adac7df-926f-26a2-2b94-8c16560cd09d --- src/common_audio/vad/vad_core.c | 28 +++++++++++++++++++ src/common_audio/vad/vad_core.h | 5 ++++ src/common_audio/vad/vad_core_unittest.cc | 6 +++++ src/common_audio/vad/vad_sp_unittest.cc | 17 ++++++------ src/common_audio/vad/vad_unittest.cc | 33 ++++++++++++++++------- src/common_audio/vad/vad_unittest.h | 6 ++--- src/common_audio/vad/webrtc_vad.c | 6 +++-- 7 files changed, 78 insertions(+), 23 deletions(-) diff --git a/src/common_audio/vad/vad_core.c b/src/common_audio/vad/vad_core.c index 1e9053fa8..6a36349ad 100644 --- a/src/common_audio/vad/vad_core.c +++ b/src/common_audio/vad/vad_core.c @@ -504,6 +504,9 @@ int WebRtcVad_InitCore(VadInstT* self) { memset(self->downsampling_filter_states, 0, sizeof(self->downsampling_filter_states)); + // Initialization of 48 to 8 kHz downsampling. + WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8); + // Read initial PDF parameters. for (i = 0; i < kTableSize; i++) { self->noise_means[i] = kNoiseDataMeans[i]; @@ -600,6 +603,31 @@ int WebRtcVad_set_mode_core(VadInstT* self, int mode) { // Calculate VAD decision by first extracting feature values and then calculate // probability for both speech and background noise. +int WebRtcVad_CalcVad48khz(VadInstT* inst, int16_t* speech_frame, + int frame_length) { + int vad; + int i; + int16_t speech_nb[240]; // 30 ms in 8 kHz. + // |tmp_mem| is a temporary memory used by resample function, length is + // frame length in 10 ms (480 samples) + 256 extra. + int32_t tmp_mem[480 + 256] = { 0 }; + const int kFrameLen10ms48khz = 480; + const int kFrameLen10ms8khz = 80; + int num_10ms_frames = frame_length / kFrameLen10ms48khz; + + for (i = 0; i < num_10ms_frames; i++) { + WebRtcSpl_Resample48khzTo8khz(speech_frame, + &speech_nb[i * kFrameLen10ms8khz], + &inst->state_48_to_8, + tmp_mem); + } + + // Do VAD on an 8 kHz signal + vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6); + + return vad; +} + int WebRtcVad_CalcVad32khz(VadInstT* inst, int16_t* speech_frame, int frame_length) { diff --git a/src/common_audio/vad/vad_core.h b/src/common_audio/vad/vad_core.h index 00d39a4fb..b89d5dfd4 100644 --- a/src/common_audio/vad/vad_core.h +++ b/src/common_audio/vad/vad_core.h @@ -16,6 +16,7 @@ #ifndef WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_ #define WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_ +#include "common_audio/signal_processing/include/signal_processing_library.h" #include "typedefs.h" enum { kNumChannels = 6 }; // Number of frequency bands (named channels). @@ -28,6 +29,7 @@ typedef struct VadInstT_ int vad; int32_t downsampling_filter_states[4]; + WebRtcSpl_State48khzTo8khz state_48_to_8; int16_t noise_means[kTableSize]; int16_t speech_means[kTableSize]; int16_t noise_stds[kTableSize]; @@ -82,6 +84,7 @@ int WebRtcVad_InitCore(VadInstT* self); int WebRtcVad_set_mode_core(VadInstT* self, int mode); /**************************************************************************** + * WebRtcVad_CalcVad48khz(...) * WebRtcVad_CalcVad32khz(...) * WebRtcVad_CalcVad16khz(...) * WebRtcVad_CalcVad8khz(...) @@ -100,6 +103,8 @@ int WebRtcVad_set_mode_core(VadInstT* self, int mode); * 0 - No active speech * 1-6 - Active speech */ +int WebRtcVad_CalcVad48khz(VadInstT* inst, int16_t* speech_frame, + int frame_length); int WebRtcVad_CalcVad32khz(VadInstT* inst, int16_t* speech_frame, int frame_length); int WebRtcVad_CalcVad16khz(VadInstT* inst, int16_t* speech_frame, diff --git a/src/common_audio/vad/vad_core_unittest.cc b/src/common_audio/vad/vad_core_unittest.cc index 141b7965c..0c5648fef 100644 --- a/src/common_audio/vad/vad_core_unittest.cc +++ b/src/common_audio/vad/vad_core_unittest.cc @@ -75,6 +75,9 @@ TEST_F(VadTest, CalcVad) { if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) { EXPECT_EQ(0, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j])); } + if (ValidRatesAndFrameLengths(48000, kFrameLengths[j])) { + EXPECT_EQ(0, WebRtcVad_CalcVad48khz(self, speech, kFrameLengths[j])); + } } // Construct a speech signal that will trigger the VAD in all modes. It is @@ -92,6 +95,9 @@ TEST_F(VadTest, CalcVad) { if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) { EXPECT_EQ(1, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j])); } + if (ValidRatesAndFrameLengths(48000, kFrameLengths[j])) { + EXPECT_EQ(1, WebRtcVad_CalcVad48khz(self, speech, kFrameLengths[j])); + } } free(self); diff --git a/src/common_audio/vad/vad_sp_unittest.cc b/src/common_audio/vad/vad_sp_unittest.cc index 2b253161b..632117ffc 100644 --- a/src/common_audio/vad/vad_sp_unittest.cc +++ b/src/common_audio/vad/vad_sp_unittest.cc @@ -23,10 +23,11 @@ namespace { TEST_F(VadTest, vad_sp) { VadInstT* self = reinterpret_cast(malloc(sizeof(VadInstT))); - int16_t zeros[kMaxFrameLength] = { 0 }; + const int kMaxFrameLenSp = 960; // Maximum frame length in this unittest. + int16_t zeros[kMaxFrameLenSp] = { 0 }; int32_t state[2] = { 0 }; - int16_t data_in[kMaxFrameLength]; - int16_t data_out[kMaxFrameLength]; + int16_t data_in[kMaxFrameLenSp]; + int16_t data_out[kMaxFrameLenSp]; // We expect the first value to be 1600 as long as |frame_counter| is zero, // which is true for the first iteration. @@ -39,20 +40,18 @@ TEST_F(VadTest, vad_sp) { // Construct a speech signal that will trigger the VAD in all modes. It is // known that (i * i) will wrap around, but that doesn't matter in this case. - for (int16_t i = 0; i < kMaxFrameLength; ++i) { + for (int16_t i = 0; i < kMaxFrameLenSp; ++i) { data_in[i] = (i * i); } // Input values all zeros, expect all zeros out. - WebRtcVad_Downsampling(zeros, data_out, state, - static_cast(kMaxFrameLength)); + WebRtcVad_Downsampling(zeros, data_out, state, kMaxFrameLenSp); EXPECT_EQ(0, state[0]); EXPECT_EQ(0, state[1]); - for (int16_t i = 0; i < kMaxFrameLength / 2; ++i) { + for (int16_t i = 0; i < kMaxFrameLenSp / 2; ++i) { EXPECT_EQ(0, data_out[i]); } // Make a simple non-zero data test. - WebRtcVad_Downsampling(data_in, data_out, state, - static_cast(kMaxFrameLength)); + WebRtcVad_Downsampling(data_in, data_out, state, kMaxFrameLenSp); EXPECT_EQ(207, state[0]); EXPECT_EQ(2270, state[1]); diff --git a/src/common_audio/vad/vad_unittest.cc b/src/common_audio/vad/vad_unittest.cc index b31217c3f..3e6685319 100644 --- a/src/common_audio/vad/vad_unittest.cc +++ b/src/common_audio/vad/vad_unittest.cc @@ -36,12 +36,16 @@ bool VadTest::ValidRatesAndFrameLengths(int rate, int frame_length) { return true; } return false; - } - if (rate == 32000) { + } else if (rate == 32000) { if (frame_length == 320 || frame_length == 640 || frame_length == 960) { return true; } return false; + } else if (rate == 48000) { + if (frame_length == 480 || frame_length == 960 || frame_length == 1440) { + return true; + } + return false; } return false; @@ -122,15 +126,26 @@ TEST_F(VadTest, ApiTest) { TEST_F(VadTest, ValidRatesFrameLengths) { // This test verifies valid and invalid rate/frame_length combinations. We - // loop through sampling rates and frame lengths from negative values to + // loop through some sampling rates and frame lengths from negative values to // values larger than possible. - for (int16_t rate = -1; rate <= kRates[kRatesSize - 1] + 1; rate++) { - for (int16_t frame_length = -1; frame_length <= kMaxFrameLength + 1; - frame_length++) { - if (ValidRatesAndFrameLengths(rate, frame_length)) { - EXPECT_EQ(0, WebRtcVad_ValidRateAndFrameLength(rate, frame_length)); + const int kNumRates = 12; + const int kRates[kNumRates] = { + -8000, -4000, 0, 4000, 8000, 8001, 15999, 16000, 32000, 48000, 48001, 96000 + }; + + const int kNumFrameLengths = 13; + const int kFrameLengths[kNumFrameLengths] = { + -10, 0, 80, 81, 159, 160, 240, 320, 480, 640, 960, 1440, 2000 + }; + + for (int i = 0; i < kNumRates; i++) { + for (int j = 0; j < kNumFrameLengths; j++) { + if (ValidRatesAndFrameLengths(kRates[i], kFrameLengths[j])) { + EXPECT_EQ(0, WebRtcVad_ValidRateAndFrameLength(kRates[i], + kFrameLengths[j])); } else { - EXPECT_EQ(-1, WebRtcVad_ValidRateAndFrameLength(rate, frame_length)); + EXPECT_EQ(-1, WebRtcVad_ValidRateAndFrameLength(kRates[i], + kFrameLengths[j])); } } } diff --git a/src/common_audio/vad/vad_unittest.h b/src/common_audio/vad/vad_unittest.h index 306980128..a42e86f00 100644 --- a/src/common_audio/vad/vad_unittest.h +++ b/src/common_audio/vad/vad_unittest.h @@ -24,12 +24,12 @@ const int kModes[] = { 0, 1, 2, 3 }; const size_t kModesSize = sizeof(kModes) / sizeof(*kModes); // Rates we support. -const int kRates[] = { 8000, 12000, 16000, 24000, 32000 }; +const int kRates[] = { 8000, 12000, 16000, 24000, 32000, 48000 }; const size_t kRatesSize = sizeof(kRates) / sizeof(*kRates); // Frame lengths we support. -const int kMaxFrameLength = 960; -const int kFrameLengths[] = { 80, 120, 160, 240, 320, 480, 640, +const int kMaxFrameLength = 1440; +const int kFrameLengths[] = { 80, 120, 160, 240, 320, 480, 640, 960, kMaxFrameLength }; const size_t kFrameLengthsSize = sizeof(kFrameLengths) / sizeof(*kFrameLengths); diff --git a/src/common_audio/vad/webrtc_vad.c b/src/common_audio/vad/webrtc_vad.c index 69992110d..dad9d7389 100644 --- a/src/common_audio/vad/webrtc_vad.c +++ b/src/common_audio/vad/webrtc_vad.c @@ -18,7 +18,7 @@ #include "typedefs.h" static const int kInitCheck = 42; -static const int kValidRates[] = { 8000, 16000, 32000 }; +static const int kValidRates[] = { 8000, 16000, 32000, 48000 }; static const size_t kRatesSize = sizeof(kValidRates) / sizeof(*kValidRates); static const int kMaxFrameLengthMs = 30; @@ -93,7 +93,9 @@ int WebRtcVad_Process(VadInst* handle, int fs, int16_t* audio_frame, return -1; } - if (fs == 32000) { + if (fs == 48000) { + vad = WebRtcVad_CalcVad48khz(self, audio_frame, frame_length); + } else if (fs == 32000) { vad = WebRtcVad_CalcVad32khz(self, audio_frame, frame_length); } else if (fs == 16000) { vad = WebRtcVad_CalcVad16khz(self, audio_frame, frame_length);