Modify the _vadActivity member of the AudioFrame passed to AudioProcessing.

This saves the user from having to explicitly check stream_has_voice(). It will allow typing detection to function, which relies on this behaviour.
Review URL: http://webrtc-codereview.appspot.com/144004

git-svn-id: http://webrtc.googlecode.com/svn/trunk@621 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
andrew@webrtc.org 2011-09-19 15:28:51 +00:00
parent 94c7413b0d
commit ed083d4079
5 changed files with 122 additions and 100 deletions

View File

@ -486,6 +486,7 @@ class HighPassFilter {
}; };
// An estimation component used to retrieve level metrics. // An estimation component used to retrieve level metrics.
// NOTE: currently unavailable. All methods return errors.
class LevelEstimator { class LevelEstimator {
public: public:
virtual int Enable(bool enable) = 0; virtual int Enable(bool enable) = 0;
@ -539,6 +540,10 @@ class NoiseSuppression {
// The voice activity detection (VAD) component analyzes the stream to // The voice activity detection (VAD) component analyzes the stream to
// determine if voice is present. A facility is also provided to pass in an // determine if voice is present. A facility is also provided to pass in an
// external VAD decision. // external VAD decision.
//
// In addition to |stream_has_voice()| the VAD decision is provided through the
// |AudioFrame| passed to |ProcessStream()|. The |_vadActivity| member will be
// modified to reflect the current decision.
class VoiceDetection { class VoiceDetection {
public: public:
virtual int Enable(bool enable) = 0; virtual int Enable(bool enable) = 0;

View File

@ -10,8 +10,6 @@
#include "audio_buffer.h" #include "audio_buffer.h"
#include "module_common_types.h"
namespace webrtc { namespace webrtc {
namespace { namespace {
@ -64,21 +62,22 @@ struct SplitAudioChannel {
WebRtc_Word32 synthesis_filter_state2[6]; WebRtc_Word32 synthesis_filter_state2[6];
}; };
// TODO(am): check range of input parameters? // TODO(andrew): check range of input parameters?
AudioBuffer::AudioBuffer(WebRtc_Word32 max_num_channels, AudioBuffer::AudioBuffer(int max_num_channels,
WebRtc_Word32 samples_per_channel) int samples_per_channel)
: max_num_channels_(max_num_channels), : max_num_channels_(max_num_channels),
num_channels_(0), num_channels_(0),
num_mixed_channels_(0), num_mixed_channels_(0),
num_mixed_low_pass_channels_(0), num_mixed_low_pass_channels_(0),
samples_per_channel_(samples_per_channel), samples_per_channel_(samples_per_channel),
samples_per_split_channel_(samples_per_channel), samples_per_split_channel_(samples_per_channel),
reference_copied_(false), reference_copied_(false),
data_(NULL), activity_(AudioFrame::kVadUnknown),
channels_(NULL), data_(NULL),
split_channels_(NULL), channels_(NULL),
mixed_low_pass_channels_(NULL), split_channels_(NULL),
low_pass_reference_channels_(NULL) { mixed_low_pass_channels_(NULL),
low_pass_reference_channels_(NULL) {
if (max_num_channels_ > 1) { if (max_num_channels_ > 1) {
channels_ = new AudioChannel[max_num_channels_]; channels_ = new AudioChannel[max_num_channels_];
mixed_low_pass_channels_ = new AudioChannel[max_num_channels_]; mixed_low_pass_channels_ = new AudioChannel[max_num_channels_];
@ -109,7 +108,7 @@ AudioBuffer::~AudioBuffer() {
} }
} }
WebRtc_Word16* AudioBuffer::data(WebRtc_Word32 channel) const { WebRtc_Word16* AudioBuffer::data(int channel) const {
assert(channel >= 0 && channel < num_channels_); assert(channel >= 0 && channel < num_channels_);
if (data_ != NULL) { if (data_ != NULL) {
return data_; return data_;
@ -118,7 +117,7 @@ WebRtc_Word16* AudioBuffer::data(WebRtc_Word32 channel) const {
return channels_[channel].data; return channels_[channel].data;
} }
WebRtc_Word16* AudioBuffer::low_pass_split_data(WebRtc_Word32 channel) const { WebRtc_Word16* AudioBuffer::low_pass_split_data(int channel) const {
assert(channel >= 0 && channel < num_channels_); assert(channel >= 0 && channel < num_channels_);
if (split_channels_ == NULL) { if (split_channels_ == NULL) {
return data(channel); return data(channel);
@ -127,7 +126,7 @@ WebRtc_Word16* AudioBuffer::low_pass_split_data(WebRtc_Word32 channel) const {
return split_channels_[channel].low_pass_data; return split_channels_[channel].low_pass_data;
} }
WebRtc_Word16* AudioBuffer::high_pass_split_data(WebRtc_Word32 channel) const { WebRtc_Word16* AudioBuffer::high_pass_split_data(int channel) const {
assert(channel >= 0 && channel < num_channels_); assert(channel >= 0 && channel < num_channels_);
if (split_channels_ == NULL) { if (split_channels_ == NULL) {
return NULL; return NULL;
@ -136,13 +135,13 @@ WebRtc_Word16* AudioBuffer::high_pass_split_data(WebRtc_Word32 channel) const {
return split_channels_[channel].high_pass_data; return split_channels_[channel].high_pass_data;
} }
WebRtc_Word16* AudioBuffer::mixed_low_pass_data(WebRtc_Word32 channel) const { WebRtc_Word16* AudioBuffer::mixed_low_pass_data(int channel) const {
assert(channel >= 0 && channel < num_mixed_low_pass_channels_); assert(channel >= 0 && channel < num_mixed_low_pass_channels_);
return mixed_low_pass_channels_[channel].data; return mixed_low_pass_channels_[channel].data;
} }
WebRtc_Word16* AudioBuffer::low_pass_reference(WebRtc_Word32 channel) const { WebRtc_Word16* AudioBuffer::low_pass_reference(int channel) const {
assert(channel >= 0 && channel < num_channels_); assert(channel >= 0 && channel < num_channels_);
if (!reference_copied_) { if (!reference_copied_) {
return NULL; return NULL;
@ -151,58 +150,67 @@ WebRtc_Word16* AudioBuffer::low_pass_reference(WebRtc_Word32 channel) const {
return low_pass_reference_channels_[channel].data; return low_pass_reference_channels_[channel].data;
} }
WebRtc_Word32* AudioBuffer::analysis_filter_state1(WebRtc_Word32 channel) const { WebRtc_Word32* AudioBuffer::analysis_filter_state1(int channel) const {
assert(channel >= 0 && channel < num_channels_); assert(channel >= 0 && channel < num_channels_);
return split_channels_[channel].analysis_filter_state1; return split_channels_[channel].analysis_filter_state1;
} }
WebRtc_Word32* AudioBuffer::analysis_filter_state2(WebRtc_Word32 channel) const { WebRtc_Word32* AudioBuffer::analysis_filter_state2(int channel) const {
assert(channel >= 0 && channel < num_channels_); assert(channel >= 0 && channel < num_channels_);
return split_channels_[channel].analysis_filter_state2; return split_channels_[channel].analysis_filter_state2;
} }
WebRtc_Word32* AudioBuffer::synthesis_filter_state1(WebRtc_Word32 channel) const { WebRtc_Word32* AudioBuffer::synthesis_filter_state1(int channel) const {
assert(channel >= 0 && channel < num_channels_); assert(channel >= 0 && channel < num_channels_);
return split_channels_[channel].synthesis_filter_state1; return split_channels_[channel].synthesis_filter_state1;
} }
WebRtc_Word32* AudioBuffer::synthesis_filter_state2(WebRtc_Word32 channel) const { WebRtc_Word32* AudioBuffer::synthesis_filter_state2(int channel) const {
assert(channel >= 0 && channel < num_channels_); assert(channel >= 0 && channel < num_channels_);
return split_channels_[channel].synthesis_filter_state2; return split_channels_[channel].synthesis_filter_state2;
} }
WebRtc_Word32 AudioBuffer::num_channels() const { void AudioBuffer::set_activity(AudioFrame::VADActivity activity) {
activity_ = activity;
}
AudioFrame::VADActivity AudioBuffer::activity() {
return activity_;
}
int AudioBuffer::num_channels() const {
return num_channels_; return num_channels_;
} }
WebRtc_Word32 AudioBuffer::samples_per_channel() const { int AudioBuffer::samples_per_channel() const {
return samples_per_channel_; return samples_per_channel_;
} }
WebRtc_Word32 AudioBuffer::samples_per_split_channel() const { int AudioBuffer::samples_per_split_channel() const {
return samples_per_split_channel_; return samples_per_split_channel_;
} }
// TODO(ajm): Do deinterleaving and mixing in one step? // TODO(andrew): Do deinterleaving and mixing in one step?
void AudioBuffer::DeinterleaveFrom(AudioFrame* audioFrame) { void AudioBuffer::DeinterleaveFrom(AudioFrame* frame) {
assert(audioFrame->_audioChannel <= max_num_channels_); assert(frame->_audioChannel <= max_num_channels_);
assert(audioFrame->_payloadDataLengthInSamples == samples_per_channel_); assert(frame->_payloadDataLengthInSamples == samples_per_channel_);
num_channels_ = audioFrame->_audioChannel; num_channels_ = frame->_audioChannel;
num_mixed_channels_ = 0; num_mixed_channels_ = 0;
num_mixed_low_pass_channels_ = 0; num_mixed_low_pass_channels_ = 0;
reference_copied_ = false; reference_copied_ = false;
activity_ = frame->_vadActivity;
if (num_channels_ == 1) { if (num_channels_ == 1) {
// We can get away with a pointer assignment in this case. // We can get away with a pointer assignment in this case.
data_ = audioFrame->_payloadData; data_ = frame->_payloadData;
return; return;
} }
WebRtc_Word16* interleaved = frame->_payloadData;
for (int i = 0; i < num_channels_; i++) { for (int i = 0; i < num_channels_; i++) {
WebRtc_Word16* deinterleaved = channels_[i].data; WebRtc_Word16* deinterleaved = channels_[i].data;
WebRtc_Word16* interleaved = audioFrame->_payloadData; int interleaved_idx = i;
WebRtc_Word32 interleaved_idx = i;
for (int j = 0; j < samples_per_channel_; j++) { for (int j = 0; j < samples_per_channel_; j++) {
deinterleaved[j] = interleaved[interleaved_idx]; deinterleaved[j] = interleaved[interleaved_idx];
interleaved_idx += num_channels_; interleaved_idx += num_channels_;
@ -210,27 +218,28 @@ void AudioBuffer::DeinterleaveFrom(AudioFrame* audioFrame) {
} }
} }
void AudioBuffer::InterleaveTo(AudioFrame* audioFrame) const { void AudioBuffer::InterleaveTo(AudioFrame* frame) const {
assert(audioFrame->_audioChannel == num_channels_); assert(frame->_audioChannel == num_channels_);
assert(audioFrame->_payloadDataLengthInSamples == samples_per_channel_); assert(frame->_payloadDataLengthInSamples == samples_per_channel_);
frame->_vadActivity = activity_;
if (num_channels_ == 1) { if (num_channels_ == 1) {
if (num_mixed_channels_ == 1) { if (num_mixed_channels_ == 1) {
memcpy(audioFrame->_payloadData, memcpy(frame->_payloadData,
channels_[0].data, channels_[0].data,
sizeof(WebRtc_Word16) * samples_per_channel_); sizeof(WebRtc_Word16) * samples_per_channel_);
} else { } else {
// These should point to the same buffer in this case. // These should point to the same buffer in this case.
assert(data_ == audioFrame->_payloadData); assert(data_ == frame->_payloadData);
} }
return; return;
} }
WebRtc_Word16* interleaved = frame->_payloadData;
for (int i = 0; i < num_channels_; i++) { for (int i = 0; i < num_channels_; i++) {
WebRtc_Word16* deinterleaved = channels_[i].data; WebRtc_Word16* deinterleaved = channels_[i].data;
WebRtc_Word16* interleaved = audioFrame->_payloadData; int interleaved_idx = i;
WebRtc_Word32 interleaved_idx = i;
for (int j = 0; j < samples_per_channel_; j++) { for (int j = 0; j < samples_per_channel_; j++) {
interleaved[interleaved_idx] = deinterleaved[j]; interleaved[interleaved_idx] = deinterleaved[j];
interleaved_idx += num_channels_; interleaved_idx += num_channels_;
@ -238,9 +247,10 @@ void AudioBuffer::InterleaveTo(AudioFrame* audioFrame) const {
} }
} }
// TODO(ajm): would be good to support the no-mix case with pointer assignment. // TODO(andrew): would be good to support the no-mix case with pointer
// TODO(ajm): handle mixing to multiple channels? // assignment.
void AudioBuffer::Mix(WebRtc_Word32 num_mixed_channels) { // TODO(andrew): handle mixing to multiple channels?
void AudioBuffer::Mix(int num_mixed_channels) {
// We currently only support the stereo to mono case. // We currently only support the stereo to mono case.
assert(num_channels_ == 2); assert(num_channels_ == 2);
assert(num_mixed_channels == 1); assert(num_mixed_channels == 1);
@ -254,7 +264,7 @@ void AudioBuffer::Mix(WebRtc_Word32 num_mixed_channels) {
num_mixed_channels_ = num_mixed_channels; num_mixed_channels_ = num_mixed_channels;
} }
void AudioBuffer::CopyAndMixLowPass(WebRtc_Word32 num_mixed_channels) { void AudioBuffer::CopyAndMixLowPass(int num_mixed_channels) {
// We currently only support the stereo to mono case. // We currently only support the stereo to mono case.
assert(num_channels_ == 2); assert(num_channels_ == 2);
assert(num_mixed_channels == 1); assert(num_mixed_channels == 1);

View File

@ -11,55 +11,58 @@
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_SOURCE_AUDIO_BUFFER_H_ #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_SOURCE_AUDIO_BUFFER_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_SOURCE_AUDIO_BUFFER_H_ #define WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_SOURCE_AUDIO_BUFFER_H_
#include "module_common_types.h"
#include "typedefs.h" #include "typedefs.h"
namespace webrtc { namespace webrtc {
struct AudioChannel; struct AudioChannel;
struct SplitAudioChannel; struct SplitAudioChannel;
class AudioFrame;
class AudioBuffer { class AudioBuffer {
public: public:
AudioBuffer(WebRtc_Word32 max_num_channels, WebRtc_Word32 samples_per_channel); AudioBuffer(int max_num_channels, int samples_per_channel);
virtual ~AudioBuffer(); virtual ~AudioBuffer();
WebRtc_Word32 num_channels() const; int num_channels() const;
WebRtc_Word32 samples_per_channel() const; int samples_per_channel() const;
WebRtc_Word32 samples_per_split_channel() const; int samples_per_split_channel() const;
WebRtc_Word16* data(WebRtc_Word32 channel) const; WebRtc_Word16* data(int channel) const;
WebRtc_Word16* low_pass_split_data(WebRtc_Word32 channel) const; WebRtc_Word16* low_pass_split_data(int channel) const;
WebRtc_Word16* high_pass_split_data(WebRtc_Word32 channel) const; WebRtc_Word16* high_pass_split_data(int channel) const;
WebRtc_Word16* mixed_low_pass_data(WebRtc_Word32 channel) const; WebRtc_Word16* mixed_low_pass_data(int channel) const;
WebRtc_Word16* low_pass_reference(WebRtc_Word32 channel) const; WebRtc_Word16* low_pass_reference(int channel) const;
WebRtc_Word32* analysis_filter_state1(WebRtc_Word32 channel) const; WebRtc_Word32* analysis_filter_state1(int channel) const;
WebRtc_Word32* analysis_filter_state2(WebRtc_Word32 channel) const; WebRtc_Word32* analysis_filter_state2(int channel) const;
WebRtc_Word32* synthesis_filter_state1(WebRtc_Word32 channel) const; WebRtc_Word32* synthesis_filter_state1(int channel) const;
WebRtc_Word32* synthesis_filter_state2(WebRtc_Word32 channel) const; WebRtc_Word32* synthesis_filter_state2(int channel) const;
void set_activity(AudioFrame::VADActivity activity);
AudioFrame::VADActivity activity();
void DeinterleaveFrom(AudioFrame* audioFrame); void DeinterleaveFrom(AudioFrame* audioFrame);
void InterleaveTo(AudioFrame* audioFrame) const; void InterleaveTo(AudioFrame* audioFrame) const;
void Mix(WebRtc_Word32 num_mixed_channels); void Mix(int num_mixed_channels);
void CopyAndMixLowPass(WebRtc_Word32 num_mixed_channels); void CopyAndMixLowPass(int num_mixed_channels);
void CopyLowPassToReference(); void CopyLowPassToReference();
private: private:
const WebRtc_Word32 max_num_channels_; const int max_num_channels_;
WebRtc_Word32 num_channels_; int num_channels_;
WebRtc_Word32 num_mixed_channels_; int num_mixed_channels_;
WebRtc_Word32 num_mixed_low_pass_channels_; int num_mixed_low_pass_channels_;
const WebRtc_Word32 samples_per_channel_; const int samples_per_channel_;
WebRtc_Word32 samples_per_split_channel_; int samples_per_split_channel_;
bool reference_copied_; bool reference_copied_;
AudioFrame::VADActivity activity_;
WebRtc_Word16* data_; WebRtc_Word16* data_;
// TODO(ajm): Prefer to make these vectors if permitted... // TODO(andrew): use vectors here.
AudioChannel* channels_; AudioChannel* channels_;
SplitAudioChannel* split_channels_; SplitAudioChannel* split_channels_;
// TODO(ajm): improve this, we don't need the full 32 kHz space here. // TODO(andrew): improve this, we don't need the full 32 kHz space here.
AudioChannel* mixed_low_pass_channels_; AudioChannel* mixed_low_pass_channels_;
AudioChannel* low_pass_reference_channels_; AudioChannel* low_pass_reference_channels_;
}; };

View File

@ -74,16 +74,16 @@ int VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) {
// TODO(ajm): concatenate data in frame buffer here. // TODO(ajm): concatenate data in frame buffer here.
int vad_ret_val; int vad_ret = WebRtcVad_Process(static_cast<Handle*>(handle(0)),
vad_ret_val = WebRtcVad_Process(static_cast<Handle*>(handle(0)), apm_->split_sample_rate_hz(),
apm_->split_sample_rate_hz(), mixed_data,
mixed_data, frame_size_samples_);
frame_size_samples_); if (vad_ret == 0) {
if (vad_ret_val == 0) {
stream_has_voice_ = false; stream_has_voice_ = false;
} else if (vad_ret_val == 1) { audio->set_activity(AudioFrame::kVadPassive);
} else if (vad_ret == 1) {
stream_has_voice_ = true; stream_has_voice_ = true;
audio->set_activity(AudioFrame::kVadActive);
} else { } else {
return apm_->kUnspecifiedError; return apm_->kUnspecifiedError;
} }

View File

@ -555,6 +555,7 @@ TEST_F(ApmTest, Process) {
&temp_data[0], &temp_data[0],
sizeof(WebRtc_Word16) * read_count); sizeof(WebRtc_Word16) * read_count);
} }
frame_->_vadActivity = AudioFrame::kVadUnknown;
EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_)); EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
@ -571,6 +572,9 @@ TEST_F(ApmTest, Process) {
} }
if (apm_->voice_detection()->stream_has_voice()) { if (apm_->voice_detection()->stream_has_voice()) {
has_voice_count++; has_voice_count++;
EXPECT_EQ(AudioFrame::kVadActive, frame_->_vadActivity);
} else {
EXPECT_EQ(AudioFrame::kVadPassive, frame_->_vadActivity);
} }
frame_count++; frame_count++;
@ -966,27 +970,27 @@ TEST_F(ApmTest, VoiceDetection) {
EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false)); EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
EXPECT_FALSE(apm_->voice_detection()->is_enabled()); EXPECT_FALSE(apm_->voice_detection()->is_enabled());
// Test that AudioFrame activity is maintained when VAD is disabled.
EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
AudioFrame::VADActivity activity[] = {
AudioFrame::kVadActive,
AudioFrame::kVadPassive,
AudioFrame::kVadUnknown
};
for (size_t i = 0; i < sizeof(activity)/sizeof(*activity); i++) {
frame_->_vadActivity = activity[i];
EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
EXPECT_EQ(activity[i], frame_->_vadActivity);
}
// Test that AudioFrame activity is set when VAD is enabled.
EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(true));
frame_->_vadActivity = AudioFrame::kVadUnknown;
EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
EXPECT_NE(AudioFrame::kVadUnknown, frame_->_vadActivity);
// TODO(bjornv): Add tests for streamed voice; stream_has_voice() // TODO(bjornv): Add tests for streamed voice; stream_has_voice()
} }
// Below are some ideas for tests from VPM.
/*TEST_F(VideoProcessingModuleTest, GetVersionTest)
{
}
TEST_F(VideoProcessingModuleTest, HandleNullBuffer)
{
}
TEST_F(VideoProcessingModuleTest, HandleBadSize)
{
}
TEST_F(VideoProcessingModuleTest, IdenticalResultsAfterReset)
{
}
*/
} // namespace } // namespace
int main(int argc, char** argv) { int main(int argc, char** argv) {