Modify the _vadActivity member of the AudioFrame passed to AudioProcessing.

This saves the user from having to explicitly check stream_has_voice(). It will allow typing detection to function, which relies on this behaviour. Review URL: http://webrtc-codereview.appspot.com/144004 git-svn-id: http://webrtc.googlecode.com/svn/trunk@621 4adac7df-926f-26a2-2b94-8c16560cd09d
2011-09-19 15:28:51 +00:00 · 2011-09-19 15:28:51 +00:00 · ed083d4079
commit ed083d4079
parent 94c7413b0d
5 changed files with 122 additions and 100 deletions
--- a/src/modules/audio_processing/main/interface/audio_processing.h
+++ b/src/modules/audio_processing/main/interface/audio_processing.h
@ -486,6 +486,7 @@ class HighPassFilter {
 };

 // An estimation component used to retrieve level metrics.
+// NOTE: currently unavailable. All methods return errors.
 class LevelEstimator {
 public:
  virtual int Enable(bool enable) = 0;
@ -539,6 +540,10 @@ class NoiseSuppression {
 // The voice activity detection (VAD) component analyzes the stream to
 // determine if voice is present. A facility is also provided to pass in an
 // external VAD decision.
+//
+// In addition to |stream_has_voice()| the VAD decision is provided through the
+// |AudioFrame| passed to |ProcessStream()|. The |_vadActivity| member will be
+// modified to reflect the current decision.
 class VoiceDetection {
 public:
  virtual int Enable(bool enable) = 0;
--- a/src/modules/audio_processing/main/source/audio_buffer.cc
+++ b/src/modules/audio_processing/main/source/audio_buffer.cc
@ -10,8 +10,6 @@

 #include "audio_buffer.h"

-#include "module_common_types.h"
-
 namespace webrtc {
 namespace {

@ -64,21 +62,22 @@ struct SplitAudioChannel {
  WebRtc_Word32 synthesis_filter_state2[6];
 };

-// TODO(am): check range of input parameters?
-AudioBuffer::AudioBuffer(WebRtc_Word32 max_num_channels,
-                         WebRtc_Word32 samples_per_channel)
-    : max_num_channels_(max_num_channels),
-      num_channels_(0),
-      num_mixed_channels_(0),
-      num_mixed_low_pass_channels_(0),
-      samples_per_channel_(samples_per_channel),
-      samples_per_split_channel_(samples_per_channel),
-      reference_copied_(false),
-      data_(NULL),
-      channels_(NULL),
-      split_channels_(NULL),
-      mixed_low_pass_channels_(NULL),
-      low_pass_reference_channels_(NULL) {
+// TODO(andrew): check range of input parameters?
+AudioBuffer::AudioBuffer(int max_num_channels,
+                         int samples_per_channel)
+  : max_num_channels_(max_num_channels),
+    num_channels_(0),
+    num_mixed_channels_(0),
+    num_mixed_low_pass_channels_(0),
+    samples_per_channel_(samples_per_channel),
+    samples_per_split_channel_(samples_per_channel),
+    reference_copied_(false),
+    activity_(AudioFrame::kVadUnknown),
+    data_(NULL),
+    channels_(NULL),
+    split_channels_(NULL),
+    mixed_low_pass_channels_(NULL),
+    low_pass_reference_channels_(NULL) {
  if (max_num_channels_ > 1) {
    channels_ = new AudioChannel[max_num_channels_];
    mixed_low_pass_channels_ = new AudioChannel[max_num_channels_];
@ -109,7 +108,7 @@ AudioBuffer::~AudioBuffer() {
  }
 }

-WebRtc_Word16* AudioBuffer::data(WebRtc_Word32 channel) const {
+WebRtc_Word16* AudioBuffer::data(int channel) const {
  assert(channel >= 0 && channel < num_channels_);
  if (data_ != NULL) {
    return data_;
@ -118,7 +117,7 @@ WebRtc_Word16* AudioBuffer::data(WebRtc_Word32 channel) const {
  return channels_[channel].data;
 }

-WebRtc_Word16* AudioBuffer::low_pass_split_data(WebRtc_Word32 channel) const {
+WebRtc_Word16* AudioBuffer::low_pass_split_data(int channel) const {
  assert(channel >= 0 && channel < num_channels_);
  if (split_channels_ == NULL) {
    return data(channel);
@ -127,7 +126,7 @@ WebRtc_Word16* AudioBuffer::low_pass_split_data(WebRtc_Word32 channel) const {
  return split_channels_[channel].low_pass_data;
 }

-WebRtc_Word16* AudioBuffer::high_pass_split_data(WebRtc_Word32 channel) const {
+WebRtc_Word16* AudioBuffer::high_pass_split_data(int channel) const {
  assert(channel >= 0 && channel < num_channels_);
  if (split_channels_ == NULL) {
    return NULL;
@ -136,13 +135,13 @@ WebRtc_Word16* AudioBuffer::high_pass_split_data(WebRtc_Word32 channel) const {
  return split_channels_[channel].high_pass_data;
 }

-WebRtc_Word16* AudioBuffer::mixed_low_pass_data(WebRtc_Word32 channel) const {
+WebRtc_Word16* AudioBuffer::mixed_low_pass_data(int channel) const {
  assert(channel >= 0 && channel < num_mixed_low_pass_channels_);

  return mixed_low_pass_channels_[channel].data;
 }

-WebRtc_Word16* AudioBuffer::low_pass_reference(WebRtc_Word32 channel) const {
+WebRtc_Word16* AudioBuffer::low_pass_reference(int channel) const {
  assert(channel >= 0 && channel < num_channels_);
  if (!reference_copied_) {
    return NULL;
@ -151,58 +150,67 @@ WebRtc_Word16* AudioBuffer::low_pass_reference(WebRtc_Word32 channel) const {
  return low_pass_reference_channels_[channel].data;
 }

-WebRtc_Word32* AudioBuffer::analysis_filter_state1(WebRtc_Word32 channel) const {
+WebRtc_Word32* AudioBuffer::analysis_filter_state1(int channel) const {
  assert(channel >= 0 && channel < num_channels_);
  return split_channels_[channel].analysis_filter_state1;
 }

-WebRtc_Word32* AudioBuffer::analysis_filter_state2(WebRtc_Word32 channel) const {
+WebRtc_Word32* AudioBuffer::analysis_filter_state2(int channel) const {
  assert(channel >= 0 && channel < num_channels_);
  return split_channels_[channel].analysis_filter_state2;
 }

-WebRtc_Word32* AudioBuffer::synthesis_filter_state1(WebRtc_Word32 channel) const {
+WebRtc_Word32* AudioBuffer::synthesis_filter_state1(int channel) const {
  assert(channel >= 0 && channel < num_channels_);
  return split_channels_[channel].synthesis_filter_state1;
 }

-WebRtc_Word32* AudioBuffer::synthesis_filter_state2(WebRtc_Word32 channel) const {
+WebRtc_Word32* AudioBuffer::synthesis_filter_state2(int channel) const {
  assert(channel >= 0 && channel < num_channels_);
  return split_channels_[channel].synthesis_filter_state2;
 }

-WebRtc_Word32 AudioBuffer::num_channels() const {
+void AudioBuffer::set_activity(AudioFrame::VADActivity activity) {
+  activity_ = activity;
+}
+
+AudioFrame::VADActivity AudioBuffer::activity() {
+  return activity_;
+}
+
+int AudioBuffer::num_channels() const {
  return num_channels_;
 }

-WebRtc_Word32 AudioBuffer::samples_per_channel() const {
+int AudioBuffer::samples_per_channel() const {
  return samples_per_channel_;
 }

-WebRtc_Word32 AudioBuffer::samples_per_split_channel() const {
+int AudioBuffer::samples_per_split_channel() const {
  return samples_per_split_channel_;
 }

-// TODO(ajm): Do deinterleaving and mixing in one step?
-void AudioBuffer::DeinterleaveFrom(AudioFrame* audioFrame) {
-  assert(audioFrame->_audioChannel <= max_num_channels_);
-  assert(audioFrame->_payloadDataLengthInSamples ==  samples_per_channel_);
+// TODO(andrew): Do deinterleaving and mixing in one step?
+void AudioBuffer::DeinterleaveFrom(AudioFrame* frame) {
+  assert(frame->_audioChannel <= max_num_channels_);
+  assert(frame->_payloadDataLengthInSamples ==  samples_per_channel_);

-  num_channels_ = audioFrame->_audioChannel;
+  num_channels_ = frame->_audioChannel;
  num_mixed_channels_ = 0;
  num_mixed_low_pass_channels_ = 0;
  reference_copied_ = false;
+  activity_ = frame->_vadActivity;

  if (num_channels_ == 1) {
    // We can get away with a pointer assignment in this case.
-    data_ = audioFrame->_payloadData;
+    data_ = frame->_payloadData;
    return;
  }

+  WebRtc_Word16* interleaved = frame->_payloadData;
  for (int i = 0; i < num_channels_; i++) {
    WebRtc_Word16* deinterleaved = channels_[i].data;
-    WebRtc_Word16* interleaved = audioFrame->_payloadData;
-    WebRtc_Word32 interleaved_idx = i;
+    int interleaved_idx = i;
    for (int j = 0; j < samples_per_channel_; j++) {
      deinterleaved[j] = interleaved[interleaved_idx];
      interleaved_idx += num_channels_;
@ -210,27 +218,28 @@ void AudioBuffer::DeinterleaveFrom(AudioFrame* audioFrame) {
  }
 }

-void AudioBuffer::InterleaveTo(AudioFrame* audioFrame) const {
-  assert(audioFrame->_audioChannel == num_channels_);
-  assert(audioFrame->_payloadDataLengthInSamples == samples_per_channel_);
+void AudioBuffer::InterleaveTo(AudioFrame* frame) const {
+  assert(frame->_audioChannel == num_channels_);
+  assert(frame->_payloadDataLengthInSamples == samples_per_channel_);
+  frame->_vadActivity = activity_;

  if (num_channels_ == 1) {
    if (num_mixed_channels_ == 1) {
-      memcpy(audioFrame->_payloadData,
+      memcpy(frame->_payloadData,
             channels_[0].data,
             sizeof(WebRtc_Word16) * samples_per_channel_);
    } else {
      // These should point to the same buffer in this case.
-      assert(data_ == audioFrame->_payloadData);
+      assert(data_ == frame->_payloadData);
    }

    return;
  }

+  WebRtc_Word16* interleaved = frame->_payloadData;
  for (int i = 0; i < num_channels_; i++) {
    WebRtc_Word16* deinterleaved = channels_[i].data;
-    WebRtc_Word16* interleaved = audioFrame->_payloadData;
-    WebRtc_Word32 interleaved_idx = i;
+    int interleaved_idx = i;
    for (int j = 0; j < samples_per_channel_; j++) {
      interleaved[interleaved_idx] = deinterleaved[j];
      interleaved_idx += num_channels_;
@ -238,9 +247,10 @@ void AudioBuffer::InterleaveTo(AudioFrame* audioFrame) const {
  }
 }

-// TODO(ajm): would be good to support the no-mix case with pointer assignment.
-// TODO(ajm): handle mixing to multiple channels?
-void AudioBuffer::Mix(WebRtc_Word32 num_mixed_channels) {
+// TODO(andrew): would be good to support the no-mix case with pointer
+// assignment.
+// TODO(andrew): handle mixing to multiple channels?
+void AudioBuffer::Mix(int num_mixed_channels) {
  // We currently only support the stereo to mono case.
  assert(num_channels_ == 2);
  assert(num_mixed_channels == 1);
@ -254,7 +264,7 @@ void AudioBuffer::Mix(WebRtc_Word32 num_mixed_channels) {
  num_mixed_channels_ = num_mixed_channels;
 }

-void AudioBuffer::CopyAndMixLowPass(WebRtc_Word32 num_mixed_channels) {
+void AudioBuffer::CopyAndMixLowPass(int num_mixed_channels) {
  // We currently only support the stereo to mono case.
  assert(num_channels_ == 2);
  assert(num_mixed_channels == 1);
--- a/src/modules/audio_processing/main/source/audio_buffer.h
+++ b/src/modules/audio_processing/main/source/audio_buffer.h
@ -11,55 +11,58 @@
 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_SOURCE_AUDIO_BUFFER_H_
 #define WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_SOURCE_AUDIO_BUFFER_H_

+#include "module_common_types.h"
 #include "typedefs.h"

-
 namespace webrtc {

 struct AudioChannel;
 struct SplitAudioChannel;
-class AudioFrame;

 class AudioBuffer {
 public:
-  AudioBuffer(WebRtc_Word32 max_num_channels, WebRtc_Word32 samples_per_channel);
+  AudioBuffer(int max_num_channels, int samples_per_channel);
  virtual ~AudioBuffer();

-  WebRtc_Word32 num_channels() const;
-  WebRtc_Word32 samples_per_channel() const;
-  WebRtc_Word32 samples_per_split_channel() const;
+  int num_channels() const;
+  int samples_per_channel() const;
+  int samples_per_split_channel() const;

-  WebRtc_Word16* data(WebRtc_Word32 channel) const;
-  WebRtc_Word16* low_pass_split_data(WebRtc_Word32 channel) const;
-  WebRtc_Word16* high_pass_split_data(WebRtc_Word32 channel) const;
-  WebRtc_Word16* mixed_low_pass_data(WebRtc_Word32 channel) const;
-  WebRtc_Word16* low_pass_reference(WebRtc_Word32 channel) const;
+  WebRtc_Word16* data(int channel) const;
+  WebRtc_Word16* low_pass_split_data(int channel) const;
+  WebRtc_Word16* high_pass_split_data(int channel) const;
+  WebRtc_Word16* mixed_low_pass_data(int channel) const;
+  WebRtc_Word16* low_pass_reference(int channel) const;

-  WebRtc_Word32* analysis_filter_state1(WebRtc_Word32 channel) const;
-  WebRtc_Word32* analysis_filter_state2(WebRtc_Word32 channel) const;
-  WebRtc_Word32* synthesis_filter_state1(WebRtc_Word32 channel) const;
-  WebRtc_Word32* synthesis_filter_state2(WebRtc_Word32 channel) const;
+  WebRtc_Word32* analysis_filter_state1(int channel) const;
+  WebRtc_Word32* analysis_filter_state2(int channel) const;
+  WebRtc_Word32* synthesis_filter_state1(int channel) const;
+  WebRtc_Word32* synthesis_filter_state2(int channel) const;
+
+  void set_activity(AudioFrame::VADActivity activity);
+  AudioFrame::VADActivity activity();

  void DeinterleaveFrom(AudioFrame* audioFrame);
  void InterleaveTo(AudioFrame* audioFrame) const;
-  void Mix(WebRtc_Word32 num_mixed_channels);
-  void CopyAndMixLowPass(WebRtc_Word32 num_mixed_channels);
+  void Mix(int num_mixed_channels);
+  void CopyAndMixLowPass(int num_mixed_channels);
  void CopyLowPassToReference();

 private:
-  const WebRtc_Word32 max_num_channels_;
-  WebRtc_Word32 num_channels_;
-  WebRtc_Word32 num_mixed_channels_;
-  WebRtc_Word32 num_mixed_low_pass_channels_;
-  const WebRtc_Word32 samples_per_channel_;
-  WebRtc_Word32 samples_per_split_channel_;
+  const int max_num_channels_;
+  int num_channels_;
+  int num_mixed_channels_;
+  int num_mixed_low_pass_channels_;
+  const int samples_per_channel_;
+  int samples_per_split_channel_;
  bool reference_copied_;
+  AudioFrame::VADActivity activity_;

  WebRtc_Word16* data_;
-  // TODO(ajm): Prefer to make these vectors if permitted...
+  // TODO(andrew): use vectors here.
  AudioChannel* channels_;
  SplitAudioChannel* split_channels_;
-  // TODO(ajm): improve this, we don't need the full 32 kHz space here.
+  // TODO(andrew): improve this, we don't need the full 32 kHz space here.
  AudioChannel* mixed_low_pass_channels_;
  AudioChannel* low_pass_reference_channels_;
 };
--- a/src/modules/audio_processing/main/source/voice_detection_impl.cc
+++ b/src/modules/audio_processing/main/source/voice_detection_impl.cc
@ -74,16 +74,16 @@ int VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) {

  // TODO(ajm): concatenate data in frame buffer here.

-  int vad_ret_val;
-  vad_ret_val = WebRtcVad_Process(static_cast<Handle*>(handle(0)),
-                      apm_->split_sample_rate_hz(),
-                      mixed_data,
-                      frame_size_samples_);
-
-  if (vad_ret_val == 0) {
+  int vad_ret = WebRtcVad_Process(static_cast<Handle*>(handle(0)),
+                                  apm_->split_sample_rate_hz(),
+                                  mixed_data,
+                                  frame_size_samples_);
+  if (vad_ret == 0) {
    stream_has_voice_ = false;
-  } else if (vad_ret_val == 1) {
+    audio->set_activity(AudioFrame::kVadPassive);
+  } else if (vad_ret == 1) {
    stream_has_voice_ = true;
+    audio->set_activity(AudioFrame::kVadActive);
  } else {
    return apm_->kUnspecifiedError;
  }
--- a/src/modules/audio_processing/main/test/unit_test/unit_test.cc
+++ b/src/modules/audio_processing/main/test/unit_test/unit_test.cc
@ -555,6 +555,7 @@ TEST_F(ApmTest, Process) {
               &temp_data[0],
               sizeof(WebRtc_Word16) * read_count);
      }
+      frame_->_vadActivity = AudioFrame::kVadUnknown;

      EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));

@ -571,6 +572,9 @@ TEST_F(ApmTest, Process) {
      }
      if (apm_->voice_detection()->stream_has_voice()) {
        has_voice_count++;
+        EXPECT_EQ(AudioFrame::kVadActive, frame_->_vadActivity);
+      } else {
+        EXPECT_EQ(AudioFrame::kVadPassive, frame_->_vadActivity);
      }

      frame_count++;
@ -966,27 +970,27 @@ TEST_F(ApmTest, VoiceDetection) {
  EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
  EXPECT_FALSE(apm_->voice_detection()->is_enabled());

+  // Test that AudioFrame activity is maintained when VAD is disabled.
+  EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
+  AudioFrame::VADActivity activity[] = {
+      AudioFrame::kVadActive,
+      AudioFrame::kVadPassive,
+      AudioFrame::kVadUnknown
+  };
+  for (size_t i = 0; i < sizeof(activity)/sizeof(*activity); i++) {
+    frame_->_vadActivity = activity[i];
+    EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
+    EXPECT_EQ(activity[i], frame_->_vadActivity);
+  }
+
+  // Test that AudioFrame activity is set when VAD is enabled.
+  EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(true));
+  frame_->_vadActivity = AudioFrame::kVadUnknown;
+  EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
+  EXPECT_NE(AudioFrame::kVadUnknown, frame_->_vadActivity);
+
  // TODO(bjornv): Add tests for streamed voice; stream_has_voice()
 }
-
-// Below are some ideas for tests from VPM.
-
-/*TEST_F(VideoProcessingModuleTest, GetVersionTest)
-{
-}
-
-TEST_F(VideoProcessingModuleTest, HandleNullBuffer)
-{
-}
-
-TEST_F(VideoProcessingModuleTest, HandleBadSize)
-{
-}
-
-TEST_F(VideoProcessingModuleTest, IdenticalResultsAfterReset)
-{
-}
-*/
 }  // namespace

 int main(int argc, char** argv) {