Add new fast mode for NetEq's Accelerate operation

This change instroduces a mode where the Accelerate operation will be more aggressive. When enabled, it will allow acceleration at lower correlation levels, and possibly remove multiple pitch periods at once. The feature is enabled through NetEq::Config, and is off by default. This means that bit-exactness tests are currently not affected. A unit test was added for the Accelerate class, with and without fast mode enabled. BUG=4691 R=minyue@webrtc.org Review URL: https://webrtc-codereview.appspot.com/50039004 Cr-Commit-Position: refs/heads/master@{#9295}
2015-05-27 14:33:29 +02:00 · 2015-05-27 14:33:29 +02:00 · cf808d2366
commit cf808d2366
parent cbe408aa11
13 changed files with 172 additions and 58 deletions
--- a/webrtc/modules/audio_coding/neteq/accelerate.cc
+++ b/webrtc/modules/audio_coding/neteq/accelerate.cc
@ -14,11 +14,11 @@

 namespace webrtc {

-Accelerate::ReturnCodes Accelerate::Process(
-    const int16_t* input,
-    size_t input_length,
-    AudioMultiVector* output,
-    int16_t* length_change_samples) {
+Accelerate::ReturnCodes Accelerate::Process(const int16_t* input,
+                                            size_t input_length,
+                                            bool fast_accelerate,
+                                            AudioMultiVector* output,
+                                            int16_t* length_change_samples) {
  // Input length must be (almost) 30 ms.
  static const int k15ms = 120;  // 15 ms = 120 samples at 8 kHz sample rate.
  if (num_channels_ == 0 || static_cast<int>(input_length) / num_channels_ <
@ -28,7 +28,7 @@ Accelerate::ReturnCodes Accelerate::Process(
    output->PushBackInterleaved(input, input_length);
    return kError;
  }
-  return TimeStretch::Process(input, input_length, output,
+  return TimeStretch::Process(input, input_length, fast_accelerate, output,
                              length_change_samples);
 }

@ -41,17 +41,30 @@ void Accelerate::SetParametersForPassiveSpeech(size_t /*len*/,
 }

 Accelerate::ReturnCodes Accelerate::CheckCriteriaAndStretch(
-    const int16_t* input, size_t input_length, size_t peak_index,
-    int16_t best_correlation, bool active_speech,
+    const int16_t* input,
+    size_t input_length,
+    size_t peak_index,
+    int16_t best_correlation,
+    bool active_speech,
+    bool fast_mode,
    AudioMultiVector* output) const {
  // Check for strong correlation or passive speech.
-  if ((best_correlation > kCorrelationThreshold) || !active_speech) {
+  // Use 8192 (0.5 in Q14) in fast mode.
+  const int correlation_threshold = fast_mode ? 8192 : kCorrelationThreshold;
+  if ((best_correlation > correlation_threshold) || !active_speech) {
    // Do accelerate operation by overlap add.

    // Pre-calculate common multiplication with |fs_mult_|.
    // 120 corresponds to 15 ms.
    size_t fs_mult_120 = fs_mult_ * 120;

+    if (fast_mode) {
+      // Fit as many multiples of |peak_index| as possible in fs_mult_120.
+      // TODO(henrik.lundin) Consider finding multiple correlation peaks and
+      // pick the one with the longest correlation lag in this case.
+      peak_index = (fs_mult_120 / peak_index) * peak_index;
+    }
+
    assert(fs_mult_120 >= peak_index);  // Should be handled in Process().
    // Copy first part; 0 to 15 ms.
    output->PushBackInterleaved(input, fs_mult_120 * num_channels_);
--- a/webrtc/modules/audio_coding/neteq/accelerate.h
+++ b/webrtc/modules/audio_coding/neteq/accelerate.h
@ -38,9 +38,12 @@ class Accelerate : public TimeStretch {
  // read from |input|, of length |input_length| elements, and are written to
  // |output|. The number of samples removed through time-stretching is
  // is provided in the output |length_change_samples|. The method returns
-  // the outcome of the operation as an enumerator value.
+  // the outcome of the operation as an enumerator value. If |fast_accelerate|
+  // is true, the algorithm will relax the requirements on finding strong
+  // correlations, and may remove multiple pitch periods if possible.
  ReturnCodes Process(const int16_t* input,
                      size_t input_length,
+                      bool fast_accelerate,
                      AudioMultiVector* output,
                      int16_t* length_change_samples);

@ -58,6 +61,7 @@ class Accelerate : public TimeStretch {
                                      size_t peak_index,
                                      int16_t best_correlation,
                                      bool active_speech,
+                                      bool fast_mode,
                                      AudioMultiVector* output) const override;

 private:
--- a/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc
+++ b/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc
@ -132,15 +132,13 @@ Operations DecisionLogicNormal::ExpectedPacketAvailable(Modes prev_mode,
    // Check criterion for time-stretching.
    int low_limit, high_limit;
    delay_manager_->BufferLimits(&low_limit, &high_limit);
-    if ((buffer_level_filter_->filtered_current_level() >= high_limit &&
-        TimescaleAllowed()) ||
-        buffer_level_filter_->filtered_current_level() >= high_limit << 2) {
-      // Buffer level higher than limit and time-scaling allowed,
-      // or buffer level really high.
-      return kAccelerate;
-    } else if ((buffer_level_filter_->filtered_current_level() < low_limit)
-        && TimescaleAllowed()) {
-      return kPreemptiveExpand;
+    if (buffer_level_filter_->filtered_current_level() >= high_limit << 2)
+      return kFastAccelerate;
+    if (TimescaleAllowed()) {
+      if (buffer_level_filter_->filtered_current_level() >= high_limit)
+        return kAccelerate;
+      if (buffer_level_filter_->filtered_current_level() < low_limit)
+        return kPreemptiveExpand;
    }
  }
  return kNormal;
--- a/webrtc/modules/audio_coding/neteq/defines.h
+++ b/webrtc/modules/audio_coding/neteq/defines.h
@ -18,6 +18,7 @@ enum Operations {
  kMerge,
  kExpand,
  kAccelerate,
+  kFastAccelerate,
  kPreemptiveExpand,
  kRfc3389Cng,
  kRfc3389CngNoPacket,
--- a/webrtc/modules/audio_coding/neteq/interface/neteq.h
+++ b/webrtc/modules/audio_coding/neteq/interface/neteq.h
@ -80,7 +80,8 @@ class NetEq {
          // |max_delay_ms| has the same effect as calling SetMaximumDelay().
          max_delay_ms(2000),
          background_noise_mode(kBgnOff),
-          playout_mode(kPlayoutOn) {}
+          playout_mode(kPlayoutOn),
+          enable_fast_accelerate(false) {}

    std::string ToString() const;

@ -90,6 +91,7 @@ class NetEq {
    int max_delay_ms;
    BackgroundNoiseMode background_noise_mode;
    NetEqPlayoutMode playout_mode;
+    bool enable_fast_accelerate;
  };

  enum ReturnCodes {
--- a/webrtc/modules/audio_coding/neteq/neteq.cc
+++ b/webrtc/modules/audio_coding/neteq/neteq.cc
@ -34,7 +34,8 @@ std::string NetEq::Config::ToString() const {
     << (enable_audio_classifier ? "true" : "false")
     << ", max_packets_in_buffer=" << max_packets_in_buffer
     << ", background_noise_mode=" << background_noise_mode
-     << ", playout_mode=" << playout_mode;
+     << ", playout_mode=" << playout_mode
+     << ", enable_fast_accelerate=" << enable_fast_accelerate;
  return ss.str();
 }

--- a/webrtc/modules/audio_coding/neteq/neteq_impl.cc
+++ b/webrtc/modules/audio_coding/neteq/neteq_impl.cc
@ -92,6 +92,7 @@ NetEqImpl::NetEqImpl(const NetEq::Config& config,
      decoder_error_code_(0),
      background_noise_mode_(config.background_noise_mode),
      playout_mode_(config.playout_mode),
+      enable_fast_accelerate_(config.enable_fast_accelerate),
      decoded_packet_sequence_number_(-1),
      decoded_packet_timestamp_(0) {
  LOG(LS_INFO) << "NetEq config: " << config.ToString();
@ -745,9 +746,12 @@ int NetEqImpl::GetAudioInternal(size_t max_length, int16_t* output,
      return_value = DoExpand(play_dtmf);
      break;
    }
-    case kAccelerate: {
+    case kAccelerate:
+    case kFastAccelerate: {
+      const bool fast_accelerate =
+          enable_fast_accelerate_ && (operation == kFastAccelerate);
      return_value = DoAccelerate(decoded_buffer_.get(), length, speech_type,
-                                  play_dtmf);
+                                  play_dtmf, fast_accelerate);
      break;
    }
    case kPreemptiveExpand: {
@ -956,9 +960,8 @@ int NetEqImpl::GetDecision(Operations* operation,
  // Check if we already have enough samples in the |sync_buffer_|. If so,
  // change decision to normal, unless the decision was merge, accelerate, or
  // preemptive expand.
-  if (samples_left >= output_size_samples_ &&
-      *operation != kMerge &&
-      *operation != kAccelerate &&
+  if (samples_left >= output_size_samples_ && *operation != kMerge &&
+      *operation != kAccelerate && *operation != kFastAccelerate &&
      *operation != kPreemptiveExpand) {
    *operation = kNormal;
    return 0;
@ -1034,8 +1037,9 @@ int NetEqImpl::GetDecision(Operations* operation,
      decision_logic_->set_generated_noise_samples(0);
      return 0;
    }
-    case kAccelerate: {
-      // In order to do a accelerate we need at least 30 ms of audio data.
+    case kAccelerate:
+    case kFastAccelerate: {
+      // In order to do an accelerate we need at least 30 ms of audio data.
      if (samples_left >= samples_30_ms) {
        // Already have enough data, so we do not need to extract any more.
        decision_logic_->set_sample_memory(samples_left);
@ -1124,13 +1128,13 @@ int NetEqImpl::GetDecision(Operations* operation,
    }
  }

-  if (*operation == kAccelerate ||
+  if (*operation == kAccelerate || *operation == kFastAccelerate ||
      *operation == kPreemptiveExpand) {
    decision_logic_->set_sample_memory(samples_left + extracted_samples);
    decision_logic_->set_prev_time_scale(true);
  }

-  if (*operation == kAccelerate) {
+  if (*operation == kAccelerate || *operation == kFastAccelerate) {
    // Check that we have enough data (30ms) to do accelerate.
    if (extracted_samples + samples_left < samples_30_ms) {
      // TODO(hlundin): Write test for this.
@ -1263,7 +1267,8 @@ int NetEqImpl::DecodeLoop(PacketList* packet_list, Operations* operation,
    assert(sync_buffer_->Channels() == decoder->Channels());
    assert(decoded_buffer_length_ >= kMaxFrameSize * decoder->Channels());
    assert(*operation == kNormal || *operation == kAccelerate ||
-           *operation == kMerge || *operation == kPreemptiveExpand);
+           *operation == kFastAccelerate || *operation == kMerge ||
+           *operation == kPreemptiveExpand);
    packet_list->pop_front();
    size_t payload_length = packet->payload_length;
    int16_t decode_length;
@ -1427,9 +1432,11 @@ int NetEqImpl::DoExpand(bool play_dtmf) {
  return 0;
 }

-int NetEqImpl::DoAccelerate(int16_t* decoded_buffer, size_t decoded_length,
+int NetEqImpl::DoAccelerate(int16_t* decoded_buffer,
+                            size_t decoded_length,
                            AudioDecoder::SpeechType speech_type,
-                            bool play_dtmf) {
+                            bool play_dtmf,
+                            bool fast_accelerate) {
  const size_t required_samples = 240 * fs_mult_;  // Must have 30 ms.
  size_t borrowed_samples_per_channel = 0;
  size_t num_channels = algorithm_buffer_->Channels();
@ -1447,9 +1454,9 @@ int NetEqImpl::DoAccelerate(int16_t* decoded_buffer, size_t decoded_length,
  }

  int16_t samples_removed;
-  Accelerate::ReturnCodes return_code = accelerate_->Process(
-      decoded_buffer, decoded_length, algorithm_buffer_.get(),
-      &samples_removed);
+  Accelerate::ReturnCodes return_code =
+      accelerate_->Process(decoded_buffer, decoded_length, fast_accelerate,
+                           algorithm_buffer_.get(), &samples_removed);
  stats_.AcceleratedSamples(samples_removed);
  switch (return_code) {
    case Accelerate::kSuccess:
--- a/webrtc/modules/audio_coding/neteq/neteq_impl.h
+++ b/webrtc/modules/audio_coding/neteq/neteq_impl.h
@ -278,7 +278,8 @@ class NetEqImpl : public webrtc::NetEq {
  int DoAccelerate(int16_t* decoded_buffer,
                   size_t decoded_length,
                   AudioDecoder::SpeechType speech_type,
-                   bool play_dtmf) EXCLUSIVE_LOCKS_REQUIRED(crit_sect_);
+                   bool play_dtmf,
+                   bool fast_accelerate) EXCLUSIVE_LOCKS_REQUIRED(crit_sect_);

  // Sub-method which calls the PreemptiveExpand class to perform the
  // preemtive expand operation.
@ -392,6 +393,7 @@ class NetEqImpl : public webrtc::NetEq {
  int decoder_error_code_ GUARDED_BY(crit_sect_);
  const BackgroundNoiseMode background_noise_mode_ GUARDED_BY(crit_sect_);
  NetEqPlayoutMode playout_mode_ GUARDED_BY(crit_sect_);
+  bool enable_fast_accelerate_ GUARDED_BY(crit_sect_);

  // These values are used by NACK module to estimate time-to-play of
  // a missing packet. Occasionally, NetEq might decide to decode more
--- a/webrtc/modules/audio_coding/neteq/preemptive_expand.cc
+++ b/webrtc/modules/audio_coding/neteq/preemptive_expand.cc
@ -34,7 +34,8 @@ PreemptiveExpand::ReturnCodes PreemptiveExpand::Process(
    output->PushBackInterleaved(input, input_length);
    return kError;
  }
-  return TimeStretch::Process(input, input_length, output,
+  const bool kFastMode = false;  // Fast mode is not available for PE Expand.
+  return TimeStretch::Process(input, input_length, kFastMode, output,
                              length_change_samples);
 }

@ -54,8 +55,12 @@ void PreemptiveExpand::SetParametersForPassiveSpeech(size_t len,
 }

 PreemptiveExpand::ReturnCodes PreemptiveExpand::CheckCriteriaAndStretch(
-    const int16_t *input, size_t input_length, size_t peak_index,
-    int16_t best_correlation, bool active_speech,
+    const int16_t* input,
+    size_t input_length,
+    size_t peak_index,
+    int16_t best_correlation,
+    bool active_speech,
+    bool /*fast_mode*/,
    AudioMultiVector* output) const {
  // Pre-calculate common multiplication with |fs_mult_|.
  // 120 corresponds to 15 ms.
--- a/webrtc/modules/audio_coding/neteq/preemptive_expand.h
+++ b/webrtc/modules/audio_coding/neteq/preemptive_expand.h
@ -58,11 +58,12 @@ class PreemptiveExpand : public TimeStretch {

  // Checks the criteria for performing the time-stretching operation and,
  // if possible, performs the time-stretching.
-  ReturnCodes CheckCriteriaAndStretch(const int16_t* pw16_decoded,
-                                      size_t len,
-                                      size_t w16_bestIndex,
-                                      int16_t w16_bestCorr,
-                                      bool w16_VAD,
+  ReturnCodes CheckCriteriaAndStretch(const int16_t* input,
+                                      size_t input_length,
+                                      size_t peak_index,
+                                      int16_t best_correlation,
+                                      bool active_speech,
+                                      bool /*fast_mode*/,
                                      AudioMultiVector* output) const override;

 private:
--- a/webrtc/modules/audio_coding/neteq/time_stretch.cc
+++ b/webrtc/modules/audio_coding/neteq/time_stretch.cc
@ -19,12 +19,11 @@

 namespace webrtc {

-TimeStretch::ReturnCodes TimeStretch::Process(
-    const int16_t* input,
-    size_t input_len,
-    AudioMultiVector* output,
-    int16_t* length_change_samples) {
-
+TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input,
+                                              size_t input_len,
+                                              bool fast_mode,
+                                              AudioMultiVector* output,
+                                              int16_t* length_change_samples) {
  // Pre-calculate common multiplication with |fs_mult_|.
  int fs_mult_120 = fs_mult_ * 120;  // Corresponds to 15 ms.

@ -140,8 +139,9 @@ TimeStretch::ReturnCodes TimeStretch::Process(


  // Check accelerate criteria and stretch the signal.
-  ReturnCodes return_value = CheckCriteriaAndStretch(
-      input, input_len, peak_index, best_correlation, active_speech, output);
+  ReturnCodes return_value =
+      CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation,
+                              active_speech, fast_mode, output);
  switch (return_value) {
    case kSuccess:
      *length_change_samples = peak_index;
--- a/webrtc/modules/audio_coding/neteq/time_stretch.h
+++ b/webrtc/modules/audio_coding/neteq/time_stretch.h
@ -58,6 +58,7 @@ class TimeStretch {
  // PreemptiveExpand.
  ReturnCodes Process(const int16_t* input,
                      size_t input_len,
+                      bool fast_mode,
                      AudioMultiVector* output,
                      int16_t* length_change_samples);

@ -73,8 +74,12 @@ class TimeStretch {
  // if possible, performs the time-stretching. This method must be implemented
  // by the sub-classes.
  virtual ReturnCodes CheckCriteriaAndStretch(
-      const int16_t* input, size_t input_length, size_t peak_index,
-      int16_t best_correlation, bool active_speech,
+      const int16_t* input,
+      size_t input_length,
+      size_t peak_index,
+      int16_t best_correlation,
+      bool active_speech,
+      bool fast_mode,
      AudioMultiVector* output) const = 0;

  static const int kCorrelationLen = 50;
--- a/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc
+++ b/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc
@ -13,14 +13,24 @@
 #include "webrtc/modules/audio_coding/neteq/accelerate.h"
 #include "webrtc/modules/audio_coding/neteq/preemptive_expand.h"

+#include <map>
+
 #include "testing/gtest/include/gtest/gtest.h"
+#include "webrtc/base/checks.h"
+#include "webrtc/base/scoped_ptr.h"
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
 #include "webrtc/modules/audio_coding/neteq/background_noise.h"
+#include "webrtc/modules/audio_coding/neteq/tools/input_audio_file.h"
+#include "webrtc/test/testsupport/fileutils.h"

 namespace webrtc {

+namespace {
+const size_t kNumChannels = 1;
+}
+
 TEST(TimeStretch, CreateAndDestroy) {
  const int kSampleRate = 8000;
-  const size_t kNumChannels = 1;
  const int kOverlapSamples = 5 * kSampleRate / 8000;
  BackgroundNoise bgn(kNumChannels);
  Accelerate accelerate(kSampleRate, kNumChannels, bgn);
@ -30,7 +40,6 @@ TEST(TimeStretch, CreateAndDestroy) {

 TEST(TimeStretch, CreateUsingFactory) {
  const int kSampleRate = 8000;
-  const size_t kNumChannels = 1;
  const int kOverlapSamples = 5 * kSampleRate / 8000;
  BackgroundNoise bgn(kNumChannels);

@ -47,6 +56,72 @@ TEST(TimeStretch, CreateUsingFactory) {
  delete preemptive_expand;
 }

-// TODO(hlundin): Write more tests.
+class TimeStretchTest : public ::testing::Test {
+ protected:
+  TimeStretchTest()
+      : input_file_(new test::InputAudioFile(
+            test::ResourcePath("audio_coding/testfile32kHz", "pcm"))),
+        sample_rate_hz_(32000),
+        block_size_(30 * sample_rate_hz_ / 1000),  // 30 ms
+        audio_(new int16_t[block_size_]),
+        background_noise_(kNumChannels) {
+    WebRtcSpl_Init();
+  }
+
+  const int16_t* Next30Ms() {
+    CHECK(input_file_->Read(block_size_, audio_.get()));
+    return audio_.get();
+  }
+
+  // Returns the total length change (in samples) that the accelerate operation
+  // resulted in during the run.
+  int TestAccelerate(int loops, bool fast_mode) {
+    Accelerate accelerate(sample_rate_hz_, kNumChannels, background_noise_);
+    int total_length_change = 0;
+    for (int i = 0; i < loops; ++i) {
+      AudioMultiVector output(kNumChannels);
+      int16_t length_change;
+      UpdateReturnStats(accelerate.Process(Next30Ms(), block_size_, fast_mode,
+                                           &output, &length_change));
+      total_length_change += length_change;
+    }
+    return total_length_change;
+  }
+
+  void UpdateReturnStats(TimeStretch::ReturnCodes ret) {
+    switch (ret) {
+      case TimeStretch::kSuccess:
+      case TimeStretch::kSuccessLowEnergy:
+      case TimeStretch::kNoStretch:
+        ++return_stats_[ret];
+        break;
+      case TimeStretch::kError:
+        FAIL() << "Process returned an error";
+    }
+  }
+
+  rtc::scoped_ptr<test::InputAudioFile> input_file_;
+  const int sample_rate_hz_;
+  const size_t block_size_;
+  rtc::scoped_ptr<int16_t[]> audio_;
+  std::map<TimeStretch::ReturnCodes, int> return_stats_;
+  BackgroundNoise background_noise_;
+};
+
+TEST_F(TimeStretchTest, Accelerate) {
+  // TestAccelerate returns the total length change in samples.
+  EXPECT_EQ(15268, TestAccelerate(100, false));
+  EXPECT_EQ(9, return_stats_[TimeStretch::kSuccess]);
+  EXPECT_EQ(58, return_stats_[TimeStretch::kSuccessLowEnergy]);
+  EXPECT_EQ(33, return_stats_[TimeStretch::kNoStretch]);
+}
+
+TEST_F(TimeStretchTest, AccelerateFastMode) {
+  // TestAccelerate returns the total length change in samples.
+  EXPECT_EQ(21400, TestAccelerate(100, true));
+  EXPECT_EQ(31, return_stats_[TimeStretch::kSuccess]);
+  EXPECT_EQ(58, return_stats_[TimeStretch::kSuccessLowEnergy]);
+  EXPECT_EQ(11, return_stats_[TimeStretch::kNoStretch]);
+}

 }  // namespace webrtc