AudioEncoderCng: Handle case where speech encoder is reset

Previously, AudioEncoderCng required the speech encoder to not change its mind regarding the number of 10 ms frames in the next packet between calls to AudioEncoderCng::EncodeInternal()---specifically, it could handle an upward but not a downward adjustment. With this patch, it can handle a downward adjustment too, by simply saving the overshoot data for the next call to EncodeInternal(). It will still not handle the case where the encoder's reported number of 10 ms frames in the next packet is inconsistent with the behavior of its Encode() function when called with no intervening changes to the encoder. R=henrik.lundin@webrtc.org Review URL: https://webrtc-codereview.appspot.com/53469005 Cr-Commit-Position: refs/heads/master@{#9261}
2015-05-22 15:13:41 +02:00 · 2015-05-22 15:13:41 +02:00 · 367c868c99
commit 367c868c99
parent f761d10393
3 changed files with 55 additions and 54 deletions
--- a/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc
+++ b/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc
@ -50,8 +50,6 @@ AudioEncoderCng::AudioEncoderCng(const Config& config)
    : speech_encoder_(config.speech_encoder),
      cng_payload_type_(config.payload_type),
      num_cng_coefficients_(config.num_cng_coefficients),
-      first_timestamp_in_buffer_(0),
-      frames_in_buffer_(0),
      last_frame_active_(true),
      vad_(new Vad(config.vad_mode)) {
  if (config.vad) {
@ -115,35 +113,31 @@ AudioEncoder::EncodedInfo AudioEncoderCng::EncodeInternal(
    size_t max_encoded_bytes,
    uint8_t* encoded) {
  CHECK_GE(max_encoded_bytes, static_cast<size_t>(num_cng_coefficients_ + 1));
-  const int num_samples = SampleRateHz() / 100 * NumChannels();
-  if (speech_buffer_.empty()) {
-    CHECK_EQ(frames_in_buffer_, 0);
-    first_timestamp_in_buffer_ = rtp_timestamp;
-  }
-  for (int i = 0; i < num_samples; ++i) {
+  const size_t samples_per_10ms_frame = SamplesPer10msFrame();
+  CHECK_EQ(speech_buffer_.size(),
+           rtp_timestamps_.size() * samples_per_10ms_frame);
+  rtp_timestamps_.push_back(rtp_timestamp);
+  for (size_t i = 0; i < samples_per_10ms_frame; ++i) {
    speech_buffer_.push_back(audio[i]);
  }
-  ++frames_in_buffer_;
-  if (frames_in_buffer_ < speech_encoder_->Num10MsFramesInNextPacket()) {
+  const int frames_to_encode = speech_encoder_->Num10MsFramesInNextPacket();
+  if (rtp_timestamps_.size() < static_cast<size_t>(frames_to_encode)) {
    return EncodedInfo();
  }
-  CHECK_LE(frames_in_buffer_ * 10, kMaxFrameSizeMs)
+  CHECK_LE(frames_to_encode * 10, kMaxFrameSizeMs)
      << "Frame size cannot be larger than " << kMaxFrameSizeMs
      << " ms when using VAD/CNG.";
-  const size_t samples_per_10ms_frame = 10 * SampleRateHz() / 1000;
-  CHECK_EQ(speech_buffer_.size(),
-           static_cast<size_t>(frames_in_buffer_) * samples_per_10ms_frame);

  // Group several 10 ms blocks per VAD call. Call VAD once or twice using the
  // following split sizes:
  // 10 ms = 10 + 0 ms; 20 ms = 20 + 0 ms; 30 ms = 30 + 0 ms;
  // 40 ms = 20 + 20 ms; 50 ms = 30 + 20 ms; 60 ms = 30 + 30 ms.
  int blocks_in_first_vad_call =
-      (frames_in_buffer_ > 3 ? 3 : frames_in_buffer_);
-  if (frames_in_buffer_ == 4)
+      (frames_to_encode > 3 ? 3 : frames_to_encode);
+  if (frames_to_encode == 4)
    blocks_in_first_vad_call = 2;
  const int blocks_in_second_vad_call =
-      frames_in_buffer_ - blocks_in_first_vad_call;
+      frames_to_encode - blocks_in_first_vad_call;
  CHECK_GE(blocks_in_second_vad_call, 0);

  // Check if all of the buffer is passive speech. Start with checking the first
@ -161,12 +155,12 @@ AudioEncoder::EncodedInfo AudioEncoderCng::EncodeInternal(
  EncodedInfo info;
  switch (activity) {
    case Vad::kPassive: {
-      info = EncodePassive(max_encoded_bytes, encoded);
+      info = EncodePassive(frames_to_encode, max_encoded_bytes, encoded);
      last_frame_active_ = false;
      break;
    }
    case Vad::kActive: {
-      info = EncodeActive(max_encoded_bytes, encoded);
+      info = EncodeActive(frames_to_encode, max_encoded_bytes, encoded);
      last_frame_active_ = true;
      break;
    }
@ -176,20 +170,24 @@ AudioEncoder::EncodedInfo AudioEncoderCng::EncodeInternal(
    }
  }

-  speech_buffer_.clear();
-  frames_in_buffer_ = 0;
+  speech_buffer_.erase(
+      speech_buffer_.begin(),
+      speech_buffer_.begin() + frames_to_encode * samples_per_10ms_frame);
+  rtp_timestamps_.erase(rtp_timestamps_.begin(),
+                        rtp_timestamps_.begin() + frames_to_encode);
  return info;
 }

 AudioEncoder::EncodedInfo AudioEncoderCng::EncodePassive(
+    int frames_to_encode,
    size_t max_encoded_bytes,
    uint8_t* encoded) {
  bool force_sid = last_frame_active_;
  bool output_produced = false;
  const size_t samples_per_10ms_frame = SamplesPer10msFrame();
-  CHECK_GE(max_encoded_bytes, frames_in_buffer_ * samples_per_10ms_frame);
+  CHECK_GE(max_encoded_bytes, frames_to_encode * samples_per_10ms_frame);
  AudioEncoder::EncodedInfo info;
-  for (int i = 0; i < frames_in_buffer_; ++i) {
+  for (int i = 0; i < frames_to_encode; ++i) {
    int16_t encoded_bytes_tmp = 0;
    CHECK_GE(WebRtcCng_Encode(cng_inst_.get(),
                              &speech_buffer_[i * samples_per_10ms_frame],
@ -202,7 +200,7 @@ AudioEncoder::EncodedInfo AudioEncoderCng::EncodePassive(
      force_sid = false;
    }
  }
-  info.encoded_timestamp = first_timestamp_in_buffer_;
+  info.encoded_timestamp = rtp_timestamps_.front();
  info.payload_type = cng_payload_type_;
  info.send_even_if_empty = true;
  info.speech = false;
@ -210,15 +208,18 @@ AudioEncoder::EncodedInfo AudioEncoderCng::EncodePassive(
 }

 AudioEncoder::EncodedInfo AudioEncoderCng::EncodeActive(
+    int frames_to_encode,
    size_t max_encoded_bytes,
    uint8_t* encoded) {
  const size_t samples_per_10ms_frame = SamplesPer10msFrame();
  AudioEncoder::EncodedInfo info;
-  for (int i = 0; i < frames_in_buffer_; ++i) {
+  for (int i = 0; i < frames_to_encode; ++i) {
    info = speech_encoder_->Encode(
-        first_timestamp_in_buffer_, &speech_buffer_[i * samples_per_10ms_frame],
+        rtp_timestamps_.front(), &speech_buffer_[i * samples_per_10ms_frame],
        samples_per_10ms_frame, max_encoded_bytes, encoded);
-    if (i < frames_in_buffer_ - 1) {
+    if (i == frames_to_encode - 1) {
+      CHECK_GT(info.encoded_bytes, 0u) << "Encoder didn't deliver data.";
+    } else {
      CHECK_EQ(info.encoded_bytes, 0u) << "Encoder delivered data too early.";
    }
  }
--- a/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc
+++ b/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc
@ -80,6 +80,21 @@ class AudioEncoderCngTest : public ::testing::Test {
    timestamp_ += num_audio_samples_10ms_;
  }

+  // Expect |num_calls| calls to the encoder, all successful. The last call
+  // claims to have encoded |kMockMaxEncodedBytes| bytes, and all the preceding
+  // ones 0 bytes.
+  void ExpectEncodeCalls(int num_calls) {
+    InSequence s;
+    AudioEncoder::EncodedInfo info;
+    for (int j = 0; j < num_calls - 1; ++j) {
+      EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
+          .WillOnce(Return(info));
+    }
+    info.encoded_bytes = kMockReturnEncodedBytes;
+    EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
+        .WillOnce(Return(info));
+  }
+
  // Verifies that the cng_ object waits until it has collected
  // |blocks_per_frame| blocks of audio, and then dispatches all of them to
  // the underlying codec (speech or cng).
@ -96,20 +111,8 @@ class AudioEncoderCngTest : public ::testing::Test {
      Encode();
      EXPECT_EQ(0u, encoded_info_.encoded_bytes);
    }
-    if (active_speech) {
-      // Now expect |blocks_per_frame| calls to the encoder in sequence.
-      // Let the speech codec mock return true and set the number of encoded
-      // bytes to |kMockReturnEncodedBytes|.
-      InSequence s;
-      AudioEncoder::EncodedInfo info;
-      for (int j = 0; j < blocks_per_frame - 1; ++j) {
-        EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-            .WillOnce(Return(info));
-      }
-      info.encoded_bytes = kMockReturnEncodedBytes;
-      EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-          .WillOnce(Return(info));
-    }
+    if (active_speech)
+      ExpectEncodeCalls(blocks_per_frame);
    Encode();
    if (active_speech) {
      EXPECT_EQ(kMockReturnEncodedBytes, encoded_info_.encoded_bytes);
@ -283,23 +286,17 @@ TEST_F(AudioEncoderCngTest, MixedActivePassive) {
  CreateCng();

  // All of the frame is active speech.
-  EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-      .Times(6)
-      .WillRepeatedly(Return(AudioEncoder::EncodedInfo()));
+  ExpectEncodeCalls(6);
  EXPECT_TRUE(CheckMixedActivePassive(Vad::kActive, Vad::kActive));
  EXPECT_TRUE(encoded_info_.speech);

  // First half of the frame is active speech.
-  EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-      .Times(6)
-      .WillRepeatedly(Return(AudioEncoder::EncodedInfo()));
+  ExpectEncodeCalls(6);
  EXPECT_TRUE(CheckMixedActivePassive(Vad::kActive, Vad::kPassive));
  EXPECT_TRUE(encoded_info_.speech);

  // Second half of the frame is active speech.
-  EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-      .Times(6)
-      .WillRepeatedly(Return(AudioEncoder::EncodedInfo()));
+  ExpectEncodeCalls(6);
  EXPECT_TRUE(CheckMixedActivePassive(Vad::kPassive, Vad::kActive));
  EXPECT_TRUE(encoded_info_.speech);

--- a/webrtc/modules/audio_coding/codecs/cng/include/audio_encoder_cng.h
+++ b/webrtc/modules/audio_coding/codecs/cng/include/audio_encoder_cng.h
@ -66,16 +66,19 @@ class AudioEncoderCng final : public AudioEncoder {
    inline void operator()(CNG_enc_inst* ptr) const { WebRtcCng_FreeEnc(ptr); }
  };

-  EncodedInfo EncodePassive(size_t max_encoded_bytes, uint8_t* encoded);
-  EncodedInfo EncodeActive(size_t max_encoded_bytes, uint8_t* encoded);
+  EncodedInfo EncodePassive(int frames_to_encode,
+                            size_t max_encoded_bytes,
+                            uint8_t* encoded);
+  EncodedInfo EncodeActive(int frames_to_encode,
+                           size_t max_encoded_bytes,
+                           uint8_t* encoded);
  size_t SamplesPer10msFrame() const;

  AudioEncoder* speech_encoder_;
  const int cng_payload_type_;
  const int num_cng_coefficients_;
  std::vector<int16_t> speech_buffer_;
-  uint32_t first_timestamp_in_buffer_;
-  int frames_in_buffer_;
+  std::vector<uint32_t> rtp_timestamps_;
  bool last_frame_active_;
  rtc::scoped_ptr<Vad> vad_;
  rtc::scoped_ptr<CNG_enc_inst, CngInstDeleter> cng_inst_;