Downstream latest Chromium SincResampler changes.

Replace the BlockSize() workaround we were using previously to support the push wrapper with the upstream request_frames interface. This requires a bit of a trick to ensure we don't add more delay than necessary. On the first pass we use a dummy Resample() call in order to prime the buffer such that all later calls only require a single input request through Run(). Notably, this brings in an optimized loop condition, improving performance by ~2% - 3% on tested platforms and avoids a 20% performance hit with clang. This addresses issue2041. Only negligible changes to the PushSincResamplerTest SNR thresholds, due to a fractional sample adjustment in output delay. This still retains the per-instance CPU detection, as webrtc lacks a LazyInstance helper for static initialization. Ideally, we would adopt SetRatio() in PushSincResampler's InitializeIfNeeded() for on-the-fly changes, but this will require a way to update request_frames. The diff against Chromium upstream is available here: https://codereview.chromium.org/19470003 BUG=2041 TESTED=unit tests, voe_cmd_test in loopback running through all codecs with 44.1 kHz and 48 kHz device formats using a stereo mic. R=dalecurtis@chromium.org Review URL: https://webrtc-codereview.appspot.com/1838004 git-svn-id: http://webrtc.googlecode.com/svn/trunk@4406 4adac7df-926f-26a2-2b94-8c16560cd09d
2013-07-25 22:04:30 +00:00
parent e691b4f952
commit b86fbaf1d4
9 changed files with 323 additions and 240 deletions
--- a/webrtc/common_audio/resampler/push_resampler.cc
+++ b/webrtc/common_audio/resampler/push_resampler.cc
@@ -38,15 +38,13 @@ int PushResampler::InitializeIfNeeded(int src_sample_rate_hz,
                                      int num_channels) {
  if (src_sample_rate_hz == src_sample_rate_hz_ &&
      dst_sample_rate_hz == dst_sample_rate_hz_ &&
-      num_channels == num_channels_) {
+      num_channels == num_channels_)
    // No-op if settings haven't changed.
    return 0;
-  }

  if (src_sample_rate_hz <= 0 || dst_sample_rate_hz <= 0 ||
-      num_channels <= 0 || num_channels > 2) {
+      num_channels <= 0 || num_channels > 2)
    return -1;
-  }

  src_sample_rate_hz_ = src_sample_rate_hz;
  dst_sample_rate_hz_ = dst_sample_rate_hz;
@@ -72,9 +70,8 @@ int PushResampler::Resample(const int16_t* src, int src_length,
                            int16_t* dst, int dst_capacity) {
  const int src_size_10ms = src_sample_rate_hz_ * num_channels_ / 100;
  const int dst_size_10ms = dst_sample_rate_hz_ * num_channels_ / 100;
-  if (src_length != src_size_10ms || dst_capacity < dst_size_10ms) {
+  if (src_length != src_size_10ms || dst_capacity < dst_size_10ms)
    return -1;
-  }

  if (src_sample_rate_hz_ == dst_sample_rate_hz_) {
    // The old resampler provides this memcpy facility in the case of matching
--- a/webrtc/common_audio/resampler/push_sinc_resampler.cc
+++ b/webrtc/common_audio/resampler/push_sinc_resampler.cc
@@ -11,20 +11,22 @@
 #include "webrtc/common_audio/resampler/push_sinc_resampler.h"

 #include <cmath>
-
+#include <cstring>
 #include <algorithm>

 namespace webrtc {

-PushSincResampler::PushSincResampler(int src_block_size,
-                                     int dst_block_size)
+PushSincResampler::PushSincResampler(int source_frames,
+                                     int destination_frames)
    : resampler_(NULL),
      float_buffer_(NULL),
      source_ptr_(NULL),
-      dst_size_(dst_block_size) {
-  resampler_.reset(new SincResampler(src_block_size * 1.0 / dst_block_size,
-                                     this, src_block_size));
-  float_buffer_.reset(new float[dst_block_size]);
+      destination_frames_(destination_frames),
+      first_pass_(true),
+      source_available_(0) {
+  resampler_.reset(new SincResampler(source_frames * 1.0 / destination_frames,
+                                     source_frames, this));
+  float_buffer_.reset(new float[destination_frames]);
 }

 PushSincResampler::~PushSincResampler() {
@@ -34,32 +36,53 @@ int PushSincResampler::Resample(const int16_t* source,
                                int source_length,
                                int16_t* destination,
                                int destination_capacity) {
-  assert(source_length == resampler_->BlockSize());
-  assert(destination_capacity >= dst_size_);
+  assert(source_length == resampler_->request_frames());
+  assert(destination_capacity >= destination_frames_);
  // Cache the source pointer. Calling Resample() will immediately trigger
  // the Run() callback whereupon we provide the cached value.
  source_ptr_ = source;
-  resampler_->Resample(float_buffer_.get(), dst_size_);
-  for (int i = 0; i < dst_size_; ++i) {
+  source_available_ = source_length;
+
+  // On the first pass, we call Resample() twice. During the first call, we
+  // provide dummy input and discard the output. This is done to prime the
+  // SincResampler buffer with the correct delay (half the kernel size), thereby
+  // ensuring that all later Resample() calls will only result in one input
+  // request through Run().
+  //
+  // If this wasn't done, SincResampler would call Run() twice on the first
+  // pass, and we'd have to introduce an entire |source_frames| of delay, rather
+  // than the minimum half kernel.
+  //
+  // It works out that ChunkSize() is exactly the amount of output we need to
+  // request in order to prime the buffer with a single Run() request for
+  // |source_frames|.
+  if (first_pass_)
+    resampler_->Resample(resampler_->ChunkSize(), float_buffer_.get());
+
+  resampler_->Resample(destination_frames_, float_buffer_.get());
+  for (int i = 0; i < destination_frames_; ++i) {
    float clipped = std::max(std::min(float_buffer_[i], 32767.0f), -32768.0f);
    destination[i] = static_cast<int16_t>(std::floor(clipped + 0.5));
  }
  source_ptr_ = NULL;
-  return dst_size_;
+  return destination_frames_;
 }

-void PushSincResampler::Run(float* destination, int frames) {
+void PushSincResampler::Run(int frames, float* destination) {
  assert(source_ptr_ != NULL);
-  assert(frames >= resampler_->BlockSize());
-  // We will have exactly |BlockSize| number of source samples available. If
-  // the resampler asks for more, zero pad the beginning. This will only happen
-  // on the first call while priming the buffer.
-  int i = 0;
-  for (; i < frames - resampler_->BlockSize(); ++i) {
-    destination[i] = 0;
-  }
-  for (int j = 0; i < frames; ++i, ++j) {
-    destination[i] = static_cast<float>(source_ptr_[j]);
+  // Ensure we are only asked for the available samples. This would fail if
+  // Run() was triggered more than once per Resample() call.
+  assert(source_available_ == frames);
+
+  if (first_pass_) {
+    // Provide dummy input on the first pass, the output of which will be
+    // discarded, as described in Resample().
+    memset(destination, 0, frames * sizeof(float));
+    first_pass_ = false;
+  } else {
+    for (int i = 0; i < frames; ++i)
+      destination[i] = static_cast<float>(source_ptr_[i]);
+    source_available_ -= frames;
  }
 }

--- a/webrtc/common_audio/resampler/push_sinc_resampler.h
+++ b/webrtc/common_audio/resampler/push_sinc_resampler.h
@@ -25,25 +25,33 @@ class PushSincResampler : public SincResamplerCallback {
  // Provide the size of the source and destination blocks in samples. These
  // must correspond to the same time duration (typically 10 ms) as the sample
  // ratio is inferred from them.
-  PushSincResampler(int src_block_size, int dst_block_size);
+  PushSincResampler(int source_frames, int destination_frames);
  virtual ~PushSincResampler();

-  // Perform the resampling. |source_length| must always equal the
-  // |src_block_size| provided at construction. |destination_capacity| must be
-  // at least as large as |dst_block_size|. Returns the number of samples
+  // Perform the resampling. |source_frames| must always equal the
+  // |source_frames| provided at construction. |destination_capacity| must be
+  // at least as large as |destination_frames|. Returns the number of samples
  // provided in destination (for convenience, since this will always be equal
-  // to |dst_block_size|).
-  int Resample(const int16_t* source, int source_length,
+  // to |destination_frames|).
+  int Resample(const int16_t* source, int source_frames,
               int16_t* destination, int destination_capacity);

  // Implements SincResamplerCallback.
-  virtual void Run(float* destination, int frames);
+  virtual void Run(int frames, float* destination);
+
+  SincResampler* get_resampler_for_testing() { return resampler_.get(); }

 private:
  scoped_ptr<SincResampler> resampler_;
  scoped_array<float> float_buffer_;
  const int16_t* source_ptr_;
-  const int dst_size_;
+  const int destination_frames_;
+
+  // True on the first call to Resample(), to prime the SincResampler buffer.
+  bool first_pass_;
+
+  // Used to assert we are only requested for as much data as is available.
+  int source_available_;

  DISALLOW_COPY_AND_ASSIGN(PushSincResampler);
 };
--- a/webrtc/common_audio/resampler/push_sinc_resampler_unittest.cc
+++ b/webrtc/common_audio/resampler/push_sinc_resampler_unittest.cc
@@ -67,10 +67,19 @@ TEST_P(PushSincResamplerTest, Resample) {
  scoped_array<int16_t> source_int(new int16_t[input_block_size]);
  scoped_array<int16_t> destination_int(new int16_t[output_block_size]);

+  // The sinc resampler has an implicit delay of approximately half the kernel
+  // size at the input sample rate. By moving to a push model, this delay
+  // becomes explicit and is managed by zero-stuffing in PushSincResampler. We
+  // deal with it in the test by delaying the "pure" source to match. It must be
+  // checked before the first call to Resample(), because ChunkSize() will
+  // change afterwards.
+  const int output_delay_samples = output_block_size -
+      resampler.get_resampler_for_testing()->ChunkSize();
+
  // Generate resampled signal.
  // With the PushSincResampler, we produce the signal block-by-10ms-block
  // rather than in a single pass, to exercise how it will be used in WebRTC.
-  resampler_source.Run(source.get(), input_samples);
+  resampler_source.Run(input_samples, source.get());
  for (int i = 0; i < kNumBlocks; ++i) {
    for (int j = 0; j < input_block_size; ++j) {
      source_int[j] = static_cast<int16_t>(std::floor(32767 *
@@ -86,17 +95,9 @@ TEST_P(PushSincResamplerTest, Resample) {
  }

  // Generate pure signal.
-  // The sinc resampler has an implicit delay of half the kernel size (32) at
-  // the input sample rate. By moving to a push model, this delay becomes
-  // explicit and is managed by zero-stuffing in PushSincResampler. This delay
-  // can be a fractional sample amount, so we deal with it in the test by
-  // delaying the "pure" source to match.
-  static const int kInputKernelDelaySamples = 16;
-  double output_delay_samples = static_cast<double>(output_rate_)
-      / input_rate_ * kInputKernelDelaySamples;
  SinusoidalLinearChirpSource pure_source(
      output_rate_, output_samples, input_nyquist_freq, output_delay_samples);
-  pure_source.Run(pure_destination.get(), output_samples);
+  pure_source.Run(output_samples, pure_destination.get());

  // Range of the Nyquist frequency (0.5 * min(input rate, output_rate)) which
  // we refer to as low and high.
@@ -216,17 +217,17 @@ INSTANTIATE_TEST_CASE_P(
        std::tr1::make_tuple(8000, 16000, kResamplingRMSError, -70.30),
        std::tr1::make_tuple(16000, 16000, kResamplingRMSError, -75.51),
        std::tr1::make_tuple(32000, 16000, -18.48, -28.59),
-        std::tr1::make_tuple(44100, 16000, -19.59, -19.77),
-        std::tr1::make_tuple(48000, 16000, -20.01, -18.11),
-        std::tr1::make_tuple(96000, 16000, -20.95, -10.99),
+        std::tr1::make_tuple(44100, 16000, -19.30, -19.67),
+        std::tr1::make_tuple(48000, 16000, -19.81, -18.11),
+        std::tr1::make_tuple(96000, 16000, -20.95, -10.96),

        // To 32 kHz
        std::tr1::make_tuple(8000, 32000, kResamplingRMSError, -70.30),
        std::tr1::make_tuple(16000, 32000, kResamplingRMSError, -75.51),
        std::tr1::make_tuple(32000, 32000, kResamplingRMSError, -75.56),
-        std::tr1::make_tuple(44100, 32000, -16.52, -51.10),
-        std::tr1::make_tuple(48000, 32000, -16.90, -44.17),
-        std::tr1::make_tuple(96000, 32000, -19.80, -18.05),
+        std::tr1::make_tuple(44100, 32000, -16.44, -51.10),
+        std::tr1::make_tuple(48000, 32000, -16.90, -44.03),
+        std::tr1::make_tuple(96000, 32000, -19.61, -18.04),
        std::tr1::make_tuple(192000, 32000, -21.02, -10.94)));

 }  // namespace webrtc
--- a/webrtc/common_audio/resampler/sinc_resampler.cc
+++ b/webrtc/common_audio/resampler/sinc_resampler.cc
@@ -11,31 +11,73 @@
 // Modified from the Chromium original:
 // src/media/base/sinc_resampler.cc

-// Input buffer layout, dividing the total buffer into regions (r0_ - r5_):
+// Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_
+// and r4_ will move after the first load):
 //
 // |----------------|-----------------------------------------|----------------|
 //
-//                                   kBlockSize + kKernelSize / 2
+//                                        request_frames_
 //                   <--------------------------------------------------------->
-//                                              r0_
+//                                    r0_ (during first load)
 //
 //  kKernelSize / 2   kKernelSize / 2         kKernelSize / 2   kKernelSize / 2
 // <---------------> <--------------->       <---------------> <--------------->
 //        r1_               r2_                     r3_               r4_
 //
-//                                                     kBlockSize
-//                                     <--------------------------------------->
-//                                                        r5_
+//                             block_size_ == r4_ - r2_
+//                   <--------------------------------------->
+//
+//                                                  request_frames_
+//                                    <------------------ ... ----------------->
+//                                               r0_ (during second load)
+//
+// On the second request r0_ slides to the right by kKernelSize / 2 and r3_, r4_
+// and block_size_ are reinitialized via step (3) in the algorithm below.
+//
+// These new regions remain constant until a Flush() occurs.  While complicated,
+// this allows us to reduce jitter by always requesting the same amount from the
+// provided callback.
 //
 // The algorithm:
 //
-// 1) Consume input frames into r0_ (r1_ is zero-initialized).
-// 2) Position kernel centered at start of r0_ (r2_) and generate output frames
-//    until kernel is centered at start of r4_ or we've finished generating all
-//    the output frames.
-// 3) Copy r3_ to r1_ and r4_ to r2_.
-// 4) Consume input frames into r5_ (zero-pad if we run out of input).
-// 5) Goto (2) until all of input is consumed.
+// 1) Allocate input_buffer of size: request_frames_ + kKernelSize; this ensures
+//    there's enough room to read request_frames_ from the callback into region
+//    r0_ (which will move between the first and subsequent passes).
+//
+// 2) Let r1_, r2_ each represent half the kernel centered around r0_:
+//
+//        r0_ = input_buffer_ + kKernelSize / 2
+//        r1_ = input_buffer_
+//        r2_ = r0_
+//
+//    r0_ is always request_frames_ in size.  r1_, r2_ are kKernelSize / 2 in
+//    size.  r1_ must be zero initialized to avoid convolution with garbage (see
+//    step (5) for why).
+//
+// 3) Let r3_, r4_ each represent half the kernel right aligned with the end of
+//    r0_ and choose block_size_ as the distance in frames between r4_ and r2_:
+//
+//        r3_ = r0_ + request_frames_ - kKernelSize
+//        r4_ = r0_ + request_frames_ - kKernelSize / 2
+//        block_size_ = r4_ - r2_ = request_frames_ - kKernelSize / 2
+//
+// 4) Consume request_frames_ frames into r0_.
+//
+// 5) Position kernel centered at start of r2_ and generate output frames until
+//    the kernel is centered at the start of r4_ or we've finished generating
+//    all the output frames.
+//
+// 6) Wrap left over data from the r3_ to r1_ and r4_ to r2_.
+//
+// 7) If we're on the second load, in order to avoid overwriting the frames we
+//    just wrapped from r4_ we need to slide r0_ to the right by the size of
+//    r4_, which is kKernelSize / 2:
+//
+//        r0_ = r0_ + kKernelSize / 2 = input_buffer_ + kKernelSize
+//
+//    r3_, r4_, and block_size_ then need to be reinitialized, so goto (3).
+//
+// 8) Else, if we're not on the second load, goto (4).
 //
 // Note: we're glossing over how the sub-sample handling works with
 // |virtual_source_idx_|, etc.
@@ -70,49 +112,49 @@ static double SincScaleFactor(double io_ratio) {
  return sinc_scale_factor;
 }

-SincResampler::SincResampler(double io_sample_rate_ratio,
-                             SincResamplerCallback* read_cb,
-                             int block_size)
-    : io_sample_rate_ratio_(io_sample_rate_ratio),
-      virtual_source_idx_(0),
-      buffer_primed_(false),
-      read_cb_(read_cb),
-      block_size_(block_size),
-      buffer_size_(block_size_ + kKernelSize),
-      // Create input buffers with a 16-byte alignment for SSE optimizations.
-      kernel_storage_(static_cast<float*>(
-          AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
-      kernel_pre_sinc_storage_(static_cast<float*>(
-          AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
-      kernel_window_storage_(static_cast<float*>(
-          AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
-      input_buffer_(static_cast<float*>(
-          AlignedMalloc(sizeof(float) * buffer_size_, 16))),
-#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE__)
-      convolve_proc_(WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C),
-#elif defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON)
-      convolve_proc_(WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON ?
-                     Convolve_NEON : Convolve_C),
-#endif
-      // Setup various region pointers in the buffer (see diagram above).
-      r0_(input_buffer_.get() + kKernelSize / 2),
-      r1_(input_buffer_.get()),
-      r2_(r0_),
-      r3_(r0_ + block_size_ - kKernelSize / 2),
-      r4_(r0_ + block_size_),
-      r5_(r0_ + kKernelSize / 2) {
-  Initialize();
-  InitializeKernel();
+// If we know the minimum architecture at compile time, avoid CPU detection.
+// iOS lies about its architecture, so we also need to exclude it here.
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WEBRTC_IOS)
+#if defined(__SSE__)
+#define CONVOLVE_FUNC Convolve_SSE
+void SincResampler::InitializeCPUSpecificFeatures() {}
+#else
+// X86 CPU detection required.  Function will be set by
+// InitializeCPUSpecificFeatures().
+// TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed.
+#define CONVOLVE_FUNC convolve_proc_
+
+void SincResampler::InitializeCPUSpecificFeatures() {
+  convolve_proc_ = WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C;
 }
+#endif
+#elif defined(WEBRTC_ARCH_ARM_V7)
+#if defined(WEBRTC_ARCH_ARM_NEON)
+#define CONVOLVE_FUNC Convolve_NEON
+void SincResampler::InitializeCPUSpecificFeatures() {}
+#else
+// NEON CPU detection required.  Function will be set by
+// InitializeCPUSpecificFeatures().
+#define CONVOLVE_FUNC convolve_proc_
+
+void SincResampler::InitializeCPUSpecificFeatures() {
+  convolve_proc_ = WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON ?
+      Convolve_NEON : Convolve_C;
+}
+#endif
+#else
+// Unknown architecture.
+#define CONVOLVE_FUNC Convolve_C
+void SincResampler::InitializeCPUSpecificFeatures() {}
+#endif

 SincResampler::SincResampler(double io_sample_rate_ratio,
+                             int request_frames,
                             SincResamplerCallback* read_cb)
    : io_sample_rate_ratio_(io_sample_rate_ratio),
-      virtual_source_idx_(0),
-      buffer_primed_(false),
      read_cb_(read_cb),
-      block_size_(kDefaultBlockSize),
-      buffer_size_(kDefaultBufferSize),
+      request_frames_(request_frames),
+      input_buffer_size_(request_frames_ + kKernelSize),
      // Create input buffers with a 16-byte alignment for SSE optimizations.
      kernel_storage_(static_cast<float*>(
          AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
@@ -121,45 +163,19 @@ SincResampler::SincResampler(double io_sample_rate_ratio,
      kernel_window_storage_(static_cast<float*>(
          AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
      input_buffer_(static_cast<float*>(
-          AlignedMalloc(sizeof(float) * buffer_size_, 16))),
-#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE__)
-      convolve_proc_(WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C),
-#elif defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON)
-      convolve_proc_(WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON ?
-                     Convolve_NEON : Convolve_C),
+          AlignedMalloc(sizeof(float) * input_buffer_size_, 16))),
+#if defined(WEBRTC_RESAMPLER_CPU_DETECTION)
+      convolve_proc_(NULL),
 #endif
-      // Setup various region pointers in the buffer (see diagram above).
-      r0_(input_buffer_.get() + kKernelSize / 2),
      r1_(input_buffer_.get()),
-      r2_(r0_),
-      r3_(r0_ + block_size_ - kKernelSize / 2),
-      r4_(r0_ + block_size_),
-      r5_(r0_ + kKernelSize / 2) {
-  Initialize();
-  InitializeKernel();
-}
-
-SincResampler::~SincResampler() {}
-
-void SincResampler::Initialize() {
-  // Ensure kKernelSize is a multiple of 32 for easy SSE optimizations; causes
-  // r0_ and r5_ (used for input) to always be 16-byte aligned by virtue of
-  // input_buffer_ being 16-byte aligned.
-  COMPILE_ASSERT(kKernelSize % 32 == 0);
+      r2_(input_buffer_.get() + kKernelSize / 2) {
+#if defined(WEBRTC_RESAMPLER_CPU_DETECTION)
+  InitializeCPUSpecificFeatures();
+  assert(convolve_proc_);
+#endif
+  assert(request_frames_ > 0);
+  Flush();
  assert(block_size_ > kKernelSize);
-  // Basic sanity checks to ensure buffer regions are laid out correctly:
-  // r0_ and r2_ should always be the same position.
-  assert(r0_ == r2_);
-  // r1_ at the beginning of the buffer.
-  assert(r1_ == input_buffer_.get());
-  // r1_ left of r2_, r2_ left of r5_ and r1_, r2_ size correct.
-  assert(r2_ - r1_ == r5_ - r2_);
-  // r3_ left of r4_, r5_ left of r0_ and r3_ size correct.
-  assert(r4_ - r3_ == r5_ - r0_);
-  // r3_, r4_ size correct and r4_ at the end of the buffer.
-  assert(r4_ + (r4_ - r3_) == r1_ + buffer_size_);
-  // r5_ size correct and at the end of the buffer.
-  assert(r5_ + block_size_ == r1_ + buffer_size_);

  memset(kernel_storage_.get(), 0,
         sizeof(*kernel_storage_.get()) * kKernelStorageSize);
@@ -167,7 +183,26 @@ void SincResampler::Initialize() {
         sizeof(*kernel_pre_sinc_storage_.get()) * kKernelStorageSize);
  memset(kernel_window_storage_.get(), 0,
         sizeof(*kernel_window_storage_.get()) * kKernelStorageSize);
-  memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * buffer_size_);
+
+  InitializeKernel();
+}
+
+SincResampler::~SincResampler() {}
+
+void SincResampler::UpdateRegions(bool second_load) {
+  // Setup various region pointers in the buffer (see diagram above).  If we're
+  // on the second load we need to slide r0_ to the right by kKernelSize / 2.
+  r0_ = input_buffer_.get() + (second_load ? kKernelSize : kKernelSize / 2);
+  r3_ = r0_ + request_frames_ - kKernelSize;
+  r4_ = r0_ + request_frames_ - kKernelSize / 2;
+  block_size_ = r4_ - r2_;
+
+  // r1_ at the beginning of the buffer.
+  assert(r1_ == input_buffer_.get());
+  // r1_ left of r2_, r4_ left of r3_ and size correct.
+  assert(r2_ - r1_ == r4_ - r3_);
+  // r2_ left of r3.
+  assert(r2_ < r3_);
 }

 void SincResampler::InitializeKernel() {
@@ -234,67 +269,59 @@ void SincResampler::SetRatio(double io_sample_rate_ratio) {
  }
 }

-// If we know the minimum architecture avoid function hopping for CPU detection.
-#if defined(WEBRTC_ARCH_X86_FAMILY)
-#if defined(__SSE__)
-#define CONVOLVE_FUNC Convolve_SSE
-#else
-// X86 CPU detection required.  |convolve_proc_| will be set upon construction.
-// TODO(dalecurtis): Once Chrome moves to a SSE baseline this can be removed.
-#define CONVOLVE_FUNC convolve_proc_
-#endif
-#elif defined(WEBRTC_ARCH_ARM_V7)
-#if defined(WEBRTC_ARCH_ARM_NEON)
-#define CONVOLVE_FUNC Convolve_NEON
-#else
-// NEON CPU detection required.  |convolve_proc_| will be set upon construction.
-#define CONVOLVE_FUNC convolve_proc_
-#endif
-#else
-// Unknown architecture.
-#define CONVOLVE_FUNC Convolve_C
-#endif
-
-void SincResampler::Resample(float* destination, int frames) {
+void SincResampler::Resample(int frames, float* destination) {
  int remaining_frames = frames;

  // Step (1) -- Prime the input buffer at the start of the input stream.
-  if (!buffer_primed_) {
-    read_cb_->Run(r0_, block_size_ + kKernelSize / 2);
+  if (!buffer_primed_ && remaining_frames) {
+    read_cb_->Run(request_frames_, r0_);
    buffer_primed_ = true;
  }

-  // Step (2) -- Resample!
+  // Step (2) -- Resample!  const what we can outside of the loop for speed.  It
+  // actually has an impact on ARM performance.  See inner loop comment below.
+  const double current_io_ratio = io_sample_rate_ratio_;
+  const float* const kernel_ptr = kernel_storage_.get();
  while (remaining_frames) {
-    while (virtual_source_idx_ < block_size_) {
+    // |i| may be negative if the last Resample() call ended on an iteration
+    // that put |virtual_source_idx_| over the limit.
+    //
+    // Note: The loop construct here can severely impact performance on ARM
+    // or when built with clang.  See https://codereview.chromium.org/18566009/
+    for (int i = ceil((block_size_ - virtual_source_idx_) / current_io_ratio);
+         i > 0; --i) {
+      assert(virtual_source_idx_ < block_size_);
+
      // |virtual_source_idx_| lies in between two kernel offsets so figure out
      // what they are.
-      int source_idx = static_cast<int>(virtual_source_idx_);
-      double subsample_remainder = virtual_source_idx_ - source_idx;
+      const int source_idx = virtual_source_idx_;
+      const double subsample_remainder = virtual_source_idx_ - source_idx;

-      double virtual_offset_idx = subsample_remainder * kKernelOffsetCount;
-      int offset_idx = static_cast<int>(virtual_offset_idx);
+      const double virtual_offset_idx =
+          subsample_remainder * kKernelOffsetCount;
+      const int offset_idx = virtual_offset_idx;

      // We'll compute "convolutions" for the two kernels which straddle
      // |virtual_source_idx_|.
-      float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;
-      float* k2 = k1 + kKernelSize;
+      const float* const k1 = kernel_ptr + offset_idx * kKernelSize;
+      const float* const k2 = k1 + kKernelSize;

      // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage.  Should always be
      // true so long as kKernelSize is a multiple of 16.
-      assert((reinterpret_cast<uintptr_t>(k1) & 0x0F) == 0u);
-      assert((reinterpret_cast<uintptr_t>(k2) & 0x0F) == 0u);
+      assert(0u == (reinterpret_cast<uintptr_t>(k1) & 0x0F));
+      assert(0u == (reinterpret_cast<uintptr_t>(k2) & 0x0F));

      // Initialize input pointer based on quantized |virtual_source_idx_|.
-      float* input_ptr = r1_ + source_idx;
+      const float* const input_ptr = r1_ + source_idx;

      // Figure out how much to weight each kernel's "convolution".
-      double kernel_interpolation_factor = virtual_offset_idx - offset_idx;
+      const double kernel_interpolation_factor =
+          virtual_offset_idx - offset_idx;
      *destination++ = CONVOLVE_FUNC(
          input_ptr, k1, k2, kernel_interpolation_factor);

      // Advance the virtual index.
-      virtual_source_idx_ += io_sample_rate_ratio_;
+      virtual_source_idx_ += current_io_ratio;

      if (!--remaining_frames)
        return;
@@ -303,31 +330,31 @@ void SincResampler::Resample(float* destination, int frames) {
    // Wrap back around to the start.
    virtual_source_idx_ -= block_size_;

-    // Step (3) Copy r3_ to r1_ and r4_ to r2_.
+    // Step (3) -- Copy r3_, r4_ to r1_, r2_.
    // This wraps the last input frames back to the start of the buffer.
-    memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * (kKernelSize / 2));
-    memcpy(r2_, r4_, sizeof(*input_buffer_.get()) * (kKernelSize / 2));
+    memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * kKernelSize);

-    // Step (4)
-    // Refresh the buffer with more input.
-    read_cb_->Run(r5_, block_size_);
+    // Step (4) -- Reinitialize regions if necessary.
+    if (r0_ == r2_)
+      UpdateRegions(true);
+
+    // Step (5) -- Refresh the buffer with more input.
+    read_cb_->Run(request_frames_, r0_);
  }
 }

 #undef CONVOLVE_FUNC

-int SincResampler::ChunkSize() {
+int SincResampler::ChunkSize() const {
  return block_size_ / io_sample_rate_ratio_;
 }

-int SincResampler::BlockSize() {
-  return block_size_;
-}
-
 void SincResampler::Flush() {
  virtual_source_idx_ = 0;
  buffer_primed_ = false;
-  memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * buffer_size_);
+  memset(input_buffer_.get(), 0,
+         sizeof(*input_buffer_.get()) * input_buffer_size_);
+  UpdateRegions(false);
 }

 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,
--- a/webrtc/common_audio/resampler/sinc_resampler.h
+++ b/webrtc/common_audio/resampler/sinc_resampler.h
@@ -20,6 +20,13 @@
 #include "webrtc/test/testsupport/gtest_prod_util.h"
 #include "webrtc/typedefs.h"

+#if (defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WEBRTC_IOS) &&  \
+        !defined(__SSE__)) ||  \
+    (defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON))
+// Convenience define.
+#define WEBRTC_RESAMPLER_CPU_DETECTION
+#endif
+
 namespace webrtc {

 // Callback class for providing more data into the resampler.  Expects |frames|
@@ -28,7 +35,7 @@ namespace webrtc {
 class SincResamplerCallback {
 public:
  virtual ~SincResamplerCallback() {}
-  virtual void Run(float* destination, int frames) = 0;
+  virtual void Run(int frames, float* destination) = 0;
 };

 // SincResampler is a high-quality single-channel sample-rate converter.
@@ -40,43 +47,36 @@ class SincResampler {
    // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
    kKernelSize = 32,

-    // The number of destination frames generated per processing pass.  Affects
-    // how often and for how much SincResampler calls back for input.  Must be
-    // greater than kKernelSize.
-    kDefaultBlockSize = 512,
+    // Default request size.  Affects how often and for how much SincResampler
+    // calls back for input.  Must be greater than kKernelSize.
+    kDefaultRequestSize = 512,

    // The kernel offset count is used for interpolation and is the number of
    // sub-sample kernel shifts.  Can be adjusted for quality (higher is better)
    // at the expense of allocating more memory.
    kKernelOffsetCount = 32,
    kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
-
-    // The size (in samples) of the internal buffer used by the resampler.
-    kDefaultBufferSize = kDefaultBlockSize + kKernelSize,
  };

  // Constructs a SincResampler with the specified |read_cb|, which is used to
-  // acquire audio data for resampling.  |io_sample_rate_ratio| is the ratio of
-  // input / output sample rates.  If desired, the number of destination frames
-  // generated per processing pass can be specified through |block_size|.
+  // acquire audio data for resampling.  |io_sample_rate_ratio| is the ratio
+  // of input / output sample rates.  |request_frames| controls the size in
+  // frames of the buffer requested by each |read_cb| call.  The value must be
+  // greater than kKernelSize.  Specify kDefaultRequestSize if there are no
+  // request size constraints.
  SincResampler(double io_sample_rate_ratio,
+                int request_frames,
                SincResamplerCallback* read_cb);
-  SincResampler(double io_sample_rate_ratio,
-                SincResamplerCallback* read_cb,
-                int block_size);
  virtual ~SincResampler();

  // Resample |frames| of data from |read_cb_| into |destination|.
-  void Resample(float* destination, int frames);
+  void Resample(int frames, float* destination);

  // The maximum size in frames that guarantees Resample() will only make a
  // single call to |read_cb_| for more data.
-  int ChunkSize();
+  int ChunkSize() const;

-  // The number of source frames requested per processing pass (and equal to
-  // |block_size| if provided at construction).  The first pass will request
-  // more to prime the buffer.
-  int BlockSize();
+  int request_frames() const { return request_frames_; }

  // Flush all buffered data and reset internal indices.  Not thread safe, do
  // not call while Resample() is in progress.
@@ -86,8 +86,8 @@ class SincResampler {
  // the kernels used for resampling.  Not thread safe, do not call while
  // Resample() is in progress.
  //
-  // TODO(ajm): use this in PushSincResampler rather than reconstructing
-  // SincResampler.
+  // TODO(ajm): Use this in PushSincResampler rather than reconstructing
+  // SincResampler.  We would also need a way to update |request_frames_|.
  void SetRatio(double io_sample_rate_ratio);

  float* get_kernel_for_testing() { return kernel_storage_.get(); }
@@ -96,8 +96,14 @@ class SincResampler {
  FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, Convolve);
  FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, ConvolveBenchmark);

-  void Initialize();
  void InitializeKernel();
+  void UpdateRegions(bool second_load);
+
+  // Selects runtime specific CPU features like SSE.  Must be called before
+  // using SincResampler.
+  // TODO(ajm): Currently managed by the class internally. See the note with
+  // |convolve_proc_| below.
+  void InitializeCPUSpecificFeatures();

  // Compute convolution of |k1| and |k2| over |input_ptr|, resultant sums are
  // linearly interpolated using |kernel_interpolation_factor|.  On x86, the
@@ -128,11 +134,14 @@ class SincResampler {
  // Source of data for resampling.
  SincResamplerCallback* read_cb_;

-  // See kDefaultBlockSize.
+  // The size (in samples) to request from each |read_cb_| execution.
+  const int request_frames_;
+
+  // The number of source frames processed per pass.
  int block_size_;

-  // See kDefaultBufferSize.
-  int buffer_size_;
+  // The size (in samples) of the internal buffer used by the resampler.
+  const int input_buffer_size_;

  // Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize.
  // The kernel offsets are sub-sample shifts of a windowed sinc shifted from
@@ -145,21 +154,22 @@ class SincResampler {
  scoped_ptr_malloc<float, AlignedFree> input_buffer_;

  // Stores the runtime selection of which Convolve function to use.
-#if (defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE__)) ||  \
-    (defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON))
+  // TODO(ajm): Move to using a global static which must only be initialized
+  // once by the user. We're not doing this initially, because we don't have
+  // e.g. a LazyInstance helper in webrtc.
+#if defined(WEBRTC_RESAMPLER_CPU_DETECTION)
  typedef float (*ConvolveProc)(const float*, const float*, const float*,
                                double);
-  const ConvolveProc convolve_proc_;
+  ConvolveProc convolve_proc_;
 #endif

  // Pointers to the various regions inside |input_buffer_|.  See the diagram at
  // the top of the .cc file for more information.
-  float* const r0_;
+  float* r0_;
  float* const r1_;
  float* const r2_;
-  float* const r3_;
-  float* const r4_;
-  float* const r5_;
+  float* r3_;
+  float* r4_;

  DISALLOW_COPY_AND_ASSIGN(SincResampler);
 };
--- a/webrtc/common_audio/resampler/sinc_resampler_unittest.cc
+++ b/webrtc/common_audio/resampler/sinc_resampler_unittest.cc
@@ -36,18 +36,18 @@ static const double kKernelInterpolationFactor = 0.5;
 // Helper class to ensure ChunkedResample() functions properly.
 class MockSource : public SincResamplerCallback {
 public:
-  MOCK_METHOD2(Run, void(float* destination, int frames));
+  MOCK_METHOD2(Run, void(int frames, float* destination));
 };

 ACTION(ClearBuffer) {
-  memset(arg0, 0, arg1 * sizeof(float));
+  memset(arg1, 0, arg0 * sizeof(float));
 }

 ACTION(FillBuffer) {
  // Value chosen arbitrarily such that SincResampler resamples it to something
  // easily representable on all platforms; e.g., using kSampleRateRatio this
  // becomes 1.81219.
-  memset(arg0, 64, arg1 * sizeof(float));
+  memset(arg1, 64, arg0 * sizeof(float));
 }

 // Test requesting multiples of ChunkSize() frames results in the proper number
@@ -57,7 +57,8 @@ TEST(SincResamplerTest, ChunkedResample) {

  // Choose a high ratio of input to output samples which will result in quick
  // exhaustion of SincResampler's internal buffers.
-  SincResampler resampler(kSampleRateRatio, &mock_source);
+  SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize,
+                          &mock_source);

  static const int kChunks = 2;
  int max_chunk_size = resampler.ChunkSize() * kChunks;
@@ -66,25 +67,26 @@ TEST(SincResamplerTest, ChunkedResample) {
  // Verify requesting ChunkSize() frames causes a single callback.
  EXPECT_CALL(mock_source, Run(_, _))
      .Times(1).WillOnce(ClearBuffer());
-  resampler.Resample(resampled_destination.get(), resampler.ChunkSize());
+  resampler.Resample(resampler.ChunkSize(), resampled_destination.get());

  // Verify requesting kChunks * ChunkSize() frames causes kChunks callbacks.
  testing::Mock::VerifyAndClear(&mock_source);
  EXPECT_CALL(mock_source, Run(_, _))
      .Times(kChunks).WillRepeatedly(ClearBuffer());
-  resampler.Resample(resampled_destination.get(), max_chunk_size);
+  resampler.Resample(max_chunk_size, resampled_destination.get());
 }

 // Test flush resets the internal state properly.
 TEST(SincResamplerTest, Flush) {
  MockSource mock_source;
-  SincResampler resampler(kSampleRateRatio, &mock_source);
+  SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize,
+                          &mock_source);
  scoped_array<float> resampled_destination(new float[resampler.ChunkSize()]);

  // Fill the resampler with junk data.
  EXPECT_CALL(mock_source, Run(_, _))
      .Times(1).WillOnce(FillBuffer());
-  resampler.Resample(resampled_destination.get(), resampler.ChunkSize() / 2);
+  resampler.Resample(resampler.ChunkSize() / 2, resampled_destination.get());
  ASSERT_NE(resampled_destination[0], 0);

  // Flush and request more data, which should all be zeros now.
@@ -92,11 +94,25 @@ TEST(SincResamplerTest, Flush) {
  testing::Mock::VerifyAndClear(&mock_source);
  EXPECT_CALL(mock_source, Run(_, _))
      .Times(1).WillOnce(ClearBuffer());
-  resampler.Resample(resampled_destination.get(), resampler.ChunkSize() / 2);
+  resampler.Resample(resampler.ChunkSize() / 2, resampled_destination.get());
  for (int i = 0; i < resampler.ChunkSize() / 2; ++i)
    ASSERT_FLOAT_EQ(resampled_destination[i], 0);
 }

+// Test flush resets the internal state properly.
+TEST(SincResamplerTest, DISABLED_SetRatioBench) {
+  MockSource mock_source;
+  SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize,
+                          &mock_source);
+
+  TickTime start = TickTime::Now();
+  for (int i = 1; i < 10000; ++i)
+    resampler.SetRatio(1.0 / i);
+  double total_time_c_us = (TickTime::Now() - start).Microseconds();
+  printf("SetRatio() took %.2fms.\n", total_time_c_us / 1000);
+}
+
+
 // Define platform independent function name for Convolve* tests.
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 #define CONVOLVE_FUNC Convolve_SSE
@@ -117,7 +133,8 @@ TEST(SincResamplerTest, Convolve) {

  // Initialize a dummy resampler.
  MockSource mock_source;
-  SincResampler resampler(kSampleRateRatio, &mock_source);
+  SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize,
+                          &mock_source);

  // The optimized Convolve methods are slightly more precise than Convolve_C(),
  // so comparison must be done using an epsilon.
@@ -150,7 +167,8 @@ TEST(SincResamplerTest, Convolve) {
 TEST(SincResamplerTest, ConvolveBenchmark) {
  // Initialize a dummy resampler.
  MockSource mock_source;
-  SincResampler resampler(kSampleRateRatio, &mock_source);
+  SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize,
+                          &mock_source);

  // Retrieve benchmark iterations from command line.
  // TODO(ajm): Reintroduce this as a command line option.
@@ -243,9 +261,8 @@ TEST_P(SincResamplerTest, Resample) {
      input_rate_, input_samples, input_nyquist_freq, 0);

  const double io_ratio = input_rate_ / static_cast<double>(output_rate_);
-  SincResampler resampler(
-      io_ratio,
-      &resampler_source);
+  SincResampler resampler(io_ratio, SincResampler::kDefaultRequestSize,
+                          &resampler_source);

  // Force an update to the sample rate ratio to ensure dyanmic sample rate
  // changes are working correctly.
@@ -265,12 +282,12 @@ TEST_P(SincResamplerTest, Resample) {
  scoped_array<float> pure_destination(new float[output_samples]);

  // Generate resampled signal.
-  resampler.Resample(resampled_destination.get(), output_samples);
+  resampler.Resample(output_samples, resampled_destination.get());

  // Generate pure signal.
  SinusoidalLinearChirpSource pure_source(
      output_rate_, output_samples, input_nyquist_freq, 0);
-  pure_source.Run(pure_destination.get(), output_samples);
+  pure_source.Run(output_samples, pure_destination.get());

  // Range of the Nyquist frequency (0.5 * min(input rate, output_rate)) which
  // we refer to as low and high.
--- a/webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.cc
+++ b/webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.cc
@@ -29,7 +29,7 @@ SinusoidalLinearChirpSource::SinusoidalLinearChirpSource(int sample_rate,
  k_ = (max_frequency_ - kMinFrequency) / duration;
 }

-void SinusoidalLinearChirpSource::Run(float* destination, int frames) {
+void SinusoidalLinearChirpSource::Run(int frames, float* destination) {
  for (int i = 0; i < frames; ++i, ++current_index_) {
    // Filter out frequencies higher than Nyquist.
    if (Frequency(current_index_) > 0.5 * sample_rate_) {
--- a/webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.h
+++ b/webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.h
@@ -31,7 +31,7 @@ class SinusoidalLinearChirpSource : public SincResamplerCallback {

  virtual ~SinusoidalLinearChirpSource() {}

-  virtual void Run(float* destination, int frames);
+  virtual void Run(int frames, float* destination);

  double Frequency(int position);