From b86fbaf1d41db539205ec671ff399a3a3aa50734 Mon Sep 17 00:00:00 2001 From: "andrew@webrtc.org" Date: Thu, 25 Jul 2013 22:04:30 +0000 Subject: [PATCH] Downstream latest Chromium SincResampler changes. Replace the BlockSize() workaround we were using previously to support the push wrapper with the upstream request_frames interface. This requires a bit of a trick to ensure we don't add more delay than necessary. On the first pass we use a dummy Resample() call in order to prime the buffer such that all later calls only require a single input request through Run(). Notably, this brings in an optimized loop condition, improving performance by ~2% - 3% on tested platforms and avoids a 20% performance hit with clang. This addresses issue2041. Only negligible changes to the PushSincResamplerTest SNR thresholds, due to a fractional sample adjustment in output delay. This still retains the per-instance CPU detection, as webrtc lacks a LazyInstance helper for static initialization. Ideally, we would adopt SetRatio() in PushSincResampler's InitializeIfNeeded() for on-the-fly changes, but this will require a way to update request_frames. The diff against Chromium upstream is available here: https://codereview.chromium.org/19470003 BUG=2041 TESTED=unit tests, voe_cmd_test in loopback running through all codecs with 44.1 kHz and 48 kHz device formats using a stereo mic. R=dalecurtis@chromium.org Review URL: https://webrtc-codereview.appspot.com/1838004 git-svn-id: http://webrtc.googlecode.com/svn/trunk@4406 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../common_audio/resampler/push_resampler.cc | 9 +- .../resampler/push_sinc_resampler.cc | 69 ++-- .../resampler/push_sinc_resampler.h | 24 +- .../resampler/push_sinc_resampler_unittest.cc | 33 +- .../common_audio/resampler/sinc_resampler.cc | 299 ++++++++++-------- .../common_audio/resampler/sinc_resampler.h | 76 +++-- .../resampler/sinc_resampler_unittest.cc | 49 ++- .../sinusoidal_linear_chirp_source.cc | 2 +- .../sinusoidal_linear_chirp_source.h | 2 +- 9 files changed, 323 insertions(+), 240 deletions(-) diff --git a/webrtc/common_audio/resampler/push_resampler.cc b/webrtc/common_audio/resampler/push_resampler.cc index 8fdf6386c..05c6c617e 100644 --- a/webrtc/common_audio/resampler/push_resampler.cc +++ b/webrtc/common_audio/resampler/push_resampler.cc @@ -38,15 +38,13 @@ int PushResampler::InitializeIfNeeded(int src_sample_rate_hz, int num_channels) { if (src_sample_rate_hz == src_sample_rate_hz_ && dst_sample_rate_hz == dst_sample_rate_hz_ && - num_channels == num_channels_) { + num_channels == num_channels_) // No-op if settings haven't changed. return 0; - } if (src_sample_rate_hz <= 0 || dst_sample_rate_hz <= 0 || - num_channels <= 0 || num_channels > 2) { + num_channels <= 0 || num_channels > 2) return -1; - } src_sample_rate_hz_ = src_sample_rate_hz; dst_sample_rate_hz_ = dst_sample_rate_hz; @@ -72,9 +70,8 @@ int PushResampler::Resample(const int16_t* src, int src_length, int16_t* dst, int dst_capacity) { const int src_size_10ms = src_sample_rate_hz_ * num_channels_ / 100; const int dst_size_10ms = dst_sample_rate_hz_ * num_channels_ / 100; - if (src_length != src_size_10ms || dst_capacity < dst_size_10ms) { + if (src_length != src_size_10ms || dst_capacity < dst_size_10ms) return -1; - } if (src_sample_rate_hz_ == dst_sample_rate_hz_) { // The old resampler provides this memcpy facility in the case of matching diff --git a/webrtc/common_audio/resampler/push_sinc_resampler.cc b/webrtc/common_audio/resampler/push_sinc_resampler.cc index 50109a9f3..224d75781 100644 --- a/webrtc/common_audio/resampler/push_sinc_resampler.cc +++ b/webrtc/common_audio/resampler/push_sinc_resampler.cc @@ -11,20 +11,22 @@ #include "webrtc/common_audio/resampler/push_sinc_resampler.h" #include - +#include #include namespace webrtc { -PushSincResampler::PushSincResampler(int src_block_size, - int dst_block_size) +PushSincResampler::PushSincResampler(int source_frames, + int destination_frames) : resampler_(NULL), float_buffer_(NULL), source_ptr_(NULL), - dst_size_(dst_block_size) { - resampler_.reset(new SincResampler(src_block_size * 1.0 / dst_block_size, - this, src_block_size)); - float_buffer_.reset(new float[dst_block_size]); + destination_frames_(destination_frames), + first_pass_(true), + source_available_(0) { + resampler_.reset(new SincResampler(source_frames * 1.0 / destination_frames, + source_frames, this)); + float_buffer_.reset(new float[destination_frames]); } PushSincResampler::~PushSincResampler() { @@ -34,32 +36,53 @@ int PushSincResampler::Resample(const int16_t* source, int source_length, int16_t* destination, int destination_capacity) { - assert(source_length == resampler_->BlockSize()); - assert(destination_capacity >= dst_size_); + assert(source_length == resampler_->request_frames()); + assert(destination_capacity >= destination_frames_); // Cache the source pointer. Calling Resample() will immediately trigger // the Run() callback whereupon we provide the cached value. source_ptr_ = source; - resampler_->Resample(float_buffer_.get(), dst_size_); - for (int i = 0; i < dst_size_; ++i) { + source_available_ = source_length; + + // On the first pass, we call Resample() twice. During the first call, we + // provide dummy input and discard the output. This is done to prime the + // SincResampler buffer with the correct delay (half the kernel size), thereby + // ensuring that all later Resample() calls will only result in one input + // request through Run(). + // + // If this wasn't done, SincResampler would call Run() twice on the first + // pass, and we'd have to introduce an entire |source_frames| of delay, rather + // than the minimum half kernel. + // + // It works out that ChunkSize() is exactly the amount of output we need to + // request in order to prime the buffer with a single Run() request for + // |source_frames|. + if (first_pass_) + resampler_->Resample(resampler_->ChunkSize(), float_buffer_.get()); + + resampler_->Resample(destination_frames_, float_buffer_.get()); + for (int i = 0; i < destination_frames_; ++i) { float clipped = std::max(std::min(float_buffer_[i], 32767.0f), -32768.0f); destination[i] = static_cast(std::floor(clipped + 0.5)); } source_ptr_ = NULL; - return dst_size_; + return destination_frames_; } -void PushSincResampler::Run(float* destination, int frames) { +void PushSincResampler::Run(int frames, float* destination) { assert(source_ptr_ != NULL); - assert(frames >= resampler_->BlockSize()); - // We will have exactly |BlockSize| number of source samples available. If - // the resampler asks for more, zero pad the beginning. This will only happen - // on the first call while priming the buffer. - int i = 0; - for (; i < frames - resampler_->BlockSize(); ++i) { - destination[i] = 0; - } - for (int j = 0; i < frames; ++i, ++j) { - destination[i] = static_cast(source_ptr_[j]); + // Ensure we are only asked for the available samples. This would fail if + // Run() was triggered more than once per Resample() call. + assert(source_available_ == frames); + + if (first_pass_) { + // Provide dummy input on the first pass, the output of which will be + // discarded, as described in Resample(). + memset(destination, 0, frames * sizeof(float)); + first_pass_ = false; + } else { + for (int i = 0; i < frames; ++i) + destination[i] = static_cast(source_ptr_[i]); + source_available_ -= frames; } } diff --git a/webrtc/common_audio/resampler/push_sinc_resampler.h b/webrtc/common_audio/resampler/push_sinc_resampler.h index 05862ffb4..ae0f05865 100644 --- a/webrtc/common_audio/resampler/push_sinc_resampler.h +++ b/webrtc/common_audio/resampler/push_sinc_resampler.h @@ -25,25 +25,33 @@ class PushSincResampler : public SincResamplerCallback { // Provide the size of the source and destination blocks in samples. These // must correspond to the same time duration (typically 10 ms) as the sample // ratio is inferred from them. - PushSincResampler(int src_block_size, int dst_block_size); + PushSincResampler(int source_frames, int destination_frames); virtual ~PushSincResampler(); - // Perform the resampling. |source_length| must always equal the - // |src_block_size| provided at construction. |destination_capacity| must be - // at least as large as |dst_block_size|. Returns the number of samples + // Perform the resampling. |source_frames| must always equal the + // |source_frames| provided at construction. |destination_capacity| must be + // at least as large as |destination_frames|. Returns the number of samples // provided in destination (for convenience, since this will always be equal - // to |dst_block_size|). - int Resample(const int16_t* source, int source_length, + // to |destination_frames|). + int Resample(const int16_t* source, int source_frames, int16_t* destination, int destination_capacity); // Implements SincResamplerCallback. - virtual void Run(float* destination, int frames); + virtual void Run(int frames, float* destination); + + SincResampler* get_resampler_for_testing() { return resampler_.get(); } private: scoped_ptr resampler_; scoped_array float_buffer_; const int16_t* source_ptr_; - const int dst_size_; + const int destination_frames_; + + // True on the first call to Resample(), to prime the SincResampler buffer. + bool first_pass_; + + // Used to assert we are only requested for as much data as is available. + int source_available_; DISALLOW_COPY_AND_ASSIGN(PushSincResampler); }; diff --git a/webrtc/common_audio/resampler/push_sinc_resampler_unittest.cc b/webrtc/common_audio/resampler/push_sinc_resampler_unittest.cc index 0806107d3..25b7bee2c 100644 --- a/webrtc/common_audio/resampler/push_sinc_resampler_unittest.cc +++ b/webrtc/common_audio/resampler/push_sinc_resampler_unittest.cc @@ -67,10 +67,19 @@ TEST_P(PushSincResamplerTest, Resample) { scoped_array source_int(new int16_t[input_block_size]); scoped_array destination_int(new int16_t[output_block_size]); + // The sinc resampler has an implicit delay of approximately half the kernel + // size at the input sample rate. By moving to a push model, this delay + // becomes explicit and is managed by zero-stuffing in PushSincResampler. We + // deal with it in the test by delaying the "pure" source to match. It must be + // checked before the first call to Resample(), because ChunkSize() will + // change afterwards. + const int output_delay_samples = output_block_size - + resampler.get_resampler_for_testing()->ChunkSize(); + // Generate resampled signal. // With the PushSincResampler, we produce the signal block-by-10ms-block // rather than in a single pass, to exercise how it will be used in WebRTC. - resampler_source.Run(source.get(), input_samples); + resampler_source.Run(input_samples, source.get()); for (int i = 0; i < kNumBlocks; ++i) { for (int j = 0; j < input_block_size; ++j) { source_int[j] = static_cast(std::floor(32767 * @@ -86,17 +95,9 @@ TEST_P(PushSincResamplerTest, Resample) { } // Generate pure signal. - // The sinc resampler has an implicit delay of half the kernel size (32) at - // the input sample rate. By moving to a push model, this delay becomes - // explicit and is managed by zero-stuffing in PushSincResampler. This delay - // can be a fractional sample amount, so we deal with it in the test by - // delaying the "pure" source to match. - static const int kInputKernelDelaySamples = 16; - double output_delay_samples = static_cast(output_rate_) - / input_rate_ * kInputKernelDelaySamples; SinusoidalLinearChirpSource pure_source( output_rate_, output_samples, input_nyquist_freq, output_delay_samples); - pure_source.Run(pure_destination.get(), output_samples); + pure_source.Run(output_samples, pure_destination.get()); // Range of the Nyquist frequency (0.5 * min(input rate, output_rate)) which // we refer to as low and high. @@ -216,17 +217,17 @@ INSTANTIATE_TEST_CASE_P( std::tr1::make_tuple(8000, 16000, kResamplingRMSError, -70.30), std::tr1::make_tuple(16000, 16000, kResamplingRMSError, -75.51), std::tr1::make_tuple(32000, 16000, -18.48, -28.59), - std::tr1::make_tuple(44100, 16000, -19.59, -19.77), - std::tr1::make_tuple(48000, 16000, -20.01, -18.11), - std::tr1::make_tuple(96000, 16000, -20.95, -10.99), + std::tr1::make_tuple(44100, 16000, -19.30, -19.67), + std::tr1::make_tuple(48000, 16000, -19.81, -18.11), + std::tr1::make_tuple(96000, 16000, -20.95, -10.96), // To 32 kHz std::tr1::make_tuple(8000, 32000, kResamplingRMSError, -70.30), std::tr1::make_tuple(16000, 32000, kResamplingRMSError, -75.51), std::tr1::make_tuple(32000, 32000, kResamplingRMSError, -75.56), - std::tr1::make_tuple(44100, 32000, -16.52, -51.10), - std::tr1::make_tuple(48000, 32000, -16.90, -44.17), - std::tr1::make_tuple(96000, 32000, -19.80, -18.05), + std::tr1::make_tuple(44100, 32000, -16.44, -51.10), + std::tr1::make_tuple(48000, 32000, -16.90, -44.03), + std::tr1::make_tuple(96000, 32000, -19.61, -18.04), std::tr1::make_tuple(192000, 32000, -21.02, -10.94))); } // namespace webrtc diff --git a/webrtc/common_audio/resampler/sinc_resampler.cc b/webrtc/common_audio/resampler/sinc_resampler.cc index 2e2ac453a..b66ed345a 100644 --- a/webrtc/common_audio/resampler/sinc_resampler.cc +++ b/webrtc/common_audio/resampler/sinc_resampler.cc @@ -11,31 +11,73 @@ // Modified from the Chromium original: // src/media/base/sinc_resampler.cc -// Input buffer layout, dividing the total buffer into regions (r0_ - r5_): +// Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ +// and r4_ will move after the first load): // // |----------------|-----------------------------------------|----------------| // -// kBlockSize + kKernelSize / 2 +// request_frames_ // <---------------------------------------------------------> -// r0_ +// r0_ (during first load) // // kKernelSize / 2 kKernelSize / 2 kKernelSize / 2 kKernelSize / 2 // <---------------> <---------------> <---------------> <---------------> // r1_ r2_ r3_ r4_ // -// kBlockSize -// <---------------------------------------> -// r5_ +// block_size_ == r4_ - r2_ +// <---------------------------------------> +// +// request_frames_ +// <------------------ ... -----------------> +// r0_ (during second load) +// +// On the second request r0_ slides to the right by kKernelSize / 2 and r3_, r4_ +// and block_size_ are reinitialized via step (3) in the algorithm below. +// +// These new regions remain constant until a Flush() occurs. While complicated, +// this allows us to reduce jitter by always requesting the same amount from the +// provided callback. // // The algorithm: // -// 1) Consume input frames into r0_ (r1_ is zero-initialized). -// 2) Position kernel centered at start of r0_ (r2_) and generate output frames -// until kernel is centered at start of r4_ or we've finished generating all -// the output frames. -// 3) Copy r3_ to r1_ and r4_ to r2_. -// 4) Consume input frames into r5_ (zero-pad if we run out of input). -// 5) Goto (2) until all of input is consumed. +// 1) Allocate input_buffer of size: request_frames_ + kKernelSize; this ensures +// there's enough room to read request_frames_ from the callback into region +// r0_ (which will move between the first and subsequent passes). +// +// 2) Let r1_, r2_ each represent half the kernel centered around r0_: +// +// r0_ = input_buffer_ + kKernelSize / 2 +// r1_ = input_buffer_ +// r2_ = r0_ +// +// r0_ is always request_frames_ in size. r1_, r2_ are kKernelSize / 2 in +// size. r1_ must be zero initialized to avoid convolution with garbage (see +// step (5) for why). +// +// 3) Let r3_, r4_ each represent half the kernel right aligned with the end of +// r0_ and choose block_size_ as the distance in frames between r4_ and r2_: +// +// r3_ = r0_ + request_frames_ - kKernelSize +// r4_ = r0_ + request_frames_ - kKernelSize / 2 +// block_size_ = r4_ - r2_ = request_frames_ - kKernelSize / 2 +// +// 4) Consume request_frames_ frames into r0_. +// +// 5) Position kernel centered at start of r2_ and generate output frames until +// the kernel is centered at the start of r4_ or we've finished generating +// all the output frames. +// +// 6) Wrap left over data from the r3_ to r1_ and r4_ to r2_. +// +// 7) If we're on the second load, in order to avoid overwriting the frames we +// just wrapped from r4_ we need to slide r0_ to the right by the size of +// r4_, which is kKernelSize / 2: +// +// r0_ = r0_ + kKernelSize / 2 = input_buffer_ + kKernelSize +// +// r3_, r4_, and block_size_ then need to be reinitialized, so goto (3). +// +// 8) Else, if we're not on the second load, goto (4). // // Note: we're glossing over how the sub-sample handling works with // |virtual_source_idx_|, etc. @@ -70,49 +112,49 @@ static double SincScaleFactor(double io_ratio) { return sinc_scale_factor; } -SincResampler::SincResampler(double io_sample_rate_ratio, - SincResamplerCallback* read_cb, - int block_size) - : io_sample_rate_ratio_(io_sample_rate_ratio), - virtual_source_idx_(0), - buffer_primed_(false), - read_cb_(read_cb), - block_size_(block_size), - buffer_size_(block_size_ + kKernelSize), - // Create input buffers with a 16-byte alignment for SSE optimizations. - kernel_storage_(static_cast( - AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), - kernel_pre_sinc_storage_(static_cast( - AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), - kernel_window_storage_(static_cast( - AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), - input_buffer_(static_cast( - AlignedMalloc(sizeof(float) * buffer_size_, 16))), -#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE__) - convolve_proc_(WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C), -#elif defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON) - convolve_proc_(WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON ? - Convolve_NEON : Convolve_C), -#endif - // Setup various region pointers in the buffer (see diagram above). - r0_(input_buffer_.get() + kKernelSize / 2), - r1_(input_buffer_.get()), - r2_(r0_), - r3_(r0_ + block_size_ - kKernelSize / 2), - r4_(r0_ + block_size_), - r5_(r0_ + kKernelSize / 2) { - Initialize(); - InitializeKernel(); +// If we know the minimum architecture at compile time, avoid CPU detection. +// iOS lies about its architecture, so we also need to exclude it here. +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WEBRTC_IOS) +#if defined(__SSE__) +#define CONVOLVE_FUNC Convolve_SSE +void SincResampler::InitializeCPUSpecificFeatures() {} +#else +// X86 CPU detection required. Function will be set by +// InitializeCPUSpecificFeatures(). +// TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed. +#define CONVOLVE_FUNC convolve_proc_ + +void SincResampler::InitializeCPUSpecificFeatures() { + convolve_proc_ = WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C; } +#endif +#elif defined(WEBRTC_ARCH_ARM_V7) +#if defined(WEBRTC_ARCH_ARM_NEON) +#define CONVOLVE_FUNC Convolve_NEON +void SincResampler::InitializeCPUSpecificFeatures() {} +#else +// NEON CPU detection required. Function will be set by +// InitializeCPUSpecificFeatures(). +#define CONVOLVE_FUNC convolve_proc_ + +void SincResampler::InitializeCPUSpecificFeatures() { + convolve_proc_ = WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON ? + Convolve_NEON : Convolve_C; +} +#endif +#else +// Unknown architecture. +#define CONVOLVE_FUNC Convolve_C +void SincResampler::InitializeCPUSpecificFeatures() {} +#endif SincResampler::SincResampler(double io_sample_rate_ratio, + int request_frames, SincResamplerCallback* read_cb) : io_sample_rate_ratio_(io_sample_rate_ratio), - virtual_source_idx_(0), - buffer_primed_(false), read_cb_(read_cb), - block_size_(kDefaultBlockSize), - buffer_size_(kDefaultBufferSize), + request_frames_(request_frames), + input_buffer_size_(request_frames_ + kKernelSize), // Create input buffers with a 16-byte alignment for SSE optimizations. kernel_storage_(static_cast( AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), @@ -121,45 +163,19 @@ SincResampler::SincResampler(double io_sample_rate_ratio, kernel_window_storage_(static_cast( AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), input_buffer_(static_cast( - AlignedMalloc(sizeof(float) * buffer_size_, 16))), -#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE__) - convolve_proc_(WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C), -#elif defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON) - convolve_proc_(WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON ? - Convolve_NEON : Convolve_C), + AlignedMalloc(sizeof(float) * input_buffer_size_, 16))), +#if defined(WEBRTC_RESAMPLER_CPU_DETECTION) + convolve_proc_(NULL), #endif - // Setup various region pointers in the buffer (see diagram above). - r0_(input_buffer_.get() + kKernelSize / 2), r1_(input_buffer_.get()), - r2_(r0_), - r3_(r0_ + block_size_ - kKernelSize / 2), - r4_(r0_ + block_size_), - r5_(r0_ + kKernelSize / 2) { - Initialize(); - InitializeKernel(); -} - -SincResampler::~SincResampler() {} - -void SincResampler::Initialize() { - // Ensure kKernelSize is a multiple of 32 for easy SSE optimizations; causes - // r0_ and r5_ (used for input) to always be 16-byte aligned by virtue of - // input_buffer_ being 16-byte aligned. - COMPILE_ASSERT(kKernelSize % 32 == 0); + r2_(input_buffer_.get() + kKernelSize / 2) { +#if defined(WEBRTC_RESAMPLER_CPU_DETECTION) + InitializeCPUSpecificFeatures(); + assert(convolve_proc_); +#endif + assert(request_frames_ > 0); + Flush(); assert(block_size_ > kKernelSize); - // Basic sanity checks to ensure buffer regions are laid out correctly: - // r0_ and r2_ should always be the same position. - assert(r0_ == r2_); - // r1_ at the beginning of the buffer. - assert(r1_ == input_buffer_.get()); - // r1_ left of r2_, r2_ left of r5_ and r1_, r2_ size correct. - assert(r2_ - r1_ == r5_ - r2_); - // r3_ left of r4_, r5_ left of r0_ and r3_ size correct. - assert(r4_ - r3_ == r5_ - r0_); - // r3_, r4_ size correct and r4_ at the end of the buffer. - assert(r4_ + (r4_ - r3_) == r1_ + buffer_size_); - // r5_ size correct and at the end of the buffer. - assert(r5_ + block_size_ == r1_ + buffer_size_); memset(kernel_storage_.get(), 0, sizeof(*kernel_storage_.get()) * kKernelStorageSize); @@ -167,7 +183,26 @@ void SincResampler::Initialize() { sizeof(*kernel_pre_sinc_storage_.get()) * kKernelStorageSize); memset(kernel_window_storage_.get(), 0, sizeof(*kernel_window_storage_.get()) * kKernelStorageSize); - memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * buffer_size_); + + InitializeKernel(); +} + +SincResampler::~SincResampler() {} + +void SincResampler::UpdateRegions(bool second_load) { + // Setup various region pointers in the buffer (see diagram above). If we're + // on the second load we need to slide r0_ to the right by kKernelSize / 2. + r0_ = input_buffer_.get() + (second_load ? kKernelSize : kKernelSize / 2); + r3_ = r0_ + request_frames_ - kKernelSize; + r4_ = r0_ + request_frames_ - kKernelSize / 2; + block_size_ = r4_ - r2_; + + // r1_ at the beginning of the buffer. + assert(r1_ == input_buffer_.get()); + // r1_ left of r2_, r4_ left of r3_ and size correct. + assert(r2_ - r1_ == r4_ - r3_); + // r2_ left of r3. + assert(r2_ < r3_); } void SincResampler::InitializeKernel() { @@ -234,67 +269,59 @@ void SincResampler::SetRatio(double io_sample_rate_ratio) { } } -// If we know the minimum architecture avoid function hopping for CPU detection. -#if defined(WEBRTC_ARCH_X86_FAMILY) -#if defined(__SSE__) -#define CONVOLVE_FUNC Convolve_SSE -#else -// X86 CPU detection required. |convolve_proc_| will be set upon construction. -// TODO(dalecurtis): Once Chrome moves to a SSE baseline this can be removed. -#define CONVOLVE_FUNC convolve_proc_ -#endif -#elif defined(WEBRTC_ARCH_ARM_V7) -#if defined(WEBRTC_ARCH_ARM_NEON) -#define CONVOLVE_FUNC Convolve_NEON -#else -// NEON CPU detection required. |convolve_proc_| will be set upon construction. -#define CONVOLVE_FUNC convolve_proc_ -#endif -#else -// Unknown architecture. -#define CONVOLVE_FUNC Convolve_C -#endif - -void SincResampler::Resample(float* destination, int frames) { +void SincResampler::Resample(int frames, float* destination) { int remaining_frames = frames; // Step (1) -- Prime the input buffer at the start of the input stream. - if (!buffer_primed_) { - read_cb_->Run(r0_, block_size_ + kKernelSize / 2); + if (!buffer_primed_ && remaining_frames) { + read_cb_->Run(request_frames_, r0_); buffer_primed_ = true; } - // Step (2) -- Resample! + // Step (2) -- Resample! const what we can outside of the loop for speed. It + // actually has an impact on ARM performance. See inner loop comment below. + const double current_io_ratio = io_sample_rate_ratio_; + const float* const kernel_ptr = kernel_storage_.get(); while (remaining_frames) { - while (virtual_source_idx_ < block_size_) { + // |i| may be negative if the last Resample() call ended on an iteration + // that put |virtual_source_idx_| over the limit. + // + // Note: The loop construct here can severely impact performance on ARM + // or when built with clang. See https://codereview.chromium.org/18566009/ + for (int i = ceil((block_size_ - virtual_source_idx_) / current_io_ratio); + i > 0; --i) { + assert(virtual_source_idx_ < block_size_); + // |virtual_source_idx_| lies in between two kernel offsets so figure out // what they are. - int source_idx = static_cast(virtual_source_idx_); - double subsample_remainder = virtual_source_idx_ - source_idx; + const int source_idx = virtual_source_idx_; + const double subsample_remainder = virtual_source_idx_ - source_idx; - double virtual_offset_idx = subsample_remainder * kKernelOffsetCount; - int offset_idx = static_cast(virtual_offset_idx); + const double virtual_offset_idx = + subsample_remainder * kKernelOffsetCount; + const int offset_idx = virtual_offset_idx; // We'll compute "convolutions" for the two kernels which straddle // |virtual_source_idx_|. - float* k1 = kernel_storage_.get() + offset_idx * kKernelSize; - float* k2 = k1 + kKernelSize; + const float* const k1 = kernel_ptr + offset_idx * kKernelSize; + const float* const k2 = k1 + kKernelSize; // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be // true so long as kKernelSize is a multiple of 16. - assert((reinterpret_cast(k1) & 0x0F) == 0u); - assert((reinterpret_cast(k2) & 0x0F) == 0u); + assert(0u == (reinterpret_cast(k1) & 0x0F)); + assert(0u == (reinterpret_cast(k2) & 0x0F)); // Initialize input pointer based on quantized |virtual_source_idx_|. - float* input_ptr = r1_ + source_idx; + const float* const input_ptr = r1_ + source_idx; // Figure out how much to weight each kernel's "convolution". - double kernel_interpolation_factor = virtual_offset_idx - offset_idx; + const double kernel_interpolation_factor = + virtual_offset_idx - offset_idx; *destination++ = CONVOLVE_FUNC( input_ptr, k1, k2, kernel_interpolation_factor); // Advance the virtual index. - virtual_source_idx_ += io_sample_rate_ratio_; + virtual_source_idx_ += current_io_ratio; if (!--remaining_frames) return; @@ -303,31 +330,31 @@ void SincResampler::Resample(float* destination, int frames) { // Wrap back around to the start. virtual_source_idx_ -= block_size_; - // Step (3) Copy r3_ to r1_ and r4_ to r2_. + // Step (3) -- Copy r3_, r4_ to r1_, r2_. // This wraps the last input frames back to the start of the buffer. - memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * (kKernelSize / 2)); - memcpy(r2_, r4_, sizeof(*input_buffer_.get()) * (kKernelSize / 2)); + memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * kKernelSize); - // Step (4) - // Refresh the buffer with more input. - read_cb_->Run(r5_, block_size_); + // Step (4) -- Reinitialize regions if necessary. + if (r0_ == r2_) + UpdateRegions(true); + + // Step (5) -- Refresh the buffer with more input. + read_cb_->Run(request_frames_, r0_); } } #undef CONVOLVE_FUNC -int SincResampler::ChunkSize() { +int SincResampler::ChunkSize() const { return block_size_ / io_sample_rate_ratio_; } -int SincResampler::BlockSize() { - return block_size_; -} - void SincResampler::Flush() { virtual_source_idx_ = 0; buffer_primed_ = false; - memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * buffer_size_); + memset(input_buffer_.get(), 0, + sizeof(*input_buffer_.get()) * input_buffer_size_); + UpdateRegions(false); } float SincResampler::Convolve_C(const float* input_ptr, const float* k1, diff --git a/webrtc/common_audio/resampler/sinc_resampler.h b/webrtc/common_audio/resampler/sinc_resampler.h index 5a1534354..60abd6128 100644 --- a/webrtc/common_audio/resampler/sinc_resampler.h +++ b/webrtc/common_audio/resampler/sinc_resampler.h @@ -20,6 +20,13 @@ #include "webrtc/test/testsupport/gtest_prod_util.h" #include "webrtc/typedefs.h" +#if (defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WEBRTC_IOS) && \ + !defined(__SSE__)) || \ + (defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON)) +// Convenience define. +#define WEBRTC_RESAMPLER_CPU_DETECTION +#endif + namespace webrtc { // Callback class for providing more data into the resampler. Expects |frames| @@ -28,7 +35,7 @@ namespace webrtc { class SincResamplerCallback { public: virtual ~SincResamplerCallback() {} - virtual void Run(float* destination, int frames) = 0; + virtual void Run(int frames, float* destination) = 0; }; // SincResampler is a high-quality single-channel sample-rate converter. @@ -40,43 +47,36 @@ class SincResampler { // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. kKernelSize = 32, - // The number of destination frames generated per processing pass. Affects - // how often and for how much SincResampler calls back for input. Must be - // greater than kKernelSize. - kDefaultBlockSize = 512, + // Default request size. Affects how often and for how much SincResampler + // calls back for input. Must be greater than kKernelSize. + kDefaultRequestSize = 512, // The kernel offset count is used for interpolation and is the number of // sub-sample kernel shifts. Can be adjusted for quality (higher is better) // at the expense of allocating more memory. kKernelOffsetCount = 32, kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1), - - // The size (in samples) of the internal buffer used by the resampler. - kDefaultBufferSize = kDefaultBlockSize + kKernelSize, }; // Constructs a SincResampler with the specified |read_cb|, which is used to - // acquire audio data for resampling. |io_sample_rate_ratio| is the ratio of - // input / output sample rates. If desired, the number of destination frames - // generated per processing pass can be specified through |block_size|. + // acquire audio data for resampling. |io_sample_rate_ratio| is the ratio + // of input / output sample rates. |request_frames| controls the size in + // frames of the buffer requested by each |read_cb| call. The value must be + // greater than kKernelSize. Specify kDefaultRequestSize if there are no + // request size constraints. SincResampler(double io_sample_rate_ratio, + int request_frames, SincResamplerCallback* read_cb); - SincResampler(double io_sample_rate_ratio, - SincResamplerCallback* read_cb, - int block_size); virtual ~SincResampler(); // Resample |frames| of data from |read_cb_| into |destination|. - void Resample(float* destination, int frames); + void Resample(int frames, float* destination); // The maximum size in frames that guarantees Resample() will only make a // single call to |read_cb_| for more data. - int ChunkSize(); + int ChunkSize() const; - // The number of source frames requested per processing pass (and equal to - // |block_size| if provided at construction). The first pass will request - // more to prime the buffer. - int BlockSize(); + int request_frames() const { return request_frames_; } // Flush all buffered data and reset internal indices. Not thread safe, do // not call while Resample() is in progress. @@ -86,8 +86,8 @@ class SincResampler { // the kernels used for resampling. Not thread safe, do not call while // Resample() is in progress. // - // TODO(ajm): use this in PushSincResampler rather than reconstructing - // SincResampler. + // TODO(ajm): Use this in PushSincResampler rather than reconstructing + // SincResampler. We would also need a way to update |request_frames_|. void SetRatio(double io_sample_rate_ratio); float* get_kernel_for_testing() { return kernel_storage_.get(); } @@ -96,8 +96,14 @@ class SincResampler { FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, Convolve); FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, ConvolveBenchmark); - void Initialize(); void InitializeKernel(); + void UpdateRegions(bool second_load); + + // Selects runtime specific CPU features like SSE. Must be called before + // using SincResampler. + // TODO(ajm): Currently managed by the class internally. See the note with + // |convolve_proc_| below. + void InitializeCPUSpecificFeatures(); // Compute convolution of |k1| and |k2| over |input_ptr|, resultant sums are // linearly interpolated using |kernel_interpolation_factor|. On x86, the @@ -128,11 +134,14 @@ class SincResampler { // Source of data for resampling. SincResamplerCallback* read_cb_; - // See kDefaultBlockSize. + // The size (in samples) to request from each |read_cb_| execution. + const int request_frames_; + + // The number of source frames processed per pass. int block_size_; - // See kDefaultBufferSize. - int buffer_size_; + // The size (in samples) of the internal buffer used by the resampler. + const int input_buffer_size_; // Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize. // The kernel offsets are sub-sample shifts of a windowed sinc shifted from @@ -145,21 +154,22 @@ class SincResampler { scoped_ptr_malloc input_buffer_; // Stores the runtime selection of which Convolve function to use. -#if (defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE__)) || \ - (defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON)) + // TODO(ajm): Move to using a global static which must only be initialized + // once by the user. We're not doing this initially, because we don't have + // e.g. a LazyInstance helper in webrtc. +#if defined(WEBRTC_RESAMPLER_CPU_DETECTION) typedef float (*ConvolveProc)(const float*, const float*, const float*, double); - const ConvolveProc convolve_proc_; + ConvolveProc convolve_proc_; #endif // Pointers to the various regions inside |input_buffer_|. See the diagram at // the top of the .cc file for more information. - float* const r0_; + float* r0_; float* const r1_; float* const r2_; - float* const r3_; - float* const r4_; - float* const r5_; + float* r3_; + float* r4_; DISALLOW_COPY_AND_ASSIGN(SincResampler); }; diff --git a/webrtc/common_audio/resampler/sinc_resampler_unittest.cc b/webrtc/common_audio/resampler/sinc_resampler_unittest.cc index 80d32214a..b228f3d41 100644 --- a/webrtc/common_audio/resampler/sinc_resampler_unittest.cc +++ b/webrtc/common_audio/resampler/sinc_resampler_unittest.cc @@ -36,18 +36,18 @@ static const double kKernelInterpolationFactor = 0.5; // Helper class to ensure ChunkedResample() functions properly. class MockSource : public SincResamplerCallback { public: - MOCK_METHOD2(Run, void(float* destination, int frames)); + MOCK_METHOD2(Run, void(int frames, float* destination)); }; ACTION(ClearBuffer) { - memset(arg0, 0, arg1 * sizeof(float)); + memset(arg1, 0, arg0 * sizeof(float)); } ACTION(FillBuffer) { // Value chosen arbitrarily such that SincResampler resamples it to something // easily representable on all platforms; e.g., using kSampleRateRatio this // becomes 1.81219. - memset(arg0, 64, arg1 * sizeof(float)); + memset(arg1, 64, arg0 * sizeof(float)); } // Test requesting multiples of ChunkSize() frames results in the proper number @@ -57,7 +57,8 @@ TEST(SincResamplerTest, ChunkedResample) { // Choose a high ratio of input to output samples which will result in quick // exhaustion of SincResampler's internal buffers. - SincResampler resampler(kSampleRateRatio, &mock_source); + SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize, + &mock_source); static const int kChunks = 2; int max_chunk_size = resampler.ChunkSize() * kChunks; @@ -66,25 +67,26 @@ TEST(SincResamplerTest, ChunkedResample) { // Verify requesting ChunkSize() frames causes a single callback. EXPECT_CALL(mock_source, Run(_, _)) .Times(1).WillOnce(ClearBuffer()); - resampler.Resample(resampled_destination.get(), resampler.ChunkSize()); + resampler.Resample(resampler.ChunkSize(), resampled_destination.get()); // Verify requesting kChunks * ChunkSize() frames causes kChunks callbacks. testing::Mock::VerifyAndClear(&mock_source); EXPECT_CALL(mock_source, Run(_, _)) .Times(kChunks).WillRepeatedly(ClearBuffer()); - resampler.Resample(resampled_destination.get(), max_chunk_size); + resampler.Resample(max_chunk_size, resampled_destination.get()); } // Test flush resets the internal state properly. TEST(SincResamplerTest, Flush) { MockSource mock_source; - SincResampler resampler(kSampleRateRatio, &mock_source); + SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize, + &mock_source); scoped_array resampled_destination(new float[resampler.ChunkSize()]); // Fill the resampler with junk data. EXPECT_CALL(mock_source, Run(_, _)) .Times(1).WillOnce(FillBuffer()); - resampler.Resample(resampled_destination.get(), resampler.ChunkSize() / 2); + resampler.Resample(resampler.ChunkSize() / 2, resampled_destination.get()); ASSERT_NE(resampled_destination[0], 0); // Flush and request more data, which should all be zeros now. @@ -92,11 +94,25 @@ TEST(SincResamplerTest, Flush) { testing::Mock::VerifyAndClear(&mock_source); EXPECT_CALL(mock_source, Run(_, _)) .Times(1).WillOnce(ClearBuffer()); - resampler.Resample(resampled_destination.get(), resampler.ChunkSize() / 2); + resampler.Resample(resampler.ChunkSize() / 2, resampled_destination.get()); for (int i = 0; i < resampler.ChunkSize() / 2; ++i) ASSERT_FLOAT_EQ(resampled_destination[i], 0); } +// Test flush resets the internal state properly. +TEST(SincResamplerTest, DISABLED_SetRatioBench) { + MockSource mock_source; + SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize, + &mock_source); + + TickTime start = TickTime::Now(); + for (int i = 1; i < 10000; ++i) + resampler.SetRatio(1.0 / i); + double total_time_c_us = (TickTime::Now() - start).Microseconds(); + printf("SetRatio() took %.2fms.\n", total_time_c_us / 1000); +} + + // Define platform independent function name for Convolve* tests. #if defined(WEBRTC_ARCH_X86_FAMILY) #define CONVOLVE_FUNC Convolve_SSE @@ -117,7 +133,8 @@ TEST(SincResamplerTest, Convolve) { // Initialize a dummy resampler. MockSource mock_source; - SincResampler resampler(kSampleRateRatio, &mock_source); + SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize, + &mock_source); // The optimized Convolve methods are slightly more precise than Convolve_C(), // so comparison must be done using an epsilon. @@ -150,7 +167,8 @@ TEST(SincResamplerTest, Convolve) { TEST(SincResamplerTest, ConvolveBenchmark) { // Initialize a dummy resampler. MockSource mock_source; - SincResampler resampler(kSampleRateRatio, &mock_source); + SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize, + &mock_source); // Retrieve benchmark iterations from command line. // TODO(ajm): Reintroduce this as a command line option. @@ -243,9 +261,8 @@ TEST_P(SincResamplerTest, Resample) { input_rate_, input_samples, input_nyquist_freq, 0); const double io_ratio = input_rate_ / static_cast(output_rate_); - SincResampler resampler( - io_ratio, - &resampler_source); + SincResampler resampler(io_ratio, SincResampler::kDefaultRequestSize, + &resampler_source); // Force an update to the sample rate ratio to ensure dyanmic sample rate // changes are working correctly. @@ -265,12 +282,12 @@ TEST_P(SincResamplerTest, Resample) { scoped_array pure_destination(new float[output_samples]); // Generate resampled signal. - resampler.Resample(resampled_destination.get(), output_samples); + resampler.Resample(output_samples, resampled_destination.get()); // Generate pure signal. SinusoidalLinearChirpSource pure_source( output_rate_, output_samples, input_nyquist_freq, 0); - pure_source.Run(pure_destination.get(), output_samples); + pure_source.Run(output_samples, pure_destination.get()); // Range of the Nyquist frequency (0.5 * min(input rate, output_rate)) which // we refer to as low and high. diff --git a/webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.cc b/webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.cc index d7cb7f130..b7cacfa4d 100644 --- a/webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.cc +++ b/webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.cc @@ -29,7 +29,7 @@ SinusoidalLinearChirpSource::SinusoidalLinearChirpSource(int sample_rate, k_ = (max_frequency_ - kMinFrequency) / duration; } -void SinusoidalLinearChirpSource::Run(float* destination, int frames) { +void SinusoidalLinearChirpSource::Run(int frames, float* destination) { for (int i = 0; i < frames; ++i, ++current_index_) { // Filter out frequencies higher than Nyquist. if (Frequency(current_index_) > 0.5 * sample_rate_) { diff --git a/webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.h b/webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.h index bfca5d4c8..944634382 100644 --- a/webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.h +++ b/webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.h @@ -31,7 +31,7 @@ class SinusoidalLinearChirpSource : public SincResamplerCallback { virtual ~SinusoidalLinearChirpSource() {} - virtual void Run(float* destination, int frames); + virtual void Run(int frames, float* destination); double Frequency(int position);