Downstream latest Chromium SincResampler changes.

Replace the BlockSize() workaround we were using previously to support
the push wrapper with the upstream request_frames interface. This
requires a bit of a trick to ensure we don't add more delay than
necessary. On the first pass we use a dummy Resample() call in order to
prime the buffer such that all later calls only require a single input
request through Run().

Notably, this brings in an optimized loop condition, improving
performance by ~2% - 3% on tested platforms and avoids a 20% performance
hit with clang. This addresses issue2041.

Only negligible changes to the PushSincResamplerTest SNR thresholds, due
to a fractional sample adjustment in output delay.

This still retains the per-instance CPU detection, as webrtc lacks a
LazyInstance helper for static initialization.

Ideally, we would adopt SetRatio() in PushSincResampler's
InitializeIfNeeded() for on-the-fly changes, but this will require a way
to update request_frames.

The diff against Chromium upstream is available here:
https://codereview.chromium.org/19470003

BUG=2041
TESTED=unit tests, voe_cmd_test in loopback running through all codecs
with 44.1 kHz and 48 kHz device formats using a stereo mic.

R=dalecurtis@chromium.org

Review URL: https://webrtc-codereview.appspot.com/1838004

git-svn-id: http://webrtc.googlecode.com/svn/trunk@4406 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
andrew@webrtc.org
2013-07-25 22:04:30 +00:00
parent e691b4f952
commit b86fbaf1d4
9 changed files with 323 additions and 240 deletions

View File

@@ -38,15 +38,13 @@ int PushResampler::InitializeIfNeeded(int src_sample_rate_hz,
int num_channels) {
if (src_sample_rate_hz == src_sample_rate_hz_ &&
dst_sample_rate_hz == dst_sample_rate_hz_ &&
num_channels == num_channels_) {
num_channels == num_channels_)
// No-op if settings haven't changed.
return 0;
}
if (src_sample_rate_hz <= 0 || dst_sample_rate_hz <= 0 ||
num_channels <= 0 || num_channels > 2) {
num_channels <= 0 || num_channels > 2)
return -1;
}
src_sample_rate_hz_ = src_sample_rate_hz;
dst_sample_rate_hz_ = dst_sample_rate_hz;
@@ -72,9 +70,8 @@ int PushResampler::Resample(const int16_t* src, int src_length,
int16_t* dst, int dst_capacity) {
const int src_size_10ms = src_sample_rate_hz_ * num_channels_ / 100;
const int dst_size_10ms = dst_sample_rate_hz_ * num_channels_ / 100;
if (src_length != src_size_10ms || dst_capacity < dst_size_10ms) {
if (src_length != src_size_10ms || dst_capacity < dst_size_10ms)
return -1;
}
if (src_sample_rate_hz_ == dst_sample_rate_hz_) {
// The old resampler provides this memcpy facility in the case of matching

View File

@@ -11,20 +11,22 @@
#include "webrtc/common_audio/resampler/push_sinc_resampler.h"
#include <cmath>
#include <cstring>
#include <algorithm>
namespace webrtc {
PushSincResampler::PushSincResampler(int src_block_size,
int dst_block_size)
PushSincResampler::PushSincResampler(int source_frames,
int destination_frames)
: resampler_(NULL),
float_buffer_(NULL),
source_ptr_(NULL),
dst_size_(dst_block_size) {
resampler_.reset(new SincResampler(src_block_size * 1.0 / dst_block_size,
this, src_block_size));
float_buffer_.reset(new float[dst_block_size]);
destination_frames_(destination_frames),
first_pass_(true),
source_available_(0) {
resampler_.reset(new SincResampler(source_frames * 1.0 / destination_frames,
source_frames, this));
float_buffer_.reset(new float[destination_frames]);
}
PushSincResampler::~PushSincResampler() {
@@ -34,32 +36,53 @@ int PushSincResampler::Resample(const int16_t* source,
int source_length,
int16_t* destination,
int destination_capacity) {
assert(source_length == resampler_->BlockSize());
assert(destination_capacity >= dst_size_);
assert(source_length == resampler_->request_frames());
assert(destination_capacity >= destination_frames_);
// Cache the source pointer. Calling Resample() will immediately trigger
// the Run() callback whereupon we provide the cached value.
source_ptr_ = source;
resampler_->Resample(float_buffer_.get(), dst_size_);
for (int i = 0; i < dst_size_; ++i) {
source_available_ = source_length;
// On the first pass, we call Resample() twice. During the first call, we
// provide dummy input and discard the output. This is done to prime the
// SincResampler buffer with the correct delay (half the kernel size), thereby
// ensuring that all later Resample() calls will only result in one input
// request through Run().
//
// If this wasn't done, SincResampler would call Run() twice on the first
// pass, and we'd have to introduce an entire |source_frames| of delay, rather
// than the minimum half kernel.
//
// It works out that ChunkSize() is exactly the amount of output we need to
// request in order to prime the buffer with a single Run() request for
// |source_frames|.
if (first_pass_)
resampler_->Resample(resampler_->ChunkSize(), float_buffer_.get());
resampler_->Resample(destination_frames_, float_buffer_.get());
for (int i = 0; i < destination_frames_; ++i) {
float clipped = std::max(std::min(float_buffer_[i], 32767.0f), -32768.0f);
destination[i] = static_cast<int16_t>(std::floor(clipped + 0.5));
}
source_ptr_ = NULL;
return dst_size_;
return destination_frames_;
}
void PushSincResampler::Run(float* destination, int frames) {
void PushSincResampler::Run(int frames, float* destination) {
assert(source_ptr_ != NULL);
assert(frames >= resampler_->BlockSize());
// We will have exactly |BlockSize| number of source samples available. If
// the resampler asks for more, zero pad the beginning. This will only happen
// on the first call while priming the buffer.
int i = 0;
for (; i < frames - resampler_->BlockSize(); ++i) {
destination[i] = 0;
}
for (int j = 0; i < frames; ++i, ++j) {
destination[i] = static_cast<float>(source_ptr_[j]);
// Ensure we are only asked for the available samples. This would fail if
// Run() was triggered more than once per Resample() call.
assert(source_available_ == frames);
if (first_pass_) {
// Provide dummy input on the first pass, the output of which will be
// discarded, as described in Resample().
memset(destination, 0, frames * sizeof(float));
first_pass_ = false;
} else {
for (int i = 0; i < frames; ++i)
destination[i] = static_cast<float>(source_ptr_[i]);
source_available_ -= frames;
}
}

View File

@@ -25,25 +25,33 @@ class PushSincResampler : public SincResamplerCallback {
// Provide the size of the source and destination blocks in samples. These
// must correspond to the same time duration (typically 10 ms) as the sample
// ratio is inferred from them.
PushSincResampler(int src_block_size, int dst_block_size);
PushSincResampler(int source_frames, int destination_frames);
virtual ~PushSincResampler();
// Perform the resampling. |source_length| must always equal the
// |src_block_size| provided at construction. |destination_capacity| must be
// at least as large as |dst_block_size|. Returns the number of samples
// Perform the resampling. |source_frames| must always equal the
// |source_frames| provided at construction. |destination_capacity| must be
// at least as large as |destination_frames|. Returns the number of samples
// provided in destination (for convenience, since this will always be equal
// to |dst_block_size|).
int Resample(const int16_t* source, int source_length,
// to |destination_frames|).
int Resample(const int16_t* source, int source_frames,
int16_t* destination, int destination_capacity);
// Implements SincResamplerCallback.
virtual void Run(float* destination, int frames);
virtual void Run(int frames, float* destination);
SincResampler* get_resampler_for_testing() { return resampler_.get(); }
private:
scoped_ptr<SincResampler> resampler_;
scoped_array<float> float_buffer_;
const int16_t* source_ptr_;
const int dst_size_;
const int destination_frames_;
// True on the first call to Resample(), to prime the SincResampler buffer.
bool first_pass_;
// Used to assert we are only requested for as much data as is available.
int source_available_;
DISALLOW_COPY_AND_ASSIGN(PushSincResampler);
};

View File

@@ -67,10 +67,19 @@ TEST_P(PushSincResamplerTest, Resample) {
scoped_array<int16_t> source_int(new int16_t[input_block_size]);
scoped_array<int16_t> destination_int(new int16_t[output_block_size]);
// The sinc resampler has an implicit delay of approximately half the kernel
// size at the input sample rate. By moving to a push model, this delay
// becomes explicit and is managed by zero-stuffing in PushSincResampler. We
// deal with it in the test by delaying the "pure" source to match. It must be
// checked before the first call to Resample(), because ChunkSize() will
// change afterwards.
const int output_delay_samples = output_block_size -
resampler.get_resampler_for_testing()->ChunkSize();
// Generate resampled signal.
// With the PushSincResampler, we produce the signal block-by-10ms-block
// rather than in a single pass, to exercise how it will be used in WebRTC.
resampler_source.Run(source.get(), input_samples);
resampler_source.Run(input_samples, source.get());
for (int i = 0; i < kNumBlocks; ++i) {
for (int j = 0; j < input_block_size; ++j) {
source_int[j] = static_cast<int16_t>(std::floor(32767 *
@@ -86,17 +95,9 @@ TEST_P(PushSincResamplerTest, Resample) {
}
// Generate pure signal.
// The sinc resampler has an implicit delay of half the kernel size (32) at
// the input sample rate. By moving to a push model, this delay becomes
// explicit and is managed by zero-stuffing in PushSincResampler. This delay
// can be a fractional sample amount, so we deal with it in the test by
// delaying the "pure" source to match.
static const int kInputKernelDelaySamples = 16;
double output_delay_samples = static_cast<double>(output_rate_)
/ input_rate_ * kInputKernelDelaySamples;
SinusoidalLinearChirpSource pure_source(
output_rate_, output_samples, input_nyquist_freq, output_delay_samples);
pure_source.Run(pure_destination.get(), output_samples);
pure_source.Run(output_samples, pure_destination.get());
// Range of the Nyquist frequency (0.5 * min(input rate, output_rate)) which
// we refer to as low and high.
@@ -216,17 +217,17 @@ INSTANTIATE_TEST_CASE_P(
std::tr1::make_tuple(8000, 16000, kResamplingRMSError, -70.30),
std::tr1::make_tuple(16000, 16000, kResamplingRMSError, -75.51),
std::tr1::make_tuple(32000, 16000, -18.48, -28.59),
std::tr1::make_tuple(44100, 16000, -19.59, -19.77),
std::tr1::make_tuple(48000, 16000, -20.01, -18.11),
std::tr1::make_tuple(96000, 16000, -20.95, -10.99),
std::tr1::make_tuple(44100, 16000, -19.30, -19.67),
std::tr1::make_tuple(48000, 16000, -19.81, -18.11),
std::tr1::make_tuple(96000, 16000, -20.95, -10.96),
// To 32 kHz
std::tr1::make_tuple(8000, 32000, kResamplingRMSError, -70.30),
std::tr1::make_tuple(16000, 32000, kResamplingRMSError, -75.51),
std::tr1::make_tuple(32000, 32000, kResamplingRMSError, -75.56),
std::tr1::make_tuple(44100, 32000, -16.52, -51.10),
std::tr1::make_tuple(48000, 32000, -16.90, -44.17),
std::tr1::make_tuple(96000, 32000, -19.80, -18.05),
std::tr1::make_tuple(44100, 32000, -16.44, -51.10),
std::tr1::make_tuple(48000, 32000, -16.90, -44.03),
std::tr1::make_tuple(96000, 32000, -19.61, -18.04),
std::tr1::make_tuple(192000, 32000, -21.02, -10.94)));
} // namespace webrtc

View File

@@ -11,31 +11,73 @@
// Modified from the Chromium original:
// src/media/base/sinc_resampler.cc
// Input buffer layout, dividing the total buffer into regions (r0_ - r5_):
// Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_
// and r4_ will move after the first load):
//
// |----------------|-----------------------------------------|----------------|
//
// kBlockSize + kKernelSize / 2
// request_frames_
// <--------------------------------------------------------->
// r0_
// r0_ (during first load)
//
// kKernelSize / 2 kKernelSize / 2 kKernelSize / 2 kKernelSize / 2
// <---------------> <---------------> <---------------> <--------------->
// r1_ r2_ r3_ r4_
//
// kBlockSize
// <--------------------------------------->
// r5_
// block_size_ == r4_ - r2_
// <--------------------------------------->
//
// request_frames_
// <------------------ ... ----------------->
// r0_ (during second load)
//
// On the second request r0_ slides to the right by kKernelSize / 2 and r3_, r4_
// and block_size_ are reinitialized via step (3) in the algorithm below.
//
// These new regions remain constant until a Flush() occurs. While complicated,
// this allows us to reduce jitter by always requesting the same amount from the
// provided callback.
//
// The algorithm:
//
// 1) Consume input frames into r0_ (r1_ is zero-initialized).
// 2) Position kernel centered at start of r0_ (r2_) and generate output frames
// until kernel is centered at start of r4_ or we've finished generating all
// the output frames.
// 3) Copy r3_ to r1_ and r4_ to r2_.
// 4) Consume input frames into r5_ (zero-pad if we run out of input).
// 5) Goto (2) until all of input is consumed.
// 1) Allocate input_buffer of size: request_frames_ + kKernelSize; this ensures
// there's enough room to read request_frames_ from the callback into region
// r0_ (which will move between the first and subsequent passes).
//
// 2) Let r1_, r2_ each represent half the kernel centered around r0_:
//
// r0_ = input_buffer_ + kKernelSize / 2
// r1_ = input_buffer_
// r2_ = r0_
//
// r0_ is always request_frames_ in size. r1_, r2_ are kKernelSize / 2 in
// size. r1_ must be zero initialized to avoid convolution with garbage (see
// step (5) for why).
//
// 3) Let r3_, r4_ each represent half the kernel right aligned with the end of
// r0_ and choose block_size_ as the distance in frames between r4_ and r2_:
//
// r3_ = r0_ + request_frames_ - kKernelSize
// r4_ = r0_ + request_frames_ - kKernelSize / 2
// block_size_ = r4_ - r2_ = request_frames_ - kKernelSize / 2
//
// 4) Consume request_frames_ frames into r0_.
//
// 5) Position kernel centered at start of r2_ and generate output frames until
// the kernel is centered at the start of r4_ or we've finished generating
// all the output frames.
//
// 6) Wrap left over data from the r3_ to r1_ and r4_ to r2_.
//
// 7) If we're on the second load, in order to avoid overwriting the frames we
// just wrapped from r4_ we need to slide r0_ to the right by the size of
// r4_, which is kKernelSize / 2:
//
// r0_ = r0_ + kKernelSize / 2 = input_buffer_ + kKernelSize
//
// r3_, r4_, and block_size_ then need to be reinitialized, so goto (3).
//
// 8) Else, if we're not on the second load, goto (4).
//
// Note: we're glossing over how the sub-sample handling works with
// |virtual_source_idx_|, etc.
@@ -70,49 +112,49 @@ static double SincScaleFactor(double io_ratio) {
return sinc_scale_factor;
}
SincResampler::SincResampler(double io_sample_rate_ratio,
SincResamplerCallback* read_cb,
int block_size)
: io_sample_rate_ratio_(io_sample_rate_ratio),
virtual_source_idx_(0),
buffer_primed_(false),
read_cb_(read_cb),
block_size_(block_size),
buffer_size_(block_size_ + kKernelSize),
// Create input buffers with a 16-byte alignment for SSE optimizations.
kernel_storage_(static_cast<float*>(
AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
kernel_pre_sinc_storage_(static_cast<float*>(
AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
kernel_window_storage_(static_cast<float*>(
AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
input_buffer_(static_cast<float*>(
AlignedMalloc(sizeof(float) * buffer_size_, 16))),
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE__)
convolve_proc_(WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C),
#elif defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON)
convolve_proc_(WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON ?
Convolve_NEON : Convolve_C),
#endif
// Setup various region pointers in the buffer (see diagram above).
r0_(input_buffer_.get() + kKernelSize / 2),
r1_(input_buffer_.get()),
r2_(r0_),
r3_(r0_ + block_size_ - kKernelSize / 2),
r4_(r0_ + block_size_),
r5_(r0_ + kKernelSize / 2) {
Initialize();
InitializeKernel();
// If we know the minimum architecture at compile time, avoid CPU detection.
// iOS lies about its architecture, so we also need to exclude it here.
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WEBRTC_IOS)
#if defined(__SSE__)
#define CONVOLVE_FUNC Convolve_SSE
void SincResampler::InitializeCPUSpecificFeatures() {}
#else
// X86 CPU detection required. Function will be set by
// InitializeCPUSpecificFeatures().
// TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed.
#define CONVOLVE_FUNC convolve_proc_
void SincResampler::InitializeCPUSpecificFeatures() {
convolve_proc_ = WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C;
}
#endif
#elif defined(WEBRTC_ARCH_ARM_V7)
#if defined(WEBRTC_ARCH_ARM_NEON)
#define CONVOLVE_FUNC Convolve_NEON
void SincResampler::InitializeCPUSpecificFeatures() {}
#else
// NEON CPU detection required. Function will be set by
// InitializeCPUSpecificFeatures().
#define CONVOLVE_FUNC convolve_proc_
void SincResampler::InitializeCPUSpecificFeatures() {
convolve_proc_ = WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON ?
Convolve_NEON : Convolve_C;
}
#endif
#else
// Unknown architecture.
#define CONVOLVE_FUNC Convolve_C
void SincResampler::InitializeCPUSpecificFeatures() {}
#endif
SincResampler::SincResampler(double io_sample_rate_ratio,
int request_frames,
SincResamplerCallback* read_cb)
: io_sample_rate_ratio_(io_sample_rate_ratio),
virtual_source_idx_(0),
buffer_primed_(false),
read_cb_(read_cb),
block_size_(kDefaultBlockSize),
buffer_size_(kDefaultBufferSize),
request_frames_(request_frames),
input_buffer_size_(request_frames_ + kKernelSize),
// Create input buffers with a 16-byte alignment for SSE optimizations.
kernel_storage_(static_cast<float*>(
AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
@@ -121,45 +163,19 @@ SincResampler::SincResampler(double io_sample_rate_ratio,
kernel_window_storage_(static_cast<float*>(
AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
input_buffer_(static_cast<float*>(
AlignedMalloc(sizeof(float) * buffer_size_, 16))),
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE__)
convolve_proc_(WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C),
#elif defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON)
convolve_proc_(WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON ?
Convolve_NEON : Convolve_C),
AlignedMalloc(sizeof(float) * input_buffer_size_, 16))),
#if defined(WEBRTC_RESAMPLER_CPU_DETECTION)
convolve_proc_(NULL),
#endif
// Setup various region pointers in the buffer (see diagram above).
r0_(input_buffer_.get() + kKernelSize / 2),
r1_(input_buffer_.get()),
r2_(r0_),
r3_(r0_ + block_size_ - kKernelSize / 2),
r4_(r0_ + block_size_),
r5_(r0_ + kKernelSize / 2) {
Initialize();
InitializeKernel();
}
SincResampler::~SincResampler() {}
void SincResampler::Initialize() {
// Ensure kKernelSize is a multiple of 32 for easy SSE optimizations; causes
// r0_ and r5_ (used for input) to always be 16-byte aligned by virtue of
// input_buffer_ being 16-byte aligned.
COMPILE_ASSERT(kKernelSize % 32 == 0);
r2_(input_buffer_.get() + kKernelSize / 2) {
#if defined(WEBRTC_RESAMPLER_CPU_DETECTION)
InitializeCPUSpecificFeatures();
assert(convolve_proc_);
#endif
assert(request_frames_ > 0);
Flush();
assert(block_size_ > kKernelSize);
// Basic sanity checks to ensure buffer regions are laid out correctly:
// r0_ and r2_ should always be the same position.
assert(r0_ == r2_);
// r1_ at the beginning of the buffer.
assert(r1_ == input_buffer_.get());
// r1_ left of r2_, r2_ left of r5_ and r1_, r2_ size correct.
assert(r2_ - r1_ == r5_ - r2_);
// r3_ left of r4_, r5_ left of r0_ and r3_ size correct.
assert(r4_ - r3_ == r5_ - r0_);
// r3_, r4_ size correct and r4_ at the end of the buffer.
assert(r4_ + (r4_ - r3_) == r1_ + buffer_size_);
// r5_ size correct and at the end of the buffer.
assert(r5_ + block_size_ == r1_ + buffer_size_);
memset(kernel_storage_.get(), 0,
sizeof(*kernel_storage_.get()) * kKernelStorageSize);
@@ -167,7 +183,26 @@ void SincResampler::Initialize() {
sizeof(*kernel_pre_sinc_storage_.get()) * kKernelStorageSize);
memset(kernel_window_storage_.get(), 0,
sizeof(*kernel_window_storage_.get()) * kKernelStorageSize);
memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * buffer_size_);
InitializeKernel();
}
SincResampler::~SincResampler() {}
void SincResampler::UpdateRegions(bool second_load) {
// Setup various region pointers in the buffer (see diagram above). If we're
// on the second load we need to slide r0_ to the right by kKernelSize / 2.
r0_ = input_buffer_.get() + (second_load ? kKernelSize : kKernelSize / 2);
r3_ = r0_ + request_frames_ - kKernelSize;
r4_ = r0_ + request_frames_ - kKernelSize / 2;
block_size_ = r4_ - r2_;
// r1_ at the beginning of the buffer.
assert(r1_ == input_buffer_.get());
// r1_ left of r2_, r4_ left of r3_ and size correct.
assert(r2_ - r1_ == r4_ - r3_);
// r2_ left of r3.
assert(r2_ < r3_);
}
void SincResampler::InitializeKernel() {
@@ -234,67 +269,59 @@ void SincResampler::SetRatio(double io_sample_rate_ratio) {
}
}
// If we know the minimum architecture avoid function hopping for CPU detection.
#if defined(WEBRTC_ARCH_X86_FAMILY)
#if defined(__SSE__)
#define CONVOLVE_FUNC Convolve_SSE
#else
// X86 CPU detection required. |convolve_proc_| will be set upon construction.
// TODO(dalecurtis): Once Chrome moves to a SSE baseline this can be removed.
#define CONVOLVE_FUNC convolve_proc_
#endif
#elif defined(WEBRTC_ARCH_ARM_V7)
#if defined(WEBRTC_ARCH_ARM_NEON)
#define CONVOLVE_FUNC Convolve_NEON
#else
// NEON CPU detection required. |convolve_proc_| will be set upon construction.
#define CONVOLVE_FUNC convolve_proc_
#endif
#else
// Unknown architecture.
#define CONVOLVE_FUNC Convolve_C
#endif
void SincResampler::Resample(float* destination, int frames) {
void SincResampler::Resample(int frames, float* destination) {
int remaining_frames = frames;
// Step (1) -- Prime the input buffer at the start of the input stream.
if (!buffer_primed_) {
read_cb_->Run(r0_, block_size_ + kKernelSize / 2);
if (!buffer_primed_ && remaining_frames) {
read_cb_->Run(request_frames_, r0_);
buffer_primed_ = true;
}
// Step (2) -- Resample!
// Step (2) -- Resample! const what we can outside of the loop for speed. It
// actually has an impact on ARM performance. See inner loop comment below.
const double current_io_ratio = io_sample_rate_ratio_;
const float* const kernel_ptr = kernel_storage_.get();
while (remaining_frames) {
while (virtual_source_idx_ < block_size_) {
// |i| may be negative if the last Resample() call ended on an iteration
// that put |virtual_source_idx_| over the limit.
//
// Note: The loop construct here can severely impact performance on ARM
// or when built with clang. See https://codereview.chromium.org/18566009/
for (int i = ceil((block_size_ - virtual_source_idx_) / current_io_ratio);
i > 0; --i) {
assert(virtual_source_idx_ < block_size_);
// |virtual_source_idx_| lies in between two kernel offsets so figure out
// what they are.
int source_idx = static_cast<int>(virtual_source_idx_);
double subsample_remainder = virtual_source_idx_ - source_idx;
const int source_idx = virtual_source_idx_;
const double subsample_remainder = virtual_source_idx_ - source_idx;
double virtual_offset_idx = subsample_remainder * kKernelOffsetCount;
int offset_idx = static_cast<int>(virtual_offset_idx);
const double virtual_offset_idx =
subsample_remainder * kKernelOffsetCount;
const int offset_idx = virtual_offset_idx;
// We'll compute "convolutions" for the two kernels which straddle
// |virtual_source_idx_|.
float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;
float* k2 = k1 + kKernelSize;
const float* const k1 = kernel_ptr + offset_idx * kKernelSize;
const float* const k2 = k1 + kKernelSize;
// Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be
// true so long as kKernelSize is a multiple of 16.
assert((reinterpret_cast<uintptr_t>(k1) & 0x0F) == 0u);
assert((reinterpret_cast<uintptr_t>(k2) & 0x0F) == 0u);
assert(0u == (reinterpret_cast<uintptr_t>(k1) & 0x0F));
assert(0u == (reinterpret_cast<uintptr_t>(k2) & 0x0F));
// Initialize input pointer based on quantized |virtual_source_idx_|.
float* input_ptr = r1_ + source_idx;
const float* const input_ptr = r1_ + source_idx;
// Figure out how much to weight each kernel's "convolution".
double kernel_interpolation_factor = virtual_offset_idx - offset_idx;
const double kernel_interpolation_factor =
virtual_offset_idx - offset_idx;
*destination++ = CONVOLVE_FUNC(
input_ptr, k1, k2, kernel_interpolation_factor);
// Advance the virtual index.
virtual_source_idx_ += io_sample_rate_ratio_;
virtual_source_idx_ += current_io_ratio;
if (!--remaining_frames)
return;
@@ -303,31 +330,31 @@ void SincResampler::Resample(float* destination, int frames) {
// Wrap back around to the start.
virtual_source_idx_ -= block_size_;
// Step (3) Copy r3_ to r1_ and r4_ to r2_.
// Step (3) -- Copy r3_, r4_ to r1_, r2_.
// This wraps the last input frames back to the start of the buffer.
memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * (kKernelSize / 2));
memcpy(r2_, r4_, sizeof(*input_buffer_.get()) * (kKernelSize / 2));
memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * kKernelSize);
// Step (4)
// Refresh the buffer with more input.
read_cb_->Run(r5_, block_size_);
// Step (4) -- Reinitialize regions if necessary.
if (r0_ == r2_)
UpdateRegions(true);
// Step (5) -- Refresh the buffer with more input.
read_cb_->Run(request_frames_, r0_);
}
}
#undef CONVOLVE_FUNC
int SincResampler::ChunkSize() {
int SincResampler::ChunkSize() const {
return block_size_ / io_sample_rate_ratio_;
}
int SincResampler::BlockSize() {
return block_size_;
}
void SincResampler::Flush() {
virtual_source_idx_ = 0;
buffer_primed_ = false;
memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * buffer_size_);
memset(input_buffer_.get(), 0,
sizeof(*input_buffer_.get()) * input_buffer_size_);
UpdateRegions(false);
}
float SincResampler::Convolve_C(const float* input_ptr, const float* k1,

View File

@@ -20,6 +20,13 @@
#include "webrtc/test/testsupport/gtest_prod_util.h"
#include "webrtc/typedefs.h"
#if (defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WEBRTC_IOS) && \
!defined(__SSE__)) || \
(defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON))
// Convenience define.
#define WEBRTC_RESAMPLER_CPU_DETECTION
#endif
namespace webrtc {
// Callback class for providing more data into the resampler. Expects |frames|
@@ -28,7 +35,7 @@ namespace webrtc {
class SincResamplerCallback {
public:
virtual ~SincResamplerCallback() {}
virtual void Run(float* destination, int frames) = 0;
virtual void Run(int frames, float* destination) = 0;
};
// SincResampler is a high-quality single-channel sample-rate converter.
@@ -40,43 +47,36 @@ class SincResampler {
// TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
kKernelSize = 32,
// The number of destination frames generated per processing pass. Affects
// how often and for how much SincResampler calls back for input. Must be
// greater than kKernelSize.
kDefaultBlockSize = 512,
// Default request size. Affects how often and for how much SincResampler
// calls back for input. Must be greater than kKernelSize.
kDefaultRequestSize = 512,
// The kernel offset count is used for interpolation and is the number of
// sub-sample kernel shifts. Can be adjusted for quality (higher is better)
// at the expense of allocating more memory.
kKernelOffsetCount = 32,
kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
// The size (in samples) of the internal buffer used by the resampler.
kDefaultBufferSize = kDefaultBlockSize + kKernelSize,
};
// Constructs a SincResampler with the specified |read_cb|, which is used to
// acquire audio data for resampling. |io_sample_rate_ratio| is the ratio of
// input / output sample rates. If desired, the number of destination frames
// generated per processing pass can be specified through |block_size|.
// acquire audio data for resampling. |io_sample_rate_ratio| is the ratio
// of input / output sample rates. |request_frames| controls the size in
// frames of the buffer requested by each |read_cb| call. The value must be
// greater than kKernelSize. Specify kDefaultRequestSize if there are no
// request size constraints.
SincResampler(double io_sample_rate_ratio,
int request_frames,
SincResamplerCallback* read_cb);
SincResampler(double io_sample_rate_ratio,
SincResamplerCallback* read_cb,
int block_size);
virtual ~SincResampler();
// Resample |frames| of data from |read_cb_| into |destination|.
void Resample(float* destination, int frames);
void Resample(int frames, float* destination);
// The maximum size in frames that guarantees Resample() will only make a
// single call to |read_cb_| for more data.
int ChunkSize();
int ChunkSize() const;
// The number of source frames requested per processing pass (and equal to
// |block_size| if provided at construction). The first pass will request
// more to prime the buffer.
int BlockSize();
int request_frames() const { return request_frames_; }
// Flush all buffered data and reset internal indices. Not thread safe, do
// not call while Resample() is in progress.
@@ -86,8 +86,8 @@ class SincResampler {
// the kernels used for resampling. Not thread safe, do not call while
// Resample() is in progress.
//
// TODO(ajm): use this in PushSincResampler rather than reconstructing
// SincResampler.
// TODO(ajm): Use this in PushSincResampler rather than reconstructing
// SincResampler. We would also need a way to update |request_frames_|.
void SetRatio(double io_sample_rate_ratio);
float* get_kernel_for_testing() { return kernel_storage_.get(); }
@@ -96,8 +96,14 @@ class SincResampler {
FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, Convolve);
FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, ConvolveBenchmark);
void Initialize();
void InitializeKernel();
void UpdateRegions(bool second_load);
// Selects runtime specific CPU features like SSE. Must be called before
// using SincResampler.
// TODO(ajm): Currently managed by the class internally. See the note with
// |convolve_proc_| below.
void InitializeCPUSpecificFeatures();
// Compute convolution of |k1| and |k2| over |input_ptr|, resultant sums are
// linearly interpolated using |kernel_interpolation_factor|. On x86, the
@@ -128,11 +134,14 @@ class SincResampler {
// Source of data for resampling.
SincResamplerCallback* read_cb_;
// See kDefaultBlockSize.
// The size (in samples) to request from each |read_cb_| execution.
const int request_frames_;
// The number of source frames processed per pass.
int block_size_;
// See kDefaultBufferSize.
int buffer_size_;
// The size (in samples) of the internal buffer used by the resampler.
const int input_buffer_size_;
// Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize.
// The kernel offsets are sub-sample shifts of a windowed sinc shifted from
@@ -145,21 +154,22 @@ class SincResampler {
scoped_ptr_malloc<float, AlignedFree> input_buffer_;
// Stores the runtime selection of which Convolve function to use.
#if (defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE__)) || \
(defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON))
// TODO(ajm): Move to using a global static which must only be initialized
// once by the user. We're not doing this initially, because we don't have
// e.g. a LazyInstance helper in webrtc.
#if defined(WEBRTC_RESAMPLER_CPU_DETECTION)
typedef float (*ConvolveProc)(const float*, const float*, const float*,
double);
const ConvolveProc convolve_proc_;
ConvolveProc convolve_proc_;
#endif
// Pointers to the various regions inside |input_buffer_|. See the diagram at
// the top of the .cc file for more information.
float* const r0_;
float* r0_;
float* const r1_;
float* const r2_;
float* const r3_;
float* const r4_;
float* const r5_;
float* r3_;
float* r4_;
DISALLOW_COPY_AND_ASSIGN(SincResampler);
};

View File

@@ -36,18 +36,18 @@ static const double kKernelInterpolationFactor = 0.5;
// Helper class to ensure ChunkedResample() functions properly.
class MockSource : public SincResamplerCallback {
public:
MOCK_METHOD2(Run, void(float* destination, int frames));
MOCK_METHOD2(Run, void(int frames, float* destination));
};
ACTION(ClearBuffer) {
memset(arg0, 0, arg1 * sizeof(float));
memset(arg1, 0, arg0 * sizeof(float));
}
ACTION(FillBuffer) {
// Value chosen arbitrarily such that SincResampler resamples it to something
// easily representable on all platforms; e.g., using kSampleRateRatio this
// becomes 1.81219.
memset(arg0, 64, arg1 * sizeof(float));
memset(arg1, 64, arg0 * sizeof(float));
}
// Test requesting multiples of ChunkSize() frames results in the proper number
@@ -57,7 +57,8 @@ TEST(SincResamplerTest, ChunkedResample) {
// Choose a high ratio of input to output samples which will result in quick
// exhaustion of SincResampler's internal buffers.
SincResampler resampler(kSampleRateRatio, &mock_source);
SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize,
&mock_source);
static const int kChunks = 2;
int max_chunk_size = resampler.ChunkSize() * kChunks;
@@ -66,25 +67,26 @@ TEST(SincResamplerTest, ChunkedResample) {
// Verify requesting ChunkSize() frames causes a single callback.
EXPECT_CALL(mock_source, Run(_, _))
.Times(1).WillOnce(ClearBuffer());
resampler.Resample(resampled_destination.get(), resampler.ChunkSize());
resampler.Resample(resampler.ChunkSize(), resampled_destination.get());
// Verify requesting kChunks * ChunkSize() frames causes kChunks callbacks.
testing::Mock::VerifyAndClear(&mock_source);
EXPECT_CALL(mock_source, Run(_, _))
.Times(kChunks).WillRepeatedly(ClearBuffer());
resampler.Resample(resampled_destination.get(), max_chunk_size);
resampler.Resample(max_chunk_size, resampled_destination.get());
}
// Test flush resets the internal state properly.
TEST(SincResamplerTest, Flush) {
MockSource mock_source;
SincResampler resampler(kSampleRateRatio, &mock_source);
SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize,
&mock_source);
scoped_array<float> resampled_destination(new float[resampler.ChunkSize()]);
// Fill the resampler with junk data.
EXPECT_CALL(mock_source, Run(_, _))
.Times(1).WillOnce(FillBuffer());
resampler.Resample(resampled_destination.get(), resampler.ChunkSize() / 2);
resampler.Resample(resampler.ChunkSize() / 2, resampled_destination.get());
ASSERT_NE(resampled_destination[0], 0);
// Flush and request more data, which should all be zeros now.
@@ -92,11 +94,25 @@ TEST(SincResamplerTest, Flush) {
testing::Mock::VerifyAndClear(&mock_source);
EXPECT_CALL(mock_source, Run(_, _))
.Times(1).WillOnce(ClearBuffer());
resampler.Resample(resampled_destination.get(), resampler.ChunkSize() / 2);
resampler.Resample(resampler.ChunkSize() / 2, resampled_destination.get());
for (int i = 0; i < resampler.ChunkSize() / 2; ++i)
ASSERT_FLOAT_EQ(resampled_destination[i], 0);
}
// Test flush resets the internal state properly.
TEST(SincResamplerTest, DISABLED_SetRatioBench) {
MockSource mock_source;
SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize,
&mock_source);
TickTime start = TickTime::Now();
for (int i = 1; i < 10000; ++i)
resampler.SetRatio(1.0 / i);
double total_time_c_us = (TickTime::Now() - start).Microseconds();
printf("SetRatio() took %.2fms.\n", total_time_c_us / 1000);
}
// Define platform independent function name for Convolve* tests.
#if defined(WEBRTC_ARCH_X86_FAMILY)
#define CONVOLVE_FUNC Convolve_SSE
@@ -117,7 +133,8 @@ TEST(SincResamplerTest, Convolve) {
// Initialize a dummy resampler.
MockSource mock_source;
SincResampler resampler(kSampleRateRatio, &mock_source);
SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize,
&mock_source);
// The optimized Convolve methods are slightly more precise than Convolve_C(),
// so comparison must be done using an epsilon.
@@ -150,7 +167,8 @@ TEST(SincResamplerTest, Convolve) {
TEST(SincResamplerTest, ConvolveBenchmark) {
// Initialize a dummy resampler.
MockSource mock_source;
SincResampler resampler(kSampleRateRatio, &mock_source);
SincResampler resampler(kSampleRateRatio, SincResampler::kDefaultRequestSize,
&mock_source);
// Retrieve benchmark iterations from command line.
// TODO(ajm): Reintroduce this as a command line option.
@@ -243,9 +261,8 @@ TEST_P(SincResamplerTest, Resample) {
input_rate_, input_samples, input_nyquist_freq, 0);
const double io_ratio = input_rate_ / static_cast<double>(output_rate_);
SincResampler resampler(
io_ratio,
&resampler_source);
SincResampler resampler(io_ratio, SincResampler::kDefaultRequestSize,
&resampler_source);
// Force an update to the sample rate ratio to ensure dyanmic sample rate
// changes are working correctly.
@@ -265,12 +282,12 @@ TEST_P(SincResamplerTest, Resample) {
scoped_array<float> pure_destination(new float[output_samples]);
// Generate resampled signal.
resampler.Resample(resampled_destination.get(), output_samples);
resampler.Resample(output_samples, resampled_destination.get());
// Generate pure signal.
SinusoidalLinearChirpSource pure_source(
output_rate_, output_samples, input_nyquist_freq, 0);
pure_source.Run(pure_destination.get(), output_samples);
pure_source.Run(output_samples, pure_destination.get());
// Range of the Nyquist frequency (0.5 * min(input rate, output_rate)) which
// we refer to as low and high.

View File

@@ -29,7 +29,7 @@ SinusoidalLinearChirpSource::SinusoidalLinearChirpSource(int sample_rate,
k_ = (max_frequency_ - kMinFrequency) / duration;
}
void SinusoidalLinearChirpSource::Run(float* destination, int frames) {
void SinusoidalLinearChirpSource::Run(int frames, float* destination) {
for (int i = 0; i < frames; ++i, ++current_index_) {
// Filter out frequencies higher than Nyquist.
if (Frequency(current_index_) > 0.5 * sample_rate_) {

View File

@@ -31,7 +31,7 @@ class SinusoidalLinearChirpSource : public SincResamplerCallback {
virtual ~SinusoidalLinearChirpSource() {}
virtual void Run(float* destination, int frames);
virtual void Run(int frames, float* destination);
double Frequency(int position);