diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc new file mode 100644 index 000000000..932eff109 --- /dev/null +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h" + +#include +#include + +#include + +#include "webrtc/base/checks.h" +#include "webrtc/common_audio/vad/include/webrtc_vad.h" +#include "webrtc/common_audio/window_generator.h" + +using std::complex; +using std::max; +using std::min; + +namespace webrtc { + +const int IntelligibilityEnhancer::kErbResolution = 2; +const int IntelligibilityEnhancer::kWindowSizeMs = 2; +// The size of the chunk provided by APM, in milliseconds. +const int IntelligibilityEnhancer::kChunkSizeMs = 10; +const int IntelligibilityEnhancer::kAnalyzeRate = 800; +const int IntelligibilityEnhancer::kVarianceRate = 2; +const float IntelligibilityEnhancer::kClipFreq = 200.0f; +const float IntelligibilityEnhancer::kConfigRho = 0.02f; +const float IntelligibilityEnhancer::kKbdAlpha = 1.5f; +const float IntelligibilityEnhancer::kGainChangeLimit = 0.0125f; + +using VarianceType = intelligibility::VarianceArray::StepType; + +IntelligibilityEnhancer::TransformCallback::TransformCallback( + IntelligibilityEnhancer* parent, + IntelligibilityEnhancer::AudioSource source) + : parent_(parent), + source_(source) {} + +void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock( + const complex* const* in_block, + int in_channels, int frames, int /* out_channels */, + complex* const* out_block) { + DCHECK_EQ(parent_->freqs_, frames); + for (int i = 0; i < in_channels; ++i) { + parent_->DispatchAudio(source_, in_block[i], out_block[i]); + } +} + +IntelligibilityEnhancer::IntelligibilityEnhancer(int erb_resolution, + int sample_rate_hz, + int channels, + int cv_type, float cv_alpha, + int cv_win, + int analysis_rate, + int variance_rate, + float gain_limit) + : freqs_(RealFourier::ComplexLength(RealFourier::FftOrder( + sample_rate_hz * kWindowSizeMs / 1000))), + window_size_(1 << RealFourier::FftOrder(freqs_)), + chunk_length_(sample_rate_hz * kChunkSizeMs / 1000), + bank_size_(GetBankSize(sample_rate_hz, erb_resolution)), + sample_rate_hz_(sample_rate_hz), + erb_resolution_(erb_resolution), + channels_(channels), + analysis_rate_(analysis_rate), + variance_rate_(variance_rate), + clear_variance_(freqs_, static_cast(cv_type), cv_win, + cv_alpha), + noise_variance_(freqs_, VarianceType::kStepInfinite, 475, 0.01f), + filtered_clear_var_(new float[bank_size_]), + filtered_noise_var_(new float[bank_size_]), + filter_bank_(nullptr), + center_freqs_(new float[bank_size_]), + rho_(new float[bank_size_]), + gains_eq_(new float[bank_size_]), + gain_applier_(freqs_, gain_limit), + temp_out_buffer_(nullptr), + input_audio_(new float*[channels]), + kbd_window_(new float[window_size_]), + render_callback_(this, AudioSource::kRenderStream), + capture_callback_(this, AudioSource::kCaptureStream), + block_count_(0), + analysis_step_(0), + vad_high_(nullptr), + vad_low_(nullptr), + vad_tmp_buffer_(new int16_t[chunk_length_]) { + DCHECK_LE(kConfigRho, 1.0f); + + CreateErbBank(); + + WebRtcVad_Create(&vad_high_); + WebRtcVad_Init(vad_high_); + WebRtcVad_set_mode(vad_high_, 0); // high likelihood of speech + WebRtcVad_Create(&vad_low_); + WebRtcVad_Init(vad_low_); + WebRtcVad_set_mode(vad_low_, 3); // low likelihood of speech + + temp_out_buffer_ = static_cast(malloc( + sizeof(*temp_out_buffer_) * channels_ + + sizeof(**temp_out_buffer_) * chunk_length_ * channels_)); + for (int i = 0; i < channels_; ++i) { + temp_out_buffer_[i] = reinterpret_cast(temp_out_buffer_ + channels_) + + chunk_length_ * i; + } + + for (int i = 0; i < bank_size_; ++i) { + rho_[i] = kConfigRho * kConfigRho; + } + + float freqs_khz = kClipFreq / 1000.0f; + int erb_index = static_cast(ceilf(11.17f * logf((freqs_khz + 0.312f) / + (freqs_khz + 14.6575f)) + + 43.0f)); + start_freq_ = max(1, erb_index * kErbResolution); + + WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_, + kbd_window_.get()); + render_mangler_.reset(new LappedTransform(channels_, channels_, + chunk_length_, + kbd_window_.get(), + window_size_, + window_size_ / 2, + &render_callback_)); + capture_mangler_.reset(new LappedTransform(channels_, channels_, + chunk_length_, + kbd_window_.get(), + window_size_, + window_size_ / 2, + &capture_callback_)); +} + +IntelligibilityEnhancer::~IntelligibilityEnhancer() { + WebRtcVad_Free(vad_low_); + WebRtcVad_Free(vad_high_); + free(filter_bank_); +} + +void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) { + for (int i = 0; i < chunk_length_; ++i) { + vad_tmp_buffer_[i] = (int16_t)audio[0][i]; + } + has_voice_low_ = WebRtcVad_Process(vad_low_, sample_rate_hz_, + vad_tmp_buffer_.get(), chunk_length_) == 1; + + render_mangler_->ProcessChunk(audio, temp_out_buffer_); + for (int i = 0; i < channels_; ++i) { + memcpy(audio[i], temp_out_buffer_[i], + chunk_length_ * sizeof(**temp_out_buffer_)); + } +} + +void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) { + for (int i = 0; i < chunk_length_; ++i) { + vad_tmp_buffer_[i] = (int16_t)audio[0][i]; + } + // TODO(bercic): the VAD was always detecting voice in the noise stream, + // no matter what the aggressiveness, so it was temporarily disabled here + + //if (WebRtcVad_Process(vad_high_, sample_rate_hz_, vad_tmp_buffer_.get(), + // chunk_length_) == 1) { + // printf("capture HAS speech\n"); + // return; + //} + //printf("capture NO speech\n"); + capture_mangler_->ProcessChunk(audio, temp_out_buffer_); +} + +void IntelligibilityEnhancer::DispatchAudio( + IntelligibilityEnhancer::AudioSource source, + const complex* in_block, complex* out_block) { + switch (source) { + case kRenderStream: + ProcessClearBlock(in_block, out_block); + break; + case kCaptureStream: + ProcessNoiseBlock(in_block, out_block); + break; + } +} + +void IntelligibilityEnhancer::ProcessClearBlock(const complex* in_block, + complex* out_block) { + float power_target; + + if (block_count_ < 2) { + memset(out_block, 0, freqs_ * sizeof(*out_block)); + ++block_count_; + return; + } + + if (has_voice_low_ || true) { + clear_variance_.Step(in_block, false); + power_target = std::accumulate(clear_variance_.variance(), + clear_variance_.variance() + freqs_, 0.0f); + + if (block_count_ % analysis_rate_ == analysis_rate_ - 1) { + AnalyzeClearBlock(power_target); + ++analysis_step_; + if (analysis_step_ == variance_rate_) { + analysis_step_ = 0; + clear_variance_.Clear(); + noise_variance_.Clear(); + } + } + ++block_count_; + } + + /* efidata(n,:) = sqrt(b(n)) * fidata(n,:) */ + gain_applier_.Apply(in_block, out_block); +} + +void IntelligibilityEnhancer::AnalyzeClearBlock(float power_target) { + FilterVariance(clear_variance_.variance(), filtered_clear_var_.get()); + FilterVariance(noise_variance_.variance(), filtered_noise_var_.get()); + + /* lambda binary search */ + + float lambda_bot = -1.0f, lambda_top = -10e-18f, lambda; + float power_bot, power_top, power; + SolveEquation14(lambda_top, start_freq_, gains_eq_.get()); + power_top = DotProduct(gains_eq_.get(), filtered_clear_var_.get(), + bank_size_); + SolveEquation14(lambda_bot, start_freq_, gains_eq_.get()); + power_bot = DotProduct(gains_eq_.get(), filtered_clear_var_.get(), + bank_size_); + DCHECK(power_target >= power_bot && power_target <= power_top); + + float power_ratio = 2.0f; + int iters = 0; + while (fabs(power_ratio - 1.0f) > 0.001f && iters <= 100) { + lambda = lambda_bot + (lambda_top - lambda_bot) / 2.0f; + SolveEquation14(lambda, start_freq_, gains_eq_.get()); + power = DotProduct(gains_eq_.get(), filtered_clear_var_.get(), bank_size_); + if (power < power_target) { + lambda_bot = lambda; + } else { + lambda_top = lambda; + } + power_ratio = fabs(power / power_target); + ++iters; + } + + /* b = filterbank' * b */ + float* gains = gain_applier_.target(); + for (int i = 0; i < freqs_; ++i) { + gains[i] = 0.0f; + for (int j = 0; j < bank_size_; ++j) { + gains[i] = fmaf(filter_bank_[j][i], gains_eq_[j], gains[i]); + } + } +} + +void IntelligibilityEnhancer::ProcessNoiseBlock(const complex* in_block, + complex* /*out_block*/) { + noise_variance_.Step(in_block); +} + +int IntelligibilityEnhancer::GetBankSize(int sample_rate, int erb_resolution) { + float freq_limit = sample_rate / 2000.0f; + int erb_scale = ceilf(11.17f * logf((freq_limit + 0.312f) / + (freq_limit + 14.6575f)) + 43.0f); + return erb_scale * erb_resolution; +} + +void IntelligibilityEnhancer::CreateErbBank() { + int lf = 1, rf = 4; + + for (int i = 0; i < bank_size_; ++i) { + float abs_temp = fabsf((i + 1.0f) / static_cast(erb_resolution_)); + center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp)); + center_freqs_[i] -= 14678.49f; + } + float last_center_freq = center_freqs_[bank_size_ - 1]; + for (int i = 0; i < bank_size_; ++i) { + center_freqs_[i] *= 0.5f * sample_rate_hz_ / last_center_freq; + } + + filter_bank_ = static_cast(malloc( + sizeof(*filter_bank_) * bank_size_ + + sizeof(**filter_bank_) * freqs_ * bank_size_)); + for (int i = 0; i < bank_size_; ++i) { + filter_bank_[i] = reinterpret_cast(filter_bank_ + bank_size_) + + freqs_ * i; + } + + for (int i = 1; i <= bank_size_; ++i) { + int lll, ll, rr, rrr; + lll = round(center_freqs_[max(1, i - lf) - 1] * freqs_ / + (0.5f * sample_rate_hz_)); + ll = round(center_freqs_[max(1, i ) - 1] * freqs_ / + (0.5f * sample_rate_hz_)); + lll = min(freqs_, max(lll, 1)) - 1; + ll = min(freqs_, max(ll, 1)) - 1; + + rrr = round(center_freqs_[min(bank_size_, i + rf) - 1] * freqs_ / + (0.5f * sample_rate_hz_)); + rr = round(center_freqs_[min(bank_size_, i + 1) - 1] * freqs_ / + (0.5f * sample_rate_hz_)); + rrr = min(freqs_, max(rrr, 1)) - 1; + rr = min(freqs_, max(rr, 1)) - 1; + + float step, element; + + step = 1.0f / (ll - lll); + element = 0.0f; + for (int j = lll; j <= ll; ++j) { + filter_bank_[i - 1][j] = element; + element += step; + } + step = 1.0f / (rrr - rr); + element = 1.0f; + for (int j = rr; j <= rrr; ++j) { + filter_bank_[i - 1][j] = element; + element -= step; + } + for (int j = ll; j <= rr; ++j) { + filter_bank_[i - 1][j] = 1.0f; + } + } + + float sum; + for (int i = 0; i < freqs_; ++i) { + sum = 0.0f; + for (int j = 0; j < bank_size_; ++j) { + sum += filter_bank_[j][i]; + } + for (int j = 0; j < bank_size_; ++j) { + filter_bank_[j][i] /= sum; + } + } +} + +void IntelligibilityEnhancer::SolveEquation14(float lambda, int start_freq, + float* sols) { + bool quadratic = (kConfigRho < 1.0f); + const float* var_x0 = filtered_clear_var_.get(); + const float* var_n0 = filtered_noise_var_.get(); + + for (int n = 0; n < start_freq; ++n) { + sols[n] = 1.0f; + } + for (int n = start_freq - 1; n < bank_size_; ++n) { + float alpha0, beta0, gamma0; + gamma0 = 0.5f * rho_[n] * var_x0[n] * var_n0[n] + + lambda * var_x0[n] * var_n0[n] * var_n0[n]; + beta0 = lambda * var_x0[n] * (2 - rho_[n]) * var_x0[n] * var_n0[n]; + if (quadratic) { + alpha0 = lambda * var_x0[n] * (1 - rho_[n]) * var_x0[n] * var_x0[n]; + sols[n] = (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) + / (2 * alpha0); + } else { + sols[n] = -gamma0 / beta0; + } + sols[n] = fmax(0, sols[n]); + } +} + +void IntelligibilityEnhancer::FilterVariance(const float* var, float* result) { + for (int i = 0; i < bank_size_; ++i) { + result[i] = DotProduct(filter_bank_[i], var, freqs_); + } +} + +float IntelligibilityEnhancer::DotProduct(const float* a, const float* b, + int length) { + float ret = 0.0f; + + for (int i = 0; i < length; ++i) { + ret = fmaf(a[i], b[i], ret); + } + return ret; +} + +} // namespace webrtc + diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h new file mode 100644 index 000000000..d0818f688 --- /dev/null +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ + +#include + +#include "webrtc/common_audio/lapped_transform.h" +#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h" +#include "webrtc/system_wrappers/interface/scoped_ptr.h" + +struct WebRtcVadInst; +typedef struct WebRtcVadInst VadInst; + +namespace webrtc { + +// Speech intelligibility enhancement module. Reads render and capture +// audio streams and modifies the render stream with a set of gains per +// frequency bin to enhance speech against the noise background. +class IntelligibilityEnhancer { + public: + // Construct a new instance with the given filter bank resolution, + // sampling rate, number of channels and analysis rates. + // |analysis_rate| sets the number of input blocks (containing speech!) + // to elapse before a new gain computation is made. |variance_rate| specifies + // the number of gain recomputations after which the variances are reset. + // |cv_*| are parameters for the VarianceArray constructor for the + // lear speech stream. + // TODO(bercic): the |cv_*|, |*_rate| and |gain_limit| parameters should + // probably go away once fine tuning is done. They override the internal + // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate). + IntelligibilityEnhancer(int erb_resolution, int sample_rate_hz, int channels, + int cv_type, float cv_alpha, int cv_win, + int analysis_rate, int variance_rate, + float gain_limit); + ~IntelligibilityEnhancer(); + + void ProcessRenderAudio(float* const* audio); + void ProcessCaptureAudio(float* const* audio); + + private: + enum AudioSource { + kRenderStream = 0, + kCaptureStream, + }; + + class TransformCallback : public LappedTransform::Callback { + public: + TransformCallback(IntelligibilityEnhancer* parent, AudioSource source); + virtual void ProcessAudioBlock(const std::complex* const* in_block, + int in_channels, int frames, + int out_channels, + std::complex* const* out_block); + + private: + IntelligibilityEnhancer* parent_; + AudioSource source_; + }; + friend class TransformCallback; + + void DispatchAudio(AudioSource source, const std::complex* in_block, + std::complex* out_block); + void ProcessClearBlock(const std::complex* in_block, + std::complex* out_block); + void AnalyzeClearBlock(float power_target); + void ProcessNoiseBlock(const std::complex* in_block, + std::complex* out_block); + + static int GetBankSize(int sample_rate, int erb_resolution); + void CreateErbBank(); + void SolveEquation14(float lambda, int start_freq, float* sols); + void FilterVariance(const float* var, float* result); + static float DotProduct(const float* a, const float* b, int length); + + static const int kErbResolution; + static const int kWindowSizeMs; + static const int kChunkSizeMs; + static const int kAnalyzeRate; + static const int kVarianceRate; + static const float kClipFreq; + static const float kConfigRho; + static const float kKbdAlpha; + static const float kGainChangeLimit; + + const int freqs_; + const int window_size_; // window size in samples; also the block size + const int chunk_length_; // chunk size in samples + const int bank_size_; + const int sample_rate_hz_; + const int erb_resolution_; + const int channels_; + const int analysis_rate_; + const int variance_rate_; + + intelligibility::VarianceArray clear_variance_; + intelligibility::VarianceArray noise_variance_; + scoped_ptr filtered_clear_var_; + scoped_ptr filtered_noise_var_; + float** filter_bank_; + scoped_ptr center_freqs_; + int start_freq_; + scoped_ptr rho_; + scoped_ptr gains_eq_; + intelligibility::GainApplier gain_applier_; + + // Destination buffer used to reassemble blocked chunks before overwriting + // the original input array with modifications. + float** temp_out_buffer_; + scoped_ptr input_audio_; + scoped_ptr kbd_window_; + TransformCallback render_callback_; + TransformCallback capture_callback_; + scoped_ptr render_mangler_; + scoped_ptr capture_mangler_; + int block_count_; + int analysis_step_; + + // TODO(bercic): Quick stopgap measure for voice detection in the clear + // and noise streams. + VadInst* vad_high_; + VadInst* vad_low_; + scoped_ptr vad_tmp_buffer_; + bool has_voice_low_; +}; + +} // namespace webrtc + +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ + diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_proc.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_proc.cc new file mode 100644 index 000000000..b0ea2dfee --- /dev/null +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_proc.cc @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "gflags/gflags.h" +#include "webrtc/base/checks.h" +#include "webrtc/common_audio/real_fourier.h" +#include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h" +#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h" +#include "webrtc/system_wrappers/interface/critical_section_wrapper.h" +#include "webrtc/system_wrappers/interface/scoped_ptr.h" + +const int16_t* in_ipcm; +int16_t* out_ipcm; +const int16_t* noise_ipcm; + +float* in_fpcm; +float* out_fpcm; +float* noise_fpcm; +float* noise_cursor; +float* clear_cursor; + +int samples; +int fragment_size; + +using std::complex; +using webrtc::RealFourier; +using webrtc::IntelligibilityEnhancer; + +DEFINE_int32(clear_type, webrtc::intelligibility::VarianceArray::kStepInfinite, + "Variance algorithm for clear data."); +DEFINE_double(clear_alpha, 0.9, + "Variance decay factor for clear data."); +DEFINE_int32(clear_window, 475, + "Window size for windowed variance for clear data."); +DEFINE_int32(sample_rate, 16000, + "Audio sample rate used in the input and output files."); +DEFINE_int32(ana_rate, 800, + "Analysis rate; gains recalculated every N blocks."); +DEFINE_int32(var_rate, 2, + "Variance clear rate; history is forgotten every N gain recalculations."); +DEFINE_double(gain_limit, 1000.0, "Maximum gain change in one block."); + +DEFINE_bool(repeat, false, "Repeat input file ad nauseam."); + +DEFINE_string(clear_file, "speech.pcm", "Input file with clear speech."); +DEFINE_string(noise_file, "noise.pcm", "Input file with noise data."); +DEFINE_string(out_file, "proc_enhanced.pcm", "Enhanced output. Use '-' to " + "pipe through aplay internally."); + +// Write an Sun AU-formatted audio chunk into file descriptor |fd|. Can be used +// to pipe the audio stream directly into aplay. +void writeau(int fd) { + uint32_t thing; + + write(fd, ".snd", 4); + thing = htonl(24); + write(fd, &thing, sizeof(thing)); + thing = htonl(0xffffffff); + write(fd, &thing, sizeof(thing)); + thing = htonl(3); + write(fd, &thing, sizeof(thing)); + thing = htonl(FLAGS_sample_rate); + write(fd, &thing, sizeof(thing)); + thing = htonl(1); + write(fd, &thing, sizeof(thing)); + + for (int i = 0; i < samples; ++i) { + out_ipcm[i] = htons(out_ipcm[i]); + } + write(fd, out_ipcm, sizeof(*out_ipcm) * samples); +} + +int main(int argc, char* argv[]) { + google::SetUsageMessage("\n\nVariance algorithm types are:\n" + " 0 - infinite/normal,\n" + " 1 - exponentially decaying,\n" + " 2 - rolling window.\n" + "\nInput files must be little-endian 16-bit signed raw PCM.\n"); + google::ParseCommandLineFlags(&argc, &argv, true); + + const char* in_name = FLAGS_clear_file.c_str(); + const char* out_name = FLAGS_out_file.c_str(); + const char* noise_name = FLAGS_noise_file.c_str(); + struct stat in_stat, noise_stat; + int in_fd, out_fd, noise_fd; + FILE* aplay_file = nullptr; + + fragment_size = FLAGS_sample_rate / 100; + + stat(in_name, &in_stat); + stat(noise_name, &noise_stat); + samples = in_stat.st_size / sizeof(*in_ipcm); + + in_fd = open(in_name, O_RDONLY); + if (!strcmp(out_name, "-")) { + aplay_file = popen("aplay -t au", "w"); + out_fd = fileno(aplay_file); + } else { + out_fd = open(out_name, O_WRONLY | O_CREAT | O_TRUNC, + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); + } + noise_fd = open(noise_name, O_RDONLY); + + in_ipcm = static_cast(mmap(nullptr, in_stat.st_size, PROT_READ, + MAP_PRIVATE, in_fd, 0)); + noise_ipcm = static_cast(mmap(nullptr, noise_stat.st_size, + PROT_READ, MAP_PRIVATE, noise_fd, 0)); + out_ipcm = new int16_t[samples]; + out_fpcm = new float[samples]; + in_fpcm = new float[samples]; + noise_fpcm = new float[samples]; + + for (int i = 0; i < samples; ++i) { + noise_fpcm[i] = noise_ipcm[i % (noise_stat.st_size / sizeof(*noise_ipcm))]; + } + + //feenableexcept(FE_INVALID | FE_OVERFLOW); + IntelligibilityEnhancer enh(2, + FLAGS_sample_rate, 1, + FLAGS_clear_type, + static_cast(FLAGS_clear_alpha), + FLAGS_clear_window, + FLAGS_ana_rate, + FLAGS_var_rate, + FLAGS_gain_limit); + + // Slice the input into smaller chunks, as the APM would do, and feed them + // into the enhancer. Repeat indefinitely if FLAGS_repeat is set. + do { + noise_cursor = noise_fpcm; + clear_cursor = in_fpcm; + for (int i = 0; i < samples; ++i) { + in_fpcm[i] = in_ipcm[i]; + } + + for (int i = 0; i < samples; i += fragment_size) { + enh.ProcessCaptureAudio(&noise_cursor); + enh.ProcessRenderAudio(&clear_cursor); + clear_cursor += fragment_size; + noise_cursor += fragment_size; + } + + for (int i = 0; i < samples; ++i) { + out_ipcm[i] = static_cast(in_fpcm[i]); + } + if (!strcmp(out_name, "-")) { + writeau(out_fd); + } else { + write(out_fd, out_ipcm, samples * sizeof(*out_ipcm)); + } + } while (FLAGS_repeat); + + munmap(const_cast(noise_ipcm), noise_stat.st_size); + munmap(const_cast(in_ipcm), in_stat.st_size); + close(noise_fd); + if (aplay_file) { + pclose(aplay_file); + } else { + close(out_fd); + } + close(in_fd); + + return 0; +} + diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc new file mode 100644 index 000000000..e6fc3fa6a --- /dev/null +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h" + +#include +#include +#include + +using std::complex; + +namespace { + +// Return |current| changed towards |target|, with the change being at most +// |limit|. +inline float UpdateFactor(float target, float current, float limit) { + float delta = fabsf(target - current); + float sign = copysign(1.0f, target - current); + return current + sign * fminf(delta, limit); +} + +// std::isfinite for complex numbers. +inline bool cplxfinite(complex c) { + return std::isfinite(c.real()) && std::isfinite(c.imag()); +} + +// std::isnormal for complex numbers. +inline bool cplxnormal(complex c) { + return std::isnormal(c.real()) && std::isnormal(c.imag()); +} + +// Apply a small fudge to degenerate complex values. The numbers in the array +// were chosen randomly, so that even a series of all zeroes has some small +// variability. +inline complex zerofudge(complex c) { + const static complex fudge[7] = { + {0.001f, 0.002f}, {0.008f, 0.001f}, {0.003f, 0.008f}, {0.0006f, 0.0009f}, + {0.001f, 0.004f}, {0.003f, 0.004f}, {0.002f, 0.009f} + }; + static int fudge_index = 0; + if (cplxfinite(c) && !cplxnormal(c)) { + fudge_index = (fudge_index + 1) % 7; + return c + fudge[fudge_index]; + } + return c; +} + +// Incremental mean computation. Return the mean of the series with the +// mean |mean| with added |data|. +inline complex NewMean(complex mean, complex data, + int count) { + return mean + (data - mean) / static_cast(count); +} + +inline void AddToMean(complex data, int count, complex* mean) { + (*mean) = NewMean(*mean, data, count); +} + +} // namespace + +using std::min; + +namespace webrtc { + +namespace intelligibility { + +static const int kWindowBlockSize = 10; + +VarianceArray::VarianceArray(int freqs, StepType type, int window_size, + float decay) + : running_mean_(new complex[freqs]()), + running_mean_sq_(new complex[freqs]()), + sub_running_mean_(new complex[freqs]()), + sub_running_mean_sq_(new complex[freqs]()), + variance_(new float[freqs]()), + conj_sum_(new float[freqs]()), + freqs_(freqs), + window_size_(window_size), + decay_(decay), + history_cursor_(0), + count_(0), + array_mean_(0.0f) { + history_.reset(new scoped_ptr[]>[freqs_]()); + for (int i = 0; i < freqs_; ++i) { + history_[i].reset(new complex[window_size_]()); + } + subhistory_.reset(new scoped_ptr[]>[freqs_]()); + for (int i = 0; i < freqs_; ++i) { + subhistory_[i].reset(new complex[window_size_]()); + } + subhistory_sq_.reset(new scoped_ptr[]>[freqs_]()); + for (int i = 0; i < freqs_; ++i) { + subhistory_sq_[i].reset(new complex[window_size_]()); + } + switch (type) { + case kStepInfinite: + step_func_ = &VarianceArray::InfiniteStep; + break; + case kStepDecaying: + step_func_ = &VarianceArray::DecayStep; + break; + case kStepWindowed: + step_func_ = &VarianceArray::WindowedStep; + break; + case kStepBlocked: + step_func_ = &VarianceArray::BlockedStep; + break; + } +} + +// Compute the variance with Welford's algorithm, adding some fudge to +// the input in case of all-zeroes. +void VarianceArray::InfiniteStep(const complex* data, bool skip_fudge) { + array_mean_ = 0.0f; + ++count_; + for (int i = 0; i < freqs_; ++i) { + complex sample = data[i]; + if (!skip_fudge) { + sample = zerofudge(sample); + } + if (count_ == 1) { + running_mean_[i] = sample; + variance_[i] = 0.0f; + } else { + float old_sum = conj_sum_[i]; + complex old_mean = running_mean_[i]; + running_mean_[i] = old_mean + (sample - old_mean) / + static_cast(count_); + conj_sum_[i] = (old_sum + std::conj(sample - old_mean) * + (sample - running_mean_[i])).real(); + variance_[i] = conj_sum_[i] / (count_ - 1); // + fudge[fudge_index].real(); + if (skip_fudge && false) { + //variance_[i] -= fudge[fudge_index].real(); + } + } + array_mean_ += (variance_[i] - array_mean_) / (i + 1); + } +} + +// Compute the variance from the beginning, with exponential decaying of the +// series data. +void VarianceArray::DecayStep(const complex* data, bool /*dummy*/) { + array_mean_ = 0.0f; + ++count_; + for (int i = 0; i < freqs_; ++i) { + complex sample = data[i]; + sample = zerofudge(sample); + + if (count_ == 1) { + running_mean_[i] = sample; + running_mean_sq_[i] = sample * std::conj(sample); + variance_[i] = 0.0f; + } else { + complex prev = running_mean_[i]; + complex prev2 = running_mean_sq_[i]; + running_mean_[i] = decay_ * prev + (1.0f - decay_) * sample; + running_mean_sq_[i] = decay_ * prev2 + + (1.0f - decay_) * sample * std::conj(sample); + //variance_[i] = decay_ * variance_[i] + (1.0f - decay_) * ( + // (sample - running_mean_[i]) * std::conj(sample - running_mean_[i])).real(); + variance_[i] = (running_mean_sq_[i] - running_mean_[i] * std::conj(running_mean_[i])).real(); + } + + array_mean_ += (variance_[i] - array_mean_) / (i + 1); + } +} + +// Windowed variance computation. On each step, the variances for the +// window are recomputed from scratch, using Welford's algorithm. +void VarianceArray::WindowedStep(const complex* data, bool /*dummy*/) { + int num = min(count_ + 1, window_size_); + array_mean_ = 0.0f; + for (int i = 0; i < freqs_; ++i) { + complex mean; + float conj_sum = 0.0f; + + history_[i][history_cursor_] = data[i]; + + mean = history_[i][history_cursor_]; + variance_[i] = 0.0f; + for (int j = 1; j < num; ++j) { + complex sample = zerofudge( + history_[i][(history_cursor_ + j) % window_size_]); + sample = history_[i][(history_cursor_ + j) % window_size_]; + float old_sum = conj_sum; + complex old_mean = mean; + + mean = old_mean + (sample - old_mean) / static_cast(j + 1); + conj_sum = (old_sum + std::conj(sample - old_mean) * + (sample - mean)).real(); + variance_[i] = conj_sum / (j); + } + array_mean_ += (variance_[i] - array_mean_) / (i + 1); + } + history_cursor_ = (history_cursor_ + 1) % window_size_; + ++count_; +} + +// Variance with a window of blocks. Within each block, the variances are +// recomputed from scratch at every stp, using |Var(X) = E(X^2) - E^2(X)|. +// Once a block is filled with kWindowBlockSize samples, it is added to the +// history window and a new block is started. The variances for the window +// are recomputed from scratch at each of these transitions. +void VarianceArray::BlockedStep(const complex* data, bool /*dummy*/) { + int blocks = min(window_size_, history_cursor_); + for (int i = 0; i < freqs_; ++i) { + AddToMean(data[i], count_ + 1, &sub_running_mean_[i]); + AddToMean(data[i] * std::conj(data[i]), count_ + 1, + &sub_running_mean_sq_[i]); + subhistory_[i][history_cursor_ % window_size_] = sub_running_mean_[i]; + subhistory_sq_[i][history_cursor_ % window_size_] = sub_running_mean_sq_[i]; + + variance_[i] = (NewMean(running_mean_sq_[i], sub_running_mean_sq_[i], + blocks) - + NewMean(running_mean_[i], sub_running_mean_[i], blocks) * + std::conj(NewMean(running_mean_[i], sub_running_mean_[i], + blocks))).real(); + if (count_ == kWindowBlockSize - 1) { + sub_running_mean_[i] = complex(0.0f, 0.0f); + sub_running_mean_sq_[i] = complex(0.0f, 0.0f); + running_mean_[i] = complex(0.0f, 0.0f); + running_mean_sq_[i] = complex(0.0f, 0.0f); + for (int j = 0; j < min(window_size_, history_cursor_); ++j) { + AddToMean(subhistory_[i][j], j, &running_mean_[i]); + AddToMean(subhistory_sq_[i][j], j, &running_mean_sq_[i]); + } + ++history_cursor_; + } + } + ++count_; + if (count_ == kWindowBlockSize) { + count_ = 0; + } +} + +void VarianceArray::Clear() { + memset(running_mean_.get(), 0, sizeof(*running_mean_.get()) * freqs_); + memset(running_mean_sq_.get(), 0, sizeof(*running_mean_sq_.get()) * freqs_); + memset(variance_.get(), 0, sizeof(*variance_.get()) * freqs_); + memset(conj_sum_.get(), 0, sizeof(*conj_sum_.get()) * freqs_); + history_cursor_ = 0; + count_ = 0; + array_mean_ = 0.0f; +} + +void VarianceArray::ApplyScale(float scale) { + array_mean_ = 0.0f; + for (int i = 0; i < freqs_; ++i) { + variance_[i] *= scale * scale; + array_mean_ += (variance_[i] - array_mean_) / (i + 1); + } +} + +GainApplier::GainApplier(int freqs, float change_limit) + : freqs_(freqs), + change_limit_(change_limit), + target_(new float[freqs]()), + current_(new float[freqs]()) { + for (int i = 0; i < freqs; ++i) { + target_[i] = 1.0f; + current_[i] = 1.0f; + } +} + +void GainApplier::Apply(const complex* in_block, + complex* out_block) { + for (int i = 0; i < freqs_; ++i) { + float factor = sqrtf(fabsf(current_[i])); + if (!std::isnormal(factor)) { + factor = 1.0f; + } + out_block[i] = factor * in_block[i]; + current_[i] = UpdateFactor(target_[i], current_[i], change_limit_); + } +} + +} // namespace intelligibility + +} // namespace webrtc + diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h new file mode 100644 index 000000000..550f293a7 --- /dev/null +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_UTILS_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_UTILS_H_ + +#include + +#include "webrtc/system_wrappers/interface/scoped_ptr.h" + +namespace webrtc { + +namespace intelligibility { + +// Internal helper for computing the variances of a stream of arrays. +// The result is an array of variances per position: the i-th variance +// is the variance of the stream of data on the i-th positions in the +// input arrays. +// There are four methods of computation: +// * kStepInfinite computes variances from the beginning onwards +// * kStepDecaying uses a recursive exponential decay formula with a +// settable forgetting factor +// * kStepWindowed computes variances within a moving window +// * kStepBlocked is similar to kStepWindowed, but history is kept +// as a rolling window of blocks: multiple input elements are used for +// one block and the history then consists of the variances of these blocks +// with the same effect as kStepWindowed, but less storage, so the window +// can be longer +class VarianceArray { + public: + enum StepType { + kStepInfinite = 0, + kStepDecaying, + kStepWindowed, + kStepBlocked + }; + + // Construct an instance for the given input array length (|freqs|) and + // computation algorithm (|type|), with the appropriate parameters. + // |window_size| is the number of samples for kStepWindowed and + // the number of blocks for kStepBlocked. |decay| is the forgetting factor + // for kStepDecaying. + VarianceArray(int freqs, StepType type, int window_size, float decay); + + // Add a new data point to the series and compute the new variances. + // TODO(bercic) |skip_fudge| is a flag for kStepWindowed and kStepDecaying, + // whether they should skip adding some small dummy values to the input + // to prevent problems with all-zero inputs. Can probably be removed. + void Step(const std::complex* data, bool skip_fudge = false) { + (this->*step_func_)(data, skip_fudge); + } + // Reset variances to zero and forget all history. + void Clear(); + // Scale the input data by |scale|. Effectively multiply variances + // by |scale^2|. + void ApplyScale(float scale); + + // The current set of variances. + const float* variance() const { + return variance_.get(); + } + + // The mean value of the current set of variances. + float array_mean() const { + return array_mean_; + } + + private: + void InfiniteStep(const std::complex* data, bool dummy); + void DecayStep(const std::complex* data, bool dummy); + void WindowedStep(const std::complex* data, bool dummy); + void BlockedStep(const std::complex* data, bool dummy); + + // The current average X and X^2. + scoped_ptr[]> running_mean_; + scoped_ptr[]> running_mean_sq_; + + // Average X and X^2 for the current block in kStepBlocked. + scoped_ptr[]> sub_running_mean_; + scoped_ptr[]> sub_running_mean_sq_; + + // Sample history for the rolling window in kStepWindowed and block-wise + // histories for kStepBlocked. + scoped_ptr[]>[]> history_; + scoped_ptr[]>[]> subhistory_; + scoped_ptr[]>[]> subhistory_sq_; + + // The current set of variances and sums for Welford's algorithm. + scoped_ptr variance_; + scoped_ptr conj_sum_; + + const int freqs_; + const int window_size_; + const float decay_; + int history_cursor_; + int count_; + float array_mean_; + void (VarianceArray::*step_func_)(const std::complex*, bool); +}; + +// Helper class for smoothing gain changes. On each applicatiion step, the +// currently used gains are changed towards a set of settable target gains, +// constrained by a limit on the magnitude of the changes. +class GainApplier { + public: + GainApplier(int freqs, float change_limit); + + // Copy |in_block| to |out_block|, multiplied by the current set of gains, + // and step the current set of gains towards the target set. + void Apply(const std::complex* in_block, + std::complex* out_block); + + // Return the current target gain set. Modify this array to set the targets. + float* target() const { + return target_.get(); + } + + private: + const int freqs_; + const float change_limit_; + scoped_ptr target_; + scoped_ptr current_; +}; + +} // namespace intelligibility + +} // namespace webrtc + +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_UTILS_H_ +