Simplify mask calculation

There are only 2 things that prevent the output to be bit-exact:
* The zero initialization of the postfilter_mask_ and high_pass_postfilter_mask_, which only afects the first blocks.
* The re-tuning of the target presence estimation, since only the bins between low_average_start_bin_ and high_average_end_bin_ are of interest.
This latter was not taken into account before.

R=andrew@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/35139004

Cr-Commit-Position: refs/heads/master@{#8368}
git-svn-id: http://webrtc.googlecode.com/svn/trunk@8368 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
aluebs@webrtc.org 2015-02-13 19:37:38 +00:00
parent 56cb0ea99c
commit 92a19bcbd7
2 changed files with 29 additions and 60 deletions

View File

@ -15,6 +15,7 @@
#include <algorithm>
#include <cmath>
#include "webrtc/base/arraysize.h"
#include "webrtc/common_audio/window_generator.h"
#include "webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.h"
@ -61,13 +62,16 @@ const float kCovUniformGapHalfWidth = 0.001f;
// Alpha coefficient for mask smoothing.
const float kMaskSmoothAlpha = 0.2f;
// The average mask is computed from masks in this mid-frequency range.
// The average mask is computed from masks in this mid-frequency range. If these
// ranges are changed |kMaskQuantile| might need to be adjusted.
const int kLowAverageStartHz = 200;
const int kLowAverageEndHz = 400;
const int kHighAverageStartHz = 6000;
const int kHighAverageEndHz = 6500;
// Quantile of mask values which is used to estimate target presence.
const float kMaskQuantile = 0.3f;
// Mask threshold over which the data is considered signal and not interference.
const float kMaskTargetThreshold = 0.3f;
// Time in seconds after which the data is considered interference if the mask
@ -141,12 +145,7 @@ float SumAbs(const ComplexMatrix<float>& mat) {
Beamformer::Beamformer(const std::vector<Point>& array_geometry)
: num_input_channels_(array_geometry.size()),
mic_spacing_(MicSpacingFromGeometry(array_geometry)) {
WindowGenerator::KaiserBesselDerived(kAlpha, kFftSize, window_);
for (int i = 0; i < kNumberSavedPostfilterMasks; ++i) {
postfilter_masks_[i].Resize(1, kNumFreqBins);
}
}
void Beamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
@ -160,8 +159,7 @@ void Beamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
Round(kHighAverageStartHz * kFftSize / sample_rate_hz_);
high_average_end_bin_ =
Round(kHighAverageEndHz * kFftSize / sample_rate_hz_);
current_block_ix_ = 0;
previous_block_ix_ = -1;
high_pass_postfilter_mask_ = 1.f;
is_target_present_ = false;
hold_target_blocks_ = kHoldTargetSeconds * 2 * sample_rate_hz / kFftSize;
interference_blocks_count_ = hold_target_blocks_;
@ -178,8 +176,8 @@ void Beamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
kFftSize,
kFftSize / 2,
this));
for (int i = 0; i < kNumFreqBins; ++i) {
postfilter_mask_[i] = 1.f;
float freq_hz = (static_cast<float>(i) / kFftSize) * sample_rate_hz_;
wave_numbers_[i] = 2 * M_PI * freq_hz / kSpeedOfSoundMeterSeconds;
mask_thresholds_[i] = num_input_channels_ * num_input_channels_ *
@ -301,10 +299,6 @@ void Beamformer::ProcessChunk(const float* const* input,
// Apply delay and sum and post-filter in the time domain. WARNING: only works
// because delay-and-sum is not frequency dependent.
if (high_pass_split_input != NULL) {
if (previous_block_ix_ == -1) {
old_high_pass_mask = high_pass_postfilter_mask_;
}
// Ramp up/down for smoothing. 1 mask per 10ms results in audible
// discontinuities.
float ramp_inc =
@ -333,8 +327,6 @@ void Beamformer::ProcessAudioBlock(const complex_f* const* input,
CHECK_EQ(num_input_channels, num_input_channels_);
CHECK_EQ(num_output_channels, 1);
float* mask_data = postfilter_masks_[current_block_ix_].elements()[0];
// Calculating the post-filter masks. Note that we need two for each
// frequency bin to account for the positive and negative interferer
// angle.
@ -356,33 +348,25 @@ void Beamformer::ProcessAudioBlock(const complex_f* const* input,
rmw *= rmw;
float rmw_r = rmw.real();
mask_data[i] = CalculatePostfilterMask(interf_cov_mats_[i],
new_mask_[i] = CalculatePostfilterMask(interf_cov_mats_[i],
rpsiws_[i],
ratio_rxiw_rxim,
rmw_r,
mask_thresholds_[i]);
mask_data[i] *= CalculatePostfilterMask(reflected_interf_cov_mats_[i],
new_mask_[i] *= CalculatePostfilterMask(reflected_interf_cov_mats_[i],
reflected_rpsiws_[i],
ratio_rxiw_rxim,
rmw_r,
mask_thresholds_[i]);
}
EstimateTargetPresence(mask_data, kNumFreqBins);
// Can't access block_index - 1 on the first block.
if (previous_block_ix_ >= 0) {
ApplyMaskSmoothing();
}
ApplyMaskSmoothing();
ApplyLowFrequencyCorrection();
ApplyHighFrequencyCorrection();
ApplyMasks(input, output);
previous_block_ix_ = current_block_ix_;
current_block_ix_ = (current_block_ix_ + 1) % kNumberSavedPostfilterMasks;
EstimateTargetPresence();
}
float Beamformer::CalculatePostfilterMask(const ComplexMatrixF& interf_cov_mat,
@ -411,8 +395,6 @@ float Beamformer::CalculatePostfilterMask(const ComplexMatrixF& interf_cov_mat,
void Beamformer::ApplyMasks(const complex_f* const* input,
complex_f* const* output) {
complex_f* output_channel = output[0];
const float* postfilter_mask_els =
postfilter_masks_[current_block_ix_].elements()[0];
for (int f_ix = 0; f_ix < kNumFreqBins; ++f_ix) {
output_channel[f_ix] = complex_f(0.f, 0.f);
@ -422,45 +404,40 @@ void Beamformer::ApplyMasks(const complex_f* const* input,
output_channel[f_ix] += input[c_ix][f_ix] * delay_sum_mask_els[c_ix];
}
output_channel[f_ix] *= postfilter_mask_els[f_ix];
output_channel[f_ix] *= postfilter_mask_[f_ix];
}
}
void Beamformer::ApplyMaskSmoothing() {
float* current_mask_els = postfilter_masks_[current_block_ix_].elements()[0];
const float* previous_block_els =
postfilter_masks_[previous_block_ix_].elements()[0];
for (int i = 0; i < kNumFreqBins; ++i) {
current_mask_els[i] = kMaskSmoothAlpha * current_mask_els[i] +
(1.f - kMaskSmoothAlpha) * previous_block_els[i];
postfilter_mask_[i] = kMaskSmoothAlpha * new_mask_[i] +
(1.f - kMaskSmoothAlpha) * postfilter_mask_[i];
}
}
void Beamformer::ApplyLowFrequencyCorrection() {
float low_frequency_mask = 0.f;
float* mask_els = postfilter_masks_[current_block_ix_].elements()[0];
for (int i = low_average_start_bin_; i < low_average_end_bin_; ++i) {
low_frequency_mask += mask_els[i];
low_frequency_mask += postfilter_mask_[i];
}
low_frequency_mask /= low_average_end_bin_ - low_average_start_bin_;
for (int i = 0; i < low_average_start_bin_; ++i) {
mask_els[i] = low_frequency_mask;
postfilter_mask_[i] = low_frequency_mask;
}
}
void Beamformer::ApplyHighFrequencyCorrection() {
high_pass_postfilter_mask_ = 0.f;
float* mask_els = postfilter_masks_[current_block_ix_].elements()[0];
for (int i = high_average_start_bin_; i < high_average_end_bin_; ++i) {
high_pass_postfilter_mask_ += mask_els[i];
high_pass_postfilter_mask_ += postfilter_mask_[i];
}
high_pass_postfilter_mask_ /= high_average_end_bin_ - high_average_start_bin_;
for (int i = high_average_end_bin_; i < kNumFreqBins; ++i) {
mask_els[i] = high_pass_postfilter_mask_;
postfilter_mask_[i] = high_pass_postfilter_mask_;
}
}
@ -478,13 +455,13 @@ float Beamformer::MicSpacingFromGeometry(const std::vector<Point>& geometry) {
return sqrt(mic_spacing);
}
void Beamformer::EstimateTargetPresence(float* mask, int length) {
memcpy(sorted_mask_, mask, kNumFreqBins * sizeof(*mask));
const int median_ix = (length + 1) / 2;
std::nth_element(sorted_mask_,
sorted_mask_ + median_ix,
sorted_mask_ + length);
if (sorted_mask_[median_ix] > kMaskTargetThreshold) {
void Beamformer::EstimateTargetPresence() {
const int quantile = (1.f - kMaskQuantile) * high_average_end_bin_ +
kMaskQuantile * low_average_start_bin_;
std::nth_element(new_mask_ + low_average_start_bin_,
new_mask_ + quantile,
new_mask_ + high_average_end_bin_);
if (new_mask_[quantile] > kMaskTargetThreshold) {
is_target_present_ = true;
interference_blocks_count_ = 0;
} else {

View File

@ -100,13 +100,10 @@ class Beamformer : public LappedTransform::Callback {
void ApplyMasks(const complex_f* const* input, complex_f* const* output);
float MicSpacingFromGeometry(const std::vector<Point>& array_geometry);
void EstimateTargetPresence(float* mask, int length);
void EstimateTargetPresence();
static const int kFftSize = 256;
static const int kNumFreqBins = kFftSize / 2 + 1;
// How many blocks of past masks (including the current block) we save. Saved
// masks are used for postprocessing such as removing musical noise.
static const int kNumberSavedPostfilterMasks = 2;
// Deals with the fft transform and blocking.
int chunk_length_;
@ -124,14 +121,9 @@ class Beamformer : public LappedTransform::Callback {
int high_average_start_bin_;
int high_average_end_bin_;
// Indices into |postfilter_masks_|.
int current_block_ix_;
int previous_block_ix_;
// Old masks are saved in this ring buffer for smoothing. Array of length
// |kNumberSavedMasks| matrix of size 1 x |kNumFreqBins|.
MatrixF postfilter_masks_[kNumberSavedPostfilterMasks];
float sorted_mask_[kNumFreqBins];
// Old masks are saved for smoothing. Matrix of size 1 x |kNumFreqBins|.
float postfilter_mask_[kNumFreqBins];
float new_mask_[kNumFreqBins];
// Array of length |kNumFreqBins|, Matrix of size |1| x |num_channels_|.
ComplexMatrixF delay_sum_masks_[kNumFreqBins];