Simplify mask calculation
There are only 2 things that prevent the output to be bit-exact: * The zero initialization of the postfilter_mask_ and high_pass_postfilter_mask_, which only afects the first blocks. * The re-tuning of the target presence estimation, since only the bins between low_average_start_bin_ and high_average_end_bin_ are of interest. This latter was not taken into account before. R=andrew@webrtc.org Review URL: https://webrtc-codereview.appspot.com/35139004 Cr-Commit-Position: refs/heads/master@{#8368} git-svn-id: http://webrtc.googlecode.com/svn/trunk@8368 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
56cb0ea99c
commit
92a19bcbd7
@ -15,6 +15,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
|
#include "webrtc/base/arraysize.h"
|
||||||
#include "webrtc/common_audio/window_generator.h"
|
#include "webrtc/common_audio/window_generator.h"
|
||||||
#include "webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.h"
|
#include "webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.h"
|
||||||
|
|
||||||
@ -61,13 +62,16 @@ const float kCovUniformGapHalfWidth = 0.001f;
|
|||||||
// Alpha coefficient for mask smoothing.
|
// Alpha coefficient for mask smoothing.
|
||||||
const float kMaskSmoothAlpha = 0.2f;
|
const float kMaskSmoothAlpha = 0.2f;
|
||||||
|
|
||||||
// The average mask is computed from masks in this mid-frequency range.
|
// The average mask is computed from masks in this mid-frequency range. If these
|
||||||
|
// ranges are changed |kMaskQuantile| might need to be adjusted.
|
||||||
const int kLowAverageStartHz = 200;
|
const int kLowAverageStartHz = 200;
|
||||||
const int kLowAverageEndHz = 400;
|
const int kLowAverageEndHz = 400;
|
||||||
|
|
||||||
const int kHighAverageStartHz = 6000;
|
const int kHighAverageStartHz = 6000;
|
||||||
const int kHighAverageEndHz = 6500;
|
const int kHighAverageEndHz = 6500;
|
||||||
|
|
||||||
|
// Quantile of mask values which is used to estimate target presence.
|
||||||
|
const float kMaskQuantile = 0.3f;
|
||||||
// Mask threshold over which the data is considered signal and not interference.
|
// Mask threshold over which the data is considered signal and not interference.
|
||||||
const float kMaskTargetThreshold = 0.3f;
|
const float kMaskTargetThreshold = 0.3f;
|
||||||
// Time in seconds after which the data is considered interference if the mask
|
// Time in seconds after which the data is considered interference if the mask
|
||||||
@ -141,12 +145,7 @@ float SumAbs(const ComplexMatrix<float>& mat) {
|
|||||||
Beamformer::Beamformer(const std::vector<Point>& array_geometry)
|
Beamformer::Beamformer(const std::vector<Point>& array_geometry)
|
||||||
: num_input_channels_(array_geometry.size()),
|
: num_input_channels_(array_geometry.size()),
|
||||||
mic_spacing_(MicSpacingFromGeometry(array_geometry)) {
|
mic_spacing_(MicSpacingFromGeometry(array_geometry)) {
|
||||||
|
|
||||||
WindowGenerator::KaiserBesselDerived(kAlpha, kFftSize, window_);
|
WindowGenerator::KaiserBesselDerived(kAlpha, kFftSize, window_);
|
||||||
|
|
||||||
for (int i = 0; i < kNumberSavedPostfilterMasks; ++i) {
|
|
||||||
postfilter_masks_[i].Resize(1, kNumFreqBins);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Beamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
|
void Beamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
|
||||||
@ -160,8 +159,7 @@ void Beamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
|
|||||||
Round(kHighAverageStartHz * kFftSize / sample_rate_hz_);
|
Round(kHighAverageStartHz * kFftSize / sample_rate_hz_);
|
||||||
high_average_end_bin_ =
|
high_average_end_bin_ =
|
||||||
Round(kHighAverageEndHz * kFftSize / sample_rate_hz_);
|
Round(kHighAverageEndHz * kFftSize / sample_rate_hz_);
|
||||||
current_block_ix_ = 0;
|
high_pass_postfilter_mask_ = 1.f;
|
||||||
previous_block_ix_ = -1;
|
|
||||||
is_target_present_ = false;
|
is_target_present_ = false;
|
||||||
hold_target_blocks_ = kHoldTargetSeconds * 2 * sample_rate_hz / kFftSize;
|
hold_target_blocks_ = kHoldTargetSeconds * 2 * sample_rate_hz / kFftSize;
|
||||||
interference_blocks_count_ = hold_target_blocks_;
|
interference_blocks_count_ = hold_target_blocks_;
|
||||||
@ -178,8 +176,8 @@ void Beamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
|
|||||||
kFftSize,
|
kFftSize,
|
||||||
kFftSize / 2,
|
kFftSize / 2,
|
||||||
this));
|
this));
|
||||||
|
|
||||||
for (int i = 0; i < kNumFreqBins; ++i) {
|
for (int i = 0; i < kNumFreqBins; ++i) {
|
||||||
|
postfilter_mask_[i] = 1.f;
|
||||||
float freq_hz = (static_cast<float>(i) / kFftSize) * sample_rate_hz_;
|
float freq_hz = (static_cast<float>(i) / kFftSize) * sample_rate_hz_;
|
||||||
wave_numbers_[i] = 2 * M_PI * freq_hz / kSpeedOfSoundMeterSeconds;
|
wave_numbers_[i] = 2 * M_PI * freq_hz / kSpeedOfSoundMeterSeconds;
|
||||||
mask_thresholds_[i] = num_input_channels_ * num_input_channels_ *
|
mask_thresholds_[i] = num_input_channels_ * num_input_channels_ *
|
||||||
@ -301,10 +299,6 @@ void Beamformer::ProcessChunk(const float* const* input,
|
|||||||
// Apply delay and sum and post-filter in the time domain. WARNING: only works
|
// Apply delay and sum and post-filter in the time domain. WARNING: only works
|
||||||
// because delay-and-sum is not frequency dependent.
|
// because delay-and-sum is not frequency dependent.
|
||||||
if (high_pass_split_input != NULL) {
|
if (high_pass_split_input != NULL) {
|
||||||
if (previous_block_ix_ == -1) {
|
|
||||||
old_high_pass_mask = high_pass_postfilter_mask_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ramp up/down for smoothing. 1 mask per 10ms results in audible
|
// Ramp up/down for smoothing. 1 mask per 10ms results in audible
|
||||||
// discontinuities.
|
// discontinuities.
|
||||||
float ramp_inc =
|
float ramp_inc =
|
||||||
@ -333,8 +327,6 @@ void Beamformer::ProcessAudioBlock(const complex_f* const* input,
|
|||||||
CHECK_EQ(num_input_channels, num_input_channels_);
|
CHECK_EQ(num_input_channels, num_input_channels_);
|
||||||
CHECK_EQ(num_output_channels, 1);
|
CHECK_EQ(num_output_channels, 1);
|
||||||
|
|
||||||
float* mask_data = postfilter_masks_[current_block_ix_].elements()[0];
|
|
||||||
|
|
||||||
// Calculating the post-filter masks. Note that we need two for each
|
// Calculating the post-filter masks. Note that we need two for each
|
||||||
// frequency bin to account for the positive and negative interferer
|
// frequency bin to account for the positive and negative interferer
|
||||||
// angle.
|
// angle.
|
||||||
@ -356,33 +348,25 @@ void Beamformer::ProcessAudioBlock(const complex_f* const* input,
|
|||||||
rmw *= rmw;
|
rmw *= rmw;
|
||||||
float rmw_r = rmw.real();
|
float rmw_r = rmw.real();
|
||||||
|
|
||||||
mask_data[i] = CalculatePostfilterMask(interf_cov_mats_[i],
|
new_mask_[i] = CalculatePostfilterMask(interf_cov_mats_[i],
|
||||||
rpsiws_[i],
|
rpsiws_[i],
|
||||||
ratio_rxiw_rxim,
|
ratio_rxiw_rxim,
|
||||||
rmw_r,
|
rmw_r,
|
||||||
mask_thresholds_[i]);
|
mask_thresholds_[i]);
|
||||||
|
|
||||||
mask_data[i] *= CalculatePostfilterMask(reflected_interf_cov_mats_[i],
|
new_mask_[i] *= CalculatePostfilterMask(reflected_interf_cov_mats_[i],
|
||||||
reflected_rpsiws_[i],
|
reflected_rpsiws_[i],
|
||||||
ratio_rxiw_rxim,
|
ratio_rxiw_rxim,
|
||||||
rmw_r,
|
rmw_r,
|
||||||
mask_thresholds_[i]);
|
mask_thresholds_[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
EstimateTargetPresence(mask_data, kNumFreqBins);
|
ApplyMaskSmoothing();
|
||||||
|
|
||||||
// Can't access block_index - 1 on the first block.
|
|
||||||
if (previous_block_ix_ >= 0) {
|
|
||||||
ApplyMaskSmoothing();
|
|
||||||
}
|
|
||||||
|
|
||||||
ApplyLowFrequencyCorrection();
|
ApplyLowFrequencyCorrection();
|
||||||
ApplyHighFrequencyCorrection();
|
ApplyHighFrequencyCorrection();
|
||||||
|
|
||||||
ApplyMasks(input, output);
|
ApplyMasks(input, output);
|
||||||
|
|
||||||
previous_block_ix_ = current_block_ix_;
|
EstimateTargetPresence();
|
||||||
current_block_ix_ = (current_block_ix_ + 1) % kNumberSavedPostfilterMasks;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
float Beamformer::CalculatePostfilterMask(const ComplexMatrixF& interf_cov_mat,
|
float Beamformer::CalculatePostfilterMask(const ComplexMatrixF& interf_cov_mat,
|
||||||
@ -411,8 +395,6 @@ float Beamformer::CalculatePostfilterMask(const ComplexMatrixF& interf_cov_mat,
|
|||||||
void Beamformer::ApplyMasks(const complex_f* const* input,
|
void Beamformer::ApplyMasks(const complex_f* const* input,
|
||||||
complex_f* const* output) {
|
complex_f* const* output) {
|
||||||
complex_f* output_channel = output[0];
|
complex_f* output_channel = output[0];
|
||||||
const float* postfilter_mask_els =
|
|
||||||
postfilter_masks_[current_block_ix_].elements()[0];
|
|
||||||
for (int f_ix = 0; f_ix < kNumFreqBins; ++f_ix) {
|
for (int f_ix = 0; f_ix < kNumFreqBins; ++f_ix) {
|
||||||
output_channel[f_ix] = complex_f(0.f, 0.f);
|
output_channel[f_ix] = complex_f(0.f, 0.f);
|
||||||
|
|
||||||
@ -422,45 +404,40 @@ void Beamformer::ApplyMasks(const complex_f* const* input,
|
|||||||
output_channel[f_ix] += input[c_ix][f_ix] * delay_sum_mask_els[c_ix];
|
output_channel[f_ix] += input[c_ix][f_ix] * delay_sum_mask_els[c_ix];
|
||||||
}
|
}
|
||||||
|
|
||||||
output_channel[f_ix] *= postfilter_mask_els[f_ix];
|
output_channel[f_ix] *= postfilter_mask_[f_ix];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Beamformer::ApplyMaskSmoothing() {
|
void Beamformer::ApplyMaskSmoothing() {
|
||||||
float* current_mask_els = postfilter_masks_[current_block_ix_].elements()[0];
|
|
||||||
const float* previous_block_els =
|
|
||||||
postfilter_masks_[previous_block_ix_].elements()[0];
|
|
||||||
for (int i = 0; i < kNumFreqBins; ++i) {
|
for (int i = 0; i < kNumFreqBins; ++i) {
|
||||||
current_mask_els[i] = kMaskSmoothAlpha * current_mask_els[i] +
|
postfilter_mask_[i] = kMaskSmoothAlpha * new_mask_[i] +
|
||||||
(1.f - kMaskSmoothAlpha) * previous_block_els[i];
|
(1.f - kMaskSmoothAlpha) * postfilter_mask_[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Beamformer::ApplyLowFrequencyCorrection() {
|
void Beamformer::ApplyLowFrequencyCorrection() {
|
||||||
float low_frequency_mask = 0.f;
|
float low_frequency_mask = 0.f;
|
||||||
float* mask_els = postfilter_masks_[current_block_ix_].elements()[0];
|
|
||||||
for (int i = low_average_start_bin_; i < low_average_end_bin_; ++i) {
|
for (int i = low_average_start_bin_; i < low_average_end_bin_; ++i) {
|
||||||
low_frequency_mask += mask_els[i];
|
low_frequency_mask += postfilter_mask_[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
low_frequency_mask /= low_average_end_bin_ - low_average_start_bin_;
|
low_frequency_mask /= low_average_end_bin_ - low_average_start_bin_;
|
||||||
|
|
||||||
for (int i = 0; i < low_average_start_bin_; ++i) {
|
for (int i = 0; i < low_average_start_bin_; ++i) {
|
||||||
mask_els[i] = low_frequency_mask;
|
postfilter_mask_[i] = low_frequency_mask;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Beamformer::ApplyHighFrequencyCorrection() {
|
void Beamformer::ApplyHighFrequencyCorrection() {
|
||||||
high_pass_postfilter_mask_ = 0.f;
|
high_pass_postfilter_mask_ = 0.f;
|
||||||
float* mask_els = postfilter_masks_[current_block_ix_].elements()[0];
|
|
||||||
for (int i = high_average_start_bin_; i < high_average_end_bin_; ++i) {
|
for (int i = high_average_start_bin_; i < high_average_end_bin_; ++i) {
|
||||||
high_pass_postfilter_mask_ += mask_els[i];
|
high_pass_postfilter_mask_ += postfilter_mask_[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
high_pass_postfilter_mask_ /= high_average_end_bin_ - high_average_start_bin_;
|
high_pass_postfilter_mask_ /= high_average_end_bin_ - high_average_start_bin_;
|
||||||
|
|
||||||
for (int i = high_average_end_bin_; i < kNumFreqBins; ++i) {
|
for (int i = high_average_end_bin_; i < kNumFreqBins; ++i) {
|
||||||
mask_els[i] = high_pass_postfilter_mask_;
|
postfilter_mask_[i] = high_pass_postfilter_mask_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -478,13 +455,13 @@ float Beamformer::MicSpacingFromGeometry(const std::vector<Point>& geometry) {
|
|||||||
return sqrt(mic_spacing);
|
return sqrt(mic_spacing);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Beamformer::EstimateTargetPresence(float* mask, int length) {
|
void Beamformer::EstimateTargetPresence() {
|
||||||
memcpy(sorted_mask_, mask, kNumFreqBins * sizeof(*mask));
|
const int quantile = (1.f - kMaskQuantile) * high_average_end_bin_ +
|
||||||
const int median_ix = (length + 1) / 2;
|
kMaskQuantile * low_average_start_bin_;
|
||||||
std::nth_element(sorted_mask_,
|
std::nth_element(new_mask_ + low_average_start_bin_,
|
||||||
sorted_mask_ + median_ix,
|
new_mask_ + quantile,
|
||||||
sorted_mask_ + length);
|
new_mask_ + high_average_end_bin_);
|
||||||
if (sorted_mask_[median_ix] > kMaskTargetThreshold) {
|
if (new_mask_[quantile] > kMaskTargetThreshold) {
|
||||||
is_target_present_ = true;
|
is_target_present_ = true;
|
||||||
interference_blocks_count_ = 0;
|
interference_blocks_count_ = 0;
|
||||||
} else {
|
} else {
|
||||||
|
@ -100,13 +100,10 @@ class Beamformer : public LappedTransform::Callback {
|
|||||||
void ApplyMasks(const complex_f* const* input, complex_f* const* output);
|
void ApplyMasks(const complex_f* const* input, complex_f* const* output);
|
||||||
|
|
||||||
float MicSpacingFromGeometry(const std::vector<Point>& array_geometry);
|
float MicSpacingFromGeometry(const std::vector<Point>& array_geometry);
|
||||||
void EstimateTargetPresence(float* mask, int length);
|
void EstimateTargetPresence();
|
||||||
|
|
||||||
static const int kFftSize = 256;
|
static const int kFftSize = 256;
|
||||||
static const int kNumFreqBins = kFftSize / 2 + 1;
|
static const int kNumFreqBins = kFftSize / 2 + 1;
|
||||||
// How many blocks of past masks (including the current block) we save. Saved
|
|
||||||
// masks are used for postprocessing such as removing musical noise.
|
|
||||||
static const int kNumberSavedPostfilterMasks = 2;
|
|
||||||
|
|
||||||
// Deals with the fft transform and blocking.
|
// Deals with the fft transform and blocking.
|
||||||
int chunk_length_;
|
int chunk_length_;
|
||||||
@ -124,14 +121,9 @@ class Beamformer : public LappedTransform::Callback {
|
|||||||
int high_average_start_bin_;
|
int high_average_start_bin_;
|
||||||
int high_average_end_bin_;
|
int high_average_end_bin_;
|
||||||
|
|
||||||
// Indices into |postfilter_masks_|.
|
// Old masks are saved for smoothing. Matrix of size 1 x |kNumFreqBins|.
|
||||||
int current_block_ix_;
|
float postfilter_mask_[kNumFreqBins];
|
||||||
int previous_block_ix_;
|
float new_mask_[kNumFreqBins];
|
||||||
|
|
||||||
// Old masks are saved in this ring buffer for smoothing. Array of length
|
|
||||||
// |kNumberSavedMasks| matrix of size 1 x |kNumFreqBins|.
|
|
||||||
MatrixF postfilter_masks_[kNumberSavedPostfilterMasks];
|
|
||||||
float sorted_mask_[kNumFreqBins];
|
|
||||||
|
|
||||||
// Array of length |kNumFreqBins|, Matrix of size |1| x |num_channels_|.
|
// Array of length |kNumFreqBins|, Matrix of size |1| x |num_channels_|.
|
||||||
ComplexMatrixF delay_sum_masks_[kNumFreqBins];
|
ComplexMatrixF delay_sum_masks_[kNumFreqBins];
|
||||||
|
Loading…
x
Reference in New Issue
Block a user