Simplify mask calculation

There are only 2 things that prevent the output to be bit-exact: * The zero initialization of the postfilter_mask_ and high_pass_postfilter_mask_, which only afects the first blocks. * The re-tuning of the target presence estimation, since only the bins between low_average_start_bin_ and high_average_end_bin_ are of interest. This latter was not taken into account before. R=andrew@webrtc.org Review URL: https://webrtc-codereview.appspot.com/35139004 Cr-Commit-Position: refs/heads/master@{#8368} git-svn-id: http://webrtc.googlecode.com/svn/trunk@8368 4adac7df-926f-26a2-2b94-8c16560cd09d
2015-02-13 19:37:38 +00:00 · 2015-02-13 19:37:38 +00:00 · 92a19bcbd7
commit 92a19bcbd7
parent 56cb0ea99c
2 changed files with 29 additions and 60 deletions
--- a/webrtc/modules/audio_processing/beamformer/beamformer.cc
+++ b/webrtc/modules/audio_processing/beamformer/beamformer.cc
@ -15,6 +15,7 @@
 #include <algorithm>
 #include <cmath>

+#include "webrtc/base/arraysize.h"
 #include "webrtc/common_audio/window_generator.h"
 #include "webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.h"

@ -61,13 +62,16 @@ const float kCovUniformGapHalfWidth = 0.001f;
 // Alpha coefficient for mask smoothing.
 const float kMaskSmoothAlpha = 0.2f;

-// The average mask is computed from masks in this mid-frequency range.
+// The average mask is computed from masks in this mid-frequency range. If these
+// ranges are changed |kMaskQuantile| might need to be adjusted.
 const int kLowAverageStartHz = 200;
 const int kLowAverageEndHz = 400;

 const int kHighAverageStartHz = 6000;
 const int kHighAverageEndHz = 6500;

+// Quantile of mask values which is used to estimate target presence.
+const float kMaskQuantile = 0.3f;
 // Mask threshold over which the data is considered signal and not interference.
 const float kMaskTargetThreshold = 0.3f;
 // Time in seconds after which the data is considered interference if the mask
@ -141,12 +145,7 @@ float SumAbs(const ComplexMatrix<float>& mat) {
 Beamformer::Beamformer(const std::vector<Point>& array_geometry)
    : num_input_channels_(array_geometry.size()),
      mic_spacing_(MicSpacingFromGeometry(array_geometry)) {
-
  WindowGenerator::KaiserBesselDerived(kAlpha, kFftSize, window_);
-
-  for (int i = 0; i < kNumberSavedPostfilterMasks; ++i) {
-    postfilter_masks_[i].Resize(1, kNumFreqBins);
-  }
 }

 void Beamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
@ -160,8 +159,7 @@ void Beamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
      Round(kHighAverageStartHz * kFftSize / sample_rate_hz_);
  high_average_end_bin_ =
      Round(kHighAverageEndHz * kFftSize / sample_rate_hz_);
-  current_block_ix_ = 0;
-  previous_block_ix_ = -1;
+  high_pass_postfilter_mask_ = 1.f;
  is_target_present_ = false;
  hold_target_blocks_ = kHoldTargetSeconds * 2 * sample_rate_hz / kFftSize;
  interference_blocks_count_ = hold_target_blocks_;
@ -178,8 +176,8 @@ void Beamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
                                              kFftSize,
                                              kFftSize / 2,
                                              this));
-
  for (int i = 0; i < kNumFreqBins; ++i) {
+    postfilter_mask_[i] = 1.f;
    float freq_hz = (static_cast<float>(i) / kFftSize) * sample_rate_hz_;
    wave_numbers_[i] = 2 * M_PI * freq_hz / kSpeedOfSoundMeterSeconds;
    mask_thresholds_[i] = num_input_channels_ * num_input_channels_ *
@ -301,10 +299,6 @@ void Beamformer::ProcessChunk(const float* const* input,
  // Apply delay and sum and post-filter in the time domain. WARNING: only works
  // because delay-and-sum is not frequency dependent.
  if (high_pass_split_input != NULL) {
-    if (previous_block_ix_ == -1) {
-      old_high_pass_mask = high_pass_postfilter_mask_;
-    }
-
    // Ramp up/down for smoothing. 1 mask per 10ms results in audible
    // discontinuities.
    float ramp_inc =
@ -333,8 +327,6 @@ void Beamformer::ProcessAudioBlock(const complex_f* const* input,
  CHECK_EQ(num_input_channels, num_input_channels_);
  CHECK_EQ(num_output_channels, 1);

-  float* mask_data = postfilter_masks_[current_block_ix_].elements()[0];
-
  // Calculating the post-filter masks. Note that we need two for each
  // frequency bin to account for the positive and negative interferer
  // angle.
@ -356,33 +348,25 @@ void Beamformer::ProcessAudioBlock(const complex_f* const* input,
    rmw *= rmw;
    float rmw_r = rmw.real();

-    mask_data[i] = CalculatePostfilterMask(interf_cov_mats_[i],
+    new_mask_[i] = CalculatePostfilterMask(interf_cov_mats_[i],
                                           rpsiws_[i],
                                           ratio_rxiw_rxim,
                                           rmw_r,
                                           mask_thresholds_[i]);

-    mask_data[i] *= CalculatePostfilterMask(reflected_interf_cov_mats_[i],
+    new_mask_[i] *= CalculatePostfilterMask(reflected_interf_cov_mats_[i],
                                            reflected_rpsiws_[i],
                                            ratio_rxiw_rxim,
                                            rmw_r,
                                            mask_thresholds_[i]);
  }

-  EstimateTargetPresence(mask_data, kNumFreqBins);
-
-  // Can't access block_index - 1 on the first block.
-  if (previous_block_ix_ >= 0) {
-    ApplyMaskSmoothing();
-  }
-
+  ApplyMaskSmoothing();
  ApplyLowFrequencyCorrection();
  ApplyHighFrequencyCorrection();
-
  ApplyMasks(input, output);

-  previous_block_ix_ = current_block_ix_;
-  current_block_ix_ = (current_block_ix_ + 1) % kNumberSavedPostfilterMasks;
+  EstimateTargetPresence();
 }

 float Beamformer::CalculatePostfilterMask(const ComplexMatrixF& interf_cov_mat,
@ -411,8 +395,6 @@ float Beamformer::CalculatePostfilterMask(const ComplexMatrixF& interf_cov_mat,
 void Beamformer::ApplyMasks(const complex_f* const* input,
                            complex_f* const* output) {
  complex_f* output_channel = output[0];
-  const float* postfilter_mask_els =
-      postfilter_masks_[current_block_ix_].elements()[0];
  for (int f_ix = 0; f_ix < kNumFreqBins; ++f_ix) {
    output_channel[f_ix] = complex_f(0.f, 0.f);

@ -422,45 +404,40 @@ void Beamformer::ApplyMasks(const complex_f* const* input,
      output_channel[f_ix] += input[c_ix][f_ix] * delay_sum_mask_els[c_ix];
    }

-    output_channel[f_ix] *= postfilter_mask_els[f_ix];
+    output_channel[f_ix] *= postfilter_mask_[f_ix];
  }
 }

 void Beamformer::ApplyMaskSmoothing() {
-  float* current_mask_els = postfilter_masks_[current_block_ix_].elements()[0];
-  const float* previous_block_els =
-      postfilter_masks_[previous_block_ix_].elements()[0];
  for (int i = 0; i < kNumFreqBins; ++i) {
-    current_mask_els[i] = kMaskSmoothAlpha * current_mask_els[i] +
-                          (1.f - kMaskSmoothAlpha) * previous_block_els[i];
+    postfilter_mask_[i] = kMaskSmoothAlpha * new_mask_[i] +
+                          (1.f - kMaskSmoothAlpha) * postfilter_mask_[i];
  }
 }

 void Beamformer::ApplyLowFrequencyCorrection() {
  float low_frequency_mask = 0.f;
-  float* mask_els = postfilter_masks_[current_block_ix_].elements()[0];
  for (int i = low_average_start_bin_; i < low_average_end_bin_; ++i) {
-    low_frequency_mask += mask_els[i];
+    low_frequency_mask += postfilter_mask_[i];
  }

  low_frequency_mask /= low_average_end_bin_ - low_average_start_bin_;

  for (int i = 0; i < low_average_start_bin_; ++i) {
-    mask_els[i] = low_frequency_mask;
+    postfilter_mask_[i] = low_frequency_mask;
  }
 }

 void Beamformer::ApplyHighFrequencyCorrection() {
  high_pass_postfilter_mask_ = 0.f;
-  float* mask_els = postfilter_masks_[current_block_ix_].elements()[0];
  for (int i = high_average_start_bin_; i < high_average_end_bin_; ++i) {
-    high_pass_postfilter_mask_ += mask_els[i];
+    high_pass_postfilter_mask_ += postfilter_mask_[i];
  }

  high_pass_postfilter_mask_ /= high_average_end_bin_ - high_average_start_bin_;

  for (int i = high_average_end_bin_; i < kNumFreqBins; ++i) {
-    mask_els[i] = high_pass_postfilter_mask_;
+    postfilter_mask_[i] = high_pass_postfilter_mask_;
  }
 }

@ -478,13 +455,13 @@ float Beamformer::MicSpacingFromGeometry(const std::vector<Point>& geometry) {
  return sqrt(mic_spacing);
 }

-void Beamformer::EstimateTargetPresence(float* mask, int length) {
-  memcpy(sorted_mask_, mask, kNumFreqBins * sizeof(*mask));
-  const int median_ix = (length + 1) / 2;
-  std::nth_element(sorted_mask_,
-                   sorted_mask_ + median_ix,
-                   sorted_mask_ + length);
-  if (sorted_mask_[median_ix] > kMaskTargetThreshold) {
+void Beamformer::EstimateTargetPresence() {
+  const int quantile = (1.f - kMaskQuantile) * high_average_end_bin_ +
+                       kMaskQuantile * low_average_start_bin_;
+  std::nth_element(new_mask_ + low_average_start_bin_,
+                   new_mask_ + quantile,
+                   new_mask_ + high_average_end_bin_);
+  if (new_mask_[quantile] > kMaskTargetThreshold) {
    is_target_present_ = true;
    interference_blocks_count_ = 0;
  } else {
--- a/webrtc/modules/audio_processing/beamformer/beamformer.h
+++ b/webrtc/modules/audio_processing/beamformer/beamformer.h
@ -100,13 +100,10 @@ class Beamformer : public LappedTransform::Callback {
  void ApplyMasks(const complex_f* const* input, complex_f* const* output);

  float MicSpacingFromGeometry(const std::vector<Point>& array_geometry);
-  void EstimateTargetPresence(float* mask, int length);
+  void EstimateTargetPresence();

  static const int kFftSize = 256;
  static const int kNumFreqBins = kFftSize / 2 + 1;
-  // How many blocks of past masks (including the current block) we save. Saved
-  // masks are used for postprocessing such as removing musical noise.
-  static const int kNumberSavedPostfilterMasks = 2;

  // Deals with the fft transform and blocking.
  int chunk_length_;
@ -124,14 +121,9 @@ class Beamformer : public LappedTransform::Callback {
  int high_average_start_bin_;
  int high_average_end_bin_;

-  // Indices into |postfilter_masks_|.
-  int current_block_ix_;
-  int previous_block_ix_;
-
-  // Old masks are saved in this ring buffer for smoothing. Array of length
-  // |kNumberSavedMasks| matrix of size 1 x |kNumFreqBins|.
-  MatrixF postfilter_masks_[kNumberSavedPostfilterMasks];
-  float sorted_mask_[kNumFreqBins];
+  // Old masks are saved for smoothing. Matrix of size 1 x |kNumFreqBins|.
+  float postfilter_mask_[kNumFreqBins];
+  float new_mask_[kNumFreqBins];

  // Array of length |kNumFreqBins|, Matrix of size |1| x |num_channels_|.
  ComplexMatrixF delay_sum_masks_[kNumFreqBins];