Separate between Analyze and Process in NS

Filled the empty analyze API, separating the noise estimation from the process API. No formatting fixes or extra refactoring has been done, to make the review process easier. This patch has been tested for bit-exactness over the whole QA set in every aggressiveness. BUG=webrtc:3811 R=bjornv@webrtc.org Review URL: https://webrtc-codereview.appspot.com/27549004 git-svn-id: http://webrtc.googlecode.com/svn/trunk@7243 4adac7df-926f-26a2-2b94-8c16560cd09d
2014-09-19 15:18:59 +00:00 · 2014-09-19 15:18:59 +00:00 · fbf3bfe172
commit fbf3bfe172
parent 95705602bd
2 changed files with 129 additions and 89 deletions
--- a/webrtc/modules/audio_processing/ns/ns_core.c
+++ b/webrtc/modules/audio_processing/ns/ns_core.c
@ -113,6 +113,7 @@ int WebRtcNs_InitCore(NSinst_t* inst, uint32_t fs) {
  memset(inst->dataBuf, 0, sizeof(float) * ANAL_BLOCKL_MAX);
  WebRtc_rdft(inst->anaLen, 1, inst->dataBuf, inst->ip, inst->wfft);

+  memset(inst->analyzeBuf, 0, sizeof(float) * ANAL_BLOCKL_MAX);
  memset(inst->dataBuf, 0, sizeof(float) * ANAL_BLOCKL_MAX);
  memset(inst->syntBuf, 0, sizeof(float) * ANAL_BLOCKL_MAX);

@ -147,7 +148,7 @@ int WebRtcNs_InitCore(NSinst_t* inst, uint32_t fs) {
    inst->noisePrev[i]     = (float)0.0; //previous noise-spectrum
    inst->logLrtTimeAvg[i] = LRT_FEATURE_THR; //smooth LR ratio (same as threshold)
    inst->magnAvgPause[i]  = (float)0.0; //conservative noise spectrum estimate
-    inst->speechProbHB[i]  = (float)0.0; //for estimation of HB in second pass
+    inst->speechProb[i]  = (float)0.0; //for estimation of HB in second pass
    inst->initMagnEst[i]   = (float)0.0; //initial average mag spectrum
  }

@ -714,34 +715,20 @@ void WebRtcNs_SpeechNoiseProb(NSinst_t* inst, float* probSpeechFinal, float* snr
  }
 }

-int WebRtcNs_AnalyzeCore(NSinst_t* inst, float* inFrame) {
-  return 0;
-}
-
-int WebRtcNs_ProcessCore(NSinst_t* inst,
-                         float* speechFrame,
-                         float* speechFrameHB,
-                         float* outFrame,
-                         float* outFrameHB) {
-  // main routine for noise reduction
-
-  int     flagHB = 0;
+int WebRtcNs_AnalyzeCore(NSinst_t* inst, float* speechFrame) {
  int     i;
  const int kStartBand = 5; // Skip first frequency bins during estimation.
  int     updateParsFlag;
-
-  float   energy1, energy2, gain, factor, factor1, factor2;
+  float   energy;
  float   signalEnergy, sumMagn;
  float   snrPrior, currentEstimateStsa;
  float   tmpFloat1, tmpFloat2, tmpFloat3, probSpeech, probNonSpeech;
  float   gammaNoiseTmp, gammaNoiseOld;
  float   noiseUpdateTmp, fTmp;
-  float   fout[BLOCKL_MAX];
  float   winData[ANAL_BLOCKL_MAX];
  float   magn[HALF_ANAL_BLOCKL], noise[HALF_ANAL_BLOCKL];
  float   theFilter[HALF_ANAL_BLOCKL], theFilterTmp[HALF_ANAL_BLOCKL];
  float   snrLocPost[HALF_ANAL_BLOCKL], snrLocPrior[HALF_ANAL_BLOCKL];
-  float   probSpeechFinal[HALF_ANAL_BLOCKL] = { 0 };
  float   previousEstimateStsa[HALF_ANAL_BLOCKL];
  float   real[ANAL_BLOCKL_MAX], imag[HALF_ANAL_BLOCKL];
  // Variables during startup
@ -753,56 +740,29 @@ int WebRtcNs_ProcessCore(NSinst_t* inst,
  float   parametric_exp = 0.0;
  float   parametric_num = 0.0;

-  // SWB variables
-  int     deltaBweHB = 1;
-  int     deltaGainHB = 1;
-  float   decayBweHB = 1.0;
-  float   gainMapParHB = 1.0;
-  float   gainTimeDomainHB = 1.0;
-  float   avgProbSpeechHB, avgProbSpeechHBTmp, avgFilterGainHB, gainModHB;
-
  // Check that initiation has been done
  if (inst->initFlag != 1) {
    return (-1);
  }
-  // Check for valid pointers based on sampling rate
-  if (inst->fs == 32000) {
-    if (speechFrameHB == NULL) {
-      return -1;
-    }
-    flagHB = 1;
-    // range for averaging low band quantities for H band gain
-    deltaBweHB = (int)inst->magnLen / 4;
-    deltaGainHB = deltaBweHB;
-  }
  //
  updateParsFlag = inst->modelUpdatePars[0];
  //

  // update analysis buffer for L band
-  memcpy(inst->dataBuf, inst->dataBuf + inst->blockLen10ms,
+  memcpy(inst->analyzeBuf, inst->analyzeBuf + inst->blockLen10ms,
         sizeof(float) * (inst->anaLen - inst->blockLen10ms));
-  memcpy(inst->dataBuf + inst->anaLen - inst->blockLen10ms, speechFrame,
+  memcpy(inst->analyzeBuf + inst->anaLen - inst->blockLen10ms, speechFrame,
         sizeof(float) * inst->blockLen10ms);

-  if (flagHB == 1) {
-    // update analysis buffer for H band
-    memcpy(inst->dataBufHB, inst->dataBufHB + inst->blockLen10ms,
-           sizeof(float) * (inst->anaLen - inst->blockLen10ms));
-    memcpy(inst->dataBufHB + inst->anaLen - inst->blockLen10ms, speechFrameHB,
-           sizeof(float) * inst->blockLen10ms);
-  }
-
  // check if processing needed
  if (inst->outLen == 0) {
    // windowing
-    energy1 = 0.0;
+    energy = 0.0;
    for (i = 0; i < inst->anaLen; i++) {
-      winData[i] = inst->window[i] * inst->dataBuf[i];
-      energy1 += winData[i] * winData[i];
+      winData[i] = inst->window[i] * inst->analyzeBuf[i];
+      energy += winData[i] * winData[i];
    }
-    if (energy1 == 0.0) {
-      // synthesize the special case of zero input
+    if (energy == 0.0) {
      // we want to avoid updating statistics in this case:
      // Updating feature statistics when we have zeros only will cause thresholds to
      // move towards zero signal situations. This in turn has the effect that once the
@ -810,34 +770,6 @@ int WebRtcNs_ProcessCore(NSinst_t* inst,
      // and there is no noise suppression effect. Depending on the duration of the
      // inactive signal it takes a considerable amount of time for the system to learn
      // what is noise and what is speech.
-
-      // read out fully processed segment
-      for (i = inst->windShift; i < inst->blockLen + inst->windShift; i++) {
-        fout[i - inst->windShift] = inst->syntBuf[i];
-      }
-      // update synthesis buffer
-      memcpy(inst->syntBuf, inst->syntBuf + inst->blockLen,
-             sizeof(float) * (inst->anaLen - inst->blockLen));
-      memset(inst->syntBuf + inst->anaLen - inst->blockLen, 0,
-             sizeof(float) * inst->blockLen);
-
-      // out buffer
-      inst->outLen = inst->blockLen - inst->blockLen10ms;
-      if (inst->blockLen > inst->blockLen10ms) {
-        for (i = 0; i < inst->outLen; i++) {
-          inst->outBuf[i] = fout[i + inst->blockLen10ms];
-        }
-      }
-      for (i = 0; i < inst->blockLen10ms; ++i)
-        outFrame[i] = WEBRTC_SPL_SAT(
-            WEBRTC_SPL_WORD16_MAX, fout[i], WEBRTC_SPL_WORD16_MIN);
-
-      // for time-domain gain of HB
-      if (flagHB == 1)
-        for (i = 0; i < inst->blockLen10ms; ++i)
-          outFrameHB[i] = WEBRTC_SPL_SAT(
-              WEBRTC_SPL_WORD16_MAX, inst->dataBufHB[i], WEBRTC_SPL_WORD16_MIN);
-
      return 0;
    }

@ -1011,11 +943,11 @@ int WebRtcNs_ProcessCore(NSinst_t* inst,
      }
    }
    // compute speech/noise probability
-    WebRtcNs_SpeechNoiseProb(inst, probSpeechFinal, snrLocPrior, snrLocPost);
+    WebRtcNs_SpeechNoiseProb(inst, inst->speechProb, snrLocPrior, snrLocPost);
    // time-avg parameter for noise update
    gammaNoiseTmp = NOISE_UPDATE;
    for (i = 0; i < inst->magnLen; i++) {
-      probSpeech = probSpeechFinal[i];
+      probSpeech = inst->speechProb[i];
      probNonSpeech = (float)1.0 - probSpeech;
      // temporary noise update:
      // use it for speech frames if update value is less than previous
@ -1094,14 +1026,124 @@ int WebRtcNs_ProcessCore(NSinst_t* inst,
      }
      // smoothing
      inst->smooth[i] = theFilter[i];
-      real[i] *= inst->smooth[i];
-      imag[i] *= inst->smooth[i];
    }
    // keep track of noise and magn spectrum for next frame
    for (i = 0; i < inst->magnLen; i++) {
      inst->noisePrev[i] = noise[i];
      inst->magnPrev[i] = magn[i];
    }
+  } // end of if inst->outLen == 0
+
+  return 0;
+}
+
+int WebRtcNs_ProcessCore(NSinst_t* inst,
+                         float* speechFrame,
+                         float* speechFrameHB,
+                         float* outFrame,
+                         float* outFrameHB) {
+  // main routine for noise reduction
+  int     flagHB = 0;
+  int     i;
+
+  float   energy1, energy2, gain, factor, factor1, factor2;
+  float   fout[BLOCKL_MAX];
+  float   winData[ANAL_BLOCKL_MAX];
+  float   real[ANAL_BLOCKL_MAX], imag[HALF_ANAL_BLOCKL];
+
+  // SWB variables
+  int     deltaBweHB = 1;
+  int     deltaGainHB = 1;
+  float   decayBweHB = 1.0;
+  float   gainMapParHB = 1.0;
+  float   gainTimeDomainHB = 1.0;
+  float   avgProbSpeechHB, avgProbSpeechHBTmp, avgFilterGainHB, gainModHB;
+
+  // Check that initiation has been done
+  if (inst->initFlag != 1) {
+    return (-1);
+  }
+  // Check for valid pointers based on sampling rate
+  if (inst->fs == 32000) {
+    if (speechFrameHB == NULL) {
+      return -1;
+    }
+    flagHB = 1;
+    // range for averaging low band quantities for H band gain
+    deltaBweHB = (int)inst->magnLen / 4;
+    deltaGainHB = deltaBweHB;
+  }
+
+  // update analysis buffer for L band
+  memcpy(inst->dataBuf, inst->dataBuf + inst->blockLen10ms,
+         sizeof(float) * (inst->anaLen - inst->blockLen10ms));
+  memcpy(inst->dataBuf + inst->anaLen - inst->blockLen10ms, speechFrame,
+         sizeof(float) * inst->blockLen10ms);
+
+  if (flagHB == 1) {
+    // update analysis buffer for H band
+    memcpy(inst->dataBufHB, inst->dataBufHB + inst->blockLen10ms,
+           sizeof(float) * (inst->anaLen - inst->blockLen10ms));
+    memcpy(inst->dataBufHB + inst->anaLen - inst->blockLen10ms, speechFrameHB,
+           sizeof(float) * inst->blockLen10ms);
+  }
+
+  // check if processing needed
+  if (inst->outLen == 0) {
+    // windowing
+    energy1 = 0.0;
+    for (i = 0; i < inst->anaLen; i++) {
+      winData[i] = inst->window[i] * inst->dataBuf[i];
+      energy1 += winData[i] * winData[i];
+    }
+    if (energy1 == 0.0) {
+      // synthesize the special case of zero input
+      // read out fully processed segment
+      for (i = inst->windShift; i < inst->blockLen + inst->windShift; i++) {
+        fout[i - inst->windShift] = inst->syntBuf[i];
+      }
+      // update synthesis buffer
+      memcpy(inst->syntBuf, inst->syntBuf + inst->blockLen,
+             sizeof(float) * (inst->anaLen - inst->blockLen));
+      memset(inst->syntBuf + inst->anaLen - inst->blockLen, 0,
+             sizeof(float) * inst->blockLen);
+
+      // out buffer
+      inst->outLen = inst->blockLen - inst->blockLen10ms;
+      if (inst->blockLen > inst->blockLen10ms) {
+        for (i = 0; i < inst->outLen; i++) {
+          inst->outBuf[i] = fout[i + inst->blockLen10ms];
+        }
+      }
+      for (i = 0; i < inst->blockLen10ms; ++i)
+        outFrame[i] = WEBRTC_SPL_SAT(
+            WEBRTC_SPL_WORD16_MAX, fout[i], WEBRTC_SPL_WORD16_MIN);
+
+      // for time-domain gain of HB
+      if (flagHB == 1)
+        for (i = 0; i < inst->blockLen10ms; ++i)
+          outFrameHB[i] = WEBRTC_SPL_SAT(
+              WEBRTC_SPL_WORD16_MAX, inst->dataBufHB[i], WEBRTC_SPL_WORD16_MIN);
+
+      return 0;
+    }
+
+    // FFT
+    WebRtc_rdft(inst->anaLen, 1, winData, inst->ip, inst->wfft);
+
+    imag[0] = 0;
+    real[0] = winData[0];
+    imag[inst->magnLen - 1] = 0;
+    real[inst->magnLen - 1] = winData[1];
+    for (i = 1; i < inst->magnLen - 1; i++) {
+      real[i] = winData[2 * i];
+      imag[i] = winData[2 * i + 1];
+    }
+
+    for (i = 0; i < inst->magnLen; i++) {
+      real[i] *= inst->smooth[i];
+      imag[i] *= inst->smooth[i];
+    }
    // back to time domain
    winData[0] = real[0];
    winData[1] = real[inst->magnLen - 1];
@ -1187,14 +1229,11 @@ int WebRtcNs_ProcessCore(NSinst_t* inst,

  // for time-domain gain of HB
  if (flagHB == 1) {
-    for (i = 0; i < inst->magnLen; i++) {
-      inst->speechProbHB[i] = probSpeechFinal[i];
-    }
    // average speech prob from low band
    // avg over second half (i.e., 4->8kHz) of freq. spectrum
    avgProbSpeechHB = 0.0;
    for (i = inst->magnLen - deltaBweHB - 1; i < inst->magnLen - 1; i++) {
-      avgProbSpeechHB += inst->speechProbHB[i];
+      avgProbSpeechHB += inst->speechProb[i];
    }
    avgProbSpeechHB = avgProbSpeechHB / ((float)deltaBweHB);
    // average filter gain from low band
--- a/webrtc/modules/audio_processing/ns/ns_core.h
+++ b/webrtc/modules/audio_processing/ns/ns_core.h
@ -59,6 +59,7 @@ typedef struct NSinst_t_ {
  int             magnLen;
  int             aggrMode;
  const float*    window;
+  float           analyzeBuf[ANAL_BLOCKL_MAX];
  float           dataBuf[ANAL_BLOCKL_MAX];
  float           syntBuf[ANAL_BLOCKL_MAX];
  float           outBuf[3 * BLOCKL_MAX];
@ -102,7 +103,7 @@ typedef struct NSinst_t_ {
  int             histSpecFlat[HIST_PAR_EST];
  int             histSpecDiff[HIST_PAR_EST];
  //quantities for high band estimate
-  float           speechProbHB[HALF_ANAL_BLOCKL];     //final speech/noise prob: prior + LRT
+  float           speechProb[HALF_ANAL_BLOCKL];     //final speech/noise prob: prior + LRT
  float           dataBufHB[ANAL_BLOCKL_MAX];         //buffering data for HB

 } NSinst_t;
@ -153,7 +154,7 @@ int WebRtcNs_set_policy_core(NSinst_t* inst, int mode);
 *
 * Input:
 *      - inst          : Instance that should be initialized
- *      - inFrame       : Input speech frame for lower band
+ *      - speechFrame   : Input speech frame for lower band
 *
 * Output:
 *      - inst          : Updated instance
@ -161,7 +162,7 @@ int WebRtcNs_set_policy_core(NSinst_t* inst, int mode);
 * Return value         :  0 - OK
 *                        -1 - Error
 */
-int WebRtcNs_AnalyzeCore(NSinst_t* inst, float* inFrame);
+int WebRtcNs_AnalyzeCore(NSinst_t* inst, float* speechFrame);

 /****************************************************************************
 * WebRtcNs_ProcessCore