For commiting changes in CL 277002, due to file structure changes introduced during

the review of the code. Review URL: http://webrtc-codereview.appspot.com/246005 git-svn-id: http://webrtc.googlecode.com/svn/trunk@805 4adac7df-926f-26a2-2b94-8c16560cd09d
2011-10-24 21:36:33 +00:00 · 2011-10-24 21:36:33 +00:00 · 913644b92d
commit 913644b92d
parent 0d0037c2fd
4 changed files with 890 additions and 319 deletions
--- a/src/modules/audio_processing/Android.mk
+++ b/src/modules/audio_processing/Android.mk
@ -107,7 +107,8 @@ LOCAL_MODULE_TAGS := tests
 LOCAL_CPP_EXTENSION := .cc
 LOCAL_SRC_FILES:= \
    $(call all-proto-files-under, test) \
-    test/unit_test.cc
+    test/unit_test.cc \
+    ../../../test/testsupport/fileutils.cc

 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
@ -118,6 +119,7 @@ LOCAL_C_INCLUDES := \
    $(LOCAL_PATH)/interface \
    $(LOCAL_PATH)/../interface \
    $(LOCAL_PATH)/../.. \
+    $(LOCAL_PATH)/../../../test \
    $(LOCAL_PATH)/../../system_wrappers/interface \
    $(LOCAL_PATH)/../../common_audio/signal_processing_library/main/interface \
    external/gtest/include \
--- a/src/modules/audio_processing/ns/nsx_core.c
+++ b/src/modules/audio_processing/ns/nsx_core.c
@ -426,46 +426,6 @@ static const WebRtc_Word16 kDeterminantEstMatrix[66] = {
  355,    330
 };

-void WebRtcNsx_UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
-  WebRtc_Word32 tmp32no1 = 0;
-  WebRtc_Word32 tmp32no2 = 0;
-
-  WebRtc_Word16 tmp16no1 = 0;
-  WebRtc_Word16 tmp16no2 = 0;
-  const WebRtc_Word16 kExp2Const = 11819; // Q13
-
-  int i = 0;
-
-  tmp16no2 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
-                                   inst->magnLen);
-  // Guarantee a Q-domain as high as possible and still fit in int16
-  inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-      kExp2Const, tmp16no2, 21);
-  for (i = 0; i < inst->magnLen; i++) {
-    // inst->quantile[i]=exp(inst->lquantile[offset+i]);
-    // in Q21
-    tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
-                                    inst->noiseEstLogQuantile[offset + i]);
-    tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
-    tmp16no1 = -(WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
-    tmp16no1 += 21;// shift 21 to get result in Q0
-    tmp16no1 -= (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise)
-    if (tmp16no1 > 0) {
-      tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, tmp16no1);
-    } else {
-      tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, -tmp16no1);
-    }
-    // TODO(bjornv): Replace with WebRtcSpl_SatW32ToW16(...) when available.
-    if (tmp32no1 > 32767) {
-      tmp32no1 = 32767;
-    } else if (tmp32no1 < -32768) {
-      tmp32no1 = -32768;
-    }
-    tmp16no1 = (WebRtc_Word16) tmp32no1;
-    inst->noiseEstQuantile[i] = tmp16no1;
-  }
-}
-
 void WebRtcNsx_CalcParametricNoiseEstimate(NsxInst_t* inst,
                                           WebRtc_Word16 pink_noise_exp_avg,
                                           WebRtc_Word32 pink_noise_num_avg,
@ -675,128 +635,6 @@ int WebRtcNsx_set_policy_core(NsxInst_t* inst, int mode) {
  return 0;
 }

-#if !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
-void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, WebRtc_UWord16* magn, WebRtc_UWord32* noise,
-                               WebRtc_Word16* qNoise) {
-  WebRtc_Word32 numerator;
-
-  WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv, countProd, delta, zeros, frac;
-  WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
-  WebRtc_Word16 log2Const = 22713; // Q15
-  WebRtc_Word16 widthFactor = 21845;
-
-  int i, s, offset;
-
-  numerator = FACTOR_Q16;
-
-  tabind = inst->stages - inst->normData;
-  assert(tabind < 9);
-  assert(tabind > -9);
-  if (tabind < 0) {
-    logval = -WebRtcNsx_kLogTable[-tabind];
-  } else {
-    logval = WebRtcNsx_kLogTable[tabind];
-  }
-
-  // lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
-  // magn is in Q(-stages), and the real lmagn values are:
-  // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
-  // lmagn in Q8
-  for (i = 0; i < inst->magnLen; i++) {
-    if (magn[i]) {
-      zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
-      frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros) & 0x7FFFFFFF) >> 23);
-      // log2(magn(i))
-      assert(frac < 256);
-      log2 = (WebRtc_Word16)(((31 - zeros) << 8) + WebRtcNsx_kLogTableFrac[frac]);
-      // log2(magn(i))*log(2)
-      lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2Const, 15);
-      // + log(2^stages)
-      lmagn[i] += logval;
-    } else {
-      lmagn[i] = logval;//0;
-    }
-  }
-
-  // loop over simultaneous estimates
-  for (s = 0; s < SIMULT; s++) {
-    offset = s * inst->magnLen;
-
-    // Get counter values from state
-    counter = inst->noiseEstCounter[s];
-    assert(counter < 201);
-    countDiv = WebRtcNsx_kCounterDiv[counter];
-    countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
-
-    // quant_est(...)
-    for (i = 0; i < inst->magnLen; i++) {
-      // compute delta
-      if (inst->noiseEstDensity[offset + i] > 512) {
-        delta = WebRtcSpl_DivW32W16ResW16(numerator,
-                                          inst->noiseEstDensity[offset + i]);
-      } else {
-        delta = FACTOR_Q7;
-        if (inst->blockIndex < END_STARTUP_LONG) {
-          // Smaller step size during startup. This prevents from using
-          // unrealistic values causing overflow.
-          delta = FACTOR_Q7_STARTUP;
-        }
-      }
-
-      // update log quantile estimate
-      tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
-      if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
-        // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
-        // CounterDiv=1/(inst->counter[s]+1) in Q15
-        tmp16 += 2;
-        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
-        inst->noiseEstLogQuantile[offset + i] += tmp16no1;
-      } else {
-        tmp16 += 1;
-        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
-        // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
-        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
-        inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
-        if (inst->noiseEstLogQuantile[offset + i] < logval) {
-          // This is the smallest fixed point representation we can
-          // have, hence we limit the output.
-          inst->noiseEstLogQuantile[offset + i] = logval;
-        }
-      }
-
-      // update density estimate
-      if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
-          < WIDTH_Q8) {
-        tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-                     inst->noiseEstDensity[offset + i], countProd, 15);
-        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(widthFactor,
-                                                                       countDiv, 15);
-        inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
-      }
-    } // end loop over magnitude spectrum
-
-    if (counter >= END_STARTUP_LONG) {
-      inst->noiseEstCounter[s] = 0;
-      if (inst->blockIndex >= END_STARTUP_LONG) {
-        WebRtcNsx_UpdateNoiseEstimate(inst, offset);
-      }
-    }
-    inst->noiseEstCounter[s]++;
-
-  } // end loop over simultaneous estimates
-
-  // Sequentially update the noise during startup
-  if (inst->blockIndex < END_STARTUP_LONG) {
-    WebRtcNsx_UpdateNoiseEstimate(inst, offset);
-  }
-
-  for (i = 0; i < inst->magnLen; i++) {
-    noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
-  }
-  (*qNoise) = (WebRtc_Word16)inst->qNoise;
-}
-#endif  // !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
-
 // Extract thresholds for feature parameters
 // histograms are computed over some window_size (given by window_pars)
 // thresholds and weights are extracted every window
@ -1322,7 +1160,7 @@ void WebRtcNsx_SpeechNoiseProb(NsxInst_t* inst, WebRtc_UWord16* nonSpeechProbFin
      tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex];
      frac = (WebRtc_Word16)(tmpU32no1 & 0x00003fff); // Q14
      tmp16no2 += (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-          tmp16no1, frac, 14);
+                    tmp16no1, frac, 14);
      if (tmpIndFX) {
        tmpIndFX = 8192 + tmp16no2;
      } else {
@ -1343,7 +1181,7 @@ void WebRtcNsx_SpeechNoiseProb(NsxInst_t* inst, WebRtc_UWord16* nonSpeechProbFin
  // inst->priorNonSpeechProb += PRIOR_UPDATE * (indPriorNonSpeech - inst->priorNonSpeechProb);
  tmp16 = indPriorFX16 - inst->priorNonSpeechProb; // Q14
  inst->priorNonSpeechProb += (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
-      PRIOR_UPDATE_Q14, tmp16, 14); // Q14
+                                PRIOR_UPDATE_Q14, tmp16, 14); // Q14

  //final speech probability: combine prior model with LR factor:

@ -1424,18 +1262,9 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16*
  int right_shifts_in_magnU16 = 0;
  int right_shifts_in_initMagnEst = 0;

-  // For lower band do all processing
-  // update analysis buffer for L band
-  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer, inst->analysisBuffer + inst->blockLen10ms,
-                        inst->anaLen - inst->blockLen10ms);
-  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer + inst->anaLen - inst->blockLen10ms,
-                        speechFrame, inst->blockLen10ms);
+  // Update analysis buffer for lower band, and window data before FFT.
+  WebRtcNsx_AnalysisUpdate(inst, winData, speechFrame);

-  // Window data before FFT
-  for (i = 0; i < inst->anaLen; i++) {
-    winData[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-        inst->window[i], inst->analysisBuffer[i], 14); // Q0
-  }
  // Get input energy
  inst->energyIn = WebRtcSpl_Energy(winData, (int)inst->anaLen, &(inst->scaleEnergyIn));

@ -1459,11 +1288,7 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16*
  right_shifts_in_magnU16 = WEBRTC_SPL_MAX(right_shifts_in_magnU16, 0);

  // create realImag as winData interleaved with zeros (= imag. part), normalize it
-  for (i = 0; i < inst->anaLen; i++) {
-    j = WEBRTC_SPL_LSHIFT_W16(i, 1);
-    realImag[j] = WEBRTC_SPL_LSHIFT_W16(winData[i], inst->normData); // Q(normData)
-    realImag[j + 1] = 0; // Insert zeros in imaginary part
-  }
+  WebRtcNsx_CreateComplexBuffer(inst, winData, realImag);

  // bit-reverse position of elements in array and FFT the array
  WebRtcSpl_ComplexBitReverse(realImag, inst->stages); // Q(normData-stages)
@ -1492,7 +1317,7 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16*
      tmpU32no1 += (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(realImag[j + 1], realImag[j + 1]);
      inst->magnEnergy += tmpU32no1; // Q(2*(normData-stages))

-      magnU16[i] = (WebRtc_UWord16)WebRtcSpl_Sqrt(tmpU32no1); // Q(normData-stages)
+      magnU16[i] = (WebRtc_UWord16)WebRtcSpl_SqrtFloor(tmpU32no1); // Q(normData-stages)
      inst->sumMagn += (WebRtc_UWord32)magnU16[i]; // Q(normData-stages)
    }
  } else {
@ -1541,7 +1366,7 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16*
      tmpU32no1 += (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(realImag[j + 1], realImag[j + 1]);
      inst->magnEnergy += tmpU32no1; // Q(2*(normData-stages))

-      magnU16[i] = (WebRtc_UWord16)WebRtcSpl_Sqrt(tmpU32no1); // Q(normData-stages)
+      magnU16[i] = (WebRtc_UWord16)WebRtcSpl_SqrtFloor(tmpU32no1); // Q(normData-stages)
      inst->sumMagn += (WebRtc_UWord32)magnU16[i]; // Q(normData-stages)

      // Switch initMagnEst to Q(minNorm-stages)
@ -1607,8 +1432,8 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16*
      tmp_1_w32 += WEBRTC_SPL_MUL_16_16_RSFT(kSumLogIndex[65], sum_log_i, 9);
      tmp_1_w32 -= WEBRTC_SPL_MUL_16_16_RSFT(kSumLogIndex[65], kSumLogIndex[65], 10);
      tmp_1_w32 -= WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)sum_log_i_square, 4);
-      tmp_1_w32 -= WEBRTC_SPL_MUL_16_16_RSFT(
-          (WebRtc_Word16)(inst->magnLen - kStartBand), kSumSquareLogIndex[65], 2);
+      tmp_1_w32 -= WEBRTC_SPL_MUL_16_16_RSFT((WebRtc_Word16)
+                       (inst->magnLen - kStartBand), kSumSquareLogIndex[65], 2);
      matrix_determinant = (WebRtc_Word16)tmp_1_w32;
      sum_log_i -= kSumLogIndex[65]; // Q5
      sum_log_i_square -= kSumSquareLogIndex[65]; // Q2
@ -1684,40 +1509,16 @@ void WebRtcNsx_DataSynthesis(NsxInst_t* inst, short* outFrame) {
                            inst->blockLen10ms);
    return;
  }
-  // Filter the data in the frequency domain
-  for (i = 0; i < inst->magnLen; i++) {
-    inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
-        inst->real[i], (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
-    inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
-        inst->imag[i], (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
-  }
-  // back to time domain
-  // Create spectrum
-  realImag[0] = inst->real[0];
-  realImag[1] = -inst->imag[0];
-  for (i = 1; i < inst->anaLen2; i++) {
-    j = WEBRTC_SPL_LSHIFT_W16(i, 1);
-    tmp16no1 = (inst->anaLen << 1) - j;
-    realImag[j] = inst->real[i];
-    realImag[j + 1] = -inst->imag[i];
-    realImag[tmp16no1] = inst->real[i];
-    realImag[tmp16no1 + 1] = inst->imag[i];
-  }
-  realImag[inst->anaLen] = inst->real[inst->anaLen2];
-  realImag[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
+
+  // Filter the data in the frequency domain, and create spectrum.
+  WebRtcNsx_PrepareSpectrum(inst, realImag);

  // bit-reverse position of elements in array and IFFT it
  WebRtcSpl_ComplexBitReverse(realImag, inst->stages);
  outCIFFT = WebRtcSpl_ComplexIFFT(realImag, inst->stages, 1);

-  for (i = 0; i < inst->anaLen; i++) {
-    j = WEBRTC_SPL_LSHIFT_W16(i, 1);
-    tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)realImag[j],
-                                    outCIFFT - inst->normData);
-    inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
-                                                  tmp32no1,
-                                                  WEBRTC_SPL_WORD16_MIN);
-  }
+  // Denormalize.
+  WebRtcNsx_Denormalize(inst, realImag, outCIFFT);

  //scale factor: only do it after END_STARTUP_LONG time
  gainFactor = 8192; // 8192 = Q13(1.0)
@ -1754,26 +1555,8 @@ void WebRtcNsx_DataSynthesis(NsxInst_t* inst, short* outFrame) {
    gainFactor = tmp16no1 + tmp16no2; // Q13
  } // out of flag_gain_map==1

-  // synthesis
-  for (i = 0; i < inst->anaLen; i++) {
-    tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(inst->window[i],
-                                                                   inst->real[i], 14); // Q0, window in Q14
-    tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16no1, gainFactor, 13); // Q0
-    // Down shift with rounding
-    tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, tmp32no1,
-                                             WEBRTC_SPL_WORD16_MIN); // Q0
-    inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i], tmp16no2); // Q0
-  }
-
-  // read out fully processed segment
-  for (i = 0; i < inst->blockLen10ms; i++) {
-    outFrame[i] = inst->synthesisBuffer[i]; // Q0
-  }
-  // update synthesis buffer
-  WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer, inst->synthesisBuffer + inst->blockLen10ms,
-                        inst->anaLen - inst->blockLen10ms);
-  WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer + inst->anaLen - inst->blockLen10ms,
-                          inst->blockLen10ms);
+  // Synthesis, read out fully processed segment, and update synthesis buffer.
+  WebRtcNsx_SynthesisUpdate(inst, outFrame, gainFactor);
 }

 int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* speechFrame, short* speechFrameHB,
@ -1815,6 +1598,12 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* speechFrame, short* speechFram
  int flag, sign;
  int q_domain_to_use = 0;

+  // Code for ARMv7-Neon platform assumes the following:
+  assert(inst->anaLen % 16 == 0);
+  assert(inst->anaLen2 % 8 == 0);
+  assert(inst->blockLen10ms % 16 == 0);
+  assert(inst->magnLen == inst->anaLen2 + 1);
+
 #ifdef NS_FILEDEBUG
  fwrite(spframe, sizeof(short), inst->blockLen10ms, inst->infile);
 #endif
@ -2080,8 +1869,8 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* speechFrame, short* speechFram
      if (WebRtcSpl_NormU32(tmpU32no3) < norm32no1) {
        inst->featureSpecDiff = 0x007FFFFF;
      } else {
-        inst->featureSpecDiff = WEBRTC_SPL_MIN(
-            0x007FFFFF, WEBRTC_SPL_LSHIFT_U32(tmpU32no3, norm32no1));
+        inst->featureSpecDiff = WEBRTC_SPL_MIN(0x007FFFFF,
+            WEBRTC_SPL_LSHIFT_U32(tmpU32no3, norm32no1));
      }
    }

@ -2317,7 +2106,8 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* speechFrame, short* speechFram
    }
    avgProbSpeechHB = (WebRtc_Word16)(4096
        - WEBRTC_SPL_RSHIFT_U16(tmpU16no1, inst->stages - 7)); // Q12
-    avgFilterGainHB = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_U32(tmpU32no1, inst->stages - 3); // Q14
+    avgFilterGainHB = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_U32(
+        tmpU32no1, inst->stages - 3); // Q14

    // // original FLOAT code
    // // gain based on speech probability:
@ -2368,3 +2158,264 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* speechFrame, short* speechFram

  return 0;
 }
+
+#if !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
+
+// Update the noise estimation information.
+static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
+  WebRtc_Word32 tmp32no1 = 0;
+  WebRtc_Word32 tmp32no2 = 0;
+  WebRtc_Word16 tmp16 = 0;
+  const WebRtc_Word16 kExp2Const = 11819; // Q13
+
+  int i = 0;
+
+  tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
+                                   inst->magnLen);
+  // Guarantee a Q-domain as high as possible and still fit in int16
+  inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                   kExp2Const, tmp16, 21);
+  for (i = 0; i < inst->magnLen; i++) {
+    // inst->quantile[i]=exp(inst->lquantile[offset+i]);
+    // in Q21
+    tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
+                                    inst->noiseEstLogQuantile[offset + i]);
+    tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
+    tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
+    tmp16 -= 21;// shift 21 to get result in Q0
+    tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise)
+    if (tmp16 < 0) {
+      tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
+    } else {
+      tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
+    }
+    inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1);
+  }
+}
+
+// Noise Estimation
+void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
+                               uint16_t* magn,
+                               uint32_t* noise,
+                               int16_t* q_noise) {
+  WebRtc_Word32 numerator = FACTOR_Q16;
+  WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
+  WebRtc_Word16 countProd, delta, zeros, frac;
+  WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
+  const int16_t log2_const = 22713; // Q15
+  const int16_t width_factor = 21845;
+
+  int i, s, offset;
+
+  tabind = inst->stages - inst->normData;
+  assert(tabind < 9);
+  assert(tabind > -9);
+  if (tabind < 0) {
+    logval = -WebRtcNsx_kLogTable[-tabind];
+  } else {
+    logval = WebRtcNsx_kLogTable[tabind];
+  }
+
+  // lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
+  // magn is in Q(-stages), and the real lmagn values are:
+  // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
+  // lmagn in Q8
+  for (i = 0; i < inst->magnLen; i++) {
+    if (magn[i]) {
+      zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
+      frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros)
+                              & 0x7FFFFFFF) >> 23);
+      // log2(magn(i))
+      assert(frac < 256);
+      log2 = (WebRtc_Word16)(((31 - zeros) << 8)
+                             + WebRtcNsx_kLogTableFrac[frac]);
+      // log2(magn(i))*log(2)
+      lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
+      // + log(2^stages)
+      lmagn[i] += logval;
+    } else {
+      lmagn[i] = logval;//0;
+    }
+  }
+
+  // loop over simultaneous estimates
+  for (s = 0; s < SIMULT; s++) {
+    offset = s * inst->magnLen;
+
+    // Get counter values from state
+    counter = inst->noiseEstCounter[s];
+    assert(counter < 201);
+    countDiv = WebRtcNsx_kCounterDiv[counter];
+    countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
+
+    // quant_est(...)
+    for (i = 0; i < inst->magnLen; i++) {
+      // compute delta
+      if (inst->noiseEstDensity[offset + i] > 512) {
+        delta = WebRtcSpl_DivW32W16ResW16(numerator,
+                                          inst->noiseEstDensity[offset + i]);
+      } else {
+        delta = FACTOR_Q7;
+        if (inst->blockIndex < END_STARTUP_LONG) {
+          // Smaller step size during startup. This prevents from using
+          // unrealistic values causing overflow.
+          delta = FACTOR_Q7_STARTUP;
+        }
+      }
+
+      // update log quantile estimate
+      tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
+      if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
+        // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
+        // CounterDiv=1/(inst->counter[s]+1) in Q15
+        tmp16 += 2;
+        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
+        inst->noiseEstLogQuantile[offset + i] += tmp16no1;
+      } else {
+        tmp16 += 1;
+        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
+        // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
+        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
+        inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
+        if (inst->noiseEstLogQuantile[offset + i] < logval) {
+          // This is the smallest fixed point representation we can
+          // have, hence we limit the output.
+          inst->noiseEstLogQuantile[offset + i] = logval;
+        }
+      }
+
+      // update density estimate
+      if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
+          < WIDTH_Q8) {
+        tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                     inst->noiseEstDensity[offset + i], countProd, 15);
+        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                     width_factor, countDiv, 15);
+        inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
+      }
+    } // end loop over magnitude spectrum
+
+    if (counter >= END_STARTUP_LONG) {
+      inst->noiseEstCounter[s] = 0;
+      if (inst->blockIndex >= END_STARTUP_LONG) {
+        UpdateNoiseEstimate(inst, offset);
+      }
+    }
+    inst->noiseEstCounter[s]++;
+
+  } // end loop over simultaneous estimates
+
+  // Sequentially update the noise during startup
+  if (inst->blockIndex < END_STARTUP_LONG) {
+    UpdateNoiseEstimate(inst, offset);
+  }
+
+  for (i = 0; i < inst->magnLen; i++) {
+    noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
+  }
+  (*q_noise) = (WebRtc_Word16)inst->qNoise;
+}
+
+// Filter the data in the frequency domain, and create spectrum.
+void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
+  int i = 0, j = 0;
+  int16_t tmp16 = 0;
+
+  for (i = 0; i < inst->magnLen; i++) {
+    inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
+        (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
+    inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
+        (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
+  }
+
+  freq_buf[0] = inst->real[0];
+  freq_buf[1] = -inst->imag[0];
+  for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
+    tmp16 = (inst->anaLen << 1) - j;
+    freq_buf[j] = inst->real[i];
+    freq_buf[j + 1] = -inst->imag[i];
+    freq_buf[tmp16] = inst->real[i];
+    freq_buf[tmp16 + 1] = inst->imag[i];
+  }
+  freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
+  freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
+}
+
+// Denormalize the input buffer.
+inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
+  int i = 0, j = 0;
+  int32_t tmp32 = 0;
+  for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
+    tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j],
+                                 factor - inst->normData);
+    inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
+  }
+}
+
+// For the noise supression process, synthesis, read out fully processed
+// segment, and update synthesis buffer.
+void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
+                               int16_t* out_frame,
+                               int16_t gain_factor) {
+  int i = 0;
+  int16_t tmp16a = 0;
+  int16_t tmp16b = 0;
+  int32_t tmp32 = 0;
+
+  // synthesis
+  for (i = 0; i < inst->anaLen; i++) {
+    tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                 inst->window[i], inst->real[i], 14); // Q0, window in Q14
+    tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0
+    // Down shift with rounding
+    tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
+    inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i],
+                                                      tmp16b); // Q0
+  }
+
+  // read out fully processed segment
+  for (i = 0; i < inst->blockLen10ms; i++) {
+    out_frame[i] = inst->synthesisBuffer[i]; // Q0
+  }
+
+  // update synthesis buffer
+  WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
+                        inst->synthesisBuffer + inst->blockLen10ms,
+                        inst->anaLen - inst->blockLen10ms);
+  WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
+      + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
+}
+
+// Update analysis buffer for lower band, and window data before FFT.
+void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
+                              int16_t* out,
+                              int16_t* new_speech) {
+  int i = 0;
+
+  // For lower band update analysis buffer.
+  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
+                        inst->analysisBuffer + inst->blockLen10ms,
+                        inst->anaLen - inst->blockLen10ms);
+  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
+      + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
+
+  // Window data before FFT.
+  for (i = 0; i < inst->anaLen; i++) {
+    out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+               inst->window[i], inst->analysisBuffer[i], 14); // Q0
+  }
+}
+
+// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
+// zeros, and normalize it.
+inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
+                                          int16_t* in,
+                                          int16_t* out) {
+  int i = 0, j = 0;
+  for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
+    out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
+    out[j + 1] = 0; // Insert zeros in imaginary part
+  }
+}
+
+#endif  // !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
--- a/src/modules/audio_processing/ns/nsx_core.h
+++ b/src/modules/audio_processing/ns/nsx_core.h
@ -129,14 +129,14 @@ WebRtc_Word32 WebRtcNsx_InitCore(NsxInst_t* inst, WebRtc_UWord32 fs);
 * This changes the aggressiveness of the noise suppression method.
 *
 * Input:
- *      - inst          : Instance that should be initialized
- *      - mode          : 0: Mild (6 dB), 1: Medium (10 dB), 2: Aggressive (15 dB)
+ *      - inst       : Instance that should be initialized
+ *      - mode       : 0: Mild (6 dB), 1: Medium (10 dB), 2: Aggressive (15 dB)
 *
 * Output:
- *      - NS_inst      : Initialized instance
+ *      - inst       : Initialized instance
 *
- * Return value         :  0 - Ok
- *                        -1 - Error
+ * Return value      :  0 - Ok
+ *                     -1 - Error
 */
 int WebRtcNsx_set_policy_core(NsxInst_t* inst, int mode);

@ -158,16 +158,47 @@ int WebRtcNsx_set_policy_core(NsxInst_t* inst, int mode);
 * Return value         :  0 - OK
 *                        -1 - Error
 */
-int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* inFrameLow, short* inFrameHigh,
-                          short* outFrameLow, short* outFrameHigh);
+int WebRtcNsx_ProcessCore(NsxInst_t* inst,
+                          short* inFrameLow,
+                          short* inFrameHigh,
+                          short* outFrameLow,
+                          short* outFrameHigh);

 /****************************************************************************
 * Internal functions and variable declarations shared with optimized code.
 */
-void WebRtcNsx_UpdateNoiseEstimate(NsxInst_t* inst, int offset);

-void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, WebRtc_UWord16* magn, WebRtc_UWord32* noise,
-                               WebRtc_Word16* qNoise);
+// Noise Estimation.
+void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
+                               uint16_t* magn,
+                               uint32_t* noise,
+                               int16_t* q_noise);
+
+// Filter the data in the frequency domain, and create spectrum.
+void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst,
+                               int16_t* freq_buff);
+
+// For the noise supression process, synthesis, read out fully processed
+// segment, and update synthesis buffer.
+void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
+                               int16_t* out_frame,
+                               int16_t gain_factor);
+
+// Update analysis buffer for lower band, and window data before FFT.
+void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
+                              int16_t* out,
+                              int16_t* new_speech);
+
+// Denormalize the input buffer.
+inline void WebRtcNsx_Denormalize(NsxInst_t* inst,
+                                  int16_t* in,
+                                  int factor);
+
+// Create a complex number buffer, as the intput interleaved with zeros,
+// and normalize it.
+inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
+                                          int16_t* in,
+                                          int16_t* out);

 extern const WebRtc_Word16 WebRtcNsx_kLogTable[9];
 extern const WebRtc_Word16 WebRtcNsx_kLogTableFrac[256];
--- a/src/modules/audio_processing/ns/nsx_core_neon.c
+++ b/src/modules/audio_processing/ns/nsx_core_neon.c
@ -15,19 +15,98 @@
 #include <arm_neon.h>
 #include <assert.h>

-void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, WebRtc_UWord16* magn, WebRtc_UWord32* noise,
-                               WebRtc_Word16* qNoise) {
-  WebRtc_Word32 numerator;
+// Update the noise estimation information.
+static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
+  int i = 0;
+  const int16_t kExp2Const = 11819; // Q13
+  int16_t* ptr_noiseEstLogQuantile = NULL;
+  int16_t* ptr_noiseEstQuantile = NULL;
+  int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const);
+  int32x4_t twentyOne32x4 = vdupq_n_s32(21);
+  int32x4_t constA32x4 = vdupq_n_s32(0x1fffff);
+  int32x4_t constB32x4 = vdupq_n_s32(0x200000);

-  WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv, countProd, delta, zeros, frac;
-  WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
-  WebRtc_Word16 log2Const = 22713;
-  WebRtc_Word16 widthFactor = 21845;
+  int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
+                                        inst->magnLen);
+
+  // Guarantee a Q-domain as high as possible and still fit in int16
+  inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const,
+                                                                 tmp16,
+                                                                 21);
+
+  int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise);
+
+  for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset],
+       ptr_noiseEstQuantile = &inst->noiseEstQuantile[0];
+       ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3];
+       ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) {
+
+    // tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
+    //                                inst->noiseEstLogQuantile[offset + i]);
+    int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile);
+    int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4);
+
+    // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
+    int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4);
+    v32x4A = vorrq_s32(v32x4A, constB32x4);
+
+    // tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
+    v32x4B = vshrq_n_s32(v32x4B, 21);
+
+    // tmp16 -= 21;// shift 21 to get result in Q0
+    v32x4B = vsubq_s32(v32x4B, twentyOne32x4);
+
+    // tmp16 += (int16_t) inst->qNoise;
+    // shift to get result in Q(qNoise)
+    v32x4B = vaddq_s32(v32x4B, qNoise32x4);
+
+    // if (tmp16 < 0) {
+    //   tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
+    // } else {
+    //   tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
+    // }
+    v32x4B = vshlq_s32(v32x4A, v32x4B);
+
+    // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1);
+    v16x4 = vqmovn_s32(v32x4B);
+
+    //inst->noiseEstQuantile[i] = tmp16;
+    vst1_s16(ptr_noiseEstQuantile, v16x4);
+  }
+
+  // Last iteration:
+  
+  // inst->quantile[i]=exp(inst->lquantile[offset+i]);
+  // in Q21
+  int32_t tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
+                                          *ptr_noiseEstLogQuantile);
+  int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
+
+  tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
+  tmp16 -= 21;// shift 21 to get result in Q0
+  tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise)
+  if (tmp16 < 0) {
+    tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
+  } else {
+    tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
+  }
+  *ptr_noiseEstQuantile = WebRtcSpl_SatW32ToW16(tmp32no1);
+}
+
+// Noise Estimation
+void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
+                               uint16_t* magn,
+                               uint32_t* noise,
+                               int16_t* q_noise) {
+  int32_t numerator = FACTOR_Q16;
+  int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
+  int16_t countProd, delta, zeros, frac;
+  int16_t log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
+  const int16_t log2_const = 22713;
+  const int16_t width_factor = 21845;

  int i, s, offset;

-  numerator = FACTOR_Q16;
-
  tabind = inst->stages - inst->normData;
  assert(tabind < 9);
  assert(tabind > -9);
@ -45,13 +124,15 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, WebRtc_UWord16* magn, WebRtc_UWo
  // lmagn in Q8
  for (i = 0; i < inst->magnLen; i++) {
    if (magn[i]) {
-      zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
-      frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros) & 0x7FFFFFFF) >> 23);
+      zeros = WebRtcSpl_NormU32((uint32_t)magn[i]);
+      frac = (int16_t)((((uint32_t)magn[i] << zeros)
+                              & 0x7FFFFFFF) >> 23);
      assert(frac < 256);
      // log2(magn(i))
-      log2 = (WebRtc_Word16)(((31 - zeros) << 8) + WebRtcNsx_kLogTableFrac[frac]);
+      log2 = (int16_t)(((31 - zeros) << 8)
+                             + WebRtcNsx_kLogTableFrac[frac]);
      // log2(magn(i))*log(2)
-      lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2Const, 15);
+      lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
      // + log(2^stages)
      lmagn[i] += logval;
    } else {
@ -61,9 +142,9 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, WebRtc_UWord16* magn, WebRtc_UWo

  int16x4_t Q3_16x4  = vdup_n_s16(3);
  int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8);
-  int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(widthFactor);
+  int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(width_factor);

-  WebRtc_Word16 factor = FACTOR_Q7;
+  int16_t factor = FACTOR_Q7;
  if (inst->blockIndex < END_STARTUP_LONG)
    factor = FACTOR_Q7_STARTUP;

@ -75,10 +156,10 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, WebRtc_UWord16* magn, WebRtc_UWo
    counter = inst->noiseEstCounter[s];
    assert(counter < 201);
    countDiv = WebRtcNsx_kCounterDiv[counter];
-    countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
+    countProd = (int16_t)WEBRTC_SPL_MUL_16_16(counter, countDiv);

    // quant_est(...)
-    WebRtc_Word16 deltaBuff[8];
+    int16_t deltaBuff[8];
    int16x4_t tmp16x4_0;
    int16x4_t tmp16x4_1;
    int16x4_t countDiv_16x4 = vdup_n_s16(countDiv);
@ -103,13 +184,13 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, WebRtc_UWord16* magn, WebRtc_UWo
      for (j = 0; j < 8; j++) {
        if (inst->noiseEstDensity[offset + i + j] > 512) {
          deltaBuff[j] = WebRtcSpl_DivW32W16ResW16(
-              numerator, inst->noiseEstDensity[offset + i + j]);
+                           numerator, inst->noiseEstDensity[offset + i + j]);
        }
      }

      // Update log quantile estimate

-      // tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
+      // tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
      tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[0]), countDiv_16x4);
      tmp16x4_1 = vshrn_n_s32(tmp32x4, 14);
      tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[4]), countDiv_16x4);
@ -142,17 +223,19 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, WebRtc_UWord16* magn, WebRtc_UWo
      tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // keep
      tmp16x8_0 = vsubq_s16(tmp16x8_2, tmp16x8_0);

-      // logval is the smallest fixed point representation we can have. Values below
-      // that will correspond to values in the interval [0, 1], which can't possibly
-      // occur.
+      // logval is the smallest fixed point representation we can have. Values
+      // below that will correspond to values in the interval [0, 1], which
+      // can't possibly occur.
      tmp16x8_0 = vmaxq_s16(tmp16x8_0, logval_16x8);

      // Do the if-else branches:
      tmp16x8_3 = vld1q_s16(&lmagn[i]); // keep for several lines
      tmp16x8_5 = vsubq_s16(tmp16x8_3, tmp16x8_2);
      __asm__("vcgt.s16 %q0, %q1, #0"::"w"(tmp16x8_4), "w"(tmp16x8_5));
-      __asm__("vbit %q0, %q1, %q2"::"w"(tmp16x8_2), "w"(tmp16x8_1), "w"(tmp16x8_4));
-      __asm__("vbif %q0, %q1, %q2"::"w"(tmp16x8_2), "w"(tmp16x8_0), "w"(tmp16x8_4));
+      __asm__("vbit %q0, %q1, %q2"::
+              "w"(tmp16x8_2), "w"(tmp16x8_1), "w"(tmp16x8_4));
+      __asm__("vbif %q0, %q1, %q2"::
+              "w"(tmp16x8_2), "w"(tmp16x8_0), "w"(tmp16x8_4));
      vst1q_s16(&inst->noiseEstLogQuantile[offset + i], tmp16x8_2);

      // Update density estimate
@ -165,61 +248,61 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, WebRtc_UWord16* magn, WebRtc_UWo
      tmp16x8_3 = vsubq_s16(tmp16x8_3, tmp16x8_2);
      tmp16x8_3 = vabsq_s16(tmp16x8_3);
      tmp16x8_4 = vcgtq_s16(WIDTHQ8_16x8, tmp16x8_3);
-      __asm__("vbit %q0, %q1, %q2"::"w"(tmp16x8_1), "w"(tmp16x8_0), "w"(tmp16x8_4));
+      __asm__("vbit %q0, %q1, %q2"::
+              "w"(tmp16x8_1), "w"(tmp16x8_0), "w"(tmp16x8_4));
      vst1q_s16(&inst->noiseEstDensity[offset + i], tmp16x8_1);
    } // End loop over magnitude spectrum

-    for (; i < inst->magnLen; i++) {
-      // compute delta
-      if (inst->noiseEstDensity[offset + i] > 512) {
-        delta = WebRtcSpl_DivW32W16ResW16(numerator,
-                                          inst->noiseEstDensity[offset + i]);
-      } else {
-        delta = FACTOR_Q7;
-        if (inst->blockIndex < END_STARTUP_LONG) {
-          // Smaller step size during startup. This prevents from using
-          // unrealistic values causing overflow.
-          delta = FACTOR_Q7_STARTUP;
-        }
+    // Last iteration over magnitude spectrum:
+    // compute delta
+    if (inst->noiseEstDensity[offset + i] > 512) {
+      delta = WebRtcSpl_DivW32W16ResW16(numerator,
+                                        inst->noiseEstDensity[offset + i]);
+    } else {
+      delta = FACTOR_Q7;
+      if (inst->blockIndex < END_STARTUP_LONG) {
+        // Smaller step size during startup. This prevents from using
+        // unrealistic values causing overflow.
+        delta = FACTOR_Q7_STARTUP;
      }
+    }
+    // update log quantile estimate
+    tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
+    if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
+      // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
+      // CounterDiv=1/(inst->counter[s]+1) in Q15
+      tmp16 += 2;
+      tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
+      inst->noiseEstLogQuantile[offset + i] += tmp16no1;
+    } else {
+      tmp16 += 1;
+      tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
+      // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
+      tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
+      inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
+      if (inst->noiseEstLogQuantile[offset + i] < logval) {
+        // logval is the smallest fixed point representation we can have.
+        // Values below that will correspond to values in the interval
+        // [0, 1], which can't possibly occur.
+        inst->noiseEstLogQuantile[offset + i] = logval;
+      }
+    }

-      // update log quantile estimate
-      tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
-      if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
-        // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
-        // CounterDiv=1/(inst->counter[s]+1) in Q15
-        tmp16 += 2;
-        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
-        inst->noiseEstLogQuantile[offset + i] += tmp16no1;
-      } else {
-        tmp16 += 1;
-        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
-        // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
-        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
-        inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
-        if (inst->noiseEstLogQuantile[offset + i] < logval) {
-          // logval is the smallest fixed point representation we can have.
-          // Values below that will correspond to values in the interval
-          // [0, 1], which can't possibly occur.
-          inst->noiseEstLogQuantile[offset + i] = logval;
-        }
-      }
+    // update density estimate
+    if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
+        < WIDTH_Q8) {
+      tmp16no1 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                   inst->noiseEstDensity[offset + i], countProd, 15);
+      tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                   width_factor, countDiv, 15);
+      inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
+    }

-      // update density estimate
-      if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
-          < WIDTH_Q8) {
-        tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-                     inst->noiseEstDensity[offset + i], countProd, 15);
-        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-                     widthFactor, countDiv, 15);
-        inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
-      }
-    } // end loop over magnitude spectrum

    if (counter >= END_STARTUP_LONG) {
      inst->noiseEstCounter[s] = 0;
      if (inst->blockIndex >= END_STARTUP_LONG) {
-        WebRtcNsx_UpdateNoiseEstimate(inst, offset);
+        UpdateNoiseEstimate(inst, offset);
      }
    }
    inst->noiseEstCounter[s]++;
@ -228,13 +311,417 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, WebRtc_UWord16* magn, WebRtc_UWo

  // Sequentially update the noise during startup
  if (inst->blockIndex < END_STARTUP_LONG) {
-    WebRtcNsx_UpdateNoiseEstimate(inst, offset);
+    UpdateNoiseEstimate(inst, offset);
  }

  for (i = 0; i < inst->magnLen; i++) {
-    noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
+    noise[i] = (uint32_t)(inst->noiseEstQuantile[i]); // Q(qNoise)
  }
-  (*qNoise) = (WebRtc_Word16)inst->qNoise;
+  (*q_noise) = (int16_t)inst->qNoise;
 }

+// Filter the data in the frequency domain, and create spectrum.
+void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
+
+  // (1) Filtering.
+
+  // Fixed point C code for the next block is as follows:
+  // for (i = 0; i < inst->magnLen; i++) {
+  //   inst->real[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
+  //      (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
+  //   inst->imag[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
+  //      (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
+  // }
+
+  int16_t* ptr_real = &inst->real[0];
+  int16_t* ptr_imag = &inst->imag[0];
+  uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0];
+
+  // Filter the rest in the frequency domain.
+  for (; ptr_real < &inst->real[inst->magnLen - 1]; ) {
+    // Loop unrolled once. Both pointers are incremented by 4 twice.
+    __asm__ __volatile__(
+      "vld1.16 d20, [%[ptr_real]]\n\t"
+      "vld1.16 d22, [%[ptr_imag]]\n\t"
+      "vld1.16 d23, [%[ptr_noiseSupFilter]]!\n\t"
+      "vmull.s16 q10, d20, d23\n\t"
+      "vmull.s16 q11, d22, d23\n\t"
+      "vshrn.s32 d20, q10, #14\n\t"
+      "vshrn.s32 d22, q11, #14\n\t"
+      "vst1.16 d20, [%[ptr_real]]!\n\t"
+      "vst1.16 d22, [%[ptr_imag]]!\n\t"
+
+      "vld1.16 d18, [%[ptr_real]]\n\t"
+      "vld1.16 d24, [%[ptr_imag]]\n\t"
+      "vld1.16 d25, [%[ptr_noiseSupFilter]]!\n\t"
+      "vmull.s16 q9, d18, d25\n\t"
+      "vmull.s16 q12, d24, d25\n\t"
+      "vshrn.s32 d18, q9, #14\n\t"
+      "vshrn.s32 d24, q12, #14\n\t"
+      "vst1.16 d18, [%[ptr_real]]!\n\t"
+      "vst1.16 d24, [%[ptr_imag]]!\n\t"
+
+      // Specify constraints.
+      :[ptr_imag]"+r"(ptr_imag),
+       [ptr_real]"+r"(ptr_real),
+       [ptr_noiseSupFilter]"+r"(ptr_noiseSupFilter)
+      :
+      :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
+       "q9", "q10", "q11", "q12"
+      );
+  }
+
+  // Filter the last pair of elements in the frequency domain.
+  *ptr_real = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_real,
+      (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages)
+  *ptr_imag = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_imag,
+      (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages)
+
+  // (2) Create spectrum.
+
+  // Fixed point C code for the rest of the function is as follows:
+  // freq_buf[0] = inst->real[0];
+  // freq_buf[1] = -inst->imag[0];
+  // for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
+  //   tmp16 = (inst->anaLen << 1) - j;
+  //   freq_buf[j] = inst->real[i];
+  //   freq_buf[j + 1] = -inst->imag[i];
+  //   freq_buf[tmp16] = inst->real[i];
+  //   freq_buf[tmp16 + 1] = inst->imag[i];
+  // }
+  // freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
+  // freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
+
+  freq_buf[0] = inst->real[0];
+  freq_buf[1] = -inst->imag[0];
+
+  int offset = -16;
+  int16_t* ptr_realImag1 = &freq_buf[2];
+  int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8];
+  ptr_real = &inst->real[1];
+  ptr_imag = &inst->imag[1];
+  for (; ptr_real < &inst->real[inst->anaLen2 - 11]; ) {
+    // Loop unrolled once. All pointers are incremented twice.
+    __asm__ __volatile__(
+      "vld1.16 d22, [%[ptr_real]]!\n\t"
+      "vld1.16 d23, [%[ptr_imag]]!\n\t"
+      // Negate and interleave:
+      "vmov.s16 d20, d22\n\t"
+      "vneg.s16 d21, d23\n\t"
+      "vzip.16 d20, d21\n\t"
+      // Write 8 elements to &freq_buf[j]
+      "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t"
+      // Interleave and reverse elements:
+      "vzip.16 d22, d23\n\t"
+      "vrev64.32 d18, d23\n\t"
+      "vrev64.32 d19, d22\n\t"
+      // Write 8 elements to &freq_buf[tmp16]
+      "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t"
+
+      "vld1.16 d22, [%[ptr_real]]!\n\t"
+      "vld1.16 d23, [%[ptr_imag]]!\n\t"
+      // Negate and interleave:
+      "vmov.s16 d20, d22\n\t"
+      "vneg.s16 d21, d23\n\t"
+      "vzip.16 d20, d21\n\t"
+      // Write 8 elements to &freq_buf[j]
+      "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t"
+      // Interleave and reverse elements:
+      "vzip.16 d22, d23\n\t"
+      "vrev64.32 d18, d23\n\t"
+      "vrev64.32 d19, d22\n\t"
+      // Write 8 elements to &freq_buf[tmp16]
+      "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t"
+
+      // Specify constraints.
+      :[ptr_imag]"+r"(ptr_imag),
+       [ptr_real]"+r"(ptr_real),
+       [ptr_realImag1]"+r"(ptr_realImag1),
+       [ptr_realImag2]"+r"(ptr_realImag2)
+      :[offset]"r"(offset)
+      :"d18", "d19", "d20", "d21", "d22", "d23"
+    );
+  }
+  for (ptr_realImag2 += 6;
+       ptr_real <= &inst->real[inst->anaLen2];
+       ptr_real += 1, ptr_imag += 1, ptr_realImag1 += 2, ptr_realImag2 -= 2) {
+    *ptr_realImag1 = *ptr_real;
+    *(ptr_realImag1 + 1) = -(*ptr_imag);
+    *ptr_realImag2 = *ptr_real;
+    *(ptr_realImag2 + 1) = *ptr_imag;
+  }
+
+  freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
+  freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
+}
+
+// Denormalize the input buffer.
+inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
+  int16_t* ptr_real = &inst->real[0];
+  int16_t* ptr_in = &in[0];
+
+  __asm__ __volatile__("vdup.32 q10, %0" ::
+                       "r"((int32_t)(factor - inst->normData)) : "q10");
+  for (; ptr_real < &inst->real[inst->anaLen]; ) {
+
+    // Loop unrolled once. Both pointers are incremented.
+    __asm__ __volatile__(
+      // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j],
+      //                             factor - inst->normData);
+      "vld2.16 {d24, d25}, [%[ptr_in]]!\n\t"
+      "vmovl.s16 q12, d24\n\t"
+      "vshl.s32 q12, q10\n\t"
+      // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
+      "vqmovn.s32 d24, q12\n\t"
+      "vst1.16 d24, [%[ptr_real]]!\n\t"
+
+      // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j],
+      //                             factor - inst->normData);
+      "vld2.16 {d22, d23}, [%[ptr_in]]!\n\t"
+      "vmovl.s16 q11, d22\n\t"
+      "vshl.s32 q11, q10\n\t"
+      // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
+      "vqmovn.s32 d22, q11\n\t"
+      "vst1.16 d22, [%[ptr_real]]!\n\t"
+
+      // Specify constraints.
+      :[ptr_in]"+r"(ptr_in),
+       [ptr_real]"+r"(ptr_real)
+      :
+      :"d22", "d23", "d24", "d25"
+    );
+  }
+}
+
+// For the noise supress process, synthesis, read out fully processed segment,
+// and update synthesis buffer.
+void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
+                               int16_t* out_frame,
+                               int16_t gain_factor) {
+  int16_t* ptr_real = &inst->real[0];
+  int16_t* ptr_syn = &inst->synthesisBuffer[0];
+  int16_t* ptr_window = &inst->window[0];
+
+  // synthesis
+  __asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24");
+  // Loop unrolled once. All pointers are incremented in the assembly code.
+  for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) {
+    __asm__ __volatile__(
+      // Load variables.
+      "vld1.16 d22, [%[ptr_real]]!\n\t"
+      "vld1.16 d23, [%[ptr_window]]!\n\t"
+      "vld1.16 d25, [%[ptr_syn]]\n\t"
+      // tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+      //           inst->window[i], inst->real[i], 14); // Q0, window in Q14
+      "vmull.s16 q11, d22, d23\n\t"
+      "vrshrn.i32 d22, q11, #14\n\t"
+      // tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13);
+      "vmull.s16 q11, d24, d22\n\t"
+      // tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
+      "vqrshrn.s32 d22, q11, #13\n\t"
+      // inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(
+      //     inst->synthesisBuffer[i], tmp16b); // Q0
+      "vqadd.s16 d25, d22\n\t"
+      "vst1.16 d25, [%[ptr_syn]]!\n\t"
+
+      // Load variables.
+      "vld1.16 d26, [%[ptr_real]]!\n\t"
+      "vld1.16 d27, [%[ptr_window]]!\n\t"
+      "vld1.16 d28, [%[ptr_syn]]\n\t"
+      // tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+      //           inst->window[i], inst->real[i], 14); // Q0, window in Q14
+      "vmull.s16 q13, d26, d27\n\t"
+      "vrshrn.i32 d26, q13, #14\n\t"
+      // tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13);
+      "vmull.s16 q13, d24, d26\n\t"
+      // tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
+      "vqrshrn.s32 d26, q13, #13\n\t"
+      // inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(
+      //     inst->synthesisBuffer[i], tmp16b); // Q0
+      "vqadd.s16 d28, d26\n\t"
+      "vst1.16 d28, [%[ptr_syn]]!\n\t"
+
+      // Specify constraints.
+      :[ptr_real]"+r"(ptr_real),
+       [ptr_window]"+r"(ptr_window),
+       [ptr_syn]"+r"(ptr_syn)
+      :
+      :"d22", "d23", "d24", "d25", "d26", "d27", "d28", "q11", "q12", "q13"
+    );
+  }
+
+  int16_t* ptr_out = &out_frame[0];
+  ptr_syn = &inst->synthesisBuffer[0];
+  // read out fully processed segment
+  for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms]; ) {
+    // Loop unrolled once. Both pointers are incremented in the assembly code.
+    __asm__ __volatile__(
+      // out_frame[i] = inst->synthesisBuffer[i]; // Q0
+      "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
+      "vld1.16 {d24, d25}, [%[ptr_syn]]!\n\t"
+      "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
+      "vst1.16 {d24, d25}, [%[ptr_out]]!\n\t"
+      :[ptr_syn]"+r"(ptr_syn),
+       [ptr_out]"+r"(ptr_out)
+      :
+      :"d22", "d23", "d24", "d25"
+    );
+  }
+
+  // Update synthesis buffer.
+  // C code:
+  // WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
+  //                      inst->synthesisBuffer + inst->blockLen10ms,
+  //                      inst->anaLen - inst->blockLen10ms);
+  ptr_out = &inst->synthesisBuffer[0],
+  ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms];
+  for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) {
+    // Loop unrolled once. Both pointers are incremented in the assembly code.
+    __asm__ __volatile__(
+      "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
+      "vld1.16 {d24, d25}, [%[ptr_syn]]!\n\t"
+      "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
+      "vst1.16 {d24, d25}, [%[ptr_out]]!\n\t"
+      :[ptr_syn]"+r"(ptr_syn),
+       [ptr_out]"+r"(ptr_out)
+      :
+      :"d22", "d23", "d24", "d25"
+    );
+  }
+
+  // C code:
+  // WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
+  //    + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
+  __asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10");
+  for (; ptr_out < &inst->synthesisBuffer[inst->anaLen]; ) {
+    // Loop unrolled once. Pointer is incremented in the assembly code.
+    __asm__ __volatile__(
+      "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
+      "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
+      :[ptr_out]"+r"(ptr_out)
+      :
+      :"d20", "d21"
+    );
+  }
+}
+
+// Update analysis buffer for lower band, and window data before FFT.
+void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
+                              int16_t* out,
+                              int16_t* new_speech) {
+
+  int16_t* ptr_ana = &inst->analysisBuffer[inst->blockLen10ms];
+  int16_t* ptr_out = &inst->analysisBuffer[0];
+
+  // For lower band update analysis buffer.
+  // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
+  //                      inst->analysisBuffer + inst->blockLen10ms,
+  //                      inst->anaLen - inst->blockLen10ms);
+  for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms]; ) {
+    // Loop unrolled once, so both pointers are incremented by 8 twice.
+    __asm__ __volatile__(
+      "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
+      "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
+      "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t"
+      "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
+      :[ptr_ana]"+r"(ptr_ana),
+       [ptr_out]"+r"(ptr_out)
+      :
+      :"d20", "d21", "d22", "d23"
+    );
+  }
+
+  // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
+  //    + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
+  for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen]; ) {
+    // Loop unrolled once, so both pointers are incremented by 8 twice.
+    __asm__ __volatile__(
+      "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
+      "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
+      "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t"
+      "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
+      :[ptr_ana]"+r"(ptr_ana),
+       [ptr_out]"+r"(ptr_out)
+      :
+      :"d20", "d21", "d22", "d23"
+    );
+  }
+
+  // Window data before FFT
+  int16_t* ptr_window = &inst->window[0];
+  ptr_out = &out[0];
+  ptr_ana = &inst->analysisBuffer[0];
+  for (; ptr_out < &out[inst->anaLen]; ) {
+
+    // Loop unrolled once, so all pointers are incremented by 4 twice.
+    __asm__ __volatile__(
+      "vld1.16 d20, [%[ptr_ana]]!\n\t"
+      "vld1.16 d21, [%[ptr_window]]!\n\t"
+      // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+      //           inst->window[i], inst->analysisBuffer[i], 14); // Q0
+      "vmull.s16 q10, d20, d21\n\t"
+      "vrshrn.i32 d20, q10, #14\n\t"
+      "vst1.16 d20, [%[ptr_out]]!\n\t"
+
+      "vld1.16 d22, [%[ptr_ana]]!\n\t"
+      "vld1.16 d23, [%[ptr_window]]!\n\t"
+      // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+      //           inst->window[i], inst->analysisBuffer[i], 14); // Q0
+      "vmull.s16 q11, d22, d23\n\t"
+      "vrshrn.i32 d22, q11, #14\n\t"
+      "vst1.16 d22, [%[ptr_out]]!\n\t"
+
+      // Specify constraints.
+      :[ptr_ana]"+r"(ptr_ana),
+       [ptr_window]"+r"(ptr_window),
+       [ptr_out]"+r"(ptr_out)
+      :
+      :"d20", "d21", "d22", "d23", "q10", "q11"
+    );
+  }
+}
+
+// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
+// zeros, and normalize it.
+inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
+                                          int16_t* in,
+                                          int16_t* out) {
+  int16_t* ptr_out = &out[0];
+  int16_t* ptr_in = &in[0];
+
+  __asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25");
+  __asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10");
+  for (; ptr_in < &in[inst->anaLen]; ) {
+
+    // Loop unrolled once, so ptr_in is incremented by 8 twice, 
+    // and ptr_out is incremented by 8 four times.
+    __asm__ __volatile__(
+      // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
+      "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t"
+      "vshl.s16 q11, q10\n\t"
+      "vmov d24, d23\n\t"
+
+      // out[j + 1] = 0; // Insert zeros in imaginary part
+      "vmov d23, d25\n\t"
+      "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t"
+      "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t"
+
+      // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
+      "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t"
+      "vshl.s16 q11, q10\n\t"
+      "vmov d24, d23\n\t"
+
+      // out[j + 1] = 0; // Insert zeros in imaginary part
+      "vmov d23, d25\n\t"
+      "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t"
+      "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t"
+
+      // Specify constraints.
+      :[ptr_in]"+r"(ptr_in),
+       [ptr_out]"+r"(ptr_out)
+      :
+      :"d22", "d23", "d24", "d25", "q10", "q11"
+    );
+  }
+}
 #endif // defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)