Separate between Analyze and Process in NS
Filled the empty analyze API, separating the noise estimation from the process API. No formatting fixes or extra refactoring has been done, to make the review process easier. This patch has been tested for bit-exactness over the whole QA set in every aggressiveness. BUG=webrtc:3811 R=bjornv@webrtc.org Review URL: https://webrtc-codereview.appspot.com/27549004 git-svn-id: http://webrtc.googlecode.com/svn/trunk@7243 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
95705602bd
commit
fbf3bfe172
@ -113,6 +113,7 @@ int WebRtcNs_InitCore(NSinst_t* inst, uint32_t fs) {
|
||||
memset(inst->dataBuf, 0, sizeof(float) * ANAL_BLOCKL_MAX);
|
||||
WebRtc_rdft(inst->anaLen, 1, inst->dataBuf, inst->ip, inst->wfft);
|
||||
|
||||
memset(inst->analyzeBuf, 0, sizeof(float) * ANAL_BLOCKL_MAX);
|
||||
memset(inst->dataBuf, 0, sizeof(float) * ANAL_BLOCKL_MAX);
|
||||
memset(inst->syntBuf, 0, sizeof(float) * ANAL_BLOCKL_MAX);
|
||||
|
||||
@ -147,7 +148,7 @@ int WebRtcNs_InitCore(NSinst_t* inst, uint32_t fs) {
|
||||
inst->noisePrev[i] = (float)0.0; //previous noise-spectrum
|
||||
inst->logLrtTimeAvg[i] = LRT_FEATURE_THR; //smooth LR ratio (same as threshold)
|
||||
inst->magnAvgPause[i] = (float)0.0; //conservative noise spectrum estimate
|
||||
inst->speechProbHB[i] = (float)0.0; //for estimation of HB in second pass
|
||||
inst->speechProb[i] = (float)0.0; //for estimation of HB in second pass
|
||||
inst->initMagnEst[i] = (float)0.0; //initial average mag spectrum
|
||||
}
|
||||
|
||||
@ -714,34 +715,20 @@ void WebRtcNs_SpeechNoiseProb(NSinst_t* inst, float* probSpeechFinal, float* snr
|
||||
}
|
||||
}
|
||||
|
||||
int WebRtcNs_AnalyzeCore(NSinst_t* inst, float* inFrame) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int WebRtcNs_ProcessCore(NSinst_t* inst,
|
||||
float* speechFrame,
|
||||
float* speechFrameHB,
|
||||
float* outFrame,
|
||||
float* outFrameHB) {
|
||||
// main routine for noise reduction
|
||||
|
||||
int flagHB = 0;
|
||||
int WebRtcNs_AnalyzeCore(NSinst_t* inst, float* speechFrame) {
|
||||
int i;
|
||||
const int kStartBand = 5; // Skip first frequency bins during estimation.
|
||||
int updateParsFlag;
|
||||
|
||||
float energy1, energy2, gain, factor, factor1, factor2;
|
||||
float energy;
|
||||
float signalEnergy, sumMagn;
|
||||
float snrPrior, currentEstimateStsa;
|
||||
float tmpFloat1, tmpFloat2, tmpFloat3, probSpeech, probNonSpeech;
|
||||
float gammaNoiseTmp, gammaNoiseOld;
|
||||
float noiseUpdateTmp, fTmp;
|
||||
float fout[BLOCKL_MAX];
|
||||
float winData[ANAL_BLOCKL_MAX];
|
||||
float magn[HALF_ANAL_BLOCKL], noise[HALF_ANAL_BLOCKL];
|
||||
float theFilter[HALF_ANAL_BLOCKL], theFilterTmp[HALF_ANAL_BLOCKL];
|
||||
float snrLocPost[HALF_ANAL_BLOCKL], snrLocPrior[HALF_ANAL_BLOCKL];
|
||||
float probSpeechFinal[HALF_ANAL_BLOCKL] = { 0 };
|
||||
float previousEstimateStsa[HALF_ANAL_BLOCKL];
|
||||
float real[ANAL_BLOCKL_MAX], imag[HALF_ANAL_BLOCKL];
|
||||
// Variables during startup
|
||||
@ -753,56 +740,29 @@ int WebRtcNs_ProcessCore(NSinst_t* inst,
|
||||
float parametric_exp = 0.0;
|
||||
float parametric_num = 0.0;
|
||||
|
||||
// SWB variables
|
||||
int deltaBweHB = 1;
|
||||
int deltaGainHB = 1;
|
||||
float decayBweHB = 1.0;
|
||||
float gainMapParHB = 1.0;
|
||||
float gainTimeDomainHB = 1.0;
|
||||
float avgProbSpeechHB, avgProbSpeechHBTmp, avgFilterGainHB, gainModHB;
|
||||
|
||||
// Check that initiation has been done
|
||||
if (inst->initFlag != 1) {
|
||||
return (-1);
|
||||
}
|
||||
// Check for valid pointers based on sampling rate
|
||||
if (inst->fs == 32000) {
|
||||
if (speechFrameHB == NULL) {
|
||||
return -1;
|
||||
}
|
||||
flagHB = 1;
|
||||
// range for averaging low band quantities for H band gain
|
||||
deltaBweHB = (int)inst->magnLen / 4;
|
||||
deltaGainHB = deltaBweHB;
|
||||
}
|
||||
//
|
||||
updateParsFlag = inst->modelUpdatePars[0];
|
||||
//
|
||||
|
||||
// update analysis buffer for L band
|
||||
memcpy(inst->dataBuf, inst->dataBuf + inst->blockLen10ms,
|
||||
memcpy(inst->analyzeBuf, inst->analyzeBuf + inst->blockLen10ms,
|
||||
sizeof(float) * (inst->anaLen - inst->blockLen10ms));
|
||||
memcpy(inst->dataBuf + inst->anaLen - inst->blockLen10ms, speechFrame,
|
||||
memcpy(inst->analyzeBuf + inst->anaLen - inst->blockLen10ms, speechFrame,
|
||||
sizeof(float) * inst->blockLen10ms);
|
||||
|
||||
if (flagHB == 1) {
|
||||
// update analysis buffer for H band
|
||||
memcpy(inst->dataBufHB, inst->dataBufHB + inst->blockLen10ms,
|
||||
sizeof(float) * (inst->anaLen - inst->blockLen10ms));
|
||||
memcpy(inst->dataBufHB + inst->anaLen - inst->blockLen10ms, speechFrameHB,
|
||||
sizeof(float) * inst->blockLen10ms);
|
||||
}
|
||||
|
||||
// check if processing needed
|
||||
if (inst->outLen == 0) {
|
||||
// windowing
|
||||
energy1 = 0.0;
|
||||
energy = 0.0;
|
||||
for (i = 0; i < inst->anaLen; i++) {
|
||||
winData[i] = inst->window[i] * inst->dataBuf[i];
|
||||
energy1 += winData[i] * winData[i];
|
||||
winData[i] = inst->window[i] * inst->analyzeBuf[i];
|
||||
energy += winData[i] * winData[i];
|
||||
}
|
||||
if (energy1 == 0.0) {
|
||||
// synthesize the special case of zero input
|
||||
if (energy == 0.0) {
|
||||
// we want to avoid updating statistics in this case:
|
||||
// Updating feature statistics when we have zeros only will cause thresholds to
|
||||
// move towards zero signal situations. This in turn has the effect that once the
|
||||
@ -810,34 +770,6 @@ int WebRtcNs_ProcessCore(NSinst_t* inst,
|
||||
// and there is no noise suppression effect. Depending on the duration of the
|
||||
// inactive signal it takes a considerable amount of time for the system to learn
|
||||
// what is noise and what is speech.
|
||||
|
||||
// read out fully processed segment
|
||||
for (i = inst->windShift; i < inst->blockLen + inst->windShift; i++) {
|
||||
fout[i - inst->windShift] = inst->syntBuf[i];
|
||||
}
|
||||
// update synthesis buffer
|
||||
memcpy(inst->syntBuf, inst->syntBuf + inst->blockLen,
|
||||
sizeof(float) * (inst->anaLen - inst->blockLen));
|
||||
memset(inst->syntBuf + inst->anaLen - inst->blockLen, 0,
|
||||
sizeof(float) * inst->blockLen);
|
||||
|
||||
// out buffer
|
||||
inst->outLen = inst->blockLen - inst->blockLen10ms;
|
||||
if (inst->blockLen > inst->blockLen10ms) {
|
||||
for (i = 0; i < inst->outLen; i++) {
|
||||
inst->outBuf[i] = fout[i + inst->blockLen10ms];
|
||||
}
|
||||
}
|
||||
for (i = 0; i < inst->blockLen10ms; ++i)
|
||||
outFrame[i] = WEBRTC_SPL_SAT(
|
||||
WEBRTC_SPL_WORD16_MAX, fout[i], WEBRTC_SPL_WORD16_MIN);
|
||||
|
||||
// for time-domain gain of HB
|
||||
if (flagHB == 1)
|
||||
for (i = 0; i < inst->blockLen10ms; ++i)
|
||||
outFrameHB[i] = WEBRTC_SPL_SAT(
|
||||
WEBRTC_SPL_WORD16_MAX, inst->dataBufHB[i], WEBRTC_SPL_WORD16_MIN);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1011,11 +943,11 @@ int WebRtcNs_ProcessCore(NSinst_t* inst,
|
||||
}
|
||||
}
|
||||
// compute speech/noise probability
|
||||
WebRtcNs_SpeechNoiseProb(inst, probSpeechFinal, snrLocPrior, snrLocPost);
|
||||
WebRtcNs_SpeechNoiseProb(inst, inst->speechProb, snrLocPrior, snrLocPost);
|
||||
// time-avg parameter for noise update
|
||||
gammaNoiseTmp = NOISE_UPDATE;
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
probSpeech = probSpeechFinal[i];
|
||||
probSpeech = inst->speechProb[i];
|
||||
probNonSpeech = (float)1.0 - probSpeech;
|
||||
// temporary noise update:
|
||||
// use it for speech frames if update value is less than previous
|
||||
@ -1094,14 +1026,124 @@ int WebRtcNs_ProcessCore(NSinst_t* inst,
|
||||
}
|
||||
// smoothing
|
||||
inst->smooth[i] = theFilter[i];
|
||||
real[i] *= inst->smooth[i];
|
||||
imag[i] *= inst->smooth[i];
|
||||
}
|
||||
// keep track of noise and magn spectrum for next frame
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
inst->noisePrev[i] = noise[i];
|
||||
inst->magnPrev[i] = magn[i];
|
||||
}
|
||||
} // end of if inst->outLen == 0
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int WebRtcNs_ProcessCore(NSinst_t* inst,
|
||||
float* speechFrame,
|
||||
float* speechFrameHB,
|
||||
float* outFrame,
|
||||
float* outFrameHB) {
|
||||
// main routine for noise reduction
|
||||
int flagHB = 0;
|
||||
int i;
|
||||
|
||||
float energy1, energy2, gain, factor, factor1, factor2;
|
||||
float fout[BLOCKL_MAX];
|
||||
float winData[ANAL_BLOCKL_MAX];
|
||||
float real[ANAL_BLOCKL_MAX], imag[HALF_ANAL_BLOCKL];
|
||||
|
||||
// SWB variables
|
||||
int deltaBweHB = 1;
|
||||
int deltaGainHB = 1;
|
||||
float decayBweHB = 1.0;
|
||||
float gainMapParHB = 1.0;
|
||||
float gainTimeDomainHB = 1.0;
|
||||
float avgProbSpeechHB, avgProbSpeechHBTmp, avgFilterGainHB, gainModHB;
|
||||
|
||||
// Check that initiation has been done
|
||||
if (inst->initFlag != 1) {
|
||||
return (-1);
|
||||
}
|
||||
// Check for valid pointers based on sampling rate
|
||||
if (inst->fs == 32000) {
|
||||
if (speechFrameHB == NULL) {
|
||||
return -1;
|
||||
}
|
||||
flagHB = 1;
|
||||
// range for averaging low band quantities for H band gain
|
||||
deltaBweHB = (int)inst->magnLen / 4;
|
||||
deltaGainHB = deltaBweHB;
|
||||
}
|
||||
|
||||
// update analysis buffer for L band
|
||||
memcpy(inst->dataBuf, inst->dataBuf + inst->blockLen10ms,
|
||||
sizeof(float) * (inst->anaLen - inst->blockLen10ms));
|
||||
memcpy(inst->dataBuf + inst->anaLen - inst->blockLen10ms, speechFrame,
|
||||
sizeof(float) * inst->blockLen10ms);
|
||||
|
||||
if (flagHB == 1) {
|
||||
// update analysis buffer for H band
|
||||
memcpy(inst->dataBufHB, inst->dataBufHB + inst->blockLen10ms,
|
||||
sizeof(float) * (inst->anaLen - inst->blockLen10ms));
|
||||
memcpy(inst->dataBufHB + inst->anaLen - inst->blockLen10ms, speechFrameHB,
|
||||
sizeof(float) * inst->blockLen10ms);
|
||||
}
|
||||
|
||||
// check if processing needed
|
||||
if (inst->outLen == 0) {
|
||||
// windowing
|
||||
energy1 = 0.0;
|
||||
for (i = 0; i < inst->anaLen; i++) {
|
||||
winData[i] = inst->window[i] * inst->dataBuf[i];
|
||||
energy1 += winData[i] * winData[i];
|
||||
}
|
||||
if (energy1 == 0.0) {
|
||||
// synthesize the special case of zero input
|
||||
// read out fully processed segment
|
||||
for (i = inst->windShift; i < inst->blockLen + inst->windShift; i++) {
|
||||
fout[i - inst->windShift] = inst->syntBuf[i];
|
||||
}
|
||||
// update synthesis buffer
|
||||
memcpy(inst->syntBuf, inst->syntBuf + inst->blockLen,
|
||||
sizeof(float) * (inst->anaLen - inst->blockLen));
|
||||
memset(inst->syntBuf + inst->anaLen - inst->blockLen, 0,
|
||||
sizeof(float) * inst->blockLen);
|
||||
|
||||
// out buffer
|
||||
inst->outLen = inst->blockLen - inst->blockLen10ms;
|
||||
if (inst->blockLen > inst->blockLen10ms) {
|
||||
for (i = 0; i < inst->outLen; i++) {
|
||||
inst->outBuf[i] = fout[i + inst->blockLen10ms];
|
||||
}
|
||||
}
|
||||
for (i = 0; i < inst->blockLen10ms; ++i)
|
||||
outFrame[i] = WEBRTC_SPL_SAT(
|
||||
WEBRTC_SPL_WORD16_MAX, fout[i], WEBRTC_SPL_WORD16_MIN);
|
||||
|
||||
// for time-domain gain of HB
|
||||
if (flagHB == 1)
|
||||
for (i = 0; i < inst->blockLen10ms; ++i)
|
||||
outFrameHB[i] = WEBRTC_SPL_SAT(
|
||||
WEBRTC_SPL_WORD16_MAX, inst->dataBufHB[i], WEBRTC_SPL_WORD16_MIN);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// FFT
|
||||
WebRtc_rdft(inst->anaLen, 1, winData, inst->ip, inst->wfft);
|
||||
|
||||
imag[0] = 0;
|
||||
real[0] = winData[0];
|
||||
imag[inst->magnLen - 1] = 0;
|
||||
real[inst->magnLen - 1] = winData[1];
|
||||
for (i = 1; i < inst->magnLen - 1; i++) {
|
||||
real[i] = winData[2 * i];
|
||||
imag[i] = winData[2 * i + 1];
|
||||
}
|
||||
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
real[i] *= inst->smooth[i];
|
||||
imag[i] *= inst->smooth[i];
|
||||
}
|
||||
// back to time domain
|
||||
winData[0] = real[0];
|
||||
winData[1] = real[inst->magnLen - 1];
|
||||
@ -1187,14 +1229,11 @@ int WebRtcNs_ProcessCore(NSinst_t* inst,
|
||||
|
||||
// for time-domain gain of HB
|
||||
if (flagHB == 1) {
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
inst->speechProbHB[i] = probSpeechFinal[i];
|
||||
}
|
||||
// average speech prob from low band
|
||||
// avg over second half (i.e., 4->8kHz) of freq. spectrum
|
||||
avgProbSpeechHB = 0.0;
|
||||
for (i = inst->magnLen - deltaBweHB - 1; i < inst->magnLen - 1; i++) {
|
||||
avgProbSpeechHB += inst->speechProbHB[i];
|
||||
avgProbSpeechHB += inst->speechProb[i];
|
||||
}
|
||||
avgProbSpeechHB = avgProbSpeechHB / ((float)deltaBweHB);
|
||||
// average filter gain from low band
|
||||
|
@ -59,6 +59,7 @@ typedef struct NSinst_t_ {
|
||||
int magnLen;
|
||||
int aggrMode;
|
||||
const float* window;
|
||||
float analyzeBuf[ANAL_BLOCKL_MAX];
|
||||
float dataBuf[ANAL_BLOCKL_MAX];
|
||||
float syntBuf[ANAL_BLOCKL_MAX];
|
||||
float outBuf[3 * BLOCKL_MAX];
|
||||
@ -102,7 +103,7 @@ typedef struct NSinst_t_ {
|
||||
int histSpecFlat[HIST_PAR_EST];
|
||||
int histSpecDiff[HIST_PAR_EST];
|
||||
//quantities for high band estimate
|
||||
float speechProbHB[HALF_ANAL_BLOCKL]; //final speech/noise prob: prior + LRT
|
||||
float speechProb[HALF_ANAL_BLOCKL]; //final speech/noise prob: prior + LRT
|
||||
float dataBufHB[ANAL_BLOCKL_MAX]; //buffering data for HB
|
||||
|
||||
} NSinst_t;
|
||||
@ -153,7 +154,7 @@ int WebRtcNs_set_policy_core(NSinst_t* inst, int mode);
|
||||
*
|
||||
* Input:
|
||||
* - inst : Instance that should be initialized
|
||||
* - inFrame : Input speech frame for lower band
|
||||
* - speechFrame : Input speech frame for lower band
|
||||
*
|
||||
* Output:
|
||||
* - inst : Updated instance
|
||||
@ -161,7 +162,7 @@ int WebRtcNs_set_policy_core(NSinst_t* inst, int mode);
|
||||
* Return value : 0 - OK
|
||||
* -1 - Error
|
||||
*/
|
||||
int WebRtcNs_AnalyzeCore(NSinst_t* inst, float* inFrame);
|
||||
int WebRtcNs_AnalyzeCore(NSinst_t* inst, float* speechFrame);
|
||||
|
||||
/****************************************************************************
|
||||
* WebRtcNs_ProcessCore
|
||||
|
Loading…
Reference in New Issue
Block a user