webrtc/modules/audio_coding/NetEQ/main/source/preemptive_expand.c

526 lines
19 KiB
C

/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* This file contains the Pre-emptive Expand algorithm that is used to increase
* the delay by repeating a part of the audio stream.
*/
#include "dsp.h"
#include "signal_processing_library.h"
#include "dsp_helpfunctions.h"
#include "neteq_error_codes.h"
#define PREEMPTIVE_CORR_LEN 50
#define PREEMPTIVE_MIN_LAG 10
#define PREEMPTIVE_MAX_LAG 60
#define PREEMPTIVE_DOWNSAMPLED_LEN (PREEMPTIVE_CORR_LEN + PREEMPTIVE_MAX_LAG)
/* Scratch usage:
Type Name size startpos endpos
WebRtc_Word16 pw16_downSampSpeech 110 0 109
WebRtc_Word32 pw32_corr 2*50 110 209
WebRtc_Word16 pw16_corr 50 0 49
Total: 110+2*50
*/
#define SCRATCH_PW16_DS_SPEECH 0
#define SCRATCH_PW32_CORR PREEMPTIVE_DOWNSAMPLED_LEN
#define SCRATCH_PW16_CORR 0
/****************************************************************************
* WebRtcNetEQ_PreEmptiveExpand(...)
*
* This function tries to extend the audio data by repeating one or several
* pitch periods. The operation is only carried out if the correlation is
* strong or if the signal energy is very low. The algorithm is the
* reciprocal of the Accelerate algorithm.
*
* Input:
* - inst : NetEQ DSP instance
* - scratchPtr : Pointer to scratch vector.
* - decoded : Pointer to newly decoded speech.
* - len : Length of decoded speech.
* - oldDataLen : Length of the part of decoded that has already been played out.
* - BGNonly : If non-zero, Pre-emptive Expand will only copy
* the first DEFAULT_TIME_ADJUST seconds of the
* input and append to the end. No signal matching is
* done.
*
* Output:
* - inst : Updated instance
* - outData : Pointer to a memory space where the output data
* should be stored. The vector must be at least
* min(len + 120*fs/8000, NETEQ_MAX_OUTPUT_SIZE)
* elements long.
* - pw16_len : Number of samples written to outData.
*
* Return value : 0 - Ok
* <0 - Error
*/
int WebRtcNetEQ_PreEmptiveExpand(DSPInst_t *inst,
#ifdef SCRATCH
WebRtc_Word16 *pw16_scratchPtr,
#endif
const WebRtc_Word16 *pw16_decoded, int len, int oldDataLen,
WebRtc_Word16 *pw16_outData, WebRtc_Word16 *pw16_len,
WebRtc_Word16 BGNonly)
{
#ifdef SCRATCH
/* Use scratch memory for internal temporary vectors */
WebRtc_Word16 *pw16_downSampSpeech = pw16_scratchPtr + SCRATCH_PW16_DS_SPEECH;
WebRtc_Word32 *pw32_corr = (WebRtc_Word32*) (pw16_scratchPtr + SCRATCH_PW32_CORR);
WebRtc_Word16 *pw16_corr = pw16_scratchPtr + SCRATCH_PW16_CORR;
#else
/* Allocate memory for temporary vectors */
WebRtc_Word16 pw16_downSampSpeech[PREEMPTIVE_DOWNSAMPLED_LEN];
WebRtc_Word32 pw32_corr[PREEMPTIVE_CORR_LEN];
WebRtc_Word16 pw16_corr[PREEMPTIVE_CORR_LEN];
#endif
WebRtc_Word16 w16_decodedMax = 0;
WebRtc_Word16 w16_tmp;
WebRtc_Word16 w16_tmp2;
WebRtc_Word32 w32_tmp;
WebRtc_Word32 w32_tmp2;
const WebRtc_Word16 w16_startLag = PREEMPTIVE_MIN_LAG;
const WebRtc_Word16 w16_endLag = PREEMPTIVE_MAX_LAG;
const WebRtc_Word16 w16_corrLen = PREEMPTIVE_CORR_LEN;
const WebRtc_Word16 *pw16_vec1, *pw16_vec2;
WebRtc_Word16 *pw16_vectmp;
WebRtc_Word16 w16_inc, w16_startfact;
WebRtc_Word16 w16_bestIndex, w16_bestVal;
WebRtc_Word16 w16_VAD = 1;
WebRtc_Word16 fsMult;
WebRtc_Word16 fsMult120;
WebRtc_Word32 w32_en1, w32_en2, w32_cc;
WebRtc_Word16 w16_en1, w16_en2;
WebRtc_Word16 w16_en1Scale, w16_en2Scale;
WebRtc_Word16 w16_sqrtEn1En2;
WebRtc_Word16 w16_bestCorr = 0;
int ok;
#ifdef NETEQ_STEREO
MasterSlaveInfo *msInfo = inst->msInfo;
#endif
fsMult = WebRtcNetEQ_CalcFsMult(inst->fs); /* Calculate fs/8000 */
/* Pre-calculate common multiplication with fsMult */
fsMult120 = (WebRtc_Word16) WEBRTC_SPL_MUL_16_16(fsMult, 120); /* 15 ms */
inst->ExpandInst.w16_consecExp = 0; /* Last was not expand any more */
/*
* Sanity check for len variable; must be (almost) 30 ms (120*fsMult + max(bestIndex)).
* Also, the new part must be at least .625 ms (w16_overlap).
*/
if (len < (WebRtc_Word16) WEBRTC_SPL_MUL_16_16((120 + 119), fsMult) || oldDataLen >= len
- inst->ExpandInst.w16_overlap)
{
/* Length of decoded data too short */
inst->w16_mode = MODE_UNSUCCESS_PREEMPTIVE;
*pw16_len = len;
/* simply move all data from decoded to outData */
WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, (WebRtc_Word16) len);
return NETEQ_OTHER_ERROR;
}
/***********************************/
/* Special operations for BGN only */
/***********************************/
/* Check if "background noise only" flag is set */
if (BGNonly)
{
/* special operation for BGN only; simply insert a chunk of data */
w16_bestIndex = DEFAULT_TIME_ADJUST * (fsMult << 3); /* X*fs/1000 */
/* Sanity check for bestIndex */
if (w16_bestIndex > len)
{ /* not good, do nothing instead */
inst->w16_mode = MODE_UNSUCCESS_PREEMPTIVE;
*pw16_len = len;
/* simply move all data from decoded to outData */
WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, (WebRtc_Word16) len);
return NETEQ_OTHER_ERROR;
}
/* set length parameter */
*pw16_len = len + w16_bestIndex;
/* copy to output */
WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, len);
WEBRTC_SPL_MEMCPY_W16(&pw16_outData[len], pw16_decoded, w16_bestIndex);
/* set mode */
inst->w16_mode = MODE_LOWEN_PREEMPTIVE;
/* update statistics */
inst->statInst.preemptiveLength += w16_bestIndex;
return 0;
} /* end of special code for BGN mode */
#ifdef NETEQ_STEREO
/* Sanity for msInfo */
if (msInfo == NULL)
{
/* this should not happen here */
return MASTER_SLAVE_ERROR;
}
if ((msInfo->msMode == NETEQ_MASTER) || (msInfo->msMode == NETEQ_MONO))
{
/* Find correlation lag only for non-slave instances */
#endif
/****************************************************************/
/* Find the strongest correlation lag by downsampling to 4 kHz, */
/* calculating correlation for downsampled signal and finding */
/* the strongest correlation peak. */
/****************************************************************/
/* find maximum absolute value */
w16_decodedMax = WebRtcSpl_MaxAbsValueW16(pw16_decoded, (WebRtc_Word16) len);
/* downsample the decoded speech to 4 kHz */
ok = WebRtcNetEQ_DownSampleTo4kHz(pw16_decoded, len, inst->fs, pw16_downSampSpeech,
PREEMPTIVE_DOWNSAMPLED_LEN, 1 /* compensate delay*/);
if (ok != 0)
{
/* error */
inst->w16_mode = MODE_UNSUCCESS_PREEMPTIVE;
*pw16_len = len;
/* simply move all data from decoded to outData */
WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, (WebRtc_Word16) len);
return NETEQ_OTHER_ERROR;
}
/*
* Set scaling factor for cross correlation to protect against
* overflow (log2(50) => 6)
*/
w16_tmp = 6 - WebRtcSpl_NormW32(WEBRTC_SPL_MUL_16_16(w16_decodedMax, w16_decodedMax));
w16_tmp = WEBRTC_SPL_MAX(0, w16_tmp);
/* Perform correlation from lag 10 to lag 60 in 4 kHz domain */WebRtcNetEQ_CrossCorr(
pw32_corr, &pw16_downSampSpeech[w16_endLag],
&pw16_downSampSpeech[w16_endLag - w16_startLag], w16_corrLen,
(WebRtc_Word16) (w16_endLag - w16_startLag), w16_tmp, -1);
/* Normalize correlation to 14 bits and put in a WebRtc_Word16 vector */
w32_tmp = WebRtcSpl_MaxAbsValueW32(pw32_corr, w16_corrLen);
w16_tmp = 17 - WebRtcSpl_NormW32(w32_tmp);
w16_tmp = WEBRTC_SPL_MAX(0, w16_tmp);
WebRtcSpl_VectorBitShiftW32ToW16(pw16_corr, w16_corrLen, pw32_corr, w16_tmp);
/* Find limits for peak finding, in order to avoid overful NetEQ algorithm buffer. */
/* Calculate difference between MAX_OUTPUT_SIZE and len in 4 kHz domain. */
w16_tmp = WebRtcSpl_DivW32W16ResW16((WebRtc_Word32) (NETEQ_MAX_OUTPUT_SIZE - len),
(WebRtc_Word16) (fsMult << 1)) - w16_startLag;
w16_tmp = WEBRTC_SPL_MIN(w16_corrLen, w16_tmp); /* no more than corrLen = 50 */
#ifdef NETEQ_STEREO
} /* end if (msInfo->msMode != NETEQ_SLAVE) */
if ((msInfo->msMode == NETEQ_MASTER) || (msInfo->msMode == NETEQ_MONO))
{
/* Find the strongest correlation peak by using the parabolic fit method */
WebRtcNetEQ_PeakDetection(pw16_corr, w16_tmp, 1, fsMult, &w16_bestIndex, &w16_bestVal);
/* 0 <= bestIndex <= (2*w16_tmp - 1)*fsMult <= (2*corrLen - 1)*fsMult = 99*fsMult */
/* Compensate bestIndex for displaced starting position */
w16_bestIndex = w16_bestIndex + w16_startLag * WEBRTC_SPL_LSHIFT_W16(fsMult, 1);
/* 20*fsMult <= bestIndex <= 119*fsMult */
msInfo->bestIndex = w16_bestIndex;
}
else if (msInfo->msMode == NETEQ_SLAVE)
{
if (msInfo->extraInfo == PE_EXP_FAIL)
{
/* Master has signaled an unsuccessful preemptive expand */
w16_bestIndex = 0;
}
else
{
/* Get best index from master */
w16_bestIndex = msInfo->bestIndex;
}
}
else
{
/* Invalid mode */
return (MASTER_SLAVE_ERROR);
}
#else /* NETEQ_STEREO */
/* Find the strongest correlation peak by using the parabolic fit method */
WebRtcNetEQ_PeakDetection(pw16_corr, w16_tmp, 1, fsMult, &w16_bestIndex, &w16_bestVal);
/* 0 <= bestIndex <= (2*w16_tmp - 1)*fsMult <= (2*corrLen - 1)*fsMult = 99*fsMult */
/* Compensate bestIndex for displaced starting position */
w16_bestIndex = w16_bestIndex + w16_startLag * WEBRTC_SPL_LSHIFT_W16(fsMult, 1);
/* 20*fsMult <= bestIndex <= 119*fsMult */
#endif /* NETEQ_STEREO */
#ifdef NETEQ_STEREO
if ((msInfo->msMode == NETEQ_MASTER) || (msInfo->msMode == NETEQ_MONO))
{
/* Calculate correlation only for non-slave instances */
#endif /* NETEQ_STEREO */
/*****************************************************/
/* Calculate correlation bestCorr for the found lag. */
/* Also do a simple VAD decision. */
/*****************************************************/
/*
* Calculate scaling to ensure that bestIndex samples can be square-summed
* without overflowing
*/
w16_tmp = (31
- WebRtcSpl_NormW32(WEBRTC_SPL_MUL_16_16(w16_decodedMax, w16_decodedMax)));
w16_tmp += (31 - WebRtcSpl_NormW32(w16_bestIndex));
w16_tmp -= 31;
w16_tmp = WEBRTC_SPL_MAX(0, w16_tmp);
/* vec1 starts at 15 ms minus one pitch period */
pw16_vec1 = &pw16_decoded[fsMult120 - w16_bestIndex];
/* vec2 start at 15 ms */
pw16_vec2 = &pw16_decoded[fsMult120];
/* Calculate energies for vec1 and vec2 */
w32_en1 = WebRtcNetEQ_DotW16W16((WebRtc_Word16*) pw16_vec1,
(WebRtc_Word16*) pw16_vec1, w16_bestIndex, w16_tmp);
w32_en2 = WebRtcNetEQ_DotW16W16((WebRtc_Word16*) pw16_vec2,
(WebRtc_Word16*) pw16_vec2, w16_bestIndex, w16_tmp);
/* Calculate cross-correlation at the found lag */
w32_cc = WebRtcNetEQ_DotW16W16((WebRtc_Word16*) pw16_vec1, (WebRtc_Word16*) pw16_vec2,
w16_bestIndex, w16_tmp);
/* Check VAD constraint
((en1+en2)/(2*bestIndex)) <= 8*inst->BGNInst.energy */
w32_tmp = WEBRTC_SPL_RSHIFT_W32(w32_en1 + w32_en2, 4); /* (en1+en2)/(2*8) */
if (inst->BGNInst.w16_initialized == 1)
{
w32_tmp2 = inst->BGNInst.w32_energy;
}
else
{
/* if BGN parameters have not been estimated, use a fixed threshold */
w32_tmp2 = 75000;
}
w16_tmp2 = 16 - WebRtcSpl_NormW32(w32_tmp2);
w16_tmp2 = WEBRTC_SPL_MAX(0, w16_tmp2);
w32_tmp = WEBRTC_SPL_RSHIFT_W32(w32_tmp, w16_tmp2);
w16_tmp2 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(w32_tmp2, w16_tmp2);
w32_tmp2 = WEBRTC_SPL_MUL_16_16(w16_bestIndex, w16_tmp2);
/* Scale w32_tmp properly before comparing with w32_tmp2 */
/* (w16_tmp is scaling before energy calculation, thus 2*w16_tmp) */
if (WebRtcSpl_NormW32(w32_tmp) < WEBRTC_SPL_LSHIFT_W32(w16_tmp,1))
{
/* Cannot scale only w32_tmp, must scale w32_temp2 too */
WebRtc_Word16 tempshift = WebRtcSpl_NormW32(w32_tmp);
w32_tmp = WEBRTC_SPL_LSHIFT_W32(w32_tmp, tempshift);
w32_tmp2 = WEBRTC_SPL_RSHIFT_W32(w32_tmp2,
WEBRTC_SPL_LSHIFT_W32(w16_tmp,1) - tempshift);
}
else
{
w32_tmp = WEBRTC_SPL_LSHIFT_W32(w32_tmp,
WEBRTC_SPL_LSHIFT_W32(w16_tmp,1));
}
if (w32_tmp <= w32_tmp2) /*((en1+en2)/(2*bestIndex)) <= 8*inst->BGNInst.energy */
{
/* The signal seems to be passive speech */
w16_VAD = 0;
w16_bestCorr = 0; /* Correlation does not matter */
/* For low energy expansion, the new data can be less than 15 ms,
but we must ensure that bestIndex is not larger than the new data. */
w16_bestIndex = WEBRTC_SPL_MIN( w16_bestIndex, len - oldDataLen );
}
else
{
/* The signal is active speech */
w16_VAD = 1;
/* Calculate correlation (cc/sqrt(en1*en2)) */
/* Start with calculating scale values */
w16_en1Scale = 16 - WebRtcSpl_NormW32(w32_en1);
w16_en1Scale = WEBRTC_SPL_MAX(0, w16_en1Scale);
w16_en2Scale = 16 - WebRtcSpl_NormW32(w32_en2);
w16_en2Scale = WEBRTC_SPL_MAX(0, w16_en2Scale);
/* Make sure total scaling is even (to simplify scale factor after sqrt) */
if ((w16_en1Scale + w16_en2Scale) & 1)
{
w16_en1Scale += 1;
}
/* Convert energies to WebRtc_Word16 */
w16_en1 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(w32_en1, w16_en1Scale);
w16_en2 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(w32_en2, w16_en2Scale);
/* Calculate energy product */
w32_tmp = WEBRTC_SPL_MUL_16_16(w16_en1, w16_en2);
/* Calculate square-root of energy product */
w16_sqrtEn1En2 = (WebRtc_Word16) WebRtcSpl_Sqrt(w32_tmp);
/* Calculate cc/sqrt(en1*en2) in Q14 */
w16_tmp = 14 - ((w16_en1Scale + w16_en2Scale) >> 1);
w32_cc = WEBRTC_SPL_SHIFT_W32(w32_cc, w16_tmp);
w32_cc = WEBRTC_SPL_MAX(0, w32_cc); /* Don't divide with negative number */
w16_bestCorr = (WebRtc_Word16) WebRtcSpl_DivW32W16(w32_cc, w16_sqrtEn1En2);
w16_bestCorr = WEBRTC_SPL_MIN(16384, w16_bestCorr); /* set maximum to 1.0 */
}
#ifdef NETEQ_STEREO
} /* end if (msInfo->msMode != NETEQ_SLAVE) */
#endif /* NETEQ_STEREO */
/*******************************************************/
/* Check preemptive expand criteria and insert samples */
/*******************************************************/
/* Check for strong correlation (>0.9) and at least 15 ms new data,
or passive speech */
#ifdef NETEQ_STEREO
if (((((w16_bestCorr > 14746) && (oldDataLen <= fsMult120)) || (w16_VAD == 0))
&& (msInfo->msMode != NETEQ_SLAVE)) || ((msInfo->msMode == NETEQ_SLAVE)
&& (msInfo->extraInfo != PE_EXP_FAIL)))
#else
if (((w16_bestCorr > 14746) && (oldDataLen <= fsMult120))
|| (w16_VAD == 0))
#endif
{
/* Do expand operation by overlap add */
/* Set length of the first part, not to be modified */
WebRtc_Word16 w16_startIndex = WEBRTC_SPL_MAX(oldDataLen, fsMult120);
/*
* Calculate cross-fading slope so that the fading factor goes from
* 1 (16384 in Q14) to 0 in one pitch period (bestIndex).
*/
w16_inc = (WebRtc_Word16) WebRtcSpl_DivW32W16((WebRtc_Word32) 16384,
(WebRtc_Word16) (w16_bestIndex + 1)); /* in Q14 */
/* Initiate fading factor */
w16_startfact = 16384 - w16_inc;
/* vec1 starts at 15 ms minus one pitch period */
pw16_vec1 = &pw16_decoded[w16_startIndex - w16_bestIndex];
/* vec2 start at 15 ms */
pw16_vec2 = &pw16_decoded[w16_startIndex];
/* Copy unmodified part [0 to 15 ms] */
WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, w16_startIndex);
/* Generate interpolated part of length bestIndex (1 pitch period) */
pw16_vectmp = pw16_outData + w16_startIndex;
/* Reuse mixing function from Expand */
WebRtcNetEQ_MixVoiceUnvoice(pw16_vectmp, (WebRtc_Word16*) pw16_vec2,
(WebRtc_Word16*) pw16_vec1, &w16_startfact, w16_inc, w16_bestIndex);
/* Move the last part (also unmodified) */
/* Take from decoded at 15 ms */
pw16_vec2 = &pw16_decoded[w16_startIndex];
WEBRTC_SPL_MEMMOVE_W16(&pw16_outData[w16_startIndex + w16_bestIndex], pw16_vec2,
(WebRtc_Word16) (len - w16_startIndex));
/* Set the mode flag */
if (w16_VAD)
{
inst->w16_mode = MODE_SUCCESS_PREEMPTIVE;
}
else
{
inst->w16_mode = MODE_LOWEN_PREEMPTIVE;
}
/* Calculate resulting length = original length + pitch period */
*pw16_len = len + w16_bestIndex;
/* Update in-call statistics */
inst->statInst.preemptiveLength += w16_bestIndex;
return 0;
}
else
{
/* Preemptive Expand not allowed */
#ifdef NETEQ_STEREO
/* Signal to slave(s) that this was unsuccessful */
if (msInfo->msMode == NETEQ_MASTER)
{
msInfo->extraInfo = PE_EXP_FAIL;
}
#endif
/* Set mode flag to unsuccessful preemptive expand */
inst->w16_mode = MODE_UNSUCCESS_PREEMPTIVE;
/* Length is unmodified */
*pw16_len = len;
/* Simply move all data from decoded to outData */
WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, (WebRtc_Word16) len);
return 0;
}
}
#undef SCRATCH_PW16_DS_SPEECH
#undef SCRATCH_PW32_CORR
#undef SCRATCH_PW16_CORR