/* * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ /* * This file contains the Pre-emptive Expand algorithm that is used to increase * the delay by repeating a part of the audio stream. */ #include "dsp.h" #include "signal_processing_library.h" #include "dsp_helpfunctions.h" #include "neteq_error_codes.h" #define PREEMPTIVE_CORR_LEN 50 #define PREEMPTIVE_MIN_LAG 10 #define PREEMPTIVE_MAX_LAG 60 #define PREEMPTIVE_DOWNSAMPLED_LEN (PREEMPTIVE_CORR_LEN + PREEMPTIVE_MAX_LAG) /* Scratch usage: Type Name size startpos endpos WebRtc_Word16 pw16_downSampSpeech 110 0 109 WebRtc_Word32 pw32_corr 2*50 110 209 WebRtc_Word16 pw16_corr 50 0 49 Total: 110+2*50 */ #define SCRATCH_PW16_DS_SPEECH 0 #define SCRATCH_PW32_CORR PREEMPTIVE_DOWNSAMPLED_LEN #define SCRATCH_PW16_CORR 0 /**************************************************************************** * WebRtcNetEQ_PreEmptiveExpand(...) * * This function tries to extend the audio data by repeating one or several * pitch periods. The operation is only carried out if the correlation is * strong or if the signal energy is very low. The algorithm is the * reciprocal of the Accelerate algorithm. * * Input: * - inst : NetEQ DSP instance * - scratchPtr : Pointer to scratch vector. * - decoded : Pointer to newly decoded speech. * - len : Length of decoded speech. * - oldDataLen : Length of the part of decoded that has already been played out. * - BGNonly : If non-zero, Pre-emptive Expand will only copy * the first DEFAULT_TIME_ADJUST seconds of the * input and append to the end. No signal matching is * done. * * Output: * - inst : Updated instance * - outData : Pointer to a memory space where the output data * should be stored. The vector must be at least * min(len + 120*fs/8000, NETEQ_MAX_OUTPUT_SIZE) * elements long. * - pw16_len : Number of samples written to outData. * * Return value : 0 - Ok * <0 - Error */ int WebRtcNetEQ_PreEmptiveExpand(DSPInst_t *inst, #ifdef SCRATCH WebRtc_Word16 *pw16_scratchPtr, #endif const WebRtc_Word16 *pw16_decoded, int len, int oldDataLen, WebRtc_Word16 *pw16_outData, WebRtc_Word16 *pw16_len, WebRtc_Word16 BGNonly) { #ifdef SCRATCH /* Use scratch memory for internal temporary vectors */ WebRtc_Word16 *pw16_downSampSpeech = pw16_scratchPtr + SCRATCH_PW16_DS_SPEECH; WebRtc_Word32 *pw32_corr = (WebRtc_Word32*) (pw16_scratchPtr + SCRATCH_PW32_CORR); WebRtc_Word16 *pw16_corr = pw16_scratchPtr + SCRATCH_PW16_CORR; #else /* Allocate memory for temporary vectors */ WebRtc_Word16 pw16_downSampSpeech[PREEMPTIVE_DOWNSAMPLED_LEN]; WebRtc_Word32 pw32_corr[PREEMPTIVE_CORR_LEN]; WebRtc_Word16 pw16_corr[PREEMPTIVE_CORR_LEN]; #endif WebRtc_Word16 w16_decodedMax = 0; WebRtc_Word16 w16_tmp; WebRtc_Word16 w16_tmp2; WebRtc_Word32 w32_tmp; WebRtc_Word32 w32_tmp2; const WebRtc_Word16 w16_startLag = PREEMPTIVE_MIN_LAG; const WebRtc_Word16 w16_endLag = PREEMPTIVE_MAX_LAG; const WebRtc_Word16 w16_corrLen = PREEMPTIVE_CORR_LEN; const WebRtc_Word16 *pw16_vec1, *pw16_vec2; WebRtc_Word16 *pw16_vectmp; WebRtc_Word16 w16_inc, w16_startfact; WebRtc_Word16 w16_bestIndex, w16_bestVal; WebRtc_Word16 w16_VAD = 1; WebRtc_Word16 fsMult; WebRtc_Word16 fsMult120; WebRtc_Word32 w32_en1, w32_en2, w32_cc; WebRtc_Word16 w16_en1, w16_en2; WebRtc_Word16 w16_en1Scale, w16_en2Scale; WebRtc_Word16 w16_sqrtEn1En2; WebRtc_Word16 w16_bestCorr = 0; int ok; #ifdef NETEQ_STEREO MasterSlaveInfo *msInfo = inst->msInfo; #endif fsMult = WebRtcNetEQ_CalcFsMult(inst->fs); /* Calculate fs/8000 */ /* Pre-calculate common multiplication with fsMult */ fsMult120 = (WebRtc_Word16) WEBRTC_SPL_MUL_16_16(fsMult, 120); /* 15 ms */ inst->ExpandInst.w16_consecExp = 0; /* Last was not expand any more */ /* * Sanity check for len variable; must be (almost) 30 ms (120*fsMult + max(bestIndex)). * Also, the new part must be at least .625 ms (w16_overlap). */ if (len < (WebRtc_Word16) WEBRTC_SPL_MUL_16_16((120 + 119), fsMult) || oldDataLen >= len - inst->ExpandInst.w16_overlap) { /* Length of decoded data too short */ inst->w16_mode = MODE_UNSUCCESS_PREEMPTIVE; *pw16_len = len; /* simply move all data from decoded to outData */ WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, (WebRtc_Word16) len); return NETEQ_OTHER_ERROR; } /***********************************/ /* Special operations for BGN only */ /***********************************/ /* Check if "background noise only" flag is set */ if (BGNonly) { /* special operation for BGN only; simply insert a chunk of data */ w16_bestIndex = DEFAULT_TIME_ADJUST * (fsMult << 3); /* X*fs/1000 */ /* Sanity check for bestIndex */ if (w16_bestIndex > len) { /* not good, do nothing instead */ inst->w16_mode = MODE_UNSUCCESS_PREEMPTIVE; *pw16_len = len; /* simply move all data from decoded to outData */ WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, (WebRtc_Word16) len); return NETEQ_OTHER_ERROR; } /* set length parameter */ *pw16_len = len + w16_bestIndex; /* copy to output */ WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, len); WEBRTC_SPL_MEMCPY_W16(&pw16_outData[len], pw16_decoded, w16_bestIndex); /* set mode */ inst->w16_mode = MODE_LOWEN_PREEMPTIVE; /* update statistics */ inst->statInst.preemptiveLength += w16_bestIndex; return 0; } /* end of special code for BGN mode */ #ifdef NETEQ_STEREO /* Sanity for msInfo */ if (msInfo == NULL) { /* this should not happen here */ return MASTER_SLAVE_ERROR; } if ((msInfo->msMode == NETEQ_MASTER) || (msInfo->msMode == NETEQ_MONO)) { /* Find correlation lag only for non-slave instances */ #endif /****************************************************************/ /* Find the strongest correlation lag by downsampling to 4 kHz, */ /* calculating correlation for downsampled signal and finding */ /* the strongest correlation peak. */ /****************************************************************/ /* find maximum absolute value */ w16_decodedMax = WebRtcSpl_MaxAbsValueW16(pw16_decoded, (WebRtc_Word16) len); /* downsample the decoded speech to 4 kHz */ ok = WebRtcNetEQ_DownSampleTo4kHz(pw16_decoded, len, inst->fs, pw16_downSampSpeech, PREEMPTIVE_DOWNSAMPLED_LEN, 1 /* compensate delay*/); if (ok != 0) { /* error */ inst->w16_mode = MODE_UNSUCCESS_PREEMPTIVE; *pw16_len = len; /* simply move all data from decoded to outData */ WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, (WebRtc_Word16) len); return NETEQ_OTHER_ERROR; } /* * Set scaling factor for cross correlation to protect against * overflow (log2(50) => 6) */ w16_tmp = 6 - WebRtcSpl_NormW32(WEBRTC_SPL_MUL_16_16(w16_decodedMax, w16_decodedMax)); w16_tmp = WEBRTC_SPL_MAX(0, w16_tmp); /* Perform correlation from lag 10 to lag 60 in 4 kHz domain */WebRtcNetEQ_CrossCorr( pw32_corr, &pw16_downSampSpeech[w16_endLag], &pw16_downSampSpeech[w16_endLag - w16_startLag], w16_corrLen, (WebRtc_Word16) (w16_endLag - w16_startLag), w16_tmp, -1); /* Normalize correlation to 14 bits and put in a WebRtc_Word16 vector */ w32_tmp = WebRtcSpl_MaxAbsValueW32(pw32_corr, w16_corrLen); w16_tmp = 17 - WebRtcSpl_NormW32(w32_tmp); w16_tmp = WEBRTC_SPL_MAX(0, w16_tmp); WebRtcSpl_VectorBitShiftW32ToW16(pw16_corr, w16_corrLen, pw32_corr, w16_tmp); /* Find limits for peak finding, in order to avoid overful NetEQ algorithm buffer. */ /* Calculate difference between MAX_OUTPUT_SIZE and len in 4 kHz domain. */ w16_tmp = WebRtcSpl_DivW32W16ResW16((WebRtc_Word32) (NETEQ_MAX_OUTPUT_SIZE - len), (WebRtc_Word16) (fsMult << 1)) - w16_startLag; w16_tmp = WEBRTC_SPL_MIN(w16_corrLen, w16_tmp); /* no more than corrLen = 50 */ #ifdef NETEQ_STEREO } /* end if (msInfo->msMode != NETEQ_SLAVE) */ if ((msInfo->msMode == NETEQ_MASTER) || (msInfo->msMode == NETEQ_MONO)) { /* Find the strongest correlation peak by using the parabolic fit method */ WebRtcNetEQ_PeakDetection(pw16_corr, w16_tmp, 1, fsMult, &w16_bestIndex, &w16_bestVal); /* 0 <= bestIndex <= (2*w16_tmp - 1)*fsMult <= (2*corrLen - 1)*fsMult = 99*fsMult */ /* Compensate bestIndex for displaced starting position */ w16_bestIndex = w16_bestIndex + w16_startLag * WEBRTC_SPL_LSHIFT_W16(fsMult, 1); /* 20*fsMult <= bestIndex <= 119*fsMult */ msInfo->bestIndex = w16_bestIndex; } else if (msInfo->msMode == NETEQ_SLAVE) { if (msInfo->extraInfo == PE_EXP_FAIL) { /* Master has signaled an unsuccessful preemptive expand */ w16_bestIndex = 0; } else { /* Get best index from master */ w16_bestIndex = msInfo->bestIndex; } } else { /* Invalid mode */ return (MASTER_SLAVE_ERROR); } #else /* NETEQ_STEREO */ /* Find the strongest correlation peak by using the parabolic fit method */ WebRtcNetEQ_PeakDetection(pw16_corr, w16_tmp, 1, fsMult, &w16_bestIndex, &w16_bestVal); /* 0 <= bestIndex <= (2*w16_tmp - 1)*fsMult <= (2*corrLen - 1)*fsMult = 99*fsMult */ /* Compensate bestIndex for displaced starting position */ w16_bestIndex = w16_bestIndex + w16_startLag * WEBRTC_SPL_LSHIFT_W16(fsMult, 1); /* 20*fsMult <= bestIndex <= 119*fsMult */ #endif /* NETEQ_STEREO */ #ifdef NETEQ_STEREO if ((msInfo->msMode == NETEQ_MASTER) || (msInfo->msMode == NETEQ_MONO)) { /* Calculate correlation only for non-slave instances */ #endif /* NETEQ_STEREO */ /*****************************************************/ /* Calculate correlation bestCorr for the found lag. */ /* Also do a simple VAD decision. */ /*****************************************************/ /* * Calculate scaling to ensure that bestIndex samples can be square-summed * without overflowing */ w16_tmp = (31 - WebRtcSpl_NormW32(WEBRTC_SPL_MUL_16_16(w16_decodedMax, w16_decodedMax))); w16_tmp += (31 - WebRtcSpl_NormW32(w16_bestIndex)); w16_tmp -= 31; w16_tmp = WEBRTC_SPL_MAX(0, w16_tmp); /* vec1 starts at 15 ms minus one pitch period */ pw16_vec1 = &pw16_decoded[fsMult120 - w16_bestIndex]; /* vec2 start at 15 ms */ pw16_vec2 = &pw16_decoded[fsMult120]; /* Calculate energies for vec1 and vec2 */ w32_en1 = WebRtcNetEQ_DotW16W16((WebRtc_Word16*) pw16_vec1, (WebRtc_Word16*) pw16_vec1, w16_bestIndex, w16_tmp); w32_en2 = WebRtcNetEQ_DotW16W16((WebRtc_Word16*) pw16_vec2, (WebRtc_Word16*) pw16_vec2, w16_bestIndex, w16_tmp); /* Calculate cross-correlation at the found lag */ w32_cc = WebRtcNetEQ_DotW16W16((WebRtc_Word16*) pw16_vec1, (WebRtc_Word16*) pw16_vec2, w16_bestIndex, w16_tmp); /* Check VAD constraint ((en1+en2)/(2*bestIndex)) <= 8*inst->BGNInst.energy */ w32_tmp = WEBRTC_SPL_RSHIFT_W32(w32_en1 + w32_en2, 4); /* (en1+en2)/(2*8) */ if (inst->BGNInst.w16_initialized == 1) { w32_tmp2 = inst->BGNInst.w32_energy; } else { /* if BGN parameters have not been estimated, use a fixed threshold */ w32_tmp2 = 75000; } w16_tmp2 = 16 - WebRtcSpl_NormW32(w32_tmp2); w16_tmp2 = WEBRTC_SPL_MAX(0, w16_tmp2); w32_tmp = WEBRTC_SPL_RSHIFT_W32(w32_tmp, w16_tmp2); w16_tmp2 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(w32_tmp2, w16_tmp2); w32_tmp2 = WEBRTC_SPL_MUL_16_16(w16_bestIndex, w16_tmp2); /* Scale w32_tmp properly before comparing with w32_tmp2 */ /* (w16_tmp is scaling before energy calculation, thus 2*w16_tmp) */ if (WebRtcSpl_NormW32(w32_tmp) < WEBRTC_SPL_LSHIFT_W32(w16_tmp,1)) { /* Cannot scale only w32_tmp, must scale w32_temp2 too */ WebRtc_Word16 tempshift = WebRtcSpl_NormW32(w32_tmp); w32_tmp = WEBRTC_SPL_LSHIFT_W32(w32_tmp, tempshift); w32_tmp2 = WEBRTC_SPL_RSHIFT_W32(w32_tmp2, WEBRTC_SPL_LSHIFT_W32(w16_tmp,1) - tempshift); } else { w32_tmp = WEBRTC_SPL_LSHIFT_W32(w32_tmp, WEBRTC_SPL_LSHIFT_W32(w16_tmp,1)); } if (w32_tmp <= w32_tmp2) /*((en1+en2)/(2*bestIndex)) <= 8*inst->BGNInst.energy */ { /* The signal seems to be passive speech */ w16_VAD = 0; w16_bestCorr = 0; /* Correlation does not matter */ /* For low energy expansion, the new data can be less than 15 ms, but we must ensure that bestIndex is not larger than the new data. */ w16_bestIndex = WEBRTC_SPL_MIN( w16_bestIndex, len - oldDataLen ); } else { /* The signal is active speech */ w16_VAD = 1; /* Calculate correlation (cc/sqrt(en1*en2)) */ /* Start with calculating scale values */ w16_en1Scale = 16 - WebRtcSpl_NormW32(w32_en1); w16_en1Scale = WEBRTC_SPL_MAX(0, w16_en1Scale); w16_en2Scale = 16 - WebRtcSpl_NormW32(w32_en2); w16_en2Scale = WEBRTC_SPL_MAX(0, w16_en2Scale); /* Make sure total scaling is even (to simplify scale factor after sqrt) */ if ((w16_en1Scale + w16_en2Scale) & 1) { w16_en1Scale += 1; } /* Convert energies to WebRtc_Word16 */ w16_en1 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(w32_en1, w16_en1Scale); w16_en2 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(w32_en2, w16_en2Scale); /* Calculate energy product */ w32_tmp = WEBRTC_SPL_MUL_16_16(w16_en1, w16_en2); /* Calculate square-root of energy product */ w16_sqrtEn1En2 = (WebRtc_Word16) WebRtcSpl_Sqrt(w32_tmp); /* Calculate cc/sqrt(en1*en2) in Q14 */ w16_tmp = 14 - ((w16_en1Scale + w16_en2Scale) >> 1); w32_cc = WEBRTC_SPL_SHIFT_W32(w32_cc, w16_tmp); w32_cc = WEBRTC_SPL_MAX(0, w32_cc); /* Don't divide with negative number */ w16_bestCorr = (WebRtc_Word16) WebRtcSpl_DivW32W16(w32_cc, w16_sqrtEn1En2); w16_bestCorr = WEBRTC_SPL_MIN(16384, w16_bestCorr); /* set maximum to 1.0 */ } #ifdef NETEQ_STEREO } /* end if (msInfo->msMode != NETEQ_SLAVE) */ #endif /* NETEQ_STEREO */ /*******************************************************/ /* Check preemptive expand criteria and insert samples */ /*******************************************************/ /* Check for strong correlation (>0.9) and at least 15 ms new data, or passive speech */ #ifdef NETEQ_STEREO if (((((w16_bestCorr > 14746) && (oldDataLen <= fsMult120)) || (w16_VAD == 0)) && (msInfo->msMode != NETEQ_SLAVE)) || ((msInfo->msMode == NETEQ_SLAVE) && (msInfo->extraInfo != PE_EXP_FAIL))) #else if (((w16_bestCorr > 14746) && (oldDataLen <= fsMult120)) || (w16_VAD == 0)) #endif { /* Do expand operation by overlap add */ /* Set length of the first part, not to be modified */ WebRtc_Word16 w16_startIndex = WEBRTC_SPL_MAX(oldDataLen, fsMult120); /* * Calculate cross-fading slope so that the fading factor goes from * 1 (16384 in Q14) to 0 in one pitch period (bestIndex). */ w16_inc = (WebRtc_Word16) WebRtcSpl_DivW32W16((WebRtc_Word32) 16384, (WebRtc_Word16) (w16_bestIndex + 1)); /* in Q14 */ /* Initiate fading factor */ w16_startfact = 16384 - w16_inc; /* vec1 starts at 15 ms minus one pitch period */ pw16_vec1 = &pw16_decoded[w16_startIndex - w16_bestIndex]; /* vec2 start at 15 ms */ pw16_vec2 = &pw16_decoded[w16_startIndex]; /* Copy unmodified part [0 to 15 ms] */ WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, w16_startIndex); /* Generate interpolated part of length bestIndex (1 pitch period) */ pw16_vectmp = pw16_outData + w16_startIndex; /* Reuse mixing function from Expand */ WebRtcNetEQ_MixVoiceUnvoice(pw16_vectmp, (WebRtc_Word16*) pw16_vec2, (WebRtc_Word16*) pw16_vec1, &w16_startfact, w16_inc, w16_bestIndex); /* Move the last part (also unmodified) */ /* Take from decoded at 15 ms */ pw16_vec2 = &pw16_decoded[w16_startIndex]; WEBRTC_SPL_MEMMOVE_W16(&pw16_outData[w16_startIndex + w16_bestIndex], pw16_vec2, (WebRtc_Word16) (len - w16_startIndex)); /* Set the mode flag */ if (w16_VAD) { inst->w16_mode = MODE_SUCCESS_PREEMPTIVE; } else { inst->w16_mode = MODE_LOWEN_PREEMPTIVE; } /* Calculate resulting length = original length + pitch period */ *pw16_len = len + w16_bestIndex; /* Update in-call statistics */ inst->statInst.preemptiveLength += w16_bestIndex; return 0; } else { /* Preemptive Expand not allowed */ #ifdef NETEQ_STEREO /* Signal to slave(s) that this was unsuccessful */ if (msInfo->msMode == NETEQ_MASTER) { msInfo->extraInfo = PE_EXP_FAIL; } #endif /* Set mode flag to unsuccessful preemptive expand */ inst->w16_mode = MODE_UNSUCCESS_PREEMPTIVE; /* Length is unmodified */ *pw16_len = len; /* Simply move all data from decoded to outData */ WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, (WebRtc_Word16) len); return 0; } } #undef SCRATCH_PW16_DS_SPEECH #undef SCRATCH_PW32_CORR #undef SCRATCH_PW16_CORR