Neon optimization for an NS function.

Review URL: http://webrtc-codereview.appspot.com/89017

git-svn-id: http://webrtc.googlecode.com/svn/trunk@334 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@google.com 2011-08-09 04:30:06 +00:00
parent 14acdbc14d
commit 579ee4d3f1
5 changed files with 260 additions and 12 deletions

View File

@ -28,12 +28,14 @@ MY_WEBRTC_COMMON_DEFS += \
ifeq ($(ARCH_ARM_HAVE_NEON),true)
MY_WEBRTC_COMMON_DEFS += \
'-DWEBRTC_ARCH_ARM_NEON'
CFLAGS_NEON = -flax-vector-conversions
endif
ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
MY_WEBRTC_COMMON_DEFS += \
'-DWEBRTC_ARCH_ARM_V7A'
endif
else ifeq ($(TARGET_ARCH),x86)
MY_WEBRTC_COMMON_DEFS += \
'-DWEBRTC_USE_SSE2'

View File

@ -18,7 +18,7 @@ LOCAL_MODULE_TAGS := optional
LOCAL_GENERATED_SOURCES :=
LOCAL_SRC_FILES := \
noise_suppression_x.c \
nsx_core.c
nsx_core.c
# floating point
# noise_suppression.c ns_core.c
@ -27,6 +27,11 @@ LOCAL_SRC_FILES := \
LOCAL_CFLAGS := \
$(MY_WEBRTC_COMMON_DEFS)
ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_SRC_FILES += nsx_core_neon.c
LOCAL_CFLAGS += $(CFLAGS_NEON)
endif
LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/../interface \
$(LOCAL_PATH)/../../../utility \

View File

@ -25,9 +25,9 @@ static const WebRtc_Word16 kRoundTable[16] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 25
2048, 4096, 8192, 16384};
// Constants to compensate for shifting signal log(2^shifts).
static const WebRtc_Word16 kLogTable[9] = {0, 177, 355, 532, 710, 887, 1065, 1242, 1420};
const WebRtc_Word16 WebRtcNsx_kLogTable[9] = {0, 177, 355, 532, 710, 887, 1065, 1242, 1420};
static const WebRtc_Word16 kCounterDiv[201] = {32767, 16384, 10923, 8192, 6554, 5461, 4681,
const WebRtc_Word16 WebRtcNsx_kCounterDiv[201] = {32767, 16384, 10923, 8192, 6554, 5461, 4681,
4096, 3641, 3277, 2979, 2731, 2521, 2341, 2185, 2048, 1928, 1820, 1725, 1638, 1560,
1489, 1425, 1365, 1311, 1260, 1214, 1170, 1130, 1092, 1057, 1024, 993, 964, 936, 910,
886, 862, 840, 819, 799, 780, 762, 745, 728, 712, 697, 683, 669, 655, 643, 630, 618,
@ -41,7 +41,7 @@ static const WebRtc_Word16 kCounterDiv[201] = {32767, 16384, 10923, 8192, 6554,
189, 188, 187, 186, 185, 184, 183, 182, 181, 180, 179, 178, 177, 176, 175, 174, 173,
172, 172, 171, 170, 169, 168, 167, 166, 165, 165, 164, 163};
static const WebRtc_Word16 kLogTableFrac[256] = {
const WebRtc_Word16 WebRtcNsx_kLogTableFrac[256] = {
0, 1, 3, 4, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 20, 21,
22, 24, 25, 26, 28, 29, 30, 32, 33, 34, 36, 37, 38, 40, 41, 42,
44, 45, 46, 47, 49, 50, 51, 52, 54, 55, 56, 57, 59, 60, 61, 62,
@ -668,6 +668,7 @@ int WebRtcNsx_set_policy_core(NsxInst_t *inst, int mode)
return 0;
}
#if !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
void WebRtcNsx_NoiseEstimation(NsxInst_t *inst, WebRtc_UWord16 *magn, WebRtc_UWord32 *noise,
WebRtc_Word16 *qNoise)
{
@ -685,10 +686,10 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t *inst, WebRtc_UWord16 *magn, WebRtc_UWo
tabind = inst->stages - inst->normData;
if (tabind < 0)
{
logval = -kLogTable[-tabind];
logval = -WebRtcNsx_kLogTable[-tabind];
} else
{
logval = kLogTable[tabind];
logval = WebRtcNsx_kLogTable[tabind];
}
// lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
@ -702,7 +703,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t *inst, WebRtc_UWord16 *magn, WebRtc_UWo
zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros) & 0x7FFFFFFF) >> 23);
// log2(magn(i))
log2 = (WebRtc_Word16)(((31 - zeros) << 8) + kLogTableFrac[frac]);
log2 = (WebRtc_Word16)(((31 - zeros) << 8) + WebRtcNsx_kLogTableFrac[frac]);
// log2(magn(i))*log(2)
lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2Const, 15);
// + log(2^stages)
@ -720,7 +721,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t *inst, WebRtc_UWord16 *magn, WebRtc_UWo
// Get counter values from state
counter = inst->noiseEstCounter[s];
countDiv = kCounterDiv[counter];
countDiv = WebRtcNsx_kCounterDiv[counter];
countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
// quant_est(...)
@ -790,6 +791,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t *inst, WebRtc_UWord16 *magn, WebRtc_UWo
}
(*qNoise) = (WebRtc_Word16)inst->qNoise;
}
#endif // !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
// Extract thresholds for feature parameters
// histograms are computed over some window_size (given by window_pars)
@ -1045,7 +1047,8 @@ void WebRtcNsx_ComputeSpectralFlatness(NsxInst_t *inst, WebRtc_UWord16 *magn)
frac = (WebRtc_Word16)(((WebRtc_UWord32)((WebRtc_UWord32)(magn[i]) << zeros)
& 0x7FFFFFFF) >> 23);
// log2(magn(i))
tmpU32 = (WebRtc_UWord32)(((31 - zeros) << 8) + kLogTableFrac[frac]); // Q8
tmpU32 = (WebRtc_UWord32)(((31 - zeros) << 8)
+ WebRtcNsx_kLogTableFrac[frac]); // Q8
avgSpectralFlatnessNum += tmpU32; // Q8
} else
{
@ -1059,7 +1062,7 @@ void WebRtcNsx_ComputeSpectralFlatness(NsxInst_t *inst, WebRtc_UWord16 *magn)
zeros = WebRtcSpl_NormU32(avgSpectralFlatnessDen);
frac = (WebRtc_Word16)(((avgSpectralFlatnessDen << zeros) & 0x7FFFFFFF) >> 23);
// log2(avgSpectralFlatnessDen)
tmp32 = (WebRtc_Word32)(((31 - zeros) << 8) + kLogTableFrac[frac]); // Q8
tmp32 = (WebRtc_Word32)(((31 - zeros) << 8) + WebRtcNsx_kLogTableFrac[frac]); // Q8
logCurSpectralFlatness = (WebRtc_Word32)avgSpectralFlatnessNum;
logCurSpectralFlatness += ((WebRtc_Word32)(inst->stages - 1) << (inst->stages + 7)); // Q(8+stages-1)
logCurSpectralFlatness -= (tmp32 << (inst->stages - 1));
@ -1551,7 +1554,7 @@ void WebRtcNsx_DataAnalysis(NsxInst_t *inst, short *speechFrame, WebRtc_UWord16
frac = (WebRtc_Word16)((((WebRtc_UWord32)magnU16[inst->anaLen2] << zeros) &
0x7FFFFFFF) >> 23); // Q8
// log2(magnU16(i)) in Q8
log2 = (WebRtc_Word16)(((31 - zeros) << 8) + kLogTableFrac[frac]);
log2 = (WebRtc_Word16)(((31 - zeros) << 8) + WebRtcNsx_kLogTableFrac[frac]);
}
sum_log_magn = (WebRtc_Word32)log2; // Q8
@ -1594,7 +1597,8 @@ void WebRtcNsx_DataAnalysis(NsxInst_t *inst, short *speechFrame, WebRtc_UWord16
frac = (WebRtc_Word16)((((WebRtc_UWord32)magnU16[i] << zeros) &
0x7FFFFFFF) >> 23);
// log2(magnU16(i)) in Q8
log2 = (WebRtc_Word16)(((31 - zeros) << 8) + kLogTableFrac[frac]);
log2 = (WebRtc_Word16)(((31 - zeros) << 8)
+ WebRtcNsx_kLogTableFrac[frac]);
}
sum_log_magn += (WebRtc_Word32)log2; // Q8
// sum_log_i_log_magn in Q17

View File

@ -162,6 +162,18 @@ int WebRtcNsx_set_policy_core(NsxInst_t *inst, int mode);
int WebRtcNsx_ProcessCore(NsxInst_t *inst, short *inFrameLow, short *inFrameHigh,
short *outFrameLow, short *outFrameHigh);
/****************************************************************************
* Internal functions and variable declarations shared with optimized code.
*/
void WebRtcNsx_UpdateNoiseEstimate(NsxInst_t *inst, int offset);
void WebRtcNsx_NoiseEstimation(NsxInst_t *inst, WebRtc_UWord16 *magn, WebRtc_UWord32 *noise,
WebRtc_Word16 *qNoise);
extern const WebRtc_Word16 WebRtcNsx_kLogTable[9];
extern const WebRtc_Word16 WebRtcNsx_kLogTableFrac[256];
extern const WebRtc_Word16 WebRtcNsx_kCounterDiv[201];
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,225 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#if defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)
#include "nsx_core.h"
#include <arm_neon.h>
void WebRtcNsx_NoiseEstimation(NsxInst_t *inst, WebRtc_UWord16 *magn, WebRtc_UWord32 *noise,
WebRtc_Word16 *qNoise)
{
WebRtc_Word32 numerator;
WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv, countProd, delta, zeros, frac;
WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
WebRtc_Word16 log2Const = 22713;
WebRtc_Word16 widthFactor = 21845;
int i, s, offset;
numerator = FACTOR_Q16;
tabind = inst->stages - inst->normData;
if (tabind < 0)
{
logval = -WebRtcNsx_kLogTable[-tabind];
} else
{
logval = WebRtcNsx_kLogTable[tabind];
}
// lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
// magn is in Q(-stages), and the real lmagn values are:
// real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
// lmagn in Q8
for (i = 0; i < inst->magnLen; i++)
{
if (magn[i])
{
zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros) & 0x7FFFFFFF) >> 23);
// log2(magn(i))
log2 = (WebRtc_Word16)(((31 - zeros) << 8) + WebRtcNsx_kLogTableFrac[frac]);
// log2(magn(i))*log(2)
lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2Const, 15);
// + log(2^stages)
lmagn[i] += logval;
} else
{
lmagn[i] = logval;
}
}
int16x4_t Q3_16x4 = vdup_n_s16(3);
int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8);
int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(widthFactor);
// Loop over simultaneous estimates
for (s = 0; s < SIMULT; s++)
{
offset = s * inst->magnLen;
// Get counter values from state
counter = inst->noiseEstCounter[s];
countDiv = WebRtcNsx_kCounterDiv[counter];
countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
// quant_est(...)
WebRtc_Word16 delta_[8];
int16x4_t tmp16x4_0;
int16x4_t tmp16x4_1;
int16x4_t countDiv_16x4 = vdup_n_s16(countDiv);
int16x8_t countProd_16x8 = vdupq_n_s16(countProd);
int16x8_t tmp16x8_0 = vdupq_n_s16(countDiv);
int16x8_t prod16x8 = vqrdmulhq_s16(WIDTHFACTOR_16x8, tmp16x8_0);
int16x8_t tmp16x8_1;
int16x8_t tmp16x8_2;
int16x8_t tmp16x8_3;
int16x8_t tmp16x8_4;
int16x8_t tmp16x8_5;
int32x4_t tmp32x4;
for (i = 0; i < inst->magnLen - 7; i += 8) {
// compute delta
tmp16x8_0 = vdupq_n_s16(FACTOR_Q7);
vst1q_s16(delta_, tmp16x8_0);
int j;
for (j = 0; j < 8; j++) {
if (inst->noiseEstDensity[offset + i + j] > 512)
delta_[j] = WebRtcSpl_DivW32W16ResW16(numerator,
inst->noiseEstDensity[offset + i + j]);
}
// Update log quantile estimate
// tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
tmp32x4 = vmull_s16(vld1_s16(&delta_[0]), countDiv_16x4);
tmp16x4_1 = vshrn_n_s32(tmp32x4, 14);
tmp32x4 = vmull_s16(vld1_s16(&delta_[4]), countDiv_16x4);
tmp16x4_0 = vshrn_n_s32(tmp32x4, 14);
tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // Keep for several lines.
// prepare for the "if" branch
// tmp16 += 2;
// tmp16_1 = (Word16)(tmp16>>2);
tmp16x8_1 = vrshrq_n_s16(tmp16x8_0, 2);
// inst->noiseEstLogQuantile[offset+i] + tmp16_1;
tmp16x8_2 = vld1q_s16(&inst->noiseEstLogQuantile[offset + i]); // Keep
tmp16x8_1 = vaddq_s16(tmp16x8_2, tmp16x8_1); // Keep for several lines
// Prepare for the "else" branch
// tmp16 += 1;
// tmp16_1 = (Word16)(tmp16>>1);
tmp16x8_0 = vrshrq_n_s16(tmp16x8_0, 1);
// tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1);
tmp32x4 = vmull_s16(vget_low_s16(tmp16x8_0), Q3_16x4);
tmp16x4_1 = vshrn_n_s32(tmp32x4, 1);
// tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1);
tmp32x4 = vmull_s16(vget_high_s16(tmp16x8_0), Q3_16x4);
tmp16x4_0 = vshrn_n_s32(tmp32x4, 1);
// inst->noiseEstLogQuantile[offset + i] - tmp16_2;
tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // keep
tmp16x8_0 = vsubq_s16(tmp16x8_2, tmp16x8_0);
// Do the if-else branches:
tmp16x8_3 = vld1q_s16(&lmagn[i]); // keep for several lines
tmp16x8_5 = vsubq_s16(tmp16x8_3, tmp16x8_2);
__asm__("vcgt.s16 %q0, %q1, #0"::"w"(tmp16x8_4), "w"(tmp16x8_5));
__asm__("vbit %q0, %q1, %q2"::"w"(tmp16x8_2), "w"(tmp16x8_1), "w"(tmp16x8_4));
__asm__("vbif %q0, %q1, %q2"::"w"(tmp16x8_2), "w"(tmp16x8_0), "w"(tmp16x8_4));
vst1q_s16(&inst->noiseEstLogQuantile[offset + i], tmp16x8_2);
// Update density estimate
// tmp16_1 + tmp16_2
tmp16x8_1 = vld1q_s16(&inst->noiseEstDensity[offset + i]);
tmp16x8_0 = vqrdmulhq_s16(tmp16x8_1, countProd_16x8);
tmp16x8_0 = vaddq_s16(tmp16x8_0, prod16x8);
// lmagn[i] - inst->noiseEstLogQuantile[offset + i]
tmp16x8_3 = vsubq_s16(tmp16x8_3, tmp16x8_2);
tmp16x8_3 = vabsq_s16(tmp16x8_3);
tmp16x8_4 = vcgtq_s16(WIDTHQ8_16x8, tmp16x8_3);
__asm__("vbit %q0, %q1, %q2"::"w"(tmp16x8_1), "w"(tmp16x8_0), "w"(tmp16x8_4));
vst1q_s16(&inst->noiseEstDensity[offset + i], tmp16x8_1);
} // End loop over magnitude spectrum
for (; i < inst->magnLen; i++)
{
// compute delta
if (inst->noiseEstDensity[offset + i] > 512)
{
delta = WebRtcSpl_DivW32W16ResW16(numerator,
inst->noiseEstDensity[offset + i]);
} else
{
delta = FACTOR_Q7;
}
// update log quantile estimate
tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
if (lmagn[i] > inst->noiseEstLogQuantile[offset + i])
{
// +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
// CounterDiv=1/inst->counter[s] in Q15
tmp16 += 2;
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
inst->noiseEstLogQuantile[offset + i] += tmp16no1;
} else
{
tmp16 += 1;
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
// *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
}
// update density estimate
if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
< WIDTH_Q8)
{
tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
inst->noiseEstDensity[offset + i], countProd, 15);
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(widthFactor,
countDiv, 15);
inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
}
} // end loop over magnitude spectrum
if (counter >= END_STARTUP_LONG)
{
inst->noiseEstCounter[s] = 0;
if (inst->blockIndex >= END_STARTUP_LONG)
{
WebRtcNsx_UpdateNoiseEstimate(inst, offset);
}
}
inst->noiseEstCounter[s]++;
} // end loop over simultaneous estimates
// Sequentially update the noise during startup
if (inst->blockIndex < END_STARTUP_LONG)
{
WebRtcNsx_UpdateNoiseEstimate(inst, offset);
}
for (i = 0; i < inst->magnLen; i++)
{
noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
}
(*qNoise) = (WebRtc_Word16)inst->qNoise;
}
#endif // defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)