For Android ARMv7 platforms, added a feature of dynamically detecting the existence of Neon,

and when it's present, switch to some functions optimized for Neon at run time.
Review URL: http://webrtc-codereview.appspot.com/268002

git-svn-id: http://webrtc.googlecode.com/svn/trunk@1096 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@webrtc.org 2011-12-03 18:34:50 +00:00
parent ae7017d588
commit b59c031660
13 changed files with 1274 additions and 796 deletions

View File

@ -54,6 +54,7 @@ include $(MY_WEBRTC_ROOT_PATH)/libvpx.mk
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
include $(LOCAL_PATH)/../../external/webrtc/android-webrtc.mk
LOCAL_ARM_MODE := arm
LOCAL_MODULE := libwebrtc_audio_preprocessing
@ -71,6 +72,17 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
libwebrtc_aecm \
libwebrtc_system_wrappers
# Add Neon libraries.
ifneq (,$(filter '-DWEBRTC_DETECT_ARM_NEON',$(MY_WEBRTC_COMMON_DEFS)))
LOCAL_WHOLE_STATIC_LIBRARIES += \
libwebrtc_aecm_neon \
libwebrtc_ns_neon
else ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_WHOLE_STATIC_LIBRARIES += \
libwebrtc_aecm_neon \
libwebrtc_ns_neon
endif
LOCAL_STATIC_LIBRARIES := \
libprotobuf-cpp-2.3.0-lite

View File

@ -21,8 +21,9 @@ MY_WEBRTC_COMMON_DEFS := \
# '-DWEBRTC_MODULE_UTILITY_VIDEO' [module media_file] [module utility]
ifeq ($(TARGET_ARCH),arm)
MY_WEBRTC_COMMON_DEFS += \
'-DWEBRTC_ARM_INLINE_CALLS' \
'-DWEBRTC_ARCH_ARM'
# '-DWEBRTC_DETECT_ARM_NEON' # only used in a build configuration without Neon
# TODO(kma): figure out if the above define could be moved to NDK build only.
# TODO(kma): test if the code under next two macros works with generic GCC compilers
ifeq ($(ARCH_ARM_HAVE_NEON),true)

View File

@ -6,6 +6,9 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
#############################
# Build the non-neon library.
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
@ -21,21 +24,16 @@ LOCAL_SRC_FILES := \
aecm_core.c
# Flags passed to both C and C++ files.
LOCAL_CFLAGS := \
$(MY_WEBRTC_COMMON_DEFS)
ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_SRC_FILES += \
aecm_core_neon.c
LOCAL_CFLAGS += \
$(MY_ARM_CFLAGS_NEON)
endif
LOCAL_CFLAGS := $(MY_WEBRTC_COMMON_DEFS)
LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/interface \
$(LOCAL_PATH)/../utility \
$(LOCAL_PATH)/../../.. \
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
$(LOCAL_PATH)/../../../common_audio/signal_processing/include \
$(LOCAL_PATH)/../../../system_wrappers/interface
LOCAL_STATIC_LIBRARIES += libwebrtc_system_wrappers
LOCAL_SHARED_LIBRARIES := \
libcutils \
@ -46,3 +44,31 @@ ifndef NDK_ROOT
include external/stlport/libstlport.mk
endif
include $(BUILD_STATIC_LIBRARY)
#########################
# Build the neon library.
include $(CLEAR_VARS)
LOCAL_ARM_MODE := arm
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_MODULE := libwebrtc_aecm_neon
LOCAL_MODULE_TAGS := optional
LOCAL_SRC_FILES := aecm_core_neon.c
# Flags passed to both C and C++ files.
LOCAL_CFLAGS := \
$(MY_WEBRTC_COMMON_DEFS) \
-mfpu=neon \
-flax-vector-conversions
LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/interface \
$(LOCAL_PATH)/../../.. \
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
ifndef NDK_ROOT
include external/stlport/libstlport.mk
endif
include $(BUILD_STATIC_LIBRARY)

View File

@ -13,8 +13,9 @@
#include <assert.h>
#include <stdlib.h>
#include "echo_control_mobile.h"
#include "cpu_features_wrapper.h"
#include "delay_estimator_wrapper.h"
#include "echo_control_mobile.h"
#include "ring_buffer.h"
#include "typedefs.h"
@ -263,6 +264,13 @@ static const uint16_t* AlignedFarend(AecmCore_t* self, int* far_q, int delay) {
HANDLE logFile = NULL;
#endif
// Declare function pointers.
CalcLinearEnergies WebRtcAecm_CalcLinearEnergies;
StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel;
ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
WindowAndFFT WebRtcAecm_WindowAndFFT;
InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
{
AecmCore_t *aecm = malloc(sizeof(AecmCore_t));
@ -346,6 +354,194 @@ void WebRtcAecm_InitEchoPathCore(AecmCore_t* aecm, const WebRtc_Word16* echo_pat
aecm->mseChannelCount = 0;
}
static void WindowAndFFTC(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
int time_signal_scaling)
{
int i, j;
memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
// FFT of signal
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
{
// Window time domain signal and insert into real part of
// transformation array |fft|
fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[i],
14);
fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i + PART_LEN] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
// Inserting zeros in imaginary parts not necessary since we
// initialized the array with all zeros
}
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
// Take only the first PART_LEN2 samples
for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
{
freq_signal[i].real = fft[j];
// The imaginary part has to switch sign
freq_signal[i].imag = - fft[j+1];
}
}
static void InverseFFTAndWindowC(AecmCore_t* aecm,
WebRtc_Word16* fft,
complex16_t* efw,
WebRtc_Word16* output,
const WebRtc_Word16* nearendClean)
{
int i, j, outCFFT;
WebRtc_Word32 tmp32no1;
// Synthesis
for (i = 1; i < PART_LEN; i++)
{
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
fft[j] = efw[i].real;
// mirrored data, even
fft[PART_LEN4 - j] = efw[i].real;
fft[j + 1] = -efw[i].imag;
//mirrored data, odd
fft[PART_LEN4 - (j - 1)] = efw[i].imag;
}
fft[0] = efw[0].real;
fft[1] = -efw[0].imag;
fft[PART_LEN2] = efw[PART_LEN].real;
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
// inverse FFT, result should be scaled with outCFFT
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
//take only the real values and scale with outCFFT
for (i = 0; i < PART_LEN2; i++)
{
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
fft[i] = fft[j];
}
for (i = 0; i < PART_LEN; i++)
{
fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
fft[i],
WebRtcAecm_kSqrtHanning[i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
outCFFT - aecm->dfaCleanQDomain);
fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
tmp32no1 + aecm->outBuf[i],
WEBRTC_SPL_WORD16_MIN);
output[i] = fft[i];
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
fft[PART_LEN + i],
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
outCFFT - aecm->dfaCleanQDomain);
aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
WEBRTC_SPL_WORD16_MAX,
tmp32no1,
WEBRTC_SPL_WORD16_MIN);
}
#ifdef ARM_WINM_LOG_
// measure tick end
QueryPerformanceCounter((LARGE_INTEGER*)&end);
diff__ = ((end - start) * 1000) / (freq/1000);
milliseconds = (unsigned int)(diff__ & 0xffffffff);
WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
#endif
// Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
if (nearendClean != NULL)
{
memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
}
}
static void CalcLinearEnergiesC(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est,
WebRtc_UWord32* far_energy,
WebRtc_UWord32* echo_energy_adapt,
WebRtc_UWord32* echo_energy_stored)
{
int i;
// Get energy for the delayed far end signal and estimated
// echo using both stored and adapted channels.
for (i = 0; i < PART_LEN1; i++)
{
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
(*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
(*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
far_spectrum[i]);
(*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
}
}
static void StoreAdaptiveChannelC(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est)
{
int i;
// During startup we store the channel every block.
memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
// Recalculate echo estimate
for (i = 0; i < PART_LEN; i += 4)
{
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
far_spectrum[i + 1]);
echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
far_spectrum[i + 2]);
echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
far_spectrum[i + 3]);
}
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
}
static void ResetAdaptiveChannelC(AecmCore_t* aecm)
{
int i;
// The stored channel has a significantly lower MSE than the adaptive one for
// two consecutive calculations. Reset the adaptive channel.
memcpy(aecm->channelAdapt16, aecm->channelStored,
sizeof(WebRtc_Word16) * PART_LEN1);
// Restore the W32 channel
for (i = 0; i < PART_LEN; i += 4)
{
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i], 16);
aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 1], 16);
aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 2], 16);
aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 3], 16);
}
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
}
// WebRtcAecm_InitCore(...)
//
// This function initializes the AECM instant created with WebRtcAecm_CreateCore(...)
@ -463,6 +659,23 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
assert(PART_LEN % 16 == 0);
// Initialize function pointers.
WebRtcAecm_WindowAndFFT = WindowAndFFTC;
WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowC;
WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesC;
WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelC;
WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelC;
#ifdef WEBRTC_DETECT_ARM_NEON
uint64_t features = WebRtc_GetCPUFeaturesARM();
if ((features & kCPUFeatureNEON) != 0)
{
WebRtcAecm_InitNeon();
}
#elif defined(WEBRTC_ARCH_ARM_NEON)
WebRtcAecm_InitNeon();
#endif
return 0;
}
@ -1890,194 +2103,3 @@ void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const far
aecm->farBufReadPos += readLen;
}
#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
int time_signal_scaling)
{
int i, j;
memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
// FFT of signal
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
{
// Window time domain signal and insert into real part of
// transformation array |fft|
fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[i],
14);
fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i + PART_LEN] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
// Inserting zeros in imaginary parts not necessary since we
// initialized the array with all zeros
}
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
// Take only the first PART_LEN2 samples
for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
{
freq_signal[i].real = fft[j];
// The imaginary part has to switch sign
freq_signal[i].imag = - fft[j+1];
}
}
void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
WebRtc_Word16* fft,
complex16_t* efw,
WebRtc_Word16* output,
const WebRtc_Word16* nearendClean)
{
int i, j, outCFFT;
WebRtc_Word32 tmp32no1;
// Synthesis
for (i = 1; i < PART_LEN; i++)
{
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
fft[j] = efw[i].real;
// mirrored data, even
fft[PART_LEN4 - j] = efw[i].real;
fft[j + 1] = -efw[i].imag;
//mirrored data, odd
fft[PART_LEN4 - (j - 1)] = efw[i].imag;
}
fft[0] = efw[0].real;
fft[1] = -efw[0].imag;
fft[PART_LEN2] = efw[PART_LEN].real;
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
// inverse FFT, result should be scaled with outCFFT
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
//take only the real values and scale with outCFFT
for (i = 0; i < PART_LEN2; i++)
{
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
fft[i] = fft[j];
}
for (i = 0; i < PART_LEN; i++)
{
fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
fft[i],
WebRtcAecm_kSqrtHanning[i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
outCFFT - aecm->dfaCleanQDomain);
fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
tmp32no1 + aecm->outBuf[i],
WEBRTC_SPL_WORD16_MIN);
output[i] = fft[i];
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
fft[PART_LEN + i],
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
outCFFT - aecm->dfaCleanQDomain);
aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
WEBRTC_SPL_WORD16_MAX,
tmp32no1,
WEBRTC_SPL_WORD16_MIN);
}
#ifdef ARM_WINM_LOG_
// measure tick end
QueryPerformanceCounter((LARGE_INTEGER*)&end);
diff__ = ((end - start) * 1000) / (freq/1000);
milliseconds = (unsigned int)(diff__ & 0xffffffff);
WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
#endif
// Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
if (nearendClean != NULL)
{
memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
}
}
void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est,
WebRtc_UWord32* far_energy,
WebRtc_UWord32* echo_energy_adapt,
WebRtc_UWord32* echo_energy_stored)
{
int i;
// Get energy for the delayed far end signal and estimated
// echo using both stored and adapted channels.
for (i = 0; i < PART_LEN1; i++)
{
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
(*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
(*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
far_spectrum[i]);
(*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
}
}
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est)
{
int i;
// During startup we store the channel every block.
memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
// Recalculate echo estimate
for (i = 0; i < PART_LEN; i += 4)
{
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
far_spectrum[i + 1]);
echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
far_spectrum[i + 2]);
echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
far_spectrum[i + 3]);
}
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
far_spectrum[i]);
}
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
{
int i;
// The stored channel has a significantly lower MSE than the adaptive one for
// two consecutive calculations. Reset the adaptive channel.
memcpy(aecm->channelAdapt16, aecm->channelStored,
sizeof(WebRtc_Word16) * PART_LEN1);
// Restore the W32 channel
for (i = 0; i < PART_LEN; i += 4)
{
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i], 16);
aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 1], 16);
aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 2], 16);
aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i + 3], 16);
}
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
}
#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))

View File

@ -332,32 +332,44 @@ void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * co
void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const farend,
const int farLen, const int knownDelay);
///////////////////////////////////////////////////////////////////////////////////////////////
// Some internal functions shared by ARM NEON and generic C code:
///////////////////////////////////////////////////////////////////////////////
// Some function pointers, for internal functions shared by ARM NEON and
// generic C code.
//
typedef void (*CalcLinearEnergies)(
AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echoEst,
WebRtc_UWord32* far_energy,
WebRtc_UWord32* echo_energy_adapt,
WebRtc_UWord32* echo_energy_stored);
extern CalcLinearEnergies WebRtcAecm_CalcLinearEnergies;
void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echoEst,
WebRtc_UWord32* far_energy,
WebRtc_UWord32* echo_energy_adapt,
WebRtc_UWord32* echo_energy_stored);
typedef void (*StoreAdaptiveChannel)(
AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est);
extern StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel;
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est);
typedef void (*ResetAdaptiveChannel)(AecmCore_t* aecm);
extern ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm);
typedef void (*WindowAndFFT)(
WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
int time_signal_scaling);
extern WindowAndFFT WebRtcAecm_WindowAndFFT;
void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
int time_signal_scaling);
typedef void (*InverseFFTAndWindow)(
AecmCore_t* aecm,
WebRtc_Word16* fft, complex16_t* efw,
WebRtc_Word16* output,
const WebRtc_Word16* nearendClean);
extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
// Initialization of the above function pointers for ARM Neon.
void WebRtcAecm_InitNeon(void);
void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
WebRtc_Word16* fft,
complex16_t* efw,
WebRtc_Word16* output,
const WebRtc_Word16* nearendClean);
#endif

View File

@ -7,7 +7,6 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
#include "aecm_core.h"
@ -16,299 +15,289 @@
// Square root of Hanning window in Q14.
static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = {
16384, 16373, 16354, 16325,
16286, 16237, 16179, 16111,
16034, 15947, 15851, 15746,
15631, 15506, 15373, 15231,
15079, 14918, 14749, 14571,
14384, 14189, 13985, 13773,
13553, 13325, 13089, 12845,
12594, 12335, 12068, 11795,
11514, 11227, 10933, 10633,
10326, 10013, 9695, 9370,
9040, 8705, 8364, 8019,
7668, 7313, 6954, 6591,
6224, 5853, 5478, 5101,
4720, 4337, 3951, 3562,
3172, 2780, 2386, 1990,
1594, 1196, 798, 399
static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = {
16384, 16373, 16354, 16325,
16286, 16237, 16179, 16111,
16034, 15947, 15851, 15746,
15631, 15506, 15373, 15231,
15079, 14918, 14749, 14571,
14384, 14189, 13985, 13773,
13553, 13325, 13089, 12845,
12594, 12335, 12068, 11795,
11514, 11227, 10933, 10633,
10326, 10013, 9695, 9370,
9040, 8705, 8364, 8019,
7668, 7313, 6954, 6591,
6224, 5853, 5478, 5101,
4720, 4337, 3951, 3562,
3172, 2780, 2386, 1990,
1594, 1196, 798, 399
};
void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
static void WindowAndFFTNeon(WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
int time_signal_scaling)
{
int i, j;
int time_signal_scaling) {
int i, j;
int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
__asm__("vmov.i16 d21, #0" ::: "d21");
int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
__asm__("vmov.i16 d21, #0" ::: "d21");
for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
{
int16x4_t tmp16x4_0;
int16x4_t tmp16x4_1;
int32x4_t tmp32x4_0;
for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
int16x4_t tmp16x4_0;
int16x4_t tmp16x4_1;
int32x4_t tmp32x4_0;
/* Window near end */
// fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
// << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
/* Window near end */
// fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
// << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
// fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
// (time_signal[PART_LEN + i] << time_signal_scaling),
// WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN]));
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
// fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
// (time_signal[PART_LEN + i] << time_signal_scaling),
// WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN]));
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
}
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
}
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
// Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
for(i = 0, j = 0; j < PART_LEN2; i += 8, j += 16)
{
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
__asm__("vneg.s16 d22, d22" : : : "q10");
__asm__("vneg.s16 d23, d23" : : : "q11");
__asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : :
// Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
for (i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) {
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
__asm__("vneg.s16 d22, d22" : : : "q10");
__asm__("vneg.s16 d23, d23" : : : "q11");
__asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&freq_signal[i].real): "q10", "q11");
}
}
}
void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
WebRtc_Word16* fft,
complex16_t* efw,
WebRtc_Word16* output,
const WebRtc_Word16* nearendClean)
{
int i, j, outCFFT;
WebRtc_Word32 tmp32no1;
static void InverseFFTAndWindowNeon(AecmCore_t* aecm,
WebRtc_Word16* fft,
complex16_t* efw,
WebRtc_Word16* output,
const WebRtc_Word16* nearendClean) {
int i, j, outCFFT;
WebRtc_Word32 tmp32no1;
// Synthesis
for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
{
// We overwrite two more elements in fft[], but it's ok.
__asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
__asm__("vmov q11, q10" : : : "q10", "q11");
// Synthesis
for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
// We overwrite two more elements in fft[], but it's ok.
__asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
__asm__("vmov q11, q10" : : : "q10", "q11");
__asm__("vneg.s16 d23, d23" : : : "q11");
__asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11");
__asm__("vneg.s16 d23, d23" : : : "q11");
__asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11");
__asm__("vrev64.16 q10, q10" : : : "q10");
__asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10");
}
__asm__("vrev64.16 q10, q10" : : : "q10");
__asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10");
}
fft[PART_LEN2] = efw[PART_LEN].real;
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
fft[PART_LEN2] = efw[PART_LEN].real;
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
// Inverse FFT, result should be scaled with outCFFT.
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
// Inverse FFT, result should be scaled with outCFFT.
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
// Take only the real values and scale with outCFFT.
for (i = 0, j = 0; i < PART_LEN2; i += 8, j+= 16)
{
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
__asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
}
// Take only the real values and scale with outCFFT.
for (i = 0, j = 0; i < PART_LEN2; i += 8, j += 16) {
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
__asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
}
int32x4_t tmp32x4_2;
__asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
(outCFFT - aecm->dfaCleanQDomain)));
for (i = 0; i < PART_LEN; i += 4)
{
int16x4_t tmp16x4_0;
int16x4_t tmp16x4_1;
int32x4_t tmp32x4_0;
int32x4_t tmp32x4_1;
int32x4_t tmp32x4_2;
__asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
(outCFFT - aecm->dfaCleanQDomain)));
for (i = 0; i < PART_LEN; i += 4) {
int16x4_t tmp16x4_0;
int16x4_t tmp16x4_1;
int32x4_t tmp32x4_0;
int32x4_t tmp32x4_1;
// fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
// fft[i], WebRtcAecm_kSqrtHanning[i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i]));
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
__asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
__asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
// fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
// fft[i], WebRtcAecm_kSqrtHanning[i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i]));
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
__asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
__asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
// tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
// outCFFT - aecm->dfaCleanQDomain);
__asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
// tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
// outCFFT - aecm->dfaCleanQDomain);
__asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
// fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
// tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN);
// output[i] = fft[i];
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
__asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
__asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
__asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i]));
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
// fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
// tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN);
// output[i] = fft[i];
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
__asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
__asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
__asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i]));
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
// tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
// fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i]));
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
__asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
__asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
// tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
// fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i]));
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
__asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
__asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
// tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain);
__asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
// outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
// WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
__asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
}
// tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain);
__asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
// outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
// WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
__asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
}
// Copy the current block to the old position (outBuf is shifted elsewhere).
for (i = 0; i < PART_LEN; i += 16)
{
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
// Copy the current block to the old position (outBuf is shifted elsewhere).
for (i = 0; i < PART_LEN; i += 16) {
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->xBuf[i + PART_LEN]) : "q10");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
}
for (i = 0; i < PART_LEN; i += 16)
{
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
}
for (i = 0; i < PART_LEN; i += 16) {
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufNoisy[i]): "q10");
}
if (nearendClean != NULL) {
for (i = 0; i < PART_LEN; i += 16) {
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufClean[i]): "q10");
}
if (nearendClean != NULL) {
for (i = 0; i < PART_LEN; i += 16)
{
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufClean[i]): "q10");
}
}
}
}
void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
static void CalcLinearEnergiesNeon(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est,
WebRtc_UWord32* far_energy,
WebRtc_UWord32* echo_energy_adapt,
WebRtc_UWord32* echo_energy_stored)
{
int i;
WebRtc_UWord32* echo_energy_stored) {
int i;
register WebRtc_UWord32 far_energy_r;
register WebRtc_UWord32 echo_energy_stored_r;
register WebRtc_UWord32 echo_energy_adapt_r;
uint32x4_t tmp32x4_0;
register WebRtc_UWord32 far_energy_r;
register WebRtc_UWord32 echo_energy_stored_r;
register WebRtc_UWord32 echo_energy_adapt_r;
uint32x4_t tmp32x4_0;
__asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy
__asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored
__asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt
__asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy
__asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored
__asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt
for(i = 0; i < PART_LEN -7; i += 8)
{
// far_energy += (WebRtc_UWord32)(far_spectrum[i]);
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
__asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
__asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
// Get estimated echo energies for adaptive channel and stored channel.
// echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
"q10", "q11");
// echo_energy_stored += (WebRtc_UWord32)echoEst[i];
__asm__("vadd.u32 q8, q10" : : : "q10", "q8");
__asm__("vadd.u32 q8, q11" : : : "q11", "q8");
// echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(
// aecm->channelAdapt16[i], far_spectrum[i]);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vadd.u32 q9, q10" : : : "q9", "q15");
__asm__("vadd.u32 q9, q11" : : : "q9", "q11");
}
__asm__("vadd.u32 d28, d29" : : : "q14");
__asm__("vpadd.u32 d28, d28" : : : "q14");
__asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
__asm__("vadd.u32 d18, d19" : : : "q9");
__asm__("vpadd.u32 d18, d18" : : : "q9");
__asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
__asm__("vadd.u32 d16, d17" : : : "q8");
__asm__("vpadd.u32 d16, d16" : : : "q8");
__asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
for (i = 0; i < PART_LEN - 7; i += 8) {
// far_energy += (WebRtc_UWord32)(far_spectrum[i]);
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
__asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
__asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
// Get estimated echo energies for adaptive channel and stored channel.
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
*echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i];
*far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
*echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
aecm->channelAdapt16[i], far_spectrum[i]);
// echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
"q10", "q11");
// echo_energy_stored += (WebRtc_UWord32)echoEst[i];
__asm__("vadd.u32 q8, q10" : : : "q10", "q8");
__asm__("vadd.u32 q8, q11" : : : "q11", "q8");
// echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(
// aecm->channelAdapt16[i], far_spectrum[i]);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vadd.u32 q9, q10" : : : "q9", "q15");
__asm__("vadd.u32 q9, q11" : : : "q9", "q11");
}
__asm__("vadd.u32 d28, d29" : : : "q14");
__asm__("vpadd.u32 d28, d28" : : : "q14");
__asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
__asm__("vadd.u32 d18, d19" : : : "q9");
__asm__("vpadd.u32 d18, d18" : : : "q9");
__asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
__asm__("vadd.u32 d16, d17" : : : "q8");
__asm__("vpadd.u32 d16, d16" : : : "q8");
__asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
// Get estimated echo energies for adaptive channel and stored channel.
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
*echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i];
*far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
*echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
aecm->channelAdapt16[i], far_spectrum[i]);
}
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
static void StoreAdaptiveChannelNeon(AecmCore_t* aecm,
const WebRtc_UWord16* far_spectrum,
WebRtc_Word32* echo_est)
{
int i;
WebRtc_Word32* echo_est) {
int i;
// During startup we store the channel every block.
// Recalculate echo estimate.
for(i = 0; i < PART_LEN -7; i += 8)
{
// aecm->channelStored[i] = acem->channelAdapt16[i];
// echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&echo_est[i]) : "q10", "q11");
}
aecm->channelStored[i] = aecm->channelAdapt16[i];
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
// During startup we store the channel every block.
// Recalculate echo estimate.
for (i = 0; i < PART_LEN - 7; i += 8) {
// aecm->channelStored[i] = acem->channelAdapt16[i];
// echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&echo_est[i]) : "q10", "q11");
}
aecm->channelStored[i] = aecm->channelAdapt16[i];
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
}
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
{
int i;
static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
int i;
for(i = 0; i < PART_LEN -7; i += 8)
{
// aecm->channelAdapt16[i] = aecm->channelStored[i];
// aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
// aecm->channelStored[i], 16);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : :
"r"(&aecm->channelStored[i]) : "q12");
__asm__("vst1.16 {d24, d25}, [%0, :128]" : :
"r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
__asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->channelAdapt32[i]): "q10", "q11");
}
aecm->channelAdapt16[i] = aecm->channelStored[i];
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i], 16);
for (i = 0; i < PART_LEN - 7; i += 8) {
// aecm->channelAdapt16[i] = aecm->channelStored[i];
// aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
// aecm->channelStored[i], 16);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : :
"r"(&aecm->channelStored[i]) : "q12");
__asm__("vst1.16 {d24, d25}, [%0, :128]" : :
"r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
__asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->channelAdapt32[i]): "q10", "q11");
}
aecm->channelAdapt16[i] = aecm->channelStored[i];
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i], 16);
}
#endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
void WebRtcAecm_InitNeon(void) {
WebRtcAecm_WindowAndFFT = WindowAndFFTNeon;
WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon;
WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon;
WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon;
WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon;
}

View File

@ -6,6 +6,8 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
#############################
# Build the non-neon library.
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
@ -20,25 +22,20 @@ LOCAL_SRC_FILES := \
noise_suppression_x.c \
nsx_core.c
# floating point
# Files for floating point.
# noise_suppression.c ns_core.c
# Flags passed to both C and C++ files.
LOCAL_CFLAGS := \
$(MY_WEBRTC_COMMON_DEFS)
ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_SRC_FILES += \
nsx_core_neon.c
LOCAL_CFLAGS += \
$(MY_ARM_CFLAGS_NEON)
endif
LOCAL_CFLAGS := $(MY_WEBRTC_COMMON_DEFS)
LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/interface \
$(LOCAL_PATH)/../utility \
$(LOCAL_PATH)/../../.. \
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
$(LOCAL_PATH)/../../../common_audio/signal_processing/include \
$(LOCAL_PATH)/../../../system_wrappers/interface
LOCAL_STATIC_LIBRARIES += libwebrtc_system_wrappers
LOCAL_SHARED_LIBRARIES := \
libcutils \
@ -49,3 +46,31 @@ ifndef NDK_ROOT
include external/stlport/libstlport.mk
endif
include $(BUILD_STATIC_LIBRARY)
#############################
# Build the neon library.
include $(CLEAR_VARS)
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_MODULE := libwebrtc_ns_neon
LOCAL_MODULE_TAGS := optional
LOCAL_GENERATED_SOURCES :=
LOCAL_SRC_FILES := nsx_core_neon.c
# Flags passed to both C and C++ files.
LOCAL_CFLAGS := \
$(MY_WEBRTC_COMMON_DEFS) \
-mfpu=neon \
-flax-vector-conversions
LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/interface \
$(LOCAL_PATH)/../../.. \
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
ifndef NDK_ROOT
include external/stlport/libstlport.mk
endif
include $(BUILD_STATIC_LIBRARY)

View File

@ -16,6 +16,7 @@
#include <stdlib.h>
#include <stdio.h>
#include "cpu_features_wrapper.h"
#include "nsx_core.h"
// Skip first frequency bins during estimation. (0 <= value < 64)
@ -426,6 +427,271 @@ static const WebRtc_Word16 kDeterminantEstMatrix[66] = {
355, 330
};
// Declare function pointers.
NoiseEstimation WebRtcNsx_NoiseEstimation;
PrepareSpectrum WebRtcNsx_PrepareSpectrum;
SynthesisUpdate WebRtcNsx_SynthesisUpdate;
AnalysisUpdate WebRtcNsx_AnalysisUpdate;
Denormalize WebRtcNsx_Denormalize;
CreateComplexBuffer WebRtcNsx_CreateComplexBuffer;
// Update the noise estimation information.
static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
WebRtc_Word32 tmp32no1 = 0;
WebRtc_Word32 tmp32no2 = 0;
WebRtc_Word16 tmp16 = 0;
const WebRtc_Word16 kExp2Const = 11819; // Q13
int i = 0;
tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
inst->magnLen);
// Guarantee a Q-domain as high as possible and still fit in int16
inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
kExp2Const, tmp16, 21);
for (i = 0; i < inst->magnLen; i++) {
// inst->quantile[i]=exp(inst->lquantile[offset+i]);
// in Q21
tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
inst->noiseEstLogQuantile[offset + i]);
tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
tmp16 -= 21;// shift 21 to get result in Q0
tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise)
if (tmp16 < 0) {
tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
} else {
tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
}
inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1);
}
}
// Noise Estimation
static void NoiseEstimationC(NsxInst_t* inst,
uint16_t* magn,
uint32_t* noise,
int16_t* q_noise) {
WebRtc_Word32 numerator = FACTOR_Q16;
WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
WebRtc_Word16 countProd, delta, zeros, frac;
WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
const int16_t log2_const = 22713; // Q15
const int16_t width_factor = 21845;
int i, s, offset;
tabind = inst->stages - inst->normData;
assert(tabind < 9);
assert(tabind > -9);
if (tabind < 0) {
logval = -WebRtcNsx_kLogTable[-tabind];
} else {
logval = WebRtcNsx_kLogTable[tabind];
}
// lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
// magn is in Q(-stages), and the real lmagn values are:
// real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
// lmagn in Q8
for (i = 0; i < inst->magnLen; i++) {
if (magn[i]) {
zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros)
& 0x7FFFFFFF) >> 23);
// log2(magn(i))
assert(frac < 256);
log2 = (WebRtc_Word16)(((31 - zeros) << 8)
+ WebRtcNsx_kLogTableFrac[frac]);
// log2(magn(i))*log(2)
lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
// + log(2^stages)
lmagn[i] += logval;
} else {
lmagn[i] = logval;//0;
}
}
// loop over simultaneous estimates
for (s = 0; s < SIMULT; s++) {
offset = s * inst->magnLen;
// Get counter values from state
counter = inst->noiseEstCounter[s];
assert(counter < 201);
countDiv = WebRtcNsx_kCounterDiv[counter];
countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
// quant_est(...)
for (i = 0; i < inst->magnLen; i++) {
// compute delta
if (inst->noiseEstDensity[offset + i] > 512) {
delta = WebRtcSpl_DivW32W16ResW16(numerator,
inst->noiseEstDensity[offset + i]);
} else {
delta = FACTOR_Q7;
if (inst->blockIndex < END_STARTUP_LONG) {
// Smaller step size during startup. This prevents from using
// unrealistic values causing overflow.
delta = FACTOR_Q7_STARTUP;
}
}
// update log quantile estimate
tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
// +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
// CounterDiv=1/(inst->counter[s]+1) in Q15
tmp16 += 2;
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
inst->noiseEstLogQuantile[offset + i] += tmp16no1;
} else {
tmp16 += 1;
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
// *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
if (inst->noiseEstLogQuantile[offset + i] < logval) {
// This is the smallest fixed point representation we can
// have, hence we limit the output.
inst->noiseEstLogQuantile[offset + i] = logval;
}
}
// update density estimate
if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
< WIDTH_Q8) {
tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
inst->noiseEstDensity[offset + i], countProd, 15);
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
width_factor, countDiv, 15);
inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
}
} // end loop over magnitude spectrum
if (counter >= END_STARTUP_LONG) {
inst->noiseEstCounter[s] = 0;
if (inst->blockIndex >= END_STARTUP_LONG) {
UpdateNoiseEstimate(inst, offset);
}
}
inst->noiseEstCounter[s]++;
} // end loop over simultaneous estimates
// Sequentially update the noise during startup
if (inst->blockIndex < END_STARTUP_LONG) {
UpdateNoiseEstimate(inst, offset);
}
for (i = 0; i < inst->magnLen; i++) {
noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
}
(*q_noise) = (WebRtc_Word16)inst->qNoise;
}
// Filter the data in the frequency domain, and create spectrum.
static void PrepareSpectrumC(NsxInst_t* inst, int16_t* freq_buf) {
int i = 0, j = 0;
int16_t tmp16 = 0;
for (i = 0; i < inst->magnLen; i++) {
inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
(WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
(WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
}
freq_buf[0] = inst->real[0];
freq_buf[1] = -inst->imag[0];
for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
tmp16 = (inst->anaLen << 1) - j;
freq_buf[j] = inst->real[i];
freq_buf[j + 1] = -inst->imag[i];
freq_buf[tmp16] = inst->real[i];
freq_buf[tmp16 + 1] = inst->imag[i];
}
freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
}
// Denormalize the input buffer.
static __inline void DenormalizeC(NsxInst_t* inst, int16_t* in, int factor) {
int i = 0, j = 0;
int32_t tmp32 = 0;
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j],
factor - inst->normData);
inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
}
}
// For the noise supression process, synthesis, read out fully processed
// segment, and update synthesis buffer.
static void SynthesisUpdateC(NsxInst_t* inst,
int16_t* out_frame,
int16_t gain_factor) {
int i = 0;
int16_t tmp16a = 0;
int16_t tmp16b = 0;
int32_t tmp32 = 0;
// synthesis
for (i = 0; i < inst->anaLen; i++) {
tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
inst->window[i], inst->real[i], 14); // Q0, window in Q14
tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0
// Down shift with rounding
tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i],
tmp16b); // Q0
}
// read out fully processed segment
for (i = 0; i < inst->blockLen10ms; i++) {
out_frame[i] = inst->synthesisBuffer[i]; // Q0
}
// update synthesis buffer
WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
inst->synthesisBuffer + inst->blockLen10ms,
inst->anaLen - inst->blockLen10ms);
WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
+ inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
}
// Update analysis buffer for lower band, and window data before FFT.
static void AnalysisUpdateC(NsxInst_t* inst,
int16_t* out,
int16_t* new_speech) {
int i = 0;
// For lower band update analysis buffer.
WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
inst->analysisBuffer + inst->blockLen10ms,
inst->anaLen - inst->blockLen10ms);
WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
+ inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
// Window data before FFT.
for (i = 0; i < inst->anaLen; i++) {
out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
inst->window[i], inst->analysisBuffer[i], 14); // Q0
}
}
// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
// zeros, and normalize it.
static __inline void CreateComplexBufferC(NsxInst_t* inst,
int16_t* in,
int16_t* out) {
int i = 0, j = 0;
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
out[j + 1] = 0; // Insert zeros in imaginary part
}
}
void WebRtcNsx_CalcParametricNoiseEstimate(NsxInst_t* inst,
WebRtc_Word16 pink_noise_exp_avg,
WebRtc_Word32 pink_noise_num_avg,
@ -600,6 +866,24 @@ WebRtc_Word32 WebRtcNsx_InitCore(NsxInst_t* inst, WebRtc_UWord32 fs) {
inst->file5 = fopen("file5.pcm", "wb");
#endif
// Initialize function pointers.
WebRtcNsx_NoiseEstimation = NoiseEstimationC;
WebRtcNsx_PrepareSpectrum = PrepareSpectrumC;
WebRtcNsx_SynthesisUpdate = SynthesisUpdateC;
WebRtcNsx_AnalysisUpdate = AnalysisUpdateC;
WebRtcNsx_Denormalize = DenormalizeC;
WebRtcNsx_CreateComplexBuffer = CreateComplexBufferC;
#ifdef WEBRTC_DETECT_ARM_NEON
uint64_t features = WebRtc_GetCPUFeaturesARM();
if ((features & kCPUFeatureNEON) != 0)
{
WebRtcNsx_InitNeon();
}
#elif defined(WEBRTC_ARCH_ARM_NEON)
WebRtcNsx_InitNeon();
#endif
inst->initFlag = 1;
return 0;
@ -2157,263 +2441,4 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* speechFrame, short* speechFram
return 0;
}
#if !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
// Update the noise estimation information.
static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
WebRtc_Word32 tmp32no1 = 0;
WebRtc_Word32 tmp32no2 = 0;
WebRtc_Word16 tmp16 = 0;
const WebRtc_Word16 kExp2Const = 11819; // Q13
int i = 0;
tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
inst->magnLen);
// Guarantee a Q-domain as high as possible and still fit in int16
inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
kExp2Const, tmp16, 21);
for (i = 0; i < inst->magnLen; i++) {
// inst->quantile[i]=exp(inst->lquantile[offset+i]);
// in Q21
tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
inst->noiseEstLogQuantile[offset + i]);
tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
tmp16 -= 21;// shift 21 to get result in Q0
tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise)
if (tmp16 < 0) {
tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
} else {
tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
}
inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1);
}
}
// Noise Estimation
void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
uint16_t* magn,
uint32_t* noise,
int16_t* q_noise) {
WebRtc_Word32 numerator = FACTOR_Q16;
WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
WebRtc_Word16 countProd, delta, zeros, frac;
WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
const int16_t log2_const = 22713; // Q15
const int16_t width_factor = 21845;
int i, s, offset;
tabind = inst->stages - inst->normData;
assert(tabind < 9);
assert(tabind > -9);
if (tabind < 0) {
logval = -WebRtcNsx_kLogTable[-tabind];
} else {
logval = WebRtcNsx_kLogTable[tabind];
}
// lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
// magn is in Q(-stages), and the real lmagn values are:
// real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
// lmagn in Q8
for (i = 0; i < inst->magnLen; i++) {
if (magn[i]) {
zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros)
& 0x7FFFFFFF) >> 23);
// log2(magn(i))
assert(frac < 256);
log2 = (WebRtc_Word16)(((31 - zeros) << 8)
+ WebRtcNsx_kLogTableFrac[frac]);
// log2(magn(i))*log(2)
lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
// + log(2^stages)
lmagn[i] += logval;
} else {
lmagn[i] = logval;//0;
}
}
// loop over simultaneous estimates
for (s = 0; s < SIMULT; s++) {
offset = s * inst->magnLen;
// Get counter values from state
counter = inst->noiseEstCounter[s];
assert(counter < 201);
countDiv = WebRtcNsx_kCounterDiv[counter];
countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
// quant_est(...)
for (i = 0; i < inst->magnLen; i++) {
// compute delta
if (inst->noiseEstDensity[offset + i] > 512) {
delta = WebRtcSpl_DivW32W16ResW16(numerator,
inst->noiseEstDensity[offset + i]);
} else {
delta = FACTOR_Q7;
if (inst->blockIndex < END_STARTUP_LONG) {
// Smaller step size during startup. This prevents from using
// unrealistic values causing overflow.
delta = FACTOR_Q7_STARTUP;
}
}
// update log quantile estimate
tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
// +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
// CounterDiv=1/(inst->counter[s]+1) in Q15
tmp16 += 2;
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
inst->noiseEstLogQuantile[offset + i] += tmp16no1;
} else {
tmp16 += 1;
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
// *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
if (inst->noiseEstLogQuantile[offset + i] < logval) {
// This is the smallest fixed point representation we can
// have, hence we limit the output.
inst->noiseEstLogQuantile[offset + i] = logval;
}
}
// update density estimate
if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
< WIDTH_Q8) {
tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
inst->noiseEstDensity[offset + i], countProd, 15);
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
width_factor, countDiv, 15);
inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
}
} // end loop over magnitude spectrum
if (counter >= END_STARTUP_LONG) {
inst->noiseEstCounter[s] = 0;
if (inst->blockIndex >= END_STARTUP_LONG) {
UpdateNoiseEstimate(inst, offset);
}
}
inst->noiseEstCounter[s]++;
} // end loop over simultaneous estimates
// Sequentially update the noise during startup
if (inst->blockIndex < END_STARTUP_LONG) {
UpdateNoiseEstimate(inst, offset);
}
for (i = 0; i < inst->magnLen; i++) {
noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
}
(*q_noise) = (WebRtc_Word16)inst->qNoise;
}
// Filter the data in the frequency domain, and create spectrum.
void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
int i = 0, j = 0;
int16_t tmp16 = 0;
for (i = 0; i < inst->magnLen; i++) {
inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
(WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
(WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
}
freq_buf[0] = inst->real[0];
freq_buf[1] = -inst->imag[0];
for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
tmp16 = (inst->anaLen << 1) - j;
freq_buf[j] = inst->real[i];
freq_buf[j + 1] = -inst->imag[i];
freq_buf[tmp16] = inst->real[i];
freq_buf[tmp16 + 1] = inst->imag[i];
}
freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
}
// Denormalize the input buffer.
__inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
int i = 0, j = 0;
int32_t tmp32 = 0;
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j],
factor - inst->normData);
inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
}
}
// For the noise supression process, synthesis, read out fully processed
// segment, and update synthesis buffer.
void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
int16_t* out_frame,
int16_t gain_factor) {
int i = 0;
int16_t tmp16a = 0;
int16_t tmp16b = 0;
int32_t tmp32 = 0;
// synthesis
for (i = 0; i < inst->anaLen; i++) {
tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
inst->window[i], inst->real[i], 14); // Q0, window in Q14
tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0
// Down shift with rounding
tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i],
tmp16b); // Q0
}
// read out fully processed segment
for (i = 0; i < inst->blockLen10ms; i++) {
out_frame[i] = inst->synthesisBuffer[i]; // Q0
}
// update synthesis buffer
WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
inst->synthesisBuffer + inst->blockLen10ms,
inst->anaLen - inst->blockLen10ms);
WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
+ inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
}
// Update analysis buffer for lower band, and window data before FFT.
void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
int16_t* out,
int16_t* new_speech) {
int i = 0;
// For lower band update analysis buffer.
WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
inst->analysisBuffer + inst->blockLen10ms,
inst->anaLen - inst->blockLen10ms);
WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
+ inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
// Window data before FFT.
for (i = 0; i < inst->anaLen; i++) {
out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
inst->window[i], inst->analysisBuffer[i], 14); // Q0
}
}
// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
// zeros, and normalize it.
__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
int16_t* in,
int16_t* out) {
int i = 0, j = 0;
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
out[j + 1] = 0; // Insert zeros in imaginary part
}
}
#endif // !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))

View File

@ -165,40 +165,51 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst,
short* outFrameHigh);
/****************************************************************************
* Internal functions and variable declarations shared with optimized code.
* Some function pointers, for internal functions shared by ARM NEON and
* generic C code.
*/
// Noise Estimation.
void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
uint16_t* magn,
uint32_t* noise,
int16_t* q_noise);
typedef void (*NoiseEstimation)(NsxInst_t* inst,
uint16_t* magn,
uint32_t* noise,
int16_t* q_noise);
extern NoiseEstimation WebRtcNsx_NoiseEstimation;
// Filter the data in the frequency domain, and create spectrum.
void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst,
int16_t* freq_buff);
typedef void (*PrepareSpectrum)(NsxInst_t* inst,
int16_t* freq_buff);
extern PrepareSpectrum WebRtcNsx_PrepareSpectrum;
// For the noise supression process, synthesis, read out fully processed
// segment, and update synthesis buffer.
void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
int16_t* out_frame,
int16_t gain_factor);
typedef void (*SynthesisUpdate)(NsxInst_t* inst,
int16_t* out_frame,
int16_t gain_factor);
extern SynthesisUpdate WebRtcNsx_SynthesisUpdate;
// Update analysis buffer for lower band, and window data before FFT.
void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
int16_t* out,
int16_t* new_speech);
typedef void (*AnalysisUpdate)(NsxInst_t* inst,
int16_t* out,
int16_t* new_speech);
extern AnalysisUpdate WebRtcNsx_AnalysisUpdate;
// Denormalize the input buffer.
__inline void WebRtcNsx_Denormalize(NsxInst_t* inst,
int16_t* in,
int factor);
typedef void (*Denormalize)(NsxInst_t* inst,
int16_t* in,
int factor);
extern Denormalize WebRtcNsx_Denormalize;
// Create a complex number buffer, as the intput interleaved with zeros,
// and normalize it.
__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
int16_t* in,
int16_t* out);
typedef void (*CreateComplexBuffer)(NsxInst_t* inst,
int16_t* in,
int16_t* out);
extern CreateComplexBuffer WebRtcNsx_CreateComplexBuffer;
/****************************************************************************
* Initialization of the above function pointers for ARM Neon.
*/
void WebRtcNsx_InitNeon(void);
extern const WebRtc_Word16 WebRtcNsx_kLogTable[9];
extern const WebRtc_Word16 WebRtcNsx_kLogTableFrac[256];
@ -208,4 +219,4 @@ extern const WebRtc_Word16 WebRtcNsx_kCounterDiv[201];
}
#endif
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_

View File

@ -8,15 +8,13 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#if defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)
#include "nsx_core.h"
#include <arm_neon.h>
#include <assert.h>
// Update the noise estimation information.
static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset) {
int i = 0;
const int16_t kExp2Const = 11819; // Q13
int16_t* ptr_noiseEstLogQuantile = NULL;
@ -75,7 +73,7 @@ static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
}
// Last iteration:
// inst->quantile[i]=exp(inst->lquantile[offset+i]);
// in Q21
int32_t tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
@ -94,10 +92,10 @@ static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
}
// Noise Estimation
void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
uint16_t* magn,
uint32_t* noise,
int16_t* q_noise) {
static void NoiseEstimationNeon(NsxInst_t* inst,
uint16_t* magn,
uint32_t* noise,
int16_t* q_noise) {
int32_t numerator = FACTOR_Q16;
int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
int16_t countProd, delta, zeros, frac;
@ -126,11 +124,11 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
if (magn[i]) {
zeros = WebRtcSpl_NormU32((uint32_t)magn[i]);
frac = (int16_t)((((uint32_t)magn[i] << zeros)
& 0x7FFFFFFF) >> 23);
& 0x7FFFFFFF) >> 23);
assert(frac < 256);
// log2(magn(i))
log2 = (int16_t)(((31 - zeros) << 8)
+ WebRtcNsx_kLogTableFrac[frac]);
+ WebRtcNsx_kLogTableFrac[frac]);
// log2(magn(i))*log(2)
lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
// + log(2^stages)
@ -302,7 +300,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
if (counter >= END_STARTUP_LONG) {
inst->noiseEstCounter[s] = 0;
if (inst->blockIndex >= END_STARTUP_LONG) {
UpdateNoiseEstimate(inst, offset);
UpdateNoiseEstimateNeon(inst, offset);
}
}
inst->noiseEstCounter[s]++;
@ -311,7 +309,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
// Sequentially update the noise during startup
if (inst->blockIndex < END_STARTUP_LONG) {
UpdateNoiseEstimate(inst, offset);
UpdateNoiseEstimateNeon(inst, offset);
}
for (i = 0; i < inst->magnLen; i++) {
@ -321,7 +319,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
}
// Filter the data in the frequency domain, and create spectrum.
void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
static void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
// (1) Filtering.
@ -338,7 +336,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0];
// Filter the rest in the frequency domain.
for (; ptr_real < &inst->real[inst->magnLen - 1]; ) {
for (; ptr_real < &inst->real[inst->magnLen - 1];) {
// Loop unrolled once. Both pointers are incremented by 4 twice.
__asm__ __volatile__(
"vld1.16 d20, [%[ptr_real]]\n\t"
@ -368,7 +366,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
:
:"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
"q9", "q10", "q11", "q12"
);
);
}
// Filter the last pair of elements in the frequency domain.
@ -400,7 +398,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8];
ptr_real = &inst->real[1];
ptr_imag = &inst->imag[1];
for (; ptr_real < &inst->real[inst->anaLen2 - 11]; ) {
for (; ptr_real < &inst->real[inst->anaLen2 - 11];) {
// Loop unrolled once. All pointers are incremented twice.
__asm__ __volatile__(
"vld1.16 d22, [%[ptr_real]]!\n\t"
@ -456,13 +454,13 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
}
// Denormalize the input buffer.
__inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
static __inline void DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor) {
int16_t* ptr_real = &inst->real[0];
int16_t* ptr_in = &in[0];
__asm__ __volatile__("vdup.32 q10, %0" ::
"r"((int32_t)(factor - inst->normData)) : "q10");
for (; ptr_real < &inst->real[inst->anaLen]; ) {
for (; ptr_real < &inst->real[inst->anaLen];) {
// Loop unrolled once. Both pointers are incremented.
__asm__ __volatile__(
@ -495,9 +493,9 @@ __inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
// For the noise supress process, synthesis, read out fully processed segment,
// and update synthesis buffer.
void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
int16_t* out_frame,
int16_t gain_factor) {
static void SynthesisUpdateNeon(NsxInst_t* inst,
int16_t* out_frame,
int16_t gain_factor) {
int16_t* ptr_real = &inst->real[0];
int16_t* ptr_syn = &inst->synthesisBuffer[0];
int16_t* ptr_window = &inst->window[0];
@ -505,7 +503,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
// synthesis
__asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24");
// Loop unrolled once. All pointers are incremented in the assembly code.
for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) {
for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
__asm__ __volatile__(
// Load variables.
"vld1.16 d22, [%[ptr_real]]!\n\t"
@ -553,7 +551,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
int16_t* ptr_out = &out_frame[0];
ptr_syn = &inst->synthesisBuffer[0];
// read out fully processed segment
for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms]; ) {
for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms];) {
// Loop unrolled once. Both pointers are incremented in the assembly code.
__asm__ __volatile__(
// out_frame[i] = inst->synthesisBuffer[i]; // Q0
@ -575,7 +573,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
// inst->anaLen - inst->blockLen10ms);
ptr_out = &inst->synthesisBuffer[0],
ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms];
for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) {
for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
// Loop unrolled once. Both pointers are incremented in the assembly code.
__asm__ __volatile__(
"vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
@ -593,7 +591,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
// WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
// + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
__asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10");
for (; ptr_out < &inst->synthesisBuffer[inst->anaLen]; ) {
for (; ptr_out < &inst->synthesisBuffer[inst->anaLen];) {
// Loop unrolled once. Pointer is incremented in the assembly code.
__asm__ __volatile__(
"vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
@ -606,9 +604,9 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
}
// Update analysis buffer for lower band, and window data before FFT.
void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
int16_t* out,
int16_t* new_speech) {
static void AnalysisUpdateNeon(NsxInst_t* inst,
int16_t* out,
int16_t* new_speech) {
int16_t* ptr_ana = &inst->analysisBuffer[inst->blockLen10ms];
int16_t* ptr_out = &inst->analysisBuffer[0];
@ -617,7 +615,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
// WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
// inst->analysisBuffer + inst->blockLen10ms,
// inst->anaLen - inst->blockLen10ms);
for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms]; ) {
for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) {
// Loop unrolled once, so both pointers are incremented by 8 twice.
__asm__ __volatile__(
"vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
@ -633,7 +631,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
// WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
// + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen]; ) {
for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) {
// Loop unrolled once, so both pointers are incremented by 8 twice.
__asm__ __volatile__(
"vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
@ -651,7 +649,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
int16_t* ptr_window = &inst->window[0];
ptr_out = &out[0];
ptr_ana = &inst->analysisBuffer[0];
for (; ptr_out < &out[inst->anaLen]; ) {
for (; ptr_out < &out[inst->anaLen];) {
// Loop unrolled once, so all pointers are incremented by 4 twice.
__asm__ __volatile__(
@ -683,17 +681,17 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
// zeros, and normalize it.
__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
int16_t* in,
int16_t* out) {
static __inline void CreateComplexBufferNeon(NsxInst_t* inst,
int16_t* in,
int16_t* out) {
int16_t* ptr_out = &out[0];
int16_t* ptr_in = &in[0];
__asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25");
__asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10");
for (; ptr_in < &in[inst->anaLen]; ) {
for (; ptr_in < &in[inst->anaLen];) {
// Loop unrolled once, so ptr_in is incremented by 8 twice,
// Loop unrolled once, so ptr_in is incremented by 8 twice,
// and ptr_out is incremented by 8 four times.
__asm__ __volatile__(
// out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
@ -724,4 +722,12 @@ __inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
);
}
}
#endif // defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)
void WebRtcNsx_InitNeon(void) {
WebRtcNsx_NoiseEstimation = NoiseEstimationNeon;
WebRtcNsx_PrepareSpectrum = PrepareSpectrumNeon;
WebRtcNsx_SynthesisUpdate = SynthesisUpdateNeon;
WebRtcNsx_AnalysisUpdate = AnalysisUpdateNeon;
WebRtcNsx_Denormalize = DenormalizeNeon;
WebRtcNsx_CreateComplexBuffer = CreateComplexBufferNeon;
}

View File

@ -15,18 +15,33 @@
extern "C" {
#endif
// list of features.
#include <typedefs.h>
// List of features in x86.
typedef enum {
kSSE2,
kSSE3
} CPUFeature;
// List of features in ARM.
enum {
kCPUFeatureARMv7 = (1 << 0),
kCPUFeatureVFPv3 = (1 << 1),
kCPUFeatureNEON = (1 << 2),
kCPUFeatureLDREXSTREX = (1 << 3)
};
typedef int (*WebRtc_CPUInfo)(CPUFeature feature);
// returns true if the CPU supports the feature.
extern WebRtc_CPUInfo WebRtc_GetCPUInfo;
// No CPU feature is available => straight C path.
extern WebRtc_CPUInfo WebRtc_GetCPUInfoNoASM;
// Return the features in an ARM device.
// It detects the features in the hardware platform, and returns supported
// values in the above enum definition as a bitmask.
extern uint64_t WebRtc_GetCPUFeaturesARM(void);
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif

View File

@ -25,6 +25,7 @@ LOCAL_SRC_FILES := \
condition_variable.cc \
cpu_dummy.cc \
cpu_features.cc \
cpu_features_arm.c \
cpu_info.cc \
critical_section.cc \
event.cc \

View File

@ -0,0 +1,333 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
// This file is derived from Android's NDK package r7, located at
// <ndk>/sources/android/cpufeatures/ (downloadable from
// http://developer.android.com/sdk/ndk/index.html).
#include "cpu_features_wrapper.h"
#include <fcntl.h>
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
// Define CPU family.
typedef enum {
CPU_FAMILY_UNKNOWN = 0,
CPU_FAMILY_ARM,
CPU_FAMILY_X86,
CPU_FAMILY_MAX // Do not remove.
} CpuFamily;
static pthread_once_t g_once;
static CpuFamily g_cpuFamily;
static uint64_t g_cpuFeatures;
static int g_cpuCount;
static const int cpufeatures_debug = 0;
#ifdef __arm__
# define DEFAULT_CPU_FAMILY CPU_FAMILY_ARM
#elif defined __i386__
# define DEFAULT_CPU_FAMILY CPU_FAMILY_X86
#else
# define DEFAULT_CPU_FAMILY CPU_FAMILY_UNKNOWN
#endif
#define D(...) \
do { \
if (cpufeatures_debug) { \
printf(__VA_ARGS__); fflush(stdout); \
} \
} while (0)
/* Read the content of /proc/cpuinfo into a user-provided buffer.
* Return the length of the data, or -1 on error. Does *not*
* zero-terminate the content. Will not read more
* than 'buffsize' bytes.
*/
static int read_file(const char* pathname, char* buffer, size_t buffsize) {
int fd, len;
fd = open(pathname, O_RDONLY);
if (fd < 0)
return -1;
do {
len = read(fd, buffer, buffsize);
} while (len < 0 && errno == EINTR);
close(fd);
return len;
}
/* Extract the content of a the first occurence of a given field in
* the content of /proc/cpuinfo and return it as a heap-allocated
* string that must be freed by the caller.
*
* Return NULL if not found
*/
static char* extract_cpuinfo_field(char* buffer, int buflen, const char* field) {
int fieldlen = strlen(field);
char* bufend = buffer + buflen;
char* result = NULL;
int len, ignore;
const char* p, *q;
/* Look for first field occurence, and ensures it starts the line.
*/
p = buffer;
bufend = buffer + buflen;
for (;;) {
p = memmem(p, bufend - p, field, fieldlen);
if (p == NULL)
goto EXIT;
if (p == buffer || p[-1] == '\n')
break;
p += fieldlen;
}
/* Skip to the first column followed by a space */
p += fieldlen;
p = memchr(p, ':', bufend - p);
if (p == NULL || p[1] != ' ')
goto EXIT;
/* Find the end of the line */
p += 2;
q = memchr(p, '\n', bufend - p);
if (q == NULL)
q = bufend;
/* Copy the line into a heap-allocated buffer */
len = q - p;
result = malloc(len + 1);
if (result == NULL)
goto EXIT;
memcpy(result, p, len);
result[len] = '\0';
EXIT:
return result;
}
/* Count the number of occurences of a given field prefix in /proc/cpuinfo.
*/
static int count_cpuinfo_field(char* buffer, int buflen, const char* field) {
int fieldlen = strlen(field);
const char* p = buffer;
const char* bufend = buffer + buflen;
const char* q;
int count = 0;
for (;;) {
const char* q;
p = memmem(p, bufend - p, field, fieldlen);
if (p == NULL)
break;
/* Ensure that the field is at the start of a line */
if (p > buffer && p[-1] != '\n') {
p += fieldlen;
continue;
}
/* skip any whitespace */
q = p + fieldlen;
while (q < bufend && (*q == ' ' || *q == '\t'))
q++;
/* we must have a colon now */
if (q < bufend && *q == ':') {
count += 1;
q ++;
}
p = q;
}
return count;
}
/* Like strlen(), but for constant string literals */
#define STRLEN_CONST(x) ((sizeof(x)-1)
/* Checks that a space-separated list of items contains one given 'item'.
* Returns 1 if found, 0 otherwise.
*/
static int has_list_item(const char* list, const char* item) {
const char* p = list;
int itemlen = strlen(item);
if (list == NULL)
return 0;
while (*p) {
const char* q;
/* skip spaces */
while (*p == ' ' || *p == '\t')
p++;
/* find end of current list item */
q = p;
while (*q && *q != ' ' && *q != '\t')
q++;
if (itemlen == q - p && !memcmp(p, item, itemlen))
return 1;
/* skip to next item */
p = q;
}
return 0;
}
static void cpuInit(void) {
char cpuinfo[4096];
int cpuinfo_len;
g_cpuFamily = DEFAULT_CPU_FAMILY;
g_cpuFeatures = 0;
g_cpuCount = 1;
cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, sizeof cpuinfo);
D("cpuinfo_len is (%d):\n%.*s\n", cpuinfo_len,
cpuinfo_len >= 0 ? cpuinfo_len : 0, cpuinfo);
if (cpuinfo_len < 0) { /* should not happen */
return;
}
/* Count the CPU cores, the value may be 0 for single-core CPUs */
g_cpuCount = count_cpuinfo_field(cpuinfo, cpuinfo_len, "processor");
if (g_cpuCount == 0) {
g_cpuCount = count_cpuinfo_field(cpuinfo, cpuinfo_len, "Processor");
if (g_cpuCount == 0) {
g_cpuCount = 1;
}
}
D("found cpuCount = %d\n", g_cpuCount);
#ifdef __arm__
{
char* features = NULL;
char* architecture = NULL;
/* Extract architecture from the "CPU Architecture" field.
* The list is well-known, unlike the the output of
* the 'Processor' field which can vary greatly.
*
* See the definition of the 'proc_arch' array in
* $KERNEL/arch/arm/kernel/setup.c and the 'c_show' function in
* same file.
*/
char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
"CPU architecture");
if (cpuArch != NULL) {
char* end;
long archNumber;
int hasARMv7 = 0;
D("found cpuArch = '%s'\n", cpuArch);
/* read the initial decimal number, ignore the rest */
archNumber = strtol(cpuArch, &end, 10);
/* Here we assume that ARMv8 will be upwards compatible with v7
* in the future. Unfortunately, there is no 'Features' field to
* indicate that Thumb-2 is supported.
*/
if (end > cpuArch && archNumber >= 7) {
hasARMv7 = 1;
}
/* Unfortunately, it seems that certain ARMv6-based CPUs
* report an incorrect architecture number of 7!
*
* We try to correct this by looking at the 'elf_format'
* field reported by the 'Processor' field, which is of the
* form of "(v7l)" for an ARMv7-based CPU, and "(v6l)" for
* an ARMv6-one.
*/
if (hasARMv7) {
char* cpuProc = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
"Processor");
if (cpuProc != NULL) {
D("found cpuProc = '%s'\n", cpuProc);
if (has_list_item(cpuProc, "(v6l)")) {
D("CPU processor and architecture mismatch!!\n");
hasARMv7 = 0;
}
free(cpuProc);
}
}
if (hasARMv7) {
g_cpuFeatures |= kCPUFeatureARMv7;
}
/* The LDREX / STREX instructions are available from ARMv6 */
if (archNumber >= 6) {
g_cpuFeatures |= kCPUFeatureLDREXSTREX;
}
free(cpuArch);
}
/* Extract the list of CPU features from 'Features' field */
char* cpuFeatures = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
"Features");
if (cpuFeatures != NULL) {
D("found cpuFeatures = '%s'\n", cpuFeatures);
if (has_list_item(cpuFeatures, "vfpv3"))
g_cpuFeatures |= kCPUFeatureVFPv3;
else if (has_list_item(cpuFeatures, "vfpv3d16"))
g_cpuFeatures |= kCPUFeatureVFPv3;
if (has_list_item(cpuFeatures, "neon")) {
/* Note: Certain kernels only report neon but not vfpv3
* in their features list. However, ARM mandates
* that if Neon is implemented, so must be VFPv3
* so always set the flag.
*/
g_cpuFeatures |= kCPUFeatureNEON |
kCPUFeatureVFPv3;
}
free(cpuFeatures);
}
}
#endif // __arm__
#ifdef __i386__
g_cpuFamily = CPU_FAMILY_X86;
#endif
}
uint64_t WebRtc_GetCPUFeaturesARM(void) {
pthread_once(&g_once, cpuInit);
return g_cpuFeatures;
}