For Android ARMv7 platforms, added a feature of dynamically detecting the existence of Neon,
and when it's present, switch to some functions optimized for Neon at run time. Review URL: http://webrtc-codereview.appspot.com/268002 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1096 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
ae7017d588
commit
b59c031660
12
Android.mk
12
Android.mk
@ -54,6 +54,7 @@ include $(MY_WEBRTC_ROOT_PATH)/libvpx.mk
|
|||||||
LOCAL_PATH := $(call my-dir)
|
LOCAL_PATH := $(call my-dir)
|
||||||
|
|
||||||
include $(CLEAR_VARS)
|
include $(CLEAR_VARS)
|
||||||
|
include $(LOCAL_PATH)/../../external/webrtc/android-webrtc.mk
|
||||||
|
|
||||||
LOCAL_ARM_MODE := arm
|
LOCAL_ARM_MODE := arm
|
||||||
LOCAL_MODULE := libwebrtc_audio_preprocessing
|
LOCAL_MODULE := libwebrtc_audio_preprocessing
|
||||||
@ -71,6 +72,17 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
|
|||||||
libwebrtc_aecm \
|
libwebrtc_aecm \
|
||||||
libwebrtc_system_wrappers
|
libwebrtc_system_wrappers
|
||||||
|
|
||||||
|
# Add Neon libraries.
|
||||||
|
ifneq (,$(filter '-DWEBRTC_DETECT_ARM_NEON',$(MY_WEBRTC_COMMON_DEFS)))
|
||||||
|
LOCAL_WHOLE_STATIC_LIBRARIES += \
|
||||||
|
libwebrtc_aecm_neon \
|
||||||
|
libwebrtc_ns_neon
|
||||||
|
else ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
||||||
|
LOCAL_WHOLE_STATIC_LIBRARIES += \
|
||||||
|
libwebrtc_aecm_neon \
|
||||||
|
libwebrtc_ns_neon
|
||||||
|
endif
|
||||||
|
|
||||||
LOCAL_STATIC_LIBRARIES := \
|
LOCAL_STATIC_LIBRARIES := \
|
||||||
libprotobuf-cpp-2.3.0-lite
|
libprotobuf-cpp-2.3.0-lite
|
||||||
|
|
||||||
|
@ -21,8 +21,9 @@ MY_WEBRTC_COMMON_DEFS := \
|
|||||||
# '-DWEBRTC_MODULE_UTILITY_VIDEO' [module media_file] [module utility]
|
# '-DWEBRTC_MODULE_UTILITY_VIDEO' [module media_file] [module utility]
|
||||||
ifeq ($(TARGET_ARCH),arm)
|
ifeq ($(TARGET_ARCH),arm)
|
||||||
MY_WEBRTC_COMMON_DEFS += \
|
MY_WEBRTC_COMMON_DEFS += \
|
||||||
'-DWEBRTC_ARM_INLINE_CALLS' \
|
|
||||||
'-DWEBRTC_ARCH_ARM'
|
'-DWEBRTC_ARCH_ARM'
|
||||||
|
# '-DWEBRTC_DETECT_ARM_NEON' # only used in a build configuration without Neon
|
||||||
|
# TODO(kma): figure out if the above define could be moved to NDK build only.
|
||||||
|
|
||||||
# TODO(kma): test if the code under next two macros works with generic GCC compilers
|
# TODO(kma): test if the code under next two macros works with generic GCC compilers
|
||||||
ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
||||||
|
@ -6,6 +6,9 @@
|
|||||||
# in the file PATENTS. All contributing project authors may
|
# in the file PATENTS. All contributing project authors may
|
||||||
# be found in the AUTHORS file in the root of the source tree.
|
# be found in the AUTHORS file in the root of the source tree.
|
||||||
|
|
||||||
|
#############################
|
||||||
|
# Build the non-neon library.
|
||||||
|
|
||||||
LOCAL_PATH := $(call my-dir)
|
LOCAL_PATH := $(call my-dir)
|
||||||
|
|
||||||
include $(CLEAR_VARS)
|
include $(CLEAR_VARS)
|
||||||
@ -21,21 +24,16 @@ LOCAL_SRC_FILES := \
|
|||||||
aecm_core.c
|
aecm_core.c
|
||||||
|
|
||||||
# Flags passed to both C and C++ files.
|
# Flags passed to both C and C++ files.
|
||||||
LOCAL_CFLAGS := \
|
LOCAL_CFLAGS := $(MY_WEBRTC_COMMON_DEFS)
|
||||||
$(MY_WEBRTC_COMMON_DEFS)
|
|
||||||
|
|
||||||
ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
|
||||||
LOCAL_SRC_FILES += \
|
|
||||||
aecm_core_neon.c
|
|
||||||
LOCAL_CFLAGS += \
|
|
||||||
$(MY_ARM_CFLAGS_NEON)
|
|
||||||
endif
|
|
||||||
|
|
||||||
LOCAL_C_INCLUDES := \
|
LOCAL_C_INCLUDES := \
|
||||||
$(LOCAL_PATH)/interface \
|
$(LOCAL_PATH)/interface \
|
||||||
$(LOCAL_PATH)/../utility \
|
$(LOCAL_PATH)/../utility \
|
||||||
$(LOCAL_PATH)/../../.. \
|
$(LOCAL_PATH)/../../.. \
|
||||||
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
|
$(LOCAL_PATH)/../../../common_audio/signal_processing/include \
|
||||||
|
$(LOCAL_PATH)/../../../system_wrappers/interface
|
||||||
|
|
||||||
|
LOCAL_STATIC_LIBRARIES += libwebrtc_system_wrappers
|
||||||
|
|
||||||
LOCAL_SHARED_LIBRARIES := \
|
LOCAL_SHARED_LIBRARIES := \
|
||||||
libcutils \
|
libcutils \
|
||||||
@ -46,3 +44,31 @@ ifndef NDK_ROOT
|
|||||||
include external/stlport/libstlport.mk
|
include external/stlport/libstlport.mk
|
||||||
endif
|
endif
|
||||||
include $(BUILD_STATIC_LIBRARY)
|
include $(BUILD_STATIC_LIBRARY)
|
||||||
|
|
||||||
|
#########################
|
||||||
|
# Build the neon library.
|
||||||
|
|
||||||
|
include $(CLEAR_VARS)
|
||||||
|
|
||||||
|
LOCAL_ARM_MODE := arm
|
||||||
|
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
|
||||||
|
LOCAL_MODULE := libwebrtc_aecm_neon
|
||||||
|
LOCAL_MODULE_TAGS := optional
|
||||||
|
|
||||||
|
LOCAL_SRC_FILES := aecm_core_neon.c
|
||||||
|
|
||||||
|
# Flags passed to both C and C++ files.
|
||||||
|
LOCAL_CFLAGS := \
|
||||||
|
$(MY_WEBRTC_COMMON_DEFS) \
|
||||||
|
-mfpu=neon \
|
||||||
|
-flax-vector-conversions
|
||||||
|
|
||||||
|
LOCAL_C_INCLUDES := \
|
||||||
|
$(LOCAL_PATH)/interface \
|
||||||
|
$(LOCAL_PATH)/../../.. \
|
||||||
|
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
|
||||||
|
|
||||||
|
ifndef NDK_ROOT
|
||||||
|
include external/stlport/libstlport.mk
|
||||||
|
endif
|
||||||
|
include $(BUILD_STATIC_LIBRARY)
|
||||||
|
@ -13,8 +13,9 @@
|
|||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include "echo_control_mobile.h"
|
#include "cpu_features_wrapper.h"
|
||||||
#include "delay_estimator_wrapper.h"
|
#include "delay_estimator_wrapper.h"
|
||||||
|
#include "echo_control_mobile.h"
|
||||||
#include "ring_buffer.h"
|
#include "ring_buffer.h"
|
||||||
#include "typedefs.h"
|
#include "typedefs.h"
|
||||||
|
|
||||||
@ -263,6 +264,13 @@ static const uint16_t* AlignedFarend(AecmCore_t* self, int* far_q, int delay) {
|
|||||||
HANDLE logFile = NULL;
|
HANDLE logFile = NULL;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Declare function pointers.
|
||||||
|
CalcLinearEnergies WebRtcAecm_CalcLinearEnergies;
|
||||||
|
StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel;
|
||||||
|
ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
|
||||||
|
WindowAndFFT WebRtcAecm_WindowAndFFT;
|
||||||
|
InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
|
||||||
|
|
||||||
int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
|
int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
|
||||||
{
|
{
|
||||||
AecmCore_t *aecm = malloc(sizeof(AecmCore_t));
|
AecmCore_t *aecm = malloc(sizeof(AecmCore_t));
|
||||||
@ -346,6 +354,194 @@ void WebRtcAecm_InitEchoPathCore(AecmCore_t* aecm, const WebRtc_Word16* echo_pat
|
|||||||
aecm->mseChannelCount = 0;
|
aecm->mseChannelCount = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void WindowAndFFTC(WebRtc_Word16* fft,
|
||||||
|
const WebRtc_Word16* time_signal,
|
||||||
|
complex16_t* freq_signal,
|
||||||
|
int time_signal_scaling)
|
||||||
|
{
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
|
||||||
|
// FFT of signal
|
||||||
|
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
|
||||||
|
{
|
||||||
|
// Window time domain signal and insert into real part of
|
||||||
|
// transformation array |fft|
|
||||||
|
fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||||
|
(time_signal[i] << time_signal_scaling),
|
||||||
|
WebRtcAecm_kSqrtHanning[i],
|
||||||
|
14);
|
||||||
|
fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||||
|
(time_signal[i + PART_LEN] << time_signal_scaling),
|
||||||
|
WebRtcAecm_kSqrtHanning[PART_LEN - i],
|
||||||
|
14);
|
||||||
|
// Inserting zeros in imaginary parts not necessary since we
|
||||||
|
// initialized the array with all zeros
|
||||||
|
}
|
||||||
|
|
||||||
|
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
|
||||||
|
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
|
||||||
|
|
||||||
|
// Take only the first PART_LEN2 samples
|
||||||
|
for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
|
||||||
|
{
|
||||||
|
freq_signal[i].real = fft[j];
|
||||||
|
|
||||||
|
// The imaginary part has to switch sign
|
||||||
|
freq_signal[i].imag = - fft[j+1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void InverseFFTAndWindowC(AecmCore_t* aecm,
|
||||||
|
WebRtc_Word16* fft,
|
||||||
|
complex16_t* efw,
|
||||||
|
WebRtc_Word16* output,
|
||||||
|
const WebRtc_Word16* nearendClean)
|
||||||
|
{
|
||||||
|
int i, j, outCFFT;
|
||||||
|
WebRtc_Word32 tmp32no1;
|
||||||
|
|
||||||
|
// Synthesis
|
||||||
|
for (i = 1; i < PART_LEN; i++)
|
||||||
|
{
|
||||||
|
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
|
||||||
|
fft[j] = efw[i].real;
|
||||||
|
|
||||||
|
// mirrored data, even
|
||||||
|
fft[PART_LEN4 - j] = efw[i].real;
|
||||||
|
fft[j + 1] = -efw[i].imag;
|
||||||
|
|
||||||
|
//mirrored data, odd
|
||||||
|
fft[PART_LEN4 - (j - 1)] = efw[i].imag;
|
||||||
|
}
|
||||||
|
fft[0] = efw[0].real;
|
||||||
|
fft[1] = -efw[0].imag;
|
||||||
|
|
||||||
|
fft[PART_LEN2] = efw[PART_LEN].real;
|
||||||
|
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
|
||||||
|
|
||||||
|
// inverse FFT, result should be scaled with outCFFT
|
||||||
|
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
|
||||||
|
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
|
||||||
|
|
||||||
|
//take only the real values and scale with outCFFT
|
||||||
|
for (i = 0; i < PART_LEN2; i++)
|
||||||
|
{
|
||||||
|
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
|
||||||
|
fft[i] = fft[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < PART_LEN; i++)
|
||||||
|
{
|
||||||
|
fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||||
|
fft[i],
|
||||||
|
WebRtcAecm_kSqrtHanning[i],
|
||||||
|
14);
|
||||||
|
tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
|
||||||
|
outCFFT - aecm->dfaCleanQDomain);
|
||||||
|
fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
|
||||||
|
tmp32no1 + aecm->outBuf[i],
|
||||||
|
WEBRTC_SPL_WORD16_MIN);
|
||||||
|
output[i] = fft[i];
|
||||||
|
|
||||||
|
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
|
||||||
|
fft[PART_LEN + i],
|
||||||
|
WebRtcAecm_kSqrtHanning[PART_LEN - i],
|
||||||
|
14);
|
||||||
|
tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
|
||||||
|
outCFFT - aecm->dfaCleanQDomain);
|
||||||
|
aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
|
||||||
|
WEBRTC_SPL_WORD16_MAX,
|
||||||
|
tmp32no1,
|
||||||
|
WEBRTC_SPL_WORD16_MIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef ARM_WINM_LOG_
|
||||||
|
// measure tick end
|
||||||
|
QueryPerformanceCounter((LARGE_INTEGER*)&end);
|
||||||
|
diff__ = ((end - start) * 1000) / (freq/1000);
|
||||||
|
milliseconds = (unsigned int)(diff__ & 0xffffffff);
|
||||||
|
WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
|
||||||
|
memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
||||||
|
memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
||||||
|
if (nearendClean != NULL)
|
||||||
|
{
|
||||||
|
memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void CalcLinearEnergiesC(AecmCore_t* aecm,
|
||||||
|
const WebRtc_UWord16* far_spectrum,
|
||||||
|
WebRtc_Word32* echo_est,
|
||||||
|
WebRtc_UWord32* far_energy,
|
||||||
|
WebRtc_UWord32* echo_energy_adapt,
|
||||||
|
WebRtc_UWord32* echo_energy_stored)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
// Get energy for the delayed far end signal and estimated
|
||||||
|
// echo using both stored and adapted channels.
|
||||||
|
for (i = 0; i < PART_LEN1; i++)
|
||||||
|
{
|
||||||
|
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||||
|
far_spectrum[i]);
|
||||||
|
(*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
|
||||||
|
(*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
|
||||||
|
far_spectrum[i]);
|
||||||
|
(*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void StoreAdaptiveChannelC(AecmCore_t* aecm,
|
||||||
|
const WebRtc_UWord16* far_spectrum,
|
||||||
|
WebRtc_Word32* echo_est)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
// During startup we store the channel every block.
|
||||||
|
memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
|
||||||
|
// Recalculate echo estimate
|
||||||
|
for (i = 0; i < PART_LEN; i += 4)
|
||||||
|
{
|
||||||
|
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||||
|
far_spectrum[i]);
|
||||||
|
echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
|
||||||
|
far_spectrum[i + 1]);
|
||||||
|
echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
|
||||||
|
far_spectrum[i + 2]);
|
||||||
|
echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
|
||||||
|
far_spectrum[i + 3]);
|
||||||
|
}
|
||||||
|
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||||
|
far_spectrum[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ResetAdaptiveChannelC(AecmCore_t* aecm)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
// The stored channel has a significantly lower MSE than the adaptive one for
|
||||||
|
// two consecutive calculations. Reset the adaptive channel.
|
||||||
|
memcpy(aecm->channelAdapt16, aecm->channelStored,
|
||||||
|
sizeof(WebRtc_Word16) * PART_LEN1);
|
||||||
|
// Restore the W32 channel
|
||||||
|
for (i = 0; i < PART_LEN; i += 4)
|
||||||
|
{
|
||||||
|
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
|
||||||
|
(WebRtc_Word32)aecm->channelStored[i], 16);
|
||||||
|
aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
|
||||||
|
(WebRtc_Word32)aecm->channelStored[i + 1], 16);
|
||||||
|
aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
|
||||||
|
(WebRtc_Word32)aecm->channelStored[i + 2], 16);
|
||||||
|
aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
|
||||||
|
(WebRtc_Word32)aecm->channelStored[i + 3], 16);
|
||||||
|
}
|
||||||
|
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
|
||||||
|
}
|
||||||
|
|
||||||
// WebRtcAecm_InitCore(...)
|
// WebRtcAecm_InitCore(...)
|
||||||
//
|
//
|
||||||
// This function initializes the AECM instant created with WebRtcAecm_CreateCore(...)
|
// This function initializes the AECM instant created with WebRtcAecm_CreateCore(...)
|
||||||
@ -463,6 +659,23 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
|
|||||||
|
|
||||||
assert(PART_LEN % 16 == 0);
|
assert(PART_LEN % 16 == 0);
|
||||||
|
|
||||||
|
// Initialize function pointers.
|
||||||
|
WebRtcAecm_WindowAndFFT = WindowAndFFTC;
|
||||||
|
WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowC;
|
||||||
|
WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesC;
|
||||||
|
WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelC;
|
||||||
|
WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelC;
|
||||||
|
|
||||||
|
#ifdef WEBRTC_DETECT_ARM_NEON
|
||||||
|
uint64_t features = WebRtc_GetCPUFeaturesARM();
|
||||||
|
if ((features & kCPUFeatureNEON) != 0)
|
||||||
|
{
|
||||||
|
WebRtcAecm_InitNeon();
|
||||||
|
}
|
||||||
|
#elif defined(WEBRTC_ARCH_ARM_NEON)
|
||||||
|
WebRtcAecm_InitNeon();
|
||||||
|
#endif
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1890,194 +2103,3 @@ void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const far
|
|||||||
aecm->farBufReadPos += readLen;
|
aecm->farBufReadPos += readLen;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
|
|
||||||
|
|
||||||
void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
|
|
||||||
const WebRtc_Word16* time_signal,
|
|
||||||
complex16_t* freq_signal,
|
|
||||||
int time_signal_scaling)
|
|
||||||
{
|
|
||||||
int i, j;
|
|
||||||
|
|
||||||
memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
|
|
||||||
// FFT of signal
|
|
||||||
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
|
|
||||||
{
|
|
||||||
// Window time domain signal and insert into real part of
|
|
||||||
// transformation array |fft|
|
|
||||||
fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
|
||||||
(time_signal[i] << time_signal_scaling),
|
|
||||||
WebRtcAecm_kSqrtHanning[i],
|
|
||||||
14);
|
|
||||||
fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
|
||||||
(time_signal[i + PART_LEN] << time_signal_scaling),
|
|
||||||
WebRtcAecm_kSqrtHanning[PART_LEN - i],
|
|
||||||
14);
|
|
||||||
// Inserting zeros in imaginary parts not necessary since we
|
|
||||||
// initialized the array with all zeros
|
|
||||||
}
|
|
||||||
|
|
||||||
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
|
|
||||||
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
|
|
||||||
|
|
||||||
// Take only the first PART_LEN2 samples
|
|
||||||
for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
|
|
||||||
{
|
|
||||||
freq_signal[i].real = fft[j];
|
|
||||||
|
|
||||||
// The imaginary part has to switch sign
|
|
||||||
freq_signal[i].imag = - fft[j+1];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
|
|
||||||
WebRtc_Word16* fft,
|
|
||||||
complex16_t* efw,
|
|
||||||
WebRtc_Word16* output,
|
|
||||||
const WebRtc_Word16* nearendClean)
|
|
||||||
{
|
|
||||||
int i, j, outCFFT;
|
|
||||||
WebRtc_Word32 tmp32no1;
|
|
||||||
|
|
||||||
// Synthesis
|
|
||||||
for (i = 1; i < PART_LEN; i++)
|
|
||||||
{
|
|
||||||
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
|
|
||||||
fft[j] = efw[i].real;
|
|
||||||
|
|
||||||
// mirrored data, even
|
|
||||||
fft[PART_LEN4 - j] = efw[i].real;
|
|
||||||
fft[j + 1] = -efw[i].imag;
|
|
||||||
|
|
||||||
//mirrored data, odd
|
|
||||||
fft[PART_LEN4 - (j - 1)] = efw[i].imag;
|
|
||||||
}
|
|
||||||
fft[0] = efw[0].real;
|
|
||||||
fft[1] = -efw[0].imag;
|
|
||||||
|
|
||||||
fft[PART_LEN2] = efw[PART_LEN].real;
|
|
||||||
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
|
|
||||||
|
|
||||||
// inverse FFT, result should be scaled with outCFFT
|
|
||||||
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
|
|
||||||
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
|
|
||||||
|
|
||||||
//take only the real values and scale with outCFFT
|
|
||||||
for (i = 0; i < PART_LEN2; i++)
|
|
||||||
{
|
|
||||||
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
|
|
||||||
fft[i] = fft[j];
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < PART_LEN; i++)
|
|
||||||
{
|
|
||||||
fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
|
||||||
fft[i],
|
|
||||||
WebRtcAecm_kSqrtHanning[i],
|
|
||||||
14);
|
|
||||||
tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
|
|
||||||
outCFFT - aecm->dfaCleanQDomain);
|
|
||||||
fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
|
|
||||||
tmp32no1 + aecm->outBuf[i],
|
|
||||||
WEBRTC_SPL_WORD16_MIN);
|
|
||||||
output[i] = fft[i];
|
|
||||||
|
|
||||||
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
|
|
||||||
fft[PART_LEN + i],
|
|
||||||
WebRtcAecm_kSqrtHanning[PART_LEN - i],
|
|
||||||
14);
|
|
||||||
tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
|
|
||||||
outCFFT - aecm->dfaCleanQDomain);
|
|
||||||
aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
|
|
||||||
WEBRTC_SPL_WORD16_MAX,
|
|
||||||
tmp32no1,
|
|
||||||
WEBRTC_SPL_WORD16_MIN);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef ARM_WINM_LOG_
|
|
||||||
// measure tick end
|
|
||||||
QueryPerformanceCounter((LARGE_INTEGER*)&end);
|
|
||||||
diff__ = ((end - start) * 1000) / (freq/1000);
|
|
||||||
milliseconds = (unsigned int)(diff__ & 0xffffffff);
|
|
||||||
WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
|
|
||||||
memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
|
||||||
memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
|
||||||
if (nearendClean != NULL)
|
|
||||||
{
|
|
||||||
memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
|
|
||||||
const WebRtc_UWord16* far_spectrum,
|
|
||||||
WebRtc_Word32* echo_est,
|
|
||||||
WebRtc_UWord32* far_energy,
|
|
||||||
WebRtc_UWord32* echo_energy_adapt,
|
|
||||||
WebRtc_UWord32* echo_energy_stored)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
|
|
||||||
// Get energy for the delayed far end signal and estimated
|
|
||||||
// echo using both stored and adapted channels.
|
|
||||||
for (i = 0; i < PART_LEN1; i++)
|
|
||||||
{
|
|
||||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
|
||||||
far_spectrum[i]);
|
|
||||||
(*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
|
|
||||||
(*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
|
|
||||||
far_spectrum[i]);
|
|
||||||
(*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
|
|
||||||
const WebRtc_UWord16* far_spectrum,
|
|
||||||
WebRtc_Word32* echo_est)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
|
|
||||||
// During startup we store the channel every block.
|
|
||||||
memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
|
|
||||||
// Recalculate echo estimate
|
|
||||||
for (i = 0; i < PART_LEN; i += 4)
|
|
||||||
{
|
|
||||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
|
||||||
far_spectrum[i]);
|
|
||||||
echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
|
|
||||||
far_spectrum[i + 1]);
|
|
||||||
echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
|
|
||||||
far_spectrum[i + 2]);
|
|
||||||
echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
|
|
||||||
far_spectrum[i + 3]);
|
|
||||||
}
|
|
||||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
|
||||||
far_spectrum[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
|
|
||||||
// The stored channel has a significantly lower MSE than the adaptive one for
|
|
||||||
// two consecutive calculations. Reset the adaptive channel.
|
|
||||||
memcpy(aecm->channelAdapt16, aecm->channelStored,
|
|
||||||
sizeof(WebRtc_Word16) * PART_LEN1);
|
|
||||||
// Restore the W32 channel
|
|
||||||
for (i = 0; i < PART_LEN; i += 4)
|
|
||||||
{
|
|
||||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
|
|
||||||
(WebRtc_Word32)aecm->channelStored[i], 16);
|
|
||||||
aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
|
|
||||||
(WebRtc_Word32)aecm->channelStored[i + 1], 16);
|
|
||||||
aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
|
|
||||||
(WebRtc_Word32)aecm->channelStored[i + 2], 16);
|
|
||||||
aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
|
|
||||||
(WebRtc_Word32)aecm->channelStored[i + 3], 16);
|
|
||||||
}
|
|
||||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
|
|
||||||
|
@ -332,32 +332,44 @@ void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * co
|
|||||||
void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const farend,
|
void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const farend,
|
||||||
const int farLen, const int knownDelay);
|
const int farLen, const int knownDelay);
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// Some internal functions shared by ARM NEON and generic C code:
|
// Some function pointers, for internal functions shared by ARM NEON and
|
||||||
|
// generic C code.
|
||||||
//
|
//
|
||||||
|
typedef void (*CalcLinearEnergies)(
|
||||||
void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
|
AecmCore_t* aecm,
|
||||||
const WebRtc_UWord16* far_spectrum,
|
const WebRtc_UWord16* far_spectrum,
|
||||||
WebRtc_Word32* echoEst,
|
WebRtc_Word32* echoEst,
|
||||||
WebRtc_UWord32* far_energy,
|
WebRtc_UWord32* far_energy,
|
||||||
WebRtc_UWord32* echo_energy_adapt,
|
WebRtc_UWord32* echo_energy_adapt,
|
||||||
WebRtc_UWord32* echo_energy_stored);
|
WebRtc_UWord32* echo_energy_stored);
|
||||||
|
extern CalcLinearEnergies WebRtcAecm_CalcLinearEnergies;
|
||||||
|
|
||||||
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
|
typedef void (*StoreAdaptiveChannel)(
|
||||||
|
AecmCore_t* aecm,
|
||||||
const WebRtc_UWord16* far_spectrum,
|
const WebRtc_UWord16* far_spectrum,
|
||||||
WebRtc_Word32* echo_est);
|
WebRtc_Word32* echo_est);
|
||||||
|
extern StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel;
|
||||||
|
|
||||||
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm);
|
typedef void (*ResetAdaptiveChannel)(AecmCore_t* aecm);
|
||||||
|
extern ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
|
||||||
|
|
||||||
void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
|
typedef void (*WindowAndFFT)(
|
||||||
|
WebRtc_Word16* fft,
|
||||||
const WebRtc_Word16* time_signal,
|
const WebRtc_Word16* time_signal,
|
||||||
complex16_t* freq_signal,
|
complex16_t* freq_signal,
|
||||||
int time_signal_scaling);
|
int time_signal_scaling);
|
||||||
|
extern WindowAndFFT WebRtcAecm_WindowAndFFT;
|
||||||
|
|
||||||
void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
|
typedef void (*InverseFFTAndWindow)(
|
||||||
WebRtc_Word16* fft,
|
AecmCore_t* aecm,
|
||||||
complex16_t* efw,
|
WebRtc_Word16* fft, complex16_t* efw,
|
||||||
WebRtc_Word16* output,
|
WebRtc_Word16* output,
|
||||||
const WebRtc_Word16* nearendClean);
|
const WebRtc_Word16* nearendClean);
|
||||||
|
extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
|
||||||
|
|
||||||
|
// Initialization of the above function pointers for ARM Neon.
|
||||||
|
void WebRtcAecm_InitNeon(void);
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -7,7 +7,6 @@
|
|||||||
* in the file PATENTS. All contributing project authors may
|
* in the file PATENTS. All contributing project authors may
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
#if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
|
|
||||||
|
|
||||||
#include "aecm_core.h"
|
#include "aecm_core.h"
|
||||||
|
|
||||||
@ -16,7 +15,7 @@
|
|||||||
|
|
||||||
|
|
||||||
// Square root of Hanning window in Q14.
|
// Square root of Hanning window in Q14.
|
||||||
static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = {
|
static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = {
|
||||||
16384, 16373, 16354, 16325,
|
16384, 16373, 16354, 16325,
|
||||||
16286, 16237, 16179, 16111,
|
16286, 16237, 16179, 16111,
|
||||||
16034, 15947, 15851, 15746,
|
16034, 15947, 15851, 15746,
|
||||||
@ -35,18 +34,16 @@ static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8)))
|
|||||||
1594, 1196, 798, 399
|
1594, 1196, 798, 399
|
||||||
};
|
};
|
||||||
|
|
||||||
void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
|
static void WindowAndFFTNeon(WebRtc_Word16* fft,
|
||||||
const WebRtc_Word16* time_signal,
|
const WebRtc_Word16* time_signal,
|
||||||
complex16_t* freq_signal,
|
complex16_t* freq_signal,
|
||||||
int time_signal_scaling)
|
int time_signal_scaling) {
|
||||||
{
|
|
||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
|
int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
|
||||||
__asm__("vmov.i16 d21, #0" ::: "d21");
|
__asm__("vmov.i16 d21, #0" ::: "d21");
|
||||||
|
|
||||||
for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
|
for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
|
||||||
{
|
|
||||||
int16x4_t tmp16x4_0;
|
int16x4_t tmp16x4_0;
|
||||||
int16x4_t tmp16x4_1;
|
int16x4_t tmp16x4_1;
|
||||||
int32x4_t tmp32x4_0;
|
int32x4_t tmp32x4_0;
|
||||||
@ -80,8 +77,7 @@ void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
|
|||||||
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
|
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
|
||||||
|
|
||||||
// Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
|
// Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
|
||||||
for(i = 0, j = 0; j < PART_LEN2; i += 8, j += 16)
|
for (i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) {
|
||||||
{
|
|
||||||
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
|
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
|
||||||
__asm__("vneg.s16 d22, d22" : : : "q10");
|
__asm__("vneg.s16 d22, d22" : : : "q10");
|
||||||
__asm__("vneg.s16 d23, d23" : : : "q11");
|
__asm__("vneg.s16 d23, d23" : : : "q11");
|
||||||
@ -90,18 +86,16 @@ void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
|
static void InverseFFTAndWindowNeon(AecmCore_t* aecm,
|
||||||
WebRtc_Word16* fft,
|
WebRtc_Word16* fft,
|
||||||
complex16_t* efw,
|
complex16_t* efw,
|
||||||
WebRtc_Word16* output,
|
WebRtc_Word16* output,
|
||||||
const WebRtc_Word16* nearendClean)
|
const WebRtc_Word16* nearendClean) {
|
||||||
{
|
|
||||||
int i, j, outCFFT;
|
int i, j, outCFFT;
|
||||||
WebRtc_Word32 tmp32no1;
|
WebRtc_Word32 tmp32no1;
|
||||||
|
|
||||||
// Synthesis
|
// Synthesis
|
||||||
for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
|
for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
|
||||||
{
|
|
||||||
// We overwrite two more elements in fft[], but it's ok.
|
// We overwrite two more elements in fft[], but it's ok.
|
||||||
__asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
|
__asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
|
||||||
__asm__("vmov q11, q10" : : : "q10", "q11");
|
__asm__("vmov q11, q10" : : : "q10", "q11");
|
||||||
@ -121,8 +115,7 @@ void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
|
|||||||
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
|
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
|
||||||
|
|
||||||
// Take only the real values and scale with outCFFT.
|
// Take only the real values and scale with outCFFT.
|
||||||
for (i = 0, j = 0; i < PART_LEN2; i += 8, j+= 16)
|
for (i = 0, j = 0; i < PART_LEN2; i += 8, j += 16) {
|
||||||
{
|
|
||||||
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
|
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
|
||||||
__asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
|
__asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
|
||||||
}
|
}
|
||||||
@ -130,8 +123,7 @@ void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
|
|||||||
int32x4_t tmp32x4_2;
|
int32x4_t tmp32x4_2;
|
||||||
__asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
|
__asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
|
||||||
(outCFFT - aecm->dfaCleanQDomain)));
|
(outCFFT - aecm->dfaCleanQDomain)));
|
||||||
for (i = 0; i < PART_LEN; i += 4)
|
for (i = 0; i < PART_LEN; i += 4) {
|
||||||
{
|
|
||||||
int16x4_t tmp16x4_0;
|
int16x4_t tmp16x4_0;
|
||||||
int16x4_t tmp16x4_1;
|
int16x4_t tmp16x4_1;
|
||||||
int32x4_t tmp32x4_0;
|
int32x4_t tmp32x4_0;
|
||||||
@ -174,22 +166,19 @@ void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Copy the current block to the old position (outBuf is shifted elsewhere).
|
// Copy the current block to the old position (outBuf is shifted elsewhere).
|
||||||
for (i = 0; i < PART_LEN; i += 16)
|
for (i = 0; i < PART_LEN; i += 16) {
|
||||||
{
|
|
||||||
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||||
"r"(&aecm->xBuf[i + PART_LEN]) : "q10");
|
"r"(&aecm->xBuf[i + PART_LEN]) : "q10");
|
||||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
|
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
|
||||||
}
|
}
|
||||||
for (i = 0; i < PART_LEN; i += 16)
|
for (i = 0; i < PART_LEN; i += 16) {
|
||||||
{
|
|
||||||
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||||
"r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10");
|
"r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10");
|
||||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||||
"r"(&aecm->dBufNoisy[i]): "q10");
|
"r"(&aecm->dBufNoisy[i]): "q10");
|
||||||
}
|
}
|
||||||
if (nearendClean != NULL) {
|
if (nearendClean != NULL) {
|
||||||
for (i = 0; i < PART_LEN; i += 16)
|
for (i = 0; i < PART_LEN; i += 16) {
|
||||||
{
|
|
||||||
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||||
"r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
|
"r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
|
||||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||||
@ -198,13 +187,12 @@ void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
|
static void CalcLinearEnergiesNeon(AecmCore_t* aecm,
|
||||||
const WebRtc_UWord16* far_spectrum,
|
const WebRtc_UWord16* far_spectrum,
|
||||||
WebRtc_Word32* echo_est,
|
WebRtc_Word32* echo_est,
|
||||||
WebRtc_UWord32* far_energy,
|
WebRtc_UWord32* far_energy,
|
||||||
WebRtc_UWord32* echo_energy_adapt,
|
WebRtc_UWord32* echo_energy_adapt,
|
||||||
WebRtc_UWord32* echo_energy_stored)
|
WebRtc_UWord32* echo_energy_stored) {
|
||||||
{
|
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
register WebRtc_UWord32 far_energy_r;
|
register WebRtc_UWord32 far_energy_r;
|
||||||
@ -216,8 +204,7 @@ void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
|
|||||||
__asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored
|
__asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored
|
||||||
__asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt
|
__asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt
|
||||||
|
|
||||||
for(i = 0; i < PART_LEN -7; i += 8)
|
for (i = 0; i < PART_LEN - 7; i += 8) {
|
||||||
{
|
|
||||||
// far_energy += (WebRtc_UWord32)(far_spectrum[i]);
|
// far_energy += (WebRtc_UWord32)(far_spectrum[i]);
|
||||||
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
|
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
|
||||||
__asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
|
__asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
|
||||||
@ -264,16 +251,14 @@ void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
|
|||||||
aecm->channelAdapt16[i], far_spectrum[i]);
|
aecm->channelAdapt16[i], far_spectrum[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
|
static void StoreAdaptiveChannelNeon(AecmCore_t* aecm,
|
||||||
const WebRtc_UWord16* far_spectrum,
|
const WebRtc_UWord16* far_spectrum,
|
||||||
WebRtc_Word32* echo_est)
|
WebRtc_Word32* echo_est) {
|
||||||
{
|
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
// During startup we store the channel every block.
|
// During startup we store the channel every block.
|
||||||
// Recalculate echo estimate.
|
// Recalculate echo estimate.
|
||||||
for(i = 0; i < PART_LEN -7; i += 8)
|
for (i = 0; i < PART_LEN - 7; i += 8) {
|
||||||
{
|
|
||||||
// aecm->channelStored[i] = acem->channelAdapt16[i];
|
// aecm->channelStored[i] = acem->channelAdapt16[i];
|
||||||
// echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
// echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||||
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
|
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
|
||||||
@ -288,12 +273,10 @@ void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
|
|||||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
|
static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
|
||||||
{
|
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for(i = 0; i < PART_LEN -7; i += 8)
|
for (i = 0; i < PART_LEN - 7; i += 8) {
|
||||||
{
|
|
||||||
// aecm->channelAdapt16[i] = aecm->channelStored[i];
|
// aecm->channelAdapt16[i] = aecm->channelStored[i];
|
||||||
// aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
|
// aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
|
||||||
// aecm->channelStored[i], 16);
|
// aecm->channelStored[i], 16);
|
||||||
@ -311,4 +294,10 @@ void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
|
|||||||
(WebRtc_Word32)aecm->channelStored[i], 16);
|
(WebRtc_Word32)aecm->channelStored[i], 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
|
void WebRtcAecm_InitNeon(void) {
|
||||||
|
WebRtcAecm_WindowAndFFT = WindowAndFFTNeon;
|
||||||
|
WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon;
|
||||||
|
WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon;
|
||||||
|
WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon;
|
||||||
|
WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon;
|
||||||
|
}
|
||||||
|
@ -6,6 +6,8 @@
|
|||||||
# in the file PATENTS. All contributing project authors may
|
# in the file PATENTS. All contributing project authors may
|
||||||
# be found in the AUTHORS file in the root of the source tree.
|
# be found in the AUTHORS file in the root of the source tree.
|
||||||
|
|
||||||
|
#############################
|
||||||
|
# Build the non-neon library.
|
||||||
LOCAL_PATH := $(call my-dir)
|
LOCAL_PATH := $(call my-dir)
|
||||||
|
|
||||||
include $(CLEAR_VARS)
|
include $(CLEAR_VARS)
|
||||||
@ -20,25 +22,20 @@ LOCAL_SRC_FILES := \
|
|||||||
noise_suppression_x.c \
|
noise_suppression_x.c \
|
||||||
nsx_core.c
|
nsx_core.c
|
||||||
|
|
||||||
# floating point
|
# Files for floating point.
|
||||||
# noise_suppression.c ns_core.c
|
# noise_suppression.c ns_core.c
|
||||||
|
|
||||||
# Flags passed to both C and C++ files.
|
# Flags passed to both C and C++ files.
|
||||||
LOCAL_CFLAGS := \
|
LOCAL_CFLAGS := $(MY_WEBRTC_COMMON_DEFS)
|
||||||
$(MY_WEBRTC_COMMON_DEFS)
|
|
||||||
|
|
||||||
ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
|
||||||
LOCAL_SRC_FILES += \
|
|
||||||
nsx_core_neon.c
|
|
||||||
LOCAL_CFLAGS += \
|
|
||||||
$(MY_ARM_CFLAGS_NEON)
|
|
||||||
endif
|
|
||||||
|
|
||||||
LOCAL_C_INCLUDES := \
|
LOCAL_C_INCLUDES := \
|
||||||
$(LOCAL_PATH)/interface \
|
$(LOCAL_PATH)/interface \
|
||||||
$(LOCAL_PATH)/../utility \
|
$(LOCAL_PATH)/../utility \
|
||||||
$(LOCAL_PATH)/../../.. \
|
$(LOCAL_PATH)/../../.. \
|
||||||
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
|
$(LOCAL_PATH)/../../../common_audio/signal_processing/include \
|
||||||
|
$(LOCAL_PATH)/../../../system_wrappers/interface
|
||||||
|
|
||||||
|
LOCAL_STATIC_LIBRARIES += libwebrtc_system_wrappers
|
||||||
|
|
||||||
LOCAL_SHARED_LIBRARIES := \
|
LOCAL_SHARED_LIBRARIES := \
|
||||||
libcutils \
|
libcutils \
|
||||||
@ -49,3 +46,31 @@ ifndef NDK_ROOT
|
|||||||
include external/stlport/libstlport.mk
|
include external/stlport/libstlport.mk
|
||||||
endif
|
endif
|
||||||
include $(BUILD_STATIC_LIBRARY)
|
include $(BUILD_STATIC_LIBRARY)
|
||||||
|
|
||||||
|
#############################
|
||||||
|
# Build the neon library.
|
||||||
|
|
||||||
|
include $(CLEAR_VARS)
|
||||||
|
|
||||||
|
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
|
||||||
|
LOCAL_MODULE := libwebrtc_ns_neon
|
||||||
|
LOCAL_MODULE_TAGS := optional
|
||||||
|
LOCAL_GENERATED_SOURCES :=
|
||||||
|
|
||||||
|
LOCAL_SRC_FILES := nsx_core_neon.c
|
||||||
|
|
||||||
|
# Flags passed to both C and C++ files.
|
||||||
|
LOCAL_CFLAGS := \
|
||||||
|
$(MY_WEBRTC_COMMON_DEFS) \
|
||||||
|
-mfpu=neon \
|
||||||
|
-flax-vector-conversions
|
||||||
|
|
||||||
|
LOCAL_C_INCLUDES := \
|
||||||
|
$(LOCAL_PATH)/interface \
|
||||||
|
$(LOCAL_PATH)/../../.. \
|
||||||
|
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
|
||||||
|
|
||||||
|
ifndef NDK_ROOT
|
||||||
|
include external/stlport/libstlport.mk
|
||||||
|
endif
|
||||||
|
include $(BUILD_STATIC_LIBRARY)
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include "cpu_features_wrapper.h"
|
||||||
#include "nsx_core.h"
|
#include "nsx_core.h"
|
||||||
|
|
||||||
// Skip first frequency bins during estimation. (0 <= value < 64)
|
// Skip first frequency bins during estimation. (0 <= value < 64)
|
||||||
@ -426,6 +427,271 @@ static const WebRtc_Word16 kDeterminantEstMatrix[66] = {
|
|||||||
355, 330
|
355, 330
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Declare function pointers.
|
||||||
|
NoiseEstimation WebRtcNsx_NoiseEstimation;
|
||||||
|
PrepareSpectrum WebRtcNsx_PrepareSpectrum;
|
||||||
|
SynthesisUpdate WebRtcNsx_SynthesisUpdate;
|
||||||
|
AnalysisUpdate WebRtcNsx_AnalysisUpdate;
|
||||||
|
Denormalize WebRtcNsx_Denormalize;
|
||||||
|
CreateComplexBuffer WebRtcNsx_CreateComplexBuffer;
|
||||||
|
|
||||||
|
// Update the noise estimation information.
|
||||||
|
static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
|
||||||
|
WebRtc_Word32 tmp32no1 = 0;
|
||||||
|
WebRtc_Word32 tmp32no2 = 0;
|
||||||
|
WebRtc_Word16 tmp16 = 0;
|
||||||
|
const WebRtc_Word16 kExp2Const = 11819; // Q13
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
|
||||||
|
inst->magnLen);
|
||||||
|
// Guarantee a Q-domain as high as possible and still fit in int16
|
||||||
|
inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||||
|
kExp2Const, tmp16, 21);
|
||||||
|
for (i = 0; i < inst->magnLen; i++) {
|
||||||
|
// inst->quantile[i]=exp(inst->lquantile[offset+i]);
|
||||||
|
// in Q21
|
||||||
|
tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
|
||||||
|
inst->noiseEstLogQuantile[offset + i]);
|
||||||
|
tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
|
||||||
|
tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
|
||||||
|
tmp16 -= 21;// shift 21 to get result in Q0
|
||||||
|
tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise)
|
||||||
|
if (tmp16 < 0) {
|
||||||
|
tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
|
||||||
|
} else {
|
||||||
|
tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
|
||||||
|
}
|
||||||
|
inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Noise Estimation
|
||||||
|
static void NoiseEstimationC(NsxInst_t* inst,
|
||||||
|
uint16_t* magn,
|
||||||
|
uint32_t* noise,
|
||||||
|
int16_t* q_noise) {
|
||||||
|
WebRtc_Word32 numerator = FACTOR_Q16;
|
||||||
|
WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
|
||||||
|
WebRtc_Word16 countProd, delta, zeros, frac;
|
||||||
|
WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
|
||||||
|
const int16_t log2_const = 22713; // Q15
|
||||||
|
const int16_t width_factor = 21845;
|
||||||
|
|
||||||
|
int i, s, offset;
|
||||||
|
|
||||||
|
tabind = inst->stages - inst->normData;
|
||||||
|
assert(tabind < 9);
|
||||||
|
assert(tabind > -9);
|
||||||
|
if (tabind < 0) {
|
||||||
|
logval = -WebRtcNsx_kLogTable[-tabind];
|
||||||
|
} else {
|
||||||
|
logval = WebRtcNsx_kLogTable[tabind];
|
||||||
|
}
|
||||||
|
|
||||||
|
// lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
|
||||||
|
// magn is in Q(-stages), and the real lmagn values are:
|
||||||
|
// real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
|
||||||
|
// lmagn in Q8
|
||||||
|
for (i = 0; i < inst->magnLen; i++) {
|
||||||
|
if (magn[i]) {
|
||||||
|
zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
|
||||||
|
frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros)
|
||||||
|
& 0x7FFFFFFF) >> 23);
|
||||||
|
// log2(magn(i))
|
||||||
|
assert(frac < 256);
|
||||||
|
log2 = (WebRtc_Word16)(((31 - zeros) << 8)
|
||||||
|
+ WebRtcNsx_kLogTableFrac[frac]);
|
||||||
|
// log2(magn(i))*log(2)
|
||||||
|
lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
|
||||||
|
// + log(2^stages)
|
||||||
|
lmagn[i] += logval;
|
||||||
|
} else {
|
||||||
|
lmagn[i] = logval;//0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// loop over simultaneous estimates
|
||||||
|
for (s = 0; s < SIMULT; s++) {
|
||||||
|
offset = s * inst->magnLen;
|
||||||
|
|
||||||
|
// Get counter values from state
|
||||||
|
counter = inst->noiseEstCounter[s];
|
||||||
|
assert(counter < 201);
|
||||||
|
countDiv = WebRtcNsx_kCounterDiv[counter];
|
||||||
|
countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
|
||||||
|
|
||||||
|
// quant_est(...)
|
||||||
|
for (i = 0; i < inst->magnLen; i++) {
|
||||||
|
// compute delta
|
||||||
|
if (inst->noiseEstDensity[offset + i] > 512) {
|
||||||
|
delta = WebRtcSpl_DivW32W16ResW16(numerator,
|
||||||
|
inst->noiseEstDensity[offset + i]);
|
||||||
|
} else {
|
||||||
|
delta = FACTOR_Q7;
|
||||||
|
if (inst->blockIndex < END_STARTUP_LONG) {
|
||||||
|
// Smaller step size during startup. This prevents from using
|
||||||
|
// unrealistic values causing overflow.
|
||||||
|
delta = FACTOR_Q7_STARTUP;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update log quantile estimate
|
||||||
|
tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
|
||||||
|
if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
|
||||||
|
// +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
|
||||||
|
// CounterDiv=1/(inst->counter[s]+1) in Q15
|
||||||
|
tmp16 += 2;
|
||||||
|
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
|
||||||
|
inst->noiseEstLogQuantile[offset + i] += tmp16no1;
|
||||||
|
} else {
|
||||||
|
tmp16 += 1;
|
||||||
|
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
|
||||||
|
// *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
|
||||||
|
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
|
||||||
|
inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
|
||||||
|
if (inst->noiseEstLogQuantile[offset + i] < logval) {
|
||||||
|
// This is the smallest fixed point representation we can
|
||||||
|
// have, hence we limit the output.
|
||||||
|
inst->noiseEstLogQuantile[offset + i] = logval;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update density estimate
|
||||||
|
if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
|
||||||
|
< WIDTH_Q8) {
|
||||||
|
tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||||
|
inst->noiseEstDensity[offset + i], countProd, 15);
|
||||||
|
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||||
|
width_factor, countDiv, 15);
|
||||||
|
inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
|
||||||
|
}
|
||||||
|
} // end loop over magnitude spectrum
|
||||||
|
|
||||||
|
if (counter >= END_STARTUP_LONG) {
|
||||||
|
inst->noiseEstCounter[s] = 0;
|
||||||
|
if (inst->blockIndex >= END_STARTUP_LONG) {
|
||||||
|
UpdateNoiseEstimate(inst, offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inst->noiseEstCounter[s]++;
|
||||||
|
|
||||||
|
} // end loop over simultaneous estimates
|
||||||
|
|
||||||
|
// Sequentially update the noise during startup
|
||||||
|
if (inst->blockIndex < END_STARTUP_LONG) {
|
||||||
|
UpdateNoiseEstimate(inst, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < inst->magnLen; i++) {
|
||||||
|
noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
|
||||||
|
}
|
||||||
|
(*q_noise) = (WebRtc_Word16)inst->qNoise;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter the data in the frequency domain, and create spectrum.
|
||||||
|
static void PrepareSpectrumC(NsxInst_t* inst, int16_t* freq_buf) {
|
||||||
|
int i = 0, j = 0;
|
||||||
|
int16_t tmp16 = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < inst->magnLen; i++) {
|
||||||
|
inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
|
||||||
|
(WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
|
||||||
|
inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
|
||||||
|
(WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
|
||||||
|
}
|
||||||
|
|
||||||
|
freq_buf[0] = inst->real[0];
|
||||||
|
freq_buf[1] = -inst->imag[0];
|
||||||
|
for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
|
||||||
|
tmp16 = (inst->anaLen << 1) - j;
|
||||||
|
freq_buf[j] = inst->real[i];
|
||||||
|
freq_buf[j + 1] = -inst->imag[i];
|
||||||
|
freq_buf[tmp16] = inst->real[i];
|
||||||
|
freq_buf[tmp16 + 1] = inst->imag[i];
|
||||||
|
}
|
||||||
|
freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
|
||||||
|
freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Denormalize the input buffer.
|
||||||
|
static __inline void DenormalizeC(NsxInst_t* inst, int16_t* in, int factor) {
|
||||||
|
int i = 0, j = 0;
|
||||||
|
int32_t tmp32 = 0;
|
||||||
|
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
|
||||||
|
tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j],
|
||||||
|
factor - inst->normData);
|
||||||
|
inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For the noise supression process, synthesis, read out fully processed
|
||||||
|
// segment, and update synthesis buffer.
|
||||||
|
static void SynthesisUpdateC(NsxInst_t* inst,
|
||||||
|
int16_t* out_frame,
|
||||||
|
int16_t gain_factor) {
|
||||||
|
int i = 0;
|
||||||
|
int16_t tmp16a = 0;
|
||||||
|
int16_t tmp16b = 0;
|
||||||
|
int32_t tmp32 = 0;
|
||||||
|
|
||||||
|
// synthesis
|
||||||
|
for (i = 0; i < inst->anaLen; i++) {
|
||||||
|
tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||||
|
inst->window[i], inst->real[i], 14); // Q0, window in Q14
|
||||||
|
tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0
|
||||||
|
// Down shift with rounding
|
||||||
|
tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
|
||||||
|
inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i],
|
||||||
|
tmp16b); // Q0
|
||||||
|
}
|
||||||
|
|
||||||
|
// read out fully processed segment
|
||||||
|
for (i = 0; i < inst->blockLen10ms; i++) {
|
||||||
|
out_frame[i] = inst->synthesisBuffer[i]; // Q0
|
||||||
|
}
|
||||||
|
|
||||||
|
// update synthesis buffer
|
||||||
|
WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
|
||||||
|
inst->synthesisBuffer + inst->blockLen10ms,
|
||||||
|
inst->anaLen - inst->blockLen10ms);
|
||||||
|
WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
|
||||||
|
+ inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update analysis buffer for lower band, and window data before FFT.
|
||||||
|
static void AnalysisUpdateC(NsxInst_t* inst,
|
||||||
|
int16_t* out,
|
||||||
|
int16_t* new_speech) {
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
// For lower band update analysis buffer.
|
||||||
|
WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
|
||||||
|
inst->analysisBuffer + inst->blockLen10ms,
|
||||||
|
inst->anaLen - inst->blockLen10ms);
|
||||||
|
WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
|
||||||
|
+ inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
|
||||||
|
|
||||||
|
// Window data before FFT.
|
||||||
|
for (i = 0; i < inst->anaLen; i++) {
|
||||||
|
out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||||
|
inst->window[i], inst->analysisBuffer[i], 14); // Q0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
|
||||||
|
// zeros, and normalize it.
|
||||||
|
static __inline void CreateComplexBufferC(NsxInst_t* inst,
|
||||||
|
int16_t* in,
|
||||||
|
int16_t* out) {
|
||||||
|
int i = 0, j = 0;
|
||||||
|
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
|
||||||
|
out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
|
||||||
|
out[j + 1] = 0; // Insert zeros in imaginary part
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void WebRtcNsx_CalcParametricNoiseEstimate(NsxInst_t* inst,
|
void WebRtcNsx_CalcParametricNoiseEstimate(NsxInst_t* inst,
|
||||||
WebRtc_Word16 pink_noise_exp_avg,
|
WebRtc_Word16 pink_noise_exp_avg,
|
||||||
WebRtc_Word32 pink_noise_num_avg,
|
WebRtc_Word32 pink_noise_num_avg,
|
||||||
@ -600,6 +866,24 @@ WebRtc_Word32 WebRtcNsx_InitCore(NsxInst_t* inst, WebRtc_UWord32 fs) {
|
|||||||
inst->file5 = fopen("file5.pcm", "wb");
|
inst->file5 = fopen("file5.pcm", "wb");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Initialize function pointers.
|
||||||
|
WebRtcNsx_NoiseEstimation = NoiseEstimationC;
|
||||||
|
WebRtcNsx_PrepareSpectrum = PrepareSpectrumC;
|
||||||
|
WebRtcNsx_SynthesisUpdate = SynthesisUpdateC;
|
||||||
|
WebRtcNsx_AnalysisUpdate = AnalysisUpdateC;
|
||||||
|
WebRtcNsx_Denormalize = DenormalizeC;
|
||||||
|
WebRtcNsx_CreateComplexBuffer = CreateComplexBufferC;
|
||||||
|
|
||||||
|
#ifdef WEBRTC_DETECT_ARM_NEON
|
||||||
|
uint64_t features = WebRtc_GetCPUFeaturesARM();
|
||||||
|
if ((features & kCPUFeatureNEON) != 0)
|
||||||
|
{
|
||||||
|
WebRtcNsx_InitNeon();
|
||||||
|
}
|
||||||
|
#elif defined(WEBRTC_ARCH_ARM_NEON)
|
||||||
|
WebRtcNsx_InitNeon();
|
||||||
|
#endif
|
||||||
|
|
||||||
inst->initFlag = 1;
|
inst->initFlag = 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
@ -2157,263 +2441,4 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* speechFrame, short* speechFram
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
|
|
||||||
|
|
||||||
// Update the noise estimation information.
|
|
||||||
static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
|
|
||||||
WebRtc_Word32 tmp32no1 = 0;
|
|
||||||
WebRtc_Word32 tmp32no2 = 0;
|
|
||||||
WebRtc_Word16 tmp16 = 0;
|
|
||||||
const WebRtc_Word16 kExp2Const = 11819; // Q13
|
|
||||||
|
|
||||||
int i = 0;
|
|
||||||
|
|
||||||
tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
|
|
||||||
inst->magnLen);
|
|
||||||
// Guarantee a Q-domain as high as possible and still fit in int16
|
|
||||||
inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
|
||||||
kExp2Const, tmp16, 21);
|
|
||||||
for (i = 0; i < inst->magnLen; i++) {
|
|
||||||
// inst->quantile[i]=exp(inst->lquantile[offset+i]);
|
|
||||||
// in Q21
|
|
||||||
tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
|
|
||||||
inst->noiseEstLogQuantile[offset + i]);
|
|
||||||
tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
|
|
||||||
tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
|
|
||||||
tmp16 -= 21;// shift 21 to get result in Q0
|
|
||||||
tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise)
|
|
||||||
if (tmp16 < 0) {
|
|
||||||
tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
|
|
||||||
} else {
|
|
||||||
tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
|
|
||||||
}
|
|
||||||
inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Noise Estimation
|
|
||||||
void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
|
||||||
uint16_t* magn,
|
|
||||||
uint32_t* noise,
|
|
||||||
int16_t* q_noise) {
|
|
||||||
WebRtc_Word32 numerator = FACTOR_Q16;
|
|
||||||
WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
|
|
||||||
WebRtc_Word16 countProd, delta, zeros, frac;
|
|
||||||
WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
|
|
||||||
const int16_t log2_const = 22713; // Q15
|
|
||||||
const int16_t width_factor = 21845;
|
|
||||||
|
|
||||||
int i, s, offset;
|
|
||||||
|
|
||||||
tabind = inst->stages - inst->normData;
|
|
||||||
assert(tabind < 9);
|
|
||||||
assert(tabind > -9);
|
|
||||||
if (tabind < 0) {
|
|
||||||
logval = -WebRtcNsx_kLogTable[-tabind];
|
|
||||||
} else {
|
|
||||||
logval = WebRtcNsx_kLogTable[tabind];
|
|
||||||
}
|
|
||||||
|
|
||||||
// lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
|
|
||||||
// magn is in Q(-stages), and the real lmagn values are:
|
|
||||||
// real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
|
|
||||||
// lmagn in Q8
|
|
||||||
for (i = 0; i < inst->magnLen; i++) {
|
|
||||||
if (magn[i]) {
|
|
||||||
zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
|
|
||||||
frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros)
|
|
||||||
& 0x7FFFFFFF) >> 23);
|
|
||||||
// log2(magn(i))
|
|
||||||
assert(frac < 256);
|
|
||||||
log2 = (WebRtc_Word16)(((31 - zeros) << 8)
|
|
||||||
+ WebRtcNsx_kLogTableFrac[frac]);
|
|
||||||
// log2(magn(i))*log(2)
|
|
||||||
lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
|
|
||||||
// + log(2^stages)
|
|
||||||
lmagn[i] += logval;
|
|
||||||
} else {
|
|
||||||
lmagn[i] = logval;//0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop over simultaneous estimates
|
|
||||||
for (s = 0; s < SIMULT; s++) {
|
|
||||||
offset = s * inst->magnLen;
|
|
||||||
|
|
||||||
// Get counter values from state
|
|
||||||
counter = inst->noiseEstCounter[s];
|
|
||||||
assert(counter < 201);
|
|
||||||
countDiv = WebRtcNsx_kCounterDiv[counter];
|
|
||||||
countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
|
|
||||||
|
|
||||||
// quant_est(...)
|
|
||||||
for (i = 0; i < inst->magnLen; i++) {
|
|
||||||
// compute delta
|
|
||||||
if (inst->noiseEstDensity[offset + i] > 512) {
|
|
||||||
delta = WebRtcSpl_DivW32W16ResW16(numerator,
|
|
||||||
inst->noiseEstDensity[offset + i]);
|
|
||||||
} else {
|
|
||||||
delta = FACTOR_Q7;
|
|
||||||
if (inst->blockIndex < END_STARTUP_LONG) {
|
|
||||||
// Smaller step size during startup. This prevents from using
|
|
||||||
// unrealistic values causing overflow.
|
|
||||||
delta = FACTOR_Q7_STARTUP;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// update log quantile estimate
|
|
||||||
tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
|
|
||||||
if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
|
|
||||||
// +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
|
|
||||||
// CounterDiv=1/(inst->counter[s]+1) in Q15
|
|
||||||
tmp16 += 2;
|
|
||||||
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
|
|
||||||
inst->noiseEstLogQuantile[offset + i] += tmp16no1;
|
|
||||||
} else {
|
|
||||||
tmp16 += 1;
|
|
||||||
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
|
|
||||||
// *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
|
|
||||||
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
|
|
||||||
inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
|
|
||||||
if (inst->noiseEstLogQuantile[offset + i] < logval) {
|
|
||||||
// This is the smallest fixed point representation we can
|
|
||||||
// have, hence we limit the output.
|
|
||||||
inst->noiseEstLogQuantile[offset + i] = logval;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// update density estimate
|
|
||||||
if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
|
|
||||||
< WIDTH_Q8) {
|
|
||||||
tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
|
||||||
inst->noiseEstDensity[offset + i], countProd, 15);
|
|
||||||
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
|
||||||
width_factor, countDiv, 15);
|
|
||||||
inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
|
|
||||||
}
|
|
||||||
} // end loop over magnitude spectrum
|
|
||||||
|
|
||||||
if (counter >= END_STARTUP_LONG) {
|
|
||||||
inst->noiseEstCounter[s] = 0;
|
|
||||||
if (inst->blockIndex >= END_STARTUP_LONG) {
|
|
||||||
UpdateNoiseEstimate(inst, offset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
inst->noiseEstCounter[s]++;
|
|
||||||
|
|
||||||
} // end loop over simultaneous estimates
|
|
||||||
|
|
||||||
// Sequentially update the noise during startup
|
|
||||||
if (inst->blockIndex < END_STARTUP_LONG) {
|
|
||||||
UpdateNoiseEstimate(inst, offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < inst->magnLen; i++) {
|
|
||||||
noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
|
|
||||||
}
|
|
||||||
(*q_noise) = (WebRtc_Word16)inst->qNoise;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Filter the data in the frequency domain, and create spectrum.
|
|
||||||
void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
|
|
||||||
int i = 0, j = 0;
|
|
||||||
int16_t tmp16 = 0;
|
|
||||||
|
|
||||||
for (i = 0; i < inst->magnLen; i++) {
|
|
||||||
inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
|
|
||||||
(WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
|
|
||||||
inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
|
|
||||||
(WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
|
|
||||||
}
|
|
||||||
|
|
||||||
freq_buf[0] = inst->real[0];
|
|
||||||
freq_buf[1] = -inst->imag[0];
|
|
||||||
for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
|
|
||||||
tmp16 = (inst->anaLen << 1) - j;
|
|
||||||
freq_buf[j] = inst->real[i];
|
|
||||||
freq_buf[j + 1] = -inst->imag[i];
|
|
||||||
freq_buf[tmp16] = inst->real[i];
|
|
||||||
freq_buf[tmp16 + 1] = inst->imag[i];
|
|
||||||
}
|
|
||||||
freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
|
|
||||||
freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Denormalize the input buffer.
|
|
||||||
__inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
|
|
||||||
int i = 0, j = 0;
|
|
||||||
int32_t tmp32 = 0;
|
|
||||||
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
|
|
||||||
tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j],
|
|
||||||
factor - inst->normData);
|
|
||||||
inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// For the noise supression process, synthesis, read out fully processed
|
|
||||||
// segment, and update synthesis buffer.
|
|
||||||
void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
|
||||||
int16_t* out_frame,
|
|
||||||
int16_t gain_factor) {
|
|
||||||
int i = 0;
|
|
||||||
int16_t tmp16a = 0;
|
|
||||||
int16_t tmp16b = 0;
|
|
||||||
int32_t tmp32 = 0;
|
|
||||||
|
|
||||||
// synthesis
|
|
||||||
for (i = 0; i < inst->anaLen; i++) {
|
|
||||||
tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
|
||||||
inst->window[i], inst->real[i], 14); // Q0, window in Q14
|
|
||||||
tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0
|
|
||||||
// Down shift with rounding
|
|
||||||
tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
|
|
||||||
inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i],
|
|
||||||
tmp16b); // Q0
|
|
||||||
}
|
|
||||||
|
|
||||||
// read out fully processed segment
|
|
||||||
for (i = 0; i < inst->blockLen10ms; i++) {
|
|
||||||
out_frame[i] = inst->synthesisBuffer[i]; // Q0
|
|
||||||
}
|
|
||||||
|
|
||||||
// update synthesis buffer
|
|
||||||
WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
|
|
||||||
inst->synthesisBuffer + inst->blockLen10ms,
|
|
||||||
inst->anaLen - inst->blockLen10ms);
|
|
||||||
WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
|
|
||||||
+ inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update analysis buffer for lower band, and window data before FFT.
|
|
||||||
void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
|
||||||
int16_t* out,
|
|
||||||
int16_t* new_speech) {
|
|
||||||
int i = 0;
|
|
||||||
|
|
||||||
// For lower band update analysis buffer.
|
|
||||||
WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
|
|
||||||
inst->analysisBuffer + inst->blockLen10ms,
|
|
||||||
inst->anaLen - inst->blockLen10ms);
|
|
||||||
WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
|
|
||||||
+ inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
|
|
||||||
|
|
||||||
// Window data before FFT.
|
|
||||||
for (i = 0; i < inst->anaLen; i++) {
|
|
||||||
out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
|
||||||
inst->window[i], inst->analysisBuffer[i], 14); // Q0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
|
|
||||||
// zeros, and normalize it.
|
|
||||||
__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
|
|
||||||
int16_t* in,
|
|
||||||
int16_t* out) {
|
|
||||||
int i = 0, j = 0;
|
|
||||||
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
|
|
||||||
out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
|
|
||||||
out[j + 1] = 0; // Insert zeros in imaginary part
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
|
|
||||||
|
@ -165,40 +165,51 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst,
|
|||||||
short* outFrameHigh);
|
short* outFrameHigh);
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
* Internal functions and variable declarations shared with optimized code.
|
* Some function pointers, for internal functions shared by ARM NEON and
|
||||||
|
* generic C code.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Noise Estimation.
|
// Noise Estimation.
|
||||||
void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
typedef void (*NoiseEstimation)(NsxInst_t* inst,
|
||||||
uint16_t* magn,
|
uint16_t* magn,
|
||||||
uint32_t* noise,
|
uint32_t* noise,
|
||||||
int16_t* q_noise);
|
int16_t* q_noise);
|
||||||
|
extern NoiseEstimation WebRtcNsx_NoiseEstimation;
|
||||||
|
|
||||||
// Filter the data in the frequency domain, and create spectrum.
|
// Filter the data in the frequency domain, and create spectrum.
|
||||||
void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst,
|
typedef void (*PrepareSpectrum)(NsxInst_t* inst,
|
||||||
int16_t* freq_buff);
|
int16_t* freq_buff);
|
||||||
|
extern PrepareSpectrum WebRtcNsx_PrepareSpectrum;
|
||||||
|
|
||||||
// For the noise supression process, synthesis, read out fully processed
|
// For the noise supression process, synthesis, read out fully processed
|
||||||
// segment, and update synthesis buffer.
|
// segment, and update synthesis buffer.
|
||||||
void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
typedef void (*SynthesisUpdate)(NsxInst_t* inst,
|
||||||
int16_t* out_frame,
|
int16_t* out_frame,
|
||||||
int16_t gain_factor);
|
int16_t gain_factor);
|
||||||
|
extern SynthesisUpdate WebRtcNsx_SynthesisUpdate;
|
||||||
|
|
||||||
// Update analysis buffer for lower band, and window data before FFT.
|
// Update analysis buffer for lower band, and window data before FFT.
|
||||||
void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
typedef void (*AnalysisUpdate)(NsxInst_t* inst,
|
||||||
int16_t* out,
|
int16_t* out,
|
||||||
int16_t* new_speech);
|
int16_t* new_speech);
|
||||||
|
extern AnalysisUpdate WebRtcNsx_AnalysisUpdate;
|
||||||
|
|
||||||
// Denormalize the input buffer.
|
// Denormalize the input buffer.
|
||||||
__inline void WebRtcNsx_Denormalize(NsxInst_t* inst,
|
typedef void (*Denormalize)(NsxInst_t* inst,
|
||||||
int16_t* in,
|
int16_t* in,
|
||||||
int factor);
|
int factor);
|
||||||
|
extern Denormalize WebRtcNsx_Denormalize;
|
||||||
|
|
||||||
// Create a complex number buffer, as the intput interleaved with zeros,
|
// Create a complex number buffer, as the intput interleaved with zeros,
|
||||||
// and normalize it.
|
// and normalize it.
|
||||||
__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
|
typedef void (*CreateComplexBuffer)(NsxInst_t* inst,
|
||||||
int16_t* in,
|
int16_t* in,
|
||||||
int16_t* out);
|
int16_t* out);
|
||||||
|
extern CreateComplexBuffer WebRtcNsx_CreateComplexBuffer;
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* Initialization of the above function pointers for ARM Neon.
|
||||||
|
*/
|
||||||
|
void WebRtcNsx_InitNeon(void);
|
||||||
|
|
||||||
extern const WebRtc_Word16 WebRtcNsx_kLogTable[9];
|
extern const WebRtc_Word16 WebRtcNsx_kLogTable[9];
|
||||||
extern const WebRtc_Word16 WebRtcNsx_kLogTableFrac[256];
|
extern const WebRtc_Word16 WebRtcNsx_kLogTableFrac[256];
|
||||||
|
@ -8,15 +8,13 @@
|
|||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#if defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)
|
|
||||||
|
|
||||||
#include "nsx_core.h"
|
#include "nsx_core.h"
|
||||||
|
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
// Update the noise estimation information.
|
// Update the noise estimation information.
|
||||||
static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
|
static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
const int16_t kExp2Const = 11819; // Q13
|
const int16_t kExp2Const = 11819; // Q13
|
||||||
int16_t* ptr_noiseEstLogQuantile = NULL;
|
int16_t* ptr_noiseEstLogQuantile = NULL;
|
||||||
@ -94,7 +92,7 @@ static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Noise Estimation
|
// Noise Estimation
|
||||||
void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
static void NoiseEstimationNeon(NsxInst_t* inst,
|
||||||
uint16_t* magn,
|
uint16_t* magn,
|
||||||
uint32_t* noise,
|
uint32_t* noise,
|
||||||
int16_t* q_noise) {
|
int16_t* q_noise) {
|
||||||
@ -302,7 +300,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
|||||||
if (counter >= END_STARTUP_LONG) {
|
if (counter >= END_STARTUP_LONG) {
|
||||||
inst->noiseEstCounter[s] = 0;
|
inst->noiseEstCounter[s] = 0;
|
||||||
if (inst->blockIndex >= END_STARTUP_LONG) {
|
if (inst->blockIndex >= END_STARTUP_LONG) {
|
||||||
UpdateNoiseEstimate(inst, offset);
|
UpdateNoiseEstimateNeon(inst, offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
inst->noiseEstCounter[s]++;
|
inst->noiseEstCounter[s]++;
|
||||||
@ -311,7 +309,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
|||||||
|
|
||||||
// Sequentially update the noise during startup
|
// Sequentially update the noise during startup
|
||||||
if (inst->blockIndex < END_STARTUP_LONG) {
|
if (inst->blockIndex < END_STARTUP_LONG) {
|
||||||
UpdateNoiseEstimate(inst, offset);
|
UpdateNoiseEstimateNeon(inst, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < inst->magnLen; i++) {
|
for (i = 0; i < inst->magnLen; i++) {
|
||||||
@ -321,7 +319,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Filter the data in the frequency domain, and create spectrum.
|
// Filter the data in the frequency domain, and create spectrum.
|
||||||
void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
|
static void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
|
||||||
|
|
||||||
// (1) Filtering.
|
// (1) Filtering.
|
||||||
|
|
||||||
@ -338,7 +336,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
|
|||||||
uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0];
|
uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0];
|
||||||
|
|
||||||
// Filter the rest in the frequency domain.
|
// Filter the rest in the frequency domain.
|
||||||
for (; ptr_real < &inst->real[inst->magnLen - 1]; ) {
|
for (; ptr_real < &inst->real[inst->magnLen - 1];) {
|
||||||
// Loop unrolled once. Both pointers are incremented by 4 twice.
|
// Loop unrolled once. Both pointers are incremented by 4 twice.
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
"vld1.16 d20, [%[ptr_real]]\n\t"
|
"vld1.16 d20, [%[ptr_real]]\n\t"
|
||||||
@ -400,7 +398,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
|
|||||||
int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8];
|
int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8];
|
||||||
ptr_real = &inst->real[1];
|
ptr_real = &inst->real[1];
|
||||||
ptr_imag = &inst->imag[1];
|
ptr_imag = &inst->imag[1];
|
||||||
for (; ptr_real < &inst->real[inst->anaLen2 - 11]; ) {
|
for (; ptr_real < &inst->real[inst->anaLen2 - 11];) {
|
||||||
// Loop unrolled once. All pointers are incremented twice.
|
// Loop unrolled once. All pointers are incremented twice.
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
"vld1.16 d22, [%[ptr_real]]!\n\t"
|
"vld1.16 d22, [%[ptr_real]]!\n\t"
|
||||||
@ -456,13 +454,13 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Denormalize the input buffer.
|
// Denormalize the input buffer.
|
||||||
__inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
|
static __inline void DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor) {
|
||||||
int16_t* ptr_real = &inst->real[0];
|
int16_t* ptr_real = &inst->real[0];
|
||||||
int16_t* ptr_in = &in[0];
|
int16_t* ptr_in = &in[0];
|
||||||
|
|
||||||
__asm__ __volatile__("vdup.32 q10, %0" ::
|
__asm__ __volatile__("vdup.32 q10, %0" ::
|
||||||
"r"((int32_t)(factor - inst->normData)) : "q10");
|
"r"((int32_t)(factor - inst->normData)) : "q10");
|
||||||
for (; ptr_real < &inst->real[inst->anaLen]; ) {
|
for (; ptr_real < &inst->real[inst->anaLen];) {
|
||||||
|
|
||||||
// Loop unrolled once. Both pointers are incremented.
|
// Loop unrolled once. Both pointers are incremented.
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
@ -495,7 +493,7 @@ __inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
|
|||||||
|
|
||||||
// For the noise supress process, synthesis, read out fully processed segment,
|
// For the noise supress process, synthesis, read out fully processed segment,
|
||||||
// and update synthesis buffer.
|
// and update synthesis buffer.
|
||||||
void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
static void SynthesisUpdateNeon(NsxInst_t* inst,
|
||||||
int16_t* out_frame,
|
int16_t* out_frame,
|
||||||
int16_t gain_factor) {
|
int16_t gain_factor) {
|
||||||
int16_t* ptr_real = &inst->real[0];
|
int16_t* ptr_real = &inst->real[0];
|
||||||
@ -505,7 +503,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
|||||||
// synthesis
|
// synthesis
|
||||||
__asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24");
|
__asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24");
|
||||||
// Loop unrolled once. All pointers are incremented in the assembly code.
|
// Loop unrolled once. All pointers are incremented in the assembly code.
|
||||||
for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) {
|
for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
// Load variables.
|
// Load variables.
|
||||||
"vld1.16 d22, [%[ptr_real]]!\n\t"
|
"vld1.16 d22, [%[ptr_real]]!\n\t"
|
||||||
@ -553,7 +551,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
|||||||
int16_t* ptr_out = &out_frame[0];
|
int16_t* ptr_out = &out_frame[0];
|
||||||
ptr_syn = &inst->synthesisBuffer[0];
|
ptr_syn = &inst->synthesisBuffer[0];
|
||||||
// read out fully processed segment
|
// read out fully processed segment
|
||||||
for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms]; ) {
|
for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms];) {
|
||||||
// Loop unrolled once. Both pointers are incremented in the assembly code.
|
// Loop unrolled once. Both pointers are incremented in the assembly code.
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
// out_frame[i] = inst->synthesisBuffer[i]; // Q0
|
// out_frame[i] = inst->synthesisBuffer[i]; // Q0
|
||||||
@ -575,7 +573,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
|||||||
// inst->anaLen - inst->blockLen10ms);
|
// inst->anaLen - inst->blockLen10ms);
|
||||||
ptr_out = &inst->synthesisBuffer[0],
|
ptr_out = &inst->synthesisBuffer[0],
|
||||||
ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms];
|
ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms];
|
||||||
for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) {
|
for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
|
||||||
// Loop unrolled once. Both pointers are incremented in the assembly code.
|
// Loop unrolled once. Both pointers are incremented in the assembly code.
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
"vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
|
"vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
|
||||||
@ -593,7 +591,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
|||||||
// WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
|
// WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
|
||||||
// + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
|
// + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
|
||||||
__asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10");
|
__asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10");
|
||||||
for (; ptr_out < &inst->synthesisBuffer[inst->anaLen]; ) {
|
for (; ptr_out < &inst->synthesisBuffer[inst->anaLen];) {
|
||||||
// Loop unrolled once. Pointer is incremented in the assembly code.
|
// Loop unrolled once. Pointer is incremented in the assembly code.
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
"vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
|
"vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
|
||||||
@ -606,7 +604,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Update analysis buffer for lower band, and window data before FFT.
|
// Update analysis buffer for lower band, and window data before FFT.
|
||||||
void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
static void AnalysisUpdateNeon(NsxInst_t* inst,
|
||||||
int16_t* out,
|
int16_t* out,
|
||||||
int16_t* new_speech) {
|
int16_t* new_speech) {
|
||||||
|
|
||||||
@ -617,7 +615,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
|||||||
// WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
|
// WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
|
||||||
// inst->analysisBuffer + inst->blockLen10ms,
|
// inst->analysisBuffer + inst->blockLen10ms,
|
||||||
// inst->anaLen - inst->blockLen10ms);
|
// inst->anaLen - inst->blockLen10ms);
|
||||||
for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms]; ) {
|
for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) {
|
||||||
// Loop unrolled once, so both pointers are incremented by 8 twice.
|
// Loop unrolled once, so both pointers are incremented by 8 twice.
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
"vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
|
"vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
|
||||||
@ -633,7 +631,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
|||||||
|
|
||||||
// WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
|
// WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
|
||||||
// + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
|
// + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
|
||||||
for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen]; ) {
|
for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) {
|
||||||
// Loop unrolled once, so both pointers are incremented by 8 twice.
|
// Loop unrolled once, so both pointers are incremented by 8 twice.
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
"vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
|
"vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
|
||||||
@ -651,7 +649,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
|||||||
int16_t* ptr_window = &inst->window[0];
|
int16_t* ptr_window = &inst->window[0];
|
||||||
ptr_out = &out[0];
|
ptr_out = &out[0];
|
||||||
ptr_ana = &inst->analysisBuffer[0];
|
ptr_ana = &inst->analysisBuffer[0];
|
||||||
for (; ptr_out < &out[inst->anaLen]; ) {
|
for (; ptr_out < &out[inst->anaLen];) {
|
||||||
|
|
||||||
// Loop unrolled once, so all pointers are incremented by 4 twice.
|
// Loop unrolled once, so all pointers are incremented by 4 twice.
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
@ -683,7 +681,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
|||||||
|
|
||||||
// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
|
// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
|
||||||
// zeros, and normalize it.
|
// zeros, and normalize it.
|
||||||
__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
|
static __inline void CreateComplexBufferNeon(NsxInst_t* inst,
|
||||||
int16_t* in,
|
int16_t* in,
|
||||||
int16_t* out) {
|
int16_t* out) {
|
||||||
int16_t* ptr_out = &out[0];
|
int16_t* ptr_out = &out[0];
|
||||||
@ -691,7 +689,7 @@ __inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
|
|||||||
|
|
||||||
__asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25");
|
__asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25");
|
||||||
__asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10");
|
__asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10");
|
||||||
for (; ptr_in < &in[inst->anaLen]; ) {
|
for (; ptr_in < &in[inst->anaLen];) {
|
||||||
|
|
||||||
// Loop unrolled once, so ptr_in is incremented by 8 twice,
|
// Loop unrolled once, so ptr_in is incremented by 8 twice,
|
||||||
// and ptr_out is incremented by 8 four times.
|
// and ptr_out is incremented by 8 four times.
|
||||||
@ -724,4 +722,12 @@ __inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)
|
|
||||||
|
void WebRtcNsx_InitNeon(void) {
|
||||||
|
WebRtcNsx_NoiseEstimation = NoiseEstimationNeon;
|
||||||
|
WebRtcNsx_PrepareSpectrum = PrepareSpectrumNeon;
|
||||||
|
WebRtcNsx_SynthesisUpdate = SynthesisUpdateNeon;
|
||||||
|
WebRtcNsx_AnalysisUpdate = AnalysisUpdateNeon;
|
||||||
|
WebRtcNsx_Denormalize = DenormalizeNeon;
|
||||||
|
WebRtcNsx_CreateComplexBuffer = CreateComplexBufferNeon;
|
||||||
|
}
|
||||||
|
@ -15,18 +15,33 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// list of features.
|
#include <typedefs.h>
|
||||||
|
|
||||||
|
// List of features in x86.
|
||||||
typedef enum {
|
typedef enum {
|
||||||
kSSE2,
|
kSSE2,
|
||||||
kSSE3
|
kSSE3
|
||||||
} CPUFeature;
|
} CPUFeature;
|
||||||
|
|
||||||
|
// List of features in ARM.
|
||||||
|
enum {
|
||||||
|
kCPUFeatureARMv7 = (1 << 0),
|
||||||
|
kCPUFeatureVFPv3 = (1 << 1),
|
||||||
|
kCPUFeatureNEON = (1 << 2),
|
||||||
|
kCPUFeatureLDREXSTREX = (1 << 3)
|
||||||
|
};
|
||||||
|
|
||||||
typedef int (*WebRtc_CPUInfo)(CPUFeature feature);
|
typedef int (*WebRtc_CPUInfo)(CPUFeature feature);
|
||||||
// returns true if the CPU supports the feature.
|
// returns true if the CPU supports the feature.
|
||||||
extern WebRtc_CPUInfo WebRtc_GetCPUInfo;
|
extern WebRtc_CPUInfo WebRtc_GetCPUInfo;
|
||||||
// No CPU feature is available => straight C path.
|
// No CPU feature is available => straight C path.
|
||||||
extern WebRtc_CPUInfo WebRtc_GetCPUInfoNoASM;
|
extern WebRtc_CPUInfo WebRtc_GetCPUInfoNoASM;
|
||||||
|
|
||||||
|
// Return the features in an ARM device.
|
||||||
|
// It detects the features in the hardware platform, and returns supported
|
||||||
|
// values in the above enum definition as a bitmask.
|
||||||
|
extern uint64_t WebRtc_GetCPUFeaturesARM(void);
|
||||||
|
|
||||||
#if defined(__cplusplus) || defined(c_plusplus)
|
#if defined(__cplusplus) || defined(c_plusplus)
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
#endif
|
#endif
|
||||||
|
@ -25,6 +25,7 @@ LOCAL_SRC_FILES := \
|
|||||||
condition_variable.cc \
|
condition_variable.cc \
|
||||||
cpu_dummy.cc \
|
cpu_dummy.cc \
|
||||||
cpu_features.cc \
|
cpu_features.cc \
|
||||||
|
cpu_features_arm.c \
|
||||||
cpu_info.cc \
|
cpu_info.cc \
|
||||||
critical_section.cc \
|
critical_section.cc \
|
||||||
event.cc \
|
event.cc \
|
||||||
|
333
src/system_wrappers/source/cpu_features_arm.c
Normal file
333
src/system_wrappers/source/cpu_features_arm.c
Normal file
@ -0,0 +1,333 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This file is derived from Android's NDK package r7, located at
|
||||||
|
// <ndk>/sources/android/cpufeatures/ (downloadable from
|
||||||
|
// http://developer.android.com/sdk/ndk/index.html).
|
||||||
|
|
||||||
|
#include "cpu_features_wrapper.h"
|
||||||
|
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <pthread.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// Define CPU family.
|
||||||
|
typedef enum {
|
||||||
|
CPU_FAMILY_UNKNOWN = 0,
|
||||||
|
CPU_FAMILY_ARM,
|
||||||
|
CPU_FAMILY_X86,
|
||||||
|
CPU_FAMILY_MAX // Do not remove.
|
||||||
|
} CpuFamily;
|
||||||
|
|
||||||
|
static pthread_once_t g_once;
|
||||||
|
static CpuFamily g_cpuFamily;
|
||||||
|
static uint64_t g_cpuFeatures;
|
||||||
|
static int g_cpuCount;
|
||||||
|
|
||||||
|
static const int cpufeatures_debug = 0;
|
||||||
|
|
||||||
|
#ifdef __arm__
|
||||||
|
# define DEFAULT_CPU_FAMILY CPU_FAMILY_ARM
|
||||||
|
#elif defined __i386__
|
||||||
|
# define DEFAULT_CPU_FAMILY CPU_FAMILY_X86
|
||||||
|
#else
|
||||||
|
# define DEFAULT_CPU_FAMILY CPU_FAMILY_UNKNOWN
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define D(...) \
|
||||||
|
do { \
|
||||||
|
if (cpufeatures_debug) { \
|
||||||
|
printf(__VA_ARGS__); fflush(stdout); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
/* Read the content of /proc/cpuinfo into a user-provided buffer.
|
||||||
|
* Return the length of the data, or -1 on error. Does *not*
|
||||||
|
* zero-terminate the content. Will not read more
|
||||||
|
* than 'buffsize' bytes.
|
||||||
|
*/
|
||||||
|
static int read_file(const char* pathname, char* buffer, size_t buffsize) {
|
||||||
|
int fd, len;
|
||||||
|
|
||||||
|
fd = open(pathname, O_RDONLY);
|
||||||
|
if (fd < 0)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
do {
|
||||||
|
len = read(fd, buffer, buffsize);
|
||||||
|
} while (len < 0 && errno == EINTR);
|
||||||
|
|
||||||
|
close(fd);
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Extract the content of a the first occurence of a given field in
|
||||||
|
* the content of /proc/cpuinfo and return it as a heap-allocated
|
||||||
|
* string that must be freed by the caller.
|
||||||
|
*
|
||||||
|
* Return NULL if not found
|
||||||
|
*/
|
||||||
|
static char* extract_cpuinfo_field(char* buffer, int buflen, const char* field) {
|
||||||
|
int fieldlen = strlen(field);
|
||||||
|
char* bufend = buffer + buflen;
|
||||||
|
char* result = NULL;
|
||||||
|
int len, ignore;
|
||||||
|
const char* p, *q;
|
||||||
|
|
||||||
|
/* Look for first field occurence, and ensures it starts the line.
|
||||||
|
*/
|
||||||
|
p = buffer;
|
||||||
|
bufend = buffer + buflen;
|
||||||
|
for (;;) {
|
||||||
|
p = memmem(p, bufend - p, field, fieldlen);
|
||||||
|
if (p == NULL)
|
||||||
|
goto EXIT;
|
||||||
|
|
||||||
|
if (p == buffer || p[-1] == '\n')
|
||||||
|
break;
|
||||||
|
|
||||||
|
p += fieldlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Skip to the first column followed by a space */
|
||||||
|
p += fieldlen;
|
||||||
|
p = memchr(p, ':', bufend - p);
|
||||||
|
if (p == NULL || p[1] != ' ')
|
||||||
|
goto EXIT;
|
||||||
|
|
||||||
|
/* Find the end of the line */
|
||||||
|
p += 2;
|
||||||
|
q = memchr(p, '\n', bufend - p);
|
||||||
|
if (q == NULL)
|
||||||
|
q = bufend;
|
||||||
|
|
||||||
|
/* Copy the line into a heap-allocated buffer */
|
||||||
|
len = q - p;
|
||||||
|
result = malloc(len + 1);
|
||||||
|
if (result == NULL)
|
||||||
|
goto EXIT;
|
||||||
|
|
||||||
|
memcpy(result, p, len);
|
||||||
|
result[len] = '\0';
|
||||||
|
|
||||||
|
EXIT:
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Count the number of occurences of a given field prefix in /proc/cpuinfo.
|
||||||
|
*/
|
||||||
|
static int count_cpuinfo_field(char* buffer, int buflen, const char* field) {
|
||||||
|
int fieldlen = strlen(field);
|
||||||
|
const char* p = buffer;
|
||||||
|
const char* bufend = buffer + buflen;
|
||||||
|
const char* q;
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
const char* q;
|
||||||
|
|
||||||
|
p = memmem(p, bufend - p, field, fieldlen);
|
||||||
|
if (p == NULL)
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* Ensure that the field is at the start of a line */
|
||||||
|
if (p > buffer && p[-1] != '\n') {
|
||||||
|
p += fieldlen;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* skip any whitespace */
|
||||||
|
q = p + fieldlen;
|
||||||
|
while (q < bufend && (*q == ' ' || *q == '\t'))
|
||||||
|
q++;
|
||||||
|
|
||||||
|
/* we must have a colon now */
|
||||||
|
if (q < bufend && *q == ':') {
|
||||||
|
count += 1;
|
||||||
|
q ++;
|
||||||
|
}
|
||||||
|
p = q;
|
||||||
|
}
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Like strlen(), but for constant string literals */
|
||||||
|
#define STRLEN_CONST(x) ((sizeof(x)-1)
|
||||||
|
|
||||||
|
|
||||||
|
/* Checks that a space-separated list of items contains one given 'item'.
|
||||||
|
* Returns 1 if found, 0 otherwise.
|
||||||
|
*/
|
||||||
|
static int has_list_item(const char* list, const char* item) {
|
||||||
|
const char* p = list;
|
||||||
|
int itemlen = strlen(item);
|
||||||
|
|
||||||
|
if (list == NULL)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
while (*p) {
|
||||||
|
const char* q;
|
||||||
|
|
||||||
|
/* skip spaces */
|
||||||
|
while (*p == ' ' || *p == '\t')
|
||||||
|
p++;
|
||||||
|
|
||||||
|
/* find end of current list item */
|
||||||
|
q = p;
|
||||||
|
while (*q && *q != ' ' && *q != '\t')
|
||||||
|
q++;
|
||||||
|
|
||||||
|
if (itemlen == q - p && !memcmp(p, item, itemlen))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
/* skip to next item */
|
||||||
|
p = q;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void cpuInit(void) {
|
||||||
|
char cpuinfo[4096];
|
||||||
|
int cpuinfo_len;
|
||||||
|
|
||||||
|
g_cpuFamily = DEFAULT_CPU_FAMILY;
|
||||||
|
g_cpuFeatures = 0;
|
||||||
|
g_cpuCount = 1;
|
||||||
|
|
||||||
|
cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, sizeof cpuinfo);
|
||||||
|
D("cpuinfo_len is (%d):\n%.*s\n", cpuinfo_len,
|
||||||
|
cpuinfo_len >= 0 ? cpuinfo_len : 0, cpuinfo);
|
||||||
|
|
||||||
|
if (cpuinfo_len < 0) { /* should not happen */
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Count the CPU cores, the value may be 0 for single-core CPUs */
|
||||||
|
g_cpuCount = count_cpuinfo_field(cpuinfo, cpuinfo_len, "processor");
|
||||||
|
if (g_cpuCount == 0) {
|
||||||
|
g_cpuCount = count_cpuinfo_field(cpuinfo, cpuinfo_len, "Processor");
|
||||||
|
if (g_cpuCount == 0) {
|
||||||
|
g_cpuCount = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
D("found cpuCount = %d\n", g_cpuCount);
|
||||||
|
|
||||||
|
#ifdef __arm__
|
||||||
|
{
|
||||||
|
char* features = NULL;
|
||||||
|
char* architecture = NULL;
|
||||||
|
|
||||||
|
/* Extract architecture from the "CPU Architecture" field.
|
||||||
|
* The list is well-known, unlike the the output of
|
||||||
|
* the 'Processor' field which can vary greatly.
|
||||||
|
*
|
||||||
|
* See the definition of the 'proc_arch' array in
|
||||||
|
* $KERNEL/arch/arm/kernel/setup.c and the 'c_show' function in
|
||||||
|
* same file.
|
||||||
|
*/
|
||||||
|
char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
|
||||||
|
"CPU architecture");
|
||||||
|
|
||||||
|
if (cpuArch != NULL) {
|
||||||
|
char* end;
|
||||||
|
long archNumber;
|
||||||
|
int hasARMv7 = 0;
|
||||||
|
|
||||||
|
D("found cpuArch = '%s'\n", cpuArch);
|
||||||
|
|
||||||
|
/* read the initial decimal number, ignore the rest */
|
||||||
|
archNumber = strtol(cpuArch, &end, 10);
|
||||||
|
|
||||||
|
/* Here we assume that ARMv8 will be upwards compatible with v7
|
||||||
|
* in the future. Unfortunately, there is no 'Features' field to
|
||||||
|
* indicate that Thumb-2 is supported.
|
||||||
|
*/
|
||||||
|
if (end > cpuArch && archNumber >= 7) {
|
||||||
|
hasARMv7 = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Unfortunately, it seems that certain ARMv6-based CPUs
|
||||||
|
* report an incorrect architecture number of 7!
|
||||||
|
*
|
||||||
|
* We try to correct this by looking at the 'elf_format'
|
||||||
|
* field reported by the 'Processor' field, which is of the
|
||||||
|
* form of "(v7l)" for an ARMv7-based CPU, and "(v6l)" for
|
||||||
|
* an ARMv6-one.
|
||||||
|
*/
|
||||||
|
if (hasARMv7) {
|
||||||
|
char* cpuProc = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
|
||||||
|
"Processor");
|
||||||
|
if (cpuProc != NULL) {
|
||||||
|
D("found cpuProc = '%s'\n", cpuProc);
|
||||||
|
if (has_list_item(cpuProc, "(v6l)")) {
|
||||||
|
D("CPU processor and architecture mismatch!!\n");
|
||||||
|
hasARMv7 = 0;
|
||||||
|
}
|
||||||
|
free(cpuProc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasARMv7) {
|
||||||
|
g_cpuFeatures |= kCPUFeatureARMv7;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The LDREX / STREX instructions are available from ARMv6 */
|
||||||
|
if (archNumber >= 6) {
|
||||||
|
g_cpuFeatures |= kCPUFeatureLDREXSTREX;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(cpuArch);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Extract the list of CPU features from 'Features' field */
|
||||||
|
char* cpuFeatures = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
|
||||||
|
"Features");
|
||||||
|
|
||||||
|
if (cpuFeatures != NULL) {
|
||||||
|
|
||||||
|
D("found cpuFeatures = '%s'\n", cpuFeatures);
|
||||||
|
|
||||||
|
if (has_list_item(cpuFeatures, "vfpv3"))
|
||||||
|
g_cpuFeatures |= kCPUFeatureVFPv3;
|
||||||
|
|
||||||
|
else if (has_list_item(cpuFeatures, "vfpv3d16"))
|
||||||
|
g_cpuFeatures |= kCPUFeatureVFPv3;
|
||||||
|
|
||||||
|
if (has_list_item(cpuFeatures, "neon")) {
|
||||||
|
/* Note: Certain kernels only report neon but not vfpv3
|
||||||
|
* in their features list. However, ARM mandates
|
||||||
|
* that if Neon is implemented, so must be VFPv3
|
||||||
|
* so always set the flag.
|
||||||
|
*/
|
||||||
|
g_cpuFeatures |= kCPUFeatureNEON |
|
||||||
|
kCPUFeatureVFPv3;
|
||||||
|
}
|
||||||
|
free(cpuFeatures);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // __arm__
|
||||||
|
|
||||||
|
#ifdef __i386__
|
||||||
|
g_cpuFamily = CPU_FAMILY_X86;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
uint64_t WebRtc_GetCPUFeaturesARM(void) {
|
||||||
|
pthread_once(&g_once, cpuInit);
|
||||||
|
return g_cpuFeatures;
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user