For Android ARMv7 platforms, added a feature of dynamically detecting the existence of Neon,
and when it's present, switch to some functions optimized for Neon at run time. Review URL: http://webrtc-codereview.appspot.com/268002 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1096 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
ae7017d588
commit
b59c031660
12
Android.mk
12
Android.mk
@ -54,6 +54,7 @@ include $(MY_WEBRTC_ROOT_PATH)/libvpx.mk
|
||||
LOCAL_PATH := $(call my-dir)
|
||||
|
||||
include $(CLEAR_VARS)
|
||||
include $(LOCAL_PATH)/../../external/webrtc/android-webrtc.mk
|
||||
|
||||
LOCAL_ARM_MODE := arm
|
||||
LOCAL_MODULE := libwebrtc_audio_preprocessing
|
||||
@ -71,6 +72,17 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
|
||||
libwebrtc_aecm \
|
||||
libwebrtc_system_wrappers
|
||||
|
||||
# Add Neon libraries.
|
||||
ifneq (,$(filter '-DWEBRTC_DETECT_ARM_NEON',$(MY_WEBRTC_COMMON_DEFS)))
|
||||
LOCAL_WHOLE_STATIC_LIBRARIES += \
|
||||
libwebrtc_aecm_neon \
|
||||
libwebrtc_ns_neon
|
||||
else ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
||||
LOCAL_WHOLE_STATIC_LIBRARIES += \
|
||||
libwebrtc_aecm_neon \
|
||||
libwebrtc_ns_neon
|
||||
endif
|
||||
|
||||
LOCAL_STATIC_LIBRARIES := \
|
||||
libprotobuf-cpp-2.3.0-lite
|
||||
|
||||
|
@ -21,8 +21,9 @@ MY_WEBRTC_COMMON_DEFS := \
|
||||
# '-DWEBRTC_MODULE_UTILITY_VIDEO' [module media_file] [module utility]
|
||||
ifeq ($(TARGET_ARCH),arm)
|
||||
MY_WEBRTC_COMMON_DEFS += \
|
||||
'-DWEBRTC_ARM_INLINE_CALLS' \
|
||||
'-DWEBRTC_ARCH_ARM'
|
||||
# '-DWEBRTC_DETECT_ARM_NEON' # only used in a build configuration without Neon
|
||||
# TODO(kma): figure out if the above define could be moved to NDK build only.
|
||||
|
||||
# TODO(kma): test if the code under next two macros works with generic GCC compilers
|
||||
ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
||||
|
@ -6,6 +6,9 @@
|
||||
# in the file PATENTS. All contributing project authors may
|
||||
# be found in the AUTHORS file in the root of the source tree.
|
||||
|
||||
#############################
|
||||
# Build the non-neon library.
|
||||
|
||||
LOCAL_PATH := $(call my-dir)
|
||||
|
||||
include $(CLEAR_VARS)
|
||||
@ -21,21 +24,16 @@ LOCAL_SRC_FILES := \
|
||||
aecm_core.c
|
||||
|
||||
# Flags passed to both C and C++ files.
|
||||
LOCAL_CFLAGS := \
|
||||
$(MY_WEBRTC_COMMON_DEFS)
|
||||
|
||||
ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
||||
LOCAL_SRC_FILES += \
|
||||
aecm_core_neon.c
|
||||
LOCAL_CFLAGS += \
|
||||
$(MY_ARM_CFLAGS_NEON)
|
||||
endif
|
||||
LOCAL_CFLAGS := $(MY_WEBRTC_COMMON_DEFS)
|
||||
|
||||
LOCAL_C_INCLUDES := \
|
||||
$(LOCAL_PATH)/interface \
|
||||
$(LOCAL_PATH)/../utility \
|
||||
$(LOCAL_PATH)/../../.. \
|
||||
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
|
||||
$(LOCAL_PATH)/../../../common_audio/signal_processing/include \
|
||||
$(LOCAL_PATH)/../../../system_wrappers/interface
|
||||
|
||||
LOCAL_STATIC_LIBRARIES += libwebrtc_system_wrappers
|
||||
|
||||
LOCAL_SHARED_LIBRARIES := \
|
||||
libcutils \
|
||||
@ -46,3 +44,31 @@ ifndef NDK_ROOT
|
||||
include external/stlport/libstlport.mk
|
||||
endif
|
||||
include $(BUILD_STATIC_LIBRARY)
|
||||
|
||||
#########################
|
||||
# Build the neon library.
|
||||
|
||||
include $(CLEAR_VARS)
|
||||
|
||||
LOCAL_ARM_MODE := arm
|
||||
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
|
||||
LOCAL_MODULE := libwebrtc_aecm_neon
|
||||
LOCAL_MODULE_TAGS := optional
|
||||
|
||||
LOCAL_SRC_FILES := aecm_core_neon.c
|
||||
|
||||
# Flags passed to both C and C++ files.
|
||||
LOCAL_CFLAGS := \
|
||||
$(MY_WEBRTC_COMMON_DEFS) \
|
||||
-mfpu=neon \
|
||||
-flax-vector-conversions
|
||||
|
||||
LOCAL_C_INCLUDES := \
|
||||
$(LOCAL_PATH)/interface \
|
||||
$(LOCAL_PATH)/../../.. \
|
||||
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
|
||||
|
||||
ifndef NDK_ROOT
|
||||
include external/stlport/libstlport.mk
|
||||
endif
|
||||
include $(BUILD_STATIC_LIBRARY)
|
||||
|
@ -13,8 +13,9 @@
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "echo_control_mobile.h"
|
||||
#include "cpu_features_wrapper.h"
|
||||
#include "delay_estimator_wrapper.h"
|
||||
#include "echo_control_mobile.h"
|
||||
#include "ring_buffer.h"
|
||||
#include "typedefs.h"
|
||||
|
||||
@ -263,6 +264,13 @@ static const uint16_t* AlignedFarend(AecmCore_t* self, int* far_q, int delay) {
|
||||
HANDLE logFile = NULL;
|
||||
#endif
|
||||
|
||||
// Declare function pointers.
|
||||
CalcLinearEnergies WebRtcAecm_CalcLinearEnergies;
|
||||
StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel;
|
||||
ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
|
||||
WindowAndFFT WebRtcAecm_WindowAndFFT;
|
||||
InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
|
||||
|
||||
int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
|
||||
{
|
||||
AecmCore_t *aecm = malloc(sizeof(AecmCore_t));
|
||||
@ -346,6 +354,194 @@ void WebRtcAecm_InitEchoPathCore(AecmCore_t* aecm, const WebRtc_Word16* echo_pat
|
||||
aecm->mseChannelCount = 0;
|
||||
}
|
||||
|
||||
static void WindowAndFFTC(WebRtc_Word16* fft,
|
||||
const WebRtc_Word16* time_signal,
|
||||
complex16_t* freq_signal,
|
||||
int time_signal_scaling)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
|
||||
// FFT of signal
|
||||
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
|
||||
{
|
||||
// Window time domain signal and insert into real part of
|
||||
// transformation array |fft|
|
||||
fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
(time_signal[i] << time_signal_scaling),
|
||||
WebRtcAecm_kSqrtHanning[i],
|
||||
14);
|
||||
fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
(time_signal[i + PART_LEN] << time_signal_scaling),
|
||||
WebRtcAecm_kSqrtHanning[PART_LEN - i],
|
||||
14);
|
||||
// Inserting zeros in imaginary parts not necessary since we
|
||||
// initialized the array with all zeros
|
||||
}
|
||||
|
||||
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
|
||||
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
|
||||
|
||||
// Take only the first PART_LEN2 samples
|
||||
for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
|
||||
{
|
||||
freq_signal[i].real = fft[j];
|
||||
|
||||
// The imaginary part has to switch sign
|
||||
freq_signal[i].imag = - fft[j+1];
|
||||
}
|
||||
}
|
||||
|
||||
static void InverseFFTAndWindowC(AecmCore_t* aecm,
|
||||
WebRtc_Word16* fft,
|
||||
complex16_t* efw,
|
||||
WebRtc_Word16* output,
|
||||
const WebRtc_Word16* nearendClean)
|
||||
{
|
||||
int i, j, outCFFT;
|
||||
WebRtc_Word32 tmp32no1;
|
||||
|
||||
// Synthesis
|
||||
for (i = 1; i < PART_LEN; i++)
|
||||
{
|
||||
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
|
||||
fft[j] = efw[i].real;
|
||||
|
||||
// mirrored data, even
|
||||
fft[PART_LEN4 - j] = efw[i].real;
|
||||
fft[j + 1] = -efw[i].imag;
|
||||
|
||||
//mirrored data, odd
|
||||
fft[PART_LEN4 - (j - 1)] = efw[i].imag;
|
||||
}
|
||||
fft[0] = efw[0].real;
|
||||
fft[1] = -efw[0].imag;
|
||||
|
||||
fft[PART_LEN2] = efw[PART_LEN].real;
|
||||
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
|
||||
|
||||
// inverse FFT, result should be scaled with outCFFT
|
||||
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
|
||||
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
|
||||
|
||||
//take only the real values and scale with outCFFT
|
||||
for (i = 0; i < PART_LEN2; i++)
|
||||
{
|
||||
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
|
||||
fft[i] = fft[j];
|
||||
}
|
||||
|
||||
for (i = 0; i < PART_LEN; i++)
|
||||
{
|
||||
fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
fft[i],
|
||||
WebRtcAecm_kSqrtHanning[i],
|
||||
14);
|
||||
tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
|
||||
outCFFT - aecm->dfaCleanQDomain);
|
||||
fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
|
||||
tmp32no1 + aecm->outBuf[i],
|
||||
WEBRTC_SPL_WORD16_MIN);
|
||||
output[i] = fft[i];
|
||||
|
||||
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
fft[PART_LEN + i],
|
||||
WebRtcAecm_kSqrtHanning[PART_LEN - i],
|
||||
14);
|
||||
tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
|
||||
outCFFT - aecm->dfaCleanQDomain);
|
||||
aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
|
||||
WEBRTC_SPL_WORD16_MAX,
|
||||
tmp32no1,
|
||||
WEBRTC_SPL_WORD16_MIN);
|
||||
}
|
||||
|
||||
#ifdef ARM_WINM_LOG_
|
||||
// measure tick end
|
||||
QueryPerformanceCounter((LARGE_INTEGER*)&end);
|
||||
diff__ = ((end - start) * 1000) / (freq/1000);
|
||||
milliseconds = (unsigned int)(diff__ & 0xffffffff);
|
||||
WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
|
||||
#endif
|
||||
|
||||
// Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
|
||||
memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
||||
memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
||||
if (nearendClean != NULL)
|
||||
{
|
||||
memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
||||
}
|
||||
}
|
||||
|
||||
static void CalcLinearEnergiesC(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est,
|
||||
WebRtc_UWord32* far_energy,
|
||||
WebRtc_UWord32* echo_energy_adapt,
|
||||
WebRtc_UWord32* echo_energy_stored)
|
||||
{
|
||||
int i;
|
||||
|
||||
// Get energy for the delayed far end signal and estimated
|
||||
// echo using both stored and adapted channels.
|
||||
for (i = 0; i < PART_LEN1; i++)
|
||||
{
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
(*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
|
||||
(*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
|
||||
far_spectrum[i]);
|
||||
(*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void StoreAdaptiveChannelC(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est)
|
||||
{
|
||||
int i;
|
||||
|
||||
// During startup we store the channel every block.
|
||||
memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
|
||||
// Recalculate echo estimate
|
||||
for (i = 0; i < PART_LEN; i += 4)
|
||||
{
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
|
||||
far_spectrum[i + 1]);
|
||||
echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
|
||||
far_spectrum[i + 2]);
|
||||
echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
|
||||
far_spectrum[i + 3]);
|
||||
}
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
}
|
||||
|
||||
static void ResetAdaptiveChannelC(AecmCore_t* aecm)
|
||||
{
|
||||
int i;
|
||||
|
||||
// The stored channel has a significantly lower MSE than the adaptive one for
|
||||
// two consecutive calculations. Reset the adaptive channel.
|
||||
memcpy(aecm->channelAdapt16, aecm->channelStored,
|
||||
sizeof(WebRtc_Word16) * PART_LEN1);
|
||||
// Restore the W32 channel
|
||||
for (i = 0; i < PART_LEN; i += 4)
|
||||
{
|
||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i], 16);
|
||||
aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i + 1], 16);
|
||||
aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i + 2], 16);
|
||||
aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i + 3], 16);
|
||||
}
|
||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
|
||||
}
|
||||
|
||||
// WebRtcAecm_InitCore(...)
|
||||
//
|
||||
// This function initializes the AECM instant created with WebRtcAecm_CreateCore(...)
|
||||
@ -463,6 +659,23 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
|
||||
|
||||
assert(PART_LEN % 16 == 0);
|
||||
|
||||
// Initialize function pointers.
|
||||
WebRtcAecm_WindowAndFFT = WindowAndFFTC;
|
||||
WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowC;
|
||||
WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesC;
|
||||
WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelC;
|
||||
WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelC;
|
||||
|
||||
#ifdef WEBRTC_DETECT_ARM_NEON
|
||||
uint64_t features = WebRtc_GetCPUFeaturesARM();
|
||||
if ((features & kCPUFeatureNEON) != 0)
|
||||
{
|
||||
WebRtcAecm_InitNeon();
|
||||
}
|
||||
#elif defined(WEBRTC_ARCH_ARM_NEON)
|
||||
WebRtcAecm_InitNeon();
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1890,194 +2103,3 @@ void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const far
|
||||
aecm->farBufReadPos += readLen;
|
||||
}
|
||||
|
||||
#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
|
||||
|
||||
void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
|
||||
const WebRtc_Word16* time_signal,
|
||||
complex16_t* freq_signal,
|
||||
int time_signal_scaling)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
|
||||
// FFT of signal
|
||||
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
|
||||
{
|
||||
// Window time domain signal and insert into real part of
|
||||
// transformation array |fft|
|
||||
fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
(time_signal[i] << time_signal_scaling),
|
||||
WebRtcAecm_kSqrtHanning[i],
|
||||
14);
|
||||
fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
(time_signal[i + PART_LEN] << time_signal_scaling),
|
||||
WebRtcAecm_kSqrtHanning[PART_LEN - i],
|
||||
14);
|
||||
// Inserting zeros in imaginary parts not necessary since we
|
||||
// initialized the array with all zeros
|
||||
}
|
||||
|
||||
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
|
||||
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
|
||||
|
||||
// Take only the first PART_LEN2 samples
|
||||
for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
|
||||
{
|
||||
freq_signal[i].real = fft[j];
|
||||
|
||||
// The imaginary part has to switch sign
|
||||
freq_signal[i].imag = - fft[j+1];
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
|
||||
WebRtc_Word16* fft,
|
||||
complex16_t* efw,
|
||||
WebRtc_Word16* output,
|
||||
const WebRtc_Word16* nearendClean)
|
||||
{
|
||||
int i, j, outCFFT;
|
||||
WebRtc_Word32 tmp32no1;
|
||||
|
||||
// Synthesis
|
||||
for (i = 1; i < PART_LEN; i++)
|
||||
{
|
||||
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
|
||||
fft[j] = efw[i].real;
|
||||
|
||||
// mirrored data, even
|
||||
fft[PART_LEN4 - j] = efw[i].real;
|
||||
fft[j + 1] = -efw[i].imag;
|
||||
|
||||
//mirrored data, odd
|
||||
fft[PART_LEN4 - (j - 1)] = efw[i].imag;
|
||||
}
|
||||
fft[0] = efw[0].real;
|
||||
fft[1] = -efw[0].imag;
|
||||
|
||||
fft[PART_LEN2] = efw[PART_LEN].real;
|
||||
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
|
||||
|
||||
// inverse FFT, result should be scaled with outCFFT
|
||||
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
|
||||
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
|
||||
|
||||
//take only the real values and scale with outCFFT
|
||||
for (i = 0; i < PART_LEN2; i++)
|
||||
{
|
||||
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
|
||||
fft[i] = fft[j];
|
||||
}
|
||||
|
||||
for (i = 0; i < PART_LEN; i++)
|
||||
{
|
||||
fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
fft[i],
|
||||
WebRtcAecm_kSqrtHanning[i],
|
||||
14);
|
||||
tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
|
||||
outCFFT - aecm->dfaCleanQDomain);
|
||||
fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
|
||||
tmp32no1 + aecm->outBuf[i],
|
||||
WEBRTC_SPL_WORD16_MIN);
|
||||
output[i] = fft[i];
|
||||
|
||||
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
fft[PART_LEN + i],
|
||||
WebRtcAecm_kSqrtHanning[PART_LEN - i],
|
||||
14);
|
||||
tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
|
||||
outCFFT - aecm->dfaCleanQDomain);
|
||||
aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
|
||||
WEBRTC_SPL_WORD16_MAX,
|
||||
tmp32no1,
|
||||
WEBRTC_SPL_WORD16_MIN);
|
||||
}
|
||||
|
||||
#ifdef ARM_WINM_LOG_
|
||||
// measure tick end
|
||||
QueryPerformanceCounter((LARGE_INTEGER*)&end);
|
||||
diff__ = ((end - start) * 1000) / (freq/1000);
|
||||
milliseconds = (unsigned int)(diff__ & 0xffffffff);
|
||||
WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
|
||||
#endif
|
||||
|
||||
// Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
|
||||
memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
||||
memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
||||
if (nearendClean != NULL)
|
||||
{
|
||||
memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est,
|
||||
WebRtc_UWord32* far_energy,
|
||||
WebRtc_UWord32* echo_energy_adapt,
|
||||
WebRtc_UWord32* echo_energy_stored)
|
||||
{
|
||||
int i;
|
||||
|
||||
// Get energy for the delayed far end signal and estimated
|
||||
// echo using both stored and adapted channels.
|
||||
for (i = 0; i < PART_LEN1; i++)
|
||||
{
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
(*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
|
||||
(*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
|
||||
far_spectrum[i]);
|
||||
(*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est)
|
||||
{
|
||||
int i;
|
||||
|
||||
// During startup we store the channel every block.
|
||||
memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
|
||||
// Recalculate echo estimate
|
||||
for (i = 0; i < PART_LEN; i += 4)
|
||||
{
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
|
||||
far_spectrum[i + 1]);
|
||||
echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
|
||||
far_spectrum[i + 2]);
|
||||
echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
|
||||
far_spectrum[i + 3]);
|
||||
}
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
|
||||
far_spectrum[i]);
|
||||
}
|
||||
|
||||
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
|
||||
{
|
||||
int i;
|
||||
|
||||
// The stored channel has a significantly lower MSE than the adaptive one for
|
||||
// two consecutive calculations. Reset the adaptive channel.
|
||||
memcpy(aecm->channelAdapt16, aecm->channelStored,
|
||||
sizeof(WebRtc_Word16) * PART_LEN1);
|
||||
// Restore the W32 channel
|
||||
for (i = 0; i < PART_LEN; i += 4)
|
||||
{
|
||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i], 16);
|
||||
aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i + 1], 16);
|
||||
aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i + 2], 16);
|
||||
aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i + 3], 16);
|
||||
}
|
||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
|
||||
}
|
||||
|
||||
#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
|
||||
|
@ -332,32 +332,44 @@ void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * co
|
||||
void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const farend,
|
||||
const int farLen, const int knownDelay);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Some internal functions shared by ARM NEON and generic C code:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Some function pointers, for internal functions shared by ARM NEON and
|
||||
// generic C code.
|
||||
//
|
||||
typedef void (*CalcLinearEnergies)(
|
||||
AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echoEst,
|
||||
WebRtc_UWord32* far_energy,
|
||||
WebRtc_UWord32* echo_energy_adapt,
|
||||
WebRtc_UWord32* echo_energy_stored);
|
||||
extern CalcLinearEnergies WebRtcAecm_CalcLinearEnergies;
|
||||
|
||||
void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echoEst,
|
||||
WebRtc_UWord32* far_energy,
|
||||
WebRtc_UWord32* echo_energy_adapt,
|
||||
WebRtc_UWord32* echo_energy_stored);
|
||||
typedef void (*StoreAdaptiveChannel)(
|
||||
AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est);
|
||||
extern StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel;
|
||||
|
||||
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est);
|
||||
typedef void (*ResetAdaptiveChannel)(AecmCore_t* aecm);
|
||||
extern ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
|
||||
|
||||
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm);
|
||||
typedef void (*WindowAndFFT)(
|
||||
WebRtc_Word16* fft,
|
||||
const WebRtc_Word16* time_signal,
|
||||
complex16_t* freq_signal,
|
||||
int time_signal_scaling);
|
||||
extern WindowAndFFT WebRtcAecm_WindowAndFFT;
|
||||
|
||||
void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
|
||||
const WebRtc_Word16* time_signal,
|
||||
complex16_t* freq_signal,
|
||||
int time_signal_scaling);
|
||||
typedef void (*InverseFFTAndWindow)(
|
||||
AecmCore_t* aecm,
|
||||
WebRtc_Word16* fft, complex16_t* efw,
|
||||
WebRtc_Word16* output,
|
||||
const WebRtc_Word16* nearendClean);
|
||||
extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
|
||||
|
||||
// Initialization of the above function pointers for ARM Neon.
|
||||
void WebRtcAecm_InitNeon(void);
|
||||
|
||||
void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
|
||||
WebRtc_Word16* fft,
|
||||
complex16_t* efw,
|
||||
WebRtc_Word16* output,
|
||||
const WebRtc_Word16* nearendClean);
|
||||
|
||||
#endif
|
||||
|
@ -7,7 +7,6 @@
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
#if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
|
||||
|
||||
#include "aecm_core.h"
|
||||
|
||||
@ -16,299 +15,289 @@
|
||||
|
||||
|
||||
// Square root of Hanning window in Q14.
|
||||
static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = {
|
||||
16384, 16373, 16354, 16325,
|
||||
16286, 16237, 16179, 16111,
|
||||
16034, 15947, 15851, 15746,
|
||||
15631, 15506, 15373, 15231,
|
||||
15079, 14918, 14749, 14571,
|
||||
14384, 14189, 13985, 13773,
|
||||
13553, 13325, 13089, 12845,
|
||||
12594, 12335, 12068, 11795,
|
||||
11514, 11227, 10933, 10633,
|
||||
10326, 10013, 9695, 9370,
|
||||
9040, 8705, 8364, 8019,
|
||||
7668, 7313, 6954, 6591,
|
||||
6224, 5853, 5478, 5101,
|
||||
4720, 4337, 3951, 3562,
|
||||
3172, 2780, 2386, 1990,
|
||||
1594, 1196, 798, 399
|
||||
static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = {
|
||||
16384, 16373, 16354, 16325,
|
||||
16286, 16237, 16179, 16111,
|
||||
16034, 15947, 15851, 15746,
|
||||
15631, 15506, 15373, 15231,
|
||||
15079, 14918, 14749, 14571,
|
||||
14384, 14189, 13985, 13773,
|
||||
13553, 13325, 13089, 12845,
|
||||
12594, 12335, 12068, 11795,
|
||||
11514, 11227, 10933, 10633,
|
||||
10326, 10013, 9695, 9370,
|
||||
9040, 8705, 8364, 8019,
|
||||
7668, 7313, 6954, 6591,
|
||||
6224, 5853, 5478, 5101,
|
||||
4720, 4337, 3951, 3562,
|
||||
3172, 2780, 2386, 1990,
|
||||
1594, 1196, 798, 399
|
||||
};
|
||||
|
||||
void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
|
||||
static void WindowAndFFTNeon(WebRtc_Word16* fft,
|
||||
const WebRtc_Word16* time_signal,
|
||||
complex16_t* freq_signal,
|
||||
int time_signal_scaling)
|
||||
{
|
||||
int i, j;
|
||||
int time_signal_scaling) {
|
||||
int i, j;
|
||||
|
||||
int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
|
||||
__asm__("vmov.i16 d21, #0" ::: "d21");
|
||||
int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
|
||||
__asm__("vmov.i16 d21, #0" ::: "d21");
|
||||
|
||||
for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
|
||||
{
|
||||
int16x4_t tmp16x4_0;
|
||||
int16x4_t tmp16x4_1;
|
||||
int32x4_t tmp32x4_0;
|
||||
for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
|
||||
int16x4_t tmp16x4_0;
|
||||
int16x4_t tmp16x4_1;
|
||||
int32x4_t tmp32x4_0;
|
||||
|
||||
/* Window near end */
|
||||
// fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
|
||||
// << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
|
||||
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
|
||||
/* Window near end */
|
||||
// fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
|
||||
// << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
|
||||
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
|
||||
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
|
||||
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
|
||||
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
|
||||
|
||||
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
|
||||
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
|
||||
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
|
||||
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
|
||||
|
||||
// fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
// (time_signal[PART_LEN + i] << time_signal_scaling),
|
||||
// WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN]));
|
||||
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
|
||||
// fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
// (time_signal[PART_LEN + i] << time_signal_scaling),
|
||||
// WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN]));
|
||||
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
|
||||
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
|
||||
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
|
||||
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
|
||||
|
||||
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
|
||||
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
|
||||
}
|
||||
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
|
||||
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
|
||||
}
|
||||
|
||||
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
|
||||
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
|
||||
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
|
||||
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
|
||||
|
||||
// Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
|
||||
for(i = 0, j = 0; j < PART_LEN2; i += 8, j += 16)
|
||||
{
|
||||
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
|
||||
__asm__("vneg.s16 d22, d22" : : : "q10");
|
||||
__asm__("vneg.s16 d23, d23" : : : "q11");
|
||||
__asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
// Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
|
||||
for (i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) {
|
||||
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
|
||||
__asm__("vneg.s16 d22, d22" : : : "q10");
|
||||
__asm__("vneg.s16 d23, d23" : : : "q11");
|
||||
__asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&freq_signal[i].real): "q10", "q11");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
|
||||
WebRtc_Word16* fft,
|
||||
complex16_t* efw,
|
||||
WebRtc_Word16* output,
|
||||
const WebRtc_Word16* nearendClean)
|
||||
{
|
||||
int i, j, outCFFT;
|
||||
WebRtc_Word32 tmp32no1;
|
||||
static void InverseFFTAndWindowNeon(AecmCore_t* aecm,
|
||||
WebRtc_Word16* fft,
|
||||
complex16_t* efw,
|
||||
WebRtc_Word16* output,
|
||||
const WebRtc_Word16* nearendClean) {
|
||||
int i, j, outCFFT;
|
||||
WebRtc_Word32 tmp32no1;
|
||||
|
||||
// Synthesis
|
||||
for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
|
||||
{
|
||||
// We overwrite two more elements in fft[], but it's ok.
|
||||
__asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
|
||||
__asm__("vmov q11, q10" : : : "q10", "q11");
|
||||
// Synthesis
|
||||
for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
|
||||
// We overwrite two more elements in fft[], but it's ok.
|
||||
__asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
|
||||
__asm__("vmov q11, q10" : : : "q10", "q11");
|
||||
|
||||
__asm__("vneg.s16 d23, d23" : : : "q11");
|
||||
__asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11");
|
||||
__asm__("vneg.s16 d23, d23" : : : "q11");
|
||||
__asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11");
|
||||
|
||||
__asm__("vrev64.16 q10, q10" : : : "q10");
|
||||
__asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10");
|
||||
}
|
||||
__asm__("vrev64.16 q10, q10" : : : "q10");
|
||||
__asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10");
|
||||
}
|
||||
|
||||
fft[PART_LEN2] = efw[PART_LEN].real;
|
||||
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
|
||||
fft[PART_LEN2] = efw[PART_LEN].real;
|
||||
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
|
||||
|
||||
// Inverse FFT, result should be scaled with outCFFT.
|
||||
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
|
||||
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
|
||||
// Inverse FFT, result should be scaled with outCFFT.
|
||||
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
|
||||
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
|
||||
|
||||
// Take only the real values and scale with outCFFT.
|
||||
for (i = 0, j = 0; i < PART_LEN2; i += 8, j+= 16)
|
||||
{
|
||||
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
|
||||
__asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
|
||||
}
|
||||
// Take only the real values and scale with outCFFT.
|
||||
for (i = 0, j = 0; i < PART_LEN2; i += 8, j += 16) {
|
||||
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
|
||||
__asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
|
||||
}
|
||||
|
||||
int32x4_t tmp32x4_2;
|
||||
__asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
|
||||
(outCFFT - aecm->dfaCleanQDomain)));
|
||||
for (i = 0; i < PART_LEN; i += 4)
|
||||
{
|
||||
int16x4_t tmp16x4_0;
|
||||
int16x4_t tmp16x4_1;
|
||||
int32x4_t tmp32x4_0;
|
||||
int32x4_t tmp32x4_1;
|
||||
int32x4_t tmp32x4_2;
|
||||
__asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
|
||||
(outCFFT - aecm->dfaCleanQDomain)));
|
||||
for (i = 0; i < PART_LEN; i += 4) {
|
||||
int16x4_t tmp16x4_0;
|
||||
int16x4_t tmp16x4_1;
|
||||
int32x4_t tmp32x4_0;
|
||||
int32x4_t tmp32x4_1;
|
||||
|
||||
// fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
// fft[i], WebRtcAecm_kSqrtHanning[i], 14);
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i]));
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
|
||||
__asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
|
||||
__asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
|
||||
// fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
// fft[i], WebRtcAecm_kSqrtHanning[i], 14);
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i]));
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
|
||||
__asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
|
||||
__asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
|
||||
|
||||
// tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
|
||||
// outCFFT - aecm->dfaCleanQDomain);
|
||||
__asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
|
||||
// tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
|
||||
// outCFFT - aecm->dfaCleanQDomain);
|
||||
__asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
|
||||
|
||||
// fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
|
||||
// tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN);
|
||||
// output[i] = fft[i];
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
|
||||
__asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
|
||||
__asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
|
||||
__asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
|
||||
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i]));
|
||||
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
|
||||
// fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
|
||||
// tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN);
|
||||
// output[i] = fft[i];
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
|
||||
__asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
|
||||
__asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
|
||||
__asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
|
||||
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i]));
|
||||
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
|
||||
|
||||
// tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
// fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i]));
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
|
||||
__asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
|
||||
__asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
|
||||
// tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
|
||||
// fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i]));
|
||||
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
|
||||
__asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
|
||||
__asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
|
||||
|
||||
// tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain);
|
||||
__asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
|
||||
// outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
|
||||
// WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
|
||||
__asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
|
||||
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
|
||||
}
|
||||
// tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain);
|
||||
__asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
|
||||
// outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
|
||||
// WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
|
||||
__asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
|
||||
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
|
||||
}
|
||||
|
||||
// Copy the current block to the old position (outBuf is shifted elsewhere).
|
||||
for (i = 0; i < PART_LEN; i += 16)
|
||||
{
|
||||
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
// Copy the current block to the old position (outBuf is shifted elsewhere).
|
||||
for (i = 0; i < PART_LEN; i += 16) {
|
||||
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&aecm->xBuf[i + PART_LEN]) : "q10");
|
||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
|
||||
}
|
||||
for (i = 0; i < PART_LEN; i += 16)
|
||||
{
|
||||
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
|
||||
}
|
||||
for (i = 0; i < PART_LEN; i += 16) {
|
||||
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10");
|
||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&aecm->dBufNoisy[i]): "q10");
|
||||
}
|
||||
if (nearendClean != NULL) {
|
||||
for (i = 0; i < PART_LEN; i += 16) {
|
||||
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
|
||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&aecm->dBufClean[i]): "q10");
|
||||
}
|
||||
if (nearendClean != NULL) {
|
||||
for (i = 0; i < PART_LEN; i += 16)
|
||||
{
|
||||
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
|
||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&aecm->dBufClean[i]): "q10");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
|
||||
static void CalcLinearEnergiesNeon(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est,
|
||||
WebRtc_UWord32* far_energy,
|
||||
WebRtc_UWord32* echo_energy_adapt,
|
||||
WebRtc_UWord32* echo_energy_stored)
|
||||
{
|
||||
int i;
|
||||
WebRtc_UWord32* echo_energy_stored) {
|
||||
int i;
|
||||
|
||||
register WebRtc_UWord32 far_energy_r;
|
||||
register WebRtc_UWord32 echo_energy_stored_r;
|
||||
register WebRtc_UWord32 echo_energy_adapt_r;
|
||||
uint32x4_t tmp32x4_0;
|
||||
register WebRtc_UWord32 far_energy_r;
|
||||
register WebRtc_UWord32 echo_energy_stored_r;
|
||||
register WebRtc_UWord32 echo_energy_adapt_r;
|
||||
uint32x4_t tmp32x4_0;
|
||||
|
||||
__asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy
|
||||
__asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored
|
||||
__asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt
|
||||
__asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy
|
||||
__asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored
|
||||
__asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt
|
||||
|
||||
for(i = 0; i < PART_LEN -7; i += 8)
|
||||
{
|
||||
// far_energy += (WebRtc_UWord32)(far_spectrum[i]);
|
||||
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
|
||||
__asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
|
||||
__asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
|
||||
|
||||
// Get estimated echo energies for adaptive channel and stored channel.
|
||||
// echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
|
||||
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
|
||||
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
|
||||
__asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
|
||||
"q10", "q11");
|
||||
|
||||
// echo_energy_stored += (WebRtc_UWord32)echoEst[i];
|
||||
__asm__("vadd.u32 q8, q10" : : : "q10", "q8");
|
||||
__asm__("vadd.u32 q8, q11" : : : "q11", "q8");
|
||||
|
||||
// echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(
|
||||
// aecm->channelAdapt16[i], far_spectrum[i]);
|
||||
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
|
||||
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
|
||||
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
|
||||
__asm__("vadd.u32 q9, q10" : : : "q9", "q15");
|
||||
__asm__("vadd.u32 q9, q11" : : : "q9", "q11");
|
||||
}
|
||||
|
||||
__asm__("vadd.u32 d28, d29" : : : "q14");
|
||||
__asm__("vpadd.u32 d28, d28" : : : "q14");
|
||||
__asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
|
||||
|
||||
__asm__("vadd.u32 d18, d19" : : : "q9");
|
||||
__asm__("vpadd.u32 d18, d18" : : : "q9");
|
||||
__asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
|
||||
|
||||
__asm__("vadd.u32 d16, d17" : : : "q8");
|
||||
__asm__("vpadd.u32 d16, d16" : : : "q8");
|
||||
__asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
|
||||
for (i = 0; i < PART_LEN - 7; i += 8) {
|
||||
// far_energy += (WebRtc_UWord32)(far_spectrum[i]);
|
||||
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
|
||||
__asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
|
||||
__asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
|
||||
|
||||
// Get estimated echo energies for adaptive channel and stored channel.
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
*echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i];
|
||||
*far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
|
||||
*echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
|
||||
aecm->channelAdapt16[i], far_spectrum[i]);
|
||||
// echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
|
||||
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
|
||||
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
|
||||
__asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
|
||||
"q10", "q11");
|
||||
|
||||
// echo_energy_stored += (WebRtc_UWord32)echoEst[i];
|
||||
__asm__("vadd.u32 q8, q10" : : : "q10", "q8");
|
||||
__asm__("vadd.u32 q8, q11" : : : "q11", "q8");
|
||||
|
||||
// echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(
|
||||
// aecm->channelAdapt16[i], far_spectrum[i]);
|
||||
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
|
||||
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
|
||||
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
|
||||
__asm__("vadd.u32 q9, q10" : : : "q9", "q15");
|
||||
__asm__("vadd.u32 q9, q11" : : : "q9", "q11");
|
||||
}
|
||||
|
||||
__asm__("vadd.u32 d28, d29" : : : "q14");
|
||||
__asm__("vpadd.u32 d28, d28" : : : "q14");
|
||||
__asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
|
||||
|
||||
__asm__("vadd.u32 d18, d19" : : : "q9");
|
||||
__asm__("vpadd.u32 d18, d18" : : : "q9");
|
||||
__asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
|
||||
|
||||
__asm__("vadd.u32 d16, d17" : : : "q8");
|
||||
__asm__("vpadd.u32 d16, d16" : : : "q8");
|
||||
__asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
|
||||
|
||||
// Get estimated echo energies for adaptive channel and stored channel.
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
*echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i];
|
||||
*far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
|
||||
*echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
|
||||
aecm->channelAdapt16[i], far_spectrum[i]);
|
||||
}
|
||||
|
||||
void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
|
||||
static void StoreAdaptiveChannelNeon(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est)
|
||||
{
|
||||
int i;
|
||||
WebRtc_Word32* echo_est) {
|
||||
int i;
|
||||
|
||||
// During startup we store the channel every block.
|
||||
// Recalculate echo estimate.
|
||||
for(i = 0; i < PART_LEN -7; i += 8)
|
||||
{
|
||||
// aecm->channelStored[i] = acem->channelAdapt16[i];
|
||||
// echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
|
||||
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
|
||||
__asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
|
||||
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
|
||||
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
|
||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&echo_est[i]) : "q10", "q11");
|
||||
}
|
||||
aecm->channelStored[i] = aecm->channelAdapt16[i];
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
// During startup we store the channel every block.
|
||||
// Recalculate echo estimate.
|
||||
for (i = 0; i < PART_LEN - 7; i += 8) {
|
||||
// aecm->channelStored[i] = acem->channelAdapt16[i];
|
||||
// echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
|
||||
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
|
||||
__asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
|
||||
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
|
||||
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
|
||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&echo_est[i]) : "q10", "q11");
|
||||
}
|
||||
aecm->channelStored[i] = aecm->channelAdapt16[i];
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
}
|
||||
|
||||
void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
|
||||
{
|
||||
int i;
|
||||
static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
|
||||
int i;
|
||||
|
||||
for(i = 0; i < PART_LEN -7; i += 8)
|
||||
{
|
||||
// aecm->channelAdapt16[i] = aecm->channelStored[i];
|
||||
// aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
|
||||
// aecm->channelStored[i], 16);
|
||||
__asm__("vld1.16 {d24, d25}, [%0, :128]" : :
|
||||
"r"(&aecm->channelStored[i]) : "q12");
|
||||
__asm__("vst1.16 {d24, d25}, [%0, :128]" : :
|
||||
"r"(&aecm->channelAdapt16[i]) : "q12");
|
||||
__asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
|
||||
__asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
|
||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&aecm->channelAdapt32[i]): "q10", "q11");
|
||||
}
|
||||
aecm->channelAdapt16[i] = aecm->channelStored[i];
|
||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i], 16);
|
||||
for (i = 0; i < PART_LEN - 7; i += 8) {
|
||||
// aecm->channelAdapt16[i] = aecm->channelStored[i];
|
||||
// aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
|
||||
// aecm->channelStored[i], 16);
|
||||
__asm__("vld1.16 {d24, d25}, [%0, :128]" : :
|
||||
"r"(&aecm->channelStored[i]) : "q12");
|
||||
__asm__("vst1.16 {d24, d25}, [%0, :128]" : :
|
||||
"r"(&aecm->channelAdapt16[i]) : "q12");
|
||||
__asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
|
||||
__asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
|
||||
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
|
||||
"r"(&aecm->channelAdapt32[i]): "q10", "q11");
|
||||
}
|
||||
aecm->channelAdapt16[i] = aecm->channelStored[i];
|
||||
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
|
||||
(WebRtc_Word32)aecm->channelStored[i], 16);
|
||||
}
|
||||
|
||||
#endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
|
||||
void WebRtcAecm_InitNeon(void) {
|
||||
WebRtcAecm_WindowAndFFT = WindowAndFFTNeon;
|
||||
WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon;
|
||||
WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon;
|
||||
WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon;
|
||||
WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon;
|
||||
}
|
||||
|
@ -6,6 +6,8 @@
|
||||
# in the file PATENTS. All contributing project authors may
|
||||
# be found in the AUTHORS file in the root of the source tree.
|
||||
|
||||
#############################
|
||||
# Build the non-neon library.
|
||||
LOCAL_PATH := $(call my-dir)
|
||||
|
||||
include $(CLEAR_VARS)
|
||||
@ -20,25 +22,20 @@ LOCAL_SRC_FILES := \
|
||||
noise_suppression_x.c \
|
||||
nsx_core.c
|
||||
|
||||
# floating point
|
||||
# Files for floating point.
|
||||
# noise_suppression.c ns_core.c
|
||||
|
||||
# Flags passed to both C and C++ files.
|
||||
LOCAL_CFLAGS := \
|
||||
$(MY_WEBRTC_COMMON_DEFS)
|
||||
|
||||
ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
||||
LOCAL_SRC_FILES += \
|
||||
nsx_core_neon.c
|
||||
LOCAL_CFLAGS += \
|
||||
$(MY_ARM_CFLAGS_NEON)
|
||||
endif
|
||||
LOCAL_CFLAGS := $(MY_WEBRTC_COMMON_DEFS)
|
||||
|
||||
LOCAL_C_INCLUDES := \
|
||||
$(LOCAL_PATH)/interface \
|
||||
$(LOCAL_PATH)/../utility \
|
||||
$(LOCAL_PATH)/../../.. \
|
||||
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
|
||||
$(LOCAL_PATH)/../../../common_audio/signal_processing/include \
|
||||
$(LOCAL_PATH)/../../../system_wrappers/interface
|
||||
|
||||
LOCAL_STATIC_LIBRARIES += libwebrtc_system_wrappers
|
||||
|
||||
LOCAL_SHARED_LIBRARIES := \
|
||||
libcutils \
|
||||
@ -49,3 +46,31 @@ ifndef NDK_ROOT
|
||||
include external/stlport/libstlport.mk
|
||||
endif
|
||||
include $(BUILD_STATIC_LIBRARY)
|
||||
|
||||
#############################
|
||||
# Build the neon library.
|
||||
|
||||
include $(CLEAR_VARS)
|
||||
|
||||
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
|
||||
LOCAL_MODULE := libwebrtc_ns_neon
|
||||
LOCAL_MODULE_TAGS := optional
|
||||
LOCAL_GENERATED_SOURCES :=
|
||||
|
||||
LOCAL_SRC_FILES := nsx_core_neon.c
|
||||
|
||||
# Flags passed to both C and C++ files.
|
||||
LOCAL_CFLAGS := \
|
||||
$(MY_WEBRTC_COMMON_DEFS) \
|
||||
-mfpu=neon \
|
||||
-flax-vector-conversions
|
||||
|
||||
LOCAL_C_INCLUDES := \
|
||||
$(LOCAL_PATH)/interface \
|
||||
$(LOCAL_PATH)/../../.. \
|
||||
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
|
||||
|
||||
ifndef NDK_ROOT
|
||||
include external/stlport/libstlport.mk
|
||||
endif
|
||||
include $(BUILD_STATIC_LIBRARY)
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "cpu_features_wrapper.h"
|
||||
#include "nsx_core.h"
|
||||
|
||||
// Skip first frequency bins during estimation. (0 <= value < 64)
|
||||
@ -426,6 +427,271 @@ static const WebRtc_Word16 kDeterminantEstMatrix[66] = {
|
||||
355, 330
|
||||
};
|
||||
|
||||
// Declare function pointers.
|
||||
NoiseEstimation WebRtcNsx_NoiseEstimation;
|
||||
PrepareSpectrum WebRtcNsx_PrepareSpectrum;
|
||||
SynthesisUpdate WebRtcNsx_SynthesisUpdate;
|
||||
AnalysisUpdate WebRtcNsx_AnalysisUpdate;
|
||||
Denormalize WebRtcNsx_Denormalize;
|
||||
CreateComplexBuffer WebRtcNsx_CreateComplexBuffer;
|
||||
|
||||
// Update the noise estimation information.
|
||||
static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
|
||||
WebRtc_Word32 tmp32no1 = 0;
|
||||
WebRtc_Word32 tmp32no2 = 0;
|
||||
WebRtc_Word16 tmp16 = 0;
|
||||
const WebRtc_Word16 kExp2Const = 11819; // Q13
|
||||
|
||||
int i = 0;
|
||||
|
||||
tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
|
||||
inst->magnLen);
|
||||
// Guarantee a Q-domain as high as possible and still fit in int16
|
||||
inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
kExp2Const, tmp16, 21);
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
// inst->quantile[i]=exp(inst->lquantile[offset+i]);
|
||||
// in Q21
|
||||
tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
|
||||
inst->noiseEstLogQuantile[offset + i]);
|
||||
tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
|
||||
tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
|
||||
tmp16 -= 21;// shift 21 to get result in Q0
|
||||
tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise)
|
||||
if (tmp16 < 0) {
|
||||
tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
|
||||
} else {
|
||||
tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
|
||||
}
|
||||
inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1);
|
||||
}
|
||||
}
|
||||
|
||||
// Noise Estimation
|
||||
static void NoiseEstimationC(NsxInst_t* inst,
|
||||
uint16_t* magn,
|
||||
uint32_t* noise,
|
||||
int16_t* q_noise) {
|
||||
WebRtc_Word32 numerator = FACTOR_Q16;
|
||||
WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
|
||||
WebRtc_Word16 countProd, delta, zeros, frac;
|
||||
WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
|
||||
const int16_t log2_const = 22713; // Q15
|
||||
const int16_t width_factor = 21845;
|
||||
|
||||
int i, s, offset;
|
||||
|
||||
tabind = inst->stages - inst->normData;
|
||||
assert(tabind < 9);
|
||||
assert(tabind > -9);
|
||||
if (tabind < 0) {
|
||||
logval = -WebRtcNsx_kLogTable[-tabind];
|
||||
} else {
|
||||
logval = WebRtcNsx_kLogTable[tabind];
|
||||
}
|
||||
|
||||
// lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
|
||||
// magn is in Q(-stages), and the real lmagn values are:
|
||||
// real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
|
||||
// lmagn in Q8
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
if (magn[i]) {
|
||||
zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
|
||||
frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros)
|
||||
& 0x7FFFFFFF) >> 23);
|
||||
// log2(magn(i))
|
||||
assert(frac < 256);
|
||||
log2 = (WebRtc_Word16)(((31 - zeros) << 8)
|
||||
+ WebRtcNsx_kLogTableFrac[frac]);
|
||||
// log2(magn(i))*log(2)
|
||||
lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
|
||||
// + log(2^stages)
|
||||
lmagn[i] += logval;
|
||||
} else {
|
||||
lmagn[i] = logval;//0;
|
||||
}
|
||||
}
|
||||
|
||||
// loop over simultaneous estimates
|
||||
for (s = 0; s < SIMULT; s++) {
|
||||
offset = s * inst->magnLen;
|
||||
|
||||
// Get counter values from state
|
||||
counter = inst->noiseEstCounter[s];
|
||||
assert(counter < 201);
|
||||
countDiv = WebRtcNsx_kCounterDiv[counter];
|
||||
countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
|
||||
|
||||
// quant_est(...)
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
// compute delta
|
||||
if (inst->noiseEstDensity[offset + i] > 512) {
|
||||
delta = WebRtcSpl_DivW32W16ResW16(numerator,
|
||||
inst->noiseEstDensity[offset + i]);
|
||||
} else {
|
||||
delta = FACTOR_Q7;
|
||||
if (inst->blockIndex < END_STARTUP_LONG) {
|
||||
// Smaller step size during startup. This prevents from using
|
||||
// unrealistic values causing overflow.
|
||||
delta = FACTOR_Q7_STARTUP;
|
||||
}
|
||||
}
|
||||
|
||||
// update log quantile estimate
|
||||
tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
|
||||
if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
|
||||
// +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
|
||||
// CounterDiv=1/(inst->counter[s]+1) in Q15
|
||||
tmp16 += 2;
|
||||
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
|
||||
inst->noiseEstLogQuantile[offset + i] += tmp16no1;
|
||||
} else {
|
||||
tmp16 += 1;
|
||||
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
|
||||
// *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
|
||||
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
|
||||
inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
|
||||
if (inst->noiseEstLogQuantile[offset + i] < logval) {
|
||||
// This is the smallest fixed point representation we can
|
||||
// have, hence we limit the output.
|
||||
inst->noiseEstLogQuantile[offset + i] = logval;
|
||||
}
|
||||
}
|
||||
|
||||
// update density estimate
|
||||
if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
|
||||
< WIDTH_Q8) {
|
||||
tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
inst->noiseEstDensity[offset + i], countProd, 15);
|
||||
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
width_factor, countDiv, 15);
|
||||
inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
|
||||
}
|
||||
} // end loop over magnitude spectrum
|
||||
|
||||
if (counter >= END_STARTUP_LONG) {
|
||||
inst->noiseEstCounter[s] = 0;
|
||||
if (inst->blockIndex >= END_STARTUP_LONG) {
|
||||
UpdateNoiseEstimate(inst, offset);
|
||||
}
|
||||
}
|
||||
inst->noiseEstCounter[s]++;
|
||||
|
||||
} // end loop over simultaneous estimates
|
||||
|
||||
// Sequentially update the noise during startup
|
||||
if (inst->blockIndex < END_STARTUP_LONG) {
|
||||
UpdateNoiseEstimate(inst, offset);
|
||||
}
|
||||
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
|
||||
}
|
||||
(*q_noise) = (WebRtc_Word16)inst->qNoise;
|
||||
}
|
||||
|
||||
// Filter the data in the frequency domain, and create spectrum.
|
||||
static void PrepareSpectrumC(NsxInst_t* inst, int16_t* freq_buf) {
|
||||
int i = 0, j = 0;
|
||||
int16_t tmp16 = 0;
|
||||
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
|
||||
(WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
|
||||
inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
|
||||
(WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
|
||||
}
|
||||
|
||||
freq_buf[0] = inst->real[0];
|
||||
freq_buf[1] = -inst->imag[0];
|
||||
for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
|
||||
tmp16 = (inst->anaLen << 1) - j;
|
||||
freq_buf[j] = inst->real[i];
|
||||
freq_buf[j + 1] = -inst->imag[i];
|
||||
freq_buf[tmp16] = inst->real[i];
|
||||
freq_buf[tmp16 + 1] = inst->imag[i];
|
||||
}
|
||||
freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
|
||||
freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
|
||||
}
|
||||
|
||||
// Denormalize the input buffer.
|
||||
static __inline void DenormalizeC(NsxInst_t* inst, int16_t* in, int factor) {
|
||||
int i = 0, j = 0;
|
||||
int32_t tmp32 = 0;
|
||||
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
|
||||
tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j],
|
||||
factor - inst->normData);
|
||||
inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
|
||||
}
|
||||
}
|
||||
|
||||
// For the noise supression process, synthesis, read out fully processed
|
||||
// segment, and update synthesis buffer.
|
||||
static void SynthesisUpdateC(NsxInst_t* inst,
|
||||
int16_t* out_frame,
|
||||
int16_t gain_factor) {
|
||||
int i = 0;
|
||||
int16_t tmp16a = 0;
|
||||
int16_t tmp16b = 0;
|
||||
int32_t tmp32 = 0;
|
||||
|
||||
// synthesis
|
||||
for (i = 0; i < inst->anaLen; i++) {
|
||||
tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
inst->window[i], inst->real[i], 14); // Q0, window in Q14
|
||||
tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0
|
||||
// Down shift with rounding
|
||||
tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
|
||||
inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i],
|
||||
tmp16b); // Q0
|
||||
}
|
||||
|
||||
// read out fully processed segment
|
||||
for (i = 0; i < inst->blockLen10ms; i++) {
|
||||
out_frame[i] = inst->synthesisBuffer[i]; // Q0
|
||||
}
|
||||
|
||||
// update synthesis buffer
|
||||
WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
|
||||
inst->synthesisBuffer + inst->blockLen10ms,
|
||||
inst->anaLen - inst->blockLen10ms);
|
||||
WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
|
||||
+ inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
|
||||
}
|
||||
|
||||
// Update analysis buffer for lower band, and window data before FFT.
|
||||
static void AnalysisUpdateC(NsxInst_t* inst,
|
||||
int16_t* out,
|
||||
int16_t* new_speech) {
|
||||
int i = 0;
|
||||
|
||||
// For lower band update analysis buffer.
|
||||
WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
|
||||
inst->analysisBuffer + inst->blockLen10ms,
|
||||
inst->anaLen - inst->blockLen10ms);
|
||||
WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
|
||||
+ inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
|
||||
|
||||
// Window data before FFT.
|
||||
for (i = 0; i < inst->anaLen; i++) {
|
||||
out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
inst->window[i], inst->analysisBuffer[i], 14); // Q0
|
||||
}
|
||||
}
|
||||
|
||||
// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
|
||||
// zeros, and normalize it.
|
||||
static __inline void CreateComplexBufferC(NsxInst_t* inst,
|
||||
int16_t* in,
|
||||
int16_t* out) {
|
||||
int i = 0, j = 0;
|
||||
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
|
||||
out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
|
||||
out[j + 1] = 0; // Insert zeros in imaginary part
|
||||
}
|
||||
}
|
||||
|
||||
void WebRtcNsx_CalcParametricNoiseEstimate(NsxInst_t* inst,
|
||||
WebRtc_Word16 pink_noise_exp_avg,
|
||||
WebRtc_Word32 pink_noise_num_avg,
|
||||
@ -600,6 +866,24 @@ WebRtc_Word32 WebRtcNsx_InitCore(NsxInst_t* inst, WebRtc_UWord32 fs) {
|
||||
inst->file5 = fopen("file5.pcm", "wb");
|
||||
#endif
|
||||
|
||||
// Initialize function pointers.
|
||||
WebRtcNsx_NoiseEstimation = NoiseEstimationC;
|
||||
WebRtcNsx_PrepareSpectrum = PrepareSpectrumC;
|
||||
WebRtcNsx_SynthesisUpdate = SynthesisUpdateC;
|
||||
WebRtcNsx_AnalysisUpdate = AnalysisUpdateC;
|
||||
WebRtcNsx_Denormalize = DenormalizeC;
|
||||
WebRtcNsx_CreateComplexBuffer = CreateComplexBufferC;
|
||||
|
||||
#ifdef WEBRTC_DETECT_ARM_NEON
|
||||
uint64_t features = WebRtc_GetCPUFeaturesARM();
|
||||
if ((features & kCPUFeatureNEON) != 0)
|
||||
{
|
||||
WebRtcNsx_InitNeon();
|
||||
}
|
||||
#elif defined(WEBRTC_ARCH_ARM_NEON)
|
||||
WebRtcNsx_InitNeon();
|
||||
#endif
|
||||
|
||||
inst->initFlag = 1;
|
||||
|
||||
return 0;
|
||||
@ -2157,263 +2441,4 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* speechFrame, short* speechFram
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
|
||||
|
||||
// Update the noise estimation information.
|
||||
static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
|
||||
WebRtc_Word32 tmp32no1 = 0;
|
||||
WebRtc_Word32 tmp32no2 = 0;
|
||||
WebRtc_Word16 tmp16 = 0;
|
||||
const WebRtc_Word16 kExp2Const = 11819; // Q13
|
||||
|
||||
int i = 0;
|
||||
|
||||
tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
|
||||
inst->magnLen);
|
||||
// Guarantee a Q-domain as high as possible and still fit in int16
|
||||
inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
kExp2Const, tmp16, 21);
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
// inst->quantile[i]=exp(inst->lquantile[offset+i]);
|
||||
// in Q21
|
||||
tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
|
||||
inst->noiseEstLogQuantile[offset + i]);
|
||||
tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
|
||||
tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
|
||||
tmp16 -= 21;// shift 21 to get result in Q0
|
||||
tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise)
|
||||
if (tmp16 < 0) {
|
||||
tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
|
||||
} else {
|
||||
tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
|
||||
}
|
||||
inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1);
|
||||
}
|
||||
}
|
||||
|
||||
// Noise Estimation
|
||||
void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
||||
uint16_t* magn,
|
||||
uint32_t* noise,
|
||||
int16_t* q_noise) {
|
||||
WebRtc_Word32 numerator = FACTOR_Q16;
|
||||
WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
|
||||
WebRtc_Word16 countProd, delta, zeros, frac;
|
||||
WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
|
||||
const int16_t log2_const = 22713; // Q15
|
||||
const int16_t width_factor = 21845;
|
||||
|
||||
int i, s, offset;
|
||||
|
||||
tabind = inst->stages - inst->normData;
|
||||
assert(tabind < 9);
|
||||
assert(tabind > -9);
|
||||
if (tabind < 0) {
|
||||
logval = -WebRtcNsx_kLogTable[-tabind];
|
||||
} else {
|
||||
logval = WebRtcNsx_kLogTable[tabind];
|
||||
}
|
||||
|
||||
// lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
|
||||
// magn is in Q(-stages), and the real lmagn values are:
|
||||
// real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
|
||||
// lmagn in Q8
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
if (magn[i]) {
|
||||
zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
|
||||
frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros)
|
||||
& 0x7FFFFFFF) >> 23);
|
||||
// log2(magn(i))
|
||||
assert(frac < 256);
|
||||
log2 = (WebRtc_Word16)(((31 - zeros) << 8)
|
||||
+ WebRtcNsx_kLogTableFrac[frac]);
|
||||
// log2(magn(i))*log(2)
|
||||
lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
|
||||
// + log(2^stages)
|
||||
lmagn[i] += logval;
|
||||
} else {
|
||||
lmagn[i] = logval;//0;
|
||||
}
|
||||
}
|
||||
|
||||
// loop over simultaneous estimates
|
||||
for (s = 0; s < SIMULT; s++) {
|
||||
offset = s * inst->magnLen;
|
||||
|
||||
// Get counter values from state
|
||||
counter = inst->noiseEstCounter[s];
|
||||
assert(counter < 201);
|
||||
countDiv = WebRtcNsx_kCounterDiv[counter];
|
||||
countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
|
||||
|
||||
// quant_est(...)
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
// compute delta
|
||||
if (inst->noiseEstDensity[offset + i] > 512) {
|
||||
delta = WebRtcSpl_DivW32W16ResW16(numerator,
|
||||
inst->noiseEstDensity[offset + i]);
|
||||
} else {
|
||||
delta = FACTOR_Q7;
|
||||
if (inst->blockIndex < END_STARTUP_LONG) {
|
||||
// Smaller step size during startup. This prevents from using
|
||||
// unrealistic values causing overflow.
|
||||
delta = FACTOR_Q7_STARTUP;
|
||||
}
|
||||
}
|
||||
|
||||
// update log quantile estimate
|
||||
tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
|
||||
if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
|
||||
// +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
|
||||
// CounterDiv=1/(inst->counter[s]+1) in Q15
|
||||
tmp16 += 2;
|
||||
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
|
||||
inst->noiseEstLogQuantile[offset + i] += tmp16no1;
|
||||
} else {
|
||||
tmp16 += 1;
|
||||
tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
|
||||
// *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
|
||||
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
|
||||
inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
|
||||
if (inst->noiseEstLogQuantile[offset + i] < logval) {
|
||||
// This is the smallest fixed point representation we can
|
||||
// have, hence we limit the output.
|
||||
inst->noiseEstLogQuantile[offset + i] = logval;
|
||||
}
|
||||
}
|
||||
|
||||
// update density estimate
|
||||
if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
|
||||
< WIDTH_Q8) {
|
||||
tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
inst->noiseEstDensity[offset + i], countProd, 15);
|
||||
tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
width_factor, countDiv, 15);
|
||||
inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
|
||||
}
|
||||
} // end loop over magnitude spectrum
|
||||
|
||||
if (counter >= END_STARTUP_LONG) {
|
||||
inst->noiseEstCounter[s] = 0;
|
||||
if (inst->blockIndex >= END_STARTUP_LONG) {
|
||||
UpdateNoiseEstimate(inst, offset);
|
||||
}
|
||||
}
|
||||
inst->noiseEstCounter[s]++;
|
||||
|
||||
} // end loop over simultaneous estimates
|
||||
|
||||
// Sequentially update the noise during startup
|
||||
if (inst->blockIndex < END_STARTUP_LONG) {
|
||||
UpdateNoiseEstimate(inst, offset);
|
||||
}
|
||||
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
|
||||
}
|
||||
(*q_noise) = (WebRtc_Word16)inst->qNoise;
|
||||
}
|
||||
|
||||
// Filter the data in the frequency domain, and create spectrum.
|
||||
void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
|
||||
int i = 0, j = 0;
|
||||
int16_t tmp16 = 0;
|
||||
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
|
||||
(WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
|
||||
inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
|
||||
(WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
|
||||
}
|
||||
|
||||
freq_buf[0] = inst->real[0];
|
||||
freq_buf[1] = -inst->imag[0];
|
||||
for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
|
||||
tmp16 = (inst->anaLen << 1) - j;
|
||||
freq_buf[j] = inst->real[i];
|
||||
freq_buf[j + 1] = -inst->imag[i];
|
||||
freq_buf[tmp16] = inst->real[i];
|
||||
freq_buf[tmp16 + 1] = inst->imag[i];
|
||||
}
|
||||
freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
|
||||
freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
|
||||
}
|
||||
|
||||
// Denormalize the input buffer.
|
||||
__inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
|
||||
int i = 0, j = 0;
|
||||
int32_t tmp32 = 0;
|
||||
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
|
||||
tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j],
|
||||
factor - inst->normData);
|
||||
inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
|
||||
}
|
||||
}
|
||||
|
||||
// For the noise supression process, synthesis, read out fully processed
|
||||
// segment, and update synthesis buffer.
|
||||
void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
||||
int16_t* out_frame,
|
||||
int16_t gain_factor) {
|
||||
int i = 0;
|
||||
int16_t tmp16a = 0;
|
||||
int16_t tmp16b = 0;
|
||||
int32_t tmp32 = 0;
|
||||
|
||||
// synthesis
|
||||
for (i = 0; i < inst->anaLen; i++) {
|
||||
tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
inst->window[i], inst->real[i], 14); // Q0, window in Q14
|
||||
tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0
|
||||
// Down shift with rounding
|
||||
tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
|
||||
inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i],
|
||||
tmp16b); // Q0
|
||||
}
|
||||
|
||||
// read out fully processed segment
|
||||
for (i = 0; i < inst->blockLen10ms; i++) {
|
||||
out_frame[i] = inst->synthesisBuffer[i]; // Q0
|
||||
}
|
||||
|
||||
// update synthesis buffer
|
||||
WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
|
||||
inst->synthesisBuffer + inst->blockLen10ms,
|
||||
inst->anaLen - inst->blockLen10ms);
|
||||
WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
|
||||
+ inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
|
||||
}
|
||||
|
||||
// Update analysis buffer for lower band, and window data before FFT.
|
||||
void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
||||
int16_t* out,
|
||||
int16_t* new_speech) {
|
||||
int i = 0;
|
||||
|
||||
// For lower band update analysis buffer.
|
||||
WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
|
||||
inst->analysisBuffer + inst->blockLen10ms,
|
||||
inst->anaLen - inst->blockLen10ms);
|
||||
WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
|
||||
+ inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
|
||||
|
||||
// Window data before FFT.
|
||||
for (i = 0; i < inst->anaLen; i++) {
|
||||
out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
|
||||
inst->window[i], inst->analysisBuffer[i], 14); // Q0
|
||||
}
|
||||
}
|
||||
|
||||
// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
|
||||
// zeros, and normalize it.
|
||||
__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
|
||||
int16_t* in,
|
||||
int16_t* out) {
|
||||
int i = 0, j = 0;
|
||||
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
|
||||
out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
|
||||
out[j + 1] = 0; // Insert zeros in imaginary part
|
||||
}
|
||||
}
|
||||
|
||||
#endif // !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
|
||||
|
@ -165,40 +165,51 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst,
|
||||
short* outFrameHigh);
|
||||
|
||||
/****************************************************************************
|
||||
* Internal functions and variable declarations shared with optimized code.
|
||||
* Some function pointers, for internal functions shared by ARM NEON and
|
||||
* generic C code.
|
||||
*/
|
||||
|
||||
// Noise Estimation.
|
||||
void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
||||
uint16_t* magn,
|
||||
uint32_t* noise,
|
||||
int16_t* q_noise);
|
||||
typedef void (*NoiseEstimation)(NsxInst_t* inst,
|
||||
uint16_t* magn,
|
||||
uint32_t* noise,
|
||||
int16_t* q_noise);
|
||||
extern NoiseEstimation WebRtcNsx_NoiseEstimation;
|
||||
|
||||
// Filter the data in the frequency domain, and create spectrum.
|
||||
void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst,
|
||||
int16_t* freq_buff);
|
||||
typedef void (*PrepareSpectrum)(NsxInst_t* inst,
|
||||
int16_t* freq_buff);
|
||||
extern PrepareSpectrum WebRtcNsx_PrepareSpectrum;
|
||||
|
||||
// For the noise supression process, synthesis, read out fully processed
|
||||
// segment, and update synthesis buffer.
|
||||
void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
||||
int16_t* out_frame,
|
||||
int16_t gain_factor);
|
||||
typedef void (*SynthesisUpdate)(NsxInst_t* inst,
|
||||
int16_t* out_frame,
|
||||
int16_t gain_factor);
|
||||
extern SynthesisUpdate WebRtcNsx_SynthesisUpdate;
|
||||
|
||||
// Update analysis buffer for lower band, and window data before FFT.
|
||||
void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
||||
int16_t* out,
|
||||
int16_t* new_speech);
|
||||
typedef void (*AnalysisUpdate)(NsxInst_t* inst,
|
||||
int16_t* out,
|
||||
int16_t* new_speech);
|
||||
extern AnalysisUpdate WebRtcNsx_AnalysisUpdate;
|
||||
|
||||
// Denormalize the input buffer.
|
||||
__inline void WebRtcNsx_Denormalize(NsxInst_t* inst,
|
||||
int16_t* in,
|
||||
int factor);
|
||||
typedef void (*Denormalize)(NsxInst_t* inst,
|
||||
int16_t* in,
|
||||
int factor);
|
||||
extern Denormalize WebRtcNsx_Denormalize;
|
||||
|
||||
// Create a complex number buffer, as the intput interleaved with zeros,
|
||||
// and normalize it.
|
||||
__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
|
||||
int16_t* in,
|
||||
int16_t* out);
|
||||
typedef void (*CreateComplexBuffer)(NsxInst_t* inst,
|
||||
int16_t* in,
|
||||
int16_t* out);
|
||||
extern CreateComplexBuffer WebRtcNsx_CreateComplexBuffer;
|
||||
|
||||
/****************************************************************************
|
||||
* Initialization of the above function pointers for ARM Neon.
|
||||
*/
|
||||
void WebRtcNsx_InitNeon(void);
|
||||
|
||||
extern const WebRtc_Word16 WebRtcNsx_kLogTable[9];
|
||||
extern const WebRtc_Word16 WebRtcNsx_kLogTableFrac[256];
|
||||
@ -208,4 +219,4 @@ extern const WebRtc_Word16 WebRtcNsx_kCounterDiv[201];
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_
|
||||
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_
|
||||
|
@ -8,15 +8,13 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#if defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)
|
||||
|
||||
#include "nsx_core.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <assert.h>
|
||||
|
||||
// Update the noise estimation information.
|
||||
static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
|
||||
static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset) {
|
||||
int i = 0;
|
||||
const int16_t kExp2Const = 11819; // Q13
|
||||
int16_t* ptr_noiseEstLogQuantile = NULL;
|
||||
@ -75,7 +73,7 @@ static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
|
||||
}
|
||||
|
||||
// Last iteration:
|
||||
|
||||
|
||||
// inst->quantile[i]=exp(inst->lquantile[offset+i]);
|
||||
// in Q21
|
||||
int32_t tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
|
||||
@ -94,10 +92,10 @@ static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
|
||||
}
|
||||
|
||||
// Noise Estimation
|
||||
void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
||||
uint16_t* magn,
|
||||
uint32_t* noise,
|
||||
int16_t* q_noise) {
|
||||
static void NoiseEstimationNeon(NsxInst_t* inst,
|
||||
uint16_t* magn,
|
||||
uint32_t* noise,
|
||||
int16_t* q_noise) {
|
||||
int32_t numerator = FACTOR_Q16;
|
||||
int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
|
||||
int16_t countProd, delta, zeros, frac;
|
||||
@ -126,11 +124,11 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
||||
if (magn[i]) {
|
||||
zeros = WebRtcSpl_NormU32((uint32_t)magn[i]);
|
||||
frac = (int16_t)((((uint32_t)magn[i] << zeros)
|
||||
& 0x7FFFFFFF) >> 23);
|
||||
& 0x7FFFFFFF) >> 23);
|
||||
assert(frac < 256);
|
||||
// log2(magn(i))
|
||||
log2 = (int16_t)(((31 - zeros) << 8)
|
||||
+ WebRtcNsx_kLogTableFrac[frac]);
|
||||
+ WebRtcNsx_kLogTableFrac[frac]);
|
||||
// log2(magn(i))*log(2)
|
||||
lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
|
||||
// + log(2^stages)
|
||||
@ -302,7 +300,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
||||
if (counter >= END_STARTUP_LONG) {
|
||||
inst->noiseEstCounter[s] = 0;
|
||||
if (inst->blockIndex >= END_STARTUP_LONG) {
|
||||
UpdateNoiseEstimate(inst, offset);
|
||||
UpdateNoiseEstimateNeon(inst, offset);
|
||||
}
|
||||
}
|
||||
inst->noiseEstCounter[s]++;
|
||||
@ -311,7 +309,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
||||
|
||||
// Sequentially update the noise during startup
|
||||
if (inst->blockIndex < END_STARTUP_LONG) {
|
||||
UpdateNoiseEstimate(inst, offset);
|
||||
UpdateNoiseEstimateNeon(inst, offset);
|
||||
}
|
||||
|
||||
for (i = 0; i < inst->magnLen; i++) {
|
||||
@ -321,7 +319,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
|
||||
}
|
||||
|
||||
// Filter the data in the frequency domain, and create spectrum.
|
||||
void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
|
||||
static void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
|
||||
|
||||
// (1) Filtering.
|
||||
|
||||
@ -338,7 +336,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
|
||||
uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0];
|
||||
|
||||
// Filter the rest in the frequency domain.
|
||||
for (; ptr_real < &inst->real[inst->magnLen - 1]; ) {
|
||||
for (; ptr_real < &inst->real[inst->magnLen - 1];) {
|
||||
// Loop unrolled once. Both pointers are incremented by 4 twice.
|
||||
__asm__ __volatile__(
|
||||
"vld1.16 d20, [%[ptr_real]]\n\t"
|
||||
@ -368,7 +366,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
|
||||
:
|
||||
:"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
|
||||
"q9", "q10", "q11", "q12"
|
||||
);
|
||||
);
|
||||
}
|
||||
|
||||
// Filter the last pair of elements in the frequency domain.
|
||||
@ -400,7 +398,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
|
||||
int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8];
|
||||
ptr_real = &inst->real[1];
|
||||
ptr_imag = &inst->imag[1];
|
||||
for (; ptr_real < &inst->real[inst->anaLen2 - 11]; ) {
|
||||
for (; ptr_real < &inst->real[inst->anaLen2 - 11];) {
|
||||
// Loop unrolled once. All pointers are incremented twice.
|
||||
__asm__ __volatile__(
|
||||
"vld1.16 d22, [%[ptr_real]]!\n\t"
|
||||
@ -456,13 +454,13 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
|
||||
}
|
||||
|
||||
// Denormalize the input buffer.
|
||||
__inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
|
||||
static __inline void DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor) {
|
||||
int16_t* ptr_real = &inst->real[0];
|
||||
int16_t* ptr_in = &in[0];
|
||||
|
||||
__asm__ __volatile__("vdup.32 q10, %0" ::
|
||||
"r"((int32_t)(factor - inst->normData)) : "q10");
|
||||
for (; ptr_real < &inst->real[inst->anaLen]; ) {
|
||||
for (; ptr_real < &inst->real[inst->anaLen];) {
|
||||
|
||||
// Loop unrolled once. Both pointers are incremented.
|
||||
__asm__ __volatile__(
|
||||
@ -495,9 +493,9 @@ __inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
|
||||
|
||||
// For the noise supress process, synthesis, read out fully processed segment,
|
||||
// and update synthesis buffer.
|
||||
void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
||||
int16_t* out_frame,
|
||||
int16_t gain_factor) {
|
||||
static void SynthesisUpdateNeon(NsxInst_t* inst,
|
||||
int16_t* out_frame,
|
||||
int16_t gain_factor) {
|
||||
int16_t* ptr_real = &inst->real[0];
|
||||
int16_t* ptr_syn = &inst->synthesisBuffer[0];
|
||||
int16_t* ptr_window = &inst->window[0];
|
||||
@ -505,7 +503,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
||||
// synthesis
|
||||
__asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24");
|
||||
// Loop unrolled once. All pointers are incremented in the assembly code.
|
||||
for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) {
|
||||
for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
|
||||
__asm__ __volatile__(
|
||||
// Load variables.
|
||||
"vld1.16 d22, [%[ptr_real]]!\n\t"
|
||||
@ -553,7 +551,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
||||
int16_t* ptr_out = &out_frame[0];
|
||||
ptr_syn = &inst->synthesisBuffer[0];
|
||||
// read out fully processed segment
|
||||
for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms]; ) {
|
||||
for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms];) {
|
||||
// Loop unrolled once. Both pointers are incremented in the assembly code.
|
||||
__asm__ __volatile__(
|
||||
// out_frame[i] = inst->synthesisBuffer[i]; // Q0
|
||||
@ -575,7 +573,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
||||
// inst->anaLen - inst->blockLen10ms);
|
||||
ptr_out = &inst->synthesisBuffer[0],
|
||||
ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms];
|
||||
for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) {
|
||||
for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
|
||||
// Loop unrolled once. Both pointers are incremented in the assembly code.
|
||||
__asm__ __volatile__(
|
||||
"vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
|
||||
@ -593,7 +591,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
||||
// WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
|
||||
// + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
|
||||
__asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10");
|
||||
for (; ptr_out < &inst->synthesisBuffer[inst->anaLen]; ) {
|
||||
for (; ptr_out < &inst->synthesisBuffer[inst->anaLen];) {
|
||||
// Loop unrolled once. Pointer is incremented in the assembly code.
|
||||
__asm__ __volatile__(
|
||||
"vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
|
||||
@ -606,9 +604,9 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
|
||||
}
|
||||
|
||||
// Update analysis buffer for lower band, and window data before FFT.
|
||||
void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
||||
int16_t* out,
|
||||
int16_t* new_speech) {
|
||||
static void AnalysisUpdateNeon(NsxInst_t* inst,
|
||||
int16_t* out,
|
||||
int16_t* new_speech) {
|
||||
|
||||
int16_t* ptr_ana = &inst->analysisBuffer[inst->blockLen10ms];
|
||||
int16_t* ptr_out = &inst->analysisBuffer[0];
|
||||
@ -617,7 +615,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
||||
// WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
|
||||
// inst->analysisBuffer + inst->blockLen10ms,
|
||||
// inst->anaLen - inst->blockLen10ms);
|
||||
for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms]; ) {
|
||||
for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) {
|
||||
// Loop unrolled once, so both pointers are incremented by 8 twice.
|
||||
__asm__ __volatile__(
|
||||
"vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
|
||||
@ -633,7 +631,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
||||
|
||||
// WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
|
||||
// + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
|
||||
for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen]; ) {
|
||||
for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) {
|
||||
// Loop unrolled once, so both pointers are incremented by 8 twice.
|
||||
__asm__ __volatile__(
|
||||
"vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
|
||||
@ -651,7 +649,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
||||
int16_t* ptr_window = &inst->window[0];
|
||||
ptr_out = &out[0];
|
||||
ptr_ana = &inst->analysisBuffer[0];
|
||||
for (; ptr_out < &out[inst->anaLen]; ) {
|
||||
for (; ptr_out < &out[inst->anaLen];) {
|
||||
|
||||
// Loop unrolled once, so all pointers are incremented by 4 twice.
|
||||
__asm__ __volatile__(
|
||||
@ -683,17 +681,17 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
|
||||
|
||||
// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
|
||||
// zeros, and normalize it.
|
||||
__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
|
||||
int16_t* in,
|
||||
int16_t* out) {
|
||||
static __inline void CreateComplexBufferNeon(NsxInst_t* inst,
|
||||
int16_t* in,
|
||||
int16_t* out) {
|
||||
int16_t* ptr_out = &out[0];
|
||||
int16_t* ptr_in = &in[0];
|
||||
|
||||
__asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25");
|
||||
__asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10");
|
||||
for (; ptr_in < &in[inst->anaLen]; ) {
|
||||
for (; ptr_in < &in[inst->anaLen];) {
|
||||
|
||||
// Loop unrolled once, so ptr_in is incremented by 8 twice,
|
||||
// Loop unrolled once, so ptr_in is incremented by 8 twice,
|
||||
// and ptr_out is incremented by 8 four times.
|
||||
__asm__ __volatile__(
|
||||
// out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
|
||||
@ -724,4 +722,12 @@ __inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
|
||||
);
|
||||
}
|
||||
}
|
||||
#endif // defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)
|
||||
|
||||
void WebRtcNsx_InitNeon(void) {
|
||||
WebRtcNsx_NoiseEstimation = NoiseEstimationNeon;
|
||||
WebRtcNsx_PrepareSpectrum = PrepareSpectrumNeon;
|
||||
WebRtcNsx_SynthesisUpdate = SynthesisUpdateNeon;
|
||||
WebRtcNsx_AnalysisUpdate = AnalysisUpdateNeon;
|
||||
WebRtcNsx_Denormalize = DenormalizeNeon;
|
||||
WebRtcNsx_CreateComplexBuffer = CreateComplexBufferNeon;
|
||||
}
|
||||
|
@ -15,18 +15,33 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// list of features.
|
||||
#include <typedefs.h>
|
||||
|
||||
// List of features in x86.
|
||||
typedef enum {
|
||||
kSSE2,
|
||||
kSSE3
|
||||
} CPUFeature;
|
||||
|
||||
// List of features in ARM.
|
||||
enum {
|
||||
kCPUFeatureARMv7 = (1 << 0),
|
||||
kCPUFeatureVFPv3 = (1 << 1),
|
||||
kCPUFeatureNEON = (1 << 2),
|
||||
kCPUFeatureLDREXSTREX = (1 << 3)
|
||||
};
|
||||
|
||||
typedef int (*WebRtc_CPUInfo)(CPUFeature feature);
|
||||
// returns true if the CPU supports the feature.
|
||||
extern WebRtc_CPUInfo WebRtc_GetCPUInfo;
|
||||
// No CPU feature is available => straight C path.
|
||||
extern WebRtc_CPUInfo WebRtc_GetCPUInfoNoASM;
|
||||
|
||||
// Return the features in an ARM device.
|
||||
// It detects the features in the hardware platform, and returns supported
|
||||
// values in the above enum definition as a bitmask.
|
||||
extern uint64_t WebRtc_GetCPUFeaturesARM(void);
|
||||
|
||||
#if defined(__cplusplus) || defined(c_plusplus)
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
@ -25,6 +25,7 @@ LOCAL_SRC_FILES := \
|
||||
condition_variable.cc \
|
||||
cpu_dummy.cc \
|
||||
cpu_features.cc \
|
||||
cpu_features_arm.c \
|
||||
cpu_info.cc \
|
||||
critical_section.cc \
|
||||
event.cc \
|
||||
|
333
src/system_wrappers/source/cpu_features_arm.c
Normal file
333
src/system_wrappers/source/cpu_features_arm.c
Normal file
@ -0,0 +1,333 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
// This file is derived from Android's NDK package r7, located at
|
||||
// <ndk>/sources/android/cpufeatures/ (downloadable from
|
||||
// http://developer.android.com/sdk/ndk/index.html).
|
||||
|
||||
#include "cpu_features_wrapper.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Define CPU family.
|
||||
typedef enum {
|
||||
CPU_FAMILY_UNKNOWN = 0,
|
||||
CPU_FAMILY_ARM,
|
||||
CPU_FAMILY_X86,
|
||||
CPU_FAMILY_MAX // Do not remove.
|
||||
} CpuFamily;
|
||||
|
||||
static pthread_once_t g_once;
|
||||
static CpuFamily g_cpuFamily;
|
||||
static uint64_t g_cpuFeatures;
|
||||
static int g_cpuCount;
|
||||
|
||||
static const int cpufeatures_debug = 0;
|
||||
|
||||
#ifdef __arm__
|
||||
# define DEFAULT_CPU_FAMILY CPU_FAMILY_ARM
|
||||
#elif defined __i386__
|
||||
# define DEFAULT_CPU_FAMILY CPU_FAMILY_X86
|
||||
#else
|
||||
# define DEFAULT_CPU_FAMILY CPU_FAMILY_UNKNOWN
|
||||
#endif
|
||||
|
||||
#define D(...) \
|
||||
do { \
|
||||
if (cpufeatures_debug) { \
|
||||
printf(__VA_ARGS__); fflush(stdout); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/* Read the content of /proc/cpuinfo into a user-provided buffer.
|
||||
* Return the length of the data, or -1 on error. Does *not*
|
||||
* zero-terminate the content. Will not read more
|
||||
* than 'buffsize' bytes.
|
||||
*/
|
||||
static int read_file(const char* pathname, char* buffer, size_t buffsize) {
|
||||
int fd, len;
|
||||
|
||||
fd = open(pathname, O_RDONLY);
|
||||
if (fd < 0)
|
||||
return -1;
|
||||
|
||||
do {
|
||||
len = read(fd, buffer, buffsize);
|
||||
} while (len < 0 && errno == EINTR);
|
||||
|
||||
close(fd);
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/* Extract the content of a the first occurence of a given field in
|
||||
* the content of /proc/cpuinfo and return it as a heap-allocated
|
||||
* string that must be freed by the caller.
|
||||
*
|
||||
* Return NULL if not found
|
||||
*/
|
||||
static char* extract_cpuinfo_field(char* buffer, int buflen, const char* field) {
|
||||
int fieldlen = strlen(field);
|
||||
char* bufend = buffer + buflen;
|
||||
char* result = NULL;
|
||||
int len, ignore;
|
||||
const char* p, *q;
|
||||
|
||||
/* Look for first field occurence, and ensures it starts the line.
|
||||
*/
|
||||
p = buffer;
|
||||
bufend = buffer + buflen;
|
||||
for (;;) {
|
||||
p = memmem(p, bufend - p, field, fieldlen);
|
||||
if (p == NULL)
|
||||
goto EXIT;
|
||||
|
||||
if (p == buffer || p[-1] == '\n')
|
||||
break;
|
||||
|
||||
p += fieldlen;
|
||||
}
|
||||
|
||||
/* Skip to the first column followed by a space */
|
||||
p += fieldlen;
|
||||
p = memchr(p, ':', bufend - p);
|
||||
if (p == NULL || p[1] != ' ')
|
||||
goto EXIT;
|
||||
|
||||
/* Find the end of the line */
|
||||
p += 2;
|
||||
q = memchr(p, '\n', bufend - p);
|
||||
if (q == NULL)
|
||||
q = bufend;
|
||||
|
||||
/* Copy the line into a heap-allocated buffer */
|
||||
len = q - p;
|
||||
result = malloc(len + 1);
|
||||
if (result == NULL)
|
||||
goto EXIT;
|
||||
|
||||
memcpy(result, p, len);
|
||||
result[len] = '\0';
|
||||
|
||||
EXIT:
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Count the number of occurences of a given field prefix in /proc/cpuinfo.
|
||||
*/
|
||||
static int count_cpuinfo_field(char* buffer, int buflen, const char* field) {
|
||||
int fieldlen = strlen(field);
|
||||
const char* p = buffer;
|
||||
const char* bufend = buffer + buflen;
|
||||
const char* q;
|
||||
int count = 0;
|
||||
|
||||
for (;;) {
|
||||
const char* q;
|
||||
|
||||
p = memmem(p, bufend - p, field, fieldlen);
|
||||
if (p == NULL)
|
||||
break;
|
||||
|
||||
/* Ensure that the field is at the start of a line */
|
||||
if (p > buffer && p[-1] != '\n') {
|
||||
p += fieldlen;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
/* skip any whitespace */
|
||||
q = p + fieldlen;
|
||||
while (q < bufend && (*q == ' ' || *q == '\t'))
|
||||
q++;
|
||||
|
||||
/* we must have a colon now */
|
||||
if (q < bufend && *q == ':') {
|
||||
count += 1;
|
||||
q ++;
|
||||
}
|
||||
p = q;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
/* Like strlen(), but for constant string literals */
|
||||
#define STRLEN_CONST(x) ((sizeof(x)-1)
|
||||
|
||||
|
||||
/* Checks that a space-separated list of items contains one given 'item'.
|
||||
* Returns 1 if found, 0 otherwise.
|
||||
*/
|
||||
static int has_list_item(const char* list, const char* item) {
|
||||
const char* p = list;
|
||||
int itemlen = strlen(item);
|
||||
|
||||
if (list == NULL)
|
||||
return 0;
|
||||
|
||||
while (*p) {
|
||||
const char* q;
|
||||
|
||||
/* skip spaces */
|
||||
while (*p == ' ' || *p == '\t')
|
||||
p++;
|
||||
|
||||
/* find end of current list item */
|
||||
q = p;
|
||||
while (*q && *q != ' ' && *q != '\t')
|
||||
q++;
|
||||
|
||||
if (itemlen == q - p && !memcmp(p, item, itemlen))
|
||||
return 1;
|
||||
|
||||
/* skip to next item */
|
||||
p = q;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static void cpuInit(void) {
|
||||
char cpuinfo[4096];
|
||||
int cpuinfo_len;
|
||||
|
||||
g_cpuFamily = DEFAULT_CPU_FAMILY;
|
||||
g_cpuFeatures = 0;
|
||||
g_cpuCount = 1;
|
||||
|
||||
cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, sizeof cpuinfo);
|
||||
D("cpuinfo_len is (%d):\n%.*s\n", cpuinfo_len,
|
||||
cpuinfo_len >= 0 ? cpuinfo_len : 0, cpuinfo);
|
||||
|
||||
if (cpuinfo_len < 0) { /* should not happen */
|
||||
return;
|
||||
}
|
||||
|
||||
/* Count the CPU cores, the value may be 0 for single-core CPUs */
|
||||
g_cpuCount = count_cpuinfo_field(cpuinfo, cpuinfo_len, "processor");
|
||||
if (g_cpuCount == 0) {
|
||||
g_cpuCount = count_cpuinfo_field(cpuinfo, cpuinfo_len, "Processor");
|
||||
if (g_cpuCount == 0) {
|
||||
g_cpuCount = 1;
|
||||
}
|
||||
}
|
||||
|
||||
D("found cpuCount = %d\n", g_cpuCount);
|
||||
|
||||
#ifdef __arm__
|
||||
{
|
||||
char* features = NULL;
|
||||
char* architecture = NULL;
|
||||
|
||||
/* Extract architecture from the "CPU Architecture" field.
|
||||
* The list is well-known, unlike the the output of
|
||||
* the 'Processor' field which can vary greatly.
|
||||
*
|
||||
* See the definition of the 'proc_arch' array in
|
||||
* $KERNEL/arch/arm/kernel/setup.c and the 'c_show' function in
|
||||
* same file.
|
||||
*/
|
||||
char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
|
||||
"CPU architecture");
|
||||
|
||||
if (cpuArch != NULL) {
|
||||
char* end;
|
||||
long archNumber;
|
||||
int hasARMv7 = 0;
|
||||
|
||||
D("found cpuArch = '%s'\n", cpuArch);
|
||||
|
||||
/* read the initial decimal number, ignore the rest */
|
||||
archNumber = strtol(cpuArch, &end, 10);
|
||||
|
||||
/* Here we assume that ARMv8 will be upwards compatible with v7
|
||||
* in the future. Unfortunately, there is no 'Features' field to
|
||||
* indicate that Thumb-2 is supported.
|
||||
*/
|
||||
if (end > cpuArch && archNumber >= 7) {
|
||||
hasARMv7 = 1;
|
||||
}
|
||||
|
||||
/* Unfortunately, it seems that certain ARMv6-based CPUs
|
||||
* report an incorrect architecture number of 7!
|
||||
*
|
||||
* We try to correct this by looking at the 'elf_format'
|
||||
* field reported by the 'Processor' field, which is of the
|
||||
* form of "(v7l)" for an ARMv7-based CPU, and "(v6l)" for
|
||||
* an ARMv6-one.
|
||||
*/
|
||||
if (hasARMv7) {
|
||||
char* cpuProc = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
|
||||
"Processor");
|
||||
if (cpuProc != NULL) {
|
||||
D("found cpuProc = '%s'\n", cpuProc);
|
||||
if (has_list_item(cpuProc, "(v6l)")) {
|
||||
D("CPU processor and architecture mismatch!!\n");
|
||||
hasARMv7 = 0;
|
||||
}
|
||||
free(cpuProc);
|
||||
}
|
||||
}
|
||||
|
||||
if (hasARMv7) {
|
||||
g_cpuFeatures |= kCPUFeatureARMv7;
|
||||
}
|
||||
|
||||
/* The LDREX / STREX instructions are available from ARMv6 */
|
||||
if (archNumber >= 6) {
|
||||
g_cpuFeatures |= kCPUFeatureLDREXSTREX;
|
||||
}
|
||||
|
||||
free(cpuArch);
|
||||
}
|
||||
|
||||
/* Extract the list of CPU features from 'Features' field */
|
||||
char* cpuFeatures = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
|
||||
"Features");
|
||||
|
||||
if (cpuFeatures != NULL) {
|
||||
|
||||
D("found cpuFeatures = '%s'\n", cpuFeatures);
|
||||
|
||||
if (has_list_item(cpuFeatures, "vfpv3"))
|
||||
g_cpuFeatures |= kCPUFeatureVFPv3;
|
||||
|
||||
else if (has_list_item(cpuFeatures, "vfpv3d16"))
|
||||
g_cpuFeatures |= kCPUFeatureVFPv3;
|
||||
|
||||
if (has_list_item(cpuFeatures, "neon")) {
|
||||
/* Note: Certain kernels only report neon but not vfpv3
|
||||
* in their features list. However, ARM mandates
|
||||
* that if Neon is implemented, so must be VFPv3
|
||||
* so always set the flag.
|
||||
*/
|
||||
g_cpuFeatures |= kCPUFeatureNEON |
|
||||
kCPUFeatureVFPv3;
|
||||
}
|
||||
free(cpuFeatures);
|
||||
}
|
||||
}
|
||||
#endif // __arm__
|
||||
|
||||
#ifdef __i386__
|
||||
g_cpuFamily = CPU_FAMILY_X86;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
uint64_t WebRtc_GetCPUFeaturesARM(void) {
|
||||
pthread_once(&g_once, cpuInit);
|
||||
return g_cpuFeatures;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user