diff --git a/src/modules/audio_processing/aecm/Android.mk b/src/modules/audio_processing/aecm/Android.mk index 2d64b85c5..191d5bfe0 100644 --- a/src/modules/audio_processing/aecm/Android.mk +++ b/src/modules/audio_processing/aecm/Android.mk @@ -56,7 +56,21 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := libwebrtc_aecm_neon LOCAL_MODULE_TAGS := optional -LOCAL_SRC_FILES := aecm_core_neon.c +GEN := $(LOCAL_PATH)/aecm_core_neon_offsets.h + +# Generate a header file aecm_core_neon_offsets.h which will be included in +# assembly file aecm_core_neon.S, from file aecm_core_neon_offsets.c. +$(GEN): $(LOCAL_PATH)/../../../../src/build/generate_asm_header.py \ + $(intermediates)/aecm_core_neon_offsets.S + @python $^ $@ offset_aecm_ + +$(intermediates)/aecm_core_neon_offsets.S: \ + $(LOCAL_PATH)/aecm_core_neon_offsets.c + @$(TARGET_CC) $(addprefix -I, $(LOCAL_INCLUDES)) $(addprefix -isystem ,\ + $(TARGET_C_INCLUDES)) -S -o $@ $^ + +LOCAL_GENERATED_SOURCES := $(GEN) +LOCAL_SRC_FILES := aecm_core_neon.S # Flags passed to both C and C++ files. LOCAL_CFLAGS := \ @@ -70,6 +84,8 @@ LOCAL_C_INCLUDES := \ $(LOCAL_PATH)/../../.. \ $(LOCAL_PATH)/../../../common_audio/signal_processing/include +LOCAL_INCLUDES := $(LOCAL_C_INCLUDES) + ifndef NDK_ROOT include external/stlport/libstlport.mk endif diff --git a/src/modules/audio_processing/aecm/aecm_core.c b/src/modules/audio_processing/aecm/aecm_core.c index f57e434e7..059894870 100644 --- a/src/modules/audio_processing/aecm/aecm_core.c +++ b/src/modules/audio_processing/aecm/aecm_core.c @@ -11,6 +11,7 @@ #include "aecm_core.h" #include +#include #include #include "cpu_features_wrapper.h" @@ -197,6 +198,15 @@ static const WebRtc_Word16 kSinTable[] = { static const WebRtc_Word16 kNoiseEstQDomain = 15; static const WebRtc_Word16 kNoiseEstIncCount = 5; +// TODO(andrew): put this into general WebRTC so other modules can use it. +// Define a compiler-time assertion. +#define WEBRTC_STATIC_ASSERT(name, boolean_cond) \ + static char const static_assert_##name[(boolean_cond) ? 1 : -1] = {'!'} + +// Assert a preprocessor definition at compile-time. It's an assumption +// used in assembly code, so check the assembly files before any change. +WEBRTC_STATIC_ASSERT(PART_LEN, PART_LEN % 16 == 0); + static void ComfortNoise(AecmCore_t* aecm, const WebRtc_UWord16* dfa, complex16_t* out, @@ -395,6 +405,18 @@ static void WindowAndFFTC(WebRtc_Word16* fft, } } +// Initialize function pointers for ARM Neon platform. +#if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON) +static void WebRtcAecm_InitNeon(void) +{ + WebRtcAecm_WindowAndFFT = WebRtcAecm_WindowAndFFTNeon; + WebRtcAecm_InverseFFTAndWindow = WebRtcAecm_InverseFFTAndWindowNeon; + WebRtcAecm_CalcLinearEnergies = WebRtcAecm_CalcLinearEnergiesNeon; + WebRtcAecm_StoreAdaptiveChannel = WebRtcAecm_StoreAdaptiveChannelNeon; + WebRtcAecm_ResetAdaptiveChannel = WebRtcAecm_ResetAdaptiveChannelNeon; +} +#endif + static void InverseFFTAndWindowC(AecmCore_t* aecm, WebRtc_Word16* fft, complex16_t* efw, @@ -673,7 +695,7 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq) uint64_t features = WebRtc_GetCPUFeaturesARM(); if ((features & kCPUFeatureNEON) != 0) { - WebRtcAecm_InitNeon(); + WebRtcAecm_InitNeon(); } #elif defined(WEBRTC_ARCH_ARM_NEON) WebRtcAecm_InitNeon(); @@ -1850,7 +1872,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, { hnl[i] = 0; } - + // Remove outliers if (numPosCoef < 3) { diff --git a/src/modules/audio_processing/aecm/aecm_core.h b/src/modules/audio_processing/aecm/aecm_core.h index 0ec62ec24..8161a8cbd 100644 --- a/src/modules/audio_processing/aecm/aecm_core.h +++ b/src/modules/audio_processing/aecm/aecm_core.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -10,92 +10,13 @@ // Performs echo control (suppression) with fft routines in fixed-point -#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_MAIN_SOURCE_AECM_CORE_H_ -#define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_MAIN_SOURCE_AECM_CORE_H_ - -#define AECM_DYNAMIC_Q // turn on/off dynamic Q-domain -//#define AECM_WITH_ABS_APPROX -//#define AECM_SHORT // for 32 sample partition length (otherwise 64) +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_ #include "typedefs.h" #include "signal_processing_library.h" -// Algorithm parameters - -#define FRAME_LEN 80 // Total frame length, 10 ms -#ifdef AECM_SHORT - -#define PART_LEN 32 // Length of partition -#define PART_LEN_SHIFT 6 // Length of (PART_LEN * 2) in base 2 - -#else - -#define PART_LEN 64 // Length of partition -#define PART_LEN_SHIFT 7 // Length of (PART_LEN * 2) in base 2 - -#endif - -#define PART_LEN1 (PART_LEN + 1) // Unique fft coefficients -#define PART_LEN2 (PART_LEN << 1) // Length of partition * 2 -#define PART_LEN4 (PART_LEN << 2) // Length of partition * 4 -#define FAR_BUF_LEN PART_LEN4 // Length of buffers -#define MAX_DELAY 100 - -// Counter parameters -#ifdef AECM_SHORT - -#define CONV_LEN 1024 // Convergence length used at startup -#else - -#define CONV_LEN 512 // Convergence length used at startup -#endif - -#define CONV_LEN2 (CONV_LEN << 1) // Convergence length * 2 used at startup -// Energy parameters -#define MAX_BUF_LEN 64 // History length of energy signals - -#define FAR_ENERGY_MIN 1025 // Lowest Far energy level: At least 2 in energy -#define FAR_ENERGY_DIFF 929 // Allowed difference between max and min - -#define ENERGY_DEV_OFFSET 0 // The energy error offset in Q8 -#define ENERGY_DEV_TOL 400 // The energy estimation tolerance in Q8 -#define FAR_ENERGY_VAD_REGION 230 // Far VAD tolerance region -// Stepsize parameters -#define MU_MIN 10 // Min stepsize 2^-MU_MIN (far end energy dependent) -#define MU_MAX 1 // Max stepsize 2^-MU_MAX (far end energy dependent) -#define MU_DIFF 9 // MU_MIN - MU_MAX -// Channel parameters -#define MIN_MSE_COUNT 20 // Min number of consecutive blocks with enough far end - // energy to compare channel estimates -#define MIN_MSE_DIFF 29 // The ratio between adapted and stored channel to - // accept a new storage (0.8 in Q-MSE_RESOLUTION) -#define MSE_RESOLUTION 5 // MSE parameter resolution -#define RESOLUTION_CHANNEL16 12 // W16 Channel in Q-RESOLUTION_CHANNEL16 -#define RESOLUTION_CHANNEL32 28 // W32 Channel in Q-RESOLUTION_CHANNEL -#define CHANNEL_VAD 16 // Minimum energy in frequency band to update channel -// Suppression gain parameters: SUPGAIN_ parameters in Q-(RESOLUTION_SUPGAIN) -#define RESOLUTION_SUPGAIN 8 // Channel in Q-(RESOLUTION_SUPGAIN) -#define SUPGAIN_DEFAULT (1 << RESOLUTION_SUPGAIN) // Default suppression gain -#define SUPGAIN_ERROR_PARAM_A 3072 // Estimation error parameter (Maximum gain) (8 in Q8) -#define SUPGAIN_ERROR_PARAM_B 1536 // Estimation error parameter (Gain before going down) -#define SUPGAIN_ERROR_PARAM_D SUPGAIN_DEFAULT // Estimation error parameter - // (Should be the same as Default) (1 in Q8) -#define SUPGAIN_EPC_DT 200 // = SUPGAIN_ERROR_PARAM_C * ENERGY_DEV_TOL -// Defines for "check delay estimation" -#define CORR_WIDTH 31 // Number of samples to correlate over. -#define CORR_MAX 16 // Maximum correlation offset -#define CORR_MAX_BUF 63 -#define CORR_DEV 4 -#define CORR_MAX_LEVEL 20 -#define CORR_MAX_LOW 4 -#define CORR_BUF_LEN (CORR_MAX << 1) + 1 -// Note that CORR_WIDTH + 2*CORR_MAX <= MAX_BUF_LEN - -#define ONE_Q14 (1 << 14) - -// NLP defines -#define NLP_COMP_LOW 3277 // 0.2 in Q14 -#define NLP_COMP_HIGH ONE_Q14 // 1 in Q14 +#include "aecm_defines.h" extern const WebRtc_Word16 WebRtcAecm_kSqrtHanning[]; @@ -368,8 +289,33 @@ typedef void (*InverseFFTAndWindow)( const WebRtc_Word16* nearendClean); extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow; -// Initialization of the above function pointers for ARM Neon. -void WebRtcAecm_InitNeon(void); +// For the above function pointers, functions for generic platforms are declared +// and defined as static in file aecm_core.c, while those for ARM Neon platforms +// are declared below and defined in file aecm_core_neon.s. +#if (defined WEBRTC_DETECT_ARM_NEON) || defined (WEBRTC_ARCH_ARM_NEON) +void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft, + const WebRtc_Word16* time_signal, + complex16_t* freq_signal, + int time_signal_scaling); +void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, + WebRtc_Word16* fft, + complex16_t* efw, + WebRtc_Word16* output, + const WebRtc_Word16* nearendClean); + +void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est, + WebRtc_UWord32* far_energy, + WebRtc_UWord32* echo_energy_adapt, + WebRtc_UWord32* echo_energy_stored); + +void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est); + +void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm); +#endif #endif diff --git a/src/modules/audio_processing/aecm/aecm_core_neon.S b/src/modules/audio_processing/aecm/aecm_core_neon.S new file mode 100644 index 000000000..0708c5fd9 --- /dev/null +++ b/src/modules/audio_processing/aecm/aecm_core_neon.S @@ -0,0 +1,361 @@ +@ +@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + +@ aecm_core_neon.s +@ This file contains some functions in AECM, optimized for ARM Neon +@ platforms. Reference C code is in file aecm_core.c. Bit-exact. + +.arch armv7-a +.fpu neon + +#include "aecm_defines.h" +#include "aecm_core_neon_offsets.h" + +.extern WebRtcAecm_kSqrtHanning + +.global WebRtcAecm_WindowAndFFTNeon +.global WebRtcAecm_InverseFFTAndWindowNeon +.global WebRtcAecm_CalcLinearEnergiesNeon +.global WebRtcAecm_StoreAdaptiveChannelNeon +.global WebRtcAecm_ResetAdaptiveChannelNeon + +@ void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft, +@ const WebRtc_Word16* time_signal, +@ complex16_t* freq_signal, +@ int time_signal_scaling); +.align 2 +WebRtcAecm_WindowAndFFTNeon: +.fnstart +.save {r4, r5, lr} + push {r4, r5, lr} + + vdup.16 d16, r3 + mov r5, r2 @ WebRtcSpl_ComplexIFFT changes r2. + + vmov.i16 d21, #0 @ For imaginary parts of |fft|. + vmov.i16 d27, #0 @ For imaginary parts of |fft|. + ldr r2, =WebRtcAecm_kSqrtHanning + adr lr, kSqrtHanningReversed + add r4, r0, #(PART_LEN2 * 2) @ &fft[PART_LEN2] + add r12, r1, #(PART_LEN * 2) @ time_signal[PART_LEN] + mov r3, #(PART_LEN / 4) @ Loop counter, unrolled by 4 + +LOOP_PART_LEN: + vld1.16 d0, [r1, :64]! @ time_signal[i] + vld1.16 d22, [r12, :64]! @ time_signal[i + PART_LEN] + vld1.16 d17, [r2, :64]! @ WebRtcAecm_kSqrtHanning[i] + vld1.16 d23, [lr, :64]! @ kSqrtHanningReversed[i] + vshl.s16 d18, d0, d16 + vshl.s16 d22, d22, d16 + vmull.s16 q9, d18, d17 + vmull.s16 q12, d22, d23 + subs r3, #1 + vshrn.i32 d20, q9, #14 + vshrn.i32 d26, q12, #14 + vst2.16 {d20, d21}, [r0, :128]! @ fft[j] + vst2.16 {d26, d27}, [r4, :128]! @ fft[PART_LEN2 + j] + bgt LOOP_PART_LEN + + sub r4, r0, #(PART_LEN2 * 2) @ r4 points to fft[0] + mov r0, r4 + mov r1, #7 + bl WebRtcSpl_ComplexBitReverse + + mov r0, r4 + mov r1, #7 + mov r2, #1 + bl WebRtcSpl_ComplexFFT + + mov r3, #(PART_LEN * 2 / 16) @ Loop counter, unrolled by 16. + +LOOP_PART_LEN2: + @ freq_signal[i].real = fft[j]; + @ freq_signal[i].imag = - fft[j+1]; + vld2.16 {d20, d21, d22, d23}, [r4, :256]! + subs r3, #1 + vneg.s16 d22, d22 + vneg.s16 d23, d23 + vst2.16 {d20, d21, d22, d23}, [r5, :256]! + bgt LOOP_PART_LEN2 + + pop {r4, r5, pc} +.fnend + +@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, +@ WebRtc_Word16* fft, +@ complex16_t* efw, +@ WebRtc_Word16* output, +@ const WebRtc_Word16* nearendClean); +.align 2 +WebRtcAecm_InverseFFTAndWindowNeon: +.fnstart +.save {r4-r8, lr} + push {r4-r8, lr} + + @ Values of r0, r1, and r3 will change in WebRtcSpl_ComplexIFFT + @ and WebRtcSpl_ComplexBitReverse. + mov r4, r1 + mov r5, r0 + mov r7, r3 + + add r3, r1, #((PART_LEN4 - 6) * 2) @ &fft[PART_LEN4 - 6] + mov r6, #(PART_LEN / 4) @ Loop counter, unrolled by 4 + add r12, r2, #(PART_LEN * 4) @ &efw[PART_LEN] + mov r8, #-16 + +LOOP_PRE_IFFT: + vld2.16 {q10}, [r2, :128]! + vmov q11, q10 + vneg.s16 d23, d23 + vst2.16 {d22, d23}, [r1, :128]! + vrev64.16 q10, q10 + subs r6, #1 + vst2.16 {q10}, [r3], r8 + bgt LOOP_PRE_IFFT + + @ fft[PART_LEN2] = efw[PART_LEN].real; + @ fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; + ldr r8, [r12] + ssub16 r2, r6, r8 + mov r1, #(PART_LEN2 * 2) + pkhbt r8, r8, r2 + str r8, [r4, r1] + + mov r0, r4 + mov r1, #7 + bl WebRtcSpl_ComplexBitReverse + + mov r0, r4 + mov r1, #7 + mov r2, #1 + bl WebRtcSpl_ComplexIFFT + + mov r1, r4 + mov r2, r4 + mov r3, #(PART_LEN * 2 / 8) @ Loop counter, unrolled by 8. + +LOOP_GET_REAL_VALUES: + vld2.16 {q10, q11}, [r2, :256]! + subs r3, #1 + vst1.16 {q10}, [r1, :128]! + bgt LOOP_GET_REAL_VALUES + + ldr r6, =offset_aecm_outBuf + ldr r12, =offset_aecm_dfaCleanQDomain + ldr r8, [r5, r6] @ &aecm->outBuf[0] + ldrsh r2, [r5, r12] @ &aecm->dfaCleanQDomain[0] + + adr r12, kSqrtHanningReversed + ldr r6, =WebRtcAecm_kSqrtHanning + rsb r0, r2, r0 @ outCFFT - aecm->dfaCleanQDomain + vdup.32 q9, r0 + add r0, r4, #(PART_LEN * 2) @ &fft[PART_LEN] + mov r3, #(PART_LEN / 4) @ Loop counter, unrolled by 4. + +LOOP_POST_IFFT: + vld1.16 d16, [r4, :64] @ fft[i]; + vld1.16 d17, [r6, :64]! @ WebRtcAecm_kSqrtHanning[i] + vld1.16 d20, [r8, :64] @ aecm->outBuf[i] + vmull.s16 q8, d16, d17 + vmovl.s16 q10, d20 + vrshr.s32 q8, q8, #14 + vld1.16 d0, [r0, :64]! @ &fft[PART_LEN + i] + vshl.s32 q8, q8, q9 + vld1.16 d1, [r12, :64]! @ kSqrtHanningReversed[i] + vadd.i32 q8, q10 + vmull.s16 q0, d0, d1 + vqshrn.s32 d16, q8, #0 + vshr.s32 q0, q0, #14 + vst1.16 d16, [r4, :64]! @ fft[i]; + vshl.s32 q0, q0, q9 + vst1.16 d16, [r7, :64]! @ output[i] + vqshrn.s32 d0, q0, #0 + subs r3, #1 + vst1.16 d0, [r8, :64]! @ aecm->outBuf[i] + bgt LOOP_POST_IFFT + + ldr r3, =offset_aecm_xBuf + ldr r12, =offset_aecm_dBufNoisy + ldr r3, [r5, r3] @ &aecm->xBuf[0] + ldr r1, [r5, r12] @ &aecm->dBufNoisy[0] + add r2, r3, #(PART_LEN * 2) @ &aecm->xBuf[PART_LEN] + add r0, r1, #(PART_LEN * 2) @ &aecm->dBufNoisy[PART_LEN] + mov r4, #(PART_LEN / 16) @ Loop counter, unrolled by 16. + +LOOP_COPY: + vld1.16 {q10, q11}, [r2, :256]! + vld1.16 {q12, q13}, [r0, :256]! + subs r4, #1 + vst1.16 {q10, q11}, [r3, :256]! + vst1.16 {q12, q13}, [r1, :256]! + bgt LOOP_COPY + + ldr r2, [sp, #24] + cmp r2, #0 @ Check if (nearendClean != NULL). + beq END + + ldr r4, =offset_aecm_dBufClean + ldr r1, [r5, r4] @ &aecm->dBufClean[0] + add r0, r1, #(PART_LEN * 2) @ &aecm->dBufClean[PART_LEN] + + vld1.16 {q10, q11}, [r0, :256]! + vld1.16 {q12, q13}, [r0, :256]! + vst1.16 {q10, q11}, [r1, :256]! + vst1.16 {q12, q13}, [r1, :256]! + vld1.16 {q10, q11}, [r0, :256]! + vld1.16 {q12, q13}, [r0, :256]! + vst1.16 {q10, q11}, [r1, :256]! + vst1.16 {q12, q13}, [r1, :256]! + +END: + pop {r4-r8, pc} +.fnend + +@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, +@ const WebRtc_UWord16* far_spectrum, +@ WebRtc_Word32* echo_est, +@ WebRtc_UWord32* far_energy, +@ WebRtc_UWord32* echo_energy_adapt, +@ WebRtc_UWord32* echo_energy_stored); +.align 2 +WebRtcAecm_CalcLinearEnergiesNeon: +.fnstart +.save {r4-r7} + push {r4-r7} + + vmov.i32 q14, #0 + vmov.i32 q8, #0 + vmov.i32 q9, #0 + + ldr r7, =offset_aecm_channelStored + ldr r5, =offset_aecm_channelAdapt16 + + mov r4, r2 + mov r12, #(PART_LEN / 8) @ Loop counter, unrolled by 8. + ldr r6, [r0, r7] + ldr r7, [r0, r5] + +LOOP_CALC_LINEAR_ENERGIES: + vld1.16 {d26, d27}, [r1]! @ far_spectrum[i] + vld1.16 {d24, d25}, [r6, :128]! @ &aecm->channelStored[i] + vld1.16 {d0, d1}, [r7, :128]! @ &aecm->channelAdapt16[i] + vaddw.u16 q14, q14, d26 + vmull.u16 q10, d26, d24 + vmull.u16 q11, d27, d25 + vaddw.u16 q14, q14, d27 + vmull.u16 q1, d26, d0 + vst1.32 {q10, q11}, [r4, :256]! @ &echo_est[i] + vadd.u32 q8, q10 + vmull.u16 q2, d27, d1 + vadd.u32 q8, q11 + vadd.u32 q9, q1 + subs r12, #1 + vadd.u32 q9, q2 + bgt LOOP_CALC_LINEAR_ENERGIES + + vadd.u32 d28, d29 + vpadd.u32 d28, d28 + vmov.32 r12, d28[0] + vadd.u32 d18, d19 + vpadd.u32 d18, d18 + vmov.32 r5, d18[0] @ echo_energy_adapt_r + vadd.u32 d16, d17 + vpadd.u32 d16, d16 + + ldrh r1, [r1] @ far_spectrum[i] + add r12, r12, r1 + str r12, [r3] @ far_energy + vmov.32 r2, d16[0] + + ldrsh r12, [r6] @ aecm->channelStored[i] + ldrh r6, [r7] @ aecm->channelAdapt16[i] + mul r0, r12, r1 + mla r1, r6, r1, r5 + add r2, r2, r0 + str r0, [r4] @ echo_est[i] + ldr r4, [sp, #20] @ &echo_energy_stored + str r2, [r4] + ldr r3, [sp, #16] @ &echo_energy_adapt + str r1, [r3] + + pop {r4-r7} + bx lr +.fnend + +@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, +@ const uint16_t* far_spectrum, +@ int32_t* echo_est); +.align 2 +WebRtcAecm_StoreAdaptiveChannelNeon: +.fnstart + ldr r3, =offset_aecm_channelAdapt16 + ldr r12, =offset_aecm_channelStored + ldr r3, [r0, r3] + ldr r0, [r0, r12] + mov r12, #(PART_LEN / 8) @ Loop counter, unrolled by 8. + +LOOP_STORE_ADAPTIVE_CHANNEL: + vld1.16 {d24, d25}, [r3, :128]! @ &aecm->channelAdapt16[i] + vld1.16 {d26, d27}, [r1]! @ &far_spectrum[i] + vst1.16 {d24, d25}, [r0, :128]! @ &aecm->channelStored[i] + vmull.u16 q10, d26, d24 + vmull.u16 q11, d27, d25 + vst1.16 {q10, q11}, [r2, :256]! @ echo_est[i] + subs r12, #1 + bgt LOOP_STORE_ADAPTIVE_CHANNEL + + ldrsh r12, [r3] + strh r12, [r0] + ldrh r1, [r1] + mul r3, r1, r12 + str r3, [r2] + + bx lr +.fnend + +@ void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm); +.align 2 +WebRtcAecm_ResetAdaptiveChannelNeon: +.fnstart + ldr r1, =offset_aecm_channelAdapt16 + ldr r2, =offset_aecm_channelAdapt32 + movw r3, #offset_aecm_channelStored + ldr r1, [r0, r1] @ &aecm->channelAdapt16[0] + ldr r2, [r0, r2] @ &aecm->channelAdapt32[0] + ldr r0, [r0, r3] @ &aecm->channelStored[0] + mov r3, #(PART_LEN / 8) @ Loop counter, unrolled by 8. + +LOOP_RESET_ADAPTIVE_CHANNEL: + vld1.16 {d24, d25}, [r0, :128]! + subs r3, #1 + vst1.16 {d24, d25}, [r1, :128]! + vshll.s16 q10, d24, #16 + vshll.s16 q11, d25, #16 + vst1.16 {q10, q11}, [r2, :256]! + bgt LOOP_RESET_ADAPTIVE_CHANNEL + + ldrh r0, [r0] + strh r0, [r1] + mov r0, r0, asl #16 + str r0, [r2] + + bx lr +.fnend + + @ Square root of Hanning window in Q14. Compared to WebRtcAecm_kSqrtHanning, + @ the order was reversed and one useless element (0) was removed. +.align 3 +kSqrtHanningReversed: + .hword 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947 + .hword 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571 + .hword 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335 + .hword 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370 + .hword 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101 + .hword 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399 diff --git a/src/modules/audio_processing/aecm/aecm_core_neon.c b/src/modules/audio_processing/aecm/aecm_core_neon.c index 169201d9e..c06a678f5 100644 --- a/src/modules/audio_processing/aecm/aecm_core_neon.c +++ b/src/modules/audio_processing/aecm/aecm_core_neon.c @@ -34,10 +34,10 @@ static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = 1594, 1196, 798, 399 }; -static void WindowAndFFTNeon(WebRtc_Word16* fft, - const WebRtc_Word16* time_signal, - complex16_t* freq_signal, - int time_signal_scaling) { +void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft, + const WebRtc_Word16* time_signal, + complex16_t* freq_signal, + int time_signal_scaling) { int i, j; int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling); @@ -86,11 +86,11 @@ static void WindowAndFFTNeon(WebRtc_Word16* fft, } } -static void InverseFFTAndWindowNeon(AecmCore_t* aecm, - WebRtc_Word16* fft, - complex16_t* efw, - WebRtc_Word16* output, - const WebRtc_Word16* nearendClean) { +void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, + WebRtc_Word16* fft, + complex16_t* efw, + WebRtc_Word16* output, + const WebRtc_Word16* nearendClean) { int i, j, outCFFT; // Synthesis @@ -186,12 +186,12 @@ static void InverseFFTAndWindowNeon(AecmCore_t* aecm, } } -static void CalcLinearEnergiesNeon(AecmCore_t* aecm, - const WebRtc_UWord16* far_spectrum, - WebRtc_Word32* echo_est, - WebRtc_UWord32* far_energy, - WebRtc_UWord32* echo_energy_adapt, - WebRtc_UWord32* echo_energy_stored) { +void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est, + WebRtc_UWord32* far_energy, + WebRtc_UWord32* echo_energy_adapt, + WebRtc_UWord32* echo_energy_stored) { int i; register WebRtc_UWord32 far_energy_r; @@ -249,9 +249,9 @@ static void CalcLinearEnergiesNeon(AecmCore_t* aecm, aecm->channelAdapt16[i], far_spectrum[i]); } -static void StoreAdaptiveChannelNeon(AecmCore_t* aecm, - const WebRtc_UWord16* far_spectrum, - WebRtc_Word32* echo_est) { +void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est) { int i; // During startup we store the channel every block. @@ -271,7 +271,7 @@ static void StoreAdaptiveChannelNeon(AecmCore_t* aecm, echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); } -static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) { +void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm) { int i; for (i = 0; i < PART_LEN - 7; i += 8) { @@ -292,10 +292,3 @@ static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) { (WebRtc_Word32)aecm->channelStored[i], 16); } -void WebRtcAecm_InitNeon(void) { - WebRtcAecm_WindowAndFFT = WindowAndFFTNeon; - WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon; - WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon; - WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon; - WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon; -} diff --git a/src/modules/audio_processing/aecm/aecm_core_neon_offsets.c b/src/modules/audio_processing/aecm/aecm_core_neon_offsets.c new file mode 100644 index 000000000..b61497787 --- /dev/null +++ b/src/modules/audio_processing/aecm/aecm_core_neon_offsets.c @@ -0,0 +1,26 @@ + +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "aecm_core.h" + +#include + +// Define offset variables that will be compiled and abstracted to constant +// defines, which will then only be used in ARM assembly code. +int offset_aecm_dfaCleanQDomain = offsetof(AecmCore_t, dfaCleanQDomain); +int offset_aecm_outBuf = offsetof(AecmCore_t, outBuf); +int offset_aecm_xBuf = offsetof(AecmCore_t, xBuf); +int offset_aecm_dBufNoisy = offsetof(AecmCore_t, dBufNoisy); +int offset_aecm_dBufClean = offsetof(AecmCore_t, dBufClean); +int offset_aecm_channelStored = offsetof(AecmCore_t, channelStored); +int offset_aecm_channelAdapt16 = offsetof(AecmCore_t, channelAdapt16); +int offset_aecm_channelAdapt32 = offsetof(AecmCore_t, channelAdapt32); + diff --git a/src/modules/audio_processing/aecm/aecm_defines.h b/src/modules/audio_processing/aecm/aecm_defines.h new file mode 100644 index 000000000..437cbf255 --- /dev/null +++ b/src/modules/audio_processing/aecm/aecm_defines.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_DEFINES_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_DEFINES_H_ + +#define AECM_DYNAMIC_Q /* Turn on/off dynamic Q-domain. */ + +/* #define AECM_SHORT For 32 sample partition length. */ + +/* Algorithm parameters */ +#define FRAME_LEN 80 /* Total frame length, 10 ms. */ + +#ifdef AECM_SHORT +#define PART_LEN 32 /* Length of partition. */ +#define PART_LEN_SHIFT 6 /* Length of (PART_LEN * 2) in base 2. */ +#else +#define PART_LEN 64 /* Length of partition. */ +#define PART_LEN_SHIFT 7 /* Length of (PART_LEN * 2) in base 2. */ +#endif + +#define PART_LEN1 (PART_LEN + 1) /* Unique fft coefficients. */ +#define PART_LEN2 (PART_LEN << 1) /* Length of partition * 2. */ +#define PART_LEN4 (PART_LEN << 2) /* Length of partition * 4. */ +#define FAR_BUF_LEN PART_LEN4 /* Length of buffers. */ +#define MAX_DELAY 100 + +/* Counter parameters */ +#ifdef AECM_SHORT +#define CONV_LEN 1024 /* Convergence length used at startup. */ +#else +#define CONV_LEN 512 /* Convergence length used at startup. */ +#endif +#define CONV_LEN2 (CONV_LEN << 1) /* Used at startup. */ + +/* Energy parameters */ +#define MAX_BUF_LEN 64 /* History length of energy signals. */ +#define FAR_ENERGY_MIN 1025 /* Lowest Far energy level: At least 2 */ + /* in energy. */ +#define FAR_ENERGY_DIFF 929 /* Allowed difference between max */ + /* and min. */ +#define ENERGY_DEV_OFFSET 0 /* The energy error offset in Q8. */ +#define ENERGY_DEV_TOL 400 /* The energy estimation tolerance (Q8). */ +#define FAR_ENERGY_VAD_REGION 230 /* Far VAD tolerance region. */ + +/* Stepsize parameters */ +#define MU_MIN 10 /* Min stepsize 2^-MU_MIN (far end energy */ + /* dependent). */ +#define MU_MAX 1 /* Max stepsize 2^-MU_MAX (far end energy */ + /* dependent). */ +#define MU_DIFF 9 /* MU_MIN - MU_MAX */ + +/* Channel parameters */ +#define MIN_MSE_COUNT 20 /* Min number of consecutive blocks with enough */ + /* far end energy to compare channel estimates. */ +#define MIN_MSE_DIFF 29 /* The ratio between adapted and stored channel to */ + /* accept a new storage (0.8 in Q-MSE_RESOLUTION). */ +#define MSE_RESOLUTION 5 /* MSE parameter resolution. */ +#define RESOLUTION_CHANNEL16 12 /* W16 Channel in Q-RESOLUTION_CHANNEL16. */ +#define RESOLUTION_CHANNEL32 28 /* W32 Channel in Q-RESOLUTION_CHANNEL. */ +#define CHANNEL_VAD 16 /* Minimum energy in frequency band */ + /* to update channel. */ + +/* Suppression gain parameters: SUPGAIN parameters in Q-(RESOLUTION_SUPGAIN). */ +#define RESOLUTION_SUPGAIN 8 /* Channel in Q-(RESOLUTION_SUPGAIN). */ +#define SUPGAIN_DEFAULT (1 << RESOLUTION_SUPGAIN) /* Default. */ +#define SUPGAIN_ERROR_PARAM_A 3072 /* Estimation error parameter */ + /* (Maximum gain) (8 in Q8). */ +#define SUPGAIN_ERROR_PARAM_B 1536 /* Estimation error parameter */ + /* (Gain before going down). */ +#define SUPGAIN_ERROR_PARAM_D SUPGAIN_DEFAULT /* Estimation error parameter */ + /* (Should be the same as Default) (1 in Q8). */ +#define SUPGAIN_EPC_DT 200 /* SUPGAIN_ERROR_PARAM_C * ENERGY_DEV_TOL */ + +/* Defines for "check delay estimation" */ +#define CORR_WIDTH 31 /* Number of samples to correlate over. */ +#define CORR_MAX 16 /* Maximum correlation offset. */ +#define CORR_MAX_BUF 63 +#define CORR_DEV 4 +#define CORR_MAX_LEVEL 20 +#define CORR_MAX_LOW 4 +#define CORR_BUF_LEN (CORR_MAX << 1) + 1 +/* Note that CORR_WIDTH + 2*CORR_MAX <= MAX_BUF_LEN. */ + +#define ONE_Q14 (1 << 14) + +/* NLP defines */ +#define NLP_COMP_LOW 3277 /* 0.2 in Q14 */ +#define NLP_COMP_HIGH ONE_Q14 /* 1 in Q14 */ + +#endif