Refactored Neon code for AECM module, by using pure assembly code.
Bit exact. Review URL: https://webrtc-codereview.appspot.com/447008 git-svn-id: http://webrtc.googlecode.com/svn/trunk@2382 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
38506ef4d3
commit
f85b35a2f4
@ -56,7 +56,21 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
|
||||
LOCAL_MODULE := libwebrtc_aecm_neon
|
||||
LOCAL_MODULE_TAGS := optional
|
||||
|
||||
LOCAL_SRC_FILES := aecm_core_neon.c
|
||||
GEN := $(LOCAL_PATH)/aecm_core_neon_offsets.h
|
||||
|
||||
# Generate a header file aecm_core_neon_offsets.h which will be included in
|
||||
# assembly file aecm_core_neon.S, from file aecm_core_neon_offsets.c.
|
||||
$(GEN): $(LOCAL_PATH)/../../../../src/build/generate_asm_header.py \
|
||||
$(intermediates)/aecm_core_neon_offsets.S
|
||||
@python $^ $@ offset_aecm_
|
||||
|
||||
$(intermediates)/aecm_core_neon_offsets.S: \
|
||||
$(LOCAL_PATH)/aecm_core_neon_offsets.c
|
||||
@$(TARGET_CC) $(addprefix -I, $(LOCAL_INCLUDES)) $(addprefix -isystem ,\
|
||||
$(TARGET_C_INCLUDES)) -S -o $@ $^
|
||||
|
||||
LOCAL_GENERATED_SOURCES := $(GEN)
|
||||
LOCAL_SRC_FILES := aecm_core_neon.S
|
||||
|
||||
# Flags passed to both C and C++ files.
|
||||
LOCAL_CFLAGS := \
|
||||
@ -70,6 +84,8 @@ LOCAL_C_INCLUDES := \
|
||||
$(LOCAL_PATH)/../../.. \
|
||||
$(LOCAL_PATH)/../../../common_audio/signal_processing/include
|
||||
|
||||
LOCAL_INCLUDES := $(LOCAL_C_INCLUDES)
|
||||
|
||||
ifndef NDK_ROOT
|
||||
include external/stlport/libstlport.mk
|
||||
endif
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "aecm_core.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "cpu_features_wrapper.h"
|
||||
@ -197,6 +198,15 @@ static const WebRtc_Word16 kSinTable[] = {
|
||||
static const WebRtc_Word16 kNoiseEstQDomain = 15;
|
||||
static const WebRtc_Word16 kNoiseEstIncCount = 5;
|
||||
|
||||
// TODO(andrew): put this into general WebRTC so other modules can use it.
|
||||
// Define a compiler-time assertion.
|
||||
#define WEBRTC_STATIC_ASSERT(name, boolean_cond) \
|
||||
static char const static_assert_##name[(boolean_cond) ? 1 : -1] = {'!'}
|
||||
|
||||
// Assert a preprocessor definition at compile-time. It's an assumption
|
||||
// used in assembly code, so check the assembly files before any change.
|
||||
WEBRTC_STATIC_ASSERT(PART_LEN, PART_LEN % 16 == 0);
|
||||
|
||||
static void ComfortNoise(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* dfa,
|
||||
complex16_t* out,
|
||||
@ -395,6 +405,18 @@ static void WindowAndFFTC(WebRtc_Word16* fft,
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize function pointers for ARM Neon platform.
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON)
|
||||
static void WebRtcAecm_InitNeon(void)
|
||||
{
|
||||
WebRtcAecm_WindowAndFFT = WebRtcAecm_WindowAndFFTNeon;
|
||||
WebRtcAecm_InverseFFTAndWindow = WebRtcAecm_InverseFFTAndWindowNeon;
|
||||
WebRtcAecm_CalcLinearEnergies = WebRtcAecm_CalcLinearEnergiesNeon;
|
||||
WebRtcAecm_StoreAdaptiveChannel = WebRtcAecm_StoreAdaptiveChannelNeon;
|
||||
WebRtcAecm_ResetAdaptiveChannel = WebRtcAecm_ResetAdaptiveChannelNeon;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void InverseFFTAndWindowC(AecmCore_t* aecm,
|
||||
WebRtc_Word16* fft,
|
||||
complex16_t* efw,
|
||||
@ -673,7 +695,7 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
|
||||
uint64_t features = WebRtc_GetCPUFeaturesARM();
|
||||
if ((features & kCPUFeatureNEON) != 0)
|
||||
{
|
||||
WebRtcAecm_InitNeon();
|
||||
WebRtcAecm_InitNeon();
|
||||
}
|
||||
#elif defined(WEBRTC_ARCH_ARM_NEON)
|
||||
WebRtcAecm_InitNeon();
|
||||
@ -1850,7 +1872,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm,
|
||||
{
|
||||
hnl[i] = 0;
|
||||
}
|
||||
|
||||
|
||||
// Remove outliers
|
||||
if (numPosCoef < 3)
|
||||
{
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
@ -10,92 +10,13 @@
|
||||
|
||||
// Performs echo control (suppression) with fft routines in fixed-point
|
||||
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_MAIN_SOURCE_AECM_CORE_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_MAIN_SOURCE_AECM_CORE_H_
|
||||
|
||||
#define AECM_DYNAMIC_Q // turn on/off dynamic Q-domain
|
||||
//#define AECM_WITH_ABS_APPROX
|
||||
//#define AECM_SHORT // for 32 sample partition length (otherwise 64)
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_
|
||||
|
||||
#include "typedefs.h"
|
||||
#include "signal_processing_library.h"
|
||||
|
||||
// Algorithm parameters
|
||||
|
||||
#define FRAME_LEN 80 // Total frame length, 10 ms
|
||||
#ifdef AECM_SHORT
|
||||
|
||||
#define PART_LEN 32 // Length of partition
|
||||
#define PART_LEN_SHIFT 6 // Length of (PART_LEN * 2) in base 2
|
||||
|
||||
#else
|
||||
|
||||
#define PART_LEN 64 // Length of partition
|
||||
#define PART_LEN_SHIFT 7 // Length of (PART_LEN * 2) in base 2
|
||||
|
||||
#endif
|
||||
|
||||
#define PART_LEN1 (PART_LEN + 1) // Unique fft coefficients
|
||||
#define PART_LEN2 (PART_LEN << 1) // Length of partition * 2
|
||||
#define PART_LEN4 (PART_LEN << 2) // Length of partition * 4
|
||||
#define FAR_BUF_LEN PART_LEN4 // Length of buffers
|
||||
#define MAX_DELAY 100
|
||||
|
||||
// Counter parameters
|
||||
#ifdef AECM_SHORT
|
||||
|
||||
#define CONV_LEN 1024 // Convergence length used at startup
|
||||
#else
|
||||
|
||||
#define CONV_LEN 512 // Convergence length used at startup
|
||||
#endif
|
||||
|
||||
#define CONV_LEN2 (CONV_LEN << 1) // Convergence length * 2 used at startup
|
||||
// Energy parameters
|
||||
#define MAX_BUF_LEN 64 // History length of energy signals
|
||||
|
||||
#define FAR_ENERGY_MIN 1025 // Lowest Far energy level: At least 2 in energy
|
||||
#define FAR_ENERGY_DIFF 929 // Allowed difference between max and min
|
||||
|
||||
#define ENERGY_DEV_OFFSET 0 // The energy error offset in Q8
|
||||
#define ENERGY_DEV_TOL 400 // The energy estimation tolerance in Q8
|
||||
#define FAR_ENERGY_VAD_REGION 230 // Far VAD tolerance region
|
||||
// Stepsize parameters
|
||||
#define MU_MIN 10 // Min stepsize 2^-MU_MIN (far end energy dependent)
|
||||
#define MU_MAX 1 // Max stepsize 2^-MU_MAX (far end energy dependent)
|
||||
#define MU_DIFF 9 // MU_MIN - MU_MAX
|
||||
// Channel parameters
|
||||
#define MIN_MSE_COUNT 20 // Min number of consecutive blocks with enough far end
|
||||
// energy to compare channel estimates
|
||||
#define MIN_MSE_DIFF 29 // The ratio between adapted and stored channel to
|
||||
// accept a new storage (0.8 in Q-MSE_RESOLUTION)
|
||||
#define MSE_RESOLUTION 5 // MSE parameter resolution
|
||||
#define RESOLUTION_CHANNEL16 12 // W16 Channel in Q-RESOLUTION_CHANNEL16
|
||||
#define RESOLUTION_CHANNEL32 28 // W32 Channel in Q-RESOLUTION_CHANNEL
|
||||
#define CHANNEL_VAD 16 // Minimum energy in frequency band to update channel
|
||||
// Suppression gain parameters: SUPGAIN_ parameters in Q-(RESOLUTION_SUPGAIN)
|
||||
#define RESOLUTION_SUPGAIN 8 // Channel in Q-(RESOLUTION_SUPGAIN)
|
||||
#define SUPGAIN_DEFAULT (1 << RESOLUTION_SUPGAIN) // Default suppression gain
|
||||
#define SUPGAIN_ERROR_PARAM_A 3072 // Estimation error parameter (Maximum gain) (8 in Q8)
|
||||
#define SUPGAIN_ERROR_PARAM_B 1536 // Estimation error parameter (Gain before going down)
|
||||
#define SUPGAIN_ERROR_PARAM_D SUPGAIN_DEFAULT // Estimation error parameter
|
||||
// (Should be the same as Default) (1 in Q8)
|
||||
#define SUPGAIN_EPC_DT 200 // = SUPGAIN_ERROR_PARAM_C * ENERGY_DEV_TOL
|
||||
// Defines for "check delay estimation"
|
||||
#define CORR_WIDTH 31 // Number of samples to correlate over.
|
||||
#define CORR_MAX 16 // Maximum correlation offset
|
||||
#define CORR_MAX_BUF 63
|
||||
#define CORR_DEV 4
|
||||
#define CORR_MAX_LEVEL 20
|
||||
#define CORR_MAX_LOW 4
|
||||
#define CORR_BUF_LEN (CORR_MAX << 1) + 1
|
||||
// Note that CORR_WIDTH + 2*CORR_MAX <= MAX_BUF_LEN
|
||||
|
||||
#define ONE_Q14 (1 << 14)
|
||||
|
||||
// NLP defines
|
||||
#define NLP_COMP_LOW 3277 // 0.2 in Q14
|
||||
#define NLP_COMP_HIGH ONE_Q14 // 1 in Q14
|
||||
#include "aecm_defines.h"
|
||||
|
||||
extern const WebRtc_Word16 WebRtcAecm_kSqrtHanning[];
|
||||
|
||||
@ -368,8 +289,33 @@ typedef void (*InverseFFTAndWindow)(
|
||||
const WebRtc_Word16* nearendClean);
|
||||
extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
|
||||
|
||||
// Initialization of the above function pointers for ARM Neon.
|
||||
void WebRtcAecm_InitNeon(void);
|
||||
// For the above function pointers, functions for generic platforms are declared
|
||||
// and defined as static in file aecm_core.c, while those for ARM Neon platforms
|
||||
// are declared below and defined in file aecm_core_neon.s.
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || defined (WEBRTC_ARCH_ARM_NEON)
|
||||
void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft,
|
||||
const WebRtc_Word16* time_signal,
|
||||
complex16_t* freq_signal,
|
||||
int time_signal_scaling);
|
||||
|
||||
void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
|
||||
WebRtc_Word16* fft,
|
||||
complex16_t* efw,
|
||||
WebRtc_Word16* output,
|
||||
const WebRtc_Word16* nearendClean);
|
||||
|
||||
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est,
|
||||
WebRtc_UWord32* far_energy,
|
||||
WebRtc_UWord32* echo_energy_adapt,
|
||||
WebRtc_UWord32* echo_energy_stored);
|
||||
|
||||
void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est);
|
||||
|
||||
void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
361
src/modules/audio_processing/aecm/aecm_core_neon.S
Normal file
361
src/modules/audio_processing/aecm/aecm_core_neon.S
Normal file
@ -0,0 +1,361 @@
|
||||
@
|
||||
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
@
|
||||
@ Use of this source code is governed by a BSD-style license
|
||||
@ that can be found in the LICENSE file in the root of the source
|
||||
@ tree. An additional intellectual property rights grant can be found
|
||||
@ in the file PATENTS. All contributing project authors may
|
||||
@ be found in the AUTHORS file in the root of the source tree.
|
||||
@
|
||||
|
||||
@ aecm_core_neon.s
|
||||
@ This file contains some functions in AECM, optimized for ARM Neon
|
||||
@ platforms. Reference C code is in file aecm_core.c. Bit-exact.
|
||||
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
#include "aecm_defines.h"
|
||||
#include "aecm_core_neon_offsets.h"
|
||||
|
||||
.extern WebRtcAecm_kSqrtHanning
|
||||
|
||||
.global WebRtcAecm_WindowAndFFTNeon
|
||||
.global WebRtcAecm_InverseFFTAndWindowNeon
|
||||
.global WebRtcAecm_CalcLinearEnergiesNeon
|
||||
.global WebRtcAecm_StoreAdaptiveChannelNeon
|
||||
.global WebRtcAecm_ResetAdaptiveChannelNeon
|
||||
|
||||
@ void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft,
|
||||
@ const WebRtc_Word16* time_signal,
|
||||
@ complex16_t* freq_signal,
|
||||
@ int time_signal_scaling);
|
||||
.align 2
|
||||
WebRtcAecm_WindowAndFFTNeon:
|
||||
.fnstart
|
||||
.save {r4, r5, lr}
|
||||
push {r4, r5, lr}
|
||||
|
||||
vdup.16 d16, r3
|
||||
mov r5, r2 @ WebRtcSpl_ComplexIFFT changes r2.
|
||||
|
||||
vmov.i16 d21, #0 @ For imaginary parts of |fft|.
|
||||
vmov.i16 d27, #0 @ For imaginary parts of |fft|.
|
||||
ldr r2, =WebRtcAecm_kSqrtHanning
|
||||
adr lr, kSqrtHanningReversed
|
||||
add r4, r0, #(PART_LEN2 * 2) @ &fft[PART_LEN2]
|
||||
add r12, r1, #(PART_LEN * 2) @ time_signal[PART_LEN]
|
||||
mov r3, #(PART_LEN / 4) @ Loop counter, unrolled by 4
|
||||
|
||||
LOOP_PART_LEN:
|
||||
vld1.16 d0, [r1, :64]! @ time_signal[i]
|
||||
vld1.16 d22, [r12, :64]! @ time_signal[i + PART_LEN]
|
||||
vld1.16 d17, [r2, :64]! @ WebRtcAecm_kSqrtHanning[i]
|
||||
vld1.16 d23, [lr, :64]! @ kSqrtHanningReversed[i]
|
||||
vshl.s16 d18, d0, d16
|
||||
vshl.s16 d22, d22, d16
|
||||
vmull.s16 q9, d18, d17
|
||||
vmull.s16 q12, d22, d23
|
||||
subs r3, #1
|
||||
vshrn.i32 d20, q9, #14
|
||||
vshrn.i32 d26, q12, #14
|
||||
vst2.16 {d20, d21}, [r0, :128]! @ fft[j]
|
||||
vst2.16 {d26, d27}, [r4, :128]! @ fft[PART_LEN2 + j]
|
||||
bgt LOOP_PART_LEN
|
||||
|
||||
sub r4, r0, #(PART_LEN2 * 2) @ r4 points to fft[0]
|
||||
mov r0, r4
|
||||
mov r1, #7
|
||||
bl WebRtcSpl_ComplexBitReverse
|
||||
|
||||
mov r0, r4
|
||||
mov r1, #7
|
||||
mov r2, #1
|
||||
bl WebRtcSpl_ComplexFFT
|
||||
|
||||
mov r3, #(PART_LEN * 2 / 16) @ Loop counter, unrolled by 16.
|
||||
|
||||
LOOP_PART_LEN2:
|
||||
@ freq_signal[i].real = fft[j];
|
||||
@ freq_signal[i].imag = - fft[j+1];
|
||||
vld2.16 {d20, d21, d22, d23}, [r4, :256]!
|
||||
subs r3, #1
|
||||
vneg.s16 d22, d22
|
||||
vneg.s16 d23, d23
|
||||
vst2.16 {d20, d21, d22, d23}, [r5, :256]!
|
||||
bgt LOOP_PART_LEN2
|
||||
|
||||
pop {r4, r5, pc}
|
||||
.fnend
|
||||
|
||||
@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
|
||||
@ WebRtc_Word16* fft,
|
||||
@ complex16_t* efw,
|
||||
@ WebRtc_Word16* output,
|
||||
@ const WebRtc_Word16* nearendClean);
|
||||
.align 2
|
||||
WebRtcAecm_InverseFFTAndWindowNeon:
|
||||
.fnstart
|
||||
.save {r4-r8, lr}
|
||||
push {r4-r8, lr}
|
||||
|
||||
@ Values of r0, r1, and r3 will change in WebRtcSpl_ComplexIFFT
|
||||
@ and WebRtcSpl_ComplexBitReverse.
|
||||
mov r4, r1
|
||||
mov r5, r0
|
||||
mov r7, r3
|
||||
|
||||
add r3, r1, #((PART_LEN4 - 6) * 2) @ &fft[PART_LEN4 - 6]
|
||||
mov r6, #(PART_LEN / 4) @ Loop counter, unrolled by 4
|
||||
add r12, r2, #(PART_LEN * 4) @ &efw[PART_LEN]
|
||||
mov r8, #-16
|
||||
|
||||
LOOP_PRE_IFFT:
|
||||
vld2.16 {q10}, [r2, :128]!
|
||||
vmov q11, q10
|
||||
vneg.s16 d23, d23
|
||||
vst2.16 {d22, d23}, [r1, :128]!
|
||||
vrev64.16 q10, q10
|
||||
subs r6, #1
|
||||
vst2.16 {q10}, [r3], r8
|
||||
bgt LOOP_PRE_IFFT
|
||||
|
||||
@ fft[PART_LEN2] = efw[PART_LEN].real;
|
||||
@ fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
|
||||
ldr r8, [r12]
|
||||
ssub16 r2, r6, r8
|
||||
mov r1, #(PART_LEN2 * 2)
|
||||
pkhbt r8, r8, r2
|
||||
str r8, [r4, r1]
|
||||
|
||||
mov r0, r4
|
||||
mov r1, #7
|
||||
bl WebRtcSpl_ComplexBitReverse
|
||||
|
||||
mov r0, r4
|
||||
mov r1, #7
|
||||
mov r2, #1
|
||||
bl WebRtcSpl_ComplexIFFT
|
||||
|
||||
mov r1, r4
|
||||
mov r2, r4
|
||||
mov r3, #(PART_LEN * 2 / 8) @ Loop counter, unrolled by 8.
|
||||
|
||||
LOOP_GET_REAL_VALUES:
|
||||
vld2.16 {q10, q11}, [r2, :256]!
|
||||
subs r3, #1
|
||||
vst1.16 {q10}, [r1, :128]!
|
||||
bgt LOOP_GET_REAL_VALUES
|
||||
|
||||
ldr r6, =offset_aecm_outBuf
|
||||
ldr r12, =offset_aecm_dfaCleanQDomain
|
||||
ldr r8, [r5, r6] @ &aecm->outBuf[0]
|
||||
ldrsh r2, [r5, r12] @ &aecm->dfaCleanQDomain[0]
|
||||
|
||||
adr r12, kSqrtHanningReversed
|
||||
ldr r6, =WebRtcAecm_kSqrtHanning
|
||||
rsb r0, r2, r0 @ outCFFT - aecm->dfaCleanQDomain
|
||||
vdup.32 q9, r0
|
||||
add r0, r4, #(PART_LEN * 2) @ &fft[PART_LEN]
|
||||
mov r3, #(PART_LEN / 4) @ Loop counter, unrolled by 4.
|
||||
|
||||
LOOP_POST_IFFT:
|
||||
vld1.16 d16, [r4, :64] @ fft[i];
|
||||
vld1.16 d17, [r6, :64]! @ WebRtcAecm_kSqrtHanning[i]
|
||||
vld1.16 d20, [r8, :64] @ aecm->outBuf[i]
|
||||
vmull.s16 q8, d16, d17
|
||||
vmovl.s16 q10, d20
|
||||
vrshr.s32 q8, q8, #14
|
||||
vld1.16 d0, [r0, :64]! @ &fft[PART_LEN + i]
|
||||
vshl.s32 q8, q8, q9
|
||||
vld1.16 d1, [r12, :64]! @ kSqrtHanningReversed[i]
|
||||
vadd.i32 q8, q10
|
||||
vmull.s16 q0, d0, d1
|
||||
vqshrn.s32 d16, q8, #0
|
||||
vshr.s32 q0, q0, #14
|
||||
vst1.16 d16, [r4, :64]! @ fft[i];
|
||||
vshl.s32 q0, q0, q9
|
||||
vst1.16 d16, [r7, :64]! @ output[i]
|
||||
vqshrn.s32 d0, q0, #0
|
||||
subs r3, #1
|
||||
vst1.16 d0, [r8, :64]! @ aecm->outBuf[i]
|
||||
bgt LOOP_POST_IFFT
|
||||
|
||||
ldr r3, =offset_aecm_xBuf
|
||||
ldr r12, =offset_aecm_dBufNoisy
|
||||
ldr r3, [r5, r3] @ &aecm->xBuf[0]
|
||||
ldr r1, [r5, r12] @ &aecm->dBufNoisy[0]
|
||||
add r2, r3, #(PART_LEN * 2) @ &aecm->xBuf[PART_LEN]
|
||||
add r0, r1, #(PART_LEN * 2) @ &aecm->dBufNoisy[PART_LEN]
|
||||
mov r4, #(PART_LEN / 16) @ Loop counter, unrolled by 16.
|
||||
|
||||
LOOP_COPY:
|
||||
vld1.16 {q10, q11}, [r2, :256]!
|
||||
vld1.16 {q12, q13}, [r0, :256]!
|
||||
subs r4, #1
|
||||
vst1.16 {q10, q11}, [r3, :256]!
|
||||
vst1.16 {q12, q13}, [r1, :256]!
|
||||
bgt LOOP_COPY
|
||||
|
||||
ldr r2, [sp, #24]
|
||||
cmp r2, #0 @ Check if (nearendClean != NULL).
|
||||
beq END
|
||||
|
||||
ldr r4, =offset_aecm_dBufClean
|
||||
ldr r1, [r5, r4] @ &aecm->dBufClean[0]
|
||||
add r0, r1, #(PART_LEN * 2) @ &aecm->dBufClean[PART_LEN]
|
||||
|
||||
vld1.16 {q10, q11}, [r0, :256]!
|
||||
vld1.16 {q12, q13}, [r0, :256]!
|
||||
vst1.16 {q10, q11}, [r1, :256]!
|
||||
vst1.16 {q12, q13}, [r1, :256]!
|
||||
vld1.16 {q10, q11}, [r0, :256]!
|
||||
vld1.16 {q12, q13}, [r0, :256]!
|
||||
vst1.16 {q10, q11}, [r1, :256]!
|
||||
vst1.16 {q12, q13}, [r1, :256]!
|
||||
|
||||
END:
|
||||
pop {r4-r8, pc}
|
||||
.fnend
|
||||
|
||||
@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
|
||||
@ const WebRtc_UWord16* far_spectrum,
|
||||
@ WebRtc_Word32* echo_est,
|
||||
@ WebRtc_UWord32* far_energy,
|
||||
@ WebRtc_UWord32* echo_energy_adapt,
|
||||
@ WebRtc_UWord32* echo_energy_stored);
|
||||
.align 2
|
||||
WebRtcAecm_CalcLinearEnergiesNeon:
|
||||
.fnstart
|
||||
.save {r4-r7}
|
||||
push {r4-r7}
|
||||
|
||||
vmov.i32 q14, #0
|
||||
vmov.i32 q8, #0
|
||||
vmov.i32 q9, #0
|
||||
|
||||
ldr r7, =offset_aecm_channelStored
|
||||
ldr r5, =offset_aecm_channelAdapt16
|
||||
|
||||
mov r4, r2
|
||||
mov r12, #(PART_LEN / 8) @ Loop counter, unrolled by 8.
|
||||
ldr r6, [r0, r7]
|
||||
ldr r7, [r0, r5]
|
||||
|
||||
LOOP_CALC_LINEAR_ENERGIES:
|
||||
vld1.16 {d26, d27}, [r1]! @ far_spectrum[i]
|
||||
vld1.16 {d24, d25}, [r6, :128]! @ &aecm->channelStored[i]
|
||||
vld1.16 {d0, d1}, [r7, :128]! @ &aecm->channelAdapt16[i]
|
||||
vaddw.u16 q14, q14, d26
|
||||
vmull.u16 q10, d26, d24
|
||||
vmull.u16 q11, d27, d25
|
||||
vaddw.u16 q14, q14, d27
|
||||
vmull.u16 q1, d26, d0
|
||||
vst1.32 {q10, q11}, [r4, :256]! @ &echo_est[i]
|
||||
vadd.u32 q8, q10
|
||||
vmull.u16 q2, d27, d1
|
||||
vadd.u32 q8, q11
|
||||
vadd.u32 q9, q1
|
||||
subs r12, #1
|
||||
vadd.u32 q9, q2
|
||||
bgt LOOP_CALC_LINEAR_ENERGIES
|
||||
|
||||
vadd.u32 d28, d29
|
||||
vpadd.u32 d28, d28
|
||||
vmov.32 r12, d28[0]
|
||||
vadd.u32 d18, d19
|
||||
vpadd.u32 d18, d18
|
||||
vmov.32 r5, d18[0] @ echo_energy_adapt_r
|
||||
vadd.u32 d16, d17
|
||||
vpadd.u32 d16, d16
|
||||
|
||||
ldrh r1, [r1] @ far_spectrum[i]
|
||||
add r12, r12, r1
|
||||
str r12, [r3] @ far_energy
|
||||
vmov.32 r2, d16[0]
|
||||
|
||||
ldrsh r12, [r6] @ aecm->channelStored[i]
|
||||
ldrh r6, [r7] @ aecm->channelAdapt16[i]
|
||||
mul r0, r12, r1
|
||||
mla r1, r6, r1, r5
|
||||
add r2, r2, r0
|
||||
str r0, [r4] @ echo_est[i]
|
||||
ldr r4, [sp, #20] @ &echo_energy_stored
|
||||
str r2, [r4]
|
||||
ldr r3, [sp, #16] @ &echo_energy_adapt
|
||||
str r1, [r3]
|
||||
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
|
||||
@ const uint16_t* far_spectrum,
|
||||
@ int32_t* echo_est);
|
||||
.align 2
|
||||
WebRtcAecm_StoreAdaptiveChannelNeon:
|
||||
.fnstart
|
||||
ldr r3, =offset_aecm_channelAdapt16
|
||||
ldr r12, =offset_aecm_channelStored
|
||||
ldr r3, [r0, r3]
|
||||
ldr r0, [r0, r12]
|
||||
mov r12, #(PART_LEN / 8) @ Loop counter, unrolled by 8.
|
||||
|
||||
LOOP_STORE_ADAPTIVE_CHANNEL:
|
||||
vld1.16 {d24, d25}, [r3, :128]! @ &aecm->channelAdapt16[i]
|
||||
vld1.16 {d26, d27}, [r1]! @ &far_spectrum[i]
|
||||
vst1.16 {d24, d25}, [r0, :128]! @ &aecm->channelStored[i]
|
||||
vmull.u16 q10, d26, d24
|
||||
vmull.u16 q11, d27, d25
|
||||
vst1.16 {q10, q11}, [r2, :256]! @ echo_est[i]
|
||||
subs r12, #1
|
||||
bgt LOOP_STORE_ADAPTIVE_CHANNEL
|
||||
|
||||
ldrsh r12, [r3]
|
||||
strh r12, [r0]
|
||||
ldrh r1, [r1]
|
||||
mul r3, r1, r12
|
||||
str r3, [r2]
|
||||
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
@ void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm);
|
||||
.align 2
|
||||
WebRtcAecm_ResetAdaptiveChannelNeon:
|
||||
.fnstart
|
||||
ldr r1, =offset_aecm_channelAdapt16
|
||||
ldr r2, =offset_aecm_channelAdapt32
|
||||
movw r3, #offset_aecm_channelStored
|
||||
ldr r1, [r0, r1] @ &aecm->channelAdapt16[0]
|
||||
ldr r2, [r0, r2] @ &aecm->channelAdapt32[0]
|
||||
ldr r0, [r0, r3] @ &aecm->channelStored[0]
|
||||
mov r3, #(PART_LEN / 8) @ Loop counter, unrolled by 8.
|
||||
|
||||
LOOP_RESET_ADAPTIVE_CHANNEL:
|
||||
vld1.16 {d24, d25}, [r0, :128]!
|
||||
subs r3, #1
|
||||
vst1.16 {d24, d25}, [r1, :128]!
|
||||
vshll.s16 q10, d24, #16
|
||||
vshll.s16 q11, d25, #16
|
||||
vst1.16 {q10, q11}, [r2, :256]!
|
||||
bgt LOOP_RESET_ADAPTIVE_CHANNEL
|
||||
|
||||
ldrh r0, [r0]
|
||||
strh r0, [r1]
|
||||
mov r0, r0, asl #16
|
||||
str r0, [r2]
|
||||
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
@ Square root of Hanning window in Q14. Compared to WebRtcAecm_kSqrtHanning,
|
||||
@ the order was reversed and one useless element (0) was removed.
|
||||
.align 3
|
||||
kSqrtHanningReversed:
|
||||
.hword 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947
|
||||
.hword 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571
|
||||
.hword 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335
|
||||
.hword 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370
|
||||
.hword 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101
|
||||
.hword 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399
|
@ -34,10 +34,10 @@ static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) =
|
||||
1594, 1196, 798, 399
|
||||
};
|
||||
|
||||
static void WindowAndFFTNeon(WebRtc_Word16* fft,
|
||||
const WebRtc_Word16* time_signal,
|
||||
complex16_t* freq_signal,
|
||||
int time_signal_scaling) {
|
||||
void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft,
|
||||
const WebRtc_Word16* time_signal,
|
||||
complex16_t* freq_signal,
|
||||
int time_signal_scaling) {
|
||||
int i, j;
|
||||
|
||||
int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
|
||||
@ -86,11 +86,11 @@ static void WindowAndFFTNeon(WebRtc_Word16* fft,
|
||||
}
|
||||
}
|
||||
|
||||
static void InverseFFTAndWindowNeon(AecmCore_t* aecm,
|
||||
WebRtc_Word16* fft,
|
||||
complex16_t* efw,
|
||||
WebRtc_Word16* output,
|
||||
const WebRtc_Word16* nearendClean) {
|
||||
void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
|
||||
WebRtc_Word16* fft,
|
||||
complex16_t* efw,
|
||||
WebRtc_Word16* output,
|
||||
const WebRtc_Word16* nearendClean) {
|
||||
int i, j, outCFFT;
|
||||
|
||||
// Synthesis
|
||||
@ -186,12 +186,12 @@ static void InverseFFTAndWindowNeon(AecmCore_t* aecm,
|
||||
}
|
||||
}
|
||||
|
||||
static void CalcLinearEnergiesNeon(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est,
|
||||
WebRtc_UWord32* far_energy,
|
||||
WebRtc_UWord32* echo_energy_adapt,
|
||||
WebRtc_UWord32* echo_energy_stored) {
|
||||
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est,
|
||||
WebRtc_UWord32* far_energy,
|
||||
WebRtc_UWord32* echo_energy_adapt,
|
||||
WebRtc_UWord32* echo_energy_stored) {
|
||||
int i;
|
||||
|
||||
register WebRtc_UWord32 far_energy_r;
|
||||
@ -249,9 +249,9 @@ static void CalcLinearEnergiesNeon(AecmCore_t* aecm,
|
||||
aecm->channelAdapt16[i], far_spectrum[i]);
|
||||
}
|
||||
|
||||
static void StoreAdaptiveChannelNeon(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est) {
|
||||
void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
|
||||
const WebRtc_UWord16* far_spectrum,
|
||||
WebRtc_Word32* echo_est) {
|
||||
int i;
|
||||
|
||||
// During startup we store the channel every block.
|
||||
@ -271,7 +271,7 @@ static void StoreAdaptiveChannelNeon(AecmCore_t* aecm,
|
||||
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
|
||||
}
|
||||
|
||||
static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
|
||||
void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < PART_LEN - 7; i += 8) {
|
||||
@ -292,10 +292,3 @@ static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
|
||||
(WebRtc_Word32)aecm->channelStored[i], 16);
|
||||
}
|
||||
|
||||
void WebRtcAecm_InitNeon(void) {
|
||||
WebRtcAecm_WindowAndFFT = WindowAndFFTNeon;
|
||||
WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon;
|
||||
WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon;
|
||||
WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon;
|
||||
WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon;
|
||||
}
|
||||
|
26
src/modules/audio_processing/aecm/aecm_core_neon_offsets.c
Normal file
26
src/modules/audio_processing/aecm/aecm_core_neon_offsets.c
Normal file
@ -0,0 +1,26 @@
|
||||
|
||||
/*
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "aecm_core.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
// Define offset variables that will be compiled and abstracted to constant
|
||||
// defines, which will then only be used in ARM assembly code.
|
||||
int offset_aecm_dfaCleanQDomain = offsetof(AecmCore_t, dfaCleanQDomain);
|
||||
int offset_aecm_outBuf = offsetof(AecmCore_t, outBuf);
|
||||
int offset_aecm_xBuf = offsetof(AecmCore_t, xBuf);
|
||||
int offset_aecm_dBufNoisy = offsetof(AecmCore_t, dBufNoisy);
|
||||
int offset_aecm_dBufClean = offsetof(AecmCore_t, dBufClean);
|
||||
int offset_aecm_channelStored = offsetof(AecmCore_t, channelStored);
|
||||
int offset_aecm_channelAdapt16 = offsetof(AecmCore_t, channelAdapt16);
|
||||
int offset_aecm_channelAdapt32 = offsetof(AecmCore_t, channelAdapt32);
|
||||
|
98
src/modules/audio_processing/aecm/aecm_defines.h
Normal file
98
src/modules/audio_processing/aecm/aecm_defines.h
Normal file
@ -0,0 +1,98 @@
|
||||
/*
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_DEFINES_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_DEFINES_H_
|
||||
|
||||
#define AECM_DYNAMIC_Q /* Turn on/off dynamic Q-domain. */
|
||||
|
||||
/* #define AECM_SHORT For 32 sample partition length. */
|
||||
|
||||
/* Algorithm parameters */
|
||||
#define FRAME_LEN 80 /* Total frame length, 10 ms. */
|
||||
|
||||
#ifdef AECM_SHORT
|
||||
#define PART_LEN 32 /* Length of partition. */
|
||||
#define PART_LEN_SHIFT 6 /* Length of (PART_LEN * 2) in base 2. */
|
||||
#else
|
||||
#define PART_LEN 64 /* Length of partition. */
|
||||
#define PART_LEN_SHIFT 7 /* Length of (PART_LEN * 2) in base 2. */
|
||||
#endif
|
||||
|
||||
#define PART_LEN1 (PART_LEN + 1) /* Unique fft coefficients. */
|
||||
#define PART_LEN2 (PART_LEN << 1) /* Length of partition * 2. */
|
||||
#define PART_LEN4 (PART_LEN << 2) /* Length of partition * 4. */
|
||||
#define FAR_BUF_LEN PART_LEN4 /* Length of buffers. */
|
||||
#define MAX_DELAY 100
|
||||
|
||||
/* Counter parameters */
|
||||
#ifdef AECM_SHORT
|
||||
#define CONV_LEN 1024 /* Convergence length used at startup. */
|
||||
#else
|
||||
#define CONV_LEN 512 /* Convergence length used at startup. */
|
||||
#endif
|
||||
#define CONV_LEN2 (CONV_LEN << 1) /* Used at startup. */
|
||||
|
||||
/* Energy parameters */
|
||||
#define MAX_BUF_LEN 64 /* History length of energy signals. */
|
||||
#define FAR_ENERGY_MIN 1025 /* Lowest Far energy level: At least 2 */
|
||||
/* in energy. */
|
||||
#define FAR_ENERGY_DIFF 929 /* Allowed difference between max */
|
||||
/* and min. */
|
||||
#define ENERGY_DEV_OFFSET 0 /* The energy error offset in Q8. */
|
||||
#define ENERGY_DEV_TOL 400 /* The energy estimation tolerance (Q8). */
|
||||
#define FAR_ENERGY_VAD_REGION 230 /* Far VAD tolerance region. */
|
||||
|
||||
/* Stepsize parameters */
|
||||
#define MU_MIN 10 /* Min stepsize 2^-MU_MIN (far end energy */
|
||||
/* dependent). */
|
||||
#define MU_MAX 1 /* Max stepsize 2^-MU_MAX (far end energy */
|
||||
/* dependent). */
|
||||
#define MU_DIFF 9 /* MU_MIN - MU_MAX */
|
||||
|
||||
/* Channel parameters */
|
||||
#define MIN_MSE_COUNT 20 /* Min number of consecutive blocks with enough */
|
||||
/* far end energy to compare channel estimates. */
|
||||
#define MIN_MSE_DIFF 29 /* The ratio between adapted and stored channel to */
|
||||
/* accept a new storage (0.8 in Q-MSE_RESOLUTION). */
|
||||
#define MSE_RESOLUTION 5 /* MSE parameter resolution. */
|
||||
#define RESOLUTION_CHANNEL16 12 /* W16 Channel in Q-RESOLUTION_CHANNEL16. */
|
||||
#define RESOLUTION_CHANNEL32 28 /* W32 Channel in Q-RESOLUTION_CHANNEL. */
|
||||
#define CHANNEL_VAD 16 /* Minimum energy in frequency band */
|
||||
/* to update channel. */
|
||||
|
||||
/* Suppression gain parameters: SUPGAIN parameters in Q-(RESOLUTION_SUPGAIN). */
|
||||
#define RESOLUTION_SUPGAIN 8 /* Channel in Q-(RESOLUTION_SUPGAIN). */
|
||||
#define SUPGAIN_DEFAULT (1 << RESOLUTION_SUPGAIN) /* Default. */
|
||||
#define SUPGAIN_ERROR_PARAM_A 3072 /* Estimation error parameter */
|
||||
/* (Maximum gain) (8 in Q8). */
|
||||
#define SUPGAIN_ERROR_PARAM_B 1536 /* Estimation error parameter */
|
||||
/* (Gain before going down). */
|
||||
#define SUPGAIN_ERROR_PARAM_D SUPGAIN_DEFAULT /* Estimation error parameter */
|
||||
/* (Should be the same as Default) (1 in Q8). */
|
||||
#define SUPGAIN_EPC_DT 200 /* SUPGAIN_ERROR_PARAM_C * ENERGY_DEV_TOL */
|
||||
|
||||
/* Defines for "check delay estimation" */
|
||||
#define CORR_WIDTH 31 /* Number of samples to correlate over. */
|
||||
#define CORR_MAX 16 /* Maximum correlation offset. */
|
||||
#define CORR_MAX_BUF 63
|
||||
#define CORR_DEV 4
|
||||
#define CORR_MAX_LEVEL 20
|
||||
#define CORR_MAX_LOW 4
|
||||
#define CORR_BUF_LEN (CORR_MAX << 1) + 1
|
||||
/* Note that CORR_WIDTH + 2*CORR_MAX <= MAX_BUF_LEN. */
|
||||
|
||||
#define ONE_Q14 (1 << 14)
|
||||
|
||||
/* NLP defines */
|
||||
#define NLP_COMP_LOW 3277 /* 0.2 in Q14 */
|
||||
#define NLP_COMP_HIGH ONE_Q14 /* 1 in Q14 */
|
||||
|
||||
#endif
|
Loading…
x
Reference in New Issue
Block a user