Refactored Neon code for AECM module, by using pure assembly code.
Bit exact. Review URL: https://webrtc-codereview.appspot.com/447008 git-svn-id: http://webrtc.googlecode.com/svn/trunk@2382 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
		| @@ -56,7 +56,21 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES | ||||
| LOCAL_MODULE := libwebrtc_aecm_neon | ||||
| LOCAL_MODULE_TAGS := optional | ||||
|  | ||||
| LOCAL_SRC_FILES := aecm_core_neon.c | ||||
| GEN := $(LOCAL_PATH)/aecm_core_neon_offsets.h | ||||
|  | ||||
| # Generate a header file aecm_core_neon_offsets.h which will be included in | ||||
| # assembly file aecm_core_neon.S, from file aecm_core_neon_offsets.c. | ||||
| $(GEN): $(LOCAL_PATH)/../../../../src/build/generate_asm_header.py \ | ||||
|             $(intermediates)/aecm_core_neon_offsets.S | ||||
| 	@python $^ $@ offset_aecm_ | ||||
|  | ||||
| $(intermediates)/aecm_core_neon_offsets.S: \ | ||||
| 	    $(LOCAL_PATH)/aecm_core_neon_offsets.c | ||||
| 	@$(TARGET_CC) $(addprefix -I, $(LOCAL_INCLUDES)) $(addprefix -isystem ,\ | ||||
|             $(TARGET_C_INCLUDES)) -S -o $@ $^ | ||||
|  | ||||
| LOCAL_GENERATED_SOURCES := $(GEN) | ||||
| LOCAL_SRC_FILES := aecm_core_neon.S | ||||
|  | ||||
| # Flags passed to both C and C++ files. | ||||
| LOCAL_CFLAGS := \ | ||||
| @@ -70,6 +84,8 @@ LOCAL_C_INCLUDES := \ | ||||
|     $(LOCAL_PATH)/../../.. \ | ||||
|     $(LOCAL_PATH)/../../../common_audio/signal_processing/include | ||||
|  | ||||
| LOCAL_INCLUDES := $(LOCAL_C_INCLUDES) | ||||
|  | ||||
| ifndef NDK_ROOT | ||||
| include external/stlport/libstlport.mk | ||||
| endif | ||||
|   | ||||
| @@ -11,6 +11,7 @@ | ||||
| #include "aecm_core.h" | ||||
|  | ||||
| #include <assert.h> | ||||
| #include <stddef.h> | ||||
| #include <stdlib.h> | ||||
|  | ||||
| #include "cpu_features_wrapper.h" | ||||
| @@ -197,6 +198,15 @@ static const WebRtc_Word16 kSinTable[] = { | ||||
| static const WebRtc_Word16 kNoiseEstQDomain = 15; | ||||
| static const WebRtc_Word16 kNoiseEstIncCount = 5; | ||||
|  | ||||
| // TODO(andrew): put this into general WebRTC so other modules can use it. | ||||
| // Define a compiler-time assertion. | ||||
| #define WEBRTC_STATIC_ASSERT(name, boolean_cond) \ | ||||
|   static char const static_assert_##name[(boolean_cond) ? 1 : -1] = {'!'} | ||||
|  | ||||
| // Assert a preprocessor definition at compile-time. It's an assumption | ||||
| // used in assembly code, so check the assembly files before any change. | ||||
| WEBRTC_STATIC_ASSERT(PART_LEN, PART_LEN % 16 == 0); | ||||
|  | ||||
| static void ComfortNoise(AecmCore_t* aecm, | ||||
|                          const WebRtc_UWord16* dfa, | ||||
|                          complex16_t* out, | ||||
| @@ -395,6 +405,18 @@ static void WindowAndFFTC(WebRtc_Word16* fft, | ||||
|     } | ||||
| } | ||||
|  | ||||
| // Initialize function pointers for ARM Neon platform. | ||||
| #if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON) | ||||
| static void WebRtcAecm_InitNeon(void) | ||||
| { | ||||
|   WebRtcAecm_WindowAndFFT = WebRtcAecm_WindowAndFFTNeon; | ||||
|   WebRtcAecm_InverseFFTAndWindow = WebRtcAecm_InverseFFTAndWindowNeon; | ||||
|   WebRtcAecm_CalcLinearEnergies = WebRtcAecm_CalcLinearEnergiesNeon; | ||||
|   WebRtcAecm_StoreAdaptiveChannel = WebRtcAecm_StoreAdaptiveChannelNeon; | ||||
|   WebRtcAecm_ResetAdaptiveChannel = WebRtcAecm_ResetAdaptiveChannelNeon; | ||||
| } | ||||
| #endif | ||||
|  | ||||
| static void InverseFFTAndWindowC(AecmCore_t* aecm, | ||||
|                                  WebRtc_Word16* fft, | ||||
|                                  complex16_t* efw, | ||||
| @@ -673,7 +695,7 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq) | ||||
|     uint64_t features = WebRtc_GetCPUFeaturesARM(); | ||||
|     if ((features & kCPUFeatureNEON) != 0) | ||||
|     { | ||||
|         WebRtcAecm_InitNeon(); | ||||
|       WebRtcAecm_InitNeon(); | ||||
|     } | ||||
| #elif defined(WEBRTC_ARCH_ARM_NEON) | ||||
|     WebRtcAecm_InitNeon(); | ||||
|   | ||||
| @@ -1,5 +1,5 @@ | ||||
| /* | ||||
|  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | ||||
|  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | ||||
|  * | ||||
|  *  Use of this source code is governed by a BSD-style license | ||||
|  *  that can be found in the LICENSE file in the root of the source | ||||
| @@ -10,92 +10,13 @@ | ||||
|  | ||||
| // Performs echo control (suppression) with fft routines in fixed-point | ||||
|  | ||||
| #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_MAIN_SOURCE_AECM_CORE_H_ | ||||
| #define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_MAIN_SOURCE_AECM_CORE_H_ | ||||
|  | ||||
| #define AECM_DYNAMIC_Q // turn on/off dynamic Q-domain | ||||
| //#define AECM_WITH_ABS_APPROX | ||||
| //#define AECM_SHORT                // for 32 sample partition length (otherwise 64) | ||||
| #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_ | ||||
| #define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_ | ||||
|  | ||||
| #include "typedefs.h" | ||||
| #include "signal_processing_library.h" | ||||
|  | ||||
| // Algorithm parameters | ||||
|  | ||||
| #define FRAME_LEN       80              // Total frame length, 10 ms | ||||
| #ifdef AECM_SHORT | ||||
|  | ||||
| #define PART_LEN        32              // Length of partition | ||||
| #define PART_LEN_SHIFT  6               // Length of (PART_LEN * 2) in base 2 | ||||
|  | ||||
| #else | ||||
|  | ||||
| #define PART_LEN        64              // Length of partition | ||||
| #define PART_LEN_SHIFT  7               // Length of (PART_LEN * 2) in base 2 | ||||
|  | ||||
| #endif | ||||
|  | ||||
| #define PART_LEN1       (PART_LEN + 1)  // Unique fft coefficients | ||||
| #define PART_LEN2       (PART_LEN << 1) // Length of partition * 2 | ||||
| #define PART_LEN4       (PART_LEN << 2) // Length of partition * 4 | ||||
| #define FAR_BUF_LEN     PART_LEN4       // Length of buffers | ||||
| #define MAX_DELAY 100 | ||||
|  | ||||
| // Counter parameters | ||||
| #ifdef AECM_SHORT | ||||
|  | ||||
| #define CONV_LEN        1024            // Convergence length used at startup | ||||
| #else | ||||
|  | ||||
| #define CONV_LEN        512             // Convergence length used at startup | ||||
| #endif | ||||
|  | ||||
| #define CONV_LEN2       (CONV_LEN << 1) // Convergence length * 2 used at startup | ||||
| // Energy parameters | ||||
| #define MAX_BUF_LEN     64              // History length of energy signals | ||||
|  | ||||
| #define FAR_ENERGY_MIN  1025            // Lowest Far energy level: At least 2 in energy | ||||
| #define FAR_ENERGY_DIFF 929             // Allowed difference between max and min | ||||
|  | ||||
| #define ENERGY_DEV_OFFSET       0       // The energy error offset in Q8 | ||||
| #define ENERGY_DEV_TOL  400             // The energy estimation tolerance in Q8 | ||||
| #define FAR_ENERGY_VAD_REGION   230     // Far VAD tolerance region | ||||
| // Stepsize parameters | ||||
| #define MU_MIN          10              // Min stepsize 2^-MU_MIN (far end energy dependent) | ||||
| #define MU_MAX          1               // Max stepsize 2^-MU_MAX (far end energy dependent) | ||||
| #define MU_DIFF         9               // MU_MIN - MU_MAX | ||||
| // Channel parameters | ||||
| #define MIN_MSE_COUNT   20              // Min number of consecutive blocks with enough far end | ||||
|                                         // energy to compare channel estimates | ||||
| #define MIN_MSE_DIFF    29              // The ratio between adapted and stored channel to | ||||
|                                         // accept a new storage (0.8 in Q-MSE_RESOLUTION) | ||||
| #define MSE_RESOLUTION  5               // MSE parameter resolution | ||||
| #define RESOLUTION_CHANNEL16    12      // W16 Channel in Q-RESOLUTION_CHANNEL16 | ||||
| #define RESOLUTION_CHANNEL32    28      // W32 Channel in Q-RESOLUTION_CHANNEL | ||||
| #define CHANNEL_VAD     16              // Minimum energy in frequency band to update channel | ||||
| // Suppression gain parameters: SUPGAIN_ parameters in Q-(RESOLUTION_SUPGAIN) | ||||
| #define RESOLUTION_SUPGAIN      8       // Channel in Q-(RESOLUTION_SUPGAIN) | ||||
| #define SUPGAIN_DEFAULT (1 << RESOLUTION_SUPGAIN)   // Default suppression gain | ||||
| #define SUPGAIN_ERROR_PARAM_A   3072    // Estimation error parameter (Maximum gain) (8 in Q8) | ||||
| #define SUPGAIN_ERROR_PARAM_B   1536    // Estimation error parameter (Gain before going down) | ||||
| #define SUPGAIN_ERROR_PARAM_D   SUPGAIN_DEFAULT // Estimation error parameter | ||||
|                                                 // (Should be the same as Default) (1 in Q8) | ||||
| #define SUPGAIN_EPC_DT  200             // = SUPGAIN_ERROR_PARAM_C * ENERGY_DEV_TOL | ||||
| // Defines for "check delay estimation" | ||||
| #define CORR_WIDTH      31              // Number of samples to correlate over. | ||||
| #define CORR_MAX        16              // Maximum correlation offset | ||||
| #define CORR_MAX_BUF    63 | ||||
| #define CORR_DEV        4 | ||||
| #define CORR_MAX_LEVEL  20 | ||||
| #define CORR_MAX_LOW    4 | ||||
| #define CORR_BUF_LEN    (CORR_MAX << 1) + 1 | ||||
| // Note that CORR_WIDTH + 2*CORR_MAX <= MAX_BUF_LEN | ||||
|  | ||||
| #define ONE_Q14         (1 << 14) | ||||
|  | ||||
| // NLP defines | ||||
| #define NLP_COMP_LOW    3277            // 0.2 in Q14 | ||||
| #define NLP_COMP_HIGH   ONE_Q14         // 1 in Q14 | ||||
| #include "aecm_defines.h" | ||||
|  | ||||
| extern const WebRtc_Word16 WebRtcAecm_kSqrtHanning[]; | ||||
|  | ||||
| @@ -368,8 +289,33 @@ typedef void (*InverseFFTAndWindow)( | ||||
|     const WebRtc_Word16* nearendClean); | ||||
| extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow; | ||||
|  | ||||
| // Initialization of the above function pointers for ARM Neon. | ||||
| void WebRtcAecm_InitNeon(void); | ||||
| // For the above function pointers, functions for generic platforms are declared | ||||
| // and defined as static in file aecm_core.c, while those for ARM Neon platforms | ||||
| // are declared below and defined in file aecm_core_neon.s. | ||||
| #if (defined WEBRTC_DETECT_ARM_NEON) || defined (WEBRTC_ARCH_ARM_NEON) | ||||
| void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft, | ||||
|                                  const WebRtc_Word16* time_signal, | ||||
|                                  complex16_t* freq_signal, | ||||
|                                  int time_signal_scaling); | ||||
|  | ||||
| void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, | ||||
|                                         WebRtc_Word16* fft, | ||||
|                                         complex16_t* efw, | ||||
|                                         WebRtc_Word16* output, | ||||
|                                         const WebRtc_Word16* nearendClean); | ||||
|  | ||||
| void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, | ||||
|                                        const WebRtc_UWord16* far_spectrum, | ||||
|                                        WebRtc_Word32* echo_est, | ||||
|                                        WebRtc_UWord32* far_energy, | ||||
|                                        WebRtc_UWord32* echo_energy_adapt, | ||||
|                                        WebRtc_UWord32* echo_energy_stored); | ||||
|  | ||||
| void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, | ||||
|                                          const WebRtc_UWord16* far_spectrum, | ||||
|                                          WebRtc_Word32* echo_est); | ||||
|  | ||||
| void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm); | ||||
| #endif | ||||
|  | ||||
| #endif | ||||
|   | ||||
							
								
								
									
										361
									
								
								src/modules/audio_processing/aecm/aecm_core_neon.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										361
									
								
								src/modules/audio_processing/aecm/aecm_core_neon.S
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,361 @@ | ||||
| @ | ||||
| @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | ||||
| @ | ||||
| @ Use of this source code is governed by a BSD-style license | ||||
| @ that can be found in the LICENSE file in the root of the source | ||||
| @ tree. An additional intellectual property rights grant can be found | ||||
| @ in the file PATENTS.  All contributing project authors may | ||||
| @ be found in the AUTHORS file in the root of the source tree. | ||||
| @ | ||||
|  | ||||
| @ aecm_core_neon.s | ||||
| @ This file contains some functions in AECM, optimized for ARM Neon | ||||
| @ platforms. Reference C code is in file aecm_core.c. Bit-exact. | ||||
|  | ||||
| .arch armv7-a | ||||
| .fpu neon | ||||
|  | ||||
| #include "aecm_defines.h" | ||||
| #include "aecm_core_neon_offsets.h" | ||||
|  | ||||
| .extern WebRtcAecm_kSqrtHanning | ||||
|  | ||||
| .global WebRtcAecm_WindowAndFFTNeon | ||||
| .global WebRtcAecm_InverseFFTAndWindowNeon | ||||
| .global WebRtcAecm_CalcLinearEnergiesNeon | ||||
| .global WebRtcAecm_StoreAdaptiveChannelNeon | ||||
| .global WebRtcAecm_ResetAdaptiveChannelNeon | ||||
|  | ||||
| @ void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft, | ||||
| @                                  const WebRtc_Word16* time_signal, | ||||
| @                                  complex16_t* freq_signal, | ||||
| @                                  int time_signal_scaling); | ||||
| .align  2 | ||||
| WebRtcAecm_WindowAndFFTNeon: | ||||
| .fnstart | ||||
| .save {r4, r5, lr} | ||||
|   push {r4, r5, lr} | ||||
|  | ||||
|   vdup.16 d16, r3 | ||||
|   mov r5, r2                                 @ WebRtcSpl_ComplexIFFT changes r2. | ||||
|  | ||||
|   vmov.i16 d21, #0                           @ For imaginary parts of |fft|. | ||||
|   vmov.i16 d27, #0                           @ For imaginary parts of |fft|. | ||||
|   ldr r2, =WebRtcAecm_kSqrtHanning | ||||
|   adr lr, kSqrtHanningReversed | ||||
|   add r4, r0, #(PART_LEN2 * 2)               @ &fft[PART_LEN2] | ||||
|   add r12, r1, #(PART_LEN * 2)               @ time_signal[PART_LEN] | ||||
|   mov r3, #(PART_LEN / 4)                    @ Loop counter, unrolled by 4 | ||||
|  | ||||
| LOOP_PART_LEN: | ||||
|   vld1.16 d0, [r1, :64]!                     @ time_signal[i] | ||||
|   vld1.16 d22, [r12, :64]!                   @ time_signal[i + PART_LEN] | ||||
|   vld1.16 d17, [r2, :64]!                    @ WebRtcAecm_kSqrtHanning[i] | ||||
|   vld1.16 d23, [lr, :64]!                    @ kSqrtHanningReversed[i] | ||||
|   vshl.s16  d18, d0, d16 | ||||
|   vshl.s16  d22, d22, d16 | ||||
|   vmull.s16 q9, d18, d17 | ||||
|   vmull.s16 q12, d22, d23 | ||||
|   subs r3, #1 | ||||
|   vshrn.i32 d20, q9, #14 | ||||
|   vshrn.i32 d26, q12, #14 | ||||
|   vst2.16 {d20, d21}, [r0, :128]!            @ fft[j] | ||||
|   vst2.16 {d26, d27}, [r4, :128]!            @ fft[PART_LEN2 + j] | ||||
|   bgt LOOP_PART_LEN | ||||
|  | ||||
|   sub r4, r0, #(PART_LEN2 * 2)               @ r4 points to fft[0] | ||||
|   mov r0, r4 | ||||
|   mov r1, #7 | ||||
|   bl  WebRtcSpl_ComplexBitReverse | ||||
|  | ||||
|   mov r0, r4 | ||||
|   mov r1, #7 | ||||
|   mov r2, #1 | ||||
|   bl  WebRtcSpl_ComplexFFT | ||||
|  | ||||
|   mov r3, #(PART_LEN * 2 / 16)               @ Loop counter, unrolled by 16. | ||||
|  | ||||
| LOOP_PART_LEN2: | ||||
|   @ freq_signal[i].real = fft[j]; | ||||
|   @ freq_signal[i].imag = - fft[j+1]; | ||||
|   vld2.16 {d20, d21, d22, d23}, [r4, :256]! | ||||
|   subs r3, #1 | ||||
|   vneg.s16 d22, d22 | ||||
|   vneg.s16 d23, d23 | ||||
|   vst2.16 {d20, d21, d22, d23}, [r5, :256]! | ||||
|   bgt LOOP_PART_LEN2 | ||||
|  | ||||
|   pop {r4, r5, pc} | ||||
| .fnend | ||||
|  | ||||
| @ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, | ||||
| @                                         WebRtc_Word16* fft, | ||||
| @                                         complex16_t* efw, | ||||
| @                                         WebRtc_Word16* output, | ||||
| @                                         const WebRtc_Word16* nearendClean); | ||||
| .align  2 | ||||
| WebRtcAecm_InverseFFTAndWindowNeon: | ||||
| .fnstart | ||||
| .save {r4-r8, lr} | ||||
|   push {r4-r8, lr} | ||||
|  | ||||
|   @ Values of r0, r1, and r3 will change in WebRtcSpl_ComplexIFFT | ||||
|   @ and WebRtcSpl_ComplexBitReverse. | ||||
|   mov r4, r1 | ||||
|   mov r5, r0 | ||||
|   mov r7, r3 | ||||
|  | ||||
|   add r3, r1, #((PART_LEN4 - 6) * 2)         @ &fft[PART_LEN4 - 6] | ||||
|   mov r6, #(PART_LEN / 4)                    @ Loop counter, unrolled by 4 | ||||
|   add r12, r2, #(PART_LEN * 4)               @ &efw[PART_LEN] | ||||
|   mov r8, #-16 | ||||
|  | ||||
| LOOP_PRE_IFFT: | ||||
|   vld2.16 {q10}, [r2, :128]! | ||||
|   vmov q11, q10 | ||||
|   vneg.s16 d23, d23 | ||||
|   vst2.16 {d22, d23}, [r1, :128]! | ||||
|   vrev64.16 q10, q10 | ||||
|   subs r6, #1 | ||||
|   vst2.16 {q10}, [r3], r8 | ||||
|   bgt LOOP_PRE_IFFT | ||||
|  | ||||
|   @  fft[PART_LEN2] = efw[PART_LEN].real; | ||||
|   @  fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; | ||||
|   ldr r8, [r12] | ||||
|   ssub16 r2, r6, r8 | ||||
|   mov r1, #(PART_LEN2 * 2) | ||||
|   pkhbt r8, r8, r2 | ||||
|   str r8, [r4, r1] | ||||
|  | ||||
|   mov r0, r4 | ||||
|   mov r1, #7 | ||||
|   bl  WebRtcSpl_ComplexBitReverse | ||||
|  | ||||
|   mov r0, r4 | ||||
|   mov r1, #7 | ||||
|   mov r2, #1 | ||||
|   bl  WebRtcSpl_ComplexIFFT | ||||
|  | ||||
|   mov r1, r4 | ||||
|   mov r2, r4 | ||||
|   mov r3, #(PART_LEN * 2 / 8)                @ Loop counter, unrolled by 8. | ||||
|  | ||||
| LOOP_GET_REAL_VALUES: | ||||
|   vld2.16 {q10, q11}, [r2, :256]! | ||||
|   subs r3, #1 | ||||
|   vst1.16 {q10}, [r1, :128]! | ||||
|   bgt LOOP_GET_REAL_VALUES | ||||
|  | ||||
|   ldr r6, =offset_aecm_outBuf | ||||
|   ldr r12, =offset_aecm_dfaCleanQDomain | ||||
|   ldr r8, [r5, r6]                           @ &aecm->outBuf[0] | ||||
|   ldrsh r2, [r5, r12]                        @ &aecm->dfaCleanQDomain[0] | ||||
|  | ||||
|   adr r12, kSqrtHanningReversed | ||||
|   ldr r6, =WebRtcAecm_kSqrtHanning | ||||
|   rsb r0, r2, r0                             @ outCFFT - aecm->dfaCleanQDomain | ||||
|   vdup.32 q9, r0 | ||||
|   add r0, r4, #(PART_LEN * 2)                @ &fft[PART_LEN] | ||||
|   mov r3, #(PART_LEN / 4)                    @ Loop counter, unrolled by 4. | ||||
|  | ||||
| LOOP_POST_IFFT: | ||||
|   vld1.16 d16, [r4, :64]                     @ fft[i]; | ||||
|   vld1.16 d17, [r6, :64]!                    @ WebRtcAecm_kSqrtHanning[i] | ||||
|   vld1.16 d20, [r8, :64]                     @ aecm->outBuf[i] | ||||
|   vmull.s16 q8, d16, d17 | ||||
|   vmovl.s16 q10, d20 | ||||
|   vrshr.s32 q8, q8, #14 | ||||
|   vld1.16 d0, [r0, :64]!                     @ &fft[PART_LEN + i] | ||||
|   vshl.s32 q8, q8, q9 | ||||
|   vld1.16 d1, [r12, :64]!                    @ kSqrtHanningReversed[i] | ||||
|   vadd.i32 q8, q10 | ||||
|   vmull.s16 q0, d0, d1 | ||||
|   vqshrn.s32 d16, q8, #0 | ||||
|   vshr.s32 q0, q0, #14 | ||||
|   vst1.16 d16, [r4, :64]!                    @ fft[i]; | ||||
|   vshl.s32 q0, q0, q9 | ||||
|   vst1.16 d16, [r7, :64]!                    @ output[i] | ||||
|   vqshrn.s32 d0, q0, #0 | ||||
|   subs r3, #1 | ||||
|   vst1.16 d0, [r8, :64]!                     @ aecm->outBuf[i] | ||||
|   bgt LOOP_POST_IFFT | ||||
|  | ||||
|   ldr r3, =offset_aecm_xBuf | ||||
|   ldr r12, =offset_aecm_dBufNoisy | ||||
|   ldr r3, [r5, r3]                           @ &aecm->xBuf[0] | ||||
|   ldr r1, [r5, r12]                          @ &aecm->dBufNoisy[0] | ||||
|   add r2, r3, #(PART_LEN * 2)                @ &aecm->xBuf[PART_LEN] | ||||
|   add r0, r1, #(PART_LEN * 2)                @ &aecm->dBufNoisy[PART_LEN] | ||||
|   mov r4, #(PART_LEN / 16)                   @ Loop counter, unrolled by 16. | ||||
|  | ||||
| LOOP_COPY: | ||||
|   vld1.16 {q10, q11}, [r2, :256]! | ||||
|   vld1.16 {q12, q13}, [r0, :256]! | ||||
|   subs r4, #1 | ||||
|   vst1.16 {q10, q11}, [r3, :256]! | ||||
|   vst1.16 {q12, q13}, [r1, :256]! | ||||
|   bgt LOOP_COPY | ||||
|  | ||||
|   ldr r2, [sp, #24] | ||||
|   cmp r2, #0                                  @ Check if (nearendClean != NULL). | ||||
|   beq END | ||||
|  | ||||
|   ldr r4, =offset_aecm_dBufClean | ||||
|   ldr r1, [r5, r4]                            @ &aecm->dBufClean[0] | ||||
|   add r0, r1, #(PART_LEN * 2)                 @ &aecm->dBufClean[PART_LEN] | ||||
|  | ||||
|   vld1.16 {q10, q11}, [r0, :256]! | ||||
|   vld1.16 {q12, q13}, [r0, :256]! | ||||
|   vst1.16 {q10, q11}, [r1, :256]! | ||||
|   vst1.16 {q12, q13}, [r1, :256]! | ||||
|   vld1.16 {q10, q11}, [r0, :256]! | ||||
|   vld1.16 {q12, q13}, [r0, :256]! | ||||
|   vst1.16 {q10, q11}, [r1, :256]! | ||||
|   vst1.16 {q12, q13}, [r1, :256]! | ||||
|  | ||||
| END: | ||||
|   pop {r4-r8, pc} | ||||
| .fnend | ||||
|  | ||||
| @ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, | ||||
| @                                        const WebRtc_UWord16* far_spectrum, | ||||
| @                                        WebRtc_Word32* echo_est, | ||||
| @                                        WebRtc_UWord32* far_energy, | ||||
| @                                        WebRtc_UWord32* echo_energy_adapt, | ||||
| @                                        WebRtc_UWord32* echo_energy_stored); | ||||
| .align  2 | ||||
| WebRtcAecm_CalcLinearEnergiesNeon: | ||||
| .fnstart | ||||
| .save {r4-r7} | ||||
|   push {r4-r7} | ||||
|  | ||||
|   vmov.i32 q14, #0 | ||||
|   vmov.i32 q8,  #0 | ||||
|   vmov.i32 q9,  #0 | ||||
|  | ||||
|   ldr r7, =offset_aecm_channelStored | ||||
|   ldr r5, =offset_aecm_channelAdapt16 | ||||
|  | ||||
|   mov r4, r2 | ||||
|   mov r12, #(PART_LEN / 8)                   @  Loop counter, unrolled by 8. | ||||
|   ldr r6, [r0, r7] | ||||
|   ldr r7, [r0, r5] | ||||
|  | ||||
| LOOP_CALC_LINEAR_ENERGIES: | ||||
|   vld1.16 {d26, d27}, [r1]!                  @ far_spectrum[i] | ||||
|   vld1.16 {d24, d25}, [r6, :128]!            @ &aecm->channelStored[i] | ||||
|   vld1.16 {d0, d1}, [r7, :128]!              @ &aecm->channelAdapt16[i] | ||||
|   vaddw.u16 q14, q14, d26 | ||||
|   vmull.u16 q10, d26, d24 | ||||
|   vmull.u16 q11, d27, d25 | ||||
|   vaddw.u16 q14, q14, d27 | ||||
|   vmull.u16 q1, d26, d0 | ||||
|   vst1.32 {q10, q11}, [r4, :256]!            @ &echo_est[i] | ||||
|   vadd.u32 q8, q10 | ||||
|   vmull.u16 q2, d27, d1 | ||||
|   vadd.u32 q8, q11 | ||||
|   vadd.u32 q9, q1 | ||||
|   subs r12, #1 | ||||
|   vadd.u32 q9, q2 | ||||
|   bgt LOOP_CALC_LINEAR_ENERGIES | ||||
|  | ||||
|   vadd.u32 d28, d29 | ||||
|   vpadd.u32 d28, d28 | ||||
|   vmov.32 r12, d28[0] | ||||
|   vadd.u32 d18, d19 | ||||
|   vpadd.u32 d18, d18 | ||||
|   vmov.32 r5, d18[0]                         @ echo_energy_adapt_r | ||||
|   vadd.u32 d16, d17 | ||||
|   vpadd.u32 d16, d16 | ||||
|  | ||||
|   ldrh  r1, [r1]                             @ far_spectrum[i] | ||||
|   add r12, r12, r1 | ||||
|   str r12, [r3]                              @ far_energy | ||||
|   vmov.32 r2, d16[0] | ||||
|  | ||||
|   ldrsh r12, [r6]                            @ aecm->channelStored[i] | ||||
|   ldrh  r6, [r7]                             @ aecm->channelAdapt16[i] | ||||
|   mul r0, r12, r1 | ||||
|   mla r1, r6, r1, r5 | ||||
|   add r2, r2, r0 | ||||
|   str r0, [r4]                               @ echo_est[i] | ||||
|   ldr r4, [sp, #20]                          @ &echo_energy_stored | ||||
|   str r2, [r4] | ||||
|   ldr r3, [sp, #16]                          @ &echo_energy_adapt | ||||
|   str r1, [r3] | ||||
|  | ||||
|   pop {r4-r7} | ||||
|   bx  lr | ||||
| .fnend | ||||
|  | ||||
| @ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, | ||||
| @                                          const uint16_t* far_spectrum, | ||||
| @                                          int32_t* echo_est); | ||||
| .align  2 | ||||
| WebRtcAecm_StoreAdaptiveChannelNeon: | ||||
| .fnstart | ||||
|   ldr r3, =offset_aecm_channelAdapt16 | ||||
|   ldr r12, =offset_aecm_channelStored | ||||
|   ldr r3, [r0, r3] | ||||
|   ldr r0, [r0, r12] | ||||
|   mov r12, #(PART_LEN / 8)                   @ Loop counter, unrolled by 8. | ||||
|  | ||||
| LOOP_STORE_ADAPTIVE_CHANNEL: | ||||
|   vld1.16 {d24, d25}, [r3, :128]!            @ &aecm->channelAdapt16[i] | ||||
|   vld1.16 {d26, d27}, [r1]!                  @ &far_spectrum[i] | ||||
|   vst1.16 {d24, d25}, [r0, :128]!            @ &aecm->channelStored[i] | ||||
|   vmull.u16 q10, d26, d24 | ||||
|   vmull.u16 q11, d27, d25 | ||||
|   vst1.16 {q10, q11}, [r2, :256]!            @ echo_est[i] | ||||
|   subs r12, #1 | ||||
|   bgt LOOP_STORE_ADAPTIVE_CHANNEL | ||||
|  | ||||
|   ldrsh  r12, [r3] | ||||
|   strh  r12, [r0] | ||||
|   ldrh  r1, [r1] | ||||
|   mul r3, r1, r12 | ||||
|   str r3, [r2] | ||||
|  | ||||
|   bx  lr | ||||
| .fnend | ||||
|  | ||||
| @ void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm); | ||||
| .align  2 | ||||
| WebRtcAecm_ResetAdaptiveChannelNeon: | ||||
| .fnstart | ||||
|   ldr r1, =offset_aecm_channelAdapt16 | ||||
|   ldr r2, =offset_aecm_channelAdapt32 | ||||
|   movw r3, #offset_aecm_channelStored | ||||
|   ldr r1, [r0, r1]                           @ &aecm->channelAdapt16[0] | ||||
|   ldr r2, [r0, r2]                           @ &aecm->channelAdapt32[0] | ||||
|   ldr r0, [r0, r3]                           @ &aecm->channelStored[0] | ||||
|   mov r3, #(PART_LEN / 8)                    @ Loop counter, unrolled by 8. | ||||
|  | ||||
| LOOP_RESET_ADAPTIVE_CHANNEL: | ||||
|   vld1.16 {d24, d25}, [r0, :128]! | ||||
|   subs r3, #1 | ||||
|   vst1.16 {d24, d25}, [r1, :128]! | ||||
|   vshll.s16 q10, d24, #16 | ||||
|   vshll.s16 q11, d25, #16 | ||||
|   vst1.16 {q10, q11}, [r2, :256]! | ||||
|   bgt LOOP_RESET_ADAPTIVE_CHANNEL | ||||
|  | ||||
|   ldrh  r0, [r0] | ||||
|   strh  r0, [r1] | ||||
|   mov r0, r0, asl #16 | ||||
|   str r0, [r2] | ||||
|  | ||||
|   bx  lr | ||||
| .fnend | ||||
|  | ||||
|   @ Square root of Hanning window in Q14. Compared to WebRtcAecm_kSqrtHanning, | ||||
|   @ the order was reversed and one useless element (0) was removed. | ||||
| .align  3 | ||||
| kSqrtHanningReversed: | ||||
|   .hword 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947 | ||||
|   .hword 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571 | ||||
|   .hword 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335 | ||||
|   .hword 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370 | ||||
|   .hword 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101 | ||||
|   .hword 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399 | ||||
| @@ -34,10 +34,10 @@ static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = | ||||
|   1594,  1196,  798,   399 | ||||
| }; | ||||
|  | ||||
| static void WindowAndFFTNeon(WebRtc_Word16* fft, | ||||
|                              const WebRtc_Word16* time_signal, | ||||
|                              complex16_t* freq_signal, | ||||
|                              int time_signal_scaling) { | ||||
| void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft, | ||||
|                                  const WebRtc_Word16* time_signal, | ||||
|                                  complex16_t* freq_signal, | ||||
|                                  int time_signal_scaling) { | ||||
|   int i, j; | ||||
|  | ||||
|   int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling); | ||||
| @@ -86,11 +86,11 @@ static void WindowAndFFTNeon(WebRtc_Word16* fft, | ||||
|   } | ||||
| } | ||||
|  | ||||
| static void InverseFFTAndWindowNeon(AecmCore_t* aecm, | ||||
|                                     WebRtc_Word16* fft, | ||||
|                                     complex16_t* efw, | ||||
|                                     WebRtc_Word16* output, | ||||
|                                     const WebRtc_Word16* nearendClean) { | ||||
| void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, | ||||
|                                         WebRtc_Word16* fft, | ||||
|                                         complex16_t* efw, | ||||
|                                         WebRtc_Word16* output, | ||||
|                                         const WebRtc_Word16* nearendClean) { | ||||
|   int i, j, outCFFT; | ||||
|  | ||||
|   // Synthesis | ||||
| @@ -186,12 +186,12 @@ static void InverseFFTAndWindowNeon(AecmCore_t* aecm, | ||||
|   } | ||||
| } | ||||
|  | ||||
| static void CalcLinearEnergiesNeon(AecmCore_t* aecm, | ||||
|                                    const WebRtc_UWord16* far_spectrum, | ||||
|                                    WebRtc_Word32* echo_est, | ||||
|                                    WebRtc_UWord32* far_energy, | ||||
|                                    WebRtc_UWord32* echo_energy_adapt, | ||||
|                                    WebRtc_UWord32* echo_energy_stored) { | ||||
| void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, | ||||
|                                        const WebRtc_UWord16* far_spectrum, | ||||
|                                        WebRtc_Word32* echo_est, | ||||
|                                        WebRtc_UWord32* far_energy, | ||||
|                                        WebRtc_UWord32* echo_energy_adapt, | ||||
|                                        WebRtc_UWord32* echo_energy_stored) { | ||||
|   int i; | ||||
|  | ||||
|   register WebRtc_UWord32 far_energy_r; | ||||
| @@ -249,9 +249,9 @@ static void CalcLinearEnergiesNeon(AecmCore_t* aecm, | ||||
|       aecm->channelAdapt16[i], far_spectrum[i]); | ||||
| } | ||||
|  | ||||
| static void StoreAdaptiveChannelNeon(AecmCore_t* aecm, | ||||
|                                      const WebRtc_UWord16* far_spectrum, | ||||
|                                      WebRtc_Word32* echo_est) { | ||||
| void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, | ||||
|                                          const WebRtc_UWord16* far_spectrum, | ||||
|                                          WebRtc_Word32* echo_est) { | ||||
|   int i; | ||||
|  | ||||
|   // During startup we store the channel every block. | ||||
| @@ -271,7 +271,7 @@ static void StoreAdaptiveChannelNeon(AecmCore_t* aecm, | ||||
|   echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); | ||||
| } | ||||
|  | ||||
| static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) { | ||||
| void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm) { | ||||
|   int i; | ||||
|  | ||||
|   for (i = 0; i < PART_LEN - 7; i += 8) { | ||||
| @@ -292,10 +292,3 @@ static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) { | ||||
|       (WebRtc_Word32)aecm->channelStored[i], 16); | ||||
| } | ||||
|  | ||||
| void WebRtcAecm_InitNeon(void) { | ||||
|   WebRtcAecm_WindowAndFFT = WindowAndFFTNeon; | ||||
|   WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon; | ||||
|   WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon; | ||||
|   WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon; | ||||
|   WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon; | ||||
| } | ||||
|   | ||||
							
								
								
									
										26
									
								
								src/modules/audio_processing/aecm/aecm_core_neon_offsets.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								src/modules/audio_processing/aecm/aecm_core_neon_offsets.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
|  | ||||
| /* | ||||
|  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | ||||
|  * | ||||
|  *  Use of this source code is governed by a BSD-style license | ||||
|  *  that can be found in the LICENSE file in the root of the source | ||||
|  *  tree. An additional intellectual property rights grant can be found | ||||
|  *  in the file PATENTS.  All contributing project authors may | ||||
|  *  be found in the AUTHORS file in the root of the source tree. | ||||
|  */ | ||||
|  | ||||
| #include "aecm_core.h" | ||||
|  | ||||
| #include <stddef.h> | ||||
|  | ||||
| // Define offset variables that will be compiled and abstracted to constant | ||||
| // defines, which will then only be used in ARM assembly code. | ||||
| int offset_aecm_dfaCleanQDomain = offsetof(AecmCore_t, dfaCleanQDomain); | ||||
| int offset_aecm_outBuf = offsetof(AecmCore_t, outBuf); | ||||
| int offset_aecm_xBuf = offsetof(AecmCore_t, xBuf); | ||||
| int offset_aecm_dBufNoisy = offsetof(AecmCore_t, dBufNoisy); | ||||
| int offset_aecm_dBufClean = offsetof(AecmCore_t, dBufClean); | ||||
| int offset_aecm_channelStored = offsetof(AecmCore_t, channelStored); | ||||
| int offset_aecm_channelAdapt16 = offsetof(AecmCore_t, channelAdapt16); | ||||
| int offset_aecm_channelAdapt32 = offsetof(AecmCore_t, channelAdapt32); | ||||
|  | ||||
							
								
								
									
										98
									
								
								src/modules/audio_processing/aecm/aecm_defines.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										98
									
								
								src/modules/audio_processing/aecm/aecm_defines.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,98 @@ | ||||
| /* | ||||
|  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | ||||
|  * | ||||
|  *  Use of this source code is governed by a BSD-style license | ||||
|  *  that can be found in the LICENSE file in the root of the source | ||||
|  *  tree. An additional intellectual property rights grant can be found | ||||
|  *  in the file PATENTS.  All contributing project authors may | ||||
|  *  be found in the AUTHORS file in the root of the source tree. | ||||
|  */ | ||||
|  | ||||
| #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_DEFINES_H_ | ||||
| #define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_DEFINES_H_ | ||||
|  | ||||
| #define AECM_DYNAMIC_Q                 /* Turn on/off dynamic Q-domain. */ | ||||
|  | ||||
| /* #define AECM_SHORT                   For 32 sample partition length. */ | ||||
|  | ||||
| /* Algorithm parameters */ | ||||
| #define FRAME_LEN       80             /* Total frame length, 10 ms. */ | ||||
|  | ||||
| #ifdef AECM_SHORT | ||||
| #define PART_LEN        32             /* Length of partition. */ | ||||
| #define PART_LEN_SHIFT  6              /* Length of (PART_LEN * 2) in base 2. */ | ||||
| #else | ||||
| #define PART_LEN        64             /* Length of partition. */ | ||||
| #define PART_LEN_SHIFT  7              /* Length of (PART_LEN * 2) in base 2. */ | ||||
| #endif | ||||
|  | ||||
| #define PART_LEN1       (PART_LEN + 1)  /* Unique fft coefficients. */ | ||||
| #define PART_LEN2       (PART_LEN << 1) /* Length of partition * 2. */ | ||||
| #define PART_LEN4       (PART_LEN << 2) /* Length of partition * 4. */ | ||||
| #define FAR_BUF_LEN     PART_LEN4       /* Length of buffers. */ | ||||
| #define MAX_DELAY       100 | ||||
|  | ||||
| /* Counter parameters */ | ||||
| #ifdef AECM_SHORT | ||||
| #define CONV_LEN        1024         /* Convergence length used at startup. */ | ||||
| #else | ||||
| #define CONV_LEN        512          /* Convergence length used at startup. */ | ||||
| #endif | ||||
| #define CONV_LEN2       (CONV_LEN << 1) /* Used at startup. */ | ||||
|  | ||||
| /* Energy parameters */ | ||||
| #define MAX_BUF_LEN     64           /* History length of energy signals. */ | ||||
| #define FAR_ENERGY_MIN  1025         /* Lowest Far energy level: At least 2 */ | ||||
|                                      /* in energy. */ | ||||
| #define FAR_ENERGY_DIFF 929          /* Allowed difference between max */ | ||||
|                                      /* and min. */ | ||||
| #define ENERGY_DEV_OFFSET       0    /* The energy error offset in Q8. */ | ||||
| #define ENERGY_DEV_TOL  400          /* The energy estimation tolerance (Q8). */ | ||||
| #define FAR_ENERGY_VAD_REGION   230  /* Far VAD tolerance region. */ | ||||
|  | ||||
| /* Stepsize parameters */ | ||||
| #define MU_MIN          10          /* Min stepsize 2^-MU_MIN (far end energy */ | ||||
|                                     /* dependent). */ | ||||
| #define MU_MAX          1           /* Max stepsize 2^-MU_MAX (far end energy */ | ||||
|                                     /* dependent). */ | ||||
| #define MU_DIFF         9           /* MU_MIN - MU_MAX */ | ||||
|  | ||||
| /* Channel parameters */ | ||||
| #define MIN_MSE_COUNT   20 /* Min number of consecutive blocks with enough */ | ||||
|                            /* far end energy to compare channel estimates. */ | ||||
| #define MIN_MSE_DIFF    29 /* The ratio between adapted and stored channel to */ | ||||
|                            /* accept a new storage (0.8 in Q-MSE_RESOLUTION). */ | ||||
| #define MSE_RESOLUTION  5           /* MSE parameter resolution. */ | ||||
| #define RESOLUTION_CHANNEL16    12  /* W16 Channel in Q-RESOLUTION_CHANNEL16. */ | ||||
| #define RESOLUTION_CHANNEL32    28  /* W32 Channel in Q-RESOLUTION_CHANNEL. */ | ||||
| #define CHANNEL_VAD     16          /* Minimum energy in frequency band */ | ||||
|                                     /* to update channel. */ | ||||
|  | ||||
| /* Suppression gain parameters: SUPGAIN parameters in Q-(RESOLUTION_SUPGAIN). */ | ||||
| #define RESOLUTION_SUPGAIN      8     /* Channel in Q-(RESOLUTION_SUPGAIN). */ | ||||
| #define SUPGAIN_DEFAULT (1 << RESOLUTION_SUPGAIN)  /* Default. */ | ||||
| #define SUPGAIN_ERROR_PARAM_A   3072  /* Estimation error parameter */ | ||||
|                                       /* (Maximum gain) (8 in Q8). */ | ||||
| #define SUPGAIN_ERROR_PARAM_B   1536  /* Estimation error parameter */ | ||||
|                                       /* (Gain before going down). */ | ||||
| #define SUPGAIN_ERROR_PARAM_D   SUPGAIN_DEFAULT /* Estimation error parameter */ | ||||
|                                 /* (Should be the same as Default) (1 in Q8). */ | ||||
| #define SUPGAIN_EPC_DT  200     /* SUPGAIN_ERROR_PARAM_C * ENERGY_DEV_TOL */ | ||||
|  | ||||
| /* Defines for "check delay estimation" */ | ||||
| #define CORR_WIDTH      31      /* Number of samples to correlate over. */ | ||||
| #define CORR_MAX        16      /* Maximum correlation offset. */ | ||||
| #define CORR_MAX_BUF    63 | ||||
| #define CORR_DEV        4 | ||||
| #define CORR_MAX_LEVEL  20 | ||||
| #define CORR_MAX_LOW    4 | ||||
| #define CORR_BUF_LEN    (CORR_MAX << 1) + 1 | ||||
| /* Note that CORR_WIDTH + 2*CORR_MAX <= MAX_BUF_LEN. */ | ||||
|  | ||||
| #define ONE_Q14         (1 << 14) | ||||
|  | ||||
| /* NLP defines */ | ||||
| #define NLP_COMP_LOW    3277    /* 0.2 in Q14 */ | ||||
| #define NLP_COMP_HIGH   ONE_Q14 /* 1 in Q14 */ | ||||
|  | ||||
| #endif | ||||
		Reference in New Issue
	
	Block a user
	 kma@webrtc.org
					kma@webrtc.org