Fixed and enabled ARM assembly code in AECM and NS.

Review URL: https://webrtc-codereview.appspot.com/860005

git-svn-id: http://webrtc.googlecode.com/svn/trunk@3060 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@webrtc.org 2012-11-07 22:34:31 +00:00
parent 31eae47444
commit 12454028bc
8 changed files with 113 additions and 99 deletions

View File

@ -56,20 +56,18 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_MODULE := libwebrtc_aecm_neon LOCAL_MODULE := libwebrtc_aecm_neon
LOCAL_MODULE_TAGS := optional LOCAL_MODULE_TAGS := optional
GEN := $(LOCAL_PATH)/aecm_core_neon_offsets.h AECM_ASM_HEADER := $(intermediates)/aecm_core_neon_offsets.h
AECM_ASM_HEADER_DIR := $(intermediates)
# Generate a header file aecm_core_neon_offsets.h which will be included in # Generate a header file aecm_core_neon_offsets.h which will be included in
# assembly file aecm_core_neon.S, from file aecm_core_neon_offsets.c. # assembly file aecm_core_neon.S, from file aecm_core_neon_offsets.c.
$(GEN): $(LOCAL_PATH)/../../../build/generate_asm_header.py \ $(AECM_ASM_HEADER): $(LOCAL_PATH)/../../../build/generate_asm_header.py \
$(intermediates)/aecm_core_neon_offsets.S
@python $^ $@ offset_aecm_
$(intermediates)/aecm_core_neon_offsets.S: \
$(LOCAL_PATH)/aecm_core_neon_offsets.c $(LOCAL_PATH)/aecm_core_neon_offsets.c
@$(TARGET_CC) $(addprefix -I, $(LOCAL_INCLUDES)) $(addprefix -isystem ,\ @python $^ --compiler=$(TARGET_CC) --options="$(addprefix -I, \
$(TARGET_C_INCLUDES)) -S -o $@ $^ $(LOCAL_INCLUDES)) $(addprefix -isystem , $(TARGET_C_INCLUDES)) -S" \
--dir=$(AECM_ASM_HEADER_DIR)
LOCAL_GENERATED_SOURCES := $(GEN) LOCAL_GENERATED_SOURCES := $(AECM_ASM_HEADER)
LOCAL_SRC_FILES := aecm_core_neon.S LOCAL_SRC_FILES := aecm_core_neon.S
# Flags passed to both C and C++ files. # Flags passed to both C and C++ files.
@ -80,6 +78,7 @@ LOCAL_CFLAGS := \
-flax-vector-conversions -flax-vector-conversions
LOCAL_C_INCLUDES := \ LOCAL_C_INCLUDES := \
$(AECM_ASM_HEADER_DIR) \
$(LOCAL_PATH)/include \ $(LOCAL_PATH)/include \
$(LOCAL_PATH)/../../.. \ $(LOCAL_PATH)/../../.. \
$(LOCAL_PATH)/../../../common_audio/signal_processing/include $(LOCAL_PATH)/../../../common_audio/signal_processing/include

View File

@ -13,10 +13,9 @@
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_ #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_ #define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_
#include "common_audio/signal_processing/include/signal_processing_library.h"
#include "modules/audio_processing/aecm/aecm_defines.h"
#include "typedefs.h" #include "typedefs.h"
#include "signal_processing_library.h"
#include "aecm_defines.h"
#ifdef _MSC_VER // visual c++ #ifdef _MSC_VER // visual c++
#define ALIGN8_BEG __declspec(align(8)) #define ALIGN8_BEG __declspec(align(8))

View File

@ -26,66 +26,64 @@
.global WebRtcAecm_StoreAdaptiveChannelNeon .global WebRtcAecm_StoreAdaptiveChannelNeon
.global WebRtcAecm_ResetAdaptiveChannelNeon .global WebRtcAecm_ResetAdaptiveChannelNeon
@ void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft, @ void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm,
@ WebRtc_Word16* fft,
@ const WebRtc_Word16* time_signal, @ const WebRtc_Word16* time_signal,
@ complex16_t* freq_signal, @ complex16_t* freq_signal,
@ int time_signal_scaling); @ int time_signal_scaling);
.align 2 .align 2
WebRtcAecm_WindowAndFFTNeon: WebRtcAecm_WindowAndFFTNeon:
.fnstart .fnstart
.save {r4, r5, lr} .save {r4, r5, r6, lr}
push {r4, r5, lr} push {r4, r5, r6, lr}
vdup.16 d16, r3 ldr r12, [sp, #16] @ time_signal_scaling
mov r5, r2 @ WebRtcSpl_ComplexIFFT changes r2. vdup.16 d16, r12
vmov.i16 d21, #0 @ For imaginary parts of |fft|. vmov.i16 d21, #0 @ For imaginary parts of |fft|.
vmov.i16 d27, #0 @ For imaginary parts of |fft|. vmov.i16 d27, #0 @ For imaginary parts of |fft|.
ldr r2, =WebRtcAecm_kSqrtHanning ldr r5, =WebRtcAecm_kSqrtHanning
adr lr, kSqrtHanningReversed adr lr, kSqrtHanningReversed
add r4, r0, #(PART_LEN2 * 2) @ &fft[PART_LEN2] add r4, r1, #(PART_LEN2 * 2) @ &fft[PART_LEN2]
add r12, r1, #(PART_LEN * 2) @ time_signal[PART_LEN] add r12, r2, #(PART_LEN * 2) @ time_signal[PART_LEN]
mov r3, #(PART_LEN / 4) @ Loop counter, unrolled by 4 mov r6, #(PART_LEN / 4) @ Loop counter, unrolled by 4
LOOP_PART_LEN: LOOP_PART_LEN:
vld1.16 d0, [r1, :64]! @ time_signal[i] vld1.16 d0, [r2, :64]! @ time_signal[i]
vld1.16 d22, [r12, :64]! @ time_signal[i + PART_LEN] vld1.16 d22, [r12, :64]! @ time_signal[i + PART_LEN]
vld1.16 d17, [r2, :64]! @ WebRtcAecm_kSqrtHanning[i] vld1.16 d17, [r5, :64]! @ WebRtcAecm_kSqrtHanning[i]
vld1.16 d23, [lr, :64]! @ kSqrtHanningReversed[i] vld1.16 d23, [lr, :64]! @ kSqrtHanningReversed[i]
vshl.s16 d18, d0, d16 vshl.s16 d18, d0, d16
vshl.s16 d22, d22, d16 vshl.s16 d22, d22, d16
vmull.s16 q9, d18, d17 vmull.s16 q9, d18, d17
vmull.s16 q12, d22, d23 vmull.s16 q12, d22, d23
subs r3, #1 subs r6, #1
vshrn.i32 d20, q9, #14 vshrn.i32 d20, q9, #14
vshrn.i32 d26, q12, #14 vshrn.i32 d26, q12, #14
vst2.16 {d20, d21}, [r0, :128]! @ fft[j] vst2.16 {d20, d21}, [r1, :128]! @ fft[j]
vst2.16 {d26, d27}, [r4, :128]! @ fft[PART_LEN2 + j] vst2.16 {d26, d27}, [r4, :128]! @ fft[PART_LEN2 + j]
bgt LOOP_PART_LEN bgt LOOP_PART_LEN
sub r4, r0, #(PART_LEN2 * 2) @ r4 points to fft[0] @ WebRtcSpl_RealForwardFFT(aecm->real_fft, fft, (int16_t*)freq_signal);
mov r0, r4 ldr r12, =offset_aecm_real_fft
mov r1, #7 sub r1, #(PART_LEN * 4) @ Get r1 back to &fft[0].
bl WebRtcSpl_ComplexBitReverse mov r2, r3 @ freq_signal
mov r4, r3
ldr r0, [r0, r12] @ aecm->real_fft
bl WebRtcSpl_RealForwardFFTNeon
mov r0, r4 mov r12, #(PART_LEN * 2 / 16) @ Loop counter, unrolled by 16.
mov r1, #7
mov r2, #1
bl WebRtcSpl_ComplexFFT
mov r3, #(PART_LEN * 2 / 16) @ Loop counter, unrolled by 16.
LOOP_PART_LEN2: LOOP_PART_LEN2:
@ freq_signal[i].real = fft[j]; @ freq_signal[i].imag = - freq_signal[i].imag;
@ freq_signal[i].imag = - fft[j+1]; vld2.16 {d20, d21, d22, d23}, [r4, :256]
vld2.16 {d20, d21, d22, d23}, [r4, :256]! subs r12, #1
subs r3, #1
vneg.s16 d22, d22 vneg.s16 d22, d22
vneg.s16 d23, d23 vneg.s16 d23, d23
vst2.16 {d20, d21, d22, d23}, [r5, :256]! vst2.16 {d20, d21, d22, d23}, [r4, :256]!
bgt LOOP_PART_LEN2 bgt LOOP_PART_LEN2
pop {r4, r5, pc} pop {r4, r5, r6, pc}
.fnend .fnend
@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, @ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
@ -123,29 +121,18 @@ LOOP_PRE_IFFT:
@ fft[PART_LEN2] = efw[PART_LEN].real; @ fft[PART_LEN2] = efw[PART_LEN].real;
@ fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; @ fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
ldr r8, [r12] ldr r8, [r12]
ssub16 r2, r6, r8 ssub16 r12, r6, r8
mov r1, #(PART_LEN2 * 2) mov r3, #(PART_LEN2 * 2)
pkhbt r8, r8, r2 pkhbt r8, r8, r12
str r8, [r4, r1] str r8, [r4, r3]
mov r0, r4 @ outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, (int16_t*)efw);
mov r1, #7 ldr r12, =offset_aecm_real_fft
bl WebRtcSpl_ComplexBitReverse sub r1, #(PART_LEN * 4) @ Get r1 back to &fft[0].
sub r2, #(PART_LEN * 4) @ Get r2 back to &efw[0].
mov r0, r4 mov r4, r2 @ Keep efw in r4.
mov r1, #7 ldr r0, [r0, r12] @ aecm->real_fft
mov r2, #1 bl WebRtcSpl_RealInverseFFTNeon
bl WebRtcSpl_ComplexIFFT
mov r1, r4
mov r2, r4
mov r3, #(PART_LEN * 2 / 8) @ Loop counter, unrolled by 8.
LOOP_GET_REAL_VALUES:
vld2.16 {q10, q11}, [r2, :256]!
subs r3, #1
vst1.16 {q10}, [r1, :128]!
bgt LOOP_GET_REAL_VALUES
ldr r6, =offset_aecm_outBuf ldr r6, =offset_aecm_outBuf
ldr r12, =offset_aecm_dfaCleanQDomain ldr r12, =offset_aecm_dfaCleanQDomain
@ -156,24 +143,24 @@ LOOP_GET_REAL_VALUES:
ldr r6, =WebRtcAecm_kSqrtHanning ldr r6, =WebRtcAecm_kSqrtHanning
rsb r0, r2, r0 @ outCFFT - aecm->dfaCleanQDomain rsb r0, r2, r0 @ outCFFT - aecm->dfaCleanQDomain
vdup.32 q9, r0 vdup.32 q9, r0
add r0, r4, #(PART_LEN * 2) @ &fft[PART_LEN] add r0, r4, #(PART_LEN * 4) @ &efw[PART_LEN]
mov r3, #(PART_LEN / 4) @ Loop counter, unrolled by 4. mov r3, #(PART_LEN / 4) @ Loop counter, unrolled by 4
LOOP_POST_IFFT: LOOP_POST_IFFT:
vld1.16 d16, [r4, :64] @ fft[i]; vld2.16 {d4, d5}, [r4, :128] @ &efw[i];
vld1.16 d17, [r6, :64]! @ WebRtcAecm_kSqrtHanning[i] vld1.16 d17, [r6, :64]! @ WebRtcAecm_kSqrtHanning[i]
vld1.16 d20, [r8, :64] @ aecm->outBuf[i] vld1.16 d20, [r8, :64] @ aecm->outBuf[i]
vmull.s16 q8, d16, d17 vmull.s16 q8, d4, d17
vmovl.s16 q10, d20 vmovl.s16 q10, d20
vrshr.s32 q8, q8, #14 vrshr.s32 q8, q8, #14
vld1.16 d0, [r0, :64]! @ &fft[PART_LEN + i] vld1.16 d0, [r0, :64]! @ &efw[PART_LEN + i]
vshl.s32 q8, q8, q9 vshl.s32 q8, q8, q9
vld1.16 d1, [r12, :64]! @ kSqrtHanningReversed[i] vld1.16 d1, [r12, :64]! @ kSqrtHanningReversed[i]
vadd.i32 q8, q10 vadd.i32 q8, q10
vmull.s16 q0, d0, d1 vmull.s16 q0, d0, d1
vqshrn.s32 d16, q8, #0 vqshrn.s32 d4, q8, #0
vshr.s32 q0, q0, #14 vshr.s32 q0, q0, #14
vst1.16 d16, [r4, :64]! @ fft[i]; vst2.16 {d4, d5}, [r4, :128]! @ &efw[i];
vshl.s32 q0, q0, q9 vshl.s32 q0, q0, q9
vst1.16 d16, [r7, :64]! @ output[i] vst1.16 d16, [r7, :64]! @ output[i]
vqshrn.s32 d0, q0, #0 vqshrn.s32 d0, q0, #0
@ -197,7 +184,7 @@ LOOP_COPY:
vst1.16 {q12, q13}, [r1, :256]! vst1.16 {q12, q13}, [r1, :256]!
bgt LOOP_COPY bgt LOOP_COPY
ldr r2, [sp, #24] ldr r2, [sp, #16]
cmp r2, #0 @ Check if (nearendClean != NULL). cmp r2, #0 @ Check if (nearendClean != NULL).
beq END beq END

View File

@ -23,4 +23,4 @@ int offset_aecm_dBufClean = offsetof(AecmCore_t, dBufClean);
int offset_aecm_channelStored = offsetof(AecmCore_t, channelStored); int offset_aecm_channelStored = offsetof(AecmCore_t, channelStored);
int offset_aecm_channelAdapt16 = offsetof(AecmCore_t, channelAdapt16); int offset_aecm_channelAdapt16 = offsetof(AecmCore_t, channelAdapt16);
int offset_aecm_channelAdapt32 = offsetof(AecmCore_t, channelAdapt32); int offset_aecm_channelAdapt32 = offsetof(AecmCore_t, channelAdapt32);
int offset_aecm_real_fft = offsetof(AecmCore_t, real_fft);

View File

@ -159,19 +159,50 @@
], ],
}], }],
['target_arch=="arm" and armv7==1', { ['target_arch=="arm" and armv7==1', {
'targets': [ 'targets': [{
{ 'target_name': 'audio_processing_neon',
'target_name': 'audio_processing_neon', 'type': 'static_library',
'type': 'static_library', 'includes': ['../../build/arm_neon.gypi',],
'includes': ['../../build/arm_neon.gypi',], 'dependencies': [
'dependencies': [ '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
'<(webrtc_root)/common_audio/common_audio.gyp:signal_processing', ],
], 'sources': [
'sources': [ 'aecm/aecm_core_neon.c',
'aecm/aecm_core_neon.c', 'ns/nsx_core_neon.c',
'ns/nsx_core_neon.c', ],
], 'conditions': [
}, ['OS=="android"', {
'dependencies': [
'audio_processing_offsets',
],
# TODO(kma): port this block from Android into other build systems.
'sources': [
'aecm/aecm_core_neon.S',
'ns/nsx_core_neon.S',
],
'sources!': [
'aecm/aecm_core_neon.c',
'ns/nsx_core_neon.c',
],
'includes!': ['../../build/arm_neon.gypi',],
}],
],
}],
'conditions': [
['OS=="android"', {
'targets': [{
'target_name': 'audio_processing_offsets',
'type': 'none',
'sources': [
'aecm/aecm_core_neon_offsets.c',
'ns/nsx_core_neon_offsets.c',
],
'variables': {
'asm_header_dir': 'asm_offsets',
},
'includes': ['../../build/generate_asm_header.gypi',],
}],
}],
], ],
}], }],
], ],

View File

@ -57,19 +57,18 @@ LOCAL_ARM_MODE := arm
LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_MODULE := libwebrtc_ns_neon LOCAL_MODULE := libwebrtc_ns_neon
LOCAL_MODULE_TAGS := optional LOCAL_MODULE_TAGS := optional
GEN := $(LOCAL_PATH)/nsx_core_neon_offsets.h NS_ASM_HEADER := $(intermediates)/ns_core_neon_offsets.h
NS_ASM_HEADER_DIR := $(intermediates)
# Generate a header file nsx_core_neon_offsets.h which will be included in # Generate a header file nsx_core_neon_offsets.h which will be included in
# assembly file nsx_core_neon.S, from file nsx_core_neon_offsets.c. # assembly file nsx_core_neon.S, from file nsx_core_neon_offsets.c.
$(GEN): $(LOCAL_PATH)/../../../build/generate_asm_header.py \ $(NS_ASM_HEADER): $(LOCAL_PATH)/../../../build/generate_asm_header.py \
$(intermediates)/nsx_core_neon_offsets.S $(LOCAL_PATH)/nsx_core_neon_offsets.c
@python $^ $@ offset_nsx_ @python $^ --compiler=$(TARGET_CC) --options="$(addprefix -I, \
$(LOCAL_INCLUDES)) $(addprefix -isystem , $(TARGET_C_INCLUDES)) -S" \
--dir=$(NS_ASM_HEADER_DIR)
$(intermediates)/nsx_core_neon_offsets.S: $(LOCAL_PATH)/nsx_core_neon_offsets.c LOCAL_GENERATED_SOURCES := $(NS_ASM_HEADER)
@$(TARGET_CC) $(addprefix -I, $(LOCAL_INCLUDES)) $(addprefix -isystem ,\
$(TARGET_C_INCLUDES)) -S -o $@ $^
LOCAL_GENERATED_SOURCES := $(GEN)
LOCAL_SRC_FILES := nsx_core_neon.S LOCAL_SRC_FILES := nsx_core_neon.S
# Flags passed to both C and C++ files. # Flags passed to both C and C++ files.
@ -80,6 +79,7 @@ LOCAL_CFLAGS := \
-flax-vector-conversions -flax-vector-conversions
LOCAL_C_INCLUDES := \ LOCAL_C_INCLUDES := \
$(NS_ASM_HEADER_DIR) \
$(LOCAL_PATH)/include \ $(LOCAL_PATH)/include \
$(LOCAL_PATH)/../../.. \ $(LOCAL_PATH)/../../.. \
$(LOCAL_PATH)/../../../common_audio/signal_processing/include $(LOCAL_PATH)/../../../common_audio/signal_processing/include

View File

@ -11,10 +11,9 @@
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_ #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_ #define WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_
#include "common_audio/signal_processing/include/signal_processing_library.h"
#include "modules/audio_processing/ns/nsx_defines.h"
#include "typedefs.h" #include "typedefs.h"
#include "signal_processing_library.h"
#include "nsx_defines.h"
#ifdef NS_FILEDEBUG #ifdef NS_FILEDEBUG
#include <stdio.h> #include <stdio.h>

View File

@ -335,7 +335,7 @@ UpdateNoiseEstimateNeon:
mov r0, r4 mov r0, r4
mov r1, r6 mov r1, r6
bl WebRtcSpl_MaxValueW16 bl WebRtcSpl_MaxValueW16Neon
sub r12, r6, #1 @ Loop counter: inst->magnLen - 1. sub r12, r6, #1 @ Loop counter: inst->magnLen - 1.
@ -351,7 +351,6 @@ UpdateNoiseEstimateNeon:
vdup.32 q13, r0 vdup.32 q13, r0
str r0, [r5, r1] str r0, [r5, r1]
LOOP_UPDATE: LOOP_UPDATE:
vld1.16 {d0, d1}, [r4]! @ &inst->noiseEstLogQuantile[offset + i] vld1.16 {d0, d1}, [r4]! @ &inst->noiseEstLogQuantile[offset + i]
vmull.s16 q1, d0, d16 vmull.s16 q1, d0, d16