Porting ARM optimization from Android to ios.

Tested APM and iSAC in Android. Bit-exact with original versions.
Changes include removing or changing some GCC derivatives (e.g. .fnstart, .hword), instruction syntax, etc.
Review URL: https://webrtc-codereview.appspot.com/934009

git-svn-id: http://webrtc.googlecode.com/svn/trunk@3124 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@webrtc.org 2012-11-17 00:22:46 +00:00
parent 2ec58dc4d1
commit 55cd78cfc2
22 changed files with 189 additions and 281 deletions

View File

@ -60,7 +60,7 @@ LOCAL_C_INCLUDES := \
ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
LOCAL_SRC_FILES += \
filter_ar_fast_q12_armv7.s
filter_ar_fast_q12_armv7.S
else
LOCAL_SRC_FILES += \
filter_ar_fast_q12.c
@ -68,8 +68,8 @@ endif
ifeq ($(TARGET_ARCH),arm)
LOCAL_SRC_FILES += \
complex_bit_reverse_arm.s \
spl_sqrt_floor_arm.s
complex_bit_reverse_arm.S \
spl_sqrt_floor_arm.S
else
LOCAL_SRC_FILES += \
complex_bit_reverse.c \
@ -102,10 +102,10 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_MODULE := libwebrtc_spl_neon
LOCAL_MODULE_TAGS := optional
LOCAL_SRC_FILES := \
cross_correlation_neon.s \
downsample_fast_neon.s \
min_max_operations_neon.s \
vector_scaling_operations_neon.s
cross_correlation_neon.S \
downsample_fast_neon.S \
min_max_operations_neon.S \
vector_scaling_operations_neon.S
# Flags passed to both C and C++ files.
LOCAL_CFLAGS := \

View File

@ -12,15 +12,11 @@
@ for ARMv5 platforms.
@ Reference C code is in file complex_bit_reverse.c. Bit-exact.
.arch armv5
.global WebRtcSpl_ComplexBitReverse
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcSpl_ComplexBitReverse
.align 2
WebRtcSpl_ComplexBitReverse:
.fnstart
DEFINE_FUNCTION WebRtcSpl_ComplexBitReverse
push {r4-r7}
cmp r1, #7
@ -88,39 +84,36 @@ END:
pop {r4-r7}
bx lr
.fnend
@ The index tables. Note the values are doubles of the actual indexes for 16-bit
@ elements, different from the generic C code. It actually provides byte offsets
@ for the indexes.
.align 2
index_7: @ Indexes for stages == 7.
.hword 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288
.hword 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144
.hword 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116
.hword 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156
.hword 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204
.hword 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268
.hword 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348
.hword 468, 364, 436, 380, 500, 412, 460, 444, 492
.short 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288
.short 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144
.short 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116
.short 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156
.short 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204
.short 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268
.short 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348
.short 468, 364, 436, 380, 500, 412, 460, 444, 492
index_8: @ Indexes for stages == 8.
.hword 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64
.hword 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544
.hword 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104
.hword 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136
.hword 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172
.hword 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204
.hword 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244
.hword 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284
.hword 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324
.hword 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372
.hword 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420
.hword 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468
.hword 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532
.hword 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596
.hword 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684
.hword 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796
.hword 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988
.short 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64
.short 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544
.short 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104
.short 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136
.short 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172
.short 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204
.short 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244
.short 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284
.short 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324
.short 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372
.short 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420
.short 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468
.short 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532
.short 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596
.short 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684
.short 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796
.short 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988

View File

@ -29,24 +29,18 @@
@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
@ r8, r9, r10, r11, r12: scratch
.arch armv7-a
.fpu neon
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon
.align 2
.global WebRtcSpl_CrossCorrelationNeon
WebRtcSpl_CrossCorrelationNeon:
.fnstart
.save {r4-r11}
DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon
push {r4-r11}
@ Put the shift value (-right_shifts) into a Neon register.
ldrsh r10, [sp, #36]
rsb r10, r10, #0
mov r8, r10, asr #31
vmov.32 d16, r10, r8
vmov d16, r10, r8
@ Initialize loop counters.
and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8;
@ -105,9 +99,6 @@ POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift.
pop {r4-r11}
bx lr
.fnend
@ TODO(kma): Place this piece of reference code into a C code file.
@ void WebRtcSpl_CrossCorrelationNeon(WebRtc_Word32* cross_correlation,
@ WebRtc_Word16* seq1,

View File

@ -14,17 +14,11 @@
@
@ The reference C code is in file downsample_fast.c. Bit-exact.
.arch armv7-a
.fpu neon
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon
.align 2
.global WebRtcSpl_DownsampleFastNeon
WebRtcSpl_DownsampleFastNeon:
.fnstart
.save {r4-r11}
DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon
push {r4-r11}
cmp r3, #0 @ data_out_length <= 0?
@ -168,14 +162,15 @@ LOOP_COEFF_LENGTH_FACTOR4:
vmlal.s16 q3, d18, d17
bge LOOP_COEFF_LENGTH_FACTOR4
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
@ Shift, saturate, and store the result.
vqshrn.s32 d0, q2, #12
vqshrn.s32 d1, q3, #12
cmp r9, r3 @ i < endpos - factor * 7 ?
vst1.16 {d0, d1}, [r2]!
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
cmp r9, r3 @ i < endpos - factor * 7 ?
blt LOOP_ENDPOS_FACTOR4
@
@ -218,5 +213,3 @@ LOOP2_COEFF_LENGTH:
END:
pop {r4-r11}
bx lr
.fnend

View File

@ -35,16 +35,11 @@
@ r11: Scratch
@ r12: &coefficients[j]
.arch armv7-a
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12
.align 2
.global WebRtcSpl_FilterARFastQ12
WebRtcSpl_FilterARFastQ12:
.fnstart
.save {r4-r11}
DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12
push {r4-r11}
ldrsh r12, [sp, #32] @ data_length
@ -155,9 +150,6 @@ END:
pop {r4-r11}
bx lr
.fnend
@Reference C code:
@
@void WebRtcSpl_FilterARFastQ12(int16_t* data_in,

View File

@ -166,7 +166,7 @@ static __inline int WebRtcSpl_NormW16(WebRtc_Word16 a) {
static __inline WebRtc_Word16 WebRtcSpl_SatW32ToW16(WebRtc_Word32 value32) {
WebRtc_Word16 out16 = 0;
__asm __volatile ("ssat %r0, #16, %r1" : "=r"(out16) : "r"(value32));
__asm __volatile ("ssat %0, #16, %1" : "=r"(out16) : "r"(value32));
return out16;
}

View File

@ -15,20 +15,18 @@
@ The reference C code is in file min_max_operations.c. Code here is basically
@ a loop unrolling by 8 with Neon instructions. Bit-exact.
.arch armv7-a
.fpu neon
.global WebRtcSpl_MaxAbsValueW16Neon
.global WebRtcSpl_MaxAbsValueW32Neon
.global WebRtcSpl_MaxValueW16Neon
.global WebRtcSpl_MaxValueW32Neon
.global WebRtcSpl_MinValueW16Neon
.global WebRtcSpl_MinValueW32Neon
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon
GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon
GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon
GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon
.align 2
@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
WebRtcSpl_MaxAbsValueW16Neon:
.fnstart
DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
mov r2, #-1 @ Initialize the return value.
cmp r0, #0
beq END_MAX_ABS_VALUE_W16
@ -50,8 +48,8 @@ LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
@ Find the maximum value in the Neon registers and move it to r2.
vmax.u16 d24, d25
vpmax.u16 d24, d24
vpmax.u16 d24, d24
vpmax.u16 d24, d24, d24
vpmax.u16 d24, d24, d24
adds r1, #8
vmov.u16 r2, d24[0]
beq END_MAX_ABS_VALUE_W16
@ -71,12 +69,10 @@ END_MAX_ABS_VALUE_W16:
mov r0, r2
bx lr
.fnend
@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
WebRtcSpl_MaxAbsValueW32Neon:
.fnstart
DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
cmp r0, #0
moveq r0, #-1
beq EXIT @ Return -1 for a NULL pointer.
@ -103,7 +99,7 @@ LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
@ Find the maximum value in the Neon registers and move it to r2.
vmax.u32 q12, q11
vmax.u32 d24, d25
vpmax.u32 d24, d24
vpmax.u32 d24, d24, d24
adds r1, #8
vmov.u32 r2, d24[0]
beq END_MAX_ABS_VALUE_W32
@ -125,12 +121,8 @@ END_MAX_ABS_VALUE_W32:
EXIT:
bx lr
.fnend
@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
WebRtcSpl_MaxValueW16Neon:
.fnstart
DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon
mov r2, #0x8000 @ Initialize the return value.
cmp r0, #0
beq END_MAX_VALUE_W16
@ -151,8 +143,8 @@ LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
@ Find the maximum value in the Neon registers and move it to r2.
vmax.s16 d24, d25
vpmax.s16 d24, d24
vpmax.s16 d24, d24
vpmax.s16 d24, d24, d24
vpmax.s16 d24, d24, d24
adds r1, #8
vmov.u16 r2, d24[0]
beq END_MAX_VALUE_W16
@ -168,12 +160,8 @@ END_MAX_VALUE_W16:
mov r0, r2
bx lr
.fnend
@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
WebRtcSpl_MaxValueW32Neon:
.fnstart
DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon
mov r2, #0x80000000 @ Initialize the return value.
cmp r0, #0
beq END_MAX_VALUE_W32
@ -196,8 +184,8 @@ LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
@ Find the maximum value in the Neon registers and move it to r2.
vmax.s32 q12, q11
vpmax.s32 d24, d25
vpmax.s32 d24, d24
vpmax.s32 d24, d24, d25
vpmax.s32 d24, d24, d24
adds r1, #8
vmov.s32 r2, d24[0]
beq END_MAX_VALUE_W32
@ -213,12 +201,8 @@ END_MAX_VALUE_W32:
mov r0, r2
bx lr
.fnend
@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
WebRtcSpl_MinValueW16Neon:
.fnstart
DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon
movw r2, #0x7FFF @ Initialize the return value.
cmp r0, #0
beq END_MIN_VALUE_W16
@ -239,8 +223,8 @@ LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
@ Find the maximum value in the Neon registers and move it to r2.
vmin.s16 d24, d25
vpmin.s16 d24, d24
vpmin.s16 d24, d24
vpmin.s16 d24, d24, d24
vpmin.s16 d24, d24, d24
adds r1, #8
vmov.s16 r2, d24[0]
sxth r2, r2
@ -257,12 +241,8 @@ END_MIN_VALUE_W16:
mov r0, r2
bx lr
.fnend
@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
WebRtcSpl_MinValueW32Neon:
.fnstart
DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon
mov r2, #0x7FFFFFFF @ Initialize the return value.
cmp r0, #0
beq END_MIN_VALUE_W32
@ -285,8 +265,8 @@ LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
@ Find the maximum value in the Neon registers and move it to r2.
vmin.s32 q12, q11
vpmin.s32 d24, d25
vpmin.s32 d24, d24
vpmin.s32 d24, d24, d25
vpmin.s32 d24, d24, d24
adds r1, #8
vmov.s32 r2, d24[0]
beq END_MIN_VALUE_W32
@ -301,5 +281,3 @@ LOOP_MIN_VALUE_W32:
END_MIN_VALUE_W32:
mov r0, r2
bx lr
.fnend

View File

@ -31,7 +31,7 @@ static __inline WebRtc_Word32 MUL_ACCUM_1(WebRtc_Word32 tbl_value,
WebRtc_Word32 diff,
WebRtc_Word32 state) {
WebRtc_Word32 result;
__asm__("smlawb %r0, %r1, %r2, %r3": "=r"(result): "r"(diff),
__asm __volatile ("smlawb %0, %1, %2, %3": "=r"(result): "r"(diff),
"r"(tbl_value), "r"(state));
return result;
}
@ -47,7 +47,7 @@ static __inline WebRtc_Word32 MUL_ACCUM_2(WebRtc_Word32 tbl_value,
WebRtc_Word32 diff,
WebRtc_Word32 state) {
WebRtc_Word32 result;
__asm__("smmla %r0, %r1, %r2, %r3": "=r"(result): "r"(diff << 1),
__asm __volatile ("smmla %0, %1, %2, %3": "=r"(result): "r"(diff << 1),
"r"(tbl_value), "r"(state));
return result;
}

View File

@ -65,8 +65,8 @@
'conditions': [
['target_arch=="arm"', {
'sources': [
'complex_bit_reverse_arm.s',
'spl_sqrt_floor_arm.s',
'complex_bit_reverse_arm.S',
'spl_sqrt_floor_arm.S',
],
'sources!': [
'complex_bit_reverse.c',
@ -76,7 +76,7 @@
['armv7==1', {
'dependencies': ['signal_processing_neon',],
'sources': [
'filter_ar_fast_q12_armv7.s',
'filter_ar_fast_q12_armv7.S',
],
'sources!': [
'filter_ar_fast_q12.c',
@ -112,10 +112,10 @@
'type': '<(library)',
'includes': ['../../build/arm_neon.gypi',],
'sources': [
'cross_correlation_neon.s',
'downsample_fast_neon.s',
'min_max_operations_neon.s',
'vector_scaling_operations_neon.s',
'cross_correlation_neon.S',
'downsample_fast_neon.S',
'min_max_operations_neon.S',
'vector_scaling_operations_neon.S',
],
},
],

View File

@ -8,10 +8,11 @@
@ Output: r0 = INT (SQRT (r0)), precision is 16 bits
@ Registers touched: r1, r2
.global WebRtcSpl_SqrtFloor
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcSpl_SqrtFloor
.align 2
WebRtcSpl_SqrtFloor:
DEFINE_FUNCTION WebRtcSpl_SqrtFloor
mov r1, #3 << 30
mov r2, #1 << 30

View File

@ -13,15 +13,11 @@
@ optimized for ARM Neon platform. Output is bit-exact with the reference
@ C code in vector_scaling_operations.c.
.arch armv7-a
.fpu neon
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
.align 2
.global WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
WebRtcSpl_ScaleAndAddVectorsWithRoundNeon:
.fnstart
DEFINE_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
push {r4-r9}
ldr r4, [sp, #32] @ length
@ -84,5 +80,3 @@ LOOP_NO_UNROLLING:
END:
pop {r4-r9}
bx lr
.fnend

View File

@ -13,9 +13,9 @@
@ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype
@ C code is at end of this file.
.arch armv7-a
.fpu neon
.global WebRtcIsacfix_AllpassFilter2FixDec16Neon
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
.align 2
@void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
@ -27,7 +27,7 @@
@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16
@ int32_t *filter_state_ch2); // Filter state for channel 2, in Q16
WebRtcIsacfix_AllpassFilter2FixDec16Neon:
DEFINE_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
push {r4 - r7}
ldr r5, [sp, #24] @ filter_state_ch2

View File

@ -9,9 +9,9 @@
@
@ Reference code in filters.c. Output is bit-exact.
#include "settings.h"
#include "webrtc/system_wrappers/interface/asm_defines.h"
.global WebRtcIsacfix_AutocorrNeon
GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon
.align 2
@ int WebRtcIsacfix_AutocorrNeon(
@ -21,7 +21,7 @@
@ WebRtc_Word16 order,
@ WebRtc_Word16* __restrict scale);
WebRtcIsacfix_AutocorrNeon:
DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon
push {r3 - r12}
@ Constant initializations

View File

@ -97,8 +97,8 @@
'<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
],
'sources': [
'filters_neon.S',
'filterbanks_neon.S',
'filters_neon.S',
'lattice_neon.S',
'lpc_masking_model_neon.S',
],

View File

@ -25,16 +25,12 @@
@ r12: constant #16384
@ r6, r7, r8, r10, r11: scratch
#include "webrtc/system_wrappers/interface/asm_defines.h"
#include "settings.h"
.arch armv7-a
.global WebRtcIsacfix_FilterArLoop
GLOBAL_FUNCTION WebRtcIsacfix_FilterArLoop
.align 2
WebRtcIsacfix_FilterArLoop:
.fnstart
.save {r4-r11}
DEFINE_FUNCTION WebRtcIsacfix_FilterArLoop
push {r4-r11}
add r1, #2 @ &ar_f_Q0[1]
@ -77,6 +73,3 @@ ORDER_COEF_LOOP: @ for(k = order_coef - 1 ; k >= 0; k--)
pop {r4-r11}
bx lr
.fnend

View File

@ -29,19 +29,12 @@
@ instructions, smulwb, and smull. Speech quality was not degraded by
@ testing speech and tone vectors.
.arch armv7-a
.fpu neon
#include "webrtc/system_wrappers/interface/asm_defines.h"
#include "settings.h"
.global WebRtcIsacfix_FilterMaLoopNeon
GLOBAL_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
.align 2
WebRtcIsacfix_FilterMaLoopNeon:
.fnstart
.save {r4-r8}
DEFINE_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
push {r4-r8}
vdup.32 d28, r0 @ Initialize Neon register with input0
@ -151,5 +144,3 @@ LAST_SAMPLE:
END:
pop {r4-r8}
bx lr
.fnend

View File

@ -12,9 +12,9 @@
@ iSAC codec, optimized for ARM Neon platform. Reference code in
@ lpc_masking_model.c.
.arch armv7-a
.fpu neon
.global WebRtcIsacfix_CalculateResidualEnergyNeon
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
.align 2
@ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order,
@ -23,10 +23,7 @@
@ int16_t* a_polynomial,
@ int32_t* corr_coeffs,
@ int* q_val_residual_energy);
WebRtcIsacfix_CalculateResidualEnergyNeon:
.fnstart
.save {r4-r11}
DEFINE_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
push {r4-r11}
sub r13, r13, #16
@ -173,5 +170,4 @@ GET_SHIFT_NORM:
pop {r4-r11}
bx r14
.fnend

View File

@ -13,12 +13,11 @@
@
@ Output is bit-exact with the reference C code in pitch_filter.c.
#include "webrtc/system_wrappers/interface/asm_defines.h"
#include "settings.h"
.arch armv6
GLOBAL_FUNCTION WebRtcIsacfix_PitchFilterCore
.align 2
.global WebRtcIsacfix_PitchFilterCore
@ void WebRtcIsacfix_PitchFilterCore(int loopNumber,
@ WebRtc_Word16 gain,
@ -30,9 +29,7 @@
@ WebRtc_Word16* inputBuf,
@ WebRtc_Word16* outputBuf,
@ int* index2) {
WebRtcIsacfix_PitchFilterCore:
.fnstart
DEFINE_FUNCTION WebRtcIsacfix_PitchFilterCore
push {r4-r11}
sub sp, #8
@ -140,7 +137,6 @@ LOOP:
add sp, #8
pop {r4-r11}
bx lr
.fnend
.align 2
kDampFilter:

View File

@ -12,19 +12,17 @@
@ This file contains some functions in AECM, optimized for ARM Neon
@ platforms. Reference C code is in file aecm_core.c. Bit-exact.
.arch armv7-a
.fpu neon
#include "aecm_defines.h"
#include "aecm_core_neon_offsets.h"
#include "webrtc/system_wrappers/interface/asm_defines.h"
.extern WebRtcAecm_kSqrtHanning
.global WebRtcAecm_WindowAndFFTNeon
.global WebRtcAecm_InverseFFTAndWindowNeon
.global WebRtcAecm_CalcLinearEnergiesNeon
.global WebRtcAecm_StoreAdaptiveChannelNeon
.global WebRtcAecm_ResetAdaptiveChannelNeon
GLOBAL_FUNCTION WebRtcAecm_WindowAndFFTNeon
GLOBAL_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon
GLOBAL_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon
GLOBAL_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon
GLOBAL_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon
@ void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm,
@ WebRtc_Word16* fft,
@ -32,9 +30,7 @@
@ complex16_t* freq_signal,
@ int time_signal_scaling);
.align 2
WebRtcAecm_WindowAndFFTNeon:
.fnstart
.save {r4, r5, r6, lr}
DEFINE_FUNCTION WebRtcAecm_WindowAndFFTNeon
push {r4, r5, r6, lr}
ldr r12, [sp, #16] @ time_signal_scaling
@ -84,7 +80,6 @@ LOOP_PART_LEN2:
bgt LOOP_PART_LEN2
pop {r4, r5, r6, pc}
.fnend
@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
@ WebRtc_Word16* fft,
@ -92,9 +87,7 @@ LOOP_PART_LEN2:
@ WebRtc_Word16* output,
@ const WebRtc_Word16* nearendClean);
.align 2
WebRtcAecm_InverseFFTAndWindowNeon:
.fnstart
.save {r4-r8, lr}
DEFINE_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon
push {r4-r8, lr}
@ Values of r0, r1, and r3 will change in WebRtcSpl_ComplexIFFT
@ -158,12 +151,12 @@ LOOP_POST_IFFT:
vld1.16 d1, [r12, :64]! @ kSqrtHanningReversed[i]
vadd.i32 q8, q10
vmull.s16 q0, d0, d1
vqshrn.s32 d4, q8, #0
vqmovn.s32 d16, q8
vshr.s32 q0, q0, #14
vst2.16 {d4, d5}, [r4, :128]! @ &efw[i];
vshl.s32 q0, q0, q9
vst1.16 d16, [r7, :64]! @ output[i]
vqshrn.s32 d0, q0, #0
vqmovn.s32 d0, q0
subs r3, #1
vst1.16 d0, [r8, :64]! @ aecm->outBuf[i]
bgt LOOP_POST_IFFT
@ -203,7 +196,6 @@ LOOP_COPY:
END:
pop {r4-r8, pc}
.fnend
@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
@ const WebRtc_UWord16* far_spectrum,
@ -212,9 +204,7 @@ END:
@ WebRtc_UWord32* echo_energy_adapt,
@ WebRtc_UWord32* echo_energy_stored);
.align 2
WebRtcAecm_CalcLinearEnergiesNeon:
.fnstart
.save {r4-r7}
DEFINE_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon
push {r4-r7}
vmov.i32 q14, #0
@ -274,14 +264,12 @@ LOOP_CALC_LINEAR_ENERGIES:
pop {r4-r7}
bx lr
.fnend
@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
@ const uint16_t* far_spectrum,
@ int32_t* echo_est);
.align 2
WebRtcAecm_StoreAdaptiveChannelNeon:
.fnstart
DEFINE_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon
ldr r3, =offset_aecm_channelAdapt16
ldr r12, =offset_aecm_channelStored
ldr r3, [r0, r3]
@ -305,12 +293,10 @@ LOOP_STORE_ADAPTIVE_CHANNEL:
str r3, [r2]
bx lr
.fnend
@ void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm);
.align 2
WebRtcAecm_ResetAdaptiveChannelNeon:
.fnstart
DEFINE_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon
ldr r1, =offset_aecm_channelAdapt16
ldr r2, =offset_aecm_channelAdapt32
movw r3, #offset_aecm_channelStored
@ -334,15 +320,14 @@ LOOP_RESET_ADAPTIVE_CHANNEL:
str r0, [r2]
bx lr
.fnend
@ Square root of Hanning window in Q14. Compared to WebRtcAecm_kSqrtHanning,
@ the order was reversed and one useless element (0) was removed.
.align 3
kSqrtHanningReversed:
.hword 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947
.hword 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571
.hword 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335
.hword 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370
.hword 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101
.hword 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399
.short 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947
.short 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571
.short 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335
.short 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370
.short 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101
.short 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399

View File

@ -139,7 +139,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
"vneg.s16 d23, d23\n\t"
"vst2.16 {d22, d23}, [%[p_fft], :128]!\n\t"
"vrev64.16 q10, q10\n\t"
"vst2.16 {q10}, [%[p_fft_offset], %[offset]]\n\t"
"vst2.16 {q10}, [%[p_fft_offset]], %[offset]\n\t"
:[p_efw]"+r"(p_efw),
[p_fft]"+r"(p_fft),
[p_fft_offset]"+r"(p_fft_offset)
@ -181,7 +181,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
__asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
__asm __volatile("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
__asm __volatile("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
__asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&efw[i].real));
__asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
@ -196,7 +196,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
__asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
// aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
// WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
__asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
}

View File

@ -12,18 +12,16 @@
@ This file contains some functions in NS, optimized for ARM Neon
@ platforms. Reference C code is in file nsx_core.c. Bit-exact.
.arch armv7-a
.fpu neon
#include "webrtc/system_wrappers/interface/asm_defines.h"
#include "nsx_defines.h"
#include "nsx_core_neon_offsets.h"
.global WebRtcNsx_NoiseEstimationNeon
.global WebRtcNsx_PrepareSpectrumNeon
.global WebRtcNsx_SynthesisUpdateNeon
.global WebRtcNsx_AnalysisUpdateNeon
.global WebRtcNsx_DenormalizeNeon
.global WebRtcNsx_CreateComplexBufferNeon
GLOBAL_FUNCTION WebRtcNsx_NoiseEstimationNeon
GLOBAL_FUNCTION WebRtcNsx_PrepareSpectrumNeon
GLOBAL_FUNCTION WebRtcNsx_SynthesisUpdateNeon
GLOBAL_FUNCTION WebRtcNsx_AnalysisUpdateNeon
GLOBAL_FUNCTION WebRtcNsx_DenormalizeNeon
GLOBAL_FUNCTION WebRtcNsx_CreateComplexBufferNeon
@ void NoiseEstimationNeon(NsxInst_t* inst,
@ uint16_t* magn,
@ -42,12 +40,7 @@
@ r11: countDiv
@ r12: i, the loop counter for LOOP_NOISEESTIMATION_MAGNLEN_INNER
WebRtcNsx_NoiseEstimationNeon:
.fnstart
.save {r4-r11, r14}
.vsave {d8-d15}
.pad #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
DEFINE_FUNCTION WebRtcNsx_NoiseEstimationNeon
push {r4-r11, r14}
vpush {d8-d15}
sub sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
@ -312,14 +305,10 @@ UPDATE_Q_NOISE:
add sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
vpop {d8-d15}
pop {r4-r11, pc}
.fnend
@ static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset);
@ Neon registers touched: q0-q3, q8-q13.
UpdateNoiseEstimateNeon:
.fnstart
.save {r4, r5, r6, r14}
DEFINE_FUNCTION UpdateNoiseEstimateNeon
push {r4, r5, r6, r14}
mov r5, r0
@ -385,13 +374,9 @@ POST_LOOP_MAGNLEN:
strh r3, [r2]
pop {r4, r5, r6, pc}
.fnend
@ void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf);
WebRtcNsx_PrepareSpectrumNeon:
.fnstart
.save {r4-r8}
DEFINE_FUNCTION WebRtcNsx_PrepareSpectrumNeon
push {r4-r8}
movw r2, #offset_nsx_real
@ -478,11 +463,9 @@ LOOP_ANALEN2:
pop {r4-r8}
bx r14
.fnend
@ void WebRtcNsx_DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor);
WebRtcNsx_DenormalizeNeon:
.fnstart
DEFINE_FUNCTION WebRtcNsx_DenormalizeNeon
movw r12, #offset_nsx_normData
movw r3, #offset_nsx_real
ldr r12, [r0, r12] @ inst->normData
@ -508,14 +491,11 @@ LOOP_ANALEN:
blt LOOP_ANALEN
bx r14
.fnend
@ void SynthesisUpdateNeon(NsxInst_t* inst,
@ int16_t* out_frame,
@ int16_t gain_factor);
WebRtcNsx_SynthesisUpdateNeon:
.fnstart
.save {r4, r5}
DEFINE_FUNCTION WebRtcNsx_SynthesisUpdateNeon
push {r4, r5}
vdup.16 d31, r2
@ -586,12 +566,8 @@ EXIT_SYNTHESISUPDATE:
pop {r4, r5}
bx r14
.fnend
@ void AnalysisUpdateNeon(NsxInst_t* inst, int16_t* out, int16_t* new_speech);
WebRtcNsx_AnalysisUpdateNeon:
.fnstart
.save {r4-r6}
DEFINE_FUNCTION WebRtcNsx_AnalysisUpdateNeon
push {r4-r6}
movw r3, #offset_nsx_analysisBuffer
@ -647,11 +623,9 @@ LOOP_WINDOW_DATA:
POST_LOOP_WINDOW_DATA:
pop {r4-r6}
bx r14
.fnend
@ void CreateComplexBufferNeon(NsxInst_t* inst, int16_t* in, int16_t* out);
WebRtcNsx_CreateComplexBufferNeon:
.fnstart
DEFINE_FUNCTION WebRtcNsx_CreateComplexBufferNeon
movw r3, #offset_nsx_anaLen
movw r12, #offset_nsx_normData
ldrsh r3, [r0, r3] @ inst->anaLen
@ -678,4 +652,3 @@ LOOP_CREATE_COMPLEX_BUFFER: @ Unrolled by 16.
blt LOOP_CREATE_COMPLEX_BUFFER
bx r14
.fnend

View File

@ -0,0 +1,32 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_SYSTEM_WRAPPERS_INTERFACE_ASM_DEFINES_H_
#define WEBRTC_SYSTEM_WRAPPERS_INTERFACE_ASM_DEFINES_H_
// Define the macros used in ARM assembly code, so that for Mac or iOS builds
// we add leading underscores for the function names.
#ifdef __APPLE__
.macro GLOBAL_FUNCTION name
.global _\name
.endm
.macro DEFINE_FUNCTION name
_\name:
.endm
#else
.macro GLOBAL_FUNCTION name
.global \name
.endm
.macro DEFINE_FUNCTION name
\name:
.endm
#endif
#endif // WEBRTC_SYSTEM_WRAPPERS_INTERFACE_COMPILE_ASSERT_H_