Porting ARM optimization from Android to ios.

Tested APM and iSAC in Android. Bit-exact with original versions.
Changes include removing or changing some GCC derivatives (e.g. .fnstart, .hword), instruction syntax, etc.
Review URL: https://webrtc-codereview.appspot.com/934009

git-svn-id: http://webrtc.googlecode.com/svn/trunk@3124 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@webrtc.org 2012-11-17 00:22:46 +00:00
parent 2ec58dc4d1
commit 55cd78cfc2
22 changed files with 189 additions and 281 deletions

View File

@ -60,7 +60,7 @@ LOCAL_C_INCLUDES := \
ifeq ($(ARCH_ARM_HAVE_ARMV7A),true) ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
LOCAL_SRC_FILES += \ LOCAL_SRC_FILES += \
filter_ar_fast_q12_armv7.s filter_ar_fast_q12_armv7.S
else else
LOCAL_SRC_FILES += \ LOCAL_SRC_FILES += \
filter_ar_fast_q12.c filter_ar_fast_q12.c
@ -68,8 +68,8 @@ endif
ifeq ($(TARGET_ARCH),arm) ifeq ($(TARGET_ARCH),arm)
LOCAL_SRC_FILES += \ LOCAL_SRC_FILES += \
complex_bit_reverse_arm.s \ complex_bit_reverse_arm.S \
spl_sqrt_floor_arm.s spl_sqrt_floor_arm.S
else else
LOCAL_SRC_FILES += \ LOCAL_SRC_FILES += \
complex_bit_reverse.c \ complex_bit_reverse.c \
@ -102,10 +102,10 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_MODULE := libwebrtc_spl_neon LOCAL_MODULE := libwebrtc_spl_neon
LOCAL_MODULE_TAGS := optional LOCAL_MODULE_TAGS := optional
LOCAL_SRC_FILES := \ LOCAL_SRC_FILES := \
cross_correlation_neon.s \ cross_correlation_neon.S \
downsample_fast_neon.s \ downsample_fast_neon.S \
min_max_operations_neon.s \ min_max_operations_neon.S \
vector_scaling_operations_neon.s vector_scaling_operations_neon.S
# Flags passed to both C and C++ files. # Flags passed to both C and C++ files.
LOCAL_CFLAGS := \ LOCAL_CFLAGS := \

View File

@ -12,15 +12,11 @@
@ for ARMv5 platforms. @ for ARMv5 platforms.
@ Reference C code is in file complex_bit_reverse.c. Bit-exact. @ Reference C code is in file complex_bit_reverse.c. Bit-exact.
.arch armv5 #include "webrtc/system_wrappers/interface/asm_defines.h"
.global WebRtcSpl_ComplexBitReverse
GLOBAL_FUNCTION WebRtcSpl_ComplexBitReverse
.align 2 .align 2
DEFINE_FUNCTION WebRtcSpl_ComplexBitReverse
WebRtcSpl_ComplexBitReverse:
.fnstart
push {r4-r7} push {r4-r7}
cmp r1, #7 cmp r1, #7
@ -88,39 +84,36 @@ END:
pop {r4-r7} pop {r4-r7}
bx lr bx lr
.fnend
@ The index tables. Note the values are doubles of the actual indexes for 16-bit @ The index tables. Note the values are doubles of the actual indexes for 16-bit
@ elements, different from the generic C code. It actually provides byte offsets @ elements, different from the generic C code. It actually provides byte offsets
@ for the indexes. @ for the indexes.
.align 2 .align 2
index_7: @ Indexes for stages == 7. index_7: @ Indexes for stages == 7.
.hword 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288 .short 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288
.hword 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144 .short 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144
.hword 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116 .short 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116
.hword 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156 .short 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156
.hword 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204 .short 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204
.hword 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268 .short 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268
.hword 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348 .short 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348
.hword 468, 364, 436, 380, 500, 412, 460, 444, 492 .short 468, 364, 436, 380, 500, 412, 460, 444, 492
index_8: @ Indexes for stages == 8. index_8: @ Indexes for stages == 8.
.hword 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64 .short 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64
.hword 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544 .short 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544
.hword 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104 .short 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104
.hword 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136 .short 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136
.hword 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172 .short 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172
.hword 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204 .short 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204
.hword 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244 .short 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244
.hword 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284 .short 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284
.hword 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324 .short 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324
.hword 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372 .short 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372
.hword 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420 .short 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420
.hword 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468 .short 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468
.hword 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532 .short 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532
.hword 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596 .short 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596
.hword 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684 .short 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684
.hword 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796 .short 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796
.hword 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988 .short 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988

View File

@ -29,24 +29,18 @@
@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL @ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
@ r8, r9, r10, r11, r12: scratch @ r8, r9, r10, r11, r12: scratch
.arch armv7-a #include "webrtc/system_wrappers/interface/asm_defines.h"
.fpu neon
GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon
.align 2 .align 2
.global WebRtcSpl_CrossCorrelationNeon DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon
WebRtcSpl_CrossCorrelationNeon:
.fnstart
.save {r4-r11}
push {r4-r11} push {r4-r11}
@ Put the shift value (-right_shifts) into a Neon register. @ Put the shift value (-right_shifts) into a Neon register.
ldrsh r10, [sp, #36] ldrsh r10, [sp, #36]
rsb r10, r10, #0 rsb r10, r10, #0
mov r8, r10, asr #31 mov r8, r10, asr #31
vmov.32 d16, r10, r8 vmov d16, r10, r8
@ Initialize loop counters. @ Initialize loop counters.
and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8; and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8;
@ -105,9 +99,6 @@ POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift.
pop {r4-r11} pop {r4-r11}
bx lr bx lr
.fnend
@ TODO(kma): Place this piece of reference code into a C code file. @ TODO(kma): Place this piece of reference code into a C code file.
@ void WebRtcSpl_CrossCorrelationNeon(WebRtc_Word32* cross_correlation, @ void WebRtcSpl_CrossCorrelationNeon(WebRtc_Word32* cross_correlation,
@ WebRtc_Word16* seq1, @ WebRtc_Word16* seq1,

View File

@ -14,17 +14,11 @@
@ @
@ The reference C code is in file downsample_fast.c. Bit-exact. @ The reference C code is in file downsample_fast.c. Bit-exact.
.arch armv7-a #include "webrtc/system_wrappers/interface/asm_defines.h"
.fpu neon
GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon
.align 2 .align 2
.global WebRtcSpl_DownsampleFastNeon DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon
WebRtcSpl_DownsampleFastNeon:
.fnstart
.save {r4-r11}
push {r4-r11} push {r4-r11}
cmp r3, #0 @ data_out_length <= 0? cmp r3, #0 @ data_out_length <= 0?
@ -168,14 +162,15 @@ LOOP_COEFF_LENGTH_FACTOR4:
vmlal.s16 q3, d18, d17 vmlal.s16 q3, d18, d17
bge LOOP_COEFF_LENGTH_FACTOR4 bge LOOP_COEFF_LENGTH_FACTOR4
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
@ Shift, saturate, and store the result. @ Shift, saturate, and store the result.
vqshrn.s32 d0, q2, #12 vqshrn.s32 d0, q2, #12
vqshrn.s32 d1, q3, #12 vqshrn.s32 d1, q3, #12
cmp r9, r3 @ i < endpos - factor * 7 ?
vst1.16 {d0, d1}, [r2]! vst1.16 {d0, d1}, [r2]!
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
cmp r9, r3 @ i < endpos - factor * 7 ?
blt LOOP_ENDPOS_FACTOR4 blt LOOP_ENDPOS_FACTOR4
@ @
@ -218,5 +213,3 @@ LOOP2_COEFF_LENGTH:
END: END:
pop {r4-r11} pop {r4-r11}
bx lr bx lr
.fnend

View File

@ -35,16 +35,11 @@
@ r11: Scratch @ r11: Scratch
@ r12: &coefficients[j] @ r12: &coefficients[j]
.arch armv7-a #include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12
.align 2 .align 2
.global WebRtcSpl_FilterARFastQ12 DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12
WebRtcSpl_FilterARFastQ12:
.fnstart
.save {r4-r11}
push {r4-r11} push {r4-r11}
ldrsh r12, [sp, #32] @ data_length ldrsh r12, [sp, #32] @ data_length
@ -155,9 +150,6 @@ END:
pop {r4-r11} pop {r4-r11}
bx lr bx lr
.fnend
@Reference C code: @Reference C code:
@ @
@void WebRtcSpl_FilterARFastQ12(int16_t* data_in, @void WebRtcSpl_FilterARFastQ12(int16_t* data_in,

View File

@ -166,7 +166,7 @@ static __inline int WebRtcSpl_NormW16(WebRtc_Word16 a) {
static __inline WebRtc_Word16 WebRtcSpl_SatW32ToW16(WebRtc_Word32 value32) { static __inline WebRtc_Word16 WebRtcSpl_SatW32ToW16(WebRtc_Word32 value32) {
WebRtc_Word16 out16 = 0; WebRtc_Word16 out16 = 0;
__asm __volatile ("ssat %r0, #16, %r1" : "=r"(out16) : "r"(value32)); __asm __volatile ("ssat %0, #16, %1" : "=r"(out16) : "r"(value32));
return out16; return out16;
} }

View File

@ -15,20 +15,18 @@
@ The reference C code is in file min_max_operations.c. Code here is basically @ The reference C code is in file min_max_operations.c. Code here is basically
@ a loop unrolling by 8 with Neon instructions. Bit-exact. @ a loop unrolling by 8 with Neon instructions. Bit-exact.
.arch armv7-a #include "webrtc/system_wrappers/interface/asm_defines.h"
.fpu neon
.global WebRtcSpl_MaxAbsValueW16Neon GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
.global WebRtcSpl_MaxAbsValueW32Neon GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
.global WebRtcSpl_MaxValueW16Neon GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon
.global WebRtcSpl_MaxValueW32Neon GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon
.global WebRtcSpl_MinValueW16Neon GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon
.global WebRtcSpl_MinValueW32Neon GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon
.align 2 .align 2
@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length); @ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
WebRtcSpl_MaxAbsValueW16Neon: DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
.fnstart
mov r2, #-1 @ Initialize the return value. mov r2, #-1 @ Initialize the return value.
cmp r0, #0 cmp r0, #0
beq END_MAX_ABS_VALUE_W16 beq END_MAX_ABS_VALUE_W16
@ -50,8 +48,8 @@ LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
@ Find the maximum value in the Neon registers and move it to r2. @ Find the maximum value in the Neon registers and move it to r2.
vmax.u16 d24, d25 vmax.u16 d24, d25
vpmax.u16 d24, d24 vpmax.u16 d24, d24, d24
vpmax.u16 d24, d24 vpmax.u16 d24, d24, d24
adds r1, #8 adds r1, #8
vmov.u16 r2, d24[0] vmov.u16 r2, d24[0]
beq END_MAX_ABS_VALUE_W16 beq END_MAX_ABS_VALUE_W16
@ -71,12 +69,10 @@ END_MAX_ABS_VALUE_W16:
mov r0, r2 mov r0, r2
bx lr bx lr
.fnend
@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length); @ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
WebRtcSpl_MaxAbsValueW32Neon: DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
.fnstart
cmp r0, #0 cmp r0, #0
moveq r0, #-1 moveq r0, #-1
beq EXIT @ Return -1 for a NULL pointer. beq EXIT @ Return -1 for a NULL pointer.
@ -103,7 +99,7 @@ LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
@ Find the maximum value in the Neon registers and move it to r2. @ Find the maximum value in the Neon registers and move it to r2.
vmax.u32 q12, q11 vmax.u32 q12, q11
vmax.u32 d24, d25 vmax.u32 d24, d25
vpmax.u32 d24, d24 vpmax.u32 d24, d24, d24
adds r1, #8 adds r1, #8
vmov.u32 r2, d24[0] vmov.u32 r2, d24[0]
beq END_MAX_ABS_VALUE_W32 beq END_MAX_ABS_VALUE_W32
@ -125,12 +121,8 @@ END_MAX_ABS_VALUE_W32:
EXIT: EXIT:
bx lr bx lr
.fnend
@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length); @ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
WebRtcSpl_MaxValueW16Neon: DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon
.fnstart
mov r2, #0x8000 @ Initialize the return value. mov r2, #0x8000 @ Initialize the return value.
cmp r0, #0 cmp r0, #0
beq END_MAX_VALUE_W16 beq END_MAX_VALUE_W16
@ -151,8 +143,8 @@ LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
@ Find the maximum value in the Neon registers and move it to r2. @ Find the maximum value in the Neon registers and move it to r2.
vmax.s16 d24, d25 vmax.s16 d24, d25
vpmax.s16 d24, d24 vpmax.s16 d24, d24, d24
vpmax.s16 d24, d24 vpmax.s16 d24, d24, d24
adds r1, #8 adds r1, #8
vmov.u16 r2, d24[0] vmov.u16 r2, d24[0]
beq END_MAX_VALUE_W16 beq END_MAX_VALUE_W16
@ -168,12 +160,8 @@ END_MAX_VALUE_W16:
mov r0, r2 mov r0, r2
bx lr bx lr
.fnend
@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length); @ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
WebRtcSpl_MaxValueW32Neon: DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon
.fnstart
mov r2, #0x80000000 @ Initialize the return value. mov r2, #0x80000000 @ Initialize the return value.
cmp r0, #0 cmp r0, #0
beq END_MAX_VALUE_W32 beq END_MAX_VALUE_W32
@ -196,8 +184,8 @@ LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
@ Find the maximum value in the Neon registers and move it to r2. @ Find the maximum value in the Neon registers and move it to r2.
vmax.s32 q12, q11 vmax.s32 q12, q11
vpmax.s32 d24, d25 vpmax.s32 d24, d24, d25
vpmax.s32 d24, d24 vpmax.s32 d24, d24, d24
adds r1, #8 adds r1, #8
vmov.s32 r2, d24[0] vmov.s32 r2, d24[0]
beq END_MAX_VALUE_W32 beq END_MAX_VALUE_W32
@ -213,12 +201,8 @@ END_MAX_VALUE_W32:
mov r0, r2 mov r0, r2
bx lr bx lr
.fnend
@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length); @ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
WebRtcSpl_MinValueW16Neon: DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon
.fnstart
movw r2, #0x7FFF @ Initialize the return value. movw r2, #0x7FFF @ Initialize the return value.
cmp r0, #0 cmp r0, #0
beq END_MIN_VALUE_W16 beq END_MIN_VALUE_W16
@ -239,8 +223,8 @@ LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
@ Find the maximum value in the Neon registers and move it to r2. @ Find the maximum value in the Neon registers and move it to r2.
vmin.s16 d24, d25 vmin.s16 d24, d25
vpmin.s16 d24, d24 vpmin.s16 d24, d24, d24
vpmin.s16 d24, d24 vpmin.s16 d24, d24, d24
adds r1, #8 adds r1, #8
vmov.s16 r2, d24[0] vmov.s16 r2, d24[0]
sxth r2, r2 sxth r2, r2
@ -257,12 +241,8 @@ END_MIN_VALUE_W16:
mov r0, r2 mov r0, r2
bx lr bx lr
.fnend
@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length); @ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
WebRtcSpl_MinValueW32Neon: DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon
.fnstart
mov r2, #0x7FFFFFFF @ Initialize the return value. mov r2, #0x7FFFFFFF @ Initialize the return value.
cmp r0, #0 cmp r0, #0
beq END_MIN_VALUE_W32 beq END_MIN_VALUE_W32
@ -285,8 +265,8 @@ LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
@ Find the maximum value in the Neon registers and move it to r2. @ Find the maximum value in the Neon registers and move it to r2.
vmin.s32 q12, q11 vmin.s32 q12, q11
vpmin.s32 d24, d25 vpmin.s32 d24, d24, d25
vpmin.s32 d24, d24 vpmin.s32 d24, d24, d24
adds r1, #8 adds r1, #8
vmov.s32 r2, d24[0] vmov.s32 r2, d24[0]
beq END_MIN_VALUE_W32 beq END_MIN_VALUE_W32
@ -301,5 +281,3 @@ LOOP_MIN_VALUE_W32:
END_MIN_VALUE_W32: END_MIN_VALUE_W32:
mov r0, r2 mov r0, r2
bx lr bx lr
.fnend

View File

@ -31,7 +31,7 @@ static __inline WebRtc_Word32 MUL_ACCUM_1(WebRtc_Word32 tbl_value,
WebRtc_Word32 diff, WebRtc_Word32 diff,
WebRtc_Word32 state) { WebRtc_Word32 state) {
WebRtc_Word32 result; WebRtc_Word32 result;
__asm__("smlawb %r0, %r1, %r2, %r3": "=r"(result): "r"(diff), __asm __volatile ("smlawb %0, %1, %2, %3": "=r"(result): "r"(diff),
"r"(tbl_value), "r"(state)); "r"(tbl_value), "r"(state));
return result; return result;
} }
@ -47,7 +47,7 @@ static __inline WebRtc_Word32 MUL_ACCUM_2(WebRtc_Word32 tbl_value,
WebRtc_Word32 diff, WebRtc_Word32 diff,
WebRtc_Word32 state) { WebRtc_Word32 state) {
WebRtc_Word32 result; WebRtc_Word32 result;
__asm__("smmla %r0, %r1, %r2, %r3": "=r"(result): "r"(diff << 1), __asm __volatile ("smmla %0, %1, %2, %3": "=r"(result): "r"(diff << 1),
"r"(tbl_value), "r"(state)); "r"(tbl_value), "r"(state));
return result; return result;
} }

View File

@ -65,8 +65,8 @@
'conditions': [ 'conditions': [
['target_arch=="arm"', { ['target_arch=="arm"', {
'sources': [ 'sources': [
'complex_bit_reverse_arm.s', 'complex_bit_reverse_arm.S',
'spl_sqrt_floor_arm.s', 'spl_sqrt_floor_arm.S',
], ],
'sources!': [ 'sources!': [
'complex_bit_reverse.c', 'complex_bit_reverse.c',
@ -76,7 +76,7 @@
['armv7==1', { ['armv7==1', {
'dependencies': ['signal_processing_neon',], 'dependencies': ['signal_processing_neon',],
'sources': [ 'sources': [
'filter_ar_fast_q12_armv7.s', 'filter_ar_fast_q12_armv7.S',
], ],
'sources!': [ 'sources!': [
'filter_ar_fast_q12.c', 'filter_ar_fast_q12.c',
@ -112,10 +112,10 @@
'type': '<(library)', 'type': '<(library)',
'includes': ['../../build/arm_neon.gypi',], 'includes': ['../../build/arm_neon.gypi',],
'sources': [ 'sources': [
'cross_correlation_neon.s', 'cross_correlation_neon.S',
'downsample_fast_neon.s', 'downsample_fast_neon.S',
'min_max_operations_neon.s', 'min_max_operations_neon.S',
'vector_scaling_operations_neon.s', 'vector_scaling_operations_neon.S',
], ],
}, },
], ],

View File

@ -8,10 +8,11 @@
@ Output: r0 = INT (SQRT (r0)), precision is 16 bits @ Output: r0 = INT (SQRT (r0)), precision is 16 bits
@ Registers touched: r1, r2 @ Registers touched: r1, r2
.global WebRtcSpl_SqrtFloor #include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcSpl_SqrtFloor
.align 2 .align 2
WebRtcSpl_SqrtFloor: DEFINE_FUNCTION WebRtcSpl_SqrtFloor
mov r1, #3 << 30 mov r1, #3 << 30
mov r2, #1 << 30 mov r2, #1 << 30

View File

@ -13,15 +13,11 @@
@ optimized for ARM Neon platform. Output is bit-exact with the reference @ optimized for ARM Neon platform. Output is bit-exact with the reference
@ C code in vector_scaling_operations.c. @ C code in vector_scaling_operations.c.
.arch armv7-a #include "webrtc/system_wrappers/interface/asm_defines.h"
.fpu neon
GLOBAL_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
.align 2 .align 2
.global WebRtcSpl_ScaleAndAddVectorsWithRoundNeon DEFINE_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
WebRtcSpl_ScaleAndAddVectorsWithRoundNeon:
.fnstart
push {r4-r9} push {r4-r9}
ldr r4, [sp, #32] @ length ldr r4, [sp, #32] @ length
@ -84,5 +80,3 @@ LOOP_NO_UNROLLING:
END: END:
pop {r4-r9} pop {r4-r9}
bx lr bx lr
.fnend

View File

@ -13,9 +13,9 @@
@ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype @ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype
@ C code is at end of this file. @ C code is at end of this file.
.arch armv7-a #include "webrtc/system_wrappers/interface/asm_defines.h"
.fpu neon
.global WebRtcIsacfix_AllpassFilter2FixDec16Neon GLOBAL_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
.align 2 .align 2
@void WebRtcIsacfix_AllpassFilter2FixDec16Neon( @void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
@ -27,7 +27,7 @@
@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16 @ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16
@ int32_t *filter_state_ch2); // Filter state for channel 2, in Q16 @ int32_t *filter_state_ch2); // Filter state for channel 2, in Q16
WebRtcIsacfix_AllpassFilter2FixDec16Neon: DEFINE_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
push {r4 - r7} push {r4 - r7}
ldr r5, [sp, #24] @ filter_state_ch2 ldr r5, [sp, #24] @ filter_state_ch2

View File

@ -9,9 +9,9 @@
@ @
@ Reference code in filters.c. Output is bit-exact. @ Reference code in filters.c. Output is bit-exact.
#include "settings.h" #include "webrtc/system_wrappers/interface/asm_defines.h"
.global WebRtcIsacfix_AutocorrNeon GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon
.align 2 .align 2
@ int WebRtcIsacfix_AutocorrNeon( @ int WebRtcIsacfix_AutocorrNeon(
@ -21,7 +21,7 @@
@ WebRtc_Word16 order, @ WebRtc_Word16 order,
@ WebRtc_Word16* __restrict scale); @ WebRtc_Word16* __restrict scale);
WebRtcIsacfix_AutocorrNeon: DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon
push {r3 - r12} push {r3 - r12}
@ Constant initializations @ Constant initializations

View File

@ -97,8 +97,8 @@
'<(webrtc_root)/common_audio/common_audio.gyp:signal_processing', '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
], ],
'sources': [ 'sources': [
'filters_neon.S',
'filterbanks_neon.S', 'filterbanks_neon.S',
'filters_neon.S',
'lattice_neon.S', 'lattice_neon.S',
'lpc_masking_model_neon.S', 'lpc_masking_model_neon.S',
], ],

View File

@ -25,16 +25,12 @@
@ r12: constant #16384 @ r12: constant #16384
@ r6, r7, r8, r10, r11: scratch @ r6, r7, r8, r10, r11: scratch
#include "webrtc/system_wrappers/interface/asm_defines.h"
#include "settings.h" #include "settings.h"
.arch armv7-a GLOBAL_FUNCTION WebRtcIsacfix_FilterArLoop
.global WebRtcIsacfix_FilterArLoop
.align 2 .align 2
DEFINE_FUNCTION WebRtcIsacfix_FilterArLoop
WebRtcIsacfix_FilterArLoop:
.fnstart
.save {r4-r11}
push {r4-r11} push {r4-r11}
add r1, #2 @ &ar_f_Q0[1] add r1, #2 @ &ar_f_Q0[1]
@ -77,6 +73,3 @@ ORDER_COEF_LOOP: @ for(k = order_coef - 1 ; k >= 0; k--)
pop {r4-r11} pop {r4-r11}
bx lr bx lr
.fnend

View File

@ -29,19 +29,12 @@
@ instructions, smulwb, and smull. Speech quality was not degraded by @ instructions, smulwb, and smull. Speech quality was not degraded by
@ testing speech and tone vectors. @ testing speech and tone vectors.
.arch armv7-a #include "webrtc/system_wrappers/interface/asm_defines.h"
.fpu neon
#include "settings.h" #include "settings.h"
.global WebRtcIsacfix_FilterMaLoopNeon GLOBAL_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
.align 2 .align 2
DEFINE_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
WebRtcIsacfix_FilterMaLoopNeon:
.fnstart
.save {r4-r8}
push {r4-r8} push {r4-r8}
vdup.32 d28, r0 @ Initialize Neon register with input0 vdup.32 d28, r0 @ Initialize Neon register with input0
@ -151,5 +144,3 @@ LAST_SAMPLE:
END: END:
pop {r4-r8} pop {r4-r8}
bx lr bx lr
.fnend

View File

@ -12,9 +12,9 @@
@ iSAC codec, optimized for ARM Neon platform. Reference code in @ iSAC codec, optimized for ARM Neon platform. Reference code in
@ lpc_masking_model.c. @ lpc_masking_model.c.
.arch armv7-a #include "webrtc/system_wrappers/interface/asm_defines.h"
.fpu neon
.global WebRtcIsacfix_CalculateResidualEnergyNeon GLOBAL_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
.align 2 .align 2
@ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order, @ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order,
@ -23,10 +23,7 @@
@ int16_t* a_polynomial, @ int16_t* a_polynomial,
@ int32_t* corr_coeffs, @ int32_t* corr_coeffs,
@ int* q_val_residual_energy); @ int* q_val_residual_energy);
DEFINE_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
WebRtcIsacfix_CalculateResidualEnergyNeon:
.fnstart
.save {r4-r11}
push {r4-r11} push {r4-r11}
sub r13, r13, #16 sub r13, r13, #16
@ -173,5 +170,4 @@ GET_SHIFT_NORM:
pop {r4-r11} pop {r4-r11}
bx r14 bx r14
.fnend

View File

@ -13,12 +13,11 @@
@ @
@ Output is bit-exact with the reference C code in pitch_filter.c. @ Output is bit-exact with the reference C code in pitch_filter.c.
#include "webrtc/system_wrappers/interface/asm_defines.h"
#include "settings.h" #include "settings.h"
.arch armv6 GLOBAL_FUNCTION WebRtcIsacfix_PitchFilterCore
.align 2 .align 2
.global WebRtcIsacfix_PitchFilterCore
@ void WebRtcIsacfix_PitchFilterCore(int loopNumber, @ void WebRtcIsacfix_PitchFilterCore(int loopNumber,
@ WebRtc_Word16 gain, @ WebRtc_Word16 gain,
@ -30,9 +29,7 @@
@ WebRtc_Word16* inputBuf, @ WebRtc_Word16* inputBuf,
@ WebRtc_Word16* outputBuf, @ WebRtc_Word16* outputBuf,
@ int* index2) { @ int* index2) {
DEFINE_FUNCTION WebRtcIsacfix_PitchFilterCore
WebRtcIsacfix_PitchFilterCore:
.fnstart
push {r4-r11} push {r4-r11}
sub sp, #8 sub sp, #8
@ -140,7 +137,6 @@ LOOP:
add sp, #8 add sp, #8
pop {r4-r11} pop {r4-r11}
bx lr bx lr
.fnend
.align 2 .align 2
kDampFilter: kDampFilter:

View File

@ -12,19 +12,17 @@
@ This file contains some functions in AECM, optimized for ARM Neon @ This file contains some functions in AECM, optimized for ARM Neon
@ platforms. Reference C code is in file aecm_core.c. Bit-exact. @ platforms. Reference C code is in file aecm_core.c. Bit-exact.
.arch armv7-a
.fpu neon
#include "aecm_defines.h" #include "aecm_defines.h"
#include "aecm_core_neon_offsets.h" #include "aecm_core_neon_offsets.h"
#include "webrtc/system_wrappers/interface/asm_defines.h"
.extern WebRtcAecm_kSqrtHanning .extern WebRtcAecm_kSqrtHanning
.global WebRtcAecm_WindowAndFFTNeon GLOBAL_FUNCTION WebRtcAecm_WindowAndFFTNeon
.global WebRtcAecm_InverseFFTAndWindowNeon GLOBAL_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon
.global WebRtcAecm_CalcLinearEnergiesNeon GLOBAL_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon
.global WebRtcAecm_StoreAdaptiveChannelNeon GLOBAL_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon
.global WebRtcAecm_ResetAdaptiveChannelNeon GLOBAL_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon
@ void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm, @ void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm,
@ WebRtc_Word16* fft, @ WebRtc_Word16* fft,
@ -32,9 +30,7 @@
@ complex16_t* freq_signal, @ complex16_t* freq_signal,
@ int time_signal_scaling); @ int time_signal_scaling);
.align 2 .align 2
WebRtcAecm_WindowAndFFTNeon: DEFINE_FUNCTION WebRtcAecm_WindowAndFFTNeon
.fnstart
.save {r4, r5, r6, lr}
push {r4, r5, r6, lr} push {r4, r5, r6, lr}
ldr r12, [sp, #16] @ time_signal_scaling ldr r12, [sp, #16] @ time_signal_scaling
@ -84,7 +80,6 @@ LOOP_PART_LEN2:
bgt LOOP_PART_LEN2 bgt LOOP_PART_LEN2
pop {r4, r5, r6, pc} pop {r4, r5, r6, pc}
.fnend
@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, @ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
@ WebRtc_Word16* fft, @ WebRtc_Word16* fft,
@ -92,9 +87,7 @@ LOOP_PART_LEN2:
@ WebRtc_Word16* output, @ WebRtc_Word16* output,
@ const WebRtc_Word16* nearendClean); @ const WebRtc_Word16* nearendClean);
.align 2 .align 2
WebRtcAecm_InverseFFTAndWindowNeon: DEFINE_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon
.fnstart
.save {r4-r8, lr}
push {r4-r8, lr} push {r4-r8, lr}
@ Values of r0, r1, and r3 will change in WebRtcSpl_ComplexIFFT @ Values of r0, r1, and r3 will change in WebRtcSpl_ComplexIFFT
@ -158,12 +151,12 @@ LOOP_POST_IFFT:
vld1.16 d1, [r12, :64]! @ kSqrtHanningReversed[i] vld1.16 d1, [r12, :64]! @ kSqrtHanningReversed[i]
vadd.i32 q8, q10 vadd.i32 q8, q10
vmull.s16 q0, d0, d1 vmull.s16 q0, d0, d1
vqshrn.s32 d4, q8, #0 vqmovn.s32 d16, q8
vshr.s32 q0, q0, #14 vshr.s32 q0, q0, #14
vst2.16 {d4, d5}, [r4, :128]! @ &efw[i]; vst2.16 {d4, d5}, [r4, :128]! @ &efw[i];
vshl.s32 q0, q0, q9 vshl.s32 q0, q0, q9
vst1.16 d16, [r7, :64]! @ output[i] vst1.16 d16, [r7, :64]! @ output[i]
vqshrn.s32 d0, q0, #0 vqmovn.s32 d0, q0
subs r3, #1 subs r3, #1
vst1.16 d0, [r8, :64]! @ aecm->outBuf[i] vst1.16 d0, [r8, :64]! @ aecm->outBuf[i]
bgt LOOP_POST_IFFT bgt LOOP_POST_IFFT
@ -203,7 +196,6 @@ LOOP_COPY:
END: END:
pop {r4-r8, pc} pop {r4-r8, pc}
.fnend
@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, @ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
@ const WebRtc_UWord16* far_spectrum, @ const WebRtc_UWord16* far_spectrum,
@ -212,9 +204,7 @@ END:
@ WebRtc_UWord32* echo_energy_adapt, @ WebRtc_UWord32* echo_energy_adapt,
@ WebRtc_UWord32* echo_energy_stored); @ WebRtc_UWord32* echo_energy_stored);
.align 2 .align 2
WebRtcAecm_CalcLinearEnergiesNeon: DEFINE_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon
.fnstart
.save {r4-r7}
push {r4-r7} push {r4-r7}
vmov.i32 q14, #0 vmov.i32 q14, #0
@ -274,14 +264,12 @@ LOOP_CALC_LINEAR_ENERGIES:
pop {r4-r7} pop {r4-r7}
bx lr bx lr
.fnend
@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, @ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
@ const uint16_t* far_spectrum, @ const uint16_t* far_spectrum,
@ int32_t* echo_est); @ int32_t* echo_est);
.align 2 .align 2
WebRtcAecm_StoreAdaptiveChannelNeon: DEFINE_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon
.fnstart
ldr r3, =offset_aecm_channelAdapt16 ldr r3, =offset_aecm_channelAdapt16
ldr r12, =offset_aecm_channelStored ldr r12, =offset_aecm_channelStored
ldr r3, [r0, r3] ldr r3, [r0, r3]
@ -305,12 +293,10 @@ LOOP_STORE_ADAPTIVE_CHANNEL:
str r3, [r2] str r3, [r2]
bx lr bx lr
.fnend
@ void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm); @ void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm);
.align 2 .align 2
WebRtcAecm_ResetAdaptiveChannelNeon: DEFINE_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon
.fnstart
ldr r1, =offset_aecm_channelAdapt16 ldr r1, =offset_aecm_channelAdapt16
ldr r2, =offset_aecm_channelAdapt32 ldr r2, =offset_aecm_channelAdapt32
movw r3, #offset_aecm_channelStored movw r3, #offset_aecm_channelStored
@ -334,15 +320,14 @@ LOOP_RESET_ADAPTIVE_CHANNEL:
str r0, [r2] str r0, [r2]
bx lr bx lr
.fnend
@ Square root of Hanning window in Q14. Compared to WebRtcAecm_kSqrtHanning, @ Square root of Hanning window in Q14. Compared to WebRtcAecm_kSqrtHanning,
@ the order was reversed and one useless element (0) was removed. @ the order was reversed and one useless element (0) was removed.
.align 3 .align 3
kSqrtHanningReversed: kSqrtHanningReversed:
.hword 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947 .short 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947
.hword 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571 .short 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571
.hword 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335 .short 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335
.hword 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370 .short 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370
.hword 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101 .short 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101
.hword 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399 .short 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399

View File

@ -139,7 +139,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
"vneg.s16 d23, d23\n\t" "vneg.s16 d23, d23\n\t"
"vst2.16 {d22, d23}, [%[p_fft], :128]!\n\t" "vst2.16 {d22, d23}, [%[p_fft], :128]!\n\t"
"vrev64.16 q10, q10\n\t" "vrev64.16 q10, q10\n\t"
"vst2.16 {q10}, [%[p_fft_offset], %[offset]]\n\t" "vst2.16 {q10}, [%[p_fft_offset]], %[offset]\n\t"
:[p_efw]"+r"(p_efw), :[p_efw]"+r"(p_efw),
[p_fft]"+r"(p_fft), [p_fft]"+r"(p_fft),
[p_fft_offset]"+r"(p_fft_offset) [p_fft_offset]"+r"(p_fft_offset)
@ -181,7 +181,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
__asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i])); __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
__asm __volatile("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0)); __asm __volatile("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
__asm __volatile("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1)); __asm __volatile("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
__asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); __asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&efw[i].real)); __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&efw[i].real));
__asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i])); __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
@ -196,7 +196,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
__asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); __asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
// aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT( // aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
// WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN); // WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
__asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); __asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i])); __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
} }

View File

@ -12,18 +12,16 @@
@ This file contains some functions in NS, optimized for ARM Neon @ This file contains some functions in NS, optimized for ARM Neon
@ platforms. Reference C code is in file nsx_core.c. Bit-exact. @ platforms. Reference C code is in file nsx_core.c. Bit-exact.
.arch armv7-a #include "webrtc/system_wrappers/interface/asm_defines.h"
.fpu neon
#include "nsx_defines.h" #include "nsx_defines.h"
#include "nsx_core_neon_offsets.h" #include "nsx_core_neon_offsets.h"
.global WebRtcNsx_NoiseEstimationNeon GLOBAL_FUNCTION WebRtcNsx_NoiseEstimationNeon
.global WebRtcNsx_PrepareSpectrumNeon GLOBAL_FUNCTION WebRtcNsx_PrepareSpectrumNeon
.global WebRtcNsx_SynthesisUpdateNeon GLOBAL_FUNCTION WebRtcNsx_SynthesisUpdateNeon
.global WebRtcNsx_AnalysisUpdateNeon GLOBAL_FUNCTION WebRtcNsx_AnalysisUpdateNeon
.global WebRtcNsx_DenormalizeNeon GLOBAL_FUNCTION WebRtcNsx_DenormalizeNeon
.global WebRtcNsx_CreateComplexBufferNeon GLOBAL_FUNCTION WebRtcNsx_CreateComplexBufferNeon
@ void NoiseEstimationNeon(NsxInst_t* inst, @ void NoiseEstimationNeon(NsxInst_t* inst,
@ uint16_t* magn, @ uint16_t* magn,
@ -42,12 +40,7 @@
@ r11: countDiv @ r11: countDiv
@ r12: i, the loop counter for LOOP_NOISEESTIMATION_MAGNLEN_INNER @ r12: i, the loop counter for LOOP_NOISEESTIMATION_MAGNLEN_INNER
WebRtcNsx_NoiseEstimationNeon: DEFINE_FUNCTION WebRtcNsx_NoiseEstimationNeon
.fnstart
.save {r4-r11, r14}
.vsave {d8-d15}
.pad #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
push {r4-r11, r14} push {r4-r11, r14}
vpush {d8-d15} vpush {d8-d15}
sub sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8) sub sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
@ -312,14 +305,10 @@ UPDATE_Q_NOISE:
add sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8) add sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
vpop {d8-d15} vpop {d8-d15}
pop {r4-r11, pc} pop {r4-r11, pc}
.fnend
@ static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset); @ static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset);
@ Neon registers touched: q0-q3, q8-q13. @ Neon registers touched: q0-q3, q8-q13.
UpdateNoiseEstimateNeon: DEFINE_FUNCTION UpdateNoiseEstimateNeon
.fnstart
.save {r4, r5, r6, r14}
push {r4, r5, r6, r14} push {r4, r5, r6, r14}
mov r5, r0 mov r5, r0
@ -385,13 +374,9 @@ POST_LOOP_MAGNLEN:
strh r3, [r2] strh r3, [r2]
pop {r4, r5, r6, pc} pop {r4, r5, r6, pc}
.fnend
@ void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf); @ void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf);
WebRtcNsx_PrepareSpectrumNeon: DEFINE_FUNCTION WebRtcNsx_PrepareSpectrumNeon
.fnstart
.save {r4-r8}
push {r4-r8} push {r4-r8}
movw r2, #offset_nsx_real movw r2, #offset_nsx_real
@ -478,11 +463,9 @@ LOOP_ANALEN2:
pop {r4-r8} pop {r4-r8}
bx r14 bx r14
.fnend
@ void WebRtcNsx_DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor); @ void WebRtcNsx_DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor);
WebRtcNsx_DenormalizeNeon: DEFINE_FUNCTION WebRtcNsx_DenormalizeNeon
.fnstart
movw r12, #offset_nsx_normData movw r12, #offset_nsx_normData
movw r3, #offset_nsx_real movw r3, #offset_nsx_real
ldr r12, [r0, r12] @ inst->normData ldr r12, [r0, r12] @ inst->normData
@ -508,14 +491,11 @@ LOOP_ANALEN:
blt LOOP_ANALEN blt LOOP_ANALEN
bx r14 bx r14
.fnend
@ void SynthesisUpdateNeon(NsxInst_t* inst, @ void SynthesisUpdateNeon(NsxInst_t* inst,
@ int16_t* out_frame, @ int16_t* out_frame,
@ int16_t gain_factor); @ int16_t gain_factor);
WebRtcNsx_SynthesisUpdateNeon: DEFINE_FUNCTION WebRtcNsx_SynthesisUpdateNeon
.fnstart
.save {r4, r5}
push {r4, r5} push {r4, r5}
vdup.16 d31, r2 vdup.16 d31, r2
@ -586,12 +566,8 @@ EXIT_SYNTHESISUPDATE:
pop {r4, r5} pop {r4, r5}
bx r14 bx r14
.fnend
@ void AnalysisUpdateNeon(NsxInst_t* inst, int16_t* out, int16_t* new_speech); @ void AnalysisUpdateNeon(NsxInst_t* inst, int16_t* out, int16_t* new_speech);
WebRtcNsx_AnalysisUpdateNeon: DEFINE_FUNCTION WebRtcNsx_AnalysisUpdateNeon
.fnstart
.save {r4-r6}
push {r4-r6} push {r4-r6}
movw r3, #offset_nsx_analysisBuffer movw r3, #offset_nsx_analysisBuffer
@ -647,11 +623,9 @@ LOOP_WINDOW_DATA:
POST_LOOP_WINDOW_DATA: POST_LOOP_WINDOW_DATA:
pop {r4-r6} pop {r4-r6}
bx r14 bx r14
.fnend
@ void CreateComplexBufferNeon(NsxInst_t* inst, int16_t* in, int16_t* out); @ void CreateComplexBufferNeon(NsxInst_t* inst, int16_t* in, int16_t* out);
WebRtcNsx_CreateComplexBufferNeon: DEFINE_FUNCTION WebRtcNsx_CreateComplexBufferNeon
.fnstart
movw r3, #offset_nsx_anaLen movw r3, #offset_nsx_anaLen
movw r12, #offset_nsx_normData movw r12, #offset_nsx_normData
ldrsh r3, [r0, r3] @ inst->anaLen ldrsh r3, [r0, r3] @ inst->anaLen
@ -678,4 +652,3 @@ LOOP_CREATE_COMPLEX_BUFFER: @ Unrolled by 16.
blt LOOP_CREATE_COMPLEX_BUFFER blt LOOP_CREATE_COMPLEX_BUFFER
bx r14 bx r14
.fnend

View File

@ -0,0 +1,32 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_SYSTEM_WRAPPERS_INTERFACE_ASM_DEFINES_H_
#define WEBRTC_SYSTEM_WRAPPERS_INTERFACE_ASM_DEFINES_H_
// Define the macros used in ARM assembly code, so that for Mac or iOS builds
// we add leading underscores for the function names.
#ifdef __APPLE__
.macro GLOBAL_FUNCTION name
.global _\name
.endm
.macro DEFINE_FUNCTION name
_\name:
.endm
#else
.macro GLOBAL_FUNCTION name
.global \name
.endm
.macro DEFINE_FUNCTION name
\name:
.endm
#endif
#endif // WEBRTC_SYSTEM_WRAPPERS_INTERFACE_COMPILE_ASSERT_H_