From 55cd78cfc25f135149b780dcf527d147d5621ba2 Mon Sep 17 00:00:00 2001 From: "kma@webrtc.org" Date: Sat, 17 Nov 2012 00:22:46 +0000 Subject: [PATCH] Porting ARM optimization from Android to ios. Tested APM and iSAC in Android. Bit-exact with original versions. Changes include removing or changing some GCC derivatives (e.g. .fnstart, .hword), instruction syntax, etc. Review URL: https://webrtc-codereview.appspot.com/934009 git-svn-id: http://webrtc.googlecode.com/svn/trunk@3124 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../common_audio/signal_processing/Android.mk | 14 ++-- ...everse_arm.s => complex_bit_reverse_arm.S} | 63 +++++++-------- ...lation_neon.s => cross_correlation_neon.S} | 29 +++---- ...ple_fast_neon.s => downsample_fast_neon.S} | 21 ++--- ...q12_armv7.s => filter_ar_fast_q12_armv7.S} | 14 +--- .../signal_processing/include/spl_inl_armv7.h | 2 +- ...tions_neon.s => min_max_operations_neon.S} | 76 +++++++------------ .../signal_processing/resample_by_2.c | 8 +- .../signal_processing/signal_processing.gypi | 14 ++-- ..._sqrt_floor_arm.s => spl_sqrt_floor_arm.S} | 5 +- ...eon.s => vector_scaling_operations_neon.S} | 12 +-- .../codecs/isac/fix/source/filterbanks_neon.S | 8 +- .../codecs/isac/fix/source/filters_neon.S | 6 +- .../codecs/isac/fix/source/isacfix.gypi | 2 +- .../codecs/isac/fix/source/lattice_armv7.S | 13 +--- .../codecs/isac/fix/source/lattice_neon.S | 15 +--- .../isac/fix/source/lpc_masking_model_neon.S | 12 +-- .../isac/fix/source/pitch_filter_armv6.S | 10 +-- .../audio_processing/aecm/aecm_core_neon.S | 53 +++++-------- .../audio_processing/aecm/aecm_core_neon.c | 6 +- .../audio_processing/ns/nsx_core_neon.S | 55 ++++---------- .../system_wrappers/interface/asm_defines.h | 32 ++++++++ 22 files changed, 189 insertions(+), 281 deletions(-) rename webrtc/common_audio/signal_processing/{complex_bit_reverse_arm.s => complex_bit_reverse_arm.S} (62%) rename webrtc/common_audio/signal_processing/{cross_correlation_neon.s => cross_correlation_neon.S} (95%) rename webrtc/common_audio/signal_processing/{downsample_fast_neon.s => downsample_fast_neon.S} (97%) rename webrtc/common_audio/signal_processing/{filter_ar_fast_q12_armv7.s => filter_ar_fast_q12_armv7.S} (98%) rename webrtc/common_audio/signal_processing/{min_max_operations_neon.s => min_max_operations_neon.S} (87%) rename webrtc/common_audio/signal_processing/{spl_sqrt_floor_arm.s => spl_sqrt_floor_arm.S} (93%) rename webrtc/common_audio/signal_processing/{vector_scaling_operations_neon.s => vector_scaling_operations_neon.S} (92%) create mode 100644 webrtc/system_wrappers/interface/asm_defines.h diff --git a/webrtc/common_audio/signal_processing/Android.mk b/webrtc/common_audio/signal_processing/Android.mk index ecbc5ddf4..aed7e73b5 100644 --- a/webrtc/common_audio/signal_processing/Android.mk +++ b/webrtc/common_audio/signal_processing/Android.mk @@ -60,7 +60,7 @@ LOCAL_C_INCLUDES := \ ifeq ($(ARCH_ARM_HAVE_ARMV7A),true) LOCAL_SRC_FILES += \ - filter_ar_fast_q12_armv7.s + filter_ar_fast_q12_armv7.S else LOCAL_SRC_FILES += \ filter_ar_fast_q12.c @@ -68,8 +68,8 @@ endif ifeq ($(TARGET_ARCH),arm) LOCAL_SRC_FILES += \ - complex_bit_reverse_arm.s \ - spl_sqrt_floor_arm.s + complex_bit_reverse_arm.S \ + spl_sqrt_floor_arm.S else LOCAL_SRC_FILES += \ complex_bit_reverse.c \ @@ -102,10 +102,10 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := libwebrtc_spl_neon LOCAL_MODULE_TAGS := optional LOCAL_SRC_FILES := \ - cross_correlation_neon.s \ - downsample_fast_neon.s \ - min_max_operations_neon.s \ - vector_scaling_operations_neon.s + cross_correlation_neon.S \ + downsample_fast_neon.S \ + min_max_operations_neon.S \ + vector_scaling_operations_neon.S # Flags passed to both C and C++ files. LOCAL_CFLAGS := \ diff --git a/webrtc/common_audio/signal_processing/complex_bit_reverse_arm.s b/webrtc/common_audio/signal_processing/complex_bit_reverse_arm.S similarity index 62% rename from webrtc/common_audio/signal_processing/complex_bit_reverse_arm.s rename to webrtc/common_audio/signal_processing/complex_bit_reverse_arm.S index 482807780..e7f8a819b 100644 --- a/webrtc/common_audio/signal_processing/complex_bit_reverse_arm.s +++ b/webrtc/common_audio/signal_processing/complex_bit_reverse_arm.S @@ -12,15 +12,11 @@ @ for ARMv5 platforms. @ Reference C code is in file complex_bit_reverse.c. Bit-exact. -.arch armv5 - -.global WebRtcSpl_ComplexBitReverse +#include "webrtc/system_wrappers/interface/asm_defines.h" +GLOBAL_FUNCTION WebRtcSpl_ComplexBitReverse .align 2 - -WebRtcSpl_ComplexBitReverse: -.fnstart - +DEFINE_FUNCTION WebRtcSpl_ComplexBitReverse push {r4-r7} cmp r1, #7 @@ -88,39 +84,36 @@ END: pop {r4-r7} bx lr -.fnend - - @ The index tables. Note the values are doubles of the actual indexes for 16-bit @ elements, different from the generic C code. It actually provides byte offsets @ for the indexes. .align 2 index_7: @ Indexes for stages == 7. - .hword 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288 - .hword 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144 - .hword 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116 - .hword 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156 - .hword 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204 - .hword 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268 - .hword 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348 - .hword 468, 364, 436, 380, 500, 412, 460, 444, 492 + .short 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288 + .short 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144 + .short 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116 + .short 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156 + .short 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204 + .short 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268 + .short 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348 + .short 468, 364, 436, 380, 500, 412, 460, 444, 492 index_8: @ Indexes for stages == 8. - .hword 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64 - .hword 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544 - .hword 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104 - .hword 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136 - .hword 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172 - .hword 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204 - .hword 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244 - .hword 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284 - .hword 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324 - .hword 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372 - .hword 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420 - .hword 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468 - .hword 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532 - .hword 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596 - .hword 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684 - .hword 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796 - .hword 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988 + .short 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64 + .short 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544 + .short 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104 + .short 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136 + .short 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172 + .short 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204 + .short 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244 + .short 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284 + .short 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324 + .short 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372 + .short 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420 + .short 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468 + .short 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532 + .short 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596 + .short 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684 + .short 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796 + .short 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988 diff --git a/webrtc/common_audio/signal_processing/cross_correlation_neon.s b/webrtc/common_audio/signal_processing/cross_correlation_neon.S similarity index 95% rename from webrtc/common_audio/signal_processing/cross_correlation_neon.s rename to webrtc/common_audio/signal_processing/cross_correlation_neon.S index a18f67223..580440c81 100644 --- a/webrtc/common_audio/signal_processing/cross_correlation_neon.s +++ b/webrtc/common_audio/signal_processing/cross_correlation_neon.S @@ -29,24 +29,18 @@ @ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL @ r8, r9, r10, r11, r12: scratch -.arch armv7-a -.fpu neon +#include "webrtc/system_wrappers/interface/asm_defines.h" +GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon .align 2 -.global WebRtcSpl_CrossCorrelationNeon - -WebRtcSpl_CrossCorrelationNeon: - -.fnstart - -.save {r4-r11} +DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon push {r4-r11} @ Put the shift value (-right_shifts) into a Neon register. ldrsh r10, [sp, #36] rsb r10, r10, #0 mov r8, r10, asr #31 - vmov.32 d16, r10, r8 + vmov d16, r10, r8 @ Initialize loop counters. and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8; @@ -63,7 +57,7 @@ LOOP_DIM_CROSS_CORRELATION: LOOP_DIM_SEQ: vld1.16 {d20, d21}, [r6]! @ seq1_ptr - vld1.16 {d22, d23}, [r5]! @ seq2_ptr + vld1.16 {d22, d23}, [r5]! @ seq2_ptr subs r8, r8, #1 vmull.s16 q12, d20, d22 vmull.s16 q13, d21, d23 @@ -105,9 +99,6 @@ POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift. pop {r4-r11} bx lr -.fnend - - @ TODO(kma): Place this piece of reference code into a C code file. @ void WebRtcSpl_CrossCorrelationNeon(WebRtc_Word32* cross_correlation, @ WebRtc_Word16* seq1, @@ -120,15 +111,15 @@ POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift. @ int j = 0; @ int inner_loop_len1 = dim_seq >> 3; @ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3); -@ +@ @ assert(dim_cross_correlation > 0); @ assert(dim_seq > 0); -@ +@ @ for (i = 0; i < dim_cross_correlation; i++) { @ int16_t *seq1_ptr = seq1; @ int16_t *seq2_ptr = seq2 + (step_seq2 * i); @ int64_t sum = 0; -@ +@ @ for (j = inner_loop_len1; j > 0; j -= 1) { @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); @ seq1_ptr++; @@ -155,14 +146,14 @@ POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift. @ seq1_ptr++; @ seq2_ptr++; @ } -@ +@ @ // Calculate the rest of the samples. @ for (j = inner_loop_len2; j > 0; j -= 1) { @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); @ seq1_ptr++; @ seq2_ptr++; @ } -@ +@ @ *cross_correlation++ = (int32_t)(sum >> right_shifts); @ } @ } diff --git a/webrtc/common_audio/signal_processing/downsample_fast_neon.s b/webrtc/common_audio/signal_processing/downsample_fast_neon.S similarity index 97% rename from webrtc/common_audio/signal_processing/downsample_fast_neon.s rename to webrtc/common_audio/signal_processing/downsample_fast_neon.S index 13a825d79..4e348ec64 100644 --- a/webrtc/common_audio/signal_processing/downsample_fast_neon.s +++ b/webrtc/common_audio/signal_processing/downsample_fast_neon.S @@ -14,17 +14,11 @@ @ @ The reference C code is in file downsample_fast.c. Bit-exact. -.arch armv7-a -.fpu neon +#include "webrtc/system_wrappers/interface/asm_defines.h" +GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon .align 2 -.global WebRtcSpl_DownsampleFastNeon - -WebRtcSpl_DownsampleFastNeon: - -.fnstart - -.save {r4-r11} +DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon push {r4-r11} cmp r3, #0 @ data_out_length <= 0? @@ -168,14 +162,15 @@ LOOP_COEFF_LENGTH_FACTOR4: vmlal.s16 q3, d18, d17 bge LOOP_COEFF_LENGTH_FACTOR4 + add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8] + add r9, r5, asl #3 @ Counter i = delay + factor * 8. + @ Shift, saturate, and store the result. vqshrn.s32 d0, q2, #12 vqshrn.s32 d1, q3, #12 + cmp r9, r3 @ i < endpos - factor * 7 ? vst1.16 {d0, d1}, [r2]! - add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8] - add r9, r5, asl #3 @ Counter i = delay + factor * 8. - cmp r9, r3 @ i < endpos - factor * 7 ? blt LOOP_ENDPOS_FACTOR4 @ @@ -218,5 +213,3 @@ LOOP2_COEFF_LENGTH: END: pop {r4-r11} bx lr - -.fnend diff --git a/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.s b/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.S similarity index 98% rename from webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.s rename to webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.S index 5591bb83c..ff60cc619 100644 --- a/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.s +++ b/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.S @@ -35,16 +35,11 @@ @ r11: Scratch @ r12: &coefficients[j] -.arch armv7-a +#include "webrtc/system_wrappers/interface/asm_defines.h" +GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12 .align 2 -.global WebRtcSpl_FilterARFastQ12 - -WebRtcSpl_FilterARFastQ12: - -.fnstart - -.save {r4-r11} +DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12 push {r4-r11} ldrsh r12, [sp, #32] @ data_length @@ -155,9 +150,6 @@ END: pop {r4-r11} bx lr -.fnend - - @Reference C code: @ @void WebRtcSpl_FilterARFastQ12(int16_t* data_in, diff --git a/webrtc/common_audio/signal_processing/include/spl_inl_armv7.h b/webrtc/common_audio/signal_processing/include/spl_inl_armv7.h index 8461474fb..c9bcc1c23 100644 --- a/webrtc/common_audio/signal_processing/include/spl_inl_armv7.h +++ b/webrtc/common_audio/signal_processing/include/spl_inl_armv7.h @@ -166,7 +166,7 @@ static __inline int WebRtcSpl_NormW16(WebRtc_Word16 a) { static __inline WebRtc_Word16 WebRtcSpl_SatW32ToW16(WebRtc_Word32 value32) { WebRtc_Word16 out16 = 0; - __asm __volatile ("ssat %r0, #16, %r1" : "=r"(out16) : "r"(value32)); + __asm __volatile ("ssat %0, #16, %1" : "=r"(out16) : "r"(value32)); return out16; } diff --git a/webrtc/common_audio/signal_processing/min_max_operations_neon.s b/webrtc/common_audio/signal_processing/min_max_operations_neon.S similarity index 87% rename from webrtc/common_audio/signal_processing/min_max_operations_neon.s rename to webrtc/common_audio/signal_processing/min_max_operations_neon.S index 85dd2fb9d..c84307f5e 100644 --- a/webrtc/common_audio/signal_processing/min_max_operations_neon.s +++ b/webrtc/common_audio/signal_processing/min_max_operations_neon.S @@ -15,20 +15,18 @@ @ The reference C code is in file min_max_operations.c. Code here is basically @ a loop unrolling by 8 with Neon instructions. Bit-exact. -.arch armv7-a -.fpu neon -.global WebRtcSpl_MaxAbsValueW16Neon -.global WebRtcSpl_MaxAbsValueW32Neon -.global WebRtcSpl_MaxValueW16Neon -.global WebRtcSpl_MaxValueW32Neon -.global WebRtcSpl_MinValueW16Neon -.global WebRtcSpl_MinValueW32Neon +#include "webrtc/system_wrappers/interface/asm_defines.h" + +GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon +GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon +GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon +GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon +GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon +GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon + .align 2 - @ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length); -WebRtcSpl_MaxAbsValueW16Neon: -.fnstart - +DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon mov r2, #-1 @ Initialize the return value. cmp r0, #0 beq END_MAX_ABS_VALUE_W16 @@ -50,8 +48,8 @@ LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16: @ Find the maximum value in the Neon registers and move it to r2. vmax.u16 d24, d25 - vpmax.u16 d24, d24 - vpmax.u16 d24, d24 + vpmax.u16 d24, d24, d24 + vpmax.u16 d24, d24, d24 adds r1, #8 vmov.u16 r2, d24[0] beq END_MAX_ABS_VALUE_W16 @@ -71,12 +69,10 @@ END_MAX_ABS_VALUE_W16: mov r0, r2 bx lr -.fnend + @ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length); -WebRtcSpl_MaxAbsValueW32Neon: -.fnstart - +DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon cmp r0, #0 moveq r0, #-1 beq EXIT @ Return -1 for a NULL pointer. @@ -103,7 +99,7 @@ LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32: @ Find the maximum value in the Neon registers and move it to r2. vmax.u32 q12, q11 vmax.u32 d24, d25 - vpmax.u32 d24, d24 + vpmax.u32 d24, d24, d24 adds r1, #8 vmov.u32 r2, d24[0] beq END_MAX_ABS_VALUE_W32 @@ -125,12 +121,8 @@ END_MAX_ABS_VALUE_W32: EXIT: bx lr -.fnend - @ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length); -WebRtcSpl_MaxValueW16Neon: -.fnstart - +DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon mov r2, #0x8000 @ Initialize the return value. cmp r0, #0 beq END_MAX_VALUE_W16 @@ -151,8 +143,8 @@ LOOP_UNROLLED_BY_8_MAX_VALUE_W16: @ Find the maximum value in the Neon registers and move it to r2. vmax.s16 d24, d25 - vpmax.s16 d24, d24 - vpmax.s16 d24, d24 + vpmax.s16 d24, d24, d24 + vpmax.s16 d24, d24, d24 adds r1, #8 vmov.u16 r2, d24[0] beq END_MAX_VALUE_W16 @@ -168,12 +160,8 @@ END_MAX_VALUE_W16: mov r0, r2 bx lr -.fnend - @ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length); -WebRtcSpl_MaxValueW32Neon: -.fnstart - +DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon mov r2, #0x80000000 @ Initialize the return value. cmp r0, #0 beq END_MAX_VALUE_W32 @@ -196,8 +184,8 @@ LOOP_UNROLLED_BY_8_MAX_VALUE_W32: @ Find the maximum value in the Neon registers and move it to r2. vmax.s32 q12, q11 - vpmax.s32 d24, d25 - vpmax.s32 d24, d24 + vpmax.s32 d24, d24, d25 + vpmax.s32 d24, d24, d24 adds r1, #8 vmov.s32 r2, d24[0] beq END_MAX_VALUE_W32 @@ -213,12 +201,8 @@ END_MAX_VALUE_W32: mov r0, r2 bx lr -.fnend - @ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length); -WebRtcSpl_MinValueW16Neon: -.fnstart - +DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon movw r2, #0x7FFF @ Initialize the return value. cmp r0, #0 beq END_MIN_VALUE_W16 @@ -239,8 +223,8 @@ LOOP_UNROLLED_BY_8_MIN_VALUE_W16: @ Find the maximum value in the Neon registers and move it to r2. vmin.s16 d24, d25 - vpmin.s16 d24, d24 - vpmin.s16 d24, d24 + vpmin.s16 d24, d24, d24 + vpmin.s16 d24, d24, d24 adds r1, #8 vmov.s16 r2, d24[0] sxth r2, r2 @@ -257,12 +241,8 @@ END_MIN_VALUE_W16: mov r0, r2 bx lr -.fnend - @ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length); -WebRtcSpl_MinValueW32Neon: -.fnstart - +DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon mov r2, #0x7FFFFFFF @ Initialize the return value. cmp r0, #0 beq END_MIN_VALUE_W32 @@ -285,8 +265,8 @@ LOOP_UNROLLED_BY_8_MIN_VALUE_W32: @ Find the maximum value in the Neon registers and move it to r2. vmin.s32 q12, q11 - vpmin.s32 d24, d25 - vpmin.s32 d24, d24 + vpmin.s32 d24, d24, d25 + vpmin.s32 d24, d24, d24 adds r1, #8 vmov.s32 r2, d24[0] beq END_MIN_VALUE_W32 @@ -301,5 +281,3 @@ LOOP_MIN_VALUE_W32: END_MIN_VALUE_W32: mov r0, r2 bx lr - -.fnend diff --git a/webrtc/common_audio/signal_processing/resample_by_2.c b/webrtc/common_audio/signal_processing/resample_by_2.c index c1d8b3784..e6692e8a7 100644 --- a/webrtc/common_audio/signal_processing/resample_by_2.c +++ b/webrtc/common_audio/signal_processing/resample_by_2.c @@ -31,8 +31,8 @@ static __inline WebRtc_Word32 MUL_ACCUM_1(WebRtc_Word32 tbl_value, WebRtc_Word32 diff, WebRtc_Word32 state) { WebRtc_Word32 result; - __asm__("smlawb %r0, %r1, %r2, %r3": "=r"(result): "r"(diff), - "r"(tbl_value), "r"(state)); + __asm __volatile ("smlawb %0, %1, %2, %3": "=r"(result): "r"(diff), + "r"(tbl_value), "r"(state)); return result; } @@ -47,8 +47,8 @@ static __inline WebRtc_Word32 MUL_ACCUM_2(WebRtc_Word32 tbl_value, WebRtc_Word32 diff, WebRtc_Word32 state) { WebRtc_Word32 result; - __asm__("smmla %r0, %r1, %r2, %r3": "=r"(result): "r"(diff << 1), - "r"(tbl_value), "r"(state)); + __asm __volatile ("smmla %0, %1, %2, %3": "=r"(result): "r"(diff << 1), + "r"(tbl_value), "r"(state)); return result; } diff --git a/webrtc/common_audio/signal_processing/signal_processing.gypi b/webrtc/common_audio/signal_processing/signal_processing.gypi index b09c767be..91592ea7c 100644 --- a/webrtc/common_audio/signal_processing/signal_processing.gypi +++ b/webrtc/common_audio/signal_processing/signal_processing.gypi @@ -65,8 +65,8 @@ 'conditions': [ ['target_arch=="arm"', { 'sources': [ - 'complex_bit_reverse_arm.s', - 'spl_sqrt_floor_arm.s', + 'complex_bit_reverse_arm.S', + 'spl_sqrt_floor_arm.S', ], 'sources!': [ 'complex_bit_reverse.c', @@ -76,7 +76,7 @@ ['armv7==1', { 'dependencies': ['signal_processing_neon',], 'sources': [ - 'filter_ar_fast_q12_armv7.s', + 'filter_ar_fast_q12_armv7.S', ], 'sources!': [ 'filter_ar_fast_q12.c', @@ -112,10 +112,10 @@ 'type': '<(library)', 'includes': ['../../build/arm_neon.gypi',], 'sources': [ - 'cross_correlation_neon.s', - 'downsample_fast_neon.s', - 'min_max_operations_neon.s', - 'vector_scaling_operations_neon.s', + 'cross_correlation_neon.S', + 'downsample_fast_neon.S', + 'min_max_operations_neon.S', + 'vector_scaling_operations_neon.S', ], }, ], diff --git a/webrtc/common_audio/signal_processing/spl_sqrt_floor_arm.s b/webrtc/common_audio/signal_processing/spl_sqrt_floor_arm.S similarity index 93% rename from webrtc/common_audio/signal_processing/spl_sqrt_floor_arm.s rename to webrtc/common_audio/signal_processing/spl_sqrt_floor_arm.S index a2c5b7d0d..c49ef1f3b 100644 --- a/webrtc/common_audio/signal_processing/spl_sqrt_floor_arm.s +++ b/webrtc/common_audio/signal_processing/spl_sqrt_floor_arm.S @@ -8,10 +8,11 @@ @ Output: r0 = INT (SQRT (r0)), precision is 16 bits @ Registers touched: r1, r2 -.global WebRtcSpl_SqrtFloor +#include "webrtc/system_wrappers/interface/asm_defines.h" +GLOBAL_FUNCTION WebRtcSpl_SqrtFloor .align 2 -WebRtcSpl_SqrtFloor: +DEFINE_FUNCTION WebRtcSpl_SqrtFloor mov r1, #3 << 30 mov r2, #1 << 30 diff --git a/webrtc/common_audio/signal_processing/vector_scaling_operations_neon.s b/webrtc/common_audio/signal_processing/vector_scaling_operations_neon.S similarity index 92% rename from webrtc/common_audio/signal_processing/vector_scaling_operations_neon.s rename to webrtc/common_audio/signal_processing/vector_scaling_operations_neon.S index 562425bf0..07db741b0 100644 --- a/webrtc/common_audio/signal_processing/vector_scaling_operations_neon.s +++ b/webrtc/common_audio/signal_processing/vector_scaling_operations_neon.S @@ -13,15 +13,11 @@ @ optimized for ARM Neon platform. Output is bit-exact with the reference @ C code in vector_scaling_operations.c. -.arch armv7-a -.fpu neon +#include "webrtc/system_wrappers/interface/asm_defines.h" +GLOBAL_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon .align 2 -.global WebRtcSpl_ScaleAndAddVectorsWithRoundNeon - -WebRtcSpl_ScaleAndAddVectorsWithRoundNeon: -.fnstart - +DEFINE_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon push {r4-r9} ldr r4, [sp, #32] @ length @@ -84,5 +80,3 @@ LOOP_NO_UNROLLING: END: pop {r4-r9} bx lr - -.fnend diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S index e915fabaf..125a5d1fa 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S @@ -13,9 +13,9 @@ @ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype @ C code is at end of this file. -.arch armv7-a -.fpu neon -.global WebRtcIsacfix_AllpassFilter2FixDec16Neon +#include "webrtc/system_wrappers/interface/asm_defines.h" + +GLOBAL_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon .align 2 @void WebRtcIsacfix_AllpassFilter2FixDec16Neon( @@ -27,7 +27,7 @@ @ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16 @ int32_t *filter_state_ch2); // Filter state for channel 2, in Q16 -WebRtcIsacfix_AllpassFilter2FixDec16Neon: +DEFINE_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon push {r4 - r7} ldr r5, [sp, #24] @ filter_state_ch2 diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S index feb93c93f..a970333d8 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S @@ -9,9 +9,9 @@ @ @ Reference code in filters.c. Output is bit-exact. -#include "settings.h" +#include "webrtc/system_wrappers/interface/asm_defines.h" -.global WebRtcIsacfix_AutocorrNeon +GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon .align 2 @ int WebRtcIsacfix_AutocorrNeon( @@ -21,7 +21,7 @@ @ WebRtc_Word16 order, @ WebRtc_Word16* __restrict scale); -WebRtcIsacfix_AutocorrNeon: +DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon push {r3 - r12} @ Constant initializations diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi index 866e8e621..8b4b51c5d 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi @@ -97,8 +97,8 @@ '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing', ], 'sources': [ - 'filters_neon.S', 'filterbanks_neon.S', + 'filters_neon.S', 'lattice_neon.S', 'lpc_masking_model_neon.S', ], diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_armv7.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_armv7.S index 1cd3a764f..35fd9ef74 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_armv7.S +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_armv7.S @@ -25,16 +25,12 @@ @ r12: constant #16384 @ r6, r7, r8, r10, r11: scratch +#include "webrtc/system_wrappers/interface/asm_defines.h" #include "settings.h" -.arch armv7-a -.global WebRtcIsacfix_FilterArLoop +GLOBAL_FUNCTION WebRtcIsacfix_FilterArLoop .align 2 - -WebRtcIsacfix_FilterArLoop: -.fnstart - -.save {r4-r11} +DEFINE_FUNCTION WebRtcIsacfix_FilterArLoop push {r4-r11} add r1, #2 @ &ar_f_Q0[1] @@ -77,6 +73,3 @@ ORDER_COEF_LOOP: @ for(k = order_coef - 1 ; k >= 0; k--) pop {r4-r11} bx lr - -.fnend - diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S index a59b6e37f..f31a32d9d 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S @@ -29,19 +29,12 @@ @ instructions, smulwb, and smull. Speech quality was not degraded by @ testing speech and tone vectors. -.arch armv7-a -.fpu neon - +#include "webrtc/system_wrappers/interface/asm_defines.h" #include "settings.h" -.global WebRtcIsacfix_FilterMaLoopNeon - +GLOBAL_FUNCTION WebRtcIsacfix_FilterMaLoopNeon .align 2 - -WebRtcIsacfix_FilterMaLoopNeon: -.fnstart - -.save {r4-r8} +DEFINE_FUNCTION WebRtcIsacfix_FilterMaLoopNeon push {r4-r8} vdup.32 d28, r0 @ Initialize Neon register with input0 @@ -151,5 +144,3 @@ LAST_SAMPLE: END: pop {r4-r8} bx lr - -.fnend diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S index 20b60d0f4..a5955c27a 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S @@ -12,9 +12,9 @@ @ iSAC codec, optimized for ARM Neon platform. Reference code in @ lpc_masking_model.c. -.arch armv7-a -.fpu neon -.global WebRtcIsacfix_CalculateResidualEnergyNeon +#include "webrtc/system_wrappers/interface/asm_defines.h" + +GLOBAL_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon .align 2 @ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order, @@ -23,10 +23,7 @@ @ int16_t* a_polynomial, @ int32_t* corr_coeffs, @ int* q_val_residual_energy); - -WebRtcIsacfix_CalculateResidualEnergyNeon: -.fnstart -.save {r4-r11} +DEFINE_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon push {r4-r11} sub r13, r13, #16 @@ -173,5 +170,4 @@ GET_SHIFT_NORM: pop {r4-r11} bx r14 -.fnend diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_filter_armv6.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_filter_armv6.S index 7ce3b6f26..ffd0e6338 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_filter_armv6.S +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_filter_armv6.S @@ -13,12 +13,11 @@ @ @ Output is bit-exact with the reference C code in pitch_filter.c. +#include "webrtc/system_wrappers/interface/asm_defines.h" #include "settings.h" -.arch armv6 +GLOBAL_FUNCTION WebRtcIsacfix_PitchFilterCore .align 2 -.global WebRtcIsacfix_PitchFilterCore - @ void WebRtcIsacfix_PitchFilterCore(int loopNumber, @ WebRtc_Word16 gain, @@ -30,9 +29,7 @@ @ WebRtc_Word16* inputBuf, @ WebRtc_Word16* outputBuf, @ int* index2) { - -WebRtcIsacfix_PitchFilterCore: -.fnstart +DEFINE_FUNCTION WebRtcIsacfix_PitchFilterCore push {r4-r11} sub sp, #8 @@ -140,7 +137,6 @@ LOOP: add sp, #8 pop {r4-r11} bx lr -.fnend .align 2 kDampFilter: diff --git a/webrtc/modules/audio_processing/aecm/aecm_core_neon.S b/webrtc/modules/audio_processing/aecm/aecm_core_neon.S index 412c173f6..833575ea0 100644 --- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.S +++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.S @@ -12,19 +12,17 @@ @ This file contains some functions in AECM, optimized for ARM Neon @ platforms. Reference C code is in file aecm_core.c. Bit-exact. -.arch armv7-a -.fpu neon - #include "aecm_defines.h" #include "aecm_core_neon_offsets.h" +#include "webrtc/system_wrappers/interface/asm_defines.h" .extern WebRtcAecm_kSqrtHanning -.global WebRtcAecm_WindowAndFFTNeon -.global WebRtcAecm_InverseFFTAndWindowNeon -.global WebRtcAecm_CalcLinearEnergiesNeon -.global WebRtcAecm_StoreAdaptiveChannelNeon -.global WebRtcAecm_ResetAdaptiveChannelNeon +GLOBAL_FUNCTION WebRtcAecm_WindowAndFFTNeon +GLOBAL_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon +GLOBAL_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon +GLOBAL_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon +GLOBAL_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon @ void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm, @ WebRtc_Word16* fft, @@ -32,9 +30,7 @@ @ complex16_t* freq_signal, @ int time_signal_scaling); .align 2 -WebRtcAecm_WindowAndFFTNeon: -.fnstart -.save {r4, r5, r6, lr} +DEFINE_FUNCTION WebRtcAecm_WindowAndFFTNeon push {r4, r5, r6, lr} ldr r12, [sp, #16] @ time_signal_scaling @@ -84,7 +80,6 @@ LOOP_PART_LEN2: bgt LOOP_PART_LEN2 pop {r4, r5, r6, pc} -.fnend @ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, @ WebRtc_Word16* fft, @@ -92,9 +87,7 @@ LOOP_PART_LEN2: @ WebRtc_Word16* output, @ const WebRtc_Word16* nearendClean); .align 2 -WebRtcAecm_InverseFFTAndWindowNeon: -.fnstart -.save {r4-r8, lr} +DEFINE_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon push {r4-r8, lr} @ Values of r0, r1, and r3 will change in WebRtcSpl_ComplexIFFT @@ -158,12 +151,12 @@ LOOP_POST_IFFT: vld1.16 d1, [r12, :64]! @ kSqrtHanningReversed[i] vadd.i32 q8, q10 vmull.s16 q0, d0, d1 - vqshrn.s32 d4, q8, #0 + vqmovn.s32 d16, q8 vshr.s32 q0, q0, #14 vst2.16 {d4, d5}, [r4, :128]! @ &efw[i]; vshl.s32 q0, q0, q9 vst1.16 d16, [r7, :64]! @ output[i] - vqshrn.s32 d0, q0, #0 + vqmovn.s32 d0, q0 subs r3, #1 vst1.16 d0, [r8, :64]! @ aecm->outBuf[i] bgt LOOP_POST_IFFT @@ -203,7 +196,6 @@ LOOP_COPY: END: pop {r4-r8, pc} -.fnend @ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, @ const WebRtc_UWord16* far_spectrum, @@ -212,9 +204,7 @@ END: @ WebRtc_UWord32* echo_energy_adapt, @ WebRtc_UWord32* echo_energy_stored); .align 2 -WebRtcAecm_CalcLinearEnergiesNeon: -.fnstart -.save {r4-r7} +DEFINE_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon push {r4-r7} vmov.i32 q14, #0 @@ -274,14 +264,12 @@ LOOP_CALC_LINEAR_ENERGIES: pop {r4-r7} bx lr -.fnend @ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, @ const uint16_t* far_spectrum, @ int32_t* echo_est); .align 2 -WebRtcAecm_StoreAdaptiveChannelNeon: -.fnstart +DEFINE_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon ldr r3, =offset_aecm_channelAdapt16 ldr r12, =offset_aecm_channelStored ldr r3, [r0, r3] @@ -305,12 +293,10 @@ LOOP_STORE_ADAPTIVE_CHANNEL: str r3, [r2] bx lr -.fnend @ void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm); .align 2 -WebRtcAecm_ResetAdaptiveChannelNeon: -.fnstart +DEFINE_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon ldr r1, =offset_aecm_channelAdapt16 ldr r2, =offset_aecm_channelAdapt32 movw r3, #offset_aecm_channelStored @@ -334,15 +320,14 @@ LOOP_RESET_ADAPTIVE_CHANNEL: str r0, [r2] bx lr -.fnend @ Square root of Hanning window in Q14. Compared to WebRtcAecm_kSqrtHanning, @ the order was reversed and one useless element (0) was removed. .align 3 kSqrtHanningReversed: - .hword 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947 - .hword 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571 - .hword 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335 - .hword 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370 - .hword 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101 - .hword 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399 + .short 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947 + .short 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571 + .short 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335 + .short 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370 + .short 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101 + .short 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399 diff --git a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c index 3bbd84b49..d8250ef08 100644 --- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c +++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c @@ -139,7 +139,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, "vneg.s16 d23, d23\n\t" "vst2.16 {d22, d23}, [%[p_fft], :128]!\n\t" "vrev64.16 q10, q10\n\t" - "vst2.16 {q10}, [%[p_fft_offset], %[offset]]\n\t" + "vst2.16 {q10}, [%[p_fft_offset]], %[offset]\n\t" :[p_efw]"+r"(p_efw), [p_fft]"+r"(p_fft), [p_fft_offset]"+r"(p_fft_offset) @@ -181,7 +181,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i])); __asm __volatile("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0)); __asm __volatile("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1)); - __asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); + __asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&efw[i].real)); __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i])); @@ -196,7 +196,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, __asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); // aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT( // WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN); - __asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); + __asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i])); } diff --git a/webrtc/modules/audio_processing/ns/nsx_core_neon.S b/webrtc/modules/audio_processing/ns/nsx_core_neon.S index 4e15959cd..cea75532a 100644 --- a/webrtc/modules/audio_processing/ns/nsx_core_neon.S +++ b/webrtc/modules/audio_processing/ns/nsx_core_neon.S @@ -12,18 +12,16 @@ @ This file contains some functions in NS, optimized for ARM Neon @ platforms. Reference C code is in file nsx_core.c. Bit-exact. -.arch armv7-a -.fpu neon - +#include "webrtc/system_wrappers/interface/asm_defines.h" #include "nsx_defines.h" #include "nsx_core_neon_offsets.h" -.global WebRtcNsx_NoiseEstimationNeon -.global WebRtcNsx_PrepareSpectrumNeon -.global WebRtcNsx_SynthesisUpdateNeon -.global WebRtcNsx_AnalysisUpdateNeon -.global WebRtcNsx_DenormalizeNeon -.global WebRtcNsx_CreateComplexBufferNeon +GLOBAL_FUNCTION WebRtcNsx_NoiseEstimationNeon +GLOBAL_FUNCTION WebRtcNsx_PrepareSpectrumNeon +GLOBAL_FUNCTION WebRtcNsx_SynthesisUpdateNeon +GLOBAL_FUNCTION WebRtcNsx_AnalysisUpdateNeon +GLOBAL_FUNCTION WebRtcNsx_DenormalizeNeon +GLOBAL_FUNCTION WebRtcNsx_CreateComplexBufferNeon @ void NoiseEstimationNeon(NsxInst_t* inst, @ uint16_t* magn, @@ -42,12 +40,7 @@ @ r11: countDiv @ r12: i, the loop counter for LOOP_NOISEESTIMATION_MAGNLEN_INNER -WebRtcNsx_NoiseEstimationNeon: -.fnstart -.save {r4-r11, r14} -.vsave {d8-d15} -.pad #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8) - +DEFINE_FUNCTION WebRtcNsx_NoiseEstimationNeon push {r4-r11, r14} vpush {d8-d15} sub sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8) @@ -312,14 +305,10 @@ UPDATE_Q_NOISE: add sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8) vpop {d8-d15} pop {r4-r11, pc} -.fnend @ static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset); @ Neon registers touched: q0-q3, q8-q13. -UpdateNoiseEstimateNeon: -.fnstart -.save {r4, r5, r6, r14} - +DEFINE_FUNCTION UpdateNoiseEstimateNeon push {r4, r5, r6, r14} mov r5, r0 @@ -385,13 +374,9 @@ POST_LOOP_MAGNLEN: strh r3, [r2] pop {r4, r5, r6, pc} -.fnend @ void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf); -WebRtcNsx_PrepareSpectrumNeon: -.fnstart -.save {r4-r8} - +DEFINE_FUNCTION WebRtcNsx_PrepareSpectrumNeon push {r4-r8} movw r2, #offset_nsx_real @@ -478,11 +463,9 @@ LOOP_ANALEN2: pop {r4-r8} bx r14 -.fnend @ void WebRtcNsx_DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor); -WebRtcNsx_DenormalizeNeon: -.fnstart +DEFINE_FUNCTION WebRtcNsx_DenormalizeNeon movw r12, #offset_nsx_normData movw r3, #offset_nsx_real ldr r12, [r0, r12] @ inst->normData @@ -508,14 +491,11 @@ LOOP_ANALEN: blt LOOP_ANALEN bx r14 -.fnend @ void SynthesisUpdateNeon(NsxInst_t* inst, @ int16_t* out_frame, @ int16_t gain_factor); -WebRtcNsx_SynthesisUpdateNeon: -.fnstart -.save {r4, r5} +DEFINE_FUNCTION WebRtcNsx_SynthesisUpdateNeon push {r4, r5} vdup.16 d31, r2 @@ -586,12 +566,8 @@ EXIT_SYNTHESISUPDATE: pop {r4, r5} bx r14 -.fnend - @ void AnalysisUpdateNeon(NsxInst_t* inst, int16_t* out, int16_t* new_speech); -WebRtcNsx_AnalysisUpdateNeon: -.fnstart -.save {r4-r6} +DEFINE_FUNCTION WebRtcNsx_AnalysisUpdateNeon push {r4-r6} movw r3, #offset_nsx_analysisBuffer @@ -647,11 +623,9 @@ LOOP_WINDOW_DATA: POST_LOOP_WINDOW_DATA: pop {r4-r6} bx r14 -.fnend @ void CreateComplexBufferNeon(NsxInst_t* inst, int16_t* in, int16_t* out); -WebRtcNsx_CreateComplexBufferNeon: -.fnstart +DEFINE_FUNCTION WebRtcNsx_CreateComplexBufferNeon movw r3, #offset_nsx_anaLen movw r12, #offset_nsx_normData ldrsh r3, [r0, r3] @ inst->anaLen @@ -678,4 +652,3 @@ LOOP_CREATE_COMPLEX_BUFFER: @ Unrolled by 16. blt LOOP_CREATE_COMPLEX_BUFFER bx r14 -.fnend diff --git a/webrtc/system_wrappers/interface/asm_defines.h b/webrtc/system_wrappers/interface/asm_defines.h new file mode 100644 index 000000000..9ef6c8f3b --- /dev/null +++ b/webrtc/system_wrappers/interface/asm_defines.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef WEBRTC_SYSTEM_WRAPPERS_INTERFACE_ASM_DEFINES_H_ +#define WEBRTC_SYSTEM_WRAPPERS_INTERFACE_ASM_DEFINES_H_ + +// Define the macros used in ARM assembly code, so that for Mac or iOS builds +// we add leading underscores for the function names. +#ifdef __APPLE__ +.macro GLOBAL_FUNCTION name +.global _\name +.endm +.macro DEFINE_FUNCTION name +_\name: +.endm +#else +.macro GLOBAL_FUNCTION name +.global \name +.endm +.macro DEFINE_FUNCTION name +\name: +.endm +#endif + +#endif // WEBRTC_SYSTEM_WRAPPERS_INTERFACE_COMPILE_ASSERT_H_