From 55cd78cfc25f135149b780dcf527d147d5621ba2 Mon Sep 17 00:00:00 2001
From: "kma@webrtc.org" <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d>
Date: Sat, 17 Nov 2012 00:22:46 +0000
Subject: [PATCH] Porting ARM optimization from Android to ios. Tested APM and
 iSAC in Android. Bit-exact with original versions. Changes include removing
 or changing some GCC derivatives (e.g. .fnstart, .hword), instruction syntax,
 etc. Review URL: https://webrtc-codereview.appspot.com/934009

git-svn-id: http://webrtc.googlecode.com/svn/trunk@3124 4adac7df-926f-26a2-2b94-8c16560cd09d
---
 .../common_audio/signal_processing/Android.mk | 14 ++--
 ...everse_arm.s => complex_bit_reverse_arm.S} | 63 +++++++--------
 ...lation_neon.s => cross_correlation_neon.S} | 29 +++----
 ...ple_fast_neon.s => downsample_fast_neon.S} | 21 ++---
 ...q12_armv7.s => filter_ar_fast_q12_armv7.S} | 14 +---
 .../signal_processing/include/spl_inl_armv7.h |  2 +-
 ...tions_neon.s => min_max_operations_neon.S} | 76 +++++++------------
 .../signal_processing/resample_by_2.c         |  8 +-
 .../signal_processing/signal_processing.gypi  | 14 ++--
 ..._sqrt_floor_arm.s => spl_sqrt_floor_arm.S} |  5 +-
 ...eon.s => vector_scaling_operations_neon.S} | 12 +--
 .../codecs/isac/fix/source/filterbanks_neon.S |  8 +-
 .../codecs/isac/fix/source/filters_neon.S     |  6 +-
 .../codecs/isac/fix/source/isacfix.gypi       |  2 +-
 .../codecs/isac/fix/source/lattice_armv7.S    | 13 +---
 .../codecs/isac/fix/source/lattice_neon.S     | 15 +---
 .../isac/fix/source/lpc_masking_model_neon.S  | 12 +--
 .../isac/fix/source/pitch_filter_armv6.S      | 10 +--
 .../audio_processing/aecm/aecm_core_neon.S    | 53 +++++--------
 .../audio_processing/aecm/aecm_core_neon.c    |  6 +-
 .../audio_processing/ns/nsx_core_neon.S       | 55 ++++----------
 .../system_wrappers/interface/asm_defines.h   | 32 ++++++++
 22 files changed, 189 insertions(+), 281 deletions(-)
 rename webrtc/common_audio/signal_processing/{complex_bit_reverse_arm.s => complex_bit_reverse_arm.S} (62%)
 rename webrtc/common_audio/signal_processing/{cross_correlation_neon.s => cross_correlation_neon.S} (95%)
 rename webrtc/common_audio/signal_processing/{downsample_fast_neon.s => downsample_fast_neon.S} (97%)
 rename webrtc/common_audio/signal_processing/{filter_ar_fast_q12_armv7.s => filter_ar_fast_q12_armv7.S} (98%)
 rename webrtc/common_audio/signal_processing/{min_max_operations_neon.s => min_max_operations_neon.S} (87%)
 rename webrtc/common_audio/signal_processing/{spl_sqrt_floor_arm.s => spl_sqrt_floor_arm.S} (93%)
 rename webrtc/common_audio/signal_processing/{vector_scaling_operations_neon.s => vector_scaling_operations_neon.S} (92%)
 create mode 100644 webrtc/system_wrappers/interface/asm_defines.h

diff --git a/webrtc/common_audio/signal_processing/Android.mk b/webrtc/common_audio/signal_processing/Android.mk
index ecbc5ddf4..aed7e73b5 100644
--- a/webrtc/common_audio/signal_processing/Android.mk
+++ b/webrtc/common_audio/signal_processing/Android.mk
@@ -60,7 +60,7 @@ LOCAL_C_INCLUDES := \
 
 ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
 LOCAL_SRC_FILES += \
-    filter_ar_fast_q12_armv7.s
+    filter_ar_fast_q12_armv7.S
 else
 LOCAL_SRC_FILES += \
     filter_ar_fast_q12.c
@@ -68,8 +68,8 @@ endif
 
 ifeq ($(TARGET_ARCH),arm)
 LOCAL_SRC_FILES += \
-    complex_bit_reverse_arm.s \
-    spl_sqrt_floor_arm.s
+    complex_bit_reverse_arm.S \
+    spl_sqrt_floor_arm.S
 else
 LOCAL_SRC_FILES += \
     complex_bit_reverse.c \
@@ -102,10 +102,10 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_MODULE := libwebrtc_spl_neon
 LOCAL_MODULE_TAGS := optional
 LOCAL_SRC_FILES := \
-    cross_correlation_neon.s \
-    downsample_fast_neon.s \
-    min_max_operations_neon.s \
-    vector_scaling_operations_neon.s
+    cross_correlation_neon.S \
+    downsample_fast_neon.S \
+    min_max_operations_neon.S \
+    vector_scaling_operations_neon.S
 
 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
diff --git a/webrtc/common_audio/signal_processing/complex_bit_reverse_arm.s b/webrtc/common_audio/signal_processing/complex_bit_reverse_arm.S
similarity index 62%
rename from webrtc/common_audio/signal_processing/complex_bit_reverse_arm.s
rename to webrtc/common_audio/signal_processing/complex_bit_reverse_arm.S
index 482807780..e7f8a819b 100644
--- a/webrtc/common_audio/signal_processing/complex_bit_reverse_arm.s
+++ b/webrtc/common_audio/signal_processing/complex_bit_reverse_arm.S
@@ -12,15 +12,11 @@
 @ for ARMv5 platforms.
 @ Reference C code is in file complex_bit_reverse.c. Bit-exact.
 
-.arch armv5
-
-.global WebRtcSpl_ComplexBitReverse
+#include "webrtc/system_wrappers/interface/asm_defines.h"
 
+GLOBAL_FUNCTION WebRtcSpl_ComplexBitReverse
 .align  2
-
-WebRtcSpl_ComplexBitReverse:
-.fnstart
-
+DEFINE_FUNCTION WebRtcSpl_ComplexBitReverse
   push {r4-r7}
 
   cmp r1, #7
@@ -88,39 +84,36 @@ END:
   pop {r4-r7}
   bx lr
 
-.fnend
-
-
 @ The index tables. Note the values are doubles of the actual indexes for 16-bit
 @ elements, different from the generic C code. It actually provides byte offsets
 @ for the indexes.
 
 .align  2
 index_7:  @ Indexes for stages == 7.
-  .hword 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288
-  .hword 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144
-  .hword 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116
-  .hword 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156
-  .hword 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204
-  .hword 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268
-  .hword 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348
-  .hword 468, 364, 436, 380, 500, 412, 460, 444, 492
+  .short 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288
+  .short 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144
+  .short 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116
+  .short 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156
+  .short 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204
+  .short 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268
+  .short 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348
+  .short 468, 364, 436, 380, 500, 412, 460, 444, 492
 
 index_8:  @ Indexes for stages == 8.
-  .hword 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64
-  .hword 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544
-  .hword 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104
-  .hword 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136
-  .hword 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172
-  .hword 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204
-  .hword 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244
-  .hword 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284
-  .hword 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324
-  .hword 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372
-  .hword 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420
-  .hword 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468
-  .hword 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532
-  .hword 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596
-  .hword 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684
-  .hword 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796
-  .hword 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988
+  .short 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64
+  .short 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544
+  .short 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104
+  .short 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136
+  .short 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172
+  .short 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204
+  .short 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244
+  .short 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284
+  .short 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324
+  .short 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372
+  .short 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420
+  .short 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468
+  .short 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532
+  .short 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596
+  .short 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684
+  .short 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796
+  .short 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988
diff --git a/webrtc/common_audio/signal_processing/cross_correlation_neon.s b/webrtc/common_audio/signal_processing/cross_correlation_neon.S
similarity index 95%
rename from webrtc/common_audio/signal_processing/cross_correlation_neon.s
rename to webrtc/common_audio/signal_processing/cross_correlation_neon.S
index a18f67223..580440c81 100644
--- a/webrtc/common_audio/signal_processing/cross_correlation_neon.s
+++ b/webrtc/common_audio/signal_processing/cross_correlation_neon.S
@@ -29,24 +29,18 @@
 @ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
 @ r8, r9, r10, r11, r12: scratch
 
-.arch armv7-a
-.fpu neon
+#include "webrtc/system_wrappers/interface/asm_defines.h"
 
+GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon
 .align  2
-.global WebRtcSpl_CrossCorrelationNeon
-
-WebRtcSpl_CrossCorrelationNeon:
-
-.fnstart
-
-.save {r4-r11}
+DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon
   push {r4-r11}
 
   @ Put the shift value (-right_shifts) into a Neon register.
   ldrsh r10, [sp, #36]
   rsb r10, r10, #0
   mov r8, r10, asr #31
-  vmov.32 d16, r10, r8
+  vmov d16, r10, r8
 
   @ Initialize loop counters.
   and r7, r3, #7              @ inner_loop_len2 = dim_seq % 8;
@@ -63,7 +57,7 @@ LOOP_DIM_CROSS_CORRELATION:
 
 LOOP_DIM_SEQ:
   vld1.16 {d20, d21}, [r6]!   @ seq1_ptr
-  vld1.16 {d22, d23}, [r5]!   @ seq2_ptr 
+  vld1.16 {d22, d23}, [r5]!   @ seq2_ptr
   subs r8, r8, #1
   vmull.s16 q12, d20, d22
   vmull.s16 q13, d21, d23
@@ -105,9 +99,6 @@ POST_LOOP_DIM_SEQ_RESIDUAL:   @ Sum the results up and do the shift.
   pop {r4-r11}
   bx  lr
 
-.fnend
-
-
 @ TODO(kma): Place this piece of reference code into a C code file.
 @ void WebRtcSpl_CrossCorrelationNeon(WebRtc_Word32* cross_correlation,
 @                                     WebRtc_Word16* seq1,
@@ -120,15 +111,15 @@ POST_LOOP_DIM_SEQ_RESIDUAL:   @ Sum the results up and do the shift.
 @   int j = 0;
 @   int inner_loop_len1 = dim_seq >> 3;
 @   int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
-@ 
+@
 @   assert(dim_cross_correlation > 0);
 @   assert(dim_seq > 0);
-@ 
+@
 @   for (i = 0; i < dim_cross_correlation; i++) {
 @     int16_t *seq1_ptr = seq1;
 @     int16_t *seq2_ptr = seq2 + (step_seq2 * i);
 @     int64_t sum = 0;
-@ 
+@
 @     for (j = inner_loop_len1; j > 0; j -= 1) {
 @       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
 @       seq1_ptr++;
@@ -155,14 +146,14 @@ POST_LOOP_DIM_SEQ_RESIDUAL:   @ Sum the results up and do the shift.
 @       seq1_ptr++;
 @       seq2_ptr++;
 @     }
-@ 
+@
 @     // Calculate the rest of the samples.
 @     for (j = inner_loop_len2; j > 0; j -= 1) {
 @       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
 @       seq1_ptr++;
 @       seq2_ptr++;
 @     }
-@ 
+@
 @     *cross_correlation++ = (int32_t)(sum >> right_shifts);
 @   }
 @ }
diff --git a/webrtc/common_audio/signal_processing/downsample_fast_neon.s b/webrtc/common_audio/signal_processing/downsample_fast_neon.S
similarity index 97%
rename from webrtc/common_audio/signal_processing/downsample_fast_neon.s
rename to webrtc/common_audio/signal_processing/downsample_fast_neon.S
index 13a825d79..4e348ec64 100644
--- a/webrtc/common_audio/signal_processing/downsample_fast_neon.s
+++ b/webrtc/common_audio/signal_processing/downsample_fast_neon.S
@@ -14,17 +14,11 @@
 @
 @ The reference C code is in file downsample_fast.c. Bit-exact.
 
-.arch armv7-a
-.fpu neon
+#include "webrtc/system_wrappers/interface/asm_defines.h"
 
+GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon
 .align  2
-.global WebRtcSpl_DownsampleFastNeon
-
-WebRtcSpl_DownsampleFastNeon:
-
-.fnstart
-
-.save {r4-r11}
+DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon
   push {r4-r11}
 
   cmp r3, #0                                @ data_out_length <= 0?
@@ -168,14 +162,15 @@ LOOP_COEFF_LENGTH_FACTOR4:
   vmlal.s16 q3, d18, d17
   bge LOOP_COEFF_LENGTH_FACTOR4
 
+  add r11, r5, asl #4                       @ r11 -> &data_in[i + factor * 8]
+  add r9, r5, asl #3                        @ Counter i = delay + factor * 8.
+
   @ Shift, saturate, and store the result.
   vqshrn.s32 d0, q2, #12
   vqshrn.s32 d1, q3, #12
+  cmp r9, r3                                @ i < endpos - factor * 7 ?
   vst1.16 {d0, d1}, [r2]!
 
-  add r11, r5, asl #4                       @ r11 -> &data_in[i + factor * 8]
-  add r9, r5, asl #3                        @ Counter i = delay + factor * 8.
-  cmp r9, r3                                @ i < endpos - factor * 7 ?
   blt LOOP_ENDPOS_FACTOR4
 
 @
@@ -218,5 +213,3 @@ LOOP2_COEFF_LENGTH:
 END:
   pop {r4-r11}
   bx  lr
-
-.fnend
diff --git a/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.s b/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.S
similarity index 98%
rename from webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.s
rename to webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.S
index 5591bb83c..ff60cc619 100644
--- a/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.s
+++ b/webrtc/common_audio/signal_processing/filter_ar_fast_q12_armv7.S
@@ -35,16 +35,11 @@
 @ r11: Scratch
 @ r12: &coefficients[j]
 
-.arch armv7-a
+#include "webrtc/system_wrappers/interface/asm_defines.h"
 
+GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12
 .align  2
-.global WebRtcSpl_FilterARFastQ12
-
-WebRtcSpl_FilterARFastQ12:
-
-.fnstart
-
-.save {r4-r11}
+DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12
   push {r4-r11}
 
   ldrsh r12, [sp, #32]         @ data_length
@@ -155,9 +150,6 @@ END:
   pop {r4-r11}
   bx  lr
 
-.fnend
-
-
 @Reference C code:
 @
 @void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
diff --git a/webrtc/common_audio/signal_processing/include/spl_inl_armv7.h b/webrtc/common_audio/signal_processing/include/spl_inl_armv7.h
index 8461474fb..c9bcc1c23 100644
--- a/webrtc/common_audio/signal_processing/include/spl_inl_armv7.h
+++ b/webrtc/common_audio/signal_processing/include/spl_inl_armv7.h
@@ -166,7 +166,7 @@ static __inline int WebRtcSpl_NormW16(WebRtc_Word16 a) {
 static __inline WebRtc_Word16 WebRtcSpl_SatW32ToW16(WebRtc_Word32 value32) {
   WebRtc_Word16 out16 = 0;
 
-  __asm __volatile ("ssat %r0, #16, %r1" : "=r"(out16) : "r"(value32));
+  __asm __volatile ("ssat %0, #16, %1" : "=r"(out16) : "r"(value32));
 
   return out16;
 }
diff --git a/webrtc/common_audio/signal_processing/min_max_operations_neon.s b/webrtc/common_audio/signal_processing/min_max_operations_neon.S
similarity index 87%
rename from webrtc/common_audio/signal_processing/min_max_operations_neon.s
rename to webrtc/common_audio/signal_processing/min_max_operations_neon.S
index 85dd2fb9d..c84307f5e 100644
--- a/webrtc/common_audio/signal_processing/min_max_operations_neon.s
+++ b/webrtc/common_audio/signal_processing/min_max_operations_neon.S
@@ -15,20 +15,18 @@
 @ The reference C code is in file min_max_operations.c. Code here is basically
 @ a loop unrolling by 8 with Neon instructions. Bit-exact.
 
-.arch armv7-a
-.fpu neon
-.global WebRtcSpl_MaxAbsValueW16Neon
-.global WebRtcSpl_MaxAbsValueW32Neon
-.global WebRtcSpl_MaxValueW16Neon
-.global WebRtcSpl_MaxValueW32Neon
-.global WebRtcSpl_MinValueW16Neon
-.global WebRtcSpl_MinValueW32Neon
+#include "webrtc/system_wrappers/interface/asm_defines.h"
+
+GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
+GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
+GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon
+GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon
+GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon
+GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon
+
 .align  2
-
 @ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
-WebRtcSpl_MaxAbsValueW16Neon:
-.fnstart
-
+DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
   mov r2, #-1                 @ Initialize the return value.
   cmp r0, #0
   beq END_MAX_ABS_VALUE_W16
@@ -50,8 +48,8 @@ LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
 
   @ Find the maximum value in the Neon registers and move it to r2.
   vmax.u16 d24, d25
-  vpmax.u16 d24, d24
-  vpmax.u16 d24, d24
+  vpmax.u16 d24, d24, d24
+  vpmax.u16 d24, d24, d24
   adds r1, #8
   vmov.u16 r2, d24[0]
   beq END_MAX_ABS_VALUE_W16
@@ -71,12 +69,10 @@ END_MAX_ABS_VALUE_W16:
   mov r0, r2
   bx  lr
 
-.fnend
+
 
 @ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
-WebRtcSpl_MaxAbsValueW32Neon:
-.fnstart
-
+DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
   cmp r0, #0
   moveq r0, #-1
   beq EXIT                    @ Return -1 for a NULL pointer.
@@ -103,7 +99,7 @@ LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
   @ Find the maximum value in the Neon registers and move it to r2.
   vmax.u32 q12, q11
   vmax.u32 d24, d25
-  vpmax.u32 d24, d24
+  vpmax.u32 d24, d24, d24
   adds r1, #8
   vmov.u32 r2, d24[0]
   beq END_MAX_ABS_VALUE_W32
@@ -125,12 +121,8 @@ END_MAX_ABS_VALUE_W32:
 EXIT:
   bx  lr
 
-.fnend
-
 @ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
-WebRtcSpl_MaxValueW16Neon:
-.fnstart
-
+DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon
   mov r2, #0x8000             @ Initialize the return value.
   cmp r0, #0
   beq END_MAX_VALUE_W16
@@ -151,8 +143,8 @@ LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
 
   @ Find the maximum value in the Neon registers and move it to r2.
   vmax.s16 d24, d25
-  vpmax.s16 d24, d24
-  vpmax.s16 d24, d24
+  vpmax.s16 d24, d24, d24
+  vpmax.s16 d24, d24, d24
   adds r1, #8
   vmov.u16 r2, d24[0]
   beq END_MAX_VALUE_W16
@@ -168,12 +160,8 @@ END_MAX_VALUE_W16:
   mov r0, r2
   bx  lr
 
-.fnend
-
 @ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
-WebRtcSpl_MaxValueW32Neon:
-.fnstart
-
+DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon
   mov r2, #0x80000000         @ Initialize the return value.
   cmp r0, #0
   beq END_MAX_VALUE_W32
@@ -196,8 +184,8 @@ LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
 
   @ Find the maximum value in the Neon registers and move it to r2.
   vmax.s32 q12, q11
-  vpmax.s32 d24, d25
-  vpmax.s32 d24, d24
+  vpmax.s32 d24, d24, d25
+  vpmax.s32 d24, d24, d24
   adds r1, #8
   vmov.s32 r2, d24[0]
   beq END_MAX_VALUE_W32
@@ -213,12 +201,8 @@ END_MAX_VALUE_W32:
   mov r0, r2
   bx  lr
 
-.fnend
-
 @ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
-WebRtcSpl_MinValueW16Neon:
-.fnstart
-
+DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon
   movw r2, #0x7FFF            @ Initialize the return value.
   cmp r0, #0
   beq END_MIN_VALUE_W16
@@ -239,8 +223,8 @@ LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
 
   @ Find the maximum value in the Neon registers and move it to r2.
   vmin.s16 d24, d25
-  vpmin.s16 d24, d24
-  vpmin.s16 d24, d24
+  vpmin.s16 d24, d24, d24
+  vpmin.s16 d24, d24, d24
   adds r1, #8
   vmov.s16 r2, d24[0]
   sxth  r2, r2
@@ -257,12 +241,8 @@ END_MIN_VALUE_W16:
   mov r0, r2
   bx  lr
 
-.fnend
-
 @ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
-WebRtcSpl_MinValueW32Neon:
-.fnstart
-
+DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon
   mov r2, #0x7FFFFFFF         @ Initialize the return value.
   cmp r0, #0
   beq END_MIN_VALUE_W32
@@ -285,8 +265,8 @@ LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
 
   @ Find the maximum value in the Neon registers and move it to r2.
   vmin.s32 q12, q11
-  vpmin.s32 d24, d25
-  vpmin.s32 d24, d24
+  vpmin.s32 d24, d24, d25
+  vpmin.s32 d24, d24, d24
   adds r1, #8
   vmov.s32 r2, d24[0]
   beq END_MIN_VALUE_W32
@@ -301,5 +281,3 @@ LOOP_MIN_VALUE_W32:
 END_MIN_VALUE_W32:
   mov r0, r2
   bx  lr
-
-.fnend
diff --git a/webrtc/common_audio/signal_processing/resample_by_2.c b/webrtc/common_audio/signal_processing/resample_by_2.c
index c1d8b3784..e6692e8a7 100644
--- a/webrtc/common_audio/signal_processing/resample_by_2.c
+++ b/webrtc/common_audio/signal_processing/resample_by_2.c
@@ -31,8 +31,8 @@ static __inline WebRtc_Word32 MUL_ACCUM_1(WebRtc_Word32 tbl_value,
                                           WebRtc_Word32 diff,
                                           WebRtc_Word32 state) {
   WebRtc_Word32 result;
-  __asm__("smlawb %r0, %r1, %r2, %r3": "=r"(result): "r"(diff),
-                                       "r"(tbl_value), "r"(state));
+  __asm __volatile ("smlawb %0, %1, %2, %3": "=r"(result): "r"(diff),
+                                   "r"(tbl_value), "r"(state));
   return result;
 }
 
@@ -47,8 +47,8 @@ static __inline WebRtc_Word32 MUL_ACCUM_2(WebRtc_Word32 tbl_value,
                                           WebRtc_Word32 diff,
                                           WebRtc_Word32 state) {
   WebRtc_Word32 result;
-  __asm__("smmla %r0, %r1, %r2, %r3": "=r"(result): "r"(diff << 1),
-                                      "r"(tbl_value), "r"(state));
+  __asm __volatile ("smmla %0, %1, %2, %3": "=r"(result): "r"(diff << 1),
+                                  "r"(tbl_value), "r"(state));
   return result;
 }
 
diff --git a/webrtc/common_audio/signal_processing/signal_processing.gypi b/webrtc/common_audio/signal_processing/signal_processing.gypi
index b09c767be..91592ea7c 100644
--- a/webrtc/common_audio/signal_processing/signal_processing.gypi
+++ b/webrtc/common_audio/signal_processing/signal_processing.gypi
@@ -65,8 +65,8 @@
       'conditions': [
         ['target_arch=="arm"', {
           'sources': [
-            'complex_bit_reverse_arm.s',
-            'spl_sqrt_floor_arm.s',
+            'complex_bit_reverse_arm.S',
+            'spl_sqrt_floor_arm.S',
           ],
           'sources!': [
             'complex_bit_reverse.c',
@@ -76,7 +76,7 @@
             ['armv7==1', {
               'dependencies': ['signal_processing_neon',],
               'sources': [
-                'filter_ar_fast_q12_armv7.s',
+                'filter_ar_fast_q12_armv7.S',
               ],
               'sources!': [
                 'filter_ar_fast_q12.c',
@@ -112,10 +112,10 @@
           'type': '<(library)',
           'includes': ['../../build/arm_neon.gypi',],
           'sources': [
-            'cross_correlation_neon.s',
-            'downsample_fast_neon.s',
-            'min_max_operations_neon.s',
-            'vector_scaling_operations_neon.s',
+            'cross_correlation_neon.S',
+            'downsample_fast_neon.S',
+            'min_max_operations_neon.S',
+            'vector_scaling_operations_neon.S',
           ],
         },
       ],
diff --git a/webrtc/common_audio/signal_processing/spl_sqrt_floor_arm.s b/webrtc/common_audio/signal_processing/spl_sqrt_floor_arm.S
similarity index 93%
rename from webrtc/common_audio/signal_processing/spl_sqrt_floor_arm.s
rename to webrtc/common_audio/signal_processing/spl_sqrt_floor_arm.S
index a2c5b7d0d..c49ef1f3b 100644
--- a/webrtc/common_audio/signal_processing/spl_sqrt_floor_arm.s
+++ b/webrtc/common_audio/signal_processing/spl_sqrt_floor_arm.S
@@ -8,10 +8,11 @@
 @ Output:             r0 = INT (SQRT (r0)), precision is 16 bits
 @ Registers touched:  r1, r2
 
-.global WebRtcSpl_SqrtFloor
+#include "webrtc/system_wrappers/interface/asm_defines.h"
 
+GLOBAL_FUNCTION WebRtcSpl_SqrtFloor
 .align  2
-WebRtcSpl_SqrtFloor:
+DEFINE_FUNCTION WebRtcSpl_SqrtFloor
   mov    r1, #3 << 30
   mov    r2, #1 << 30
 
diff --git a/webrtc/common_audio/signal_processing/vector_scaling_operations_neon.s b/webrtc/common_audio/signal_processing/vector_scaling_operations_neon.S
similarity index 92%
rename from webrtc/common_audio/signal_processing/vector_scaling_operations_neon.s
rename to webrtc/common_audio/signal_processing/vector_scaling_operations_neon.S
index 562425bf0..07db741b0 100644
--- a/webrtc/common_audio/signal_processing/vector_scaling_operations_neon.s
+++ b/webrtc/common_audio/signal_processing/vector_scaling_operations_neon.S
@@ -13,15 +13,11 @@
 @ optimized for ARM Neon platform. Output is bit-exact with the reference
 @ C code in vector_scaling_operations.c.
 
-.arch armv7-a
-.fpu neon
+#include "webrtc/system_wrappers/interface/asm_defines.h"
 
+GLOBAL_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
 .align  2
-.global WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
-
-WebRtcSpl_ScaleAndAddVectorsWithRoundNeon:
-.fnstart
-
+DEFINE_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
   push {r4-r9}
 
   ldr r4, [sp, #32]           @ length
@@ -84,5 +80,3 @@ LOOP_NO_UNROLLING:
 END:
   pop {r4-r9}
   bx  lr
-
-.fnend
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S
index e915fabaf..125a5d1fa 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S
@@ -13,9 +13,9 @@
 @ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype
 @ C code is at end of this file.
 
-.arch armv7-a
-.fpu neon
-.global WebRtcIsacfix_AllpassFilter2FixDec16Neon
+#include "webrtc/system_wrappers/interface/asm_defines.h"
+
+GLOBAL_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
 .align  2
 
 @void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
@@ -27,7 +27,7 @@
 @    int32_t *filter_state_ch1,  // Filter state for channel 1, in Q16
 @    int32_t *filter_state_ch2); // Filter state for channel 2, in Q16
 
-WebRtcIsacfix_AllpassFilter2FixDec16Neon:
+DEFINE_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
   push {r4 - r7}
 
   ldr r5, [sp, #24]           @ filter_state_ch2
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S
index feb93c93f..a970333d8 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S
@@ -9,9 +9,9 @@
 @
 @ Reference code in filters.c. Output is bit-exact.
 
-#include "settings.h"
+#include "webrtc/system_wrappers/interface/asm_defines.h"
 
-.global WebRtcIsacfix_AutocorrNeon
+GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon
 .align  2
 
 @ int WebRtcIsacfix_AutocorrNeon(
@@ -21,7 +21,7 @@
 @     WebRtc_Word16 order,
 @     WebRtc_Word16* __restrict scale);
 
-WebRtcIsacfix_AutocorrNeon:
+DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon
   push       {r3 - r12}
 
   @ Constant initializations
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi
index 866e8e621..8b4b51c5d 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi
@@ -97,8 +97,8 @@
             '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
           ],
           'sources': [
-            'filters_neon.S',
             'filterbanks_neon.S',
+            'filters_neon.S',
             'lattice_neon.S',
             'lpc_masking_model_neon.S',
           ],
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_armv7.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_armv7.S
index 1cd3a764f..35fd9ef74 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_armv7.S
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_armv7.S
@@ -25,16 +25,12 @@
 @ r12: constant #16384
 @ r6, r7, r8, r10, r11: scratch
 
+#include "webrtc/system_wrappers/interface/asm_defines.h"
 #include "settings.h"
 
-.arch armv7-a
-.global WebRtcIsacfix_FilterArLoop
+GLOBAL_FUNCTION WebRtcIsacfix_FilterArLoop
 .align  2
-
-WebRtcIsacfix_FilterArLoop:
-.fnstart
-
-.save {r4-r11}
+DEFINE_FUNCTION WebRtcIsacfix_FilterArLoop
   push    {r4-r11}
 
   add     r1, #2                 @ &ar_f_Q0[1]
@@ -77,6 +73,3 @@ ORDER_COEF_LOOP:  @ for(k = order_coef - 1 ; k >= 0; k--)
 
   pop     {r4-r11}
   bx      lr
-
-.fnend
-
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S
index a59b6e37f..f31a32d9d 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S
@@ -29,19 +29,12 @@
 @ instructions, smulwb, and smull. Speech quality was not degraded by
 @ testing speech and tone vectors.
 
-.arch armv7-a
-.fpu neon
-
+#include "webrtc/system_wrappers/interface/asm_defines.h"
 #include "settings.h"
 
-.global WebRtcIsacfix_FilterMaLoopNeon
-
+GLOBAL_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
 .align  2
-
-WebRtcIsacfix_FilterMaLoopNeon:
-.fnstart
-
-.save {r4-r8}
+DEFINE_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
   push        {r4-r8}
 
   vdup.32     d28, r0             @ Initialize Neon register with input0
@@ -151,5 +144,3 @@ LAST_SAMPLE:
 END:
   pop         {r4-r8}
   bx          lr
-
-.fnend
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S
index 20b60d0f4..a5955c27a 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S
@@ -12,9 +12,9 @@
 @ iSAC codec, optimized for ARM Neon platform. Reference code in
 @ lpc_masking_model.c.
 
-.arch armv7-a
-.fpu neon
-.global WebRtcIsacfix_CalculateResidualEnergyNeon
+#include "webrtc/system_wrappers/interface/asm_defines.h"
+
+GLOBAL_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
 .align  2
 
 @ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order,
@@ -23,10 +23,7 @@
 @                                                   int16_t* a_polynomial,
 @                                                   int32_t* corr_coeffs,
 @                                                   int* q_val_residual_energy);
-
-WebRtcIsacfix_CalculateResidualEnergyNeon:
-.fnstart
-.save {r4-r11}
+DEFINE_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
   push {r4-r11}
 
   sub r13, r13, #16
@@ -173,5 +170,4 @@ GET_SHIFT_NORM:
   pop {r4-r11}
   bx  r14
 
-.fnend
 
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_filter_armv6.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_filter_armv6.S
index 7ce3b6f26..ffd0e6338 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_filter_armv6.S
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_filter_armv6.S
@@ -13,12 +13,11 @@
 @
 @ Output is bit-exact with the reference C code in pitch_filter.c.
 
+#include "webrtc/system_wrappers/interface/asm_defines.h"
 #include "settings.h"
 
-.arch armv6
+GLOBAL_FUNCTION WebRtcIsacfix_PitchFilterCore
 .align  2
-.global WebRtcIsacfix_PitchFilterCore
-
 
 @ void WebRtcIsacfix_PitchFilterCore(int loopNumber,
 @                                    WebRtc_Word16 gain,
@@ -30,9 +29,7 @@
 @                                    WebRtc_Word16* inputBuf,
 @                                    WebRtc_Word16* outputBuf,
 @                                    int* index2) {
-
-WebRtcIsacfix_PitchFilterCore:
-.fnstart
+DEFINE_FUNCTION WebRtcIsacfix_PitchFilterCore
   push {r4-r11}
   sub sp, #8
 
@@ -140,7 +137,6 @@ LOOP:
   add sp, #8
   pop {r4-r11}
   bx  lr
-.fnend
 
 .align  2
 kDampFilter:
diff --git a/webrtc/modules/audio_processing/aecm/aecm_core_neon.S b/webrtc/modules/audio_processing/aecm/aecm_core_neon.S
index 412c173f6..833575ea0 100644
--- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.S
+++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.S
@@ -12,19 +12,17 @@
 @ This file contains some functions in AECM, optimized for ARM Neon
 @ platforms. Reference C code is in file aecm_core.c. Bit-exact.
 
-.arch armv7-a
-.fpu neon
-
 #include "aecm_defines.h"
 #include "aecm_core_neon_offsets.h"
+#include "webrtc/system_wrappers/interface/asm_defines.h"
 
 .extern WebRtcAecm_kSqrtHanning
 
-.global WebRtcAecm_WindowAndFFTNeon
-.global WebRtcAecm_InverseFFTAndWindowNeon
-.global WebRtcAecm_CalcLinearEnergiesNeon
-.global WebRtcAecm_StoreAdaptiveChannelNeon
-.global WebRtcAecm_ResetAdaptiveChannelNeon
+GLOBAL_FUNCTION WebRtcAecm_WindowAndFFTNeon
+GLOBAL_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon
+GLOBAL_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon
+GLOBAL_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon
+GLOBAL_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon
 
 @ void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm,
 @                                  WebRtc_Word16* fft,
@@ -32,9 +30,7 @@
 @                                  complex16_t* freq_signal,
 @                                  int time_signal_scaling);
 .align  2
-WebRtcAecm_WindowAndFFTNeon:
-.fnstart
-.save {r4, r5, r6, lr}
+DEFINE_FUNCTION WebRtcAecm_WindowAndFFTNeon
   push {r4, r5, r6, lr}
 
   ldr r12, [sp, #16]                         @ time_signal_scaling
@@ -84,7 +80,6 @@ LOOP_PART_LEN2:
   bgt LOOP_PART_LEN2
 
   pop {r4, r5, r6, pc}
-.fnend
 
 @ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
 @                                         WebRtc_Word16* fft,
@@ -92,9 +87,7 @@ LOOP_PART_LEN2:
 @                                         WebRtc_Word16* output,
 @                                         const WebRtc_Word16* nearendClean);
 .align  2
-WebRtcAecm_InverseFFTAndWindowNeon:
-.fnstart
-.save {r4-r8, lr}
+DEFINE_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon
   push {r4-r8, lr}
 
   @ Values of r0, r1, and r3 will change in WebRtcSpl_ComplexIFFT
@@ -158,12 +151,12 @@ LOOP_POST_IFFT:
   vld1.16 d1, [r12, :64]!                    @ kSqrtHanningReversed[i]
   vadd.i32 q8, q10
   vmull.s16 q0, d0, d1
-  vqshrn.s32 d4, q8, #0
+  vqmovn.s32 d16, q8
   vshr.s32 q0, q0, #14
   vst2.16 {d4, d5}, [r4, :128]!              @ &efw[i];
   vshl.s32 q0, q0, q9
   vst1.16 d16, [r7, :64]!                    @ output[i]
-  vqshrn.s32 d0, q0, #0
+  vqmovn.s32 d0, q0
   subs r3, #1
   vst1.16 d0, [r8, :64]!                     @ aecm->outBuf[i]
   bgt LOOP_POST_IFFT
@@ -203,7 +196,6 @@ LOOP_COPY:
 
 END:
   pop {r4-r8, pc}
-.fnend
 
 @ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
 @                                        const WebRtc_UWord16* far_spectrum,
@@ -212,9 +204,7 @@ END:
 @                                        WebRtc_UWord32* echo_energy_adapt,
 @                                        WebRtc_UWord32* echo_energy_stored);
 .align  2
-WebRtcAecm_CalcLinearEnergiesNeon:
-.fnstart
-.save {r4-r7}
+DEFINE_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon
   push {r4-r7}
 
   vmov.i32 q14, #0
@@ -274,14 +264,12 @@ LOOP_CALC_LINEAR_ENERGIES:
 
   pop {r4-r7}
   bx  lr
-.fnend
 
 @ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
 @                                          const uint16_t* far_spectrum,
 @                                          int32_t* echo_est);
 .align  2
-WebRtcAecm_StoreAdaptiveChannelNeon:
-.fnstart
+DEFINE_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon
   ldr r3, =offset_aecm_channelAdapt16
   ldr r12, =offset_aecm_channelStored
   ldr r3, [r0, r3]
@@ -305,12 +293,10 @@ LOOP_STORE_ADAPTIVE_CHANNEL:
   str r3, [r2]
 
   bx  lr
-.fnend
 
 @ void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm);
 .align  2
-WebRtcAecm_ResetAdaptiveChannelNeon:
-.fnstart
+DEFINE_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon
   ldr r1, =offset_aecm_channelAdapt16
   ldr r2, =offset_aecm_channelAdapt32
   movw r3, #offset_aecm_channelStored
@@ -334,15 +320,14 @@ LOOP_RESET_ADAPTIVE_CHANNEL:
   str r0, [r2]
 
   bx  lr
-.fnend
 
   @ Square root of Hanning window in Q14. Compared to WebRtcAecm_kSqrtHanning,
   @ the order was reversed and one useless element (0) was removed.
 .align  3
 kSqrtHanningReversed:
-  .hword 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947
-  .hword 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571
-  .hword 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335
-  .hword 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370
-  .hword 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101
-  .hword 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399
+  .short 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947
+  .short 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571
+  .short 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335
+  .short 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370
+  .short 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101
+  .short 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399
diff --git a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
index 3bbd84b49..d8250ef08 100644
--- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
+++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
@@ -139,7 +139,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
       "vneg.s16 d23, d23\n\t"
       "vst2.16 {d22, d23}, [%[p_fft], :128]!\n\t"
       "vrev64.16 q10, q10\n\t"
-      "vst2.16 {q10}, [%[p_fft_offset], %[offset]]\n\t"
+      "vst2.16 {q10}, [%[p_fft_offset]], %[offset]\n\t"
       :[p_efw]"+r"(p_efw),
        [p_fft]"+r"(p_fft),
        [p_fft_offset]"+r"(p_fft_offset)
@@ -181,7 +181,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
     __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
     __asm __volatile("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
     __asm __volatile("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
-    __asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
+    __asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
     __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&efw[i].real));
     __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
 
@@ -196,7 +196,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
     __asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
     // aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
     //    WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
-    __asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
+    __asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
     __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
   }
 
diff --git a/webrtc/modules/audio_processing/ns/nsx_core_neon.S b/webrtc/modules/audio_processing/ns/nsx_core_neon.S
index 4e15959cd..cea75532a 100644
--- a/webrtc/modules/audio_processing/ns/nsx_core_neon.S
+++ b/webrtc/modules/audio_processing/ns/nsx_core_neon.S
@@ -12,18 +12,16 @@
 @ This file contains some functions in NS, optimized for ARM Neon
 @ platforms. Reference C code is in file nsx_core.c. Bit-exact.
 
-.arch armv7-a
-.fpu neon
-
+#include "webrtc/system_wrappers/interface/asm_defines.h"
 #include "nsx_defines.h"
 #include "nsx_core_neon_offsets.h"
 
-.global WebRtcNsx_NoiseEstimationNeon
-.global WebRtcNsx_PrepareSpectrumNeon
-.global WebRtcNsx_SynthesisUpdateNeon
-.global WebRtcNsx_AnalysisUpdateNeon
-.global WebRtcNsx_DenormalizeNeon
-.global WebRtcNsx_CreateComplexBufferNeon
+GLOBAL_FUNCTION WebRtcNsx_NoiseEstimationNeon
+GLOBAL_FUNCTION WebRtcNsx_PrepareSpectrumNeon
+GLOBAL_FUNCTION WebRtcNsx_SynthesisUpdateNeon
+GLOBAL_FUNCTION WebRtcNsx_AnalysisUpdateNeon
+GLOBAL_FUNCTION WebRtcNsx_DenormalizeNeon
+GLOBAL_FUNCTION WebRtcNsx_CreateComplexBufferNeon
 
 @ void NoiseEstimationNeon(NsxInst_t* inst,
 @                          uint16_t* magn,
@@ -42,12 +40,7 @@
 @ r11: countDiv
 @ r12: i, the loop counter for LOOP_NOISEESTIMATION_MAGNLEN_INNER
 
-WebRtcNsx_NoiseEstimationNeon:
-.fnstart
-.save {r4-r11, r14}
-.vsave {d8-d15}
-.pad #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
-
+DEFINE_FUNCTION WebRtcNsx_NoiseEstimationNeon
   push {r4-r11, r14}
   vpush {d8-d15}
   sub sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
@@ -312,14 +305,10 @@ UPDATE_Q_NOISE:
   add sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
   vpop {d8-d15}
   pop {r4-r11, pc}
-.fnend
 
 @ static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset);
 @ Neon registers touched: q0-q3, q8-q13.
-UpdateNoiseEstimateNeon:
-.fnstart
-.save {r4, r5, r6, r14}
-
+DEFINE_FUNCTION UpdateNoiseEstimateNeon
   push {r4, r5, r6, r14}
   mov r5, r0
 
@@ -385,13 +374,9 @@ POST_LOOP_MAGNLEN:
   strh r3, [r2]
 
   pop {r4, r5, r6, pc}
-.fnend
 
 @ void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf);
-WebRtcNsx_PrepareSpectrumNeon:
-.fnstart
-.save {r4-r8}
-
+DEFINE_FUNCTION WebRtcNsx_PrepareSpectrumNeon
   push {r4-r8}
 
   movw r2, #offset_nsx_real
@@ -478,11 +463,9 @@ LOOP_ANALEN2:
 
   pop {r4-r8}
   bx r14
-.fnend
 
 @ void WebRtcNsx_DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor);
-WebRtcNsx_DenormalizeNeon:
-.fnstart
+DEFINE_FUNCTION WebRtcNsx_DenormalizeNeon
   movw r12, #offset_nsx_normData
   movw r3, #offset_nsx_real
   ldr r12, [r0, r12]          @ inst->normData
@@ -508,14 +491,11 @@ LOOP_ANALEN:
   blt LOOP_ANALEN
 
   bx r14
-.fnend
 
 @ void SynthesisUpdateNeon(NsxInst_t* inst,
 @                          int16_t* out_frame,
 @                          int16_t gain_factor);
-WebRtcNsx_SynthesisUpdateNeon:
-.fnstart
-.save {r4, r5}
+DEFINE_FUNCTION WebRtcNsx_SynthesisUpdateNeon
   push {r4, r5}
 
   vdup.16 d31, r2
@@ -586,12 +566,8 @@ EXIT_SYNTHESISUPDATE:
   pop {r4, r5}
   bx r14
 
-.fnend
-
 @ void AnalysisUpdateNeon(NsxInst_t* inst, int16_t* out, int16_t* new_speech);
-WebRtcNsx_AnalysisUpdateNeon:
-.fnstart
-.save {r4-r6}
+DEFINE_FUNCTION WebRtcNsx_AnalysisUpdateNeon
   push {r4-r6}
 
   movw r3, #offset_nsx_analysisBuffer
@@ -647,11 +623,9 @@ LOOP_WINDOW_DATA:
 POST_LOOP_WINDOW_DATA:
   pop {r4-r6}
   bx r14
-.fnend
 
 @ void CreateComplexBufferNeon(NsxInst_t* inst, int16_t* in, int16_t* out);
-WebRtcNsx_CreateComplexBufferNeon:
-.fnstart
+DEFINE_FUNCTION WebRtcNsx_CreateComplexBufferNeon
   movw r3, #offset_nsx_anaLen
   movw r12, #offset_nsx_normData
   ldrsh r3, [r0, r3]                  @ inst->anaLen
@@ -678,4 +652,3 @@ LOOP_CREATE_COMPLEX_BUFFER:           @ Unrolled by 16.
   blt LOOP_CREATE_COMPLEX_BUFFER
 
   bx r14
-.fnend
diff --git a/webrtc/system_wrappers/interface/asm_defines.h b/webrtc/system_wrappers/interface/asm_defines.h
new file mode 100644
index 000000000..9ef6c8f3b
--- /dev/null
+++ b/webrtc/system_wrappers/interface/asm_defines.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_SYSTEM_WRAPPERS_INTERFACE_ASM_DEFINES_H_
+#define WEBRTC_SYSTEM_WRAPPERS_INTERFACE_ASM_DEFINES_H_
+
+// Define the macros used in ARM assembly code, so that for Mac or iOS builds
+// we add leading underscores for the function names.
+#ifdef __APPLE__
+.macro GLOBAL_FUNCTION name
+.global _\name
+.endm
+.macro DEFINE_FUNCTION name
+_\name:
+.endm
+#else
+.macro GLOBAL_FUNCTION name
+.global \name
+.endm
+.macro DEFINE_FUNCTION name
+\name:
+.endm
+#endif
+
+#endif  // WEBRTC_SYSTEM_WRAPPERS_INTERFACE_COMPILE_ASSERT_H_