Enabling common_audio building with NEON on ARM64

Passed building common_audio_neon and common_audio_unittests both on Android ARMv7 and Android ARM64. Pass common_audio_unittests tests both on Android ARMv7 and Android ARM64. BUG=4002 R=andrew@webrtc.org, jridges@masque.com, kjellander@webrtc.org Change-Id: I8e0722f356db8cca6fc8232f00ae1e898a086f5a Review URL: https://webrtc-codereview.appspot.com/40629004 Patch from Zhongwei Yao <zhongwei.yao@arm.com>. Cr-Commit-Position: refs/heads/master@{#8620} git-svn-id: http://webrtc.googlecode.com/svn/trunk@8620 4adac7df-926f-26a2-2b94-8c16560cd09d
2015-03-05 19:13:46 +00:00 · 2015-03-05 19:13:46 +00:00 · 0933d01d09
commit 0933d01d09
parent d7a212e8b9
8 changed files with 40 additions and 688 deletions
--- a/webrtc/common_audio/BUILD.gn
+++ b/webrtc/common_audio/BUILD.gn
@ -124,6 +124,10 @@ source_set("common_audio") {
    }
  }

+  if (current_cpu == "arm64") {
+    deps += [ ":common_audio_neon" ]
+  }
+
  if (current_cpu == "mipsel") {
    sources += [
      "signal_processing/include/spl_inl_mips.h",
@ -194,30 +198,23 @@ if (current_cpu == "x86" || current_cpu == "x64") {
  }
 }

-if (rtc_build_armv7_neon) {
+if (rtc_build_armv7_neon || current_cpu == "arm64") {
  source_set("common_audio_neon") {
    sources = [
      "fir_filter_neon.cc",
      "resampler/sinc_resampler_neon.cc",
-      "signal_processing/cross_correlation_neon.S",
-      "signal_processing/downsample_fast_neon.S",
-      "signal_processing/min_max_operations_neon.S",
+      "signal_processing/cross_correlation_neon.c",
+      "signal_processing/downsample_fast_neon.c",
+      "signal_processing/min_max_operations_neon.c",
    ]

    configs += [ "..:common_config" ]
    public_configs = [ "..:common_inherited_config" ]

-
-    # Enable compilation for the ARM v7 Neon instruction set. This is needed
-    # since //build/config/arm.gni only enables Neon for iOS, not Android.
-    # This provides the same functionality as webrtc/build/arm_neon.gypi.
-    # TODO(kjellander): Investigate if this can be moved into webrtc.gni or
-    # //build/config/arm.gni instead, to reduce code duplication.
-    # Remove the -mfpu=vfpv3-d16 cflag.
-    configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
-    cflags = [
-      "-mfpu=neon",
-    ]
+    if (!arm_use_neon) {
+      configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
+      cflags = [ "-mfpu=neon" ]
+    }

    # Disable LTO in audio_processing_neon target due to compiler bug.
    if (rtc_use_lto) {
--- a/webrtc/common_audio/common_audio.gyp
+++ b/webrtc/common_audio/common_audio.gyp
@ -146,6 +146,9 @@
            }],
          ],  # conditions
        }],
+        ['target_arch=="arm64"', {
+          'dependencies': ['common_audio_neon',],
+        }],
        ['target_arch=="mipsel" and mips_arch_variant!="r6" and android_webview_build==0', {
          'sources': [
            'signal_processing/include/spl_inl_mips.h',
@ -194,7 +197,7 @@
        },
      ],  # targets
    }],
-    ['target_arch=="arm" and arm_version>=7', {
+    ['target_arch=="arm" and arm_version>=7 or target_arch=="arm64"', {
      'targets': [
        {
          'target_name': 'common_audio_neon',
@ -203,9 +206,9 @@
          'sources': [
            'fir_filter_neon.cc',
            'resampler/sinc_resampler_neon.cc',
-            'signal_processing/cross_correlation_neon.S',
-            'signal_processing/downsample_fast_neon.S',
-            'signal_processing/min_max_operations_neon.S',
+            'signal_processing/cross_correlation_neon.c',
+            'signal_processing/downsample_fast_neon.c',
+            'signal_processing/min_max_operations_neon.c',
          ],
          'conditions': [
            # Disable LTO in common_audio_neon target due to compiler bug
--- a/webrtc/common_audio/resampler/sinc_resampler.h
+++ b/webrtc/common_audio/resampler/sinc_resampler.h
@ -107,7 +107,7 @@ class SincResampler {
  static float Convolve_SSE(const float* input_ptr, const float* k1,
                            const float* k2,
                            double kernel_interpolation_factor);
-#elif defined(WEBRTC_ARCH_ARM_V7)
+#elif defined(WEBRTC_ARCH_ARM_V7) || defined(WEBRTC_ARCH_ARM64_NEON)
  static float Convolve_NEON(const float* input_ptr, const float* k1,
                             const float* k2,
                             double kernel_interpolation_factor);
--- a/webrtc/common_audio/signal_processing/cross_correlation_neon.S
+++ b/webrtc/common_audio/signal_processing/cross_correlation_neon.S
@ -1,159 +0,0 @@
-@
-@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS.  All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-@ cross_correlation_neon.s
-@ This file contains the function WebRtcSpl_CrossCorrelationNeon(),
-@ optimized for ARM Neon platform.
-@
-@ Reference Ccode at end of this file.
-@ Output is bit-exact with the reference C code, but not with the generic
-@ C code in file cross_correlation.c, due to reduction of shift operations
-@ from using Neon registers.
-
-@ Register usage:
-@
-@ r0: *cross_correlation (function argument)
-@ r1: *seq1 (function argument)
-@ r2: *seq2 (function argument)
-@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ
-@ r4: counter for LOOP_DIM_CROSS_CORRELATION
-@ r5: seq2_ptr
-@ r6: seq1_ptr
-@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
-@ r8, r9, r10, r11, r12: scratch
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon
-.align  2
-DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon
-  push {r4-r11}
-
-  @ Put the shift value (-right_shifts) into a Neon register.
-  ldrsh r10, [sp, #36]
-  rsb r10, r10, #0
-  mov r8, r10, asr #31
-  vmov d16, r10, r8
-
-  @ Initialize loop counters.
-  and r7, r3, #7              @ inner_loop_len2 = dim_seq % 8;
-  asr r3, r3, #3              @ inner_loop_len1 = dim_seq / 8;
-  ldrsh r4, [sp, #32]         @ dim_cross_correlation
-
-LOOP_DIM_CROSS_CORRELATION:
-  vmov.i32 q9, #0
-  vmov.i32 q14, #0
-  movs r8, r3                 @ inner_loop_len1
-  mov r6, r1                  @ seq1_ptr
-  mov r5, r2                  @ seq2_ptr
-  ble POST_LOOP_DIM_SEQ
-
-LOOP_DIM_SEQ:
-  vld1.16 {d20, d21}, [r6]!   @ seq1_ptr
-  vld1.16 {d22, d23}, [r5]!   @ seq2_ptr
-  subs r8, r8, #1
-  vmull.s16 q12, d20, d22
-  vmull.s16 q13, d21, d23
-  vpadal.s32 q9, q12
-  vpadal.s32 q14, q13
-  bgt LOOP_DIM_SEQ
-
-POST_LOOP_DIM_SEQ:
-  movs r10, r7                @ Loop counter
-  mov r12, #0
-  mov r8, #0
-  ble POST_LOOP_DIM_SEQ_RESIDUAL
-
-LOOP_DIM_SEQ_RESIDUAL:
-  ldrh r11, [r6], #2
-  ldrh r9, [r5], #2
-  smulbb r11, r11, r9
-  adds r8, r8, r11
-  adc r12, r12, r11, asr #31
-  subs r10, #1
-  bgt LOOP_DIM_SEQ_RESIDUAL
-
-POST_LOOP_DIM_SEQ_RESIDUAL:   @ Sum the results up and do the shift.
-  vadd.i64 d18, d19
-  vadd.i64 d28, d29
-  vadd.i64 d18, d28
-  vmov.32 d17[0], r8
-  vmov.32 d17[1], r12
-  vadd.i64 d17, d18
-  vshl.s64 d17, d16
-  vst1.32 d17[0], [r0]!       @ Store the output
-
-  ldr r8, [sp, #40]           @ step_seq2
-  add r2, r8, lsl #1          @ prepare for seq2_ptr(r5) in the next loop.
-
-  subs r4, #1
-  bgt LOOP_DIM_CROSS_CORRELATION
-
-  pop {r4-r11}
-  bx  lr
-
-@ TODO(kma): Place this piece of reference code into a C code file.
-@ void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
-@                                     int16_t* seq1,
-@                                     int16_t* seq2,
-@                                     int16_t dim_seq,
-@                                     int16_t dim_cross_correlation,
-@                                     int16_t right_shifts,
-@                                     int16_t step_seq2) {
-@   int i = 0;
-@   int j = 0;
-@   int inner_loop_len1 = dim_seq >> 3;
-@   int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
-@
-@   assert(dim_cross_correlation > 0);
-@   assert(dim_seq > 0);
-@
-@   for (i = 0; i < dim_cross_correlation; i++) {
-@     int16_t *seq1_ptr = seq1;
-@     int16_t *seq2_ptr = seq2 + (step_seq2 * i);
-@     int64_t sum = 0;
-@
-@     for (j = inner_loop_len1; j > 0; j -= 1) {
-@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@       seq1_ptr++;
-@       seq2_ptr++;
-@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@       seq1_ptr++;
-@       seq2_ptr++;
-@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@       seq1_ptr++;
-@       seq2_ptr++;
-@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@       seq1_ptr++;
-@       seq2_ptr++;
-@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@       seq1_ptr++;
-@       seq2_ptr++;
-@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@       seq1_ptr++;
-@       seq2_ptr++;
-@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@       seq1_ptr++;
-@       seq2_ptr++;
-@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@       seq1_ptr++;
-@       seq2_ptr++;
-@     }
-@
-@     // Calculate the rest of the samples.
-@     for (j = inner_loop_len2; j > 0; j -= 1) {
-@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@       seq1_ptr++;
-@       seq2_ptr++;
-@     }
-@
-@     *cross_correlation++ = (int32_t)(sum >> right_shifts);
-@   }
-@ }
--- a/webrtc/common_audio/signal_processing/downsample_fast_neon.S
+++ b/webrtc/common_audio/signal_processing/downsample_fast_neon.S
@ -1,215 +0,0 @@
-@
-@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS.  All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-@ This file contains the function WebRtcSpl_DownsampleFastNeon(), optimized for
-@ ARM Neon platform. The description header can be found in
-@ signal_processing_library.h
-@
-@ The reference C code is in file downsample_fast.c. Bit-exact.
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon
-.align  2
-DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon
-  push {r4-r11}
-
-  cmp r3, #0                                @ data_out_length <= 0?
-  movle r0, #-1
-  ble END
-
-  ldrsh r12, [sp, #44]
-  ldr r5, [sp, #40]                         @ r5: factor
-  add r4, r12, #1                           @ r4: delay + 1
-  sub r3, r3, #1                            @ r3: data_out_length - 1
-  smulbb r3, r5, r3
-  ldr r8, [sp, #32]                         @ &coefficients[0]
-  mov r9, r12                               @ Iteration counter for outer loops.
-  add r3, r4                                @ delay + factor * (out_length-1) +1
-
-  cmp r3, r1                                @ data_in_length < endpos?
-  movgt r0, #-1
-  bgt END
-
-  @ Initializations.
-  sub r3, r5, asl #3
-  add r11, r0, r12, asl #1                  @ &data_in[delay]
-  ldr r0, [sp, #36]                         @ coefficients_length
-  add r3, r5                                @ endpos - factor * 7
-
-  cmp r0, #0                                @ coefficients_length <= 0 ?
-  movle r0, #-1
-  ble END
-
-  add r8, r0, asl #1                        @ &coeffieient[coefficients_length]
-  cmp r9, r3
-  bge POST_LOOP_ENDPOS                      @ branch when Iteration < 8 times.
-
-@
-@ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others)
-@
-  mov r4, #-2
-
-  @ Direct program flow to the right channel.
-
-  @ r10 is an offset to &data_in[] in the loop. After an iteration, we need to
-  @ move the pointer back to original after advancing 16 bytes by a vld1, and
-  @ then move 2 bytes forward to increment one more sample.
-  cmp r5, #2
-  moveq r10, #-14
-  beq LOOP_ENDPOS_FACTOR2                   @ Branch when factor == 2
-
-  @ Similar here, for r10, we need to move the pointer back to original after
-  @ advancing 32 bytes, then move 2 bytes forward to increment one sample.
-  cmp r5, #4
-  moveq r10, #-30
-  beq LOOP_ENDPOS_FACTOR4                   @ Branch when factor == 4
-
-  @ For r10, we need to move the pointer back to original after advancing
-  @ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample.
-  mov r10, r5, asl #4
-  rsb r10, #2
-  add r10, r5, asl #1
-  lsl r5, #1                                @ r5 = factor * sizeof(data_in)
-
-@ The general case (factor != 2 && factor != 4)
-LOOP_ENDPOS_GENERAL:
-  @ Initializations.
-  vmov.i32 q2, #2048
-  vmov.i32 q3, #2048
-  sub r7, r8, #2
-  sub r12, r0, #1                           @ coefficients_length - 1
-  sub r1, r11, r12, asl #1                  @ &data_in[i - j]
-
-LOOP_COEFF_LENGTH_GENERAL:
-  vld1.16 {d2[], d3[]}, [r7], r4            @ coefficients[j]
-  vld1.16 d0[0], [r1], r5                   @ data_in[i - j]
-  vld1.16 d0[1], [r1], r5                   @ data_in[i + factor - j]
-  vld1.16 d0[2], [r1], r5                   @ data_in[i + factor * 2 - j]
-  vld1.16 d0[3], [r1], r5                   @ data_in[i + factor * 3 - j]
-  vld1.16 d1[0], [r1], r5                   @ data_in[i + factor * 4 - j]
-  vld1.16 d1[1], [r1], r5                   @ data_in[i + factor * 5 - j]
-  vld1.16 d1[2], [r1], r5                   @ data_in[i + factor * 6 - j]
-  vld1.16 d1[3], [r1], r10                  @ data_in[i + factor * 7 - j]
-  subs r12, #1
-  vmlal.s16 q2, d0, d2
-  vmlal.s16 q3, d1, d3
-  bge LOOP_COEFF_LENGTH_GENERAL
-
-  @ Shift, saturate, and store the result.
-  vqshrn.s32 d0, q2, #12
-  vqshrn.s32 d1, q3, #12
-  vst1.16 {d0, d1}, [r2]!
-
-  add r11, r5, asl #3                       @ r11 -> &data_in[i + factor * 8]
-  add r9, r5, asl #2                        @ Counter i = delay + factor * 8.
-  cmp r9, r3                                @ i < endpos - factor * 7 ?
-  blt LOOP_ENDPOS_GENERAL
-  asr r5, #1                                @ Restore r5 to the value of factor.
-  b POST_LOOP_ENDPOS
-
-@ The case for factor == 2.
-LOOP_ENDPOS_FACTOR2:
-  @ Initializations.
-  vmov.i32 q2, #2048
-  vmov.i32 q3, #2048
-  sub r7, r8, #2
-  sub r12, r0, #1                           @ coefficients_length - 1
-  sub r1, r11, r12, asl #1                  @ &data_in[i - j]
-
-LOOP_COEFF_LENGTH_FACTOR2:
-  vld1.16 {d16[], d17[]}, [r7], r4          @ coefficients[j]
-  vld2.16 {d0, d1}, [r1]!                   @ data_in[]
-  vld2.16 {d2, d3}, [r1], r10               @ data_in[]
-  subs r12, #1
-  vmlal.s16 q2, d0, d16
-  vmlal.s16 q3, d2, d17
-  bge LOOP_COEFF_LENGTH_FACTOR2
-
-  @ Shift, saturate, and store the result.
-  vqshrn.s32 d0, q2, #12
-  vqshrn.s32 d1, q3, #12
-  vst1.16 {d0, d1}, [r2]!
-
-  add r11, r5, asl #4                       @ r11 -> &data_in[i + factor * 8]
-  add r9, r5, asl #3                        @ Counter i = delay + factor * 8.
-  cmp r9, r3                                @ i < endpos - factor * 7 ?
-  blt LOOP_ENDPOS_FACTOR2
-  b POST_LOOP_ENDPOS
-
-@ The case for factor == 4.
-LOOP_ENDPOS_FACTOR4:
-  @ Initializations.
-  vmov.i32 q2, #2048
-  vmov.i32 q3, #2048
-  sub r7, r8, #2
-  sub r12, r0, #1                           @ coefficients_length - 1
-  sub r1, r11, r12, asl #1                  @ &data_in[i - j]
-
-LOOP_COEFF_LENGTH_FACTOR4:
-  vld1.16 {d16[], d17[]}, [r7], r4          @ coefficients[j]
-  vld4.16 {d0, d1, d2, d3}, [r1]!           @ data_in[]
-  vld4.16 {d18, d19, d20, d21}, [r1], r10   @ data_in[]
-  subs r12, #1
-  vmlal.s16 q2, d0, d16
-  vmlal.s16 q3, d18, d17
-  bge LOOP_COEFF_LENGTH_FACTOR4
-
-  add r11, r5, asl #4                       @ r11 -> &data_in[i + factor * 8]
-  add r9, r5, asl #3                        @ Counter i = delay + factor * 8.
-
-  @ Shift, saturate, and store the result.
-  vqshrn.s32 d0, q2, #12
-  vqshrn.s32 d1, q3, #12
-  cmp r9, r3                                @ i < endpos - factor * 7 ?
-  vst1.16 {d0, d1}, [r2]!
-
-  blt LOOP_ENDPOS_FACTOR4
-
-@
-@ Second part, do the rest iterations (if any).
-@
-
-POST_LOOP_ENDPOS:
-  add r3, r5, asl #3
-  sub r3, r5                                @ Restore r3 to endpos.
-  cmp r9, r3
-  movge r0, #0
-  bge END
-
-LOOP2_ENDPOS:
-  @ Initializations.
-  mov r7, r8
-  sub r12, r0, #1                           @ coefficients_length - 1
-  sub r6, r11, r12, asl #1                  @ &data_in[i - j]
-
-  mov r1, #2048
-
-LOOP2_COEFF_LENGTH:
-  ldrsh r4, [r7, #-2]!                      @ coefficients[j]
-  ldrsh r10, [r6], #2                       @ data_in[i - j]
-  smlabb r1, r4, r10, r1
-  subs r12, #1
-  bge LOOP2_COEFF_LENGTH
-
-  @ Shift, saturate, and store the result.
-  ssat r1, #16, r1, asr #12
-  strh r1, [r2], #2
-
-  add r11, r5, asl #1                       @ r11 -> &data_in[i + factor]
-  add r9, r5                                @ Counter i = delay + factor.
-  cmp r9, r3                                @ i < endpos?
-  blt LOOP2_ENDPOS
-
-  mov r0, #0
-
-END:
-  pop {r4-r11}
-  bx  lr
--- a/webrtc/common_audio/signal_processing/include/signal_processing_library.h
+++ b/webrtc/common_audio/signal_processing/include/signal_processing_library.h
@ -154,7 +154,8 @@ void WebRtcSpl_ZerosArrayW32(int32_t* vector,
 typedef int16_t (*MaxAbsValueW16)(const int16_t* vector, int length);
 extern MaxAbsValueW16 WebRtcSpl_MaxAbsValueW16;
 int16_t WebRtcSpl_MaxAbsValueW16C(const int16_t* vector, int length);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+  (defined WEBRTC_ARCH_ARM64_NEON)
 int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
 #endif
 #if defined(MIPS32_LE)
@ -172,7 +173,8 @@ int16_t WebRtcSpl_MaxAbsValueW16_mips(const int16_t* vector, int length);
 typedef int32_t (*MaxAbsValueW32)(const int32_t* vector, int length);
 extern MaxAbsValueW32 WebRtcSpl_MaxAbsValueW32;
 int32_t WebRtcSpl_MaxAbsValueW32C(const int32_t* vector, int length);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+  (defined WEBRTC_ARCH_ARM64_NEON)
 int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
 #endif
 #if defined(MIPS_DSP_R1_LE)
@ -192,7 +194,8 @@ int32_t WebRtcSpl_MaxAbsValueW32_mips(const int32_t* vector, int length);
 typedef int16_t (*MaxValueW16)(const int16_t* vector, int length);
 extern MaxValueW16 WebRtcSpl_MaxValueW16;
 int16_t WebRtcSpl_MaxValueW16C(const int16_t* vector, int length);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+  (defined WEBRTC_ARCH_ARM64_NEON)
 int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
 #endif
 #if defined(MIPS32_LE)
@ -212,7 +215,8 @@ int16_t WebRtcSpl_MaxValueW16_mips(const int16_t* vector, int length);
 typedef int32_t (*MaxValueW32)(const int32_t* vector, int length);
 extern MaxValueW32 WebRtcSpl_MaxValueW32;
 int32_t WebRtcSpl_MaxValueW32C(const int32_t* vector, int length);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+  (defined WEBRTC_ARCH_ARM64_NEON)
 int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
 #endif
 #if defined(MIPS32_LE)
@ -232,7 +236,8 @@ int32_t WebRtcSpl_MaxValueW32_mips(const int32_t* vector, int length);
 typedef int16_t (*MinValueW16)(const int16_t* vector, int length);
 extern MinValueW16 WebRtcSpl_MinValueW16;
 int16_t WebRtcSpl_MinValueW16C(const int16_t* vector, int length);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+  (defined WEBRTC_ARCH_ARM64_NEON)
 int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
 #endif
 #if defined(MIPS32_LE)
@ -252,7 +257,8 @@ int16_t WebRtcSpl_MinValueW16_mips(const int16_t* vector, int length);
 typedef int32_t (*MinValueW32)(const int32_t* vector, int length);
 extern MinValueW32 WebRtcSpl_MinValueW32;
 int32_t WebRtcSpl_MinValueW32C(const int32_t* vector, int length);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+  (defined WEBRTC_ARCH_ARM64_NEON)
 int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
 #endif
 #if defined(MIPS32_LE)
@ -552,7 +558,8 @@ void WebRtcSpl_CrossCorrelationC(int32_t* cross_correlation,
                                 int16_t dim_cross_correlation,
                                 int16_t right_shifts,
                                 int16_t step_seq2);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+  (defined WEBRTC_ARCH_ARM64_NEON)
 void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
                                    const int16_t* seq1,
                                    const int16_t* seq2,
@ -717,7 +724,8 @@ int WebRtcSpl_DownsampleFastC(const int16_t* data_in,
                              int coefficients_length,
                              int factor,
                              int delay);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+  (defined WEBRTC_ARCH_ARM64_NEON)
 int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
                                 int data_in_length,
                                 int16_t* data_out,
--- a/webrtc/common_audio/signal_processing/min_max_operations_neon.S
+++ b/webrtc/common_audio/signal_processing/min_max_operations_neon.S
@ -1,283 +0,0 @@
-@
-@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS.  All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-@ This file contains some minimum and maximum functions, optimized for
-@ ARM Neon platform. The description header can be found in
-@ signal_processing_library.h
-@
-@ The reference C code is in file min_max_operations.c. Code here is basically
-@ a loop unrolling by 8 with Neon instructions. Bit-exact.
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
-GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
-GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon
-GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon
-GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon
-GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon
-
-.align  2
-@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
-DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
-  mov r2, #-1                 @ Initialize the return value.
-  cmp r0, #0
-  beq END_MAX_ABS_VALUE_W16
-  cmp r1, #0
-  ble END_MAX_ABS_VALUE_W16
-
-  cmp r1, #8
-  blt LOOP_MAX_ABS_VALUE_W16
-
-  vmov.i16 q12, #0
-  sub r1, #8                  @ Counter for loops
-
-LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
-  vld1.16 {q13}, [r0]!
-  subs r1, #8
-  vabs.s16 q13, q13           @ Note vabs doesn't change the value of -32768.
-  vmax.u16 q12, q13           @ Use u16 so we don't lose the value -32768.
-  bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16
-
-  @ Find the maximum value in the Neon registers and move it to r2.
-  vmax.u16 d24, d25
-  vpmax.u16 d24, d24, d24
-  vpmax.u16 d24, d24, d24
-  adds r1, #8
-  vmov.u16 r2, d24[0]
-  beq END_MAX_ABS_VALUE_W16
-
-LOOP_MAX_ABS_VALUE_W16:
-  ldrsh r3, [r0], #2
-  eor r12, r3, r3, asr #31    @ eor and then sub, to get absolute value.
-  sub r12, r12, r3, asr #31
-  cmp r2, r12
-  movlt r2, r12
-  subs r1, #1
-  bne LOOP_MAX_ABS_VALUE_W16
-
-END_MAX_ABS_VALUE_W16:
-  cmp r2, #0x8000             @ Guard against the case for -32768.
-  subeq r2, #1
-  mov r0, r2
-  bx  lr
-
-
-
-@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
-DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
-  cmp r0, #0
-  moveq r0, #-1
-  beq EXIT                    @ Return -1 for a NULL pointer.
-  cmp r1, #0                  @ length
-  movle r0, #-1
-  ble EXIT                    @ Return -1 if length <= 0.
-
-  vmov.i32 q11, #0
-  vmov.i32 q12, #0
-  cmp r1, #8
-  blt LOOP_MAX_ABS_VALUE_W32
-
-  sub r1, #8                  @ Counter for loops
-
-LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
-  vld1.32 {q13, q14}, [r0]!
-  subs r1, #8                 @ Counter for loops
-  vabs.s32 q13, q13           @ vabs doesn't change the value of 0x80000000.
-  vabs.s32 q14, q14
-  vmax.u32 q11, q13           @ Use u32 so we don't lose the value 0x80000000.
-  vmax.u32 q12, q14
-  bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32
-
-  @ Find the maximum value in the Neon registers and move it to r2.
-  vmax.u32 q12, q11
-  vmax.u32 d24, d25
-  vpmax.u32 d24, d24, d24
-  adds r1, #8
-  vmov.u32 r2, d24[0]
-  beq END_MAX_ABS_VALUE_W32
-
-LOOP_MAX_ABS_VALUE_W32:
-  ldr r3, [r0], #4
-  eor r12, r3, r3, asr #31    @ eor and then sub, to get absolute value.
-  sub r12, r12, r3, asr #31
-  cmp r2, r12
-  movcc r2, r12
-  subs r1, #1
-  bne LOOP_MAX_ABS_VALUE_W32
-
-END_MAX_ABS_VALUE_W32:
-  mvn r0, #0x80000000         @ Guard against the case for 0x80000000.
-  cmp r2, r0
-  movcc r0, r2
-
-EXIT:
-  bx  lr
-
-@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
-DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon
-  mov r2, #0x8000             @ Initialize the return value.
-  cmp r0, #0
-  beq END_MAX_VALUE_W16
-  cmp r1, #0
-  ble END_MAX_VALUE_W16
-
-  vmov.i16 q12, #0x8000
-  cmp r1, #8
-  blt LOOP_MAX_VALUE_W16
-
-  sub r1, #8                  @ Counter for loops
-
-LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
-  vld1.16 {q13}, [r0]!
-  subs r1, #8
-  vmax.s16 q12, q13
-  bge LOOP_UNROLLED_BY_8_MAX_VALUE_W16
-
-  @ Find the maximum value in the Neon registers and move it to r2.
-  vmax.s16 d24, d25
-  vpmax.s16 d24, d24, d24
-  vpmax.s16 d24, d24, d24
-  adds r1, #8
-  vmov.u16 r2, d24[0]
-  beq END_MAX_VALUE_W16
-
-LOOP_MAX_VALUE_W16:
-  ldrsh r3, [r0], #2
-  cmp r2, r3
-  movlt r2, r3
-  subs r1, #1
-  bne LOOP_MAX_VALUE_W16
-
-END_MAX_VALUE_W16:
-  mov r0, r2
-  bx  lr
-
-@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
-DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon
-  mov r2, #0x80000000         @ Initialize the return value.
-  cmp r0, #0
-  beq END_MAX_VALUE_W32
-  cmp r1, #0
-  ble END_MAX_VALUE_W32
-
-  vmov.i32 q11, #0x80000000
-  vmov.i32 q12, #0x80000000
-  cmp r1, #8
-  blt LOOP_MAX_VALUE_W32
-
-  sub r1, #8                  @ Counter for loops
-
-LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
-  vld1.32 {q13, q14}, [r0]!
-  subs r1, #8
-  vmax.s32 q11, q13
-  vmax.s32 q12, q14
-  bge LOOP_UNROLLED_BY_8_MAX_VALUE_W32
-
-  @ Find the maximum value in the Neon registers and move it to r2.
-  vmax.s32 q12, q11
-  vpmax.s32 d24, d24, d25
-  vpmax.s32 d24, d24, d24
-  adds r1, #8
-  vmov.s32 r2, d24[0]
-  beq END_MAX_VALUE_W32
-
-LOOP_MAX_VALUE_W32:
-  ldr r3, [r0], #4
-  cmp r2, r3
-  movlt r2, r3
-  subs r1, #1
-  bne LOOP_MAX_VALUE_W32
-
-END_MAX_VALUE_W32:
-  mov r0, r2
-  bx  lr
-
-@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
-DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon
-  movw r2, #0x7FFF            @ Initialize the return value.
-  cmp r0, #0
-  beq END_MIN_VALUE_W16
-  cmp r1, #0
-  ble END_MIN_VALUE_W16
-
-  vdup.16 q12, r2
-  cmp r1, #8
-  blt LOOP_MIN_VALUE_W16
-
-  sub r1, #8                  @ Counter for loops
-
-LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
-  vld1.16 {q13}, [r0]!
-  subs r1, #8
-  vmin.s16 q12, q13
-  bge LOOP_UNROLLED_BY_8_MIN_VALUE_W16
-
-  @ Find the maximum value in the Neon registers and move it to r2.
-  vmin.s16 d24, d25
-  vpmin.s16 d24, d24, d24
-  vpmin.s16 d24, d24, d24
-  adds r1, #8
-  vmov.s16 r2, d24[0]
-  sxth  r2, r2
-  beq END_MIN_VALUE_W16
-
-LOOP_MIN_VALUE_W16:
-  ldrsh r3, [r0], #2
-  cmp r2, r3
-  movge r2, r3
-  subs r1, #1
-  bne LOOP_MIN_VALUE_W16
-
-END_MIN_VALUE_W16:
-  mov r0, r2
-  bx  lr
-
-@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
-DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon
-  mov r2, #0x7FFFFFFF         @ Initialize the return value.
-  cmp r0, #0
-  beq END_MIN_VALUE_W32
-  cmp r1, #0
-  ble END_MIN_VALUE_W32
-
-  vdup.32 q11, r2
-  vdup.32 q12, r2
-  cmp r1, #8
-  blt LOOP_MIN_VALUE_W32
-
-  sub r1, #8                  @ Counter for loops
-
-LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
-  vld1.32 {q13, q14}, [r0]!
-  subs r1, #8
-  vmin.s32 q11, q13
-  vmin.s32 q12, q14
-  bge LOOP_UNROLLED_BY_8_MIN_VALUE_W32
-
-  @ Find the maximum value in the Neon registers and move it to r2.
-  vmin.s32 q12, q11
-  vpmin.s32 d24, d24, d25
-  vpmin.s32 d24, d24, d24
-  adds r1, #8
-  vmov.s32 r2, d24[0]
-  beq END_MIN_VALUE_W32
-
-LOOP_MIN_VALUE_W32:
-  ldr r3, [r0], #4
-  cmp r2, r3
-  movge r2, r3
-  subs r1, #1
-  bne LOOP_MIN_VALUE_W32
-
-END_MIN_VALUE_W32:
-  mov r0, r2
-  bx  lr
--- a/webrtc/common_audio/signal_processing/spl_init.c
+++ b/webrtc/common_audio/signal_processing/spl_init.c
@ -29,7 +29,7 @@ DownsampleFast WebRtcSpl_DownsampleFast;
 ScaleAndAddVectorsWithRound WebRtcSpl_ScaleAndAddVectorsWithRound;

 #if (defined(WEBRTC_DETECT_ARM_NEON) || !defined(WEBRTC_ARCH_ARM_NEON)) && \
-     !defined(MIPS32_LE)
+    !defined(MIPS32_LE) && !defined(WEBRTC_ARCH_ARM64_NEON)
 /* Initialize function pointers to the generic C version. */
 static void InitPointersToC() {
  WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16C;
@ -45,7 +45,8 @@ static void InitPointersToC() {
 }
 #endif

-#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON)
+#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON) || \
+  (defined WEBRTC_ARCH_ARM64_NEON)
 /* Initialize function pointers to the Neon version. */
 static void InitPointersToNeon() {
  WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16Neon;
@ -92,7 +93,7 @@ static void InitFunctionPointers(void) {
  } else {
    InitPointersToC();
  }
-#elif defined(WEBRTC_ARCH_ARM_NEON)
+#elif defined(WEBRTC_ARCH_ARM_NEON) || defined(WEBRTC_ARCH_ARM64_NEON)
  InitPointersToNeon();
 #elif defined(MIPS32_LE)
  InitPointersToMIPS();