Replace asm NEON function by intrinsics implementation on ARMv7

Passed building isac_neon and modules_unittests on Android ARMv7. Passed modules_unittests with following filters: --gtest_filter=FiltersTest* --gtest_filter=LpcMaskingModelTest* --gtest_filter=TransformTest* --gtest_filter=FilterBanksTest* WebRtcIsacfix_CalculateResidualEnergyNeon is removed, refer more in Issue 4224. The old review url is at: https://webrtc-codereview.appspot.com/37259004/ BUG=4002 R=andrew@webrtc.org, jridges@masque.com, kjellander@webrtc.org Review URL: https://webrtc-codereview.appspot.com/48319005 Patch from Zhongwei Yao <zhongwei.yao@arm.com>. Change-Id: I4c16e15930f1b3449d67b67bf023fac28121dff8 Cr-Commit-Position: refs/heads/master@{#9140}
2015-05-06 16:39:17 +08:00 · 2015-05-06 16:39:17 +08:00 · f242e665b4
commit f242e665b4
parent 507a550af8
11 changed files with 14 additions and 1443 deletions
--- a/webrtc/modules/audio_coding/BUILD.gn
+++ b/webrtc/modules/audio_coding/BUILD.gn
@ -591,17 +591,14 @@ source_set("isacfix") {

 if (rtc_build_armv7_neon || current_cpu == "arm64") {
  source_set("isac_neon") {
-    sources = [ "codecs/isac/fix/source/entropy_coding_neon.c" ]
+    sources = [
+      "codecs/isac/fix/source/entropy_coding_neon.c",
+      "codecs/isac/fix/source/filters_neon.c",
+      "codecs/isac/fix/source/lattice_neon.c",
+      "codecs/isac/fix/source/transform_neon.c",
+    ]

    if (rtc_build_armv7_neon) {
-      sources += [
-        "codecs/isac/fix/source/filterbanks_neon.S",
-        "codecs/isac/fix/source/filters_neon.S",
-        "codecs/isac/fix/source/lattice_neon.S",
-        "codecs/isac/fix/source/lpc_masking_model_neon.S",
-        "codecs/isac/fix/source/transform_neon.S",
-      ]
-
      # Enable compilation for the ARM v7 Neon instruction set. This is needed
      # since //build/config/arm.gni only enables Neon for iOS, not Android.
      # This provides the same functionality as webrtc/build/arm_neon.gypi.
@ -614,18 +611,11 @@ if (rtc_build_armv7_neon || current_cpu == "arm64") {
      ]
    }

-    if (current_cpu == "arm64") {
-      sources += [
-        "codecs/isac/fix/source/filters_neon.c",
-        "codecs/isac/fix/source/lattice_neon.c",
-        "codecs/isac/fix/source/transform_neon.c",
-      ]
+    if (current_cpu != "arm64" || !is_clang) {
      # Disable AllpassFilter2FixDec16Neon function due to a clang bug.
      # Refer more details at:
      # https://code.google.com/p/webrtc/issues/detail?id=4567
-      if (!is_clang) {
        sources += [ "codecs/isac/fix/source/filterbanks_neon.c", ]
-      }
    }

    # Disable LTO in audio_processing_neon target due to compiler bug.
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S
@ -1,270 +0,0 @@
-@
-@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS.  All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-@ Contains a function for WebRtcIsacfix_AllpassFilter2FixDec16Neon()
-@ in iSAC codec, optimized for ARM Neon platform. Bit exact with function
-@ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype
-@ C code is at end of this file.
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
-.align  2
-
-@void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
-@    int16_t *data_ch1,  // Input and output in channel 1, in Q0
-@    int16_t *data_ch2,  // Input and output in channel 2, in Q0
-@    const int16_t *factor_ch1,  // Scaling factor for channel 1, in Q15
-@    const int16_t *factor_ch2,  // Scaling factor for channel 2, in Q15
-@    const int length,           // Length of the data buffers
-@    int32_t *filter_state_ch1,  // Filter state for channel 1, in Q16
-@    int32_t *filter_state_ch2); // Filter state for channel 2, in Q16
-
-DEFINE_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
-  push {r4 - r7}
-
-  ldr r5, [sp, #24]           @ filter_state_ch2
-  ldr r6, [sp, #20]           @ filter_state_ch1
-
-  @ Initialize the Neon registers.
-  vld1.16 d0[0], [r0]!        @ data_ch1[0]
-  vld1.16 d0[2], [r1]!        @ data_ch2[0]
-  vld1.32 d30[0], [r2]        @ factor_ch1[0], factor_ch1[1]
-  vld1.32 d30[1], [r3]        @ factor_ch2[0], factor_ch2[1]
-  vld1.32 d16[0], [r6]!       @ filter_state_ch1[0]
-  vld1.32 d17[0], [r5]!       @ filter_state_ch2[0]
-  vneg.s16 d31, d30
-
-  ldr r3, [sp, #16]           @ length
-  mov r4, #4                  @ Post offset value for the loop
-  mov r2, #-2                 @ Post offset value for the loop
-  sub r3, #2                  @ Loop counter
-
-  @ Loop unrolling pre-processing.
-  vqdmull.s16 q1, d30, d0
-  vshll.s16 q0, d0, #16
-  vqadd.s32 q2, q1, q8
-  vshrn.i32 d6, q2, #16
-  vmull.s16 q1, d31, d6
-  vshl.s32 q1, #1
-  vqadd.s32 q8, q1, q0
-  vld1.32 d16[1], [r6]        @ filter_state_ch1[1]
-  vld1.32 d17[1], [r5]        @ filter_state_ch2[1]
-  sub r6, #4                  @ &filter_state_ch1[0]
-  sub r5, #4                  @ &filter_state_ch2[0]
-  vld1.16 d6[1], [r0], r2     @ data_ch1[1]
-  vld1.16 d6[3], [r1], r2     @ data_ch2[1]
-  vrev32.16 d0, d6
-
-FOR_LOOP:
-  vqdmull.s16 q1, d30, d0
-  vshll.s16 q0, d0, #16
-  vqadd.s32 q2, q1, q8
-  vshrn.i32 d4, q2, #16
-  vmull.s16 q1, d31, d4
-  vst1.16 d4[1], [r0], r4     @ Store data_ch1[n]
-  vst1.16 d4[3], [r1], r4     @ Store data_ch2[n]
-  vshl.s32 q1, #1
-  vld1.16 d4[1], [r0], r2     @ Load data_ch1[n + 2]
-  vld1.16 d4[3], [r1], r2     @ Load data_ch2[n + 2]
-  vqadd.s32 q8, q1, q0
-  vrev32.16 d0, d4
-  vqdmull.s16 q1, d30, d0
-  subs r3, #2
-  vqadd.s32 q2, q1, q8
-  vshrn.i32 d6, q2, #16
-  vmull.s16 q1, d31, d6
-  vshll.s16 q0, d0, #16
-  vst1.16 d6[1], [r0], r4     @ Store data_ch1[n + 1]
-  vst1.16 d6[3], [r1], r4     @ Store data_ch2[n + 1]
-  vshl.s32 q1, #1
-  vld1.16 d6[1], [r0], r2     @ Load data_ch1[n + 3]
-  vld1.16 d6[3], [r1], r2     @ Load data_ch2[n + 3]
-  vqadd.s32 q8, q1, q0
-  vrev32.16 d0, d6
-  bgt FOR_LOOP
-
-  @ Loop unrolling post-processing.
-  vqdmull.s16 q1, d30, d0
-  vshll.s16 q0, d0, #16
-  vqadd.s32 q2, q1, q8
-  vshrn.i32 d4, q2, #16
-  vmull.s16 q1, d31, d4
-  vst1.16 d4[1], [r0]!        @ Store data_ch1[n]
-  vst1.16 d4[3], [r1]!        @ Store data_ch2[n]
-  vshl.s32 q1, #1
-  vqadd.s32 q8, q1, q0
-  vrev32.16 d0, d4
-  vqdmull.s16 q1, d30, d0
-  vshll.s16 q0, d0, #16
-  vqadd.s32 q2, q1, q8
-  vshrn.i32 d6, q2, #16
-  vmull.s16 q1, d31, d6
-  vst1.16 d6[1], [r0]         @ Store data_ch1[n + 1]
-  vst1.16 d6[3], [r1]         @ Store data_ch2[n + 1]
-  vshl.s32 q1, #1
-  vst1.32 d16[0], [r6]!       @ Store filter_state_ch1[0]
-  vqadd.s32 q9, q1, q0
-  vst1.32 d17[0], [r5]!       @ Store filter_state_ch1[1]
-  vst1.32 d18[1], [r6]        @ Store filter_state_ch2[0]
-  vst1.32 d19[1], [r5]        @ Store filter_state_ch2[1]
-
-  pop {r4 - r7}
-  bx lr
-
-@void AllpassFilter2FixDec16BothChannels(
-@    int16_t *data_ch1,  // Input and output in channel 1, in Q0
-@    int16_t *data_ch2,  // Input and output in channel 2, in Q0
-@    const int16_t *factor_ch1,  // Scaling factor for channel 1, in Q15
-@    const int16_t *factor_ch2,  // Scaling factor for channel 2, in Q15
-@    const int length,  // Length of the data buffers
-@    int32_t *filter_state_ch1,  // Filter state for channel 1, in Q16
-@    int32_t *filter_state_ch2) {  // Filter state for channel 2, in Q16
-@  int n = 0;
-@  int32_t state0_ch1 = filter_state_ch1[0], state1_ch1 = filter_state_ch1[1];
-@  int32_t state0_ch2 = filter_state_ch2[0], state1_ch2 = filter_state_ch2[1];
-@  int16_t sample0_ch1 = 0, sample0_ch2 = 0;
-@  int16_t sample1_ch1 = 0, sample1_ch2  = 0;
-@  int32_t a0_ch1 = 0, a0_ch2 = 0;
-@  int32_t b0_ch1 = 0, b0_ch2 = 0;
-@
-@  int32_t a1_ch1 = 0, a1_ch2 = 0;
-@  int32_t b1_ch1 = 0, b1_ch2 = 0;
-@  int32_t b2_ch1  = 0, b2_ch2 = 0;
-@
-@  // Loop unrolling preprocessing.
-@
-@  sample0_ch1 = data_ch1[n];
-@  sample0_ch2 = data_ch2[n];
-@
-@  a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
-@  a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
-@
-@  b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
-@  b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
-@
-@  a0_ch1 = -factor_ch1[0] * (int16_t)(b0_ch1 >> 16);
-@  a0_ch2 = -factor_ch2[0] * (int16_t)(b0_ch2 >> 16);
-@
-@  state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1 <<1, (uint32_t)sample0_ch1 << 16);
-@  state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2 <<1, (uint32_t)sample0_ch2 << 16);
-@
-@  sample1_ch1 = data_ch1[n + 1];
-@  sample0_ch1 = (int16_t) (b0_ch1 >> 16); //Save as Q0
-@  sample1_ch2  = data_ch2[n + 1];
-@  sample0_ch2 = (int16_t) (b0_ch2 >> 16); //Save as Q0
-@
-@
-@  for (n = 0; n < length - 2; n += 2) {
-@    a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
-@    a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
-@    a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
-@    a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
-@
-@    b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
-@    b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1); //Q16+Q16=Q16
-@    b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2); //Q16+Q16=Q16
-@    b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2); //Q16+Q16=Q16
-@
-@    a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
-@    a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
-@    a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
-@    a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
-@
-@    state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 <<16);
-@    state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 <<16);
-@    state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 <<16);
-@    state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 <<16);
-@
-@    sample0_ch1 = data_ch1[n + 2];
-@    sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
-@    sample0_ch2 = data_ch2[n + 2];
-@    sample1_ch2  = (int16_t) (b1_ch2 >> 16); //Save as Q0
-@
-@    a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
-@    a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
-@    a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
-@    a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
-@
-@    b2_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
-@    b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
-@    b2_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
-@    b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
-@
-@    a0_ch1 = -factor_ch1[0] * (int16_t)(b2_ch1 >> 16);
-@    a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
-@    a0_ch2 = -factor_ch2[0] * (int16_t)(b2_ch2 >> 16);
-@    a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
-@
-@    state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1<<16);
-@    state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
-@    state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2<<16);
-@    state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
-@
-@
-@    sample1_ch1 = data_ch1[n + 3];
-@    sample0_ch1 = (int16_t) (b2_ch1  >> 16); //Save as Q0
-@    sample1_ch2 = data_ch2[n + 3];
-@    sample0_ch2 = (int16_t) (b2_ch2 >> 16); //Save as Q0
-@
-@    data_ch1[n]     = (int16_t) (b0_ch1 >> 16); //Save as Q0
-@    data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
-@    data_ch2[n]     = (int16_t) (b0_ch2 >> 16);
-@    data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
-@  }
-@
-@  // Loop unrolling post-processing.
-@
-@  a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
-@  a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
-@  a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
-@  a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
-@
-@  b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
-@  b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1);
-@  b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2);
-@  b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2);
-@
-@  a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
-@  a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
-@  a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
-@  a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
-@
-@  state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 << 16);
-@  state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 << 16);
-@  state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 << 16);
-@  state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 << 16);
-@
-@  data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
-@  data_ch2[n] = (int16_t) (b0_ch2 >> 16);
-@
-@  sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
-@  sample1_ch2  = (int16_t) (b1_ch2 >> 16); //Save as Q0
-@
-@  a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
-@  a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
-@
-@  b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
-@  b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
-@
-@  a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
-@  a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
-@
-@  state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
-@  state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
-@
-@  data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
-@  data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
-@
-@  filter_state_ch1[0] = state0_ch1;
-@  filter_state_ch1[1] = state1_ch1;
-@  filter_state_ch2[0] = state0_ch2;
-@  filter_state_ch2[1] = state1_ch2;
-@}
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S
@ -1,145 +0,0 @@
-@
-@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS.  All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-@ Reference code in filters.c. Output is bit-exact.
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon
-.align  2
-
-@ int WebRtcIsacfix_AutocorrNeon(
-@     int32_t* __restrict r,
-@     const int16_t* __restrict x,
-@     int16_t N,
-@     int16_t order,
-@     int16_t* __restrict scale);
-
-DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon
-  push       {r3 - r12}
-
-  @ Constant initializations
-  mov        r4, #33
-  vmov.i32   d0, #0
-  vmov.i32   q8, #0
-  vmov.i32   d29, #0               @ Initialize (-scale).
-  vmov.u8    d30, #255             @ Initialize d30 as -1.
-  vmov.i32   d0[0], r4             @ d0: 00000033 (low), 00000000 (high)
-  vmov.i32   d25, #32
-
-  mov        r5, r1                @ x
-  mov        r6, r2                @ N
-
-@ Generate the first coefficient r0.
-LOOP_R0:
-  vld1.16    {d18}, [r5]!          @ x[]
-  subs       r6, r6, #4
-  vmull.s16  q9, d18, d18
-  vpadal.s32 q8, q9
-  bgt        LOOP_R0
-
-  vadd.i64   d16, d16, d17
-
-  @ Calculate scaling (the value of shifting).
-  vmov       d17, d16
-
-  @ Check overflow and determine the value for 'scale'.
-  @ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and
-  @ lower 32-bit words. Note that we don't care about the value of the upper
-  @ word in d17.
-
-  @ Check the case of 1 bit overflow. If it occurs store the results for
-  @ scale and r[0] in d17 and d29.
-
-  vshr.u64   d3, d16, #1
-  vclt.s32   d1, d16, #0           @ < 0 ?
-  vbit       d17, d3, d1           @ For r[0]
-  vbit       d29, d30, d1          @ -scale = -1
-
-  @ For the case of more than 1 bit overflow. If it occurs overwrite the
-  @ results for scale and r[0] in d17 and d29.
-  vclz.s32   d5, d16               @ Leading zeros of the two 32 bit words.
-  vshr.s64   d26, d5, #32          @ Keep only the upper 32 bits.
-  vsub.i64   d31, d26, d0          @ zeros - 33
-  vshl.i64   d27, d26, #32
-  vorr       d27, d26              @ Duplicate the high word with its low one.
-  vshl.u64   d2, d16, d31          @ Shift by (-scale).
-  vclt.s32   d1, d27, d25          @ < 32 ?
-  vbit       d17, d2, d1           @ For r[0]
-  vbit       d29, d31, d1          @ -scale
-
-  vst1.32    d17[0], [r0]!         @ r[0]
-  mov        r5, #1                @ outer loop counter
-
-@ Generate rest of the coefficients
-LOOP_R:
-  vmov.i32   q8, #0                @ Initialize the accumulation result.
-  vmov.i32   q9, #0                @ Initialize the accumulation result.
-  mov        r7, r1                @ &x[0]
-  add        r6, r7, r5, lsl #1    @ x[i]
-  sub        r12, r2, r5           @ N - i
-  lsr        r8, r12, #3           @ inner loop counter
-  sub        r12, r8, lsl #3       @ Leftover samples to be processed
-
-LOOP_8X_SAMPLES:                   @ Multiple of 8 samples
-  vld1.16    {d20, d21}, [r7]!     @ x[0, ...]
-  vld1.16    {d22, d23}, [r6]!     @ x[i, ...]
-  vmull.s16  q12, d20, d22
-  vmull.s16  q13, d21, d23
-  subs       r8, #1
-  vpadal.s32 q8, q12
-  vpadal.s32 q9, q13
-  bgt        LOOP_8X_SAMPLES
-
-  cmp r12, #4
-  blt REST_SAMPLES
-
-Four_SAMPLES:
-  vld1.16    d20, [r7]!
-  vld1.16    d22, [r6]!
-  vmull.s16  q12, d20, d22
-  vpadal.s32 q8, q12
-  sub r12, #4
-
-REST_SAMPLES:
-  mov        r8, #0                @ Initialize lower word of the accumulation.
-  mov        r4, #0                @ Initialize upper word of the accumulation.
-  cmp r12, #0
-  ble SUMUP
-
-LOOP_REST_SAMPLES:
-  ldrh       r9, [r7], #2          @ x[0, ...]
-  ldrh       r10, [r6], #2         @ x[i, ...]
-  smulbb     r11, r9, r10
-  adds       r8, r8, r11           @ lower word of the accumulation.
-  adc        r4, r4, r11, asr #31  @ upper word of the accumulation.
-  subs       r12, #1
-  bgt        LOOP_REST_SAMPLES
-
-@ Added the multiplication results together and do a shift.
-SUMUP:
-  vadd.i64   d16, d17
-  vadd.i64   d18, d19
-  vadd.i64   d18, d16
-  vmov       d17, r8, r4
-  vadd.i64   d18, d17
-  vshl.s64   d18, d29              @ Shift left by (-scale).
-  vst1.32    d18[0], [r0]!         @ r[i]
-
-  add        r5, #1
-  cmp        r5, r3
-  ble        LOOP_R
-
-  vneg.s32   d29, d29              @ Get value for 'scale'.
-  ldr        r2, [sp, #40]         @ &scale
-  add        r0, r3, #1            @ return (order + 1)
-  vst1.s16   d29[0], [r2]          @ Store 'scale'
-
-  pop        {r3 - r12}
-  bx         lr
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c
@ -205,10 +205,6 @@ static void WebRtcIsacfix_InitNeon(void) {
  WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopNeon;
  WebRtcIsacfix_Spec2Time = WebRtcIsacfix_Spec2TimeNeon;
  WebRtcIsacfix_Time2Spec = WebRtcIsacfix_Time2SpecNeon;
-#if !(defined WEBRTC_ARCH_ARM64_NEON)
-  WebRtcIsacfix_CalculateResidualEnergy =
-      WebRtcIsacfix_CalculateResidualEnergyNeon;
-#endif
 // Disable AllpassFilter2FixDec16Neon function due to a clang bug.
 // Refer more details at:
 // https://code.google.com/p/webrtc/issues/detail?id=4567
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S
@ -1,146 +0,0 @@
-@
-@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS.  All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-@ lattice_neon.s
-@
-@ Contains a function for the core loop in the normalized lattice MA 
-@ filter routine for iSAC codec, optimized for ARM Neon platform.
-@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
-@                                     int16_t input1,
-@                                     int32_t input2,
-@                                     int32_t* ptr0,
-@                                     int32_t* ptr1,
-@                                     int32_t* __restrict ptr2);
-@ It calculates
-@   *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
-@   *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
-@ in Q15 domain.
-@
-@ Reference code in lattice.c.
-@ Output is not bit-exact with the reference C code, due to the replacement
-@ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
-@ instructions, smulwb, and smull. Speech quality was not degraded by
-@ testing speech and tone vectors.
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-#include "settings.h"
-
-GLOBAL_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
-.align  2
-DEFINE_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
-  push        {r4-r8}
-
-  vdup.32     d28, r0             @ Initialize Neon register with input0
-  vdup.32     d29, r1             @ Initialize Neon register with input1
-  vdup.32     d30, r2             @ Initialize Neon register with input2
-  ldr         r4, [sp, #20]       @ ptr1
-  ldr         r12, [sp, #24]      @ ptr2
-
-  @ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2
-  @ Leftover samples after the loop, in r6:
-  @    r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2
-  mov         r6, #HALF_SUBFRAMELEN
-  sub         r6, #1
-  lsr         r5, r6, #2
-  sub         r6, r5, lsl #2
-
-  @ First r5 iterations in a loop.
-
-LOOP:
-  vld1.32     {d0, d1}, [r3]!     @ *ptr0
-
-  vmull.s32   q10, d0, d28        @ tmp32a = input0 * (*ptr0)
-  vmull.s32   q11, d1, d28        @ tmp32a = input0 * (*ptr0)
-  vmull.s32   q12, d0, d29        @ input1 * (*ptr0)
-  vmull.s32   q13, d1, d29        @ input1 * (*ptr0)
-                                  
-  vrshrn.i64  d4, q10, #15        
-  vrshrn.i64  d5, q11, #15        
-                                  
-  vld1.32     {d2, d3}, [r12]     @ *ptr2
-  vadd.i32    q3, q2, q1          @ tmp32b = *ptr2 + tmp32a
-                                  
-  vrshrn.i64  d0, q12, #15        
-                                  
-  vmull.s32   q10, d6, d30        @ input2 * (*ptr2 + tmp32b)
-  vmull.s32   q11, d7, d30        @ input2 * (*ptr2 + tmp32b)
-
-  vrshrn.i64  d16, q10, #16
-  vrshrn.i64  d17, q11, #16
-
-  vmull.s32   q10, d16, d28       @ input0 * (*ptr2)
-  vmull.s32   q11, d17, d28       @ input0 * (*ptr2)
-
-  vrshrn.i64  d1, q13, #15
-  vrshrn.i64  d18, q10, #15
-  vrshrn.i64  d19, q11, #15
-
-  vst1.32     {d16, d17}, [r12]!  @ *ptr2
-
-  vadd.i32    q9, q0, q9
-  subs        r5, #1
-  vst1.32     {d18, d19}, [r4]!   @ *ptr1
-
-  bgt         LOOP
-
-  @ Check how many samples still need to be processed.
-  subs        r6, #2
-  blt         LAST_SAMPLE
-
-  @ Process two more samples:
-  vld1.32     d0, [r3]!           @ *ptr0
-
-  vmull.s32   q11, d0, d28        @ tmp32a = input0 * (*ptr0)
-  vmull.s32   q13, d0, d29        @ input1 * (*ptr0)
-
-  vld1.32     d18, [r12]          @ *ptr2
-  vrshrn.i64  d4, q11, #15
-
-  vadd.i32    d7, d4, d18         @ tmp32b = *ptr2 + tmp32a
-  vmull.s32   q11, d7, d30        @ input2 * (*ptr2 + tmp32b)
-  vrshrn.i64  d16, q11, #16
-
-  vmull.s32   q11, d16, d28       @ input0 * (*ptr2)
-  vst1.32     d16, [r12]!         @ *ptr2
-
-  vrshrn.i64  d0, q13, #15
-  vrshrn.i64  d19, q11, #15
-  vadd.i32    d19, d0, d19
-
-  vst1.32     d19, [r4]!          @ *ptr1
-
-  @ If there's still one more sample, process it here.
-LAST_SAMPLE:
-  cmp         r6, #1
-  bne         END
-
-  @ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
-  
-  ldr         r7, [r3]            @ *ptr0
-  ldr         r8, [r12]           @ *ptr2
-
-  smulwb      r5, r7, r0          @ tmp32a = *ptr0 * input0 >> 16
-  add         r8, r8, r5, lsl #1  @ tmp32b = *ptr2 + (tmp32a << 1)
-  smull       r5, r6, r8, r2      @ tmp32b * input2, in 64 bits
-  lsl         r6, #16
-  add         r6, r5, lsr #16     @ Only take the middle 32 bits
-  str         r6, [r12]           @ Output (*ptr2, as 32 bits)
-
-  @ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
-
-  smulwb      r5, r7, r1          @ tmp32a = *ptr0 * input1 >> 16
-  smulwb      r6, r6, r0          @ tmp32b = *ptr2 * input0 >> 16
-  lsl         r5, r5, #1
-  add         r5, r6, lsl #1
-  str         r5, [r4]            @ Output (*ptr1)
-
-END:
-  pop         {r4-r8}
-  bx          lr
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model.h
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model.h
@ -53,15 +53,6 @@ int32_t WebRtcIsacfix_CalculateResidualEnergyC(int lpc_order,
                                               int32_t* corr_coeffs,
                                               int* q_val_residual_energy);

-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
-int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order,
-                                                  int32_t q_val_corr,
-                                                  int q_val_polynomial,
-                                                  int16_t* a_polynomial,
-                                                  int32_t* corr_coeffs,
-                                                  int* q_val_residual_energy);
-#endif
-
 #if defined(MIPS_DSP_R2_LE)
 int32_t WebRtcIsacfix_CalculateResidualEnergyMIPS(int lpc_order,
                                                  int32_t q_val_corr,
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S
@ -1,173 +0,0 @@
-@
-@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS.  All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-@ Contains a function for WebRtcIsacfix_CalculateResidualEnergyNeon() in
-@ iSAC codec, optimized for ARM Neon platform. Reference code in
-@ lpc_masking_model.c.
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
-.align  2
-
-@ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order,
-@                                                   int32_t q_val_corr,
-@                                                   int q_val_polynomial,
-@                                                   int16_t* a_polynomial,
-@                                                   int32_t* corr_coeffs,
-@                                                   int* q_val_residual_energy);
-DEFINE_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
-  push {r4-r11}
-
-  sub r13, r13, #16
-  str r1, [r13, #8]
-  str r2, [r13, #12]
-
-  mov r4, #1
-  vmov.s64 q11, #0            @ Initialize shift_internal.
-  vmov.s64 q13, #0            @ Initialize sum64.
-  vmov.s64 q10, #0
-  vmov.u8 d20[0], r4          @ Set q10 to 1.
-
-  cmp r0, #0
-  blt POST_LOOP_I
-
-  add r9, r3, r0, asl #1      @ &a_polynomial[lpc_order]
-  mov r6, #0                  @ Loop counter i.
-  ldr r11, [r13, #48]
-  sub r10, r0, #1
-  mov r7, r3                  @ &a_polynomial[0]
-  str r9, [r13, #4]
-
-LOOP_I:
-  ldr r2, [r11], #4            @ corr_coeffs[i]
-  vmov.s64 q15, #0            @ Initialize the sum64_tmp.
-  vdup.s32 d25, r2
-
-  cmp r0, r6                  @ Compare lpc_order to i.
-  movle r2, r6
-  ble POST_LOOP_J
-
-  mov r1, r6                  @ j = i;
-  mov r12, r7                  @ &a_polynomial[i]
-  mov r4, r3                  @ &a_polynomial[j - i]
-
-LOOP_J:
-  ldr r8, [r12], #4
-  ldr r5, [r4], #4
-  vmov.u32 d0[0], r8
-  vmov.u32 d1[0], r5
-  vmull.s16 q0, d0, d1
-  vmull.s32 q0, d0, d25
-  cmp r6, #0                  @ i == 0?
-  vshl.s64 q0, q11
-  beq SUM1
-  vshl.s64 q0, #1
-
-SUM1:
-  vqadd.s64 q14, q0, q15      @ Sum and test overflow.
-  add r1, r1, #2
-  bvc MOV1                    @ Skip the shift if there's no overflow.
-  vshr.s64 q0, #1
-  vshr.s64 q15, #1
-  vadd.s64 q14, q0, q15
-  vsub.s64 q11, q10
-
-MOV1:
-  cmp r0, r1                  @ Compare lpc_order to j.
-  vmov.s64 q15, q14
-  bgt LOOP_J
-
-  bic r1, r10, #1
-  add r2, r6, #2
-  add r2, r1, r2
-
-POST_LOOP_J:
-  vqadd.s64 q0, q13, q15      @ Sum and test overflow.
-  bvc MOV2                    @ Skip the shift if there's no overflow.
-  vshr.s64 q13, #1
-  vshr.s64 q15, #1
-  vadd.s64 q0, q13, q15
-  vsub.s64 q11, q10
-
-MOV2:
-  vmov.s64 q13, q0            @ update sum64.
-  cmp r2, r0
-  bne CHECK_LOOP_CONDITION
-
-  @ Last sample in the inner loop.
-  ldr r4, [r13, #4]
-  ldrsh r8, [r4]
-  ldrsh r12, [r9]
-  mul r8, r8, r12
-  vmov.s32 d0[0], r8
-  vmull.s32 q0, d0, d25
-  cmp r6, #0                  @ i == 0?
-  vshl.s64 q0, q11
-  beq SUM2
-  vshl.s64 q0, #1
-
-SUM2:
-  vqadd.s64 d1, d0, d26       @ Sum and test overflow.
-  bvc MOV3                    @ Skip the shift if there's no overflow.
-  vshr.s64 q13, #1
-  vshr.s64 d0, #1
-  vadd.s64 d1, d0, d26
-  vsub.s64 q11, q10
-
-MOV3:
-  vmov.s64 d26, d1            @ update sum64.
-
-CHECK_LOOP_CONDITION:
-  add r6, r6, #1
-  sub r9, r9, #2
-  cmp r0, r6                  @ Compare i to lpc_order.
-  sub r10, r10, #1
-  add r7, r7, #2
-  bge LOOP_I
-
-POST_LOOP_I:
-  mov r3, #0
-  vqadd.s64 d0, d26, d27      @ Sum and test overflow.
-  bvc GET_SHIFT_NORM          @ Skip the shift if there's no overflow.
-  vshr.s64 q13, #1
-  vadd.s64 d0, d26, d27
-  vsub.s64 q11, q10
-
-GET_SHIFT_NORM:
-  vcls.s32 d1, d0             @ Count leading extra sign bits.
-  vmov.32 r2, d1[1]           @ Store # of sign bits of only the 32 MSBs.
-  vmovl.s32 q1, d1
-  vshl.s64 d0, d3             @ d3 contains # of sign bits of the 32 MSBs.
-
-  vcls.s32 d1, d0             @ Count again the leading extra sign bits.
-  vmov.s32 r1, d1[1]          @ Store # of sign bits of only the 32 MSBs.
-  vmovl.s32 q1, d1
-  vshl.s64 d0, d3             @ d3 contains # of sign bits of the 32 MSBs.
-
-  vmov.s32 r0, d0[1]          @ residual_energy
-  vmov.s32 r3, d22[0]         @ shift_internal
-
-  @ Calculate the value for q_val_residual_energy.
-  ldr r4, [r13, #8]            @ q_val_corr
-  ldr r5, [r13, #12]           @ q_val_polynomial
-  sub r12, r4, #32
-  add r12, r12, r5, asl #1
-  add r1, r12, r1              @ add 1st part of shift_internal.
-  add r12, r1, r2              @ add 2nd part of shift_internal.
-  ldr r2, [r13, #52]
-  add r3, r12, r3              @ value for q_val_residual_energy.
-  str r3, [r2, #0]
-
-  add r13, r13, #16
-  pop {r4-r11}
-  bx  r14
-
-
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_unittest.cc
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_unittest.cc
@ -58,11 +58,4 @@ class LpcMaskingModelTest : public testing::Test {

 TEST_F(LpcMaskingModelTest, CalculateResidualEnergyTest) {
  CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyC);
-#ifdef WEBRTC_DETECT_ARM_NEON
-  if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
-    CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyNeon);
-  }
-#elif defined(WEBRTC_ARCH_ARM_NEON)
-  CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyNeon);
-#endif
 }
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_neon.S
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_neon.S
@ -1,645 +0,0 @@
-@
-@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS.  All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-@ Reference code in transform.c. Bit not exact due to how rounding is
-@ done in C code and ARM instructions, but quality by assembly code is
-@ not worse.
-
-#include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcIsacfix_Spec2TimeNeon
-GLOBAL_FUNCTION WebRtcIsacfix_Time2SpecNeon
-GLOBAL_LABEL WebRtcIsacfix_kSinTab1
-GLOBAL_LABEL WebRtcIsacfix_kCosTab1
-GLOBAL_LABEL WebRtcIsacfix_kSinTab2
-
-@ void WebRtcIsacfix_Time2SpecNeon(int16_t* inre1Q9,
-@                                  int16_t* inre2Q9,
-@                                  int16_t* outreQ7,
-@                                  int16_t* outimQ7);
-
-DEFINE_FUNCTION WebRtcIsacfix_Time2SpecNeon
-.align  2
-  push {r3-r11,lr}            @ need to push r4-r11, but push r3 too to keep
-                              @ stack 8-byte aligned
-  sub sp, sp, #(16 + FRAMESAMPLES * 4)
-
-  str r0, [sp]                @ inre1Q9
-  str r1, [sp, #4]            @ inre2Q9
-  str r2, [sp, #8]            @ outreQ7
-  str r3, [sp, #12]           @ outimQ7
-
-  mov r8, #(FRAMESAMPLES - 16)
-  add r12, r0, r8             @ &inreQ7[FRAMESAMPLES / 2 - 4]
-  add r11, r1, r8             @ &inimQ7[FRAMESAMPLES / 2 - 4]
-  add r4, sp, #16             @ tmpreQ16;
-  add r5, sp, #(16 + FRAMESAMPLES * 2)  @ tmpimQ16;
-
-  adr r9, WebRtcIsacfix_kCosTab1
-#if defined(__APPLE__)
-  mov r6, #:lower16:(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
-#else
-  mov r6, #(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
-#endif
-  add r10, r9, r6             @ WebRtcIsacfix_kSinTab1
-
-  vmov.u32 q14, #0            @ Initialize the maximum values for tmpInIm.
-  vmov.u32 q15, #0            @ Initialize the maximum values for tmpInRe.
-  movw r6, #16921             @ 0.5 / sqrt(240) in Q19
-  lsl r6, #5                  @ Together with vqdmulh, net effect is ">> 26".
-  mov r8, #(FRAMESAMPLES / 2) @ loop counter
-  vdup.s32 q11, r6
-
-Time2Spec_TransformAndFindMax:
-@ Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code.
-
-  subs r8, #8
-
-  vld1.16 {q0}, [r9, :64]!    @ WebRtcIsacfix_kCosTab1[]
-  vld1.16 {q2}, [r0]!         @ inre1Q9[]
-  vmull.s16 q8, d0, d4        @ WebRtcIsacfix_kCosTab1[k] * inre1Q9[k]
-  vld1.16 {q1}, [r10, :64]!   @ WebRtcIsacfix_kSinTab1[]
-  vmull.s16 q9, d1, d5        @ WebRtcIsacfix_kCosTab1[k] * inre1Q9[k]
-  vld1.16 {q3}, [r1]!         @ inre2Q9[]
-  vmlal.s16 q8, d2, d6        @ WebRtcIsacfix_kSinTab1[k] * inre2Q9[k]
-  vmlal.s16 q9, d3, d7        @ WebRtcIsacfix_kSinTab1[k] * inre2Q9[k]
-  vmull.s16 q12, d0, d6       @ WebRtcIsacfix_kCosTab1[k] * inre2Q9[k]
-  vmull.s16 q13, d1, d7       @ WebRtcIsacfix_kCosTab1[k] * inre2Q9[k]
-  vmlsl.s16 q12, d2, d4       @ WebRtcIsacfix_kSinTab1[k] * inre1Q9[k]
-  vmlsl.s16 q13, d3, d5       @ WebRtcIsacfix_kSinTab1[k] * inre1Q9[k]
-
-  vqdmulh.s32 q0, q8, q11     @ xrQ16 * factQ19
-  vqdmulh.s32 q1, q9, q11     @ xrQ16 * factQ19
-  vqdmulh.s32 q2, q12, q11    @ xrQ16 * factQ19
-  vqdmulh.s32 q3, q13, q11    @ xrQ16 * factQ19
-
-  @ Find the absolute maximum in the vectors and store them.
-  vabs.s32 q8, q0
-  vabs.s32 q9, q1
-  vabs.s32 q12, q2
-  vst1.32  {q0, q1}, [r4]!    @ tmpreQ16[k]
-  vabs.s32 q13, q3
-  vmax.u32 q14, q8            @ Use u32 so we don't lose the value 0x80000000.
-  vmax.u32 q15, q12
-  vst1.32  {q2, q3}, [r5]!    @ tmpimQ16[k]
-  vmax.u32 q15, q13
-  vmax.u32 q14, q9            @ Maximum for outre1Q16[].
-
-  bgt Time2Spec_TransformAndFindMax
-
-  @ Find the maximum value in the Neon registers
-  vmax.u32 d28, d29
-  vmax.u32 d30, d31
-  vpmax.u32 d28, d28, d28     @ Both 32 bits words hold the same value tmpInIm.
-  vpmax.u32 d30, d30, d30     @ Both 32 bits words hold the same value tmpInRe.
-  vmax.s32 d30, d28, d30      @ if (yrQ16 > xrQ16) {xrQ16 = yrQ16};
-
-  ldr r4, [sp]                @ inre1Q9
-  vcls.s32  d31, d30          @ sh = WebRtcSpl_NormW32(tmpInRe);
-  ldr r5, [sp, #4]            @ inre2Q9
-  vmov.i32  d30, #24
-  add r6, sp, #16             @ tmpreQ16;
-  vsub.s32  d31, d31, d30     @ sh = sh - 24;
-  add r7, sp, #(16 + FRAMESAMPLES * 2)  @ tmpimQ16;
-  vdup.s32  q8, d31[0]                  @ sh
-
-  mov r8, #(FRAMESAMPLES / 2) @ loop counter
-
-Time2Spec_PreFftShift:
-  subs r8, #16
-
-  vld1.32 {q0, q1}, [r6]!     @ tmpreQ16[]
-  vrshl.s32 q0, q0, q8
-  vld1.32 {q2, q3}, [r6]!     @ tmpreQ16[]
-  vrshl.s32 q1, q1, q8
-  vld1.32 {q10, q11}, [r7]!   @ tmpimQ16[]
-  vrshl.s32 q2, q2, q8
-  vld1.32 {q12, q13}, [r7]!   @ tmpimQ16[]
-  vrshl.s32 q3, q3, q8
-  vrshl.s32 q10, q10, q8
-  vrshl.s32 q11, q11, q8
-  vrshl.s32 q12, q12, q8
-  vrshl.s32 q13, q13, q8
-
-  vmovn.s32 d0, q0
-  vmovn.s32 d1, q1
-  vmovn.s32 d2, q2
-  vmovn.s32 d3, q3
-  vmovn.s32 d4, q10
-  vmovn.s32 d5, q11
-  vmovn.s32 d6, q12
-  vmovn.s32 d7, q13
-
-  vst1.16 {q0, q1}, [r4]!     @ inre1Q9[]
-  vst1.16 {q2, q3}, [r5]!     @ inre2Q9[]
-
-  bgt Time2Spec_PreFftShift
-
-  vmov.s32 r10, d16[0]        @ Store value of sh.
-  ldr r0, [sp]                @ inre1Q9
-  ldr r1, [sp, #4]            @ inre2Q9
-  mov r2, #-1
-  CALL_FUNCTION WebRtcIsacfix_FftRadix16Fastest
-
-  vdup.s32 q8, r10            @ sh
-  mov r8, #(FRAMESAMPLES - 8)
-  ldr r2, [sp, #8]            @ outreQ7
-  ldr r3, [sp, #12]           @ outimQ7
-  add r11, r2, r8             @ &outRe1Q16[FRAMESAMPLES / 2 - 4]
-  add r12, r3, r8             @ &outim2Q16[FRAMESAMPLES / 2 - 4]
-  ldr r6, [sp]                @ inre1Q9
-  ldr r7, [sp, #4]            @ inre2Q9
-  add r4, r6, r8              @ &inre1Q9[FRAMESAMPLES / 2 - 4]
-  add r5, r7, r8              @ &inre2Q9[FRAMESAMPLES / 2 - 4]
-  adr r10, WebRtcIsacfix_kSinTab2
-
-  add r9, r10, #(120*2 - 8)   @ &WebRtcIsacfix_kSinTab2[119 - 4]
-
-  vneg.s32 q15, q8            @ -sh
-  vmov.i32 q0, #23
-  vsub.s32 q15, q15, q0       @ -sh - 23
-
-  mov r8, #(FRAMESAMPLES / 4) @ loop counter
-
-  @ Pre-load variables.
-  vld1.16 {d2}, [r4]          @ inre1Q9[FRAMESAMPLES / 2 - 4 - i]
-  vld1.16 {d3}, [r5]          @ inre2Q9[FRAMESAMPLES / 2 - 4 - i]
-  vld1.16 {d0}, [r6]!         @ inre1Q9
-  vld1.16 {d1}, [r7]!         @ inre2Q9
-
-Time2Spec_PostFftTransform:
-@ By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)",
-@ ">> 14" and then ">> 9" as in the C code.
-
-  vld1.16 {d6}, [r9, :64]     @ kCosTab2[]
-  vneg.s16 d6, d6
-  vld1.16 {d7}, [r10, :64]!   @ WebRtcIsacfix_kSinTab2[]
-  vrev64.16 q1, q1            @ Reverse samples in 2nd half of xrQ16[].
-  vqadd.s16 d4, d0, d2        @ xrQ16
-  vqsub.s16 d5, d1, d3        @ xiQ16
-  vrev64.16 d6, d6
-
-  sub r9, #8                  @ Update pointers for kCosTab2[].
-  sub r4, #8                  @ Update pointers for inre1Q9[].
-  sub r5, #8                  @ Update pointers for inr22Q9[].
-  subs r8, #4                 @ Update loop counter.
-
-  vqadd.s16 d1, d1, d3        @ yrQ16
-  vqsub.s16 d0, d2, d0        @ yiQ16
-
-  vmull.s16 q12, d6, d4       @ kCosTab2[k] * xrQ16
-  vmlsl.s16 q12, d7, d5       @ WebRtcIsacfix_kSinTab2[k] * xiQ16
-  vmull.s16 q13, d7, d4       @ WebRtcIsacfix_kSinTab2[k] * xrQ16
-  vmlal.s16 q13, d6, d5       @ kCosTab2[k] * xiQ16
-  vmull.s16 q9, d7, d1        @ WebRtcIsacfix_kSinTab2[k] * yrQ16
-  vmlal.s16 q9, d6, d0        @ kCosTab2[k] * yiQ16
-  vmull.s16 q10, d7, d0       @ WebRtcIsacfix_kSinTab2[k] * yiQ16
-  vmlsl.s16 q10, d6, d1       @ kCosTab2[k] * yrQ16
-
-  vshl.s32 q12, q12, q15
-  vshl.s32 q13, q13, q15
-  vshl.s32 q9, q9, q15
-  vshl.s32 q10, q10, q15
-
-  vneg.s32 q8, q9
-  vld1.16 {d0}, [r6]!         @ inre1Q9
-  vmovn.s32 d24, q12
-  vld1.16 {d1}, [r7]!         @ inre2Q9
-  vmovn.s32 d25, q13
-  vld1.16 {d2}, [r4]          @ inre1Q9[FRAMESAMPLES / 2 - 4 - i]
-  vmovn.s32 d5, q10
-  vld1.16 {d3}, [r5]          @ inre2Q9[FRAMESAMPLES / 2 - 4 - i]
-  vmovn.s32 d4, q8
-  vst1.16  {d24}, [r2]!       @ outreQ7[k]
-  vrev64.16 q2, q2            @ Reverse the order of the samples.
-  vst1.16  {d25}, [r3]!       @ outimQ7[k]
-  vst1.16 {d4}, [r11]         @ outreQ7[FRAMESAMPLES / 2 - 1 - k]
-  vst1.16 {d5}, [r12]         @ outimQ7[FRAMESAMPLES / 2 - 1 - k]
-  sub r11, #8                 @ Update pointers for outreQ7[].
-  sub r12, #8                 @ Update pointers for outimQ7[].
-
-  bgt Time2Spec_PostFftTransform
-
-  add sp, sp, #(16 + FRAMESAMPLES * 4)
-  pop {r3-r11,pc}
-
-.align  8
-@ Cosine table 1 in Q14
-WebRtcIsacfix_kCosTab1:
-_WebRtcIsacfix_kCosTab1:  @ Label for iOS
-  .short 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315
-  .short 16294, 16270, 16244, 16214, 16182, 16147, 16110, 16069
-  .short 16026, 15980, 15931, 15880, 15826, 15769, 15709, 15647
-  .short 15582, 15515, 15444, 15371, 15296, 15218, 15137, 15053
-  .short 14968, 14879, 14788, 14694, 14598, 14500, 14399, 14295
-  .short 14189, 14081, 13970, 13856, 13741, 13623, 13502, 13380
-  .short 13255, 13128, 12998, 12867, 12733, 12597, 12458, 12318
-  .short 12176, 12031, 11885, 11736, 11585, 11433, 11278, 11121
-  .short 10963, 10803, 10641, 10477, 10311, 10143, 9974, 9803
-  .short 9630, 9456, 9280, 9102, 8923, 8743, 8561, 8377
-  .short 8192, 8006, 7818, 7629, 7438, 7246, 7053, 6859
-  .short 6664, 6467, 6270, 6071, 5872, 5671, 5469, 5266
-  .short 5063, 4859, 4653, 4447, 4240, 4033, 3825, 3616
-  .short 3406, 3196, 2986, 2775, 2563, 2351, 2139, 1926
-  .short 1713, 1499, 1285, 1072,  857,  643,  429,  214
-  .short 0, -214, -429, -643, -857, -1072, -1285, -1499
-  .short -1713, -1926, -2139, -2351, -2563, -2775, -2986, -3196
-  .short -3406, -3616, -3825, -4033, -4240, -4447, -4653, -4859
-  .short -5063, -5266, -5469, -5671, -5872, -6071, -6270, -6467
-  .short -6664, -6859, -7053, -7246, -7438, -7629, -7818, -8006
-  .short -8192, -8377, -8561, -8743, -8923, -9102, -9280, -9456
-  .short -9630, -9803, -9974, -10143, -10311, -10477, -10641, -10803
-  .short -10963, -11121, -11278, -11433, -11585, -11736, -11885, -12031
-  .short -12176, -12318, -12458, -12597, -12733, -12867, -12998, -13128
-  .short -13255, -13380, -13502, -13623, -13741, -13856, -13970, -14081
-  .short -14189, -14295, -14399, -14500, -14598, -14694, -14788, -14879
-  .short -14968, -15053, -15137, -15218, -15296, -15371, -15444, -15515
-  .short -15582, -15647, -15709, -15769, -15826, -15880, -15931, -15980
-  .short -16026, -16069, -16110, -16147, -16182, -16214, -16244, -16270
-  .short -16294, -16315, -16333, -16349, -16362, -16371, -16378, -16383
-
-.align  8
-@ Sine table 2 in Q14
-WebRtcIsacfix_kSinTab2:
-_WebRtcIsacfix_kSinTab2:  @ Label for iOS
-  .short 16384, -16381, 16375, -16367, 16356, -16342, 16325, -16305
-  .short 16283, -16257, 16229, -16199, 16165, -16129, 16090, -16048
-  .short 16003, -15956, 15906, -15853, 15798, -15739, 15679, -15615
-  .short 15549, -15480, 15408, -15334, 15257, -15178, 15095, -15011
-  .short 14924, -14834, 14741, -14647, 14549, -14449, 14347, -14242
-  .short 14135, -14025, 13913, -13799, 13682, -13563, 13441, -13318
-  .short 13192, -13063, 12933, -12800, 12665, -12528, 12389, -12247
-  .short 12104, -11958, 11810, -11661, 11509, -11356, 11200, -11042
-  .short 10883, -10722, 10559, -10394, 10227, -10059, 9889, -9717
-  .short 9543, -9368, 9191, -9013, 8833, -8652, 8469, -8285
-  .short 8099, -7912, 7723, -7534, 7342, -7150, 6957, -6762
-  .short 6566, -6369, 6171, -5971, 5771, -5570, 5368, -5165
-  .short 4961, -4756, 4550, -4344, 4137, -3929, 3720, -3511
-  .short 3301, -3091, 2880, -2669, 2457, -2245, 2032, -1819
-  .short 1606, -1392, 1179, -965, 750, -536, 322, -107
-
-@ Table kCosTab2 was removed since its data is redundant with kSinTab2.
-
-.align  8
-@ Sine table 1 in Q14
-WebRtcIsacfix_kSinTab1:
-_WebRtcIsacfix_kSinTab1:  @ Label for iOS
-  .short 0, 214, 429, 643, 857, 1072, 1285, 1499
-  .short 1713, 1926, 2139, 2351, 2563, 2775, 2986, 3196
-  .short 3406, 3616, 3825, 4033, 4240, 4447, 4653, 4859
-  .short 5063, 5266, 5469, 5671, 5872, 6071, 6270, 6467
-  .short 6664, 6859, 7053, 7246, 7438, 7629, 7818, 8006
-  .short 8192, 8377, 8561, 8743, 8923, 9102, 9280, 9456
-  .short 9630, 9803, 9974, 10143, 10311, 10477, 10641, 10803
-  .short 10963, 11121, 11278, 11433, 11585, 11736, 11885, 12031
-  .short 12176, 12318, 12458, 12597, 12733, 12867, 12998, 13128
-  .short 13255, 13380, 13502, 13623, 13741, 13856, 13970, 14081
-  .short 14189, 14295, 14399, 14500, 14598, 14694, 14788, 14879
-  .short 14968, 15053, 15137, 15218, 15296, 15371, 15444, 15515
-  .short 15582, 15647, 15709, 15769, 15826, 15880, 15931, 15980
-  .short 16026, 16069, 16110, 16147, 16182, 16214, 16244, 16270
-  .short 16294, 16315, 16333, 16349, 16362, 16371, 16378, 16383
-  .short 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315
-  .short 16294, 16270, 16244, 16214, 16182, 16147, 16110, 16069
-  .short 16026, 15980, 15931, 15880, 15826, 15769, 15709, 15647
-  .short 15582, 15515, 15444, 15371, 15296, 15218, 15137, 15053
-  .short 14968, 14879, 14788, 14694, 14598, 14500, 14399, 14295
-  .short 14189, 14081, 13970, 13856, 13741, 13623, 13502, 13380
-  .short 13255, 13128, 12998, 12867, 12733, 12597, 12458, 12318
-  .short 12176, 12031, 11885, 11736, 11585, 11433, 11278, 11121
-  .short 10963, 10803, 10641, 10477, 10311, 10143, 9974, 9803
-  .short 9630, 9456, 9280, 9102, 8923, 8743, 8561, 8377
-  .short 8192, 8006, 7818, 7629, 7438, 7246, 7053, 6859
-  .short 6664, 6467, 6270, 6071, 5872, 5671, 5469, 5266
-  .short 5063, 4859, 4653, 4447, 4240, 4033, 3825, 3616
-  .short 3406, 3196, 2986, 2775, 2563, 2351, 2139, 1926
-  .short 1713, 1499, 1285, 1072, 857, 643, 429, 214
-
-@ void WebRtcIsacfix_Spec2TimeNeon(int16_t *inreQ7,
-@                                  int16_t *inimQ7,
-@                                  int32_t *outre1Q16,
-@                                  int32_t *outre2Q16);
-
-DEFINE_FUNCTION WebRtcIsacfix_Spec2TimeNeon
-.align  2
-  push {r3-r11,lr}            @ need to push r4-r11, but push r3 too to keep
-                              @ stack 8-byte aligned
-
-  sub sp, sp, #16
-  str r0, [sp]                @ inreQ7
-  str r1, [sp, #4]            @ inimQ7
-  str r2, [sp, #8]            @ outre1Q16
-  str r3, [sp, #12]           @ outre2Q16
-
-  mov r8, #(FRAMESAMPLES - 16)
-  add r12, r0, r8             @ &inreQ7[FRAMESAMPLES / 2 - 8]
-  add r11, r1, r8             @ &inimQ7[FRAMESAMPLES / 2 - 8]
-  add r4, r2, r8, lsl #1      @ &outRe1Q16[FRAMESAMPLES / 2 - 8]
-  add r6, r3, r8, lsl #1      @ &outRe2Q16[FRAMESAMPLES / 2 - 8]
-
-  mov r8, #(FRAMESAMPLES / 2) @ loop counter
-  adr r10, WebRtcIsacfix_kSinTab2
-  add r9, r10, #(120*2 - 16)  @ &WebRtcIsacfix_kSinTab2[119 - 8]
-
-  vpush {q4-q7}
-
-  mov r5, #-32
-  mov r7, #-16
-  vmov.u32 q6, #0             @ Initialize the maximum values for tmpInIm.
-  vmov.u32 q7, #0             @ Initialize the maximum values for tmpInRe.
-
-TransformAndFindMax:
-@ Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
-@ Bit-exact.
-
-  subs r8, #16
-
-  vld1.16 {q0}, [r9, :64]     @ kCosTab2[]
-  sub r9, #16
-  vld1.16 {q2}, [r0]!         @ inreQ7[]
-  vneg.s16 q0, q0
-  vld1.16 {q3}, [r1]!         @ inimQ7[]
-  vrev64.16 d0, d0
-  vrev64.16 d1, d1
-  vld1.16 {q1}, [r10, :64]!   @ WebRtcIsacfix_kSinTab2[]
-  vswp d0, d1
-
-  vmull.s16 q8, d2, d6        @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
-  vmull.s16 q9, d3, d7        @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
-  vmlal.s16 q8, d0, d4        @ kCosTab2[k] * inreQ7[k]
-  vmlal.s16 q9, d1, d5        @ kCosTab2[k] * inreQ7[k]
-  vmull.s16 q12, d0, d6       @ kCosTab2[k] * inimQ7[k]
-  vmull.s16 q13, d1, d7       @ kCosTab2[k] * inimQ7[k]
-  vmlsl.s16 q12, d2, d4       @ WebRtcIsacfix_kSinTab2[k] * inreQ7[k]
-  vmlsl.s16 q13, d3, d5       @ WebRtcIsacfix_kSinTab2[k] * inreQ7[k]
-
-  vld1.16 {q2}, [r11], r7     @ inimQ7[FRAMESAMPLES / 2 - 8 + i]
-  vld1.16 {q3}, [r12], r7     @ inreQ7[FRAMESAMPLES / 2 - 8 + i]
-
-  vrev64.16 q2, q2            @ Reverse the order of the samples
-  vrev64.16 q3, q3            @ Reverse the order of the samples
-
-  vmull.s16 q14, d2, d5       @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
-  vmull.s16 q15, d3, d4       @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
-  vmlsl.s16 q14, d0, d7       @ q14 -= kCosTab2[k] * inreQ7[k]
-  vmlsl.s16 q15, d1, d6       @ q15 -= kCosTab2[k] * inreQ7[k]
-
-  vmull.s16 q10, d0, d5       @ kCosTab2[k] * inimQ7[]
-  vmull.s16 q11, d1, d4       @ kCosTab2[k] * inimQ7[]
-  vmlal.s16 q10, d2, d7       @ q10 += WebRtcIsacfix_kSinTab2[k] * inreQ7[]
-  vmlal.s16 q11, d3, d6       @ q11 += WebRtcIsacfix_kSinTab2[k] * inreQ7[]
-
-  vshr.s32 q8, q8, #5         @ xrQ16
-  vshr.s32 q9, q9, #5         @ xrQ16
-  vshr.s32 q12, q12, #5       @ xiQ16
-  vshr.s32 q13, q13, #5       @ xiQ16
-  vshr.s32 q14, q14, #5       @ yiQ16
-  vshr.s32 q15, q15, #5       @ yiQ16
-
-  vneg.s32 q10, q10
-  vneg.s32 q11, q11
-
-  @ xrQ16 - yiQ16
-  vsub.s32 q0, q8, q14
-  vsub.s32 q1, q9, q15
-
-  vshr.s32 q10, q10, #5       @ yrQ16
-  vshr.s32 q11, q11, #5       @ yrQ16
-
-  @ xrQ16 + yiQ16
-  vadd.s32 q3, q8, q14
-  vadd.s32 q2, q9, q15
-
-  @ yrQ16 + xiQ16
-  vadd.s32 q4, q10, q12
-  vadd.s32 q5, q11, q13
-
-  @ yrQ16 - xiQ16
-  vsub.s32 q8, q11, q13
-  vsub.s32 q9, q10, q12
-
-  @ Reverse the order of the samples
-  vrev64.32 q2, q2
-  vrev64.32 q3, q3
-  vrev64.32 q8, q8
-  vrev64.32 q9, q9
-  vswp d4, d5
-  vswp d6, d7
-
-  vst1.32  {q0, q1}, [r2]!    @ outre1Q16[k]
-  vswp d16, d17
-  vswp d18, d19
-  vst1.32  {q2, q3}, [r4], r5 @ outre1Q16[FRAMESAMPLES / 2 - 1 - k]
-
-  @ Find the absolute maximum in the vectors and store them in q6 and q7.
-  vabs.s32 q10, q0
-  vabs.s32 q14, q4
-  vabs.s32 q11, q1
-  vabs.s32 q15, q5
-  vabs.s32 q12, q2
-  vmax.u32 q6, q10            @ Use u32 so we don't lose the value 0x80000000.
-  vmax.u32 q7, q14            @ Maximum for outre2Q16[].
-  vabs.s32 q0, q8
-  vmax.u32 q6, q11            @ Maximum for outre1Q16[].
-  vmax.u32 q7, q15
-  vabs.s32 q13, q3
-  vmax.u32 q6, q12
-  vmax.u32 q7, q0
-  vabs.s32 q1, q9
-  vst1.32  {q4, q5}, [r3]!    @ outre2Q16[k]
-  vst1.32  {q8, q9}, [r6], r5 @ outre2Q16[FRAMESAMPLES / 2 - 1 - k]
-  vmax.u32 q6, q13
-  vmax.u32 q7, q1
-
-  bgt TransformAndFindMax
-
-  adr r10, WebRtcIsacfix_kSinTab1
-#if defined(__APPLE__)
-  mov r2, #:lower16:(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
-#else
-  mov r2, #(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
-#endif
-
-  sub r11, r10, r2            @ WebRtcIsacfix_kCosTab1
-
-  @ Find the maximum value in the Neon registers
-  vmax.u32 d12, d13
-  vmax.u32 d14, d15
-  vpmax.u32 d12, d12, d12     @ Both 32 bits words hold the same value tmpInIm.
-  vpmax.u32 d14, d14, d14     @ Both 32 bits words hold the same value tmpInRe.
-  vmax.s32 d0, d12, d14       @ if (tmpInIm>tmpInRe) tmpInRe = tmpInIm;
-
-  vpop {q4-q7}
-
-  ldr r4, [sp]                @ inreQ7
-  vcls.s32  d1, d0            @ sh = WebRtcSpl_NormW32(tmpInRe);
-  ldr r5, [sp, #4]            @ inimQ7
-  vmov.i32  d0, #24           @ sh = sh-24;
-  ldr r6, [sp, #8]            @ outre1Q16
-  vsub.s32  d1, d1, d0
-  ldr r7, [sp, #12]           @ outre2Q16
-  vdup.s32  q8, d1[0]         @ sh
-
-  mov r8, #(FRAMESAMPLES / 2)
-
-PreFftShift:
-  subs r8, #16
-  vld1.32 {q0, q1}, [r6]!     @ outre1Q16[]
-  vld1.32 {q2, q3}, [r6]!     @ outre1Q16[]
-  vrshl.s32 q0, q0, q8
-  vrshl.s32 q1, q1, q8
-  vrshl.s32 q2, q2, q8
-  vrshl.s32 q3, q3, q8
-  vld1.32 {q10, q11}, [r7]!   @ outre2Q16[]
-  vld1.32 {q12, q13}, [r7]!   @ outre2Q16[]
-  vrshl.s32 q10, q10, q8
-  vrshl.s32 q11, q11, q8
-  vrshl.s32 q12, q12, q8
-  vrshl.s32 q13, q13, q8
-
-  vmovn.s32 d0, q0
-  vmovn.s32 d1, q1
-  vmovn.s32 d2, q2
-  vmovn.s32 d3, q3
-  vmovn.s32 d4, q10
-  vmovn.s32 d5, q11
-  vmovn.s32 d6, q12
-  vmovn.s32 d7, q13
-
-  vst1.16 {q0, q1}, [r4]!     @ inreQ7[]
-  vst1.16 {q2, q3}, [r5]!     @ inimQ7[]
-
-  bgt PreFftShift
-
-  vmov.s32 r8, d16[0]         @ Store value of sh.
-  ldr r0, [sp]                @ inreQ7
-  ldr r1, [sp, #4]            @ inimQ7
-  mov r2, #1
-  CALL_FUNCTION WebRtcIsacfix_FftRadix16Fastest
-
-  vdup.s32 q8, r8             @ sh
-  mov r9, r11                 @ WebRtcIsacfix_kCosTab1
-  ldr r4, [sp]                @ inreQ7
-  ldr r5, [sp, #4]            @ inimQ7
-  ldr r6, [sp, #8]            @ outre1Q16
-  ldr r7, [sp, #12]           @ outre2Q16
-  mov r8, #(FRAMESAMPLES / 2)
-  vneg.s32 q15, q8            @ -sh
-  movw r0, #273
-  lsl r0, #15                 @ Together with vqdmulh, net effect is ">> 16".
-  vdup.s32 q14, r0
-
-PostFftShiftDivide:
-  subs r8, #16
-
-  vld1.16 {q0, q1}, [r4]!     @ inreQ7
-  vmovl.s16 q10, d0
-  vmovl.s16 q11, d1
-  vld1.16 {q2, q3}, [r5]!     @ inimQ7
-  vmovl.s16 q8, d2
-  vmovl.s16 q9, d3
-
-  vshl.s32 q10, q10, q15
-  vshl.s32 q11, q11, q15
-  vshl.s32 q8, q8, q15
-  vshl.s32 q9, q9, q15
-
-  vqdmulh.s32 q10, q10, q14
-  vqdmulh.s32 q11, q11, q14
-  vqdmulh.s32 q8, q8, q14
-  vqdmulh.s32 q9, q9, q14
-
-  vmovl.s16 q0, d4
-  vmovl.s16 q1, d5
-  vmovl.s16 q2, d6
-  vmovl.s16 q3, d7
-
-  vshl.s32 q0, q0, q15
-  vshl.s32 q1, q1, q15
-  vshl.s32 q2, q2, q15
-  vshl.s32 q3, q3, q15
-
-  @ WEBRTC_SPL_MUL_16_32_RSFT16(273, outre2Q16[k])
-  vqdmulh.s32 q0, q0, q14
-  vqdmulh.s32 q1, q1, q14
-  vst1.32 {q10, q11}, [r6]!   @ outre1Q16[]
-  vqdmulh.s32 q2, q2, q14
-  vqdmulh.s32 q3, q3, q14
-  vst1.32 {q8, q9}, [r6]!     @ outre1Q16[]
-  vst1.32 {q0, q1}, [r7]!     @ outre2Q16[]
-  vst1.32 {q2, q3}, [r7]!     @ outre2Q16[]
-
-  bgt PostFftShiftDivide
-
-  mov r8, #(FRAMESAMPLES / 2)
-  ldr r2, [sp, #8]            @ outre1Q16
-  ldr r3, [sp, #12]           @ outre2Q16
-  movw r0, #31727
-  lsl r0, #16                 @ With vqdmulh and vrshrn, net effect is ">> 25".
-
-DemodulateAndSeparate:
-  subs r8, #8
-
-  vld1.16 {q0}, [r9, :64]!    @ WebRtcIsacfix_kCosTab1[]
-  vmovl.s16 q10, d0           @ WebRtcIsacfix_kCosTab1[]
-  vld1.16 {q1}, [r10, :64]!   @ WebRtcIsacfix_kSinTab1[]
-  vmovl.s16 q11, d1           @ WebRtcIsacfix_kCosTab1[]
-  vld1.32 {q2, q3}, [r2]      @ outre1Q16
-  vmovl.s16 q12, d2           @ WebRtcIsacfix_kSinTab1[]
-  vld1.32 {q14, q15}, [r3]    @ outre2Q16
-  vmovl.s16 q13, d3           @ WebRtcIsacfix_kSinTab1[]
-
-  vmull.s32 q0, d20, d4       @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
-  vmull.s32 q1, d21, d5       @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
-  vmull.s32 q8, d22, d6       @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
-  vmull.s32 q9, d23, d7       @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
-
-  vmlsl.s32 q0, d24, d28      @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
-  vmlsl.s32 q1, d25, d29      @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
-  vmlsl.s32 q8, d26, d30      @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
-  vmlsl.s32 q9, d27, d31      @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
-
-  vrshrn.s64 d0, q0, #10      @ xrQ16
-  vrshrn.s64 d1, q1, #10      @ xrQ16
-  vrshrn.s64 d2, q8, #10      @ xrQ16
-  vrshrn.s64 d3, q9, #10      @ xrQ16
-
-  vmull.s32 q8, d20, d28      @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
-  vmull.s32 q9, d21, d29      @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
-  vmull.s32 q14, d22, d30     @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
-  vmull.s32 q15, d23, d31     @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
-
-  vmlal.s32 q8, d24, d4       @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
-  vmlal.s32 q9, d25, d5       @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
-  vmlal.s32 q14, d26, d6      @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
-  vmlal.s32 q15, d27, d7      @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
-
-  vdup.s32 q11, r0            @ generic -> Neon doesn't cost extra cycles.
-
-  vrshrn.s64 d24, q8, #10     @ xiQ16
-  vrshrn.s64 d25, q9, #10     @ xiQ16
-  vqdmulh.s32 q0, q0, q11
-  vrshrn.s64 d26, q14, #10    @ xiQ16
-  vrshrn.s64 d27, q15, #10    @ xiQ16
-
-  @ WEBRTC_SPL_MUL_16_32_RSFT11(factQ11, xrQ16)
-  @ WEBRTC_SPL_MUL_16_32_RSFT11(factQ11, xiQ16)
-
-  vqdmulh.s32 q1, q1, q11
-  vqdmulh.s32 q2, q12, q11
-  vqdmulh.s32 q3, q13, q11
-
-  vst1.16 {q0, q1}, [r2]!     @ outre1Q16[]
-  vst1.16 {q2, q3}, [r3]!     @ outre2Q16[]
-
-  bgt DemodulateAndSeparate
-
-  add sp, sp, #16
-  pop {r3-r11,pc}
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_tables.c
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_tables.c
@ -16,7 +16,6 @@
 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
 #include "webrtc/typedefs.h"

-#if !(defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON)
 /* Cosine table 1 in Q14. */
 const int16_t WebRtcIsacfix_kCosTab1[FRAMESAMPLES/2] = {
  16384,  16383,  16378,  16371,  16362,  16349,  16333,  16315,  16294,  16270,
@ -90,7 +89,6 @@ const int16_t WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4] = {
  4137,  -3929,  3720,  -3511,  3301,  -3091,  2880,  -2669,  2457,  -2245,
  2032,  -1819,  1606,  -1392,  1179,   -965,   750,   -536,   322,   -107
 };
-#endif

 #if defined(MIPS32_LE)
 /* Cosine table 2 in Q14. Used only on MIPS platforms. */
--- a/webrtc/modules/audio_coding/codecs/isac/isacfix.gypi
+++ b/webrtc/modules/audio_coding/codecs/isac/isacfix.gypi
@ -142,11 +142,9 @@
          ],
          'sources': [
            'fix/source/entropy_coding_neon.c',
-            'fix/source/filterbanks_neon.S',
-            'fix/source/filters_neon.S',
-            'fix/source/lattice_neon.S',
-            'fix/source/lpc_masking_model_neon.S',
-            'fix/source/transform_neon.S',
+            'fix/source/filters_neon.c',
+            'fix/source/lattice_neon.c',
+            'fix/source/transform_neon.c',
          ],
          'conditions': [
            # Disable LTO in isac_neon target due to compiler bug
@ -156,27 +154,11 @@
                '-ffat-lto-objects',
              ],
            }],
-            ['target_arch=="arm64"', {
-              'sources!': [
-                'fix/source/filterbanks_neon.S',
-                'fix/source/filters_neon.S',
-                'fix/source/lattice_neon.S',
-                'fix/source/lpc_masking_model_neon.S',
-                'fix/source/transform_neon.S',
-              ],
-              'sources': [
-                'fix/source/filters_neon.c',
-                'fix/source/lattice_neon.c',
-                'fix/source/transform_neon.c',
-              ],
-              'conditions': [
-                # Disable AllpassFilter2FixDec16Neon function due to a clang
-                # bug. Refer more details at:
-                # https://code.google.com/p/webrtc/issues/detail?id=4567
-                ['clang==0', {
+            # Disable AllpassFilter2FixDec16Neon function due to a clang
+            # bug. Refer more details at:
+            # https://code.google.com/p/webrtc/issues/detail?id=4567
+            ['target_arch!="arm64" or clang==0', {
                  'sources': ['fix/source/filterbanks_neon.c',],
-                }],
-              ],
            }]
          ],
        },