diff --git a/webrtc/modules/audio_coding/BUILD.gn b/webrtc/modules/audio_coding/BUILD.gn index 9a2bf79d3..df565d06f 100644 --- a/webrtc/modules/audio_coding/BUILD.gn +++ b/webrtc/modules/audio_coding/BUILD.gn @@ -591,17 +591,14 @@ source_set("isacfix") { if (rtc_build_armv7_neon || current_cpu == "arm64") { source_set("isac_neon") { - sources = [ "codecs/isac/fix/source/entropy_coding_neon.c" ] + sources = [ + "codecs/isac/fix/source/entropy_coding_neon.c", + "codecs/isac/fix/source/filters_neon.c", + "codecs/isac/fix/source/lattice_neon.c", + "codecs/isac/fix/source/transform_neon.c", + ] if (rtc_build_armv7_neon) { - sources += [ - "codecs/isac/fix/source/filterbanks_neon.S", - "codecs/isac/fix/source/filters_neon.S", - "codecs/isac/fix/source/lattice_neon.S", - "codecs/isac/fix/source/lpc_masking_model_neon.S", - "codecs/isac/fix/source/transform_neon.S", - ] - # Enable compilation for the ARM v7 Neon instruction set. This is needed # since //build/config/arm.gni only enables Neon for iOS, not Android. # This provides the same functionality as webrtc/build/arm_neon.gypi. @@ -614,18 +611,11 @@ if (rtc_build_armv7_neon || current_cpu == "arm64") { ] } - if (current_cpu == "arm64") { - sources += [ - "codecs/isac/fix/source/filters_neon.c", - "codecs/isac/fix/source/lattice_neon.c", - "codecs/isac/fix/source/transform_neon.c", - ] + if (current_cpu != "arm64" || !is_clang) { # Disable AllpassFilter2FixDec16Neon function due to a clang bug. # Refer more details at: # https://code.google.com/p/webrtc/issues/detail?id=4567 - if (!is_clang) { sources += [ "codecs/isac/fix/source/filterbanks_neon.c", ] - } } # Disable LTO in audio_processing_neon target due to compiler bug. diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S deleted file mode 100644 index 0a43551ad..000000000 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S +++ /dev/null @@ -1,270 +0,0 @@ -@ -@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. -@ -@ Use of this source code is governed by a BSD-style license -@ that can be found in the LICENSE file in the root of the source -@ tree. An additional intellectual property rights grant can be found -@ in the file PATENTS. All contributing project authors may -@ be found in the AUTHORS file in the root of the source tree. -@ - -@ Contains a function for WebRtcIsacfix_AllpassFilter2FixDec16Neon() -@ in iSAC codec, optimized for ARM Neon platform. Bit exact with function -@ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype -@ C code is at end of this file. - -#include "webrtc/system_wrappers/interface/asm_defines.h" - -GLOBAL_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon -.align 2 - -@void WebRtcIsacfix_AllpassFilter2FixDec16Neon( -@ int16_t *data_ch1, // Input and output in channel 1, in Q0 -@ int16_t *data_ch2, // Input and output in channel 2, in Q0 -@ const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15 -@ const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15 -@ const int length, // Length of the data buffers -@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16 -@ int32_t *filter_state_ch2); // Filter state for channel 2, in Q16 - -DEFINE_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon - push {r4 - r7} - - ldr r5, [sp, #24] @ filter_state_ch2 - ldr r6, [sp, #20] @ filter_state_ch1 - - @ Initialize the Neon registers. - vld1.16 d0[0], [r0]! @ data_ch1[0] - vld1.16 d0[2], [r1]! @ data_ch2[0] - vld1.32 d30[0], [r2] @ factor_ch1[0], factor_ch1[1] - vld1.32 d30[1], [r3] @ factor_ch2[0], factor_ch2[1] - vld1.32 d16[0], [r6]! @ filter_state_ch1[0] - vld1.32 d17[0], [r5]! @ filter_state_ch2[0] - vneg.s16 d31, d30 - - ldr r3, [sp, #16] @ length - mov r4, #4 @ Post offset value for the loop - mov r2, #-2 @ Post offset value for the loop - sub r3, #2 @ Loop counter - - @ Loop unrolling pre-processing. - vqdmull.s16 q1, d30, d0 - vshll.s16 q0, d0, #16 - vqadd.s32 q2, q1, q8 - vshrn.i32 d6, q2, #16 - vmull.s16 q1, d31, d6 - vshl.s32 q1, #1 - vqadd.s32 q8, q1, q0 - vld1.32 d16[1], [r6] @ filter_state_ch1[1] - vld1.32 d17[1], [r5] @ filter_state_ch2[1] - sub r6, #4 @ &filter_state_ch1[0] - sub r5, #4 @ &filter_state_ch2[0] - vld1.16 d6[1], [r0], r2 @ data_ch1[1] - vld1.16 d6[3], [r1], r2 @ data_ch2[1] - vrev32.16 d0, d6 - -FOR_LOOP: - vqdmull.s16 q1, d30, d0 - vshll.s16 q0, d0, #16 - vqadd.s32 q2, q1, q8 - vshrn.i32 d4, q2, #16 - vmull.s16 q1, d31, d4 - vst1.16 d4[1], [r0], r4 @ Store data_ch1[n] - vst1.16 d4[3], [r1], r4 @ Store data_ch2[n] - vshl.s32 q1, #1 - vld1.16 d4[1], [r0], r2 @ Load data_ch1[n + 2] - vld1.16 d4[3], [r1], r2 @ Load data_ch2[n + 2] - vqadd.s32 q8, q1, q0 - vrev32.16 d0, d4 - vqdmull.s16 q1, d30, d0 - subs r3, #2 - vqadd.s32 q2, q1, q8 - vshrn.i32 d6, q2, #16 - vmull.s16 q1, d31, d6 - vshll.s16 q0, d0, #16 - vst1.16 d6[1], [r0], r4 @ Store data_ch1[n + 1] - vst1.16 d6[3], [r1], r4 @ Store data_ch2[n + 1] - vshl.s32 q1, #1 - vld1.16 d6[1], [r0], r2 @ Load data_ch1[n + 3] - vld1.16 d6[3], [r1], r2 @ Load data_ch2[n + 3] - vqadd.s32 q8, q1, q0 - vrev32.16 d0, d6 - bgt FOR_LOOP - - @ Loop unrolling post-processing. - vqdmull.s16 q1, d30, d0 - vshll.s16 q0, d0, #16 - vqadd.s32 q2, q1, q8 - vshrn.i32 d4, q2, #16 - vmull.s16 q1, d31, d4 - vst1.16 d4[1], [r0]! @ Store data_ch1[n] - vst1.16 d4[3], [r1]! @ Store data_ch2[n] - vshl.s32 q1, #1 - vqadd.s32 q8, q1, q0 - vrev32.16 d0, d4 - vqdmull.s16 q1, d30, d0 - vshll.s16 q0, d0, #16 - vqadd.s32 q2, q1, q8 - vshrn.i32 d6, q2, #16 - vmull.s16 q1, d31, d6 - vst1.16 d6[1], [r0] @ Store data_ch1[n + 1] - vst1.16 d6[3], [r1] @ Store data_ch2[n + 1] - vshl.s32 q1, #1 - vst1.32 d16[0], [r6]! @ Store filter_state_ch1[0] - vqadd.s32 q9, q1, q0 - vst1.32 d17[0], [r5]! @ Store filter_state_ch1[1] - vst1.32 d18[1], [r6] @ Store filter_state_ch2[0] - vst1.32 d19[1], [r5] @ Store filter_state_ch2[1] - - pop {r4 - r7} - bx lr - -@void AllpassFilter2FixDec16BothChannels( -@ int16_t *data_ch1, // Input and output in channel 1, in Q0 -@ int16_t *data_ch2, // Input and output in channel 2, in Q0 -@ const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15 -@ const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15 -@ const int length, // Length of the data buffers -@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16 -@ int32_t *filter_state_ch2) { // Filter state for channel 2, in Q16 -@ int n = 0; -@ int32_t state0_ch1 = filter_state_ch1[0], state1_ch1 = filter_state_ch1[1]; -@ int32_t state0_ch2 = filter_state_ch2[0], state1_ch2 = filter_state_ch2[1]; -@ int16_t sample0_ch1 = 0, sample0_ch2 = 0; -@ int16_t sample1_ch1 = 0, sample1_ch2 = 0; -@ int32_t a0_ch1 = 0, a0_ch2 = 0; -@ int32_t b0_ch1 = 0, b0_ch2 = 0; -@ -@ int32_t a1_ch1 = 0, a1_ch2 = 0; -@ int32_t b1_ch1 = 0, b1_ch2 = 0; -@ int32_t b2_ch1 = 0, b2_ch2 = 0; -@ -@ // Loop unrolling preprocessing. -@ -@ sample0_ch1 = data_ch1[n]; -@ sample0_ch2 = data_ch2[n]; -@ -@ a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1; -@ a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1; -@ -@ b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1); -@ b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16 -@ -@ a0_ch1 = -factor_ch1[0] * (int16_t)(b0_ch1 >> 16); -@ a0_ch2 = -factor_ch2[0] * (int16_t)(b0_ch2 >> 16); -@ -@ state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1 <<1, (uint32_t)sample0_ch1 << 16); -@ state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2 <<1, (uint32_t)sample0_ch2 << 16); -@ -@ sample1_ch1 = data_ch1[n + 1]; -@ sample0_ch1 = (int16_t) (b0_ch1 >> 16); //Save as Q0 -@ sample1_ch2 = data_ch2[n + 1]; -@ sample0_ch2 = (int16_t) (b0_ch2 >> 16); //Save as Q0 -@ -@ -@ for (n = 0; n < length - 2; n += 2) { -@ a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1; -@ a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1; -@ a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1; -@ a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1; -@ -@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1); -@ b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1); //Q16+Q16=Q16 -@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2); //Q16+Q16=Q16 -@ b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2); //Q16+Q16=Q16 -@ -@ a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16); -@ a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16); -@ a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16); -@ a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16); -@ -@ state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 <<16); -@ state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 <<16); -@ state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 <<16); -@ state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 <<16); -@ -@ sample0_ch1 = data_ch1[n + 2]; -@ sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0 -@ sample0_ch2 = data_ch2[n + 2]; -@ sample1_ch2 = (int16_t) (b1_ch2 >> 16); //Save as Q0 -@ -@ a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1; -@ a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1; -@ a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1; -@ a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1; -@ -@ b2_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1); -@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16 -@ b2_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16 -@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16 -@ -@ a0_ch1 = -factor_ch1[0] * (int16_t)(b2_ch1 >> 16); -@ a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16); -@ a0_ch2 = -factor_ch2[0] * (int16_t)(b2_ch2 >> 16); -@ a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16); -@ -@ state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1<<16); -@ state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16); -@ state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2<<16); -@ state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16); -@ -@ -@ sample1_ch1 = data_ch1[n + 3]; -@ sample0_ch1 = (int16_t) (b2_ch1 >> 16); //Save as Q0 -@ sample1_ch2 = data_ch2[n + 3]; -@ sample0_ch2 = (int16_t) (b2_ch2 >> 16); //Save as Q0 -@ -@ data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0 -@ data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0 -@ data_ch2[n] = (int16_t) (b0_ch2 >> 16); -@ data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16); -@ } -@ -@ // Loop unrolling post-processing. -@ -@ a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1; -@ a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1; -@ a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1; -@ a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1; -@ -@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1); -@ b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1); -@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2); -@ b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2); -@ -@ a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16); -@ a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16); -@ a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16); -@ a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16); -@ -@ state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 << 16); -@ state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 << 16); -@ state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 << 16); -@ state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 << 16); -@ -@ data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0 -@ data_ch2[n] = (int16_t) (b0_ch2 >> 16); -@ -@ sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0 -@ sample1_ch2 = (int16_t) (b1_ch2 >> 16); //Save as Q0 -@ -@ a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1; -@ a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1; -@ -@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16 -@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16 -@ -@ a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16); -@ a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16); -@ -@ state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16); -@ state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16); -@ -@ data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0 -@ data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16); -@ -@ filter_state_ch1[0] = state0_ch1; -@ filter_state_ch1[1] = state1_ch1; -@ filter_state_ch2[0] = state0_ch2; -@ filter_state_ch2[1] = state1_ch2; -@} diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S deleted file mode 100644 index 3c5ac646c..000000000 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S +++ /dev/null @@ -1,145 +0,0 @@ -@ -@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. -@ -@ Use of this source code is governed by a BSD-style license -@ that can be found in the LICENSE file in the root of the source -@ tree. An additional intellectual property rights grant can be found -@ in the file PATENTS. All contributing project authors may -@ be found in the AUTHORS file in the root of the source tree. -@ -@ Reference code in filters.c. Output is bit-exact. - -#include "webrtc/system_wrappers/interface/asm_defines.h" - -GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon -.align 2 - -@ int WebRtcIsacfix_AutocorrNeon( -@ int32_t* __restrict r, -@ const int16_t* __restrict x, -@ int16_t N, -@ int16_t order, -@ int16_t* __restrict scale); - -DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon - push {r3 - r12} - - @ Constant initializations - mov r4, #33 - vmov.i32 d0, #0 - vmov.i32 q8, #0 - vmov.i32 d29, #0 @ Initialize (-scale). - vmov.u8 d30, #255 @ Initialize d30 as -1. - vmov.i32 d0[0], r4 @ d0: 00000033 (low), 00000000 (high) - vmov.i32 d25, #32 - - mov r5, r1 @ x - mov r6, r2 @ N - -@ Generate the first coefficient r0. -LOOP_R0: - vld1.16 {d18}, [r5]! @ x[] - subs r6, r6, #4 - vmull.s16 q9, d18, d18 - vpadal.s32 q8, q9 - bgt LOOP_R0 - - vadd.i64 d16, d16, d17 - - @ Calculate scaling (the value of shifting). - vmov d17, d16 - - @ Check overflow and determine the value for 'scale'. - @ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and - @ lower 32-bit words. Note that we don't care about the value of the upper - @ word in d17. - - @ Check the case of 1 bit overflow. If it occurs store the results for - @ scale and r[0] in d17 and d29. - - vshr.u64 d3, d16, #1 - vclt.s32 d1, d16, #0 @ < 0 ? - vbit d17, d3, d1 @ For r[0] - vbit d29, d30, d1 @ -scale = -1 - - @ For the case of more than 1 bit overflow. If it occurs overwrite the - @ results for scale and r[0] in d17 and d29. - vclz.s32 d5, d16 @ Leading zeros of the two 32 bit words. - vshr.s64 d26, d5, #32 @ Keep only the upper 32 bits. - vsub.i64 d31, d26, d0 @ zeros - 33 - vshl.i64 d27, d26, #32 - vorr d27, d26 @ Duplicate the high word with its low one. - vshl.u64 d2, d16, d31 @ Shift by (-scale). - vclt.s32 d1, d27, d25 @ < 32 ? - vbit d17, d2, d1 @ For r[0] - vbit d29, d31, d1 @ -scale - - vst1.32 d17[0], [r0]! @ r[0] - mov r5, #1 @ outer loop counter - -@ Generate rest of the coefficients -LOOP_R: - vmov.i32 q8, #0 @ Initialize the accumulation result. - vmov.i32 q9, #0 @ Initialize the accumulation result. - mov r7, r1 @ &x[0] - add r6, r7, r5, lsl #1 @ x[i] - sub r12, r2, r5 @ N - i - lsr r8, r12, #3 @ inner loop counter - sub r12, r8, lsl #3 @ Leftover samples to be processed - -LOOP_8X_SAMPLES: @ Multiple of 8 samples - vld1.16 {d20, d21}, [r7]! @ x[0, ...] - vld1.16 {d22, d23}, [r6]! @ x[i, ...] - vmull.s16 q12, d20, d22 - vmull.s16 q13, d21, d23 - subs r8, #1 - vpadal.s32 q8, q12 - vpadal.s32 q9, q13 - bgt LOOP_8X_SAMPLES - - cmp r12, #4 - blt REST_SAMPLES - -Four_SAMPLES: - vld1.16 d20, [r7]! - vld1.16 d22, [r6]! - vmull.s16 q12, d20, d22 - vpadal.s32 q8, q12 - sub r12, #4 - -REST_SAMPLES: - mov r8, #0 @ Initialize lower word of the accumulation. - mov r4, #0 @ Initialize upper word of the accumulation. - cmp r12, #0 - ble SUMUP - -LOOP_REST_SAMPLES: - ldrh r9, [r7], #2 @ x[0, ...] - ldrh r10, [r6], #2 @ x[i, ...] - smulbb r11, r9, r10 - adds r8, r8, r11 @ lower word of the accumulation. - adc r4, r4, r11, asr #31 @ upper word of the accumulation. - subs r12, #1 - bgt LOOP_REST_SAMPLES - -@ Added the multiplication results together and do a shift. -SUMUP: - vadd.i64 d16, d17 - vadd.i64 d18, d19 - vadd.i64 d18, d16 - vmov d17, r8, r4 - vadd.i64 d18, d17 - vshl.s64 d18, d29 @ Shift left by (-scale). - vst1.32 d18[0], [r0]! @ r[i] - - add r5, #1 - cmp r5, r3 - ble LOOP_R - - vneg.s32 d29, d29 @ Get value for 'scale'. - ldr r2, [sp, #40] @ &scale - add r0, r3, #1 @ return (order + 1) - vst1.s16 d29[0], [r2] @ Store 'scale' - - pop {r3 - r12} - bx lr diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c index c21116237..2fba3e68d 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c @@ -205,10 +205,6 @@ static void WebRtcIsacfix_InitNeon(void) { WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopNeon; WebRtcIsacfix_Spec2Time = WebRtcIsacfix_Spec2TimeNeon; WebRtcIsacfix_Time2Spec = WebRtcIsacfix_Time2SpecNeon; -#if !(defined WEBRTC_ARCH_ARM64_NEON) - WebRtcIsacfix_CalculateResidualEnergy = - WebRtcIsacfix_CalculateResidualEnergyNeon; -#endif // Disable AllpassFilter2FixDec16Neon function due to a clang bug. // Refer more details at: // https://code.google.com/p/webrtc/issues/detail?id=4567 diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S deleted file mode 100644 index f31a32d9d..000000000 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S +++ /dev/null @@ -1,146 +0,0 @@ -@ -@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. -@ -@ Use of this source code is governed by a BSD-style license -@ that can be found in the LICENSE file in the root of the source -@ tree. An additional intellectual property rights grant can be found -@ in the file PATENTS. All contributing project authors may -@ be found in the AUTHORS file in the root of the source tree. -@ - -@ lattice_neon.s -@ -@ Contains a function for the core loop in the normalized lattice MA -@ filter routine for iSAC codec, optimized for ARM Neon platform. -@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, -@ int16_t input1, -@ int32_t input2, -@ int32_t* ptr0, -@ int32_t* ptr1, -@ int32_t* __restrict ptr2); -@ It calculates -@ *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); -@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); -@ in Q15 domain. -@ -@ Reference code in lattice.c. -@ Output is not bit-exact with the reference C code, due to the replacement -@ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon -@ instructions, smulwb, and smull. Speech quality was not degraded by -@ testing speech and tone vectors. - -#include "webrtc/system_wrappers/interface/asm_defines.h" -#include "settings.h" - -GLOBAL_FUNCTION WebRtcIsacfix_FilterMaLoopNeon -.align 2 -DEFINE_FUNCTION WebRtcIsacfix_FilterMaLoopNeon - push {r4-r8} - - vdup.32 d28, r0 @ Initialize Neon register with input0 - vdup.32 d29, r1 @ Initialize Neon register with input1 - vdup.32 d30, r2 @ Initialize Neon register with input2 - ldr r4, [sp, #20] @ ptr1 - ldr r12, [sp, #24] @ ptr2 - - @ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2 - @ Leftover samples after the loop, in r6: - @ r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2 - mov r6, #HALF_SUBFRAMELEN - sub r6, #1 - lsr r5, r6, #2 - sub r6, r5, lsl #2 - - @ First r5 iterations in a loop. - -LOOP: - vld1.32 {d0, d1}, [r3]! @ *ptr0 - - vmull.s32 q10, d0, d28 @ tmp32a = input0 * (*ptr0) - vmull.s32 q11, d1, d28 @ tmp32a = input0 * (*ptr0) - vmull.s32 q12, d0, d29 @ input1 * (*ptr0) - vmull.s32 q13, d1, d29 @ input1 * (*ptr0) - - vrshrn.i64 d4, q10, #15 - vrshrn.i64 d5, q11, #15 - - vld1.32 {d2, d3}, [r12] @ *ptr2 - vadd.i32 q3, q2, q1 @ tmp32b = *ptr2 + tmp32a - - vrshrn.i64 d0, q12, #15 - - vmull.s32 q10, d6, d30 @ input2 * (*ptr2 + tmp32b) - vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b) - - vrshrn.i64 d16, q10, #16 - vrshrn.i64 d17, q11, #16 - - vmull.s32 q10, d16, d28 @ input0 * (*ptr2) - vmull.s32 q11, d17, d28 @ input0 * (*ptr2) - - vrshrn.i64 d1, q13, #15 - vrshrn.i64 d18, q10, #15 - vrshrn.i64 d19, q11, #15 - - vst1.32 {d16, d17}, [r12]! @ *ptr2 - - vadd.i32 q9, q0, q9 - subs r5, #1 - vst1.32 {d18, d19}, [r4]! @ *ptr1 - - bgt LOOP - - @ Check how many samples still need to be processed. - subs r6, #2 - blt LAST_SAMPLE - - @ Process two more samples: - vld1.32 d0, [r3]! @ *ptr0 - - vmull.s32 q11, d0, d28 @ tmp32a = input0 * (*ptr0) - vmull.s32 q13, d0, d29 @ input1 * (*ptr0) - - vld1.32 d18, [r12] @ *ptr2 - vrshrn.i64 d4, q11, #15 - - vadd.i32 d7, d4, d18 @ tmp32b = *ptr2 + tmp32a - vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b) - vrshrn.i64 d16, q11, #16 - - vmull.s32 q11, d16, d28 @ input0 * (*ptr2) - vst1.32 d16, [r12]! @ *ptr2 - - vrshrn.i64 d0, q13, #15 - vrshrn.i64 d19, q11, #15 - vadd.i32 d19, d0, d19 - - vst1.32 d19, [r4]! @ *ptr1 - - @ If there's still one more sample, process it here. -LAST_SAMPLE: - cmp r6, #1 - bne END - - @ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)); - - ldr r7, [r3] @ *ptr0 - ldr r8, [r12] @ *ptr2 - - smulwb r5, r7, r0 @ tmp32a = *ptr0 * input0 >> 16 - add r8, r8, r5, lsl #1 @ tmp32b = *ptr2 + (tmp32a << 1) - smull r5, r6, r8, r2 @ tmp32b * input2, in 64 bits - lsl r6, #16 - add r6, r5, lsr #16 @ Only take the middle 32 bits - str r6, [r12] @ Output (*ptr2, as 32 bits) - - @ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); - - smulwb r5, r7, r1 @ tmp32a = *ptr0 * input1 >> 16 - smulwb r6, r6, r0 @ tmp32b = *ptr2 * input0 >> 16 - lsl r5, r5, #1 - add r5, r6, lsl #1 - str r5, [r4] @ Output (*ptr1) - -END: - pop {r4-r8} - bx lr diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model.h b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model.h index 1270c1429..aac927586 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model.h +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model.h @@ -53,15 +53,6 @@ int32_t WebRtcIsacfix_CalculateResidualEnergyC(int lpc_order, int32_t* corr_coeffs, int* q_val_residual_energy); -#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) -int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order, - int32_t q_val_corr, - int q_val_polynomial, - int16_t* a_polynomial, - int32_t* corr_coeffs, - int* q_val_residual_energy); -#endif - #if defined(MIPS_DSP_R2_LE) int32_t WebRtcIsacfix_CalculateResidualEnergyMIPS(int lpc_order, int32_t q_val_corr, diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S deleted file mode 100644 index a5955c27a..000000000 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S +++ /dev/null @@ -1,173 +0,0 @@ -@ -@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. -@ -@ Use of this source code is governed by a BSD-style license -@ that can be found in the LICENSE file in the root of the source -@ tree. An additional intellectual property rights grant can be found -@ in the file PATENTS. All contributing project authors may -@ be found in the AUTHORS file in the root of the source tree. -@ - -@ Contains a function for WebRtcIsacfix_CalculateResidualEnergyNeon() in -@ iSAC codec, optimized for ARM Neon platform. Reference code in -@ lpc_masking_model.c. - -#include "webrtc/system_wrappers/interface/asm_defines.h" - -GLOBAL_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon -.align 2 - -@ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order, -@ int32_t q_val_corr, -@ int q_val_polynomial, -@ int16_t* a_polynomial, -@ int32_t* corr_coeffs, -@ int* q_val_residual_energy); -DEFINE_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon - push {r4-r11} - - sub r13, r13, #16 - str r1, [r13, #8] - str r2, [r13, #12] - - mov r4, #1 - vmov.s64 q11, #0 @ Initialize shift_internal. - vmov.s64 q13, #0 @ Initialize sum64. - vmov.s64 q10, #0 - vmov.u8 d20[0], r4 @ Set q10 to 1. - - cmp r0, #0 - blt POST_LOOP_I - - add r9, r3, r0, asl #1 @ &a_polynomial[lpc_order] - mov r6, #0 @ Loop counter i. - ldr r11, [r13, #48] - sub r10, r0, #1 - mov r7, r3 @ &a_polynomial[0] - str r9, [r13, #4] - -LOOP_I: - ldr r2, [r11], #4 @ corr_coeffs[i] - vmov.s64 q15, #0 @ Initialize the sum64_tmp. - vdup.s32 d25, r2 - - cmp r0, r6 @ Compare lpc_order to i. - movle r2, r6 - ble POST_LOOP_J - - mov r1, r6 @ j = i; - mov r12, r7 @ &a_polynomial[i] - mov r4, r3 @ &a_polynomial[j - i] - -LOOP_J: - ldr r8, [r12], #4 - ldr r5, [r4], #4 - vmov.u32 d0[0], r8 - vmov.u32 d1[0], r5 - vmull.s16 q0, d0, d1 - vmull.s32 q0, d0, d25 - cmp r6, #0 @ i == 0? - vshl.s64 q0, q11 - beq SUM1 - vshl.s64 q0, #1 - -SUM1: - vqadd.s64 q14, q0, q15 @ Sum and test overflow. - add r1, r1, #2 - bvc MOV1 @ Skip the shift if there's no overflow. - vshr.s64 q0, #1 - vshr.s64 q15, #1 - vadd.s64 q14, q0, q15 - vsub.s64 q11, q10 - -MOV1: - cmp r0, r1 @ Compare lpc_order to j. - vmov.s64 q15, q14 - bgt LOOP_J - - bic r1, r10, #1 - add r2, r6, #2 - add r2, r1, r2 - -POST_LOOP_J: - vqadd.s64 q0, q13, q15 @ Sum and test overflow. - bvc MOV2 @ Skip the shift if there's no overflow. - vshr.s64 q13, #1 - vshr.s64 q15, #1 - vadd.s64 q0, q13, q15 - vsub.s64 q11, q10 - -MOV2: - vmov.s64 q13, q0 @ update sum64. - cmp r2, r0 - bne CHECK_LOOP_CONDITION - - @ Last sample in the inner loop. - ldr r4, [r13, #4] - ldrsh r8, [r4] - ldrsh r12, [r9] - mul r8, r8, r12 - vmov.s32 d0[0], r8 - vmull.s32 q0, d0, d25 - cmp r6, #0 @ i == 0? - vshl.s64 q0, q11 - beq SUM2 - vshl.s64 q0, #1 - -SUM2: - vqadd.s64 d1, d0, d26 @ Sum and test overflow. - bvc MOV3 @ Skip the shift if there's no overflow. - vshr.s64 q13, #1 - vshr.s64 d0, #1 - vadd.s64 d1, d0, d26 - vsub.s64 q11, q10 - -MOV3: - vmov.s64 d26, d1 @ update sum64. - -CHECK_LOOP_CONDITION: - add r6, r6, #1 - sub r9, r9, #2 - cmp r0, r6 @ Compare i to lpc_order. - sub r10, r10, #1 - add r7, r7, #2 - bge LOOP_I - -POST_LOOP_I: - mov r3, #0 - vqadd.s64 d0, d26, d27 @ Sum and test overflow. - bvc GET_SHIFT_NORM @ Skip the shift if there's no overflow. - vshr.s64 q13, #1 - vadd.s64 d0, d26, d27 - vsub.s64 q11, q10 - -GET_SHIFT_NORM: - vcls.s32 d1, d0 @ Count leading extra sign bits. - vmov.32 r2, d1[1] @ Store # of sign bits of only the 32 MSBs. - vmovl.s32 q1, d1 - vshl.s64 d0, d3 @ d3 contains # of sign bits of the 32 MSBs. - - vcls.s32 d1, d0 @ Count again the leading extra sign bits. - vmov.s32 r1, d1[1] @ Store # of sign bits of only the 32 MSBs. - vmovl.s32 q1, d1 - vshl.s64 d0, d3 @ d3 contains # of sign bits of the 32 MSBs. - - vmov.s32 r0, d0[1] @ residual_energy - vmov.s32 r3, d22[0] @ shift_internal - - @ Calculate the value for q_val_residual_energy. - ldr r4, [r13, #8] @ q_val_corr - ldr r5, [r13, #12] @ q_val_polynomial - sub r12, r4, #32 - add r12, r12, r5, asl #1 - add r1, r12, r1 @ add 1st part of shift_internal. - add r12, r1, r2 @ add 2nd part of shift_internal. - ldr r2, [r13, #52] - add r3, r12, r3 @ value for q_val_residual_energy. - str r3, [r2, #0] - - add r13, r13, #16 - pop {r4-r11} - bx r14 - - diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_unittest.cc b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_unittest.cc index aaeff2c5c..0d32ff8c5 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_unittest.cc +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_unittest.cc @@ -58,11 +58,4 @@ class LpcMaskingModelTest : public testing::Test { TEST_F(LpcMaskingModelTest, CalculateResidualEnergyTest) { CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyC); -#ifdef WEBRTC_DETECT_ARM_NEON - if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) { - CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyNeon); - } -#elif defined(WEBRTC_ARCH_ARM_NEON) - CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyNeon); -#endif } diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_neon.S deleted file mode 100644 index 98ce3899a..000000000 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_neon.S +++ /dev/null @@ -1,645 +0,0 @@ -@ -@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. -@ -@ Use of this source code is governed by a BSD-style license -@ that can be found in the LICENSE file in the root of the source -@ tree. An additional intellectual property rights grant can be found -@ in the file PATENTS. All contributing project authors may -@ be found in the AUTHORS file in the root of the source tree. -@ -@ Reference code in transform.c. Bit not exact due to how rounding is -@ done in C code and ARM instructions, but quality by assembly code is -@ not worse. - -#include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h" -#include "webrtc/system_wrappers/interface/asm_defines.h" - -GLOBAL_FUNCTION WebRtcIsacfix_Spec2TimeNeon -GLOBAL_FUNCTION WebRtcIsacfix_Time2SpecNeon -GLOBAL_LABEL WebRtcIsacfix_kSinTab1 -GLOBAL_LABEL WebRtcIsacfix_kCosTab1 -GLOBAL_LABEL WebRtcIsacfix_kSinTab2 - -@ void WebRtcIsacfix_Time2SpecNeon(int16_t* inre1Q9, -@ int16_t* inre2Q9, -@ int16_t* outreQ7, -@ int16_t* outimQ7); - -DEFINE_FUNCTION WebRtcIsacfix_Time2SpecNeon -.align 2 - push {r3-r11,lr} @ need to push r4-r11, but push r3 too to keep - @ stack 8-byte aligned - sub sp, sp, #(16 + FRAMESAMPLES * 4) - - str r0, [sp] @ inre1Q9 - str r1, [sp, #4] @ inre2Q9 - str r2, [sp, #8] @ outreQ7 - str r3, [sp, #12] @ outimQ7 - - mov r8, #(FRAMESAMPLES - 16) - add r12, r0, r8 @ &inreQ7[FRAMESAMPLES / 2 - 4] - add r11, r1, r8 @ &inimQ7[FRAMESAMPLES / 2 - 4] - add r4, sp, #16 @ tmpreQ16; - add r5, sp, #(16 + FRAMESAMPLES * 2) @ tmpimQ16; - - adr r9, WebRtcIsacfix_kCosTab1 -#if defined(__APPLE__) - mov r6, #:lower16:(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1) -#else - mov r6, #(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1) -#endif - add r10, r9, r6 @ WebRtcIsacfix_kSinTab1 - - vmov.u32 q14, #0 @ Initialize the maximum values for tmpInIm. - vmov.u32 q15, #0 @ Initialize the maximum values for tmpInRe. - movw r6, #16921 @ 0.5 / sqrt(240) in Q19 - lsl r6, #5 @ Together with vqdmulh, net effect is ">> 26". - mov r8, #(FRAMESAMPLES / 2) @ loop counter - vdup.s32 q11, r6 - -Time2Spec_TransformAndFindMax: -@ Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code. - - subs r8, #8 - - vld1.16 {q0}, [r9, :64]! @ WebRtcIsacfix_kCosTab1[] - vld1.16 {q2}, [r0]! @ inre1Q9[] - vmull.s16 q8, d0, d4 @ WebRtcIsacfix_kCosTab1[k] * inre1Q9[k] - vld1.16 {q1}, [r10, :64]! @ WebRtcIsacfix_kSinTab1[] - vmull.s16 q9, d1, d5 @ WebRtcIsacfix_kCosTab1[k] * inre1Q9[k] - vld1.16 {q3}, [r1]! @ inre2Q9[] - vmlal.s16 q8, d2, d6 @ WebRtcIsacfix_kSinTab1[k] * inre2Q9[k] - vmlal.s16 q9, d3, d7 @ WebRtcIsacfix_kSinTab1[k] * inre2Q9[k] - vmull.s16 q12, d0, d6 @ WebRtcIsacfix_kCosTab1[k] * inre2Q9[k] - vmull.s16 q13, d1, d7 @ WebRtcIsacfix_kCosTab1[k] * inre2Q9[k] - vmlsl.s16 q12, d2, d4 @ WebRtcIsacfix_kSinTab1[k] * inre1Q9[k] - vmlsl.s16 q13, d3, d5 @ WebRtcIsacfix_kSinTab1[k] * inre1Q9[k] - - vqdmulh.s32 q0, q8, q11 @ xrQ16 * factQ19 - vqdmulh.s32 q1, q9, q11 @ xrQ16 * factQ19 - vqdmulh.s32 q2, q12, q11 @ xrQ16 * factQ19 - vqdmulh.s32 q3, q13, q11 @ xrQ16 * factQ19 - - @ Find the absolute maximum in the vectors and store them. - vabs.s32 q8, q0 - vabs.s32 q9, q1 - vabs.s32 q12, q2 - vst1.32 {q0, q1}, [r4]! @ tmpreQ16[k] - vabs.s32 q13, q3 - vmax.u32 q14, q8 @ Use u32 so we don't lose the value 0x80000000. - vmax.u32 q15, q12 - vst1.32 {q2, q3}, [r5]! @ tmpimQ16[k] - vmax.u32 q15, q13 - vmax.u32 q14, q9 @ Maximum for outre1Q16[]. - - bgt Time2Spec_TransformAndFindMax - - @ Find the maximum value in the Neon registers - vmax.u32 d28, d29 - vmax.u32 d30, d31 - vpmax.u32 d28, d28, d28 @ Both 32 bits words hold the same value tmpInIm. - vpmax.u32 d30, d30, d30 @ Both 32 bits words hold the same value tmpInRe. - vmax.s32 d30, d28, d30 @ if (yrQ16 > xrQ16) {xrQ16 = yrQ16}; - - ldr r4, [sp] @ inre1Q9 - vcls.s32 d31, d30 @ sh = WebRtcSpl_NormW32(tmpInRe); - ldr r5, [sp, #4] @ inre2Q9 - vmov.i32 d30, #24 - add r6, sp, #16 @ tmpreQ16; - vsub.s32 d31, d31, d30 @ sh = sh - 24; - add r7, sp, #(16 + FRAMESAMPLES * 2) @ tmpimQ16; - vdup.s32 q8, d31[0] @ sh - - mov r8, #(FRAMESAMPLES / 2) @ loop counter - -Time2Spec_PreFftShift: - subs r8, #16 - - vld1.32 {q0, q1}, [r6]! @ tmpreQ16[] - vrshl.s32 q0, q0, q8 - vld1.32 {q2, q3}, [r6]! @ tmpreQ16[] - vrshl.s32 q1, q1, q8 - vld1.32 {q10, q11}, [r7]! @ tmpimQ16[] - vrshl.s32 q2, q2, q8 - vld1.32 {q12, q13}, [r7]! @ tmpimQ16[] - vrshl.s32 q3, q3, q8 - vrshl.s32 q10, q10, q8 - vrshl.s32 q11, q11, q8 - vrshl.s32 q12, q12, q8 - vrshl.s32 q13, q13, q8 - - vmovn.s32 d0, q0 - vmovn.s32 d1, q1 - vmovn.s32 d2, q2 - vmovn.s32 d3, q3 - vmovn.s32 d4, q10 - vmovn.s32 d5, q11 - vmovn.s32 d6, q12 - vmovn.s32 d7, q13 - - vst1.16 {q0, q1}, [r4]! @ inre1Q9[] - vst1.16 {q2, q3}, [r5]! @ inre2Q9[] - - bgt Time2Spec_PreFftShift - - vmov.s32 r10, d16[0] @ Store value of sh. - ldr r0, [sp] @ inre1Q9 - ldr r1, [sp, #4] @ inre2Q9 - mov r2, #-1 - CALL_FUNCTION WebRtcIsacfix_FftRadix16Fastest - - vdup.s32 q8, r10 @ sh - mov r8, #(FRAMESAMPLES - 8) - ldr r2, [sp, #8] @ outreQ7 - ldr r3, [sp, #12] @ outimQ7 - add r11, r2, r8 @ &outRe1Q16[FRAMESAMPLES / 2 - 4] - add r12, r3, r8 @ &outim2Q16[FRAMESAMPLES / 2 - 4] - ldr r6, [sp] @ inre1Q9 - ldr r7, [sp, #4] @ inre2Q9 - add r4, r6, r8 @ &inre1Q9[FRAMESAMPLES / 2 - 4] - add r5, r7, r8 @ &inre2Q9[FRAMESAMPLES / 2 - 4] - adr r10, WebRtcIsacfix_kSinTab2 - - add r9, r10, #(120*2 - 8) @ &WebRtcIsacfix_kSinTab2[119 - 4] - - vneg.s32 q15, q8 @ -sh - vmov.i32 q0, #23 - vsub.s32 q15, q15, q0 @ -sh - 23 - - mov r8, #(FRAMESAMPLES / 4) @ loop counter - - @ Pre-load variables. - vld1.16 {d2}, [r4] @ inre1Q9[FRAMESAMPLES / 2 - 4 - i] - vld1.16 {d3}, [r5] @ inre2Q9[FRAMESAMPLES / 2 - 4 - i] - vld1.16 {d0}, [r6]! @ inre1Q9 - vld1.16 {d1}, [r7]! @ inre2Q9 - -Time2Spec_PostFftTransform: -@ By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)", -@ ">> 14" and then ">> 9" as in the C code. - - vld1.16 {d6}, [r9, :64] @ kCosTab2[] - vneg.s16 d6, d6 - vld1.16 {d7}, [r10, :64]! @ WebRtcIsacfix_kSinTab2[] - vrev64.16 q1, q1 @ Reverse samples in 2nd half of xrQ16[]. - vqadd.s16 d4, d0, d2 @ xrQ16 - vqsub.s16 d5, d1, d3 @ xiQ16 - vrev64.16 d6, d6 - - sub r9, #8 @ Update pointers for kCosTab2[]. - sub r4, #8 @ Update pointers for inre1Q9[]. - sub r5, #8 @ Update pointers for inr22Q9[]. - subs r8, #4 @ Update loop counter. - - vqadd.s16 d1, d1, d3 @ yrQ16 - vqsub.s16 d0, d2, d0 @ yiQ16 - - vmull.s16 q12, d6, d4 @ kCosTab2[k] * xrQ16 - vmlsl.s16 q12, d7, d5 @ WebRtcIsacfix_kSinTab2[k] * xiQ16 - vmull.s16 q13, d7, d4 @ WebRtcIsacfix_kSinTab2[k] * xrQ16 - vmlal.s16 q13, d6, d5 @ kCosTab2[k] * xiQ16 - vmull.s16 q9, d7, d1 @ WebRtcIsacfix_kSinTab2[k] * yrQ16 - vmlal.s16 q9, d6, d0 @ kCosTab2[k] * yiQ16 - vmull.s16 q10, d7, d0 @ WebRtcIsacfix_kSinTab2[k] * yiQ16 - vmlsl.s16 q10, d6, d1 @ kCosTab2[k] * yrQ16 - - vshl.s32 q12, q12, q15 - vshl.s32 q13, q13, q15 - vshl.s32 q9, q9, q15 - vshl.s32 q10, q10, q15 - - vneg.s32 q8, q9 - vld1.16 {d0}, [r6]! @ inre1Q9 - vmovn.s32 d24, q12 - vld1.16 {d1}, [r7]! @ inre2Q9 - vmovn.s32 d25, q13 - vld1.16 {d2}, [r4] @ inre1Q9[FRAMESAMPLES / 2 - 4 - i] - vmovn.s32 d5, q10 - vld1.16 {d3}, [r5] @ inre2Q9[FRAMESAMPLES / 2 - 4 - i] - vmovn.s32 d4, q8 - vst1.16 {d24}, [r2]! @ outreQ7[k] - vrev64.16 q2, q2 @ Reverse the order of the samples. - vst1.16 {d25}, [r3]! @ outimQ7[k] - vst1.16 {d4}, [r11] @ outreQ7[FRAMESAMPLES / 2 - 1 - k] - vst1.16 {d5}, [r12] @ outimQ7[FRAMESAMPLES / 2 - 1 - k] - sub r11, #8 @ Update pointers for outreQ7[]. - sub r12, #8 @ Update pointers for outimQ7[]. - - bgt Time2Spec_PostFftTransform - - add sp, sp, #(16 + FRAMESAMPLES * 4) - pop {r3-r11,pc} - -.align 8 -@ Cosine table 1 in Q14 -WebRtcIsacfix_kCosTab1: -_WebRtcIsacfix_kCosTab1: @ Label for iOS - .short 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315 - .short 16294, 16270, 16244, 16214, 16182, 16147, 16110, 16069 - .short 16026, 15980, 15931, 15880, 15826, 15769, 15709, 15647 - .short 15582, 15515, 15444, 15371, 15296, 15218, 15137, 15053 - .short 14968, 14879, 14788, 14694, 14598, 14500, 14399, 14295 - .short 14189, 14081, 13970, 13856, 13741, 13623, 13502, 13380 - .short 13255, 13128, 12998, 12867, 12733, 12597, 12458, 12318 - .short 12176, 12031, 11885, 11736, 11585, 11433, 11278, 11121 - .short 10963, 10803, 10641, 10477, 10311, 10143, 9974, 9803 - .short 9630, 9456, 9280, 9102, 8923, 8743, 8561, 8377 - .short 8192, 8006, 7818, 7629, 7438, 7246, 7053, 6859 - .short 6664, 6467, 6270, 6071, 5872, 5671, 5469, 5266 - .short 5063, 4859, 4653, 4447, 4240, 4033, 3825, 3616 - .short 3406, 3196, 2986, 2775, 2563, 2351, 2139, 1926 - .short 1713, 1499, 1285, 1072, 857, 643, 429, 214 - .short 0, -214, -429, -643, -857, -1072, -1285, -1499 - .short -1713, -1926, -2139, -2351, -2563, -2775, -2986, -3196 - .short -3406, -3616, -3825, -4033, -4240, -4447, -4653, -4859 - .short -5063, -5266, -5469, -5671, -5872, -6071, -6270, -6467 - .short -6664, -6859, -7053, -7246, -7438, -7629, -7818, -8006 - .short -8192, -8377, -8561, -8743, -8923, -9102, -9280, -9456 - .short -9630, -9803, -9974, -10143, -10311, -10477, -10641, -10803 - .short -10963, -11121, -11278, -11433, -11585, -11736, -11885, -12031 - .short -12176, -12318, -12458, -12597, -12733, -12867, -12998, -13128 - .short -13255, -13380, -13502, -13623, -13741, -13856, -13970, -14081 - .short -14189, -14295, -14399, -14500, -14598, -14694, -14788, -14879 - .short -14968, -15053, -15137, -15218, -15296, -15371, -15444, -15515 - .short -15582, -15647, -15709, -15769, -15826, -15880, -15931, -15980 - .short -16026, -16069, -16110, -16147, -16182, -16214, -16244, -16270 - .short -16294, -16315, -16333, -16349, -16362, -16371, -16378, -16383 - -.align 8 -@ Sine table 2 in Q14 -WebRtcIsacfix_kSinTab2: -_WebRtcIsacfix_kSinTab2: @ Label for iOS - .short 16384, -16381, 16375, -16367, 16356, -16342, 16325, -16305 - .short 16283, -16257, 16229, -16199, 16165, -16129, 16090, -16048 - .short 16003, -15956, 15906, -15853, 15798, -15739, 15679, -15615 - .short 15549, -15480, 15408, -15334, 15257, -15178, 15095, -15011 - .short 14924, -14834, 14741, -14647, 14549, -14449, 14347, -14242 - .short 14135, -14025, 13913, -13799, 13682, -13563, 13441, -13318 - .short 13192, -13063, 12933, -12800, 12665, -12528, 12389, -12247 - .short 12104, -11958, 11810, -11661, 11509, -11356, 11200, -11042 - .short 10883, -10722, 10559, -10394, 10227, -10059, 9889, -9717 - .short 9543, -9368, 9191, -9013, 8833, -8652, 8469, -8285 - .short 8099, -7912, 7723, -7534, 7342, -7150, 6957, -6762 - .short 6566, -6369, 6171, -5971, 5771, -5570, 5368, -5165 - .short 4961, -4756, 4550, -4344, 4137, -3929, 3720, -3511 - .short 3301, -3091, 2880, -2669, 2457, -2245, 2032, -1819 - .short 1606, -1392, 1179, -965, 750, -536, 322, -107 - -@ Table kCosTab2 was removed since its data is redundant with kSinTab2. - -.align 8 -@ Sine table 1 in Q14 -WebRtcIsacfix_kSinTab1: -_WebRtcIsacfix_kSinTab1: @ Label for iOS - .short 0, 214, 429, 643, 857, 1072, 1285, 1499 - .short 1713, 1926, 2139, 2351, 2563, 2775, 2986, 3196 - .short 3406, 3616, 3825, 4033, 4240, 4447, 4653, 4859 - .short 5063, 5266, 5469, 5671, 5872, 6071, 6270, 6467 - .short 6664, 6859, 7053, 7246, 7438, 7629, 7818, 8006 - .short 8192, 8377, 8561, 8743, 8923, 9102, 9280, 9456 - .short 9630, 9803, 9974, 10143, 10311, 10477, 10641, 10803 - .short 10963, 11121, 11278, 11433, 11585, 11736, 11885, 12031 - .short 12176, 12318, 12458, 12597, 12733, 12867, 12998, 13128 - .short 13255, 13380, 13502, 13623, 13741, 13856, 13970, 14081 - .short 14189, 14295, 14399, 14500, 14598, 14694, 14788, 14879 - .short 14968, 15053, 15137, 15218, 15296, 15371, 15444, 15515 - .short 15582, 15647, 15709, 15769, 15826, 15880, 15931, 15980 - .short 16026, 16069, 16110, 16147, 16182, 16214, 16244, 16270 - .short 16294, 16315, 16333, 16349, 16362, 16371, 16378, 16383 - .short 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315 - .short 16294, 16270, 16244, 16214, 16182, 16147, 16110, 16069 - .short 16026, 15980, 15931, 15880, 15826, 15769, 15709, 15647 - .short 15582, 15515, 15444, 15371, 15296, 15218, 15137, 15053 - .short 14968, 14879, 14788, 14694, 14598, 14500, 14399, 14295 - .short 14189, 14081, 13970, 13856, 13741, 13623, 13502, 13380 - .short 13255, 13128, 12998, 12867, 12733, 12597, 12458, 12318 - .short 12176, 12031, 11885, 11736, 11585, 11433, 11278, 11121 - .short 10963, 10803, 10641, 10477, 10311, 10143, 9974, 9803 - .short 9630, 9456, 9280, 9102, 8923, 8743, 8561, 8377 - .short 8192, 8006, 7818, 7629, 7438, 7246, 7053, 6859 - .short 6664, 6467, 6270, 6071, 5872, 5671, 5469, 5266 - .short 5063, 4859, 4653, 4447, 4240, 4033, 3825, 3616 - .short 3406, 3196, 2986, 2775, 2563, 2351, 2139, 1926 - .short 1713, 1499, 1285, 1072, 857, 643, 429, 214 - -@ void WebRtcIsacfix_Spec2TimeNeon(int16_t *inreQ7, -@ int16_t *inimQ7, -@ int32_t *outre1Q16, -@ int32_t *outre2Q16); - -DEFINE_FUNCTION WebRtcIsacfix_Spec2TimeNeon -.align 2 - push {r3-r11,lr} @ need to push r4-r11, but push r3 too to keep - @ stack 8-byte aligned - - sub sp, sp, #16 - str r0, [sp] @ inreQ7 - str r1, [sp, #4] @ inimQ7 - str r2, [sp, #8] @ outre1Q16 - str r3, [sp, #12] @ outre2Q16 - - mov r8, #(FRAMESAMPLES - 16) - add r12, r0, r8 @ &inreQ7[FRAMESAMPLES / 2 - 8] - add r11, r1, r8 @ &inimQ7[FRAMESAMPLES / 2 - 8] - add r4, r2, r8, lsl #1 @ &outRe1Q16[FRAMESAMPLES / 2 - 8] - add r6, r3, r8, lsl #1 @ &outRe2Q16[FRAMESAMPLES / 2 - 8] - - mov r8, #(FRAMESAMPLES / 2) @ loop counter - adr r10, WebRtcIsacfix_kSinTab2 - add r9, r10, #(120*2 - 16) @ &WebRtcIsacfix_kSinTab2[119 - 8] - - vpush {q4-q7} - - mov r5, #-32 - mov r7, #-16 - vmov.u32 q6, #0 @ Initialize the maximum values for tmpInIm. - vmov.u32 q7, #0 @ Initialize the maximum values for tmpInRe. - -TransformAndFindMax: -@ Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code. -@ Bit-exact. - - subs r8, #16 - - vld1.16 {q0}, [r9, :64] @ kCosTab2[] - sub r9, #16 - vld1.16 {q2}, [r0]! @ inreQ7[] - vneg.s16 q0, q0 - vld1.16 {q3}, [r1]! @ inimQ7[] - vrev64.16 d0, d0 - vrev64.16 d1, d1 - vld1.16 {q1}, [r10, :64]! @ WebRtcIsacfix_kSinTab2[] - vswp d0, d1 - - vmull.s16 q8, d2, d6 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k] - vmull.s16 q9, d3, d7 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k] - vmlal.s16 q8, d0, d4 @ kCosTab2[k] * inreQ7[k] - vmlal.s16 q9, d1, d5 @ kCosTab2[k] * inreQ7[k] - vmull.s16 q12, d0, d6 @ kCosTab2[k] * inimQ7[k] - vmull.s16 q13, d1, d7 @ kCosTab2[k] * inimQ7[k] - vmlsl.s16 q12, d2, d4 @ WebRtcIsacfix_kSinTab2[k] * inreQ7[k] - vmlsl.s16 q13, d3, d5 @ WebRtcIsacfix_kSinTab2[k] * inreQ7[k] - - vld1.16 {q2}, [r11], r7 @ inimQ7[FRAMESAMPLES / 2 - 8 + i] - vld1.16 {q3}, [r12], r7 @ inreQ7[FRAMESAMPLES / 2 - 8 + i] - - vrev64.16 q2, q2 @ Reverse the order of the samples - vrev64.16 q3, q3 @ Reverse the order of the samples - - vmull.s16 q14, d2, d5 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k] - vmull.s16 q15, d3, d4 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k] - vmlsl.s16 q14, d0, d7 @ q14 -= kCosTab2[k] * inreQ7[k] - vmlsl.s16 q15, d1, d6 @ q15 -= kCosTab2[k] * inreQ7[k] - - vmull.s16 q10, d0, d5 @ kCosTab2[k] * inimQ7[] - vmull.s16 q11, d1, d4 @ kCosTab2[k] * inimQ7[] - vmlal.s16 q10, d2, d7 @ q10 += WebRtcIsacfix_kSinTab2[k] * inreQ7[] - vmlal.s16 q11, d3, d6 @ q11 += WebRtcIsacfix_kSinTab2[k] * inreQ7[] - - vshr.s32 q8, q8, #5 @ xrQ16 - vshr.s32 q9, q9, #5 @ xrQ16 - vshr.s32 q12, q12, #5 @ xiQ16 - vshr.s32 q13, q13, #5 @ xiQ16 - vshr.s32 q14, q14, #5 @ yiQ16 - vshr.s32 q15, q15, #5 @ yiQ16 - - vneg.s32 q10, q10 - vneg.s32 q11, q11 - - @ xrQ16 - yiQ16 - vsub.s32 q0, q8, q14 - vsub.s32 q1, q9, q15 - - vshr.s32 q10, q10, #5 @ yrQ16 - vshr.s32 q11, q11, #5 @ yrQ16 - - @ xrQ16 + yiQ16 - vadd.s32 q3, q8, q14 - vadd.s32 q2, q9, q15 - - @ yrQ16 + xiQ16 - vadd.s32 q4, q10, q12 - vadd.s32 q5, q11, q13 - - @ yrQ16 - xiQ16 - vsub.s32 q8, q11, q13 - vsub.s32 q9, q10, q12 - - @ Reverse the order of the samples - vrev64.32 q2, q2 - vrev64.32 q3, q3 - vrev64.32 q8, q8 - vrev64.32 q9, q9 - vswp d4, d5 - vswp d6, d7 - - vst1.32 {q0, q1}, [r2]! @ outre1Q16[k] - vswp d16, d17 - vswp d18, d19 - vst1.32 {q2, q3}, [r4], r5 @ outre1Q16[FRAMESAMPLES / 2 - 1 - k] - - @ Find the absolute maximum in the vectors and store them in q6 and q7. - vabs.s32 q10, q0 - vabs.s32 q14, q4 - vabs.s32 q11, q1 - vabs.s32 q15, q5 - vabs.s32 q12, q2 - vmax.u32 q6, q10 @ Use u32 so we don't lose the value 0x80000000. - vmax.u32 q7, q14 @ Maximum for outre2Q16[]. - vabs.s32 q0, q8 - vmax.u32 q6, q11 @ Maximum for outre1Q16[]. - vmax.u32 q7, q15 - vabs.s32 q13, q3 - vmax.u32 q6, q12 - vmax.u32 q7, q0 - vabs.s32 q1, q9 - vst1.32 {q4, q5}, [r3]! @ outre2Q16[k] - vst1.32 {q8, q9}, [r6], r5 @ outre2Q16[FRAMESAMPLES / 2 - 1 - k] - vmax.u32 q6, q13 - vmax.u32 q7, q1 - - bgt TransformAndFindMax - - adr r10, WebRtcIsacfix_kSinTab1 -#if defined(__APPLE__) - mov r2, #:lower16:(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1) -#else - mov r2, #(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1) -#endif - - sub r11, r10, r2 @ WebRtcIsacfix_kCosTab1 - - @ Find the maximum value in the Neon registers - vmax.u32 d12, d13 - vmax.u32 d14, d15 - vpmax.u32 d12, d12, d12 @ Both 32 bits words hold the same value tmpInIm. - vpmax.u32 d14, d14, d14 @ Both 32 bits words hold the same value tmpInRe. - vmax.s32 d0, d12, d14 @ if (tmpInIm>tmpInRe) tmpInRe = tmpInIm; - - vpop {q4-q7} - - ldr r4, [sp] @ inreQ7 - vcls.s32 d1, d0 @ sh = WebRtcSpl_NormW32(tmpInRe); - ldr r5, [sp, #4] @ inimQ7 - vmov.i32 d0, #24 @ sh = sh-24; - ldr r6, [sp, #8] @ outre1Q16 - vsub.s32 d1, d1, d0 - ldr r7, [sp, #12] @ outre2Q16 - vdup.s32 q8, d1[0] @ sh - - mov r8, #(FRAMESAMPLES / 2) - -PreFftShift: - subs r8, #16 - vld1.32 {q0, q1}, [r6]! @ outre1Q16[] - vld1.32 {q2, q3}, [r6]! @ outre1Q16[] - vrshl.s32 q0, q0, q8 - vrshl.s32 q1, q1, q8 - vrshl.s32 q2, q2, q8 - vrshl.s32 q3, q3, q8 - vld1.32 {q10, q11}, [r7]! @ outre2Q16[] - vld1.32 {q12, q13}, [r7]! @ outre2Q16[] - vrshl.s32 q10, q10, q8 - vrshl.s32 q11, q11, q8 - vrshl.s32 q12, q12, q8 - vrshl.s32 q13, q13, q8 - - vmovn.s32 d0, q0 - vmovn.s32 d1, q1 - vmovn.s32 d2, q2 - vmovn.s32 d3, q3 - vmovn.s32 d4, q10 - vmovn.s32 d5, q11 - vmovn.s32 d6, q12 - vmovn.s32 d7, q13 - - vst1.16 {q0, q1}, [r4]! @ inreQ7[] - vst1.16 {q2, q3}, [r5]! @ inimQ7[] - - bgt PreFftShift - - vmov.s32 r8, d16[0] @ Store value of sh. - ldr r0, [sp] @ inreQ7 - ldr r1, [sp, #4] @ inimQ7 - mov r2, #1 - CALL_FUNCTION WebRtcIsacfix_FftRadix16Fastest - - vdup.s32 q8, r8 @ sh - mov r9, r11 @ WebRtcIsacfix_kCosTab1 - ldr r4, [sp] @ inreQ7 - ldr r5, [sp, #4] @ inimQ7 - ldr r6, [sp, #8] @ outre1Q16 - ldr r7, [sp, #12] @ outre2Q16 - mov r8, #(FRAMESAMPLES / 2) - vneg.s32 q15, q8 @ -sh - movw r0, #273 - lsl r0, #15 @ Together with vqdmulh, net effect is ">> 16". - vdup.s32 q14, r0 - -PostFftShiftDivide: - subs r8, #16 - - vld1.16 {q0, q1}, [r4]! @ inreQ7 - vmovl.s16 q10, d0 - vmovl.s16 q11, d1 - vld1.16 {q2, q3}, [r5]! @ inimQ7 - vmovl.s16 q8, d2 - vmovl.s16 q9, d3 - - vshl.s32 q10, q10, q15 - vshl.s32 q11, q11, q15 - vshl.s32 q8, q8, q15 - vshl.s32 q9, q9, q15 - - vqdmulh.s32 q10, q10, q14 - vqdmulh.s32 q11, q11, q14 - vqdmulh.s32 q8, q8, q14 - vqdmulh.s32 q9, q9, q14 - - vmovl.s16 q0, d4 - vmovl.s16 q1, d5 - vmovl.s16 q2, d6 - vmovl.s16 q3, d7 - - vshl.s32 q0, q0, q15 - vshl.s32 q1, q1, q15 - vshl.s32 q2, q2, q15 - vshl.s32 q3, q3, q15 - - @ WEBRTC_SPL_MUL_16_32_RSFT16(273, outre2Q16[k]) - vqdmulh.s32 q0, q0, q14 - vqdmulh.s32 q1, q1, q14 - vst1.32 {q10, q11}, [r6]! @ outre1Q16[] - vqdmulh.s32 q2, q2, q14 - vqdmulh.s32 q3, q3, q14 - vst1.32 {q8, q9}, [r6]! @ outre1Q16[] - vst1.32 {q0, q1}, [r7]! @ outre2Q16[] - vst1.32 {q2, q3}, [r7]! @ outre2Q16[] - - bgt PostFftShiftDivide - - mov r8, #(FRAMESAMPLES / 2) - ldr r2, [sp, #8] @ outre1Q16 - ldr r3, [sp, #12] @ outre2Q16 - movw r0, #31727 - lsl r0, #16 @ With vqdmulh and vrshrn, net effect is ">> 25". - -DemodulateAndSeparate: - subs r8, #8 - - vld1.16 {q0}, [r9, :64]! @ WebRtcIsacfix_kCosTab1[] - vmovl.s16 q10, d0 @ WebRtcIsacfix_kCosTab1[] - vld1.16 {q1}, [r10, :64]! @ WebRtcIsacfix_kSinTab1[] - vmovl.s16 q11, d1 @ WebRtcIsacfix_kCosTab1[] - vld1.32 {q2, q3}, [r2] @ outre1Q16 - vmovl.s16 q12, d2 @ WebRtcIsacfix_kSinTab1[] - vld1.32 {q14, q15}, [r3] @ outre2Q16 - vmovl.s16 q13, d3 @ WebRtcIsacfix_kSinTab1[] - - vmull.s32 q0, d20, d4 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k] - vmull.s32 q1, d21, d5 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k] - vmull.s32 q8, d22, d6 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k] - vmull.s32 q9, d23, d7 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k] - - vmlsl.s32 q0, d24, d28 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k] - vmlsl.s32 q1, d25, d29 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k] - vmlsl.s32 q8, d26, d30 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k] - vmlsl.s32 q9, d27, d31 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k] - - vrshrn.s64 d0, q0, #10 @ xrQ16 - vrshrn.s64 d1, q1, #10 @ xrQ16 - vrshrn.s64 d2, q8, #10 @ xrQ16 - vrshrn.s64 d3, q9, #10 @ xrQ16 - - vmull.s32 q8, d20, d28 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k] - vmull.s32 q9, d21, d29 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k] - vmull.s32 q14, d22, d30 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k] - vmull.s32 q15, d23, d31 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k] - - vmlal.s32 q8, d24, d4 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k] - vmlal.s32 q9, d25, d5 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k] - vmlal.s32 q14, d26, d6 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k] - vmlal.s32 q15, d27, d7 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k] - - vdup.s32 q11, r0 @ generic -> Neon doesn't cost extra cycles. - - vrshrn.s64 d24, q8, #10 @ xiQ16 - vrshrn.s64 d25, q9, #10 @ xiQ16 - vqdmulh.s32 q0, q0, q11 - vrshrn.s64 d26, q14, #10 @ xiQ16 - vrshrn.s64 d27, q15, #10 @ xiQ16 - - @ WEBRTC_SPL_MUL_16_32_RSFT11(factQ11, xrQ16) - @ WEBRTC_SPL_MUL_16_32_RSFT11(factQ11, xiQ16) - - vqdmulh.s32 q1, q1, q11 - vqdmulh.s32 q2, q12, q11 - vqdmulh.s32 q3, q13, q11 - - vst1.16 {q0, q1}, [r2]! @ outre1Q16[] - vst1.16 {q2, q3}, [r3]! @ outre2Q16[] - - bgt DemodulateAndSeparate - - add sp, sp, #16 - pop {r3-r11,pc} diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_tables.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_tables.c index ee96b8e35..8f89fb8f8 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_tables.c +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_tables.c @@ -16,7 +16,6 @@ #include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h" #include "webrtc/typedefs.h" -#if !(defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON) /* Cosine table 1 in Q14. */ const int16_t WebRtcIsacfix_kCosTab1[FRAMESAMPLES/2] = { 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315, 16294, 16270, @@ -90,7 +89,6 @@ const int16_t WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4] = { 4137, -3929, 3720, -3511, 3301, -3091, 2880, -2669, 2457, -2245, 2032, -1819, 1606, -1392, 1179, -965, 750, -536, 322, -107 }; -#endif #if defined(MIPS32_LE) /* Cosine table 2 in Q14. Used only on MIPS platforms. */ diff --git a/webrtc/modules/audio_coding/codecs/isac/isacfix.gypi b/webrtc/modules/audio_coding/codecs/isac/isacfix.gypi index cc2af97f8..285583c6b 100644 --- a/webrtc/modules/audio_coding/codecs/isac/isacfix.gypi +++ b/webrtc/modules/audio_coding/codecs/isac/isacfix.gypi @@ -142,11 +142,9 @@ ], 'sources': [ 'fix/source/entropy_coding_neon.c', - 'fix/source/filterbanks_neon.S', - 'fix/source/filters_neon.S', - 'fix/source/lattice_neon.S', - 'fix/source/lpc_masking_model_neon.S', - 'fix/source/transform_neon.S', + 'fix/source/filters_neon.c', + 'fix/source/lattice_neon.c', + 'fix/source/transform_neon.c', ], 'conditions': [ # Disable LTO in isac_neon target due to compiler bug @@ -156,27 +154,11 @@ '-ffat-lto-objects', ], }], - ['target_arch=="arm64"', { - 'sources!': [ - 'fix/source/filterbanks_neon.S', - 'fix/source/filters_neon.S', - 'fix/source/lattice_neon.S', - 'fix/source/lpc_masking_model_neon.S', - 'fix/source/transform_neon.S', - ], - 'sources': [ - 'fix/source/filters_neon.c', - 'fix/source/lattice_neon.c', - 'fix/source/transform_neon.c', - ], - 'conditions': [ - # Disable AllpassFilter2FixDec16Neon function due to a clang - # bug. Refer more details at: - # https://code.google.com/p/webrtc/issues/detail?id=4567 - ['clang==0', { + # Disable AllpassFilter2FixDec16Neon function due to a clang + # bug. Refer more details at: + # https://code.google.com/p/webrtc/issues/detail?id=4567 + ['target_arch!="arm64" or clang==0', { 'sources': ['fix/source/filterbanks_neon.c',], - }], - ], }] ], },