Replace asm NEON function by intrinsics implementation on ARMv7

Passed building isac_neon and modules_unittests on Android ARMv7.
Passed modules_unittests with following filters:
  --gtest_filter=FiltersTest*
  --gtest_filter=LpcMaskingModelTest*
  --gtest_filter=TransformTest*
  --gtest_filter=FilterBanksTest*

WebRtcIsacfix_CalculateResidualEnergyNeon is removed, refer more in
Issue 4224.

The old review url is at: https://webrtc-codereview.appspot.com/37259004/

BUG=4002
R=andrew@webrtc.org, jridges@masque.com, kjellander@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/48319005

Patch from Zhongwei Yao <zhongwei.yao@arm.com>.

Change-Id: I4c16e15930f1b3449d67b67bf023fac28121dff8
Cr-Commit-Position: refs/heads/master@{#9140}
This commit is contained in:
Zhongwei Yao
2015-05-06 16:39:17 +08:00
committed by Zhongwei Yao
parent 507a550af8
commit f242e665b4
11 changed files with 14 additions and 1443 deletions

View File

@@ -591,17 +591,14 @@ source_set("isacfix") {
if (rtc_build_armv7_neon || current_cpu == "arm64") { if (rtc_build_armv7_neon || current_cpu == "arm64") {
source_set("isac_neon") { source_set("isac_neon") {
sources = [ "codecs/isac/fix/source/entropy_coding_neon.c" ] sources = [
"codecs/isac/fix/source/entropy_coding_neon.c",
if (rtc_build_armv7_neon) { "codecs/isac/fix/source/filters_neon.c",
sources += [ "codecs/isac/fix/source/lattice_neon.c",
"codecs/isac/fix/source/filterbanks_neon.S", "codecs/isac/fix/source/transform_neon.c",
"codecs/isac/fix/source/filters_neon.S",
"codecs/isac/fix/source/lattice_neon.S",
"codecs/isac/fix/source/lpc_masking_model_neon.S",
"codecs/isac/fix/source/transform_neon.S",
] ]
if (rtc_build_armv7_neon) {
# Enable compilation for the ARM v7 Neon instruction set. This is needed # Enable compilation for the ARM v7 Neon instruction set. This is needed
# since //build/config/arm.gni only enables Neon for iOS, not Android. # since //build/config/arm.gni only enables Neon for iOS, not Android.
# This provides the same functionality as webrtc/build/arm_neon.gypi. # This provides the same functionality as webrtc/build/arm_neon.gypi.
@@ -614,19 +611,12 @@ if (rtc_build_armv7_neon || current_cpu == "arm64") {
] ]
} }
if (current_cpu == "arm64") { if (current_cpu != "arm64" || !is_clang) {
sources += [
"codecs/isac/fix/source/filters_neon.c",
"codecs/isac/fix/source/lattice_neon.c",
"codecs/isac/fix/source/transform_neon.c",
]
# Disable AllpassFilter2FixDec16Neon function due to a clang bug. # Disable AllpassFilter2FixDec16Neon function due to a clang bug.
# Refer more details at: # Refer more details at:
# https://code.google.com/p/webrtc/issues/detail?id=4567 # https://code.google.com/p/webrtc/issues/detail?id=4567
if (!is_clang) {
sources += [ "codecs/isac/fix/source/filterbanks_neon.c", ] sources += [ "codecs/isac/fix/source/filterbanks_neon.c", ]
} }
}
# Disable LTO in audio_processing_neon target due to compiler bug. # Disable LTO in audio_processing_neon target due to compiler bug.
if (rtc_use_lto) { if (rtc_use_lto) {

View File

@@ -1,270 +0,0 @@
@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ Contains a function for WebRtcIsacfix_AllpassFilter2FixDec16Neon()
@ in iSAC codec, optimized for ARM Neon platform. Bit exact with function
@ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype
@ C code is at end of this file.
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
.align 2
@void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
@ int16_t *data_ch1, // Input and output in channel 1, in Q0
@ int16_t *data_ch2, // Input and output in channel 2, in Q0
@ const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15
@ const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15
@ const int length, // Length of the data buffers
@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16
@ int32_t *filter_state_ch2); // Filter state for channel 2, in Q16
DEFINE_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
push {r4 - r7}
ldr r5, [sp, #24] @ filter_state_ch2
ldr r6, [sp, #20] @ filter_state_ch1
@ Initialize the Neon registers.
vld1.16 d0[0], [r0]! @ data_ch1[0]
vld1.16 d0[2], [r1]! @ data_ch2[0]
vld1.32 d30[0], [r2] @ factor_ch1[0], factor_ch1[1]
vld1.32 d30[1], [r3] @ factor_ch2[0], factor_ch2[1]
vld1.32 d16[0], [r6]! @ filter_state_ch1[0]
vld1.32 d17[0], [r5]! @ filter_state_ch2[0]
vneg.s16 d31, d30
ldr r3, [sp, #16] @ length
mov r4, #4 @ Post offset value for the loop
mov r2, #-2 @ Post offset value for the loop
sub r3, #2 @ Loop counter
@ Loop unrolling pre-processing.
vqdmull.s16 q1, d30, d0
vshll.s16 q0, d0, #16
vqadd.s32 q2, q1, q8
vshrn.i32 d6, q2, #16
vmull.s16 q1, d31, d6
vshl.s32 q1, #1
vqadd.s32 q8, q1, q0
vld1.32 d16[1], [r6] @ filter_state_ch1[1]
vld1.32 d17[1], [r5] @ filter_state_ch2[1]
sub r6, #4 @ &filter_state_ch1[0]
sub r5, #4 @ &filter_state_ch2[0]
vld1.16 d6[1], [r0], r2 @ data_ch1[1]
vld1.16 d6[3], [r1], r2 @ data_ch2[1]
vrev32.16 d0, d6
FOR_LOOP:
vqdmull.s16 q1, d30, d0
vshll.s16 q0, d0, #16
vqadd.s32 q2, q1, q8
vshrn.i32 d4, q2, #16
vmull.s16 q1, d31, d4
vst1.16 d4[1], [r0], r4 @ Store data_ch1[n]
vst1.16 d4[3], [r1], r4 @ Store data_ch2[n]
vshl.s32 q1, #1
vld1.16 d4[1], [r0], r2 @ Load data_ch1[n + 2]
vld1.16 d4[3], [r1], r2 @ Load data_ch2[n + 2]
vqadd.s32 q8, q1, q0
vrev32.16 d0, d4
vqdmull.s16 q1, d30, d0
subs r3, #2
vqadd.s32 q2, q1, q8
vshrn.i32 d6, q2, #16
vmull.s16 q1, d31, d6
vshll.s16 q0, d0, #16
vst1.16 d6[1], [r0], r4 @ Store data_ch1[n + 1]
vst1.16 d6[3], [r1], r4 @ Store data_ch2[n + 1]
vshl.s32 q1, #1
vld1.16 d6[1], [r0], r2 @ Load data_ch1[n + 3]
vld1.16 d6[3], [r1], r2 @ Load data_ch2[n + 3]
vqadd.s32 q8, q1, q0
vrev32.16 d0, d6
bgt FOR_LOOP
@ Loop unrolling post-processing.
vqdmull.s16 q1, d30, d0
vshll.s16 q0, d0, #16
vqadd.s32 q2, q1, q8
vshrn.i32 d4, q2, #16
vmull.s16 q1, d31, d4
vst1.16 d4[1], [r0]! @ Store data_ch1[n]
vst1.16 d4[3], [r1]! @ Store data_ch2[n]
vshl.s32 q1, #1
vqadd.s32 q8, q1, q0
vrev32.16 d0, d4
vqdmull.s16 q1, d30, d0
vshll.s16 q0, d0, #16
vqadd.s32 q2, q1, q8
vshrn.i32 d6, q2, #16
vmull.s16 q1, d31, d6
vst1.16 d6[1], [r0] @ Store data_ch1[n + 1]
vst1.16 d6[3], [r1] @ Store data_ch2[n + 1]
vshl.s32 q1, #1
vst1.32 d16[0], [r6]! @ Store filter_state_ch1[0]
vqadd.s32 q9, q1, q0
vst1.32 d17[0], [r5]! @ Store filter_state_ch1[1]
vst1.32 d18[1], [r6] @ Store filter_state_ch2[0]
vst1.32 d19[1], [r5] @ Store filter_state_ch2[1]
pop {r4 - r7}
bx lr
@void AllpassFilter2FixDec16BothChannels(
@ int16_t *data_ch1, // Input and output in channel 1, in Q0
@ int16_t *data_ch2, // Input and output in channel 2, in Q0
@ const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15
@ const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15
@ const int length, // Length of the data buffers
@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16
@ int32_t *filter_state_ch2) { // Filter state for channel 2, in Q16
@ int n = 0;
@ int32_t state0_ch1 = filter_state_ch1[0], state1_ch1 = filter_state_ch1[1];
@ int32_t state0_ch2 = filter_state_ch2[0], state1_ch2 = filter_state_ch2[1];
@ int16_t sample0_ch1 = 0, sample0_ch2 = 0;
@ int16_t sample1_ch1 = 0, sample1_ch2 = 0;
@ int32_t a0_ch1 = 0, a0_ch2 = 0;
@ int32_t b0_ch1 = 0, b0_ch2 = 0;
@
@ int32_t a1_ch1 = 0, a1_ch2 = 0;
@ int32_t b1_ch1 = 0, b1_ch2 = 0;
@ int32_t b2_ch1 = 0, b2_ch2 = 0;
@
@ // Loop unrolling preprocessing.
@
@ sample0_ch1 = data_ch1[n];
@ sample0_ch2 = data_ch2[n];
@
@ a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
@ a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
@
@ b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
@ b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
@
@ a0_ch1 = -factor_ch1[0] * (int16_t)(b0_ch1 >> 16);
@ a0_ch2 = -factor_ch2[0] * (int16_t)(b0_ch2 >> 16);
@
@ state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1 <<1, (uint32_t)sample0_ch1 << 16);
@ state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2 <<1, (uint32_t)sample0_ch2 << 16);
@
@ sample1_ch1 = data_ch1[n + 1];
@ sample0_ch1 = (int16_t) (b0_ch1 >> 16); //Save as Q0
@ sample1_ch2 = data_ch2[n + 1];
@ sample0_ch2 = (int16_t) (b0_ch2 >> 16); //Save as Q0
@
@
@ for (n = 0; n < length - 2; n += 2) {
@ a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
@ a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
@ a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
@ a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
@
@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
@ b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1); //Q16+Q16=Q16
@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2); //Q16+Q16=Q16
@ b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2); //Q16+Q16=Q16
@
@ a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
@ a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
@ a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
@ a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
@
@ state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 <<16);
@ state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 <<16);
@ state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 <<16);
@ state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 <<16);
@
@ sample0_ch1 = data_ch1[n + 2];
@ sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
@ sample0_ch2 = data_ch2[n + 2];
@ sample1_ch2 = (int16_t) (b1_ch2 >> 16); //Save as Q0
@
@ a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
@ a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
@ a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
@ a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
@
@ b2_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
@ b2_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
@
@ a0_ch1 = -factor_ch1[0] * (int16_t)(b2_ch1 >> 16);
@ a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
@ a0_ch2 = -factor_ch2[0] * (int16_t)(b2_ch2 >> 16);
@ a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
@
@ state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1<<16);
@ state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
@ state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2<<16);
@ state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
@
@
@ sample1_ch1 = data_ch1[n + 3];
@ sample0_ch1 = (int16_t) (b2_ch1 >> 16); //Save as Q0
@ sample1_ch2 = data_ch2[n + 3];
@ sample0_ch2 = (int16_t) (b2_ch2 >> 16); //Save as Q0
@
@ data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
@ data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
@ data_ch2[n] = (int16_t) (b0_ch2 >> 16);
@ data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
@ }
@
@ // Loop unrolling post-processing.
@
@ a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
@ a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
@ a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
@ a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
@
@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
@ b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1);
@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2);
@ b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2);
@
@ a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
@ a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
@ a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
@ a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
@
@ state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 << 16);
@ state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 << 16);
@ state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 << 16);
@ state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 << 16);
@
@ data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
@ data_ch2[n] = (int16_t) (b0_ch2 >> 16);
@
@ sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
@ sample1_ch2 = (int16_t) (b1_ch2 >> 16); //Save as Q0
@
@ a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
@ a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
@
@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
@
@ a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
@ a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
@
@ state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
@ state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
@
@ data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
@ data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
@
@ filter_state_ch1[0] = state0_ch1;
@ filter_state_ch1[1] = state1_ch1;
@ filter_state_ch2[0] = state0_ch2;
@ filter_state_ch2[1] = state1_ch2;
@}

View File

@@ -1,145 +0,0 @@
@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ Reference code in filters.c. Output is bit-exact.
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon
.align 2
@ int WebRtcIsacfix_AutocorrNeon(
@ int32_t* __restrict r,
@ const int16_t* __restrict x,
@ int16_t N,
@ int16_t order,
@ int16_t* __restrict scale);
DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon
push {r3 - r12}
@ Constant initializations
mov r4, #33
vmov.i32 d0, #0
vmov.i32 q8, #0
vmov.i32 d29, #0 @ Initialize (-scale).
vmov.u8 d30, #255 @ Initialize d30 as -1.
vmov.i32 d0[0], r4 @ d0: 00000033 (low), 00000000 (high)
vmov.i32 d25, #32
mov r5, r1 @ x
mov r6, r2 @ N
@ Generate the first coefficient r0.
LOOP_R0:
vld1.16 {d18}, [r5]! @ x[]
subs r6, r6, #4
vmull.s16 q9, d18, d18
vpadal.s32 q8, q9
bgt LOOP_R0
vadd.i64 d16, d16, d17
@ Calculate scaling (the value of shifting).
vmov d17, d16
@ Check overflow and determine the value for 'scale'.
@ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and
@ lower 32-bit words. Note that we don't care about the value of the upper
@ word in d17.
@ Check the case of 1 bit overflow. If it occurs store the results for
@ scale and r[0] in d17 and d29.
vshr.u64 d3, d16, #1
vclt.s32 d1, d16, #0 @ < 0 ?
vbit d17, d3, d1 @ For r[0]
vbit d29, d30, d1 @ -scale = -1
@ For the case of more than 1 bit overflow. If it occurs overwrite the
@ results for scale and r[0] in d17 and d29.
vclz.s32 d5, d16 @ Leading zeros of the two 32 bit words.
vshr.s64 d26, d5, #32 @ Keep only the upper 32 bits.
vsub.i64 d31, d26, d0 @ zeros - 33
vshl.i64 d27, d26, #32
vorr d27, d26 @ Duplicate the high word with its low one.
vshl.u64 d2, d16, d31 @ Shift by (-scale).
vclt.s32 d1, d27, d25 @ < 32 ?
vbit d17, d2, d1 @ For r[0]
vbit d29, d31, d1 @ -scale
vst1.32 d17[0], [r0]! @ r[0]
mov r5, #1 @ outer loop counter
@ Generate rest of the coefficients
LOOP_R:
vmov.i32 q8, #0 @ Initialize the accumulation result.
vmov.i32 q9, #0 @ Initialize the accumulation result.
mov r7, r1 @ &x[0]
add r6, r7, r5, lsl #1 @ x[i]
sub r12, r2, r5 @ N - i
lsr r8, r12, #3 @ inner loop counter
sub r12, r8, lsl #3 @ Leftover samples to be processed
LOOP_8X_SAMPLES: @ Multiple of 8 samples
vld1.16 {d20, d21}, [r7]! @ x[0, ...]
vld1.16 {d22, d23}, [r6]! @ x[i, ...]
vmull.s16 q12, d20, d22
vmull.s16 q13, d21, d23
subs r8, #1
vpadal.s32 q8, q12
vpadal.s32 q9, q13
bgt LOOP_8X_SAMPLES
cmp r12, #4
blt REST_SAMPLES
Four_SAMPLES:
vld1.16 d20, [r7]!
vld1.16 d22, [r6]!
vmull.s16 q12, d20, d22
vpadal.s32 q8, q12
sub r12, #4
REST_SAMPLES:
mov r8, #0 @ Initialize lower word of the accumulation.
mov r4, #0 @ Initialize upper word of the accumulation.
cmp r12, #0
ble SUMUP
LOOP_REST_SAMPLES:
ldrh r9, [r7], #2 @ x[0, ...]
ldrh r10, [r6], #2 @ x[i, ...]
smulbb r11, r9, r10
adds r8, r8, r11 @ lower word of the accumulation.
adc r4, r4, r11, asr #31 @ upper word of the accumulation.
subs r12, #1
bgt LOOP_REST_SAMPLES
@ Added the multiplication results together and do a shift.
SUMUP:
vadd.i64 d16, d17
vadd.i64 d18, d19
vadd.i64 d18, d16
vmov d17, r8, r4
vadd.i64 d18, d17
vshl.s64 d18, d29 @ Shift left by (-scale).
vst1.32 d18[0], [r0]! @ r[i]
add r5, #1
cmp r5, r3
ble LOOP_R
vneg.s32 d29, d29 @ Get value for 'scale'.
ldr r2, [sp, #40] @ &scale
add r0, r3, #1 @ return (order + 1)
vst1.s16 d29[0], [r2] @ Store 'scale'
pop {r3 - r12}
bx lr

View File

@@ -205,10 +205,6 @@ static void WebRtcIsacfix_InitNeon(void) {
WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopNeon; WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopNeon;
WebRtcIsacfix_Spec2Time = WebRtcIsacfix_Spec2TimeNeon; WebRtcIsacfix_Spec2Time = WebRtcIsacfix_Spec2TimeNeon;
WebRtcIsacfix_Time2Spec = WebRtcIsacfix_Time2SpecNeon; WebRtcIsacfix_Time2Spec = WebRtcIsacfix_Time2SpecNeon;
#if !(defined WEBRTC_ARCH_ARM64_NEON)
WebRtcIsacfix_CalculateResidualEnergy =
WebRtcIsacfix_CalculateResidualEnergyNeon;
#endif
// Disable AllpassFilter2FixDec16Neon function due to a clang bug. // Disable AllpassFilter2FixDec16Neon function due to a clang bug.
// Refer more details at: // Refer more details at:
// https://code.google.com/p/webrtc/issues/detail?id=4567 // https://code.google.com/p/webrtc/issues/detail?id=4567

View File

@@ -1,146 +0,0 @@
@
@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ lattice_neon.s
@
@ Contains a function for the core loop in the normalized lattice MA
@ filter routine for iSAC codec, optimized for ARM Neon platform.
@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
@ int16_t input1,
@ int32_t input2,
@ int32_t* ptr0,
@ int32_t* ptr1,
@ int32_t* __restrict ptr2);
@ It calculates
@ *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
@ in Q15 domain.
@
@ Reference code in lattice.c.
@ Output is not bit-exact with the reference C code, due to the replacement
@ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
@ instructions, smulwb, and smull. Speech quality was not degraded by
@ testing speech and tone vectors.
#include "webrtc/system_wrappers/interface/asm_defines.h"
#include "settings.h"
GLOBAL_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
.align 2
DEFINE_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
push {r4-r8}
vdup.32 d28, r0 @ Initialize Neon register with input0
vdup.32 d29, r1 @ Initialize Neon register with input1
vdup.32 d30, r2 @ Initialize Neon register with input2
ldr r4, [sp, #20] @ ptr1
ldr r12, [sp, #24] @ ptr2
@ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2
@ Leftover samples after the loop, in r6:
@ r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2
mov r6, #HALF_SUBFRAMELEN
sub r6, #1
lsr r5, r6, #2
sub r6, r5, lsl #2
@ First r5 iterations in a loop.
LOOP:
vld1.32 {d0, d1}, [r3]! @ *ptr0
vmull.s32 q10, d0, d28 @ tmp32a = input0 * (*ptr0)
vmull.s32 q11, d1, d28 @ tmp32a = input0 * (*ptr0)
vmull.s32 q12, d0, d29 @ input1 * (*ptr0)
vmull.s32 q13, d1, d29 @ input1 * (*ptr0)
vrshrn.i64 d4, q10, #15
vrshrn.i64 d5, q11, #15
vld1.32 {d2, d3}, [r12] @ *ptr2
vadd.i32 q3, q2, q1 @ tmp32b = *ptr2 + tmp32a
vrshrn.i64 d0, q12, #15
vmull.s32 q10, d6, d30 @ input2 * (*ptr2 + tmp32b)
vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b)
vrshrn.i64 d16, q10, #16
vrshrn.i64 d17, q11, #16
vmull.s32 q10, d16, d28 @ input0 * (*ptr2)
vmull.s32 q11, d17, d28 @ input0 * (*ptr2)
vrshrn.i64 d1, q13, #15
vrshrn.i64 d18, q10, #15
vrshrn.i64 d19, q11, #15
vst1.32 {d16, d17}, [r12]! @ *ptr2
vadd.i32 q9, q0, q9
subs r5, #1
vst1.32 {d18, d19}, [r4]! @ *ptr1
bgt LOOP
@ Check how many samples still need to be processed.
subs r6, #2
blt LAST_SAMPLE
@ Process two more samples:
vld1.32 d0, [r3]! @ *ptr0
vmull.s32 q11, d0, d28 @ tmp32a = input0 * (*ptr0)
vmull.s32 q13, d0, d29 @ input1 * (*ptr0)
vld1.32 d18, [r12] @ *ptr2
vrshrn.i64 d4, q11, #15
vadd.i32 d7, d4, d18 @ tmp32b = *ptr2 + tmp32a
vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b)
vrshrn.i64 d16, q11, #16
vmull.s32 q11, d16, d28 @ input0 * (*ptr2)
vst1.32 d16, [r12]! @ *ptr2
vrshrn.i64 d0, q13, #15
vrshrn.i64 d19, q11, #15
vadd.i32 d19, d0, d19
vst1.32 d19, [r4]! @ *ptr1
@ If there's still one more sample, process it here.
LAST_SAMPLE:
cmp r6, #1
bne END
@ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
ldr r7, [r3] @ *ptr0
ldr r8, [r12] @ *ptr2
smulwb r5, r7, r0 @ tmp32a = *ptr0 * input0 >> 16
add r8, r8, r5, lsl #1 @ tmp32b = *ptr2 + (tmp32a << 1)
smull r5, r6, r8, r2 @ tmp32b * input2, in 64 bits
lsl r6, #16
add r6, r5, lsr #16 @ Only take the middle 32 bits
str r6, [r12] @ Output (*ptr2, as 32 bits)
@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
smulwb r5, r7, r1 @ tmp32a = *ptr0 * input1 >> 16
smulwb r6, r6, r0 @ tmp32b = *ptr2 * input0 >> 16
lsl r5, r5, #1
add r5, r6, lsl #1
str r5, [r4] @ Output (*ptr1)
END:
pop {r4-r8}
bx lr

View File

@@ -53,15 +53,6 @@ int32_t WebRtcIsacfix_CalculateResidualEnergyC(int lpc_order,
int32_t* corr_coeffs, int32_t* corr_coeffs,
int* q_val_residual_energy); int* q_val_residual_energy);
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order,
int32_t q_val_corr,
int q_val_polynomial,
int16_t* a_polynomial,
int32_t* corr_coeffs,
int* q_val_residual_energy);
#endif
#if defined(MIPS_DSP_R2_LE) #if defined(MIPS_DSP_R2_LE)
int32_t WebRtcIsacfix_CalculateResidualEnergyMIPS(int lpc_order, int32_t WebRtcIsacfix_CalculateResidualEnergyMIPS(int lpc_order,
int32_t q_val_corr, int32_t q_val_corr,

View File

@@ -1,173 +0,0 @@
@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ Contains a function for WebRtcIsacfix_CalculateResidualEnergyNeon() in
@ iSAC codec, optimized for ARM Neon platform. Reference code in
@ lpc_masking_model.c.
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
.align 2
@ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order,
@ int32_t q_val_corr,
@ int q_val_polynomial,
@ int16_t* a_polynomial,
@ int32_t* corr_coeffs,
@ int* q_val_residual_energy);
DEFINE_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
push {r4-r11}
sub r13, r13, #16
str r1, [r13, #8]
str r2, [r13, #12]
mov r4, #1
vmov.s64 q11, #0 @ Initialize shift_internal.
vmov.s64 q13, #0 @ Initialize sum64.
vmov.s64 q10, #0
vmov.u8 d20[0], r4 @ Set q10 to 1.
cmp r0, #0
blt POST_LOOP_I
add r9, r3, r0, asl #1 @ &a_polynomial[lpc_order]
mov r6, #0 @ Loop counter i.
ldr r11, [r13, #48]
sub r10, r0, #1
mov r7, r3 @ &a_polynomial[0]
str r9, [r13, #4]
LOOP_I:
ldr r2, [r11], #4 @ corr_coeffs[i]
vmov.s64 q15, #0 @ Initialize the sum64_tmp.
vdup.s32 d25, r2
cmp r0, r6 @ Compare lpc_order to i.
movle r2, r6
ble POST_LOOP_J
mov r1, r6 @ j = i;
mov r12, r7 @ &a_polynomial[i]
mov r4, r3 @ &a_polynomial[j - i]
LOOP_J:
ldr r8, [r12], #4
ldr r5, [r4], #4
vmov.u32 d0[0], r8
vmov.u32 d1[0], r5
vmull.s16 q0, d0, d1
vmull.s32 q0, d0, d25
cmp r6, #0 @ i == 0?
vshl.s64 q0, q11
beq SUM1
vshl.s64 q0, #1
SUM1:
vqadd.s64 q14, q0, q15 @ Sum and test overflow.
add r1, r1, #2
bvc MOV1 @ Skip the shift if there's no overflow.
vshr.s64 q0, #1
vshr.s64 q15, #1
vadd.s64 q14, q0, q15
vsub.s64 q11, q10
MOV1:
cmp r0, r1 @ Compare lpc_order to j.
vmov.s64 q15, q14
bgt LOOP_J
bic r1, r10, #1
add r2, r6, #2
add r2, r1, r2
POST_LOOP_J:
vqadd.s64 q0, q13, q15 @ Sum and test overflow.
bvc MOV2 @ Skip the shift if there's no overflow.
vshr.s64 q13, #1
vshr.s64 q15, #1
vadd.s64 q0, q13, q15
vsub.s64 q11, q10
MOV2:
vmov.s64 q13, q0 @ update sum64.
cmp r2, r0
bne CHECK_LOOP_CONDITION
@ Last sample in the inner loop.
ldr r4, [r13, #4]
ldrsh r8, [r4]
ldrsh r12, [r9]
mul r8, r8, r12
vmov.s32 d0[0], r8
vmull.s32 q0, d0, d25
cmp r6, #0 @ i == 0?
vshl.s64 q0, q11
beq SUM2
vshl.s64 q0, #1
SUM2:
vqadd.s64 d1, d0, d26 @ Sum and test overflow.
bvc MOV3 @ Skip the shift if there's no overflow.
vshr.s64 q13, #1
vshr.s64 d0, #1
vadd.s64 d1, d0, d26
vsub.s64 q11, q10
MOV3:
vmov.s64 d26, d1 @ update sum64.
CHECK_LOOP_CONDITION:
add r6, r6, #1
sub r9, r9, #2
cmp r0, r6 @ Compare i to lpc_order.
sub r10, r10, #1
add r7, r7, #2
bge LOOP_I
POST_LOOP_I:
mov r3, #0
vqadd.s64 d0, d26, d27 @ Sum and test overflow.
bvc GET_SHIFT_NORM @ Skip the shift if there's no overflow.
vshr.s64 q13, #1
vadd.s64 d0, d26, d27
vsub.s64 q11, q10
GET_SHIFT_NORM:
vcls.s32 d1, d0 @ Count leading extra sign bits.
vmov.32 r2, d1[1] @ Store # of sign bits of only the 32 MSBs.
vmovl.s32 q1, d1
vshl.s64 d0, d3 @ d3 contains # of sign bits of the 32 MSBs.
vcls.s32 d1, d0 @ Count again the leading extra sign bits.
vmov.s32 r1, d1[1] @ Store # of sign bits of only the 32 MSBs.
vmovl.s32 q1, d1
vshl.s64 d0, d3 @ d3 contains # of sign bits of the 32 MSBs.
vmov.s32 r0, d0[1] @ residual_energy
vmov.s32 r3, d22[0] @ shift_internal
@ Calculate the value for q_val_residual_energy.
ldr r4, [r13, #8] @ q_val_corr
ldr r5, [r13, #12] @ q_val_polynomial
sub r12, r4, #32
add r12, r12, r5, asl #1
add r1, r12, r1 @ add 1st part of shift_internal.
add r12, r1, r2 @ add 2nd part of shift_internal.
ldr r2, [r13, #52]
add r3, r12, r3 @ value for q_val_residual_energy.
str r3, [r2, #0]
add r13, r13, #16
pop {r4-r11}
bx r14

View File

@@ -58,11 +58,4 @@ class LpcMaskingModelTest : public testing::Test {
TEST_F(LpcMaskingModelTest, CalculateResidualEnergyTest) { TEST_F(LpcMaskingModelTest, CalculateResidualEnergyTest) {
CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyC); CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyC);
#ifdef WEBRTC_DETECT_ARM_NEON
if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyNeon);
}
#elif defined(WEBRTC_ARCH_ARM_NEON)
CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyNeon);
#endif
} }

View File

@@ -1,645 +0,0 @@
@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ Reference code in transform.c. Bit not exact due to how rounding is
@ done in C code and ARM instructions, but quality by assembly code is
@ not worse.
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcIsacfix_Spec2TimeNeon
GLOBAL_FUNCTION WebRtcIsacfix_Time2SpecNeon
GLOBAL_LABEL WebRtcIsacfix_kSinTab1
GLOBAL_LABEL WebRtcIsacfix_kCosTab1
GLOBAL_LABEL WebRtcIsacfix_kSinTab2
@ void WebRtcIsacfix_Time2SpecNeon(int16_t* inre1Q9,
@ int16_t* inre2Q9,
@ int16_t* outreQ7,
@ int16_t* outimQ7);
DEFINE_FUNCTION WebRtcIsacfix_Time2SpecNeon
.align 2
push {r3-r11,lr} @ need to push r4-r11, but push r3 too to keep
@ stack 8-byte aligned
sub sp, sp, #(16 + FRAMESAMPLES * 4)
str r0, [sp] @ inre1Q9
str r1, [sp, #4] @ inre2Q9
str r2, [sp, #8] @ outreQ7
str r3, [sp, #12] @ outimQ7
mov r8, #(FRAMESAMPLES - 16)
add r12, r0, r8 @ &inreQ7[FRAMESAMPLES / 2 - 4]
add r11, r1, r8 @ &inimQ7[FRAMESAMPLES / 2 - 4]
add r4, sp, #16 @ tmpreQ16;
add r5, sp, #(16 + FRAMESAMPLES * 2) @ tmpimQ16;
adr r9, WebRtcIsacfix_kCosTab1
#if defined(__APPLE__)
mov r6, #:lower16:(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
#else
mov r6, #(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
#endif
add r10, r9, r6 @ WebRtcIsacfix_kSinTab1
vmov.u32 q14, #0 @ Initialize the maximum values for tmpInIm.
vmov.u32 q15, #0 @ Initialize the maximum values for tmpInRe.
movw r6, #16921 @ 0.5 / sqrt(240) in Q19
lsl r6, #5 @ Together with vqdmulh, net effect is ">> 26".
mov r8, #(FRAMESAMPLES / 2) @ loop counter
vdup.s32 q11, r6
Time2Spec_TransformAndFindMax:
@ Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code.
subs r8, #8
vld1.16 {q0}, [r9, :64]! @ WebRtcIsacfix_kCosTab1[]
vld1.16 {q2}, [r0]! @ inre1Q9[]
vmull.s16 q8, d0, d4 @ WebRtcIsacfix_kCosTab1[k] * inre1Q9[k]
vld1.16 {q1}, [r10, :64]! @ WebRtcIsacfix_kSinTab1[]
vmull.s16 q9, d1, d5 @ WebRtcIsacfix_kCosTab1[k] * inre1Q9[k]
vld1.16 {q3}, [r1]! @ inre2Q9[]
vmlal.s16 q8, d2, d6 @ WebRtcIsacfix_kSinTab1[k] * inre2Q9[k]
vmlal.s16 q9, d3, d7 @ WebRtcIsacfix_kSinTab1[k] * inre2Q9[k]
vmull.s16 q12, d0, d6 @ WebRtcIsacfix_kCosTab1[k] * inre2Q9[k]
vmull.s16 q13, d1, d7 @ WebRtcIsacfix_kCosTab1[k] * inre2Q9[k]
vmlsl.s16 q12, d2, d4 @ WebRtcIsacfix_kSinTab1[k] * inre1Q9[k]
vmlsl.s16 q13, d3, d5 @ WebRtcIsacfix_kSinTab1[k] * inre1Q9[k]
vqdmulh.s32 q0, q8, q11 @ xrQ16 * factQ19
vqdmulh.s32 q1, q9, q11 @ xrQ16 * factQ19
vqdmulh.s32 q2, q12, q11 @ xrQ16 * factQ19
vqdmulh.s32 q3, q13, q11 @ xrQ16 * factQ19
@ Find the absolute maximum in the vectors and store them.
vabs.s32 q8, q0
vabs.s32 q9, q1
vabs.s32 q12, q2
vst1.32 {q0, q1}, [r4]! @ tmpreQ16[k]
vabs.s32 q13, q3
vmax.u32 q14, q8 @ Use u32 so we don't lose the value 0x80000000.
vmax.u32 q15, q12
vst1.32 {q2, q3}, [r5]! @ tmpimQ16[k]
vmax.u32 q15, q13
vmax.u32 q14, q9 @ Maximum for outre1Q16[].
bgt Time2Spec_TransformAndFindMax
@ Find the maximum value in the Neon registers
vmax.u32 d28, d29
vmax.u32 d30, d31
vpmax.u32 d28, d28, d28 @ Both 32 bits words hold the same value tmpInIm.
vpmax.u32 d30, d30, d30 @ Both 32 bits words hold the same value tmpInRe.
vmax.s32 d30, d28, d30 @ if (yrQ16 > xrQ16) {xrQ16 = yrQ16};
ldr r4, [sp] @ inre1Q9
vcls.s32 d31, d30 @ sh = WebRtcSpl_NormW32(tmpInRe);
ldr r5, [sp, #4] @ inre2Q9
vmov.i32 d30, #24
add r6, sp, #16 @ tmpreQ16;
vsub.s32 d31, d31, d30 @ sh = sh - 24;
add r7, sp, #(16 + FRAMESAMPLES * 2) @ tmpimQ16;
vdup.s32 q8, d31[0] @ sh
mov r8, #(FRAMESAMPLES / 2) @ loop counter
Time2Spec_PreFftShift:
subs r8, #16
vld1.32 {q0, q1}, [r6]! @ tmpreQ16[]
vrshl.s32 q0, q0, q8
vld1.32 {q2, q3}, [r6]! @ tmpreQ16[]
vrshl.s32 q1, q1, q8
vld1.32 {q10, q11}, [r7]! @ tmpimQ16[]
vrshl.s32 q2, q2, q8
vld1.32 {q12, q13}, [r7]! @ tmpimQ16[]
vrshl.s32 q3, q3, q8
vrshl.s32 q10, q10, q8
vrshl.s32 q11, q11, q8
vrshl.s32 q12, q12, q8
vrshl.s32 q13, q13, q8
vmovn.s32 d0, q0
vmovn.s32 d1, q1
vmovn.s32 d2, q2
vmovn.s32 d3, q3
vmovn.s32 d4, q10
vmovn.s32 d5, q11
vmovn.s32 d6, q12
vmovn.s32 d7, q13
vst1.16 {q0, q1}, [r4]! @ inre1Q9[]
vst1.16 {q2, q3}, [r5]! @ inre2Q9[]
bgt Time2Spec_PreFftShift
vmov.s32 r10, d16[0] @ Store value of sh.
ldr r0, [sp] @ inre1Q9
ldr r1, [sp, #4] @ inre2Q9
mov r2, #-1
CALL_FUNCTION WebRtcIsacfix_FftRadix16Fastest
vdup.s32 q8, r10 @ sh
mov r8, #(FRAMESAMPLES - 8)
ldr r2, [sp, #8] @ outreQ7
ldr r3, [sp, #12] @ outimQ7
add r11, r2, r8 @ &outRe1Q16[FRAMESAMPLES / 2 - 4]
add r12, r3, r8 @ &outim2Q16[FRAMESAMPLES / 2 - 4]
ldr r6, [sp] @ inre1Q9
ldr r7, [sp, #4] @ inre2Q9
add r4, r6, r8 @ &inre1Q9[FRAMESAMPLES / 2 - 4]
add r5, r7, r8 @ &inre2Q9[FRAMESAMPLES / 2 - 4]
adr r10, WebRtcIsacfix_kSinTab2
add r9, r10, #(120*2 - 8) @ &WebRtcIsacfix_kSinTab2[119 - 4]
vneg.s32 q15, q8 @ -sh
vmov.i32 q0, #23
vsub.s32 q15, q15, q0 @ -sh - 23
mov r8, #(FRAMESAMPLES / 4) @ loop counter
@ Pre-load variables.
vld1.16 {d2}, [r4] @ inre1Q9[FRAMESAMPLES / 2 - 4 - i]
vld1.16 {d3}, [r5] @ inre2Q9[FRAMESAMPLES / 2 - 4 - i]
vld1.16 {d0}, [r6]! @ inre1Q9
vld1.16 {d1}, [r7]! @ inre2Q9
Time2Spec_PostFftTransform:
@ By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)",
@ ">> 14" and then ">> 9" as in the C code.
vld1.16 {d6}, [r9, :64] @ kCosTab2[]
vneg.s16 d6, d6
vld1.16 {d7}, [r10, :64]! @ WebRtcIsacfix_kSinTab2[]
vrev64.16 q1, q1 @ Reverse samples in 2nd half of xrQ16[].
vqadd.s16 d4, d0, d2 @ xrQ16
vqsub.s16 d5, d1, d3 @ xiQ16
vrev64.16 d6, d6
sub r9, #8 @ Update pointers for kCosTab2[].
sub r4, #8 @ Update pointers for inre1Q9[].
sub r5, #8 @ Update pointers for inr22Q9[].
subs r8, #4 @ Update loop counter.
vqadd.s16 d1, d1, d3 @ yrQ16
vqsub.s16 d0, d2, d0 @ yiQ16
vmull.s16 q12, d6, d4 @ kCosTab2[k] * xrQ16
vmlsl.s16 q12, d7, d5 @ WebRtcIsacfix_kSinTab2[k] * xiQ16
vmull.s16 q13, d7, d4 @ WebRtcIsacfix_kSinTab2[k] * xrQ16
vmlal.s16 q13, d6, d5 @ kCosTab2[k] * xiQ16
vmull.s16 q9, d7, d1 @ WebRtcIsacfix_kSinTab2[k] * yrQ16
vmlal.s16 q9, d6, d0 @ kCosTab2[k] * yiQ16
vmull.s16 q10, d7, d0 @ WebRtcIsacfix_kSinTab2[k] * yiQ16
vmlsl.s16 q10, d6, d1 @ kCosTab2[k] * yrQ16
vshl.s32 q12, q12, q15
vshl.s32 q13, q13, q15
vshl.s32 q9, q9, q15
vshl.s32 q10, q10, q15
vneg.s32 q8, q9
vld1.16 {d0}, [r6]! @ inre1Q9
vmovn.s32 d24, q12
vld1.16 {d1}, [r7]! @ inre2Q9
vmovn.s32 d25, q13
vld1.16 {d2}, [r4] @ inre1Q9[FRAMESAMPLES / 2 - 4 - i]
vmovn.s32 d5, q10
vld1.16 {d3}, [r5] @ inre2Q9[FRAMESAMPLES / 2 - 4 - i]
vmovn.s32 d4, q8
vst1.16 {d24}, [r2]! @ outreQ7[k]
vrev64.16 q2, q2 @ Reverse the order of the samples.
vst1.16 {d25}, [r3]! @ outimQ7[k]
vst1.16 {d4}, [r11] @ outreQ7[FRAMESAMPLES / 2 - 1 - k]
vst1.16 {d5}, [r12] @ outimQ7[FRAMESAMPLES / 2 - 1 - k]
sub r11, #8 @ Update pointers for outreQ7[].
sub r12, #8 @ Update pointers for outimQ7[].
bgt Time2Spec_PostFftTransform
add sp, sp, #(16 + FRAMESAMPLES * 4)
pop {r3-r11,pc}
.align 8
@ Cosine table 1 in Q14
WebRtcIsacfix_kCosTab1:
_WebRtcIsacfix_kCosTab1: @ Label for iOS
.short 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315
.short 16294, 16270, 16244, 16214, 16182, 16147, 16110, 16069
.short 16026, 15980, 15931, 15880, 15826, 15769, 15709, 15647
.short 15582, 15515, 15444, 15371, 15296, 15218, 15137, 15053
.short 14968, 14879, 14788, 14694, 14598, 14500, 14399, 14295
.short 14189, 14081, 13970, 13856, 13741, 13623, 13502, 13380
.short 13255, 13128, 12998, 12867, 12733, 12597, 12458, 12318
.short 12176, 12031, 11885, 11736, 11585, 11433, 11278, 11121
.short 10963, 10803, 10641, 10477, 10311, 10143, 9974, 9803
.short 9630, 9456, 9280, 9102, 8923, 8743, 8561, 8377
.short 8192, 8006, 7818, 7629, 7438, 7246, 7053, 6859
.short 6664, 6467, 6270, 6071, 5872, 5671, 5469, 5266
.short 5063, 4859, 4653, 4447, 4240, 4033, 3825, 3616
.short 3406, 3196, 2986, 2775, 2563, 2351, 2139, 1926
.short 1713, 1499, 1285, 1072, 857, 643, 429, 214
.short 0, -214, -429, -643, -857, -1072, -1285, -1499
.short -1713, -1926, -2139, -2351, -2563, -2775, -2986, -3196
.short -3406, -3616, -3825, -4033, -4240, -4447, -4653, -4859
.short -5063, -5266, -5469, -5671, -5872, -6071, -6270, -6467
.short -6664, -6859, -7053, -7246, -7438, -7629, -7818, -8006
.short -8192, -8377, -8561, -8743, -8923, -9102, -9280, -9456
.short -9630, -9803, -9974, -10143, -10311, -10477, -10641, -10803
.short -10963, -11121, -11278, -11433, -11585, -11736, -11885, -12031
.short -12176, -12318, -12458, -12597, -12733, -12867, -12998, -13128
.short -13255, -13380, -13502, -13623, -13741, -13856, -13970, -14081
.short -14189, -14295, -14399, -14500, -14598, -14694, -14788, -14879
.short -14968, -15053, -15137, -15218, -15296, -15371, -15444, -15515
.short -15582, -15647, -15709, -15769, -15826, -15880, -15931, -15980
.short -16026, -16069, -16110, -16147, -16182, -16214, -16244, -16270
.short -16294, -16315, -16333, -16349, -16362, -16371, -16378, -16383
.align 8
@ Sine table 2 in Q14
WebRtcIsacfix_kSinTab2:
_WebRtcIsacfix_kSinTab2: @ Label for iOS
.short 16384, -16381, 16375, -16367, 16356, -16342, 16325, -16305
.short 16283, -16257, 16229, -16199, 16165, -16129, 16090, -16048
.short 16003, -15956, 15906, -15853, 15798, -15739, 15679, -15615
.short 15549, -15480, 15408, -15334, 15257, -15178, 15095, -15011
.short 14924, -14834, 14741, -14647, 14549, -14449, 14347, -14242
.short 14135, -14025, 13913, -13799, 13682, -13563, 13441, -13318
.short 13192, -13063, 12933, -12800, 12665, -12528, 12389, -12247
.short 12104, -11958, 11810, -11661, 11509, -11356, 11200, -11042
.short 10883, -10722, 10559, -10394, 10227, -10059, 9889, -9717
.short 9543, -9368, 9191, -9013, 8833, -8652, 8469, -8285
.short 8099, -7912, 7723, -7534, 7342, -7150, 6957, -6762
.short 6566, -6369, 6171, -5971, 5771, -5570, 5368, -5165
.short 4961, -4756, 4550, -4344, 4137, -3929, 3720, -3511
.short 3301, -3091, 2880, -2669, 2457, -2245, 2032, -1819
.short 1606, -1392, 1179, -965, 750, -536, 322, -107
@ Table kCosTab2 was removed since its data is redundant with kSinTab2.
.align 8
@ Sine table 1 in Q14
WebRtcIsacfix_kSinTab1:
_WebRtcIsacfix_kSinTab1: @ Label for iOS
.short 0, 214, 429, 643, 857, 1072, 1285, 1499
.short 1713, 1926, 2139, 2351, 2563, 2775, 2986, 3196
.short 3406, 3616, 3825, 4033, 4240, 4447, 4653, 4859
.short 5063, 5266, 5469, 5671, 5872, 6071, 6270, 6467
.short 6664, 6859, 7053, 7246, 7438, 7629, 7818, 8006
.short 8192, 8377, 8561, 8743, 8923, 9102, 9280, 9456
.short 9630, 9803, 9974, 10143, 10311, 10477, 10641, 10803
.short 10963, 11121, 11278, 11433, 11585, 11736, 11885, 12031
.short 12176, 12318, 12458, 12597, 12733, 12867, 12998, 13128
.short 13255, 13380, 13502, 13623, 13741, 13856, 13970, 14081
.short 14189, 14295, 14399, 14500, 14598, 14694, 14788, 14879
.short 14968, 15053, 15137, 15218, 15296, 15371, 15444, 15515
.short 15582, 15647, 15709, 15769, 15826, 15880, 15931, 15980
.short 16026, 16069, 16110, 16147, 16182, 16214, 16244, 16270
.short 16294, 16315, 16333, 16349, 16362, 16371, 16378, 16383
.short 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315
.short 16294, 16270, 16244, 16214, 16182, 16147, 16110, 16069
.short 16026, 15980, 15931, 15880, 15826, 15769, 15709, 15647
.short 15582, 15515, 15444, 15371, 15296, 15218, 15137, 15053
.short 14968, 14879, 14788, 14694, 14598, 14500, 14399, 14295
.short 14189, 14081, 13970, 13856, 13741, 13623, 13502, 13380
.short 13255, 13128, 12998, 12867, 12733, 12597, 12458, 12318
.short 12176, 12031, 11885, 11736, 11585, 11433, 11278, 11121
.short 10963, 10803, 10641, 10477, 10311, 10143, 9974, 9803
.short 9630, 9456, 9280, 9102, 8923, 8743, 8561, 8377
.short 8192, 8006, 7818, 7629, 7438, 7246, 7053, 6859
.short 6664, 6467, 6270, 6071, 5872, 5671, 5469, 5266
.short 5063, 4859, 4653, 4447, 4240, 4033, 3825, 3616
.short 3406, 3196, 2986, 2775, 2563, 2351, 2139, 1926
.short 1713, 1499, 1285, 1072, 857, 643, 429, 214
@ void WebRtcIsacfix_Spec2TimeNeon(int16_t *inreQ7,
@ int16_t *inimQ7,
@ int32_t *outre1Q16,
@ int32_t *outre2Q16);
DEFINE_FUNCTION WebRtcIsacfix_Spec2TimeNeon
.align 2
push {r3-r11,lr} @ need to push r4-r11, but push r3 too to keep
@ stack 8-byte aligned
sub sp, sp, #16
str r0, [sp] @ inreQ7
str r1, [sp, #4] @ inimQ7
str r2, [sp, #8] @ outre1Q16
str r3, [sp, #12] @ outre2Q16
mov r8, #(FRAMESAMPLES - 16)
add r12, r0, r8 @ &inreQ7[FRAMESAMPLES / 2 - 8]
add r11, r1, r8 @ &inimQ7[FRAMESAMPLES / 2 - 8]
add r4, r2, r8, lsl #1 @ &outRe1Q16[FRAMESAMPLES / 2 - 8]
add r6, r3, r8, lsl #1 @ &outRe2Q16[FRAMESAMPLES / 2 - 8]
mov r8, #(FRAMESAMPLES / 2) @ loop counter
adr r10, WebRtcIsacfix_kSinTab2
add r9, r10, #(120*2 - 16) @ &WebRtcIsacfix_kSinTab2[119 - 8]
vpush {q4-q7}
mov r5, #-32
mov r7, #-16
vmov.u32 q6, #0 @ Initialize the maximum values for tmpInIm.
vmov.u32 q7, #0 @ Initialize the maximum values for tmpInRe.
TransformAndFindMax:
@ Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
@ Bit-exact.
subs r8, #16
vld1.16 {q0}, [r9, :64] @ kCosTab2[]
sub r9, #16
vld1.16 {q2}, [r0]! @ inreQ7[]
vneg.s16 q0, q0
vld1.16 {q3}, [r1]! @ inimQ7[]
vrev64.16 d0, d0
vrev64.16 d1, d1
vld1.16 {q1}, [r10, :64]! @ WebRtcIsacfix_kSinTab2[]
vswp d0, d1
vmull.s16 q8, d2, d6 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
vmull.s16 q9, d3, d7 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
vmlal.s16 q8, d0, d4 @ kCosTab2[k] * inreQ7[k]
vmlal.s16 q9, d1, d5 @ kCosTab2[k] * inreQ7[k]
vmull.s16 q12, d0, d6 @ kCosTab2[k] * inimQ7[k]
vmull.s16 q13, d1, d7 @ kCosTab2[k] * inimQ7[k]
vmlsl.s16 q12, d2, d4 @ WebRtcIsacfix_kSinTab2[k] * inreQ7[k]
vmlsl.s16 q13, d3, d5 @ WebRtcIsacfix_kSinTab2[k] * inreQ7[k]
vld1.16 {q2}, [r11], r7 @ inimQ7[FRAMESAMPLES / 2 - 8 + i]
vld1.16 {q3}, [r12], r7 @ inreQ7[FRAMESAMPLES / 2 - 8 + i]
vrev64.16 q2, q2 @ Reverse the order of the samples
vrev64.16 q3, q3 @ Reverse the order of the samples
vmull.s16 q14, d2, d5 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
vmull.s16 q15, d3, d4 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
vmlsl.s16 q14, d0, d7 @ q14 -= kCosTab2[k] * inreQ7[k]
vmlsl.s16 q15, d1, d6 @ q15 -= kCosTab2[k] * inreQ7[k]
vmull.s16 q10, d0, d5 @ kCosTab2[k] * inimQ7[]
vmull.s16 q11, d1, d4 @ kCosTab2[k] * inimQ7[]
vmlal.s16 q10, d2, d7 @ q10 += WebRtcIsacfix_kSinTab2[k] * inreQ7[]
vmlal.s16 q11, d3, d6 @ q11 += WebRtcIsacfix_kSinTab2[k] * inreQ7[]
vshr.s32 q8, q8, #5 @ xrQ16
vshr.s32 q9, q9, #5 @ xrQ16
vshr.s32 q12, q12, #5 @ xiQ16
vshr.s32 q13, q13, #5 @ xiQ16
vshr.s32 q14, q14, #5 @ yiQ16
vshr.s32 q15, q15, #5 @ yiQ16
vneg.s32 q10, q10
vneg.s32 q11, q11
@ xrQ16 - yiQ16
vsub.s32 q0, q8, q14
vsub.s32 q1, q9, q15
vshr.s32 q10, q10, #5 @ yrQ16
vshr.s32 q11, q11, #5 @ yrQ16
@ xrQ16 + yiQ16
vadd.s32 q3, q8, q14
vadd.s32 q2, q9, q15
@ yrQ16 + xiQ16
vadd.s32 q4, q10, q12
vadd.s32 q5, q11, q13
@ yrQ16 - xiQ16
vsub.s32 q8, q11, q13
vsub.s32 q9, q10, q12
@ Reverse the order of the samples
vrev64.32 q2, q2
vrev64.32 q3, q3
vrev64.32 q8, q8
vrev64.32 q9, q9
vswp d4, d5
vswp d6, d7
vst1.32 {q0, q1}, [r2]! @ outre1Q16[k]
vswp d16, d17
vswp d18, d19
vst1.32 {q2, q3}, [r4], r5 @ outre1Q16[FRAMESAMPLES / 2 - 1 - k]
@ Find the absolute maximum in the vectors and store them in q6 and q7.
vabs.s32 q10, q0
vabs.s32 q14, q4
vabs.s32 q11, q1
vabs.s32 q15, q5
vabs.s32 q12, q2
vmax.u32 q6, q10 @ Use u32 so we don't lose the value 0x80000000.
vmax.u32 q7, q14 @ Maximum for outre2Q16[].
vabs.s32 q0, q8
vmax.u32 q6, q11 @ Maximum for outre1Q16[].
vmax.u32 q7, q15
vabs.s32 q13, q3
vmax.u32 q6, q12
vmax.u32 q7, q0
vabs.s32 q1, q9
vst1.32 {q4, q5}, [r3]! @ outre2Q16[k]
vst1.32 {q8, q9}, [r6], r5 @ outre2Q16[FRAMESAMPLES / 2 - 1 - k]
vmax.u32 q6, q13
vmax.u32 q7, q1
bgt TransformAndFindMax
adr r10, WebRtcIsacfix_kSinTab1
#if defined(__APPLE__)
mov r2, #:lower16:(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
#else
mov r2, #(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
#endif
sub r11, r10, r2 @ WebRtcIsacfix_kCosTab1
@ Find the maximum value in the Neon registers
vmax.u32 d12, d13
vmax.u32 d14, d15
vpmax.u32 d12, d12, d12 @ Both 32 bits words hold the same value tmpInIm.
vpmax.u32 d14, d14, d14 @ Both 32 bits words hold the same value tmpInRe.
vmax.s32 d0, d12, d14 @ if (tmpInIm>tmpInRe) tmpInRe = tmpInIm;
vpop {q4-q7}
ldr r4, [sp] @ inreQ7
vcls.s32 d1, d0 @ sh = WebRtcSpl_NormW32(tmpInRe);
ldr r5, [sp, #4] @ inimQ7
vmov.i32 d0, #24 @ sh = sh-24;
ldr r6, [sp, #8] @ outre1Q16
vsub.s32 d1, d1, d0
ldr r7, [sp, #12] @ outre2Q16
vdup.s32 q8, d1[0] @ sh
mov r8, #(FRAMESAMPLES / 2)
PreFftShift:
subs r8, #16
vld1.32 {q0, q1}, [r6]! @ outre1Q16[]
vld1.32 {q2, q3}, [r6]! @ outre1Q16[]
vrshl.s32 q0, q0, q8
vrshl.s32 q1, q1, q8
vrshl.s32 q2, q2, q8
vrshl.s32 q3, q3, q8
vld1.32 {q10, q11}, [r7]! @ outre2Q16[]
vld1.32 {q12, q13}, [r7]! @ outre2Q16[]
vrshl.s32 q10, q10, q8
vrshl.s32 q11, q11, q8
vrshl.s32 q12, q12, q8
vrshl.s32 q13, q13, q8
vmovn.s32 d0, q0
vmovn.s32 d1, q1
vmovn.s32 d2, q2
vmovn.s32 d3, q3
vmovn.s32 d4, q10
vmovn.s32 d5, q11
vmovn.s32 d6, q12
vmovn.s32 d7, q13
vst1.16 {q0, q1}, [r4]! @ inreQ7[]
vst1.16 {q2, q3}, [r5]! @ inimQ7[]
bgt PreFftShift
vmov.s32 r8, d16[0] @ Store value of sh.
ldr r0, [sp] @ inreQ7
ldr r1, [sp, #4] @ inimQ7
mov r2, #1
CALL_FUNCTION WebRtcIsacfix_FftRadix16Fastest
vdup.s32 q8, r8 @ sh
mov r9, r11 @ WebRtcIsacfix_kCosTab1
ldr r4, [sp] @ inreQ7
ldr r5, [sp, #4] @ inimQ7
ldr r6, [sp, #8] @ outre1Q16
ldr r7, [sp, #12] @ outre2Q16
mov r8, #(FRAMESAMPLES / 2)
vneg.s32 q15, q8 @ -sh
movw r0, #273
lsl r0, #15 @ Together with vqdmulh, net effect is ">> 16".
vdup.s32 q14, r0
PostFftShiftDivide:
subs r8, #16
vld1.16 {q0, q1}, [r4]! @ inreQ7
vmovl.s16 q10, d0
vmovl.s16 q11, d1
vld1.16 {q2, q3}, [r5]! @ inimQ7
vmovl.s16 q8, d2
vmovl.s16 q9, d3
vshl.s32 q10, q10, q15
vshl.s32 q11, q11, q15
vshl.s32 q8, q8, q15
vshl.s32 q9, q9, q15
vqdmulh.s32 q10, q10, q14
vqdmulh.s32 q11, q11, q14
vqdmulh.s32 q8, q8, q14
vqdmulh.s32 q9, q9, q14
vmovl.s16 q0, d4
vmovl.s16 q1, d5
vmovl.s16 q2, d6
vmovl.s16 q3, d7
vshl.s32 q0, q0, q15
vshl.s32 q1, q1, q15
vshl.s32 q2, q2, q15
vshl.s32 q3, q3, q15
@ WEBRTC_SPL_MUL_16_32_RSFT16(273, outre2Q16[k])
vqdmulh.s32 q0, q0, q14
vqdmulh.s32 q1, q1, q14
vst1.32 {q10, q11}, [r6]! @ outre1Q16[]
vqdmulh.s32 q2, q2, q14
vqdmulh.s32 q3, q3, q14
vst1.32 {q8, q9}, [r6]! @ outre1Q16[]
vst1.32 {q0, q1}, [r7]! @ outre2Q16[]
vst1.32 {q2, q3}, [r7]! @ outre2Q16[]
bgt PostFftShiftDivide
mov r8, #(FRAMESAMPLES / 2)
ldr r2, [sp, #8] @ outre1Q16
ldr r3, [sp, #12] @ outre2Q16
movw r0, #31727
lsl r0, #16 @ With vqdmulh and vrshrn, net effect is ">> 25".
DemodulateAndSeparate:
subs r8, #8
vld1.16 {q0}, [r9, :64]! @ WebRtcIsacfix_kCosTab1[]
vmovl.s16 q10, d0 @ WebRtcIsacfix_kCosTab1[]
vld1.16 {q1}, [r10, :64]! @ WebRtcIsacfix_kSinTab1[]
vmovl.s16 q11, d1 @ WebRtcIsacfix_kCosTab1[]
vld1.32 {q2, q3}, [r2] @ outre1Q16
vmovl.s16 q12, d2 @ WebRtcIsacfix_kSinTab1[]
vld1.32 {q14, q15}, [r3] @ outre2Q16
vmovl.s16 q13, d3 @ WebRtcIsacfix_kSinTab1[]
vmull.s32 q0, d20, d4 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
vmull.s32 q1, d21, d5 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
vmull.s32 q8, d22, d6 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
vmull.s32 q9, d23, d7 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
vmlsl.s32 q0, d24, d28 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
vmlsl.s32 q1, d25, d29 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
vmlsl.s32 q8, d26, d30 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
vmlsl.s32 q9, d27, d31 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
vrshrn.s64 d0, q0, #10 @ xrQ16
vrshrn.s64 d1, q1, #10 @ xrQ16
vrshrn.s64 d2, q8, #10 @ xrQ16
vrshrn.s64 d3, q9, #10 @ xrQ16
vmull.s32 q8, d20, d28 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
vmull.s32 q9, d21, d29 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
vmull.s32 q14, d22, d30 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
vmull.s32 q15, d23, d31 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
vmlal.s32 q8, d24, d4 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
vmlal.s32 q9, d25, d5 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
vmlal.s32 q14, d26, d6 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
vmlal.s32 q15, d27, d7 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
vdup.s32 q11, r0 @ generic -> Neon doesn't cost extra cycles.
vrshrn.s64 d24, q8, #10 @ xiQ16
vrshrn.s64 d25, q9, #10 @ xiQ16
vqdmulh.s32 q0, q0, q11
vrshrn.s64 d26, q14, #10 @ xiQ16
vrshrn.s64 d27, q15, #10 @ xiQ16
@ WEBRTC_SPL_MUL_16_32_RSFT11(factQ11, xrQ16)
@ WEBRTC_SPL_MUL_16_32_RSFT11(factQ11, xiQ16)
vqdmulh.s32 q1, q1, q11
vqdmulh.s32 q2, q12, q11
vqdmulh.s32 q3, q13, q11
vst1.16 {q0, q1}, [r2]! @ outre1Q16[]
vst1.16 {q2, q3}, [r3]! @ outre2Q16[]
bgt DemodulateAndSeparate
add sp, sp, #16
pop {r3-r11,pc}

View File

@@ -16,7 +16,6 @@
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h" #include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
#include "webrtc/typedefs.h" #include "webrtc/typedefs.h"
#if !(defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON)
/* Cosine table 1 in Q14. */ /* Cosine table 1 in Q14. */
const int16_t WebRtcIsacfix_kCosTab1[FRAMESAMPLES/2] = { const int16_t WebRtcIsacfix_kCosTab1[FRAMESAMPLES/2] = {
16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315, 16294, 16270, 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315, 16294, 16270,
@@ -90,7 +89,6 @@ const int16_t WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4] = {
4137, -3929, 3720, -3511, 3301, -3091, 2880, -2669, 2457, -2245, 4137, -3929, 3720, -3511, 3301, -3091, 2880, -2669, 2457, -2245,
2032, -1819, 1606, -1392, 1179, -965, 750, -536, 322, -107 2032, -1819, 1606, -1392, 1179, -965, 750, -536, 322, -107
}; };
#endif
#if defined(MIPS32_LE) #if defined(MIPS32_LE)
/* Cosine table 2 in Q14. Used only on MIPS platforms. */ /* Cosine table 2 in Q14. Used only on MIPS platforms. */

View File

@@ -142,11 +142,9 @@
], ],
'sources': [ 'sources': [
'fix/source/entropy_coding_neon.c', 'fix/source/entropy_coding_neon.c',
'fix/source/filterbanks_neon.S', 'fix/source/filters_neon.c',
'fix/source/filters_neon.S', 'fix/source/lattice_neon.c',
'fix/source/lattice_neon.S', 'fix/source/transform_neon.c',
'fix/source/lpc_masking_model_neon.S',
'fix/source/transform_neon.S',
], ],
'conditions': [ 'conditions': [
# Disable LTO in isac_neon target due to compiler bug # Disable LTO in isac_neon target due to compiler bug
@@ -156,27 +154,11 @@
'-ffat-lto-objects', '-ffat-lto-objects',
], ],
}], }],
['target_arch=="arm64"', {
'sources!': [
'fix/source/filterbanks_neon.S',
'fix/source/filters_neon.S',
'fix/source/lattice_neon.S',
'fix/source/lpc_masking_model_neon.S',
'fix/source/transform_neon.S',
],
'sources': [
'fix/source/filters_neon.c',
'fix/source/lattice_neon.c',
'fix/source/transform_neon.c',
],
'conditions': [
# Disable AllpassFilter2FixDec16Neon function due to a clang # Disable AllpassFilter2FixDec16Neon function due to a clang
# bug. Refer more details at: # bug. Refer more details at:
# https://code.google.com/p/webrtc/issues/detail?id=4567 # https://code.google.com/p/webrtc/issues/detail?id=4567
['clang==0', { ['target_arch!="arm64" or clang==0', {
'sources': ['fix/source/filterbanks_neon.c',], 'sources': ['fix/source/filterbanks_neon.c',],
}],
],
}] }]
], ],
}, },