Enabling common_audio building with NEON on ARM64
Passed building common_audio_neon and common_audio_unittests both on Android ARMv7 and Android ARM64. Pass common_audio_unittests tests both on Android ARMv7 and Android ARM64. BUG=4002 R=andrew@webrtc.org, jridges@masque.com, kjellander@webrtc.org Change-Id: I8e0722f356db8cca6fc8232f00ae1e898a086f5a Review URL: https://webrtc-codereview.appspot.com/40629004 Patch from Zhongwei Yao <zhongwei.yao@arm.com>. Cr-Commit-Position: refs/heads/master@{#8620} git-svn-id: http://webrtc.googlecode.com/svn/trunk@8620 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
d7a212e8b9
commit
0933d01d09
@ -124,6 +124,10 @@ source_set("common_audio") {
|
||||
}
|
||||
}
|
||||
|
||||
if (current_cpu == "arm64") {
|
||||
deps += [ ":common_audio_neon" ]
|
||||
}
|
||||
|
||||
if (current_cpu == "mipsel") {
|
||||
sources += [
|
||||
"signal_processing/include/spl_inl_mips.h",
|
||||
@ -194,30 +198,23 @@ if (current_cpu == "x86" || current_cpu == "x64") {
|
||||
}
|
||||
}
|
||||
|
||||
if (rtc_build_armv7_neon) {
|
||||
if (rtc_build_armv7_neon || current_cpu == "arm64") {
|
||||
source_set("common_audio_neon") {
|
||||
sources = [
|
||||
"fir_filter_neon.cc",
|
||||
"resampler/sinc_resampler_neon.cc",
|
||||
"signal_processing/cross_correlation_neon.S",
|
||||
"signal_processing/downsample_fast_neon.S",
|
||||
"signal_processing/min_max_operations_neon.S",
|
||||
"signal_processing/cross_correlation_neon.c",
|
||||
"signal_processing/downsample_fast_neon.c",
|
||||
"signal_processing/min_max_operations_neon.c",
|
||||
]
|
||||
|
||||
configs += [ "..:common_config" ]
|
||||
public_configs = [ "..:common_inherited_config" ]
|
||||
|
||||
|
||||
# Enable compilation for the ARM v7 Neon instruction set. This is needed
|
||||
# since //build/config/arm.gni only enables Neon for iOS, not Android.
|
||||
# This provides the same functionality as webrtc/build/arm_neon.gypi.
|
||||
# TODO(kjellander): Investigate if this can be moved into webrtc.gni or
|
||||
# //build/config/arm.gni instead, to reduce code duplication.
|
||||
# Remove the -mfpu=vfpv3-d16 cflag.
|
||||
configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
|
||||
cflags = [
|
||||
"-mfpu=neon",
|
||||
]
|
||||
if (!arm_use_neon) {
|
||||
configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
|
||||
cflags = [ "-mfpu=neon" ]
|
||||
}
|
||||
|
||||
# Disable LTO in audio_processing_neon target due to compiler bug.
|
||||
if (rtc_use_lto) {
|
||||
|
@ -146,6 +146,9 @@
|
||||
}],
|
||||
], # conditions
|
||||
}],
|
||||
['target_arch=="arm64"', {
|
||||
'dependencies': ['common_audio_neon',],
|
||||
}],
|
||||
['target_arch=="mipsel" and mips_arch_variant!="r6" and android_webview_build==0', {
|
||||
'sources': [
|
||||
'signal_processing/include/spl_inl_mips.h',
|
||||
@ -194,7 +197,7 @@
|
||||
},
|
||||
], # targets
|
||||
}],
|
||||
['target_arch=="arm" and arm_version>=7', {
|
||||
['target_arch=="arm" and arm_version>=7 or target_arch=="arm64"', {
|
||||
'targets': [
|
||||
{
|
||||
'target_name': 'common_audio_neon',
|
||||
@ -203,9 +206,9 @@
|
||||
'sources': [
|
||||
'fir_filter_neon.cc',
|
||||
'resampler/sinc_resampler_neon.cc',
|
||||
'signal_processing/cross_correlation_neon.S',
|
||||
'signal_processing/downsample_fast_neon.S',
|
||||
'signal_processing/min_max_operations_neon.S',
|
||||
'signal_processing/cross_correlation_neon.c',
|
||||
'signal_processing/downsample_fast_neon.c',
|
||||
'signal_processing/min_max_operations_neon.c',
|
||||
],
|
||||
'conditions': [
|
||||
# Disable LTO in common_audio_neon target due to compiler bug
|
||||
|
@ -107,7 +107,7 @@ class SincResampler {
|
||||
static float Convolve_SSE(const float* input_ptr, const float* k1,
|
||||
const float* k2,
|
||||
double kernel_interpolation_factor);
|
||||
#elif defined(WEBRTC_ARCH_ARM_V7)
|
||||
#elif defined(WEBRTC_ARCH_ARM_V7) || defined(WEBRTC_ARCH_ARM64_NEON)
|
||||
static float Convolve_NEON(const float* input_ptr, const float* k1,
|
||||
const float* k2,
|
||||
double kernel_interpolation_factor);
|
||||
|
@ -1,159 +0,0 @@
|
||||
@
|
||||
@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
@
|
||||
@ Use of this source code is governed by a BSD-style license
|
||||
@ that can be found in the LICENSE file in the root of the source
|
||||
@ tree. An additional intellectual property rights grant can be found
|
||||
@ in the file PATENTS. All contributing project authors may
|
||||
@ be found in the AUTHORS file in the root of the source tree.
|
||||
@
|
||||
|
||||
@ cross_correlation_neon.s
|
||||
@ This file contains the function WebRtcSpl_CrossCorrelationNeon(),
|
||||
@ optimized for ARM Neon platform.
|
||||
@
|
||||
@ Reference Ccode at end of this file.
|
||||
@ Output is bit-exact with the reference C code, but not with the generic
|
||||
@ C code in file cross_correlation.c, due to reduction of shift operations
|
||||
@ from using Neon registers.
|
||||
|
||||
@ Register usage:
|
||||
@
|
||||
@ r0: *cross_correlation (function argument)
|
||||
@ r1: *seq1 (function argument)
|
||||
@ r2: *seq2 (function argument)
|
||||
@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ
|
||||
@ r4: counter for LOOP_DIM_CROSS_CORRELATION
|
||||
@ r5: seq2_ptr
|
||||
@ r6: seq1_ptr
|
||||
@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
|
||||
@ r8, r9, r10, r11, r12: scratch
|
||||
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon
|
||||
.align 2
|
||||
DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon
|
||||
push {r4-r11}
|
||||
|
||||
@ Put the shift value (-right_shifts) into a Neon register.
|
||||
ldrsh r10, [sp, #36]
|
||||
rsb r10, r10, #0
|
||||
mov r8, r10, asr #31
|
||||
vmov d16, r10, r8
|
||||
|
||||
@ Initialize loop counters.
|
||||
and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8;
|
||||
asr r3, r3, #3 @ inner_loop_len1 = dim_seq / 8;
|
||||
ldrsh r4, [sp, #32] @ dim_cross_correlation
|
||||
|
||||
LOOP_DIM_CROSS_CORRELATION:
|
||||
vmov.i32 q9, #0
|
||||
vmov.i32 q14, #0
|
||||
movs r8, r3 @ inner_loop_len1
|
||||
mov r6, r1 @ seq1_ptr
|
||||
mov r5, r2 @ seq2_ptr
|
||||
ble POST_LOOP_DIM_SEQ
|
||||
|
||||
LOOP_DIM_SEQ:
|
||||
vld1.16 {d20, d21}, [r6]! @ seq1_ptr
|
||||
vld1.16 {d22, d23}, [r5]! @ seq2_ptr
|
||||
subs r8, r8, #1
|
||||
vmull.s16 q12, d20, d22
|
||||
vmull.s16 q13, d21, d23
|
||||
vpadal.s32 q9, q12
|
||||
vpadal.s32 q14, q13
|
||||
bgt LOOP_DIM_SEQ
|
||||
|
||||
POST_LOOP_DIM_SEQ:
|
||||
movs r10, r7 @ Loop counter
|
||||
mov r12, #0
|
||||
mov r8, #0
|
||||
ble POST_LOOP_DIM_SEQ_RESIDUAL
|
||||
|
||||
LOOP_DIM_SEQ_RESIDUAL:
|
||||
ldrh r11, [r6], #2
|
||||
ldrh r9, [r5], #2
|
||||
smulbb r11, r11, r9
|
||||
adds r8, r8, r11
|
||||
adc r12, r12, r11, asr #31
|
||||
subs r10, #1
|
||||
bgt LOOP_DIM_SEQ_RESIDUAL
|
||||
|
||||
POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift.
|
||||
vadd.i64 d18, d19
|
||||
vadd.i64 d28, d29
|
||||
vadd.i64 d18, d28
|
||||
vmov.32 d17[0], r8
|
||||
vmov.32 d17[1], r12
|
||||
vadd.i64 d17, d18
|
||||
vshl.s64 d17, d16
|
||||
vst1.32 d17[0], [r0]! @ Store the output
|
||||
|
||||
ldr r8, [sp, #40] @ step_seq2
|
||||
add r2, r8, lsl #1 @ prepare for seq2_ptr(r5) in the next loop.
|
||||
|
||||
subs r4, #1
|
||||
bgt LOOP_DIM_CROSS_CORRELATION
|
||||
|
||||
pop {r4-r11}
|
||||
bx lr
|
||||
|
||||
@ TODO(kma): Place this piece of reference code into a C code file.
|
||||
@ void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
|
||||
@ int16_t* seq1,
|
||||
@ int16_t* seq2,
|
||||
@ int16_t dim_seq,
|
||||
@ int16_t dim_cross_correlation,
|
||||
@ int16_t right_shifts,
|
||||
@ int16_t step_seq2) {
|
||||
@ int i = 0;
|
||||
@ int j = 0;
|
||||
@ int inner_loop_len1 = dim_seq >> 3;
|
||||
@ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
|
||||
@
|
||||
@ assert(dim_cross_correlation > 0);
|
||||
@ assert(dim_seq > 0);
|
||||
@
|
||||
@ for (i = 0; i < dim_cross_correlation; i++) {
|
||||
@ int16_t *seq1_ptr = seq1;
|
||||
@ int16_t *seq2_ptr = seq2 + (step_seq2 * i);
|
||||
@ int64_t sum = 0;
|
||||
@
|
||||
@ for (j = inner_loop_len1; j > 0; j -= 1) {
|
||||
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||
@ seq1_ptr++;
|
||||
@ seq2_ptr++;
|
||||
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||
@ seq1_ptr++;
|
||||
@ seq2_ptr++;
|
||||
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||
@ seq1_ptr++;
|
||||
@ seq2_ptr++;
|
||||
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||
@ seq1_ptr++;
|
||||
@ seq2_ptr++;
|
||||
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||
@ seq1_ptr++;
|
||||
@ seq2_ptr++;
|
||||
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||
@ seq1_ptr++;
|
||||
@ seq2_ptr++;
|
||||
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||
@ seq1_ptr++;
|
||||
@ seq2_ptr++;
|
||||
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||
@ seq1_ptr++;
|
||||
@ seq2_ptr++;
|
||||
@ }
|
||||
@
|
||||
@ // Calculate the rest of the samples.
|
||||
@ for (j = inner_loop_len2; j > 0; j -= 1) {
|
||||
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||
@ seq1_ptr++;
|
||||
@ seq2_ptr++;
|
||||
@ }
|
||||
@
|
||||
@ *cross_correlation++ = (int32_t)(sum >> right_shifts);
|
||||
@ }
|
||||
@ }
|
@ -1,215 +0,0 @@
|
||||
@
|
||||
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
@
|
||||
@ Use of this source code is governed by a BSD-style license
|
||||
@ that can be found in the LICENSE file in the root of the source
|
||||
@ tree. An additional intellectual property rights grant can be found
|
||||
@ in the file PATENTS. All contributing project authors may
|
||||
@ be found in the AUTHORS file in the root of the source tree.
|
||||
@
|
||||
|
||||
@ This file contains the function WebRtcSpl_DownsampleFastNeon(), optimized for
|
||||
@ ARM Neon platform. The description header can be found in
|
||||
@ signal_processing_library.h
|
||||
@
|
||||
@ The reference C code is in file downsample_fast.c. Bit-exact.
|
||||
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon
|
||||
.align 2
|
||||
DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon
|
||||
push {r4-r11}
|
||||
|
||||
cmp r3, #0 @ data_out_length <= 0?
|
||||
movle r0, #-1
|
||||
ble END
|
||||
|
||||
ldrsh r12, [sp, #44]
|
||||
ldr r5, [sp, #40] @ r5: factor
|
||||
add r4, r12, #1 @ r4: delay + 1
|
||||
sub r3, r3, #1 @ r3: data_out_length - 1
|
||||
smulbb r3, r5, r3
|
||||
ldr r8, [sp, #32] @ &coefficients[0]
|
||||
mov r9, r12 @ Iteration counter for outer loops.
|
||||
add r3, r4 @ delay + factor * (out_length-1) +1
|
||||
|
||||
cmp r3, r1 @ data_in_length < endpos?
|
||||
movgt r0, #-1
|
||||
bgt END
|
||||
|
||||
@ Initializations.
|
||||
sub r3, r5, asl #3
|
||||
add r11, r0, r12, asl #1 @ &data_in[delay]
|
||||
ldr r0, [sp, #36] @ coefficients_length
|
||||
add r3, r5 @ endpos - factor * 7
|
||||
|
||||
cmp r0, #0 @ coefficients_length <= 0 ?
|
||||
movle r0, #-1
|
||||
ble END
|
||||
|
||||
add r8, r0, asl #1 @ &coeffieient[coefficients_length]
|
||||
cmp r9, r3
|
||||
bge POST_LOOP_ENDPOS @ branch when Iteration < 8 times.
|
||||
|
||||
@
|
||||
@ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others)
|
||||
@
|
||||
mov r4, #-2
|
||||
|
||||
@ Direct program flow to the right channel.
|
||||
|
||||
@ r10 is an offset to &data_in[] in the loop. After an iteration, we need to
|
||||
@ move the pointer back to original after advancing 16 bytes by a vld1, and
|
||||
@ then move 2 bytes forward to increment one more sample.
|
||||
cmp r5, #2
|
||||
moveq r10, #-14
|
||||
beq LOOP_ENDPOS_FACTOR2 @ Branch when factor == 2
|
||||
|
||||
@ Similar here, for r10, we need to move the pointer back to original after
|
||||
@ advancing 32 bytes, then move 2 bytes forward to increment one sample.
|
||||
cmp r5, #4
|
||||
moveq r10, #-30
|
||||
beq LOOP_ENDPOS_FACTOR4 @ Branch when factor == 4
|
||||
|
||||
@ For r10, we need to move the pointer back to original after advancing
|
||||
@ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample.
|
||||
mov r10, r5, asl #4
|
||||
rsb r10, #2
|
||||
add r10, r5, asl #1
|
||||
lsl r5, #1 @ r5 = factor * sizeof(data_in)
|
||||
|
||||
@ The general case (factor != 2 && factor != 4)
|
||||
LOOP_ENDPOS_GENERAL:
|
||||
@ Initializations.
|
||||
vmov.i32 q2, #2048
|
||||
vmov.i32 q3, #2048
|
||||
sub r7, r8, #2
|
||||
sub r12, r0, #1 @ coefficients_length - 1
|
||||
sub r1, r11, r12, asl #1 @ &data_in[i - j]
|
||||
|
||||
LOOP_COEFF_LENGTH_GENERAL:
|
||||
vld1.16 {d2[], d3[]}, [r7], r4 @ coefficients[j]
|
||||
vld1.16 d0[0], [r1], r5 @ data_in[i - j]
|
||||
vld1.16 d0[1], [r1], r5 @ data_in[i + factor - j]
|
||||
vld1.16 d0[2], [r1], r5 @ data_in[i + factor * 2 - j]
|
||||
vld1.16 d0[3], [r1], r5 @ data_in[i + factor * 3 - j]
|
||||
vld1.16 d1[0], [r1], r5 @ data_in[i + factor * 4 - j]
|
||||
vld1.16 d1[1], [r1], r5 @ data_in[i + factor * 5 - j]
|
||||
vld1.16 d1[2], [r1], r5 @ data_in[i + factor * 6 - j]
|
||||
vld1.16 d1[3], [r1], r10 @ data_in[i + factor * 7 - j]
|
||||
subs r12, #1
|
||||
vmlal.s16 q2, d0, d2
|
||||
vmlal.s16 q3, d1, d3
|
||||
bge LOOP_COEFF_LENGTH_GENERAL
|
||||
|
||||
@ Shift, saturate, and store the result.
|
||||
vqshrn.s32 d0, q2, #12
|
||||
vqshrn.s32 d1, q3, #12
|
||||
vst1.16 {d0, d1}, [r2]!
|
||||
|
||||
add r11, r5, asl #3 @ r11 -> &data_in[i + factor * 8]
|
||||
add r9, r5, asl #2 @ Counter i = delay + factor * 8.
|
||||
cmp r9, r3 @ i < endpos - factor * 7 ?
|
||||
blt LOOP_ENDPOS_GENERAL
|
||||
asr r5, #1 @ Restore r5 to the value of factor.
|
||||
b POST_LOOP_ENDPOS
|
||||
|
||||
@ The case for factor == 2.
|
||||
LOOP_ENDPOS_FACTOR2:
|
||||
@ Initializations.
|
||||
vmov.i32 q2, #2048
|
||||
vmov.i32 q3, #2048
|
||||
sub r7, r8, #2
|
||||
sub r12, r0, #1 @ coefficients_length - 1
|
||||
sub r1, r11, r12, asl #1 @ &data_in[i - j]
|
||||
|
||||
LOOP_COEFF_LENGTH_FACTOR2:
|
||||
vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j]
|
||||
vld2.16 {d0, d1}, [r1]! @ data_in[]
|
||||
vld2.16 {d2, d3}, [r1], r10 @ data_in[]
|
||||
subs r12, #1
|
||||
vmlal.s16 q2, d0, d16
|
||||
vmlal.s16 q3, d2, d17
|
||||
bge LOOP_COEFF_LENGTH_FACTOR2
|
||||
|
||||
@ Shift, saturate, and store the result.
|
||||
vqshrn.s32 d0, q2, #12
|
||||
vqshrn.s32 d1, q3, #12
|
||||
vst1.16 {d0, d1}, [r2]!
|
||||
|
||||
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
|
||||
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
|
||||
cmp r9, r3 @ i < endpos - factor * 7 ?
|
||||
blt LOOP_ENDPOS_FACTOR2
|
||||
b POST_LOOP_ENDPOS
|
||||
|
||||
@ The case for factor == 4.
|
||||
LOOP_ENDPOS_FACTOR4:
|
||||
@ Initializations.
|
||||
vmov.i32 q2, #2048
|
||||
vmov.i32 q3, #2048
|
||||
sub r7, r8, #2
|
||||
sub r12, r0, #1 @ coefficients_length - 1
|
||||
sub r1, r11, r12, asl #1 @ &data_in[i - j]
|
||||
|
||||
LOOP_COEFF_LENGTH_FACTOR4:
|
||||
vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j]
|
||||
vld4.16 {d0, d1, d2, d3}, [r1]! @ data_in[]
|
||||
vld4.16 {d18, d19, d20, d21}, [r1], r10 @ data_in[]
|
||||
subs r12, #1
|
||||
vmlal.s16 q2, d0, d16
|
||||
vmlal.s16 q3, d18, d17
|
||||
bge LOOP_COEFF_LENGTH_FACTOR4
|
||||
|
||||
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
|
||||
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
|
||||
|
||||
@ Shift, saturate, and store the result.
|
||||
vqshrn.s32 d0, q2, #12
|
||||
vqshrn.s32 d1, q3, #12
|
||||
cmp r9, r3 @ i < endpos - factor * 7 ?
|
||||
vst1.16 {d0, d1}, [r2]!
|
||||
|
||||
blt LOOP_ENDPOS_FACTOR4
|
||||
|
||||
@
|
||||
@ Second part, do the rest iterations (if any).
|
||||
@
|
||||
|
||||
POST_LOOP_ENDPOS:
|
||||
add r3, r5, asl #3
|
||||
sub r3, r5 @ Restore r3 to endpos.
|
||||
cmp r9, r3
|
||||
movge r0, #0
|
||||
bge END
|
||||
|
||||
LOOP2_ENDPOS:
|
||||
@ Initializations.
|
||||
mov r7, r8
|
||||
sub r12, r0, #1 @ coefficients_length - 1
|
||||
sub r6, r11, r12, asl #1 @ &data_in[i - j]
|
||||
|
||||
mov r1, #2048
|
||||
|
||||
LOOP2_COEFF_LENGTH:
|
||||
ldrsh r4, [r7, #-2]! @ coefficients[j]
|
||||
ldrsh r10, [r6], #2 @ data_in[i - j]
|
||||
smlabb r1, r4, r10, r1
|
||||
subs r12, #1
|
||||
bge LOOP2_COEFF_LENGTH
|
||||
|
||||
@ Shift, saturate, and store the result.
|
||||
ssat r1, #16, r1, asr #12
|
||||
strh r1, [r2], #2
|
||||
|
||||
add r11, r5, asl #1 @ r11 -> &data_in[i + factor]
|
||||
add r9, r5 @ Counter i = delay + factor.
|
||||
cmp r9, r3 @ i < endpos?
|
||||
blt LOOP2_ENDPOS
|
||||
|
||||
mov r0, #0
|
||||
|
||||
END:
|
||||
pop {r4-r11}
|
||||
bx lr
|
@ -154,7 +154,8 @@ void WebRtcSpl_ZerosArrayW32(int32_t* vector,
|
||||
typedef int16_t (*MaxAbsValueW16)(const int16_t* vector, int length);
|
||||
extern MaxAbsValueW16 WebRtcSpl_MaxAbsValueW16;
|
||||
int16_t WebRtcSpl_MaxAbsValueW16C(const int16_t* vector, int length);
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
|
||||
(defined WEBRTC_ARCH_ARM64_NEON)
|
||||
int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
|
||||
#endif
|
||||
#if defined(MIPS32_LE)
|
||||
@ -172,7 +173,8 @@ int16_t WebRtcSpl_MaxAbsValueW16_mips(const int16_t* vector, int length);
|
||||
typedef int32_t (*MaxAbsValueW32)(const int32_t* vector, int length);
|
||||
extern MaxAbsValueW32 WebRtcSpl_MaxAbsValueW32;
|
||||
int32_t WebRtcSpl_MaxAbsValueW32C(const int32_t* vector, int length);
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
|
||||
(defined WEBRTC_ARCH_ARM64_NEON)
|
||||
int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
|
||||
#endif
|
||||
#if defined(MIPS_DSP_R1_LE)
|
||||
@ -192,7 +194,8 @@ int32_t WebRtcSpl_MaxAbsValueW32_mips(const int32_t* vector, int length);
|
||||
typedef int16_t (*MaxValueW16)(const int16_t* vector, int length);
|
||||
extern MaxValueW16 WebRtcSpl_MaxValueW16;
|
||||
int16_t WebRtcSpl_MaxValueW16C(const int16_t* vector, int length);
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
|
||||
(defined WEBRTC_ARCH_ARM64_NEON)
|
||||
int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
|
||||
#endif
|
||||
#if defined(MIPS32_LE)
|
||||
@ -212,7 +215,8 @@ int16_t WebRtcSpl_MaxValueW16_mips(const int16_t* vector, int length);
|
||||
typedef int32_t (*MaxValueW32)(const int32_t* vector, int length);
|
||||
extern MaxValueW32 WebRtcSpl_MaxValueW32;
|
||||
int32_t WebRtcSpl_MaxValueW32C(const int32_t* vector, int length);
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
|
||||
(defined WEBRTC_ARCH_ARM64_NEON)
|
||||
int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
|
||||
#endif
|
||||
#if defined(MIPS32_LE)
|
||||
@ -232,7 +236,8 @@ int32_t WebRtcSpl_MaxValueW32_mips(const int32_t* vector, int length);
|
||||
typedef int16_t (*MinValueW16)(const int16_t* vector, int length);
|
||||
extern MinValueW16 WebRtcSpl_MinValueW16;
|
||||
int16_t WebRtcSpl_MinValueW16C(const int16_t* vector, int length);
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
|
||||
(defined WEBRTC_ARCH_ARM64_NEON)
|
||||
int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
|
||||
#endif
|
||||
#if defined(MIPS32_LE)
|
||||
@ -252,7 +257,8 @@ int16_t WebRtcSpl_MinValueW16_mips(const int16_t* vector, int length);
|
||||
typedef int32_t (*MinValueW32)(const int32_t* vector, int length);
|
||||
extern MinValueW32 WebRtcSpl_MinValueW32;
|
||||
int32_t WebRtcSpl_MinValueW32C(const int32_t* vector, int length);
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
|
||||
(defined WEBRTC_ARCH_ARM64_NEON)
|
||||
int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
|
||||
#endif
|
||||
#if defined(MIPS32_LE)
|
||||
@ -552,7 +558,8 @@ void WebRtcSpl_CrossCorrelationC(int32_t* cross_correlation,
|
||||
int16_t dim_cross_correlation,
|
||||
int16_t right_shifts,
|
||||
int16_t step_seq2);
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
|
||||
(defined WEBRTC_ARCH_ARM64_NEON)
|
||||
void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
|
||||
const int16_t* seq1,
|
||||
const int16_t* seq2,
|
||||
@ -717,7 +724,8 @@ int WebRtcSpl_DownsampleFastC(const int16_t* data_in,
|
||||
int coefficients_length,
|
||||
int factor,
|
||||
int delay);
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
|
||||
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
|
||||
(defined WEBRTC_ARCH_ARM64_NEON)
|
||||
int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
|
||||
int data_in_length,
|
||||
int16_t* data_out,
|
||||
|
@ -1,283 +0,0 @@
|
||||
@
|
||||
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
@
|
||||
@ Use of this source code is governed by a BSD-style license
|
||||
@ that can be found in the LICENSE file in the root of the source
|
||||
@ tree. An additional intellectual property rights grant can be found
|
||||
@ in the file PATENTS. All contributing project authors may
|
||||
@ be found in the AUTHORS file in the root of the source tree.
|
||||
@
|
||||
|
||||
@ This file contains some minimum and maximum functions, optimized for
|
||||
@ ARM Neon platform. The description header can be found in
|
||||
@ signal_processing_library.h
|
||||
@
|
||||
@ The reference C code is in file min_max_operations.c. Code here is basically
|
||||
@ a loop unrolling by 8 with Neon instructions. Bit-exact.
|
||||
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
|
||||
GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
|
||||
GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon
|
||||
GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon
|
||||
GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon
|
||||
GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon
|
||||
|
||||
.align 2
|
||||
@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
|
||||
DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
|
||||
mov r2, #-1 @ Initialize the return value.
|
||||
cmp r0, #0
|
||||
beq END_MAX_ABS_VALUE_W16
|
||||
cmp r1, #0
|
||||
ble END_MAX_ABS_VALUE_W16
|
||||
|
||||
cmp r1, #8
|
||||
blt LOOP_MAX_ABS_VALUE_W16
|
||||
|
||||
vmov.i16 q12, #0
|
||||
sub r1, #8 @ Counter for loops
|
||||
|
||||
LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
|
||||
vld1.16 {q13}, [r0]!
|
||||
subs r1, #8
|
||||
vabs.s16 q13, q13 @ Note vabs doesn't change the value of -32768.
|
||||
vmax.u16 q12, q13 @ Use u16 so we don't lose the value -32768.
|
||||
bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16
|
||||
|
||||
@ Find the maximum value in the Neon registers and move it to r2.
|
||||
vmax.u16 d24, d25
|
||||
vpmax.u16 d24, d24, d24
|
||||
vpmax.u16 d24, d24, d24
|
||||
adds r1, #8
|
||||
vmov.u16 r2, d24[0]
|
||||
beq END_MAX_ABS_VALUE_W16
|
||||
|
||||
LOOP_MAX_ABS_VALUE_W16:
|
||||
ldrsh r3, [r0], #2
|
||||
eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value.
|
||||
sub r12, r12, r3, asr #31
|
||||
cmp r2, r12
|
||||
movlt r2, r12
|
||||
subs r1, #1
|
||||
bne LOOP_MAX_ABS_VALUE_W16
|
||||
|
||||
END_MAX_ABS_VALUE_W16:
|
||||
cmp r2, #0x8000 @ Guard against the case for -32768.
|
||||
subeq r2, #1
|
||||
mov r0, r2
|
||||
bx lr
|
||||
|
||||
|
||||
|
||||
@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
|
||||
DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
|
||||
cmp r0, #0
|
||||
moveq r0, #-1
|
||||
beq EXIT @ Return -1 for a NULL pointer.
|
||||
cmp r1, #0 @ length
|
||||
movle r0, #-1
|
||||
ble EXIT @ Return -1 if length <= 0.
|
||||
|
||||
vmov.i32 q11, #0
|
||||
vmov.i32 q12, #0
|
||||
cmp r1, #8
|
||||
blt LOOP_MAX_ABS_VALUE_W32
|
||||
|
||||
sub r1, #8 @ Counter for loops
|
||||
|
||||
LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
|
||||
vld1.32 {q13, q14}, [r0]!
|
||||
subs r1, #8 @ Counter for loops
|
||||
vabs.s32 q13, q13 @ vabs doesn't change the value of 0x80000000.
|
||||
vabs.s32 q14, q14
|
||||
vmax.u32 q11, q13 @ Use u32 so we don't lose the value 0x80000000.
|
||||
vmax.u32 q12, q14
|
||||
bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32
|
||||
|
||||
@ Find the maximum value in the Neon registers and move it to r2.
|
||||
vmax.u32 q12, q11
|
||||
vmax.u32 d24, d25
|
||||
vpmax.u32 d24, d24, d24
|
||||
adds r1, #8
|
||||
vmov.u32 r2, d24[0]
|
||||
beq END_MAX_ABS_VALUE_W32
|
||||
|
||||
LOOP_MAX_ABS_VALUE_W32:
|
||||
ldr r3, [r0], #4
|
||||
eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value.
|
||||
sub r12, r12, r3, asr #31
|
||||
cmp r2, r12
|
||||
movcc r2, r12
|
||||
subs r1, #1
|
||||
bne LOOP_MAX_ABS_VALUE_W32
|
||||
|
||||
END_MAX_ABS_VALUE_W32:
|
||||
mvn r0, #0x80000000 @ Guard against the case for 0x80000000.
|
||||
cmp r2, r0
|
||||
movcc r0, r2
|
||||
|
||||
EXIT:
|
||||
bx lr
|
||||
|
||||
@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
|
||||
DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon
|
||||
mov r2, #0x8000 @ Initialize the return value.
|
||||
cmp r0, #0
|
||||
beq END_MAX_VALUE_W16
|
||||
cmp r1, #0
|
||||
ble END_MAX_VALUE_W16
|
||||
|
||||
vmov.i16 q12, #0x8000
|
||||
cmp r1, #8
|
||||
blt LOOP_MAX_VALUE_W16
|
||||
|
||||
sub r1, #8 @ Counter for loops
|
||||
|
||||
LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
|
||||
vld1.16 {q13}, [r0]!
|
||||
subs r1, #8
|
||||
vmax.s16 q12, q13
|
||||
bge LOOP_UNROLLED_BY_8_MAX_VALUE_W16
|
||||
|
||||
@ Find the maximum value in the Neon registers and move it to r2.
|
||||
vmax.s16 d24, d25
|
||||
vpmax.s16 d24, d24, d24
|
||||
vpmax.s16 d24, d24, d24
|
||||
adds r1, #8
|
||||
vmov.u16 r2, d24[0]
|
||||
beq END_MAX_VALUE_W16
|
||||
|
||||
LOOP_MAX_VALUE_W16:
|
||||
ldrsh r3, [r0], #2
|
||||
cmp r2, r3
|
||||
movlt r2, r3
|
||||
subs r1, #1
|
||||
bne LOOP_MAX_VALUE_W16
|
||||
|
||||
END_MAX_VALUE_W16:
|
||||
mov r0, r2
|
||||
bx lr
|
||||
|
||||
@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
|
||||
DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon
|
||||
mov r2, #0x80000000 @ Initialize the return value.
|
||||
cmp r0, #0
|
||||
beq END_MAX_VALUE_W32
|
||||
cmp r1, #0
|
||||
ble END_MAX_VALUE_W32
|
||||
|
||||
vmov.i32 q11, #0x80000000
|
||||
vmov.i32 q12, #0x80000000
|
||||
cmp r1, #8
|
||||
blt LOOP_MAX_VALUE_W32
|
||||
|
||||
sub r1, #8 @ Counter for loops
|
||||
|
||||
LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
|
||||
vld1.32 {q13, q14}, [r0]!
|
||||
subs r1, #8
|
||||
vmax.s32 q11, q13
|
||||
vmax.s32 q12, q14
|
||||
bge LOOP_UNROLLED_BY_8_MAX_VALUE_W32
|
||||
|
||||
@ Find the maximum value in the Neon registers and move it to r2.
|
||||
vmax.s32 q12, q11
|
||||
vpmax.s32 d24, d24, d25
|
||||
vpmax.s32 d24, d24, d24
|
||||
adds r1, #8
|
||||
vmov.s32 r2, d24[0]
|
||||
beq END_MAX_VALUE_W32
|
||||
|
||||
LOOP_MAX_VALUE_W32:
|
||||
ldr r3, [r0], #4
|
||||
cmp r2, r3
|
||||
movlt r2, r3
|
||||
subs r1, #1
|
||||
bne LOOP_MAX_VALUE_W32
|
||||
|
||||
END_MAX_VALUE_W32:
|
||||
mov r0, r2
|
||||
bx lr
|
||||
|
||||
@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
|
||||
DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon
|
||||
movw r2, #0x7FFF @ Initialize the return value.
|
||||
cmp r0, #0
|
||||
beq END_MIN_VALUE_W16
|
||||
cmp r1, #0
|
||||
ble END_MIN_VALUE_W16
|
||||
|
||||
vdup.16 q12, r2
|
||||
cmp r1, #8
|
||||
blt LOOP_MIN_VALUE_W16
|
||||
|
||||
sub r1, #8 @ Counter for loops
|
||||
|
||||
LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
|
||||
vld1.16 {q13}, [r0]!
|
||||
subs r1, #8
|
||||
vmin.s16 q12, q13
|
||||
bge LOOP_UNROLLED_BY_8_MIN_VALUE_W16
|
||||
|
||||
@ Find the maximum value in the Neon registers and move it to r2.
|
||||
vmin.s16 d24, d25
|
||||
vpmin.s16 d24, d24, d24
|
||||
vpmin.s16 d24, d24, d24
|
||||
adds r1, #8
|
||||
vmov.s16 r2, d24[0]
|
||||
sxth r2, r2
|
||||
beq END_MIN_VALUE_W16
|
||||
|
||||
LOOP_MIN_VALUE_W16:
|
||||
ldrsh r3, [r0], #2
|
||||
cmp r2, r3
|
||||
movge r2, r3
|
||||
subs r1, #1
|
||||
bne LOOP_MIN_VALUE_W16
|
||||
|
||||
END_MIN_VALUE_W16:
|
||||
mov r0, r2
|
||||
bx lr
|
||||
|
||||
@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
|
||||
DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon
|
||||
mov r2, #0x7FFFFFFF @ Initialize the return value.
|
||||
cmp r0, #0
|
||||
beq END_MIN_VALUE_W32
|
||||
cmp r1, #0
|
||||
ble END_MIN_VALUE_W32
|
||||
|
||||
vdup.32 q11, r2
|
||||
vdup.32 q12, r2
|
||||
cmp r1, #8
|
||||
blt LOOP_MIN_VALUE_W32
|
||||
|
||||
sub r1, #8 @ Counter for loops
|
||||
|
||||
LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
|
||||
vld1.32 {q13, q14}, [r0]!
|
||||
subs r1, #8
|
||||
vmin.s32 q11, q13
|
||||
vmin.s32 q12, q14
|
||||
bge LOOP_UNROLLED_BY_8_MIN_VALUE_W32
|
||||
|
||||
@ Find the maximum value in the Neon registers and move it to r2.
|
||||
vmin.s32 q12, q11
|
||||
vpmin.s32 d24, d24, d25
|
||||
vpmin.s32 d24, d24, d24
|
||||
adds r1, #8
|
||||
vmov.s32 r2, d24[0]
|
||||
beq END_MIN_VALUE_W32
|
||||
|
||||
LOOP_MIN_VALUE_W32:
|
||||
ldr r3, [r0], #4
|
||||
cmp r2, r3
|
||||
movge r2, r3
|
||||
subs r1, #1
|
||||
bne LOOP_MIN_VALUE_W32
|
||||
|
||||
END_MIN_VALUE_W32:
|
||||
mov r0, r2
|
||||
bx lr
|
@ -29,7 +29,7 @@ DownsampleFast WebRtcSpl_DownsampleFast;
|
||||
ScaleAndAddVectorsWithRound WebRtcSpl_ScaleAndAddVectorsWithRound;
|
||||
|
||||
#if (defined(WEBRTC_DETECT_ARM_NEON) || !defined(WEBRTC_ARCH_ARM_NEON)) && \
|
||||
!defined(MIPS32_LE)
|
||||
!defined(MIPS32_LE) && !defined(WEBRTC_ARCH_ARM64_NEON)
|
||||
/* Initialize function pointers to the generic C version. */
|
||||
static void InitPointersToC() {
|
||||
WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16C;
|
||||
@ -45,7 +45,8 @@ static void InitPointersToC() {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON)
|
||||
#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON) || \
|
||||
(defined WEBRTC_ARCH_ARM64_NEON)
|
||||
/* Initialize function pointers to the Neon version. */
|
||||
static void InitPointersToNeon() {
|
||||
WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16Neon;
|
||||
@ -92,7 +93,7 @@ static void InitFunctionPointers(void) {
|
||||
} else {
|
||||
InitPointersToC();
|
||||
}
|
||||
#elif defined(WEBRTC_ARCH_ARM_NEON)
|
||||
#elif defined(WEBRTC_ARCH_ARM_NEON) || defined(WEBRTC_ARCH_ARM64_NEON)
|
||||
InitPointersToNeon();
|
||||
#elif defined(MIPS32_LE)
|
||||
InitPointersToMIPS();
|
||||
|
Loading…
x
Reference in New Issue
Block a user