Optimized spl function WebRtcSpl_CrossCorrelation for ARM Neon platforms.
When used in Neteq, Neteq performance improved from 13 to 33% with different test configurations. Output is not bit-exact with generic C code in file cross_correlation.c, due to reduction of shift operations from using Neon registers, although in theory now the result is more accurate than before. Review URL: http://webrtc-codereview.appspot.com/333013 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1350 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
bccac66885
commit
b0abbd353d
@ -22,7 +22,6 @@ LOCAL_SRC_FILES := \
|
|||||||
complex_fft.c \
|
complex_fft.c \
|
||||||
complex_bit_reverse.c \
|
complex_bit_reverse.c \
|
||||||
copy_set_operations.c \
|
copy_set_operations.c \
|
||||||
cross_correlation.c \
|
|
||||||
division_operations.c \
|
division_operations.c \
|
||||||
dot_product_with_scale.c \
|
dot_product_with_scale.c \
|
||||||
downsample_fast.c \
|
downsample_fast.c \
|
||||||
@ -60,9 +59,13 @@ LOCAL_C_INCLUDES := \
|
|||||||
|
|
||||||
ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
||||||
LOCAL_SRC_FILES += \
|
LOCAL_SRC_FILES += \
|
||||||
min_max_operations_neon.c
|
min_max_operations_neon.c \
|
||||||
|
cross_correlation_neon.s
|
||||||
LOCAL_CFLAGS += \
|
LOCAL_CFLAGS += \
|
||||||
$(MY_ARM_CFLAGS_NEON)
|
$(MY_ARM_CFLAGS_NEON)
|
||||||
|
else
|
||||||
|
LOCAL_SRC_FILES += \
|
||||||
|
cross_correlation.c
|
||||||
endif
|
endif
|
||||||
|
|
||||||
LOCAL_SHARED_LIBRARIES := libstlport
|
LOCAL_SHARED_LIBRARIES := libstlport
|
||||||
|
@ -15,6 +15,10 @@
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* TODO(kma): Clean up the code in this file, and break it up for
|
||||||
|
* various platforms (Xscale, ARM/Neon etc.).
|
||||||
|
*/
|
||||||
|
|
||||||
#include "signal_processing_library.h"
|
#include "signal_processing_library.h"
|
||||||
|
|
||||||
void WebRtcSpl_CrossCorrelation(WebRtc_Word32* cross_correlation, WebRtc_Word16* seq1,
|
void WebRtcSpl_CrossCorrelation(WebRtc_Word32* cross_correlation, WebRtc_Word16* seq1,
|
||||||
|
168
src/common_audio/signal_processing/cross_correlation_neon.s
Normal file
168
src/common_audio/signal_processing/cross_correlation_neon.s
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
@
|
||||||
|
@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||||
|
@
|
||||||
|
@ Use of this source code is governed by a BSD-style license
|
||||||
|
@ that can be found in the LICENSE file in the root of the source
|
||||||
|
@ tree. An additional intellectual property rights grant can be found
|
||||||
|
@ in the file PATENTS. All contributing project authors may
|
||||||
|
@ be found in the AUTHORS file in the root of the source tree.
|
||||||
|
@
|
||||||
|
|
||||||
|
@ cross_correlation_neon.s
|
||||||
|
@ This file contains the function WebRtcSpl_CrossCorrelation(),
|
||||||
|
@ optimized for ARM Neon platform.
|
||||||
|
@
|
||||||
|
@ Reference Ccode at end of this file.
|
||||||
|
@ Output is bit-exact with the reference C code, but not with the generic
|
||||||
|
@ C code in file cross_correlation.c, due to reduction of shift operations
|
||||||
|
@ from using Neon registers.
|
||||||
|
|
||||||
|
@ Register usage:
|
||||||
|
@
|
||||||
|
@ r0: *cross_correlation (function argument)
|
||||||
|
@ r1: *seq1 (function argument)
|
||||||
|
@ r2: *seq2 (function argument)
|
||||||
|
@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ
|
||||||
|
@ r4: counter for LOOP_DIM_CROSS_CORRELATION
|
||||||
|
@ r5: seq2_ptr
|
||||||
|
@ r6: seq1_ptr
|
||||||
|
@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
|
||||||
|
@ r8, r9, r10, r11, r12: scratch
|
||||||
|
|
||||||
|
.arch armv7-a
|
||||||
|
.fpu neon
|
||||||
|
|
||||||
|
.align 2
|
||||||
|
.global WebRtcSpl_CrossCorrelation
|
||||||
|
|
||||||
|
WebRtcSpl_CrossCorrelation:
|
||||||
|
|
||||||
|
.fnstart
|
||||||
|
|
||||||
|
.save {r4-r11}
|
||||||
|
push {r4-r11}
|
||||||
|
|
||||||
|
@ Put the shift value (-right_shifts) into a Neon register.
|
||||||
|
ldrsh r10, [sp, #36]
|
||||||
|
rsb r10, r10, #0
|
||||||
|
mov r8, r10, asr #31
|
||||||
|
vmov.32 d16, r10, r8
|
||||||
|
|
||||||
|
@ Initialize loop counters.
|
||||||
|
and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8;
|
||||||
|
asr r3, r3, #3 @ inner_loop_len1 = dim_seq / 8;
|
||||||
|
ldrsh r4, [sp, #32] @ dim_cross_correlation
|
||||||
|
|
||||||
|
LOOP_DIM_CROSS_CORRELATION:
|
||||||
|
vmov.i32 q9, #0
|
||||||
|
vmov.i32 q14, #0
|
||||||
|
movs r8, r3 @ inner_loop_len1
|
||||||
|
mov r6, r1 @ seq1_ptr
|
||||||
|
mov r5, r2 @ seq2_ptr
|
||||||
|
ble POST_LOOP_DIM_SEQ
|
||||||
|
|
||||||
|
LOOP_DIM_SEQ:
|
||||||
|
vld1.16 {d20, d21}, [r6]! @ seq1_ptr
|
||||||
|
vld1.16 {d22, d23}, [r5]! @ seq2_ptr
|
||||||
|
subs r8, r8, #1
|
||||||
|
vmull.s16 q12, d20, d22
|
||||||
|
vmull.s16 q13, d21, d23
|
||||||
|
vpadal.s32 q9, q12
|
||||||
|
vpadal.s32 q14, q13
|
||||||
|
bgt LOOP_DIM_SEQ
|
||||||
|
|
||||||
|
POST_LOOP_DIM_SEQ:
|
||||||
|
movs r10, r7 @ Loop counter
|
||||||
|
mov r12, #0
|
||||||
|
mov r8, #0
|
||||||
|
ble POST_LOOP_DIM_SEQ_RESIDUAL
|
||||||
|
|
||||||
|
LOOP_DIM_SEQ_RESIDUAL:
|
||||||
|
ldrh r11, [r6], #2
|
||||||
|
ldrh r9, [r5], #2
|
||||||
|
smulbb r11, r11, r9
|
||||||
|
adds r8, r8, r11
|
||||||
|
adc r12, r12, r11, asr #31
|
||||||
|
subs r10, #1
|
||||||
|
bgt LOOP_DIM_SEQ_RESIDUAL
|
||||||
|
|
||||||
|
POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift.
|
||||||
|
vadd.i64 d18, d19
|
||||||
|
vadd.i64 d28, d29
|
||||||
|
vadd.i64 d18, d28
|
||||||
|
vmov.32 d17[0], r8
|
||||||
|
vmov.32 d17[1], r12
|
||||||
|
vadd.i64 d17, d18
|
||||||
|
vshl.s64 d17, d16
|
||||||
|
vst1.32 d17[0], [r0]! @ Store the output
|
||||||
|
|
||||||
|
ldr r8, [sp, #40] @ step_seq2
|
||||||
|
add r2, r8, lsl #1 @ prepare for seq2_ptr(r5) in the next loop.
|
||||||
|
|
||||||
|
subs r4, #1
|
||||||
|
bgt LOOP_DIM_CROSS_CORRELATION
|
||||||
|
|
||||||
|
pop {r4-r11}
|
||||||
|
bx lr
|
||||||
|
|
||||||
|
.fnend
|
||||||
|
|
||||||
|
|
||||||
|
@ TODO(kma): Place this piece of reference code into a C code file.
|
||||||
|
@ void WebRtcSpl_CrossCorrelation(WebRtc_Word32* cross_correlation,
|
||||||
|
@ WebRtc_Word16* seq1,
|
||||||
|
@ WebRtc_Word16* seq2,
|
||||||
|
@ WebRtc_Word16 dim_seq,
|
||||||
|
@ WebRtc_Word16 dim_cross_correlation,
|
||||||
|
@ WebRtc_Word16 right_shifts,
|
||||||
|
@ WebRtc_Word16 step_seq2) {
|
||||||
|
@ int i = 0;
|
||||||
|
@ int j = 0;
|
||||||
|
@ int inner_loop_len1 = dim_seq >> 3;
|
||||||
|
@ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
|
||||||
|
@
|
||||||
|
@ assert(dim_cross_correlation > 0);
|
||||||
|
@ assert(dim_seq > 0);
|
||||||
|
@
|
||||||
|
@ for (i = 0; i < dim_cross_correlation; i++) {
|
||||||
|
@ int16_t *seq1_ptr = seq1;
|
||||||
|
@ int16_t *seq2_ptr = seq2 + (step_seq2 * i);
|
||||||
|
@ int64_t sum = 0;
|
||||||
|
@
|
||||||
|
@ for (j = inner_loop_len1; j > 0; j -= 1) {
|
||||||
|
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||||
|
@ seq1_ptr++;
|
||||||
|
@ seq2_ptr++;
|
||||||
|
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||||
|
@ seq1_ptr++;
|
||||||
|
@ seq2_ptr++;
|
||||||
|
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||||
|
@ seq1_ptr++;
|
||||||
|
@ seq2_ptr++;
|
||||||
|
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||||
|
@ seq1_ptr++;
|
||||||
|
@ seq2_ptr++;
|
||||||
|
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||||
|
@ seq1_ptr++;
|
||||||
|
@ seq2_ptr++;
|
||||||
|
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||||
|
@ seq1_ptr++;
|
||||||
|
@ seq2_ptr++;
|
||||||
|
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||||
|
@ seq1_ptr++;
|
||||||
|
@ seq2_ptr++;
|
||||||
|
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||||
|
@ seq1_ptr++;
|
||||||
|
@ seq2_ptr++;
|
||||||
|
@ }
|
||||||
|
@
|
||||||
|
@ // Calculate the rest of the samples.
|
||||||
|
@ for (j = inner_loop_len2; j > 0; j -= 1) {
|
||||||
|
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||||
|
@ seq1_ptr++;
|
||||||
|
@ seq2_ptr++;
|
||||||
|
@ }
|
||||||
|
@
|
||||||
|
@ *cross_correlation++ = (int32_t)(sum >> right_shifts);
|
||||||
|
@ }
|
||||||
|
@ }
|
Loading…
x
Reference in New Issue
Block a user