Optimized spl function WebRtcSpl_CrossCorrelation for ARM Neon platforms.

When used in Neteq, Neteq performance improved from 13 to 33% with different test configurations. Output is not bit-exact with generic C code in file cross_correlation.c, due to reduction of shift operations from using Neon registers, although in theory now the result is more accurate than before. Review URL: http://webrtc-codereview.appspot.com/333013 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1350 4adac7df-926f-26a2-2b94-8c16560cd09d
2012-01-06 19:50:20 +00:00 · 2012-01-06 19:50:20 +00:00 · b0abbd353d
commit b0abbd353d
parent bccac66885
3 changed files with 177 additions and 2 deletions
--- a/src/common_audio/signal_processing/Android.mk
+++ b/src/common_audio/signal_processing/Android.mk
@ -22,7 +22,6 @@ LOCAL_SRC_FILES := \
    complex_fft.c \
    complex_bit_reverse.c \
    copy_set_operations.c \
-    cross_correlation.c \
    division_operations.c \
    dot_product_with_scale.c \
    downsample_fast.c \
@ -60,9 +59,13 @@ LOCAL_C_INCLUDES := \

 ifeq ($(ARCH_ARM_HAVE_NEON),true)
 LOCAL_SRC_FILES += \
-    min_max_operations_neon.c
+    min_max_operations_neon.c \
+    cross_correlation_neon.s
 LOCAL_CFLAGS += \
    $(MY_ARM_CFLAGS_NEON)
+else
+LOCAL_SRC_FILES += \
+    cross_correlation.c
 endif

 LOCAL_SHARED_LIBRARIES := libstlport
--- a/src/common_audio/signal_processing/cross_correlation.c
+++ b/src/common_audio/signal_processing/cross_correlation.c
@ -15,6 +15,10 @@
 *
 */

+/* TODO(kma): Clean up the code in this file, and break it up for
+ * various platforms (Xscale, ARM/Neon etc.).
+ */
+
 #include "signal_processing_library.h"

 void WebRtcSpl_CrossCorrelation(WebRtc_Word32* cross_correlation, WebRtc_Word16* seq1,
--- a/src/common_audio/signal_processing/cross_correlation_neon.s
+++ b/src/common_audio/signal_processing/cross_correlation_neon.s
@ -0,0 +1,168 @@
+@
+@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS.  All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+
+@ cross_correlation_neon.s
+@ This file contains the function WebRtcSpl_CrossCorrelation(),
+@ optimized for ARM Neon platform.
+@
+@ Reference Ccode at end of this file.
+@ Output is bit-exact with the reference C code, but not with the generic
+@ C code in file cross_correlation.c, due to reduction of shift operations
+@ from using Neon registers.
+
+@ Register usage:
+@
+@ r0: *cross_correlation (function argument)
+@ r1: *seq1 (function argument)
+@ r2: *seq2 (function argument)
+@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ
+@ r4: counter for LOOP_DIM_CROSS_CORRELATION
+@ r5: seq2_ptr
+@ r6: seq1_ptr
+@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
+@ r8, r9, r10, r11, r12: scratch
+
+.arch armv7-a
+.fpu neon
+
+.align  2
+.global WebRtcSpl_CrossCorrelation
+
+WebRtcSpl_CrossCorrelation:
+
+.fnstart
+
+.save {r4-r11}
+  push {r4-r11}
+
+  @ Put the shift value (-right_shifts) into a Neon register.
+  ldrsh r10, [sp, #36]
+  rsb r10, r10, #0
+  mov r8, r10, asr #31
+  vmov.32 d16, r10, r8
+
+  @ Initialize loop counters.
+  and r7, r3, #7              @ inner_loop_len2 = dim_seq % 8;
+  asr r3, r3, #3              @ inner_loop_len1 = dim_seq / 8;
+  ldrsh r4, [sp, #32]         @ dim_cross_correlation
+
+LOOP_DIM_CROSS_CORRELATION:
+  vmov.i32 q9, #0
+  vmov.i32 q14, #0
+  movs r8, r3                 @ inner_loop_len1
+  mov r6, r1                  @ seq1_ptr
+  mov r5, r2                  @ seq2_ptr
+  ble POST_LOOP_DIM_SEQ
+
+LOOP_DIM_SEQ:
+  vld1.16 {d20, d21}, [r6]!   @ seq1_ptr
+  vld1.16 {d22, d23}, [r5]!   @ seq2_ptr 
+  subs r8, r8, #1
+  vmull.s16 q12, d20, d22
+  vmull.s16 q13, d21, d23
+  vpadal.s32 q9, q12
+  vpadal.s32 q14, q13
+  bgt LOOP_DIM_SEQ
+
+POST_LOOP_DIM_SEQ:
+  movs r10, r7                @ Loop counter
+  mov r12, #0
+  mov r8, #0
+  ble POST_LOOP_DIM_SEQ_RESIDUAL
+
+LOOP_DIM_SEQ_RESIDUAL:
+  ldrh r11, [r6], #2
+  ldrh r9, [r5], #2
+  smulbb r11, r11, r9
+  adds r8, r8, r11
+  adc r12, r12, r11, asr #31
+  subs r10, #1
+  bgt LOOP_DIM_SEQ_RESIDUAL
+
+POST_LOOP_DIM_SEQ_RESIDUAL:   @ Sum the results up and do the shift.
+  vadd.i64 d18, d19
+  vadd.i64 d28, d29
+  vadd.i64 d18, d28
+  vmov.32 d17[0], r8
+  vmov.32 d17[1], r12
+  vadd.i64 d17, d18
+  vshl.s64 d17, d16
+  vst1.32 d17[0], [r0]!       @ Store the output
+
+  ldr r8, [sp, #40]           @ step_seq2
+  add r2, r8, lsl #1          @ prepare for seq2_ptr(r5) in the next loop.
+
+  subs r4, #1
+  bgt LOOP_DIM_CROSS_CORRELATION
+
+  pop {r4-r11}
+  bx  lr
+
+.fnend
+
+
+@ TODO(kma): Place this piece of reference code into a C code file.
+@ void WebRtcSpl_CrossCorrelation(WebRtc_Word32* cross_correlation,
+@                                 WebRtc_Word16* seq1,
+@                                 WebRtc_Word16* seq2,
+@                                 WebRtc_Word16 dim_seq,
+@                                 WebRtc_Word16 dim_cross_correlation,
+@                                 WebRtc_Word16 right_shifts,
+@                                 WebRtc_Word16 step_seq2) {
+@   int i = 0;
+@   int j = 0;
+@   int inner_loop_len1 = dim_seq >> 3;
+@   int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
+@ 
+@   assert(dim_cross_correlation > 0);
+@   assert(dim_seq > 0);
+@ 
+@   for (i = 0; i < dim_cross_correlation; i++) {
+@     int16_t *seq1_ptr = seq1;
+@     int16_t *seq2_ptr = seq2 + (step_seq2 * i);
+@     int64_t sum = 0;
+@ 
+@     for (j = inner_loop_len1; j > 0; j -= 1) {
+@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
+@       seq1_ptr++;
+@       seq2_ptr++;
+@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
+@       seq1_ptr++;
+@       seq2_ptr++;
+@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
+@       seq1_ptr++;
+@       seq2_ptr++;
+@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
+@       seq1_ptr++;
+@       seq2_ptr++;
+@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
+@       seq1_ptr++;
+@       seq2_ptr++;
+@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
+@       seq1_ptr++;
+@       seq2_ptr++;
+@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
+@       seq1_ptr++;
+@       seq2_ptr++;
+@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
+@       seq1_ptr++;
+@       seq2_ptr++;
+@     }
+@ 
+@     // Calculate the rest of the samples.
+@     for (j = inner_loop_len2; j > 0; j -= 1) {
+@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
+@       seq1_ptr++;
+@       seq2_ptr++;
+@     }
+@ 
+@     *cross_correlation++ = (int32_t)(sum >> right_shifts);
+@   }
+@ }