Optimized function WebRtcSpl_DownsampleFast for ARM-NEON platform.

Review URL: https://webrtc-codereview.appspot.com/371001

git-svn-id: http://webrtc.googlecode.com/svn/trunk@1629 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@webrtc.org 2012-02-07 18:03:11 +00:00
parent 236e842bca
commit 551fcc04ec
5 changed files with 290 additions and 90 deletions

View File

@ -24,7 +24,6 @@ LOCAL_SRC_FILES := \
copy_set_operations.c \
division_operations.c \
dot_product_with_scale.c \
downsample_fast.c \
energy.c \
filter_ar.c \
filter_ma_fast_q12.c \
@ -58,12 +57,14 @@ LOCAL_C_INCLUDES := \
ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_SRC_FILES += \
min_max_operations_neon.c \
cross_correlation_neon.s
cross_correlation_neon.s \
downsample_fast_neon.s
LOCAL_CFLAGS += \
$(MY_ARM_CFLAGS_NEON)
else
LOCAL_SRC_FILES += \
cross_correlation.c
cross_correlation.c \
downsample_fast.c
endif
ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -8,52 +8,40 @@
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* This file contains the function WebRtcSpl_DownsampleFast().
* The description header can be found in signal_processing_library.h
*
*/
#include "signal_processing_library.h"
int WebRtcSpl_DownsampleFast(WebRtc_Word16 *in_ptr, WebRtc_Word16 in_length,
WebRtc_Word16 *out_ptr, WebRtc_Word16 out_length,
WebRtc_Word16 *B, WebRtc_Word16 B_length, WebRtc_Word16 factor,
WebRtc_Word16 delay)
{
WebRtc_Word32 o;
int i, j;
// TODO(Bjornv): Change the function parameter order to WebRTC code style.
int WebRtcSpl_DownsampleFast(const int16_t* data_in,
int data_in_length,
int16_t* data_out,
int data_out_length,
const int16_t* __restrict coefficients,
int coefficients_length,
int factor,
int delay) {
int i = 0;
int j = 0;
int32_t out_s32 = 0;
int endpos = delay + factor * (data_out_length - 1) + 1;
WebRtc_Word16 *downsampled_ptr = out_ptr;
WebRtc_Word16 *b_ptr;
WebRtc_Word16 *x_ptr;
WebRtc_Word16 endpos = delay
+ (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(factor, (out_length - 1)) + 1;
// Return error if any of the running conditions doesn't meet.
if (data_out_length <= 0 || coefficients_length <= 0
|| data_in_length < endpos) {
return -1;
}
if (in_length < endpos)
{
return -1;
for (i = delay; i < endpos; i += factor) {
out_s32 = 2048; // Round value, 0.5 in Q12.
for (j = 0; j < coefficients_length; j++) {
out_s32 += coefficients[j] * data_in[i - j]; // Q12.
}
for (i = delay; i < endpos; i += factor)
{
b_ptr = &B[0];
x_ptr = &in_ptr[i];
out_s32 >>= 12; // Q0.
o = (WebRtc_Word32)2048; // Round val
// Saturate and store the output.
*data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
}
for (j = 0; j < B_length; j++)
{
o += WEBRTC_SPL_MUL_16_16(*b_ptr++, *x_ptr--);
}
o = WEBRTC_SPL_RSHIFT_W32(o, 12);
// If output is higher than 32768, saturate it. Same with negative side
*downsampled_ptr++ = WebRtcSpl_SatW32ToW16(o);
}
return 0;
return 0;
}

View File

@ -0,0 +1,222 @@
@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ This file contains the function WebRtcSpl_DownsampleFast(), optimized for
@ ARM Neon platform. The description header can be found in
@ signal_processing_library.h
@
@ The reference C code is in file downsample_fast.c. Bit-exact.
.arch armv7-a
.fpu neon
.align 2
.global WebRtcSpl_DownsampleFast
WebRtcSpl_DownsampleFast:
.fnstart
.save {r4-r11}
push {r4-r11}
cmp r3, #0 @ data_out_length <= 0?
movle r0, #-1
ble END
ldrsh r12, [sp, #44]
ldr r5, [sp, #40] @ r5: factor
add r4, r12, #1 @ r4: delay + 1
sub r3, r3, #1 @ r3: data_out_length - 1
smulbb r3, r5, r3
ldr r8, [sp, #32] @ &coefficients[0]
mov r9, r12 @ Iteration counter for outer loops.
add r3, r4 @ delay + factor * (out_length-1) +1
cmp r3, r1 @ data_in_length < endpos?
movgt r0, #-1
bgt END
@ Initializations.
sub r3, r5, asl #3
add r11, r0, r12, asl #1 @ &data_in[delay]
ldr r0, [sp, #36] @ coefficients_length
add r3, r5 @ endpos - factor * 7
cmp r0, #0 @ coefficients_length <= 0 ?
movle r0, #-1
ble END
add r8, r0, asl #1 @ &coeffieient[coefficients_length]
cmp r9, r3
bge POST_LOOP_ENDPOS @ branch when Iteration < 8 times.
@
@ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others)
@
mov r4, #-2
@ Direct program flow to the right channel.
@ r10 is an offset to &data_in[] in the loop. After an iteration, we need to
@ move the pointer back to original after advancing 16 bytes by a vld1, and
@ then move 2 bytes forward to increment one more sample.
cmp r5, #2
moveq r10, #-14
beq LOOP_ENDPOS_FACTOR2 @ Branch when factor == 2
@ Similar here, for r10, we need to move the pointer back to original after
@ advancing 32 bytes, then move 2 bytes forward to increment one sample.
cmp r5, #4
moveq r10, #-30
beq LOOP_ENDPOS_FACTOR4 @ Branch when factor == 4
@ For r10, we need to move the pointer back to original after advancing
@ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample.
mov r10, r5, asl #4
rsb r10, #2
add r10, r5, asl #1
lsl r5, #1 @ r5 = factor * sizeof(data_in)
@ The general case (factor != 2 && factor != 4)
LOOP_ENDPOS_GENERAL:
@ Initializations.
vmov.i32 q2, #2048
vmov.i32 q3, #2048
sub r7, r8, #2
sub r12, r0, #1 @ coefficients_length - 1
sub r1, r11, r12, asl #1 @ &data_in[i - j]
LOOP_COEFF_LENGTH_GENERAL:
vld1.16 {d2[], d3[]}, [r7], r4 @ coefficients[j]
vld1.16 d0[0], [r1], r5 @ data_in[i - j]
vld1.16 d0[1], [r1], r5 @ data_in[i + factor - j]
vld1.16 d0[2], [r1], r5 @ data_in[i + factor * 2 - j]
vld1.16 d0[3], [r1], r5 @ data_in[i + factor * 3 - j]
vld1.16 d1[0], [r1], r5 @ data_in[i + factor * 4 - j]
vld1.16 d1[1], [r1], r5 @ data_in[i + factor * 5 - j]
vld1.16 d1[2], [r1], r5 @ data_in[i + factor * 6 - j]
vld1.16 d1[3], [r1], r10 @ data_in[i + factor * 7 - j]
subs r12, #1
vmlal.s16 q2, d0, d2
vmlal.s16 q3, d1, d3
bge LOOP_COEFF_LENGTH_GENERAL
@ Shift, saturate, and store the result.
vqshrn.s32 d0, q2, #12
vqshrn.s32 d1, q3, #12
vst1.16 {d0, d1}, [r2]!
add r11, r5, asl #3 @ r11 -> &data_in[i + factor * 8]
add r9, r5, asl #2 @ Counter i = delay + factor * 8.
cmp r9, r3 @ i < endpos - factor * 7 ?
blt LOOP_ENDPOS_GENERAL
asr r5, #1 @ Restore r5 to the value of factor.
b POST_LOOP_ENDPOS
@ The case for factor == 2.
LOOP_ENDPOS_FACTOR2:
@ Initializations.
vmov.i32 q2, #2048
vmov.i32 q3, #2048
sub r7, r8, #2
sub r12, r0, #1 @ coefficients_length - 1
sub r1, r11, r12, asl #1 @ &data_in[i - j]
LOOP_COEFF_LENGTH_FACTOR2:
vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j]
vld2.16 {d0, d1}, [r1]! @ data_in[]
vld2.16 {d2, d3}, [r1], r10 @ data_in[]
subs r12, #1
vmlal.s16 q2, d0, d16
vmlal.s16 q3, d2, d17
bge LOOP_COEFF_LENGTH_FACTOR2
@ Shift, saturate, and store the result.
vqshrn.s32 d0, q2, #12
vqshrn.s32 d1, q3, #12
vst1.16 {d0, d1}, [r2]!
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
cmp r9, r3 @ i < endpos - factor * 7 ?
blt LOOP_ENDPOS_FACTOR2
b POST_LOOP_ENDPOS
@ The case for factor == 4.
LOOP_ENDPOS_FACTOR4:
@ Initializations.
vmov.i32 q2, #2048
vmov.i32 q3, #2048
sub r7, r8, #2
sub r12, r0, #1 @ coefficients_length - 1
sub r1, r11, r12, asl #1 @ &data_in[i - j]
LOOP_COEFF_LENGTH_FACTOR4:
vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j]
vld4.16 {d0, d1, d2, d3}, [r1]! @ data_in[]
vld4.16 {d18, d19, d20, d21}, [r1], r10 @ data_in[]
subs r12, #1
vmlal.s16 q2, d0, d16
vmlal.s16 q3, d18, d17
bge LOOP_COEFF_LENGTH_FACTOR4
@ Shift, saturate, and store the result.
vqshrn.s32 d0, q2, #12
vqshrn.s32 d1, q3, #12
vst1.16 {d0, d1}, [r2]!
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
cmp r9, r3 @ i < endpos - factor * 7 ?
blt LOOP_ENDPOS_FACTOR4
@
@ Second part, do the rest iterations (if any).
@
POST_LOOP_ENDPOS:
add r3, r5, asl #3
sub r3, r5 @ Restore r3 to endpos.
cmp r9, r3
movge r0, #0
bge END
LOOP2_ENDPOS:
@ Initializations.
mov r7, r8
sub r12, r0, #1 @ coefficients_length - 1
sub r6, r11, r12, asl #1 @ &data_in[i - j]
mov r1, #2048
LOOP2_COEFF_LENGTH:
ldrsh r4, [r7, #-2]! @ coefficients[j]
ldrsh r10, [r6], #2 @ data_in[i - j]
smlabb r1, r4, r10, r1
subs r12, #1
bge LOOP2_COEFF_LENGTH
@ Shift, saturate, and store the result.
ssat r1, #16, r1, asr #12
strh r1, [r2], #2
add r11, r5, asl #1 @ r11 -> &data_in[i + factor]
add r9, r5 @ Counter i = delay + factor.
cmp r9, r3 @ i < endpos?
blt LOOP2_ENDPOS
mov r0, #0
END:
pop {r4-r11}
bx lr
.fnend

View File

@ -13,9 +13,9 @@
// TODO(bjornv): Change the return type to report errors.
void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
void WebRtcSpl_FilterARFastQ12(const int16_t* data_in,
int16_t* data_out,
int16_t* __restrict coefficients,
const int16_t* __restrict coefficients,
int coefficients_length,
int data_length) {
int i = 0;

View File

@ -386,35 +386,46 @@ void WebRtcSpl_FilterMAFastQ12(WebRtc_Word16* in_vector,
WebRtc_Word16 ma_coef_length,
WebRtc_Word16 vector_length);
// WebRtcSpl_FilterARFastQ12(...)
//
// Performs a AR filtering on a vector in Q12
//
// Input:
// - data_in : Input samples
// - data_out : State information in positions
// data_out[-order] .. data_out[-1]
// - coefficients : Filter coefficients (in Q12)
// - coefficients_length : Number of coefficients (order+1)
// - data_length : Number of samples to be filtered
//
// - data_in : Input samples
// - data_out : State information in positions
// data_out[-order] .. data_out[-1]
// - coefficients : Filter coefficients (in Q12)
// - coefficients_length: Number of coefficients (order+1)
// - data_length : Number of samples to be filtered
// Output:
// - data_out : Filtered samples
void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
// - data_out : Filtered samples
void WebRtcSpl_FilterARFastQ12(const int16_t* data_in,
int16_t* data_out,
int16_t* __restrict coefficients,
const int16_t* __restrict coefficients,
int coefficients_length,
int data_length);
int WebRtcSpl_DownsampleFast(WebRtc_Word16* in_vector,
WebRtc_Word16 in_vector_length,
WebRtc_Word16* out_vector,
WebRtc_Word16 out_vector_length,
WebRtc_Word16* ma_coef,
WebRtc_Word16 ma_coef_length,
WebRtc_Word16 factor,
WebRtc_Word16 delay);
// Performs a MA down sampling filter on a vector
// Input:
// - data_in : Input samples (state in positions
// data_in[-order] .. data_in[-1])
// - data_in_length : Number of samples in |data_in| to be filtered.
// This must be at least
// |delay| + |factor|*(|out_vector_length|-1) + 1)
// - data_out_length : Number of down sampled samples desired
// - coefficients : Filter coefficients (in Q12)
// - coefficients_length: Number of coefficients (order+1)
// - factor : Decimation factor
// - delay : Delay of filter (compensated for in out_vector)
// Output:
// - data_out : Filtered samples
// Return value : 0 if OK, -1 if |in_vector| is too short
int WebRtcSpl_DownsampleFast(const int16_t* data_in,
int data_in_length,
int16_t* data_out,
int data_out_length,
const int16_t* __restrict coefficients,
int coefficients_length,
int factor,
int delay);
// End: Filter operations.
// FFT operations
@ -1454,28 +1465,6 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band,
// - out_vector : Filtered samples
//
//
// WebRtcSpl_DownsampleFast(...)
//
// Performs a MA down sampling filter on a vector
//
// Input:
// - in_vector : Input samples (state in positions
// in_vector[-order] .. in_vector[-1])
// - in_vector_length : Number of samples in |in_vector| to be filtered.
// This must be at least
// |delay| + |factor|*(|out_vector_length|-1) + 1)
// - out_vector_length : Number of down sampled samples desired
// - ma_coef : Filter coefficients (in Q12)
// - ma_coef_length : Number of B coefficients (order+1)
// - factor : Decimation factor
// - delay : Delay of filter (compensated for in out_vector)
//
// Output:
// - out_vector : Filtered samples
//
// Return value : 0 if OK, -1 if |in_vector| is too short
//
//
// WebRtcSpl_DotProductWithScale(...)