Optimized function WebRtcSpl_DownsampleFast for ARM-NEON platform.
Review URL: https://webrtc-codereview.appspot.com/371001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1629 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
236e842bca
commit
551fcc04ec
@ -24,7 +24,6 @@ LOCAL_SRC_FILES := \
|
||||
copy_set_operations.c \
|
||||
division_operations.c \
|
||||
dot_product_with_scale.c \
|
||||
downsample_fast.c \
|
||||
energy.c \
|
||||
filter_ar.c \
|
||||
filter_ma_fast_q12.c \
|
||||
@ -58,12 +57,14 @@ LOCAL_C_INCLUDES := \
|
||||
ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
||||
LOCAL_SRC_FILES += \
|
||||
min_max_operations_neon.c \
|
||||
cross_correlation_neon.s
|
||||
cross_correlation_neon.s \
|
||||
downsample_fast_neon.s
|
||||
LOCAL_CFLAGS += \
|
||||
$(MY_ARM_CFLAGS_NEON)
|
||||
else
|
||||
LOCAL_SRC_FILES += \
|
||||
cross_correlation.c
|
||||
cross_correlation.c \
|
||||
downsample_fast.c
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
@ -8,52 +8,40 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* This file contains the function WebRtcSpl_DownsampleFast().
|
||||
* The description header can be found in signal_processing_library.h
|
||||
*
|
||||
*/
|
||||
|
||||
#include "signal_processing_library.h"
|
||||
|
||||
int WebRtcSpl_DownsampleFast(WebRtc_Word16 *in_ptr, WebRtc_Word16 in_length,
|
||||
WebRtc_Word16 *out_ptr, WebRtc_Word16 out_length,
|
||||
WebRtc_Word16 *B, WebRtc_Word16 B_length, WebRtc_Word16 factor,
|
||||
WebRtc_Word16 delay)
|
||||
{
|
||||
WebRtc_Word32 o;
|
||||
int i, j;
|
||||
// TODO(Bjornv): Change the function parameter order to WebRTC code style.
|
||||
int WebRtcSpl_DownsampleFast(const int16_t* data_in,
|
||||
int data_in_length,
|
||||
int16_t* data_out,
|
||||
int data_out_length,
|
||||
const int16_t* __restrict coefficients,
|
||||
int coefficients_length,
|
||||
int factor,
|
||||
int delay) {
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int32_t out_s32 = 0;
|
||||
int endpos = delay + factor * (data_out_length - 1) + 1;
|
||||
|
||||
WebRtc_Word16 *downsampled_ptr = out_ptr;
|
||||
WebRtc_Word16 *b_ptr;
|
||||
WebRtc_Word16 *x_ptr;
|
||||
WebRtc_Word16 endpos = delay
|
||||
+ (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(factor, (out_length - 1)) + 1;
|
||||
// Return error if any of the running conditions doesn't meet.
|
||||
if (data_out_length <= 0 || coefficients_length <= 0
|
||||
|| data_in_length < endpos) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (in_length < endpos)
|
||||
{
|
||||
return -1;
|
||||
for (i = delay; i < endpos; i += factor) {
|
||||
out_s32 = 2048; // Round value, 0.5 in Q12.
|
||||
|
||||
for (j = 0; j < coefficients_length; j++) {
|
||||
out_s32 += coefficients[j] * data_in[i - j]; // Q12.
|
||||
}
|
||||
|
||||
for (i = delay; i < endpos; i += factor)
|
||||
{
|
||||
b_ptr = &B[0];
|
||||
x_ptr = &in_ptr[i];
|
||||
out_s32 >>= 12; // Q0.
|
||||
|
||||
o = (WebRtc_Word32)2048; // Round val
|
||||
// Saturate and store the output.
|
||||
*data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
|
||||
}
|
||||
|
||||
for (j = 0; j < B_length; j++)
|
||||
{
|
||||
o += WEBRTC_SPL_MUL_16_16(*b_ptr++, *x_ptr--);
|
||||
}
|
||||
|
||||
o = WEBRTC_SPL_RSHIFT_W32(o, 12);
|
||||
|
||||
// If output is higher than 32768, saturate it. Same with negative side
|
||||
|
||||
*downsampled_ptr++ = WebRtcSpl_SatW32ToW16(o);
|
||||
}
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
222
src/common_audio/signal_processing/downsample_fast_neon.s
Normal file
222
src/common_audio/signal_processing/downsample_fast_neon.s
Normal file
@ -0,0 +1,222 @@
|
||||
@
|
||||
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
@
|
||||
@ Use of this source code is governed by a BSD-style license
|
||||
@ that can be found in the LICENSE file in the root of the source
|
||||
@ tree. An additional intellectual property rights grant can be found
|
||||
@ in the file PATENTS. All contributing project authors may
|
||||
@ be found in the AUTHORS file in the root of the source tree.
|
||||
@
|
||||
|
||||
@ This file contains the function WebRtcSpl_DownsampleFast(), optimized for
|
||||
@ ARM Neon platform. The description header can be found in
|
||||
@ signal_processing_library.h
|
||||
@
|
||||
@ The reference C code is in file downsample_fast.c. Bit-exact.
|
||||
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
.align 2
|
||||
.global WebRtcSpl_DownsampleFast
|
||||
|
||||
WebRtcSpl_DownsampleFast:
|
||||
|
||||
.fnstart
|
||||
|
||||
.save {r4-r11}
|
||||
push {r4-r11}
|
||||
|
||||
cmp r3, #0 @ data_out_length <= 0?
|
||||
movle r0, #-1
|
||||
ble END
|
||||
|
||||
ldrsh r12, [sp, #44]
|
||||
ldr r5, [sp, #40] @ r5: factor
|
||||
add r4, r12, #1 @ r4: delay + 1
|
||||
sub r3, r3, #1 @ r3: data_out_length - 1
|
||||
smulbb r3, r5, r3
|
||||
ldr r8, [sp, #32] @ &coefficients[0]
|
||||
mov r9, r12 @ Iteration counter for outer loops.
|
||||
add r3, r4 @ delay + factor * (out_length-1) +1
|
||||
|
||||
cmp r3, r1 @ data_in_length < endpos?
|
||||
movgt r0, #-1
|
||||
bgt END
|
||||
|
||||
@ Initializations.
|
||||
sub r3, r5, asl #3
|
||||
add r11, r0, r12, asl #1 @ &data_in[delay]
|
||||
ldr r0, [sp, #36] @ coefficients_length
|
||||
add r3, r5 @ endpos - factor * 7
|
||||
|
||||
cmp r0, #0 @ coefficients_length <= 0 ?
|
||||
movle r0, #-1
|
||||
ble END
|
||||
|
||||
add r8, r0, asl #1 @ &coeffieient[coefficients_length]
|
||||
cmp r9, r3
|
||||
bge POST_LOOP_ENDPOS @ branch when Iteration < 8 times.
|
||||
|
||||
@
|
||||
@ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others)
|
||||
@
|
||||
mov r4, #-2
|
||||
|
||||
@ Direct program flow to the right channel.
|
||||
|
||||
@ r10 is an offset to &data_in[] in the loop. After an iteration, we need to
|
||||
@ move the pointer back to original after advancing 16 bytes by a vld1, and
|
||||
@ then move 2 bytes forward to increment one more sample.
|
||||
cmp r5, #2
|
||||
moveq r10, #-14
|
||||
beq LOOP_ENDPOS_FACTOR2 @ Branch when factor == 2
|
||||
|
||||
@ Similar here, for r10, we need to move the pointer back to original after
|
||||
@ advancing 32 bytes, then move 2 bytes forward to increment one sample.
|
||||
cmp r5, #4
|
||||
moveq r10, #-30
|
||||
beq LOOP_ENDPOS_FACTOR4 @ Branch when factor == 4
|
||||
|
||||
@ For r10, we need to move the pointer back to original after advancing
|
||||
@ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample.
|
||||
mov r10, r5, asl #4
|
||||
rsb r10, #2
|
||||
add r10, r5, asl #1
|
||||
lsl r5, #1 @ r5 = factor * sizeof(data_in)
|
||||
|
||||
@ The general case (factor != 2 && factor != 4)
|
||||
LOOP_ENDPOS_GENERAL:
|
||||
@ Initializations.
|
||||
vmov.i32 q2, #2048
|
||||
vmov.i32 q3, #2048
|
||||
sub r7, r8, #2
|
||||
sub r12, r0, #1 @ coefficients_length - 1
|
||||
sub r1, r11, r12, asl #1 @ &data_in[i - j]
|
||||
|
||||
LOOP_COEFF_LENGTH_GENERAL:
|
||||
vld1.16 {d2[], d3[]}, [r7], r4 @ coefficients[j]
|
||||
vld1.16 d0[0], [r1], r5 @ data_in[i - j]
|
||||
vld1.16 d0[1], [r1], r5 @ data_in[i + factor - j]
|
||||
vld1.16 d0[2], [r1], r5 @ data_in[i + factor * 2 - j]
|
||||
vld1.16 d0[3], [r1], r5 @ data_in[i + factor * 3 - j]
|
||||
vld1.16 d1[0], [r1], r5 @ data_in[i + factor * 4 - j]
|
||||
vld1.16 d1[1], [r1], r5 @ data_in[i + factor * 5 - j]
|
||||
vld1.16 d1[2], [r1], r5 @ data_in[i + factor * 6 - j]
|
||||
vld1.16 d1[3], [r1], r10 @ data_in[i + factor * 7 - j]
|
||||
subs r12, #1
|
||||
vmlal.s16 q2, d0, d2
|
||||
vmlal.s16 q3, d1, d3
|
||||
bge LOOP_COEFF_LENGTH_GENERAL
|
||||
|
||||
@ Shift, saturate, and store the result.
|
||||
vqshrn.s32 d0, q2, #12
|
||||
vqshrn.s32 d1, q3, #12
|
||||
vst1.16 {d0, d1}, [r2]!
|
||||
|
||||
add r11, r5, asl #3 @ r11 -> &data_in[i + factor * 8]
|
||||
add r9, r5, asl #2 @ Counter i = delay + factor * 8.
|
||||
cmp r9, r3 @ i < endpos - factor * 7 ?
|
||||
blt LOOP_ENDPOS_GENERAL
|
||||
asr r5, #1 @ Restore r5 to the value of factor.
|
||||
b POST_LOOP_ENDPOS
|
||||
|
||||
@ The case for factor == 2.
|
||||
LOOP_ENDPOS_FACTOR2:
|
||||
@ Initializations.
|
||||
vmov.i32 q2, #2048
|
||||
vmov.i32 q3, #2048
|
||||
sub r7, r8, #2
|
||||
sub r12, r0, #1 @ coefficients_length - 1
|
||||
sub r1, r11, r12, asl #1 @ &data_in[i - j]
|
||||
|
||||
LOOP_COEFF_LENGTH_FACTOR2:
|
||||
vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j]
|
||||
vld2.16 {d0, d1}, [r1]! @ data_in[]
|
||||
vld2.16 {d2, d3}, [r1], r10 @ data_in[]
|
||||
subs r12, #1
|
||||
vmlal.s16 q2, d0, d16
|
||||
vmlal.s16 q3, d2, d17
|
||||
bge LOOP_COEFF_LENGTH_FACTOR2
|
||||
|
||||
@ Shift, saturate, and store the result.
|
||||
vqshrn.s32 d0, q2, #12
|
||||
vqshrn.s32 d1, q3, #12
|
||||
vst1.16 {d0, d1}, [r2]!
|
||||
|
||||
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
|
||||
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
|
||||
cmp r9, r3 @ i < endpos - factor * 7 ?
|
||||
blt LOOP_ENDPOS_FACTOR2
|
||||
b POST_LOOP_ENDPOS
|
||||
|
||||
@ The case for factor == 4.
|
||||
LOOP_ENDPOS_FACTOR4:
|
||||
@ Initializations.
|
||||
vmov.i32 q2, #2048
|
||||
vmov.i32 q3, #2048
|
||||
sub r7, r8, #2
|
||||
sub r12, r0, #1 @ coefficients_length - 1
|
||||
sub r1, r11, r12, asl #1 @ &data_in[i - j]
|
||||
|
||||
LOOP_COEFF_LENGTH_FACTOR4:
|
||||
vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j]
|
||||
vld4.16 {d0, d1, d2, d3}, [r1]! @ data_in[]
|
||||
vld4.16 {d18, d19, d20, d21}, [r1], r10 @ data_in[]
|
||||
subs r12, #1
|
||||
vmlal.s16 q2, d0, d16
|
||||
vmlal.s16 q3, d18, d17
|
||||
bge LOOP_COEFF_LENGTH_FACTOR4
|
||||
|
||||
@ Shift, saturate, and store the result.
|
||||
vqshrn.s32 d0, q2, #12
|
||||
vqshrn.s32 d1, q3, #12
|
||||
vst1.16 {d0, d1}, [r2]!
|
||||
|
||||
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
|
||||
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
|
||||
cmp r9, r3 @ i < endpos - factor * 7 ?
|
||||
blt LOOP_ENDPOS_FACTOR4
|
||||
|
||||
@
|
||||
@ Second part, do the rest iterations (if any).
|
||||
@
|
||||
|
||||
POST_LOOP_ENDPOS:
|
||||
add r3, r5, asl #3
|
||||
sub r3, r5 @ Restore r3 to endpos.
|
||||
cmp r9, r3
|
||||
movge r0, #0
|
||||
bge END
|
||||
|
||||
LOOP2_ENDPOS:
|
||||
@ Initializations.
|
||||
mov r7, r8
|
||||
sub r12, r0, #1 @ coefficients_length - 1
|
||||
sub r6, r11, r12, asl #1 @ &data_in[i - j]
|
||||
|
||||
mov r1, #2048
|
||||
|
||||
LOOP2_COEFF_LENGTH:
|
||||
ldrsh r4, [r7, #-2]! @ coefficients[j]
|
||||
ldrsh r10, [r6], #2 @ data_in[i - j]
|
||||
smlabb r1, r4, r10, r1
|
||||
subs r12, #1
|
||||
bge LOOP2_COEFF_LENGTH
|
||||
|
||||
@ Shift, saturate, and store the result.
|
||||
ssat r1, #16, r1, asr #12
|
||||
strh r1, [r2], #2
|
||||
|
||||
add r11, r5, asl #1 @ r11 -> &data_in[i + factor]
|
||||
add r9, r5 @ Counter i = delay + factor.
|
||||
cmp r9, r3 @ i < endpos?
|
||||
blt LOOP2_ENDPOS
|
||||
|
||||
mov r0, #0
|
||||
|
||||
END:
|
||||
pop {r4-r11}
|
||||
bx lr
|
||||
|
||||
.fnend
|
@ -13,9 +13,9 @@
|
||||
|
||||
// TODO(bjornv): Change the return type to report errors.
|
||||
|
||||
void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
|
||||
void WebRtcSpl_FilterARFastQ12(const int16_t* data_in,
|
||||
int16_t* data_out,
|
||||
int16_t* __restrict coefficients,
|
||||
const int16_t* __restrict coefficients,
|
||||
int coefficients_length,
|
||||
int data_length) {
|
||||
int i = 0;
|
||||
|
@ -386,35 +386,46 @@ void WebRtcSpl_FilterMAFastQ12(WebRtc_Word16* in_vector,
|
||||
WebRtc_Word16 ma_coef_length,
|
||||
WebRtc_Word16 vector_length);
|
||||
|
||||
// WebRtcSpl_FilterARFastQ12(...)
|
||||
//
|
||||
// Performs a AR filtering on a vector in Q12
|
||||
//
|
||||
// Input:
|
||||
// - data_in : Input samples
|
||||
// - data_out : State information in positions
|
||||
// data_out[-order] .. data_out[-1]
|
||||
// - coefficients : Filter coefficients (in Q12)
|
||||
// - coefficients_length : Number of coefficients (order+1)
|
||||
// - data_length : Number of samples to be filtered
|
||||
//
|
||||
// - data_in : Input samples
|
||||
// - data_out : State information in positions
|
||||
// data_out[-order] .. data_out[-1]
|
||||
// - coefficients : Filter coefficients (in Q12)
|
||||
// - coefficients_length: Number of coefficients (order+1)
|
||||
// - data_length : Number of samples to be filtered
|
||||
// Output:
|
||||
// - data_out : Filtered samples
|
||||
|
||||
void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
|
||||
// - data_out : Filtered samples
|
||||
void WebRtcSpl_FilterARFastQ12(const int16_t* data_in,
|
||||
int16_t* data_out,
|
||||
int16_t* __restrict coefficients,
|
||||
const int16_t* __restrict coefficients,
|
||||
int coefficients_length,
|
||||
int data_length);
|
||||
|
||||
int WebRtcSpl_DownsampleFast(WebRtc_Word16* in_vector,
|
||||
WebRtc_Word16 in_vector_length,
|
||||
WebRtc_Word16* out_vector,
|
||||
WebRtc_Word16 out_vector_length,
|
||||
WebRtc_Word16* ma_coef,
|
||||
WebRtc_Word16 ma_coef_length,
|
||||
WebRtc_Word16 factor,
|
||||
WebRtc_Word16 delay);
|
||||
// Performs a MA down sampling filter on a vector
|
||||
// Input:
|
||||
// - data_in : Input samples (state in positions
|
||||
// data_in[-order] .. data_in[-1])
|
||||
// - data_in_length : Number of samples in |data_in| to be filtered.
|
||||
// This must be at least
|
||||
// |delay| + |factor|*(|out_vector_length|-1) + 1)
|
||||
// - data_out_length : Number of down sampled samples desired
|
||||
// - coefficients : Filter coefficients (in Q12)
|
||||
// - coefficients_length: Number of coefficients (order+1)
|
||||
// - factor : Decimation factor
|
||||
// - delay : Delay of filter (compensated for in out_vector)
|
||||
// Output:
|
||||
// - data_out : Filtered samples
|
||||
// Return value : 0 if OK, -1 if |in_vector| is too short
|
||||
int WebRtcSpl_DownsampleFast(const int16_t* data_in,
|
||||
int data_in_length,
|
||||
int16_t* data_out,
|
||||
int data_out_length,
|
||||
const int16_t* __restrict coefficients,
|
||||
int coefficients_length,
|
||||
int factor,
|
||||
int delay);
|
||||
|
||||
// End: Filter operations.
|
||||
|
||||
// FFT operations
|
||||
@ -1454,28 +1465,6 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band,
|
||||
// - out_vector : Filtered samples
|
||||
//
|
||||
|
||||
//
|
||||
// WebRtcSpl_DownsampleFast(...)
|
||||
//
|
||||
// Performs a MA down sampling filter on a vector
|
||||
//
|
||||
// Input:
|
||||
// - in_vector : Input samples (state in positions
|
||||
// in_vector[-order] .. in_vector[-1])
|
||||
// - in_vector_length : Number of samples in |in_vector| to be filtered.
|
||||
// This must be at least
|
||||
// |delay| + |factor|*(|out_vector_length|-1) + 1)
|
||||
// - out_vector_length : Number of down sampled samples desired
|
||||
// - ma_coef : Filter coefficients (in Q12)
|
||||
// - ma_coef_length : Number of B coefficients (order+1)
|
||||
// - factor : Decimation factor
|
||||
// - delay : Delay of filter (compensated for in out_vector)
|
||||
//
|
||||
// Output:
|
||||
// - out_vector : Filtered samples
|
||||
//
|
||||
// Return value : 0 if OK, -1 if |in_vector| is too short
|
||||
//
|
||||
|
||||
//
|
||||
// WebRtcSpl_DotProductWithScale(...)
|
||||
|
Loading…
x
Reference in New Issue
Block a user