Optimized WebRtcSpl_ComplexBitReverse() for general ARM platforms and generic C.

In ARMv5, the cycles were reduced by 88% (weight in VoE reduced from 3.554% to 0.432%). The tradeoff is a memory increase of 704 bytes.
Review URL: https://webrtc-codereview.appspot.com/388003

git-svn-id: http://webrtc.googlecode.com/svn/trunk@1757 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@webrtc.org 2012-02-23 22:38:56 +00:00
parent 3cc03be51f
commit bfa7f96d1e
5 changed files with 241 additions and 61 deletions

View File

@ -20,7 +20,6 @@ LOCAL_SRC_FILES := \
auto_corr_to_refl_coef.c \
auto_correlation.c \
complex_fft.c \
complex_bit_reverse.c \
copy_set_operations.c \
division_operations.c \
dot_product_with_scale.c \
@ -77,9 +76,11 @@ endif
ifeq ($(TARGET_ARCH),arm)
LOCAL_SRC_FILES += \
complex_bit_reverse_arm.s \
spl_sqrt_floor.s
else
LOCAL_SRC_FILES += \
complex_bit_reverse.c \
spl_sqrt_floor.c
endif

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -8,44 +8,102 @@
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* This file contains the function WebRtcSpl_ComplexBitReverse().
* The description header can be found in signal_processing_library.h
*
*/
#include "signal_processing_library.h"
void WebRtcSpl_ComplexBitReverse(WebRtc_Word16 frfi[], int stages)
{
int mr, nn, n, l, m;
WebRtc_Word16 tr, ti;
/* Tables for data buffer indexes that are bit reversed and thus need to be
* swapped. Note that, index_7[{0, 2, 4, ...}] are for the left side of the swap
* operations, while index_7[{1, 3, 5, ...}] are for the right side of the
* operation. Same for index_8.
*/
n = 1 << stages;
/* Indexes for the case of stages == 7. */
static const int16_t index_7[112] = {
1, 64, 2, 32, 3, 96, 4, 16, 5, 80, 6, 48, 7, 112, 9, 72, 10, 40, 11, 104,
12, 24, 13, 88, 14, 56, 15, 120, 17, 68, 18, 36, 19, 100, 21, 84, 22, 52,
23, 116, 25, 76, 26, 44, 27, 108, 29, 92, 30, 60, 31, 124, 33, 66, 35, 98,
37, 82, 38, 50, 39, 114, 41, 74, 43, 106, 45, 90, 46, 58, 47, 122, 49, 70,
51, 102, 53, 86, 55, 118, 57, 78, 59, 110, 61, 94, 63, 126, 67, 97, 69,
81, 71, 113, 75, 105, 77, 89, 79, 121, 83, 101, 87, 117, 91, 109, 95, 125,
103, 115, 111, 123
};
mr = 0;
nn = n - 1;
/* Indexes for the case of stages == 8. */
static const int16_t index_8[240] = {
1, 128, 2, 64, 3, 192, 4, 32, 5, 160, 6, 96, 7, 224, 8, 16, 9, 144, 10, 80,
11, 208, 12, 48, 13, 176, 14, 112, 15, 240, 17, 136, 18, 72, 19, 200, 20,
40, 21, 168, 22, 104, 23, 232, 25, 152, 26, 88, 27, 216, 28, 56, 29, 184,
30, 120, 31, 248, 33, 132, 34, 68, 35, 196, 37, 164, 38, 100, 39, 228, 41,
148, 42, 84, 43, 212, 44, 52, 45, 180, 46, 116, 47, 244, 49, 140, 50, 76,
51, 204, 53, 172, 54, 108, 55, 236, 57, 156, 58, 92, 59, 220, 61, 188, 62,
124, 63, 252, 65, 130, 67, 194, 69, 162, 70, 98, 71, 226, 73, 146, 74, 82,
75, 210, 77, 178, 78, 114, 79, 242, 81, 138, 83, 202, 85, 170, 86, 106, 87,
234, 89, 154, 91, 218, 93, 186, 94, 122, 95, 250, 97, 134, 99, 198, 101,
166, 103, 230, 105, 150, 107, 214, 109, 182, 110, 118, 111, 246, 113, 142,
115, 206, 117, 174, 119, 238, 121, 158, 123, 222, 125, 190, 127, 254, 131,
193, 133, 161, 135, 225, 137, 145, 139, 209, 141, 177, 143, 241, 147, 201,
149, 169, 151, 233, 155, 217, 157, 185, 159, 249, 163, 197, 167, 229, 171,
213, 173, 181, 175, 245, 179, 205, 183, 237, 187, 221, 191, 253, 199, 227,
203, 211, 207, 243, 215, 235, 223, 251, 239, 247
};
// decimation in time - re-order data
for (m = 1; m <= nn; ++m)
{
l = n;
do
{
l >>= 1;
} while (mr + l > nn);
mr = (mr & (l - 1)) + l;
void WebRtcSpl_ComplexBitReverse(int16_t* __restrict complex_data, int stages) {
/* For any specific value of stages, we know exactly the indexes that are
* bit reversed. Currently (Feb. 2012) in WebRTC the only possible values of
* stages are 7 and 8, so we use tables to save unnecessary iterations and
* calculations for these two cases.
*/
if (stages == 7 || stages == 8) {
int m = 0;
int length = 112;
const int16_t* index = index_7;
if (mr <= m)
continue;
tr = frfi[2 * m];
frfi[2 * m] = frfi[2 * mr];
frfi[2 * mr] = tr;
ti = frfi[2 * m + 1];
frfi[2 * m + 1] = frfi[2 * mr + 1];
frfi[2 * mr + 1] = ti;
if (stages == 8) {
length = 240;
index = index_8;
}
/* Decimation in time. Swap the elements with bit-reversed indexes. */
for (m = 0; m < length; m += 2) {
/* We declare a int32_t* type pointer, to load both the 16-bit real
* and imaginary elements from complex_data in one instruction, reducing
* complexity.
*/
int32_t* complex_data_ptr = (int32_t*)complex_data;
int32_t temp = 0;
temp = complex_data_ptr[index[m]]; /* Real and imaginary */
complex_data_ptr[index[m]] = complex_data_ptr[index[m + 1]];
complex_data_ptr[index[m + 1]] = temp;
}
}
else {
int m = 0, mr = 0, l = 0;
int n = 1 << stages;
int nn = n - 1;
/* Decimation in time - re-order data */
for (m = 1; m <= nn; ++m) {
int32_t* complex_data_ptr = (int32_t*)complex_data;
int32_t temp = 0;
/* Find out indexes that are bit-reversed. */
l = n;
do {
l >>= 1;
} while (l > nn - mr);
mr = (mr & (l - 1)) + l;
if (mr <= m) {
continue;
}
/* Swap the elements with bit-reversed indexes.
* This is similar to the loop in the stages == 7 or 8 cases.
*/
temp = complex_data_ptr[m]; /* Real and imaginary */
complex_data_ptr[m] = complex_data_ptr[mr];
complex_data_ptr[mr] = temp;
}
}
}

View File

@ -0,0 +1,126 @@
@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ This file contains the function WebRtcSpl_ComplexBitReverse(), optimized
@ for ARMv5 platforms.
@ Reference C code is in file complex_bit_reverse.c. Bit-exact.
.arch armv5
.global WebRtcSpl_ComplexBitReverse
.align 2
WebRtcSpl_ComplexBitReverse:
.fnstart
push {r4-r7}
cmp r1, #7
adr r3, index_7 @ Table pointer.
mov r4, #112 @ Number of interations.
beq PRE_LOOP_STAGES_7_OR_8
cmp r1, #8
adr r3, index_8 @ Table pointer.
mov r4, #240 @ Number of interations.
beq PRE_LOOP_STAGES_7_OR_8
mov r3, #1 @ Initialize m.
mov r1, r3, asl r1 @ n = 1 << stages;
subs r6, r1, #1 @ nn = n - 1;
ble END
mov r5, r0 @ &complex_data
mov r4, #0 @ ml
LOOP_GENERIC:
rsb r12, r4, r6 @ l > nn - mr
mov r2, r1 @ n
LOOP_SHIFT:
asr r2, #1 @ l >>= 1;
cmp r2, r12
bgt LOOP_SHIFT
sub r12, r2, #1
and r4, r12, r4
add r4, r2 @ mr = (mr & (l - 1)) + l;
cmp r4, r3 @ mr <= m ?
ble UPDATE_REGISTERS
mov r12, r4, asl #2
ldr r7, [r5, #4] @ complex_data[2 * m, 2 * m + 1].
@ Offset 4 due to m incrementing from 1.
ldr r2, [r0, r12] @ complex_data[2 * mr, 2 * mr + 1].
str r7, [r0, r12]
str r2, [r5, #4]
UPDATE_REGISTERS:
add r3, r3, #1
add r5, #4
cmp r3, r1
bne LOOP_GENERIC
b END
PRE_LOOP_STAGES_7_OR_8:
add r4, r3, r4, asl #1
LOOP_STAGES_7_OR_8:
ldrsh r2, [r3], #2 @ index[m]
ldrsh r5, [r3], #2 @ index[m + 1]
ldr r1, [r0, r2] @ complex_data[index[m], index[m] + 1]
ldr r12, [r0, r5] @ complex_data[index[m + 1], index[m + 1] + 1]
cmp r3, r4
str r1, [r0, r5]
str r12, [r0, r2]
bne LOOP_STAGES_7_OR_8
END:
pop {r4-r7}
bx lr
.fnend
@ The index tables. Note the values are doubles of the actual indexes for 16-bit
@ elements, different from the generic C code. It actually provides byte offsets
@ for the indexes.
.align 2
index_7: @ Indexes for stages == 7.
.hword 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288
.hword 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144
.hword 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116
.hword 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156
.hword 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204
.hword 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268
.hword 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348
.hword 468, 364, 436, 380, 500, 412, 460, 444, 492
index_8: @ Indexes for stages == 8.
.hword 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64
.hword 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544
.hword 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104
.hword 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136
.hword 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172
.hword 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204
.hword 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244
.hword 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284
.hword 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324
.hword 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372
.hword 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420
.hword 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468
.hword 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532
.hword 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596
.hword 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684
.hword 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796
.hword 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988

View File

@ -429,9 +429,26 @@ int WebRtcSpl_DownsampleFast(const int16_t* data_in,
// End: Filter operations.
// FFT operations
int WebRtcSpl_ComplexFFT(WebRtc_Word16 vector[], int stages, int mode);
int WebRtcSpl_ComplexIFFT(WebRtc_Word16 vector[], int stages, int mode);
void WebRtcSpl_ComplexBitReverse(WebRtc_Word16 vector[], int stages);
// Treat a 16-bit complex data buffer |complex_data| as an array of 32-bit
// values, and swap elements whose indexes are bit-reverses of each other.
//
// Input:
// - complex_data : Complex data buffer containing 2^|stages| real
// elements interleaved with 2^|stages| imaginary
// elements: [Re Im Re Im Re Im....]
// - stages : Number of FFT stages. Must be at least 3 and at most
// 10, since the table WebRtcSpl_kSinTable1024[] is 1024
// elements long.
//
// Output:
// - complex_data : The complex data buffer.
void WebRtcSpl_ComplexBitReverse(int16_t* __restrict complex_data, int stages);
// End: FFT operations
/************************************************************
@ -1573,31 +1590,6 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band,
// which returns a scale value of -1, indicating error.
//
//
// WebRtcSpl_ComplexBitReverse(...)
//
// Complex Bit Reverse
//
// This function bit-reverses the position of elements in the complex input
// vector into the output vector.
//
// If you bit-reverse a linear-order array, you obtain a bit-reversed order
// array. If you bit-reverse a bit-reversed order array, you obtain a
// linear-order array.
//
// Input:
// - vector : In pointer to complex vector containing 2^|stages| real
// elements interleaved with 2^|stages| imaginary elements.
// [ReImReImReIm....]
// - stages : Number of FFT stages. Must be at least 3 and at most 10,
// since the table WebRtcSpl_kSinTable1024[] is 1024
// elements long.
//
// Output:
// - vector : Out pointer to complex vector in bit-reversed order.
// The input vector is over written.
//
//
// WebRtcSpl_AnalysisQMF(...)
//

View File

@ -15,6 +15,9 @@
#ifndef WEBRTC_SPL_SPL_INL_ARMV7_H_
#define WEBRTC_SPL_SPL_INL_ARMV7_H_
// TODO(kma): Replace some assembly code with GCC intrinsics
// (e.g. __builtin_clz).
static __inline WebRtc_Word32 WEBRTC_SPL_MUL_16_32_RSFT16(WebRtc_Word16 a,
WebRtc_Word32 b) {
WebRtc_Word32 tmp;