Optimized WebRtcSpl_ComplexBitReverse() for general ARM platforms and generic C.
In ARMv5, the cycles were reduced by 88% (weight in VoE reduced from 3.554% to 0.432%). The tradeoff is a memory increase of 704 bytes. Review URL: https://webrtc-codereview.appspot.com/388003 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1757 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
3cc03be51f
commit
bfa7f96d1e
@ -20,7 +20,6 @@ LOCAL_SRC_FILES := \
|
||||
auto_corr_to_refl_coef.c \
|
||||
auto_correlation.c \
|
||||
complex_fft.c \
|
||||
complex_bit_reverse.c \
|
||||
copy_set_operations.c \
|
||||
division_operations.c \
|
||||
dot_product_with_scale.c \
|
||||
@ -77,9 +76,11 @@ endif
|
||||
|
||||
ifeq ($(TARGET_ARCH),arm)
|
||||
LOCAL_SRC_FILES += \
|
||||
complex_bit_reverse_arm.s \
|
||||
spl_sqrt_floor.s
|
||||
else
|
||||
LOCAL_SRC_FILES += \
|
||||
complex_bit_reverse.c \
|
||||
spl_sqrt_floor.c
|
||||
endif
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
@ -8,44 +8,102 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* This file contains the function WebRtcSpl_ComplexBitReverse().
|
||||
* The description header can be found in signal_processing_library.h
|
||||
*
|
||||
*/
|
||||
|
||||
#include "signal_processing_library.h"
|
||||
|
||||
void WebRtcSpl_ComplexBitReverse(WebRtc_Word16 frfi[], int stages)
|
||||
{
|
||||
int mr, nn, n, l, m;
|
||||
WebRtc_Word16 tr, ti;
|
||||
/* Tables for data buffer indexes that are bit reversed and thus need to be
|
||||
* swapped. Note that, index_7[{0, 2, 4, ...}] are for the left side of the swap
|
||||
* operations, while index_7[{1, 3, 5, ...}] are for the right side of the
|
||||
* operation. Same for index_8.
|
||||
*/
|
||||
|
||||
n = 1 << stages;
|
||||
/* Indexes for the case of stages == 7. */
|
||||
static const int16_t index_7[112] = {
|
||||
1, 64, 2, 32, 3, 96, 4, 16, 5, 80, 6, 48, 7, 112, 9, 72, 10, 40, 11, 104,
|
||||
12, 24, 13, 88, 14, 56, 15, 120, 17, 68, 18, 36, 19, 100, 21, 84, 22, 52,
|
||||
23, 116, 25, 76, 26, 44, 27, 108, 29, 92, 30, 60, 31, 124, 33, 66, 35, 98,
|
||||
37, 82, 38, 50, 39, 114, 41, 74, 43, 106, 45, 90, 46, 58, 47, 122, 49, 70,
|
||||
51, 102, 53, 86, 55, 118, 57, 78, 59, 110, 61, 94, 63, 126, 67, 97, 69,
|
||||
81, 71, 113, 75, 105, 77, 89, 79, 121, 83, 101, 87, 117, 91, 109, 95, 125,
|
||||
103, 115, 111, 123
|
||||
};
|
||||
|
||||
mr = 0;
|
||||
nn = n - 1;
|
||||
/* Indexes for the case of stages == 8. */
|
||||
static const int16_t index_8[240] = {
|
||||
1, 128, 2, 64, 3, 192, 4, 32, 5, 160, 6, 96, 7, 224, 8, 16, 9, 144, 10, 80,
|
||||
11, 208, 12, 48, 13, 176, 14, 112, 15, 240, 17, 136, 18, 72, 19, 200, 20,
|
||||
40, 21, 168, 22, 104, 23, 232, 25, 152, 26, 88, 27, 216, 28, 56, 29, 184,
|
||||
30, 120, 31, 248, 33, 132, 34, 68, 35, 196, 37, 164, 38, 100, 39, 228, 41,
|
||||
148, 42, 84, 43, 212, 44, 52, 45, 180, 46, 116, 47, 244, 49, 140, 50, 76,
|
||||
51, 204, 53, 172, 54, 108, 55, 236, 57, 156, 58, 92, 59, 220, 61, 188, 62,
|
||||
124, 63, 252, 65, 130, 67, 194, 69, 162, 70, 98, 71, 226, 73, 146, 74, 82,
|
||||
75, 210, 77, 178, 78, 114, 79, 242, 81, 138, 83, 202, 85, 170, 86, 106, 87,
|
||||
234, 89, 154, 91, 218, 93, 186, 94, 122, 95, 250, 97, 134, 99, 198, 101,
|
||||
166, 103, 230, 105, 150, 107, 214, 109, 182, 110, 118, 111, 246, 113, 142,
|
||||
115, 206, 117, 174, 119, 238, 121, 158, 123, 222, 125, 190, 127, 254, 131,
|
||||
193, 133, 161, 135, 225, 137, 145, 139, 209, 141, 177, 143, 241, 147, 201,
|
||||
149, 169, 151, 233, 155, 217, 157, 185, 159, 249, 163, 197, 167, 229, 171,
|
||||
213, 173, 181, 175, 245, 179, 205, 183, 237, 187, 221, 191, 253, 199, 227,
|
||||
203, 211, 207, 243, 215, 235, 223, 251, 239, 247
|
||||
};
|
||||
|
||||
// decimation in time - re-order data
|
||||
for (m = 1; m <= nn; ++m)
|
||||
{
|
||||
l = n;
|
||||
do
|
||||
{
|
||||
l >>= 1;
|
||||
} while (mr + l > nn);
|
||||
mr = (mr & (l - 1)) + l;
|
||||
void WebRtcSpl_ComplexBitReverse(int16_t* __restrict complex_data, int stages) {
|
||||
/* For any specific value of stages, we know exactly the indexes that are
|
||||
* bit reversed. Currently (Feb. 2012) in WebRTC the only possible values of
|
||||
* stages are 7 and 8, so we use tables to save unnecessary iterations and
|
||||
* calculations for these two cases.
|
||||
*/
|
||||
if (stages == 7 || stages == 8) {
|
||||
int m = 0;
|
||||
int length = 112;
|
||||
const int16_t* index = index_7;
|
||||
|
||||
if (mr <= m)
|
||||
continue;
|
||||
|
||||
tr = frfi[2 * m];
|
||||
frfi[2 * m] = frfi[2 * mr];
|
||||
frfi[2 * mr] = tr;
|
||||
|
||||
ti = frfi[2 * m + 1];
|
||||
frfi[2 * m + 1] = frfi[2 * mr + 1];
|
||||
frfi[2 * mr + 1] = ti;
|
||||
if (stages == 8) {
|
||||
length = 240;
|
||||
index = index_8;
|
||||
}
|
||||
|
||||
/* Decimation in time. Swap the elements with bit-reversed indexes. */
|
||||
for (m = 0; m < length; m += 2) {
|
||||
/* We declare a int32_t* type pointer, to load both the 16-bit real
|
||||
* and imaginary elements from complex_data in one instruction, reducing
|
||||
* complexity.
|
||||
*/
|
||||
int32_t* complex_data_ptr = (int32_t*)complex_data;
|
||||
int32_t temp = 0;
|
||||
|
||||
temp = complex_data_ptr[index[m]]; /* Real and imaginary */
|
||||
complex_data_ptr[index[m]] = complex_data_ptr[index[m + 1]];
|
||||
complex_data_ptr[index[m + 1]] = temp;
|
||||
}
|
||||
}
|
||||
else {
|
||||
int m = 0, mr = 0, l = 0;
|
||||
int n = 1 << stages;
|
||||
int nn = n - 1;
|
||||
|
||||
/* Decimation in time - re-order data */
|
||||
for (m = 1; m <= nn; ++m) {
|
||||
int32_t* complex_data_ptr = (int32_t*)complex_data;
|
||||
int32_t temp = 0;
|
||||
|
||||
/* Find out indexes that are bit-reversed. */
|
||||
l = n;
|
||||
do {
|
||||
l >>= 1;
|
||||
} while (l > nn - mr);
|
||||
mr = (mr & (l - 1)) + l;
|
||||
|
||||
if (mr <= m) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Swap the elements with bit-reversed indexes.
|
||||
* This is similar to the loop in the stages == 7 or 8 cases.
|
||||
*/
|
||||
temp = complex_data_ptr[m]; /* Real and imaginary */
|
||||
complex_data_ptr[m] = complex_data_ptr[mr];
|
||||
complex_data_ptr[mr] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
126
src/common_audio/signal_processing/complex_bit_reverse_arm.s
Normal file
126
src/common_audio/signal_processing/complex_bit_reverse_arm.s
Normal file
@ -0,0 +1,126 @@
|
||||
@
|
||||
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
@
|
||||
@ Use of this source code is governed by a BSD-style license
|
||||
@ that can be found in the LICENSE file in the root of the source
|
||||
@ tree. An additional intellectual property rights grant can be found
|
||||
@ in the file PATENTS. All contributing project authors may
|
||||
@ be found in the AUTHORS file in the root of the source tree.
|
||||
@
|
||||
|
||||
@ This file contains the function WebRtcSpl_ComplexBitReverse(), optimized
|
||||
@ for ARMv5 platforms.
|
||||
@ Reference C code is in file complex_bit_reverse.c. Bit-exact.
|
||||
|
||||
.arch armv5
|
||||
|
||||
.global WebRtcSpl_ComplexBitReverse
|
||||
|
||||
.align 2
|
||||
|
||||
WebRtcSpl_ComplexBitReverse:
|
||||
.fnstart
|
||||
|
||||
push {r4-r7}
|
||||
|
||||
cmp r1, #7
|
||||
adr r3, index_7 @ Table pointer.
|
||||
mov r4, #112 @ Number of interations.
|
||||
beq PRE_LOOP_STAGES_7_OR_8
|
||||
|
||||
cmp r1, #8
|
||||
adr r3, index_8 @ Table pointer.
|
||||
mov r4, #240 @ Number of interations.
|
||||
beq PRE_LOOP_STAGES_7_OR_8
|
||||
|
||||
mov r3, #1 @ Initialize m.
|
||||
mov r1, r3, asl r1 @ n = 1 << stages;
|
||||
subs r6, r1, #1 @ nn = n - 1;
|
||||
ble END
|
||||
|
||||
mov r5, r0 @ &complex_data
|
||||
mov r4, #0 @ ml
|
||||
|
||||
LOOP_GENERIC:
|
||||
rsb r12, r4, r6 @ l > nn - mr
|
||||
mov r2, r1 @ n
|
||||
|
||||
LOOP_SHIFT:
|
||||
asr r2, #1 @ l >>= 1;
|
||||
cmp r2, r12
|
||||
bgt LOOP_SHIFT
|
||||
|
||||
sub r12, r2, #1
|
||||
and r4, r12, r4
|
||||
add r4, r2 @ mr = (mr & (l - 1)) + l;
|
||||
cmp r4, r3 @ mr <= m ?
|
||||
ble UPDATE_REGISTERS
|
||||
|
||||
mov r12, r4, asl #2
|
||||
ldr r7, [r5, #4] @ complex_data[2 * m, 2 * m + 1].
|
||||
@ Offset 4 due to m incrementing from 1.
|
||||
ldr r2, [r0, r12] @ complex_data[2 * mr, 2 * mr + 1].
|
||||
str r7, [r0, r12]
|
||||
str r2, [r5, #4]
|
||||
|
||||
UPDATE_REGISTERS:
|
||||
add r3, r3, #1
|
||||
add r5, #4
|
||||
cmp r3, r1
|
||||
bne LOOP_GENERIC
|
||||
|
||||
b END
|
||||
|
||||
PRE_LOOP_STAGES_7_OR_8:
|
||||
add r4, r3, r4, asl #1
|
||||
|
||||
LOOP_STAGES_7_OR_8:
|
||||
ldrsh r2, [r3], #2 @ index[m]
|
||||
ldrsh r5, [r3], #2 @ index[m + 1]
|
||||
ldr r1, [r0, r2] @ complex_data[index[m], index[m] + 1]
|
||||
ldr r12, [r0, r5] @ complex_data[index[m + 1], index[m + 1] + 1]
|
||||
cmp r3, r4
|
||||
str r1, [r0, r5]
|
||||
str r12, [r0, r2]
|
||||
bne LOOP_STAGES_7_OR_8
|
||||
|
||||
END:
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
|
||||
.fnend
|
||||
|
||||
|
||||
@ The index tables. Note the values are doubles of the actual indexes for 16-bit
|
||||
@ elements, different from the generic C code. It actually provides byte offsets
|
||||
@ for the indexes.
|
||||
|
||||
.align 2
|
||||
index_7: @ Indexes for stages == 7.
|
||||
.hword 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288
|
||||
.hword 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144
|
||||
.hword 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116
|
||||
.hword 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156
|
||||
.hword 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204
|
||||
.hword 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268
|
||||
.hword 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348
|
||||
.hword 468, 364, 436, 380, 500, 412, 460, 444, 492
|
||||
|
||||
index_8: @ Indexes for stages == 8.
|
||||
.hword 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64
|
||||
.hword 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544
|
||||
.hword 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104
|
||||
.hword 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136
|
||||
.hword 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172
|
||||
.hword 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204
|
||||
.hword 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244
|
||||
.hword 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284
|
||||
.hword 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324
|
||||
.hword 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372
|
||||
.hword 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420
|
||||
.hword 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468
|
||||
.hword 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532
|
||||
.hword 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596
|
||||
.hword 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684
|
||||
.hword 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796
|
||||
.hword 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988
|
@ -429,9 +429,26 @@ int WebRtcSpl_DownsampleFast(const int16_t* data_in,
|
||||
// End: Filter operations.
|
||||
|
||||
// FFT operations
|
||||
|
||||
int WebRtcSpl_ComplexFFT(WebRtc_Word16 vector[], int stages, int mode);
|
||||
int WebRtcSpl_ComplexIFFT(WebRtc_Word16 vector[], int stages, int mode);
|
||||
void WebRtcSpl_ComplexBitReverse(WebRtc_Word16 vector[], int stages);
|
||||
|
||||
// Treat a 16-bit complex data buffer |complex_data| as an array of 32-bit
|
||||
// values, and swap elements whose indexes are bit-reverses of each other.
|
||||
//
|
||||
// Input:
|
||||
// - complex_data : Complex data buffer containing 2^|stages| real
|
||||
// elements interleaved with 2^|stages| imaginary
|
||||
// elements: [Re Im Re Im Re Im....]
|
||||
// - stages : Number of FFT stages. Must be at least 3 and at most
|
||||
// 10, since the table WebRtcSpl_kSinTable1024[] is 1024
|
||||
// elements long.
|
||||
//
|
||||
// Output:
|
||||
// - complex_data : The complex data buffer.
|
||||
|
||||
void WebRtcSpl_ComplexBitReverse(int16_t* __restrict complex_data, int stages);
|
||||
|
||||
// End: FFT operations
|
||||
|
||||
/************************************************************
|
||||
@ -1573,31 +1590,6 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band,
|
||||
// which returns a scale value of -1, indicating error.
|
||||
//
|
||||
|
||||
//
|
||||
// WebRtcSpl_ComplexBitReverse(...)
|
||||
//
|
||||
// Complex Bit Reverse
|
||||
//
|
||||
// This function bit-reverses the position of elements in the complex input
|
||||
// vector into the output vector.
|
||||
//
|
||||
// If you bit-reverse a linear-order array, you obtain a bit-reversed order
|
||||
// array. If you bit-reverse a bit-reversed order array, you obtain a
|
||||
// linear-order array.
|
||||
//
|
||||
// Input:
|
||||
// - vector : In pointer to complex vector containing 2^|stages| real
|
||||
// elements interleaved with 2^|stages| imaginary elements.
|
||||
// [ReImReImReIm....]
|
||||
// - stages : Number of FFT stages. Must be at least 3 and at most 10,
|
||||
// since the table WebRtcSpl_kSinTable1024[] is 1024
|
||||
// elements long.
|
||||
//
|
||||
// Output:
|
||||
// - vector : Out pointer to complex vector in bit-reversed order.
|
||||
// The input vector is over written.
|
||||
//
|
||||
|
||||
//
|
||||
// WebRtcSpl_AnalysisQMF(...)
|
||||
//
|
||||
|
@ -15,6 +15,9 @@
|
||||
#ifndef WEBRTC_SPL_SPL_INL_ARMV7_H_
|
||||
#define WEBRTC_SPL_SPL_INL_ARMV7_H_
|
||||
|
||||
// TODO(kma): Replace some assembly code with GCC intrinsics
|
||||
// (e.g. __builtin_clz).
|
||||
|
||||
static __inline WebRtc_Word32 WEBRTC_SPL_MUL_16_32_RSFT16(WebRtc_Word16 a,
|
||||
WebRtc_Word32 b) {
|
||||
WebRtc_Word32 tmp;
|
||||
|
Loading…
x
Reference in New Issue
Block a user