Optimized WebRtcSpl_ComplexBitReverse() for general ARM platforms and generic C.
In ARMv5, the cycles were reduced by 88% (weight in VoE reduced from 3.554% to 0.432%). The tradeoff is a memory increase of 704 bytes. Review URL: https://webrtc-codereview.appspot.com/388003 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1757 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
3cc03be51f
commit
bfa7f96d1e
src/common_audio/signal_processing
@ -20,7 +20,6 @@ LOCAL_SRC_FILES := \
|
|||||||
auto_corr_to_refl_coef.c \
|
auto_corr_to_refl_coef.c \
|
||||||
auto_correlation.c \
|
auto_correlation.c \
|
||||||
complex_fft.c \
|
complex_fft.c \
|
||||||
complex_bit_reverse.c \
|
|
||||||
copy_set_operations.c \
|
copy_set_operations.c \
|
||||||
division_operations.c \
|
division_operations.c \
|
||||||
dot_product_with_scale.c \
|
dot_product_with_scale.c \
|
||||||
@ -77,9 +76,11 @@ endif
|
|||||||
|
|
||||||
ifeq ($(TARGET_ARCH),arm)
|
ifeq ($(TARGET_ARCH),arm)
|
||||||
LOCAL_SRC_FILES += \
|
LOCAL_SRC_FILES += \
|
||||||
|
complex_bit_reverse_arm.s \
|
||||||
spl_sqrt_floor.s
|
spl_sqrt_floor.s
|
||||||
else
|
else
|
||||||
LOCAL_SRC_FILES += \
|
LOCAL_SRC_FILES += \
|
||||||
|
complex_bit_reverse.c \
|
||||||
spl_sqrt_floor.c
|
spl_sqrt_floor.c
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||||
*
|
*
|
||||||
* Use of this source code is governed by a BSD-style license
|
* Use of this source code is governed by a BSD-style license
|
||||||
* that can be found in the LICENSE file in the root of the source
|
* that can be found in the LICENSE file in the root of the source
|
||||||
@ -8,44 +8,102 @@
|
|||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This file contains the function WebRtcSpl_ComplexBitReverse().
|
|
||||||
* The description header can be found in signal_processing_library.h
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "signal_processing_library.h"
|
#include "signal_processing_library.h"
|
||||||
|
|
||||||
void WebRtcSpl_ComplexBitReverse(WebRtc_Word16 frfi[], int stages)
|
/* Tables for data buffer indexes that are bit reversed and thus need to be
|
||||||
{
|
* swapped. Note that, index_7[{0, 2, 4, ...}] are for the left side of the swap
|
||||||
int mr, nn, n, l, m;
|
* operations, while index_7[{1, 3, 5, ...}] are for the right side of the
|
||||||
WebRtc_Word16 tr, ti;
|
* operation. Same for index_8.
|
||||||
|
*/
|
||||||
|
|
||||||
n = 1 << stages;
|
/* Indexes for the case of stages == 7. */
|
||||||
|
static const int16_t index_7[112] = {
|
||||||
|
1, 64, 2, 32, 3, 96, 4, 16, 5, 80, 6, 48, 7, 112, 9, 72, 10, 40, 11, 104,
|
||||||
|
12, 24, 13, 88, 14, 56, 15, 120, 17, 68, 18, 36, 19, 100, 21, 84, 22, 52,
|
||||||
|
23, 116, 25, 76, 26, 44, 27, 108, 29, 92, 30, 60, 31, 124, 33, 66, 35, 98,
|
||||||
|
37, 82, 38, 50, 39, 114, 41, 74, 43, 106, 45, 90, 46, 58, 47, 122, 49, 70,
|
||||||
|
51, 102, 53, 86, 55, 118, 57, 78, 59, 110, 61, 94, 63, 126, 67, 97, 69,
|
||||||
|
81, 71, 113, 75, 105, 77, 89, 79, 121, 83, 101, 87, 117, 91, 109, 95, 125,
|
||||||
|
103, 115, 111, 123
|
||||||
|
};
|
||||||
|
|
||||||
mr = 0;
|
/* Indexes for the case of stages == 8. */
|
||||||
nn = n - 1;
|
static const int16_t index_8[240] = {
|
||||||
|
1, 128, 2, 64, 3, 192, 4, 32, 5, 160, 6, 96, 7, 224, 8, 16, 9, 144, 10, 80,
|
||||||
|
11, 208, 12, 48, 13, 176, 14, 112, 15, 240, 17, 136, 18, 72, 19, 200, 20,
|
||||||
|
40, 21, 168, 22, 104, 23, 232, 25, 152, 26, 88, 27, 216, 28, 56, 29, 184,
|
||||||
|
30, 120, 31, 248, 33, 132, 34, 68, 35, 196, 37, 164, 38, 100, 39, 228, 41,
|
||||||
|
148, 42, 84, 43, 212, 44, 52, 45, 180, 46, 116, 47, 244, 49, 140, 50, 76,
|
||||||
|
51, 204, 53, 172, 54, 108, 55, 236, 57, 156, 58, 92, 59, 220, 61, 188, 62,
|
||||||
|
124, 63, 252, 65, 130, 67, 194, 69, 162, 70, 98, 71, 226, 73, 146, 74, 82,
|
||||||
|
75, 210, 77, 178, 78, 114, 79, 242, 81, 138, 83, 202, 85, 170, 86, 106, 87,
|
||||||
|
234, 89, 154, 91, 218, 93, 186, 94, 122, 95, 250, 97, 134, 99, 198, 101,
|
||||||
|
166, 103, 230, 105, 150, 107, 214, 109, 182, 110, 118, 111, 246, 113, 142,
|
||||||
|
115, 206, 117, 174, 119, 238, 121, 158, 123, 222, 125, 190, 127, 254, 131,
|
||||||
|
193, 133, 161, 135, 225, 137, 145, 139, 209, 141, 177, 143, 241, 147, 201,
|
||||||
|
149, 169, 151, 233, 155, 217, 157, 185, 159, 249, 163, 197, 167, 229, 171,
|
||||||
|
213, 173, 181, 175, 245, 179, 205, 183, 237, 187, 221, 191, 253, 199, 227,
|
||||||
|
203, 211, 207, 243, 215, 235, 223, 251, 239, 247
|
||||||
|
};
|
||||||
|
|
||||||
// decimation in time - re-order data
|
void WebRtcSpl_ComplexBitReverse(int16_t* __restrict complex_data, int stages) {
|
||||||
for (m = 1; m <= nn; ++m)
|
/* For any specific value of stages, we know exactly the indexes that are
|
||||||
{
|
* bit reversed. Currently (Feb. 2012) in WebRTC the only possible values of
|
||||||
l = n;
|
* stages are 7 and 8, so we use tables to save unnecessary iterations and
|
||||||
do
|
* calculations for these two cases.
|
||||||
{
|
*/
|
||||||
l >>= 1;
|
if (stages == 7 || stages == 8) {
|
||||||
} while (mr + l > nn);
|
int m = 0;
|
||||||
mr = (mr & (l - 1)) + l;
|
int length = 112;
|
||||||
|
const int16_t* index = index_7;
|
||||||
|
|
||||||
if (mr <= m)
|
if (stages == 8) {
|
||||||
continue;
|
length = 240;
|
||||||
|
index = index_8;
|
||||||
tr = frfi[2 * m];
|
|
||||||
frfi[2 * m] = frfi[2 * mr];
|
|
||||||
frfi[2 * mr] = tr;
|
|
||||||
|
|
||||||
ti = frfi[2 * m + 1];
|
|
||||||
frfi[2 * m + 1] = frfi[2 * mr + 1];
|
|
||||||
frfi[2 * mr + 1] = ti;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Decimation in time. Swap the elements with bit-reversed indexes. */
|
||||||
|
for (m = 0; m < length; m += 2) {
|
||||||
|
/* We declare a int32_t* type pointer, to load both the 16-bit real
|
||||||
|
* and imaginary elements from complex_data in one instruction, reducing
|
||||||
|
* complexity.
|
||||||
|
*/
|
||||||
|
int32_t* complex_data_ptr = (int32_t*)complex_data;
|
||||||
|
int32_t temp = 0;
|
||||||
|
|
||||||
|
temp = complex_data_ptr[index[m]]; /* Real and imaginary */
|
||||||
|
complex_data_ptr[index[m]] = complex_data_ptr[index[m + 1]];
|
||||||
|
complex_data_ptr[index[m + 1]] = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int m = 0, mr = 0, l = 0;
|
||||||
|
int n = 1 << stages;
|
||||||
|
int nn = n - 1;
|
||||||
|
|
||||||
|
/* Decimation in time - re-order data */
|
||||||
|
for (m = 1; m <= nn; ++m) {
|
||||||
|
int32_t* complex_data_ptr = (int32_t*)complex_data;
|
||||||
|
int32_t temp = 0;
|
||||||
|
|
||||||
|
/* Find out indexes that are bit-reversed. */
|
||||||
|
l = n;
|
||||||
|
do {
|
||||||
|
l >>= 1;
|
||||||
|
} while (l > nn - mr);
|
||||||
|
mr = (mr & (l - 1)) + l;
|
||||||
|
|
||||||
|
if (mr <= m) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Swap the elements with bit-reversed indexes.
|
||||||
|
* This is similar to the loop in the stages == 7 or 8 cases.
|
||||||
|
*/
|
||||||
|
temp = complex_data_ptr[m]; /* Real and imaginary */
|
||||||
|
complex_data_ptr[m] = complex_data_ptr[mr];
|
||||||
|
complex_data_ptr[mr] = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
126
src/common_audio/signal_processing/complex_bit_reverse_arm.s
Normal file
126
src/common_audio/signal_processing/complex_bit_reverse_arm.s
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
@
|
||||||
|
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||||
|
@
|
||||||
|
@ Use of this source code is governed by a BSD-style license
|
||||||
|
@ that can be found in the LICENSE file in the root of the source
|
||||||
|
@ tree. An additional intellectual property rights grant can be found
|
||||||
|
@ in the file PATENTS. All contributing project authors may
|
||||||
|
@ be found in the AUTHORS file in the root of the source tree.
|
||||||
|
@
|
||||||
|
|
||||||
|
@ This file contains the function WebRtcSpl_ComplexBitReverse(), optimized
|
||||||
|
@ for ARMv5 platforms.
|
||||||
|
@ Reference C code is in file complex_bit_reverse.c. Bit-exact.
|
||||||
|
|
||||||
|
.arch armv5
|
||||||
|
|
||||||
|
.global WebRtcSpl_ComplexBitReverse
|
||||||
|
|
||||||
|
.align 2
|
||||||
|
|
||||||
|
WebRtcSpl_ComplexBitReverse:
|
||||||
|
.fnstart
|
||||||
|
|
||||||
|
push {r4-r7}
|
||||||
|
|
||||||
|
cmp r1, #7
|
||||||
|
adr r3, index_7 @ Table pointer.
|
||||||
|
mov r4, #112 @ Number of interations.
|
||||||
|
beq PRE_LOOP_STAGES_7_OR_8
|
||||||
|
|
||||||
|
cmp r1, #8
|
||||||
|
adr r3, index_8 @ Table pointer.
|
||||||
|
mov r4, #240 @ Number of interations.
|
||||||
|
beq PRE_LOOP_STAGES_7_OR_8
|
||||||
|
|
||||||
|
mov r3, #1 @ Initialize m.
|
||||||
|
mov r1, r3, asl r1 @ n = 1 << stages;
|
||||||
|
subs r6, r1, #1 @ nn = n - 1;
|
||||||
|
ble END
|
||||||
|
|
||||||
|
mov r5, r0 @ &complex_data
|
||||||
|
mov r4, #0 @ ml
|
||||||
|
|
||||||
|
LOOP_GENERIC:
|
||||||
|
rsb r12, r4, r6 @ l > nn - mr
|
||||||
|
mov r2, r1 @ n
|
||||||
|
|
||||||
|
LOOP_SHIFT:
|
||||||
|
asr r2, #1 @ l >>= 1;
|
||||||
|
cmp r2, r12
|
||||||
|
bgt LOOP_SHIFT
|
||||||
|
|
||||||
|
sub r12, r2, #1
|
||||||
|
and r4, r12, r4
|
||||||
|
add r4, r2 @ mr = (mr & (l - 1)) + l;
|
||||||
|
cmp r4, r3 @ mr <= m ?
|
||||||
|
ble UPDATE_REGISTERS
|
||||||
|
|
||||||
|
mov r12, r4, asl #2
|
||||||
|
ldr r7, [r5, #4] @ complex_data[2 * m, 2 * m + 1].
|
||||||
|
@ Offset 4 due to m incrementing from 1.
|
||||||
|
ldr r2, [r0, r12] @ complex_data[2 * mr, 2 * mr + 1].
|
||||||
|
str r7, [r0, r12]
|
||||||
|
str r2, [r5, #4]
|
||||||
|
|
||||||
|
UPDATE_REGISTERS:
|
||||||
|
add r3, r3, #1
|
||||||
|
add r5, #4
|
||||||
|
cmp r3, r1
|
||||||
|
bne LOOP_GENERIC
|
||||||
|
|
||||||
|
b END
|
||||||
|
|
||||||
|
PRE_LOOP_STAGES_7_OR_8:
|
||||||
|
add r4, r3, r4, asl #1
|
||||||
|
|
||||||
|
LOOP_STAGES_7_OR_8:
|
||||||
|
ldrsh r2, [r3], #2 @ index[m]
|
||||||
|
ldrsh r5, [r3], #2 @ index[m + 1]
|
||||||
|
ldr r1, [r0, r2] @ complex_data[index[m], index[m] + 1]
|
||||||
|
ldr r12, [r0, r5] @ complex_data[index[m + 1], index[m + 1] + 1]
|
||||||
|
cmp r3, r4
|
||||||
|
str r1, [r0, r5]
|
||||||
|
str r12, [r0, r2]
|
||||||
|
bne LOOP_STAGES_7_OR_8
|
||||||
|
|
||||||
|
END:
|
||||||
|
pop {r4-r7}
|
||||||
|
bx lr
|
||||||
|
|
||||||
|
.fnend
|
||||||
|
|
||||||
|
|
||||||
|
@ The index tables. Note the values are doubles of the actual indexes for 16-bit
|
||||||
|
@ elements, different from the generic C code. It actually provides byte offsets
|
||||||
|
@ for the indexes.
|
||||||
|
|
||||||
|
.align 2
|
||||||
|
index_7: @ Indexes for stages == 7.
|
||||||
|
.hword 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288
|
||||||
|
.hword 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144
|
||||||
|
.hword 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116
|
||||||
|
.hword 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156
|
||||||
|
.hword 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204
|
||||||
|
.hword 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268
|
||||||
|
.hword 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348
|
||||||
|
.hword 468, 364, 436, 380, 500, 412, 460, 444, 492
|
||||||
|
|
||||||
|
index_8: @ Indexes for stages == 8.
|
||||||
|
.hword 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64
|
||||||
|
.hword 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544
|
||||||
|
.hword 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104
|
||||||
|
.hword 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136
|
||||||
|
.hword 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172
|
||||||
|
.hword 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204
|
||||||
|
.hword 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244
|
||||||
|
.hword 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284
|
||||||
|
.hword 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324
|
||||||
|
.hword 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372
|
||||||
|
.hword 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420
|
||||||
|
.hword 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468
|
||||||
|
.hword 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532
|
||||||
|
.hword 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596
|
||||||
|
.hword 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684
|
||||||
|
.hword 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796
|
||||||
|
.hword 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988
|
@ -429,9 +429,26 @@ int WebRtcSpl_DownsampleFast(const int16_t* data_in,
|
|||||||
// End: Filter operations.
|
// End: Filter operations.
|
||||||
|
|
||||||
// FFT operations
|
// FFT operations
|
||||||
|
|
||||||
int WebRtcSpl_ComplexFFT(WebRtc_Word16 vector[], int stages, int mode);
|
int WebRtcSpl_ComplexFFT(WebRtc_Word16 vector[], int stages, int mode);
|
||||||
int WebRtcSpl_ComplexIFFT(WebRtc_Word16 vector[], int stages, int mode);
|
int WebRtcSpl_ComplexIFFT(WebRtc_Word16 vector[], int stages, int mode);
|
||||||
void WebRtcSpl_ComplexBitReverse(WebRtc_Word16 vector[], int stages);
|
|
||||||
|
// Treat a 16-bit complex data buffer |complex_data| as an array of 32-bit
|
||||||
|
// values, and swap elements whose indexes are bit-reverses of each other.
|
||||||
|
//
|
||||||
|
// Input:
|
||||||
|
// - complex_data : Complex data buffer containing 2^|stages| real
|
||||||
|
// elements interleaved with 2^|stages| imaginary
|
||||||
|
// elements: [Re Im Re Im Re Im....]
|
||||||
|
// - stages : Number of FFT stages. Must be at least 3 and at most
|
||||||
|
// 10, since the table WebRtcSpl_kSinTable1024[] is 1024
|
||||||
|
// elements long.
|
||||||
|
//
|
||||||
|
// Output:
|
||||||
|
// - complex_data : The complex data buffer.
|
||||||
|
|
||||||
|
void WebRtcSpl_ComplexBitReverse(int16_t* __restrict complex_data, int stages);
|
||||||
|
|
||||||
// End: FFT operations
|
// End: FFT operations
|
||||||
|
|
||||||
/************************************************************
|
/************************************************************
|
||||||
@ -1573,31 +1590,6 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band,
|
|||||||
// which returns a scale value of -1, indicating error.
|
// which returns a scale value of -1, indicating error.
|
||||||
//
|
//
|
||||||
|
|
||||||
//
|
|
||||||
// WebRtcSpl_ComplexBitReverse(...)
|
|
||||||
//
|
|
||||||
// Complex Bit Reverse
|
|
||||||
//
|
|
||||||
// This function bit-reverses the position of elements in the complex input
|
|
||||||
// vector into the output vector.
|
|
||||||
//
|
|
||||||
// If you bit-reverse a linear-order array, you obtain a bit-reversed order
|
|
||||||
// array. If you bit-reverse a bit-reversed order array, you obtain a
|
|
||||||
// linear-order array.
|
|
||||||
//
|
|
||||||
// Input:
|
|
||||||
// - vector : In pointer to complex vector containing 2^|stages| real
|
|
||||||
// elements interleaved with 2^|stages| imaginary elements.
|
|
||||||
// [ReImReImReIm....]
|
|
||||||
// - stages : Number of FFT stages. Must be at least 3 and at most 10,
|
|
||||||
// since the table WebRtcSpl_kSinTable1024[] is 1024
|
|
||||||
// elements long.
|
|
||||||
//
|
|
||||||
// Output:
|
|
||||||
// - vector : Out pointer to complex vector in bit-reversed order.
|
|
||||||
// The input vector is over written.
|
|
||||||
//
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// WebRtcSpl_AnalysisQMF(...)
|
// WebRtcSpl_AnalysisQMF(...)
|
||||||
//
|
//
|
||||||
|
@ -15,6 +15,9 @@
|
|||||||
#ifndef WEBRTC_SPL_SPL_INL_ARMV7_H_
|
#ifndef WEBRTC_SPL_SPL_INL_ARMV7_H_
|
||||||
#define WEBRTC_SPL_SPL_INL_ARMV7_H_
|
#define WEBRTC_SPL_SPL_INL_ARMV7_H_
|
||||||
|
|
||||||
|
// TODO(kma): Replace some assembly code with GCC intrinsics
|
||||||
|
// (e.g. __builtin_clz).
|
||||||
|
|
||||||
static __inline WebRtc_Word32 WEBRTC_SPL_MUL_16_32_RSFT16(WebRtc_Word16 a,
|
static __inline WebRtc_Word32 WEBRTC_SPL_MUL_16_32_RSFT16(WebRtc_Word16 a,
|
||||||
WebRtc_Word32 b) {
|
WebRtc_Word32 b) {
|
||||||
WebRtc_Word32 tmp;
|
WebRtc_Word32 tmp;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user