Pure Neon assembly coding for WebRtcIsacfix_AutocorrNeon() in iSAC-Fix.
Review URL: https://webrtc-codereview.appspot.com/939018 git-svn-id: http://webrtc.googlecode.com/svn/trunk@3098 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
9e9cc72b53
commit
1786436eb2
@ -87,7 +87,8 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
|
||||
LOCAL_MODULE := libwebrtc_isacfix_neon
|
||||
LOCAL_MODULE_TAGS := optional
|
||||
LOCAL_SRC_FILES := \
|
||||
filters_neon.c \
|
||||
filterbanks_neon.S \
|
||||
filters_neon.S \
|
||||
lattice_neon.S \
|
||||
lpc_masking_model_neon.S
|
||||
|
||||
|
@ -21,6 +21,9 @@
|
||||
|
||||
#include "structs.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int WebRtcIsacfix_EstimateBandwidth(BwEstimatorstr *bwest_str,
|
||||
Bitstr_dec *streamdata,
|
||||
@ -176,4 +179,8 @@ typedef void (*FilterMaLoopFix)(int16_t input0,
|
||||
int32_t* ptr2);
|
||||
extern FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix;
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif /* WEBRTC_MODULES_AUDIO_CODING_CODECS_ISAC_FIX_SOURCE_CODEC_H_ */
|
||||
|
@ -8,19 +8,9 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
/*
|
||||
* filters.c
|
||||
*
|
||||
* This file contains function WebRtcIsacfix_AutocorrC,
|
||||
* AllpassFilterForDec32, and WebRtcIsacfix_DecimateAllpass32
|
||||
*
|
||||
*/
|
||||
#include <assert.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "pitch_estimator.h"
|
||||
#include "lpc_masking_model.h"
|
||||
#include "codec.h"
|
||||
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
|
||||
|
||||
// Autocorrelation function in fixed point.
|
||||
// NOTE! Different from SPLIB-version in how it scales the signal.
|
||||
@ -36,6 +26,10 @@ int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r,
|
||||
uint32_t temp = 0;
|
||||
int64_t prod = 0;
|
||||
|
||||
// The ARM assembly code assumptoins.
|
||||
assert(N % 4 == 0);
|
||||
assert(N >= 8);
|
||||
|
||||
// Calculate r[0].
|
||||
for (i = 0; i < N; i++) {
|
||||
prod += WEBRTC_SPL_MUL_16_16(x[i], x[i]);
|
||||
|
@ -0,0 +1,145 @@
|
||||
@
|
||||
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
@
|
||||
@ Use of this source code is governed by a BSD-style license
|
||||
@ that can be found in the LICENSE file in the root of the source
|
||||
@ tree. An additional intellectual property rights grant can be found
|
||||
@ in the file PATENTS. All contributing project authors may
|
||||
@ be found in the AUTHORS file in the root of the source tree.
|
||||
@
|
||||
@ Reference code in filters.c. Output is bit-exact.
|
||||
|
||||
#include "settings.h"
|
||||
|
||||
.global WebRtcIsacfix_AutocorrNeon
|
||||
.align 2
|
||||
|
||||
@ int WebRtcIsacfix_AutocorrNeon(
|
||||
@ WebRtc_Word32* __restrict r,
|
||||
@ const WebRtc_Word16* __restrict x,
|
||||
@ WebRtc_Word16 N,
|
||||
@ WebRtc_Word16 order,
|
||||
@ WebRtc_Word16* __restrict scale);
|
||||
|
||||
WebRtcIsacfix_AutocorrNeon:
|
||||
push {r3 - r12}
|
||||
|
||||
@ Constant initializations
|
||||
mov r4, #33
|
||||
vmov.i32 d0, #0
|
||||
vmov.i32 q8, #0
|
||||
vmov.i32 d29, #0 @ Initialize (-scale).
|
||||
vmov.u8 d30, #255 @ Initialize d30 as -1.
|
||||
vmov.i32 d0[0], r4 @ d0: 00000033 (low), 00000000 (high)
|
||||
vmov.i32 d25, #32
|
||||
|
||||
mov r5, r1 @ x
|
||||
mov r6, r2 @ N
|
||||
|
||||
@ Generate the first coefficient r0.
|
||||
LOOP_R0:
|
||||
vld1.16 {d18}, [r5]! @ x[]
|
||||
subs r6, r6, #4
|
||||
vmull.s16 q9, d18, d18
|
||||
vpadal.s32 q8, q9
|
||||
bgt LOOP_R0
|
||||
|
||||
vadd.i64 d16, d16, d17
|
||||
|
||||
@ Calculate scaling (the value of shifting).
|
||||
vmov d17, d16
|
||||
|
||||
@ Check overflow and determine the value for 'scale'.
|
||||
@ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and
|
||||
@ lower 32-bit words. Note that we don't care about the value of the upper
|
||||
@ word in d17.
|
||||
|
||||
@ Check the case of 1 bit overflow. If it occurs store the results for
|
||||
@ scale and r[0] in d17 and d29.
|
||||
|
||||
vshr.u64 d3, d16, #1
|
||||
vclt.s32 d1, d16, #0 @ < 0 ?
|
||||
vbit d17, d3, d1 @ For r[0]
|
||||
vbit d29, d30, d1 @ -scale = -1
|
||||
|
||||
@ For the case of more than 1 bit overflow. If it occurs overwrite the
|
||||
@ results for scale and r[0] in d17 and d29.
|
||||
vclz.s32 d5, d16 @ Leading zeros of the two 32 bit words.
|
||||
vshr.s64 d26, d5, #32 @ Keep only the upper 32 bits.
|
||||
vsub.i64 d31, d26, d0 @ zeros - 33
|
||||
vshl.i64 d27, d26, #32
|
||||
vorr d27, d26 @ Duplicate the high word with its low one.
|
||||
vshl.u64 d2, d16, d31 @ Shift by (-scale).
|
||||
vclt.s32 d1, d27, d25 @ < 32 ?
|
||||
vbit d17, d2, d1 @ For r[0]
|
||||
vbit d29, d31, d1 @ -scale
|
||||
|
||||
vst1.32 d17[0], [r0]! @ r[0]
|
||||
mov r5, #1 @ outer loop counter
|
||||
|
||||
@ Generate rest of the coefficients
|
||||
LOOP_R:
|
||||
vmov.i32 q8, #0 @ Initialize the accumulation result.
|
||||
vmov.i32 q9, #0 @ Initialize the accumulation result.
|
||||
mov r7, r1 @ &x[0]
|
||||
add r6, r7, r5, lsl #1 @ x[i]
|
||||
sub r12, r2, r5 @ N - i
|
||||
lsr r8, r12, #3 @ inner loop counter
|
||||
sub r12, r8, lsl #3 @ Leftover samples to be processed
|
||||
|
||||
LOOP_8X_SAMPLES: @ Multiple of 8 samples
|
||||
vld1.16 {d20, d21}, [r7]! @ x[0, ...]
|
||||
vld1.16 {d22, d23}, [r6]! @ x[i, ...]
|
||||
vmull.s16 q12, d20, d22
|
||||
vmull.s16 q13, d21, d23
|
||||
subs r8, #1
|
||||
vpadal.s32 q8, q12
|
||||
vpadal.s32 q9, q13
|
||||
bgt LOOP_8X_SAMPLES
|
||||
|
||||
cmp r12, #4
|
||||
blt REST_SAMPLES
|
||||
|
||||
Four_SAMPLES:
|
||||
vld1.16 d20, [r7]!
|
||||
vld1.16 d22, [r6]!
|
||||
vmull.s16 q12, d20, d22
|
||||
vpadal.s32 q8, q12
|
||||
sub r12, #4
|
||||
|
||||
REST_SAMPLES:
|
||||
mov r8, #0 @ Initialize lower word of the accumulation.
|
||||
mov r4, #0 @ Initialize upper word of the accumulation.
|
||||
cmp r12, #0
|
||||
ble SUMUP
|
||||
|
||||
LOOP_REST_SAMPLES:
|
||||
ldrh r9, [r7], #2 @ x[0, ...]
|
||||
ldrh r10, [r6], #2 @ x[i, ...]
|
||||
smulbb r11, r9, r10
|
||||
adds r8, r8, r11 @ lower word of the accumulation.
|
||||
adc r4, r4, r11, asr #31 @ upper word of the accumulation.
|
||||
subs r12, #1
|
||||
bgt LOOP_REST_SAMPLES
|
||||
|
||||
@ Added the multiplication results together and do a shift.
|
||||
SUMUP:
|
||||
vadd.i64 d16, d17
|
||||
vadd.i64 d18, d19
|
||||
vadd.i64 d18, d16
|
||||
vmov d17, r8, r4
|
||||
vadd.i64 d18, d17
|
||||
vshl.s64 d18, d29 @ Shift left by (-scale).
|
||||
vst1.32 d18[0], [r0]! @ r[i]
|
||||
|
||||
add r5, #1
|
||||
cmp r5, r3
|
||||
ble LOOP_R
|
||||
|
||||
vneg.s32 d29, d29 @ Get value for 'scale'.
|
||||
ldr r2, [sp, #40] @ &scale
|
||||
add r0, r3, #1 @ return (order + 1)
|
||||
vst1.s16 d29[0], [r2] @ Store 'scale'
|
||||
|
||||
pop {r3 - r12}
|
||||
bx lr
|
@ -1,167 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
/*
|
||||
* filters_neon.c
|
||||
*
|
||||
* This file contains function WebRtcIsacfix_AutocorrNeon, optimized for
|
||||
* ARM Neon platform.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "codec.h"
|
||||
|
||||
// Autocorrelation function in fixed point.
|
||||
// NOTE! Different from SPLIB-version in how it scales the signal.
|
||||
int WebRtcIsacfix_AutocorrNeon(
|
||||
WebRtc_Word32* __restrict r,
|
||||
const WebRtc_Word16* __restrict x,
|
||||
WebRtc_Word16 N,
|
||||
WebRtc_Word16 order,
|
||||
WebRtc_Word16* __restrict scale) {
|
||||
|
||||
// The 1st for loop assumed N % 4 == 0.
|
||||
assert(N % 4 == 0);
|
||||
|
||||
int i = 0;
|
||||
int zeros_low = 0;
|
||||
int zeros_high = 0;
|
||||
int16_t scaling = 0;
|
||||
int32_t sum = 0;
|
||||
|
||||
// Step 1, calculate r[0] and how much scaling is needed.
|
||||
|
||||
int16x4_t reg16x4;
|
||||
int64x1_t reg64x1a;
|
||||
int64x1_t reg64x1b;
|
||||
int32x4_t reg32x4;
|
||||
int64x2_t reg64x2 = vdupq_n_s64(0); // zeros
|
||||
|
||||
// Loop over the samples and do:
|
||||
// sum += WEBRTC_SPL_MUL_16_16(x[i], x[i]);
|
||||
for (i = 0; i < N; i += 4) {
|
||||
reg16x4 = vld1_s16(&x[i]);
|
||||
reg32x4 = vmull_s16(reg16x4, reg16x4);
|
||||
reg64x2 = vpadalq_s32(reg64x2, reg32x4);
|
||||
}
|
||||
reg64x1a = vget_low_s64(reg64x2);
|
||||
reg64x1b = vget_high_s64(reg64x2);
|
||||
reg64x1a = vadd_s64(reg64x1a, reg64x1b);
|
||||
|
||||
// Calculate the value of shifting (scaling).
|
||||
__asm__ __volatile__(
|
||||
"vmov %[z_l], %[z_h], %P[reg]\n\t"
|
||||
"clz %[z_l], %[z_l]\n\t"
|
||||
"clz %[z_h], %[z_h]\n\t"
|
||||
:[z_l]"+r"(zeros_low),
|
||||
[z_h]"+r"(zeros_high)
|
||||
:[reg]"w"(reg64x1a)
|
||||
);
|
||||
if (zeros_high != 32) {
|
||||
scaling = (32 - zeros_high + 1);
|
||||
} else if (zeros_low == 0) {
|
||||
scaling = 1;
|
||||
}
|
||||
reg64x1b = -scaling;
|
||||
reg64x1a = vshl_s64(reg64x1a, reg64x1b);
|
||||
|
||||
// Record the result.
|
||||
r[0] = (int32_t)vget_lane_s64(reg64x1a, 0);
|
||||
|
||||
|
||||
// Step 2, perform the actual correlation calculation.
|
||||
|
||||
/* Original C code (for the rest of the function):
|
||||
for (i = 1; i < order + 1; i++) {
|
||||
prod = 0;
|
||||
for (j = 0; j < N - i; j++) {
|
||||
prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]);
|
||||
}
|
||||
sum = (int32_t)(prod >> scaling);
|
||||
r[i] = sum;
|
||||
}
|
||||
*/
|
||||
|
||||
for (i = 1; i < order + 1; i++) {
|
||||
int32_t prod_lower = 0;
|
||||
int32_t prod_upper = 0;
|
||||
const int16_t* ptr0 = &x[0];
|
||||
const int16_t* ptr1 = &x[i];
|
||||
int32_t tmp = 0;
|
||||
|
||||
// Initialize the sum (q9) to zero.
|
||||
__asm__ __volatile__("vmov.i32 q9, #0\n\t":::"q9");
|
||||
|
||||
// Calculate the major block of the samples (a multiple of 8).
|
||||
for (; ptr0 < &x[N - i - 7];) {
|
||||
__asm__ __volatile__(
|
||||
"vld1.16 {d20, d21}, [%[ptr0]]!\n\t"
|
||||
"vld1.16 {d22, d23}, [%[ptr1]]!\n\t"
|
||||
"vmull.s16 q12, d20, d22\n\t"
|
||||
"vmull.s16 q13, d21, d23\n\t"
|
||||
"vpadal.s32 q9, q12\n\t"
|
||||
"vpadal.s32 q9, q13\n\t"
|
||||
|
||||
// Specify constraints.
|
||||
:[ptr0]"+r"(ptr0),
|
||||
[ptr1]"+r"(ptr1)
|
||||
:
|
||||
:"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27"
|
||||
);
|
||||
}
|
||||
|
||||
// Calculate the rest of the samples.
|
||||
for (; ptr0 < &x[N - i]; ptr0++, ptr1++) {
|
||||
__asm__ __volatile__(
|
||||
"smulbb %[tmp], %[ptr0], %[ptr1]\n\t"
|
||||
"adds %[prod_lower], %[prod_lower], %[tmp]\n\t"
|
||||
"adc %[prod_upper], %[prod_upper], %[tmp], asr #31\n\t"
|
||||
|
||||
// Specify constraints.
|
||||
:[prod_lower]"+r"(prod_lower),
|
||||
[prod_upper]"+r"(prod_upper),
|
||||
[tmp]"+r"(tmp)
|
||||
:[ptr0]"r"(*ptr0),
|
||||
[ptr1]"r"(*ptr1)
|
||||
);
|
||||
}
|
||||
|
||||
// Sum the results up, and do shift.
|
||||
__asm__ __volatile__(
|
||||
"vadd.i64 d18, d19\n\t"
|
||||
"vmov.32 d17[0], %[prod_lower]\n\t"
|
||||
"vmov.32 d17[1], %[prod_upper]\n\t"
|
||||
"vadd.i64 d17, d18\n\t"
|
||||
"mov %[tmp], %[scaling], asr #31\n\t"
|
||||
"vmov.32 d16, %[scaling], %[tmp]\n\t"
|
||||
"vshl.s64 d17, d16\n\t"
|
||||
"vmov.32 %[sum], d17[0]\n\t"
|
||||
|
||||
// Specify constraints.
|
||||
:[sum]"=r"(sum),
|
||||
[tmp]"+r"(tmp)
|
||||
:[prod_upper]"r"(prod_upper),
|
||||
[prod_lower]"r"(prod_lower),
|
||||
[scaling]"r"(-scaling)
|
||||
:"d16", "d17", "d18", "d19"
|
||||
);
|
||||
|
||||
// Record the result.
|
||||
r[i] = sum;
|
||||
}
|
||||
|
||||
// Record the result.
|
||||
*scale = scaling;
|
||||
|
||||
return(order + 1);
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
#include "gtest/gtest.h"
|
||||
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
|
||||
#include "webrtc/system_wrappers/interface/cpu_features_wrapper.h"
|
||||
#include "webrtc/typedefs.h"
|
||||
|
||||
class FiltersTest : public testing::Test {
|
||||
protected:
|
||||
// Pass a function pointer to the Tester function.
|
||||
void FiltersTester(AutocorrFix WebRtcIsacfix_AutocorrFixFunction) {
|
||||
const int kOrder = 12;
|
||||
const int kBuffer = 40;
|
||||
int16_t scale = 0;
|
||||
int32_t r_buffer[kOrder + 2] = {0};
|
||||
|
||||
// Test an overflow case.
|
||||
const int16_t x_buffer_0[kBuffer] = {0, 0, 3010, 22351, 21106, 16969, -2095,
|
||||
-664, 3513, -30980, 32767, -23839, 13335, 20289, -6831, 339, -17207,
|
||||
32767, 4959, 6177, 32767, 16599, -4747, 20504, 3513, -30980, 32767,
|
||||
-23839, 13335, 20289, 0, -16969, -2095, -664, 3513, 31981, 32767,
|
||||
-13839, 23336, 30281};
|
||||
const int32_t r_expected_0[kOrder + 2] = {1872498461, -224288754, 203789985,
|
||||
483400487, -208272635, 2436500, 137785322, 266600814, -208486262,
|
||||
329510080, 137949184, -161738972, -26894267, 237630192};
|
||||
|
||||
WebRtcIsacfix_AutocorrFixFunction(r_buffer, x_buffer_0,
|
||||
kBuffer, kOrder + 1, &scale);
|
||||
for (int i = 0; i < kOrder + 2; i++) {
|
||||
EXPECT_EQ(r_expected_0[i], r_buffer[i]);
|
||||
}
|
||||
EXPECT_EQ(3, scale);
|
||||
|
||||
// Test a no-overflow case.
|
||||
const int16_t x_buffer_1[kBuffer] = {0, 0, 300, 21, 206, 169, -295,
|
||||
-664, 3513, -300, 327, -29, 15, 289, -6831, 339, -107,
|
||||
37, 59, 6177, 327, 169, -4747, 204, 313, -980, 767,
|
||||
-9, 135, 289, 0, -6969, -2095, -664, 0, 1, 7,
|
||||
-39, 236, 281};
|
||||
const int32_t r_expected_1[kOrder + 2] = {176253864, 8126617, 1983287,
|
||||
-26196788, -3487363, -42839676, -24644043, 3469813, 30559879, 31905045,
|
||||
5101567, 29328896, -55787438, -13163978};
|
||||
|
||||
WebRtcIsacfix_AutocorrFixFunction(r_buffer, x_buffer_1,
|
||||
kBuffer, kOrder + 1, &scale);
|
||||
for (int i = 0; i < kOrder + 2; i++) {
|
||||
EXPECT_EQ(r_expected_1[i], r_buffer[i]);
|
||||
}
|
||||
EXPECT_EQ(0, scale);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(FiltersTest, AutocorrFixTest) {
|
||||
FiltersTester(WebRtcIsacfix_AutocorrC);
|
||||
#ifdef WEBRTC_DETECT_ARM_NEON
|
||||
if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
|
||||
FiltersTester(WebRtcIsacfix_AutocorrNeon);
|
||||
}
|
||||
#elif defined(WEBRTC_ARCH_ARM_NEON)
|
||||
FiltersTester(WebRtcIsacfix_AutocorrNeon);
|
||||
#endif
|
||||
}
|
@ -97,8 +97,8 @@
|
||||
'<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
|
||||
],
|
||||
'sources': [
|
||||
'filters_neon.S',
|
||||
'filterbanks_neon.S',
|
||||
'filters_neon.c',
|
||||
'lattice_neon.S',
|
||||
'lpc_masking_model_neon.S',
|
||||
],
|
||||
|
@ -32,8 +32,9 @@
|
||||
'<(webrtc_root)/test/test.gyp:test_support_main',
|
||||
],
|
||||
'sources': [
|
||||
'fix/source/lpc_masking_model_unittest.cc',
|
||||
'fix/source/filters_unittest.cc',
|
||||
'fix/source/filterbanks_unittest.cc',
|
||||
'fix/source/lpc_masking_model_unittest.cc',
|
||||
],
|
||||
},
|
||||
],
|
||||
|
Loading…
x
Reference in New Issue
Block a user