Optimized PCorr2Q32() in iSAC with intrinsics in ARM Neon platform.

Review URL: https://webrtc-codereview.appspot.com/634004

git-svn-id: http://webrtc.googlecode.com/svn/trunk@2497 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@webrtc.org
2012-07-09 23:27:02 +00:00
parent 1bc6d3238c
commit 72f8a6d77b
2 changed files with 43 additions and 12 deletions

View File

@@ -15,7 +15,9 @@
* *
*/ */
#include <string.h> #ifdef WEBRTC_ARCH_ARM_NEON
#include <arm_neon.h>
#endif
#include "signal_processing_library.h" #include "signal_processing_library.h"
#include "pitch_estimator.h" #include "pitch_estimator.h"
@@ -201,15 +203,44 @@ static void PCorr2Q32(const WebRtc_Word16 *in, WebRtc_Word32 *logcorQ8)
inptr = &in[k]; inptr = &in[k];
ysum32 -= WEBRTC_SPL_MUL_16_16_RSFT( (WebRtc_Word16) in[k-1],(WebRtc_Word16) in[k-1], scaling); ysum32 -= WEBRTC_SPL_MUL_16_16_RSFT( (WebRtc_Word16) in[k-1],(WebRtc_Word16) in[k-1], scaling);
ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (WebRtc_Word16) in[PITCH_CORR_LEN2 + k - 1],(WebRtc_Word16) in[PITCH_CORR_LEN2 + k - 1], scaling); ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (WebRtc_Word16) in[PITCH_CORR_LEN2 + k - 1],(WebRtc_Word16) in[PITCH_CORR_LEN2 + k - 1], scaling);
csum32 = 0;
prod32 = WEBRTC_SPL_MUL_16_16_RSFT( (WebRtc_Word16) x[0],(WebRtc_Word16) inptr[0], scaling);
for (n = 1; n < PITCH_CORR_LEN2; n++) { #ifdef WEBRTC_ARCH_ARM_NEON
csum32 += prod32; {
prod32 = WEBRTC_SPL_MUL_16_16_RSFT( (WebRtc_Word16) x[n],(WebRtc_Word16) inptr[n], scaling); int32_t vbuff[4];
int32x4_t int_32x4_sum = vmovq_n_s32(0);
// Can't shift a Neon register to right with a non-constant shift value.
int32x4_t int_32x4_scale = vdupq_n_s32(-scaling);
// Assert a codition used in loop unrolling at compile-time.
WEBRTC_STATIC_ASSERT(PITCH_CORR_LEN2, PITCH_CORR_LEN2 %4 == 0);
for (n = 0; n < PITCH_CORR_LEN2; n += 4) {
int16x4_t int_16x4_x = vld1_s16(&x[n]);
int16x4_t int_16x4_in = vld1_s16(&inptr[n]);
int32x4_t int_32x4 = vmull_s16(int_16x4_x, int_16x4_in);
int_32x4 = vshlq_s32(int_32x4, int_32x4_scale);
int_32x4_sum = vaddq_s32(int_32x4_sum, int_32x4);
}
// Use vector store to avoid long stall from data trasferring
// from vector to general register.
vst1q_s32(vbuff, int_32x4_sum);
csum32 = vbuff[0] + vbuff[1];
csum32 += vbuff[2];
csum32 += vbuff[3];
} }
#else
csum32 = 0;
if(scaling == 0) {
for (n = 0; n < PITCH_CORR_LEN2; n++) {
csum32 += x[n] * inptr[n];
}
} else {
for (n = 0; n < PITCH_CORR_LEN2; n++) {
csum32 += (x[n] * inptr[n]) >> scaling;
}
}
#endif
csum32 += prod32;
logcorQ8--; logcorQ8--;
lys=Log2Q8((WebRtc_UWord32)ysum32); // Q8 lys=Log2Q8((WebRtc_UWord32)ysum32); // Q8

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
* *
* Use of this source code is governed by a BSD-style license * Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source * that can be found in the LICENSE file in the root of the source
@@ -20,7 +20,10 @@
#include "structs.h" #include "structs.h"
// TODO(andrew): put this into general WebRTC so other modules can use it.
// Define a compiler-time assertion.
#define WEBRTC_STATIC_ASSERT(name, boolean_cond) \
static char const static_assert_##name[(boolean_cond) ? 1 : -1] = {'!'}
void WebRtcIsacfix_PitchAnalysis(const WebRtc_Word16 *in, /* PITCH_FRAME_LEN samples */ void WebRtcIsacfix_PitchAnalysis(const WebRtc_Word16 *in, /* PITCH_FRAME_LEN samples */
WebRtc_Word16 *outQ0, /* PITCH_FRAME_LEN+QLOOKAHEAD samples */ WebRtc_Word16 *outQ0, /* PITCH_FRAME_LEN+QLOOKAHEAD samples */
@@ -28,7 +31,6 @@ void WebRtcIsacfix_PitchAnalysis(const WebRtc_Word16 *in, /* PITCH
WebRtc_Word16 *lagsQ7, WebRtc_Word16 *lagsQ7,
WebRtc_Word16 *PitchGains_Q12); WebRtc_Word16 *PitchGains_Q12);
void WebRtcIsacfix_InitialPitch(const WebRtc_Word16 *in, void WebRtcIsacfix_InitialPitch(const WebRtc_Word16 *in,
PitchAnalysisStruct *State, PitchAnalysisStruct *State,
WebRtc_Word16 *qlags); WebRtc_Word16 *qlags);
@@ -45,8 +47,6 @@ void WebRtcIsacfix_PitchFilterGains(const WebRtc_Word16 *indatQ0,
WebRtc_Word16 *lagsQ7, WebRtc_Word16 *lagsQ7,
WebRtc_Word16 *gainsQ12); WebRtc_Word16 *gainsQ12);
void WebRtcIsacfix_DecimateAllpass32(const WebRtc_Word16 *in, void WebRtcIsacfix_DecimateAllpass32(const WebRtc_Word16 *in,
WebRtc_Word32 *state_in, /* array of size: 2*ALLPASSSECTIONS+1 */ WebRtc_Word32 *state_in, /* array of size: 2*ALLPASSSECTIONS+1 */
WebRtc_Word16 N, /* number of input samples */ WebRtc_Word16 N, /* number of input samples */