Optimized WebRtcIsacfix_NormLatticeFilterMa() function for iSAC fix for ARM Neon

architecture with intrinsics and assembly code. The total iSAC codec speech improved
about 3~5%.

Notes
(1) The Neon version after this optimization is not bit-exact with the generic
C version. The out quality, however, is not worse as verified by test vectors ouput,
and undertandably in theory (32bit x 32bit in Neon is more accurate than the approximation
C code in the generic version).
(2) In Android, a isac neon library will be built. Along with some new function structures,
it is partly for preparation of introducing a run time detection of Neon architecture soon.
Review URL: http://webrtc-codereview.appspot.com/268016

git-svn-id: http://webrtc.googlecode.com/svn/trunk@1192 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@webrtc.org 2011-12-14 18:59:43 +00:00
parent 02afbeaca5
commit f0a964dc0a
8 changed files with 329 additions and 61 deletions

View File

@ -103,6 +103,7 @@ include $(BUILD_SHARED_LIBRARY)
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
include $(LOCAL_PATH)/../../external/webrtc/android-webrtc.mk
LOCAL_ARM_MODE := arm
LOCAL_MODULE := libwebrtc
@ -137,6 +138,15 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
libwebrtc_jpeg \
libwebrtc_vpx
# Add Neon libraries.
ifneq (,$(filter '-DWEBRTC_DETECT_ARM_NEON',$(MY_WEBRTC_COMMON_DEFS)))
LOCAL_WHOLE_STATIC_LIBRARIES += \
libwebrtc_isacfix_neon
else ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_WHOLE_STATIC_LIBRARIES += \
libwebrtc_isacfix_neon
endif
LOCAL_SHARED_LIBRARIES := \
libcutils \
libdl \

View File

@ -6,6 +6,9 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
#############################
# Build the non-neon library.
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
@ -68,8 +71,41 @@ include external/stlport/libstlport.mk
endif
include $(BUILD_STATIC_LIBRARY)
#########################
# Build the neon library.
include $(CLEAR_VARS)
LOCAL_ARM_MODE := arm
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_MODULE := libwebrtc_isacfix_neon
LOCAL_MODULE_TAGS := optional
LOCAL_SRC_FILES := \
filters_neon.c \
lattice_neon.S #.S extention is for including a header file in assembly.
# TODO(kma): Check with C compiler team and on line community for any status
# in the file name (.s vs .S), for a better solution.
# Flags passed to both C and C++ files.
LOCAL_CFLAGS := \
$(MY_WEBRTC_COMMON_DEFS) \
-mfpu=neon \
-flax-vector-conversions
LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/../interface \
$(LOCAL_PATH)/../../../../../.. \
$(LOCAL_PATH)/../../../../../../common_audio/signal_processing/include
ifndef NDK_ROOT
include external/stlport/libstlport.mk
endif
include $(BUILD_STATIC_LIBRARY)
###########################
# isac test app
include $(CLEAR_VARS)
LOCAL_MODULE_TAGS := tests

View File

@ -122,7 +122,6 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
WebRtc_Word16 lo_hi,
WebRtc_Word16 *lat_outQ9);
void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
WebRtc_Word16 *stateGQ0,
WebRtc_Word32 *lat_inQ25,
@ -131,10 +130,54 @@ void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
WebRtc_Word16 lo_hi,
WebRtc_Word16 *lat_outQ0);
int WebRtcIsacfix_AutocorrFix(WebRtc_Word32* __restrict r,
int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r,
const WebRtc_Word16* __restrict x,
WebRtc_Word16 N,
WebRtc_Word16 order,
WebRtc_Word16* __restrict scale);
void WebRtcIsacfix_FilterMaLoopC(int16_t input0,
int16_t input1,
int32_t input2,
int32_t* ptr0,
int32_t* ptr1,
int32_t* ptr2);
// Functions for ARM-Neon platforms, in place of the above two generic C ones.
#if (defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
int WebRtcIsacfix_AutocorrNeon(WebRtc_Word32* __restrict r,
const WebRtc_Word16* __restrict x,
WebRtc_Word16 N,
WebRtc_Word16 order,
WebRtc_Word16* __restrict scale);
void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
int16_t input1,
int32_t input2,
int32_t* ptr0,
int32_t* ptr1,
int32_t* ptr2);
#endif
/**** Function pointers associated with
**** WebRtcIsacfix_AutocorrC() / WebRtcIsacfix_AutocorrNeon()
**** and WebRtcIsacfix_FilterMaLoopC() / WebRtcIsacfix_FilterMaLoopNeon().
****/
typedef int (*AutocorrFix)(WebRtc_Word32* __restrict__ r,
const WebRtc_Word16* __restrict__ x,
WebRtc_Word16 N,
WebRtc_Word16 order,
WebRtc_Word16* __restrict__ scale);
extern AutocorrFix WebRtcIsacfix_AutocorrFix;
typedef void (*FilterMaLoopFix)(int16_t input0,
int16_t input1,
int32_t input2,
int32_t* ptr0,
int32_t* ptr1,
int32_t* ptr2);
extern FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix;
#endif /* WEBRTC_MODULES_AUDIO_CODING_CODECS_ISAC_FIX_SOURCE_CODEC_H_ */

View File

@ -11,7 +11,7 @@
/*
* filters.c
*
* This file contains function WebRtcIsacfix_AutocorrFix,
* This file contains function WebRtcIsacfix_AutocorrC,
* AllpassFilterForDec32, and WebRtcIsacfix_DecimateAllpass32
*
*/
@ -22,16 +22,13 @@
#include "lpc_masking_model.h"
#include "codec.h"
#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
// Autocorrelation function in fixed point.
// NOTE! Different from SPLIB-version in how it scales the signal.
int WebRtcIsacfix_AutocorrFix(
WebRtc_Word32* __restrict r,
int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r,
const WebRtc_Word16* __restrict x,
WebRtc_Word16 N,
WebRtc_Word16 order,
WebRtc_Word16* __restrict scale) {
int i = 0;
int j = 0;
int16_t scaling = 0;
@ -67,7 +64,6 @@ int WebRtcIsacfix_AutocorrFix(
return(order + 1);
}
#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
static const WebRtc_Word32 kApUpperQ15[ALLPASSSECTIONS] = { 1137, 12537 };
static const WebRtc_Word32 kApLowerQ15[ALLPASSSECTIONS] = { 5059, 24379 };

View File

@ -11,7 +11,7 @@
/*
* filters_neon.c
*
* This file contains function WebRtcIsacfix_AutocorrFix, optimized for
* This file contains function WebRtcIsacfix_AutocorrNeon, optimized for
* ARM Neon platform.
*
*/
@ -23,7 +23,7 @@
// Autocorrelation function in fixed point.
// NOTE! Different from SPLIB-version in how it scales the signal.
int WebRtcIsacfix_AutocorrFix(
int WebRtcIsacfix_AutocorrNeon(
WebRtc_Word32* __restrict r,
const WebRtc_Word16* __restrict x,
WebRtc_Word16 N,

View File

@ -246,11 +246,18 @@ WebRtc_Word16 WebRtcIsacfix_EncoderInit(ISACFIX_MainStruct *ISAC_main_inst,
WebRtcIsacfix_InitPostFilterbank(&ISAC_inst->ISACenc_obj.interpolatorstr_obj);
#endif
// Initiaze function pointers.
WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrC;
WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopC;
#ifdef WEBRTC_ARCH_ARM_NEON
WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrNeon;
WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopNeon;
#endif
return statusInit;
}
/****************************************************************************
* WebRtcIsacfix_Encode(...)
*

View File

@ -18,6 +18,64 @@
#include "codec.h"
#include "settings.h"
#define LATTICE_MUL_32_32_RSFT16(a32a, a32b, b32) \
((WebRtc_Word32)(WEBRTC_SPL_MUL(a32a, b32) + (WEBRTC_SPL_MUL_16_32_RSFT16(a32b, b32))))
/* This macro is FORBIDDEN to use elsewhere than in a function in this file and
its corresponding neon version. It might give unpredictable results, since a
general WebRtc_Word32*WebRtc_Word32 multiplication results in a 64 bit value.
The result is then shifted just 16 steps to the right, giving need for 48
bits, i.e. in the generel case, it will NOT fit in a WebRtc_Word32. In the
cases used in here, the WebRtc_Word32 will be enough, since (for a good
reason) the involved multiplicands aren't big enough to overflow a
WebRtc_Word32 after shifting right 16 bits. I have compared the result of a
multiplication between t32 and tmp32, done in two ways:
1) Using (WebRtc_Word32) (((float)(tmp32))*((float)(tmp32b))/65536.0);
2) Using LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
By running 25 files, I haven't found any bigger diff than 64 - this was in the
case when method 1) gave 650235648 and 2) gave 650235712.
*/
/* Inner loop used for function WebRtcIsacfix_NormLatticeFilterMa().
It does:
for 0 <= n < HALF_SUBFRAMELEN - 1:
*ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
*ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
*/
void WebRtcIsacfix_FilterMaLoopC(int16_t input0, // Filter coefficient
int16_t input1, // Filter coefficient
int32_t input2, // Inverse coeff. (1/input1)
int32_t* ptr0, // Sample buffer
int32_t* ptr1, // Sample buffer
int32_t* ptr2) { // Sample buffer
int n = 0;
// Separate the 32-bit variable input2 into two 16-bit integers (high 16 and
// low 16 bits), for using LATTICE_MUL_32_32_RSFT16 in the loop.
int16_t t16a = (int16_t)(input2 >> 16);
int16_t t16b = (int16_t)input2;
if (t16b < 0) t16a++;
// The loop filtering the samples *ptr0, *ptr1, *ptr2 with filter coefficients
// input0, input1, and input2.
for(n = 0; n < HALF_SUBFRAMELEN - 1; n++, ptr0++, ptr1++, ptr2++) {
int32_t tmp32a = 0;
int32_t tmp32b = 0;
// Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); // Q15 * Q15 >> 15 = Q15
tmp32b = *ptr2 + tmp32a; // Q15 + Q15 = Q15
*ptr2 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
// Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); // Q15*Q15>>15 = Q15
tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); // Q15*Q15>>15 = Q15
*ptr1 = tmp32a + tmp32b; // Q15 + Q15 = Q15
}
}
// Declare a function pointer.
FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix;
/* filter the signal using normalized lattice filter */
/* MA filter */
void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
@ -47,30 +105,6 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
WebRtc_Word16 t16a;
WebRtc_Word16 t16b;
#define LATTICE_MUL_32_32_RSFT16(a32a, a32b, b32) \
((WebRtc_Word32)(WEBRTC_SPL_MUL(a32a, b32) + (WEBRTC_SPL_MUL_16_32_RSFT16(a32b, b32))))
/* This macro is FORBIDDEN to use elsewhere than in two places in this file
since it might give unpredictable results, since a general WebRtc_Word32*WebRtc_Word32
multiplication results in a 64 bit value. The result is then shifted just
16 steps to the right, giving need for 48 bits, i.e. in the generel case,
it will NOT fit in a WebRtc_Word32. In the cases used in here, the WebRtc_Word32 will be
enough, since (FOR SOME REASON!!!) the involved multiplicands aren't big
enough to overflow a WebRtc_Word32 after shifting right 16 bits. I have compared
the result of a multiplication between t32 and tmp32, done in two ways:
1) Using (WebRtc_Word32) (((float)(tmp32))*((float)(tmp32b))/65536.0);
2) Using LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
By running 25 files, I haven't found any bigger diff than 64 - this was in the
case when method 1) gave 650235648 and 2) gave 650235712.
It might be good to investigate this further, in order to PROVE why it seems to
work without any problems. This might be done, by using the properties of
all reflection coefficients etc.
*/
for (u=0;u<SUBFRAMES;u++)
{
/* set the Direct Form coefficients */
@ -133,24 +167,11 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
/* save the states */
for(k=0;k<orderCoef;k++)
{
for(n=0;n<HALF_SUBFRAMELEN-1;n++)
{
// Calculate f[k+1][n+1] = inv_cth[k]*(f[k][n+1] + sth[k]*g[k][n]);
tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[k], gQ15[k][n]);//Q15*Q15>>15 = Q15
tmp32b= fQ15vec[n+1] + tmp32; //Q15+Q15=Q15
tmp32 = inv_cthQ16[k]; //Q16
t16a = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32, 16);
t16b = (WebRtc_Word16) (tmp32-WEBRTC_SPL_LSHIFT_W32(((WebRtc_Word32)t16a), 16));
if (t16b<0) t16a++;
tmp32 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
fQ15vec[n+1] = tmp32; // Q15
// Calculate g[k+1][n+1] = cth[k]*g[k][n] + sth[k]* f[k+1][n+1];
tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[k], gQ15[k][n]); //Q15*Q15>>15 = Q15
tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[k], fQ15vec[n+1]); //Q15*Q15>>15 = Q15
tmp32 = tmp32 + tmp32b;//Q15+Q15 = Q15
gQ15[k+1][n+1] = tmp32; // Q15
}
// for 0 <= n < HALF_SUBFRAMELEN - 1:
// f[k+1][n+1] = inv_cth[k]*(f[k][n+1] + sth[k]*g[k][n]);
// g[k+1][n+1] = cth[k]*g[k][n] + sth[k]* f[k+1][n+1];
WebRtcIsacfix_FilterMaLoopFix(sthQ15[k], cthQ15[k], inv_cthQ16[k],
&gQ15[k][0], &gQ15[k+1][1], &fQ15vec[1]);
}
fQ15vec[0] = fQtmp;

View File

@ -0,0 +1,155 @@
@
@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ lattice_neon.s
@
@ Contains a function for the core loop in the normalized lattice MA
@ filter routine for iSAC codec, optimized for ARM Neon platform.
@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
@ int16_t input1,
@ int32_t input2,
@ int32_t* ptr0,
@ int32_t* ptr1,
@ int32_t* __restrict ptr2);
@ It calculates
@ *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
@ in Q15 domain.
@
@ Reference code in lattice.c.
@ Output is not bit-exact with the reference C code, due to the replacement
@ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
@ instructions, smulwb, and smull. Speech quality was not degraded by
@ testing speech and tone vectors.
.arch armv7-a
.fpu neon
#include "settings.h"
.global WebRtcIsacfix_FilterMaLoopNeon
.align 2
WebRtcIsacfix_FilterMaLoopNeon:
.fnstart
.save {r4-r8}
push {r4-r8}
vdup.32 d28, r0 @ Initialize Neon register with input0
vdup.32 d29, r1 @ Initialize Neon register with input1
vdup.32 d30, r2 @ Initialize Neon register with input2
ldr r4, [sp, #20] @ ptr1
ldr r12, [sp, #24] @ ptr2
@ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2
@ Leftover samples after the loop, in r6:
@ r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2
mov r6, #HALF_SUBFRAMELEN
sub r6, #1
lsr r5, r6, #2
sub r6, r5, lsl #2
@ First r5 iterations in a loop.
LOOP:
vld1.32 {d0, d1}, [r3]! @ *ptr0
vmull.s32 q10, d0, d28 @ tmp32a = input0 * (*ptr0)
vmull.s32 q11, d1, d28 @ tmp32a = input0 * (*ptr0)
vmull.s32 q12, d0, d29 @ input1 * (*ptr0)
vmull.s32 q13, d1, d29 @ input1 * (*ptr0)
vrshrn.i64 d4, q10, #15
vrshrn.i64 d5, q11, #15
vld1.32 {d2, d3}, [r12] @ *ptr2
vadd.i32 q3, q2, q1 @ tmp32b = *ptr2 + tmp32a
vrshrn.i64 d0, q12, #15
vmull.s32 q10, d6, d30 @ input2 * (*ptr2 + tmp32b)
vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b)
vrshrn.i64 d16, q10, #16
vrshrn.i64 d17, q11, #16
vmull.s32 q10, d16, d28 @ input0 * (*ptr2)
vmull.s32 q11, d17, d28 @ input0 * (*ptr2)
vrshrn.i64 d1, q13, #15
vrshrn.i64 d18, q10, #15
vrshrn.i64 d19, q11, #15
vst1.32 {d16, d17}, [r12]! @ *ptr2
vadd.i32 q9, q0, q9
subs r5, #1
vst1.32 {d18, d19}, [r4]! @ *ptr1
bgt LOOP
@ Check how many samples still need to be processed.
subs r6, #2
blt LAST_SAMPLE
@ Process two more samples:
vld1.32 d0, [r3]! @ *ptr0
vmull.s32 q11, d0, d28 @ tmp32a = input0 * (*ptr0)
vmull.s32 q13, d0, d29 @ input1 * (*ptr0)
vld1.32 d18, [r12] @ *ptr2
vrshrn.i64 d4, q11, #15
vadd.i32 d7, d4, d18 @ tmp32b = *ptr2 + tmp32a
vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b)
vrshrn.i64 d16, q11, #16
vmull.s32 q11, d16, d28 @ input0 * (*ptr2)
vst1.32 d16, [r12]! @ *ptr2
vrshrn.i64 d0, q13, #15
vrshrn.i64 d19, q11, #15
vadd.i32 d19, d0, d19
vst1.32 d19, [r4]! @ *ptr1
@ If there's still one more sample, process it here.
LAST_SAMPLE:
cmp r6, #1
bne END
@ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
ldr r7, [r3] @ *ptr0
ldr r8, [r12] @ *ptr2
smulwb r5, r7, r0 @ tmp32a = *ptr0 * input0 >> 16
add r8, r8, r5, lsl #1 @ tmp32b = *ptr2 + (tmp32a << 1)
smull r5, r6, r8, r2 @ tmp32b * input2, in 64 bits
lsl r6, #16
add r6, r5, lsr #16 @ Only take the middle 32 bits
str r6, [r12] @ Output (*ptr2, as 32 bits)
@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
smulwb r5, r7, r1 @ tmp32a = *ptr0 * input1 >> 16
smulwb r6, r6, r0 @ tmp32b = *ptr2 * input0 >> 16
lsl r5, r5, #1
add r5, r6, lsl #1
str r5, [r4] @ Output (*ptr1)
END:
pop {r4-r8}
bx lr
.fnend