diff --git a/Android.mk b/Android.mk index e744a8948..2380e0bbd 100644 --- a/Android.mk +++ b/Android.mk @@ -103,6 +103,7 @@ include $(BUILD_SHARED_LIBRARY) LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) +include $(LOCAL_PATH)/../../external/webrtc/android-webrtc.mk LOCAL_ARM_MODE := arm LOCAL_MODULE := libwebrtc @@ -137,6 +138,15 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \ libwebrtc_jpeg \ libwebrtc_vpx +# Add Neon libraries. +ifneq (,$(filter '-DWEBRTC_DETECT_ARM_NEON',$(MY_WEBRTC_COMMON_DEFS))) +LOCAL_WHOLE_STATIC_LIBRARIES += \ + libwebrtc_isacfix_neon +else ifeq ($(ARCH_ARM_HAVE_NEON),true) +LOCAL_WHOLE_STATIC_LIBRARIES += \ + libwebrtc_isacfix_neon +endif + LOCAL_SHARED_LIBRARIES := \ libcutils \ libdl \ diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk b/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk index 786695b08..3bedfe996 100644 --- a/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk +++ b/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk @@ -6,6 +6,9 @@ # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. +############################# +# Build the non-neon library. + LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) @@ -68,8 +71,41 @@ include external/stlport/libstlport.mk endif include $(BUILD_STATIC_LIBRARY) +######################### +# Build the neon library. +include $(CLEAR_VARS) + +LOCAL_ARM_MODE := arm +LOCAL_MODULE_CLASS := STATIC_LIBRARIES +LOCAL_MODULE := libwebrtc_isacfix_neon +LOCAL_MODULE_TAGS := optional +LOCAL_SRC_FILES := \ + filters_neon.c \ + lattice_neon.S #.S extention is for including a header file in assembly. +# TODO(kma): Check with C compiler team and on line community for any status +# in the file name (.s vs .S), for a better solution. + +# Flags passed to both C and C++ files. +LOCAL_CFLAGS := \ + $(MY_WEBRTC_COMMON_DEFS) \ + -mfpu=neon \ + -flax-vector-conversions + +LOCAL_C_INCLUDES := \ + $(LOCAL_PATH)/../interface \ + $(LOCAL_PATH)/../../../../../.. \ + $(LOCAL_PATH)/../../../../../../common_audio/signal_processing/include + + +ifndef NDK_ROOT +include external/stlport/libstlport.mk +endif +include $(BUILD_STATIC_LIBRARY) + +########################### # isac test app + include $(CLEAR_VARS) LOCAL_MODULE_TAGS := tests diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h b/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h index 279c2c511..4a8d28181 100644 --- a/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h +++ b/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h @@ -122,7 +122,6 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef, WebRtc_Word16 lo_hi, WebRtc_Word16 *lat_outQ9); - void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef, WebRtc_Word16 *stateGQ0, WebRtc_Word32 *lat_inQ25, @@ -131,10 +130,54 @@ void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef, WebRtc_Word16 lo_hi, WebRtc_Word16 *lat_outQ0); -int WebRtcIsacfix_AutocorrFix(WebRtc_Word32* __restrict r, - const WebRtc_Word16* __restrict x, - WebRtc_Word16 N, - WebRtc_Word16 order, - WebRtc_Word16* __restrict scale); +int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r, + const WebRtc_Word16* __restrict x, + WebRtc_Word16 N, + WebRtc_Word16 order, + WebRtc_Word16* __restrict scale); + +void WebRtcIsacfix_FilterMaLoopC(int16_t input0, + int16_t input1, + int32_t input2, + int32_t* ptr0, + int32_t* ptr1, + int32_t* ptr2); + +// Functions for ARM-Neon platforms, in place of the above two generic C ones. +#if (defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) +int WebRtcIsacfix_AutocorrNeon(WebRtc_Word32* __restrict r, + const WebRtc_Word16* __restrict x, + WebRtc_Word16 N, + WebRtc_Word16 order, + WebRtc_Word16* __restrict scale); + +void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, + int16_t input1, + int32_t input2, + int32_t* ptr0, + int32_t* ptr1, + int32_t* ptr2); +#endif + +/**** Function pointers associated with + **** WebRtcIsacfix_AutocorrC() / WebRtcIsacfix_AutocorrNeon() + **** and WebRtcIsacfix_FilterMaLoopC() / WebRtcIsacfix_FilterMaLoopNeon(). + ****/ + +typedef int (*AutocorrFix)(WebRtc_Word32* __restrict__ r, + const WebRtc_Word16* __restrict__ x, + WebRtc_Word16 N, + WebRtc_Word16 order, + WebRtc_Word16* __restrict__ scale); +extern AutocorrFix WebRtcIsacfix_AutocorrFix; + +typedef void (*FilterMaLoopFix)(int16_t input0, + int16_t input1, + int32_t input2, + int32_t* ptr0, + int32_t* ptr1, + int32_t* ptr2); +extern FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix; + #endif /* WEBRTC_MODULES_AUDIO_CODING_CODECS_ISAC_FIX_SOURCE_CODEC_H_ */ diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c b/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c index 940bb5614..6ee047753 100644 --- a/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c +++ b/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c @@ -11,7 +11,7 @@ /* * filters.c * - * This file contains function WebRtcIsacfix_AutocorrFix, + * This file contains function WebRtcIsacfix_AutocorrC, * AllpassFilterForDec32, and WebRtcIsacfix_DecimateAllpass32 * */ @@ -22,16 +22,13 @@ #include "lpc_masking_model.h" #include "codec.h" -#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) // Autocorrelation function in fixed point. // NOTE! Different from SPLIB-version in how it scales the signal. -int WebRtcIsacfix_AutocorrFix( - WebRtc_Word32* __restrict r, - const WebRtc_Word16* __restrict x, - WebRtc_Word16 N, - WebRtc_Word16 order, - WebRtc_Word16* __restrict scale) { - +int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r, + const WebRtc_Word16* __restrict x, + WebRtc_Word16 N, + WebRtc_Word16 order, + WebRtc_Word16* __restrict scale) { int i = 0; int j = 0; int16_t scaling = 0; @@ -67,7 +64,6 @@ int WebRtcIsacfix_AutocorrFix( return(order + 1); } -#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) static const WebRtc_Word32 kApUpperQ15[ALLPASSSECTIONS] = { 1137, 12537 }; static const WebRtc_Word32 kApLowerQ15[ALLPASSSECTIONS] = { 5059, 24379 }; diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c b/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c index 0b44886d3..8270359b1 100644 --- a/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c +++ b/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c @@ -11,7 +11,7 @@ /* * filters_neon.c * - * This file contains function WebRtcIsacfix_AutocorrFix, optimized for + * This file contains function WebRtcIsacfix_AutocorrNeon, optimized for * ARM Neon platform. * */ @@ -23,7 +23,7 @@ // Autocorrelation function in fixed point. // NOTE! Different from SPLIB-version in how it scales the signal. -int WebRtcIsacfix_AutocorrFix( +int WebRtcIsacfix_AutocorrNeon( WebRtc_Word32* __restrict r, const WebRtc_Word16* __restrict x, WebRtc_Word16 N, diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/isacfix.c b/src/modules/audio_coding/codecs/iSAC/fix/source/isacfix.c index a8c55e1ae..3a377857a 100644 --- a/src/modules/audio_coding/codecs/iSAC/fix/source/isacfix.c +++ b/src/modules/audio_coding/codecs/iSAC/fix/source/isacfix.c @@ -246,11 +246,18 @@ WebRtc_Word16 WebRtcIsacfix_EncoderInit(ISACFIX_MainStruct *ISAC_main_inst, WebRtcIsacfix_InitPostFilterbank(&ISAC_inst->ISACenc_obj.interpolatorstr_obj); #endif + // Initiaze function pointers. + WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrC; + WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopC; + +#ifdef WEBRTC_ARCH_ARM_NEON + WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrNeon; + WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopNeon; +#endif return statusInit; } - /**************************************************************************** * WebRtcIsacfix_Encode(...) * diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c index e2db729f4..0f80d5872 100644 --- a/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c +++ b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c @@ -18,6 +18,64 @@ #include "codec.h" #include "settings.h" +#define LATTICE_MUL_32_32_RSFT16(a32a, a32b, b32) \ + ((WebRtc_Word32)(WEBRTC_SPL_MUL(a32a, b32) + (WEBRTC_SPL_MUL_16_32_RSFT16(a32b, b32)))) +/* This macro is FORBIDDEN to use elsewhere than in a function in this file and + its corresponding neon version. It might give unpredictable results, since a + general WebRtc_Word32*WebRtc_Word32 multiplication results in a 64 bit value. + The result is then shifted just 16 steps to the right, giving need for 48 + bits, i.e. in the generel case, it will NOT fit in a WebRtc_Word32. In the + cases used in here, the WebRtc_Word32 will be enough, since (for a good + reason) the involved multiplicands aren't big enough to overflow a + WebRtc_Word32 after shifting right 16 bits. I have compared the result of a + multiplication between t32 and tmp32, done in two ways: + 1) Using (WebRtc_Word32) (((float)(tmp32))*((float)(tmp32b))/65536.0); + 2) Using LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b); + By running 25 files, I haven't found any bigger diff than 64 - this was in the + case when method 1) gave 650235648 and 2) gave 650235712. +*/ + +/* Inner loop used for function WebRtcIsacfix_NormLatticeFilterMa(). + It does: + for 0 <= n < HALF_SUBFRAMELEN - 1: + *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); + *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); +*/ +void WebRtcIsacfix_FilterMaLoopC(int16_t input0, // Filter coefficient + int16_t input1, // Filter coefficient + int32_t input2, // Inverse coeff. (1/input1) + int32_t* ptr0, // Sample buffer + int32_t* ptr1, // Sample buffer + int32_t* ptr2) { // Sample buffer + int n = 0; + + // Separate the 32-bit variable input2 into two 16-bit integers (high 16 and + // low 16 bits), for using LATTICE_MUL_32_32_RSFT16 in the loop. + int16_t t16a = (int16_t)(input2 >> 16); + int16_t t16b = (int16_t)input2; + if (t16b < 0) t16a++; + + // The loop filtering the samples *ptr0, *ptr1, *ptr2 with filter coefficients + // input0, input1, and input2. + for(n = 0; n < HALF_SUBFRAMELEN - 1; n++, ptr0++, ptr1++, ptr2++) { + int32_t tmp32a = 0; + int32_t tmp32b = 0; + + // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)); + tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); // Q15 * Q15 >> 15 = Q15 + tmp32b = *ptr2 + tmp32a; // Q15 + Q15 = Q15 + *ptr2 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b); + + // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); + tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); // Q15*Q15>>15 = Q15 + tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); // Q15*Q15>>15 = Q15 + *ptr1 = tmp32a + tmp32b; // Q15 + Q15 = Q15 + } +} + +// Declare a function pointer. +FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix; + /* filter the signal using normalized lattice filter */ /* MA filter */ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef, @@ -47,30 +105,6 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef, WebRtc_Word16 t16a; WebRtc_Word16 t16b; -#define LATTICE_MUL_32_32_RSFT16(a32a, a32b, b32) \ - ((WebRtc_Word32)(WEBRTC_SPL_MUL(a32a, b32) + (WEBRTC_SPL_MUL_16_32_RSFT16(a32b, b32)))) - /* This macro is FORBIDDEN to use elsewhere than in two places in this file - since it might give unpredictable results, since a general WebRtc_Word32*WebRtc_Word32 - multiplication results in a 64 bit value. The result is then shifted just - 16 steps to the right, giving need for 48 bits, i.e. in the generel case, - it will NOT fit in a WebRtc_Word32. In the cases used in here, the WebRtc_Word32 will be - enough, since (FOR SOME REASON!!!) the involved multiplicands aren't big - enough to overflow a WebRtc_Word32 after shifting right 16 bits. I have compared - the result of a multiplication between t32 and tmp32, done in two ways: - - 1) Using (WebRtc_Word32) (((float)(tmp32))*((float)(tmp32b))/65536.0); - - 2) Using LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b); - - By running 25 files, I haven't found any bigger diff than 64 - this was in the - case when method 1) gave 650235648 and 2) gave 650235712. - - It might be good to investigate this further, in order to PROVE why it seems to - work without any problems. This might be done, by using the properties of - all reflection coefficients etc. - - */ - for (u=0;u>15 = Q15 - tmp32b= fQ15vec[n+1] + tmp32; //Q15+Q15=Q15 - tmp32 = inv_cthQ16[k]; //Q16 - t16a = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32, 16); - t16b = (WebRtc_Word16) (tmp32-WEBRTC_SPL_LSHIFT_W32(((WebRtc_Word32)t16a), 16)); - if (t16b<0) t16a++; - tmp32 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b); - fQ15vec[n+1] = tmp32; // Q15 - - // Calculate g[k+1][n+1] = cth[k]*g[k][n] + sth[k]* f[k+1][n+1]; - tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[k], gQ15[k][n]); //Q15*Q15>>15 = Q15 - tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[k], fQ15vec[n+1]); //Q15*Q15>>15 = Q15 - tmp32 = tmp32 + tmp32b;//Q15+Q15 = Q15 - gQ15[k+1][n+1] = tmp32; // Q15 - } + // for 0 <= n < HALF_SUBFRAMELEN - 1: + // f[k+1][n+1] = inv_cth[k]*(f[k][n+1] + sth[k]*g[k][n]); + // g[k+1][n+1] = cth[k]*g[k][n] + sth[k]* f[k+1][n+1]; + WebRtcIsacfix_FilterMaLoopFix(sthQ15[k], cthQ15[k], inv_cthQ16[k], + &gQ15[k][0], &gQ15[k+1][1], &fQ15vec[1]); } fQ15vec[0] = fQtmp; diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_neon.S b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_neon.S new file mode 100644 index 000000000..a59b6e37f --- /dev/null +++ b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_neon.S @@ -0,0 +1,155 @@ +@ +@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + +@ lattice_neon.s +@ +@ Contains a function for the core loop in the normalized lattice MA +@ filter routine for iSAC codec, optimized for ARM Neon platform. +@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, +@ int16_t input1, +@ int32_t input2, +@ int32_t* ptr0, +@ int32_t* ptr1, +@ int32_t* __restrict ptr2); +@ It calculates +@ *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); +@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); +@ in Q15 domain. +@ +@ Reference code in lattice.c. +@ Output is not bit-exact with the reference C code, due to the replacement +@ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon +@ instructions, smulwb, and smull. Speech quality was not degraded by +@ testing speech and tone vectors. + +.arch armv7-a +.fpu neon + +#include "settings.h" + +.global WebRtcIsacfix_FilterMaLoopNeon + +.align 2 + +WebRtcIsacfix_FilterMaLoopNeon: +.fnstart + +.save {r4-r8} + push {r4-r8} + + vdup.32 d28, r0 @ Initialize Neon register with input0 + vdup.32 d29, r1 @ Initialize Neon register with input1 + vdup.32 d30, r2 @ Initialize Neon register with input2 + ldr r4, [sp, #20] @ ptr1 + ldr r12, [sp, #24] @ ptr2 + + @ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2 + @ Leftover samples after the loop, in r6: + @ r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2 + mov r6, #HALF_SUBFRAMELEN + sub r6, #1 + lsr r5, r6, #2 + sub r6, r5, lsl #2 + + @ First r5 iterations in a loop. + +LOOP: + vld1.32 {d0, d1}, [r3]! @ *ptr0 + + vmull.s32 q10, d0, d28 @ tmp32a = input0 * (*ptr0) + vmull.s32 q11, d1, d28 @ tmp32a = input0 * (*ptr0) + vmull.s32 q12, d0, d29 @ input1 * (*ptr0) + vmull.s32 q13, d1, d29 @ input1 * (*ptr0) + + vrshrn.i64 d4, q10, #15 + vrshrn.i64 d5, q11, #15 + + vld1.32 {d2, d3}, [r12] @ *ptr2 + vadd.i32 q3, q2, q1 @ tmp32b = *ptr2 + tmp32a + + vrshrn.i64 d0, q12, #15 + + vmull.s32 q10, d6, d30 @ input2 * (*ptr2 + tmp32b) + vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b) + + vrshrn.i64 d16, q10, #16 + vrshrn.i64 d17, q11, #16 + + vmull.s32 q10, d16, d28 @ input0 * (*ptr2) + vmull.s32 q11, d17, d28 @ input0 * (*ptr2) + + vrshrn.i64 d1, q13, #15 + vrshrn.i64 d18, q10, #15 + vrshrn.i64 d19, q11, #15 + + vst1.32 {d16, d17}, [r12]! @ *ptr2 + + vadd.i32 q9, q0, q9 + subs r5, #1 + vst1.32 {d18, d19}, [r4]! @ *ptr1 + + bgt LOOP + + @ Check how many samples still need to be processed. + subs r6, #2 + blt LAST_SAMPLE + + @ Process two more samples: + vld1.32 d0, [r3]! @ *ptr0 + + vmull.s32 q11, d0, d28 @ tmp32a = input0 * (*ptr0) + vmull.s32 q13, d0, d29 @ input1 * (*ptr0) + + vld1.32 d18, [r12] @ *ptr2 + vrshrn.i64 d4, q11, #15 + + vadd.i32 d7, d4, d18 @ tmp32b = *ptr2 + tmp32a + vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b) + vrshrn.i64 d16, q11, #16 + + vmull.s32 q11, d16, d28 @ input0 * (*ptr2) + vst1.32 d16, [r12]! @ *ptr2 + + vrshrn.i64 d0, q13, #15 + vrshrn.i64 d19, q11, #15 + vadd.i32 d19, d0, d19 + + vst1.32 d19, [r4]! @ *ptr1 + + @ If there's still one more sample, process it here. +LAST_SAMPLE: + cmp r6, #1 + bne END + + @ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)); + + ldr r7, [r3] @ *ptr0 + ldr r8, [r12] @ *ptr2 + + smulwb r5, r7, r0 @ tmp32a = *ptr0 * input0 >> 16 + add r8, r8, r5, lsl #1 @ tmp32b = *ptr2 + (tmp32a << 1) + smull r5, r6, r8, r2 @ tmp32b * input2, in 64 bits + lsl r6, #16 + add r6, r5, lsr #16 @ Only take the middle 32 bits + str r6, [r12] @ Output (*ptr2, as 32 bits) + + @ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); + + smulwb r5, r7, r1 @ tmp32a = *ptr0 * input1 >> 16 + smulwb r6, r6, r0 @ tmp32b = *ptr2 * input0 >> 16 + lsl r5, r5, #1 + add r5, r6, lsl #1 + str r5, [r4] @ Output (*ptr1) + +END: + pop {r4-r8} + bx lr + +.fnend