diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h b/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h index d86bd107e..279c2c511 100644 --- a/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h +++ b/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h @@ -131,10 +131,10 @@ void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef, WebRtc_Word16 lo_hi, WebRtc_Word16 *lat_outQ0); -int WebRtcIsacfix_AutocorrFix(WebRtc_Word32 *r, - const WebRtc_Word16 *x, - WebRtc_Word16 N, - WebRtc_Word16 order, - WebRtc_Word16 *scale); +int WebRtcIsacfix_AutocorrFix(WebRtc_Word32* __restrict r, + const WebRtc_Word16* __restrict x, + WebRtc_Word16 N, + WebRtc_Word16 order, + WebRtc_Word16* __restrict scale); #endif /* WEBRTC_MODULES_AUDIO_CODING_CODECS_ISAC_FIX_SOURCE_CODEC_H_ */ diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c b/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c index 8f138253d..940bb5614 100644 --- a/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c +++ b/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c @@ -22,50 +22,44 @@ #include "lpc_masking_model.h" #include "codec.h" - -/* Autocorrelation function in fixed point. NOTE! Different from SPLIB-version in how it scales the signal. */ +#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) +// Autocorrelation function in fixed point. +// NOTE! Different from SPLIB-version in how it scales the signal. int WebRtcIsacfix_AutocorrFix( - WebRtc_Word32 *r, - const WebRtc_Word16 *x, - WebRtc_Word16 N, - WebRtc_Word16 order, - WebRtc_Word16 *scale) -{ - int j, i; - WebRtc_Word16 scaling; - WebRtc_Word32 sum, prod, newsum; - G_CONST WebRtc_Word16 *xptr1; - G_CONST WebRtc_Word16 *xptr2; + WebRtc_Word32* __restrict r, + const WebRtc_Word16* __restrict x, + WebRtc_Word16 N, + WebRtc_Word16 order, + WebRtc_Word16* __restrict scale) { - sum=0; - scaling=0; - /* Calculate r[0] and how much scaling is needed */ - for (i=0; i < N; i++) { - prod = WEBRTC_SPL_MUL_16_16_RSFT(x[i],x[i],scaling); - newsum = sum+prod; - /* If sum gets less than 0 we have overflow and need to scale the signal */ - if(newsum<0) { - scaling++; - sum=WEBRTC_SPL_RSHIFT_W32(sum, 1); - prod=WEBRTC_SPL_RSHIFT_W32(prod, 1); - } - sum += prod; + int i = 0; + int j = 0; + int16_t scaling = 0; + int32_t sum = 0; + uint32_t temp = 0; + int64_t prod = 0; + + // Calculate r[0]. + for (i = 0; i < N; i++) { + prod += WEBRTC_SPL_MUL_16_16(x[i], x[i]); } - r[0]=sum; - /* Perform the actual correlation calculation */ - for (i = 1; i < order + 1; i++) - { - int loops=(N-i); - sum = 0; - xptr1=(G_CONST WebRtc_Word16 *)x; - xptr2=(G_CONST WebRtc_Word16 *)&x[i]; + // Calculate scaling (the value of shifting). + temp = (uint32_t)(prod >> 31); + if(temp == 0) { + scaling = 0; + } else { + scaling = 32 - WebRtcSpl_NormU32(temp); + } + r[0] = (int32_t)(prod >> scaling); - for (j = loops;j > 0; j--) - { - sum += WEBRTC_SPL_MUL_16_16_RSFT(*xptr1++,*xptr2++,scaling); + // Perform the actual correlation calculation. + for (i = 1; i < order + 1; i++) { + prod = 0; + for (j = 0; j < N - i; j++) { + prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]); } - + sum = (int32_t)(prod >> scaling); r[i] = sum; } @@ -73,6 +67,7 @@ int WebRtcIsacfix_AutocorrFix( return(order + 1); } +#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) static const WebRtc_Word32 kApUpperQ15[ALLPASSSECTIONS] = { 1137, 12537 }; static const WebRtc_Word32 kApLowerQ15[ALLPASSSECTIONS] = { 5059, 24379 }; diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c b/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c new file mode 100644 index 000000000..0b44886d3 --- /dev/null +++ b/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * filters_neon.c + * + * This file contains function WebRtcIsacfix_AutocorrFix, optimized for + * ARM Neon platform. + * + */ + +#include +#include + +#include "codec.h" + +// Autocorrelation function in fixed point. +// NOTE! Different from SPLIB-version in how it scales the signal. +int WebRtcIsacfix_AutocorrFix( + WebRtc_Word32* __restrict r, + const WebRtc_Word16* __restrict x, + WebRtc_Word16 N, + WebRtc_Word16 order, + WebRtc_Word16* __restrict scale) { + + // The 1st for loop assumed N % 4 == 0. + assert(N % 4 == 0); + + int i = 0; + int zeros_low = 0; + int zeros_high = 0; + int16_t scaling = 0; + int32_t sum = 0; + + // Step 1, calculate r[0] and how much scaling is needed. + + int16x4_t reg16x4; + int64x1_t reg64x1a; + int64x1_t reg64x1b; + int32x4_t reg32x4; + int64x2_t reg64x2 = vdupq_n_s64(0); // zeros + + // Loop over the samples and do: + // sum += WEBRTC_SPL_MUL_16_16(x[i], x[i]); + for (i = 0; i < N; i += 4) { + reg16x4 = vld1_s16(&x[i]); + reg32x4 = vmull_s16(reg16x4, reg16x4); + reg64x2 = vpadalq_s32(reg64x2, reg32x4); + } + reg64x1a = vget_low_s64(reg64x2); + reg64x1b = vget_high_s64(reg64x2); + reg64x1a = vadd_s64(reg64x1a, reg64x1b); + + // Calculate the value of shifting (scaling). + __asm__ __volatile__( + "vmov %[z_l], %[z_h], %P[reg]\n\t" + "clz %[z_l], %[z_l]\n\t" + "clz %[z_h], %[z_h]\n\t" + :[z_l]"+r"(zeros_low), + [z_h]"+r"(zeros_high) + :[reg]"w"(reg64x1a) + ); + if (zeros_high != 32) { + scaling = (32 - zeros_high + 1); + } else if (zeros_low == 0) { + scaling = 1; + } + reg64x1b = -scaling; + reg64x1a = vshl_s64(reg64x1a, reg64x1b); + + // Record the result. + r[0] = (int32_t)vget_lane_s64(reg64x1a, 0); + + + // Step 2, perform the actual correlation calculation. + + /* Original C code (for the rest of the function): + for (i = 1; i < order + 1; i++) { + prod = 0; + for (j = 0; j < N - i; j++) { + prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]); + } + sum = (int32_t)(prod >> scaling); + r[i] = sum; + } + */ + + for (i = 1; i < order + 1; i++) { + int32_t prod_lower = 0; + int32_t prod_upper = 0; + int16_t* ptr0 = &x[0]; + int16_t* ptr1 = &x[i]; + int32_t tmp = 0; + + // Initialize the sum (q9) to zero. + __asm__ __volatile__("vmov.i32 q9, #0\n\t":::"q9"); + + // Calculate the major block of the samples (a multiple of 8). + for (; ptr0 < &x[N - i - 7];) { + __asm__ __volatile__( + "vld1.16 {d20, d21}, [%[ptr0]]!\n\t" + "vld1.16 {d22, d23}, [%[ptr1]]!\n\t" + "vmull.s16 q12, d20, d22\n\t" + "vmull.s16 q13, d21, d23\n\t" + "vpadal.s32 q9, q12\n\t" + "vpadal.s32 q9, q13\n\t" + + // Specify constraints. + :[ptr0]"+r"(ptr0), + [ptr1]"+r"(ptr1) + : + :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27" + ); + } + + // Calculate the rest of the samples. + for (; ptr0 < &x[N - i]; ptr0++, ptr1++) { + __asm__ __volatile__( + "smulbb %[tmp], %[ptr0], %[ptr1]\n\t" + "adds %[prod_lower], %[prod_lower], %[tmp]\n\t" + "adc %[prod_upper], %[prod_upper], %[tmp], asr #31\n\t" + + // Specify constraints. + :[prod_lower]"+r"(prod_lower), + [prod_upper]"+r"(prod_upper), + [tmp]"+r"(tmp) + :[ptr0]"r"(*ptr0), + [ptr1]"r"(*ptr1) + ); + } + + // Sum the results up, and do shift. + __asm__ __volatile__( + "vadd.i64 d18, d19\n\t" + "vmov.32 d17[0], %[prod_lower]\n\t" + "vmov.32 d17[1], %[prod_upper]\n\t" + "vadd.i64 d17, d18\n\t" + "mov %[tmp], %[scaling], asr #31\n\t" + "vmov.32 d16, %[scaling], %[tmp]\n\t" + "vshl.s64 d17, d16\n\t" + "vmov.32 %[sum], d17[0]\n\t" + + // Specify constraints. + :[sum]"=r"(sum), + [tmp]"+r"(tmp) + :[prod_upper]"r"(prod_upper), + [prod_lower]"r"(prod_lower), + [scaling]"r"(-scaling) + :"d16", "d17", "d18", "d19" + ); + + // Record the result. + r[i] = sum; + } + + // Record the result. + *scale = scaling; + + return(order + 1); +}