From b413a300976324d27e1514b068c1e996b264622e Mon Sep 17 00:00:00 2001 From: "andrew@webrtc.org" Date: Mon, 15 Dec 2014 07:23:49 +0000 Subject: [PATCH] Add WebRtcIsacfix_FilterMaLoopNeon's intrinsics version. This intrinsics version gives bit-exact result as the current assembly neon code. And the performance is 38% better than current assembly neon version, 5.92 times faster than current C version. The test runs under Cortex-a53 aarch32 mode, other cpu should give similar performance result. BUG=4002 R=andrew@webrtc.org, jridges@masque.com Change-Id: I257e33ef6d634a519fd71adc4f52b06dd655bd9d Review URL: https://webrtc-codereview.appspot.com/32749004 Patch from Zhongwei Yao . git-svn-id: http://webrtc.googlecode.com/svn/trunk@7891 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../codecs/isac/fix/source/lattice_neon.c | 173 ++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c new file mode 100644 index 000000000..6bdaf1dfa --- /dev/null +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h" +#include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h" + +// Contains a function for the core loop in the normalized lattice MA +// filter routine for iSAC codec, optimized for ARM Neon platform. +// It does: +// for 0 <= n < HALF_SUBFRAMELEN - 1: +// *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); +// *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); +// Output is not bit-exact with the reference C code, due to the replacement +// of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon +// instructions. The difference should not be bigger than 1. +void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, // Filter coefficient + int16_t input1, // Filter coefficient + int32_t input2, // Inverse coefficient + int32_t* ptr0, // Sample buffer + int32_t* ptr1, // Sample buffer + int32_t* ptr2) // Sample buffer +{ + int n = 0; + int loop = (HALF_SUBFRAMELEN - 1) >> 3; + int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7; + + int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16); + int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16); + int32x4_t input2_v = vdupq_n_s32(input2); + int32x4_t tmp0a, tmp1a, tmp2a, tmp3a; + int32x4_t tmp0b, tmp1b, tmp2b, tmp3b; + int32x4_t ptr0va, ptr1va, ptr2va; + int32x4_t ptr0vb, ptr1vb, ptr2vb; + + // Unroll to process 8 samples at once. + for (n = 0; n < loop; n++) { + ptr0va = vld1q_s32(ptr0); + ptr0vb = vld1q_s32(ptr0 + 4); + ptr0 += 8; + + ptr2va = vld1q_s32(ptr2); + ptr2vb = vld1q_s32(ptr2 + 4); + + // Calculate tmp0 = (*ptr0) * input0. + tmp0a = vqrdmulhq_s32(ptr0va, input0_v); + tmp0b = vqrdmulhq_s32(ptr0vb, input0_v); + + // Calculate tmp1 = (*ptr0) * input1. + tmp1a = vqrdmulhq_s32(ptr0va, input1_v); + tmp1b = vqrdmulhq_s32(ptr0vb, input1_v); + + // Calculate tmp2 = tmp0 + *(ptr2). + tmp2a = vaddq_s32(tmp0a, ptr2va); + tmp2b = vaddq_s32(tmp0b, ptr2vb); + tmp2a = vshlq_n_s32(tmp2a, 15); + tmp2b = vshlq_n_s32(tmp2b, 15); + + // Calculate *ptr2 = input2 * tmp2. + ptr2va = vqrdmulhq_s32(tmp2a, input2_v); + ptr2vb = vqrdmulhq_s32(tmp2b, input2_v); + + vst1q_s32(ptr2, ptr2va); + vst1q_s32(ptr2 + 4, ptr2vb); + ptr2 += 8; + + // Calculate tmp3 = ptr2v * input0. + tmp3a = vqrdmulhq_s32(ptr2va, input0_v); + tmp3b = vqrdmulhq_s32(ptr2vb, input0_v); + + // Calculate *ptr1 = tmp1 + tmp3. + ptr1va = vaddq_s32(tmp1a, tmp3a); + ptr1vb = vaddq_s32(tmp1b, tmp3b); + + vst1q_s32(ptr1, ptr1va); + vst1q_s32(ptr1 + 4, ptr1vb); + ptr1 += 8; + } + + // Process four more samples. + if (loop_tail & 0x4) { + ptr0va = vld1q_s32(ptr0); + ptr2va = vld1q_s32(ptr2); + ptr0 += 4; + + // Calculate tmp0 = (*ptr0) * input0. + tmp0a = vqrdmulhq_s32(ptr0va, input0_v); + + // Calculate tmp1 = (*ptr0) * input1. + tmp1a = vqrdmulhq_s32(ptr0va, input1_v); + + // Calculate tmp2 = tmp0 + *(ptr2). + tmp2a = vaddq_s32(tmp0a, ptr2va); + tmp2a = vshlq_n_s32(tmp2a, 15); + + // Calculate *ptr2 = input2 * tmp2. + ptr2va = vqrdmulhq_s32(tmp2a, input2_v); + + vst1q_s32(ptr2, ptr2va); + ptr2 += 4; + + // Calculate tmp3 = *(ptr2) * input0. + tmp3a = vqrdmulhq_s32(ptr2va, input0_v); + + // Calculate *ptr1 = tmp1 + tmp3. + ptr1va = vaddq_s32(tmp1a, tmp3a); + + vst1q_s32(ptr1, ptr1va); + ptr1 += 4; + } + + // Process two more samples. + if (loop_tail & 0x2) { + int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail; + int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail; + ptr0v_tail = vld1_s32(ptr0); + ptr2v_tail = vld1_s32(ptr2); + ptr0 += 2; + + // Calculate tmp0 = (*ptr0) * input0. + tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v)); + + // Calculate tmp1 = (*ptr0) * input1. + tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v)); + + // Calculate tmp2 = tmp0 + *(ptr2). + tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail); + tmp2_tail = vshl_n_s32(tmp2_tail, 15); + + // Calculate *ptr2 = input2 * tmp2. + ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v)); + + vst1_s32(ptr2, ptr2v_tail); + ptr2 += 2; + + // Calculate tmp3 = *(ptr2) * input0. + tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v)); + + // Calculate *ptr1 = tmp1 + tmp3. + ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail); + + vst1_s32(ptr1, ptr1v_tail); + ptr1 += 2; + } + + // Process one more sample. + if (loop_tail & 0x1) { + int16_t t16a = (int16_t)(input2 >> 16); + int16_t t16b = (int16_t)input2; + if (t16b < 0) t16a++; + int32_t tmp32a; + int32_t tmp32b; + + // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)). + tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); + tmp32b = *ptr2 + tmp32a; + *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) + + (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b))); + + // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2). + tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); + tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); + *ptr1 = tmp32a + tmp32b; + } +}