replace inline assembly WebRtcNsx_PrepareSpectrumNeon by intrinsics.

The modification only uses the unique part of the spectrum (as is done for the C and asm code). It passes byte to byte conformance test, and the single function performance (if not specified, the code is compiled by GCC 4.6) on different platforms: | run 100k times | cortex-a7 | cortex-a9 | cortex-a15 | | use C as the base on each | (1.2Ghz) | (1.0Ghz) | (1.7Ghz) | | CPU target | | | | |----------------------------+-----------+-----------+------------| | C | 100% | 100% | 100% | | Neon asm | 18% | 14% | 19% | | Neon inline asm | 31% | 25% | 27% | | Neon intrinsic (GCC 4.6) | 33% | 27% | 42% | | Neon intrinscis (GCC 4.8) | 17% | 14% | 19% | | Neon intrinsics (LLVM 3.3) | 15% | 13% | 18% | BUG= R=andrew@webrtc.org Review URL: https://webrtc-codereview.appspot.com/13739004 Patch from Joe Yu <joe.yu@arm.com>. git-svn-id: http://webrtc.googlecode.com/svn/trunk@6920 4adac7df-926f-26a2-2b94-8c16560cd09d
2014-08-18 17:46:45 +00:00 · 2014-08-18 17:46:45 +00:00 · d798095a37
commit d798095a37
parent f86b262588
2 changed files with 52 additions and 98 deletions
--- a/1
+++ b/1
@ -24,3 +24,4 @@ Mozilla Foundation
 Opera Software ASA
 Vonage Holdings Corp.
 Temasys Communications
+ARM Holdings
--- a/webrtc/modules/audio_processing/ns/nsx_core_neon.c
+++ b/webrtc/modules/audio_processing/ns/nsx_core_neon.c
@ -363,6 +363,8 @@ void WebRtcNsx_NoiseEstimationNeon(NsxInst_t* inst,

 // Filter the data in the frequency domain, and create spectrum.
 void WebRtcNsx_PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
+  assert(inst->magnLen % 8 == 1);
+  assert(inst->anaLen2 % 16 == 0);

  // (1) Filtering.

@ -374,49 +376,38 @@ void WebRtcNsx_PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
  //      (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
  // }

-  int16_t* ptr_real = &inst->real[0];
-  int16_t* ptr_imag = &inst->imag[0];
-  uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0];
+  int16_t* preal = &inst->real[0];
+  int16_t* pimag = &inst->imag[0];
+  int16_t* pns_filter = (int16_t*)&inst->noiseSupFilter[0];
+  int16_t* pimag_end = pimag + inst->magnLen - 4;

-  // Filter the rest in the frequency domain.
-  for (; ptr_real < &inst->real[inst->magnLen - 1];) {
-    // Loop unrolled once. Both pointers are incremented by 4 twice.
-    __asm__ __volatile__(
-      "vld1.16 d20, [%[ptr_real]]\n\t"
-      "vld1.16 d22, [%[ptr_imag]]\n\t"
-      "vld1.16 d23, [%[ptr_noiseSupFilter]]!\n\t"
-      "vmull.s16 q10, d20, d23\n\t"
-      "vmull.s16 q11, d22, d23\n\t"
-      "vshrn.s32 d20, q10, #14\n\t"
-      "vshrn.s32 d22, q11, #14\n\t"
-      "vst1.16 d20, [%[ptr_real]]!\n\t"
-      "vst1.16 d22, [%[ptr_imag]]!\n\t"
+  while (pimag < pimag_end) {
+    int16x8_t real = vld1q_s16(preal);
+    int16x8_t imag = vld1q_s16(pimag);
+    int16x8_t ns_filter = vld1q_s16(pns_filter);

-      "vld1.16 d18, [%[ptr_real]]\n\t"
-      "vld1.16 d24, [%[ptr_imag]]\n\t"
-      "vld1.16 d25, [%[ptr_noiseSupFilter]]!\n\t"
-      "vmull.s16 q9, d18, d25\n\t"
-      "vmull.s16 q12, d24, d25\n\t"
-      "vshrn.s32 d18, q9, #14\n\t"
-      "vshrn.s32 d24, q12, #14\n\t"
-      "vst1.16 d18, [%[ptr_real]]!\n\t"
-      "vst1.16 d24, [%[ptr_imag]]!\n\t"
+    int32x4_t tmp_r_0 = vmull_s16(vget_low_s16(real), vget_low_s16(ns_filter));
+    int32x4_t tmp_i_0 = vmull_s16(vget_low_s16(imag), vget_low_s16(ns_filter));
+    int32x4_t tmp_r_1 = vmull_s16(vget_high_s16(real),
+                                  vget_high_s16(ns_filter));
+    int32x4_t tmp_i_1 = vmull_s16(vget_high_s16(imag),
+                                  vget_high_s16(ns_filter));

-      // Specify constraints.
-      :[ptr_imag]"+r"(ptr_imag),
-       [ptr_real]"+r"(ptr_real),
-       [ptr_noiseSupFilter]"+r"(ptr_noiseSupFilter)
-      :
-      :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
-       "q9", "q10", "q11", "q12"
-    );
+    int16x4_t result_r_0 = vshrn_n_s32(tmp_r_0, 14);
+    int16x4_t result_i_0 = vshrn_n_s32(tmp_i_0, 14);
+    int16x4_t result_r_1 = vshrn_n_s32(tmp_r_1, 14);
+    int16x4_t result_i_1 = vshrn_n_s32(tmp_i_1, 14);
+
+    vst1q_s16(preal, vcombine_s16(result_r_0, result_r_1));
+    vst1q_s16(pimag, vcombine_s16(result_i_0, result_i_1));
+    preal += 8;
+    pimag += 8;
+    pns_filter += 8;
  }

-  // Filter the last pair of elements in the frequency domain.
-  *ptr_real = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_real,
-      (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages)
-  *ptr_imag = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_imag,
-      (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages)
+  // Filter the last element
+  *preal = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*preal, *pns_filter, 14);
+  *pimag = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*pimag, *pns_filter, 14);

  // (2) Create spectrum.

@ -424,74 +415,36 @@ void WebRtcNsx_PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
  // freq_buf[0] = inst->real[0];
  // freq_buf[1] = -inst->imag[0];
  // for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
-  //   tmp16 = (inst->anaLen << 1) - j;
  //   freq_buf[j] = inst->real[i];
  //   freq_buf[j + 1] = -inst->imag[i];
-  //   freq_buf[tmp16] = inst->real[i];
-  //   freq_buf[tmp16 + 1] = inst->imag[i];
  // }
  // freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
  // freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];

-  freq_buf[0] = inst->real[0];
-  freq_buf[1] = -inst->imag[0];
+  preal = &inst->real[0];
+  pimag = &inst->imag[0];
+  pimag_end = pimag + inst->anaLen2;
+  int16_t * freq_buf_start = freq_buf;
+  while (pimag < pimag_end) {
+    // loop unroll
+    int16x8x2_t real_imag_0;
+    int16x8x2_t real_imag_1;
+    real_imag_0.val[1] = vld1q_s16(pimag);
+    real_imag_0.val[0] = vld1q_s16(preal);
+    preal += 8;
+    pimag += 8;
+    real_imag_1.val[1] = vld1q_s16(pimag);
+    real_imag_1.val[0] = vld1q_s16(preal);
+    preal += 8;
+    pimag += 8;

-  int offset = -16;
-  int16_t* ptr_realImag1 = &freq_buf[2];
-  int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8];
-  ptr_real = &inst->real[1];
-  ptr_imag = &inst->imag[1];
-  for (; ptr_real < &inst->real[inst->anaLen2 - 11];) {
-    // Loop unrolled once. All pointers are incremented twice.
-    __asm__ __volatile__(
-      "vld1.16 d22, [%[ptr_real]]!\n\t"
-      "vld1.16 d23, [%[ptr_imag]]!\n\t"
-      // Negate and interleave:
-      "vmov.s16 d20, d22\n\t"
-      "vneg.s16 d21, d23\n\t"
-      "vzip.16 d20, d21\n\t"
-      // Write 8 elements to &freq_buf[j]
-      "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t"
-      // Interleave and reverse elements:
-      "vzip.16 d22, d23\n\t"
-      "vrev64.32 d18, d23\n\t"
-      "vrev64.32 d19, d22\n\t"
-      // Write 8 elements to &freq_buf[tmp16]
-      "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t"
-
-      "vld1.16 d22, [%[ptr_real]]!\n\t"
-      "vld1.16 d23, [%[ptr_imag]]!\n\t"
-      // Negate and interleave:
-      "vmov.s16 d20, d22\n\t"
-      "vneg.s16 d21, d23\n\t"
-      "vzip.16 d20, d21\n\t"
-      // Write 8 elements to &freq_buf[j]
-      "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t"
-      // Interleave and reverse elements:
-      "vzip.16 d22, d23\n\t"
-      "vrev64.32 d18, d23\n\t"
-      "vrev64.32 d19, d22\n\t"
-      // Write 8 elements to &freq_buf[tmp16]
-      "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t"
-
-      // Specify constraints.
-      :[ptr_imag]"+r"(ptr_imag),
-       [ptr_real]"+r"(ptr_real),
-       [ptr_realImag1]"+r"(ptr_realImag1),
-       [ptr_realImag2]"+r"(ptr_realImag2)
-      :[offset]"r"(offset)
-      :"d18", "d19", "d20", "d21", "d22", "d23"
-    );
+    real_imag_0.val[1] = vnegq_s16(real_imag_0.val[1]);
+    real_imag_1.val[1] = vnegq_s16(real_imag_1.val[1]);
+    vst2q_s16(freq_buf_start, real_imag_0);
+    freq_buf_start += 16;
+    vst2q_s16(freq_buf_start, real_imag_1);
+    freq_buf_start += 16;
  }
-  for (ptr_realImag2 += 6;
-       ptr_real <= &inst->real[inst->anaLen2];
-       ptr_real += 1, ptr_imag += 1, ptr_realImag1 += 2, ptr_realImag2 -= 2) {
-    *ptr_realImag1 = *ptr_real;
-    *(ptr_realImag1 + 1) = -(*ptr_imag);
-    *ptr_realImag2 = *ptr_real;
-    *(ptr_realImag2 + 1) = *ptr_imag;
-  }
-
  freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
  freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
 }