replace inline assembly WebRtcAecm_CalcLinearEnergiesNeon by intrinsics.

The modification only uses the unique part of the CalcLinearEnergies function. Pass byte to byte conformance test both on ARMv7 and ARM64, and the single function performance is similar with original assembly version on different platforms. If not specified, the code is compiled by GCC 4.6. The result is the "X version / C version" ratio, and the less is better. | run 100k times | cortex-a7 | cortex-a9 | cortex-a15 | | use C as the base on each | (1.2Ghz) | (1.0Ghz) | (1.7Ghz) | | CPU target | | | | |----------------------------+-----------+-----------+------------| | Neon asm | 19.48% | 19.26% | 13.68% | | Neon inline | 27.90% | 28.87% | 17.79% | | Neon intrinsics (GCC 4.8) | 18.69% | 20.18% | 14.69% | | Neon intrinsics (LLVM 3.4) | 18.52% | 21.15% | 13.56% | BUG=3580 R=andrew@webrtc.org Review URL: https://webrtc-codereview.appspot.com/23349004 Patch from Zhongwei Yao <zhongwei.yao@arm.com>. git-svn-id: http://webrtc.googlecode.com/svn/trunk@7686 4adac7df-926f-26a2-2b94-8c16560cd09d
2014-11-11 19:34:14 +00:00 · 2014-11-11 19:34:14 +00:00 · 0e37b898f0
commit 0e37b898f0
parent e497be3de1
1 changed files with 63 additions and 50 deletions
--- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
+++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
@ -227,72 +227,85 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
  }
 }

+static inline void AddLanes(uint32_t* ptr, uint32x4_t v) {
+#if defined(__aarch64__)
+  *(ptr) = vaddvq_u32(v);
+#else
+  uint32x2_t tmp_v;
+  tmp_v = vadd_u32(vget_low_u32(v), vget_high_u32(v));
+  tmp_v = vpadd_u32(tmp_v, tmp_v);
+  *(ptr) = vget_lane_u32(tmp_v, 0);
+#endif
+}
+
 void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
                                       const uint16_t* far_spectrum,
                                       int32_t* echo_est,
                                       uint32_t* far_energy,
                                       uint32_t* echo_energy_adapt,
                                       uint32_t* echo_energy_stored) {
-  int i;
+  int16_t* start_stored_p = aecm->channelStored;
+  int16_t* start_adapt_p = aecm->channelAdapt16;
+  int32_t* echo_est_p = echo_est;
+  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
+  const uint16_t* far_spectrum_p = far_spectrum;
+  int16x8_t store_v, adapt_v, spectrum_v;
+  uint32x4_t echo_est_v_low, echo_est_v_high;
+  uint32x4_t far_energy_v, echo_energy_stored_v, echo_energy_adapt_v;

-  register uint32_t far_energy_r;
-  register uint32_t echo_energy_stored_r;
-  register uint32_t echo_energy_adapt_r;
+  far_energy_v = vdupq_n_u32(0);
+  echo_energy_adapt_v = vdupq_n_u32(0);
+  echo_energy_stored_v = vdupq_n_u32(0);

-  assert((uintptr_t)echo_est % 32 == 0);
-  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
-  assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
-  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
-  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
+  // Get energy for the delayed far end signal and estimated
+  // echo using both stored and adapted channels.
+  // The C code:
+  //  for (i = 0; i < PART_LEN1; i++) {
+  //      echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
+  //                                         far_spectrum[i]);
+  //      (*far_energy) += (uint32_t)(far_spectrum[i]);
+  //      *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
+  //      (*echo_energy_stored) += (uint32_t)echo_est[i];
+  //  }
+  while (start_stored_p < end_stored_p) {
+    spectrum_v = vld1q_u16(far_spectrum_p);
+    adapt_v = vld1q_s16(start_adapt_p);
+    store_v = vld1q_s16(start_stored_p);

-  __asm __volatile("vmov.i32 q14, #0" : : : "q14"); // far_energy
-  __asm __volatile("vmov.i32 q8,  #0" : : : "q8"); // echo_energy_stored
-  __asm __volatile("vmov.i32 q9,  #0" : : : "q9"); // echo_energy_adapt
+    far_energy_v = vaddw_u16(far_energy_v, vget_low_s16(spectrum_v));
+    far_energy_v = vaddw_u16(far_energy_v, vget_high_s16(spectrum_v));

-  for (i = 0; i < PART_LEN - 7; i += 8) {
-    // far_energy += (uint32_t)(far_spectrum[i]);
-    __asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
-    __asm __volatile("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
-    __asm __volatile("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
+    echo_est_v_low = vmull_u16(vget_low_s16(store_v), vget_low_s16(spectrum_v));
+    echo_est_v_high = vmull_u16(vget_high_s16(store_v),
+                                vget_high_s16(spectrum_v));
+    vst1q_s32(echo_est_p, echo_est_v_low);
+    vst1q_s32(echo_est_p + 4, echo_est_v_high);

-    // Get estimated echo energies for adaptive channel and stored channel.
-    // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
-    __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
-    __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
-    __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
-    __asm __volatile("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
-            "q10", "q11");
+    echo_energy_stored_v = vaddq_s32(echo_est_v_low, echo_energy_stored_v);
+    echo_energy_stored_v = vaddq_s32(echo_est_v_high, echo_energy_stored_v);

-    // echo_energy_stored += (uint32_t)echoEst[i];
-    __asm __volatile("vadd.u32 q8, q10" : : : "q10", "q8");
-    __asm __volatile("vadd.u32 q8, q11" : : : "q11", "q8");
+    echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
+                                    vget_low_s16(adapt_v),
+                                    vget_low_s16(spectrum_v));
+    echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
+                                    vget_high_s16(adapt_v),
+                                    vget_high_s16(spectrum_v));

-    // echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
-    __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
-    __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
-    __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
-    __asm __volatile("vadd.u32 q9, q10" : : : "q9", "q15");
-    __asm __volatile("vadd.u32 q9, q11" : : : "q9", "q11");
+    start_stored_p += 8;
+    start_adapt_p += 8;
+    far_spectrum_p += 8;
+    echo_est_p += 8;
  }

-  __asm __volatile("vadd.u32 d28, d29" : : : "q14");
-  __asm __volatile("vpadd.u32 d28, d28" : : : "q14");
-  __asm __volatile("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
+  AddLanes(far_energy, far_energy_v);
+  AddLanes(echo_energy_stored, echo_energy_stored_v);
+  AddLanes(echo_energy_adapt, echo_energy_adapt_v);

-  __asm __volatile("vadd.u32 d18, d19" : : : "q9");
-  __asm __volatile("vpadd.u32 d18, d18" : : : "q9");
-  __asm __volatile("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
-
-  __asm __volatile("vadd.u32 d16, d17" : : : "q8");
-  __asm __volatile("vpadd.u32 d16, d16" : : : "q8");
-  __asm __volatile("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
-
-  // Get estimated echo energies for adaptive channel and stored channel.
-  echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
-  *echo_energy_stored = echo_energy_stored_r + (uint32_t)echo_est[i];
-  *far_energy = far_energy_r + (uint32_t)(far_spectrum[i]);
-  *echo_energy_adapt = echo_energy_adapt_r +
-      aecm->channelAdapt16[i] * far_spectrum[i];
+  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
+                                             far_spectrum[PART_LEN]);
+  *echo_energy_stored += (uint32_t)echo_est[PART_LEN];
+  *far_energy += (uint32_t)far_spectrum[PART_LEN];
+  *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
 }

 void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,