Remove -flax-vector-conversions flag for ARM NEON building.

Pass compilation on both ARMv7 and ARM64. The generated binary (audioproc) is byte to byte (with symbol striped) same as before. The output of audioproc -aecm is also byte to byte same between C and NEON version on ARMv7 and ARM64. Change-Id: Ibdf40fe085f6bad1311f59bf9318bbcf37dd7ce5 BUG=3850 R=andrew@webrtc.org, jridges@masque.com Review URL: https://webrtc-codereview.appspot.com/30239004 Patch from Zhongwei Yao <zhongwei.yao@arm.com>. git-svn-id: http://webrtc.googlecode.com/svn/trunk@7783 4adac7df-926f-26a2-2b94-8c16560cd09d
2014-12-02 19:36:14 +00:00 · 2014-12-02 19:36:14 +00:00 · 1751ee7d32
commit 1751ee7d32
parent ac68ef9ad4
5 changed files with 31 additions and 37 deletions
--- a/webrtc/build/arm_neon.gypi
+++ b/webrtc/build/arm_neon.gypi
@ -23,9 +23,6 @@
  'cflags!': [
    '-mfpu=vfpv3-d16',
  ],
-  'cflags': [
-    '-flax-vector-conversions',
-  ],
  'conditions': [
    # "-mfpu=neon" is not requried for arm64 in GCC.
    ['target_arch!="arm64"', {
--- a/webrtc/common_audio/BUILD.gn
+++ b/webrtc/common_audio/BUILD.gn
@ -211,7 +211,6 @@ if (rtc_build_armv7_neon) {
    # Remove the -mfpu=vfpv3-d16 cflag.
    configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
    cflags = [
-      "-flax-vector-conversions",
      "-mfpu=neon",
    ]

--- a/webrtc/modules/audio_coding/BUILD.gn
+++ b/webrtc/modules/audio_coding/BUILD.gn
@ -501,7 +501,6 @@ source_set("isacfix") {
    # Remove the -mfpu=vfpv3-d16 cflag.
    configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
    cflags = [
-      "-flax-vector-conversions",
      "-mfpu=neon",
    ]

@ -572,7 +571,6 @@ if (rtc_build_armv7_neon) {
    # Remove the -mfpu=vfpv3-d16 cflag.
    configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
    cflags = [
-      "-flax-vector-conversions",
      "-mfpu=neon",
    ]

--- a/webrtc/modules/audio_processing/BUILD.gn
+++ b/webrtc/modules/audio_processing/BUILD.gn
@ -210,13 +210,10 @@ if (rtc_build_armv7_neon || cpu_arch == "arm64") {
    # //build/config/arm.gni instead, to reduce code duplication.
    # Remove the -mfpu=vfpv3-d16 cflag.
    configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
-    cflags = [
-      "-flax-vector-conversions",
-    ]

    # "-mfpu=neon" is not requried for arm64 in GCC.
    if (cpu_arch != "arm64") {
-       cflags += [ "-mfpu=neon" ]
+       cflags = [ "-mfpu=neon" ]
    }

    # Disable LTO in audio_processing_neon target due to compiler bug.
--- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
+++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
@ -53,13 +53,14 @@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
  int32_t* echo_est_p = echo_est;
  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
  const uint16_t* far_spectrum_p = far_spectrum;
-  int16x8_t store_v, adapt_v, spectrum_v;
+  int16x8_t store_v, adapt_v;
+  uint16x8_t spectrum_v;
  uint32x4_t echo_est_v_low, echo_est_v_high;
-  uint32x4_t far_energy_v, echo_energy_stored_v, echo_energy_adapt_v;
+  uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v;

  far_energy_v = vdupq_n_u32(0);
-  echo_energy_adapt_v = vdupq_n_u32(0);
-  echo_energy_stored_v = vdupq_n_u32(0);
+  echo_adapt_v = vdupq_n_u32(0);
+  echo_stored_v = vdupq_n_u32(0);

  // Get energy for the delayed far end signal and estimated
  // echo using both stored and adapted channels.
@ -76,24 +77,25 @@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
    adapt_v = vld1q_s16(start_adapt_p);
    store_v = vld1q_s16(start_stored_p);

-    far_energy_v = vaddw_u16(far_energy_v, vget_low_s16(spectrum_v));
-    far_energy_v = vaddw_u16(far_energy_v, vget_high_s16(spectrum_v));
+    far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v));
+    far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v));

-    echo_est_v_low = vmull_u16(vget_low_s16(store_v), vget_low_s16(spectrum_v));
-    echo_est_v_high = vmull_u16(vget_high_s16(store_v),
-                                vget_high_s16(spectrum_v));
-    vst1q_s32(echo_est_p, echo_est_v_low);
-    vst1q_s32(echo_est_p + 4, echo_est_v_high);
+    echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)),
+                               vget_low_u16(spectrum_v));
+    echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)),
+                                vget_high_u16(spectrum_v));
+    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
+    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));

-    echo_energy_stored_v = vaddq_s32(echo_est_v_low, echo_energy_stored_v);
-    echo_energy_stored_v = vaddq_s32(echo_est_v_high, echo_energy_stored_v);
+    echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v);
+    echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v);

-    echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
-                                    vget_low_s16(adapt_v),
-                                    vget_low_s16(spectrum_v));
-    echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
-                                    vget_high_s16(adapt_v),
-                                    vget_high_s16(spectrum_v));
+    echo_adapt_v = vmlal_u16(echo_adapt_v,
+                             vreinterpret_u16_s16(vget_low_s16(adapt_v)),
+                             vget_low_u16(spectrum_v));
+    echo_adapt_v = vmlal_u16(echo_adapt_v,
+                             vreinterpret_u16_s16(vget_high_s16(adapt_v)),
+                             vget_high_u16(spectrum_v));

    start_stored_p += 8;
    start_adapt_p += 8;
@ -102,8 +104,8 @@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
  }

  AddLanes(far_energy, far_energy_v);
-  AddLanes(echo_energy_stored, echo_energy_stored_v);
-  AddLanes(echo_energy_adapt, echo_energy_adapt_v);
+  AddLanes(echo_energy_stored, echo_stored_v);
+  AddLanes(echo_energy_adapt, echo_adapt_v);

  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
                                             far_spectrum[PART_LEN]);
@ -143,8 +145,9 @@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
  int32_t* echo_est_p = echo_est;

-  int16x8_t far_spectrum_v, adapt_v;
-  int32x4_t echo_est_v_low, echo_est_v_high;
+  uint16x8_t far_spectrum_v;
+  int16x8_t adapt_v;
+  uint32x4_t echo_est_v_low, echo_est_v_high;

  while (start_stored_p < end_stored_p) {
    far_spectrum_v = vld1q_u16(far_spectrum_p);
@ -153,12 +156,12 @@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
    vst1q_s16(start_stored_p, adapt_v);

    echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v),
-                               vget_low_u16(adapt_v));
+                               vget_low_u16(vreinterpretq_u16_s16(adapt_v)));
    echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v),
-                                vget_high_u16(adapt_v));
+                                vget_high_u16(vreinterpretq_u16_s16(adapt_v)));

-    vst1q_s32(echo_est_p, echo_est_v_low);
-    vst1q_s32(echo_est_p + 4, echo_est_v_high);
+    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
+    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));

    far_spectrum_p += 8;
    start_adapt_p += 8;