Remove -flax-vector-conversions flag for ARM NEON building.

Pass compilation on both ARMv7 and ARM64. The generated
binary (audioproc) is byte to byte (with symbol striped) same as
before. The output of audioproc -aecm is also byte to byte same between
C and NEON version on ARMv7 and ARM64.

Change-Id: Ibdf40fe085f6bad1311f59bf9318bbcf37dd7ce5

BUG=3850
R=andrew@webrtc.org, jridges@masque.com

Review URL: https://webrtc-codereview.appspot.com/30239004

Patch from Zhongwei Yao <zhongwei.yao@arm.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@7783 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
andrew@webrtc.org 2014-12-02 19:36:14 +00:00
parent ac68ef9ad4
commit 1751ee7d32
5 changed files with 31 additions and 37 deletions

View File

@ -23,9 +23,6 @@
'cflags!': [
'-mfpu=vfpv3-d16',
],
'cflags': [
'-flax-vector-conversions',
],
'conditions': [
# "-mfpu=neon" is not requried for arm64 in GCC.
['target_arch!="arm64"', {

View File

@ -211,7 +211,6 @@ if (rtc_build_armv7_neon) {
# Remove the -mfpu=vfpv3-d16 cflag.
configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
cflags = [
"-flax-vector-conversions",
"-mfpu=neon",
]

View File

@ -501,7 +501,6 @@ source_set("isacfix") {
# Remove the -mfpu=vfpv3-d16 cflag.
configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
cflags = [
"-flax-vector-conversions",
"-mfpu=neon",
]
@ -572,7 +571,6 @@ if (rtc_build_armv7_neon) {
# Remove the -mfpu=vfpv3-d16 cflag.
configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
cflags = [
"-flax-vector-conversions",
"-mfpu=neon",
]

View File

@ -210,13 +210,10 @@ if (rtc_build_armv7_neon || cpu_arch == "arm64") {
# //build/config/arm.gni instead, to reduce code duplication.
# Remove the -mfpu=vfpv3-d16 cflag.
configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
cflags = [
"-flax-vector-conversions",
]
# "-mfpu=neon" is not requried for arm64 in GCC.
if (cpu_arch != "arm64") {
cflags += [ "-mfpu=neon" ]
cflags = [ "-mfpu=neon" ]
}
# Disable LTO in audio_processing_neon target due to compiler bug.

View File

@ -53,13 +53,14 @@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
int32_t* echo_est_p = echo_est;
const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
const uint16_t* far_spectrum_p = far_spectrum;
int16x8_t store_v, adapt_v, spectrum_v;
int16x8_t store_v, adapt_v;
uint16x8_t spectrum_v;
uint32x4_t echo_est_v_low, echo_est_v_high;
uint32x4_t far_energy_v, echo_energy_stored_v, echo_energy_adapt_v;
uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v;
far_energy_v = vdupq_n_u32(0);
echo_energy_adapt_v = vdupq_n_u32(0);
echo_energy_stored_v = vdupq_n_u32(0);
echo_adapt_v = vdupq_n_u32(0);
echo_stored_v = vdupq_n_u32(0);
// Get energy for the delayed far end signal and estimated
// echo using both stored and adapted channels.
@ -76,24 +77,25 @@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
adapt_v = vld1q_s16(start_adapt_p);
store_v = vld1q_s16(start_stored_p);
far_energy_v = vaddw_u16(far_energy_v, vget_low_s16(spectrum_v));
far_energy_v = vaddw_u16(far_energy_v, vget_high_s16(spectrum_v));
far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v));
far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v));
echo_est_v_low = vmull_u16(vget_low_s16(store_v), vget_low_s16(spectrum_v));
echo_est_v_high = vmull_u16(vget_high_s16(store_v),
vget_high_s16(spectrum_v));
vst1q_s32(echo_est_p, echo_est_v_low);
vst1q_s32(echo_est_p + 4, echo_est_v_high);
echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)),
vget_low_u16(spectrum_v));
echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)),
vget_high_u16(spectrum_v));
vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));
echo_energy_stored_v = vaddq_s32(echo_est_v_low, echo_energy_stored_v);
echo_energy_stored_v = vaddq_s32(echo_est_v_high, echo_energy_stored_v);
echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v);
echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v);
echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
vget_low_s16(adapt_v),
vget_low_s16(spectrum_v));
echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
vget_high_s16(adapt_v),
vget_high_s16(spectrum_v));
echo_adapt_v = vmlal_u16(echo_adapt_v,
vreinterpret_u16_s16(vget_low_s16(adapt_v)),
vget_low_u16(spectrum_v));
echo_adapt_v = vmlal_u16(echo_adapt_v,
vreinterpret_u16_s16(vget_high_s16(adapt_v)),
vget_high_u16(spectrum_v));
start_stored_p += 8;
start_adapt_p += 8;
@ -102,8 +104,8 @@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
}
AddLanes(far_energy, far_energy_v);
AddLanes(echo_energy_stored, echo_energy_stored_v);
AddLanes(echo_energy_adapt, echo_energy_adapt_v);
AddLanes(echo_energy_stored, echo_stored_v);
AddLanes(echo_energy_adapt, echo_adapt_v);
echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
far_spectrum[PART_LEN]);
@ -143,8 +145,9 @@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
int32_t* echo_est_p = echo_est;
int16x8_t far_spectrum_v, adapt_v;
int32x4_t echo_est_v_low, echo_est_v_high;
uint16x8_t far_spectrum_v;
int16x8_t adapt_v;
uint32x4_t echo_est_v_low, echo_est_v_high;
while (start_stored_p < end_stored_p) {
far_spectrum_v = vld1q_u16(far_spectrum_p);
@ -153,12 +156,12 @@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
vst1q_s16(start_stored_p, adapt_v);
echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v),
vget_low_u16(adapt_v));
vget_low_u16(vreinterpretq_u16_s16(adapt_v)));
echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v),
vget_high_u16(adapt_v));
vget_high_u16(vreinterpretq_u16_s16(adapt_v)));
vst1q_s32(echo_est_p, echo_est_v_low);
vst1q_s32(echo_est_p + 4, echo_est_v_high);
vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));
far_spectrum_p += 8;
start_adapt_p += 8;