replace inline assembly WebRtcAecm_CalcLinearEnergiesNeon by intrinsics.

The modification only uses the unique part of the CalcLinearEnergies
 function. Pass byte to byte conformance test both on ARMv7 and ARM64,
 and the single function performance is similar with original assembly
 version on different platforms. If not specified, the code is compiled
 by GCC 4.6. The result is the "X version / C version" ratio, and the
 less is better.

| run 100k times             | cortex-a7 | cortex-a9 | cortex-a15 |
| use C as the base on each  |  (1.2Ghz) |  (1.0Ghz) |   (1.7Ghz) |
| CPU target                 |           |           |            |
|----------------------------+-----------+-----------+------------|
| Neon asm                   |    19.48% |    19.26% |     13.68% |
| Neon inline                |    27.90% |    28.87% |     17.79% |
| Neon intrinsics (GCC 4.8)  |    18.69% |    20.18% |     14.69% |
| Neon intrinsics (LLVM 3.4) |    18.52% |    21.15% |     13.56% |

BUG=3580
R=andrew@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/23349004

Patch from Zhongwei Yao <zhongwei.yao@arm.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@7686 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
andrew@webrtc.org 2014-11-11 19:34:14 +00:00
parent e497be3de1
commit 0e37b898f0

View File

@ -227,72 +227,85 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
}
}
static inline void AddLanes(uint32_t* ptr, uint32x4_t v) {
#if defined(__aarch64__)
*(ptr) = vaddvq_u32(v);
#else
uint32x2_t tmp_v;
tmp_v = vadd_u32(vget_low_u32(v), vget_high_u32(v));
tmp_v = vpadd_u32(tmp_v, tmp_v);
*(ptr) = vget_lane_u32(tmp_v, 0);
#endif
}
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
const uint16_t* far_spectrum,
int32_t* echo_est,
uint32_t* far_energy,
uint32_t* echo_energy_adapt,
uint32_t* echo_energy_stored) {
int i;
int16_t* start_stored_p = aecm->channelStored;
int16_t* start_adapt_p = aecm->channelAdapt16;
int32_t* echo_est_p = echo_est;
const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
const uint16_t* far_spectrum_p = far_spectrum;
int16x8_t store_v, adapt_v, spectrum_v;
uint32x4_t echo_est_v_low, echo_est_v_high;
uint32x4_t far_energy_v, echo_energy_stored_v, echo_energy_adapt_v;
register uint32_t far_energy_r;
register uint32_t echo_energy_stored_r;
register uint32_t echo_energy_adapt_r;
far_energy_v = vdupq_n_u32(0);
echo_energy_adapt_v = vdupq_n_u32(0);
echo_energy_stored_v = vdupq_n_u32(0);
assert((uintptr_t)echo_est % 32 == 0);
assert((uintptr_t)(aecm->channelStored) % 16 == 0);
assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
assert((uintptr_t)(aecm->channelStored) % 16 == 0);
assert((uintptr_t)(aecm->channelStored) % 16 == 0);
// Get energy for the delayed far end signal and estimated
// echo using both stored and adapted channels.
// The C code:
// for (i = 0; i < PART_LEN1; i++) {
// echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
// far_spectrum[i]);
// (*far_energy) += (uint32_t)(far_spectrum[i]);
// *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
// (*echo_energy_stored) += (uint32_t)echo_est[i];
// }
while (start_stored_p < end_stored_p) {
spectrum_v = vld1q_u16(far_spectrum_p);
adapt_v = vld1q_s16(start_adapt_p);
store_v = vld1q_s16(start_stored_p);
__asm __volatile("vmov.i32 q14, #0" : : : "q14"); // far_energy
__asm __volatile("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored
__asm __volatile("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt
far_energy_v = vaddw_u16(far_energy_v, vget_low_s16(spectrum_v));
far_energy_v = vaddw_u16(far_energy_v, vget_high_s16(spectrum_v));
for (i = 0; i < PART_LEN - 7; i += 8) {
// far_energy += (uint32_t)(far_spectrum[i]);
__asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
__asm __volatile("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
__asm __volatile("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
echo_est_v_low = vmull_u16(vget_low_s16(store_v), vget_low_s16(spectrum_v));
echo_est_v_high = vmull_u16(vget_high_s16(store_v),
vget_high_s16(spectrum_v));
vst1q_s32(echo_est_p, echo_est_v_low);
vst1q_s32(echo_est_p + 4, echo_est_v_high);
// Get estimated echo energies for adaptive channel and stored channel.
// echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
__asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
__asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm __volatile("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
"q10", "q11");
echo_energy_stored_v = vaddq_s32(echo_est_v_low, echo_energy_stored_v);
echo_energy_stored_v = vaddq_s32(echo_est_v_high, echo_energy_stored_v);
// echo_energy_stored += (uint32_t)echoEst[i];
__asm __volatile("vadd.u32 q8, q10" : : : "q10", "q8");
__asm __volatile("vadd.u32 q8, q11" : : : "q11", "q8");
echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
vget_low_s16(adapt_v),
vget_low_s16(spectrum_v));
echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
vget_high_s16(adapt_v),
vget_high_s16(spectrum_v));
// echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
__asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
__asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm __volatile("vadd.u32 q9, q10" : : : "q9", "q15");
__asm __volatile("vadd.u32 q9, q11" : : : "q9", "q11");
start_stored_p += 8;
start_adapt_p += 8;
far_spectrum_p += 8;
echo_est_p += 8;
}
__asm __volatile("vadd.u32 d28, d29" : : : "q14");
__asm __volatile("vpadd.u32 d28, d28" : : : "q14");
__asm __volatile("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
AddLanes(far_energy, far_energy_v);
AddLanes(echo_energy_stored, echo_energy_stored_v);
AddLanes(echo_energy_adapt, echo_energy_adapt_v);
__asm __volatile("vadd.u32 d18, d19" : : : "q9");
__asm __volatile("vpadd.u32 d18, d18" : : : "q9");
__asm __volatile("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
__asm __volatile("vadd.u32 d16, d17" : : : "q8");
__asm __volatile("vpadd.u32 d16, d16" : : : "q8");
__asm __volatile("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
// Get estimated echo energies for adaptive channel and stored channel.
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
*echo_energy_stored = echo_energy_stored_r + (uint32_t)echo_est[i];
*far_energy = far_energy_r + (uint32_t)(far_spectrum[i]);
*echo_energy_adapt = echo_energy_adapt_r +
aecm->channelAdapt16[i] * far_spectrum[i];
echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
far_spectrum[PART_LEN]);
*echo_energy_stored += (uint32_t)echo_est[PART_LEN];
*far_energy += (uint32_t)far_spectrum[PART_LEN];
*echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
}
void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,