From 40e4767f2bbe84be1b48346701a601e9d13010e3 Mon Sep 17 00:00:00 2001 From: "andrew@webrtc.org" Date: Mon, 15 Dec 2014 06:07:47 +0000 Subject: [PATCH] Add NEON intrinsics version for min_max_operations_neon.c WebRtcSpl_MinValueW32Neon, WebRtcSpl_MaxValueW32Neon, WebRtcSpl_MaxValueW16Neon and WebRtcSpl_MaxAbsValueW32Neon are added. SplTest in common_audio_unittests is passed on ARM32/ARM64 platforms. BUG=4002 R=andrew@webrtc.org, jridges@masque.com Change-Id: Id461d64c3313f56147edadd2231e8845574ead2a Review URL: https://webrtc-codereview.appspot.com/28259004 Patch from Yang Zhang . git-svn-id: http://webrtc.googlecode.com/svn/trunk@7889 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../min_max_operations_neon.c | 185 ++++++++++++++++++ 1 file changed, 185 insertions(+) diff --git a/webrtc/common_audio/signal_processing/min_max_operations_neon.c b/webrtc/common_audio/signal_processing/min_max_operations_neon.c index 704911e63..dec31ad31 100644 --- a/webrtc/common_audio/signal_processing/min_max_operations_neon.c +++ b/webrtc/common_audio/signal_processing/min_max_operations_neon.c @@ -67,6 +67,147 @@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length) { return (int16_t)maximum; } +// Maximum absolute value of word32 vector. NEON intrinsics version for +// ARM 32-bit/64-bit platforms. +int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length) { + // Use uint32_t for the local variables, to accommodate the return value + // of abs(0x80000000), which is 0x80000000. + + uint32_t absolute = 0, maximum = 0; + int i = 0; + int residual = length & 0x7; + + if (vector == NULL || length <= 0) { + return -1; + } + + const int32_t* p_start = vector; + uint32x4_t max32x4_0 = vdupq_n_u32(0); + uint32x4_t max32x4_1 = vdupq_n_u32(0); + + // First part, unroll the loop 8 times. + for (i = length - residual; i >0; i -= 8) { + int32x4_t in32x4_0 = vld1q_s32(p_start); + p_start += 4; + int32x4_t in32x4_1 = vld1q_s32(p_start); + p_start += 4; + in32x4_0 = vabsq_s32(in32x4_0); + in32x4_1 = vabsq_s32(in32x4_1); + // vabs doesn't change the value of 0x80000000. + // Use u32 so we don't lose the value 0x80000000. + max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0)); + max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1)); + } + + uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1); +#if defined(WEBRTC_ARCH_ARM64) + maximum = vmaxvq_u32(max32x4); +#else + uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4)); + max32x2 = vpmax_u32(max32x2, max32x2); + + maximum = vget_lane_u32(max32x2, 0); +#endif + + // Second part, do the remaining iterations (if any). + for (i = residual; i > 0; i--) { + absolute = abs((int)(*p_start)); + if (absolute > maximum) { + maximum = absolute; + } + p_start++; + } + + // Guard against the case for 0x80000000. + maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX); + + return (int32_t)maximum; +} + +// Maximum value of word16 vector. NEON intrinsics version for +// ARM 32-bit/64-bit platforms. +int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length) { + int16_t maximum = WEBRTC_SPL_WORD16_MIN; + int i = 0; + int residual = length & 0x7; + + if (vector == NULL || length <= 0) { + return maximum; + } + + const int16_t* p_start = vector; + int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN); + + // First part, unroll the loop 8 times. + for (i = length - residual; i >0; i -= 8) { + int16x8_t in16x8 = vld1q_s16(p_start); + max16x8 = vmaxq_s16(max16x8, in16x8); + p_start += 8; + } + +#if defined(WEBRTC_ARCH_ARM64) + maximum = vmaxvq_s16(max16x8); +#else + int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8)); + max16x4 = vpmax_s16(max16x4, max16x4); + max16x4 = vpmax_s16(max16x4, max16x4); + + maximum = vget_lane_s16(max16x4, 0); +#endif + + // Second part, do the remaining iterations (if any). + for (i = residual; i > 0; i--) { + if (*p_start > maximum) + maximum = *p_start; + p_start++; + } + return maximum; +} + +// Maximum value of word32 vector. NEON intrinsics version for +// ARM 32-bit/64-bit platforms. +int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length) { + int32_t maximum = WEBRTC_SPL_WORD32_MIN; + int i = 0; + int residual = length & 0x7; + + if (vector == NULL || length <= 0) { + return maximum; + } + + const int32_t* p_start = vector; + int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN); + int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN); + + // First part, unroll the loop 8 times. + for (i = length - residual; i >0; i -= 8) { + int32x4_t in32x4_0 = vld1q_s32(p_start); + p_start += 4; + int32x4_t in32x4_1 = vld1q_s32(p_start); + p_start += 4; + max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0); + max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1); + } + + int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1); +#if defined(WEBRTC_ARCH_ARM64) + maximum = vmaxvq_s32(max32x4); +#else + int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4)); + max32x2 = vpmax_s32(max32x2, max32x2); + + maximum = vget_lane_s32(max32x2, 0); +#endif + + // Second part, do the remaining iterations (if any). + for (i = residual; i > 0; i--) { + if (*p_start > maximum) + maximum = *p_start; + p_start++; + } + return maximum; +} + // Minimum value of word16 vector. NEON intrinsics version for // ARM 32-bit/64-bit platforms. int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length) { @@ -107,3 +248,47 @@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length) { return minimum; } +// Minimum value of word32 vector. NEON intrinsics version for +// ARM 32-bit/64-bit platforms. +int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length) { + int32_t minimum = WEBRTC_SPL_WORD32_MAX; + int i = 0; + int residual = length & 0x7; + + if (vector == NULL || length <= 0) { + return minimum; + } + + const int32_t* p_start = vector; + int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX); + int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX); + + // First part, unroll the loop 8 times. + for (i = length - residual; i >0; i -= 8) { + int32x4_t in32x4_0 = vld1q_s32(p_start); + p_start += 4; + int32x4_t in32x4_1 = vld1q_s32(p_start); + p_start += 4; + min32x4_0 = vminq_s32(min32x4_0, in32x4_0); + min32x4_1 = vminq_s32(min32x4_1, in32x4_1); + } + + int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1); +#if defined(WEBRTC_ARCH_ARM64) + minimum = vminvq_s32(min32x4); +#else + int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4)); + min32x2 = vpmin_s32(min32x2, min32x2); + + minimum = vget_lane_s32(min32x2, 0); +#endif + + // Second part, do the remaining iterations (if any). + for (i = residual; i > 0; i--) { + if (*p_start < minimum) + minimum = *p_start; + p_start++; + } + return minimum; +} +