Adding WebRtcSpl_MaxAbsValueW16 intrinsics version

The modification only uses the unique part of the WebRtcSpl_MaxAbsValue
 function. Pass Spltest.MinMaxOperationTest conformance test on both
 ARMv7 and ARM64. And the single function performance is similar with
 original assembly version on different platforms. If not specified, the
 code is compiled by GCC 4.6. The result is the "X version / C version"
 ratio, and the less is better.

| run 100k times             | cortex-a7 | cortex-a15 |
| use C as the base on each  |  (1.2Ghz) |   (1.7Ghz) |
| CPU target                 |           |            |
|----------------------------+-----------+------------|
| Neon asm                   |       32% |        15% |
| Neon intrinsics (GCC 4.6)  |       36% |        37% |
| Neon intrinsics (GCC 4.8)  |       35% |        18% |

BUG=3580
R=andrew@webrtc.org, jridges@masque.com

Change-Id: Ia2f6822ec58774b401cc440b6751a97e540b5048

Review URL: https://webrtc-codereview.appspot.com/30109004

git-svn-id: http://webrtc.googlecode.com/svn/trunk@7803 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
andrew@webrtc.org 2014-12-03 21:59:02 +00:00
parent 3a52458237
commit fd4acf6d55

View File

@ -0,0 +1,69 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include <stdlib.h>
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
// Maximum absolute value of word16 vector. C version for generic platforms.
int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length) {
int absolute = 0, maximum = 0;
if (vector == NULL || length <= 0) {
return -1;
}
const int16_t* p_start = vector;
int rest = length & 7;
const int16_t* p_end = vector + length - rest;
int16x8_t v;
uint16x8_t max_qv;
max_qv = vdupq_n_u16(0);
while (p_start < p_end) {
v = vld1q_s16(p_start);
// Note vabs doesn't change the value of -32768.
v = vabsq_s16(v);
// Use u16 so we don't lose the value -32768.
max_qv = vmaxq_u16(max_qv, vreinterpretq_u16_s16(v));
p_start += 8;
}
#ifdef WEBRTC_ARCH_ARM64
maximum = (int)vmaxvq_u16(max_qv);
#else
uint16x4_t max_dv;
max_dv = vmax_u16(vget_low_u16(max_qv), vget_high_u16(max_qv));
max_dv = vpmax_u16(max_dv, max_dv);
max_dv = vpmax_u16(max_dv, max_dv);
maximum = (int)vget_lane_u16(max_dv, 0);
#endif
p_end = vector + length;
while (p_start < p_end) {
absolute = abs((int)(*p_start));
if (absolute > maximum) {
maximum = absolute;
}
p_start++;
}
// Guard the case for abs(-32768).
if (maximum > WEBRTC_SPL_WORD16_MAX) {
maximum = WEBRTC_SPL_WORD16_MAX;
}
return (int16_t)maximum;
}