From 87610ac45ea7f2503189f74de5afb6f8fa624ff2 Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 7 Jul 2017 10:37:30 -0700 Subject: [PATCH] neon: consolidate horizontal adds Change-Id: Iaf9e88ff636ccf8f0ef310869c6827f3f205cca8 --- vpx_dsp/arm/avg_neon.c | 18 +++++---------- vpx_dsp/arm/fdct_partial_neon.c | 20 ++++++---------- vpx_dsp/arm/sad_neon.c | 41 +++++++++++---------------------- vpx_dsp/arm/sum_neon.h | 38 ++++++++++++++++++++++++++++++ vpx_dsp/arm/variance_neon.c | 34 +++++++++++---------------- vpx_dsp/vpx_dsp.mk | 1 + 6 files changed, 78 insertions(+), 74 deletions(-) create mode 100644 vpx_dsp/arm/sum_neon.h diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index 257e8ffee..5cdae82f9 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -17,14 +17,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/mem_neon.h" - -static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { - const uint32x4_t a = vpaddlq_u16(v_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} +#include "vpx_dsp/arm/sum_neon.h" unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) { uint16x8_t v_sum; @@ -35,7 +28,7 @@ unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) { v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0); v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1); v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1)); - return (horizontal_add_u16x8(v_sum) + 8) >> 4; + return (vget_lane_u32(horizontal_add_uint16x8(v_sum), 0) + 8) >> 4; } unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) { @@ -61,7 +54,7 @@ unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) { v_s0 = vld1_u8(s + 7 * p); v_sum = vaddw_u8(v_sum, v_s0); - return (horizontal_add_u16x8(v_sum) + 32) >> 6; + return (vget_lane_u32(horizontal_add_uint16x8(v_sum), 0) + 32) >> 6; } // coeff: 16 bits, dynamic range [-32640, 32640]. @@ -155,7 +148,8 @@ int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { ref += 16; } - return horizontal_add_u16x8(vec_sum); + return vget_lane_s16(vreinterpret_s16_u32(horizontal_add_uint16x8(vec_sum)), + 0); } // ref, src = [0, 510] - max diff = 16-bits @@ -185,7 +179,7 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { { // Note: 'total''s pairwise addition could be implemented similarly to - // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired + // horizontal_add_uint16x8(), but one less vpaddl with 'total' when paired // with the summation of 'sse' performed better on a Cortex-A15. const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c index 51e69d026..e73de41d7 100644 --- a/vpx_dsp/arm/fdct_partial_neon.c +++ b/vpx_dsp/arm/fdct_partial_neon.c @@ -13,13 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" #include "vpx_dsp/arm/mem_neon.h" - -static INLINE int32x2_t sum_int16x8(const int16x8_t a) { - const int32x4_t b = vpaddlq_s16(a); - const int64x2_t c = vpaddlq_s32(b); - return vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), - vreinterpret_s32_s64(vget_high_s64(c))); -} +#include "vpx_dsp/arm/sum_neon.h" static INLINE tran_low_t get_lane(const int32x2_t a) { #if CONFIG_VP9_HIGHBITDEPTH @@ -48,7 +42,7 @@ void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { c = vaddq_s16(b0, b1); - d = sum_int16x8(c); + d = horizontal_add_int16x8(c); output[0] = get_lane(vshl_n_s32(d, 1)); output[1] = 0; @@ -63,7 +57,7 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { sum = vaddq_s16(sum, input_00); } - output[0] = get_lane(sum_int16x8(sum)); + output[0] = get_lane(horizontal_add_int16x8(sum)); output[1] = 0; } @@ -83,7 +77,7 @@ void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, right = vaddq_s16(right, b); } - sum = vadd_s32(sum_int16x8(left), sum_int16x8(right)); + sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right)); output[0] = get_lane(vshr_n_s32(sum, 1)); output[1] = 0; @@ -111,9 +105,9 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, a3 = vaddq_s16(a3, b3); } - sum = vadd_s32(sum_int16x8(a0), sum_int16x8(a1)); - sum = vadd_s32(sum, sum_int16x8(a2)); - sum = vadd_s32(sum, sum_int16x8(a3)); + sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1)); + sum = vadd_s32(sum, horizontal_add_int16x8(a2)); + sum = vadd_s32(sum, horizontal_add_int16x8(a3)); output[0] = get_lane(vshr_n_s32(sum, 3)); output[1] = 0; } diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index e651f7e66..9518a166b 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -14,15 +14,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/mem_neon.h" - -// TODO(johannkoenig): combine with avg_neon.h version. -static INLINE uint32_t horizontal_add_16x8(const uint16x8_t vec_16x8) { - const uint32x4_t a = vpaddlq_u16(vec_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} +#include "vpx_dsp/arm/sum_neon.h" uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { @@ -30,7 +22,7 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); - return horizontal_add_16x8(abs); + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); } uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, @@ -42,7 +34,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg)); abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); - return horizontal_add_16x8(abs); + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); } uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, @@ -58,7 +50,7 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); } - return horizontal_add_16x8(abs); + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); } uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, @@ -78,7 +70,7 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); } - return horizontal_add_16x8(abs); + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); } static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b, @@ -119,7 +111,7 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride, uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride) { \ const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \ - return horizontal_add_16x8(abs); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } \ \ uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \ @@ -127,7 +119,7 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride, const uint8_t *second_pred) { \ const uint16x8_t abs = \ sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ - return horizontal_add_16x8(abs); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } sad8xN(4); @@ -175,7 +167,7 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride, uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride) { \ const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \ - return horizontal_add_16x8(abs); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } \ \ uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \ @@ -183,7 +175,7 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride, const uint8_t *second_pred) { \ const uint16x8_t abs = \ sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ - return horizontal_add_16x8(abs); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } sad16xN(8); @@ -241,7 +233,7 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride, uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride) { \ const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \ - return horizontal_add_16x8(abs); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } \ \ uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \ @@ -249,20 +241,13 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride, const uint8_t *second_pred) { \ const uint16x8_t abs = \ sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ - return horizontal_add_16x8(abs); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } sad32xN(16); sad32xN(32); sad32xN(64); -static INLINE uint32_t horizontal_add_32x4(const uint32x4_t a) { - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} - static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const int height) { @@ -344,7 +329,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride, uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride) { \ const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \ - return horizontal_add_32x4(abs); \ + return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ } \ \ uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \ @@ -352,7 +337,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride, const uint8_t *second_pred) { \ const uint32x4_t abs = \ sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ - return horizontal_add_32x4(abs); \ + return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ } sad64xN(32); diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h new file mode 100644 index 000000000..c09841223 --- /dev/null +++ b/vpx_dsp/arm/sum_neon.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_ARM_SUM_NEON_H_ +#define VPX_DSP_ARM_SUM_NEON_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +static INLINE int32x2_t horizontal_add_int16x8(const int16x8_t a) { + const int32x4_t b = vpaddlq_s16(a); + const int64x2_t c = vpaddlq_s32(b); + return vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), + vreinterpret_s32_s64(vget_high_s64(c))); +} + +static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) { + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); +} + +static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) { + const uint64x2_t b = vpaddlq_u32(a); + return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); +} +#endif // VPX_DSP_ARM_SUM_NEON_H_ diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index a6b2c53b7..61c2c16a7 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -16,23 +16,9 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { - const int32x4_t a = vpaddlq_s16(v_16x8); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { - const int64x2_t b = vpaddlq_s32(v_32x4); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - // The variance helper functions use int16_t for sum. 8 values are accumulated // and then added (at which point they expand up to int32_t). To avoid overflow, // there can be no more than 32767 / 255 ~= 128 values accumulated in each @@ -79,8 +65,10 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b, b += 4 * b_stride; } - *sum = horizontal_add_s16x8(sum_s16); - *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32)); + *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); + *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32( + vaddq_s32(sse_lo_s32, sse_hi_s32))), + 0); } // Process a block of any size where the width is divisible by 16. @@ -126,8 +114,10 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, b += b_stride; } - *sum = horizontal_add_s16x8(sum_s16); - *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32)); + *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); + *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32( + vaddq_s32(sse_lo_s32, sse_hi_s32))), + 0); } // Process a block of width 8 two rows at a time. @@ -165,8 +155,10 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, i += 2; } while (i < h); - *sum = horizontal_add_s16x8(sum_s16); - *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32)); + *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); + *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32( + vaddq_s32(sse_lo_s32, sse_hi_s32))), + 0); } void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index c0b8d90c4..4dc1dbadd 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -368,6 +368,7 @@ endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC # Neon utilities DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/sum_neon.h DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h # PPC VSX utilities