From 46d8660ce37b4d379325d5a76dea0208c8fa47f9 Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 27 Jun 2017 12:44:54 -0700 Subject: [PATCH 1/5] sad neon: rewrite 4x4 and add 4x8 The previous implementation loaded 8 values (discarding half) BUG=webm:1425 Change-Id: Icb72a94e2557a4ee2db7091266ab58fd92f72158 --- test/sad_test.cc | 1 + vpx_dsp/arm/sad_neon.c | 60 ++++++++++++++++++------------------ vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 7bf6e1385..00a4bc084 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -649,6 +649,7 @@ const SadMxNParam neon_tests[] = { SadMxNParam(16, 8, &vpx_sad16x8_neon), SadMxNParam(8, 16, &vpx_sad8x16_neon), SadMxNParam(8, 8, &vpx_sad8x8_neon), + SadMxNParam(4, 8, &vpx_sad4x8_neon), SadMxNParam(4, 4, &vpx_sad4x4_neon), }; INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index ff3228768..f5acb9149 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride) { @@ -45,32 +46,38 @@ unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride, return vget_lane_u32(d5, 0); } -unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x2_t d1; - uint64x1_t d3; +// TODO(johannkoenig): combine with avg_neon.h version. +static INLINE uint32_t horizontal_add_16x8(const uint16x8_t vec_16x8) { + const uint32x4_t a = vpaddlq_u16(vec_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); + abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); + return horizontal_add_16x8(abs); +} + +uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { int i; - - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); - - for (i = 0; i < 3; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); + uint16x8_t abs = vdupq_n_u16(0); + for (i = 0; i < 8; i += 4) { + const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; + abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8)); + abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); } - d1 = vpaddl_u16(vget_low_u16(q12)); - d3 = vpaddl_u32(d1); - - return vget_lane_u32(vreinterpret_u32_u64(d3), 0); + return horizontal_add_16x8(abs); } unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride, @@ -119,13 +126,6 @@ static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, vreinterpret_u32_u64(vget_high_u64(b))); return vget_lane_u32(c, 0); } -static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { - const uint32x4_t a = vpaddlq_u16(vec_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 51aa9f637..96e2bb7e4 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -729,7 +729,7 @@ add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, specialize qw/vpx_sad8x4 msa sse2/; add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad4x8 msa sse2/; +specialize qw/vpx_sad4x8 neon msa sse2/; add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad4x4 neon msa sse2/; From e40e78be246b2521749ce1fa6bdcd02e1f729a37 Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 27 Jun 2017 13:02:28 -0700 Subject: [PATCH 2/5] sad neon: rewrite 8x8 and 8x16 BUG=webm:1425 Change-Id: I068f06c67b841f09ea07c04ada0c2f1706102138 --- test/sad_test.cc | 1 + vpx_dsp/arm/sad_neon.c | 79 +++++++++++++++--------------------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 3 files changed, 35 insertions(+), 47 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 00a4bc084..045980642 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -649,6 +649,7 @@ const SadMxNParam neon_tests[] = { SadMxNParam(16, 8, &vpx_sad16x8_neon), SadMxNParam(8, 16, &vpx_sad8x16_neon), SadMxNParam(8, 8, &vpx_sad8x8_neon), + SadMxNParam(8, 4, &vpx_sad8x4_neon), SadMxNParam(4, 8, &vpx_sad4x8_neon), SadMxNParam(4, 4, &vpx_sad4x4_neon), }; diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index f5acb9149..cbc904feb 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -15,37 +15,6 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/mem_neon.h" -unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; - int i; - - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); - - for (i = 0; i < 15; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); - } - - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); - - return vget_lane_u32(d5, 0); -} - // TODO(johannkoenig): combine with avg_neon.h version. static INLINE uint32_t horizontal_add_16x8(const uint16x8_t vec_16x8) { const uint32x4_t a = vpaddlq_u16(vec_16x8); @@ -80,6 +49,39 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, return horizontal_add_16x8(abs); } +static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, const int height) { + int i; + uint16x8_t abs = vdupq_n_u16(0); + + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(a); + const uint8x8_t b_u8 = vld1_u8(b); + a += a_stride; + b += b_stride; + abs = vabal_u8(abs, a_u8, b_u8); + } + return abs; +} + +uint32_t vpx_sad8x4_neon(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride) { + const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, 4); + return horizontal_add_16x8(abs); +} + +uint32_t vpx_sad8x8_neon(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride) { + const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, 8); + return horizontal_add_16x8(abs); +} + +uint32_t vpx_sad8x16_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, 16); + return horizontal_add_16x8(abs); +} + unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride) { uint8x16_t q0, q4; @@ -206,18 +208,3 @@ unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, } return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); } - -unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum = vdupq_n_u16(0); - - for (i = 0; i < 8; ++i) { - const uint8x8_t vec_src = vld1_u8(src); - const uint8x8_t vec_ref = vld1_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); - } - return horizontal_add_16x8(vec_accum); -} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 96e2bb7e4..88497b796 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -726,7 +726,7 @@ add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, specialize qw/vpx_sad8x8 neon msa sse2/; add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x4 msa sse2/; +specialize qw/vpx_sad8x4 neon msa sse2/; add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad4x8 neon msa sse2/; From 469643757f3b3616acd0157b670a910fc5b78577 Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 27 Jun 2017 14:15:58 -0700 Subject: [PATCH 3/5] sad neon: rewrite 16x8, 16x16, add 16x32 BUG=webm:1425 Change-Id: Ie126553e5fffcdfaf3d82a85b368ac10ce9ab082 --- test/sad_test.cc | 1 + vpx_dsp/arm/sad_neon.c | 74 ++++++++++++++---------------------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 3 files changed, 30 insertions(+), 47 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 045980642..50965adf4 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -645,6 +645,7 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); const SadMxNParam neon_tests[] = { SadMxNParam(64, 64, &vpx_sad64x64_neon), SadMxNParam(32, 32, &vpx_sad32x32_neon), + SadMxNParam(16, 32, &vpx_sad16x32_neon), SadMxNParam(16, 16, &vpx_sad16x16_neon), SadMxNParam(16, 8, &vpx_sad16x8_neon), SadMxNParam(8, 16, &vpx_sad8x16_neon), diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index cbc904feb..f3e04231f 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -82,38 +82,39 @@ uint32_t vpx_sad8x16_neon(const uint8_t *src, int src_stride, return horizontal_add_16x8(abs); } -unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x16_t q0, q4; - uint16x8_t q12, q13; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; +static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const int height) { int i; + uint16x8_t abs = vdupq_n_u16(0); - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); - - for (i = 0; i < 7; i++) { - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); + for (i = 0; i < height; ++i) { + const uint8x16_t a_u8 = vld1q_u8(a); + const uint8x16_t b_u8 = vld1q_u8(b); + a += a_stride; + b += b_stride; + abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8)); + abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8)); } + return abs; +} - q12 = vaddq_u16(q12, q13); - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); +uint32_t vpx_sad16x8_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, 8); + return horizontal_add_16x8(abs); +} - return vget_lane_u32(d5, 0); +uint32_t vpx_sad16x16_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, 16); + return horizontal_add_16x8(abs); +} + +uint32_t vpx_sad16x32_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, 32); + return horizontal_add_16x8(abs); } static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, @@ -189,22 +190,3 @@ unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride, } return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); } - -unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - - for (i = 0; i < 16; ++i) { - const uint8x16_t vec_src = vld1q_u8(src); - const uint8x16_t vec_ref = vld1q_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum_lo = - vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref)); - vec_accum_hi = - vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref)); - } - return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); -} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 88497b796..392bc258d 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -711,7 +711,7 @@ add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride specialize qw/vpx_sad32x16 avx2 msa sse2 vsx/; add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x32 msa sse2 vsx/; +specialize qw/vpx_sad16x32 neon msa sse2 vsx/; add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad16x16 neon msa sse2 vsx/; From 77a648885cf1a271217fd66e7290932147523572 Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 27 Jun 2017 14:33:28 -0700 Subject: [PATCH 4/5] sad neon: rewrite 32x32, add 32x16 and 32x64 BUG=webm:1425 Change-Id: I966650df7e3face93e1e771634d1cc5458a35f85 --- vpx_dsp/arm/sad_neon.c | 64 ++++++++++++++++++++++-------------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 ++-- 2 files changed, 42 insertions(+), 28 deletions(-) diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index f3e04231f..5c2c90f94 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -117,6 +117,45 @@ uint32_t vpx_sad16x32_neon(const uint8_t *src, int src_stride, return horizontal_add_16x8(abs); } +static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const int height) { + int i; + uint16x8_t abs = vdupq_n_u16(0); + + for (i = 0; i < height; ++i) { + const uint8x16_t a_lo = vld1q_u8(a); + const uint8x16_t a_hi = vld1q_u8(a + 16); + const uint8x16_t b_lo = vld1q_u8(b); + const uint8x16_t b_hi = vld1q_u8(b + 16); + a += a_stride; + b += b_stride; + abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo)); + abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo)); + abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi)); + abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi)); + } + return abs; +} + +uint32_t vpx_sad32x16_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, 16); + return horizontal_add_16x8(abs); +} + +uint32_t vpx_sad32x32_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, 32); + return horizontal_add_16x8(abs); +} + +uint32_t vpx_sad32x64_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, 64); + return horizontal_add_16x8(abs); +} + static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, const uint16x8_t vec_hi) { const uint32x4_t vec_l_lo = @@ -165,28 +204,3 @@ unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, } return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); } - -unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - - for (i = 0; i < 32; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - } - return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); -} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 392bc258d..1a9c6a393 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -702,13 +702,13 @@ add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride specialize qw/vpx_sad64x32 avx2 msa sse2 vsx/; add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x64 avx2 msa sse2 vsx/; +specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx/; add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x32 avx2 neon msa sse2 vsx/; +specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx/; add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad32x16 avx2 msa sse2 vsx/; +specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx/; add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad16x32 neon msa sse2 vsx/; From ad011aaab82fbc51f4fdf3d3c0939e9723bfc3d9 Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 27 Jun 2017 15:17:31 -0700 Subject: [PATCH 5/5] sad neon: rewrite 64x64 and add 64x32 BUG=webm:1425 Change-Id: Ib454762d1c61b05a98324fe81ad58c9e09784717 --- test/sad_test.cc | 1 + vpx_dsp/arm/sad_neon.c | 82 +++++++++++++++++++----------------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 3 files changed, 46 insertions(+), 41 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 50965adf4..8c7742717 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -644,6 +644,7 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); #if HAVE_NEON const SadMxNParam neon_tests[] = { SadMxNParam(64, 64, &vpx_sad64x64_neon), + SadMxNParam(64, 32, &vpx_sad64x32_neon), SadMxNParam(32, 32, &vpx_sad32x32_neon), SadMxNParam(16, 32, &vpx_sad16x32_neon), SadMxNParam(16, 16, &vpx_sad16x16_neon), diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index 5c2c90f94..41c764c6f 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -156,51 +156,55 @@ uint32_t vpx_sad32x64_neon(const uint8_t *src, int src_stride, return horizontal_add_16x8(abs); } -static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, - const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = - vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = - vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); - const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); +static INLINE uint32_t horizontal_add_32x4(const uint32x4_t a) { const uint64x2_t b = vpaddlq_u32(a); const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), vreinterpret_u32_u64(vget_high_u64(b))); return vget_lane_u32(c, 0); } -unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const int height) { int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - for (i = 0; i < 64; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_src_32 = vld1q_u8(src + 32); - const uint8x16_t vec_src_48 = vld1q_u8(src + 48); - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); - const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), - vget_low_u8(vec_ref_32)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), - vget_high_u8(vec_ref_32)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), - vget_low_u8(vec_ref_48)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), - vget_high_u8(vec_ref_48)); + uint16x8_t abs_0 = vdupq_n_u16(0); + uint16x8_t abs_1 = vdupq_n_u16(0); + + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(a); + const uint8x16_t a_1 = vld1q_u8(a + 16); + const uint8x16_t a_2 = vld1q_u8(a + 32); + const uint8x16_t a_3 = vld1q_u8(a + 48); + const uint8x16_t b_0 = vld1q_u8(b); + const uint8x16_t b_1 = vld1q_u8(b + 16); + const uint8x16_t b_2 = vld1q_u8(b + 32); + const uint8x16_t b_3 = vld1q_u8(b + 48); + a += a_stride; + b += b_stride; + abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0)); + abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0)); + abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1)); + abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1)); + abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2)); + abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2)); + abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3)); + abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3)); + } + + { + const uint32x4_t sum = vpaddlq_u16(abs_0); + return vpadalq_u16(sum, abs_1); } - return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); +} + +uint32_t vpx_sad64x32_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, 32); + return horizontal_add_32x4(abs); +} + +uint32_t vpx_sad64x64_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, 64); + return horizontal_add_32x4(abs); } diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 1a9c6a393..49d25fdd0 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -696,10 +696,10 @@ specialize qw/vpx_subtract_block neon msa sse2/; # Single block SAD # add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad64x64 avx2 neon msa sse2 vsx/; +specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx/; add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad64x32 avx2 msa sse2 vsx/; +specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx/; add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx/;