Merge "Add Hadamard for Power8"

2017-03-16 21:52:15 +00:00 · 2017-03-16 21:52:15 +00:00 · cd3d7cf4ac
commit cd3d7cf4ac
parent ba8bfaafa7 405b94c661
7 changed files with 351 additions and 6 deletions
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@ -13,6 +13,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"

 #include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/vpx_timer.h"

 #include "test/acm_random.h"
 #include "test/register_state_check.h"
@ -99,8 +100,31 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFunc> {
  ACMRandom rnd_;
 };

+void HadamardSpeedTest(const char *name, HadamardFunc const func,
+                       const int16_t *input, int stride, tran_low_t *output,
+                       int times) {
+  int i;
+  vpx_usec_timer timer;
+
+  vpx_usec_timer_start(&timer);
+  for (i = 0; i < times; ++i) {
+    func(input, stride, output);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("%s[%12d runs]: %d us\n", name, times, elapsed_time);
+}
+
 class Hadamard8x8Test : public HadamardTestBase {};

+void HadamardSpeedTest8x8(HadamardFunc const func, int times) {
+  DECLARE_ALIGNED(16, int16_t, input[64]);
+  DECLARE_ALIGNED(16, tran_low_t, output[64]);
+  memset(input, 1, sizeof(input));
+  HadamardSpeedTest("Hadamard8x8", func, input, 8, output, times);
+}
+
 TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
  DECLARE_ALIGNED(16, int16_t, a[64]);
  DECLARE_ALIGNED(16, tran_low_t, b[64]);
@ -142,6 +166,12 @@ TEST_P(Hadamard8x8Test, VaryStride) {
  }
 }

+TEST_P(Hadamard8x8Test, DISABLED_Speed) {
+  HadamardSpeedTest8x8(h_func_, 10);
+  HadamardSpeedTest8x8(h_func_, 10000);
+  HadamardSpeedTest8x8(h_func_, 10000000);
+}
+
 INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
                        ::testing::Values(&vpx_hadamard_8x8_c));

@ -169,8 +199,20 @@ INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test,
 #endif  // HAVE_MSA
 #endif  // !CONFIG_VP9_HIGHBITDEPTH

+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_vsx));
+#endif  // HAVE_VSX
+
 class Hadamard16x16Test : public HadamardTestBase {};

+void HadamardSpeedTest16x16(HadamardFunc const func, int times) {
+  DECLARE_ALIGNED(16, int16_t, input[256]);
+  DECLARE_ALIGNED(16, tran_low_t, output[256]);
+  memset(input, 1, sizeof(input));
+  HadamardSpeedTest("Hadamard16x16", func, input, 16, output, times);
+}
+
 TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
  DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
  DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]);
@ -212,6 +254,12 @@ TEST_P(Hadamard16x16Test, VaryStride) {
  }
 }

+TEST_P(Hadamard16x16Test, DISABLED_Speed) {
+  HadamardSpeedTest16x16(h_func_, 10);
+  HadamardSpeedTest16x16(h_func_, 10000);
+  HadamardSpeedTest16x16(h_func_, 10000000);
+}
+
 INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
                        ::testing::Values(&vpx_hadamard_16x16_c));

@ -220,6 +268,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
                        ::testing::Values(&vpx_hadamard_16x16_sse2));
 #endif  // HAVE_SSE2

+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_vsx));
+#endif  // HAVE_VSX
+
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
                        ::testing::Values(&vpx_hadamard_16x16_neon));
--- a/vpx_dsp/ppc/bitdepth_conversion_vsx.h
+++ b/vpx_dsp/ppc/bitdepth_conversion_vsx.h
@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+#define VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE int16x8_t load_tran_low(int32_t c, const tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  int32x4_t u = vec_vsx_ld(c, s);
+  int32x4_t v = vec_vsx_ld(c, s + 4);
+  return vec_packs(u, v);
+#else
+  return vec_vsx_ld(c, s);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(int16x8_t v, int32_t c, tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16x8_t one = vec_splat_s16(1);
+  const int32x4_t even = vec_mule(v, one);
+  const int32x4_t odd = vec_mulo(v, one);
+  const int32x4_t high = vec_mergeh(even, odd);
+  const int32x4_t low = vec_mergel(even, odd);
+  vec_vsx_st(high, c, s);
+  vec_vsx_st(low, c, s + 4);
+#else
+  vec_vsx_st(v, c, s);
+#endif
+}
+
+#endif  // VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
--- a/vpx_dsp/ppc/hadamard_vsx.c
+++ b/vpx_dsp/ppc/hadamard_vsx.c
@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+#include "vpx_dsp/ppc/transpose_vsx.h"
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+
+static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) {
+  const int16x8_t b0 = vec_add(v[0], v[1]);
+  const int16x8_t b1 = vec_sub(v[0], v[1]);
+  const int16x8_t b2 = vec_add(v[2], v[3]);
+  const int16x8_t b3 = vec_sub(v[2], v[3]);
+  const int16x8_t b4 = vec_add(v[4], v[5]);
+  const int16x8_t b5 = vec_sub(v[4], v[5]);
+  const int16x8_t b6 = vec_add(v[6], v[7]);
+  const int16x8_t b7 = vec_sub(v[6], v[7]);
+
+  const int16x8_t c0 = vec_add(b0, b2);
+  const int16x8_t c1 = vec_add(b1, b3);
+  const int16x8_t c2 = vec_sub(b0, b2);
+  const int16x8_t c3 = vec_sub(b1, b3);
+  const int16x8_t c4 = vec_add(b4, b6);
+  const int16x8_t c5 = vec_add(b5, b7);
+  const int16x8_t c6 = vec_sub(b4, b6);
+  const int16x8_t c7 = vec_sub(b5, b7);
+
+  v[0] = vec_add(c0, c4);
+  v[1] = vec_sub(c2, c6);
+  v[2] = vec_sub(c0, c4);
+  v[3] = vec_add(c2, c6);
+  v[4] = vec_add(c3, c7);
+  v[5] = vec_sub(c3, c7);
+  v[6] = vec_sub(c1, c5);
+  v[7] = vec_add(c1, c5);
+}
+
+void vpx_hadamard_8x8_vsx(const int16_t *src_diff, int src_stride,
+                          tran_low_t *coeff) {
+  int16x8_t v[8];
+
+  v[0] = vec_vsx_ld(0, src_diff);
+  v[1] = vec_vsx_ld(0, src_diff + src_stride);
+  v[2] = vec_vsx_ld(0, src_diff + (2 * src_stride));
+  v[3] = vec_vsx_ld(0, src_diff + (3 * src_stride));
+  v[4] = vec_vsx_ld(0, src_diff + (4 * src_stride));
+  v[5] = vec_vsx_ld(0, src_diff + (5 * src_stride));
+  v[6] = vec_vsx_ld(0, src_diff + (6 * src_stride));
+  v[7] = vec_vsx_ld(0, src_diff + (7 * src_stride));
+
+  vpx_hadamard_s16_8x8_one_pass(v);
+
+  vpx_transpose_s16_8x8(v);
+
+  vpx_hadamard_s16_8x8_one_pass(v);
+
+  store_tran_low(v[0], 0, coeff);
+  store_tran_low(v[1], 0, coeff + 8);
+  store_tran_low(v[2], 0, coeff + 16);
+  store_tran_low(v[3], 0, coeff + 24);
+  store_tran_low(v[4], 0, coeff + 32);
+  store_tran_low(v[5], 0, coeff + 40);
+  store_tran_low(v[6], 0, coeff + 48);
+  store_tran_low(v[7], 0, coeff + 56);
+}
+
+void vpx_hadamard_16x16_vsx(const int16_t *src_diff, int src_stride,
+                            tran_low_t *coeff) {
+  int i;
+  const uint16x8_t ones = vec_splat_u16(1);
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  vpx_hadamard_8x8_vsx(src_diff, src_stride, coeff);
+  /* Top right. */
+  vpx_hadamard_8x8_vsx(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  /* Bottom left. */
+  vpx_hadamard_8x8_vsx(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  /* Bottom right. */
+  vpx_hadamard_8x8_vsx(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+  /* Overlay the 8x8 blocks and combine. */
+  for (i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = load_tran_low(0, coeff);
+    const int16x8_t a1 = load_tran_low(0, coeff + 64);
+    const int16x8_t a2 = load_tran_low(0, coeff + 128);
+    const int16x8_t a3 = load_tran_low(0, coeff + 192);
+
+    /* Prevent the result from escaping int16_t. */
+    const int16x8_t b0 = vec_sra(a0, ones);
+    const int16x8_t b1 = vec_sra(a1, ones);
+    const int16x8_t b2 = vec_sra(a2, ones);
+    const int16x8_t b3 = vec_sra(a3, ones);
+
+    const int16x8_t c0 = vec_add(b0, b1);
+    const int16x8_t c2 = vec_add(b2, b3);
+    const int16x8_t c1 = vec_sub(b0, b1);
+    const int16x8_t c3 = vec_sub(b2, b3);
+
+    const int16x8_t d0 = vec_add(c0, c2);
+    const int16x8_t d1 = vec_add(c1, c3);
+    const int16x8_t d2 = vec_sub(c0, c2);
+    const int16x8_t d3 = vec_sub(c1, c3);
+
+    store_tran_low(d0, 0, coeff);
+    store_tran_low(d1, 0, coeff + 64);
+    store_tran_low(d2, 0, coeff + 128);
+    store_tran_low(d3, 0, coeff + 192);
+
+    coeff += 8;
+  }
+}
--- a/vpx_dsp/ppc/transpose_vsx.h
+++ b/vpx_dsp/ppc/transpose_vsx.h
@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_PPC_TRANSPOSE_VSX_H_
+#define VPX_DSP_PPC_TRANSPOSE_VSX_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static INLINE void vpx_transpose_s16_8x8(int16x8_t v[8]) {
+  // d = vec_mergeh(a,b):
+  // The even elements of the result are obtained left-to-right,
+  // from the high elements of a.
+  // The odd elements of the result are obtained left-to-right,
+  // from the high elements of b.
+  //
+  // d = vec_mergel(a,b):
+  // The even elements of the result are obtained left-to-right,
+  // from the low elements of a.
+  // The odd elements of the result are obtained left-to-right,
+  // from the low elements of b.
+
+  // Example, starting with:
+  // v[0]: 00 01 02 03 04 05 06 07
+  // v[1]: 10 11 12 13 14 15 16 17
+  // v[2]: 20 21 22 23 24 25 26 27
+  // v[3]: 30 31 32 33 34 35 36 37
+  // v[4]: 40 41 42 43 44 45 46 47
+  // v[5]: 50 51 52 53 54 55 56 57
+  // v[6]: 60 61 62 63 64 65 66 67
+  // v[7]: 70 71 72 73 74 75 76 77
+
+  int16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+
+  b0 = vec_mergeh(v[0], v[4]);
+  b1 = vec_mergel(v[0], v[4]);
+  b2 = vec_mergeh(v[1], v[5]);
+  b3 = vec_mergel(v[1], v[5]);
+  b4 = vec_mergeh(v[2], v[6]);
+  b5 = vec_mergel(v[2], v[6]);
+  b6 = vec_mergeh(v[3], v[7]);
+  b7 = vec_mergel(v[3], v[7]);
+
+  // After first merge operation
+  // b0: 00 40 01 41 02 42 03 43
+  // b1: 04 44 05 45 06 46 07 47
+  // b2: 10 50 11 51 12 52 13 53
+  // b3: 14 54 15 55 16 56 17 57
+  // b4: 20 60 21 61 22 62 23 63
+  // b5: 24 64 25 65 26 66 27 67
+  // b6: 30 70 31 71 32 62 33 73
+  // b7: 34 74 35 75 36 76 37 77
+
+  c0 = vec_mergeh(b0, b4);
+  c1 = vec_mergel(b0, b4);
+  c2 = vec_mergeh(b1, b5);
+  c3 = vec_mergel(b1, b5);
+  c4 = vec_mergeh(b2, b6);
+  c5 = vec_mergel(b2, b6);
+  c6 = vec_mergeh(b3, b7);
+  c7 = vec_mergel(b3, b7);
+
+  // After second merge operation
+  // c0: 00 20 40 60 01 21 41 61
+  // c1: 02 22 42 62 03 23 43 63
+  // c2: 04 24 44 64 05 25 45 65
+  // c3: 06 26 46 66 07 27 47 67
+  // c4: 10 30 50 70 11 31 51 71
+  // c5: 12 32 52 72 13 33 53 73
+  // c6: 14 34 54 74 15 35 55 75
+  // c7: 16 36 56 76 17 37 57 77
+
+  v[0] = vec_mergeh(c0, c4);
+  v[1] = vec_mergel(c0, c4);
+  v[2] = vec_mergeh(c1, c5);
+  v[3] = vec_mergel(c1, c5);
+  v[4] = vec_mergeh(c2, c6);
+  v[5] = vec_mergel(c2, c6);
+  v[6] = vec_mergeh(c3, c7);
+  v[7] = vec_mergel(c3, c7);
+
+  // After last merge operation
+  // v[0]: 00 10 20 30 40 50 60 70
+  // v[1]: 01 11 21 31 41 51 61 71
+  // v[2]: 02 12 22 32 42 52 62 72
+  // v[3]: 03 13 23 33 43 53 63 73
+  // v[4]: 04 14 24 34 44 54 64 74
+  // v[5]: 05 15 25 35 45 55 65 75
+  // v[6]: 06 16 26 36 46 56 66 76
+  // v[7]: 07 17 27 37 47 57 67 77
+}
+
+#endif  // VPX_DSP_PPC_TRANSPOSE_VSX_H_
--- a/vpx_dsp/ppc/types_vsx.h
+++ b/vpx_dsp/ppc/types_vsx.h
@ -0,0 +1,20 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_PPC_TYPES_VSX_H_
+#define VPX_DSP_PPC_TYPES_VSX_H_
+
+#include <altivec.h>
+
+typedef vector signed short int16x8_t;
+typedef vector unsigned short uint16x8_t;
+typedef vector signed int int32x4_t;
+
+#endif  // VPX_DSP_PPC_TYPES_VSX_H_
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@ -264,11 +264,12 @@ endif
 DSP_SRCS-yes           += avg.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
-DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
 DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
+DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
 ifeq ($(ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
 endif
+DSP_SRCS-$(HAVE_VSX)   += ppc/hadamard_vsx.c

 endif  # CONFIG_VP9_ENCODER

@ -337,6 +338,11 @@ endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 # Neon utilities
 DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h

+# PPC VSX utilities
+DSP_SRCS-$(HAVE_VSX)  += ppc/types_vsx.h
+DSP_SRCS-$(HAVE_VSX)  += ppc/transpose_vsx.h
+DSP_SRCS-$(HAVE_VSX)  += ppc/bitdepth_conversion_vsx.h
+
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)

 DSP_SRCS-yes += vpx_dsp_rtcd.c
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@ -908,22 +908,21 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
  add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
  specialize qw/vpx_minmax_8x8 sse2 neon msa/;

-
  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
+    specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";

    add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_16x16 sse2 neon/;
+    specialize qw/vpx_hadamard_16x16 sse2 neon vsx/;

    add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
    specialize qw/vpx_satd sse2 neon/;
  } else {
    add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64";
+    specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";

    add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_16x16 sse2 neon msa/;
+    specialize qw/vpx_hadamard_16x16 sse2 neon msa vsx/;

    add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
    specialize qw/vpx_satd sse2 neon msa/;