Merge "Add Hadamard for Power8"

This commit is contained in:
Johann Koenig 2017-03-16 21:52:15 +00:00 committed by Gerrit Code Review
commit cd3d7cf4ac
7 changed files with 351 additions and 6 deletions

View File

@ -13,6 +13,7 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/vpx_timer.h"
#include "test/acm_random.h"
#include "test/register_state_check.h"
@ -99,8 +100,31 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFunc> {
ACMRandom rnd_;
};
void HadamardSpeedTest(const char *name, HadamardFunc const func,
const int16_t *input, int stride, tran_low_t *output,
int times) {
int i;
vpx_usec_timer timer;
vpx_usec_timer_start(&timer);
for (i = 0; i < times; ++i) {
func(input, stride, output);
}
vpx_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
printf("%s[%12d runs]: %d us\n", name, times, elapsed_time);
}
class Hadamard8x8Test : public HadamardTestBase {};
void HadamardSpeedTest8x8(HadamardFunc const func, int times) {
DECLARE_ALIGNED(16, int16_t, input[64]);
DECLARE_ALIGNED(16, tran_low_t, output[64]);
memset(input, 1, sizeof(input));
HadamardSpeedTest("Hadamard8x8", func, input, 8, output, times);
}
TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
DECLARE_ALIGNED(16, int16_t, a[64]);
DECLARE_ALIGNED(16, tran_low_t, b[64]);
@ -142,6 +166,12 @@ TEST_P(Hadamard8x8Test, VaryStride) {
}
}
TEST_P(Hadamard8x8Test, DISABLED_Speed) {
HadamardSpeedTest8x8(h_func_, 10);
HadamardSpeedTest8x8(h_func_, 10000);
HadamardSpeedTest8x8(h_func_, 10000000);
}
INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
::testing::Values(&vpx_hadamard_8x8_c));
@ -169,8 +199,20 @@ INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test,
#endif // HAVE_MSA
#endif // !CONFIG_VP9_HIGHBITDEPTH
#if HAVE_VSX
INSTANTIATE_TEST_CASE_P(VSX, Hadamard8x8Test,
::testing::Values(&vpx_hadamard_8x8_vsx));
#endif // HAVE_VSX
class Hadamard16x16Test : public HadamardTestBase {};
void HadamardSpeedTest16x16(HadamardFunc const func, int times) {
DECLARE_ALIGNED(16, int16_t, input[256]);
DECLARE_ALIGNED(16, tran_low_t, output[256]);
memset(input, 1, sizeof(input));
HadamardSpeedTest("Hadamard16x16", func, input, 16, output, times);
}
TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]);
@ -212,6 +254,12 @@ TEST_P(Hadamard16x16Test, VaryStride) {
}
}
TEST_P(Hadamard16x16Test, DISABLED_Speed) {
HadamardSpeedTest16x16(h_func_, 10);
HadamardSpeedTest16x16(h_func_, 10000);
HadamardSpeedTest16x16(h_func_, 10000000);
}
INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_c));
@ -220,6 +268,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_sse2));
#endif // HAVE_SSE2
#if HAVE_VSX
INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_vsx));
#endif // HAVE_VSX
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_neon));

View File

@ -0,0 +1,47 @@
/*
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
#define VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/ppc/types_vsx.h"
// Load 8 16 bit values. If the source is 32 bits then pack down with
// saturation.
static INLINE int16x8_t load_tran_low(int32_t c, const tran_low_t *s) {
#if CONFIG_VP9_HIGHBITDEPTH
int32x4_t u = vec_vsx_ld(c, s);
int32x4_t v = vec_vsx_ld(c, s + 4);
return vec_packs(u, v);
#else
return vec_vsx_ld(c, s);
#endif
}
// Store 8 16 bit values. If the destination is 32 bits then sign extend the
// values by multiplying by 1.
static INLINE void store_tran_low(int16x8_t v, int32_t c, tran_low_t *s) {
#if CONFIG_VP9_HIGHBITDEPTH
const int16x8_t one = vec_splat_s16(1);
const int32x4_t even = vec_mule(v, one);
const int32x4_t odd = vec_mulo(v, one);
const int32x4_t high = vec_mergeh(even, odd);
const int32x4_t low = vec_mergel(even, odd);
vec_vsx_st(high, c, s);
vec_vsx_st(low, c, s + 4);
#else
vec_vsx_st(v, c, s);
#endif
}
#endif // VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_

119
vpx_dsp/ppc/hadamard_vsx.c Normal file
View File

@ -0,0 +1,119 @@
/*
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/ppc/types_vsx.h"
#include "vpx_dsp/ppc/transpose_vsx.h"
#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) {
const int16x8_t b0 = vec_add(v[0], v[1]);
const int16x8_t b1 = vec_sub(v[0], v[1]);
const int16x8_t b2 = vec_add(v[2], v[3]);
const int16x8_t b3 = vec_sub(v[2], v[3]);
const int16x8_t b4 = vec_add(v[4], v[5]);
const int16x8_t b5 = vec_sub(v[4], v[5]);
const int16x8_t b6 = vec_add(v[6], v[7]);
const int16x8_t b7 = vec_sub(v[6], v[7]);
const int16x8_t c0 = vec_add(b0, b2);
const int16x8_t c1 = vec_add(b1, b3);
const int16x8_t c2 = vec_sub(b0, b2);
const int16x8_t c3 = vec_sub(b1, b3);
const int16x8_t c4 = vec_add(b4, b6);
const int16x8_t c5 = vec_add(b5, b7);
const int16x8_t c6 = vec_sub(b4, b6);
const int16x8_t c7 = vec_sub(b5, b7);
v[0] = vec_add(c0, c4);
v[1] = vec_sub(c2, c6);
v[2] = vec_sub(c0, c4);
v[3] = vec_add(c2, c6);
v[4] = vec_add(c3, c7);
v[5] = vec_sub(c3, c7);
v[6] = vec_sub(c1, c5);
v[7] = vec_add(c1, c5);
}
void vpx_hadamard_8x8_vsx(const int16_t *src_diff, int src_stride,
tran_low_t *coeff) {
int16x8_t v[8];
v[0] = vec_vsx_ld(0, src_diff);
v[1] = vec_vsx_ld(0, src_diff + src_stride);
v[2] = vec_vsx_ld(0, src_diff + (2 * src_stride));
v[3] = vec_vsx_ld(0, src_diff + (3 * src_stride));
v[4] = vec_vsx_ld(0, src_diff + (4 * src_stride));
v[5] = vec_vsx_ld(0, src_diff + (5 * src_stride));
v[6] = vec_vsx_ld(0, src_diff + (6 * src_stride));
v[7] = vec_vsx_ld(0, src_diff + (7 * src_stride));
vpx_hadamard_s16_8x8_one_pass(v);
vpx_transpose_s16_8x8(v);
vpx_hadamard_s16_8x8_one_pass(v);
store_tran_low(v[0], 0, coeff);
store_tran_low(v[1], 0, coeff + 8);
store_tran_low(v[2], 0, coeff + 16);
store_tran_low(v[3], 0, coeff + 24);
store_tran_low(v[4], 0, coeff + 32);
store_tran_low(v[5], 0, coeff + 40);
store_tran_low(v[6], 0, coeff + 48);
store_tran_low(v[7], 0, coeff + 56);
}
void vpx_hadamard_16x16_vsx(const int16_t *src_diff, int src_stride,
tran_low_t *coeff) {
int i;
const uint16x8_t ones = vec_splat_u16(1);
/* Rearrange 16x16 to 8x32 and remove stride.
* Top left first. */
vpx_hadamard_8x8_vsx(src_diff, src_stride, coeff);
/* Top right. */
vpx_hadamard_8x8_vsx(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
/* Bottom left. */
vpx_hadamard_8x8_vsx(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
/* Bottom right. */
vpx_hadamard_8x8_vsx(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
/* Overlay the 8x8 blocks and combine. */
for (i = 0; i < 64; i += 8) {
const int16x8_t a0 = load_tran_low(0, coeff);
const int16x8_t a1 = load_tran_low(0, coeff + 64);
const int16x8_t a2 = load_tran_low(0, coeff + 128);
const int16x8_t a3 = load_tran_low(0, coeff + 192);
/* Prevent the result from escaping int16_t. */
const int16x8_t b0 = vec_sra(a0, ones);
const int16x8_t b1 = vec_sra(a1, ones);
const int16x8_t b2 = vec_sra(a2, ones);
const int16x8_t b3 = vec_sra(a3, ones);
const int16x8_t c0 = vec_add(b0, b1);
const int16x8_t c2 = vec_add(b2, b3);
const int16x8_t c1 = vec_sub(b0, b1);
const int16x8_t c3 = vec_sub(b2, b3);
const int16x8_t d0 = vec_add(c0, c2);
const int16x8_t d1 = vec_add(c1, c3);
const int16x8_t d2 = vec_sub(c0, c2);
const int16x8_t d3 = vec_sub(c1, c3);
store_tran_low(d0, 0, coeff);
store_tran_low(d1, 0, coeff + 64);
store_tran_low(d2, 0, coeff + 128);
store_tran_low(d3, 0, coeff + 192);
coeff += 8;
}
}

101
vpx_dsp/ppc/transpose_vsx.h Normal file
View File

@ -0,0 +1,101 @@
/*
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_DSP_PPC_TRANSPOSE_VSX_H_
#define VPX_DSP_PPC_TRANSPOSE_VSX_H_
#include "./vpx_config.h"
#include "vpx_dsp/ppc/types_vsx.h"
static INLINE void vpx_transpose_s16_8x8(int16x8_t v[8]) {
// d = vec_mergeh(a,b):
// The even elements of the result are obtained left-to-right,
// from the high elements of a.
// The odd elements of the result are obtained left-to-right,
// from the high elements of b.
//
// d = vec_mergel(a,b):
// The even elements of the result are obtained left-to-right,
// from the low elements of a.
// The odd elements of the result are obtained left-to-right,
// from the low elements of b.
// Example, starting with:
// v[0]: 00 01 02 03 04 05 06 07
// v[1]: 10 11 12 13 14 15 16 17
// v[2]: 20 21 22 23 24 25 26 27
// v[3]: 30 31 32 33 34 35 36 37
// v[4]: 40 41 42 43 44 45 46 47
// v[5]: 50 51 52 53 54 55 56 57
// v[6]: 60 61 62 63 64 65 66 67
// v[7]: 70 71 72 73 74 75 76 77
int16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
b0 = vec_mergeh(v[0], v[4]);
b1 = vec_mergel(v[0], v[4]);
b2 = vec_mergeh(v[1], v[5]);
b3 = vec_mergel(v[1], v[5]);
b4 = vec_mergeh(v[2], v[6]);
b5 = vec_mergel(v[2], v[6]);
b6 = vec_mergeh(v[3], v[7]);
b7 = vec_mergel(v[3], v[7]);
// After first merge operation
// b0: 00 40 01 41 02 42 03 43
// b1: 04 44 05 45 06 46 07 47
// b2: 10 50 11 51 12 52 13 53
// b3: 14 54 15 55 16 56 17 57
// b4: 20 60 21 61 22 62 23 63
// b5: 24 64 25 65 26 66 27 67
// b6: 30 70 31 71 32 62 33 73
// b7: 34 74 35 75 36 76 37 77
c0 = vec_mergeh(b0, b4);
c1 = vec_mergel(b0, b4);
c2 = vec_mergeh(b1, b5);
c3 = vec_mergel(b1, b5);
c4 = vec_mergeh(b2, b6);
c5 = vec_mergel(b2, b6);
c6 = vec_mergeh(b3, b7);
c7 = vec_mergel(b3, b7);
// After second merge operation
// c0: 00 20 40 60 01 21 41 61
// c1: 02 22 42 62 03 23 43 63
// c2: 04 24 44 64 05 25 45 65
// c3: 06 26 46 66 07 27 47 67
// c4: 10 30 50 70 11 31 51 71
// c5: 12 32 52 72 13 33 53 73
// c6: 14 34 54 74 15 35 55 75
// c7: 16 36 56 76 17 37 57 77
v[0] = vec_mergeh(c0, c4);
v[1] = vec_mergel(c0, c4);
v[2] = vec_mergeh(c1, c5);
v[3] = vec_mergel(c1, c5);
v[4] = vec_mergeh(c2, c6);
v[5] = vec_mergel(c2, c6);
v[6] = vec_mergeh(c3, c7);
v[7] = vec_mergel(c3, c7);
// After last merge operation
// v[0]: 00 10 20 30 40 50 60 70
// v[1]: 01 11 21 31 41 51 61 71
// v[2]: 02 12 22 32 42 52 62 72
// v[3]: 03 13 23 33 43 53 63 73
// v[4]: 04 14 24 34 44 54 64 74
// v[5]: 05 15 25 35 45 55 65 75
// v[6]: 06 16 26 36 46 56 66 76
// v[7]: 07 17 27 37 47 57 67 77
}
#endif // VPX_DSP_PPC_TRANSPOSE_VSX_H_

20
vpx_dsp/ppc/types_vsx.h Normal file
View File

@ -0,0 +1,20 @@
/*
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_DSP_PPC_TYPES_VSX_H_
#define VPX_DSP_PPC_TYPES_VSX_H_
#include <altivec.h>
typedef vector signed short int16x8_t;
typedef vector unsigned short uint16x8_t;
typedef vector signed int int32x4_t;
#endif // VPX_DSP_PPC_TYPES_VSX_H_

View File

@ -264,11 +264,12 @@ endif
DSP_SRCS-yes += avg.c
DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c
DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c
DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c
ifeq ($(ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
endif
DSP_SRCS-$(HAVE_VSX) += ppc/hadamard_vsx.c
endif # CONFIG_VP9_ENCODER
@ -337,6 +338,11 @@ endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
# Neon utilities
DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h
# PPC VSX utilities
DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h
DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h
DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
DSP_SRCS-yes += vpx_dsp_rtcd.c

View File

@ -908,22 +908,21 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
specialize qw/vpx_minmax_8x8 sse2 neon msa/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
specialize qw/vpx_hadamard_16x16 sse2 neon/;
specialize qw/vpx_hadamard_16x16 sse2 neon vsx/;
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
specialize qw/vpx_satd sse2 neon/;
} else {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64";
specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_16x16 sse2 neon msa/;
specialize qw/vpx_hadamard_16x16 sse2 neon msa vsx/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
specialize qw/vpx_satd sse2 neon msa/;