diff --git a/build/make/Makefile b/build/make/Makefile index 0d29609ff..90522e5f6 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -141,8 +141,8 @@ $(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 # POWER -$(BUILD_PFX)%_vsx.c.d: CFLAGS += -mvsx -$(BUILD_PFX)%_vsx.c.o: CFLAGS += -mvsx +$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx +$(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx $(BUILD_PFX)%.c.d: %.c $(if $(quiet),@echo " [DEP] $@") diff --git a/build/make/configure.sh b/build/make/configure.sh index dcfdfe1d2..fbe8b1b45 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -674,7 +674,6 @@ check_xcode_minimum_version() { process_common_toolchain() { if [ -z "$toolchain" ]; then gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}" - # detect tgt_isa case "$gcctarget" in aarch64*) @@ -697,6 +696,9 @@ process_common_toolchain() { *sparc*) tgt_isa=sparc ;; + power*64*-*) + tgt_isa=ppc64 + ;; power*) tgt_isa=ppc ;; diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 17dde1a52..d011beaf8 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -312,6 +312,19 @@ INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa, vpx_tm_predictor_32x32_msa) #endif // HAVE_MSA +#if HAVE_VSX +INTRA_PRED_TEST(VSX, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_vsx) + +INTRA_PRED_TEST(VSX, TestIntraPred16, NULL, NULL, NULL, NULL, + vpx_v_predictor_16x16_vsx, vpx_h_predictor_16x16_vsx, NULL, + NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_16x16_vsx) + +INTRA_PRED_TEST(VSX, TestIntraPred32, NULL, NULL, NULL, NULL, + vpx_v_predictor_32x32_vsx, vpx_h_predictor_32x32_vsx, NULL, + NULL, NULL, NULL, NULL, NULL, NULL) +#endif // HAVE_VSX + // ----------------------------------------------------------------------------- #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/ppc/intrapred_vsx.c b/vpx_dsp/ppc/intrapred_vsx.c new file mode 100644 index 000000000..cff6c7c2b --- /dev/null +++ b/vpx_dsp/ppc/intrapred_vsx.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, above); + int i; + (void)left; + + for (i = 0; i < 16; i++, dst += stride) { + vec_vsx_st(d, 0, dst); + } +} + +void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vec_vsx_ld(0, above); + const uint8x16_t d1 = vec_vsx_ld(16, above); + int i; + (void)left; + + for (i = 0; i < 32; i++, dst += stride) { + vec_vsx_st(d0, 0, dst); + vec_vsx_st(d1, 16, dst); + } +} + +void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + const uint8x16_t v4 = vec_splat(d, 4); + const uint8x16_t v5 = vec_splat(d, 5); + const uint8x16_t v6 = vec_splat(d, 6); + const uint8x16_t v7 = vec_splat(d, 7); + + const uint8x16_t v8 = vec_splat(d, 8); + const uint8x16_t v9 = vec_splat(d, 9); + const uint8x16_t v10 = vec_splat(d, 10); + const uint8x16_t v11 = vec_splat(d, 11); + + const uint8x16_t v12 = vec_splat(d, 12); + const uint8x16_t v13 = vec_splat(d, 13); + const uint8x16_t v14 = vec_splat(d, 14); + const uint8x16_t v15 = vec_splat(d, 15); + + (void)above; + + vec_vsx_st(v0, 0, dst); + dst += stride; + vec_vsx_st(v1, 0, dst); + dst += stride; + vec_vsx_st(v2, 0, dst); + dst += stride; + vec_vsx_st(v3, 0, dst); + dst += stride; + vec_vsx_st(v4, 0, dst); + dst += stride; + vec_vsx_st(v5, 0, dst); + dst += stride; + vec_vsx_st(v6, 0, dst); + dst += stride; + vec_vsx_st(v7, 0, dst); + dst += stride; + vec_vsx_st(v8, 0, dst); + dst += stride; + vec_vsx_st(v9, 0, dst); + dst += stride; + vec_vsx_st(v10, 0, dst); + dst += stride; + vec_vsx_st(v11, 0, dst); + dst += stride; + vec_vsx_st(v12, 0, dst); + dst += stride; + vec_vsx_st(v13, 0, dst); + dst += stride; + vec_vsx_st(v14, 0, dst); + dst += stride; + vec_vsx_st(v15, 0, dst); +} + +#define H_PREDICTOR_32(v) \ + vec_vsx_st(v, 0, dst); \ + vec_vsx_st(v, 16, dst); \ + dst += stride + +void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vec_vsx_ld(0, left); + const uint8x16_t d1 = vec_vsx_ld(16, left); + + const uint8x16_t v0_0 = vec_splat(d0, 0); + const uint8x16_t v1_0 = vec_splat(d0, 1); + const uint8x16_t v2_0 = vec_splat(d0, 2); + const uint8x16_t v3_0 = vec_splat(d0, 3); + const uint8x16_t v4_0 = vec_splat(d0, 4); + const uint8x16_t v5_0 = vec_splat(d0, 5); + const uint8x16_t v6_0 = vec_splat(d0, 6); + const uint8x16_t v7_0 = vec_splat(d0, 7); + const uint8x16_t v8_0 = vec_splat(d0, 8); + const uint8x16_t v9_0 = vec_splat(d0, 9); + const uint8x16_t v10_0 = vec_splat(d0, 10); + const uint8x16_t v11_0 = vec_splat(d0, 11); + const uint8x16_t v12_0 = vec_splat(d0, 12); + const uint8x16_t v13_0 = vec_splat(d0, 13); + const uint8x16_t v14_0 = vec_splat(d0, 14); + const uint8x16_t v15_0 = vec_splat(d0, 15); + + const uint8x16_t v0_1 = vec_splat(d1, 0); + const uint8x16_t v1_1 = vec_splat(d1, 1); + const uint8x16_t v2_1 = vec_splat(d1, 2); + const uint8x16_t v3_1 = vec_splat(d1, 3); + const uint8x16_t v4_1 = vec_splat(d1, 4); + const uint8x16_t v5_1 = vec_splat(d1, 5); + const uint8x16_t v6_1 = vec_splat(d1, 6); + const uint8x16_t v7_1 = vec_splat(d1, 7); + const uint8x16_t v8_1 = vec_splat(d1, 8); + const uint8x16_t v9_1 = vec_splat(d1, 9); + const uint8x16_t v10_1 = vec_splat(d1, 10); + const uint8x16_t v11_1 = vec_splat(d1, 11); + const uint8x16_t v12_1 = vec_splat(d1, 12); + const uint8x16_t v13_1 = vec_splat(d1, 13); + const uint8x16_t v14_1 = vec_splat(d1, 14); + const uint8x16_t v15_1 = vec_splat(d1, 15); + + (void)above; + + H_PREDICTOR_32(v0_0); + H_PREDICTOR_32(v1_0); + H_PREDICTOR_32(v2_0); + H_PREDICTOR_32(v3_0); + + H_PREDICTOR_32(v4_0); + H_PREDICTOR_32(v5_0); + H_PREDICTOR_32(v6_0); + H_PREDICTOR_32(v7_0); + + H_PREDICTOR_32(v8_0); + H_PREDICTOR_32(v9_0); + H_PREDICTOR_32(v10_0); + H_PREDICTOR_32(v11_0); + + H_PREDICTOR_32(v12_0); + H_PREDICTOR_32(v13_0); + H_PREDICTOR_32(v14_0); + H_PREDICTOR_32(v15_0); + + H_PREDICTOR_32(v0_1); + H_PREDICTOR_32(v1_1); + H_PREDICTOR_32(v2_1); + H_PREDICTOR_32(v3_1); + + H_PREDICTOR_32(v4_1); + H_PREDICTOR_32(v5_1); + H_PREDICTOR_32(v6_1); + H_PREDICTOR_32(v7_1); + + H_PREDICTOR_32(v8_1); + H_PREDICTOR_32(v9_1); + H_PREDICTOR_32(v10_1); + H_PREDICTOR_32(v11_1); + + H_PREDICTOR_32(v12_1); + H_PREDICTOR_32(v13_1); + H_PREDICTOR_32(v14_1); + H_PREDICTOR_32(v15_1); +} + +void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left)); + const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above)); + int16x8_t tmp, val; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 0), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 1), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 2), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 3), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 4), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 5), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 6), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 7), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); +} + +static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l, + int16x8_t ah, int16x8_t al, int16x8_t tl) { + int16x8_t vh, vl, ls; + + ls = vec_splat(l, 0); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 1); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 2); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 3); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 4); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 5); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 6); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 7); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); +} + +void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const uint8x16_t l = vec_vsx_ld(0, left); + const int16x8_t lh = unpack_to_s16_h(l); + const int16x8_t ll = unpack_to_s16_l(l); + const uint8x16_t a = vec_vsx_ld(0, above); + const int16x8_t ah = unpack_to_s16_h(a); + const int16x8_t al = unpack_to_s16_l(a); + + tm_predictor_16x8(dst, stride, lh, ah, al, tl); + + dst += stride * 8; + + tm_predictor_16x8(dst, stride, ll, ah, al, tl); +} diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h index 2f3aa2049..ff977c267 100644 --- a/vpx_dsp/ppc/types_vsx.h +++ b/vpx_dsp/ppc/types_vsx.h @@ -13,8 +13,31 @@ #include +typedef vector signed char int8x16_t; +typedef vector unsigned char uint8x16_t; typedef vector signed short int16x8_t; typedef vector unsigned short uint16x8_t; typedef vector signed int int32x4_t; +typedef vector unsigned int uint32x4_t; + +#ifdef WORDS_BIGENDIAN +#define unpack_to_u16_h(v) \ + (uint16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_u16_l(v) \ + (uint16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_s16_h(v) \ + (int16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_s16_l(v) \ + (int16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v) +#else +#define unpack_to_u16_h(v) \ + (uint16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_u16_l(v) \ + (uint16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_s16_h(v) \ + (int16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_s16_l(v) \ + (int16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0)) +#endif #endif // VPX_DSP_PPC_TYPES_VSX_H_ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index b959cdcda..64df59cfe 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -51,6 +51,7 @@ DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm +DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index e1181223d..5b009cfff 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -95,7 +95,7 @@ add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const specialize qw/vpx_v_predictor_8x8 neon msa sse2/; add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2/; +specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2/; @@ -119,7 +119,7 @@ add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, c specialize qw/vpx_d63_predictor_16x16 ssse3/; add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2/; +specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; @@ -130,10 +130,10 @@ add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vpx_d153_predictor_16x16 ssse3/; add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_v_predictor_16x16 neon msa sse2/; +specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/; add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_tm_predictor_16x16 neon msa sse2/; +specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/; add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2/; @@ -157,7 +157,7 @@ add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, c specialize qw/vpx_d63_predictor_32x32 ssse3/; add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_h_predictor_32x32 neon msa sse2/; +specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/; add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; @@ -168,7 +168,7 @@ add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vpx_d153_predictor_32x32 ssse3/; add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_v_predictor_32x32 neon msa sse2/; +specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/; add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_tm_predictor_32x32 neon msa sse2/;