From bc4bcca3fdcc839794daf6cc0aa9eacc3befb854 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Tue, 5 Sep 2017 07:52:36 -0700 Subject: [PATCH] vpxdsp: [x86] add highbd_dc_128_predictor functions C vs SSE2 speed gains: _4x4 : ~7.64x _8x8 : ~16.60x _16x16 : ~8.15x _32x32 : ~5.05x BUG=webm:1411 Change-Id: If165d419711cfda901bd428a05ca1560a009e62e --- test/test_intra_pred_speed.cc | 12 ++++--- test/vp9_intrapred_test.cc | 24 +++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 ++--- vpx_dsp/x86/highbd_intrapred_intrin_sse2.c | 40 ++++++++++++++++++++++ 4 files changed, 76 insertions(+), 8 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 8a50c7546..cbc1a8c43 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -483,7 +483,8 @@ HIGHBD_INTRA_PRED_TEST( HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2, vpx_highbd_dc_left_predictor_4x4_sse2, - vpx_highbd_dc_top_predictor_4x4_sse2, NULL, + vpx_highbd_dc_top_predictor_4x4_sse2, + vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2, vpx_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL, vpx_highbd_tm_predictor_4x4_c) @@ -491,7 +492,8 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4, HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_sse2, vpx_highbd_dc_left_predictor_8x8_sse2, - vpx_highbd_dc_top_predictor_8x8_sse2, NULL, + vpx_highbd_dc_top_predictor_8x8_sse2, + vpx_highbd_dc_128_predictor_8x8_sse2, vpx_highbd_v_predictor_8x8_sse2, vpx_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2) @@ -499,7 +501,8 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8, HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_sse2, vpx_highbd_dc_left_predictor_16x16_sse2, - vpx_highbd_dc_top_predictor_16x16_sse2, NULL, + vpx_highbd_dc_top_predictor_16x16_sse2, + vpx_highbd_dc_128_predictor_16x16_sse2, vpx_highbd_v_predictor_16x16_sse2, vpx_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL, vpx_highbd_tm_predictor_16x16_sse2) @@ -507,7 +510,8 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16, HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_sse2, vpx_highbd_dc_left_predictor_32x32_sse2, - vpx_highbd_dc_top_predictor_32x32_sse2, NULL, + vpx_highbd_dc_top_predictor_32x32_sse2, + vpx_highbd_dc_128_predictor_32x32_sse2, vpx_highbd_v_predictor_32x32_sse2, vpx_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL, vpx_highbd_tm_predictor_32x32_sse2) diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index d87215b43..96985bd14 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -471,6 +471,14 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { INSTANTIATE_TEST_CASE_P( SSE2_TO_C_8, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, &vpx_highbd_dc_left_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, @@ -523,6 +531,14 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2_TO_C_10, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, &vpx_highbd_dc_left_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, @@ -575,6 +591,14 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2_TO_C_12, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, &vpx_highbd_dc_left_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 3826b1301..1b0a5c27e 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -220,7 +220,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_128_predictor_4x4 neon/; + specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -255,7 +255,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_128_predictor_8x8 neon/; + specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -290,7 +290,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_128_predictor_16x16 neon/; + specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -325,7 +325,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_128_predictor_32x32 neon/; + specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/; } # CONFIG_VP9_HIGHBITDEPTH # diff --git a/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c index 668178fdd..83113a293 100644 --- a/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c @@ -199,6 +199,16 @@ void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, dc_store_4x4(dst, stride, &dc); } +void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x4(dst, stride, &dc_dup); +} + //------------------------------------------------------------------------------ // DC 8x8 @@ -243,6 +253,16 @@ void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, dc_store_8x8(dst, stride, &dc); } +void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_8x8(dst, stride, &dc_dup); +} + //------------------------------------------------------------------------------ // DC 16x16 @@ -285,6 +305,16 @@ void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, dc_store_16x16(dst, stride, &dc); } +void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16x16(dst, stride, &dc_dup); +} + //------------------------------------------------------------------------------ // DC 32x32 @@ -331,3 +361,13 @@ void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, (void)bd; dc_store_32x32(dst, stride, &dc); } + +void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32x32(dst, stride, &dc_dup); +}