diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index dcc3e256f..9944bbad9 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -518,18 +518,19 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, #if HAVE_SSSE3 HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + NULL, NULL, vpx_highbd_d45_predictor_4x4_ssse3, NULL, + NULL, NULL, NULL, NULL, NULL) HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, - vpx_highbd_d207_predictor_8x8_ssse3, + NULL, NULL, vpx_highbd_d45_predictor_8x8_ssse3, NULL, + NULL, NULL, vpx_highbd_d207_predictor_8x8_ssse3, vpx_highbd_d63_predictor_8x8_ssse3, NULL) HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, - vpx_highbd_d207_predictor_16x16_ssse3, + NULL, NULL, vpx_highbd_d45_predictor_16x16_ssse3, NULL, + NULL, NULL, vpx_highbd_d207_predictor_16x16_ssse3, vpx_highbd_d63_predictor_16x16_ssse3, NULL) HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, - vpx_highbd_d207_predictor_32x32_ssse3, + NULL, NULL, vpx_highbd_d45_predictor_32x32_ssse3, NULL, + NULL, NULL, vpx_highbd_d207_predictor_32x32_ssse3, vpx_highbd_d63_predictor_32x32_ssse3, NULL) #endif // HAVE_SSSE3 diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index c3e4c7f47..01728f3b9 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -471,6 +471,14 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { INSTANTIATE_TEST_CASE_P( SSSE3_TO_C_8, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, &vpx_highbd_d63_predictor_8x8_c, 8, 8), HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, @@ -487,6 +495,14 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSSE3_TO_C_10, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, &vpx_highbd_d63_predictor_8x8_c, 8, 10), HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, @@ -503,6 +519,14 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSSE3_TO_C_12, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, &vpx_highbd_d63_predictor_8x8_c, 8, 12), HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 1b7c29991..4b3141dbd 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -192,7 +192,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_d207_predictor_4x4 sse2/; add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d45_predictor_4x4 neon/; + specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d63_predictor_4x4 sse2/; @@ -229,7 +229,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/; add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d45_predictor_8x8 neon/; + specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/; @@ -266,7 +266,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/; add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d45_predictor_16x16 neon/; + specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/; @@ -303,7 +303,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/; add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_d45_predictor_32x32 neon/; + specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/; add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/; diff --git a/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c index c0c05a8fd..fde6a8d01 100644 --- a/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c +++ b/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c @@ -35,6 +35,141 @@ static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, return _mm_avg_epu16(b, *y); } +void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); + (void)left; + (void)bd; + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); + dst[3] = above[7]; // aka H +} + +static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, + __m128i *row, const __m128i *ar) { + *row = _mm_alignr_epi8(*ar, *row, 2); + _mm_store_si128((__m128i *)*dst, *row); + *dst += stride; +} + +void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3); + dst += stride; + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); +} + +static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, + __m128i *row_0, __m128i *row_1, + const __m128i *ar) { + *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2); + *row_1 = _mm_alignr_epi8(*ar, *row_1, 2); + _mm_store_si128((__m128i *)*dst, *row_0); + _mm_store_si128((__m128i *)(*dst + 8), *row_1); + *dst += stride; +} + +void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + dst += stride; + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); +} + +void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(AR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(AR, A3, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + int i; + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + for (i = 1; i < 32; ++i) { + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2); + avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2); + avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2); + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + } +} + static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride, const __m128i *a, const __m128i *b) { _mm_store_si128((__m128i *)*dst, *a);