diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index f23e946f0..a1adef297 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -476,38 +476,32 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, #endif // HAVE_SSE2 #if HAVE_NEON -HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred4, - vpx_highbd_dc_predictor_4x4_neon, - vpx_highbd_dc_left_predictor_4x4_neon, - vpx_highbd_dc_top_predictor_4x4_neon, - vpx_highbd_dc_128_predictor_4x4_neon, NULL, NULL, - vpx_highbd_d45_predictor_4x4_neon, - vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, - NULL, NULL) -HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred8, - vpx_highbd_dc_predictor_8x8_neon, - vpx_highbd_dc_left_predictor_8x8_neon, - vpx_highbd_dc_top_predictor_8x8_neon, - vpx_highbd_dc_128_predictor_8x8_neon, NULL, NULL, - vpx_highbd_d45_predictor_8x8_neon, - vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, - NULL, NULL) -HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16, - vpx_highbd_dc_predictor_16x16_neon, - vpx_highbd_dc_left_predictor_16x16_neon, - vpx_highbd_dc_top_predictor_16x16_neon, - vpx_highbd_dc_128_predictor_16x16_neon, NULL, NULL, - vpx_highbd_d45_predictor_16x16_neon, - vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL, - NULL, NULL) -HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32, - vpx_highbd_dc_predictor_32x32_neon, - vpx_highbd_dc_left_predictor_32x32_neon, - vpx_highbd_dc_top_predictor_32x32_neon, - vpx_highbd_dc_128_predictor_32x32_neon, NULL, NULL, - vpx_highbd_d45_predictor_32x32_neon, - vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL, - NULL, NULL) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon, + vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon, + vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, + vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, + vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, NULL) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, + vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, + vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, + vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, + vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, + vpx_highbd_dc_left_predictor_16x16_neon, + vpx_highbd_dc_top_predictor_16x16_neon, + vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon, + vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon, + vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, + vpx_highbd_dc_left_predictor_32x32_neon, + vpx_highbd_dc_top_predictor_32x32_neon, + vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon, + vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon, + vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL) #endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index d573baef6..74107be08 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -463,7 +463,23 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon, &vpx_highbd_dc_top_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon, - &vpx_highbd_dc_top_predictor_32x32_c, 32, 8))); + &vpx_highbd_dc_top_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon, + &vpx_highbd_h_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon, + &vpx_highbd_h_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon, + &vpx_highbd_h_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon, + &vpx_highbd_h_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon, + &vpx_highbd_v_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon, + &vpx_highbd_v_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon, + &vpx_highbd_v_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, + &vpx_highbd_v_predictor_32x32_c, 32, 8))); INSTANTIATE_TEST_CASE_P( NEON_TO_C_10, VP9HighbdIntraPredTest, @@ -515,7 +531,23 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon, &vpx_highbd_dc_top_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon, - &vpx_highbd_dc_top_predictor_32x32_c, 32, 10))); + &vpx_highbd_dc_top_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon, + &vpx_highbd_h_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon, + &vpx_highbd_h_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon, + &vpx_highbd_h_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon, + &vpx_highbd_h_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon, + &vpx_highbd_v_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon, + &vpx_highbd_v_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon, + &vpx_highbd_v_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, + &vpx_highbd_v_predictor_32x32_c, 32, 10))); INSTANTIATE_TEST_CASE_P( NEON_TO_C_12, VP9HighbdIntraPredTest, @@ -567,7 +599,23 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon, &vpx_highbd_dc_top_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon, - &vpx_highbd_dc_top_predictor_32x32_c, 32, 12))); + &vpx_highbd_dc_top_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon, + &vpx_highbd_h_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon, + &vpx_highbd_h_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon, + &vpx_highbd_h_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon, + &vpx_highbd_h_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon, + &vpx_highbd_v_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon, + &vpx_highbd_v_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon, + &vpx_highbd_v_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, + &vpx_highbd_v_predictor_32x32_c, 32, 12))); #endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index 9177fb45d..ea959586e 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -693,3 +693,205 @@ void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, row_6 = row_7; } } + +//------------------------------------------------------------------------------ + +void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t row = vld1_u16(above); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 4; i++, dst += stride) { + vst1_u16(dst, row); + } +} + +void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t row = vld1q_u16(above); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 8; i++, dst += stride) { + vst1q_u16(dst, row); + } +} + +void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8x2_t row = vld2q_u16(above); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 16; i++, dst += stride) { + vst2q_u16(dst, row); + } +} + +void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8x2_t row0 = vld2q_u16(above); + const uint16x8x2_t row1 = vld2q_u16(above + 16); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 32; i++) { + vst2q_u16(dst, row0); + dst += 16; + vst2q_u16(dst, row1); + dst += stride - 16; + } +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t left_u16 = vld1_u16(left); + uint16x4_t row; + (void)above; + (void)bd; + + row = vdup_lane_u16(left_u16, 0); + vst1_u16(dst, row); + dst += stride; + row = vdup_lane_u16(left_u16, 1); + vst1_u16(dst, row); + dst += stride; + row = vdup_lane_u16(left_u16, 2); + vst1_u16(dst, row); + dst += stride; + row = vdup_lane_u16(left_u16, 3); + vst1_u16(dst, row); +} + +void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t left_u16 = vld1q_u16(left); + const uint16x4_t left_low = vget_low_u16(left_u16); + const uint16x4_t left_high = vget_high_u16(left_u16); + uint16x8_t row; + (void)above; + (void)bd; + + row = vdupq_lane_u16(left_low, 0); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_low, 1); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_low, 2); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_low, 3); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 0); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 1); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 2); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 3); + vst1q_u16(dst, row); +} + +static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t row) { + // Note: vst1q is faster than vst2q + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += stride - 8; +} + +void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + const uint16x8_t left_u16q = vld1q_u16(left); + const uint16x4_t left_low = vget_low_u16(left_u16q); + const uint16x4_t left_high = vget_high_u16(left_u16q); + uint16x8_t row; + + row = vdupq_lane_u16(left_low, 0); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_low, 1); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_low, 2); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_low, 3); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 0); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 1); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 2); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 3); + h_store_16(&dst, stride, row); + } +} + +static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t row) { + // Note: vst1q is faster than vst2q + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += stride - 24; +} + +void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + const uint16x8_t left_u16q = vld1q_u16(left); + const uint16x4_t left_low = vget_low_u16(left_u16q); + const uint16x4_t left_high = vget_high_u16(left_u16q); + uint16x8_t row; + + row = vdupq_lane_u16(left_low, 0); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_low, 1); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_low, 2); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_low, 3); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 0); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 1); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 2); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 3); + h_store_32(&dst, stride, row); + } +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 73da9ef37..10e632a84 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -220,6 +220,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_h_predictor_4x4 neon/; add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -229,7 +230,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_v_predictor_4x4 sse2/; + specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_4x4 sse2/; @@ -260,6 +261,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_h_predictor_8x8 neon/; add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -269,7 +271,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_v_predictor_8x8 sse2/; + specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_8x8 sse2/; @@ -300,6 +302,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_h_predictor_16x16 neon/; add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -309,7 +312,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_v_predictor_16x16 sse2/; + specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_16x16 sse2/; @@ -340,6 +343,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_h_predictor_32x32 neon/; add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -349,7 +353,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_v_predictor_32x32 sse2/; + specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_tm_predictor_32x32 sse2/;