Add high bitdepth intra prediction NEON optimization (h and v)

BUG=webm:1316

Change-Id: I47eeac698a98a31d1af5f72441052302e9fa4f46
This commit is contained in:
Linfeng Zhang 2016-10-28 09:42:11 -07:00 committed by James Zern
parent 186dc40e8e
commit a3128ad33a
4 changed files with 287 additions and 39 deletions

View File

@ -476,38 +476,32 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
#endif // HAVE_SSE2
#if HAVE_NEON
HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred4,
vpx_highbd_dc_predictor_4x4_neon,
vpx_highbd_dc_left_predictor_4x4_neon,
vpx_highbd_dc_top_predictor_4x4_neon,
vpx_highbd_dc_128_predictor_4x4_neon, NULL, NULL,
vpx_highbd_d45_predictor_4x4_neon,
vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL,
NULL, NULL)
HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred8,
vpx_highbd_dc_predictor_8x8_neon,
vpx_highbd_dc_left_predictor_8x8_neon,
vpx_highbd_dc_top_predictor_8x8_neon,
vpx_highbd_dc_128_predictor_8x8_neon, NULL, NULL,
vpx_highbd_d45_predictor_8x8_neon,
vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL,
NULL, NULL)
HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
vpx_highbd_dc_predictor_16x16_neon,
vpx_highbd_dc_left_predictor_16x16_neon,
vpx_highbd_dc_top_predictor_16x16_neon,
vpx_highbd_dc_128_predictor_16x16_neon, NULL, NULL,
vpx_highbd_d45_predictor_16x16_neon,
vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL,
NULL, NULL)
HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
vpx_highbd_dc_predictor_32x32_neon,
vpx_highbd_dc_left_predictor_32x32_neon,
vpx_highbd_dc_top_predictor_32x32_neon,
vpx_highbd_dc_128_predictor_32x32_neon, NULL, NULL,
vpx_highbd_d45_predictor_32x32_neon,
vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL,
NULL, NULL)
HIGHBD_INTRA_PRED_TEST(
NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon,
vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon,
vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, NULL)
HIGHBD_INTRA_PRED_TEST(
NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL)
HIGHBD_INTRA_PRED_TEST(
NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon,
vpx_highbd_dc_left_predictor_16x16_neon,
vpx_highbd_dc_top_predictor_16x16_neon,
vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon,
vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon,
vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL)
HIGHBD_INTRA_PRED_TEST(
NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon,
vpx_highbd_dc_left_predictor_32x32_neon,
vpx_highbd_dc_top_predictor_32x32_neon,
vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon,
vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon,
vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL)
#endif // HAVE_NEON
#endif // CONFIG_VP9_HIGHBITDEPTH

View File

@ -463,7 +463,23 @@ INSTANTIATE_TEST_CASE_P(
HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
&vpx_highbd_dc_top_predictor_16x16_c, 16, 8),
HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
&vpx_highbd_dc_top_predictor_32x32_c, 32, 8)));
&vpx_highbd_dc_top_predictor_32x32_c, 32, 8),
HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
&vpx_highbd_h_predictor_4x4_c, 4, 8),
HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
&vpx_highbd_h_predictor_8x8_c, 8, 8),
HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
&vpx_highbd_h_predictor_16x16_c, 16, 8),
HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
&vpx_highbd_h_predictor_32x32_c, 32, 8),
HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
&vpx_highbd_v_predictor_4x4_c, 4, 8),
HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
&vpx_highbd_v_predictor_8x8_c, 8, 8),
HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
&vpx_highbd_v_predictor_16x16_c, 16, 8),
HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
&vpx_highbd_v_predictor_32x32_c, 32, 8)));
INSTANTIATE_TEST_CASE_P(
NEON_TO_C_10, VP9HighbdIntraPredTest,
@ -515,7 +531,23 @@ INSTANTIATE_TEST_CASE_P(
HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
&vpx_highbd_dc_top_predictor_16x16_c, 16, 10),
HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
&vpx_highbd_dc_top_predictor_32x32_c, 32, 10)));
&vpx_highbd_dc_top_predictor_32x32_c, 32, 10),
HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
&vpx_highbd_h_predictor_4x4_c, 4, 10),
HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
&vpx_highbd_h_predictor_8x8_c, 8, 10),
HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
&vpx_highbd_h_predictor_16x16_c, 16, 10),
HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
&vpx_highbd_h_predictor_32x32_c, 32, 10),
HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
&vpx_highbd_v_predictor_4x4_c, 4, 10),
HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
&vpx_highbd_v_predictor_8x8_c, 8, 10),
HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
&vpx_highbd_v_predictor_16x16_c, 16, 10),
HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
&vpx_highbd_v_predictor_32x32_c, 32, 10)));
INSTANTIATE_TEST_CASE_P(
NEON_TO_C_12, VP9HighbdIntraPredTest,
@ -567,7 +599,23 @@ INSTANTIATE_TEST_CASE_P(
HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
&vpx_highbd_dc_top_predictor_16x16_c, 16, 12),
HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
&vpx_highbd_dc_top_predictor_32x32_c, 32, 12)));
&vpx_highbd_dc_top_predictor_32x32_c, 32, 12),
HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
&vpx_highbd_h_predictor_4x4_c, 4, 12),
HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
&vpx_highbd_h_predictor_8x8_c, 8, 12),
HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
&vpx_highbd_h_predictor_16x16_c, 16, 12),
HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
&vpx_highbd_h_predictor_32x32_c, 32, 12),
HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
&vpx_highbd_v_predictor_4x4_c, 4, 12),
HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
&vpx_highbd_v_predictor_8x8_c, 8, 12),
HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
&vpx_highbd_v_predictor_16x16_c, 16, 12),
HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
&vpx_highbd_v_predictor_32x32_c, 32, 12)));
#endif // HAVE_NEON
#endif // CONFIG_VP9_HIGHBITDEPTH

View File

@ -693,3 +693,205 @@ void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
row_6 = row_7;
}
}
//------------------------------------------------------------------------------
void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x4_t row = vld1_u16(above);
int i;
(void)left;
(void)bd;
for (i = 0; i < 4; i++, dst += stride) {
vst1_u16(dst, row);
}
}
void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x8_t row = vld1q_u16(above);
int i;
(void)left;
(void)bd;
for (i = 0; i < 8; i++, dst += stride) {
vst1q_u16(dst, row);
}
}
void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x8x2_t row = vld2q_u16(above);
int i;
(void)left;
(void)bd;
for (i = 0; i < 16; i++, dst += stride) {
vst2q_u16(dst, row);
}
}
void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x8x2_t row0 = vld2q_u16(above);
const uint16x8x2_t row1 = vld2q_u16(above + 16);
int i;
(void)left;
(void)bd;
for (i = 0; i < 32; i++) {
vst2q_u16(dst, row0);
dst += 16;
vst2q_u16(dst, row1);
dst += stride - 16;
}
}
// -----------------------------------------------------------------------------
void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x4_t left_u16 = vld1_u16(left);
uint16x4_t row;
(void)above;
(void)bd;
row = vdup_lane_u16(left_u16, 0);
vst1_u16(dst, row);
dst += stride;
row = vdup_lane_u16(left_u16, 1);
vst1_u16(dst, row);
dst += stride;
row = vdup_lane_u16(left_u16, 2);
vst1_u16(dst, row);
dst += stride;
row = vdup_lane_u16(left_u16, 3);
vst1_u16(dst, row);
}
void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x8_t left_u16 = vld1q_u16(left);
const uint16x4_t left_low = vget_low_u16(left_u16);
const uint16x4_t left_high = vget_high_u16(left_u16);
uint16x8_t row;
(void)above;
(void)bd;
row = vdupq_lane_u16(left_low, 0);
vst1q_u16(dst, row);
dst += stride;
row = vdupq_lane_u16(left_low, 1);
vst1q_u16(dst, row);
dst += stride;
row = vdupq_lane_u16(left_low, 2);
vst1q_u16(dst, row);
dst += stride;
row = vdupq_lane_u16(left_low, 3);
vst1q_u16(dst, row);
dst += stride;
row = vdupq_lane_u16(left_high, 0);
vst1q_u16(dst, row);
dst += stride;
row = vdupq_lane_u16(left_high, 1);
vst1q_u16(dst, row);
dst += stride;
row = vdupq_lane_u16(left_high, 2);
vst1q_u16(dst, row);
dst += stride;
row = vdupq_lane_u16(left_high, 3);
vst1q_u16(dst, row);
}
static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
const uint16x8_t row) {
// Note: vst1q is faster than vst2q
vst1q_u16(*dst, row);
*dst += 8;
vst1q_u16(*dst, row);
*dst += stride - 8;
}
void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
int i;
(void)above;
(void)bd;
for (i = 0; i < 2; i++, left += 8) {
const uint16x8_t left_u16q = vld1q_u16(left);
const uint16x4_t left_low = vget_low_u16(left_u16q);
const uint16x4_t left_high = vget_high_u16(left_u16q);
uint16x8_t row;
row = vdupq_lane_u16(left_low, 0);
h_store_16(&dst, stride, row);
row = vdupq_lane_u16(left_low, 1);
h_store_16(&dst, stride, row);
row = vdupq_lane_u16(left_low, 2);
h_store_16(&dst, stride, row);
row = vdupq_lane_u16(left_low, 3);
h_store_16(&dst, stride, row);
row = vdupq_lane_u16(left_high, 0);
h_store_16(&dst, stride, row);
row = vdupq_lane_u16(left_high, 1);
h_store_16(&dst, stride, row);
row = vdupq_lane_u16(left_high, 2);
h_store_16(&dst, stride, row);
row = vdupq_lane_u16(left_high, 3);
h_store_16(&dst, stride, row);
}
}
static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
const uint16x8_t row) {
// Note: vst1q is faster than vst2q
vst1q_u16(*dst, row);
*dst += 8;
vst1q_u16(*dst, row);
*dst += 8;
vst1q_u16(*dst, row);
*dst += 8;
vst1q_u16(*dst, row);
*dst += stride - 24;
}
void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
int i;
(void)above;
(void)bd;
for (i = 0; i < 4; i++, left += 8) {
const uint16x8_t left_u16q = vld1q_u16(left);
const uint16x4_t left_low = vget_low_u16(left_u16q);
const uint16x4_t left_high = vget_high_u16(left_u16q);
uint16x8_t row;
row = vdupq_lane_u16(left_low, 0);
h_store_32(&dst, stride, row);
row = vdupq_lane_u16(left_low, 1);
h_store_32(&dst, stride, row);
row = vdupq_lane_u16(left_low, 2);
h_store_32(&dst, stride, row);
row = vdupq_lane_u16(left_low, 3);
h_store_32(&dst, stride, row);
row = vdupq_lane_u16(left_high, 0);
h_store_32(&dst, stride, row);
row = vdupq_lane_u16(left_high, 1);
h_store_32(&dst, stride, row);
row = vdupq_lane_u16(left_high, 2);
h_store_32(&dst, stride, row);
row = vdupq_lane_u16(left_high, 3);
h_store_32(&dst, stride, row);
}
}

View File

@ -220,6 +220,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_h_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@ -229,7 +230,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_v_predictor_4x4 sse2/;
specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_tm_predictor_4x4 sse2/;
@ -260,6 +261,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_h_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@ -269,7 +271,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_v_predictor_8x8 sse2/;
specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_tm_predictor_8x8 sse2/;
@ -300,6 +302,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_h_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@ -309,7 +312,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_v_predictor_16x16 sse2/;
specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_tm_predictor_16x16 sse2/;
@ -340,6 +343,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_h_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@ -349,7 +353,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_v_predictor_32x32 sse2/;
specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_tm_predictor_32x32 sse2/;