Add high bitdepth intra prediction NEON optimization (mode d45 and d135)

BUG=webm:1316

Change-Id: I6a330874348df04df24a6d9efdc06f567e04bf8e
This commit is contained in:
Linfeng Zhang 2016-10-27 16:06:07 -07:00
parent f5141ea45f
commit 40ab0424d4
4 changed files with 482 additions and 8 deletions

View File

@ -480,26 +480,34 @@ HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred4,
vpx_highbd_dc_predictor_4x4_neon,
vpx_highbd_dc_left_predictor_4x4_neon,
vpx_highbd_dc_top_predictor_4x4_neon,
vpx_highbd_dc_128_predictor_4x4_neon, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL)
vpx_highbd_dc_128_predictor_4x4_neon, NULL, NULL,
vpx_highbd_d45_predictor_4x4_neon,
vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL,
NULL, NULL)
HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred8,
vpx_highbd_dc_predictor_8x8_neon,
vpx_highbd_dc_left_predictor_8x8_neon,
vpx_highbd_dc_top_predictor_8x8_neon,
vpx_highbd_dc_128_predictor_8x8_neon, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL)
vpx_highbd_dc_128_predictor_8x8_neon, NULL, NULL,
vpx_highbd_d45_predictor_8x8_neon,
vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL,
NULL, NULL)
HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
vpx_highbd_dc_predictor_16x16_neon,
vpx_highbd_dc_left_predictor_16x16_neon,
vpx_highbd_dc_top_predictor_16x16_neon,
vpx_highbd_dc_128_predictor_16x16_neon, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL)
vpx_highbd_dc_128_predictor_16x16_neon, NULL, NULL,
vpx_highbd_d45_predictor_16x16_neon,
vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL,
NULL, NULL)
HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
vpx_highbd_dc_predictor_32x32_neon,
vpx_highbd_dc_left_predictor_32x32_neon,
vpx_highbd_dc_top_predictor_32x32_neon,
vpx_highbd_dc_128_predictor_32x32_neon, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL)
vpx_highbd_dc_128_predictor_32x32_neon, NULL, NULL,
vpx_highbd_d45_predictor_32x32_neon,
vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL,
NULL, NULL)
#endif // HAVE_NEON
#endif // CONFIG_VP9_HIGHBITDEPTH

View File

@ -416,6 +416,22 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
NEON_TO_C_8, VP9HighbdIntraPredTest,
::testing::Values(
HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
&vpx_highbd_d45_predictor_4x4_c, 4, 8),
HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
&vpx_highbd_d45_predictor_8x8_c, 8, 8),
HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
&vpx_highbd_d45_predictor_16x16_c, 16, 8),
HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
&vpx_highbd_d45_predictor_32x32_c, 32, 8),
HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
&vpx_highbd_d135_predictor_4x4_c, 4, 8),
HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
&vpx_highbd_d135_predictor_8x8_c, 8, 8),
HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
&vpx_highbd_d135_predictor_16x16_c, 16, 8),
HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
&vpx_highbd_d135_predictor_32x32_c, 32, 8),
HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
&vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@ -452,6 +468,22 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
NEON_TO_C_10, VP9HighbdIntraPredTest,
::testing::Values(
HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
&vpx_highbd_d45_predictor_4x4_c, 4, 10),
HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
&vpx_highbd_d45_predictor_8x8_c, 8, 10),
HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
&vpx_highbd_d45_predictor_16x16_c, 16, 10),
HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
&vpx_highbd_d45_predictor_32x32_c, 32, 10),
HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
&vpx_highbd_d135_predictor_4x4_c, 4, 10),
HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
&vpx_highbd_d135_predictor_8x8_c, 8, 10),
HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
&vpx_highbd_d135_predictor_16x16_c, 16, 10),
HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
&vpx_highbd_d135_predictor_32x32_c, 32, 10),
HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
&vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@ -488,6 +520,22 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
NEON_TO_C_12, VP9HighbdIntraPredTest,
::testing::Values(
HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
&vpx_highbd_d45_predictor_4x4_c, 4, 12),
HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
&vpx_highbd_d45_predictor_8x8_c, 8, 12),
HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
&vpx_highbd_d45_predictor_16x16_c, 16, 12),
HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
&vpx_highbd_d45_predictor_32x32_c, 32, 12),
HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
&vpx_highbd_d135_predictor_4x4_c, 4, 12),
HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
&vpx_highbd_d135_predictor_8x8_c, 8, 12),
HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
&vpx_highbd_d135_predictor_16x16_c, 16, 12),
HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
&vpx_highbd_d135_predictor_32x32_c, 32, 12),
HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
&vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,

View File

@ -283,3 +283,413 @@ void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
(void)left;
dc_store_32x32(dst, stride, dc);
}
// -----------------------------------------------------------------------------
void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x8_t ABCDEFGH = vld1q_u16(above);
const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
const uint16x4_t avg2_low = vget_low_u16(avg2);
const uint16x4_t avg2_high = vget_high_u16(avg2);
const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
(void)left;
(void)bd;
vst1_u16(dst, avg2_low);
dst += stride;
vst1_u16(dst, r1);
dst += stride;
vst1_u16(dst, r2);
dst += stride;
vst1_u16(dst, r3);
vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
}
static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
const uint16x8_t above_right, uint16x8_t *row) {
*row = vextq_u16(*row, above_right, 1);
vst1q_u16(*dst, *row);
*dst += stride;
}
void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x8_t A0 = vld1q_u16(above);
const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
const uint16x8_t A1 = vld1q_u16(above + 1);
const uint16x8_t A2 = vld1q_u16(above + 2);
const uint16x8_t avg1 = vhaddq_u16(A0, A2);
uint16x8_t row = vrhaddq_u16(avg1, A1);
(void)left;
(void)bd;
vst1q_u16(dst, row);
dst += stride;
d45_store_8(&dst, stride, above_right, &row);
d45_store_8(&dst, stride, above_right, &row);
d45_store_8(&dst, stride, above_right, &row);
d45_store_8(&dst, stride, above_right, &row);
d45_store_8(&dst, stride, above_right, &row);
d45_store_8(&dst, stride, above_right, &row);
vst1q_u16(dst, above_right);
}
static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
const uint16x8_t above_right, uint16x8_t *row_0,
uint16x8_t *row_1) {
*row_0 = vextq_u16(*row_0, *row_1, 1);
*row_1 = vextq_u16(*row_1, above_right, 1);
vst1q_u16(*dst, *row_0);
*dst += 8;
vst1q_u16(*dst, *row_1);
*dst += stride - 8;
}
void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x8_t A0_0 = vld1q_u16(above);
const uint16x8_t A0_1 = vld1q_u16(above + 8);
const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
const uint16x8_t A1_0 = vld1q_u16(above + 1);
const uint16x8_t A1_1 = vld1q_u16(above + 9);
const uint16x8_t A2_0 = vld1q_u16(above + 2);
const uint16x8_t A2_1 = vld1q_u16(above + 10);
const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
(void)left;
(void)bd;
vst1q_u16(dst, row_0);
vst1q_u16(dst + 8, row_1);
dst += stride;
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
d45_store_16(&dst, stride, above_right, &row_0, &row_1);
vst1q_u16(dst, above_right);
vst1q_u16(dst + 8, above_right);
}
void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x8_t A0_0 = vld1q_u16(above);
const uint16x8_t A0_1 = vld1q_u16(above + 8);
const uint16x8_t A0_2 = vld1q_u16(above + 16);
const uint16x8_t A0_3 = vld1q_u16(above + 24);
const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
const uint16x8_t A1_0 = vld1q_u16(above + 1);
const uint16x8_t A1_1 = vld1q_u16(above + 9);
const uint16x8_t A1_2 = vld1q_u16(above + 17);
const uint16x8_t A1_3 = vld1q_u16(above + 25);
const uint16x8_t A2_0 = vld1q_u16(above + 2);
const uint16x8_t A2_1 = vld1q_u16(above + 10);
const uint16x8_t A2_2 = vld1q_u16(above + 18);
const uint16x8_t A2_3 = vld1q_u16(above + 26);
const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
int i;
(void)left;
(void)bd;
vst1q_u16(dst, row_0);
dst += 8;
vst1q_u16(dst, row_1);
dst += 8;
vst1q_u16(dst, row_2);
dst += 8;
vst1q_u16(dst, row_3);
dst += stride - 24;
for (i = 0; i < 30; ++i) {
row_0 = vextq_u16(row_0, row_1, 1);
row_1 = vextq_u16(row_1, row_2, 1);
row_2 = vextq_u16(row_2, row_3, 1);
row_3 = vextq_u16(row_3, above_right, 1);
vst1q_u16(dst, row_0);
dst += 8;
vst1q_u16(dst, row_1);
dst += 8;
vst1q_u16(dst, row_2);
dst += 8;
vst1q_u16(dst, row_3);
dst += stride - 24;
}
vst1q_u16(dst, above_right);
dst += 8;
vst1q_u16(dst, above_right);
dst += 8;
vst1q_u16(dst, above_right);
dst += 8;
vst1q_u16(dst, above_right);
}
// -----------------------------------------------------------------------------
void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x8_t XA0123___ = vld1q_u16(above - 1);
const uint16x4_t L0123 = vld1_u16(left);
const uint16x4_t L3210 = vrev64_u16(L0123);
const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
const uint16x4_t row_0 = vget_low_u16(avg2);
const uint16x4_t row_1 = vget_high_u16(avg2);
const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
(void)bd;
vst1_u16(dst, r0);
dst += stride;
vst1_u16(dst, r1);
dst += stride;
vst1_u16(dst, r2);
dst += stride;
vst1_u16(dst, row_0);
}
void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x8_t XA0123456 = vld1q_u16(above - 1);
const uint16x8_t A01234567 = vld1q_u16(above);
const uint16x8_t A1234567_ = vld1q_u16(above + 1);
const uint16x8_t L01234567 = vld1q_u16(left);
const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
(void)bd;
vst1q_u16(dst, r0);
dst += stride;
vst1q_u16(dst, r1);
dst += stride;
vst1q_u16(dst, r2);
dst += stride;
vst1q_u16(dst, r3);
dst += stride;
vst1q_u16(dst, r4);
dst += stride;
vst1q_u16(dst, r5);
dst += stride;
vst1q_u16(dst, r6);
dst += stride;
vst1q_u16(dst, row_0);
}
static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
const uint16x8_t row_0,
const uint16x8_t row_1) {
vst1q_u16(*dst, row_0);
*dst += 8;
vst1q_u16(*dst, row_1);
*dst += stride - 8;
}
void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x8_t L01234567 = vld1q_u16(left);
const uint16x8_t L89abcdef = vld1q_u16(left + 8);
const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
const uint16x8_t XA0123456 = vld1q_u16(above - 1);
const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
const uint16x8_t A01234567 = vld1q_u16(above);
const uint16x8_t A12345678 = vld1q_u16(above + 1);
const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
const uint16x8_t A789abcde = vld1q_u16(above + 7);
const uint16x8_t A89abcdef = vld1q_u16(above + 8);
const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
(void)bd;
d135_store_16(&dst, stride, r0_0, r0_1);
d135_store_16(&dst, stride, r1_0, r1_1);
d135_store_16(&dst, stride, r2_0, r2_1);
d135_store_16(&dst, stride, r3_0, r3_1);
d135_store_16(&dst, stride, r4_0, r4_1);
d135_store_16(&dst, stride, r5_0, r5_1);
d135_store_16(&dst, stride, r6_0, r6_1);
d135_store_16(&dst, stride, row_1, row_2);
d135_store_16(&dst, stride, r8_0, r0_0);
d135_store_16(&dst, stride, r9_0, r1_0);
d135_store_16(&dst, stride, ra_0, r2_0);
d135_store_16(&dst, stride, rb_0, r3_0);
d135_store_16(&dst, stride, rc_0, r4_0);
d135_store_16(&dst, stride, rd_0, r5_0);
d135_store_16(&dst, stride, re_0, r6_0);
vst1q_u16(dst, row_0);
dst += 8;
vst1q_u16(dst, row_1);
}
void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const uint16x8_t LL01234567 = vld1q_u16(left + 16);
const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
const uint16x8_t LU01234567 = vld1q_u16(left);
const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
const uint16x8_t AL01234567 = vld1q_u16(above);
const uint16x8_t AL12345678 = vld1q_u16(above + 1);
const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
const uint16x8_t AL789abcde = vld1q_u16(above + 7);
const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
const uint16x8_t AR01234567 = vld1q_u16(above + 16);
const uint16x8_t AR12345678 = vld1q_u16(above + 17);
const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
const uint16x8_t AR789abcde = vld1q_u16(above + 23);
const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
int i, j;
(void)bd;
dst += 31 * stride;
for (i = 0; i < 4; ++i) {
for (j = 0; j < 8; ++j) {
vst1q_u16(dst, row_0);
dst += 8;
vst1q_u16(dst, row_1);
dst += 8;
vst1q_u16(dst, row_2);
dst += 8;
vst1q_u16(dst, row_3);
dst -= stride + 24;
row_0 = vextq_u16(row_0, row_1, 1);
row_1 = vextq_u16(row_1, row_2, 1);
row_2 = vextq_u16(row_2, row_3, 1);
row_3 = vextq_u16(row_3, row_4, 1);
row_4 = vextq_u16(row_4, row_4, 1);
}
row_4 = row_5;
row_5 = row_6;
row_6 = row_7;
}
}

View File

@ -211,6 +211,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d45_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@ -223,6 +224,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d135_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@ -249,6 +251,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d45_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@ -261,6 +264,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d135_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@ -287,6 +291,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d45_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@ -299,6 +304,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d135_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@ -325,6 +331,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d45_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@ -337,6 +344,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d135_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";