vpxdsp: [x86] add highbd_dc_left_predictor functions

C vs SSE2 speed gains:
_4x4 : ~6.49x
_8x8 : ~10.82x
_16x16 : ~7.61x
_32x32 : ~5.29x

BUG=webm:1411

Change-Id: Ibc30c50cb7139049bf05298010803499e6ef949b
This commit is contained in:
Scott LaVarnway 2017-08-30 09:13:14 -07:00
parent 2d0c11093e
commit c39a05ff61
4 changed files with 80 additions and 8 deletions

View File

@ -481,28 +481,32 @@ HIGHBD_INTRA_PRED_TEST(
#if HAVE_SSE2 #if HAVE_SSE2
HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4, HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4,
vpx_highbd_dc_predictor_4x4_sse2, NULL, vpx_highbd_dc_predictor_4x4_sse2,
vpx_highbd_dc_left_predictor_4x4_sse2,
vpx_highbd_dc_top_predictor_4x4_sse2, NULL, vpx_highbd_dc_top_predictor_4x4_sse2, NULL,
vpx_highbd_v_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2,
vpx_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, vpx_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL,
NULL, NULL, vpx_highbd_tm_predictor_4x4_c) NULL, NULL, vpx_highbd_tm_predictor_4x4_c)
HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8, HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8,
vpx_highbd_dc_predictor_8x8_sse2, NULL, vpx_highbd_dc_predictor_8x8_sse2,
vpx_highbd_dc_left_predictor_8x8_sse2,
vpx_highbd_dc_top_predictor_8x8_sse2, NULL, vpx_highbd_dc_top_predictor_8x8_sse2, NULL,
vpx_highbd_v_predictor_8x8_sse2, vpx_highbd_v_predictor_8x8_sse2,
vpx_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, vpx_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2) NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2)
HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16, HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16,
vpx_highbd_dc_predictor_16x16_sse2, NULL, vpx_highbd_dc_predictor_16x16_sse2,
vpx_highbd_dc_left_predictor_16x16_sse2,
vpx_highbd_dc_top_predictor_16x16_sse2, NULL, vpx_highbd_dc_top_predictor_16x16_sse2, NULL,
vpx_highbd_v_predictor_16x16_sse2, vpx_highbd_v_predictor_16x16_sse2,
vpx_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL, vpx_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL,
NULL, NULL, NULL, vpx_highbd_tm_predictor_16x16_sse2) NULL, NULL, NULL, vpx_highbd_tm_predictor_16x16_sse2)
HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
vpx_highbd_dc_predictor_32x32_sse2, NULL, vpx_highbd_dc_predictor_32x32_sse2,
vpx_highbd_dc_left_predictor_32x32_sse2,
vpx_highbd_dc_top_predictor_32x32_sse2, NULL, vpx_highbd_dc_top_predictor_32x32_sse2, NULL,
vpx_highbd_v_predictor_32x32_sse2, vpx_highbd_v_predictor_32x32_sse2,
vpx_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL, vpx_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL,

View File

@ -471,6 +471,14 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) {
INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P(
SSE2_TO_C_8, VP9HighbdIntraPredTest, SSE2_TO_C_8, VP9HighbdIntraPredTest,
::testing::Values( ::testing::Values(
HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
&vpx_highbd_dc_left_predictor_4x4_c, 4, 8),
HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
&vpx_highbd_dc_left_predictor_8x8_c, 8, 8),
HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
&vpx_highbd_dc_left_predictor_16x16_c, 16, 8),
HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
&vpx_highbd_dc_left_predictor_32x32_c, 32, 8),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
&vpx_highbd_dc_predictor_4x4_c, 4, 8), &vpx_highbd_dc_predictor_4x4_c, 4, 8),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@ -515,6 +523,14 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P(
SSE2_TO_C_10, VP9HighbdIntraPredTest, SSE2_TO_C_10, VP9HighbdIntraPredTest,
::testing::Values( ::testing::Values(
HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
&vpx_highbd_dc_left_predictor_4x4_c, 4, 10),
HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
&vpx_highbd_dc_left_predictor_8x8_c, 8, 10),
HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
&vpx_highbd_dc_left_predictor_16x16_c, 16, 10),
HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
&vpx_highbd_dc_left_predictor_32x32_c, 32, 10),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
&vpx_highbd_dc_predictor_4x4_c, 4, 10), &vpx_highbd_dc_predictor_4x4_c, 4, 10),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@ -559,6 +575,14 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P(
SSE2_TO_C_12, VP9HighbdIntraPredTest, SSE2_TO_C_12, VP9HighbdIntraPredTest,
::testing::Values( ::testing::Values(
HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
&vpx_highbd_dc_left_predictor_4x4_c, 4, 12),
HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
&vpx_highbd_dc_left_predictor_8x8_c, 8, 12),
HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
&vpx_highbd_dc_left_predictor_16x16_c, 16, 12),
HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
&vpx_highbd_dc_left_predictor_32x32_c, 32, 12),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
&vpx_highbd_dc_predictor_4x4_c, 4, 12), &vpx_highbd_dc_predictor_4x4_c, 4, 12),
HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,

View File

@ -217,7 +217,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/; specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_left_predictor_4x4 neon/; specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_128_predictor_4x4 neon/; specialize qw/vpx_highbd_dc_128_predictor_4x4 neon/;
@ -252,7 +252,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/; specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_left_predictor_8x8 neon/; specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_128_predictor_8x8 neon/; specialize qw/vpx_highbd_dc_128_predictor_8x8 neon/;
@ -287,7 +287,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/; specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_left_predictor_16x16 neon/; specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_128_predictor_16x16 neon/; specialize qw/vpx_highbd_dc_128_predictor_16x16 neon/;
@ -322,7 +322,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/; specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_left_predictor_32x32 neon/; specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_128_predictor_32x32 neon/; specialize qw/vpx_highbd_dc_128_predictor_32x32 neon/;

View File

@ -177,6 +177,17 @@ static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
} }
} }
void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const __m128i two = _mm_cvtsi32_si128(2);
const __m128i sum = dc_sum_4(left);
const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
(void)above;
(void)bd;
dc_store_4x4(dst, stride, &dc);
}
void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
@ -210,6 +221,17 @@ static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
} }
} }
void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const __m128i four = _mm_cvtsi32_si128(4);
const __m128i sum = dc_sum_8(left);
const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
(void)above;
(void)bd;
dc_store_8x8(dst, stride, &dc);
}
void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
@ -241,6 +263,17 @@ static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
} }
} }
void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const __m128i eight = _mm_cvtsi32_si128(8);
const __m128i sum = dc_sum_16(left);
const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
(void)above;
(void)bd;
dc_store_16x16(dst, stride, &dc);
}
void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
@ -277,6 +310,17 @@ static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
} }
} }
void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const __m128i sixteen = _mm_cvtsi32_si128(16);
const __m128i sum = dc_sum_32(left);
const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
(void)above;
(void)bd;
dc_store_32x32(dst, stride, &dc);
}
void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {