diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc index 0bf6b0c23..b16f14c8e 100644 --- a/test/lpf_8_test.cc +++ b/test/lpf_8_test.cc @@ -37,120 +37,23 @@ const int number_of_iterations = 10000; #if CONFIG_VP9_HIGHBITDEPTH typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, - int count, int bd); + int bd); typedef void (*dual_loop_op_t)(uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); #else typedef void (*loop_op_t)(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count); + const uint8_t *limit, const uint8_t *thresh); typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); #endif // CONFIG_VP9_HIGHBITDEPTH -typedef std::tr1::tuple loop8_param_t; +typedef std::tr1::tuple loop8_param_t; typedef std::tr1::tuple dualloop8_param_t; -#if HAVE_SSE2 -#if CONFIG_VP9_HIGHBITDEPTH -void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { - vpx_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd); -} - -void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { - vpx_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd); -} - -void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { - vpx_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd); -} - -void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { - vpx_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd); -} -#else -void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_sse2(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh); -} -#endif // CONFIG_VP9_HIGHBITDEPTH -#endif // HAVE_SSE2 - -#if HAVE_NEON_ASM -#if CONFIG_VP9_HIGHBITDEPTH -// No neon high bitdepth functions. -#else -void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh); -} -#endif // CONFIG_VP9_HIGHBITDEPTH -#endif // HAVE_NEON_ASM - -#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH) -void wrapper_vertical_16_msa(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_msa(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh); -} -#endif // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH) - class Loop8Test6Param : public ::testing::TestWithParam { public: virtual ~Loop8Test6Param() {} @@ -158,7 +61,6 @@ class Loop8Test6Param : public ::testing::TestWithParam { loopfilter_op_ = GET_PARAM(0); ref_loopfilter_op_ = GET_PARAM(1); bit_depth_ = GET_PARAM(2); - count_ = GET_PARAM(3); mask_ = (1 << bit_depth_) - 1; } @@ -166,7 +68,6 @@ class Loop8Test6Param : public ::testing::TestWithParam { protected: int bit_depth_; - int count_; int mask_; loop_op_t loopfilter_op_; loop_op_t ref_loopfilter_op_; @@ -253,13 +154,13 @@ TEST_P(Loop8Test6Param, OperationCheck) { ref_s[j] = s[j]; } #if CONFIG_VP9_HIGHBITDEPTH - ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd); + ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bd); ASM_REGISTER_STATE_CHECK( - loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd)); + loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd)); #else - ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_); + ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh); ASM_REGISTER_STATE_CHECK( - loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_)); + loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh)); #endif // CONFIG_VP9_HIGHBITDEPTH for (int j = 0; j < kNumCoeffs; ++j) { @@ -325,13 +226,13 @@ TEST_P(Loop8Test6Param, ValueCheck) { ref_s[j] = s[j]; } #if CONFIG_VP9_HIGHBITDEPTH - ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd); + ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bd); ASM_REGISTER_STATE_CHECK( - loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd)); + loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd)); #else - ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_); + ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh); ASM_REGISTER_STATE_CHECK( - loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_)); + loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh)); #endif // CONFIG_VP9_HIGHBITDEPTH for (int j = 0; j < kNumCoeffs; ++j) { err_count += ref_s[j] != s[j]; @@ -529,70 +430,85 @@ TEST_P(Loop8Test9Param, ValueCheck) { using std::tr1::make_tuple; +#if HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + MMX, Loop8Test6Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_mmx, + &vpx_lpf_horizontal_4_c, 8), + make_tuple(&vpx_lpf_vertical_4_mmx, + &vpx_lpf_vertical_4_c, 8))); +#endif // HAVE_MMX + #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( SSE2, Loop8Test6Param, ::testing::Values( make_tuple(&vpx_highbd_lpf_horizontal_4_sse2, - &vpx_highbd_lpf_horizontal_4_c, 8, 1), + &vpx_highbd_lpf_horizontal_4_c, 8), make_tuple(&vpx_highbd_lpf_vertical_4_sse2, - &vpx_highbd_lpf_vertical_4_c, 8, 1), + &vpx_highbd_lpf_vertical_4_c, 8), make_tuple(&vpx_highbd_lpf_horizontal_8_sse2, - &vpx_highbd_lpf_horizontal_8_c, 8, 1), - make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, - &vpx_highbd_lpf_horizontal_16_c, 8, 1), - make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, - &vpx_highbd_lpf_horizontal_16_c, 8, 2), + &vpx_highbd_lpf_horizontal_8_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2, + &vpx_highbd_lpf_horizontal_edge_8_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2, + &vpx_highbd_lpf_horizontal_edge_16_c, 8), make_tuple(&vpx_highbd_lpf_vertical_8_sse2, - &vpx_highbd_lpf_vertical_8_c, 8, 1), - make_tuple(&wrapper_vertical_16_sse2, - &wrapper_vertical_16_c, 8, 1), + &vpx_highbd_lpf_vertical_8_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_16_sse2, + &vpx_highbd_lpf_vertical_16_c, 8), make_tuple(&vpx_highbd_lpf_horizontal_4_sse2, - &vpx_highbd_lpf_horizontal_4_c, 10, 1), + &vpx_highbd_lpf_horizontal_4_c, 10), make_tuple(&vpx_highbd_lpf_vertical_4_sse2, - &vpx_highbd_lpf_vertical_4_c, 10, 1), + &vpx_highbd_lpf_vertical_4_c, 10), make_tuple(&vpx_highbd_lpf_horizontal_8_sse2, - &vpx_highbd_lpf_horizontal_8_c, 10, 1), - make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, - &vpx_highbd_lpf_horizontal_16_c, 10, 1), - make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, - &vpx_highbd_lpf_horizontal_16_c, 10, 2), + &vpx_highbd_lpf_horizontal_8_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2, + &vpx_highbd_lpf_horizontal_edge_8_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2, + &vpx_highbd_lpf_horizontal_edge_16_c, 10), make_tuple(&vpx_highbd_lpf_vertical_8_sse2, - &vpx_highbd_lpf_vertical_8_c, 10, 1), - make_tuple(&wrapper_vertical_16_sse2, - &wrapper_vertical_16_c, 10, 1), + &vpx_highbd_lpf_vertical_8_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_16_sse2, + &vpx_highbd_lpf_vertical_16_c, 10), make_tuple(&vpx_highbd_lpf_horizontal_4_sse2, - &vpx_highbd_lpf_horizontal_4_c, 12, 1), + &vpx_highbd_lpf_horizontal_4_c, 12), make_tuple(&vpx_highbd_lpf_vertical_4_sse2, - &vpx_highbd_lpf_vertical_4_c, 12, 1), + &vpx_highbd_lpf_vertical_4_c, 12), make_tuple(&vpx_highbd_lpf_horizontal_8_sse2, - &vpx_highbd_lpf_horizontal_8_c, 12, 1), - make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, - &vpx_highbd_lpf_horizontal_16_c, 12, 1), - make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, - &vpx_highbd_lpf_horizontal_16_c, 12, 2), + &vpx_highbd_lpf_horizontal_8_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2, + &vpx_highbd_lpf_horizontal_edge_8_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2, + &vpx_highbd_lpf_horizontal_edge_16_c, 12), make_tuple(&vpx_highbd_lpf_vertical_8_sse2, - &vpx_highbd_lpf_vertical_8_c, 12, 1), - make_tuple(&wrapper_vertical_16_sse2, - &wrapper_vertical_16_c, 12, 1), - make_tuple(&wrapper_vertical_16_dual_sse2, - &wrapper_vertical_16_dual_c, 8, 1), - make_tuple(&wrapper_vertical_16_dual_sse2, - &wrapper_vertical_16_dual_c, 10, 1), - make_tuple(&wrapper_vertical_16_dual_sse2, - &wrapper_vertical_16_dual_c, 12, 1))); + &vpx_highbd_lpf_vertical_8_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_16_sse2, + &vpx_highbd_lpf_vertical_16_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2, + &vpx_highbd_lpf_vertical_16_dual_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2, + &vpx_highbd_lpf_vertical_16_dual_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2, + &vpx_highbd_lpf_vertical_16_dual_c, 12))); #else INSTANTIATE_TEST_CASE_P( SSE2, Loop8Test6Param, ::testing::Values( - make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2), - make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1), - make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1), - make_tuple(&wrapper_vertical_16_dual_sse2, - &wrapper_vertical_16_dual_c, 8, 1))); + make_tuple(&vpx_lpf_horizontal_8_sse2, + &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_8_sse2, + &vpx_lpf_horizontal_edge_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_16_sse2, + &vpx_lpf_horizontal_edge_16_c, 8), + make_tuple(&vpx_lpf_vertical_8_sse2, + &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_vertical_16_sse2, + &vpx_lpf_vertical_16_c, 8), + make_tuple(&vpx_lpf_vertical_16_dual_sse2, + &vpx_lpf_vertical_16_dual_c, 8))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif @@ -600,9 +516,10 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( AVX2, Loop8Test6Param, ::testing::Values( - make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, - 2))); + make_tuple(&vpx_lpf_horizontal_edge_8_avx2, + &vpx_lpf_horizontal_edge_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_16_avx2, + &vpx_lpf_horizontal_edge_16_c, 8))); #endif #if HAVE_SSE2 @@ -659,23 +576,23 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_NEON_ASM // Using #if inside the macro is unsupported on MSVS but the tests are not // currently built for MSVS with ARM and NEON. - make_tuple(&vpx_lpf_horizontal_16_neon, - &vpx_lpf_horizontal_16_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_16_neon, - &vpx_lpf_horizontal_16_c, 8, 2), - make_tuple(&wrapper_vertical_16_neon, - &wrapper_vertical_16_c, 8, 1), - make_tuple(&wrapper_vertical_16_dual_neon, - &wrapper_vertical_16_dual_c, 8, 1), + make_tuple(&vpx_lpf_horizontal_edge_8_neon, + &vpx_lpf_horizontal_edge_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_16_neon, + &vpx_lpf_horizontal_edge_16_c, 8), + make_tuple(&vpx_lpf_vertical_16_neon, + &vpx_lpf_vertical_16_c, 8), + make_tuple(&vpx_lpf_vertical_16_dual_neon, + &vpx_lpf_vertical_16_dual_c, 8), #endif // HAVE_NEON_ASM make_tuple(&vpx_lpf_horizontal_8_neon, - &vpx_lpf_horizontal_8_c, 8, 1), + &vpx_lpf_horizontal_8_c, 8), make_tuple(&vpx_lpf_vertical_8_neon, - &vpx_lpf_vertical_8_c, 8, 1), + &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_horizontal_4_neon, - &vpx_lpf_horizontal_4_c, 8, 1), + &vpx_lpf_horizontal_4_c, 8), make_tuple(&vpx_lpf_vertical_4_neon, - &vpx_lpf_vertical_4_c, 8, 1))); + &vpx_lpf_vertical_4_c, 8))); INSTANTIATE_TEST_CASE_P( NEON, Loop8Test9Param, ::testing::Values( @@ -692,15 +609,58 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON +#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + DSPR2, Loop8Test6Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_dspr2, + &vpx_lpf_horizontal_4_c, 8), + make_tuple(&vpx_lpf_horizontal_8_dspr2, + &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_8, + &vpx_lpf_horizontal_edge_8, 8), + make_tuple(&vpx_lpf_horizontal_edge_16, + &vpx_lpf_horizontal_edge_16, 8), + make_tuple(&vpx_lpf_vertical_4_dspr2, + &vpx_lpf_vertical_4_c, 8), + make_tuple(&vpx_lpf_vertical_8_dspr2, + &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_vertical_16_dspr2, + &vpx_lpf_vertical_16_c, 8), + make_tuple(&vpx_lpf_vertical_16_dual_dspr2, + &vpx_lpf_vertical_16_dual_c, 8))); + +INSTANTIATE_TEST_CASE_P( + DSPR2, Loop8Test9Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_dual_dspr2, + &vpx_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_lpf_horizontal_8_dual_dspr2, + &vpx_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_dual_dspr2, + &vpx_lpf_vertical_4_dual_c, 8), + make_tuple(&vpx_lpf_vertical_8_dual_dspr2, + &vpx_lpf_vertical_8_dual_c, 8))); +#endif // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH + #if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH) INSTANTIATE_TEST_CASE_P( MSA, Loop8Test6Param, ::testing::Values( - make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2), - make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8, 1), - make_tuple(&wrapper_vertical_16_msa, &wrapper_vertical_16_c, 8, 1))); + make_tuple(&vpx_lpf_horizontal_4_msa, + &vpx_lpf_horizontal_4_c, 8), + make_tuple(&vpx_lpf_horizontal_8_msa, + &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_8_msa, + &vpx_lpf_horizontal_edge_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_16_msa, + &vpx_lpf_horizontal_edge_16_c, 8), + make_tuple(&vpx_lpf_vertical_4_msa, + &vpx_lpf_vertical_4_c, 8), + make_tuple(&vpx_lpf_vertical_8_msa, + &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_vertical_16_msa, + &vpx_lpf_vertical_16_c, 8))); INSTANTIATE_TEST_CASE_P( MSA, Loop8Test9Param, diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c index a1925de55..8f4fc8ccd 100644 --- a/vp10/common/loopfilter.c +++ b/vp10/common/loopfilter.c @@ -324,7 +324,6 @@ static void filter_selectively_vert_row2(int subsampling_factor, const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); - // TODO(yunqingwang): count in loopfilter functions should be removed. if (mask & 1) { if ((mask_16x16_0 | mask_16x16_1) & 1) { if ((mask_16x16_0 & mask_16x16_1) & 1) { @@ -345,11 +344,10 @@ static void filter_selectively_vert_row2(int subsampling_factor, lfi0->hev_thr, lfi1->mblim, lfi1->lim, lfi1->hev_thr); } else if (mask_8x8_0 & 1) { - vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, - 1); + vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); } else { vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, 1); + lfi1->hev_thr); } } @@ -359,11 +357,10 @@ static void filter_selectively_vert_row2(int subsampling_factor, lfi0->hev_thr, lfi1->mblim, lfi1->lim, lfi1->hev_thr); } else if (mask_4x4_0 & 1) { - vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, - 1); + vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); } else { vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, 1); + lfi1->hev_thr); } } @@ -374,10 +371,10 @@ static void filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr); } else if (mask_4x4_int_0 & 1) { vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1); + lfi0->hev_thr); } else { vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, 1); + lfi1->hev_thr); } } } @@ -424,7 +421,6 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); - // TODO(yunqingwang): count in loopfilter functions should be removed. if (mask & 1) { if ((mask_16x16_0 | mask_16x16_1) & 1) { if ((mask_16x16_0 & mask_16x16_1) & 1) { @@ -446,10 +442,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr, bd); } else if (mask_8x8_0 & 1) { vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1, bd); + lfi0->hev_thr, bd); } else { vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1, bd); + lfi1->lim, lfi1->hev_thr, bd); } } @@ -460,10 +456,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr, bd); } else if (mask_4x4_0 & 1) { vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1, bd); + lfi0->hev_thr, bd); } else { vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1, bd); + lfi1->lim, lfi1->hev_thr, bd); } } @@ -474,10 +470,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr, bd); } else if (mask_4x4_int_0 & 1) { vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1, bd); + lfi0->hev_thr, bd); } else { vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1, bd); + lfi1->lim, lfi1->hev_thr, bd); } } } @@ -514,12 +510,12 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, if (mask & 1) { if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { - vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); count = 2; } else { - vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); } } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { @@ -537,18 +533,18 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } else { if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); else if (mask_4x4_int & 2) vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1); + lfin->lim, lfin->hev_thr); } count = 2; } else { - vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); } } else if (mask_4x4 & 1) { if ((mask_4x4 & 3) == 3) { @@ -565,22 +561,22 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } else { if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); else if (mask_4x4_int & 2) vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1); + lfin->lim, lfin->hev_thr); } count = 2; } else { - vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); } } else if (mask_4x4_int & 1) { vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); } } s += 8 * count; @@ -611,12 +607,12 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, if (mask & 1) { if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { - vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 2, bd); + vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); count = 2; } else { - vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + vpx_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); } } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { @@ -635,20 +631,20 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, } else { if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } else if (mask_4x4_int & 2) { vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1, bd); + lfin->lim, lfin->hev_thr, bd); } } count = 2; } else { vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } } } else if (mask_4x4 & 1) { @@ -667,25 +663,25 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, } else { if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } else if (mask_4x4_int & 2) { vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1, bd); + lfin->lim, lfin->hev_thr, bd); } } count = 2; } else { vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } } } else if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); } } s += 8 * count; @@ -1127,13 +1123,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch, if (mask_16x16 & 1) { vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } else if (mask_8x8 & 1) { - vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } else if (mask_4x4 & 1) { - vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } } if (mask_4x4_int & 1) - vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); s += 8; lfl += 1; mask_16x16 >>= 1; @@ -1163,15 +1159,15 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch, lfi->hev_thr, bd); } else if (mask_8x8 & 1) { vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); } else if (mask_4x4 & 1) { vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); } } if (mask_4x4_int & 1) vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); s += 8; lfl += 1; mask_16x16 >>= 1; diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 79c3c4820..aca69bd0f 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -324,7 +324,6 @@ static void filter_selectively_vert_row2(int subsampling_factor, const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); - // TODO(yunqingwang): count in loopfilter functions should be removed. if (mask & 1) { if ((mask_16x16_0 | mask_16x16_1) & 1) { if ((mask_16x16_0 & mask_16x16_1) & 1) { @@ -345,11 +344,10 @@ static void filter_selectively_vert_row2(int subsampling_factor, lfi0->hev_thr, lfi1->mblim, lfi1->lim, lfi1->hev_thr); } else if (mask_8x8_0 & 1) { - vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, - 1); + vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); } else { vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, 1); + lfi1->hev_thr); } } @@ -359,11 +357,10 @@ static void filter_selectively_vert_row2(int subsampling_factor, lfi0->hev_thr, lfi1->mblim, lfi1->lim, lfi1->hev_thr); } else if (mask_4x4_0 & 1) { - vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, - 1); + vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); } else { vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, 1); + lfi1->hev_thr); } } @@ -374,10 +371,10 @@ static void filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr); } else if (mask_4x4_int_0 & 1) { vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1); + lfi0->hev_thr); } else { vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, 1); + lfi1->hev_thr); } } } @@ -424,7 +421,6 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); - // TODO(yunqingwang): count in loopfilter functions should be removed. if (mask & 1) { if ((mask_16x16_0 | mask_16x16_1) & 1) { if ((mask_16x16_0 & mask_16x16_1) & 1) { @@ -446,10 +442,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr, bd); } else if (mask_8x8_0 & 1) { vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1, bd); + lfi0->hev_thr, bd); } else { vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1, bd); + lfi1->lim, lfi1->hev_thr, bd); } } @@ -460,10 +456,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr, bd); } else if (mask_4x4_0 & 1) { vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1, bd); + lfi0->hev_thr, bd); } else { vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1, bd); + lfi1->lim, lfi1->hev_thr, bd); } } @@ -474,10 +470,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr, bd); } else if (mask_4x4_int_0 & 1) { vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1, bd); + lfi0->hev_thr, bd); } else { vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1, bd); + lfi1->lim, lfi1->hev_thr, bd); } } } @@ -514,12 +510,12 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, if (mask & 1) { if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { - vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); count = 2; } else { - vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); } } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { @@ -537,18 +533,18 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } else { if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); else if (mask_4x4_int & 2) vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1); + lfin->lim, lfin->hev_thr); } count = 2; } else { - vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); } } else if (mask_4x4 & 1) { if ((mask_4x4 & 3) == 3) { @@ -565,22 +561,22 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } else { if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); else if (mask_4x4_int & 2) vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1); + lfin->lim, lfin->hev_thr); } count = 2; } else { - vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); } } else if (mask_4x4_int & 1) { vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); } } s += 8 * count; @@ -611,12 +607,12 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, if (mask & 1) { if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { - vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 2, bd); + vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); count = 2; } else { - vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + vpx_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); } } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { @@ -635,20 +631,20 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, } else { if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } else if (mask_4x4_int & 2) { vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1, bd); + lfin->lim, lfin->hev_thr, bd); } } count = 2; } else { vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } } } else if (mask_4x4 & 1) { @@ -667,25 +663,25 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, } else { if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } else if (mask_4x4_int & 2) { vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1, bd); + lfin->lim, lfin->hev_thr, bd); } } count = 2; } else { vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } } } else if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); } } s += 8 * count; @@ -1102,13 +1098,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch, if (mask_16x16 & 1) { vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } else if (mask_8x8 & 1) { - vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } else if (mask_4x4 & 1) { - vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } } if (mask_4x4_int & 1) - vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); s += 8; lfl += 1; mask_16x16 >>= 1; @@ -1138,15 +1134,15 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch, lfi->hev_thr, bd); } else if (mask_8x8 & 1) { vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); } else if (mask_4x4 & 1) { vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); } } if (mask_4x4_int & 1) vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); s += 8; lfl += 1; mask_16x16 >>= 1; diff --git a/vpx_dsp/arm/loopfilter_4_neon.asm b/vpx_dsp/arm/loopfilter_4_neon.asm index e45e34cd4..937115898 100644 --- a/vpx_dsp/arm/loopfilter_4_neon.asm +++ b/vpx_dsp/arm/loopfilter_4_neon.asm @@ -16,37 +16,28 @@ ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter ; works on 16 iterations at a time. -; TODO(fgalligan): See about removing the count code as this function is only -; called with a count of 1. ; ; void vpx_lpf_horizontal_4_neon(uint8_t *s, ; int p /* pitch */, ; const uint8_t *blimit, ; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; const uint8_t *thresh) ; ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -; sp+4 int count |vpx_lpf_horizontal_4_neon| PROC push {lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #8] ; load count ldr r2, [sp, #4] ; load thresh add r1, r1, r1 ; double pitch - cmp r12, #0 - beq end_vpx_lf_h_edge - vld1.8 {d1[]}, [r3] ; duplicate *limit vld1.8 {d2[]}, [r2] ; duplicate *thresh -count_lf_h_loop sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines add r3, r2, r1, lsr #1 ; set to 3 lines down @@ -69,47 +60,34 @@ count_lf_h_loop vst1.u8 {d6}, [r2@64], r1 ; store oq0 vst1.u8 {d7}, [r3@64], r1 ; store oq1 - add r0, r0, #8 - subs r12, r12, #1 - bne count_lf_h_loop - -end_vpx_lf_h_edge pop {pc} ENDP ; |vpx_lpf_horizontal_4_neon| ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter ; works on 16 iterations at a time. -; TODO(fgalligan): See about removing the count code as this function is only -; called with a count of 1. ; ; void vpx_lpf_vertical_4_neon(uint8_t *s, ; int p /* pitch */, ; const uint8_t *blimit, ; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; const uint8_t *thresh) ; ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -; sp+4 int count |vpx_lpf_vertical_4_neon| PROC push {lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #8] ; load count vld1.8 {d1[]}, [r3] ; duplicate *limit ldr r3, [sp, #4] ; load thresh sub r2, r0, #4 ; move s pointer down by 4 columns - cmp r12, #0 - beq end_vpx_lf_v_edge vld1.8 {d2[]}, [r3] ; duplicate *thresh -count_lf_v_loop vld1.u8 {d3}, [r2], r1 ; load s data vld1.u8 {d4}, [r2], r1 vld1.u8 {d5}, [r2], r1 @@ -149,12 +127,6 @@ count_lf_v_loop vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] - add r0, r0, r1, lsl #3 ; s += pitch * 8 - subs r12, r12, #1 - subne r2, r0, #4 ; move s pointer down by 4 columns - bne count_lf_v_loop - -end_vpx_lf_v_edge pop {pc} ENDP ; |vpx_lpf_vertical_4_neon| diff --git a/vpx_dsp/arm/loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c index 7ad411aea..7f3ee70b9 100644 --- a/vpx_dsp/arm/loopfilter_4_neon.c +++ b/vpx_dsp/arm/loopfilter_4_neon.c @@ -115,22 +115,18 @@ void vpx_lpf_horizontal_4_neon( int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { int i; uint8_t *s, *psrc; uint8x8_t dblimit, dlimit, dthresh; uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - if (count == 0) // end_vpx_lf_h_edge - return; - dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); psrc = src - (pitch << 2); - for (i = 0; i < count; i++) { + for (i = 0; i < 1; i++) { s = psrc + i * 8; d3u8 = vld1_u8(s); @@ -170,8 +166,7 @@ void vpx_lpf_vertical_4_neon( int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { int i, pitch8; uint8_t *s; uint8x8_t dblimit, dlimit, dthresh; @@ -181,15 +176,12 @@ void vpx_lpf_vertical_4_neon( uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; uint8x8x4_t d4Result; - if (count == 0) // end_vpx_lf_h_edge - return; - dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); pitch8 = pitch * 8; - for (i = 0; i < count; i++, src += pitch8) { + for (i = 0; i < 1; i++, src += pitch8) { s = src - (i + 1) * 4; d3u8 = vld1_u8(s); diff --git a/vpx_dsp/arm/loopfilter_8_neon.asm b/vpx_dsp/arm/loopfilter_8_neon.asm index e81734c04..a2f20e15f 100644 --- a/vpx_dsp/arm/loopfilter_8_neon.asm +++ b/vpx_dsp/arm/loopfilter_8_neon.asm @@ -16,35 +16,26 @@ ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter ; works on 16 iterations at a time. -; TODO(fgalligan): See about removing the count code as this function is only -; called with a count of 1. ; ; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, ; const uint8_t *blimit, ; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; const uint8_t *thresh) ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -; sp+4 int count |vpx_lpf_horizontal_8_neon| PROC push {r4-r5, lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #16] ; load count ldr r2, [sp, #12] ; load thresh add r1, r1, r1 ; double pitch - cmp r12, #0 - beq end_vpx_mblf_h_edge - vld1.8 {d1[]}, [r3] ; duplicate *limit vld1.8 {d2[]}, [r2] ; duplicate *thresh -count_mblf_h_loop sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines add r2, r3, r1, lsr #1 ; set to 3 lines down @@ -69,11 +60,6 @@ count_mblf_h_loop vst1.u8 {d4}, [r2@64], r1 ; store oq1 vst1.u8 {d5}, [r3@64], r1 ; store oq2 - add r0, r0, #8 - subs r12, r12, #1 - bne count_mblf_h_loop - -end_vpx_mblf_h_edge pop {r4-r5, pc} ENDP ; |vpx_lpf_horizontal_8_neon| @@ -82,30 +68,24 @@ end_vpx_mblf_h_edge ; int pitch, ; const uint8_t *blimit, ; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; const uint8_t *thresh) ; ; r0 uint8_t *s, ; r1 int pitch, ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -; sp+4 int count |vpx_lpf_vertical_8_neon| PROC push {r4-r5, lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #16] ; load count vld1.8 {d1[]}, [r3] ; duplicate *limit ldr r3, [sp, #12] ; load thresh sub r2, r0, #4 ; move s pointer down by 4 columns - cmp r12, #0 - beq end_vpx_mblf_v_edge vld1.8 {d2[]}, [r3] ; duplicate *thresh -count_mblf_v_loop vld1.u8 {d3}, [r2], r1 ; load s data vld1.u8 {d4}, [r2], r1 vld1.u8 {d5}, [r2], r1 @@ -156,12 +136,6 @@ count_mblf_v_loop vst2.8 {d4[6], d5[6]}, [r3], r1 vst2.8 {d4[7], d5[7]}, [r3] - add r0, r0, r1, lsl #3 ; s += pitch * 8 - subs r12, r12, #1 - subne r2, r0, #4 ; move s pointer down by 4 columns - bne count_mblf_v_loop - -end_vpx_mblf_v_edge pop {r4-r5, pc} ENDP ; |vpx_lpf_vertical_8_neon| diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c index a887e2ee5..ec3757380 100644 --- a/vpx_dsp/arm/loopfilter_8_neon.c +++ b/vpx_dsp/arm/loopfilter_8_neon.c @@ -268,23 +268,19 @@ void vpx_lpf_horizontal_8_neon( int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { int i; uint8_t *s, *psrc; uint8x8_t dblimit, dlimit, dthresh; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; uint8x8_t d16u8, d17u8, d18u8; - if (count == 0) // end_vpx_mblf_h_edge - return; - dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); psrc = src - (pitch << 2); - for (i = 0; i < count; i++) { + for (i = 0; i < 1; i++) { s = psrc + i * 8; d3u8 = vld1_u8(s); @@ -328,8 +324,7 @@ void vpx_lpf_vertical_8_neon( int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { int i; uint8_t *s; uint8x8_t dblimit, dlimit, dthresh; @@ -341,14 +336,11 @@ void vpx_lpf_vertical_8_neon( uint8x8x4_t d4Result; uint8x8x2_t d2Result; - if (count == 0) - return; - dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); - for (i = 0; i < count; i++) { + for (i = 0; i < 1; i++) { s = src + (i * (pitch << 3)) - 4; d3u8 = vld1_u8(s); diff --git a/vpx_dsp/arm/loopfilter_mb_neon.asm b/vpx_dsp/arm/loopfilter_mb_neon.asm index 20d9cfb11..d5da7a840 100644 --- a/vpx_dsp/arm/loopfilter_mb_neon.asm +++ b/vpx_dsp/arm/loopfilter_mb_neon.asm @@ -8,27 +8,28 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vpx_lpf_horizontal_16_neon| + EXPORT |vpx_lpf_horizontal_edge_8_neon| + EXPORT |vpx_lpf_horizontal_edge_16_neon| EXPORT |vpx_lpf_vertical_16_neon| ARM AREA ||.text||, CODE, READONLY, ALIGN=2 -; void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh -; int count) +; void mb_lpf_horizontal_edge(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -|vpx_lpf_horizontal_16_neon| PROC +; r12 int count +|mb_lpf_horizontal_edge| PROC push {r4-r8, lr} vpush {d8-d15} ldr r4, [sp, #88] ; load thresh - ldr r12, [sp, #92] ; load count h_count vld1.8 {d16[]}, [r2] ; load *blimit @@ -115,7 +116,35 @@ h_next vpop {d8-d15} pop {r4-r8, pc} - ENDP ; |vpx_lpf_horizontal_16_neon| + ENDP ; |mb_lpf_horizontal_edge| + +; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_horizontal_edge_8_neon| PROC + mov r12, #1 + b mb_lpf_horizontal_edge + ENDP ; |vpx_lpf_horizontal_edge_8_neon| + +; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_horizontal_edge_16_neon| PROC + mov r12, #2 + b mb_lpf_horizontal_edge + ENDP ; |vpx_lpf_horizontal_edge_16_neon| ; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, ; const uint8_t *blimit, diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c index eff87d29b..aa31f2935 100644 --- a/vpx_dsp/arm/loopfilter_neon.c +++ b/vpx_dsp/arm/loopfilter_neon.c @@ -21,8 +21,8 @@ void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); + vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1); } #if HAVE_NEON_ASM @@ -33,8 +33,8 @@ void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, @@ -44,8 +44,8 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); + vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c index 66f4d9576..46ef64617 100644 --- a/vpx_dsp/loopfilter.c +++ b/vpx_dsp/loopfilter.c @@ -119,12 +119,12 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, int count) { + const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const int8_t mask = filter_mask(*limit, *blimit, @@ -138,18 +138,17 @@ void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { + const uint8_t *limit, const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, @@ -163,9 +162,8 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1); - vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, - thresh1, 1); + vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, @@ -190,13 +188,12 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, } void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { + const uint8_t *limit, const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; @@ -213,16 +210,15 @@ void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { + const uint8_t *limit, const uint8_t *thresh) { int i; - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, @@ -238,9 +234,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1); - vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, - thresh1, 1); + vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } static INLINE void filter16(int8_t mask, uint8_t thresh, @@ -294,9 +289,9 @@ static INLINE void filter16(int8_t mask, uint8_t thresh, } } -void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { +static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -320,6 +315,16 @@ void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit, } } +void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); +} + +void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); +} + static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, @@ -450,12 +455,12 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, int count, int bd) { + const uint8_t *thresh, int bd) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint16_t p3 = s[-4 * p]; const uint16_t p2 = s[-3 * p]; const uint16_t p1 = s[-2 * p]; @@ -479,18 +484,18 @@ void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd); - vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd); + vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd); } void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { + int bd) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = highbd_filter_mask(*limit, *blimit, @@ -508,9 +513,9 @@ void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd); + vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, - thresh1, 1, bd); + thresh1, bd); } static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, @@ -536,12 +541,12 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { + int bd) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; @@ -564,16 +569,16 @@ void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd); - vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd); + vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); } void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { + int bd) { int i; - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = highbd_filter_mask(*limit, *blimit, @@ -596,9 +601,9 @@ void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd); + vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, - thresh1, 1, bd); + thresh1, bd); } static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, @@ -664,9 +669,11 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, } } -void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { +static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count, int bd) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -698,6 +705,20 @@ void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit, } } +void vpx_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); +} + +void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd); +} + static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, diff --git a/vpx_dsp/mips/loopfilter_16_msa.c b/vpx_dsp/mips/loopfilter_16_msa.c index b7c9f7bd0..a6c581d72 100644 --- a/vpx_dsp/mips/loopfilter_16_msa.c +++ b/vpx_dsp/mips/loopfilter_16_msa.c @@ -423,11 +423,11 @@ void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, } } -void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { +static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { if (1 == count) { uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; uint64_t dword0, dword1; @@ -648,6 +648,20 @@ void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, } } +void vpx_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); +} + +void vpx_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2); +} + static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch) { v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; diff --git a/vpx_dsp/mips/loopfilter_4_msa.c b/vpx_dsp/mips/loopfilter_4_msa.c index daf5f38bf..936347031 100644 --- a/vpx_dsp/mips/loopfilter_4_msa.c +++ b/vpx_dsp/mips/loopfilter_4_msa.c @@ -13,14 +13,11 @@ void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { + const uint8_t *thresh_ptr) { uint64_t p1_d, p0_d, q0_d, q1_d; v16u8 mask, hev, flat, thresh, b_limit, limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; - (void)count; - /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); @@ -74,14 +71,11 @@ void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { + const uint8_t *thresh_ptr) { v16u8 mask, hev, flat, limit, thresh, b_limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v8i16 vec0, vec1, vec2, vec3; - (void)count; - LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh_ptr); diff --git a/vpx_dsp/mips/loopfilter_8_msa.c b/vpx_dsp/mips/loopfilter_8_msa.c index 00b6db550..5b22bd002 100644 --- a/vpx_dsp/mips/loopfilter_8_msa.c +++ b/vpx_dsp/mips/loopfilter_8_msa.c @@ -13,8 +13,7 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { + const uint8_t *thresh_ptr) { uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; v16u8 mask, hev, flat, thresh, b_limit, limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; @@ -23,8 +22,6 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; v16i8 zero = { 0 }; - (void)count; - /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); @@ -161,8 +158,7 @@ void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch, void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { + const uint8_t *thresh_ptr) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p1_out, p0_out, q0_out, q1_out; v16u8 flat, mask, hev, thresh, b_limit, limit; @@ -171,8 +167,6 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, v16u8 zero = { 0 }; v8i16 vec0, vec1, vec2, vec3, vec4; - (void)count; - /* load vector elements */ LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c index 99a96d89b..8414b9ed5 100644 --- a/vpx_dsp/mips/loopfilter_filters_dspr2.c +++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c @@ -23,8 +23,7 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { uint8_t i; uint32_t mask; uint32_t hev; @@ -117,8 +116,7 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { uint8_t i; uint32_t mask, hev; uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; @@ -313,8 +311,8 @@ void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */, @@ -324,8 +322,8 @@ void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, @@ -335,8 +333,8 @@ void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1); + vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, @@ -346,9 +344,8 @@ void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, - 1); + vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, diff --git a/vpx_dsp/mips/loopfilter_mb_dspr2.c b/vpx_dsp/mips/loopfilter_mb_dspr2.c index 4138f5697..dd0545eed 100644 --- a/vpx_dsp/mips/loopfilter_mb_dspr2.c +++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c @@ -23,8 +23,7 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { uint32_t mask; uint32_t hev, flat; uint8_t i; @@ -322,8 +321,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { uint8_t i; uint32_t mask, hev, flat; uint8_t *s1, *s2, *s3, *s4; diff --git a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c index 8a4865073..85e167ca0 100644 --- a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c +++ b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c @@ -19,12 +19,12 @@ #include "vpx_mem/vpx_mem.h" #if HAVE_DSPR2 -void vpx_lpf_horizontal_16_dspr2(unsigned char *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +static void mb_lpf_horizontal_edge(unsigned char *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { uint32_t mask; uint32_t hev, flat, flat2; uint8_t i; @@ -791,4 +791,18 @@ void vpx_lpf_horizontal_16_dspr2(unsigned char *s, s = s + 4; } } + +void vpx_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1); +} + +void vpx_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2); +} #endif // #if HAVE_DSPR2 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 73726d217..d7835f4a7 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -535,31 +535,35 @@ add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8 specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/; $vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon; -add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/; $vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon; -add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_vertical_4 mmx neon dspr2 msa/; add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/; -add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/; -$vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon; +add_proto qw/void vpx_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/; +$vpx_lpf_horizontal_edge_8_neon_asm=vpx_lpf_horizontal_edge_8_neon; -add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +add_proto qw/void vpx_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/; +$vpx_lpf_horizontal_edge_16_neon_asm=vpx_lpf_horizontal_edge_16_neon; + +add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/; $vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon; -add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_horizontal_4 mmx neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; @@ -572,28 +576,31 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/; - add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd"; + add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/vpx_highbd_lpf_vertical_8 sse2/; add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/; - add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd"; + add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/vpx_highbd_lpf_vertical_4 sse2/; add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/; - add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd"; - specialize qw/vpx_highbd_lpf_horizontal_16 sse2/; + add_proto qw/void vpx_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/vpx_highbd_lpf_horizontal_edge_8 sse2/; - add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd"; + add_proto qw/void vpx_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/vpx_highbd_lpf_horizontal_edge_16 sse2/; + + add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/vpx_highbd_lpf_horizontal_8 sse2/; add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/; - add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd"; + add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/vpx_highbd_lpf_horizontal_4 sse2/; add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c index c4fd5e1a0..72e42adc9 100644 --- a/vpx_dsp/x86/highbd_loopfilter_sse2.c +++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c @@ -51,12 +51,10 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { // TODO(debargha, peter): Break up large functions into smaller ones // in this file. -static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s, - int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, - int bd) { +void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi16(1); __m128i blimit, limit, thresh; @@ -496,34 +494,19 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s, _mm_store_si128((__m128i *)(s - 0 * p), q0); } -static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s, - int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, - int bd) { - highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd); - highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh, - bd); -} - -// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. -void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, - int count, int bd) { - if (count == 1) - highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd); - else - highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd); +void vpx_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + vpx_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd); + vpx_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd); } void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, - int count, int bd) { + int bd) { DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); @@ -556,8 +539,6 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, __m128i work_a; __m128i filter1, filter2; - (void)count; - if (bd == 8) { blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); @@ -764,16 +745,15 @@ void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { - vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd); - vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, - 1, bd); + vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd); + vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); } void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, - int count, int bd) { + int bd) { const __m128i zero = _mm_set1_epi16(0); __m128i blimit, limit, thresh; __m128i mask, hev, flat; @@ -813,8 +793,6 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, __m128i work_a; __m128i filter1, filter2; - (void)count; - if (bd == 8) { blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); @@ -944,9 +922,8 @@ void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { - vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd); - vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1, - bd); + vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd); + vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); } static INLINE void highbd_transpose(uint16_t *src[], int in_p, @@ -1058,11 +1035,10 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { + int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); uint16_t *src[1]; uint16_t *dst[1]; - (void)count; // Transpose 8x8 src[0] = s - 4; @@ -1071,8 +1047,7 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, highbd_transpose(src, p, dst, 8, 1); // Loop filtering - vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1, - bd); + vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); src[0] = t_dst; dst[0] = s - 4; @@ -1112,11 +1087,10 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { + int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); uint16_t *src[1]; uint16_t *dst[1]; - (void)count; // Transpose 8x8 src[0] = s - 4; @@ -1125,8 +1099,7 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, highbd_transpose(src, p, dst, 8, 1); // Loop filtering - vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1, - bd); + vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); src[0] = t_dst; dst[0] = s - 4; @@ -1181,8 +1154,8 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, highbd_transpose(src, p, dst, 8, 2); // Loop filtering - highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, - thresh, bd); + vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, + thresh, bd); src[0] = t_dst; src[1] = t_dst + 8 * 8; dst[0] = s - 8; @@ -1205,8 +1178,8 @@ void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); // Loop filtering - highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, - thresh, bd); + vpx_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, + thresh, bd); // Transpose back highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c index 23a97dd05..be1087c1e 100644 --- a/vpx_dsp/x86/loopfilter_avx2.c +++ b/vpx_dsp/x86/loopfilter_avx2.c @@ -13,9 +13,10 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" -static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, - const unsigned char *_blimit, const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); @@ -400,9 +401,10 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 }; -static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, - const unsigned char *_blimit, const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); @@ -975,12 +977,3 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, _mm_storeu_si128((__m128i *) (s + 6 * p), q6); } } - -void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, - const unsigned char *_blimit, const unsigned char *_limit, - const unsigned char *_thresh, int count) { - if (count == 1) - mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh); - else - mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh); -} diff --git a/vpx_dsp/x86/loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm index b9c18b680..15105e3ed 100644 --- a/vpx_dsp/x86/loopfilter_mmx.asm +++ b/vpx_dsp/x86/loopfilter_mmx.asm @@ -18,14 +18,13 @@ ; int src_pixel_step, ; const char *blimit, ; const char *limit, -; const char *thresh, -; int count +; const char *thresh ;) global sym(vpx_lpf_horizontal_4_mmx) PRIVATE sym(vpx_lpf_horizontal_4_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx push rsi push rdi @@ -39,8 +38,6 @@ sym(vpx_lpf_horizontal_4_mmx): mov rsi, arg(0) ;src_ptr movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - movsxd rcx, dword ptr arg(5) ;count -.next8_h: mov rdx, arg(3) ;limit movq mm7, [rdx] mov rdi, rsi ; rdi points to row +1 for indirect addressing @@ -208,11 +205,6 @@ sym(vpx_lpf_horizontal_4_mmx): pxor mm7, [GLOBAL(t80)] ; unoffset movq [rdi], mm7 ; write back - add rsi,8 - neg rax - dec rcx - jnz .next8_h - add rsp, 32 pop rsp ; begin epilog @@ -230,14 +222,13 @@ sym(vpx_lpf_horizontal_4_mmx): ; int src_pixel_step, ; const char *blimit, ; const char *limit, -; const char *thresh, -; int count +; const char *thresh ;) global sym(vpx_lpf_vertical_4_mmx) PRIVATE sym(vpx_lpf_vertical_4_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx push rsi push rdi @@ -254,8 +245,6 @@ sym(vpx_lpf_vertical_4_mmx): lea rsi, [rsi + rax*4 - 4] - movsxd rcx, dword ptr arg(5) ;count -.next8_v: mov rdi, rsi ; rdi points to row +1 for indirect addressing add rdi, rax @@ -579,10 +568,6 @@ sym(vpx_lpf_vertical_4_mmx): movd [rdi+rax*2+2], mm5 - lea rsi, [rsi+rax*8] - dec rcx - jnz .next8_v - add rsp, 64 pop rsp ; begin epilog diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c index ed1012736..e03508a03 100644 --- a/vpx_dsp/x86/loopfilter_sse2.c +++ b/vpx_dsp/x86/loopfilter_sse2.c @@ -18,11 +18,10 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); } -static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, - int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); @@ -383,11 +382,10 @@ static INLINE __m128i filter16_mask(const __m128i *const flat, return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } -static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, - int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); @@ -716,21 +714,10 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, } } -// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. -void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh, int count) { - if (count == 1) - mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh); - else - mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh); -} - void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, - const unsigned char *_thresh, int count) { + const unsigned char *_thresh) { DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); @@ -745,8 +732,6 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; - (void)count; - q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), _mm_loadl_epi64((__m128i *)(s + 3 * p))); q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), @@ -1492,11 +1477,10 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, const unsigned char *blimit, const unsigned char *limit, - const unsigned char *thresh, int count) { + const unsigned char *thresh) { DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]); unsigned char *src[1]; unsigned char *dst[1]; - (void)count; // Transpose 8x8 src[0] = s - 4; @@ -1505,7 +1489,7 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, transpose(src, p, dst, 8, 1); // Loop filtering - vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1); + vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); src[0] = t_dst; dst[0] = s - 4; @@ -1557,7 +1541,7 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, transpose(src, p, dst, 8, 2); // Loop filtering - mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh); + vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); src[0] = t_dst; src[1] = t_dst + 8 * 8; @@ -1578,8 +1562,7 @@ void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p, transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); // Loop filtering - mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, - thresh); + vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); // Transpose back transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);