diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc index 0bf6b0c23..b16f14c8e 100644 --- a/test/lpf_8_test.cc +++ b/test/lpf_8_test.cc @@ -37,120 +37,23 @@ const int number_of_iterations = 10000; #if CONFIG_VP9_HIGHBITDEPTH typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, - int count, int bd); + int bd); typedef void (*dual_loop_op_t)(uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd); #else typedef void (*loop_op_t)(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count); + const uint8_t *limit, const uint8_t *thresh); typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); #endif // CONFIG_VP9_HIGHBITDEPTH -typedef std::tr1::tuple loop8_param_t; +typedef std::tr1::tuple loop8_param_t; typedef std::tr1::tuple dualloop8_param_t; -#if HAVE_SSE2 -#if CONFIG_VP9_HIGHBITDEPTH -void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { - vpx_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd); -} - -void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { - vpx_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd); -} - -void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { - vpx_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd); -} - -void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { - vpx_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd); -} -#else -void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_sse2(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh); -} -#endif // CONFIG_VP9_HIGHBITDEPTH -#endif // HAVE_SSE2 - -#if HAVE_NEON_ASM -#if CONFIG_VP9_HIGHBITDEPTH -// No neon high bitdepth functions. -#else -void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh); -} -#endif // CONFIG_VP9_HIGHBITDEPTH -#endif // HAVE_NEON_ASM - -#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH) -void wrapper_vertical_16_msa(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_msa(s, p, blimit, limit, thresh); -} - -void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh); -} -#endif // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH) - class Loop8Test6Param : public ::testing::TestWithParam { public: virtual ~Loop8Test6Param() {} @@ -158,7 +61,6 @@ class Loop8Test6Param : public ::testing::TestWithParam { loopfilter_op_ = GET_PARAM(0); ref_loopfilter_op_ = GET_PARAM(1); bit_depth_ = GET_PARAM(2); - count_ = GET_PARAM(3); mask_ = (1 << bit_depth_) - 1; } @@ -166,7 +68,6 @@ class Loop8Test6Param : public ::testing::TestWithParam { protected: int bit_depth_; - int count_; int mask_; loop_op_t loopfilter_op_; loop_op_t ref_loopfilter_op_; @@ -253,13 +154,13 @@ TEST_P(Loop8Test6Param, OperationCheck) { ref_s[j] = s[j]; } #if CONFIG_VP9_HIGHBITDEPTH - ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd); + ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bd); ASM_REGISTER_STATE_CHECK( - loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd)); + loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd)); #else - ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_); + ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh); ASM_REGISTER_STATE_CHECK( - loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_)); + loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh)); #endif // CONFIG_VP9_HIGHBITDEPTH for (int j = 0; j < kNumCoeffs; ++j) { @@ -325,13 +226,13 @@ TEST_P(Loop8Test6Param, ValueCheck) { ref_s[j] = s[j]; } #if CONFIG_VP9_HIGHBITDEPTH - ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd); + ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bd); ASM_REGISTER_STATE_CHECK( - loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd)); + loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd)); #else - ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_); + ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh); ASM_REGISTER_STATE_CHECK( - loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_)); + loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh)); #endif // CONFIG_VP9_HIGHBITDEPTH for (int j = 0; j < kNumCoeffs; ++j) { err_count += ref_s[j] != s[j]; @@ -529,70 +430,85 @@ TEST_P(Loop8Test9Param, ValueCheck) { using std::tr1::make_tuple; +#if HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + MMX, Loop8Test6Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_mmx, + &vpx_lpf_horizontal_4_c, 8), + make_tuple(&vpx_lpf_vertical_4_mmx, + &vpx_lpf_vertical_4_c, 8))); +#endif // HAVE_MMX + #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( SSE2, Loop8Test6Param, ::testing::Values( make_tuple(&vpx_highbd_lpf_horizontal_4_sse2, - &vpx_highbd_lpf_horizontal_4_c, 8, 1), + &vpx_highbd_lpf_horizontal_4_c, 8), make_tuple(&vpx_highbd_lpf_vertical_4_sse2, - &vpx_highbd_lpf_vertical_4_c, 8, 1), + &vpx_highbd_lpf_vertical_4_c, 8), make_tuple(&vpx_highbd_lpf_horizontal_8_sse2, - &vpx_highbd_lpf_horizontal_8_c, 8, 1), - make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, - &vpx_highbd_lpf_horizontal_16_c, 8, 1), - make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, - &vpx_highbd_lpf_horizontal_16_c, 8, 2), + &vpx_highbd_lpf_horizontal_8_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2, + &vpx_highbd_lpf_horizontal_edge_8_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2, + &vpx_highbd_lpf_horizontal_edge_16_c, 8), make_tuple(&vpx_highbd_lpf_vertical_8_sse2, - &vpx_highbd_lpf_vertical_8_c, 8, 1), - make_tuple(&wrapper_vertical_16_sse2, - &wrapper_vertical_16_c, 8, 1), + &vpx_highbd_lpf_vertical_8_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_16_sse2, + &vpx_highbd_lpf_vertical_16_c, 8), make_tuple(&vpx_highbd_lpf_horizontal_4_sse2, - &vpx_highbd_lpf_horizontal_4_c, 10, 1), + &vpx_highbd_lpf_horizontal_4_c, 10), make_tuple(&vpx_highbd_lpf_vertical_4_sse2, - &vpx_highbd_lpf_vertical_4_c, 10, 1), + &vpx_highbd_lpf_vertical_4_c, 10), make_tuple(&vpx_highbd_lpf_horizontal_8_sse2, - &vpx_highbd_lpf_horizontal_8_c, 10, 1), - make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, - &vpx_highbd_lpf_horizontal_16_c, 10, 1), - make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, - &vpx_highbd_lpf_horizontal_16_c, 10, 2), + &vpx_highbd_lpf_horizontal_8_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2, + &vpx_highbd_lpf_horizontal_edge_8_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2, + &vpx_highbd_lpf_horizontal_edge_16_c, 10), make_tuple(&vpx_highbd_lpf_vertical_8_sse2, - &vpx_highbd_lpf_vertical_8_c, 10, 1), - make_tuple(&wrapper_vertical_16_sse2, - &wrapper_vertical_16_c, 10, 1), + &vpx_highbd_lpf_vertical_8_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_16_sse2, + &vpx_highbd_lpf_vertical_16_c, 10), make_tuple(&vpx_highbd_lpf_horizontal_4_sse2, - &vpx_highbd_lpf_horizontal_4_c, 12, 1), + &vpx_highbd_lpf_horizontal_4_c, 12), make_tuple(&vpx_highbd_lpf_vertical_4_sse2, - &vpx_highbd_lpf_vertical_4_c, 12, 1), + &vpx_highbd_lpf_vertical_4_c, 12), make_tuple(&vpx_highbd_lpf_horizontal_8_sse2, - &vpx_highbd_lpf_horizontal_8_c, 12, 1), - make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, - &vpx_highbd_lpf_horizontal_16_c, 12, 1), - make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, - &vpx_highbd_lpf_horizontal_16_c, 12, 2), + &vpx_highbd_lpf_horizontal_8_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2, + &vpx_highbd_lpf_horizontal_edge_8_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2, + &vpx_highbd_lpf_horizontal_edge_16_c, 12), make_tuple(&vpx_highbd_lpf_vertical_8_sse2, - &vpx_highbd_lpf_vertical_8_c, 12, 1), - make_tuple(&wrapper_vertical_16_sse2, - &wrapper_vertical_16_c, 12, 1), - make_tuple(&wrapper_vertical_16_dual_sse2, - &wrapper_vertical_16_dual_c, 8, 1), - make_tuple(&wrapper_vertical_16_dual_sse2, - &wrapper_vertical_16_dual_c, 10, 1), - make_tuple(&wrapper_vertical_16_dual_sse2, - &wrapper_vertical_16_dual_c, 12, 1))); + &vpx_highbd_lpf_vertical_8_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_16_sse2, + &vpx_highbd_lpf_vertical_16_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2, + &vpx_highbd_lpf_vertical_16_dual_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2, + &vpx_highbd_lpf_vertical_16_dual_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2, + &vpx_highbd_lpf_vertical_16_dual_c, 12))); #else INSTANTIATE_TEST_CASE_P( SSE2, Loop8Test6Param, ::testing::Values( - make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2), - make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1), - make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1), - make_tuple(&wrapper_vertical_16_dual_sse2, - &wrapper_vertical_16_dual_c, 8, 1))); + make_tuple(&vpx_lpf_horizontal_8_sse2, + &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_8_sse2, + &vpx_lpf_horizontal_edge_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_16_sse2, + &vpx_lpf_horizontal_edge_16_c, 8), + make_tuple(&vpx_lpf_vertical_8_sse2, + &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_vertical_16_sse2, + &vpx_lpf_vertical_16_c, 8), + make_tuple(&vpx_lpf_vertical_16_dual_sse2, + &vpx_lpf_vertical_16_dual_c, 8))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif @@ -600,9 +516,10 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( AVX2, Loop8Test6Param, ::testing::Values( - make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, - 2))); + make_tuple(&vpx_lpf_horizontal_edge_8_avx2, + &vpx_lpf_horizontal_edge_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_16_avx2, + &vpx_lpf_horizontal_edge_16_c, 8))); #endif #if HAVE_SSE2 @@ -659,23 +576,23 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_NEON_ASM // Using #if inside the macro is unsupported on MSVS but the tests are not // currently built for MSVS with ARM and NEON. - make_tuple(&vpx_lpf_horizontal_16_neon, - &vpx_lpf_horizontal_16_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_16_neon, - &vpx_lpf_horizontal_16_c, 8, 2), - make_tuple(&wrapper_vertical_16_neon, - &wrapper_vertical_16_c, 8, 1), - make_tuple(&wrapper_vertical_16_dual_neon, - &wrapper_vertical_16_dual_c, 8, 1), + make_tuple(&vpx_lpf_horizontal_edge_8_neon, + &vpx_lpf_horizontal_edge_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_16_neon, + &vpx_lpf_horizontal_edge_16_c, 8), + make_tuple(&vpx_lpf_vertical_16_neon, + &vpx_lpf_vertical_16_c, 8), + make_tuple(&vpx_lpf_vertical_16_dual_neon, + &vpx_lpf_vertical_16_dual_c, 8), #endif // HAVE_NEON_ASM make_tuple(&vpx_lpf_horizontal_8_neon, - &vpx_lpf_horizontal_8_c, 8, 1), + &vpx_lpf_horizontal_8_c, 8), make_tuple(&vpx_lpf_vertical_8_neon, - &vpx_lpf_vertical_8_c, 8, 1), + &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_horizontal_4_neon, - &vpx_lpf_horizontal_4_c, 8, 1), + &vpx_lpf_horizontal_4_c, 8), make_tuple(&vpx_lpf_vertical_4_neon, - &vpx_lpf_vertical_4_c, 8, 1))); + &vpx_lpf_vertical_4_c, 8))); INSTANTIATE_TEST_CASE_P( NEON, Loop8Test9Param, ::testing::Values( @@ -692,15 +609,58 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON +#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + DSPR2, Loop8Test6Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_dspr2, + &vpx_lpf_horizontal_4_c, 8), + make_tuple(&vpx_lpf_horizontal_8_dspr2, + &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_8, + &vpx_lpf_horizontal_edge_8, 8), + make_tuple(&vpx_lpf_horizontal_edge_16, + &vpx_lpf_horizontal_edge_16, 8), + make_tuple(&vpx_lpf_vertical_4_dspr2, + &vpx_lpf_vertical_4_c, 8), + make_tuple(&vpx_lpf_vertical_8_dspr2, + &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_vertical_16_dspr2, + &vpx_lpf_vertical_16_c, 8), + make_tuple(&vpx_lpf_vertical_16_dual_dspr2, + &vpx_lpf_vertical_16_dual_c, 8))); + +INSTANTIATE_TEST_CASE_P( + DSPR2, Loop8Test9Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_dual_dspr2, + &vpx_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_lpf_horizontal_8_dual_dspr2, + &vpx_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_dual_dspr2, + &vpx_lpf_vertical_4_dual_c, 8), + make_tuple(&vpx_lpf_vertical_8_dual_dspr2, + &vpx_lpf_vertical_8_dual_c, 8))); +#endif // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH + #if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH) INSTANTIATE_TEST_CASE_P( MSA, Loop8Test6Param, ::testing::Values( - make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1), - make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2), - make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8, 1), - make_tuple(&wrapper_vertical_16_msa, &wrapper_vertical_16_c, 8, 1))); + make_tuple(&vpx_lpf_horizontal_4_msa, + &vpx_lpf_horizontal_4_c, 8), + make_tuple(&vpx_lpf_horizontal_8_msa, + &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_8_msa, + &vpx_lpf_horizontal_edge_8_c, 8), + make_tuple(&vpx_lpf_horizontal_edge_16_msa, + &vpx_lpf_horizontal_edge_16_c, 8), + make_tuple(&vpx_lpf_vertical_4_msa, + &vpx_lpf_vertical_4_c, 8), + make_tuple(&vpx_lpf_vertical_8_msa, + &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_vertical_16_msa, + &vpx_lpf_vertical_16_c, 8))); INSTANTIATE_TEST_CASE_P( MSA, Loop8Test9Param, diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c index 125b5bf0f..d9891bb6e 100644 --- a/vp10/common/loopfilter.c +++ b/vp10/common/loopfilter.c @@ -331,7 +331,6 @@ static void filter_selectively_vert_row2(int subsampling_factor, const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); - // TODO(yunqingwang): count in loopfilter functions should be removed. if (mask & 1) { if ((mask_16x16_0 | mask_16x16_1) & 1) { if ((mask_16x16_0 & mask_16x16_1) & 1) { @@ -352,11 +351,10 @@ static void filter_selectively_vert_row2(int subsampling_factor, lfi0->hev_thr, lfi1->mblim, lfi1->lim, lfi1->hev_thr); } else if (mask_8x8_0 & 1) { - vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, - 1); + vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); } else { vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, 1); + lfi1->hev_thr); } } @@ -366,11 +364,10 @@ static void filter_selectively_vert_row2(int subsampling_factor, lfi0->hev_thr, lfi1->mblim, lfi1->lim, lfi1->hev_thr); } else if (mask_4x4_0 & 1) { - vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, - 1); + vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); } else { vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, 1); + lfi1->hev_thr); } } @@ -381,10 +378,10 @@ static void filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr); } else if (mask_4x4_int_0 & 1) { vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1); + lfi0->hev_thr); } else { vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, 1); + lfi1->hev_thr); } } } @@ -431,7 +428,6 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); - // TODO(yunqingwang): count in loopfilter functions should be removed. if (mask & 1) { if ((mask_16x16_0 | mask_16x16_1) & 1) { if ((mask_16x16_0 & mask_16x16_1) & 1) { @@ -453,10 +449,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr, bd); } else if (mask_8x8_0 & 1) { vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1, bd); + lfi0->hev_thr, bd); } else { vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1, bd); + lfi1->lim, lfi1->hev_thr, bd); } } @@ -467,10 +463,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr, bd); } else if (mask_4x4_0 & 1) { vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1, bd); + lfi0->hev_thr, bd); } else { vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1, bd); + lfi1->lim, lfi1->hev_thr, bd); } } @@ -481,10 +477,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr, bd); } else if (mask_4x4_int_0 & 1) { vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1, bd); + lfi0->hev_thr, bd); } else { vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1, bd); + lfi1->lim, lfi1->hev_thr, bd); } } } @@ -521,12 +517,12 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, if (mask & 1) { if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { - vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); count = 2; } else { - vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); } } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { @@ -544,18 +540,18 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } else { if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); else if (mask_4x4_int & 2) vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1); + lfin->lim, lfin->hev_thr); } count = 2; } else { - vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); } } else if (mask_4x4 & 1) { if ((mask_4x4 & 3) == 3) { @@ -572,22 +568,22 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } else { if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); else if (mask_4x4_int & 2) vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1); + lfin->lim, lfin->hev_thr); } count = 2; } else { - vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); } } else if (mask_4x4_int & 1) { vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); } } s += 8 * count; @@ -618,12 +614,12 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, if (mask & 1) { if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { - vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 2, bd); + vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); count = 2; } else { - vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + vpx_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); } } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { @@ -642,20 +638,20 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, } else { if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } else if (mask_4x4_int & 2) { vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1, bd); + lfin->lim, lfin->hev_thr, bd); } } count = 2; } else { vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } } } else if (mask_4x4 & 1) { @@ -674,25 +670,25 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, } else { if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } else if (mask_4x4_int & 2) { vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1, bd); + lfin->lim, lfin->hev_thr, bd); } } count = 2; } else { vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } } } else if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); } } s += 8 * count; @@ -1152,13 +1148,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch, if (mask_16x16 & 1) { vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } else if (mask_8x8 & 1) { - vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } else if (mask_4x4 & 1) { - vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } } if (mask_4x4_int & 1) - vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); s += 8; lfl += 1; mask_16x16 >>= 1; @@ -1188,15 +1184,15 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch, lfi->hev_thr, bd); } else if (mask_8x8 & 1) { vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); } else if (mask_4x4 & 1) { vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); } } if (mask_4x4_int & 1) vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); s += 8; lfl += 1; mask_16x16 >>= 1; diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c index 509e1b8ff..31a6b40ab 100644 --- a/vp10/decoder/decodeframe.c +++ b/vp10/decoder/decodeframe.c @@ -2496,8 +2496,9 @@ static void resize_mv_buffer(VP10_COMMON *cm) { vpx_free(cm->cur_frame->mvs); cm->cur_frame->mi_rows = cm->mi_rows; cm->cur_frame->mi_cols = cm->mi_cols; - cm->cur_frame->mvs = (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, - sizeof(*cm->cur_frame->mvs)); + CHECK_MEM_ERROR(cm, cm->cur_frame->mvs, + (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*cm->cur_frame->mvs))); } static void resize_context_buffers(VP10_COMMON *cm, int width, int height) { diff --git a/vp10/encoder/aq_cyclicrefresh.c b/vp10/encoder/aq_cyclicrefresh.c index 660670cce..4d7b7d950 100644 --- a/vp10/encoder/aq_cyclicrefresh.c +++ b/vp10/encoder/aq_cyclicrefresh.c @@ -64,13 +64,13 @@ CYCLIC_REFRESH *vp10_cyclic_refresh_alloc(int mi_rows, int mi_cols) { cr->map = vpx_calloc(mi_rows * mi_cols, sizeof(*cr->map)); if (cr->map == NULL) { - vpx_free(cr); + vp10_cyclic_refresh_free(cr); return NULL; } last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map); cr->last_coded_q_map = vpx_malloc(last_coded_q_map_size); if (cr->last_coded_q_map == NULL) { - vpx_free(cr); + vp10_cyclic_refresh_free(cr); return NULL; } assert(MAXQ <= 255); diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c index 3f1dcf8de..5d0a0f783 100644 --- a/vp10/encoder/encoder.c +++ b/vp10/encoder/encoder.c @@ -1788,8 +1788,9 @@ VP10_COMP *vp10_create_compressor(VP10EncoderConfig *oxcf, } if (cpi->b_calculate_consistency) { - cpi->ssim_vars = vpx_malloc(sizeof(*cpi->ssim_vars) * - 4 * cpi->common.mi_rows * cpi->common.mi_cols); + CHECK_MEM_ERROR(cm, cpi->ssim_vars, + vpx_malloc(sizeof(*cpi->ssim_vars) * 4 * + cpi->common.mi_rows * cpi->common.mi_cols)); cpi->worst_consistency = 100.0; } #endif @@ -2611,16 +2612,16 @@ static void loopfilter_frame(VP10_COMP *cpi, VP10_COMMON *cm) { vpx_extend_frame_inner_borders(cm->frame_to_show); } -static INLINE void alloc_frame_mvs(const VP10_COMMON *cm, +static INLINE void alloc_frame_mvs(VP10_COMMON *const cm, int buffer_idx) { RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || new_fb_ptr->mi_cols < cm->mi_cols) { vpx_free(new_fb_ptr->mvs); - new_fb_ptr->mvs = - (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, - sizeof(*new_fb_ptr->mvs)); + CHECK_MEM_ERROR(cm, new_fb_ptr->mvs, + (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*new_fb_ptr->mvs))); new_fb_ptr->mi_rows = cm->mi_rows; new_fb_ptr->mi_cols = cm->mi_cols; } @@ -2667,12 +2668,13 @@ void vp10_scale_references(VP10_COMP *cpi) { if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || new_fb_ptr->buf.y_crop_height != cm->height) { - vpx_realloc_frame_buffer(&new_fb_ptr->buf, - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, - VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL); + if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + cm->use_highbitdepth, + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); @@ -2692,11 +2694,12 @@ void vp10_scale_references(VP10_COMP *cpi) { if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || new_fb_ptr->buf.y_crop_height != cm->height) { - vpx_realloc_frame_buffer(&new_fb_ptr->buf, - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL); + if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); scale_and_extend_frame(ref, &new_fb_ptr->buf); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); @@ -2993,14 +2996,15 @@ static void set_frame_size(VP10_COMP *cpi) { alloc_frame_mvs(cm, cm->new_fb_idx); // Reset the frame pointers to the current frame size. - vpx_realloc_frame_buffer(get_frame_new_buffer(cm), - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, + if (vpx_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, + cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL); + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); alloc_util_frame_buffers(cpi); init_motion_estimation(cpi); @@ -3816,12 +3820,14 @@ static void setup_denoiser_buffer(VP10_COMP *cpi) { VP10_COMMON *const cm = &cpi->common; if (cpi->oxcf.noise_sensitivity > 0 && !cpi->denoiser.frame_buffer_initialized) { - vp10_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, + if (vp10_denoiser_alloc(&cpi->denoiser, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, + cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS); + VP9_ENC_BORDER_IN_PIXELS)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate denoiser"); } } #endif @@ -3829,21 +3835,15 @@ static void setup_denoiser_buffer(VP10_COMP *cpi) { int vp10_receive_raw_frame(VP10_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { - VP10_COMMON *volatile const cm = &cpi->common; + VP10_COMMON *const cm = &cpi->common; struct vpx_usec_timer timer; - volatile int res = 0; + int res = 0; const int subsampling_x = sd->subsampling_x; const int subsampling_y = sd->subsampling_y; #if CONFIG_VP9_HIGHBITDEPTH const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; #endif - if (setjmp(cm->error.jmp)) { - cm->error.setjmp = 0; - return -1; - } - cm->error.setjmp = 1; - #if CONFIG_VP9_HIGHBITDEPTH check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y); #else @@ -3877,7 +3877,6 @@ int vp10_receive_raw_frame(VP10_COMP *cpi, unsigned int frame_flags, res = -1; } - cm->error.setjmp = 0; return res; } diff --git a/vp10/encoder/resize.c b/vp10/encoder/resize.c index 5572c17ad..e339fa363 100644 --- a/vp10/encoder/resize.c +++ b/vp10/encoder/resize.c @@ -461,6 +461,7 @@ static void resize_multistep(const uint8_t *const input, int filteredlength = length; if (!tmpbuf) { tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * length); + if (tmpbuf == NULL) return; otmp = tmpbuf; } else { otmp = buf; @@ -520,6 +521,7 @@ void vp10_resize_plane(const uint8_t *const input, uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * (width < height ? height : width)); uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2)); + if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL) goto Error; assert(width > 0); assert(height > 0); assert(width2 > 0); @@ -532,6 +534,8 @@ void vp10_resize_plane(const uint8_t *const input, resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf); fill_arr_to_col(output + i, out_stride, height2, arrbuf + height); } + + Error: free(intbuf); free(tmpbuf); free(arrbuf); @@ -754,6 +758,7 @@ static void highbd_resize_multistep(const uint16_t *const input, int filteredlength = length; if (!tmpbuf) { tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * length); + if (tmpbuf == NULL) return; otmp = tmpbuf; } else { otmp = buf; @@ -816,6 +821,7 @@ void vp10_highbd_resize_plane(const uint8_t *const input, uint16_t *tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * (width < height ? height : width)); uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * (height + height2)); + if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL) goto Error; for (i = 0; i < height; ++i) { highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width, intbuf + width2 * i, width2, tmpbuf, bd); @@ -827,6 +833,8 @@ void vp10_highbd_resize_plane(const uint8_t *const input, highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2, arrbuf + height); } + + Error: free(intbuf); free(tmpbuf); free(arrbuf); diff --git a/vp10/vp10_cx_iface.c b/vp10/vp10_cx_iface.c index 65a216e36..a75ce158e 100644 --- a/vp10/vp10_cx_iface.c +++ b/vp10/vp10_cx_iface.c @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "vpx/vpx_encoder.h" #include "vpx_ports/vpx_once.h" +#include "vpx_ports/system_state.h" #include "vpx/internal/vpx_codec_internal.h" #include "./vpx_version.h" #include "vp10/encoder/encoder.h" @@ -873,18 +874,21 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, unsigned long duration, - vpx_enc_frame_flags_t flags, + vpx_enc_frame_flags_t enc_flags, unsigned long deadline) { - vpx_codec_err_t res = VPX_CODEC_OK; + volatile vpx_codec_err_t res = VPX_CODEC_OK; + volatile vpx_enc_frame_flags_t flags = enc_flags; VP10_COMP *const cpi = ctx->cpi; const vpx_rational_t *const timebase = &ctx->cfg.g_timebase; size_t data_sz; + if (cpi == NULL) return VPX_CODEC_INVALID_PARAM; + if (img != NULL) { res = validate_img(ctx, img); // TODO(jzern) the checks related to cpi's validity should be treated as a // failure condition, encoder setup is done fully in init() currently. - if (res == VPX_CODEC_OK && cpi != NULL) { + if (res == VPX_CODEC_OK) { // There's no codec control for multiple alt-refs so check the encoder // instance for its status to determine the compressed data size. data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 * @@ -912,6 +916,14 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_INVALID_PARAM; } + if (setjmp(cpi->common.error.jmp)) { + cpi->common.error.setjmp = 0; + res = update_error_state(ctx, &cpi->common.error); + vpx_clear_system_state(); + return res; + } + cpi->common.error.setjmp = 1; + vp10_apply_encoding_flags(cpi, flags); // Handle fixed keyframe intervals @@ -923,8 +935,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } } - // Initialize the encoder instance on the first frame. - if (res == VPX_CODEC_OK && cpi != NULL) { + if (res == VPX_CODEC_OK) { unsigned int lib_flags = 0; YV12_BUFFER_CONFIG sd; int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts); @@ -963,7 +974,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, * the buffer size anyway. */ if (cx_data_sz < ctx->cx_data_sz / 2) { - ctx->base.err_detail = "Compressed data buffer too small"; + vpx_internal_error(&cpi->common.error, VPX_CODEC_ERROR, + "Compressed data buffer too small"); return VPX_CODEC_ERROR; } } @@ -1040,6 +1052,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } } + cpi->common.error.setjmp = 0; return res; } diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c index 2a21943fe..113865fe8 100644 --- a/vp8/encoder/denoising.c +++ b/vp8/encoder/denoising.c @@ -440,6 +440,11 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, denoiser->yv12_last_source.frame_size); denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1); + if (!denoiser->denoise_state) + { + vp8_denoiser_free(denoiser); + return 1; + } memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols)); vp8_denoiser_set_parameters(denoiser, mode); denoiser->nmse_source_diff = 0; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index edfd60c37..354bdfe47 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1318,9 +1318,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { vp8_denoiser_free(&cpi->denoiser); - vp8_denoiser_allocate(&cpi->denoiser, width, height, - cm->mb_rows, cm->mb_cols, - cpi->oxcf.noise_sensitivity); + if (vp8_denoiser_allocate(&cpi->denoiser, width, height, + cm->mb_rows, cm->mb_cols, + cpi->oxcf.noise_sensitivity)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate denoiser"); } #endif } @@ -1832,9 +1834,11 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { int width = (cpi->oxcf.Width + 15) & ~15; int height = (cpi->oxcf.Height + 15) & ~15; - vp8_denoiser_allocate(&cpi->denoiser, width, height, - cm->mb_rows, cm->mb_cols, - cpi->oxcf.noise_sensitivity); + if (vp8_denoiser_allocate(&cpi->denoiser, width, height, + cm->mb_rows, cm->mb_cols, + cpi->oxcf.noise_sensitivity)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate denoiser"); } } #endif diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index a12a2ad0e..9b58f8186 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -67,10 +67,11 @@ struct vpx_codec_alg_priv FRAGMENT_DATA fragments; }; -static void vp8_init_ctx(vpx_codec_ctx_t *ctx) +static int vp8_init_ctx(vpx_codec_ctx_t *ctx) { vpx_codec_alg_priv_t *priv = (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv)); + if (!priv) return 1; ctx->priv = (vpx_codec_priv_t *)priv; ctx->priv->init_flags = ctx->init_flags; @@ -85,6 +86,8 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx) priv->cfg = *ctx->config.dec; ctx->config.dec = &priv->cfg; } + + return 0; } static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, @@ -103,7 +106,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, * information becomes known. */ if (!ctx->priv) { - vp8_init_ctx(ctx); + if (vp8_init_ctx(ctx)) return VPX_CODEC_MEM_ERROR; priv = (vpx_codec_alg_priv_t *)ctx->priv; /* initialize number of fragments to zero */ diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index fc022093c..7b490af34 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -728,10 +728,8 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = { }; static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) { - // TODO(aconverse): model[PIVOT_NODE] should never be zero. - // https://code.google.com/p/webm/issues/detail?id=1089 - memcpy(probs, vp9_pareto8_full[p == 0 ? 254 : p - 1], - MODEL_NODES * sizeof(vpx_prob)); + assert(p != 0); + memcpy(probs, vp9_pareto8_full[p - 1], MODEL_NODES * sizeof(vpx_prob)); } void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full) { diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 79c3c4820..aca69bd0f 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -324,7 +324,6 @@ static void filter_selectively_vert_row2(int subsampling_factor, const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); - // TODO(yunqingwang): count in loopfilter functions should be removed. if (mask & 1) { if ((mask_16x16_0 | mask_16x16_1) & 1) { if ((mask_16x16_0 & mask_16x16_1) & 1) { @@ -345,11 +344,10 @@ static void filter_selectively_vert_row2(int subsampling_factor, lfi0->hev_thr, lfi1->mblim, lfi1->lim, lfi1->hev_thr); } else if (mask_8x8_0 & 1) { - vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, - 1); + vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); } else { vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, 1); + lfi1->hev_thr); } } @@ -359,11 +357,10 @@ static void filter_selectively_vert_row2(int subsampling_factor, lfi0->hev_thr, lfi1->mblim, lfi1->lim, lfi1->hev_thr); } else if (mask_4x4_0 & 1) { - vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, - 1); + vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); } else { vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, 1); + lfi1->hev_thr); } } @@ -374,10 +371,10 @@ static void filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr); } else if (mask_4x4_int_0 & 1) { vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1); + lfi0->hev_thr); } else { vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, 1); + lfi1->hev_thr); } } } @@ -424,7 +421,6 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); - // TODO(yunqingwang): count in loopfilter functions should be removed. if (mask & 1) { if ((mask_16x16_0 | mask_16x16_1) & 1) { if ((mask_16x16_0 & mask_16x16_1) & 1) { @@ -446,10 +442,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr, bd); } else if (mask_8x8_0 & 1) { vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1, bd); + lfi0->hev_thr, bd); } else { vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1, bd); + lfi1->lim, lfi1->hev_thr, bd); } } @@ -460,10 +456,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr, bd); } else if (mask_4x4_0 & 1) { vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1, bd); + lfi0->hev_thr, bd); } else { vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1, bd); + lfi1->lim, lfi1->hev_thr, bd); } } @@ -474,10 +470,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor, lfi1->hev_thr, bd); } else if (mask_4x4_int_0 & 1) { vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1, bd); + lfi0->hev_thr, bd); } else { vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1, bd); + lfi1->lim, lfi1->hev_thr, bd); } } } @@ -514,12 +510,12 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, if (mask & 1) { if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { - vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); count = 2; } else { - vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); } } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { @@ -537,18 +533,18 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } else { if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); else if (mask_4x4_int & 2) vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1); + lfin->lim, lfin->hev_thr); } count = 2; } else { - vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); } } else if (mask_4x4 & 1) { if ((mask_4x4 & 3) == 3) { @@ -565,22 +561,22 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } else { if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); else if (mask_4x4_int & 2) vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1); + lfin->lim, lfin->hev_thr); } count = 2; } else { - vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); if (mask_4x4_int & 1) vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); } } else if (mask_4x4_int & 1) { vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); } } s += 8 * count; @@ -611,12 +607,12 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, if (mask & 1) { if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { - vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 2, bd); + vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); count = 2; } else { - vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + vpx_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); } } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { @@ -635,20 +631,20 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, } else { if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } else if (mask_4x4_int & 2) { vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1, bd); + lfin->lim, lfin->hev_thr, bd); } } count = 2; } else { vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } } } else if (mask_4x4 & 1) { @@ -667,25 +663,25 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, } else { if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } else if (mask_4x4_int & 2) { vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1, bd); + lfin->lim, lfin->hev_thr, bd); } } count = 2; } else { vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1, bd); + lfi->lim, lfi->hev_thr, bd); } } } else if (mask_4x4_int & 1) { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); } } s += 8 * count; @@ -1102,13 +1098,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch, if (mask_16x16 & 1) { vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } else if (mask_8x8 & 1) { - vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } else if (mask_4x4 & 1) { - vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } } if (mask_4x4_int & 1) - vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); s += 8; lfl += 1; mask_16x16 >>= 1; @@ -1138,15 +1134,15 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch, lfi->hev_thr, bd); } else if (mask_8x8 & 1) { vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); } else if (mask_4x4 & 1) { vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); } } if (mask_4x4_int & 1) vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1, bd); + lfi->hev_thr, bd); s += 8; lfl += 1; mask_16x16 >>= 1; diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 8a492d562..9ce137d04 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -1221,8 +1221,9 @@ static void resize_mv_buffer(VP9_COMMON *cm) { vpx_free(cm->cur_frame->mvs); cm->cur_frame->mi_rows = cm->mi_rows; cm->cur_frame->mi_cols = cm->mi_cols; - cm->cur_frame->mvs = (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, - sizeof(*cm->cur_frame->mvs)); + CHECK_MEM_ERROR(cm, cm->cur_frame->mvs, + (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*cm->cur_frame->mvs))); } static void resize_context_buffers(VP9_COMMON *cm, int width, int height) { diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index ebc850de7..ac834caa1 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -29,13 +29,13 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { cr->map = vpx_calloc(mi_rows * mi_cols, sizeof(*cr->map)); if (cr->map == NULL) { - vpx_free(cr); + vp9_cyclic_refresh_free(cr); return NULL; } last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map); cr->last_coded_q_map = vpx_malloc(last_coded_q_map_size); if (cr->last_coded_q_map == NULL) { - vpx_free(cr); + vp9_cyclic_refresh_free(cr); return NULL; } assert(MAXQ <= 255); @@ -44,7 +44,7 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { consec_zero_mv_size = mi_rows * mi_cols * sizeof(*cr->consec_zero_mv); cr->consec_zero_mv = vpx_malloc(consec_zero_mv_size); if (cr->consec_zero_mv == NULL) { - vpx_free(cr); + vp9_cyclic_refresh_free(cr); return NULL; } memset(cr->consec_zero_mv, 0, consec_zero_mv_size); diff --git a/vp9/encoder/vp9_cost.c b/vp9/encoder/vp9_cost.c index c85f76322..5d14742bc 100644 --- a/vp9/encoder/vp9_cost.c +++ b/vp9/encoder/vp9_cost.c @@ -12,9 +12,8 @@ #include "vp9/encoder/vp9_cost.h" /* round(-log2(i/256.) * (1 << VP9_PROB_COST_SHIFT)) - Begins and ends with a bogus entry to satisfy use of prob=0 in the firstpass. - https://code.google.com/p/webm/issues/detail?id=1089 */ -const uint16_t vp9_prob_cost[257] = { + Begins with a bogus entry for simpler addressing. */ +const uint16_t vp9_prob_cost[256] = { 4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325, 2260, 2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780, 1748, 1718, 1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470, @@ -36,13 +35,14 @@ const uint16_t vp9_prob_cost[257] = { 125, 122, 119, 115, 112, 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73, 70, 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29, 26, 23, 20, 18, 15, - 12, 9, 6, 3, 3}; + 12, 9, 6, 3}; static void cost(int *costs, vpx_tree tree, const vpx_prob *probs, int i, int c) { const vpx_prob prob = probs[i / 2]; int b; + assert(prob != 0); for (b = 0; b <= 1; ++b) { const int cc = c + vp9_cost_bit(prob, b); const vpx_tree_index ii = tree[i + b]; diff --git a/vp9/encoder/vp9_cost.h b/vp9/encoder/vp9_cost.h index 9831013b1..0c70b7826 100644 --- a/vp9/encoder/vp9_cost.h +++ b/vp9/encoder/vp9_cost.h @@ -18,7 +18,7 @@ extern "C" { #endif -extern const uint16_t vp9_prob_cost[257]; +extern const uint16_t vp9_prob_cost[256]; // The factor to scale from cost in bits to cost in vp9_prob_cost units. #define VP9_PROB_COST_SHIFT 9 diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index d407f5e53..05dc31b8d 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1766,8 +1766,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, } if (cpi->b_calculate_consistency) { - cpi->ssim_vars = vpx_malloc(sizeof(*cpi->ssim_vars) * - 4 * cpi->common.mi_rows * cpi->common.mi_cols); + CHECK_MEM_ERROR(cm, cpi->ssim_vars, + vpx_malloc(sizeof(*cpi->ssim_vars) * 4 * + cpi->common.mi_rows * cpi->common.mi_cols)); cpi->worst_consistency = 100.0; } @@ -2631,16 +2632,16 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { vpx_extend_frame_inner_borders(cm->frame_to_show); } -static INLINE void alloc_frame_mvs(const VP9_COMMON *cm, +static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || new_fb_ptr->mi_cols < cm->mi_cols) { vpx_free(new_fb_ptr->mvs); - new_fb_ptr->mvs = - (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, - sizeof(*new_fb_ptr->mvs)); + CHECK_MEM_ERROR(cm, new_fb_ptr->mvs, + (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*new_fb_ptr->mvs))); new_fb_ptr->mi_rows = cm->mi_rows; new_fb_ptr->mi_cols = cm->mi_cols; } @@ -2678,12 +2679,13 @@ void vp9_scale_references(VP9_COMP *cpi) { if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || new_fb_ptr->buf.y_crop_height != cm->height) { - vpx_realloc_frame_buffer(&new_fb_ptr->buf, - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - cm->use_highbitdepth, - VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL); + if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + cm->use_highbitdepth, + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); @@ -2703,11 +2705,12 @@ void vp9_scale_references(VP9_COMP *cpi) { if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || new_fb_ptr->buf.y_crop_height != cm->height) { - vpx_realloc_frame_buffer(&new_fb_ptr->buf, - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL); + if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + VP9_ENC_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); @@ -2954,12 +2957,14 @@ static void setup_denoiser_buffer(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; if (cpi->oxcf.noise_sensitivity > 0 && !cpi->denoiser.frame_buffer_initialized) { - vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, + if (vp9_denoiser_alloc(&cpi->denoiser, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, + cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS); + VP9_ENC_BORDER_IN_PIXELS)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate denoiser"); } } #endif @@ -3032,14 +3037,15 @@ static void set_frame_size(VP9_COMP *cpi) { alloc_frame_mvs(cm, cm->new_fb_idx); // Reset the frame pointers to the current frame size. - vpx_realloc_frame_buffer(get_frame_new_buffer(cm), - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, + if (vpx_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, + cm->use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL); + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); alloc_util_frame_buffers(cpi); init_motion_estimation(cpi); @@ -3889,21 +3895,15 @@ static void check_initial_width(VP9_COMP *cpi, int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { - VP9_COMMON *volatile const cm = &cpi->common; + VP9_COMMON *const cm = &cpi->common; struct vpx_usec_timer timer; - volatile int res = 0; + int res = 0; const int subsampling_x = sd->subsampling_x; const int subsampling_y = sd->subsampling_y; #if CONFIG_VP9_HIGHBITDEPTH const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; #endif - if (setjmp(cm->error.jmp)) { - cm->error.setjmp = 0; - return -1; - } - cm->error.setjmp = 1; - #if CONFIG_VP9_HIGHBITDEPTH check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y); #else @@ -3937,7 +3937,6 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, res = -1; } - cm->error.setjmp = 0; return res; } diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 44b6ae71a..5ec7b25ee 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -286,29 +286,37 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { set_block_thresholds(cm, rd); set_partition_probs(cm, xd); - if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) - fill_token_costs(x->token_costs, cm->fc->coef_probs); + if (cpi->oxcf.pass == 1) { + if (!frame_is_intra_only(cm)) + vp9_build_nmv_cost_table( + x->nmvjointcost, + cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, + &cm->fc->nmvc, cm->allow_high_precision_mv); + } else { + if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) + fill_token_costs(x->token_costs, cm->fc->coef_probs); - if (cpi->sf.partition_search_type != VAR_BASED_PARTITION || - cm->frame_type == KEY_FRAME) { - for (i = 0; i < PARTITION_CONTEXTS; ++i) - vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i), - vp9_partition_tree); - } + if (cpi->sf.partition_search_type != VAR_BASED_PARTITION || + cm->frame_type == KEY_FRAME) { + for (i = 0; i < PARTITION_CONTEXTS; ++i) + vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i), + vp9_partition_tree); + } - if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 || - cm->frame_type == KEY_FRAME) { - fill_mode_costs(cpi); + if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 || + cm->frame_type == KEY_FRAME) { + fill_mode_costs(cpi); - if (!frame_is_intra_only(cm)) { - vp9_build_nmv_cost_table(x->nmvjointcost, - cm->allow_high_precision_mv ? x->nmvcost_hp - : x->nmvcost, - &cm->fc->nmvc, cm->allow_high_precision_mv); + if (!frame_is_intra_only(cm)) { + vp9_build_nmv_cost_table( + x->nmvjointcost, + cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, + &cm->fc->nmvc, cm->allow_high_precision_mv); - for (i = 0; i < INTER_MODE_CONTEXTS; ++i) - vp9_cost_tokens((int *)cpi->inter_mode_cost[i], - cm->fc->inter_mode_probs[i], vp9_inter_mode_tree); + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + vp9_cost_tokens((int *)cpi->inter_mode_cost[i], + cm->fc->inter_mode_probs[i], vp9_inter_mode_tree); + } } } } diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c index f4d0db4d5..63f0ce213 100644 --- a/vp9/encoder/vp9_resize.c +++ b/vp9/encoder/vp9_resize.c @@ -462,6 +462,7 @@ static void resize_multistep(const uint8_t *const input, int filteredlength = length; if (!tmpbuf) { tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * length); + if (tmpbuf == NULL) return; otmp = tmpbuf; } else { otmp = buf; @@ -521,6 +522,7 @@ void vp9_resize_plane(const uint8_t *const input, uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * (width < height ? height : width)); uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2)); + if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL) goto Error; assert(width > 0); assert(height > 0); assert(width2 > 0); @@ -533,6 +535,8 @@ void vp9_resize_plane(const uint8_t *const input, resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf); fill_arr_to_col(output + i, out_stride, height2, arrbuf + height); } + + Error: free(intbuf); free(tmpbuf); free(arrbuf); @@ -755,6 +759,7 @@ static void highbd_resize_multistep(const uint16_t *const input, int filteredlength = length; if (!tmpbuf) { tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * length); + if (tmpbuf == NULL) return; otmp = tmpbuf; } else { otmp = buf; @@ -817,6 +822,7 @@ void vp9_highbd_resize_plane(const uint8_t *const input, uint16_t *tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * (width < height ? height : width)); uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * (height + height2)); + if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL) goto Error; for (i = 0; i < height; ++i) { highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width, intbuf + width2 * i, width2, tmpbuf, bd); @@ -828,6 +834,8 @@ void vp9_highbd_resize_plane(const uint8_t *const input, highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2, arrbuf + height); } + + Error: free(intbuf); free(tmpbuf); free(arrbuf); diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 9724df4cd..1d561545c 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -118,15 +118,20 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { tl == 0) { size_t last_coded_q_map_size; size_t consec_zero_mv_size; + VP9_COMMON *const cm = &cpi->common; lc->sb_index = 0; - lc->map = vpx_malloc(mi_rows * mi_cols * sizeof(signed char)); + CHECK_MEM_ERROR(cm, lc->map, + vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map))); memset(lc->map, 0, mi_rows * mi_cols); - last_coded_q_map_size = mi_rows * mi_cols * sizeof(uint8_t); - lc->last_coded_q_map = vpx_malloc(last_coded_q_map_size); + last_coded_q_map_size = mi_rows * mi_cols * + sizeof(*lc->last_coded_q_map); + CHECK_MEM_ERROR(cm, lc->last_coded_q_map, + vpx_malloc(last_coded_q_map_size)); assert(MAXQ <= 255); memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size); - consec_zero_mv_size = mi_rows * mi_cols * sizeof(uint8_t); - lc->consec_zero_mv = vpx_malloc(consec_zero_mv_size); + consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv); + CHECK_MEM_ERROR(cm, lc->consec_zero_mv, + vpx_malloc(consec_zero_mv_size)); memset(lc->consec_zero_mv, 0, consec_zero_mv_size); } } diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index db7f537a6..11df1e4f6 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "vpx/vpx_encoder.h" #include "vpx_ports/vpx_once.h" +#include "vpx_ports/system_state.h" #include "vpx/internal/vpx_codec_internal.h" #include "./vpx_version.h" #include "vp9/encoder/vp9_encoder.h" @@ -967,18 +968,19 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, unsigned long duration, - vpx_enc_frame_flags_t flags, + vpx_enc_frame_flags_t enc_flags, unsigned long deadline) { - vpx_codec_err_t res = VPX_CODEC_OK; + volatile vpx_codec_err_t res = VPX_CODEC_OK; + volatile vpx_enc_frame_flags_t flags = enc_flags; VP9_COMP *const cpi = ctx->cpi; const vpx_rational_t *const timebase = &ctx->cfg.g_timebase; size_t data_sz; + if (cpi == NULL) return VPX_CODEC_INVALID_PARAM; + if (img != NULL) { res = validate_img(ctx, img); - // TODO(jzern) the checks related to cpi's validity should be treated as a - // failure condition, encoder setup is done fully in init() currently. - if (res == VPX_CODEC_OK && cpi != NULL) { + if (res == VPX_CODEC_OK) { // There's no codec control for multiple alt-refs so check the encoder // instance for its status to determine the compressed data size. data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 * @@ -1006,6 +1008,14 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_INVALID_PARAM; } + if (setjmp(cpi->common.error.jmp)) { + cpi->common.error.setjmp = 0; + res = update_error_state(ctx, &cpi->common.error); + vpx_clear_system_state(); + return res; + } + cpi->common.error.setjmp = 1; + vp9_apply_encoding_flags(cpi, flags); // Handle fixed keyframe intervals @@ -1017,8 +1027,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } } - // Initialize the encoder instance on the first frame. - if (res == VPX_CODEC_OK && cpi != NULL) { + if (res == VPX_CODEC_OK) { unsigned int lib_flags = 0; YV12_BUFFER_CONFIG sd; int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts); @@ -1057,7 +1066,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, * the buffer size anyway. */ if (cx_data_sz < ctx->cx_data_sz / 2) { - ctx->base.err_detail = "Compressed data buffer too small"; + vpx_internal_error(&cpi->common.error, VPX_CODEC_ERROR, + "Compressed data buffer too small"); return VPX_CODEC_ERROR; } } @@ -1175,6 +1185,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } } + cpi->common.error.setjmp = 0; return res; } diff --git a/vpx_dsp/arm/loopfilter_4_neon.asm b/vpx_dsp/arm/loopfilter_4_neon.asm index e45e34cd4..937115898 100644 --- a/vpx_dsp/arm/loopfilter_4_neon.asm +++ b/vpx_dsp/arm/loopfilter_4_neon.asm @@ -16,37 +16,28 @@ ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter ; works on 16 iterations at a time. -; TODO(fgalligan): See about removing the count code as this function is only -; called with a count of 1. ; ; void vpx_lpf_horizontal_4_neon(uint8_t *s, ; int p /* pitch */, ; const uint8_t *blimit, ; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; const uint8_t *thresh) ; ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -; sp+4 int count |vpx_lpf_horizontal_4_neon| PROC push {lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #8] ; load count ldr r2, [sp, #4] ; load thresh add r1, r1, r1 ; double pitch - cmp r12, #0 - beq end_vpx_lf_h_edge - vld1.8 {d1[]}, [r3] ; duplicate *limit vld1.8 {d2[]}, [r2] ; duplicate *thresh -count_lf_h_loop sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines add r3, r2, r1, lsr #1 ; set to 3 lines down @@ -69,47 +60,34 @@ count_lf_h_loop vst1.u8 {d6}, [r2@64], r1 ; store oq0 vst1.u8 {d7}, [r3@64], r1 ; store oq1 - add r0, r0, #8 - subs r12, r12, #1 - bne count_lf_h_loop - -end_vpx_lf_h_edge pop {pc} ENDP ; |vpx_lpf_horizontal_4_neon| ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter ; works on 16 iterations at a time. -; TODO(fgalligan): See about removing the count code as this function is only -; called with a count of 1. ; ; void vpx_lpf_vertical_4_neon(uint8_t *s, ; int p /* pitch */, ; const uint8_t *blimit, ; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; const uint8_t *thresh) ; ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -; sp+4 int count |vpx_lpf_vertical_4_neon| PROC push {lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #8] ; load count vld1.8 {d1[]}, [r3] ; duplicate *limit ldr r3, [sp, #4] ; load thresh sub r2, r0, #4 ; move s pointer down by 4 columns - cmp r12, #0 - beq end_vpx_lf_v_edge vld1.8 {d2[]}, [r3] ; duplicate *thresh -count_lf_v_loop vld1.u8 {d3}, [r2], r1 ; load s data vld1.u8 {d4}, [r2], r1 vld1.u8 {d5}, [r2], r1 @@ -149,12 +127,6 @@ count_lf_v_loop vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] - add r0, r0, r1, lsl #3 ; s += pitch * 8 - subs r12, r12, #1 - subne r2, r0, #4 ; move s pointer down by 4 columns - bne count_lf_v_loop - -end_vpx_lf_v_edge pop {pc} ENDP ; |vpx_lpf_vertical_4_neon| diff --git a/vpx_dsp/arm/loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c index 7ad411aea..7f3ee70b9 100644 --- a/vpx_dsp/arm/loopfilter_4_neon.c +++ b/vpx_dsp/arm/loopfilter_4_neon.c @@ -115,22 +115,18 @@ void vpx_lpf_horizontal_4_neon( int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { int i; uint8_t *s, *psrc; uint8x8_t dblimit, dlimit, dthresh; uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - if (count == 0) // end_vpx_lf_h_edge - return; - dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); psrc = src - (pitch << 2); - for (i = 0; i < count; i++) { + for (i = 0; i < 1; i++) { s = psrc + i * 8; d3u8 = vld1_u8(s); @@ -170,8 +166,7 @@ void vpx_lpf_vertical_4_neon( int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { int i, pitch8; uint8_t *s; uint8x8_t dblimit, dlimit, dthresh; @@ -181,15 +176,12 @@ void vpx_lpf_vertical_4_neon( uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; uint8x8x4_t d4Result; - if (count == 0) // end_vpx_lf_h_edge - return; - dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); pitch8 = pitch * 8; - for (i = 0; i < count; i++, src += pitch8) { + for (i = 0; i < 1; i++, src += pitch8) { s = src - (i + 1) * 4; d3u8 = vld1_u8(s); diff --git a/vpx_dsp/arm/loopfilter_8_neon.asm b/vpx_dsp/arm/loopfilter_8_neon.asm index e81734c04..a2f20e15f 100644 --- a/vpx_dsp/arm/loopfilter_8_neon.asm +++ b/vpx_dsp/arm/loopfilter_8_neon.asm @@ -16,35 +16,26 @@ ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter ; works on 16 iterations at a time. -; TODO(fgalligan): See about removing the count code as this function is only -; called with a count of 1. ; ; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, ; const uint8_t *blimit, ; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; const uint8_t *thresh) ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -; sp+4 int count |vpx_lpf_horizontal_8_neon| PROC push {r4-r5, lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #16] ; load count ldr r2, [sp, #12] ; load thresh add r1, r1, r1 ; double pitch - cmp r12, #0 - beq end_vpx_mblf_h_edge - vld1.8 {d1[]}, [r3] ; duplicate *limit vld1.8 {d2[]}, [r2] ; duplicate *thresh -count_mblf_h_loop sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines add r2, r3, r1, lsr #1 ; set to 3 lines down @@ -69,11 +60,6 @@ count_mblf_h_loop vst1.u8 {d4}, [r2@64], r1 ; store oq1 vst1.u8 {d5}, [r3@64], r1 ; store oq2 - add r0, r0, #8 - subs r12, r12, #1 - bne count_mblf_h_loop - -end_vpx_mblf_h_edge pop {r4-r5, pc} ENDP ; |vpx_lpf_horizontal_8_neon| @@ -82,30 +68,24 @@ end_vpx_mblf_h_edge ; int pitch, ; const uint8_t *blimit, ; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; const uint8_t *thresh) ; ; r0 uint8_t *s, ; r1 int pitch, ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -; sp+4 int count |vpx_lpf_vertical_8_neon| PROC push {r4-r5, lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #16] ; load count vld1.8 {d1[]}, [r3] ; duplicate *limit ldr r3, [sp, #12] ; load thresh sub r2, r0, #4 ; move s pointer down by 4 columns - cmp r12, #0 - beq end_vpx_mblf_v_edge vld1.8 {d2[]}, [r3] ; duplicate *thresh -count_mblf_v_loop vld1.u8 {d3}, [r2], r1 ; load s data vld1.u8 {d4}, [r2], r1 vld1.u8 {d5}, [r2], r1 @@ -156,12 +136,6 @@ count_mblf_v_loop vst2.8 {d4[6], d5[6]}, [r3], r1 vst2.8 {d4[7], d5[7]}, [r3] - add r0, r0, r1, lsl #3 ; s += pitch * 8 - subs r12, r12, #1 - subne r2, r0, #4 ; move s pointer down by 4 columns - bne count_mblf_v_loop - -end_vpx_mblf_v_edge pop {r4-r5, pc} ENDP ; |vpx_lpf_vertical_8_neon| diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c index a887e2ee5..ec3757380 100644 --- a/vpx_dsp/arm/loopfilter_8_neon.c +++ b/vpx_dsp/arm/loopfilter_8_neon.c @@ -268,23 +268,19 @@ void vpx_lpf_horizontal_8_neon( int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { int i; uint8_t *s, *psrc; uint8x8_t dblimit, dlimit, dthresh; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; uint8x8_t d16u8, d17u8, d18u8; - if (count == 0) // end_vpx_mblf_h_edge - return; - dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); psrc = src - (pitch << 2); - for (i = 0; i < count; i++) { + for (i = 0; i < 1; i++) { s = psrc + i * 8; d3u8 = vld1_u8(s); @@ -328,8 +324,7 @@ void vpx_lpf_vertical_8_neon( int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { int i; uint8_t *s; uint8x8_t dblimit, dlimit, dthresh; @@ -341,14 +336,11 @@ void vpx_lpf_vertical_8_neon( uint8x8x4_t d4Result; uint8x8x2_t d2Result; - if (count == 0) - return; - dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); - for (i = 0; i < count; i++) { + for (i = 0; i < 1; i++) { s = src + (i * (pitch << 3)) - 4; d3u8 = vld1_u8(s); diff --git a/vpx_dsp/arm/loopfilter_mb_neon.asm b/vpx_dsp/arm/loopfilter_mb_neon.asm index 20d9cfb11..d5da7a840 100644 --- a/vpx_dsp/arm/loopfilter_mb_neon.asm +++ b/vpx_dsp/arm/loopfilter_mb_neon.asm @@ -8,27 +8,28 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vpx_lpf_horizontal_16_neon| + EXPORT |vpx_lpf_horizontal_edge_8_neon| + EXPORT |vpx_lpf_horizontal_edge_16_neon| EXPORT |vpx_lpf_vertical_16_neon| ARM AREA ||.text||, CODE, READONLY, ALIGN=2 -; void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh -; int count) +; void mb_lpf_horizontal_edge(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -|vpx_lpf_horizontal_16_neon| PROC +; r12 int count +|mb_lpf_horizontal_edge| PROC push {r4-r8, lr} vpush {d8-d15} ldr r4, [sp, #88] ; load thresh - ldr r12, [sp, #92] ; load count h_count vld1.8 {d16[]}, [r2] ; load *blimit @@ -115,7 +116,35 @@ h_next vpop {d8-d15} pop {r4-r8, pc} - ENDP ; |vpx_lpf_horizontal_16_neon| + ENDP ; |mb_lpf_horizontal_edge| + +; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_horizontal_edge_8_neon| PROC + mov r12, #1 + b mb_lpf_horizontal_edge + ENDP ; |vpx_lpf_horizontal_edge_8_neon| + +; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_horizontal_edge_16_neon| PROC + mov r12, #2 + b mb_lpf_horizontal_edge + ENDP ; |vpx_lpf_horizontal_edge_16_neon| ; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, ; const uint8_t *blimit, diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c index eff87d29b..aa31f2935 100644 --- a/vpx_dsp/arm/loopfilter_neon.c +++ b/vpx_dsp/arm/loopfilter_neon.c @@ -21,8 +21,8 @@ void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); + vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1); } #if HAVE_NEON_ASM @@ -33,8 +33,8 @@ void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, @@ -44,8 +44,8 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); + vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c index 66f4d9576..46ef64617 100644 --- a/vpx_dsp/loopfilter.c +++ b/vpx_dsp/loopfilter.c @@ -119,12 +119,12 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, int count) { + const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const int8_t mask = filter_mask(*limit, *blimit, @@ -138,18 +138,17 @@ void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { + const uint8_t *limit, const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, @@ -163,9 +162,8 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1); - vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, - thresh1, 1); + vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, @@ -190,13 +188,12 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, } void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { + const uint8_t *limit, const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; @@ -213,16 +210,15 @@ void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { + const uint8_t *limit, const uint8_t *thresh) { int i; - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, @@ -238,9 +234,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1); - vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, - thresh1, 1); + vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } static INLINE void filter16(int8_t mask, uint8_t thresh, @@ -294,9 +289,9 @@ static INLINE void filter16(int8_t mask, uint8_t thresh, } } -void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { +static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -320,6 +315,16 @@ void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit, } } +void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); +} + +void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); +} + static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, @@ -450,12 +455,12 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, int count, int bd) { + const uint8_t *thresh, int bd) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint16_t p3 = s[-4 * p]; const uint16_t p2 = s[-3 * p]; const uint16_t p1 = s[-2 * p]; @@ -479,18 +484,18 @@ void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd); - vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd); + vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd); } void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { + int bd) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = highbd_filter_mask(*limit, *blimit, @@ -508,9 +513,9 @@ void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd); + vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, - thresh1, 1, bd); + thresh1, bd); } static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, @@ -536,12 +541,12 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { + int bd) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; @@ -564,16 +569,16 @@ void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd); - vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd); + vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); } void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { + int bd) { int i; - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = highbd_filter_mask(*limit, *blimit, @@ -596,9 +601,9 @@ void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd); + vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, - thresh1, 1, bd); + thresh1, bd); } static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, @@ -664,9 +669,11 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, } } -void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { +static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count, int bd) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -698,6 +705,20 @@ void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit, } } +void vpx_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); +} + +void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd); +} + static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, diff --git a/vpx_dsp/mips/loopfilter_16_msa.c b/vpx_dsp/mips/loopfilter_16_msa.c index b7c9f7bd0..a6c581d72 100644 --- a/vpx_dsp/mips/loopfilter_16_msa.c +++ b/vpx_dsp/mips/loopfilter_16_msa.c @@ -423,11 +423,11 @@ void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, } } -void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { +static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { if (1 == count) { uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; uint64_t dword0, dword1; @@ -648,6 +648,20 @@ void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, } } +void vpx_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); +} + +void vpx_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2); +} + static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch) { v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; diff --git a/vpx_dsp/mips/loopfilter_4_msa.c b/vpx_dsp/mips/loopfilter_4_msa.c index daf5f38bf..936347031 100644 --- a/vpx_dsp/mips/loopfilter_4_msa.c +++ b/vpx_dsp/mips/loopfilter_4_msa.c @@ -13,14 +13,11 @@ void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { + const uint8_t *thresh_ptr) { uint64_t p1_d, p0_d, q0_d, q1_d; v16u8 mask, hev, flat, thresh, b_limit, limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; - (void)count; - /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); @@ -74,14 +71,11 @@ void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { + const uint8_t *thresh_ptr) { v16u8 mask, hev, flat, limit, thresh, b_limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v8i16 vec0, vec1, vec2, vec3; - (void)count; - LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh_ptr); diff --git a/vpx_dsp/mips/loopfilter_8_msa.c b/vpx_dsp/mips/loopfilter_8_msa.c index 00b6db550..5b22bd002 100644 --- a/vpx_dsp/mips/loopfilter_8_msa.c +++ b/vpx_dsp/mips/loopfilter_8_msa.c @@ -13,8 +13,7 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { + const uint8_t *thresh_ptr) { uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; v16u8 mask, hev, flat, thresh, b_limit, limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; @@ -23,8 +22,6 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; v16i8 zero = { 0 }; - (void)count; - /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); @@ -161,8 +158,7 @@ void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch, void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { + const uint8_t *thresh_ptr) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p1_out, p0_out, q0_out, q1_out; v16u8 flat, mask, hev, thresh, b_limit, limit; @@ -171,8 +167,6 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, v16u8 zero = { 0 }; v8i16 vec0, vec1, vec2, vec3, vec4; - (void)count; - /* load vector elements */ LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c index 99a96d89b..8414b9ed5 100644 --- a/vpx_dsp/mips/loopfilter_filters_dspr2.c +++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c @@ -23,8 +23,7 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { uint8_t i; uint32_t mask; uint32_t hev; @@ -117,8 +116,7 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { uint8_t i; uint32_t mask, hev; uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; @@ -313,8 +311,8 @@ void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */, @@ -324,8 +322,8 @@ void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1); + vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, @@ -335,8 +333,8 @@ void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1); + vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, @@ -346,9 +344,8 @@ void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1); - vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, - 1); + vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); } void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, diff --git a/vpx_dsp/mips/loopfilter_mb_dspr2.c b/vpx_dsp/mips/loopfilter_mb_dspr2.c index 4138f5697..dd0545eed 100644 --- a/vpx_dsp/mips/loopfilter_mb_dspr2.c +++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c @@ -23,8 +23,7 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { uint32_t mask; uint32_t hev, flat; uint8_t i; @@ -322,8 +321,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count) { + const uint8_t *thresh) { uint8_t i; uint32_t mask, hev, flat; uint8_t *s1, *s2, *s3, *s4; diff --git a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c index 8a4865073..85e167ca0 100644 --- a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c +++ b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c @@ -19,12 +19,12 @@ #include "vpx_mem/vpx_mem.h" #if HAVE_DSPR2 -void vpx_lpf_horizontal_16_dspr2(unsigned char *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +static void mb_lpf_horizontal_edge(unsigned char *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { uint32_t mask; uint32_t hev, flat, flat2; uint8_t i; @@ -791,4 +791,18 @@ void vpx_lpf_horizontal_16_dspr2(unsigned char *s, s = s + 4; } } + +void vpx_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1); +} + +void vpx_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2); +} #endif // #if HAVE_DSPR2 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 8168b482a..5457d00bf 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -535,31 +535,35 @@ add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8 specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/; $vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon; -add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/; $vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon; -add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_vertical_4 mmx neon dspr2 msa/; add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/; -add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/; -$vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon; +add_proto qw/void vpx_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/; +$vpx_lpf_horizontal_edge_8_neon_asm=vpx_lpf_horizontal_edge_8_neon; -add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +add_proto qw/void vpx_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/; +$vpx_lpf_horizontal_edge_16_neon_asm=vpx_lpf_horizontal_edge_16_neon; + +add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/; $vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon; -add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_horizontal_4 mmx neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; @@ -572,28 +576,31 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/; - add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd"; + add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/vpx_highbd_lpf_vertical_8 sse2/; add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/; - add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd"; + add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/vpx_highbd_lpf_vertical_4 sse2/; add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/; - add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd"; - specialize qw/vpx_highbd_lpf_horizontal_16 sse2/; + add_proto qw/void vpx_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/vpx_highbd_lpf_horizontal_edge_8 sse2/; - add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd"; + add_proto qw/void vpx_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/vpx_highbd_lpf_horizontal_edge_16 sse2/; + + add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/vpx_highbd_lpf_horizontal_8 sse2/; add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/; - add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd"; + add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/vpx_highbd_lpf_horizontal_4 sse2/; add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c index c4fd5e1a0..72e42adc9 100644 --- a/vpx_dsp/x86/highbd_loopfilter_sse2.c +++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c @@ -51,12 +51,10 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { // TODO(debargha, peter): Break up large functions into smaller ones // in this file. -static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s, - int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, - int bd) { +void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi16(1); __m128i blimit, limit, thresh; @@ -496,34 +494,19 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s, _mm_store_si128((__m128i *)(s - 0 * p), q0); } -static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s, - int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, - int bd) { - highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd); - highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh, - bd); -} - -// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. -void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, - int count, int bd) { - if (count == 1) - highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd); - else - highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd); +void vpx_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + vpx_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd); + vpx_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd); } void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, - int count, int bd) { + int bd) { DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); @@ -556,8 +539,6 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, __m128i work_a; __m128i filter1, filter2; - (void)count; - if (bd == 8) { blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); @@ -764,16 +745,15 @@ void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { - vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd); - vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, - 1, bd); + vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd); + vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); } void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, - int count, int bd) { + int bd) { const __m128i zero = _mm_set1_epi16(0); __m128i blimit, limit, thresh; __m128i mask, hev, flat; @@ -813,8 +793,6 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, __m128i work_a; __m128i filter1, filter2; - (void)count; - if (bd == 8) { blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); @@ -944,9 +922,8 @@ void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { - vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd); - vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1, - bd); + vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd); + vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); } static INLINE void highbd_transpose(uint16_t *src[], int in_p, @@ -1058,11 +1035,10 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { + int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); uint16_t *src[1]; uint16_t *dst[1]; - (void)count; // Transpose 8x8 src[0] = s - 4; @@ -1071,8 +1047,7 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, highbd_transpose(src, p, dst, 8, 1); // Loop filtering - vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1, - bd); + vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); src[0] = t_dst; dst[0] = s - 4; @@ -1112,11 +1087,10 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, - int count, int bd) { + int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); uint16_t *src[1]; uint16_t *dst[1]; - (void)count; // Transpose 8x8 src[0] = s - 4; @@ -1125,8 +1099,7 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, highbd_transpose(src, p, dst, 8, 1); // Loop filtering - vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1, - bd); + vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); src[0] = t_dst; dst[0] = s - 4; @@ -1181,8 +1154,8 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, highbd_transpose(src, p, dst, 8, 2); // Loop filtering - highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, - thresh, bd); + vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, + thresh, bd); src[0] = t_dst; src[1] = t_dst + 8 * 8; dst[0] = s - 8; @@ -1205,8 +1178,8 @@ void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); // Loop filtering - highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, - thresh, bd); + vpx_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, + thresh, bd); // Transpose back highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c index 23a97dd05..be1087c1e 100644 --- a/vpx_dsp/x86/loopfilter_avx2.c +++ b/vpx_dsp/x86/loopfilter_avx2.c @@ -13,9 +13,10 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" -static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, - const unsigned char *_blimit, const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); @@ -400,9 +401,10 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 }; -static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, - const unsigned char *_blimit, const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); @@ -975,12 +977,3 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, _mm_storeu_si128((__m128i *) (s + 6 * p), q6); } } - -void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, - const unsigned char *_blimit, const unsigned char *_limit, - const unsigned char *_thresh, int count) { - if (count == 1) - mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh); - else - mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh); -} diff --git a/vpx_dsp/x86/loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm index b9c18b680..15105e3ed 100644 --- a/vpx_dsp/x86/loopfilter_mmx.asm +++ b/vpx_dsp/x86/loopfilter_mmx.asm @@ -18,14 +18,13 @@ ; int src_pixel_step, ; const char *blimit, ; const char *limit, -; const char *thresh, -; int count +; const char *thresh ;) global sym(vpx_lpf_horizontal_4_mmx) PRIVATE sym(vpx_lpf_horizontal_4_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx push rsi push rdi @@ -39,8 +38,6 @@ sym(vpx_lpf_horizontal_4_mmx): mov rsi, arg(0) ;src_ptr movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - movsxd rcx, dword ptr arg(5) ;count -.next8_h: mov rdx, arg(3) ;limit movq mm7, [rdx] mov rdi, rsi ; rdi points to row +1 for indirect addressing @@ -208,11 +205,6 @@ sym(vpx_lpf_horizontal_4_mmx): pxor mm7, [GLOBAL(t80)] ; unoffset movq [rdi], mm7 ; write back - add rsi,8 - neg rax - dec rcx - jnz .next8_h - add rsp, 32 pop rsp ; begin epilog @@ -230,14 +222,13 @@ sym(vpx_lpf_horizontal_4_mmx): ; int src_pixel_step, ; const char *blimit, ; const char *limit, -; const char *thresh, -; int count +; const char *thresh ;) global sym(vpx_lpf_vertical_4_mmx) PRIVATE sym(vpx_lpf_vertical_4_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx push rsi push rdi @@ -254,8 +245,6 @@ sym(vpx_lpf_vertical_4_mmx): lea rsi, [rsi + rax*4 - 4] - movsxd rcx, dword ptr arg(5) ;count -.next8_v: mov rdi, rsi ; rdi points to row +1 for indirect addressing add rdi, rax @@ -579,10 +568,6 @@ sym(vpx_lpf_vertical_4_mmx): movd [rdi+rax*2+2], mm5 - lea rsi, [rsi+rax*8] - dec rcx - jnz .next8_v - add rsp, 64 pop rsp ; begin epilog diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c index ed1012736..e03508a03 100644 --- a/vpx_dsp/x86/loopfilter_sse2.c +++ b/vpx_dsp/x86/loopfilter_sse2.c @@ -18,11 +18,10 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); } -static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, - int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); @@ -383,11 +382,10 @@ static INLINE __m128i filter16_mask(const __m128i *const flat, return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } -static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, - int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); @@ -716,21 +714,10 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, } } -// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. -void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh, int count) { - if (count == 1) - mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh); - else - mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh); -} - void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, - const unsigned char *_thresh, int count) { + const unsigned char *_thresh) { DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); @@ -745,8 +732,6 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; - (void)count; - q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), _mm_loadl_epi64((__m128i *)(s + 3 * p))); q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), @@ -1492,11 +1477,10 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, const unsigned char *blimit, const unsigned char *limit, - const unsigned char *thresh, int count) { + const unsigned char *thresh) { DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]); unsigned char *src[1]; unsigned char *dst[1]; - (void)count; // Transpose 8x8 src[0] = s - 4; @@ -1505,7 +1489,7 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, transpose(src, p, dst, 8, 1); // Loop filtering - vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1); + vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); src[0] = t_dst; dst[0] = s - 4; @@ -1557,7 +1541,7 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, transpose(src, p, dst, 8, 2); // Loop filtering - mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh); + vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); src[0] = t_dst; src[1] = t_dst + 8 * 8; @@ -1578,8 +1562,7 @@ void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p, transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); // Loop filtering - mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, - thresh); + vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); // Transpose back transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);