From 6ae58931d6902ca5ac7865e6abf155adf878662d Mon Sep 17 00:00:00 2001 From: Frank Galligan Date: Sun, 6 Apr 2014 20:07:14 -0700 Subject: [PATCH] Fix decoder resolution change with tiles There was a bug with the decoder that if you started the decoder with more threads than the first frame had tile columns. Afterwards tried to decode a frame with more tile columns than the first frame, the decoder would hang. E.g. run vpxdec --threads=4. The first frame had two tile columns, then the next key frame had 4 tile columns, the decoder would hang. If you started with 4 tiles and switched to 2 tiles the decoder would be fine. The issue is that the worker the thread loop is using is stale. I added a test vector "vp90-2-14-resize-848x480-1280x720.webm" that exhibited the bug. Change-Id: I7bdd47241a52ac0fe1c693a609bc779257e94229 --- test/test-data.sha1 | 4 ++++ test/test.mk | 4 ++++ test/test_vectors.cc | 4 +++- test/vp9_thread_test.cc | 20 ++++++++++++++++++++ vp9/decoder/vp9_decodeframe.c | 16 ++++++++++------ vp9/decoder/vp9_dthread.c | 19 +++++++++++++++---- 6 files changed, 56 insertions(+), 11 deletions(-) diff --git a/test/test-data.sha1 b/test/test-data.sha1 index b8f668a78..18f5b8859 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -591,3 +591,7 @@ c21e97e4ba486520118d78b01a5cb6e6dc33e190 vp90-2-12-droppable_3.ivf 601abc9e4176c70f82ac0381365e9b151fdd24cd vp90-2-12-droppable_3.ivf.md5 61c640dad23cd4f7ad811b867e7b7e3521f4e3ba vp90-2-13-largescaling.webm bca1b02eebdb088fa3f389fe0e7571e75a71f523 vp90-2-13-largescaling.webm.md5 +248e799b418cc4a20db149f0ca085583e772643c vp90-2-14-resize-1280x720-848x480.webm +80b82616efc12593d796b7b1d5cee06c48a0a11e vp90-2-14-resize-1280x720-848x480.webm.md5 +df93c49c951c460dfebd06af1e81ca11de7e3f51 vp90-2-14-resize-848x480-1280x720.webm +b1faac0b794a50efa811253d255940c4b1c3b885 vp90-2-14-resize-848x480-1280x720.webm.md5 diff --git a/test/test.mk b/test/test.mk index 4d96bc69d..0cf2b3655 100644 --- a/test/test.mk +++ b/test/test.mk @@ -698,6 +698,10 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm.md5 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-1280x720-848x480.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-1280x720-848x480.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-848x480-1280x720.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-848x480-1280x720.webm.md5 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes) # BBB VP9 streams diff --git a/test/test_vectors.cc b/test/test_vectors.cc index 8c789ffe7..a718f3d41 100644 --- a/test/test_vectors.cc +++ b/test/test_vectors.cc @@ -164,7 +164,9 @@ const char *const kVP9TestVectors[] = { "vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm", "vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf", "vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf", - "vp90-2-13-largescaling.webm", "vp91-2-04-yv444.webm" + "vp90-2-13-largescaling.webm", "vp91-2-04-yv444.webm", + "vp90-2-14-resize-1280x720-848x480.webm", + "vp90-2-14-resize-848x480-1280x720.webm" }; const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors); #endif // CONFIG_VP9_DECODER diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index a78cdea6b..731c236e3 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -153,6 +153,26 @@ TEST(VP9DecodeMTTest, MTDecode2) { } } +// Test tile quantity changes within one file. +TEST(VP9DecodeMTTest, MTDecode3) { + static const struct { + const char *name; + const char *expected_md5; + } files[] = { + { "vp90-2-14-resize-1280x720-848x480.webm", + "1fe742b0c5d9ed92314b7bc68bbea153" }, + { "vp90-2-14-resize-848x480-1280x720.webm", + "0f163f491617f0ebcb80f1033eeb5d40" }, + }; + + for (int i = 0; i < static_cast(sizeof(files) / sizeof(files[0])); ++i) { + for (int t = 2; t <= 8; ++t) { + EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str()) + << "threads = " << t; + } + } +} + INSTANTIATE_TEST_CASE_P(Synchronous, VP9WorkerThreadTest, ::testing::Bool()); } // namespace diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 5a2e6f881..08b8522cc 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -882,12 +882,16 @@ static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, assert(tile_rows == 1); (void)tile_rows; - if (num_workers > pbi->num_tile_workers) { + // TODO(jzen): See if we can remove the restriction of passing in max + // threads to the decoder. + if (pbi->num_tile_workers == 0) { + const int num_threads = pbi->oxcf.max_threads & ~1; int i; + // TODO(jzern): Allocate one less worker, as in the current code we only + // use num_threads - 1 workers. CHECK_MEM_ERROR(cm, pbi->tile_workers, - vpx_realloc(pbi->tile_workers, - num_workers * sizeof(*pbi->tile_workers))); - for (i = pbi->num_tile_workers; i < num_workers; ++i) { + vpx_malloc(num_threads * sizeof(*pbi->tile_workers))); + for (i = 0; i < num_threads; ++i) { VP9Worker *const worker = &pbi->tile_workers[i]; ++pbi->num_tile_workers; @@ -895,7 +899,7 @@ static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, CHECK_MEM_ERROR(cm, worker->data1, vpx_memalign(32, sizeof(TileWorkerData))); CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo))); - if (i < num_workers - 1 && !vp9_worker_reset(worker)) { + if (i < num_threads - 1 && !vp9_worker_reset(worker)) { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Tile decoder thread creation failed"); } @@ -903,7 +907,7 @@ static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, } // Reset tile decoding hook - for (n = 0; n < pbi->num_tile_workers; ++n) { + for (n = 0; n < num_workers; ++n) { pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook; } diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c index 163936021..acbe878b3 100644 --- a/vp9/decoder/vp9_dthread.c +++ b/vp9/decoder/vp9_dthread.c @@ -139,6 +139,8 @@ void vp9_loop_filter_frame_mt(VP9D_COMP *pbi, int y_only, int partial_frame) { // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + const int tile_cols = 1 << cm->log2_tile_cols; + const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols); int i; // Allocate memory used in thread synchronization. @@ -168,7 +170,16 @@ void vp9_loop_filter_frame_mt(VP9D_COMP *pbi, sizeof(*pbi->lf_row_sync.cur_sb_col) * sb_rows); // Set up loopfilter thread data. - for (i = 0; i < pbi->num_tile_workers; ++i) { + // The decoder is using num_workers instead of pbi->num_tile_workers + // because it has been observed that using more threads on the + // loopfilter, than there are tile columns in the frame will hurt + // performance on Android. This is because the system will only + // schedule the tile decode workers on cores equal to the number + // of tile columns. Then if the decoder tries to use more threads for the + // loopfilter, it will hurt performance because of contention. If the + // multithreading code changes in the future then the number of workers + // used by the loopfilter should be revisited. + for (i = 0; i < num_workers; ++i) { VP9Worker *const worker = &pbi->tile_workers[i]; TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; LFWorkerData *const lf_data = &tile_data->lfdata; @@ -184,10 +195,10 @@ void vp9_loop_filter_frame_mt(VP9D_COMP *pbi, lf_data->y_only = y_only; // always do all planes in decoder lf_data->lf_sync = &pbi->lf_row_sync; - lf_data->num_lf_workers = pbi->num_tile_workers; + lf_data->num_lf_workers = num_workers; // Start loopfiltering - if (i == pbi->num_tile_workers - 1) { + if (i == num_workers - 1) { vp9_worker_execute(worker); } else { vp9_worker_launch(worker); @@ -195,7 +206,7 @@ void vp9_loop_filter_frame_mt(VP9D_COMP *pbi, } // Wait till all rows are finished - for (i = 0; i < pbi->num_tile_workers; ++i) { + for (i = 0; i < num_workers; ++i) { vp9_worker_sync(&pbi->tile_workers[i]); } }