From fb484524bdb7c8efc0262421fd1a9c6d17849483 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 26 Oct 2013 14:33:45 +0200 Subject: [PATCH] vp9: add multi-threaded tile decoder tiles are decoded in parallel within a single frame Change-Id: I7aca87cb1c239b74eceef72bdc9f672faebac373 --- test/vp9_thread_test.cc | 47 ++++++++++++-- vp9/decoder/vp9_decodframe.c | 116 ++++++++++++++++++++++++++++++++++- vp9/decoder/vp9_onyxd_if.c | 8 +++ vp9/decoder/vp9_onyxd_int.h | 3 + 4 files changed, 167 insertions(+), 7 deletions(-) diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index 76fc9bbfb..a8ce6e48a 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -8,16 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vp9/decoder/vp9_thread.h" +#include #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/decode_test_driver.h" #include "test/md5_helper.h" #include "test/webm_video_source.h" +#include "vp9/decoder/vp9_thread.h" namespace { +using std::string; + class VP9WorkerThreadTest : public ::testing::TestWithParam { protected: virtual ~VP9WorkerThreadTest() {} @@ -91,19 +94,26 @@ TEST_P(VP9WorkerThreadTest, HookFailure) { EXPECT_FALSE(worker_.had_error); } -TEST(VP9DecodeMTTest, MTDecode) { - libvpx_test::WebMVideoSource video("vp90-2-03-size-226x226.webm"); +// ----------------------------------------------------------------------------- +// Multi-threaded decode tests + +// Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames. +string DecodeFile(const string& filename, int num_threads) { + libvpx_test::WebMVideoSource video(filename); video.Init(); vpx_codec_dec_cfg_t cfg = {0}; - cfg.threads = 2; + cfg.threads = num_threads; libvpx_test::VP9Decoder decoder(cfg, 0); libvpx_test::MD5 md5; for (video.Begin(); video.cxdata(); video.Next()) { const vpx_codec_err_t res = decoder.DecodeFrame(video.cxdata(), video.frame_size()); - ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); + if (res != VPX_CODEC_OK) { + EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); + break; + } libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); const vpx_image_t *img = NULL; @@ -113,7 +123,32 @@ TEST(VP9DecodeMTTest, MTDecode) { md5.Add(img); } } - EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc", md5.Get()); + return string(md5.Get()); +} + +TEST(VP9DecodeMTTest, MTDecode) { + // no tiles or frame parallel; this exercises loop filter threading. + EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc", + DecodeFile("vp90-2-03-size-226x226.webm", 2).c_str()); +} + +TEST(VP9DecodeMTTest, MTDecode2) { + static const struct { + const char *name; + const char *expected_md5; + } files[] = { + { "vp90-2-08-tile_1x2_frame_parallel.webm", + "68ede6abd66bae0a2edf2eb9232241b6" }, + { "vp90-2-08-tile_1x4_frame_parallel.webm", + "368ebc6ebf3a5e478d85b2c3149b2848" }, + }; + + for (int i = 0; i < static_cast(sizeof(files) / sizeof(files[0])); ++i) { + for (int t = 2; t <= 4; ++t) { + EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str()) + << "threads = " << t; + } + } } INSTANTIATE_TEST_CASE_P(Synchronous, VP9WorkerThreadTest, ::testing::Bool()); diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index a13d49e37..f5e4592eb 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -37,6 +37,12 @@ #include "vp9/decoder/vp9_thread.h" #include "vp9/decoder/vp9_treereader.h" +typedef struct TileWorkerData { + VP9_COMMON *cm; + vp9_reader bit_reader; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); +} TileWorkerData; + static int read_be32(const uint8_t *p) { return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; } @@ -917,6 +923,106 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { return vp9_reader_find_end(&residual_bc); } +static int tile_worker_hook(void *arg1, void *arg2) { + TileWorkerData *const tile_data = (TileWorkerData*)arg1; + const TileInfo *const tile = (TileInfo*)arg2; + int mi_row, mi_col; + + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; + mi_row += MI_BLOCK_SIZE) { + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) + decode_modes_sb(tile_data->cm, &tile_data->xd, tile, + mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64, 0); + } + return !tile_data->xd.corrupted; +} + +static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) { + VP9_COMMON *const cm = &pbi->common; + const uint8_t *const data_end = pbi->source + pbi->source_sz; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols); + int tile_col = 0; + + assert(tile_rows == 1); + (void)tile_rows; + + if (num_workers > pbi->num_tile_workers) { + int i; + CHECK_MEM_ERROR(cm, pbi->tile_workers, + vpx_realloc(pbi->tile_workers, + num_workers * sizeof(*pbi->tile_workers))); + for (i = pbi->num_tile_workers; i < num_workers; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + ++pbi->num_tile_workers; + + vp9_worker_init(worker); + worker->hook = (VP9WorkerHook)tile_worker_hook; + CHECK_MEM_ERROR(cm, worker->data1, vpx_malloc(sizeof(TileWorkerData))); + CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo))); + if (i < num_workers - 1 && !vp9_worker_reset(worker)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Tile decoder thread creation failed"); + } + } + } + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + vpx_memset(pbi->above_context[0], 0, + sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * + 2 * aligned_mi_cols); + vpx_memset(pbi->above_seg_context, 0, + sizeof(*pbi->above_seg_context) * aligned_mi_cols); + + while (tile_col < tile_cols) { + int i; + for (i = 0; i < num_workers && tile_col < tile_cols; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; + TileInfo *const tile = (TileInfo*)worker->data2; + const size_t size = + get_tile(data_end, tile_col == tile_cols - 1, &cm->error, &data); + + tile_data->cm = cm; + tile_data->xd = pbi->mb; + tile_data->xd.corrupted = 0; + vp9_tile_init(tile, tile_data->cm, 0, tile_col); + + setup_token_decoder(data, data_end, size, &cm->error, + &tile_data->bit_reader); + setup_tile_context(pbi, &tile_data->xd, tile_col); + + worker->had_error = 0; + if (i == num_workers - 1 || tile_col == tile_cols - 1) { + vp9_worker_execute(worker); + } else { + vp9_worker_launch(worker); + } + + data += size; + ++tile_col; + } + + for (; i > 0; --i) { + VP9Worker *const worker = &pbi->tile_workers[i - 1]; + pbi->mb.corrupted |= !vp9_worker_sync(worker); + } + } + + { + const int final_worker = (tile_cols + num_workers - 1) % num_workers; + TileWorkerData *const tile_data = + (TileWorkerData*)pbi->tile_workers[final_worker].data1; + return vp9_reader_find_end(&tile_data->bit_reader); + } +} + static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 || vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 || @@ -1157,6 +1263,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { struct vp9_read_bit_buffer rb = { data, data_end, 0, cm, error_handler }; const size_t first_partition_size = read_uncompressed_header(pbi, &rb); const int keyframe = cm->frame_type == KEY_FRAME; + const int tile_rows = 1 << cm->log2_tile_rows; const int tile_cols = 1 << cm->log2_tile_cols; YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); @@ -1208,7 +1315,14 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { xd->corrupted = 0; new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size); - *p_data_end = decode_tiles(pbi, data + first_partition_size); + // TODO(jzern): remove frame_parallel_decoding_mode restriction for + // single-frame tile decoding. + if (pbi->oxcf.max_threads > 1 && tile_rows == 1 && tile_cols > 1 && + cm->frame_parallel_decoding_mode) { + *p_data_end = decode_tiles_mt(pbi, data + first_partition_size); + } else { + *p_data_end = decode_tiles(pbi, data + first_partition_size); + } cm->last_width = cm->width; cm->last_height = cm->height; diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c index ada73cc4c..5f970a3d5 100644 --- a/vp9/decoder/vp9_onyxd_if.c +++ b/vp9/decoder/vp9_onyxd_if.c @@ -147,6 +147,7 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) { } void vp9_remove_decompressor(VP9D_PTR ptr) { + int i; VP9D_COMP *const pbi = (VP9D_COMP *)ptr; if (!pbi) @@ -155,6 +156,13 @@ void vp9_remove_decompressor(VP9D_PTR ptr) { vp9_remove_common(&pbi->common); vp9_worker_end(&pbi->lf_worker); vpx_free(pbi->lf_worker.data1); + for (i = 0; i < pbi->num_tile_workers; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + vp9_worker_end(worker); + vpx_free(worker->data1); + vpx_free(worker->data2); + } + vpx_free(pbi->tile_workers); vpx_free(pbi->mi_streams); vpx_free(pbi->above_context[0]); vpx_free(pbi->above_seg_context); diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h index 77399529a..83ea96771 100644 --- a/vp9/decoder/vp9_onyxd_int.h +++ b/vp9/decoder/vp9_onyxd_int.h @@ -40,6 +40,9 @@ typedef struct VP9Decompressor { int do_loopfilter_inline; // apply loopfilter to available rows immediately VP9Worker lf_worker; + VP9Worker *tile_workers; + int num_tile_workers; + /* Each tile column has its own MODE_INFO stream. This array indexes them by tile column index. */ MODE_INFO **mi_streams;