From 9ce3a7d76c5ef702337b96b9aa2c944da1b31869 Mon Sep 17 00:00:00 2001 From: Hangyu Kuang Date: Wed, 30 Jul 2014 20:43:40 -0700 Subject: [PATCH] Implement frame parallel decode for VP9. Using 4 threads, frame parallel decode is ~3x faster than single thread decode and around 30% faster than tile parallel decode for frame parallel encoded video on both Android and desktop with 4 threads. Decode speed is scalable to threads too which means decode could be even faster with more threads. Change-Id: Ia0a549aaa3e83b5a17b31d8299aa496ea4f21e3e --- vp9/common/vp9_alloccommon.c | 104 +++++++--- vp9/common/vp9_entropymode.c | 5 +- vp9/common/vp9_mvref_common.c | 31 +-- vp9/common/vp9_mvref_common.h | 4 +- vp9/common/vp9_onyxc_int.h | 38 +++- vp9/decoder/vp9_decodeframe.c | 172 ++++++++++++---- vp9/decoder/vp9_decodeframe.h | 3 +- vp9/decoder/vp9_decodemv.c | 23 ++- vp9/decoder/vp9_decodemv.h | 3 +- vp9/decoder/vp9_decoder.c | 93 +++++++-- vp9/decoder/vp9_decoder.h | 6 + vp9/decoder/vp9_dthread.c | 165 +++++++++++++++ vp9/decoder/vp9_dthread.h | 29 ++- vp9/encoder/vp9_pickmode.c | 2 +- vp9/encoder/vp9_rdopt.c | 3 +- vp9/vp9_dx_iface.c | 372 +++++++++++++++++++++++++--------- vpx/vpx_frame_buffer.h | 5 +- 17 files changed, 842 insertions(+), 216 deletions(-) diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index 04081f107..4ea62398f 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -12,11 +12,37 @@ #include "vpx_mem/vpx_mem.h" #include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_common.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_systemdependent.h" +// TODO(hkuang): Don't need to lock the whole pool after implementing atomic +// frame reference count. +void lock_buffer_pool(BufferPool *const pool) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&pool->pool_mutex); +#else + (void)pool; +#endif +} + +void unlock_buffer_pool(BufferPool *const pool) { +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&pool->pool_mutex); +#else + (void)pool; +#endif +} + +static INLINE void alloc_mi_array(VP9_COMMON *cm, int mi_size, int idx) { + CHECK_MEM_ERROR(cm, cm->mip_array[idx], + vpx_calloc(mi_size, sizeof(*cm->mip_array[0]))); + CHECK_MEM_ERROR(cm, cm->mi_grid_base_array[idx], + vpx_calloc(mi_size, sizeof(*cm->mi_grid_base_array[0]))); +} + static void clear_mi_border(const VP9_COMMON *cm, MODE_INFO *mi) { int i; @@ -49,40 +75,47 @@ static void setup_mi(VP9_COMMON *cm) { vpx_memset(cm->mi_grid_base, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base)); - clear_mi_border(cm, cm->prev_mip); + // Only clear mi border in non frame-parallel decode. In frame-parallel + // decode, prev_mip is managed by previous decoding thread. While in + // non frame-parallel decode, prev_mip and mip are both managed by + // current decoding thread. + if (!cm->frame_parallel_decode) + clear_mi_border(cm, cm->prev_mip); } static int alloc_mi(VP9_COMMON *cm, int mi_size) { int i; for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) { - cm->mip_array[i] = - (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip)); - if (cm->mip_array[i] == NULL) - return 1; - - cm->mi_grid_base_array[i] = - (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); - if (cm->mi_grid_base_array[i] == NULL) - return 1; + // Delay reallocation as another thread is accessing prev_mi. + if (cm->frame_parallel_decode && i == cm->prev_mi_idx) { + cm->update_prev_mi = 1; + continue; + } + alloc_mi_array(cm, mi_size, i); } - // Init the index. - cm->mi_idx = 0; - cm->prev_mi_idx = 1; - cm->mip = cm->mip_array[cm->mi_idx]; - cm->prev_mip = cm->mip_array[cm->prev_mi_idx]; cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx]; - cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx]; + + if (!cm->frame_parallel_decode) { + cm->mi_idx = 0; + cm->prev_mi_idx = 1; + // In frame-parallel decode, prev_mip comes from another thread, + // so current decoding thread should not touch it. + cm->prev_mip = cm->mip_array[cm->prev_mi_idx]; + cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx]; + } return 0; } -static void free_mi(VP9_COMMON *cm) { +static void free_mi(VP9_COMMON *cm, int decode_done) { int i; for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) { + if (cm->frame_parallel_decode && i == cm->prev_mi_idx && !decode_done) + continue; vpx_free(cm->mip_array[i]); cm->mip_array[i] = NULL; vpx_free(cm->mi_grid_base_array[i]); @@ -90,9 +123,12 @@ static void free_mi(VP9_COMMON *cm) { } cm->mip = NULL; - cm->prev_mip = NULL; cm->mi_grid_base = NULL; - cm->prev_mi_grid_base = NULL; + + if (!cm->frame_parallel_decode) { + cm->prev_mip = NULL; + cm->prev_mi_grid_base = NULL; + } } static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) { @@ -109,7 +145,10 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) { cm->prev_seg_map_idx = 1; cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx]; - cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; + + if (!cm->frame_parallel_decode) { + cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; + } return 0; } @@ -123,7 +162,10 @@ static void free_seg_map(VP9_COMMON *cm) { } cm->current_frame_seg_map = NULL; - cm->last_frame_seg_map = NULL; + + if (!cm->frame_parallel_decode) { + cm->last_frame_seg_map = NULL; + } } void vp9_free_frame_buffers(VP9_COMMON *cm) { @@ -144,8 +186,7 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) { } void vp9_free_context_buffers(VP9_COMMON *cm) { - free_mi(cm); - + free_mi(cm, 1); free_seg_map(cm); vpx_free(cm->above_context); @@ -170,7 +211,7 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) { set_mb_mi(cm, aligned_width, aligned_height); - free_mi(cm); + free_mi(cm, 0); if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE))) goto fail; @@ -288,7 +329,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { void vp9_remove_common(VP9_COMMON *cm) { vp9_free_frame_buffers(cm); vp9_free_context_buffers(cm); - vp9_free_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers); } void vp9_update_frame_size(VP9_COMMON *cm) { @@ -306,6 +346,20 @@ void vp9_update_frame_size(VP9_COMMON *cm) { void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) { // Swap indices. const int tmp = cm->mi_idx; + + // Only used in frame parallel decode: Update the prev_mi buffer if + // needed. The worker that was accessing it must already finish decoding. + // So it can be resized safely now. + if (cm->update_prev_mi) { + const int mi_size = cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE); + vpx_free(cm->mip_array[cm->prev_mi_idx]); + vpx_free(cm->mi_grid_base_array[cm->prev_mi_idx]); + cm->mip_array[cm->prev_mi_idx] = NULL; + cm->mi_grid_base_array[cm->prev_mi_idx] = NULL; + alloc_mi_array(cm, mi_size, cm->prev_mi_idx); + cm->update_prev_mi = 0; + } + cm->mi_idx = cm->prev_mi_idx; cm->prev_mi_idx = tmp; diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index 79c8b9bc5..4d98cf9cc 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -439,7 +439,8 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { int i; vp9_clearall_segfeatures(&cm->seg); cm->seg.abs_delta = SEGMENT_DELTADATA; - if (cm->last_frame_seg_map) + + if (cm->last_frame_seg_map && !cm->frame_parallel_decode) vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); if (cm->current_frame_seg_map) @@ -467,7 +468,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { cm->frame_contexts[cm->frame_context_idx] = cm->fc; } - if (frame_is_intra_only(cm)) + if (frame_is_intra_only(cm) && !cm->frame_parallel_decode) vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip)); diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c index 0fe58c5c8..5913d356f 100644 --- a/vp9/common/vp9_mvref_common.c +++ b/vp9/common/vp9_mvref_common.c @@ -17,14 +17,12 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, const TileInfo *const tile, MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, - int block, int mi_row, int mi_col) { + int block, int mi_row, int mi_col, + find_mv_refs_sync sync, void *const data) { const int *ref_sign_bias = cm->ref_frame_sign_bias; int i, refmv_count = 0; - const MODE_INFO *prev_mi = cm->coding_use_prev_mi && cm->prev_mi - ? cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col] - : NULL; - const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL; - + MODE_INFO *prev_mi = NULL; + MB_MODE_INFO *prev_mbmi = NULL; const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type]; @@ -71,6 +69,14 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, } } + // Synchronize here for frame parallel decode if sync function is provided. + if (sync != NULL) { + sync(data, mi_row); + } + prev_mi = cm->coding_use_prev_mi && cm->prev_mi ? + cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col] : NULL; + prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL; + // Check the last frame's mode and mv info. if (prev_mbmi) { if (prev_mbmi->ref_frame[0] == ref_frame) @@ -109,12 +115,13 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, } void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, - MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, - int mi_row, int mi_col) { + const TileInfo *const tile, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int mi_row, int mi_col, + find_mv_refs_sync sync, void *const data) { find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1, - mi_row, mi_col); + mi_row, mi_col, sync, data); } static void lower_mv_precision(MV *mv, int allow_hp) { @@ -152,7 +159,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, assert(MAX_MV_REF_CANDIDATES == 2); find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block, - mi_row, mi_col); + mi_row, mi_col, NULL, NULL); near->as_int = 0; switch (block) { diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h index 7bce3fa37..14defed9c 100644 --- a/vp9/common/vp9_mvref_common.h +++ b/vp9/common/vp9_mvref_common.h @@ -204,10 +204,12 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); } +typedef void (*find_mv_refs_sync)(void *const data, int mi_row); void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, const TileInfo *const tile, MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, int mi_row, int mi_col); + int_mv *mv_ref_list, int mi_row, int mi_col, + find_mv_refs_sync sync, void *const data); // check a list of motion vectors by sad score using a number rows of pixels // above and a number cols of pixels in the left to select the one with best diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 13c500147..7425abdd7 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -36,10 +36,13 @@ extern "C" { #define REF_FRAMES_LOG2 3 #define REF_FRAMES (1 << REF_FRAMES_LOG2) -// 1 scratch frame for the new frame, 3 for scaled references on the encoder +// 4 scratch frames for the new frames to support a maximum of 4 cores decoding +// in parallel, 3 for scaled references on the encoder. +// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number +// of framebuffers. // TODO(jkoleszar): These 3 extra references could probably come from the // normal reference pool. -#define FRAME_BUFFERS (REF_FRAMES + 4) +#define FRAME_BUFFERS (REF_FRAMES + 7) #define FRAME_CONTEXTS_LOG2 2 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2) @@ -64,6 +67,18 @@ typedef struct { int ref_count; vpx_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; + + // The Following variables will only be used in frame parallel decode. + + // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means + // that no FrameWorker owns, or is decoding, this buffer. + VP9Worker *frame_worker_owner; + + // row and col indicate which position frame has been decoded to in real + // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX + // when the frame is fully decoded. + int row; + int col; } RefCntBuffer; typedef struct { @@ -114,6 +129,10 @@ typedef struct VP9Common { int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */ + // Prepare ref_frame_map for the next frame. + // Only used in frame parallel decode. + int next_ref_frame_map[REF_FRAMES]; + // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and // roll new_fb_idx into it. @@ -178,6 +197,9 @@ typedef struct VP9Common { MODE_INFO **prev_mi_grid_base; MODE_INFO **prev_mi_grid_visible; + // Used in frame parallel decode for delay resizing prev_mi. + int update_prev_mi; + // Persistent mb segment id map used in prediction. int seg_map_idx; int prev_seg_map_idx; @@ -197,6 +219,10 @@ typedef struct VP9Common { struct loopfilter lf; struct segmentation seg; + // TODO(hkuang): Remove this as it is the same as frame_parallel_decode + // in pbi. + int frame_parallel_decode; // frame-based threading. + // Context probabilities for reference frame prediction int allow_comp_inter_inter; MV_REFERENCE_FRAME comp_fixed_ref; @@ -235,6 +261,11 @@ typedef struct VP9Common { ENTROPY_CONTEXT *above_context; } VP9_COMMON; +// TODO(hkuang): Don't need to lock the whole pool after implementing atomic +// frame reference count. +void lock_buffer_pool(BufferPool *const pool); +void unlock_buffer_pool(BufferPool *const pool); + static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf; } @@ -242,12 +273,15 @@ static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { static INLINE int get_free_fb(VP9_COMMON *cm) { RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; int i; + + lock_buffer_pool(cm->buffer_pool); for (i = 0; i < FRAME_BUFFERS; ++i) if (frame_bufs[i].ref_count == 0) break; assert(i < FRAME_BUFFERS); frame_bufs[i].ref_count = 1; + unlock_buffer_pool(cm->buffer_pool); return i; } diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index fae4255da..da973c3c8 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -327,21 +327,24 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd, MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME]; xd->block_refs[idx] = ref_buffer; + if (!vp9_is_valid_scale(&ref_buffer->sf)) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid scale factors"); vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col, &ref_buffer->sf); - xd->corrupted |= ref_buffer->buf->corrupted; + if (!cm->frame_parallel_decode) + xd->corrupted |= ref_buffer->buf->corrupted; } -static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, +static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r, BLOCK_SIZE bsize) { + VP9_COMMON *const cm = &pbi->common; const int less8x8 = bsize < BLOCK_8X8; MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col); - vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r); + vp9_read_mode_info(pbi, xd, tile, mi_row, mi_col, r); if (less8x8) bsize = BLOCK_8X8; @@ -365,7 +368,7 @@ static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, set_ref(cm, xd, 1, mi_row, mi_col); // Prediction - vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); + vp9_dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize); // Reconstruction if (!mbmi->skip) { @@ -404,10 +407,11 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs, return p; } -static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, +static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader* r, BLOCK_SIZE bsize) { + VP9_COMMON *const cm = &pbi->common; const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2; PARTITION_TYPE partition; BLOCK_SIZE subsize, uv_subsize; @@ -422,27 +426,27 @@ static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Invalid block size."); if (subsize < BLOCK_8X8) { - decode_block(cm, xd, tile, mi_row, mi_col, r, subsize); + decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize); } else { switch (partition) { case PARTITION_NONE: - decode_block(cm, xd, tile, mi_row, mi_col, r, subsize); + decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize); break; case PARTITION_HORZ: - decode_block(cm, xd, tile, mi_row, mi_col, r, subsize); + decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize); if (mi_row + hbs < cm->mi_rows) - decode_block(cm, xd, tile, mi_row + hbs, mi_col, r, subsize); + decode_block(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize); break; case PARTITION_VERT: - decode_block(cm, xd, tile, mi_row, mi_col, r, subsize); + decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize); if (mi_col + hbs < cm->mi_cols) - decode_block(cm, xd, tile, mi_row, mi_col + hbs, r, subsize); + decode_block(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize); break; case PARTITION_SPLIT: - decode_partition(cm, xd, tile, mi_row, mi_col, r, subsize); - decode_partition(cm, xd, tile, mi_row, mi_col + hbs, r, subsize); - decode_partition(cm, xd, tile, mi_row + hbs, mi_col, r, subsize); - decode_partition(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize); + decode_partition(pbi, xd, tile, mi_row, mi_col, r, subsize); + decode_partition(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize); + decode_partition(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize); + decode_partition(pbi, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize); break; default: assert(0 && "Invalid partition type"); @@ -638,6 +642,7 @@ static void apply_frame_size(VP9_COMMON *cm, int width, int height) { vp9_update_frame_size(cm); } + lock_buffer_pool(pool); if (vp9_realloc_frame_buffer( get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS, @@ -646,6 +651,7 @@ static void apply_frame_size(VP9_COMMON *cm, int width, int height) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } + unlock_buffer_pool(pool); } static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { @@ -778,7 +784,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const int tile_rows = 1 << cm->log2_tile_rows; TileBuffer tile_buffers[4][1 << 6]; int tile_row, tile_col; - int mi_row, mi_col; + int mi_row = 0, mi_col = 0; TileData *tile_data = NULL; if (cm->lf.filter_level && pbi->lf_worker.data1 == NULL) { @@ -798,7 +804,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, vp9_copy(lf_data->planes, pbi->mb.plane); lf_data->stop = 0; lf_data->y_only = 0; - vp9_loop_filter_frame_init(cm, cm->lf.filter_level); } assert(tile_rows <= 4); @@ -856,7 +861,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, vp9_zero(tile_data->xd.left_seg_context); for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; mi_col += MI_BLOCK_SIZE) { - decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col, + decode_partition(pbi, &tile_data->xd, &tile, mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64); } } @@ -880,6 +885,12 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, winterface->execute(&pbi->lf_worker); } } + // After loopfiltering, the last 7 row pixels in each superblock row may + // still be changed by the longest loopfilter of the next superblock + // row. + if (pbi->frame_parallel_decode) + vp9_frameworker_broadcast(pbi->cur_buf, + mi_row << MI_BLOCK_SIZE_LOG2); } } @@ -895,6 +906,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, // Get last tile data. tile_data = pbi->tile_data + tile_cols * tile_rows - 1; + if (pbi->frame_parallel_decode) + vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX); return vp9_reader_find_end(&tile_data->bit_reader); } @@ -909,7 +922,7 @@ static int tile_worker_hook(void *arg1, void *arg2) { vp9_zero(tile_data->xd.left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) { - decode_partition(tile_data->cm, &tile_data->xd, tile, + decode_partition(tile_data->pbi, &tile_data->xd, tile, mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64); } } @@ -1015,10 +1028,10 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, TileInfo *const tile = (TileInfo*)worker->data2; TileBuffer *const buf = &tile_buffers[0][n]; - tile_data->cm = cm; + tile_data->pbi = pbi; tile_data->xd = pbi->mb; tile_data->xd.corrupted = 0; - vp9_tile_init(tile, tile_data->cm, 0, buf->col); + vp9_tile_init(tile, &pbi->common, 0, buf->col); setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &tile_data->bit_reader, pbi->decrypt_cb, pbi->decrypt_state); @@ -1078,8 +1091,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, struct vp9_read_bit_buffer *rb) { VP9_COMMON *const cm = &pbi->common; RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + BufferPool *const pool = pbi->common.buffer_pool; + int i, mask, ref_index = 0; size_t sz; - int i; cm->last_frame_type = cm->frame_type; @@ -1096,16 +1110,22 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, if (cm->show_existing_frame) { // Show an existing frame directly. const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)]; - + lock_buffer_pool(pool); if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Buffer %d does not contain a decoded frame", frame_to_show); ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show); + unlock_buffer_pool(pool); pbi->refresh_frame_flags = 0; cm->lf.filter_level = 0; cm->show_frame = 1; + + if (pbi->frame_parallel_decode) { + for (i = 0; i < REF_FRAMES; ++i) + cm->next_ref_frame_map[i] = cm->ref_frame_map[i]; + } return 0; } @@ -1166,7 +1186,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, ref_frame->buf = &frame_bufs[idx].buf; cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb); } - setup_frame_size_with_refs(cm, rb); cm->allow_high_precision_mv = vp9_rb_read_bit(rb); @@ -1198,6 +1217,29 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, // below, forcing the use of context 0 for those frame types. cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2); + // Generate next_ref_frame_map. + lock_buffer_pool(pool); + for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + if (mask & 1) { + cm->next_ref_frame_map[ref_index] = cm->new_fb_idx; + ++frame_bufs[cm->new_fb_idx].ref_count; + } else { + cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index]; + } + // Current thread holds the reference frame. + if (cm->ref_frame_map[ref_index] >= 0) + ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count; + ++ref_index; + } + + for (; ref_index < REF_FRAMES; ++ref_index) { + cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index]; + // Current thread holds the reference frame. + if (cm->ref_frame_map[ref_index] >= 0) + ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count; + } + unlock_buffer_pool(pool); + if (frame_is_intra_only(cm) || cm->error_resilient_mode) vp9_setup_past_independence(cm); @@ -1343,6 +1385,7 @@ void vp9_decode_frame(VP9Decoder *pbi, VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0}; + int context_updated = 0; uint8_t clear_data[MAX_VP9_HEADER_SIZE]; const size_t first_partition_size = read_uncompressed_header(pbi, @@ -1380,6 +1423,28 @@ void vp9_decode_frame(VP9Decoder *pbi, xd->corrupted = 0; new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size); + if (cm->lf.filter_level) { + vp9_loop_filter_frame_init(cm, cm->lf.filter_level); + } + + // If encoded in frame parallel mode, frame context is ready after decoding + // the frame header. + if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) { + VP9Worker *const worker = pbi->frame_worker_owner; + FrameWorkerData *const frame_worker_data = worker->data1; + if (cm->refresh_frame_context) { + context_updated = 1; + cm->frame_contexts[cm->frame_context_idx] = cm->fc; + } + vp9_frameworker_lock_stats(worker); + pbi->cur_buf->row = -1; + pbi->cur_buf->col = -1; + frame_worker_data->frame_context_ready = 1; + // Signal the main thread that context is ready. + vp9_frameworker_signal_stats(worker); + vp9_frameworker_unlock_stats(worker); + } + // TODO(jzern): remove frame_parallel_decoding_mode restriction for // single-frame tile decoding. if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 && @@ -1407,7 +1472,8 @@ void vp9_decode_frame(VP9Decoder *pbi, } } - if (cm->refresh_frame_context) + // Non frame parallel update frame context here. + if (cm->refresh_frame_context && !context_updated) cm->frame_contexts[cm->frame_context_idx] = cm->fc; } @@ -1454,10 +1520,9 @@ static void build_mc_border(const uint8_t *src, int src_stride, } while (--b_h); } -void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, - int bw, int bh, - int x, int y, int w, int h, - int mi_x, int mi_y) { +void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd, + int plane, int block, int bw, int bh, int x, + int y, int w, int h, int mi_x, int mi_y) { struct macroblockd_plane *const pd = &xd->plane[plane]; const MODE_INFO *mi = xd->mi[0]; const int is_compound = has_second_ref(&mi->mbmi); @@ -1484,20 +1549,23 @@ void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, pd->subsampling_y); MV32 scaled_mv; - int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride, - subpel_x, subpel_y; + int xs, ys, x0, y0, x0_16, y0_16, y1, frame_width, frame_height, + buf_stride, subpel_x, subpel_y; uint8_t *ref_frame, *buf_ptr; - const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf; + const int idx = xd->block_refs[ref]->idx; + BufferPool *const pool = pbi->common.buffer_pool; + RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx]; // Get reference frame pointer, width and height. if (plane == 0) { - frame_width = ref_buf->y_crop_width; - frame_height = ref_buf->y_crop_height; - ref_frame = ref_buf->y_buffer; + frame_width = ref_frame_buf->buf.y_crop_width; + frame_height = ref_frame_buf->buf.y_crop_height; + ref_frame = ref_frame_buf->buf.y_buffer; } else { - frame_width = ref_buf->uv_crop_width; - frame_height = ref_buf->uv_crop_height; - ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer; + frame_width = ref_frame_buf->buf.uv_crop_width; + frame_height = ref_frame_buf->buf.uv_crop_height; + ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer + : ref_frame_buf->buf.v_buffer; } if (vp9_is_scaled(sf)) { @@ -1550,15 +1618,18 @@ void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, buf_ptr = ref_frame + y0 * pre_buf->stride + x0; buf_stride = pre_buf->stride; + // Get reference block bottom right vertical coordinate. + y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1; + // Do border extension if there is motion or the // width/height is not a multiple of 8 pixels. if (scaled_mv.col || scaled_mv.row || (frame_width & 0x7) || (frame_height & 0x7)) { - // Get reference block bottom right coordinate. - int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1; - int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1; int x_pad = 0, y_pad = 0; + // Get reference block bottom right horizontal coordinate. + int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1; + if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) { x0 -= VP9_INTERP_EXTEND - 1; x1 += VP9_INTERP_EXTEND; @@ -1571,6 +1642,12 @@ void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, y_pad = 1; } + // Wait until reference block is ready. Pad 7 more pixels as last 7 + // pixels of each superblock row can be changed by next superblock row. + if (pbi->frame_parallel_decode) + vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf, + (y1 + 7) << (plane == 0 ? 0 : 1)); + // Skip border extension if block is inside the frame. if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width || y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { @@ -1582,6 +1659,12 @@ void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, buf_stride = x1 - x0 + 1; buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3; } + } else { + // Wait until reference block is ready. Pad 7 more pixels as last 7 + // pixels of each superblock row can be changed by next superblock row. + if (pbi->frame_parallel_decode) + vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf, + (y1 + 7) << (plane == 0 ? 0 : 1)); } inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, @@ -1589,7 +1672,8 @@ void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, } } -void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, +void vp9_dec_build_inter_predictors_sb(VP9Decoder *const pbi, MACROBLOCKD *xd, + int mi_row, int mi_col, BLOCK_SIZE bsize) { int plane; const int mi_x = mi_col * MI_SIZE; @@ -1607,10 +1691,10 @@ void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, assert(bsize == BLOCK_8X8); for (y = 0; y < num_4x4_h; ++y) for (x = 0; x < num_4x4_w; ++x) - dec_build_inter_predictors(xd, plane, i++, bw, bh, + dec_build_inter_predictors(pbi, xd, plane, i++, bw, bh, 4 * x, 4 * y, 4, 4, mi_x, mi_y); } else { - dec_build_inter_predictors(xd, plane, 0, bw, bh, + dec_build_inter_predictors(pbi, xd, plane, 0, bw, bh, 0, 0, bw, bh, mi_x, mi_y); } } diff --git a/vp9/decoder/vp9_decodeframe.h b/vp9/decoder/vp9_decodeframe.h index 6fbd50c8b..901607ea1 100644 --- a/vp9/decoder/vp9_decodeframe.h +++ b/vp9/decoder/vp9_decodeframe.h @@ -25,7 +25,8 @@ void vp9_decode_frame(struct VP9Decoder *pbi, const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end); -void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, +void vp9_dec_build_inter_predictors_sb(struct VP9Decoder *const pbi, + MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); #ifdef __cplusplus } // extern "C" diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 187ff1307..bd9046187 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -420,11 +420,18 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, } } -static void read_inter_block_mode_info(VP9_COMMON *const cm, +static void fpm_sync(void *const data, int mi_row) { + VP9Decoder *const pbi = (VP9Decoder *)data; + vp9_frameworker_wait(pbi->frame_worker_owner, pbi->prev_buf, + mi_row << MI_BLOCK_SIZE_LOG2); +} + +static void read_inter_block_mode_info(VP9Decoder *const pbi, MACROBLOCKD *const xd, const TileInfo *const tile, MODE_INFO *const mi, int mi_row, int mi_col, vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; MB_MODE_INFO *const mbmi = &mi->mbmi; const BLOCK_SIZE bsize = mbmi->sb_type; const int allow_hp = cm->allow_high_precision_mv; @@ -438,7 +445,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, for (ref = 0; ref < 1 + is_compound; ++ref) { const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame], - mi_row, mi_col); + mi_row, mi_col, fpm_sync, (void *)pbi); } inter_mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]]; @@ -512,10 +519,13 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, } } -static void read_inter_frame_mode_info(VP9_COMMON *const cm, +// TODO(hkuang): Pass cm instead of pbi. This requires change in +// vp9_frameworker_wait. +static void read_inter_frame_mode_info(VP9Decoder *const pbi, MACROBLOCKD *const xd, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; MODE_INFO *const mi = xd->mi[0]; MB_MODE_INFO *const mbmi = &mi->mbmi; int inter_block; @@ -529,16 +539,17 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm, !mbmi->skip || !inter_block, r); if (inter_block) - read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r); + read_inter_block_mode_info(pbi, xd, tile, mi, mi_row, mi_col, r); else read_intra_block_mode_info(cm, mi, r); } -void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd, +void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; if (frame_is_intra_only(cm)) read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r); else - read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r); + read_inter_frame_mode_info(pbi, xd, tile, mi_row, mi_col, r); } diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h index 7394b62b4..dd97d8da0 100644 --- a/vp9/decoder/vp9_decodemv.h +++ b/vp9/decoder/vp9_decodemv.h @@ -11,6 +11,7 @@ #ifndef VP9_DECODER_VP9_DECODEMV_H_ #define VP9_DECODER_VP9_DECODEMV_H_ +#include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_reader.h" #ifdef __cplusplus @@ -19,7 +20,7 @@ extern "C" { struct TileInfo; -void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd, +void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, const struct TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r); diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 07fe2899d..0a5ed0c3a 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -26,6 +26,7 @@ #endif #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_systemdependent.h" +#include "vp9/common/vp9_thread.h" #include "vp9/decoder/vp9_decodeframe.h" #include "vp9/decoder/vp9_decoder.h" @@ -63,6 +64,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) { // Initialize the references to not point to any frame buffers. vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); + vpx_memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map)); cm->current_video_frame = 0; pbi->ready_for_new_data = 1; @@ -195,29 +197,51 @@ int vp9_get_reference_dec(VP9Decoder *pbi, int index, YV12_BUFFER_CONFIG **fb) { return 0; } +static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, + BufferPool *const pool) { + if (idx >= 0) { + --frame_bufs[idx].ref_count; + if (frame_bufs[idx].ref_count == 0) { + pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer); + } + } +} + /* If any buffer updating is signaled it should be done here. */ static void swap_frame_buffers(VP9Decoder *pbi) { int ref_index = 0, mask; - VP9_COMMON * const cm = &pbi->common; - BufferPool * const pool = cm->buffer_pool; + VP9_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + lock_buffer_pool(pool); for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { - if (mask & 1) { - const int old_idx = cm->ref_frame_map[ref_index]; - ref_cnt_fb(frame_bufs, &cm->ref_frame_map[ref_index], - cm->new_fb_idx); - if (old_idx >= 0 && frame_bufs[old_idx].ref_count == 0) - pool->release_fb_cb(pool->cb_priv, - &frame_bufs[old_idx].raw_frame_buffer); + const int old_idx = cm->ref_frame_map[ref_index]; + // Current thread releases the holding of reference frame. + decrease_ref_count(old_idx, frame_bufs, pool); + + // Release the reference frame in reference map. + if ((mask & 1) && old_idx >= 0) { + decrease_ref_count(old_idx, frame_bufs, pool); } + cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index]; ++ref_index; } + // Current thread releases the holding of reference frame. + for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) { + const int old_idx = cm->ref_frame_map[ref_index]; + decrease_ref_count(old_idx, frame_bufs, pool); + cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index]; + } + unlock_buffer_pool(pool); + cm->frame_to_show = get_frame_new_buffer(cm); if (!pbi->frame_parallel_decode || !cm->show_frame) { + lock_buffer_pool(pool); --frame_bufs[cm->new_fb_idx].ref_count; + unlock_buffer_pool(pool); } // Invalidate these references until the next frame starts. @@ -256,6 +280,20 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, &frame_bufs[cm->new_fb_idx].raw_frame_buffer); cm->new_fb_idx = get_free_fb(cm); + + if (pbi->frame_parallel_decode) { + VP9Worker *const worker = pbi->frame_worker_owner; + vp9_frameworker_lock_stats(worker); + frame_bufs[cm->new_fb_idx].frame_worker_owner = worker; + // Reset decoding progress. + pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; + pbi->cur_buf->row = -1; + pbi->cur_buf->col = -1; + vp9_frameworker_unlock_stats(worker); + } else { + pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; + } + if (setjmp(cm->error.jmp)) { cm->error.setjmp = 0; @@ -283,20 +321,39 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, vp9_clear_system_state(); - cm->last_width = cm->width; - cm->last_height = cm->height; - if (!cm->show_existing_frame) cm->last_show_frame = cm->show_frame; - if (cm->show_frame) { - if (!cm->show_existing_frame) - vp9_swap_mi_and_prev_mi(cm); - cm->current_video_frame++; + // Update progress in frame parallel decode. + if (pbi->frame_parallel_decode) { + // Need to lock the mutex here as another thread may + // be accessing this buffer. + VP9Worker *const worker = pbi->frame_worker_owner; + FrameWorkerData *const frame_worker_data = worker->data1; + vp9_frameworker_lock_stats(worker); + + if (cm->show_frame) { + if (!cm->show_existing_frame) + vp9_swap_mi_and_prev_mi(cm); + cm->current_video_frame++; + } + vp9_swap_current_and_last_seg_map(cm); + frame_worker_data->frame_decoded = 1; + frame_worker_data->frame_context_ready = 1; + vp9_frameworker_signal_stats(worker); + vp9_frameworker_unlock_stats(worker); + } else { + cm->last_width = cm->width; + cm->last_height = cm->height; + if (cm->show_frame) { + if (!cm->show_existing_frame) + vp9_swap_mi_and_prev_mi(cm); + cm->current_video_frame++; + } + + vp9_swap_current_and_last_seg_map(cm); } - vp9_swap_current_and_last_seg_map(cm); - pbi->ready_for_new_data = 0; cm->error.setjmp = 0; diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 758d49006..9844e2031 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -45,6 +45,12 @@ typedef struct VP9Decoder { int frame_parallel_decode; // frame-based threading. + // TODO(hkuang): Combine this with cur_buf in macroblockd as they are + // the same. + RefCntBuffer *cur_buf; // Current decoding frame buffer. + RefCntBuffer *prev_buf; // Previous decoding frame buffer. + + VP9Worker *frame_worker_owner; // frame_worker that owns this pbi. VP9Worker lf_worker; VP9Worker *tile_workers; int num_tile_workers; diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c index 5dda49a0f..f599c2a8b 100644 --- a/vp9/decoder/vp9_dthread.c +++ b/vp9/decoder/vp9_dthread.c @@ -17,6 +17,8 @@ #include "vp9/decoder/vp9_dthread.h" #include "vp9/decoder/vp9_decoder.h" +// #define DEBUG_THREAD + #if CONFIG_MULTITHREAD static INLINE void mutex_lock(pthread_mutex_t *const mutex) { const int kMaxTryLocks = 4000; @@ -279,3 +281,166 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) { vp9_zero(*lf_sync); } } + +// TODO(hkuang): Clean up all the #ifdef in this file. +void vp9_frameworker_lock_stats(VP9Worker *const worker) { +#if CONFIG_MULTITHREAD + FrameWorkerData *const worker_data = worker->data1; + pthread_mutex_lock(&worker_data->stats_mutex); +#else + (void)worker; +#endif +} + +void vp9_frameworker_unlock_stats(VP9Worker *const worker) { +#if CONFIG_MULTITHREAD + FrameWorkerData *const worker_data = worker->data1; + pthread_mutex_unlock(&worker_data->stats_mutex); +#else + (void)worker; +#endif +} + +void vp9_frameworker_signal_stats(VP9Worker *const worker) { +#if CONFIG_MULTITHREAD + FrameWorkerData *const worker_data = worker->data1; + // TODO(hkuang): Investigate using broadcast or signal. + pthread_cond_signal(&worker_data->stats_cond); +#else + (void)worker; +#endif +} + +// TODO(hkuang): Remove worker parameter as it is only used in debug code. +void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf, + int row) { +#if CONFIG_MULTITHREAD + if (!ref_buf) + return; + + // Enabling the following line of code will get harmless tsan error but + // will get best performance. + // if (ref_buf->row >= row) return; + + { + // Find the worker thread that owns the reference frame. If the reference + // frame has been fully decoded, it may not have owner. + VP9Worker *const ref_worker = ref_buf->frame_worker_owner; + FrameWorkerData *const ref_worker_data = + (FrameWorkerData *)ref_worker->data1; + const VP9Decoder *const pbi = ref_worker_data->pbi; + +#ifdef DEBUG_THREAD + { + FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; + printf("%d %p worker is waiting for %d %p worker (%d) ref %d \r\n", + worker_data->worker_id, worker, ref_worker_data->worker_id, + ref_buf->frame_worker_owner, row, ref_buf->row); + } +#endif + + vp9_frameworker_lock_stats(ref_worker); + while (ref_buf->row < row && pbi->cur_buf == ref_buf) { + pthread_cond_wait(&ref_worker_data->stats_cond, + &ref_worker_data->stats_mutex); + } + vp9_frameworker_unlock_stats(ref_worker); + } +#else + (void)ref_buf; + (void)row; + (void)ref_buf; +#endif // CONFIG_MULTITHREAD +} + +void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) { +#if CONFIG_MULTITHREAD + VP9Worker *worker = buf->frame_worker_owner; + +#ifdef DEBUG_THREAD + printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id, + buf->frame_worker_owner, row); +#endif + + vp9_frameworker_lock_stats(worker); + buf->row = row; + vp9_frameworker_signal_stats(worker); + vp9_frameworker_unlock_stats(worker); +#else + (void)buf; + (void)row; +#endif // CONFIG_MULTITHREAD +} + +void vp9_frameworker_copy_context(VP9Worker *const dst_worker, + VP9Worker *const src_worker) { +#if CONFIG_MULTITHREAD + FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1; + FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1; + VP9_COMMON *const src_cm = &src_worker_data->pbi->common; + VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common; + int i; + + // Wait until source frame's context is ready. + vp9_frameworker_lock_stats(src_worker); + while (!src_worker_data->frame_context_ready) { + pthread_cond_wait(&src_worker_data->stats_cond, + &src_worker_data->stats_mutex); + } + + // src worker may have already finished decoding a frame and swapped the mi. + // TODO(hkuang): Remove following code after implenment no ModeInfo decoding. + if (src_worker_data->frame_decoded) { + dst_cm->prev_mip = src_cm->prev_mip; + dst_cm->prev_mi = src_cm->prev_mi; + dst_cm->prev_mi_grid_base = src_cm->prev_mi_grid_base; + dst_cm->prev_mi_grid_visible = src_cm->prev_mi_grid_visible; + dst_cm->last_frame_seg_map = src_cm->last_frame_seg_map; + } else { + dst_cm->prev_mip = src_cm->mip; + dst_cm->prev_mi = src_cm->mi; + dst_cm->prev_mi_grid_base = src_cm->mi_grid_base; + dst_cm->prev_mi_grid_visible = src_cm->mi_grid_visible; + dst_cm->last_frame_seg_map = src_cm->current_frame_seg_map; + } + + vp9_frameworker_unlock_stats(src_worker); + + dst_worker_data->pbi->prev_buf = + src_worker_data->pbi->common.show_existing_frame ? + NULL : src_worker_data->pbi->cur_buf; + + dst_cm->last_width = !src_cm->show_existing_frame ? + src_cm->width : src_cm->last_width; + dst_cm->last_height = !src_cm->show_existing_frame ? + src_cm->height : src_cm->last_height; + dst_cm->display_width = src_cm->display_width; + dst_cm->display_height = src_cm->display_height; + dst_cm->subsampling_x = src_cm->subsampling_x; + dst_cm->subsampling_y = src_cm->subsampling_y; + dst_cm->last_show_frame = !src_cm->show_existing_frame ? + src_cm->show_frame : src_cm->last_show_frame; + dst_cm->last_frame_type = src_cm->last_frame_type; + dst_cm->frame_type = src_cm->frame_type; + dst_cm->y_dc_delta_q = src_cm->y_dc_delta_q; + dst_cm->uv_dc_delta_q = src_cm->uv_dc_delta_q; + dst_cm->uv_ac_delta_q = src_cm->uv_ac_delta_q; + dst_cm->base_qindex = src_cm->base_qindex; + + for (i = 0; i < REF_FRAMES; ++i) + dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i]; + + memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr, + (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh)); + dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level; + dst_cm->lf.filter_level = src_cm->lf.filter_level; + memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS); + memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS); + dst_cm->seg = src_cm->seg; + memcpy(dst_cm->frame_contexts, src_cm->frame_contexts, + FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0])); +#else + (void) dst_worker; + (void) src_worker; +#endif // CONFIG_MULTITHREAD +} diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h index 75b652518..52c3233e4 100644 --- a/vp9/decoder/vp9_dthread.h +++ b/vp9/decoder/vp9_dthread.h @@ -19,7 +19,7 @@ struct VP9Common; struct VP9Decoder; typedef struct TileWorkerData { - struct VP9Common *cm; + struct VP9Decoder *pbi; vp9_reader bit_reader; DECLARE_ALIGNED(16, struct macroblockd, xd); @@ -55,6 +55,14 @@ typedef struct FrameWorkerData { // It is used to make a copy of the compressed data. uint8_t *scratch_buffer; size_t scratch_buffer_size; + +#if CONFIG_MULTITHREAD + pthread_mutex_t stats_mutex; + pthread_cond_t stats_cond; +#endif + + int frame_context_ready; // Current frame's context is ready to read. + int frame_decoded; // Finished decoding current frame. } FrameWorkerData; // Allocate memory for loopfilter row synchronization. @@ -71,4 +79,23 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, int frame_filter_level, int y_only); +void vp9_frameworker_lock_stats(VP9Worker *const worker); +void vp9_frameworker_unlock_stats(VP9Worker *const worker); +void vp9_frameworker_signal_stats(VP9Worker *const worker); + +// Wait until ref_buf has been decoded to row in real pixel unit. +// Note: worker may already finish decoding ref_buf and release it in order to +// start decoding next frame. So need to check whether worker is still decoding +// ref_buf. +void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf, + int row); + +// FrameWorker broadcasts its decoding progress so other workers that are +// waiting on it can resume decoding. +void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row); + +// Copy necessary decoding context from src worker to dst worker. +void vp9_frameworker_copy_context(VP9Worker *const dst_worker, + VP9Worker *const src_worker); + #endif // VP9_DECODER_VP9_DTHREAD_H_ diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index d9edeae3e..c7ab2209c 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -484,7 +484,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (cm->coding_use_prev_mi) vp9_find_mv_refs(cm, xd, tile, xd->mi[0], ref_frame, - candidates, mi_row, mi_col); + candidates, mi_row, mi_col, NULL, NULL); else const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0], ref_frame, candidates, diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index e577017e6..b817fac98 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2253,7 +2253,8 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); // Gets an initial list of candidate vectors from neighbours and orders them - vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col); + vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col, + NULL, NULL); // Candidate refinement carried out at encoder and decoder vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 3bfdea6ad..d56ee0076 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -18,6 +18,7 @@ #include "vpx/vpx_decoder.h" #include "vp9/common/vp9_frame_buffers.h" +#include "vp9/common/vp9_thread.h" #include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_read_bit_buffer.h" @@ -28,6 +29,15 @@ typedef vpx_codec_stream_info_t vp9_stream_info_t; +// This limit is due to framebuffer numbers. +// TODO(hkuang): Remove this limit after implementing ondemand framebuffers. +#define FRAME_CACHE_SIZE 6 // Cache maximum 6 decoded frames. + +typedef struct cache_frame { + int fb_idx; + vpx_image_t img; +} cache_frame; + struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_dec_cfg_t cfg; @@ -35,17 +45,24 @@ struct vpx_codec_alg_priv { int postproc_cfg_set; vp8_postproc_cfg_t postproc_cfg; vpx_decrypt_cb decrypt_cb; - void *decrypt_state; + void *decrypt_state; vpx_image_t img; int flushed; int invert_tile_order; - int frame_parallel_decode; // frame-based threading. int last_show_frame; // Index of last output frame. + // Frame parallel related. + int frame_parallel_decode; // frame-based threading. VP9Worker *frame_workers; int num_frame_workers; - int next_submit_thread_id; - int next_output_thread_id; + int next_submit_worker_id; + int last_submit_worker_id; + int next_output_worker_id; + int available_threads; + cache_frame frame_cache[FRAME_CACHE_SIZE]; + int frame_cache_write; + int frame_cache_read; + int num_cache_frames; // BufferPool that holds all reference frames. Shared by all the FrameWorkers. BufferPool *buffer_pool; @@ -77,11 +94,10 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si); ctx->priv->init_flags = ctx->init_flags; ctx->priv->alg_priv->flushed = 0; + // Only do frame parallel decode when threads > 1. ctx->priv->alg_priv->frame_parallel_decode = - (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING); - - // Disable frame parallel decoding for now. - ctx->priv->alg_priv->frame_parallel_decode = 0; + ((ctx->config.dec->threads > 1) && + (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING)) ? 1 : 0; if (ctx->config.dec) { // Update the reference to the config structure to an internal copy. @@ -98,10 +114,21 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { int i; for (i = 0; i < ctx->num_frame_workers; ++i) { VP9Worker *const worker = &ctx->frame_workers[i]; - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - vp9_decoder_remove(worker_data->pbi); - vpx_free(worker_data); + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + vp9_get_worker_interface()->end(worker); + vp9_decoder_remove(frame_worker_data->pbi); + vpx_free(frame_worker_data->scratch_buffer); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&frame_worker_data->stats_mutex); + pthread_cond_destroy(&frame_worker_data->stats_cond); +#endif + vpx_free(frame_worker_data); } +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); +#endif + vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers); } vpx_free(ctx->frame_workers); @@ -222,8 +249,8 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { for (i = 0; i < ctx->num_frame_workers; ++i) { VP9Worker *const worker = &ctx->frame_workers[i]; - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - VP9_COMMON *const cm = &worker_data->pbi->common; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + VP9_COMMON *const cm = &frame_worker_data->pbi->common; BufferPool *const pool = cm->buffer_pool; cm->new_fb_idx = -1; @@ -260,14 +287,15 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, } static int frame_worker_hook(void *arg1, void *arg2) { - FrameWorkerData *const worker_data = (FrameWorkerData *)arg1; - const uint8_t *data = worker_data->data; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1; + const uint8_t *data = frame_worker_data->data; (void)arg2; - worker_data->result = vp9_receive_compressed_data(worker_data->pbi, - worker_data->data_size, - &data); - worker_data->data_end = data; - return !worker_data->result; + frame_worker_data->result = + vp9_receive_compressed_data(frame_worker_data->pbi, + frame_worker_data->data_size, + &data); + frame_worker_data->data_end = data; + return !frame_worker_data->result; } static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { @@ -275,14 +303,28 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); ctx->last_show_frame = -1; - ctx->next_submit_thread_id = 0; - ctx->next_output_thread_id = 0; + ctx->next_submit_worker_id = 0; + ctx->last_submit_worker_id = 0; + ctx->next_output_worker_id = 0; + ctx->frame_cache_read = 0; + ctx->frame_cache_write = 0; + ctx->num_cache_frames = 0; ctx->num_frame_workers = (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1; + ctx->available_threads = ctx->num_frame_workers; + ctx->flushed = 0; + ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool)); if (ctx->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR; +#if CONFIG_MULTITHREAD + if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) { + set_error_detail(ctx, "Failed to allocate buffer pool mutex"); + return VPX_CODEC_MEM_ERROR; + } +#endif + ctx->frame_workers = (VP9Worker *) vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers)); if (ctx->frame_workers == NULL) { @@ -292,28 +334,51 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { for (i = 0; i < ctx->num_frame_workers; ++i) { VP9Worker *const worker = &ctx->frame_workers[i]; - FrameWorkerData *worker_data = NULL; + FrameWorkerData *frame_worker_data = NULL; winterface->init(worker); worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData)); if (worker->data1 == NULL) { - set_error_detail(ctx, "Failed to allocate worker_data"); + set_error_detail(ctx, "Failed to allocate frame_worker_data"); return VPX_CODEC_MEM_ERROR; } - worker_data = (FrameWorkerData *)worker->data1; - worker_data->pbi = vp9_decoder_create(ctx->buffer_pool); - if (worker_data->pbi == NULL) { - set_error_detail(ctx, "Failed to allocate worker_data"); + frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool); + if (frame_worker_data->pbi == NULL) { + set_error_detail(ctx, "Failed to allocate frame_worker_data"); + return VPX_CODEC_MEM_ERROR; + } + frame_worker_data->pbi->frame_worker_owner = worker; + frame_worker_data->pbi->common.mi_idx = 0; + frame_worker_data->pbi->common.prev_mi_idx = 1; + frame_worker_data->worker_id = i; + frame_worker_data->scratch_buffer = NULL; + frame_worker_data->scratch_buffer_size = 0; + frame_worker_data->frame_context_ready = 0; +#if CONFIG_MULTITHREAD + if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) { + set_error_detail(ctx, "Failed to allocate frame_worker_data mutex"); return VPX_CODEC_MEM_ERROR; } + if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) { + set_error_detail(ctx, "Failed to allocate frame_worker_data cond"); + return VPX_CODEC_MEM_ERROR; + } +#endif // If decoding in serial mode, FrameWorker thread could create tile worker // thread or loopfilter thread. - worker_data->pbi->max_threads = + frame_worker_data->pbi->max_threads = (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0; - worker_data->pbi->inv_tile_order = ctx->invert_tile_order; - worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode; + frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order; + frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode; + frame_worker_data->pbi->common.frame_parallel_decode = + ctx->frame_parallel_decode; worker->hook = (VP9WorkerHook)frame_worker_hook; + if (!winterface->reset(worker)) { + set_error_detail(ctx, "Frame Worker thread creation failed"); + return VPX_CODEC_MEM_ERROR; + } } // If postprocessing was enabled by the application and a @@ -348,36 +413,66 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_ERROR; } - // Initialize the decoder workers on the first frame - if (ctx->frame_workers == NULL) { - const vpx_codec_err_t res = init_decoder(ctx); - if (res != VPX_CODEC_OK) - return res; - } - if (!ctx->frame_parallel_decode) { VP9Worker *const worker = ctx->frame_workers; - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - worker_data->data = *data; - worker_data->data_size = data_sz; - worker_data->user_priv = user_priv; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->data = *data; + frame_worker_data->data_size = data_sz; + frame_worker_data->user_priv = user_priv; // Set these even if already initialized. The caller may have changed the // decrypt config between frames. - worker_data->pbi->decrypt_cb = ctx->decrypt_cb; - worker_data->pbi->decrypt_state = ctx->decrypt_state; + frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb; + frame_worker_data->pbi->decrypt_state = ctx->decrypt_state; worker->had_error = 0; winterface->execute(worker); // Update data pointer after decode. - *data = worker_data->data_end; + *data = frame_worker_data->data_end; if (worker->had_error) - return update_error_state(ctx, &worker_data->pbi->common.error); + return update_error_state(ctx, &frame_worker_data->pbi->common.error); } else { - // TODO(hkuang): Implement frame parallel decode. - return VPX_CODEC_INCAPABLE; + const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + VP9Worker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id]; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + // Copy context from last worker thread to next worker thread. + if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) + vp9_frameworker_copy_context( + &ctx->frame_workers[ctx->next_submit_worker_id], + &ctx->frame_workers[ctx->last_submit_worker_id]); + + // Copy the compressed data into worker's internal buffer. + // TODO(hkuang): Will all the workers allocate the same size + // as the size of the first intra frame be better? This will + // avoid too many deallocate and allocate. + if (frame_worker_data->scratch_buffer_size < data_sz) { + frame_worker_data->scratch_buffer = + (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz); + if (frame_worker_data->scratch_buffer == NULL) { + set_error_detail(ctx, "Failed to reallocate scratch buffer"); + return VPX_CODEC_MEM_ERROR; + } + frame_worker_data->scratch_buffer_size = data_sz; + } + frame_worker_data->data_size = data_sz; + vpx_memcpy(frame_worker_data->scratch_buffer, *data, data_sz); + + frame_worker_data->frame_decoded = 0; + frame_worker_data->frame_context_ready = 0; + frame_worker_data->data = frame_worker_data->scratch_buffer; + frame_worker_data->user_priv = user_priv; + + if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) + ctx->last_submit_worker_id = + (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers; + + ctx->next_submit_worker_id = + (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers; + + --ctx->available_threads; + winterface->launch(worker); } if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) @@ -461,6 +556,30 @@ static vpx_codec_err_t parse_superframe_index(const uint8_t *data, return VPX_CODEC_OK; } +static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) { + YV12_BUFFER_CONFIG sd; + vp9_ppflags_t flags = {0, 0, 0}; + const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + ctx->next_output_worker_id = + (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; + winterface->sync(worker); + ++ctx->available_threads; + if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) { + VP9_COMMON *const cm = &frame_worker_data->pbi->common; + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx; + yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd, + frame_worker_data->user_priv); + ctx->frame_cache[ctx->frame_cache_write].img.fb_priv = + frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; + ctx->frame_cache_write = + (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE; + ++ctx->num_cache_frames; + } +} + static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, void *user_priv, long deadline) { @@ -478,6 +597,13 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, // Reset flushed when receiving a valid frame. ctx->flushed = 0; + // Initialize the decoder workers on the first frame. + if (ctx->frame_workers == NULL) { + const vpx_codec_err_t res = init_decoder(ctx); + if (res != VPX_CODEC_OK) + return res; + } + res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count, ctx->decrypt_cb, ctx->decrypt_state); if (res != VPX_CODEC_OK) @@ -494,30 +620,46 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, for (i = 0; i < frame_count; ++i) { const uint8_t *data_start_copy = data_start; const uint32_t frame_size = frame_sizes[i]; - vpx_codec_err_t res; if (data_start < data || frame_size > (uint32_t) (data_end - data_start)) { set_error_detail(ctx, "Invalid frame size in index"); return VPX_CODEC_CORRUPT_FRAME; } + if (ctx->available_threads == 0) { + // No more threads for decoding. Wait until the next output worker + // finishes decoding. Then copy the decoded frame into cache. + if (ctx->num_cache_frames < FRAME_CACHE_SIZE) { + wait_worker_and_cache_frame(ctx); + } else { + // TODO(hkuang): Add unit test to test this path. + set_error_detail(ctx, "Frame output cache is full."); + return VPX_CODEC_ERROR; + } + } + res = decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline); if (res != VPX_CODEC_OK) return res; - data_start += frame_size; } } else { - res = decode_one(ctx, &data_start, data_sz, user_priv, deadline); + if (ctx->available_threads == 0) { + // No more threads for decoding. Wait until the next output worker + // finishes decoding. Then copy the decoded frame into cache. + if (ctx->num_cache_frames < FRAME_CACHE_SIZE) { + wait_worker_and_cache_frame(ctx); + } else { + // TODO(hkuang): Add unit test to test this path. + set_error_detail(ctx, "Frame output cache is full."); + return VPX_CODEC_ERROR; + } + } + + res = decode_one(ctx, &data, data_sz, user_priv, deadline); if (res != VPX_CODEC_OK) return res; - - // Extra data detected after the frame. - if (data_start < data_end - 1) { - set_error_detail(ctx, "Fail to decode frame in parallel mode"); - return VPX_CODEC_INCAPABLE; - } } } else { // Decode in serial mode. @@ -561,41 +703,73 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, } } - return VPX_CODEC_OK; + return res; +} + +static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) { + RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs; + // Decrease reference count of last output frame in frame parallel mode. + if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) { + BufferPool *const pool = ctx->buffer_pool; + lock_buffer_pool(pool); + --frame_bufs[ctx->last_show_frame].ref_count; + if (frame_bufs[ctx->last_show_frame].ref_count == 0) { + pool->release_fb_cb(pool->cb_priv, + &frame_bufs[ctx->last_show_frame].raw_frame_buffer); + } + unlock_buffer_pool(pool); + } } static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter) { vpx_image_t *img = NULL; + // Only return frame when all the cpu are busy or + // application fluhsed the decoder in frame parallel decode. + if (ctx->frame_parallel_decode && ctx->available_threads > 0 && + !ctx->flushed) { + return img; + } + + // Output the frames in the cache first. + if (ctx->num_cache_frames > 0) { + release_last_output_frame(ctx); + ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx; + img = &ctx->frame_cache[ctx->frame_cache_read].img; + ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE; + --ctx->num_cache_frames; + return img; + } + // iter acts as a flip flop, so an image is only returned on the first // call to get_frame. if (*iter == NULL && ctx->frame_workers != NULL) { - YV12_BUFFER_CONFIG sd; - vp9_ppflags_t flags = {0, 0, 0}; - - VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_thread_id]; - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - if (vp9_get_raw_frame(worker_data->pbi, &sd, &flags) == 0) { - VP9_COMMON *const cm = &worker_data->pbi->common; - BufferPool *const pool = cm->buffer_pool; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - yuvconfig2image(&ctx->img, &sd, worker_data->user_priv); - ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; - img = &ctx->img; - *iter = img; - // Decrease reference count of last output frame in frame parallel mode. - if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) { - --frame_bufs[ctx->last_show_frame].ref_count; - if (frame_bufs[ctx->last_show_frame].ref_count == 0) { - pool->release_fb_cb(pool->cb_priv, - &frame_bufs[ctx->last_show_frame].raw_frame_buffer); - } + do { + YV12_BUFFER_CONFIG sd; + vp9_ppflags_t flags = {0, 0, 0}; + const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + VP9Worker *const worker = + &ctx->frame_workers[ctx->next_output_worker_id]; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + ctx->next_output_worker_id = + (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; + // Wait for the frame from worker thread. + winterface->sync(worker); + ++ctx->available_threads; + if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) { + VP9_COMMON *const cm = &frame_worker_data->pbi->common; + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + release_last_output_frame(ctx); + ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx; + yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv); + ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; + img = &ctx->img; + return img; } - ctx->last_show_frame = worker_data->pbi->common.new_fb_idx; - } + } while (ctx->next_output_worker_id != ctx->next_submit_worker_id); } - return img; } @@ -631,9 +805,9 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data; YV12_BUFFER_CONFIG sd; VP9Worker *const worker = ctx->frame_workers; - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; image2yuvconfig(&frame->img, &sd); - return vp9_set_reference_dec(&worker_data->pbi->common, + return vp9_set_reference_dec(&frame_worker_data->pbi->common, (VP9_REFFRAME)frame->frame_type, &sd); } else { return VPX_CODEC_INVALID_PARAM; @@ -654,9 +828,9 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data; YV12_BUFFER_CONFIG sd; VP9Worker *const worker = ctx->frame_workers; - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; image2yuvconfig(&frame->img, &sd); - return vp9_copy_reference_dec(worker_data->pbi, + return vp9_copy_reference_dec(frame_worker_data->pbi, (VP9_REFFRAME)frame->frame_type, &sd); } else { return VPX_CODEC_INVALID_PARAM; @@ -676,8 +850,8 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, if (data) { YV12_BUFFER_CONFIG* fb; VP9Worker *const worker = ctx->frame_workers; - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - vp9_get_reference_dec(worker_data->pbi, data->idx, &fb); + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + vp9_get_reference_dec(frame_worker_data->pbi, data->idx, &fb); yuvconfig2image(&data->img, fb, NULL); return VPX_CODEC_OK; } else { @@ -724,8 +898,9 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, if (update_info) { if (ctx->frame_workers) { VP9Worker *const worker = ctx->frame_workers; - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - *update_info = worker_data->pbi->refresh_frame_flags; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + *update_info = frame_worker_data->pbi->refresh_frame_flags; } else { return VPX_CODEC_ERROR; } @@ -735,22 +910,18 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, } } - static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, va_list args) { int *corrupted = va_arg(args, int *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (corrupted) { if (ctx->frame_workers) { VP9Worker *const worker = ctx->frame_workers; - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - *corrupted = worker_data->pbi->common.frame_to_show->corrupted; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + RefCntBuffer *const frame_bufs = + frame_worker_data->pbi->common.buffer_pool->frame_bufs; + *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted; } else { return VPX_CODEC_ERROR; } @@ -773,8 +944,9 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx, if (display_size) { if (ctx->frame_workers) { VP9Worker *const worker = ctx->frame_workers; - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - const VP9_COMMON *const cm = &worker_data->pbi->common; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const VP9_COMMON *const cm = &frame_worker_data->pbi->common; display_size[0] = cm->display_width; display_size[1] = cm->display_height; } else { diff --git a/vpx/vpx_frame_buffer.h b/vpx/vpx_frame_buffer.h index e69df4bc8..0741e6e71 100644 --- a/vpx/vpx_frame_buffer.h +++ b/vpx/vpx_frame_buffer.h @@ -22,8 +22,11 @@ extern "C" { #include "./vpx_integer.h" /*!\brief The maximum number of work buffers used by libvpx. + * Support maximum 4 threads to decode video in parallel. + * Each thread will use one work buffer. + * TODO(hkuang): Add support to set number of worker threads dynamically. */ -#define VPX_MAXIMUM_WORK_BUFFERS 1 +#define VPX_MAXIMUM_WORK_BUFFERS 4 /*!\brief The maximum number of reference buffers that a VP9 encoder may use. */