Implement frame parallel decode for VP9.

Using 4 threads, frame parallel decode is ~3x faster than single thread
decode and around 30% faster than tile parallel decode for frame parallel
encoded video on both Android and desktop with 4 threads. Decode speed is
scalable to threads too which means decode could be even faster with more threads.

Change-Id: Ia0a549aaa3e83b5a17b31d8299aa496ea4f21e3e
This commit is contained in:
Hangyu Kuang 2014-07-30 20:43:40 -07:00 committed by hkuang
parent 4d0d78424b
commit 9ce3a7d76c
17 changed files with 842 additions and 216 deletions

View File

@ -12,11 +12,37 @@
#include "vpx_mem/vpx_mem.h"
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_systemdependent.h"
// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
// frame reference count.
void lock_buffer_pool(BufferPool *const pool) {
#if CONFIG_MULTITHREAD
pthread_mutex_lock(&pool->pool_mutex);
#else
(void)pool;
#endif
}
void unlock_buffer_pool(BufferPool *const pool) {
#if CONFIG_MULTITHREAD
pthread_mutex_unlock(&pool->pool_mutex);
#else
(void)pool;
#endif
}
static INLINE void alloc_mi_array(VP9_COMMON *cm, int mi_size, int idx) {
CHECK_MEM_ERROR(cm, cm->mip_array[idx],
vpx_calloc(mi_size, sizeof(*cm->mip_array[0])));
CHECK_MEM_ERROR(cm, cm->mi_grid_base_array[idx],
vpx_calloc(mi_size, sizeof(*cm->mi_grid_base_array[0])));
}
static void clear_mi_border(const VP9_COMMON *cm, MODE_INFO *mi) {
int i;
@ -49,40 +75,47 @@ static void setup_mi(VP9_COMMON *cm) {
vpx_memset(cm->mi_grid_base, 0, cm->mi_stride * (cm->mi_rows + 1) *
sizeof(*cm->mi_grid_base));
clear_mi_border(cm, cm->prev_mip);
// Only clear mi border in non frame-parallel decode. In frame-parallel
// decode, prev_mip is managed by previous decoding thread. While in
// non frame-parallel decode, prev_mip and mip are both managed by
// current decoding thread.
if (!cm->frame_parallel_decode)
clear_mi_border(cm, cm->prev_mip);
}
static int alloc_mi(VP9_COMMON *cm, int mi_size) {
int i;
for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
cm->mip_array[i] =
(MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip));
if (cm->mip_array[i] == NULL)
return 1;
cm->mi_grid_base_array[i] =
(MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
if (cm->mi_grid_base_array[i] == NULL)
return 1;
// Delay reallocation as another thread is accessing prev_mi.
if (cm->frame_parallel_decode && i == cm->prev_mi_idx) {
cm->update_prev_mi = 1;
continue;
}
alloc_mi_array(cm, mi_size, i);
}
// Init the index.
cm->mi_idx = 0;
cm->prev_mi_idx = 1;
cm->mip = cm->mip_array[cm->mi_idx];
cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx];
cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx];
if (!cm->frame_parallel_decode) {
cm->mi_idx = 0;
cm->prev_mi_idx = 1;
// In frame-parallel decode, prev_mip comes from another thread,
// so current decoding thread should not touch it.
cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx];
}
return 0;
}
static void free_mi(VP9_COMMON *cm) {
static void free_mi(VP9_COMMON *cm, int decode_done) {
int i;
for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
if (cm->frame_parallel_decode && i == cm->prev_mi_idx && !decode_done)
continue;
vpx_free(cm->mip_array[i]);
cm->mip_array[i] = NULL;
vpx_free(cm->mi_grid_base_array[i]);
@ -90,9 +123,12 @@ static void free_mi(VP9_COMMON *cm) {
}
cm->mip = NULL;
cm->prev_mip = NULL;
cm->mi_grid_base = NULL;
cm->prev_mi_grid_base = NULL;
if (!cm->frame_parallel_decode) {
cm->prev_mip = NULL;
cm->prev_mi_grid_base = NULL;
}
}
static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
@ -109,7 +145,10 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
cm->prev_seg_map_idx = 1;
cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
if (!cm->frame_parallel_decode) {
cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
}
return 0;
}
@ -123,7 +162,10 @@ static void free_seg_map(VP9_COMMON *cm) {
}
cm->current_frame_seg_map = NULL;
cm->last_frame_seg_map = NULL;
if (!cm->frame_parallel_decode) {
cm->last_frame_seg_map = NULL;
}
}
void vp9_free_frame_buffers(VP9_COMMON *cm) {
@ -144,8 +186,7 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) {
}
void vp9_free_context_buffers(VP9_COMMON *cm) {
free_mi(cm);
free_mi(cm, 1);
free_seg_map(cm);
vpx_free(cm->above_context);
@ -170,7 +211,7 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
set_mb_mi(cm, aligned_width, aligned_height);
free_mi(cm);
free_mi(cm, 0);
if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE)))
goto fail;
@ -288,7 +329,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
void vp9_remove_common(VP9_COMMON *cm) {
vp9_free_frame_buffers(cm);
vp9_free_context_buffers(cm);
vp9_free_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
}
void vp9_update_frame_size(VP9_COMMON *cm) {
@ -306,6 +346,20 @@ void vp9_update_frame_size(VP9_COMMON *cm) {
void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
// Swap indices.
const int tmp = cm->mi_idx;
// Only used in frame parallel decode: Update the prev_mi buffer if
// needed. The worker that was accessing it must already finish decoding.
// So it can be resized safely now.
if (cm->update_prev_mi) {
const int mi_size = cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE);
vpx_free(cm->mip_array[cm->prev_mi_idx]);
vpx_free(cm->mi_grid_base_array[cm->prev_mi_idx]);
cm->mip_array[cm->prev_mi_idx] = NULL;
cm->mi_grid_base_array[cm->prev_mi_idx] = NULL;
alloc_mi_array(cm, mi_size, cm->prev_mi_idx);
cm->update_prev_mi = 0;
}
cm->mi_idx = cm->prev_mi_idx;
cm->prev_mi_idx = tmp;

View File

@ -439,7 +439,8 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
int i;
vp9_clearall_segfeatures(&cm->seg);
cm->seg.abs_delta = SEGMENT_DELTADATA;
if (cm->last_frame_seg_map)
if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
if (cm->current_frame_seg_map)
@ -467,7 +468,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
cm->frame_contexts[cm->frame_context_idx] = cm->fc;
}
if (frame_is_intra_only(cm))
if (frame_is_intra_only(cm) && !cm->frame_parallel_decode)
vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) *
sizeof(*cm->prev_mip));

View File

@ -17,14 +17,12 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
const TileInfo *const tile,
MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
int block, int mi_row, int mi_col) {
int block, int mi_row, int mi_col,
find_mv_refs_sync sync, void *const data) {
const int *ref_sign_bias = cm->ref_frame_sign_bias;
int i, refmv_count = 0;
const MODE_INFO *prev_mi = cm->coding_use_prev_mi && cm->prev_mi
? cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]
: NULL;
const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
MODE_INFO *prev_mi = NULL;
MB_MODE_INFO *prev_mbmi = NULL;
const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
@ -71,6 +69,14 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
}
}
// Synchronize here for frame parallel decode if sync function is provided.
if (sync != NULL) {
sync(data, mi_row);
}
prev_mi = cm->coding_use_prev_mi && cm->prev_mi ?
cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col] : NULL;
prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
// Check the last frame's mode and mv info.
if (prev_mbmi) {
if (prev_mbmi->ref_frame[0] == ref_frame)
@ -109,12 +115,13 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
}
void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
const TileInfo *const tile,
MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
int mi_row, int mi_col) {
const TileInfo *const tile,
MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
int mi_row, int mi_col,
find_mv_refs_sync sync, void *const data) {
find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
mi_row, mi_col);
mi_row, mi_col, sync, data);
}
static void lower_mv_precision(MV *mv, int allow_hp) {
@ -152,7 +159,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
assert(MAX_MV_REF_CANDIDATES == 2);
find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block,
mi_row, mi_col);
mi_row, mi_col, NULL, NULL);
near->as_int = 0;
switch (block) {

View File

@ -204,10 +204,12 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
}
typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
const TileInfo *const tile,
MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list, int mi_row, int mi_col);
int_mv *mv_ref_list, int mi_row, int mi_col,
find_mv_refs_sync sync, void *const data);
// check a list of motion vectors by sad score using a number rows of pixels
// above and a number cols of pixels in the left to select the one with best

View File

@ -36,10 +36,13 @@ extern "C" {
#define REF_FRAMES_LOG2 3
#define REF_FRAMES (1 << REF_FRAMES_LOG2)
// 1 scratch frame for the new frame, 3 for scaled references on the encoder
// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
// in parallel, 3 for scaled references on the encoder.
// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
// of framebuffers.
// TODO(jkoleszar): These 3 extra references could probably come from the
// normal reference pool.
#define FRAME_BUFFERS (REF_FRAMES + 4)
#define FRAME_BUFFERS (REF_FRAMES + 7)
#define FRAME_CONTEXTS_LOG2 2
#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
@ -64,6 +67,18 @@ typedef struct {
int ref_count;
vpx_codec_frame_buffer_t raw_frame_buffer;
YV12_BUFFER_CONFIG buf;
// The Following variables will only be used in frame parallel decode.
// frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
// that no FrameWorker owns, or is decoding, this buffer.
VP9Worker *frame_worker_owner;
// row and col indicate which position frame has been decoded to in real
// pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
// when the frame is fully decoded.
int row;
int col;
} RefCntBuffer;
typedef struct {
@ -114,6 +129,10 @@ typedef struct VP9Common {
int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
// Prepare ref_frame_map for the next frame.
// Only used in frame parallel decode.
int next_ref_frame_map[REF_FRAMES];
// TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
// roll new_fb_idx into it.
@ -178,6 +197,9 @@ typedef struct VP9Common {
MODE_INFO **prev_mi_grid_base;
MODE_INFO **prev_mi_grid_visible;
// Used in frame parallel decode for delay resizing prev_mi.
int update_prev_mi;
// Persistent mb segment id map used in prediction.
int seg_map_idx;
int prev_seg_map_idx;
@ -197,6 +219,10 @@ typedef struct VP9Common {
struct loopfilter lf;
struct segmentation seg;
// TODO(hkuang): Remove this as it is the same as frame_parallel_decode
// in pbi.
int frame_parallel_decode; // frame-based threading.
// Context probabilities for reference frame prediction
int allow_comp_inter_inter;
MV_REFERENCE_FRAME comp_fixed_ref;
@ -235,6 +261,11 @@ typedef struct VP9Common {
ENTROPY_CONTEXT *above_context;
} VP9_COMMON;
// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
// frame reference count.
void lock_buffer_pool(BufferPool *const pool);
void unlock_buffer_pool(BufferPool *const pool);
static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
}
@ -242,12 +273,15 @@ static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
static INLINE int get_free_fb(VP9_COMMON *cm) {
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
int i;
lock_buffer_pool(cm->buffer_pool);
for (i = 0; i < FRAME_BUFFERS; ++i)
if (frame_bufs[i].ref_count == 0)
break;
assert(i < FRAME_BUFFERS);
frame_bufs[i].ref_count = 1;
unlock_buffer_pool(cm->buffer_pool);
return i;
}

View File

@ -327,21 +327,24 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME];
xd->block_refs[idx] = ref_buffer;
if (!vp9_is_valid_scale(&ref_buffer->sf))
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Invalid scale factors");
vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
&ref_buffer->sf);
xd->corrupted |= ref_buffer->buf->corrupted;
if (!cm->frame_parallel_decode)
xd->corrupted |= ref_buffer->buf->corrupted;
}
static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
const TileInfo *const tile,
int mi_row, int mi_col,
vp9_reader *r, BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &pbi->common;
const int less8x8 = bsize < BLOCK_8X8;
MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
vp9_read_mode_info(pbi, xd, tile, mi_row, mi_col, r);
if (less8x8)
bsize = BLOCK_8X8;
@ -365,7 +368,7 @@ static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
set_ref(cm, xd, 1, mi_row, mi_col);
// Prediction
vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
vp9_dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);
// Reconstruction
if (!mbmi->skip) {
@ -404,10 +407,11 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
return p;
}
static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
const TileInfo *const tile,
int mi_row, int mi_col,
vp9_reader* r, BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &pbi->common;
const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
PARTITION_TYPE partition;
BLOCK_SIZE subsize, uv_subsize;
@ -422,27 +426,27 @@ static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Invalid block size.");
if (subsize < BLOCK_8X8) {
decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
} else {
switch (partition) {
case PARTITION_NONE:
decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
break;
case PARTITION_HORZ:
decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
if (mi_row + hbs < cm->mi_rows)
decode_block(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
decode_block(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize);
break;
case PARTITION_VERT:
decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
if (mi_col + hbs < cm->mi_cols)
decode_block(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
decode_block(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize);
break;
case PARTITION_SPLIT:
decode_partition(cm, xd, tile, mi_row, mi_col, r, subsize);
decode_partition(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
decode_partition(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
decode_partition(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
decode_partition(pbi, xd, tile, mi_row, mi_col, r, subsize);
decode_partition(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize);
decode_partition(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize);
decode_partition(pbi, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
break;
default:
assert(0 && "Invalid partition type");
@ -638,6 +642,7 @@ static void apply_frame_size(VP9_COMMON *cm, int width, int height) {
vp9_update_frame_size(cm);
}
lock_buffer_pool(pool);
if (vp9_realloc_frame_buffer(
get_frame_new_buffer(cm), cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
@ -646,6 +651,7 @@ static void apply_frame_size(VP9_COMMON *cm, int width, int height) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
}
unlock_buffer_pool(pool);
}
static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@ -778,7 +784,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
const int tile_rows = 1 << cm->log2_tile_rows;
TileBuffer tile_buffers[4][1 << 6];
int tile_row, tile_col;
int mi_row, mi_col;
int mi_row = 0, mi_col = 0;
TileData *tile_data = NULL;
if (cm->lf.filter_level && pbi->lf_worker.data1 == NULL) {
@ -798,7 +804,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
vp9_copy(lf_data->planes, pbi->mb.plane);
lf_data->stop = 0;
lf_data->y_only = 0;
vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
}
assert(tile_rows <= 4);
@ -856,7 +861,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
vp9_zero(tile_data->xd.left_seg_context);
for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
mi_col += MI_BLOCK_SIZE) {
decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col,
decode_partition(pbi, &tile_data->xd, &tile, mi_row, mi_col,
&tile_data->bit_reader, BLOCK_64X64);
}
}
@ -880,6 +885,12 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
winterface->execute(&pbi->lf_worker);
}
}
// After loopfiltering, the last 7 row pixels in each superblock row may
// still be changed by the longest loopfilter of the next superblock
// row.
if (pbi->frame_parallel_decode)
vp9_frameworker_broadcast(pbi->cur_buf,
mi_row << MI_BLOCK_SIZE_LOG2);
}
}
@ -895,6 +906,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
// Get last tile data.
tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
if (pbi->frame_parallel_decode)
vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
return vp9_reader_find_end(&tile_data->bit_reader);
}
@ -909,7 +922,7 @@ static int tile_worker_hook(void *arg1, void *arg2) {
vp9_zero(tile_data->xd.left_seg_context);
for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE) {
decode_partition(tile_data->cm, &tile_data->xd, tile,
decode_partition(tile_data->pbi, &tile_data->xd, tile,
mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
}
}
@ -1015,10 +1028,10 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
TileInfo *const tile = (TileInfo*)worker->data2;
TileBuffer *const buf = &tile_buffers[0][n];
tile_data->cm = cm;
tile_data->pbi = pbi;
tile_data->xd = pbi->mb;
tile_data->xd.corrupted = 0;
vp9_tile_init(tile, tile_data->cm, 0, buf->col);
vp9_tile_init(tile, &pbi->common, 0, buf->col);
setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
&tile_data->bit_reader, pbi->decrypt_cb,
pbi->decrypt_state);
@ -1078,8 +1091,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
struct vp9_read_bit_buffer *rb) {
VP9_COMMON *const cm = &pbi->common;
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
BufferPool *const pool = pbi->common.buffer_pool;
int i, mask, ref_index = 0;
size_t sz;
int i;
cm->last_frame_type = cm->frame_type;
@ -1096,16 +1110,22 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
if (cm->show_existing_frame) {
// Show an existing frame directly.
const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
lock_buffer_pool(pool);
if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1)
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Buffer %d does not contain a decoded frame",
frame_to_show);
ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
unlock_buffer_pool(pool);
pbi->refresh_frame_flags = 0;
cm->lf.filter_level = 0;
cm->show_frame = 1;
if (pbi->frame_parallel_decode) {
for (i = 0; i < REF_FRAMES; ++i)
cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
}
return 0;
}
@ -1166,7 +1186,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
ref_frame->buf = &frame_bufs[idx].buf;
cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
}
setup_frame_size_with_refs(cm, rb);
cm->allow_high_precision_mv = vp9_rb_read_bit(rb);
@ -1198,6 +1217,29 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
// below, forcing the use of context 0 for those frame types.
cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
// Generate next_ref_frame_map.
lock_buffer_pool(pool);
for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
if (mask & 1) {
cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
++frame_bufs[cm->new_fb_idx].ref_count;
} else {
cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
}
// Current thread holds the reference frame.
if (cm->ref_frame_map[ref_index] >= 0)
++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
++ref_index;
}
for (; ref_index < REF_FRAMES; ++ref_index) {
cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
// Current thread holds the reference frame.
if (cm->ref_frame_map[ref_index] >= 0)
++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
}
unlock_buffer_pool(pool);
if (frame_is_intra_only(cm) || cm->error_resilient_mode)
vp9_setup_past_independence(cm);
@ -1343,6 +1385,7 @@ void vp9_decode_frame(VP9Decoder *pbi,
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0};
int context_updated = 0;
uint8_t clear_data[MAX_VP9_HEADER_SIZE];
const size_t first_partition_size = read_uncompressed_header(pbi,
@ -1380,6 +1423,28 @@ void vp9_decode_frame(VP9Decoder *pbi,
xd->corrupted = 0;
new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
if (cm->lf.filter_level) {
vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
}
// If encoded in frame parallel mode, frame context is ready after decoding
// the frame header.
if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
VP9Worker *const worker = pbi->frame_worker_owner;
FrameWorkerData *const frame_worker_data = worker->data1;
if (cm->refresh_frame_context) {
context_updated = 1;
cm->frame_contexts[cm->frame_context_idx] = cm->fc;
}
vp9_frameworker_lock_stats(worker);
pbi->cur_buf->row = -1;
pbi->cur_buf->col = -1;
frame_worker_data->frame_context_ready = 1;
// Signal the main thread that context is ready.
vp9_frameworker_signal_stats(worker);
vp9_frameworker_unlock_stats(worker);
}
// TODO(jzern): remove frame_parallel_decoding_mode restriction for
// single-frame tile decoding.
if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
@ -1407,7 +1472,8 @@ void vp9_decode_frame(VP9Decoder *pbi,
}
}
if (cm->refresh_frame_context)
// Non frame parallel update frame context here.
if (cm->refresh_frame_context && !context_updated)
cm->frame_contexts[cm->frame_context_idx] = cm->fc;
}
@ -1454,10 +1520,9 @@ static void build_mc_border(const uint8_t *src, int src_stride,
} while (--b_h);
}
void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
int bw, int bh,
int x, int y, int w, int h,
int mi_x, int mi_y) {
void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
int plane, int block, int bw, int bh, int x,
int y, int w, int h, int mi_x, int mi_y) {
struct macroblockd_plane *const pd = &xd->plane[plane];
const MODE_INFO *mi = xd->mi[0];
const int is_compound = has_second_ref(&mi->mbmi);
@ -1484,20 +1549,23 @@ void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
pd->subsampling_y);
MV32 scaled_mv;
int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride,
subpel_x, subpel_y;
int xs, ys, x0, y0, x0_16, y0_16, y1, frame_width, frame_height,
buf_stride, subpel_x, subpel_y;
uint8_t *ref_frame, *buf_ptr;
const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
const int idx = xd->block_refs[ref]->idx;
BufferPool *const pool = pbi->common.buffer_pool;
RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
// Get reference frame pointer, width and height.
if (plane == 0) {
frame_width = ref_buf->y_crop_width;
frame_height = ref_buf->y_crop_height;
ref_frame = ref_buf->y_buffer;
frame_width = ref_frame_buf->buf.y_crop_width;
frame_height = ref_frame_buf->buf.y_crop_height;
ref_frame = ref_frame_buf->buf.y_buffer;
} else {
frame_width = ref_buf->uv_crop_width;
frame_height = ref_buf->uv_crop_height;
ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer;
frame_width = ref_frame_buf->buf.uv_crop_width;
frame_height = ref_frame_buf->buf.uv_crop_height;
ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
: ref_frame_buf->buf.v_buffer;
}
if (vp9_is_scaled(sf)) {
@ -1550,15 +1618,18 @@ void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
buf_stride = pre_buf->stride;
// Get reference block bottom right vertical coordinate.
y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
// Do border extension if there is motion or the
// width/height is not a multiple of 8 pixels.
if (scaled_mv.col || scaled_mv.row ||
(frame_width & 0x7) || (frame_height & 0x7)) {
// Get reference block bottom right coordinate.
int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
int x_pad = 0, y_pad = 0;
// Get reference block bottom right horizontal coordinate.
int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
x0 -= VP9_INTERP_EXTEND - 1;
x1 += VP9_INTERP_EXTEND;
@ -1571,6 +1642,12 @@ void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
y_pad = 1;
}
// Wait until reference block is ready. Pad 7 more pixels as last 7
// pixels of each superblock row can be changed by next superblock row.
if (pbi->frame_parallel_decode)
vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
(y1 + 7) << (plane == 0 ? 0 : 1));
// Skip border extension if block is inside the frame.
if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width ||
y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
@ -1582,6 +1659,12 @@ void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
buf_stride = x1 - x0 + 1;
buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
}
} else {
// Wait until reference block is ready. Pad 7 more pixels as last 7
// pixels of each superblock row can be changed by next superblock row.
if (pbi->frame_parallel_decode)
vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
(y1 + 7) << (plane == 0 ? 0 : 1));
}
inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
@ -1589,7 +1672,8 @@ void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
}
}
void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
void vp9_dec_build_inter_predictors_sb(VP9Decoder *const pbi, MACROBLOCKD *xd,
int mi_row, int mi_col,
BLOCK_SIZE bsize) {
int plane;
const int mi_x = mi_col * MI_SIZE;
@ -1607,10 +1691,10 @@ void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
assert(bsize == BLOCK_8X8);
for (y = 0; y < num_4x4_h; ++y)
for (x = 0; x < num_4x4_w; ++x)
dec_build_inter_predictors(xd, plane, i++, bw, bh,
dec_build_inter_predictors(pbi, xd, plane, i++, bw, bh,
4 * x, 4 * y, 4, 4, mi_x, mi_y);
} else {
dec_build_inter_predictors(xd, plane, 0, bw, bh,
dec_build_inter_predictors(pbi, xd, plane, 0, bw, bh,
0, 0, bw, bh, mi_x, mi_y);
}
}

View File

@ -25,7 +25,8 @@ void vp9_decode_frame(struct VP9Decoder *pbi,
const uint8_t *data, const uint8_t *data_end,
const uint8_t **p_data_end);
void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
void vp9_dec_build_inter_predictors_sb(struct VP9Decoder *const pbi,
MACROBLOCKD *xd, int mi_row, int mi_col,
BLOCK_SIZE bsize);
#ifdef __cplusplus
} // extern "C"

View File

@ -420,11 +420,18 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
}
}
static void read_inter_block_mode_info(VP9_COMMON *const cm,
static void fpm_sync(void *const data, int mi_row) {
VP9Decoder *const pbi = (VP9Decoder *)data;
vp9_frameworker_wait(pbi->frame_worker_owner, pbi->prev_buf,
mi_row << MI_BLOCK_SIZE_LOG2);
}
static void read_inter_block_mode_info(VP9Decoder *const pbi,
MACROBLOCKD *const xd,
const TileInfo *const tile,
MODE_INFO *const mi,
int mi_row, int mi_col, vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
MB_MODE_INFO *const mbmi = &mi->mbmi;
const BLOCK_SIZE bsize = mbmi->sb_type;
const int allow_hp = cm->allow_high_precision_mv;
@ -438,7 +445,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
for (ref = 0; ref < 1 + is_compound; ++ref) {
const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame],
mi_row, mi_col);
mi_row, mi_col, fpm_sync, (void *)pbi);
}
inter_mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
@ -512,10 +519,13 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
}
}
static void read_inter_frame_mode_info(VP9_COMMON *const cm,
// TODO(hkuang): Pass cm instead of pbi. This requires change in
// vp9_frameworker_wait.
static void read_inter_frame_mode_info(VP9Decoder *const pbi,
MACROBLOCKD *const xd,
const TileInfo *const tile,
int mi_row, int mi_col, vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
MODE_INFO *const mi = xd->mi[0];
MB_MODE_INFO *const mbmi = &mi->mbmi;
int inter_block;
@ -529,16 +539,17 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm,
!mbmi->skip || !inter_block, r);
if (inter_block)
read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
read_inter_block_mode_info(pbi, xd, tile, mi, mi_row, mi_col, r);
else
read_intra_block_mode_info(cm, mi, r);
}
void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
const TileInfo *const tile,
int mi_row, int mi_col, vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
if (frame_is_intra_only(cm))
read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
else
read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
read_inter_frame_mode_info(pbi, xd, tile, mi_row, mi_col, r);
}

View File

@ -11,6 +11,7 @@
#ifndef VP9_DECODER_VP9_DECODEMV_H_
#define VP9_DECODER_VP9_DECODEMV_H_
#include "vp9/decoder/vp9_decoder.h"
#include "vp9/decoder/vp9_reader.h"
#ifdef __cplusplus
@ -19,7 +20,7 @@ extern "C" {
struct TileInfo;
void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
const struct TileInfo *const tile,
int mi_row, int mi_col, vp9_reader *r);

View File

@ -26,6 +26,7 @@
#endif
#include "vp9/common/vp9_quant_common.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/common/vp9_thread.h"
#include "vp9/decoder/vp9_decodeframe.h"
#include "vp9/decoder/vp9_decoder.h"
@ -63,6 +64,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
// Initialize the references to not point to any frame buffers.
vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
vpx_memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
cm->current_video_frame = 0;
pbi->ready_for_new_data = 1;
@ -195,29 +197,51 @@ int vp9_get_reference_dec(VP9Decoder *pbi, int index, YV12_BUFFER_CONFIG **fb) {
return 0;
}
static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
BufferPool *const pool) {
if (idx >= 0) {
--frame_bufs[idx].ref_count;
if (frame_bufs[idx].ref_count == 0) {
pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
}
}
}
/* If any buffer updating is signaled it should be done here. */
static void swap_frame_buffers(VP9Decoder *pbi) {
int ref_index = 0, mask;
VP9_COMMON * const cm = &pbi->common;
BufferPool * const pool = cm->buffer_pool;
VP9_COMMON *const cm = &pbi->common;
BufferPool *const pool = cm->buffer_pool;
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
lock_buffer_pool(pool);
for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
if (mask & 1) {
const int old_idx = cm->ref_frame_map[ref_index];
ref_cnt_fb(frame_bufs, &cm->ref_frame_map[ref_index],
cm->new_fb_idx);
if (old_idx >= 0 && frame_bufs[old_idx].ref_count == 0)
pool->release_fb_cb(pool->cb_priv,
&frame_bufs[old_idx].raw_frame_buffer);
const int old_idx = cm->ref_frame_map[ref_index];
// Current thread releases the holding of reference frame.
decrease_ref_count(old_idx, frame_bufs, pool);
// Release the reference frame in reference map.
if ((mask & 1) && old_idx >= 0) {
decrease_ref_count(old_idx, frame_bufs, pool);
}
cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
++ref_index;
}
// Current thread releases the holding of reference frame.
for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
const int old_idx = cm->ref_frame_map[ref_index];
decrease_ref_count(old_idx, frame_bufs, pool);
cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
}
unlock_buffer_pool(pool);
cm->frame_to_show = get_frame_new_buffer(cm);
if (!pbi->frame_parallel_decode || !cm->show_frame) {
lock_buffer_pool(pool);
--frame_bufs[cm->new_fb_idx].ref_count;
unlock_buffer_pool(pool);
}
// Invalidate these references until the next frame starts.
@ -256,6 +280,20 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
&frame_bufs[cm->new_fb_idx].raw_frame_buffer);
cm->new_fb_idx = get_free_fb(cm);
if (pbi->frame_parallel_decode) {
VP9Worker *const worker = pbi->frame_worker_owner;
vp9_frameworker_lock_stats(worker);
frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
// Reset decoding progress.
pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
pbi->cur_buf->row = -1;
pbi->cur_buf->col = -1;
vp9_frameworker_unlock_stats(worker);
} else {
pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
}
if (setjmp(cm->error.jmp)) {
cm->error.setjmp = 0;
@ -283,20 +321,39 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
vp9_clear_system_state();
cm->last_width = cm->width;
cm->last_height = cm->height;
if (!cm->show_existing_frame)
cm->last_show_frame = cm->show_frame;
if (cm->show_frame) {
if (!cm->show_existing_frame)
vp9_swap_mi_and_prev_mi(cm);
cm->current_video_frame++;
// Update progress in frame parallel decode.
if (pbi->frame_parallel_decode) {
// Need to lock the mutex here as another thread may
// be accessing this buffer.
VP9Worker *const worker = pbi->frame_worker_owner;
FrameWorkerData *const frame_worker_data = worker->data1;
vp9_frameworker_lock_stats(worker);
if (cm->show_frame) {
if (!cm->show_existing_frame)
vp9_swap_mi_and_prev_mi(cm);
cm->current_video_frame++;
}
vp9_swap_current_and_last_seg_map(cm);
frame_worker_data->frame_decoded = 1;
frame_worker_data->frame_context_ready = 1;
vp9_frameworker_signal_stats(worker);
vp9_frameworker_unlock_stats(worker);
} else {
cm->last_width = cm->width;
cm->last_height = cm->height;
if (cm->show_frame) {
if (!cm->show_existing_frame)
vp9_swap_mi_and_prev_mi(cm);
cm->current_video_frame++;
}
vp9_swap_current_and_last_seg_map(cm);
}
vp9_swap_current_and_last_seg_map(cm);
pbi->ready_for_new_data = 0;
cm->error.setjmp = 0;

View File

@ -45,6 +45,12 @@ typedef struct VP9Decoder {
int frame_parallel_decode; // frame-based threading.
// TODO(hkuang): Combine this with cur_buf in macroblockd as they are
// the same.
RefCntBuffer *cur_buf; // Current decoding frame buffer.
RefCntBuffer *prev_buf; // Previous decoding frame buffer.
VP9Worker *frame_worker_owner; // frame_worker that owns this pbi.
VP9Worker lf_worker;
VP9Worker *tile_workers;
int num_tile_workers;

View File

@ -17,6 +17,8 @@
#include "vp9/decoder/vp9_dthread.h"
#include "vp9/decoder/vp9_decoder.h"
// #define DEBUG_THREAD
#if CONFIG_MULTITHREAD
static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
const int kMaxTryLocks = 4000;
@ -279,3 +281,166 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) {
vp9_zero(*lf_sync);
}
}
// TODO(hkuang): Clean up all the #ifdef in this file.
void vp9_frameworker_lock_stats(VP9Worker *const worker) {
#if CONFIG_MULTITHREAD
FrameWorkerData *const worker_data = worker->data1;
pthread_mutex_lock(&worker_data->stats_mutex);
#else
(void)worker;
#endif
}
void vp9_frameworker_unlock_stats(VP9Worker *const worker) {
#if CONFIG_MULTITHREAD
FrameWorkerData *const worker_data = worker->data1;
pthread_mutex_unlock(&worker_data->stats_mutex);
#else
(void)worker;
#endif
}
void vp9_frameworker_signal_stats(VP9Worker *const worker) {
#if CONFIG_MULTITHREAD
FrameWorkerData *const worker_data = worker->data1;
// TODO(hkuang): Investigate using broadcast or signal.
pthread_cond_signal(&worker_data->stats_cond);
#else
(void)worker;
#endif
}
// TODO(hkuang): Remove worker parameter as it is only used in debug code.
void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
int row) {
#if CONFIG_MULTITHREAD
if (!ref_buf)
return;
// Enabling the following line of code will get harmless tsan error but
// will get best performance.
// if (ref_buf->row >= row) return;
{
// Find the worker thread that owns the reference frame. If the reference
// frame has been fully decoded, it may not have owner.
VP9Worker *const ref_worker = ref_buf->frame_worker_owner;
FrameWorkerData *const ref_worker_data =
(FrameWorkerData *)ref_worker->data1;
const VP9Decoder *const pbi = ref_worker_data->pbi;
#ifdef DEBUG_THREAD
{
FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
printf("%d %p worker is waiting for %d %p worker (%d) ref %d \r\n",
worker_data->worker_id, worker, ref_worker_data->worker_id,
ref_buf->frame_worker_owner, row, ref_buf->row);
}
#endif
vp9_frameworker_lock_stats(ref_worker);
while (ref_buf->row < row && pbi->cur_buf == ref_buf) {
pthread_cond_wait(&ref_worker_data->stats_cond,
&ref_worker_data->stats_mutex);
}
vp9_frameworker_unlock_stats(ref_worker);
}
#else
(void)ref_buf;
(void)row;
(void)ref_buf;
#endif // CONFIG_MULTITHREAD
}
void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
#if CONFIG_MULTITHREAD
VP9Worker *worker = buf->frame_worker_owner;
#ifdef DEBUG_THREAD
printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
buf->frame_worker_owner, row);
#endif
vp9_frameworker_lock_stats(worker);
buf->row = row;
vp9_frameworker_signal_stats(worker);
vp9_frameworker_unlock_stats(worker);
#else
(void)buf;
(void)row;
#endif // CONFIG_MULTITHREAD
}
void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
VP9Worker *const src_worker) {
#if CONFIG_MULTITHREAD
FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
int i;
// Wait until source frame's context is ready.
vp9_frameworker_lock_stats(src_worker);
while (!src_worker_data->frame_context_ready) {
pthread_cond_wait(&src_worker_data->stats_cond,
&src_worker_data->stats_mutex);
}
// src worker may have already finished decoding a frame and swapped the mi.
// TODO(hkuang): Remove following code after implenment no ModeInfo decoding.
if (src_worker_data->frame_decoded) {
dst_cm->prev_mip = src_cm->prev_mip;
dst_cm->prev_mi = src_cm->prev_mi;
dst_cm->prev_mi_grid_base = src_cm->prev_mi_grid_base;
dst_cm->prev_mi_grid_visible = src_cm->prev_mi_grid_visible;
dst_cm->last_frame_seg_map = src_cm->last_frame_seg_map;
} else {
dst_cm->prev_mip = src_cm->mip;
dst_cm->prev_mi = src_cm->mi;
dst_cm->prev_mi_grid_base = src_cm->mi_grid_base;
dst_cm->prev_mi_grid_visible = src_cm->mi_grid_visible;
dst_cm->last_frame_seg_map = src_cm->current_frame_seg_map;
}
vp9_frameworker_unlock_stats(src_worker);
dst_worker_data->pbi->prev_buf =
src_worker_data->pbi->common.show_existing_frame ?
NULL : src_worker_data->pbi->cur_buf;
dst_cm->last_width = !src_cm->show_existing_frame ?
src_cm->width : src_cm->last_width;
dst_cm->last_height = !src_cm->show_existing_frame ?
src_cm->height : src_cm->last_height;
dst_cm->display_width = src_cm->display_width;
dst_cm->display_height = src_cm->display_height;
dst_cm->subsampling_x = src_cm->subsampling_x;
dst_cm->subsampling_y = src_cm->subsampling_y;
dst_cm->last_show_frame = !src_cm->show_existing_frame ?
src_cm->show_frame : src_cm->last_show_frame;
dst_cm->last_frame_type = src_cm->last_frame_type;
dst_cm->frame_type = src_cm->frame_type;
dst_cm->y_dc_delta_q = src_cm->y_dc_delta_q;
dst_cm->uv_dc_delta_q = src_cm->uv_dc_delta_q;
dst_cm->uv_ac_delta_q = src_cm->uv_ac_delta_q;
dst_cm->base_qindex = src_cm->base_qindex;
for (i = 0; i < REF_FRAMES; ++i)
dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
(MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
dst_cm->lf.filter_level = src_cm->lf.filter_level;
memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS);
memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
dst_cm->seg = src_cm->seg;
memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
#else
(void) dst_worker;
(void) src_worker;
#endif // CONFIG_MULTITHREAD
}

View File

@ -19,7 +19,7 @@ struct VP9Common;
struct VP9Decoder;
typedef struct TileWorkerData {
struct VP9Common *cm;
struct VP9Decoder *pbi;
vp9_reader bit_reader;
DECLARE_ALIGNED(16, struct macroblockd, xd);
@ -55,6 +55,14 @@ typedef struct FrameWorkerData {
// It is used to make a copy of the compressed data.
uint8_t *scratch_buffer;
size_t scratch_buffer_size;
#if CONFIG_MULTITHREAD
pthread_mutex_t stats_mutex;
pthread_cond_t stats_cond;
#endif
int frame_context_ready; // Current frame's context is ready to read.
int frame_decoded; // Finished decoding current frame.
} FrameWorkerData;
// Allocate memory for loopfilter row synchronization.
@ -71,4 +79,23 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
int frame_filter_level,
int y_only);
void vp9_frameworker_lock_stats(VP9Worker *const worker);
void vp9_frameworker_unlock_stats(VP9Worker *const worker);
void vp9_frameworker_signal_stats(VP9Worker *const worker);
// Wait until ref_buf has been decoded to row in real pixel unit.
// Note: worker may already finish decoding ref_buf and release it in order to
// start decoding next frame. So need to check whether worker is still decoding
// ref_buf.
void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
int row);
// FrameWorker broadcasts its decoding progress so other workers that are
// waiting on it can resume decoding.
void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
// Copy necessary decoding context from src worker to dst worker.
void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
VP9Worker *const src_worker);
#endif // VP9_DECODER_VP9_DTHREAD_H_

View File

@ -484,7 +484,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (cm->coding_use_prev_mi)
vp9_find_mv_refs(cm, xd, tile, xd->mi[0], ref_frame,
candidates, mi_row, mi_col);
candidates, mi_row, mi_col, NULL, NULL);
else
const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0],
ref_frame, candidates,

View File

@ -2253,7 +2253,8 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
// Gets an initial list of candidate vectors from neighbours and orders them
vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col,
NULL, NULL);
// Candidate refinement carried out at encoder and decoder
vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,

View File

@ -18,6 +18,7 @@
#include "vpx/vpx_decoder.h"
#include "vp9/common/vp9_frame_buffers.h"
#include "vp9/common/vp9_thread.h"
#include "vp9/decoder/vp9_decoder.h"
#include "vp9/decoder/vp9_read_bit_buffer.h"
@ -28,6 +29,15 @@
typedef vpx_codec_stream_info_t vp9_stream_info_t;
// This limit is due to framebuffer numbers.
// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
#define FRAME_CACHE_SIZE 6 // Cache maximum 6 decoded frames.
typedef struct cache_frame {
int fb_idx;
vpx_image_t img;
} cache_frame;
struct vpx_codec_alg_priv {
vpx_codec_priv_t base;
vpx_codec_dec_cfg_t cfg;
@ -35,17 +45,24 @@ struct vpx_codec_alg_priv {
int postproc_cfg_set;
vp8_postproc_cfg_t postproc_cfg;
vpx_decrypt_cb decrypt_cb;
void *decrypt_state;
void *decrypt_state;
vpx_image_t img;
int flushed;
int invert_tile_order;
int frame_parallel_decode; // frame-based threading.
int last_show_frame; // Index of last output frame.
// Frame parallel related.
int frame_parallel_decode; // frame-based threading.
VP9Worker *frame_workers;
int num_frame_workers;
int next_submit_thread_id;
int next_output_thread_id;
int next_submit_worker_id;
int last_submit_worker_id;
int next_output_worker_id;
int available_threads;
cache_frame frame_cache[FRAME_CACHE_SIZE];
int frame_cache_write;
int frame_cache_read;
int num_cache_frames;
// BufferPool that holds all reference frames. Shared by all the FrameWorkers.
BufferPool *buffer_pool;
@ -77,11 +94,10 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
ctx->priv->init_flags = ctx->init_flags;
ctx->priv->alg_priv->flushed = 0;
// Only do frame parallel decode when threads > 1.
ctx->priv->alg_priv->frame_parallel_decode =
(ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
// Disable frame parallel decoding for now.
ctx->priv->alg_priv->frame_parallel_decode = 0;
((ctx->config.dec->threads > 1) &&
(ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING)) ? 1 : 0;
if (ctx->config.dec) {
// Update the reference to the config structure to an internal copy.
@ -98,10 +114,21 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
int i;
for (i = 0; i < ctx->num_frame_workers; ++i) {
VP9Worker *const worker = &ctx->frame_workers[i];
FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
vp9_decoder_remove(worker_data->pbi);
vpx_free(worker_data);
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
vp9_get_worker_interface()->end(worker);
vp9_decoder_remove(frame_worker_data->pbi);
vpx_free(frame_worker_data->scratch_buffer);
#if CONFIG_MULTITHREAD
pthread_mutex_destroy(&frame_worker_data->stats_mutex);
pthread_cond_destroy(&frame_worker_data->stats_cond);
#endif
vpx_free(frame_worker_data);
}
#if CONFIG_MULTITHREAD
pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
#endif
vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
}
vpx_free(ctx->frame_workers);
@ -222,8 +249,8 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
for (i = 0; i < ctx->num_frame_workers; ++i) {
VP9Worker *const worker = &ctx->frame_workers[i];
FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
VP9_COMMON *const cm = &worker_data->pbi->common;
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
VP9_COMMON *const cm = &frame_worker_data->pbi->common;
BufferPool *const pool = cm->buffer_pool;
cm->new_fb_idx = -1;
@ -260,14 +287,15 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx,
}
static int frame_worker_hook(void *arg1, void *arg2) {
FrameWorkerData *const worker_data = (FrameWorkerData *)arg1;
const uint8_t *data = worker_data->data;
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
const uint8_t *data = frame_worker_data->data;
(void)arg2;
worker_data->result = vp9_receive_compressed_data(worker_data->pbi,
worker_data->data_size,
&data);
worker_data->data_end = data;
return !worker_data->result;
frame_worker_data->result =
vp9_receive_compressed_data(frame_worker_data->pbi,
frame_worker_data->data_size,
&data);
frame_worker_data->data_end = data;
return !frame_worker_data->result;
}
static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
@ -275,14 +303,28 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
ctx->last_show_frame = -1;
ctx->next_submit_thread_id = 0;
ctx->next_output_thread_id = 0;
ctx->next_submit_worker_id = 0;
ctx->last_submit_worker_id = 0;
ctx->next_output_worker_id = 0;
ctx->frame_cache_read = 0;
ctx->frame_cache_write = 0;
ctx->num_cache_frames = 0;
ctx->num_frame_workers =
(ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1;
ctx->available_threads = ctx->num_frame_workers;
ctx->flushed = 0;
ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
if (ctx->buffer_pool == NULL)
return VPX_CODEC_MEM_ERROR;
#if CONFIG_MULTITHREAD
if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
set_error_detail(ctx, "Failed to allocate buffer pool mutex");
return VPX_CODEC_MEM_ERROR;
}
#endif
ctx->frame_workers = (VP9Worker *)
vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
if (ctx->frame_workers == NULL) {
@ -292,28 +334,51 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
for (i = 0; i < ctx->num_frame_workers; ++i) {
VP9Worker *const worker = &ctx->frame_workers[i];
FrameWorkerData *worker_data = NULL;
FrameWorkerData *frame_worker_data = NULL;
winterface->init(worker);
worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
if (worker->data1 == NULL) {
set_error_detail(ctx, "Failed to allocate worker_data");
set_error_detail(ctx, "Failed to allocate frame_worker_data");
return VPX_CODEC_MEM_ERROR;
}
worker_data = (FrameWorkerData *)worker->data1;
worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
if (worker_data->pbi == NULL) {
set_error_detail(ctx, "Failed to allocate worker_data");
frame_worker_data = (FrameWorkerData *)worker->data1;
frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
if (frame_worker_data->pbi == NULL) {
set_error_detail(ctx, "Failed to allocate frame_worker_data");
return VPX_CODEC_MEM_ERROR;
}
frame_worker_data->pbi->frame_worker_owner = worker;
frame_worker_data->pbi->common.mi_idx = 0;
frame_worker_data->pbi->common.prev_mi_idx = 1;
frame_worker_data->worker_id = i;
frame_worker_data->scratch_buffer = NULL;
frame_worker_data->scratch_buffer_size = 0;
frame_worker_data->frame_context_ready = 0;
#if CONFIG_MULTITHREAD
if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
return VPX_CODEC_MEM_ERROR;
}
if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
return VPX_CODEC_MEM_ERROR;
}
#endif
// If decoding in serial mode, FrameWorker thread could create tile worker
// thread or loopfilter thread.
worker_data->pbi->max_threads =
frame_worker_data->pbi->max_threads =
(ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
frame_worker_data->pbi->common.frame_parallel_decode =
ctx->frame_parallel_decode;
worker->hook = (VP9WorkerHook)frame_worker_hook;
if (!winterface->reset(worker)) {
set_error_detail(ctx, "Frame Worker thread creation failed");
return VPX_CODEC_MEM_ERROR;
}
}
// If postprocessing was enabled by the application and a
@ -348,36 +413,66 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_ERROR;
}
// Initialize the decoder workers on the first frame
if (ctx->frame_workers == NULL) {
const vpx_codec_err_t res = init_decoder(ctx);
if (res != VPX_CODEC_OK)
return res;
}
if (!ctx->frame_parallel_decode) {
VP9Worker *const worker = ctx->frame_workers;
FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
worker_data->data = *data;
worker_data->data_size = data_sz;
worker_data->user_priv = user_priv;
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
frame_worker_data->data = *data;
frame_worker_data->data_size = data_sz;
frame_worker_data->user_priv = user_priv;
// Set these even if already initialized. The caller may have changed the
// decrypt config between frames.
worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
worker_data->pbi->decrypt_state = ctx->decrypt_state;
frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
worker->had_error = 0;
winterface->execute(worker);
// Update data pointer after decode.
*data = worker_data->data_end;
*data = frame_worker_data->data_end;
if (worker->had_error)
return update_error_state(ctx, &worker_data->pbi->common.error);
return update_error_state(ctx, &frame_worker_data->pbi->common.error);
} else {
// TODO(hkuang): Implement frame parallel decode.
return VPX_CODEC_INCAPABLE;
const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
VP9Worker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
// Copy context from last worker thread to next worker thread.
if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
vp9_frameworker_copy_context(
&ctx->frame_workers[ctx->next_submit_worker_id],
&ctx->frame_workers[ctx->last_submit_worker_id]);
// Copy the compressed data into worker's internal buffer.
// TODO(hkuang): Will all the workers allocate the same size
// as the size of the first intra frame be better? This will
// avoid too many deallocate and allocate.
if (frame_worker_data->scratch_buffer_size < data_sz) {
frame_worker_data->scratch_buffer =
(uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz);
if (frame_worker_data->scratch_buffer == NULL) {
set_error_detail(ctx, "Failed to reallocate scratch buffer");
return VPX_CODEC_MEM_ERROR;
}
frame_worker_data->scratch_buffer_size = data_sz;
}
frame_worker_data->data_size = data_sz;
vpx_memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
frame_worker_data->frame_decoded = 0;
frame_worker_data->frame_context_ready = 0;
frame_worker_data->data = frame_worker_data->scratch_buffer;
frame_worker_data->user_priv = user_priv;
if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
ctx->last_submit_worker_id =
(ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
ctx->next_submit_worker_id =
(ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
--ctx->available_threads;
winterface->launch(worker);
}
if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
@ -461,6 +556,30 @@ static vpx_codec_err_t parse_superframe_index(const uint8_t *data,
return VPX_CODEC_OK;
}
static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
YV12_BUFFER_CONFIG sd;
vp9_ppflags_t flags = {0, 0, 0};
const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
ctx->next_output_worker_id =
(ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
winterface->sync(worker);
++ctx->available_threads;
if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
VP9_COMMON *const cm = &frame_worker_data->pbi->common;
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
frame_worker_data->user_priv);
ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
ctx->frame_cache_write =
(ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
++ctx->num_cache_frames;
}
}
static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
const uint8_t *data, unsigned int data_sz,
void *user_priv, long deadline) {
@ -478,6 +597,13 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
// Reset flushed when receiving a valid frame.
ctx->flushed = 0;
// Initialize the decoder workers on the first frame.
if (ctx->frame_workers == NULL) {
const vpx_codec_err_t res = init_decoder(ctx);
if (res != VPX_CODEC_OK)
return res;
}
res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
ctx->decrypt_cb, ctx->decrypt_state);
if (res != VPX_CODEC_OK)
@ -494,30 +620,46 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
for (i = 0; i < frame_count; ++i) {
const uint8_t *data_start_copy = data_start;
const uint32_t frame_size = frame_sizes[i];
vpx_codec_err_t res;
if (data_start < data
|| frame_size > (uint32_t) (data_end - data_start)) {
set_error_detail(ctx, "Invalid frame size in index");
return VPX_CODEC_CORRUPT_FRAME;
}
if (ctx->available_threads == 0) {
// No more threads for decoding. Wait until the next output worker
// finishes decoding. Then copy the decoded frame into cache.
if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
wait_worker_and_cache_frame(ctx);
} else {
// TODO(hkuang): Add unit test to test this path.
set_error_detail(ctx, "Frame output cache is full.");
return VPX_CODEC_ERROR;
}
}
res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
deadline);
if (res != VPX_CODEC_OK)
return res;
data_start += frame_size;
}
} else {
res = decode_one(ctx, &data_start, data_sz, user_priv, deadline);
if (ctx->available_threads == 0) {
// No more threads for decoding. Wait until the next output worker
// finishes decoding. Then copy the decoded frame into cache.
if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
wait_worker_and_cache_frame(ctx);
} else {
// TODO(hkuang): Add unit test to test this path.
set_error_detail(ctx, "Frame output cache is full.");
return VPX_CODEC_ERROR;
}
}
res = decode_one(ctx, &data, data_sz, user_priv, deadline);
if (res != VPX_CODEC_OK)
return res;
// Extra data detected after the frame.
if (data_start < data_end - 1) {
set_error_detail(ctx, "Fail to decode frame in parallel mode");
return VPX_CODEC_INCAPABLE;
}
}
} else {
// Decode in serial mode.
@ -561,41 +703,73 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
}
}
return VPX_CODEC_OK;
return res;
}
static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) {
RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
// Decrease reference count of last output frame in frame parallel mode.
if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
BufferPool *const pool = ctx->buffer_pool;
lock_buffer_pool(pool);
--frame_bufs[ctx->last_show_frame].ref_count;
if (frame_bufs[ctx->last_show_frame].ref_count == 0) {
pool->release_fb_cb(pool->cb_priv,
&frame_bufs[ctx->last_show_frame].raw_frame_buffer);
}
unlock_buffer_pool(pool);
}
}
static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
vpx_codec_iter_t *iter) {
vpx_image_t *img = NULL;
// Only return frame when all the cpu are busy or
// application fluhsed the decoder in frame parallel decode.
if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
!ctx->flushed) {
return img;
}
// Output the frames in the cache first.
if (ctx->num_cache_frames > 0) {
release_last_output_frame(ctx);
ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
img = &ctx->frame_cache[ctx->frame_cache_read].img;
ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
--ctx->num_cache_frames;
return img;
}
// iter acts as a flip flop, so an image is only returned on the first
// call to get_frame.
if (*iter == NULL && ctx->frame_workers != NULL) {
YV12_BUFFER_CONFIG sd;
vp9_ppflags_t flags = {0, 0, 0};
VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_thread_id];
FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
if (vp9_get_raw_frame(worker_data->pbi, &sd, &flags) == 0) {
VP9_COMMON *const cm = &worker_data->pbi->common;
BufferPool *const pool = cm->buffer_pool;
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
yuvconfig2image(&ctx->img, &sd, worker_data->user_priv);
ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
img = &ctx->img;
*iter = img;
// Decrease reference count of last output frame in frame parallel mode.
if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
--frame_bufs[ctx->last_show_frame].ref_count;
if (frame_bufs[ctx->last_show_frame].ref_count == 0) {
pool->release_fb_cb(pool->cb_priv,
&frame_bufs[ctx->last_show_frame].raw_frame_buffer);
}
do {
YV12_BUFFER_CONFIG sd;
vp9_ppflags_t flags = {0, 0, 0};
const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
VP9Worker *const worker =
&ctx->frame_workers[ctx->next_output_worker_id];
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
ctx->next_output_worker_id =
(ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
// Wait for the frame from worker thread.
winterface->sync(worker);
++ctx->available_threads;
if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
VP9_COMMON *const cm = &frame_worker_data->pbi->common;
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
release_last_output_frame(ctx);
ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
img = &ctx->img;
return img;
}
ctx->last_show_frame = worker_data->pbi->common.new_fb_idx;
}
} while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
}
return img;
}
@ -631,9 +805,9 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
YV12_BUFFER_CONFIG sd;
VP9Worker *const worker = ctx->frame_workers;
FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
image2yuvconfig(&frame->img, &sd);
return vp9_set_reference_dec(&worker_data->pbi->common,
return vp9_set_reference_dec(&frame_worker_data->pbi->common,
(VP9_REFFRAME)frame->frame_type, &sd);
} else {
return VPX_CODEC_INVALID_PARAM;
@ -654,9 +828,9 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data;
YV12_BUFFER_CONFIG sd;
VP9Worker *const worker = ctx->frame_workers;
FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
image2yuvconfig(&frame->img, &sd);
return vp9_copy_reference_dec(worker_data->pbi,
return vp9_copy_reference_dec(frame_worker_data->pbi,
(VP9_REFFRAME)frame->frame_type, &sd);
} else {
return VPX_CODEC_INVALID_PARAM;
@ -676,8 +850,8 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
if (data) {
YV12_BUFFER_CONFIG* fb;
VP9Worker *const worker = ctx->frame_workers;
FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
vp9_get_reference_dec(worker_data->pbi, data->idx, &fb);
FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
vp9_get_reference_dec(frame_worker_data->pbi, data->idx, &fb);
yuvconfig2image(&data->img, fb, NULL);
return VPX_CODEC_OK;
} else {
@ -724,8 +898,9 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
if (update_info) {
if (ctx->frame_workers) {
VP9Worker *const worker = ctx->frame_workers;
FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
*update_info = worker_data->pbi->refresh_frame_flags;
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
*update_info = frame_worker_data->pbi->refresh_frame_flags;
} else {
return VPX_CODEC_ERROR;
}
@ -735,22 +910,18 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
}
}
static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
va_list args) {
int *corrupted = va_arg(args, int *);
// Only support this function in serial decode.
if (ctx->frame_parallel_decode) {
set_error_detail(ctx, "Not supported in frame parallel decode");
return VPX_CODEC_INCAPABLE;
}
if (corrupted) {
if (ctx->frame_workers) {
VP9Worker *const worker = ctx->frame_workers;
FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
*corrupted = worker_data->pbi->common.frame_to_show->corrupted;
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
RefCntBuffer *const frame_bufs =
frame_worker_data->pbi->common.buffer_pool->frame_bufs;
*corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
} else {
return VPX_CODEC_ERROR;
}
@ -773,8 +944,9 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
if (display_size) {
if (ctx->frame_workers) {
VP9Worker *const worker = ctx->frame_workers;
FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
const VP9_COMMON *const cm = &worker_data->pbi->common;
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
display_size[0] = cm->display_width;
display_size[1] = cm->display_height;
} else {

View File

@ -22,8 +22,11 @@ extern "C" {
#include "./vpx_integer.h"
/*!\brief The maximum number of work buffers used by libvpx.
* Support maximum 4 threads to decode video in parallel.
* Each thread will use one work buffer.
* TODO(hkuang): Add support to set number of worker threads dynamically.
*/
#define VPX_MAXIMUM_WORK_BUFFERS 1
#define VPX_MAXIMUM_WORK_BUFFERS 4
/*!\brief The maximum number of reference buffers that a VP9 encoder may use.
*/