vp9_bitstream: Encode tiles in parallel
Re-use the tile worker threads to pack the bitstream in parallel on a per-tile basis. Restricting this to real-time only for now (further testing is needed to ensure this does not make 2-pass worse in any case). BUG=webm:1309 Change-Id: Ia2c982da56697756e12f02643f589189b3271d98
This commit is contained in:
parent
4c3d539baa
commit
9e8efa5b18
@ -915,6 +915,120 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) {
|
||||||
|
MACROBLOCKD *const xd = &data->xd;
|
||||||
|
vpx_start_encode(&data->bit_writer, data->dest);
|
||||||
|
write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info,
|
||||||
|
&data->bit_writer, &data->tok, data->tok_end,
|
||||||
|
&data->max_mv_magnitude, data->interp_filter_selected);
|
||||||
|
assert(data->tok == data->tok_end);
|
||||||
|
vpx_stop_encode(&data->bit_writer);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) {
|
||||||
|
if (cpi->vp9_bitstream_worker_data) {
|
||||||
|
int i;
|
||||||
|
for (i = 1; i < cpi->num_workers; ++i) {
|
||||||
|
vpx_free(cpi->vp9_bitstream_worker_data[i].dest);
|
||||||
|
}
|
||||||
|
vpx_free(cpi->vp9_bitstream_worker_data);
|
||||||
|
cpi->vp9_bitstream_worker_data = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
|
||||||
|
int i;
|
||||||
|
cpi->vp9_bitstream_worker_data =
|
||||||
|
vpx_calloc(cpi->num_workers, sizeof(*cpi->vp9_bitstream_worker_data));
|
||||||
|
if (!cpi->vp9_bitstream_worker_data) return 1;
|
||||||
|
for (i = 1; i < cpi->num_workers; ++i) {
|
||||||
|
cpi->vp9_bitstream_worker_data[i].dest_size =
|
||||||
|
cpi->oxcf.width * cpi->oxcf.height;
|
||||||
|
cpi->vp9_bitstream_worker_data[i].dest =
|
||||||
|
vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size);
|
||||||
|
if (!cpi->vp9_bitstream_worker_data[i].dest) return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
|
||||||
|
const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
|
||||||
|
VP9_COMMON *const cm = &cpi->common;
|
||||||
|
const int tile_cols = 1 << cm->log2_tile_cols;
|
||||||
|
const int num_workers = cpi->num_workers;
|
||||||
|
size_t total_size = 0;
|
||||||
|
int tile_col = 0;
|
||||||
|
|
||||||
|
if (!cpi->vp9_bitstream_worker_data ||
|
||||||
|
cpi->vp9_bitstream_worker_data[1].dest_size >
|
||||||
|
(cpi->oxcf.width * cpi->oxcf.height)) {
|
||||||
|
vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
|
||||||
|
if (encode_tiles_buffer_alloc(cpi)) return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (tile_col < tile_cols) {
|
||||||
|
int i, j;
|
||||||
|
for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
|
||||||
|
VPxWorker *const worker = &cpi->workers[i];
|
||||||
|
VP9BitstreamWorkerData *const data = &cpi->vp9_bitstream_worker_data[i];
|
||||||
|
|
||||||
|
// Populate the worker data.
|
||||||
|
data->xd = cpi->td.mb.e_mbd;
|
||||||
|
data->tile_idx = tile_col;
|
||||||
|
data->tok = cpi->tile_tok[0][tile_col];
|
||||||
|
data->tok_end = cpi->tile_tok[0][tile_col] + cpi->tok_count[0][tile_col];
|
||||||
|
data->max_mv_magnitude = cpi->max_mv_magnitude;
|
||||||
|
memset(data->interp_filter_selected, 0,
|
||||||
|
sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE);
|
||||||
|
|
||||||
|
// First thread can directly write into the output buffer.
|
||||||
|
if (i == 0) {
|
||||||
|
data->dest = data_ptr + total_size + 4;
|
||||||
|
}
|
||||||
|
worker->data1 = cpi;
|
||||||
|
worker->data2 = data;
|
||||||
|
worker->hook = (VPxWorkerHook)encode_tile_worker;
|
||||||
|
worker->had_error = 0;
|
||||||
|
|
||||||
|
if (i < num_workers - 1) {
|
||||||
|
winterface->launch(worker);
|
||||||
|
} else {
|
||||||
|
winterface->execute(worker);
|
||||||
|
}
|
||||||
|
++tile_col;
|
||||||
|
}
|
||||||
|
for (j = 0; j < i; ++j) {
|
||||||
|
VPxWorker *const worker = &cpi->workers[j];
|
||||||
|
VP9BitstreamWorkerData *const data =
|
||||||
|
(VP9BitstreamWorkerData *)worker->data2;
|
||||||
|
uint32_t tile_size;
|
||||||
|
int k;
|
||||||
|
|
||||||
|
if (!winterface->sync(worker)) return 0;
|
||||||
|
tile_size = data->bit_writer.pos;
|
||||||
|
|
||||||
|
// Aggregate per-thread bitstream stats.
|
||||||
|
cpi->max_mv_magnitude =
|
||||||
|
VPXMAX(cpi->max_mv_magnitude, data->max_mv_magnitude);
|
||||||
|
for (k = 0; k < SWITCHABLE; ++k) {
|
||||||
|
cpi->interp_filter_selected[0][k] += data->interp_filter_selected[0][k];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefix the size of the tile on all but the last.
|
||||||
|
if (tile_col != tile_cols || j < i - 1) {
|
||||||
|
mem_put_be32(data_ptr + total_size, tile_size);
|
||||||
|
total_size += 4;
|
||||||
|
}
|
||||||
|
if (j > 0) {
|
||||||
|
memcpy(data_ptr + total_size, data->dest, tile_size);
|
||||||
|
}
|
||||||
|
total_size += tile_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return total_size;
|
||||||
|
}
|
||||||
|
|
||||||
static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
|
static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
|
||||||
VP9_COMMON *const cm = &cpi->common;
|
VP9_COMMON *const cm = &cpi->common;
|
||||||
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
|
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
|
||||||
@ -928,6 +1042,14 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
|
|||||||
memset(cm->above_seg_context, 0,
|
memset(cm->above_seg_context, 0,
|
||||||
sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
|
sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
|
||||||
|
|
||||||
|
// Encoding tiles in parallel is done only for realtime mode now. In other
|
||||||
|
// modes the speed up is insignificant and requires further testing to ensure
|
||||||
|
// that it does not make the overall process worse in any case.
|
||||||
|
if (cpi->oxcf.mode == REALTIME && cpi->num_workers > 1 && tile_rows == 1 &&
|
||||||
|
tile_cols > 1) {
|
||||||
|
return encode_tiles_mt(cpi, data_ptr);
|
||||||
|
}
|
||||||
|
|
||||||
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
|
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
|
||||||
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
|
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
|
||||||
int tile_idx = tile_row * tile_cols + tile_col;
|
int tile_idx = tile_row * tile_cols + tile_col;
|
||||||
@ -955,7 +1077,6 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
|
|||||||
total_size += residual_bc.pos;
|
total_size += residual_bc.pos;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return total_size;
|
return total_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,10 +17,28 @@ extern "C" {
|
|||||||
|
|
||||||
#include "vp9/encoder/vp9_encoder.h"
|
#include "vp9/encoder/vp9_encoder.h"
|
||||||
|
|
||||||
|
typedef struct VP9BitstreamWorkerData {
|
||||||
|
uint8_t *dest;
|
||||||
|
int dest_size;
|
||||||
|
TOKENEXTRA *tok;
|
||||||
|
TOKENEXTRA *tok_end;
|
||||||
|
vpx_writer bit_writer;
|
||||||
|
int tile_idx;
|
||||||
|
unsigned int max_mv_magnitude;
|
||||||
|
// The size of interp_filter_selected in VP9_COMP is actually
|
||||||
|
// MAX_REFERENCE_FRAMES x SWITCHABLE. But when encoding tiles, all we ever do
|
||||||
|
// is increment the very first index (index 0) for the first dimension. Hence
|
||||||
|
// this is sufficient.
|
||||||
|
int interp_filter_selected[1][SWITCHABLE];
|
||||||
|
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
|
||||||
|
} VP9BitstreamWorkerData;
|
||||||
|
|
||||||
int vp9_get_refresh_mask(VP9_COMP *cpi);
|
int vp9_get_refresh_mask(VP9_COMP *cpi);
|
||||||
|
|
||||||
void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
|
void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
|
||||||
|
|
||||||
|
void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi);
|
||||||
|
|
||||||
static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
|
static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
|
||||||
return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
|
return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
|
||||||
cpi->rc.is_src_frame_alt_ref &&
|
cpi->rc.is_src_frame_alt_ref &&
|
||||||
|
@ -2030,7 +2030,10 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
|
|||||||
vpx_free(cpi->tile_thr_data);
|
vpx_free(cpi->tile_thr_data);
|
||||||
vpx_free(cpi->workers);
|
vpx_free(cpi->workers);
|
||||||
|
|
||||||
if (cpi->num_workers > 1) vp9_loop_filter_dealloc(&cpi->lf_row_sync);
|
if (cpi->num_workers > 1) {
|
||||||
|
vp9_loop_filter_dealloc(&cpi->lf_row_sync);
|
||||||
|
vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
|
||||||
|
}
|
||||||
|
|
||||||
vp9_alt_ref_aq_destroy(cpi->alt_ref_aq);
|
vp9_alt_ref_aq_destroy(cpi->alt_ref_aq);
|
||||||
|
|
||||||
|
@ -601,6 +601,7 @@ typedef struct VP9_COMP {
|
|||||||
VPxWorker *workers;
|
VPxWorker *workers;
|
||||||
struct EncWorkerData *tile_thr_data;
|
struct EncWorkerData *tile_thr_data;
|
||||||
VP9LfSync lf_row_sync;
|
VP9LfSync lf_row_sync;
|
||||||
|
struct VP9BitstreamWorkerData *vp9_bitstream_worker_data;
|
||||||
|
|
||||||
int keep_level_stats;
|
int keep_level_stats;
|
||||||
Vp9LevelInfo level_info;
|
Vp9LevelInfo level_info;
|
||||||
|
Loading…
Reference in New Issue
Block a user