vp9_bitstream: Encode tiles in parallel

Re-use the tile worker threads to pack the bitstream in parallel
on a per-tile basis.  Restricting this to real-time only for now
(further testing is needed to ensure this does not make 2-pass
worse in any case).

BUG=webm:1309

Change-Id: I8a80da7c5089b837d0df79a5c49d5e3022dfc8ec
This commit is contained in:
Vignesh Venkatasubramanian 2016-10-19 11:07:41 -07:00
parent 32e63efcfb
commit 5deffa1175
6 changed files with 165 additions and 6 deletions

View File

@ -65,6 +65,12 @@ class CodecTestWith3Params
: public ::testing::TestWithParam<
std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2, T3> > {};
template <class T1, class T2, class T3, class T4>
class CodecTestWith4Params
: public ::testing::TestWithParam<
std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2, T3, T4> > {
};
/*
* VP8 Codec Definitions
*/

View File

@ -20,10 +20,12 @@
namespace {
class VPxEncoderThreadTest
: public ::libvpx_test::EncoderTest,
public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
public ::libvpx_test::CodecTestWith4Params<libvpx_test::TestMode, int,
int, int> {
protected:
VPxEncoderThreadTest()
: EncoderTest(GET_PARAM(0)), encoder_initialized_(false), tiles_(2),
: EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
tiles_(GET_PARAM(3)), threads_(GET_PARAM(4)),
encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) {
init_flags_ = VPX_CODEC_USE_PSNR;
md5_.clear();
@ -91,6 +93,7 @@ class VPxEncoderThreadTest
bool encoder_initialized_;
int tiles_;
int threads_;
::libvpx_test::TestMode encoding_mode_;
int set_cpu_used_;
std::vector<std::string> md5_;
@ -111,7 +114,7 @@ TEST_P(VPxEncoderThreadTest, EncoderResultTest) {
md5_.clear();
// Encode using multiple threads.
cfg_.g_threads = 4;
cfg_.g_threads = threads_;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
multi_thr_md5 = md5_;
md5_.clear();
@ -124,5 +127,7 @@ VP9_INSTANTIATE_TEST_CASE(VPxEncoderThreadTest,
::testing::Values(::libvpx_test::kTwoPassGood,
::libvpx_test::kOnePassGood,
::libvpx_test::kRealTime),
::testing::Range(1, 9));
::testing::Range(1, 9), // cpu_used
::testing::Range(0, 3), // tile_columns
::testing::Range(2, 5)); // threads
} // namespace

View File

@ -915,6 +915,125 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) {
}
}
static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) {
MACROBLOCKD *const xd = &data->xd;
vpx_start_encode(&data->bit_writer, data->dest);
write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info,
&data->bit_writer, &data->tok, data->tok_end,
&data->max_mv_magnitude, data->interp_filter_selected);
assert(data->tok == data->tok_end);
vpx_stop_encode(&data->bit_writer);
return 1;
}
void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) {
if (cpi->vp9_bitstream_worker_data) {
int i;
for (i = 1; i < cpi->num_workers; ++i) {
vpx_free(cpi->vp9_bitstream_worker_data[i].dest);
}
vpx_free(cpi->vp9_bitstream_worker_data);
cpi->vp9_bitstream_worker_data = NULL;
}
}
static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
int i;
const size_t worker_data_size =
cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size);
memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
if (!cpi->vp9_bitstream_worker_data) return 1;
for (i = 1; i < cpi->num_workers; ++i) {
cpi->vp9_bitstream_worker_data[i].dest_size =
cpi->oxcf.width * cpi->oxcf.height;
cpi->vp9_bitstream_worker_data[i].dest =
vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size);
if (!cpi->vp9_bitstream_worker_data[i].dest) return 1;
}
return 0;
}
static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
VP9_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
const int num_workers = cpi->num_workers;
size_t total_size = 0;
int tile_col = 0;
if (!cpi->vp9_bitstream_worker_data ||
cpi->vp9_bitstream_worker_data[1].dest_size >
(cpi->oxcf.width * cpi->oxcf.height)) {
vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
if (encode_tiles_buffer_alloc(cpi)) return 0;
}
while (tile_col < tile_cols) {
int i, j;
for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
VPxWorker *const worker = &cpi->workers[i];
VP9BitstreamWorkerData *const data = &cpi->vp9_bitstream_worker_data[i];
// Populate the worker data.
data->xd = cpi->td.mb.e_mbd;
data->tile_idx = tile_col;
data->tok = cpi->tile_tok[0][tile_col];
data->tok_end = cpi->tile_tok[0][tile_col] + cpi->tok_count[0][tile_col];
data->max_mv_magnitude = cpi->max_mv_magnitude;
memset(data->interp_filter_selected, 0,
sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE);
// First thread can directly write into the output buffer.
if (i == 0) {
// If this worker happens to be for the last tile, then do not offset it
// by 4 for the tile size.
data->dest =
data_ptr + total_size + (tile_col == tile_cols - 1 ? 0 : 4);
}
worker->data1 = cpi;
worker->data2 = data;
worker->hook = (VPxWorkerHook)encode_tile_worker;
worker->had_error = 0;
if (i < num_workers - 1) {
winterface->launch(worker);
} else {
winterface->execute(worker);
}
++tile_col;
}
for (j = 0; j < i; ++j) {
VPxWorker *const worker = &cpi->workers[j];
VP9BitstreamWorkerData *const data =
(VP9BitstreamWorkerData *)worker->data2;
uint32_t tile_size;
int k;
if (!winterface->sync(worker)) return 0;
tile_size = data->bit_writer.pos;
// Aggregate per-thread bitstream stats.
cpi->max_mv_magnitude =
VPXMAX(cpi->max_mv_magnitude, data->max_mv_magnitude);
for (k = 0; k < SWITCHABLE; ++k) {
cpi->interp_filter_selected[0][k] += data->interp_filter_selected[0][k];
}
// Prefix the size of the tile on all but the last.
if (tile_col != tile_cols || j < i - 1) {
mem_put_be32(data_ptr + total_size, tile_size);
total_size += 4;
}
if (j > 0) {
memcpy(data_ptr + total_size, data->dest, tile_size);
}
total_size += tile_size;
}
}
return total_size;
}
static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
@ -928,6 +1047,14 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
memset(cm->above_seg_context, 0,
sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
// Encoding tiles in parallel is done only for realtime mode now. In other
// modes the speed up is insignificant and requires further testing to ensure
// that it does not make the overall process worse in any case.
if (cpi->oxcf.mode == REALTIME && cpi->num_workers > 1 && tile_rows == 1 &&
tile_cols > 1) {
return encode_tiles_mt(cpi, data_ptr);
}
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
int tile_idx = tile_row * tile_cols + tile_col;
@ -955,7 +1082,6 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
total_size += residual_bc.pos;
}
}
return total_size;
}

View File

@ -17,8 +17,26 @@ extern "C" {
#include "vp9/encoder/vp9_encoder.h"
typedef struct VP9BitstreamWorkerData {
uint8_t *dest;
int dest_size;
TOKENEXTRA *tok;
TOKENEXTRA *tok_end;
vpx_writer bit_writer;
int tile_idx;
unsigned int max_mv_magnitude;
// The size of interp_filter_selected in VP9_COMP is actually
// MAX_REFERENCE_FRAMES x SWITCHABLE. But when encoding tiles, all we ever do
// is increment the very first index (index 0) for the first dimension. Hence
// this is sufficient.
int interp_filter_selected[1][SWITCHABLE];
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
} VP9BitstreamWorkerData;
int vp9_get_refresh_mask(VP9_COMP *cpi);
void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi);
void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {

View File

@ -2030,7 +2030,10 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
vpx_free(cpi->tile_thr_data);
vpx_free(cpi->workers);
if (cpi->num_workers > 1) vp9_loop_filter_dealloc(&cpi->lf_row_sync);
if (cpi->num_workers > 1) {
vp9_loop_filter_dealloc(&cpi->lf_row_sync);
vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
}
vp9_alt_ref_aq_destroy(cpi->alt_ref_aq);

View File

@ -601,6 +601,7 @@ typedef struct VP9_COMP {
VPxWorker *workers;
struct EncWorkerData *tile_thr_data;
VP9LfSync lf_row_sync;
struct VP9BitstreamWorkerData *vp9_bitstream_worker_data;
int keep_level_stats;
Vp9LevelInfo level_info;