Multi-threading of first pass stats collection

(yunqingwang)
1. Rebased the patch. Incorporated recent first pass changes.
2. Turned on the first pass unit test.

Change-Id: Ia2f7ba8152d0b6dd6bf8efb9dfaf505ba7d8edee
This commit is contained in:
Ranjit Kumar Tulabandu
2016-12-27 18:45:43 +05:30
committed by Yunqing Wang
parent 91aa1fae2a
commit 8b0c11c358
16 changed files with 1535 additions and 554 deletions

View File

@@ -82,9 +82,8 @@ class VPxFirstPassEncoderThreadTest
encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0);
// For now, new_mt_mode only works for 2-pass encoding.
// Enable this once the fp mt patch is checked in.
// if (encoding_mode_ == ::libvpx_test::kTwoPassGood)
// encoder->Control(VP9E_SET_NEW_MT, new_mt_mode_);
if (encoding_mode_ == ::libvpx_test::kTwoPassGood)
encoder->Control(VP9E_SET_NEW_MT, new_mt_mode_);
encoder_initialized_ = true;
}
@@ -131,7 +130,7 @@ static void compare_fp_stats(vpx_fixed_buf_t *fp_stats) {
for (j = 0; j < kDbl; ++j) {
EXPECT_LE(fabs(*frame_stats1 - *frame_stats2),
fabs(*frame_stats1) / 1000.0);
fabs(*frame_stats1) / 10000.0);
frame_stats1++;
frame_stats2++;
}
@@ -146,7 +145,7 @@ static void compare_fp_stats(vpx_fixed_buf_t *fp_stats) {
}
TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 50);
::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
first_pass_only_ = 1;
cfg_.rc_target_bitrate = 1000;

View File

@@ -131,6 +131,10 @@ struct macroblock {
int use_lp32x32fdct;
int skip_encode;
// In first pass, intra prediction is done based on source pixels
// at tile boundaries
int fp_src_pred;
// use fast quantization process
int quant_fp;

View File

@@ -773,9 +773,10 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
}
}
vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,
x->skip_encode ? src_stride : dst_stride, dst,
dst_stride, col, row, plane);
vp9_predict_intra_block(
xd, bwl, tx_size, mode, (x->skip_encode || x->fp_src_pred) ? src : dst,
(x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst,
dst_stride, col, row, plane);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

View File

@@ -50,6 +50,7 @@
#include "vp9/encoder/vp9_ethread.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/encoder/vp9_mbgraph.h"
#include "vp9/encoder/vp9_multi_thread.h"
#include "vp9/encoder/vp9_noise_estimate.h"
#include "vp9/encoder/vp9_picklpf.h"
#include "vp9/encoder/vp9_ratectrl.h"
@@ -1563,6 +1564,13 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
#if CONFIG_VP9_HIGHBITDEPTH
highbd_set_var_fns(cpi);
#endif
// Enable multi-threading for first pass.
cpi->new_mt = 0;
if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) &&
cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) &&
cpi->oxcf.new_mt)
cpi->new_mt = 1;
}
#ifndef M_LOG2_E
@@ -1719,6 +1727,12 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
}
#endif
#if ENABLE_MT_BIT_MATCH
CHECK_MEM_ERROR(
cm, cpi->twopass.fp_mb_float_stats,
vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
#endif
cpi->refresh_alt_ref_frame = 0;
cpi->multi_arf_last_grp_enabled = 0;
@@ -2076,6 +2090,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
}
vpx_free(cpi->tile_thr_data);
vpx_free(cpi->workers);
vp9_row_mt_mem_dealloc(cpi);
if (cpi->num_workers > 1) {
vp9_loop_filter_dealloc(&cpi->lf_row_sync);
@@ -2098,6 +2113,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
}
#endif
#if ENABLE_MT_BIT_MATCH
vpx_free(cpi->twopass.fp_mb_float_stats);
cpi->twopass.fp_mb_float_stats = NULL;
#endif
vp9_remove_common(cm);
vp9_free_ref_frame_buffers(cm->buffer_pool);
#if CONFIG_VP9_POSTPROC
@@ -4802,6 +4822,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
}
cpi->td.mb.fp_src_pred = 0;
if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
const int lossless = is_lossless_requested(oxcf);
#if CONFIG_VP9_HIGHBITDEPTH

View File

@@ -33,7 +33,9 @@
#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
#include "vp9/encoder/vp9_context_tree.h"
#include "vp9/encoder/vp9_encodemb.h"
#include "vp9/encoder/vp9_ethread.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/encoder/vp9_job_queue.h"
#include "vp9/encoder/vp9_lookahead.h"
#include "vp9/encoder/vp9_mbgraph.h"
#include "vp9/encoder/vp9_mcomp.h"
@@ -256,6 +258,8 @@ typedef struct VP9EncoderConfig {
int render_width;
int render_height;
VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
int new_mt;
} VP9EncoderConfig;
static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
@@ -269,8 +273,34 @@ typedef struct TileDataEnc {
int mode_map[BLOCK_SIZES][MAX_MODES];
int m_search_count;
int ex_search_count;
FIRSTPASS_DATA fp_data;
VP9RowMTSync row_mt_sync;
} TileDataEnc;
typedef struct RowMTInfo {
JobQueueHandle job_queue_hdl;
#if CONFIG_MULTITHREAD
pthread_mutex_t job_mutex;
#endif
} RowMTInfo;
typedef struct MultiThreadHandle {
int allocated_tile_rows;
int allocated_tile_cols;
int allocated_vert_unit_rows;
// Frame level params
int num_tile_vert_sbs[MAX_NUM_TILE_ROWS];
// Job Queue structure and handles
JobQueue *job_queue;
int jobs_per_tile_col;
RowMTInfo row_mt_info[MAX_NUM_TILE_COLS];
int thread_id_to_tile_id[MAX_NUM_THREADS]; // Mapping of threads to tiles
} MultiThreadHandle;
typedef struct RD_COUNTS {
vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
int64_t comp_pred_diff[REFERENCE_MODES];
@@ -629,6 +659,10 @@ typedef struct VP9_COMP {
int keep_level_stats;
Vp9LevelInfo level_info;
MultiThreadHandle multi_thread_ctxt;
void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int);
void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int);
int new_mt;
// Previous Partition Info
BLOCK_SIZE *prev_partition;
@@ -808,6 +842,18 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
}
static INLINE int get_num_vert_units(TileInfo tile, int shift) {
int num_vert_units =
(tile.mi_row_end - tile.mi_row_start + (1 << shift) - 1) >> shift;
return num_vert_units;
}
static INLINE int get_num_cols(TileInfo tile, int shift) {
int num_cols =
(tile.mi_col_end - tile.mi_col_start + (1 << shift) - 1) >> shift;
return num_cols;
}
static INLINE int get_level_index(VP9_LEVEL level) {
int i;
for (i = 0; i < VP9_LEVELS; ++i) {

View File

@@ -11,6 +11,8 @@
#include "vp9/encoder/vp9_encodeframe.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_ethread.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/encoder/vp9_multi_thread.h"
#include "vpx_dsp/vpx_dsp_common.h"
static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
@@ -64,15 +66,11 @@ static int get_max_tile_cols(VP9_COMP *cpi) {
return (1 << log2_tile_cols);
}
void vp9_encode_tiles_mt(VP9_COMP *cpi) {
static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
VP9_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
int i;
vp9_init_tile_data(cpi);
// Only run once to create threads and allocate thread data.
if (cpi->num_workers == 0) {
int allocated_workers = num_workers;
@@ -123,19 +121,57 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
thread_data->cpi = cpi;
thread_data->td = &cpi->td;
}
winterface->sync(worker);
}
}
}
static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2,
int num_workers) {
const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
int i;
for (i = 0; i < num_workers; i++) {
VPxWorker *const worker = &cpi->workers[i];
EncWorkerData *thread_data;
worker->hook = (VPxWorkerHook)enc_worker_hook;
worker->hook = (VPxWorkerHook)hook;
worker->data1 = &cpi->tile_thr_data[i];
worker->data2 = NULL;
thread_data = (EncWorkerData *)worker->data1;
worker->data2 = data2;
}
// Encode a frame
for (i = 0; i < num_workers; i++) {
VPxWorker *const worker = &cpi->workers[i];
EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
// Set the starting tile for each thread.
thread_data->start = i;
if (i == cpi->num_workers - 1)
winterface->execute(worker);
else
winterface->launch(worker);
}
// Encoding ends.
for (i = 0; i < num_workers; i++) {
VPxWorker *const worker = &cpi->workers[i];
winterface->sync(worker);
}
}
void vp9_encode_tiles_mt(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
int i;
vp9_init_tile_data(cpi);
create_enc_workers(cpi, num_workers);
for (i = 0; i < num_workers; i++) {
EncWorkerData *thread_data;
thread_data = &cpi->tile_thr_data[i];
// Before encoding a frame, copy the thread data from cpi.
if (thread_data->td != &cpi->td) {
@@ -165,25 +201,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
}
}
// Encode a frame
for (i = 0; i < num_workers; i++) {
VPxWorker *const worker = &cpi->workers[i];
EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
// Set the starting tile for each thread.
thread_data->start = i;
if (i == cpi->num_workers - 1)
winterface->execute(worker);
else
winterface->launch(worker);
}
// Encoding ends.
for (i = 0; i < num_workers; i++) {
VPxWorker *const worker = &cpi->workers[i];
winterface->sync(worker);
}
launch_enc_workers(cpi, (VPxWorkerHook)enc_worker_hook, NULL, num_workers);
for (i = 0; i < num_workers; i++) {
VPxWorker *const worker = &cpi->workers[i];
@@ -196,3 +214,253 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
}
}
}
static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
TileDataEnc *tile_data_t) {
tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor;
tile_data->fp_data.brightness_factor +=
tile_data_t->fp_data.brightness_factor;
tile_data->fp_data.coded_error += tile_data_t->fp_data.coded_error;
tile_data->fp_data.sr_coded_error += tile_data_t->fp_data.sr_coded_error;
tile_data->fp_data.frame_noise_energy +=
tile_data_t->fp_data.frame_noise_energy;
tile_data->fp_data.intra_error += tile_data_t->fp_data.intra_error;
tile_data->fp_data.intercount += tile_data_t->fp_data.intercount;
tile_data->fp_data.second_ref_count += tile_data_t->fp_data.second_ref_count;
tile_data->fp_data.neutral_count += tile_data_t->fp_data.neutral_count;
tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count;
tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount;
tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr;
tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs;
tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc;
tile_data->fp_data.sum_mvc_abs += tile_data_t->fp_data.sum_mvc_abs;
tile_data->fp_data.sum_mvrs += tile_data_t->fp_data.sum_mvrs;
tile_data->fp_data.sum_mvcs += tile_data_t->fp_data.sum_mvcs;
tile_data->fp_data.sum_in_vectors += tile_data_t->fp_data.sum_in_vectors;
tile_data->fp_data.intra_smooth_count +=
tile_data_t->fp_data.intra_smooth_count;
tile_data->fp_data.image_data_start_row =
VPXMIN(tile_data->fp_data.image_data_start_row,
tile_data_t->fp_data.image_data_start_row) == INVALID_ROW
? VPXMAX(tile_data->fp_data.image_data_start_row,
tile_data_t->fp_data.image_data_start_row)
: VPXMIN(tile_data->fp_data.image_data_start_row,
tile_data_t->fp_data.image_data_start_row);
}
// Allocate memory for row synchronization
void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
int rows) {
row_mt_sync->rows = rows;
#if CONFIG_MULTITHREAD
{
int i;
CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
vpx_malloc(sizeof(*row_mt_sync->mutex_) * rows));
if (row_mt_sync->mutex_) {
for (i = 0; i < rows; ++i) {
pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
}
}
CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
vpx_malloc(sizeof(*row_mt_sync->cond_) * rows));
if (row_mt_sync->cond_) {
for (i = 0; i < rows; ++i) {
pthread_cond_init(&row_mt_sync->cond_[i], NULL);
}
}
}
#endif // CONFIG_MULTITHREAD
CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));
// Set up nsync.
row_mt_sync->sync_range = 1;
}
// Deallocate row based multi-threading synchronization related mutex and data
void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) {
if (row_mt_sync != NULL) {
#if CONFIG_MULTITHREAD
int i;
if (row_mt_sync->mutex_ != NULL) {
for (i = 0; i < row_mt_sync->rows; ++i) {
pthread_mutex_destroy(&row_mt_sync->mutex_[i]);
}
vpx_free(row_mt_sync->mutex_);
}
if (row_mt_sync->cond_ != NULL) {
for (i = 0; i < row_mt_sync->rows; ++i) {
pthread_cond_destroy(&row_mt_sync->cond_[i]);
}
vpx_free(row_mt_sync->cond_);
}
#endif // CONFIG_MULTITHREAD
vpx_free(row_mt_sync->cur_col);
// clear the structure as the source of this call may be dynamic change
// in tiles in which case this call will be followed by an _alloc()
// which may fail.
vp9_zero(*row_mt_sync);
}
}
void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) {
#if CONFIG_MULTITHREAD
const int nsync = row_mt_sync->sync_range;
if (r && !(c & (nsync - 1))) {
pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1];
pthread_mutex_lock(mutex);
while (c > row_mt_sync->cur_col[r - 1] - nsync) {
pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex);
}
pthread_mutex_unlock(mutex);
}
#else
(void)row_mt_sync;
(void)r;
(void)c;
#endif // CONFIG_MULTITHREAD
}
void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c) {
(void)row_mt_sync;
(void)r;
(void)c;
return;
}
void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
const int cols) {
#if CONFIG_MULTITHREAD
const int nsync = row_mt_sync->sync_range;
int cur;
// Only signal when there are enough filtered SB for next row to run.
int sig = 1;
if (c < cols - 1) {
cur = c;
if (c % nsync) sig = 0;
} else {
cur = cols + nsync;
}
if (sig) {
pthread_mutex_lock(&row_mt_sync->mutex_[r]);
row_mt_sync->cur_col[r] = cur;
pthread_cond_signal(&row_mt_sync->cond_[r]);
pthread_mutex_unlock(&row_mt_sync->mutex_[r]);
}
#else
(void)row_mt_sync;
(void)r;
(void)c;
(void)cols;
#endif // CONFIG_MULTITHREAD
}
void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
const int cols) {
(void)row_mt_sync;
(void)r;
(void)c;
(void)cols;
return;
}
static int first_pass_worker_hook(EncWorkerData *const thread_data,
MultiThreadHandle *multi_thread_ctxt) {
VP9_COMP *const cpi = thread_data->cpi;
const VP9_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
int tile_row, tile_col;
TileDataEnc *this_tile;
int end_of_frame;
int thread_id = thread_data->thread_id;
int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
JobNode *proc_job = NULL;
FIRSTPASS_DATA fp_acc_data;
MV zero_mv = { 0, 0 };
MV best_ref_mv;
int mb_row;
end_of_frame = 0;
while (0 == end_of_frame) {
// Get the next job in the queue
proc_job =
(JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
if (NULL == proc_job) {
// Query for the status of other tiles
end_of_frame = vp9_get_tiles_proc_status(
multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
tile_cols);
} else {
tile_col = proc_job->tile_col_id;
tile_row = proc_job->tile_row_id;
this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
mb_row = proc_job->vert_unit_row_num;
best_ref_mv = zero_mv;
vp9_zero(fp_acc_data);
fp_acc_data.image_data_start_row = INVALID_ROW;
vp9_first_pass_encode_tile_mb_row(cpi, thread_data->td, &fp_acc_data,
this_tile, &best_ref_mv, mb_row);
}
}
return 0;
}
void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows;
MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
TileDataEnc *first_tile_col;
int num_workers = VPXMAX(cpi->oxcf.max_threads, 1);
int i;
if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
multi_thread_ctxt->allocated_tile_rows < tile_rows ||
multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
vp9_row_mt_mem_dealloc(cpi);
vp9_init_tile_data(cpi);
vp9_row_mt_mem_alloc(cpi);
} else {
vp9_init_tile_data(cpi);
}
create_enc_workers(cpi, num_workers);
vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
vp9_prepare_job_queue(cpi, FIRST_PASS_JOB);
vp9_multi_thread_tile_init(cpi);
for (i = 0; i < num_workers; i++) {
EncWorkerData *thread_data;
thread_data = &cpi->tile_thr_data[i];
// Before encoding a frame, copy the thread data from cpi.
if (thread_data->td != &cpi->td) {
thread_data->td->mb = cpi->td.mb;
}
}
launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook,
multi_thread_ctxt, num_workers);
first_tile_col = &cpi->tile_data[0];
for (i = 1; i < tile_cols; i++) {
TileDataEnc *this_tile = &cpi->tile_data[i];
accumulate_fp_tile_stat(first_tile_col, this_tile);
}
}

View File

@@ -15,6 +15,10 @@
extern "C" {
#endif
#define MAX_NUM_TILE_COLS (1 << 6)
#define MAX_NUM_TILE_ROWS 4
#define MAX_NUM_THREADS 80
struct VP9_COMP;
struct ThreadData;
@@ -22,10 +26,41 @@ typedef struct EncWorkerData {
struct VP9_COMP *cpi;
struct ThreadData *td;
int start;
int thread_id;
int tile_completion_status[MAX_NUM_TILE_COLS];
} EncWorkerData;
// Encoder row synchronization
typedef struct VP9RowMTSyncData {
#if CONFIG_MULTITHREAD
pthread_mutex_t *mutex_;
pthread_cond_t *cond_;
#endif
// Allocate memory to store the sb/mb block index in each row.
int *cur_col;
int sync_range;
int rows;
} VP9RowMTSync;
void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
void vp9_encode_fp_row_mt(struct VP9_COMP *cpi);
void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c);
void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
const int cols);
void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c);
void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
const int cols);
// Allocate memory for row based multi-threading synchronization.
void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, struct VP9Common *cm,
int rows);
// Deallocate row based multi-threading synchronization related mutex and data.
void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync);
#ifdef __cplusplus
} // extern "C"
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -39,6 +39,40 @@ typedef struct {
} FIRSTPASS_MB_STATS;
#endif
#define INVALID_ROW -1
#define ENABLE_MT_BIT_MATCH 0
#if ENABLE_MT_BIT_MATCH
typedef struct {
double frame_mb_intra_factor;
double frame_mb_brightness_factor;
double frame_mb_neutral_count;
} FP_MB_FLOAT_STATS;
#endif
typedef struct {
double intra_factor;
double brightness_factor;
int64_t coded_error;
int64_t sr_coded_error;
int64_t frame_noise_energy;
int64_t intra_error;
int intercount;
int second_ref_count;
double neutral_count;
int intra_skip_count;
int image_data_start_row;
int mvcount;
int sum_mvr;
int sum_mvr_abs;
int sum_mvc;
int sum_mvc_abs;
int64_t sum_mvrs;
int64_t sum_mvcs;
int sum_in_vectors;
int intra_smooth_count;
} FIRSTPASS_DATA;
typedef struct {
double frame;
double weight;
@@ -114,6 +148,11 @@ typedef struct {
uint8_t *this_frame_mb_stats;
FIRSTPASS_MB_STATS firstpass_mb_stats;
#endif
#if ENABLE_MT_BIT_MATCH
FP_MB_FLOAT_STATS *fp_mb_float_stats;
#endif
// An indication of the content type of the current frame
FRAME_CONTENT_TYPE fr_content_type;
@@ -141,12 +180,20 @@ typedef struct {
} TWO_PASS;
struct VP9_COMP;
struct ThreadData;
struct TileDataEnc;
void vp9_init_first_pass(struct VP9_COMP *cpi);
void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
void vp9_end_first_pass(struct VP9_COMP *cpi);
void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi,
struct ThreadData *td,
FIRSTPASS_DATA *fp_acc_data,
struct TileDataEnc *tile_data,
MV *best_ref_mv, int mb_row);
void vp9_init_second_pass(struct VP9_COMP *cpi);
void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
void vp9_twopass_postencode_update(struct VP9_COMP *cpi);

View File

@@ -0,0 +1,46 @@
/*
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_ENCODER_VP9_JOB_QUEUE_H_
#define VP9_ENCODER_VP9_JOB_QUEUE_H_
typedef enum {
FIRST_PASS_JOB,
ENCODE_JOB,
ARNR_JOB,
NUM_JOB_TYPES,
} JOB_TYPE;
// Encode job parameters
typedef struct {
int vert_unit_row_num; // Index of the vertical unit row
int tile_col_id; // tile col id within a tile
int tile_row_id; // tile col id within a tile
} JobNode;
// Job queue element parameters
typedef struct {
// Pointer to the next link in the job queue
void *next;
// Job information context of the module
JobNode job_info;
} JobQueue;
// Job queue handle
typedef struct {
// Pointer to the next link in the job queue
void *next;
// Counter to store the number of jobs picked up for processing
int num_jobs_acquired;
} JobQueueHandle;
#endif // VP9_ENCODER_VP9_JOB_QUEUE_H_

View File

@@ -0,0 +1,282 @@
/*
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_ethread.h"
#include "vp9/encoder/vp9_multi_thread.h"
void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
int tile_id) {
RowMTInfo *row_mt_info;
JobQueueHandle *job_queue_hdl = NULL;
void *next = NULL;
JobNode *job_info = NULL;
#if CONFIG_MULTITHREAD
pthread_mutex_t *mutex_handle = NULL;
#endif
row_mt_info = (RowMTInfo *)(&multi_thread_ctxt->row_mt_info[tile_id]);
job_queue_hdl = (JobQueueHandle *)&row_mt_info->job_queue_hdl;
#if CONFIG_MULTITHREAD
mutex_handle = &row_mt_info->job_mutex;
#endif
// lock the mutex for queue access
#if CONFIG_MULTITHREAD
pthread_mutex_lock(mutex_handle);
#endif
next = job_queue_hdl->next;
if (NULL != next) {
JobQueue *job_queue = (JobQueue *)next;
job_info = &job_queue->job_info;
// Update the next job in the queue
job_queue_hdl->next = job_queue->next;
job_queue_hdl->num_jobs_acquired++;
}
#if CONFIG_MULTITHREAD
pthread_mutex_unlock(mutex_handle);
#endif
return job_info;
}
void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
struct VP9Common *cm = &cpi->common;
MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
int tile_row, tile_col;
const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows;
const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
int jobs_per_tile_col, total_jobs;
jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows);
// Calculate the total number of jobs
total_jobs = jobs_per_tile_col * tile_cols;
multi_thread_ctxt->allocated_tile_cols = tile_cols;
multi_thread_ctxt->allocated_tile_rows = tile_rows;
multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
multi_thread_ctxt->job_queue =
(JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue));
#if CONFIG_MULTITHREAD
// Create mutex for each tile
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];
pthread_mutex_init(&row_mt_info->job_mutex, NULL);
}
#endif
// Allocate memory for row based multi-threading
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
TileDataEnc *this_tile = &cpi->tile_data[tile_col];
vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col);
}
// Assign the sync pointer of tile row zero for every tile row > 0
for (tile_row = 1; tile_row < tile_rows; tile_row++) {
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
TileDataEnc *this_col_tile = &cpi->tile_data[tile_col];
this_tile->row_mt_sync = this_col_tile->row_mt_sync;
}
}
// Calculate the number of vertical units in the given tile row
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols];
TileInfo *tile_info = &this_tile->tile_info;
multi_thread_ctxt->num_tile_vert_sbs[tile_row] =
get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
}
}
void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
int tile_col;
// Deallocate memory for job queue
if (multi_thread_ctxt->job_queue) vpx_free(multi_thread_ctxt->job_queue);
#if CONFIG_MULTITHREAD
// Destroy mutex for each tile
for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
tile_col++) {
RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];
if (row_mt_info) pthread_mutex_destroy(&row_mt_info->job_mutex);
}
#endif
// Free row based multi-threading sync memory
for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
tile_col++) {
TileDataEnc *this_tile = &cpi->tile_data[tile_col];
vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
}
}
void vp9_multi_thread_tile_init(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
int i;
for (i = 0; i < tile_cols; i++) {
TileDataEnc *this_tile = &cpi->tile_data[i];
int jobs_per_tile_col = cpi->oxcf.pass == 1 ? cm->mb_rows : sb_rows;
// Initialize cur_col to -1 for all rows.
memset(this_tile->row_mt_sync.cur_col, -1,
sizeof(*this_tile->row_mt_sync.cur_col) * jobs_per_tile_col);
vp9_zero(this_tile->fp_data);
this_tile->fp_data.image_data_start_row = INVALID_ROW;
}
}
void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
int tile_cols, int num_workers) {
int tile_id = 0;
int i;
// Allocating the threads for the tiles
for (i = 0; i < num_workers; i++) {
multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++;
if (tile_id == tile_cols) tile_id = 0;
}
}
int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,
int cur_tile_id) {
RowMTInfo *row_mt_info;
JobQueueHandle *job_queue_hndl;
#if CONFIG_MULTITHREAD
pthread_mutex_t *mutex;
#endif
int num_jobs_remaining;
row_mt_info = &multi_thread_ctxt->row_mt_info[cur_tile_id];
job_queue_hndl = &row_mt_info->job_queue_hdl;
#if CONFIG_MULTITHREAD
mutex = &row_mt_info->job_mutex;
#endif
#if CONFIG_MULTITHREAD
pthread_mutex_lock(mutex);
#endif
num_jobs_remaining =
multi_thread_ctxt->jobs_per_tile_col - job_queue_hndl->num_jobs_acquired;
#if CONFIG_MULTITHREAD
pthread_mutex_unlock(mutex);
#endif
return (num_jobs_remaining);
}
void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) {
VP9_COMMON *const cm = &cpi->common;
MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
JobQueue *job_queue = multi_thread_ctxt->job_queue;
const int tile_cols = 1 << cm->log2_tile_cols;
int job_row_num, jobs_per_tile, jobs_per_tile_col, total_jobs;
const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
int tile_col, i;
jobs_per_tile_col = (job_type != ENCODE_JOB) ? cm->mb_rows : sb_rows;
total_jobs = jobs_per_tile_col * tile_cols;
multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col;
// memset the entire job queue buffer to zero
memset(job_queue, 0, total_jobs * sizeof(JobQueue));
// Job queue preparation
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
RowMTInfo *tile_ctxt = &multi_thread_ctxt->row_mt_info[tile_col];
JobQueue *job_queue_curr, *job_queue_temp;
int tile_row = 0;
tile_ctxt->job_queue_hdl.next = (void *)job_queue;
tile_ctxt->job_queue_hdl.num_jobs_acquired = 0;
job_queue_curr = job_queue;
job_queue_temp = job_queue;
// loop over all the vertical rows
for (job_row_num = 0, jobs_per_tile = 0; job_row_num < jobs_per_tile_col;
job_row_num++, jobs_per_tile++) {
job_queue_curr->job_info.vert_unit_row_num = job_row_num;
job_queue_curr->job_info.tile_col_id = tile_col;
job_queue_curr->job_info.tile_row_id = tile_row;
job_queue_curr->next = (void *)(job_queue_temp + 1);
job_queue_curr = ++job_queue_temp;
if (ENCODE_JOB == job_type) {
if (jobs_per_tile >=
multi_thread_ctxt->num_tile_vert_sbs[tile_row] - 1) {
tile_row++;
jobs_per_tile = -1;
}
}
}
// Set the last pointer to NULL
job_queue_curr += -1;
job_queue_curr->next = (void *)NULL;
// Move to the next tile
job_queue += jobs_per_tile_col;
}
for (i = 0; i < cpi->num_workers; i++) {
EncWorkerData *thread_data;
thread_data = &cpi->tile_thr_data[i];
thread_data->thread_id = i;
for (tile_col = 0; tile_col < tile_cols; tile_col++)
thread_data->tile_completion_status[tile_col] = 0;
}
}
int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
int *tile_completion_status, int *cur_tile_id,
int tile_cols) {
int tile_col;
int tile_id = -1; // Stores the tile ID with minimum proc done
int max_num_jobs_remaining = 0;
int num_jobs_remaining;
// Mark the completion to avoid check in the loop
tile_completion_status[*cur_tile_id] = 1;
// Check for the status of all the tiles
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
if (tile_completion_status[tile_col] == 0) {
num_jobs_remaining =
vp9_get_job_queue_status(multi_thread_ctxt, tile_col);
// Mark the completion to avoid checks during future switches across tiles
if (num_jobs_remaining == 0) tile_completion_status[tile_col] = 1;
if (num_jobs_remaining > max_num_jobs_remaining) {
max_num_jobs_remaining = num_jobs_remaining;
tile_id = tile_col;
}
}
}
if (-1 == tile_id) {
return 1;
} else {
// Update the cur ID to the next tile ID that will be processed,
// which will be the least processed tile
*cur_tile_id = tile_id;
return 0;
}
}

View File

@@ -0,0 +1,38 @@
/*
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_ENCODER_VP9_MULTI_THREAD_H
#define VP9_ENCODER_VP9_MULTI_THREAD_H
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_job_queue.h"
void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
int tile_id);
void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type);
int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,
int cur_tile_id);
void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
int tile_cols, int num_workers);
void vp9_multi_thread_tile_init(VP9_COMP *cpi);
void vp9_row_mt_mem_alloc(VP9_COMP *cpi);
void vp9_row_mt_mem_dealloc(VP9_COMP *cpi);
int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
int *tile_completion_status, int *cur_tile_id,
int tile_cols);
#endif // VP9_ENCODER_VP9_MULTI_THREAD_H

View File

@@ -51,6 +51,7 @@ struct vp9_extracfg {
vpx_color_range_t color_range;
int render_width;
int render_height;
unsigned int new_mt;
};
static struct vp9_extracfg default_extra_cfg = {
@@ -82,6 +83,7 @@ static struct vp9_extracfg default_extra_cfg = {
0, // color range
0, // render width
0, // render height
1, // new_mt
};
struct vpx_codec_alg_priv {
@@ -245,6 +247,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
"kf_min_dist not supported in auto mode, use 0 "
"or kf_max_dist instead.");
RANGE_CHECK(extra_cfg, new_mt, 0, 1);
RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
@@ -554,6 +557,8 @@ static vpx_codec_err_t set_encoder_config(
oxcf->target_level = extra_cfg->target_level;
oxcf->new_mt = extra_cfg->new_mt;
for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
#if CONFIG_SPATIAL_SVC
oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl];
@@ -842,6 +847,13 @@ static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
static vpx_codec_err_t ctrl_set_new_mt(vpx_codec_alg_priv_t *ctx,
va_list args) {
struct vp9_extracfg extra_cfg = ctx->extra_cfg;
extra_cfg.new_mt = CAST(VP9E_SET_NEW_MT, args);
return update_extra_cfg(ctx, &extra_cfg);
}
static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) {
int *const arg = va_arg(args, int *);
if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
@@ -1594,6 +1606,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ VP9E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },
{ VP9E_SET_RENDER_SIZE, ctrl_set_render_size },
{ VP9E_SET_TARGET_LEVEL, ctrl_set_target_level },
{ VP9E_SET_NEW_MT, ctrl_set_new_mt },
// Getters
{ VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },

View File

@@ -39,9 +39,12 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
VP9_CX_SRCS-yes += encoder/vp9_extend.h
VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
VP9_CX_SRCS-yes += encoder/vp9_job_queue.h
VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
VP9_CX_SRCS-yes += encoder/vp9_mcomp.h
VP9_CX_SRCS-yes += encoder/vp9_multi_thread.c
VP9_CX_SRCS-yes += encoder/vp9_multi_thread.h
VP9_CX_SRCS-yes += encoder/vp9_encoder.h
VP9_CX_SRCS-yes += encoder/vp9_quantize.h
VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h

View File

@@ -547,6 +547,14 @@ enum vp8e_enc_control_id {
*/
VP9E_SET_TARGET_LEVEL,
/*!\brief Codec control function to set row level multi-threading.
*
* 0 : off, 1 : on
*
* Supported in codecs: VP9
*/
VP9E_SET_NEW_MT,
/*!\brief Codec control function to get bitstream level.
*
* Supported in codecs: VP9
@@ -838,6 +846,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int)
#define VPX_CTRL_VP9E_SET_TARGET_LEVEL
VPX_CTRL_USE_TYPE(VP9E_SET_NEW_MT, unsigned int)
#define VPX_CTRL_VP9E_SET_NEW_MT
VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
#define VPX_CTRL_VP9E_GET_LEVEL

View File

@@ -470,6 +470,9 @@ static const arg_def_t target_level = ARG_DEF(
NULL, "target-level", 1,
"Target level (255: off (default); 0: only keep level stats; 10: level 1.0;"
" 11: level 1.1; ... 62: level 6.2)");
static const arg_def_t new_mt =
ARG_DEF(NULL, "new-mt", 1, "Enable row based multi-threading in VP9");
#endif
#if CONFIG_VP9_ENCODER
@@ -498,6 +501,7 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
&min_gf_interval,
&max_gf_interval,
&target_level,
&new_mt,
#if CONFIG_VP9_HIGHBITDEPTH
&bitdeptharg,
&inbitdeptharg,
@@ -528,6 +532,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
VP9E_SET_MIN_GF_INTERVAL,
VP9E_SET_MAX_GF_INTERVAL,
VP9E_SET_TARGET_LEVEL,
VP9E_SET_NEW_MT,
0 };
#endif