Merge "Speed up of wedge search" into nextgen

This commit is contained in:
Debargha Mukherjee 2015-10-06 00:33:50 +00:00 committed by Gerrit Code Review
commit 9fc51184b7
3 changed files with 404 additions and 2 deletions

View File

@ -1805,3 +1805,375 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
}
}
}
#if CONFIG_WEDGE_PARTITION
// Builds the inter-predictor for the single ref case
// for use in the encoder to search the wedges efficiently.
static void build_inter_predictors_single_buf(MACROBLOCKD *xd,
int plane, int block,
int bw, int bh,
int x, int y, int w, int h,
int mi_x, int mi_y, int ref,
uint8_t *const ext_dst,
int ext_dst_stride) {
struct macroblockd_plane *const pd = &xd->plane[plane];
const MODE_INFO *mi = xd->mi[0].src_mi;
#if CONFIG_INTRABC
const int is_intrabc = is_intrabc_mode(mi->mbmi.mode);
#endif // CONFIG_INTRABC
const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
#if CONFIG_GLOBAL_MOTION
Global_Motion_Params *gm;
int is_global;
gm = &xd->global_motion[mi->mbmi.ref_frame[ref]][0];
#endif // CONFIG_GLOBAL_MOTION
#if CONFIG_INTRABC
assert(!is_intrabc || mi->mbmi.interp_filter == BILINEAR);
#endif // CONFIG_INTRABC
const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
struct buf_2d *const dst_buf = &pd->dst;
struct buf_2d *const pre_buf =
#if CONFIG_INTRABC
is_intrabc ? dst_buf :
#endif // CONFIG_INTRABC
&pd->pre[ref];
#if CONFIG_VP9_HIGHBITDEPTH
uint8_t *const dst =
(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ?
CONVERT_TO_BYTEPTR(ext_dst) : ext_dst) + ext_dst_stride * y + x;
#else
uint8_t *const dst = ext_dst + ext_dst_stride * y + x;
#endif
const MV mv = mi->mbmi.sb_type < BLOCK_8X8
? average_split_mvs(pd, mi, ref, block)
: mi->mbmi.mv[ref].as_mv;
// TODO(jkoleszar): This clamping is done in the incorrect place for the
// scaling case. It needs to be done on the scaled MV, not the pre-scaling
// MV. Note however that it performs the subsampling aware scaling so
// that the result is always q4.
// mv_precision precision is MV_PRECISION_Q4.
const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
pd->subsampling_x,
pd->subsampling_y);
uint8_t *pre;
MV32 scaled_mv;
int xs, ys, subpel_x, subpel_y;
const int is_scaled = vp9_is_scaled(sf);
(void) dst_buf;
#if CONFIG_GLOBAL_MOTION
is_global = (get_y_mode(mi, block) == ZEROMV &&
#if CONFIG_INTRABC
!is_intrabc &&
#endif
get_gmtype(gm) == GLOBAL_ROTZOOM);
#endif // CONFIG_GLOBAL_MOTION
if (is_scaled) {
#if CONFIG_INTRABC
assert(!is_intrabc);
#endif // CONFIG_INTRABC
pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
xs = sf->x_step_q4;
ys = sf->y_step_q4;
} else {
pre = pre_buf->buf + (y * pre_buf->stride + x);
scaled_mv.row = mv_q4.row;
scaled_mv.col = mv_q4.col;
xs = ys = 16;
}
subpel_x = scaled_mv.col & SUBPEL_MASK;
subpel_y = scaled_mv.row & SUBPEL_MASK;
pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
+ (scaled_mv.col >> SUBPEL_BITS);
#if CONFIG_GLOBAL_MOTION
if (is_global) {
vp9_warp_plane(gm, pre_buf->buf0,
pre_buf->width, pre_buf->height, pre_buf->stride, dst,
(mi_x >> pd->subsampling_x) + x,
(mi_y >> pd->subsampling_y) + y, w, h, ext_dst_stride,
pd->subsampling_x, pd->subsampling_y, xs, ys);
} else {
#endif // CONFIG_GLOBAL_MOTION
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
highbd_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
subpel_x, subpel_y, sf, w, h, 0, kernel,
xs, ys, xd->bd);
} else {
inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
subpel_x, subpel_y, sf, w, h, 0, kernel, xs, ys);
}
#else
inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
subpel_x, subpel_y, sf, w, h, 0, kernel, xs, ys);
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_GLOBAL_MOTION
}
#endif // CONFIG_GLOBAL_MOTION
}
void vp9_build_inter_predictors_for_planes_single_buf(
MACROBLOCKD *xd, BLOCK_SIZE bsize,
int mi_row, int mi_col, int ref,
uint8_t *ext_dst[3], int ext_dst_stride[3]) {
const int plane_from = 0;
const int plane_to = 2;
int plane;
const int mi_x = mi_col * MI_SIZE;
const int mi_y = mi_row * MI_SIZE;
for (plane = plane_from; plane <= plane_to; ++plane) {
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
&xd->plane[plane]);
const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
const int bw = 4 * num_4x4_w;
const int bh = 4 * num_4x4_h;
if (xd->mi[0].src_mi->mbmi.sb_type < BLOCK_8X8) {
int i = 0, x, y;
assert(bsize == BLOCK_8X8);
for (y = 0; y < num_4x4_h; ++y)
for (x = 0; x < num_4x4_w; ++x)
build_inter_predictors_single_buf(xd, plane, i++, bw, bh,
4 * x, 4 * y, 4, 4,
mi_x, mi_y, ref,
ext_dst[plane],
ext_dst_stride[plane]);
} else {
build_inter_predictors_single_buf(xd, plane, 0, bw, bh,
0, 0, bw, bh,
mi_x, mi_y, ref,
ext_dst[plane],
ext_dst_stride[plane]);
}
}
}
static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
int block, int bw, int bh,
int x, int y, int w, int h,
#if CONFIG_SUPERTX
int wedge_offset_x,
int wedge_offset_y,
#endif // CONFIG_SUPERTX
int mi_x, int mi_y,
uint8_t *ext_dst0,
int ext_dst_stride0,
uint8_t *ext_dst1,
int ext_dst_stride1) {
struct macroblockd_plane *const pd = &xd->plane[plane];
const MODE_INFO *mi = xd->mi[0].src_mi;
const int is_compound = has_second_ref(&mi->mbmi);
#if CONFIG_INTRABC
const int is_intrabc = is_intrabc_mode(mi->mbmi.mode);
#endif // CONFIG_INTRABC
int ref;
#if CONFIG_GLOBAL_MOTION
Global_Motion_Params *gm[2];
gm[0] = &xd->global_motion[mi->mbmi.ref_frame[0]][0];
if (is_compound)
gm[1] = &xd->global_motion[mi->mbmi.ref_frame[1]][0];
#endif // CONFIG_GLOBAL_MOTION
#if CONFIG_INTRABC
assert(!is_intrabc || mi->mbmi.interp_filter == BILINEAR);
#endif // CONFIG_INTRABC
(void) block;
(void) bw;
(void) bh;
(void) mi_x;
(void) mi_y;
for (ref = 0; ref < 1 + is_compound; ++ref) {
struct buf_2d *const dst_buf = &pd->dst;
uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
#if CONFIG_GLOBAL_MOTION
const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
const int is_scaled = vp9_is_scaled(sf);
int xs, ys;
struct buf_2d *const pre_buf =
#if CONFIG_INTRABC
is_intrabc ? dst_buf :
#endif // CONFIG_INTRABC
&pd->pre[ref];
int is_global = (get_y_mode(mi, block) == ZEROMV &&
#if CONFIG_INTRABC
!is_intrabc &&
#endif
get_gmtype(gm[ref]) == GLOBAL_ROTZOOM);
if (is_scaled) {
#if CONFIG_INTRABC
assert(!is_intrabc);
#endif // CONFIG_INTRABC
xs = sf->x_step_q4;
ys = sf->y_step_q4;
} else {
xs = ys = 16;
}
#endif // CONFIG_GLOBAL_MOTION
if (ref && get_wedge_bits(mi->mbmi.sb_type)
&& mi->mbmi.use_wedge_interinter) {
#if CONFIG_VP9_HIGHBITDEPTH
uint8_t tmp_dst_[8192];
uint8_t *tmp_dst =
(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
CONVERT_TO_BYTEPTR(tmp_dst_) : tmp_dst_;
#else
uint8_t tmp_dst[4096];
#endif
#if CONFIG_GLOBAL_MOTION
if (is_global) {
vp9_warp_plane(gm[ref], pre_buf->buf0,
pre_buf->width, pre_buf->height, pre_buf->stride,
tmp_dst, (mi_x >> pd->subsampling_x) + x,
(mi_y >> pd->subsampling_y) + y, w, h, 64,
pd->subsampling_x, pd->subsampling_y, xs, ys);
} else {
#endif // CONFIG_GLOBAL_MOTION
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int k;
for (k = 0; k < h; ++k)
vpx_memcpy(tmp_dst_ + 128 * k, ext_dst1 + ext_dst_stride1 * 2 * k,
w * 2);
} else {
int k;
for (k = 0; k < h; ++k)
vpx_memcpy(tmp_dst_ + 64 * k, ext_dst1 + ext_dst_stride1 * k, w);
}
#else
{
int k;
for (k = 0; k < h; ++k)
vpx_memcpy(tmp_dst + 64 * k, ext_dst1 + ext_dst_stride1 * k, w);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_GLOBAL_MOTION
}
#endif // CONFIG_GLOBAL_MOTION
#if CONFIG_SUPERTX
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
build_masked_compound_extend_highbd(
dst, dst_buf->stride, tmp_dst, 64, plane,
mi->mbmi.interinter_wedge_index,
mi->mbmi.sb_type,
wedge_offset_x, wedge_offset_y, h, w);
} else {
build_masked_compound_extend(
dst, dst_buf->stride, tmp_dst, 64, plane,
mi->mbmi.interinter_wedge_index,
mi->mbmi.sb_type,
wedge_offset_x, wedge_offset_y, h, w);
}
#else
build_masked_compound_extend(dst, dst_buf->stride, tmp_dst, 64, plane,
mi->mbmi.interinter_wedge_index,
mi->mbmi.sb_type,
wedge_offset_x, wedge_offset_y, h, w);
#endif // CONFIG_VP9_HIGHBITDEPTH
#else // CONFIG_SUPERTX
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
build_masked_compound_highbd(dst, dst_buf->stride, tmp_dst, 64,
mi->mbmi.interinter_wedge_index,
mi->mbmi.sb_type, h, w);
else
#endif // CONFIG_VP9_HIGHBITDEPTH
build_masked_compound(dst, dst_buf->stride, tmp_dst, 64,
mi->mbmi.interinter_wedge_index,
mi->mbmi.sb_type, h, w);
#endif // CONFIG_SUPERTX
} else {
#if CONFIG_GLOBAL_MOTION
if (is_global) {
vp9_warp_plane(gm[ref], pre_buf->buf0,
pre_buf->width, pre_buf->height, pre_buf->stride, dst,
(mi_x >> pd->subsampling_x) + x,
(mi_y >> pd->subsampling_y) + y, w, h, dst_buf->stride,
pd->subsampling_x, pd->subsampling_y, xs, ys);
} else {
#endif // CONFIG_GLOBAL_MOTION
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int k;
for (k = 0; k < h; ++k)
vpx_memcpy(CONVERT_TO_SHORTPTR(dst + dst_buf->stride * k),
ext_dst0 + ext_dst_stride0 * 2 * k, w * 2);
} else {
int k;
for (k = 0; k < h; ++k)
vpx_memcpy(dst + dst_buf->stride * k,
ext_dst0 + ext_dst_stride0 * k, w);
}
#else
{
int k;
for (k = 0; k < h; ++k)
vpx_memcpy(dst + dst_buf->stride * k,
ext_dst0 + ext_dst_stride0 * k, w);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_GLOBAL_MOTION
}
#endif // CONFIG_GLOBAL_MOTION
}
}
}
void vp9_build_wedge_inter_predictor_from_buf(
MACROBLOCKD *xd, BLOCK_SIZE bsize,
int mi_row, int mi_col,
uint8_t *ext_dst0[3], int ext_dst_stride0[3],
uint8_t *ext_dst1[3], int ext_dst_stride1[3]) {
const int plane_from = 0;
const int plane_to = 2;
int plane;
const int mi_x = mi_col * MI_SIZE;
const int mi_y = mi_row * MI_SIZE;
for (plane = plane_from; plane <= plane_to; ++plane) {
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
&xd->plane[plane]);
const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
const int bw = 4 * num_4x4_w;
const int bh = 4 * num_4x4_h;
if (xd->mi[0].src_mi->mbmi.sb_type < BLOCK_8X8) {
int i = 0, x, y;
assert(bsize == BLOCK_8X8);
for (y = 0; y < num_4x4_h; ++y)
for (x = 0; x < num_4x4_w; ++x)
build_wedge_inter_predictor_from_buf(xd, plane, i++, bw, bh,
4 * x, 4 * y, 4, 4,
#if CONFIG_SUPERTX
0, 0,
#endif
mi_x, mi_y,
ext_dst0[plane],
ext_dst_stride0[plane],
ext_dst1[plane],
ext_dst_stride1[plane]);
} else {
build_wedge_inter_predictor_from_buf(xd, plane, 0, bw, bh,
0, 0, bw, bh,
#if CONFIG_SUPERTX
0, 0,
#endif
mi_x, mi_y,
ext_dst0[plane],
ext_dst_stride0[plane],
ext_dst1[plane],
ext_dst_stride1[plane]);
}
}
}
#endif // CONFIG_WEDGE_PARTITION

View File

@ -94,6 +94,15 @@ void vp9_generate_masked_weight(int wedge_index, BLOCK_SIZE sb_type,
int h, int w, uint8_t *mask, int stride);
void vp9_generate_hard_mask(int wedge_index, BLOCK_SIZE sb_type,
int h, int w, uint8_t *mask, int stride);
void vp9_build_inter_predictors_for_planes_single_buf(
MACROBLOCKD *xd, BLOCK_SIZE bsize,
int mi_row, int mi_col, int ref,
uint8_t *ext_dst[3], int ext_dst_stride[3]);
void vp9_build_wedge_inter_predictor_from_buf(
MACROBLOCKD *xd, BLOCK_SIZE bsize,
int mi_row, int mi_col,
uint8_t *ext_dst0[3], int ext_dst_stride0[3],
uint8_t *ext_dst1[3], int ext_dst_stride1[3]);
#endif // CONFIG_WEDGE_PARTITION
#if CONFIG_SUPERTX

View File

@ -5706,10 +5706,21 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (have_newmv_in_inter_mode(this_mode)) {
int_mv tmp_mv[2];
int rate_mvs[2], tmp_rate_mv = 0;
uint8_t pred0[8192 * 3], pred1[8192 * 3];
uint8_t *preds0[3] = {pred0, pred0 + 8192, pred0 + 16384};
uint8_t *preds1[3] = {pred1, pred1 + 8192, pred1 + 16384};
int strides[3] = {64, 64, 64};
vp9_build_inter_predictors_for_planes_single_buf(
xd, bsize, mi_row, mi_col, 0, preds0, strides);
vp9_build_inter_predictors_for_planes_single_buf(
xd, bsize, mi_row, mi_col, 1, preds1, strides);
// TODO(spencere, debargha): Reimplement to make this run faster
for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
mbmi->interinter_wedge_index = wedge_index;
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
vp9_build_wedge_inter_predictor_from_buf(xd, bsize, mi_row, mi_col,
preds0, strides,
preds1, strides);
model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, NULL, NULL);
rd = RDCOST(x->rdmult, x->rddiv, rs + rate_mv_tmp + rate_sum, dist_sum);
if (rd < best_rd_wedge) {
@ -5769,9 +5780,19 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
mbmi->mv[1].as_int = cur_mv[1].as_int;
}
} else {
uint8_t pred0[8192 * 3], pred1[8192 * 3];
uint8_t *preds0[3] = {pred0, pred0 + 8192, pred0 + 16384};
uint8_t *preds1[3] = {pred1, pred1 + 8192, pred1 + 16384};
int strides[3] = {64, 64, 64};
vp9_build_inter_predictors_for_planes_single_buf(
xd, bsize, mi_row, mi_col, 0, preds0, strides);
vp9_build_inter_predictors_for_planes_single_buf(
xd, bsize, mi_row, mi_col, 1, preds1, strides);
for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
mbmi->interinter_wedge_index = wedge_index;
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
vp9_build_wedge_inter_predictor_from_buf(xd, bsize, mi_row, mi_col,
preds0, strides,
preds1, strides);
model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, NULL, NULL);
rd = RDCOST(x->rdmult, x->rddiv, rs + rate_mv_tmp + rate_sum, dist_sum);
if (rd < best_rd_wedge) {