Rework the rate and distortion computation pipeline

This allows the encoder to use more precise rate and distortion
costs for mode decision.

Change-Id: I7cfd676a88531a194b9a509375feea8365e5ef12
This commit is contained in:
Jingning Han 2015-06-02 10:02:08 -07:00
parent 0207dcde4a
commit a96f2ca319

View File

@ -1132,35 +1132,75 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
return best_rd;
}
static void tx_block_rd_b(MACROBLOCK *x, TX_SIZE tx_size,
static void tx_block_rd_b(VP9_COMP const *cpi, MACROBLOCK *x, TX_SIZE tx_size,
int blk_row, int blk_col, int plane, int block,
int plane_bsize, ENTROPY_CONTEXT *above_ctx,
ENTROPY_CONTEXT *left_ctx,
int *rate, int64_t *dist, int64_t *bsse, int *skip) {
MACROBLOCKD *const xd = &x->e_mbd;
struct macroblockd_plane *const pd = &xd->plane[plane];
const int ss_txfrm_size = tx_size << 1;
const struct macroblock_plane *const p = &x->plane[plane];
int64_t this_sse;
int shift = tx_size == TX_32X32 ? 0 : 2;
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
unsigned int tmp_sse;
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
ENTROPY_CONTEXT *ta = above_ctx + blk_col;
ENTROPY_CONTEXT *tl = left_ctx + blk_row;
scan_order const *sc = get_scan(xd, tx_size, pd->plane_type, 0);
int i;
BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
int bh = 4 * num_4x4_blocks_wide_lookup[txm_bsize];
int src_stride = p->src.stride;
uint8_t *src = &p->src.buf[4 * blk_row * src_stride + 4 * blk_col];
uint8_t *dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
DECLARE_ALIGNED_ARRAY(16, uint8_t, rec_buffer, 32 * 32);
#if CONFIG_VP9_HIGHBITDEPTH
int64_t this_sse;
const int ss_txfrm_size = tx_size << 1;
int shift = tx_size == TX_32X32 ? 0 : 2;
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
(void)this_dist, tmp_sse, txm_bsize, bh, src_stride, src, dst, rec_buffer;
vp9_xform_quant_inter(x, plane, block, blk_row, blk_col,
plane_bsize, tx_size);
*dist += vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
&this_sse, xd->bd) >> shift;
*bsse += this_sse >> shift;
#else
vp9_convolve_copy(dst, pd->dst.stride, rec_buffer, 32,
NULL, 0, NULL, 0, bh, bh);
cpi->fn_ptr[txm_bsize].vf(src, src_stride,
rec_buffer, 32, &tmp_sse);
*bsse += (int64_t)tmp_sse * 16;
vp9_xform_quant_inter(x, plane, block, blk_row, blk_col,
plane_bsize, tx_size);
#if CONFIG_VP9_HIGHBITDEPTH
*dist += vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
&this_sse, xd->bd) >> shift;
#else
*dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
&this_sse) >> shift;
if (p->eobs[block] > 0) {
switch (tx_size) {
case TX_32X32:
vp9_idct32x32_add(dqcoeff, rec_buffer, 32, p->eobs[block]);
break;
case TX_16X16:
vp9_idct16x16_add(dqcoeff, rec_buffer, 32, p->eobs[block]);
break;
case TX_8X8:
vp9_idct8x8_add(dqcoeff, rec_buffer, 32, p->eobs[block]);
break;
case TX_4X4:
// this is like vp9_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
x->itxm_add(dqcoeff, rec_buffer, 32, p->eobs[block]);
break;
default:
assert(0 && "Invalid transform size");
break;
}
cpi->fn_ptr[txm_bsize].vf(src, src_stride,
rec_buffer, 32, &tmp_sse);
}
*dist += (int64_t)tmp_sse * 16;
#endif // CONFIG_VP9_HIGHBITDEPTH
*bsse += this_sse >> shift;
switch (tx_size) {
case TX_4X4:
@ -1231,7 +1271,7 @@ static void select_tx_block(const VP9_COMP *cpi, MACROBLOCK *x,
mbmi->tx_size = tx_size;
if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
tx_block_rd_b(x, tx_size, blk_row, blk_col, plane, block,
tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
plane_bsize, ta, tl, rate, dist, bsse, skip);
if (tx_size > TX_4X4)
*rate += 256;
@ -1371,7 +1411,7 @@ static void tx_block_rd(const VP9_COMP *cpi, MACROBLOCK *x,
return;
if (tx_size == plane_tx_size) {
tx_block_rd_b(x, tx_size, blk_row, blk_col, plane, block,
tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
plane_bsize, above_ctx, left_ctx, rate, dist, bsse, skip);
} else {
BLOCK_SIZE bsize = txsize_to_bsize[tx_size];