From a517343ca33edebadd963485abdd1a2cacda7df6 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Fri, 20 Sep 2013 16:29:24 -0700 Subject: [PATCH] Enable per transformed block zero coeffs forcing This commit enables forcing all coefficients zero per transformed block, when its rate-distortion cost is lower than regular coeff quantization. The overall performance improvement (including its parent patch on calculating rd cost per transformed block) at speed 1: derf: 0.298% yt: 0.452% hd: 0.741% stdhd: 0.006% Change-Id: I66005fe0fd7af192c3eba32e02fd6d77952accb5 --- vp9/encoder/vp9_block.h | 2 ++ vp9/encoder/vp9_encodeframe.c | 4 +++- vp9/encoder/vp9_encodemb.c | 8 ++++++++ vp9/encoder/vp9_rdopt.c | 16 ++++++++++++++++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 013047e35..5a0d746c8 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -34,6 +34,7 @@ typedef struct { typedef struct { MODE_INFO mic; PARTITION_INFO partition_info; + unsigned char zcoeff_blk[256]; int skip; int_mv best_ref_mv; int_mv second_best_ref_mv; @@ -136,6 +137,7 @@ struct macroblock { int mv_row_min; int mv_row_max; + unsigned char zcoeff_blk[TX_SIZES][256]; int skip; int encode_breakout; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index ee938bda9..267d5d936 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -390,6 +390,9 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, } x->skip = ctx->skip; + vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk, + sizeof(ctx->zcoeff_blk)); + if (!output_enabled) return; @@ -2744,7 +2747,6 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col, &xd->scale_factor[1]); - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); } diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 2c12477a7..30d01d793 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -482,6 +482,14 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block, pd->dst.buf, pd->dst.stride); + + // TODO(jingning): per transformed block zero forcing only enabled for + // luma component. will integrate chroma components as well. + if (x->zcoeff_blk[tx_size][block] && plane == 0) { + pd->eobs[block] = 0; + return; + } + vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); if (x->optimize) diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index bd465dfbb..adaa1a29d 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -624,7 +624,12 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, rate_block(plane, block, plane_bsize, tx_size, args); rd1 = RDCOST(x->rdmult, x->rddiv, args->rate[block], args->dist[block]); rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse[block]); + + // TODO(jingning): temporarily enabled only for luma component rd = MIN(rd1, rd2); + if (plane == 0) + x->zcoeff_blk[tx_size][block] = rd1 > rd2; + args->this_rate += args->rate[block]; args->this_dist += args->dist[block]; args->this_sse += args->sse[block]; @@ -2234,6 +2239,9 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, ctx->comp_pred_diff = (int)comp_pred_diff[COMP_PREDICTION_ONLY]; ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION]; + vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[xd->this_mi->mbmi.tx_size], + sizeof(ctx->zcoeff_blk)); + // FIXME(rbultje) does this memcpy the whole array? I believe sizeof() // doesn't actually work this way memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff)); @@ -3153,8 +3161,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, const int bws = num_8x8_blocks_wide_lookup[bsize] / 2; const int bhs = num_8x8_blocks_high_lookup[bsize] / 2; int best_skip2 = 0; + unsigned char best_zcoeff_blk[256] = { 0 }; x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH; + vpx_memset(x->zcoeff_blk, 0, sizeof(x->zcoeff_blk)); + vpx_memset(ctx->zcoeff_blk, 0, sizeof(ctx->zcoeff_blk)); for (i = 0; i < 4; i++) { int j; @@ -3826,6 +3837,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_mbmode = *mbmi; best_skip2 = this_skip2; best_partition = *x->partition_info; + vpx_memcpy(best_zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], + sizeof(best_zcoeff_blk)); if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV) for (i = 0; i < 4; i++) @@ -4021,6 +4034,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int; } + vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], best_zcoeff_blk, + sizeof(best_zcoeff_blk)); + for (i = 0; i < NB_PREDICTION_TYPES; ++i) { if (best_pred_rd[i] == INT64_MAX) best_pred_diff[i] = INT_MIN;