Rework the rate and distortion computation pipeline

This allows the encoder to use more precise rate and distortion costs for mode decision. Change-Id: I7cfd676a88531a194b9a509375feea8365e5ef12
2015-06-02 10:02:08 -07:00 · 2015-06-02 10:02:08 -07:00 · a96f2ca319
commit a96f2ca319
parent 0207dcde4a
1 changed files with 54 additions and 14 deletions
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -1132,35 +1132,75 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
  return best_rd;
 }

-static void tx_block_rd_b(MACROBLOCK *x, TX_SIZE tx_size,
+static void tx_block_rd_b(VP9_COMP const *cpi, MACROBLOCK *x, TX_SIZE tx_size,
                          int blk_row, int blk_col, int plane, int block,
                          int plane_bsize, ENTROPY_CONTEXT *above_ctx,
                          ENTROPY_CONTEXT *left_ctx,
                          int *rate, int64_t *dist, int64_t *bsse, int *skip) {
  MACROBLOCKD *const xd = &x->e_mbd;
  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int ss_txfrm_size = tx_size << 1;
  const struct macroblock_plane *const p = &x->plane[plane];
-  int64_t this_sse;
-  int shift = tx_size == TX_32X32 ? 0 : 2;
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  unsigned int tmp_sse;
  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
  ENTROPY_CONTEXT *ta = above_ctx + blk_col;
  ENTROPY_CONTEXT *tl = left_ctx + blk_row;
  scan_order const *sc = get_scan(xd, tx_size, pd->plane_type, 0);
  int i;
+  BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
+  int bh = 4 * num_4x4_blocks_wide_lookup[txm_bsize];
+  int src_stride = p->src.stride;
+  uint8_t *src = &p->src.buf[4 * blk_row * src_stride + 4 * blk_col];
+  uint8_t *dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, rec_buffer, 32 * 32);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  int64_t this_sse;
+  const int ss_txfrm_size = tx_size << 1;
+  int shift = tx_size == TX_32X32 ? 0 : 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+
+  (void)this_dist, tmp_sse, txm_bsize, bh, src_stride, src, dst, rec_buffer;
+  vp9_xform_quant_inter(x, plane, block, blk_row, blk_col,
+                        plane_bsize, tx_size);
+  *dist += vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                                  &this_sse, xd->bd) >> shift;
+  *bsse += this_sse >> shift;
+#else
+  vp9_convolve_copy(dst, pd->dst.stride, rec_buffer, 32,
+                    NULL, 0, NULL, 0, bh, bh);
+  cpi->fn_ptr[txm_bsize].vf(src, src_stride,
+                            rec_buffer, 32, &tmp_sse);
+  *bsse += (int64_t)tmp_sse * 16;

  vp9_xform_quant_inter(x, plane, block, blk_row, blk_col,
                        plane_bsize, tx_size);

-#if CONFIG_VP9_HIGHBITDEPTH
-  *dist += vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                                  &this_sse, xd->bd) >> shift;
-#else
-  *dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                           &this_sse) >> shift;
+  if (p->eobs[block] > 0) {
+    switch (tx_size) {
+      case TX_32X32:
+        vp9_idct32x32_add(dqcoeff, rec_buffer, 32, p->eobs[block]);
+        break;
+      case TX_16X16:
+        vp9_idct16x16_add(dqcoeff, rec_buffer, 32, p->eobs[block]);
+        break;
+      case TX_8X8:
+        vp9_idct8x8_add(dqcoeff, rec_buffer, 32, p->eobs[block]);
+        break;
+      case TX_4X4:
+        // this is like vp9_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        x->itxm_add(dqcoeff, rec_buffer, 32, p->eobs[block]);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+        break;
+    }
+    cpi->fn_ptr[txm_bsize].vf(src, src_stride,
+                              rec_buffer, 32, &tmp_sse);
+  }
+  *dist += (int64_t)tmp_sse * 16;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  *bsse += this_sse >> shift;

  switch (tx_size) {
    case TX_4X4:
@ -1231,7 +1271,7 @@ static void select_tx_block(const VP9_COMP *cpi, MACROBLOCK *x,
  mbmi->tx_size = tx_size;

  if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
-    tx_block_rd_b(x, tx_size, blk_row, blk_col, plane, block,
+    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
                  plane_bsize, ta, tl, rate, dist, bsse, skip);
    if (tx_size > TX_4X4)
      *rate += 256;
@ -1371,7 +1411,7 @@ static void tx_block_rd(const VP9_COMP *cpi, MACROBLOCK *x,
    return;

  if (tx_size == plane_tx_size) {
-    tx_block_rd_b(x, tx_size, blk_row, blk_col, plane, block,
+    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
                  plane_bsize, above_ctx, left_ctx, rate, dist, bsse, skip);
  } else {
    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];