Merge "Separate transform and quant from vp9_encode_sb" into experimental

2013-05-03 17:19:01 -07:00 · 2013-05-03 17:19:01 -07:00 · 6c622e2783
commit 6c622e2783
parent 4b8f7a67e5 4529c68b3b
11 changed files with 172 additions and 841 deletions
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@ -19,153 +19,3 @@ void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
  else
    xd->inv_txm4x4(dqcoeff, diff, pitch);
 }
 void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff, int16_t *output_coeff,
                                 int pitch) {
  vp9_short_idct8x8(input_dqcoeff, output_coeff, pitch);
 }
 void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,
                                   int16_t *output_coeff, int pitch) {
  vp9_short_idct16x16(input_dqcoeff, output_coeff, pitch);
 }
 void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 3);
  const int stride = 32 << bwl;
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
    const int offset = x_idx * 32 + y_idx * 32 * stride;
    vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 1024),
                        xd->plane[0].diff + offset, stride * 2);
  }
 }
 void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 2);
  const int stride = 16 << bwl, bstride = 4 << bwl;
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
    const TX_TYPE tx_type = get_tx_type_16x16(xd,
                                              (y_idx * bstride + x_idx) * 4);
    const int offset = x_idx * 16 + y_idx * 16 * stride;
    if (tx_type == DCT_DCT) {
      vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 256),
                                    xd->plane[0].diff + offset, stride * 2);
    } else {
      vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 256),
                         xd->plane[0].diff + offset, stride, tx_type);
    }
  }
 }
 void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 1);
  const int stride = 8 << bwl, bstride = 2 << bwl;
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * bstride + x_idx) * 2);
    const int offset = x_idx * 8 + y_idx * 8 * stride;
    if (tx_type == DCT_DCT) {
      vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 64),
                                  xd->plane[0].diff + offset, stride * 2);
    } else {
      vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 64),
                       xd->plane[0].diff + offset, stride, tx_type);
    }
  }
 }
 void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
  const int bh = 1 << b_height_log2(bsize);
  const int stride = 4 << bwl, bstride = 1 << bwl;
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * bstride + x_idx);
    const int offset = x_idx * 4 + y_idx * 4 * stride;
    if (tx_type == DCT_DCT) {
      vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[n],
                                  BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 16),
                                  xd->plane[0].diff + offset, stride * 2);
    } else {
      vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 16),
                       xd->plane[0].diff + offset, stride, tx_type);
    }
  }
 }
 void vp9_inverse_transform_sbuv_32x32(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
  assert(bsize == BLOCK_SIZE_SB64X64);
  vp9_short_idct32x32(xd->plane[1].dqcoeff, xd->plane[1].diff, 64);
  vp9_short_idct32x32(xd->plane[2].dqcoeff, xd->plane[2].diff, 64);
 }
 void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 2, bhl = b_height_log2(bsize) - 2;
  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
  const int stride = 16 << (bwl - 1);
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
    const int off = x_idx * 16 + y_idx * stride * 16;
    vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 256),
                                  xd->plane[1].diff + off, stride * 2);
    vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 256),
                                  xd->plane[2].diff + off, stride * 2);
  }
 }
 void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 1, bhl = b_height_log2(bsize) - 1;
  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
  const int stride = 8 << (bwl - 1);
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
    const int off = x_idx * 8 + y_idx * stride * 8;
    vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 64),
                                xd->plane[1].diff + off, stride * 2);
    vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 64),
                                xd->plane[2].diff + off, stride * 2);
  }
 }
 void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
  const int stride = 4 << (bwl - 1);
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
    const int off = x_idx * 4 + y_idx * stride * 4;
    vp9_inverse_transform_b_4x4(xd, xd->plane[1].eobs[n],
                                BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 16),
                                xd->plane[1].diff + off, stride * 2);
    vp9_inverse_transform_b_4x4(xd, xd->plane[2].eobs[n],
                                BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 16),
                                xd->plane[2].diff + off, stride * 2);
  }
 }
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@ -18,20 +18,4 @@
 void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
                                 int16_t *dqcoeff, int16_t *diff,
                                 int pitch);
 void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff,
                                 int16_t *output_coeff, int pitch);
 void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,
                                   int16_t *output_coeff, int pitch);
 void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
 void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
 void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
 void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
 void vp9_inverse_transform_sbuv_32x32(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
 void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
 void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
 void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
 #endif  // VP9_COMMON_VP9_INVTRANS_H_
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@ -164,12 +164,12 @@ struct macroblock {
  void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);
  void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
                         int y_blocks);
 #if !CONFIG_SB8X8
  void (*quantize_b_4x4_pair)(MACROBLOCK *x, int b_idx1, int b_idx2,
                              int y_blocks);
  void (*quantize_b_16x16)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
                           int y_blocks);
  void (*quantize_b_8x8)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
                         int y_blocks);
 #endif
 };
 #endif  // VP9_ENCODER_VP9_BLOCK_H_
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@ -2438,13 +2438,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
    vp9_encode_intra4x4mby(x, bsize);
    vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
-    vp9_subtract_sbuv(x, bsize);
+    vp9_encode_sbuv(cm, x, bsize);
    vp9_transform_sbuv_4x4(x, bsize);
    vp9_quantize_sbuv_4x4(x, bsize);
    if (x->optimize)
      vp9_optimize_sbuv(cm, x, bsize);
    vp9_inverse_transform_sbuv_4x4(xd, bsize);
    vp9_recon_sbuv(xd, bsize);
    if (output_enabled)
      sum_intra_stats(cpi, x);
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@ -104,63 +104,16 @@ void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {
 void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {
  MACROBLOCKD *xd = &x->e_mbd;
  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
  vp9_build_intra_predictors_sby_s(xd, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sby(x, BLOCK_SIZE_MB16X16);
+  vp9_encode_sby(cm, x, BLOCK_SIZE_MB16X16);
  switch (tx_size) {
    case TX_16X16:
      vp9_transform_sby_16x16(x, BLOCK_SIZE_MB16X16);
      vp9_quantize_sby_16x16(x, BLOCK_SIZE_MB16X16);
      if (x->optimize)
        vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
      vp9_inverse_transform_sby_16x16(xd, BLOCK_SIZE_MB16X16);
      break;
    case TX_8X8:
      vp9_transform_sby_8x8(x, BLOCK_SIZE_MB16X16);
      vp9_quantize_sby_8x8(x, BLOCK_SIZE_MB16X16);
      if (x->optimize)
        vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
      vp9_inverse_transform_sby_8x8(xd, BLOCK_SIZE_MB16X16);
      break;
    default:
      vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);
      vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
      if (x->optimize)
        vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
      vp9_inverse_transform_sby_4x4(xd, BLOCK_SIZE_MB16X16);
      break;
  }
  vp9_recon_sby(xd, BLOCK_SIZE_MB16X16);
 }
 void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {
  MACROBLOCKD *xd = &x->e_mbd;
  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
  vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);
+  vp9_encode_sbuv(cm, x, BLOCK_SIZE_MB16X16);
  switch (tx_size) {
    case TX_4X4:
      vp9_transform_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
      vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
      if (x->optimize)
        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
      vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
      break;
    default:  // 16x16 or 8x8
      vp9_transform_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
      vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
      if (x->optimize)
        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
      vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
      break;
    }
  vp9_recon_sbuv(xd, BLOCK_SIZE_MB16X16);
 }
 #if !CONFIG_SB8X8
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@ -67,143 +67,6 @@ void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
 }
 void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 3);
  const int stride = 32 << bwl;
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
    vp9_short_fdct32x32(x->plane[0].src_diff + y_idx * stride * 32 + x_idx * 32,
                        x->plane[0].coeff + n * 1024, stride * 2);
  }
 }
 void vp9_transform_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 2);
  const int stride = 16 << bwl, bstride = 4 << bwl;
  MACROBLOCKD *const xd = &x->e_mbd;
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
    const TX_TYPE tx_type = get_tx_type_16x16(xd,
                                              (y_idx * bstride + x_idx) * 4);
    if (tx_type != DCT_DCT) {
      vp9_short_fht16x16(x->plane[0].src_diff +
                             y_idx * stride * 16 + x_idx * 16,
                         x->plane[0].coeff + n * 256, stride, tx_type);
    } else {
      x->fwd_txm16x16(x->plane[0].src_diff + y_idx * stride * 16 + x_idx * 16,
                      x->plane[0].coeff + n * 256, stride * 2);
    }
  }
 }
 void vp9_transform_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 1);
  const int stride = 8 << bwl, bstride = 2 << bwl;
  MACROBLOCKD *const xd = &x->e_mbd;
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * bstride + x_idx) * 2);
    if (tx_type != DCT_DCT) {
      vp9_short_fht8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8,
                       x->plane[0].coeff + n * 64, stride, tx_type);
    } else {
      x->fwd_txm8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8,
                    x->plane[0].coeff + n * 64, stride * 2);
    }
  }
 }
 void vp9_transform_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
  const int bh = 1 << b_height_log2(bsize);
  const int stride = 4 << bwl;
  MACROBLOCKD *const xd = &x->e_mbd;
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
    if (tx_type != DCT_DCT) {
      vp9_short_fht4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4,
                       x->plane[0].coeff + n * 16, stride, tx_type);
    } else {
      x->fwd_txm4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4,
                    x->plane[0].coeff + n * 16, stride * 2);
    }
  }
 }
 void vp9_transform_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  assert(bsize == BLOCK_SIZE_SB64X64);
  vp9_clear_system_state();
  vp9_short_fdct32x32(x->plane[1].src_diff, x->plane[1].coeff, 64);
  vp9_short_fdct32x32(x->plane[2].src_diff, x->plane[2].coeff, 64);
 }
 void vp9_transform_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 2, bhl = b_height_log2(bsize) - 2;
  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
  const int stride = 16 << (bwl - 1);
  int n;
  vp9_clear_system_state();
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
    x->fwd_txm16x16(x->plane[1].src_diff + y_idx * stride * 16 + x_idx * 16,
                    x->plane[1].coeff + n * 256, stride * 2);
    x->fwd_txm16x16(x->plane[2].src_diff + y_idx * stride * 16 + x_idx * 16,
                    x->plane[2].coeff + n * 256, stride * 2);
  }
 }
 void vp9_transform_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 1, bhl = b_height_log2(bsize) - 1;
  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
  const int stride = 8 << (bwl - 1);
  int n;
  vp9_clear_system_state();
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
    x->fwd_txm8x8(x->plane[1].src_diff + y_idx * stride * 8 + x_idx * 8,
                  x->plane[1].coeff + n * 64, stride * 2);
    x->fwd_txm8x8(x->plane[2].src_diff + y_idx * stride * 8 + x_idx * 8,
                  x->plane[2].coeff + n * 64, stride * 2);
  }
 }
 void vp9_transform_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
  const int stride = 4 << (bwl - 1);
  int n;
  vp9_clear_system_state();
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
    x->fwd_txm4x4(x->plane[1].src_diff + y_idx * stride * 4 + x_idx * 4,
                  x->plane[1].coeff + n * 16, stride * 2);
    x->fwd_txm4x4(x->plane[2].src_diff + y_idx * stride * 4 + x_idx * 4,
                  x->plane[2].coeff + n * 16, stride * 2);
  }
 }
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp9_token_state vp9_token_state;
@ -561,7 +424,7 @@ struct encode_b_args {
  struct optimize_ctx *ctx;
 };
-static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
                         int ss_txfrm_size, void *arg) {
  struct encode_b_args* const args = arg;
  MACROBLOCK* const x = args->x;
@ -572,9 +435,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
  int16_t* const src_diff = raster_block_offset_int16(xd, bsize, plane,
                                                      raster_block,
                                                      x->plane[plane].src_diff);
  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
                                                  raster_block,
                                                  xd->plane[plane].diff);
  TX_TYPE tx_type = DCT_DCT;
  switch (ss_txfrm_size / 2) {
@ -624,6 +484,23 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
  }
  vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type);
 }
 static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                         int ss_txfrm_size, void *arg) {
  struct encode_b_args* const args = arg;
  MACROBLOCK* const x = args->x;
  MACROBLOCKD* const xd = &x->e_mbd;
  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
                                                       block, ss_txfrm_size);
  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
                                                  raster_block,
                                                  xd->plane[plane].diff);
  TX_TYPE tx_type = DCT_DCT;
  xform_quant(plane, block, bsize, ss_txfrm_size, arg);
  if (x->optimize)
    vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, x, args->ctx);
@ -633,6 +510,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                          diff, bw * 2);
      break;
    case TX_16X16:
      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
      if (tx_type == DCT_DCT) {
        vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                            diff, bw * 2);
@ -642,6 +520,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
      }
      break;
    case TX_8X8:
      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
      if (tx_type == DCT_DCT) {
        vp9_short_idct8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                          diff, bw * 2);
@ -651,6 +530,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
      }
      break;
    case TX_4X4:
      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
      if (tx_type == DCT_DCT) {
        // this is like vp9_short_idct4x4 but has a special case around eob<=1
        // which is significant (not just an optimization) for the lossless
@ -665,6 +545,60 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
  }
 }
 void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                         BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
  struct encode_b_args arg = {cm, x, NULL};
  foreach_transformed_block_in_plane(xd, bsize, 0,
 #if !CONFIG_SB8X8
                                     0,
 #endif
                                     xform_quant, &arg);
 }
 void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                         BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
  struct encode_b_args arg = {cm, x, NULL};
  foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
 }
 void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                    BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
  struct optimize_ctx ctx;
  struct encode_b_args arg = {cm, x, &ctx};
  vp9_subtract_sby(x, bsize);
  if (x->optimize)
    vp9_optimize_init(xd, bsize, &ctx);
  foreach_transformed_block_in_plane(xd, bsize, 0,
 #if !CONFIG_SB8X8
                                     0,
 #endif
                                     encode_block, &arg);
  vp9_recon_sby(xd, bsize);
 }
 void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                     BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
  struct optimize_ctx ctx;
  struct encode_b_args arg = {cm, x, &ctx};
  vp9_subtract_sbuv(x, bsize);
  if (x->optimize)
    vp9_optimize_init(xd, bsize, &ctx);
  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
  vp9_recon_sbuv(xd, bsize);
 }
 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
                   BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@ -22,18 +22,6 @@ typedef struct {
  MV_REFERENCE_FRAME second_ref_frame;
 } MODE_DEFINITION;
 #if !CONFIG_SB8X8
 #endif
 void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 struct optimize_ctx {
  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
@ -49,6 +37,14 @@ void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                       BLOCK_SIZE_TYPE bsize);
 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                     BLOCK_SIZE_TYPE bsize);
 void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                        BLOCK_SIZE_TYPE bsize);
 void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize);
 void vp9_subtract_block(int rows, int cols,
                        int16_t *diff_ptr, int diff_stride,
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@ -867,9 +867,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
  }
  cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
 #if !CONFIG_SB8X8
  cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;
  cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;
-  cpi->mb.quantize_b_16x16    = vp9_regular_quantize_b_16x16;
+#endif
  vp9_init_quantizer(cpi);
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@ -133,6 +133,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
           pt_scan, 1);
 }
 #if !CONFIG_SB8X8
 void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                int y_blocks) {
  MACROBLOCKD *const xd = &mb->e_mbd;
@ -154,131 +155,6 @@ void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
           pt_scan, 1);
 }
 void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                  int y_blocks) {
  MACROBLOCKD *const xd = &mb->e_mbd;
  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
  const int *pt_scan = get_scan_16x16(tx_type);
  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
           256, mb->skip_block,
           mb->plane[pb_idx.plane].zbin,
           mb->plane[pb_idx.plane].round,
           mb->plane[pb_idx.plane].quant,
           mb->plane[pb_idx.plane].quant_shift,
           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
           xd->plane[pb_idx.plane].dequant,
           mb->plane[pb_idx.plane].zbin_extra,
           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
           pt_scan, 1);
 }
 void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx, int y_blocks) {
  MACROBLOCKD *const xd = &mb->e_mbd;
  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
           1024, mb->skip_block,
           mb->plane[pb_idx.plane].zbin,
           mb->plane[pb_idx.plane].round,
           mb->plane[pb_idx.plane].quant,
           mb->plane[pb_idx.plane].quant_shift,
           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
           xd->plane[pb_idx.plane].dequant,
           mb->plane[pb_idx.plane].zbin_extra,
           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
           vp9_default_zig_zag1d_32x32, 2);
 }
 void vp9_quantize_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bw = 1 << (b_width_log2(bsize) - 3);
  const int bh = 1 << (b_height_log2(bsize) - 3);
  int n;
  for (n = 0; n < bw * bh; n++)
    vp9_regular_quantize_b_32x32(x, n * 64, bw * bh * 64);
 }
 void vp9_quantize_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 2);
  const int bstride = 16 << bwl;
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
    TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd,
                                        4 * x_idx + y_idx * bstride);
    x->quantize_b_16x16(x, n * 16, tx_type, 16 * bw * bh);
  }
 }
 void vp9_quantize_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 1);
  const int bstride = 4 << bwl;
  int n;
  for (n = 0; n < bw * bh; n++) {
    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd,
                                      2 * x_idx + y_idx * bstride);
    x->quantize_b_8x8(x, n * 4, tx_type, 4 * bw * bh);
  }
 }
 void vp9_quantize_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
  const int bh = 1 << b_height_log2(bsize);
  MACROBLOCKD *const xd = &x->e_mbd;
  int n;
  for (n = 0; n < bw * bh; n++) {
    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
    x->quantize_b_4x4(x, n, tx_type, bw * bh);
  }
 }
 void vp9_quantize_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  assert(bsize == BLOCK_SIZE_SB64X64);
  vp9_regular_quantize_b_32x32(x, 256, 256);
  vp9_regular_quantize_b_32x32(x, 320, 256);
 }
 void vp9_quantize_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 2;
  const int bhl = b_height_log2(bsize) - 2;
  const int uoff = 16 << (bhl + bwl);
  int i;
  for (i = uoff; i < ((uoff * 3) >> 1); i += 16)
    x->quantize_b_16x16(x, i, DCT_DCT, uoff);
 }
 void vp9_quantize_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 1;
  const int bhl = b_height_log2(bsize) - 1;
  const int uoff = 4 << (bhl + bwl);
  int i;
  for (i = uoff; i < ((uoff * 3) >> 1); i += 4)
    x->quantize_b_8x8(x, i, DCT_DCT, uoff);
 }
 void vp9_quantize_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize);
  const int bhl = b_height_log2(bsize);
  const int uoff = 1 << (bhl + bwl);
  int i;
  for (i = uoff; i < ((uoff * 3) >> 1); i++)
    x->quantize_b_4x4(x, i, DCT_DCT, uoff);
 }
 /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
 * these two C functions if corresponding optimized routine is not available.
 * NEON optimized version implements currently the fast quantization for pair
@ -288,6 +164,7 @@ void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *x, int b_idx1, int b_idx2,
  vp9_regular_quantize_b_4x4(x, b_idx1, DCT_DCT, y_blocks);
  vp9_regular_quantize_b_4x4(x, b_idx2, DCT_DCT, y_blocks);
 }
 #endif
 static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
  unsigned t;
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@ -31,20 +31,6 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                int y_blocks);
 void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                int y_blocks);
 void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                  int y_blocks);
 void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx,
                                  int y_blocks);
 void vp9_quantize_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_quantize_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_quantize_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_quantize_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_quantize_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_quantize_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_quantize_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_quantize_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 struct VP9_COMP;
 extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -288,7 +288,7 @@ int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {
 }
 static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
-                              int ib, PLANE_TYPE type,
+                              int plane, int block, PLANE_TYPE type,
                              ENTROPY_CONTEXT *A,
                              ENTROPY_CONTEXT *L,
                              TX_SIZE tx_size,
@ -299,10 +299,9 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
  int c = 0;
  int cost = 0, pad;
  const int *scan, *nb;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, ib);
+  const int eob = xd->plane[plane].eobs[block];
-  const int eob = xd->plane[pb_idx.plane].eobs[pb_idx.block];
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff,
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff,
+                                           block, 16);
                                           pb_idx.block, 16);
  const int ref = mbmi->ref_frame != INTRA_FRAME;
  unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
      mb->token_costs[tx_size][type][ref];
@ -329,7 +328,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
 #endif
  // Check for consistency of tx_size with mode info
-  assert((!type && !pb_idx.plane) || (type && pb_idx.plane));
+  assert((!type && !plane) || (type && plane));
  if (type == PLANE_TYPE_Y_WITH_DC) {
    assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
  } else {
@ -340,7 +339,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
  switch (tx_size) {
    case TX_4X4: {
      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_4x4(xd, ib) : DCT_DCT;
+          get_tx_type_4x4(xd, block) : DCT_DCT;
      above_ec = A[0] != 0;
      left_ec = L[0] != 0;
      coef_probs = cm->fc.coef_probs_4x4;
@ -354,7 +353,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
    case TX_8X8: {
      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
      const int sz = 1 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
+      const int x = block & ((1 << sz) - 1), y = block - x;
      TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
      above_ec = (A[0] + A[1]) != 0;
@ -370,7 +369,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
    case TX_16X16: {
      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
      const int sz = 2 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
+      const int x = block & ((1 << sz) - 1), y = block - x;
      TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
      scan = get_scan_16x16(tx_type);
@ -610,9 +609,10 @@ static int block_error(int16_t *coeff, int16_t *dqcoeff,
  return error > INT_MAX ? INT_MAX : (int)error;
 }
-static int block_error_sby(MACROBLOCK *x, int block_size, int shift) {
+static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
  return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
-                     block_size, shift);
+                     16 << (bwl + bhl), shift);
 }
 static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
@ -630,155 +630,54 @@ static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
  return sum > INT_MAX ? INT_MAX : (int)sum;
 }
-static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
+static int rdcost_plane(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize) {
+                        int plane, BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
  const int bh = 1 << b_height_log2(bsize);
  int cost = 0, b;
  MACROBLOCKD *const xd = &x->e_mbd;
  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
  const int bw = 1 << bwl, bh = 1 << bhl;
  ENTROPY_CONTEXT t_above[16], t_left[16];
  int block, cost;
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
+  vpx_memcpy(&t_above, xd->plane[plane].above_context,
             sizeof(ENTROPY_CONTEXT) * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
+  vpx_memcpy(&t_left,  xd->plane[plane].left_context,
             sizeof(ENTROPY_CONTEXT) * bh);
-  for (b = 0; b < bw * bh; b++) {
+  cost = 0;
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
+  for (block = 0; block < bw * bh; block += 1 << (tx_size * 2)) {
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
+    int x_idx, y_idx;
    txfrm_block_to_raster_xy(xd, bsize, plane, block, tx_size * 2,
                             &x_idx, &y_idx);
    cost += cost_coeffs(cm, x, plane, block, xd->plane[plane].plane_type,
                        t_above + x_idx, t_left + y_idx,
-                        TX_4X4, bw * bh);
+                        tx_size, bw * bh);
  }
  return cost;
 }
-static void super_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
+static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate, int *distortion, int *skippable,
+                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
-                                BLOCK_SIZE_TYPE bsize) {
+  int cost = 0, plane;
  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
  MACROBLOCKD *const xd = &x->e_mbd;
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-  vp9_transform_sby_4x4(x, bsize);
+    cost += rdcost_plane(cm, x, plane, bsize, tx_size);
  vp9_quantize_sby_4x4(x, bsize);
  *distortion = block_error_sby(x, 16 << (bwl + bhl), 2);
  *rate       = rdcost_sby_4x4(cm, x, bsize);
  *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
 static int rdcost_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 1);
  int cost = 0, b;
  MACROBLOCKD *const xd = &x->e_mbd;
  ENTROPY_CONTEXT t_above[16], t_left[16];
  vpx_memcpy(&t_above, xd->plane[0].above_context,
             sizeof(ENTROPY_CONTEXT) * 2 * bw);
  vpx_memcpy(&t_left,  xd->plane[0].left_context,
             sizeof(ENTROPY_CONTEXT) * 2 * bh);
  for (b = 0; b < bw * bh; b++) {
    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
    cost += cost_coeffs(cm, x, b * 4, PLANE_TYPE_Y_WITH_DC,
                        t_above + x_idx * 2, t_left + y_idx * 2,
                        TX_8X8, 4 * bw * bh);
  }
  return cost;
 }
-static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
+static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate, int *distortion, int *skippable,
+                                     int *rate, int *distortion, int *skippable,
-                                BLOCK_SIZE_TYPE bsize) {
+                                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
  MACROBLOCKD *const xd = &x->e_mbd;
  xd->mode_info_context->mbmi.txfm_size = tx_size;
  vp9_xform_quant_sby(cm, x, bsize);
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);
-  vp9_transform_sby_8x8(x, bsize);
+  *rate       = rdcost_plane(cm, x, 0, bsize, tx_size);
  vp9_quantize_sby_8x8(x, bsize);
  *distortion = block_error_sby(x, 16 << (bhl + bwl), 2);
  *rate       = rdcost_sby_8x8(cm, x, bsize);
  *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
 static int rdcost_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
                            BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 2);
  int cost = 0, b;
  MACROBLOCKD *const xd = &x->e_mbd;
  ENTROPY_CONTEXT t_above[16], t_left[16];
  vpx_memcpy(&t_above, xd->plane[0].above_context,
             sizeof(ENTROPY_CONTEXT) * 4 * bw);
  vpx_memcpy(&t_left,  xd->plane[0].left_context,
             sizeof(ENTROPY_CONTEXT) * 4 * bh);
  for (b = 0; b < bw * bh; b++) {
    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
    cost += cost_coeffs(cm, x, b * 16, PLANE_TYPE_Y_WITH_DC,
                        t_above + x_idx * 4, t_left + y_idx * 4,
                        TX_16X16, bw * bh * 16);
  }
  return cost;
 }
 static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
                                  int *rate, int *distortion, int *skippable,
                                  BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
  MACROBLOCKD *const xd = &x->e_mbd;
  xd->mode_info_context->mbmi.txfm_size = TX_16X16;
  vp9_transform_sby_16x16(x, bsize);
  vp9_quantize_sby_16x16(x, bsize);
  *distortion = block_error_sby(x, 16 << (bwl + bhl), 2);
  *rate       = rdcost_sby_16x16(cm, x, bsize);
  *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
 static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
                            BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 3);
  int cost = 0, b;
  MACROBLOCKD * const xd = &x->e_mbd;
  ENTROPY_CONTEXT t_above[16], t_left[16];
  vpx_memcpy(&t_above, xd->plane[0].above_context,
             sizeof(ENTROPY_CONTEXT) * 8 * bw);
  vpx_memcpy(&t_left,  xd->plane[0].left_context,
             sizeof(ENTROPY_CONTEXT) * 8 * bh);
  for (b = 0; b < bw * bh; b++) {
    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
    cost += cost_coeffs(cm, x, b * 64, PLANE_TYPE_Y_WITH_DC,
                        t_above + x_idx * 8, t_left + y_idx * 8,
                        TX_32X32, bw * bh * 64);
  }
  return cost;
 }
 static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
                                  int *rate, int *distortion, int *skippable,
                                  BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
  MACROBLOCKD *const xd = &x->e_mbd;
  xd->mode_info_context->mbmi.txfm_size = TX_32X32;
  vp9_transform_sby_32x32(x, bsize);
  vp9_quantize_sby_32x32(x, bsize);
  *distortion = block_error_sby(x, 16 << (bwl + bhl), 0);
  *rate       = rdcost_sby_32x32(cm, x, bsize);
  *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
@ -792,13 +691,15 @@ static void super_block_yrd(VP9_COMP *cpi,
  vp9_subtract_sby(x, bs);
  if (bs >= BLOCK_SIZE_SB32X32)
-    super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
+    super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
-                          bs);
+                             bs, TX_32X32);
  if (bs >= BLOCK_SIZE_MB16X16)
-    super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
+    super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
-                          bs);
+                             bs, TX_16X16);
-  super_block_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8],   bs);
+  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,
-  super_block_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4],   bs);
+                           TX_8X8);
  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,
                           TX_4X4);
  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
                           TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
@ -915,7 +816,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
    tempa = ta;
    templ = tl;
-    ratey = cost_coeffs(cm, x, ib,
+    ratey = cost_coeffs(cm, x, 0, ib,
                        PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4, 16);
    rate += ratey;
    distortion = vp9_block_error(coeff,
@ -1142,7 +1043,7 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
      distortion = vp9_block_error_c(coeff,
          BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
-      rate_t = cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
+      rate_t = cost_coeffs(cm, x, 0, idx, PLANE_TYPE_Y_WITH_DC,
                           ta_temp, tl_temp, TX_8X8, 16);
      rate += rate_t;
@ -1177,12 +1078,12 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
        distortion += vp9_block_error_c(coeff,
            BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[i], 16),
            16 << do_two);
-        rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
+        rate_t += cost_coeffs(cm, x, 0, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
                              &ta_temp[i & 1], &tl_temp[i >> 1],
                              TX_4X4, 16);
        if (do_two) {
          i++;
-          rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
+          rate_t += cost_coeffs(cm, x, 0, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
                                &ta_temp[i & 1], &tl_temp[i >> 1],
                                TX_4X4, 16);
        }
@ -1322,165 +1223,16 @@ static int64_t rd_pick_intra8x8mby_modes_and_txsz(VP9_COMP *cpi, MACROBLOCK *x,
 }
 #endif  // !CONFIG_SB8X8
-static int rd_cost_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
+static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
+                                      int *rate, int *distortion,
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
+                                      int *skippable, BLOCK_SIZE_TYPE bsize,
-  const int bh = 1 << (b_height_log2(bsize) - 1);
+                                      TX_SIZE uv_tx_size) {
  int yoff = 4 * bw * bh;
  int p, b, cost = 0;
  MACROBLOCKD *const xd = &x->e_mbd;
  vp9_xform_quant_sbuv(cm, x, bsize);
-  for (p = 1; p < MAX_MB_PLANE; p++) {
+  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);
-    ENTROPY_CONTEXT t_above[8], t_left[8];
+  *rate       = rdcost_uv(cm, x, bsize, uv_tx_size);
-
+  *skippable  = vp9_sbuv_is_skippable(xd, bsize);
    vpx_memcpy(t_above, xd->plane[p].above_context,
               sizeof(ENTROPY_CONTEXT) * 2 * bw >> xd->plane[p].subsampling_x);
    vpx_memcpy(t_left, xd->plane[p].left_context,
               sizeof(ENTROPY_CONTEXT) * 2 * bh >> xd->plane[p].subsampling_y);
    for (b = 0; b < bw * bh; b++) {
      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
      cost += cost_coeffs(cm, x, yoff + b, PLANE_TYPE_UV,
                          t_above + x_idx, t_left + y_idx,
                          TX_4X4, bw * bh * 4);
    }
    yoff = (yoff * 5) >> 2;  // u -> v
  }
  return cost;
 }
 static void super_block_uvrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
                                 int *rate, int *distortion, int *skip,
                                 BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD *const xd = &x->e_mbd;
  vp9_transform_sbuv_4x4(x, bsize);
  vp9_quantize_sbuv_4x4(x, bsize);
  *rate       = rd_cost_sbuv_4x4(cm, x, bsize);
  *distortion = block_error_sbuv(x, bsize, 2);
  *skip       = vp9_sbuv_is_skippable(xd, bsize);
 }
 static int rd_cost_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
                            BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 2);
  int yoff = 16 * bw * bh;
  int p, b, cost = 0;
  MACROBLOCKD *const xd = &x->e_mbd;
  for (p = 1; p < MAX_MB_PLANE; p++) {
    ENTROPY_CONTEXT t_above[8], t_left[8];
    vpx_memcpy(t_above, xd->plane[p].above_context,
               sizeof(ENTROPY_CONTEXT) * 4 * bw >> xd->plane[p].subsampling_x);
    vpx_memcpy(t_left, xd->plane[p].left_context,
               sizeof(ENTROPY_CONTEXT) * 4 * bh >> xd->plane[p].subsampling_y);
    for (b = 0; b < bw * bh; b++) {
      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
      cost += cost_coeffs(cm, x, yoff + b * 4, PLANE_TYPE_UV,
                          t_above + x_idx * 2, t_left + y_idx * 2,
                          TX_8X8, bw * bh * 16);
    }
    yoff = (yoff * 5) >> 2;  // u -> v
  }
  return cost;
 }
 static void super_block_uvrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
                                 int *rate, int *distortion, int *skip,
                                 BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD *const xd = &x->e_mbd;
  vp9_transform_sbuv_8x8(x, bsize);
  vp9_quantize_sbuv_8x8(x, bsize);
  *rate       = rd_cost_sbuv_8x8(cm, x, bsize);
  *distortion = block_error_sbuv(x, bsize, 2);
  *skip       = vp9_sbuv_is_skippable(xd, bsize);
 }
 static int rd_cost_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
                              BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 3);
  int yoff = 64 * bw * bh;
  int p, b, cost = 0;
  MACROBLOCKD *const xd = &x->e_mbd;
  for (p = 1; p < MAX_MB_PLANE; p++) {
    ENTROPY_CONTEXT t_above[8], t_left[8];
    vpx_memcpy(t_above, xd->plane[p].above_context,
               sizeof(ENTROPY_CONTEXT) * 8 * bw >> xd->plane[p].subsampling_x);
    vpx_memcpy(t_left, xd->plane[p].left_context,
               sizeof(ENTROPY_CONTEXT) * 8 * bh >> xd->plane[p].subsampling_y);
    for (b = 0; b < bw * bh; b++) {
      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
      cost += cost_coeffs(cm, x, yoff + b * 16, PLANE_TYPE_UV,
                          t_above + x_idx * 4, t_left + y_idx * 4,
                          TX_16X16, bw * bh * 64);
    }
    yoff = (yoff * 5) >> 2;  // u -> v
  }
  return cost;
 }
 static void super_block_uvrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
                                   int *rate, int *distortion, int *skip,
                                   BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD *const xd = &x->e_mbd;
  vp9_transform_sbuv_16x16(x, bsize);
  vp9_quantize_sbuv_16x16(x, bsize);
  *rate       = rd_cost_sbuv_16x16(cm, x, bsize);
  *distortion = block_error_sbuv(x, bsize, 2);
  *skip       = vp9_sbuv_is_skippable(xd, bsize);
 }
 static int rd_cost_sbuv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
                              BLOCK_SIZE_TYPE bsize) {
  const int bwl = b_width_log2(bsize) - 4, bw = 1 << bwl;
  const int bh = 1 << (b_height_log2(bsize) - 4);
  int yoff = 256 * bh * bw;
  int p, b, cost = 0;
  MACROBLOCKD *const xd = &x->e_mbd;
  for (p = 1; p < MAX_MB_PLANE; p++) {
    ENTROPY_CONTEXT t_above[8], t_left[8];
    vpx_memcpy(t_above, xd->plane[p].above_context,
               sizeof(ENTROPY_CONTEXT) * 16 * bw >> xd->plane[p].subsampling_x);
    vpx_memcpy(t_left, xd->plane[p].left_context,
               sizeof(ENTROPY_CONTEXT) * 16 * bh >> xd->plane[p].subsampling_y);
    for (b = 0; b < bw * bh; b++) {
      const int x_idx = b * (bw - 1), y_idx = b >> bwl;
      cost += cost_coeffs(cm, x, yoff + b * 64, PLANE_TYPE_UV,
                          t_above + x_idx * 8, t_left + y_idx * 8,
                          TX_32X32, 256 * bh * bw);
    }
    yoff = (yoff * 5) >> 2;  // u -> v
  }
  return cost;
 }
 #undef UVCTX
 static void super_block_uvrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
                                   int *rate, int *distortion, int *skip,
                                   BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD *const xd = &x->e_mbd;
  vp9_transform_sbuv_32x32(x, bsize);
  vp9_quantize_sbuv_32x32(x, bsize);
  *rate       = rd_cost_sbuv_32x32(cm, x, bsize);
  *distortion = block_error_sbuv(x, bsize, 0);
  *skip       = vp9_sbuv_is_skippable(xd, bsize);
 }
 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
@ -1492,13 +1244,17 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
  vp9_subtract_sbuv(x, bsize);
  if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) {
-    super_block_uvrd_32x32(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
                              TX_32X32);
  } else if (mbmi->txfm_size >= TX_16X16 && bsize >= BLOCK_SIZE_SB32X32) {
-    super_block_uvrd_16x16(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
                              TX_16X16);
  } else if (mbmi->txfm_size >= TX_8X8 && bsize >= BLOCK_SIZE_MB16X16) {
-    super_block_uvrd_8x8(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
                              TX_8X8);
  } else {
-    super_block_uvrd_4x4(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
                              TX_4X4);
  }
 }
@ -1735,7 +1491,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
                                       BLOCK_OFFSET(xd->plane[0].dqcoeff,
                                                    i, 16), 16);
      *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
+      *labelyrate += cost_coeffs(cm, x, 0, i, PLANE_TYPE_Y_WITH_DC,
                                 ta + (i & 1),
                                 tl + (i >> 1), TX_4X4, 16);
    }
@ -2245,7 +2001,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
      thisdistortion = vp9_block_error(coeff,
          BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), 16);
      *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
+      *labelyrate += cost_coeffs(cm, x, 0, i, PLANE_TYPE_Y_WITH_DC,
                                 ta + (i & 3),
                                 tl + (i >> 2), TX_4X4, 16);
    }
@ -2328,7 +2084,7 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
              BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
          otherdist += thisdistortion;
          xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-          othercost += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
+          othercost += cost_coeffs(cm, x, 0, idx, PLANE_TYPE_Y_WITH_DC,
                                   tac + (i & 1) * 2,
                                   tlc + (i & 2),
                                   TX_8X8, 16);
@ -2347,12 +2103,12 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
              BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);
          *distortion += thisdistortion;
          *labelyrate +=
-              cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
+              cost_coeffs(cm, x, 0, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
                          ta + (i & 1) * 2,
                          tl + (i & 2) + ((j & 2) >> 1),
                          TX_4X4, 16);
          *labelyrate +=
-              cost_coeffs(cm, x, ib + iblock[j] + 1,
+              cost_coeffs(cm, x, 0, ib + iblock[j] + 1,
                          PLANE_TYPE_Y_WITH_DC,
                          ta + (i & 1) * 2 + 1,
                          tl + (i & 2) + ((j & 2) >> 1),
@ -2374,12 +2130,12 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
            otherdist += thisdistortion;
            xd->mode_info_context->mbmi.txfm_size = TX_4X4;
            othercost +=
-                cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
+                cost_coeffs(cm, x, 0, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
                            tac + (i & 1) * 2,
                            tlc + (i & 2) + ((j & 2) >> 1),
                            TX_4X4, 16);
            othercost +=
-                cost_coeffs(cm, x, ib + iblock[j] + 1,
+                cost_coeffs(cm, x, 0, ib + iblock[j] + 1,
                            PLANE_TYPE_Y_WITH_DC,
                            tac + (i & 1) * 2 + 1,
                            tlc + (i & 2) + ((j & 2) >> 1),
@ -2392,7 +2148,7 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
        thisdistortion = vp9_block_error_c(coeff,
            BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
        *distortion += thisdistortion;
-        *labelyrate += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
+        *labelyrate += cost_coeffs(cm, x, 0, idx, PLANE_TYPE_Y_WITH_DC,
                                   ta + (i & 1) * 2,
                                   tl + (i & 2),
                                   TX_8X8, 16);
@ -4215,8 +3971,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
        vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);
-        super_block_uvrd_4x4(cm, x, &rate_uv, &distortion_uv,
+        super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
-                             &uv_skippable, BLOCK_SIZE_MB16X16);
+                                  &uv_skippable, BLOCK_SIZE_MB16X16, TX_4X4);
        rate2 += rate_uv;
        distortion2 += distortion_uv;
        skippable = skippable && uv_skippable;
@ -5235,8 +4991,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                      bsize);
      vp9_subtract_sbuv(x, bsize);
-      super_block_uvrd_4x4(cm, x, &rate_uv, &distortion_uv,
+      super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
-                           &uv_skippable, bsize);
+                                &uv_skippable, bsize, TX_4X4);
      rate2 += rate_uv;
      distortion2 += distortion_uv;
      skippable = skippable && uv_skippable;