diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 124057634..bfe5e3285 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -300,7 +300,7 @@ void vp9_foreach_transformed_block(
     const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
     foreach_transformed_block_visitor visit, void *arg);
 
-static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
+static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
                                             TX_SIZE tx_size, int block,
                                             int *x, int *y) {
   const int bwl = b_width_log2_lookup[plane_bsize];
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 839f6239c..690ead568 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -338,7 +338,7 @@ struct inter_args {
   const int16_t *const uv_dequant;
 };
 
-static void decode_reconstruct_tx(int mi_row, int mi_col,
+static void decode_reconstruct_tx(int blk_row, int blk_col,
                                   int plane, int block,
                                   TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
                                   void *arg) {
@@ -351,35 +351,40 @@ static void decode_reconstruct_tx(int mi_row, int mi_col,
       get_uv_tx_size_impl(mbmi->tx_size, plane_bsize,
                           0, 0) : mbmi->tx_size;
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
     return;
 
   if (tx_size == plane_tx_size) {
     const int16_t *const dequant = (plane == 0) ? args->y_dequant
                                                 : args->uv_dequant;
-    int x, y, eob;
-    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+    int eob;
     eob = vp9_decode_block_tokens(cm, xd, args->counts, plane, block,
-                                  plane_bsize, x, y, tx_size, args->r, dequant);
+                                  plane_bsize, blk_col, blk_row,
+                                  tx_size, args->r, dequant);
     inverse_transform_block(xd, plane, block, tx_size,
-                            &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
+                            &pd->dst.buf[4 * blk_row * pd->dst.stride +
+                                         4 * blk_col],
                             pd->dst.stride, eob);
     *args->eobtotal += eob;
   } else {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-    int bh = num_8x8_blocks_high_lookup[bsize];
-    int max_blocks_high = cm->mi_rows;
-    int max_blocks_wide = cm->mi_cols;
+    int bh = num_4x4_blocks_high_lookup[bsize];
     int step = 1 << (2 *(tx_size - 1));
     int i;
     for (i = 0; i < 4; ++i) {
       int offsetr = (i >> 1) * bh / 2;
       int offsetc = (i & 0x01) * bh / 2;
-      if ((mi_row + offsetr < max_blocks_high) &&
-          (mi_col + offsetc < max_blocks_wide))
-        decode_reconstruct_tx(mi_row + offsetr, mi_col + offsetc,
-                              plane, block + i * step, tx_size - 1,
-                              plane_bsize, arg);
+      decode_reconstruct_tx(blk_row + offsetr, blk_col + offsetc,
+                            plane, block + i * step, tx_size - 1,
+                            plane_bsize, arg);
     }
   }
 }
@@ -485,8 +490,8 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
 
         for (idy = 0; idy < mi_height; idy += bh) {
           for (idx = 0; idx < mi_width; idx += bh) {
-            decode_reconstruct_tx(mi_row + idy / 2, mi_col + idx / 2,
-                                  plane, block, max_txsize_lookup[plane_bsize],
+            decode_reconstruct_tx(idy, idx, plane, block,
+                                  max_txsize_lookup[plane_bsize],
                                   plane_bsize, &arg);
             block += step;
           }
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 194aef1de..f1417e9e9 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -4084,9 +4084,7 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
 
     vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
 
-    vp9_tokenize_sb_inter(cpi, td, t, !output_enabled, mi_row, mi_col,
-                          MAX(bsize, BLOCK_8X8));
-//    vp9_tokenize_sb(cpi, td, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+    vp9_tokenize_sb_inter(cpi, td, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   }
 
   if (output_enabled) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 21909e248..6800c940c 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -530,6 +530,94 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
   }
 }
 
+void vp9_xform_quant_inter(MACROBLOCK *x, int plane, int block,
+                           int blk_row, int blk_col,
+                           BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+     switch (tx_size) {
+      case TX_32X32:
+        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                    p->round, p->quant, p->quant_shift, qcoeff,
+                                    dqcoeff, pd->dequant, eob,
+                                    scan_order->scan, scan_order->iscan);
+        break;
+      case TX_16X16:
+        vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff, dqcoeff,
+                           pd->dequant, eob, scan_order->scan,
+                           scan_order->iscan);
+      break;
+    case TX_16X16:
+      vp9_fdct16x16(src_diff, coeff, diff_stride);
+      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, eob,
+                     scan_order->scan, scan_order->iscan);
+      break;
+    case TX_8X8:
+      vp9_fdct8x8(src_diff, coeff, diff_stride);
+      vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, eob,
+                     scan_order->scan, scan_order->iscan);
+      break;
+    case TX_4X4:
+      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, eob,
+                     scan_order->scan, scan_order->iscan);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -619,8 +707,9 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
   }
 }
 
-static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
-                         TX_SIZE tx_size, void *arg) {
+static void encode_block_b(int blk_row, int blk_col, int plane,
+                           int block, BLOCK_SIZE plane_bsize,
+                           TX_SIZE tx_size, void *arg) {
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -628,55 +717,24 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  int i, j;
   uint8_t *dst;
   ENTROPY_CONTEXT *a, *l;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
-  a = &ctx->ta[plane][i];
-  l = &ctx->tl[plane][j];
+  dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+  a = &ctx->ta[plane][blk_col];
+  l = &ctx->tl[plane][blk_row];
 
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
-  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
-    p->eobs[block] = 0;
-    *a = *l = 0;
-    return;
-  }
+//  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+//    p->eobs[block] = 0;
+//    *a = *l = 0;
+//    return;
+//  }
 
-  if (!x->skip_recode) {
-    if (x->quant_fp) {
-      // Encoding process for rtc mode
-      if (x->skip_txfm[0] == 1 && plane == 0) {
-        // skip forward transform
-        p->eobs[block] = 0;
-        *a = *l = 0;
-        return;
-      } else {
-        vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
-      }
-    } else {
-      if (max_txsize_lookup[plane_bsize] == tx_size) {
-        int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
-        if (x->skip_txfm[txfm_blk_index] == 0) {
-          // full forward transform and quantization
-          vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
-        } else if (x->skip_txfm[txfm_blk_index]== 2) {
-          // fast path forward transform and quantization
-          vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
-        } else {
-          // skip forward transform
-          p->eobs[block] = 0;
-          *a = *l = 0;
-          return;
-        }
-      } else {
-        vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
-      }
-    }
-  }
+  vp9_xform_quant_inter(x, plane, block, blk_row, blk_col,
+                        plane_bsize, tx_size);
 
-  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+  if (x->optimize) {
     const int ctx = combine_entropy_contexts(*a, *l);
     *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0;
   } else {
@@ -739,6 +797,44 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   }
 }
 
+static void encode_block_inter(int blk_row, int blk_col,
+                               int plane, int block, BLOCK_SIZE plane_bsize,
+                               TX_SIZE tx_size, void *arg) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->tx_size, plane_bsize,
+                          0, 0) : mbmi->tx_size;
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    encode_block_b(blk_row, blk_col, plane, block, plane_bsize, tx_size, arg);
+  } else {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bh = num_4x4_blocks_high_lookup[bsize];
+    int step = 1 << (2 *(tx_size - 1));
+    int i;
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bh / 2;
+      int offsetc = (i & 0x01) * bh / 2;
+      encode_block_inter(blk_row + offsetr, blk_col + offsetc,
+                         plane, block + i * step, plane_bsize,
+                         tx_size - 1, arg);
+    }
+  }
+}
+
 static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
                                TX_SIZE tx_size, void *arg) {
   MACROBLOCK *const x = (MACROBLOCK *)arg;
@@ -783,18 +879,32 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
     return;
 
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    if (!x->skip_recode)
-      vp9_subtract_plane(x, bsize, plane);
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize_lookup[plane_bsize]];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
 
-    if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+    vp9_subtract_plane(x, bsize, plane);
+
+    if (x->optimize) {
       const struct macroblockd_plane* const pd = &xd->plane[plane];
       const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
       vp9_get_entropy_contexts(bsize, tx_size, pd,
                                ctx.ta[plane], ctx.tl[plane]);
     }
 
-    vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
-                                           &arg);
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        encode_block_inter(idy, idx, plane, block, plane_bsize,
+                           max_txsize_lookup[plane_bsize], &arg);
+        block += step;
+      }
+    }
   }
 }
 
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 97df8a66b..1e5d444ac 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -29,6 +29,9 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+void vp9_xform_quant_inter(MACROBLOCK *x, int plane, int block,
+                           int blk_row, int blk_col,
+                           BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index a95f0f46d..4114b64bc 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -188,4 +188,5 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
     lf->filter_level = search_filter_level(sd, cpi,
                                            method == LPF_PICK_FROM_SUBIMAGE);
   }
+  lf->filter_level = 0;
 }
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index f0a882c8b..05e569c70 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -442,6 +442,20 @@ struct tokenize_b_args {
   TOKENEXTRA **tp;
 };
 
+static void set_entropy_context_b_inter(int plane, int block,
+                                        BLOCK_SIZE plane_bsize,
+                                        int blk_row, int blk_col,
+                                        TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args* const args = arg;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  vp9_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0,
+                   blk_col, blk_row);
+}
+
 static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
@@ -486,6 +500,85 @@ static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
   return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
+static void tokenize_b_inter(int plane, int block, BLOCK_SIZE plane_bsize,
+                             int blk_row, int blk_col,
+                             TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args* const args = arg;
+  VP9_COMP *cpi = args->cpi;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TOKENEXTRA **tp = args->tp;
+  uint8_t token_cache[32 * 32];
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  int pt; /* near block/prev token context index */
+  int c;
+  TOKENEXTRA *t = *tp;        /* store tokens starting here */
+  int eob = p->eobs[block];
+  const PLANE_TYPE type = pd->plane_type;
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const int segment_id = mbmi->segment_id;
+  const int16_t *scan, *nb;
+  const scan_order *so;
+  const int ref = is_inter_block(mbmi);
+  unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      td->rd_counts.coef_counts[tx_size][type][ref];
+  vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      cpi->common.fc->coef_probs[tx_size][type][ref];
+  unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
+      td->counts->eob_branch[tx_size][type][ref];
+  const uint8_t *const band = get_band_translate(tx_size);
+  const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  int16_t token;
+  EXTRABIT extra;
+
+  pt = get_entropy_context(tx_size, pd->above_context + blk_col,
+                           pd->left_context + blk_row);
+  so = get_scan(xd, tx_size, type, block);
+  scan = so->scan;
+  nb = so->neighbors;
+  c = 0;
+
+  while (c < eob) {
+    int v = 0;
+    int skip_eob = 0;
+    v = qcoeff[scan[c]];
+
+    while (!v) {
+      add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN, skip_eob,
+                         counts[band[c]][pt]);
+      eob_branch[band[c]][pt] += !skip_eob;
+
+      skip_eob = 1;
+      token_cache[scan[c]] = 0;
+      ++c;
+      pt = get_coef_context(nb, token_cache, c);
+      v = qcoeff[scan[c]];
+    }
+
+    vp9_get_token_extra(v, &token, &extra);
+
+    add_token(&t, coef_probs[band[c]][pt], extra, (uint8_t)token,
+              (uint8_t)skip_eob, counts[band[c]][pt]);
+    eob_branch[band[c]][pt] += !skip_eob;
+
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
+    ++c;
+    pt = get_coef_context(nb, token_cache, c);
+  }
+  if (c < seg_eob) {
+    add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, 0,
+                       counts[band[c]][pt]);
+    ++eob_branch[band[c]][pt];
+  }
+
+  *tp = t;
+
+  vp9_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, blk_col, blk_row);
+}
+
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
@@ -609,47 +702,55 @@ int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 
 void tokenize_tx(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
                  int dry_run, TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
-                 int mi_row, int mi_col, int block, int plane,
+                 int blk_row, int blk_col, int block, int plane,
                  void *arg) {
-  VP9_COMMON *cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
   TX_SIZE plane_tx_size = plane ?
       get_uv_tx_size_impl(mbmi->tx_size, plane_bsize,
                           0, 0) : mbmi->tx_size;
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
     return;
 
   if (tx_size == plane_tx_size) {
     if (!dry_run)
-      tokenize_b(plane, block, plane_bsize, tx_size, arg);
+      tokenize_b_inter(plane, block, plane_bsize,
+                       blk_row, blk_col, tx_size, arg);
     else
-      set_entropy_context_b(plane, block, plane_bsize, tx_size, arg);
+      set_entropy_context_b_inter(plane, block, plane_bsize,
+                                  blk_row, blk_col, tx_size, arg);
   } else {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-    int bh = num_8x8_blocks_high_lookup[bsize];
-    int max_blocks_high = cm->mi_rows;
-    int max_blocks_wide = cm->mi_cols;
+    int bh = num_4x4_blocks_wide_lookup[bsize];
     int i;
 
+    assert(num_4x4_blocks_high_lookup[bsize] ==
+           num_4x4_blocks_wide_lookup[bsize]);
+
     for (i = 0; i < 4; ++i) {
       int offsetr = (i >> 1) * bh / 2;
       int offsetc = (i & 0x01) * bh / 2;
       int step = 1 << (2 *(tx_size - 1));
-      if ((mi_row + offsetr < max_blocks_high) &&
-          (mi_col + offsetc < max_blocks_wide))
-        tokenize_tx(cpi, td, t, dry_run, tx_size - 1, plane_bsize,
-                    mi_row + offsetr, mi_col + offsetc,
-                    block + i * step, plane, arg);
+      tokenize_tx(cpi, td, t, dry_run, tx_size - 1, plane_bsize,
+                  blk_row + offsetr, blk_col + offsetc,
+                  block + i * step, plane, arg);
     }
   }
 }
 
 void vp9_tokenize_sb_inter(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                           int dry_run, int mi_row, int mi_col,
-                           BLOCK_SIZE bsize) {
+                           int dry_run, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -690,8 +791,7 @@ void vp9_tokenize_sb_inter(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bh) {
         tokenize_tx(cpi, td, t, dry_run, max_txsize_lookup[plane_bsize],
-                    plane_bsize, mi_row + idy / 2, mi_col + idx / 2,
-                    block, plane, &arg);
+                    plane_bsize, idy, idx, block, plane, &arg);
         block += step;
       }
     }
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 77aab09e5..56a06b946 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -52,8 +52,7 @@ struct VP9_COMP;
 struct ThreadData;
 
 void vp9_tokenize_sb_inter(struct VP9_COMP *cpi, struct ThreadData *td,
-                           TOKENEXTRA **t, int dry_run,
-                           int mi_row, int mi_col, BLOCK_SIZE bsize);
+                           TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
 
 void vp9_tokenize_sb(struct VP9_COMP *cpi, struct ThreadData *td,
                      TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);