Implementing transform overlapping multiple blocks

We removed the restriction that transform blocks could not exceed the size of prediction blocks. Smoothing masks are applied to reduce discontinuity between prediction blocks in order to realize the efficiency of large transform. 0.997%/0.895% bit-rate reduction is achieved on derf/stdhd set. Change-Id: I8db241bab9fe74d864809e95f76b771ee59a2def
2014-08-11 16:39:23 -07:00
parent be17f1b338
commit a4dfcd9a2d
20 changed files with 3098 additions and 63 deletions
--- a/1
+++ b/1
@@ -278,6 +278,7 @@ EXPERIMENT_LIST="
    masked_interintra
    filterintra
    ext_tx
+    supertx
 "
 CONFIG_LIST="
    external_build
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -334,6 +334,13 @@ typedef struct macroblockd {
  PARTITION_CONTEXT left_seg_context[8];
 } MACROBLOCKD;

+#if CONFIG_SUPERTX
+static INLINE int supertx_enabled(const MB_MODE_INFO *mbmi) {
+  return mbmi->tx_size >
+         MIN(b_width_log2(mbmi->sb_type), b_height_log2(mbmi->sb_type));
+}
+#endif
+
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
                                     PARTITION_TYPE partition) {
  const BLOCK_SIZE subsize = subsize_lookup[partition][bsize];
@@ -399,7 +406,15 @@ static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize) {
 }

 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
+#if CONFIG_SUPERTX
+  if (!supertx_enabled(mbmi)) {
+#endif
  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type);
+#if CONFIG_SUPERTX
+  } else {
+    return uvsupertx_size_lookup[mbmi->tx_size];
+  }
+#endif
 }

 static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -133,6 +133,15 @@ const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
  {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
 };

+#if CONFIG_SUPERTX
+const TX_SIZE uvsupertx_size_lookup[TX_SIZES] = {
+    TX_4X4,
+    TX_4X4,
+    TX_8X8,
+    TX_16X16
+};
+#endif
+
 // Generates 4 bit field in which each bit set to 1 represents
 // a blocksize partition  1111 means we split 64x64, 32x32, 16x16
 // and 8x8.  1000 means we just split the 64x64 to 32x32
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -31,6 +31,9 @@ extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
 extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
+#if CONFIG_SUPERTX
+extern const TX_SIZE uvsupertx_size_lookup[TX_SIZES];
+#endif

 #ifdef __cplusplus
 }  // extern "C"
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -31,7 +31,7 @@ static const vp9_prob default_masked_interintra_prob[BLOCK_SIZES] = {
 #endif

 #if CONFIG_FILTERINTRA
-const vp9_prob default_filterintra_prob[TX_SIZES][INTRA_MODES] = {
+static const vp9_prob default_filterintra_prob[TX_SIZES][INTRA_MODES] = {
  // DC     V      H    D45   D135   D117   D153   D207    D63     TM
  {153,   171,   147,   150,   129,   101,   100,   153,   132,   111},
  {171,   173,   185,   131,    70,    53,    70,   148,   127,   114},
@@ -41,7 +41,17 @@ const vp9_prob default_filterintra_prob[TX_SIZES][INTRA_MODES] = {
 #endif

 #if CONFIG_EXT_TX
-const vp9_prob default_ext_tx_prob = 178;  // 0.6 = 153, 0.7 = 178, 0.8 = 204
+static const vp9_prob default_ext_tx_prob = 178;
+#endif
+
+#if CONFIG_SUPERTX
+static const vp9_prob default_supertx_prob[TX_SIZES] = {
+  255, 160, 160, 160
+};
+
+static const vp9_prob default_supertxsplit_prob[TX_SIZES] = {
+  255, 200, 200, 200
+};
 #endif

 const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
@@ -372,6 +382,10 @@ void vp9_init_mode_probs(FRAME_CONTEXT *fc) {
 #if CONFIG_EXT_TX
  fc->ext_tx_prob = default_ext_tx_prob;
 #endif
+#if CONFIG_SUPERTX
+  vp9_copy(fc->supertx_prob, default_supertx_prob);
+  vp9_copy(fc->supertxsplit_prob, default_supertxsplit_prob);
+#endif
 }

 const vp9_tree_index vp9_switchable_interp_tree
@@ -504,6 +518,23 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
 #if CONFIG_EXT_TX
  fc->ext_tx_prob = adapt_prob(pre_fc->ext_tx_prob, counts->ext_tx);
 #endif
+
+#if CONFIG_SUPERTX
+  for (i = 1; i < TX_SIZES; ++i) {
+    fc->supertx_prob[i] = adapt_prob(pre_fc->supertx_prob[i],
+                                     counts->supertx[i]);
+/*    fprintf(stderr, "%d(%d %d) ", fc->supertx_prob[i],
+            counts->supertx[i][0], counts->supertx[i][1]);*/
+  }
+
+  for (i = 1; i < TX_SIZES; ++i) {
+    fc->supertxsplit_prob[i] = adapt_prob(pre_fc->supertxsplit_prob[i],
+                                          counts->supertxsplit[i]);
+/*    fprintf(stderr, "%d(%d %d) ", fc->supertxsplit_prob[i],
+                    counts->supertxsplit[i][0], counts->supertxsplit[i][1]);*/
+  }
+/*  fprintf(stderr, "\n");*/
+#endif
 }

 static void set_default_lf_deltas(struct loopfilter *lf) {
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -67,6 +67,10 @@ typedef struct frame_contexts {
 #if CONFIG_EXT_TX
  vp9_prob ext_tx_prob;
 #endif
+#if CONFIG_SUPERTX
+  vp9_prob supertx_prob[TX_SIZES];
+  vp9_prob supertxsplit_prob[TX_SIZES];
+#endif
 } FRAME_CONTEXT;

 typedef struct {
@@ -101,6 +105,11 @@ typedef struct {
 #if CONFIG_EXT_TX
  unsigned int ext_tx[2];
 #endif
+#if CONFIG_SUPERTX
+  unsigned int supertx[TX_SIZES][2];
+  unsigned int supertxsplit[TX_SIZES][2];
+  unsigned int supertx_size[BLOCK_SIZES];
+#endif
 } FRAME_COUNTS;

 extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -206,6 +206,13 @@ static const int mode_lf_lut[MB_MODE_COUNT] = {
  1, 1, 0, 1                     // INTER_MODES (ZEROMV == 0)
 };

+#if CONFIG_SUPERTX
+static int supertx_enabled_lpf(const MB_MODE_INFO *mbmi) {
+  return mbmi->tx_size >
+         MIN(b_width_log2(mbmi->sb_type), b_height_log2(mbmi->sb_type));
+}
+#endif
+
 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
  int lvl;

@@ -572,6 +579,85 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
 }

+#if CONFIG_SUPERTX
+static void build_masks_supertx(const loop_filter_info_n *const lfi_n,
+                                const MODE_INFO *mi, const int shift_y,
+                                const int shift_uv,
+                                LOOP_FILTER_MASK *lfm) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);
+  const BLOCK_SIZE block_size = 3 * (int)tx_size_y;
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
+  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
+  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
+  int i;
+
+  // If filter level is 0 we don't loop filter.
+  if (!filter_level) {
+    return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
+
+  // These set 1 in the current block size for the block size edges.
+  // For instance if the block size is 32x16,   we'll set :
+  //    above =   1111
+  //              0000
+  //    and
+  //    left  =   1000
+  //          =   1000
+  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+  //        1,  not 8...
+  //
+  // U and v set things on a 16 bit scale.
+  //
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
+
+  // If the block has no coefficients and is not intra we skip applying
+  // the loop filter on block edges.
+  if (mbmi->skip && is_inter_block(mbmi))
+    return;
+
+  // Here we are adding a mask for the transform size.  The transform
+  // size mask is set to be correct for a 64x64 prediction block size. We
+  // mask to match the size of the block we are working on and then shift it
+  // into place..
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+  *above_uv |= (size_mask_uv[block_size] &
+                above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+  *left_uv |= (size_mask_uv[block_size] &
+               left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  // Here we are trying to determine what to do with the internal 4x4 block
+  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
+  // an 8x8 in that the internal ones can be skipped and don't depend on
+  // the prediction block size.
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+
+  if (tx_size_uv == TX_4X4)
+    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
+}
+#endif
+
 // This function does the same thing as the one above with the exception that
 // it only affects the y masks.   It exists because for blocks < 16x16 in size,
 // we only update u and v masks on the first block.
@@ -615,6 +701,48 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
 }

+#if CONFIG_SUPERTX
+static void build_y_mask_supertx(const loop_filter_info_n *const lfi_n,
+                                 const MODE_INFO *mi, const int shift_y,
+                                 LOOP_FILTER_MASK *lfm) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const BLOCK_SIZE block_size = 3 * (int)tx_size_y;
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  int i;
+
+  if (!filter_level) {
+    return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
+
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+
+  if (mbmi->skip && is_inter_block(mbmi))
+    return;
+
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+}
+#endif
+
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 // TODO(JBB): This function only works for yv12.
@@ -650,6 +778,9 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
                        cm->mi_cols - mi_col : MI_BLOCK_SIZE);
+#if CONFIG_SUPERTX
+  int supertx;
+#endif

  vp9_zero(*lfm);

@@ -687,20 +818,43 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
            break;
          case BLOCK_32X16:
+#if CONFIG_SUPERTX
+            supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+            if (!supertx) {
+#endif
            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
            if (mi_32_row_offset + 2 >= max_rows)
              continue;
            mip2 = mip + mode_info_stride * 2;
            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
+#if CONFIG_SUPERTX
+            } else {
+              build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            }
+#endif
            break;
          case BLOCK_16X32:
+#if CONFIG_SUPERTX
+            supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+            if (!supertx) {
+#endif
            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
            if (mi_32_col_offset + 2 >= max_cols)
              continue;
            mip2 = mip + 2;
            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
+#if CONFIG_SUPERTX
+            } else {
+              build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            }
+#endif
            break;
          default:
+#if CONFIG_SUPERTX
+            if (mip[0]->mbmi.tx_size == TX_32X32) {
+              build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            } else {
+#endif
            for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
@@ -717,24 +871,56 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                  break;
                case BLOCK_16X8:
+#if CONFIG_SUPERTX
+                  supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+                  if (!supertx) {
+#endif
                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                  if (mi_16_row_offset + 1 >= max_rows)
                    continue;
                  mip2 = mip + mode_info_stride;
                  build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
+#if CONFIG_SUPERTX
+                  } else {
+                    build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  }
+#endif
                  break;
                case BLOCK_8X16:
+#if CONFIG_SUPERTX
+                  supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+                  if (!supertx) {
+#endif
                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                  if (mi_16_col_offset +1 >= max_cols)
                    continue;
                  mip2 = mip + 1;
                  build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
+#if CONFIG_SUPERTX
+                  } else {
+                    build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  }
+#endif
                  break;
                default: {
+#if CONFIG_SUPERTX
+                  if (mip[0]->mbmi.tx_size == TX_16X16) {
+                    build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  } else {
+#endif
                  const int shift_y = shift_32_y[idx_32] +
                                      shift_16_y[idx_16] +
                                      shift_8_y[0];
+#if CONFIG_SUPERTX
+                  supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+                  if (!supertx) {
+#endif
                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+#if CONFIG_SUPERTX
+                  } else {
+                    build_masks_supertx(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  }
+#endif
                  mip += offset[0];
                  for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
                    const int shift_y = shift_32_y[idx_32] +
@@ -748,12 +934,26 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                    if (mi_8_col_offset >= max_cols ||
                        mi_8_row_offset >= max_rows)
                      continue;
+#if CONFIG_SUPERTX
+                    supertx = supertx_enabled_lpf(&mip[0]->mbmi);
+                    if (!supertx)
+#endif
                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+#if CONFIG_SUPERTX
+                    else
+                      build_y_mask_supertx(lfi_n, mip[0], shift_y, lfm);
+#endif
                  }
+#if CONFIG_SUPERTX
+                  }
+#endif
                  break;
                }
              }
            }
+#if CONFIG_SUPERTX
+            }
+#endif
            break;
        }
      }
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -164,7 +164,7 @@ static int get_masked_weight(int m) {
 }

 static int get_hard_mask(int m) {
-  return m > 0;
+  return 1 << MASK_WEIGHT_BITS * (m > 0);
 }

 // Equation of line: f(x, y) = a[0]*(x - a[2]*w/4) + a[1]*(y - a[3]*h/4) = 0
@@ -426,18 +426,62 @@ static void build_masked_compound(uint8_t *dst, int dst_stride,
  for (i = 0; i < h; ++i)
    for (j = 0; j < w; ++j) {
      int m = mask[i * 64 + j];
-      dst[i * dst_stride + j] =  (dst[i * dst_stride + j] * m +
-                                  dst2[i * dst2_stride + j] *
-                                  ((1 << MASK_WEIGHT_BITS) - m) +
-                                  (1 << (MASK_WEIGHT_BITS - 1))) >>
+      dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
+                                 dst2[i * dst2_stride + j] *
+                                 ((1 << MASK_WEIGHT_BITS) - m) +
+                                 (1 << (MASK_WEIGHT_BITS - 1))) >>
                                 MASK_WEIGHT_BITS;
    }
 }
+
+#if CONFIG_SUPERTX
+void generate_masked_weight_extend(int mask_index, int plane,
+                                   BLOCK_SIZE sb_type, int h, int w,
+                                   int mask_offset_x, int mask_offset_y,
+                                   uint8_t *mask, int stride) {
+  int i, j;
+  int subh = (plane ? 2 : 4) << b_height_log2(sb_type);
+  int subw = (plane ? 2 : 4) << b_width_log2(sb_type);
+  const int *a = get_mask_params(mask_index, sb_type, subh, subw);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * subw) / 4 - mask_offset_x);
+      int y = (i - (a[3] * subh) / 4 - mask_offset_y);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_masked_weight(m);
+    }
+}
+
+static void build_masked_compound_extend(uint8_t *dst, int dst_stride,
+                                         uint8_t *dst2, int dst2_stride,
+                                         int plane,
+                                         int mask_index, BLOCK_SIZE sb_type,
+                                         int mask_offset_x, int mask_offset_y,
+                                         int h, int w) {
+  int i, j;
+  uint8_t mask[4096];
+  generate_masked_weight_extend(mask_index, plane, sb_type, h, w,
+                                mask_offset_x, mask_offset_y, mask, 64);
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int m = mask[i * 64 + j];
+      dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
+                                 dst2[i * dst2_stride + j] *
+                                 ((1 << MASK_WEIGHT_BITS) - m) +
+                                 (1 << (MASK_WEIGHT_BITS - 1))) >>
+                                 MASK_WEIGHT_BITS;
+    }
+}
+#endif
 #endif

 static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                   int bw, int bh,
                                   int x, int y, int w, int h,
+#if CONFIG_SUPERTX && CONFIG_MASKED_INTERINTER
+                                   int mask_offset_x, int mask_offset_y,
+#endif
                                   int mi_x, int mi_y) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  const MODE_INFO *mi = xd->mi[0];
@@ -495,8 +539,14 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
      uint8_t tmp_dst[4096];
      inter_predictor(pre, pre_buf->stride, tmp_dst, 64,
                     subpel_x, subpel_y, sf, w, h, 0, kernel, xs, ys);
+#if !CONFIG_SUPERTX
      build_masked_compound(dst, dst_buf->stride, tmp_dst, 64,
                            mi->mbmi.mask_index, mi->mbmi.sb_type, h, w);
+#else
+      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst, 64, plane,
+                                   mi->mbmi.mask_index, mi->mbmi.sb_type,
+                                   mask_offset_x, mask_offset_y, h, w);
+#endif
    } else {
 #endif
    inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
@@ -527,10 +577,18 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
      for (y = 0; y < num_4x4_h; ++y)
        for (x = 0; x < num_4x4_w; ++x)
           build_inter_predictors(xd, plane, i++, bw, bh,
-                                  4 * x, 4 * y, 4, 4, mi_x, mi_y);
+                                  4 * x, 4 * y, 4, 4,
+#if CONFIG_SUPERTX && CONFIG_MASKED_INTERINTER
+                                  0, 0,
+#endif
+                                  mi_x, mi_y);
    } else {
      build_inter_predictors(xd, plane, 0, bw, bh,
-                             0, 0, bw, bh, mi_x, mi_y);
+                             0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_MASKED_INTERINTER
+                             0, 0,
+#endif
+                             mi_x, mi_y);
    }
  }
 }
@@ -558,6 +616,7 @@ void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                         xd->plane[2].dst.stride, bsize);
 #endif
 }
+
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                   BLOCK_SIZE bsize) {
  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
@@ -573,11 +632,287 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
 #endif
 }

+#if CONFIG_SUPERTX
+static int get_masked_weight_supertx(int m) {
+  #define SMOOTHER_LEN  32
+  static const uint8_t smoothfn[2 * SMOOTHER_LEN + 1] = {
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  1,  1,  1,
+      1,  1,  2,  2,  3,  4,  5,  6,
+      8,  9, 12, 14, 17, 21, 24, 28,
+      32,
+      36, 40, 43, 47, 50, 52, 55, 56,
+      58, 59, 60, 61, 62, 62, 63, 63,
+      63, 63, 63, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+  };
+  if (m < -SMOOTHER_LEN)
+    return 0;
+  else if (m > SMOOTHER_LEN)
+    return 64;
+  else
+    return smoothfn[m + SMOOTHER_LEN];
+}
+
+static const uint8_t mask_8[8] = {
+  64, 64, 62, 52, 12,  2,  0,  0
+};
+
+static const uint8_t mask_16[16] = {
+  63, 62, 60, 58, 55, 50, 43, 36, 28, 21, 14, 9, 6, 4, 2, 1
+};
+
+static const uint8_t mask_32[32] = {
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 61, 57, 52, 45, 36,
+  28, 19, 12,  7,  3,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+static void generate_1dmask(int length, uint8_t *mask) {
+  int i;
+  switch (length) {
+    case 8:
+      vpx_memcpy(mask, mask_8, length);
+      break;
+    case 16:
+      vpx_memcpy(mask, mask_16, length);
+      break;
+    case 32:
+      vpx_memcpy(mask, mask_32, length);
+      break;
+    default:
+      assert(0);
+  }
+  if (length > 16) {
+    for (i = 0; i < length; ++i)
+      mask[i] = get_masked_weight_supertx(-1 * (2 * i - length + 1));
+  }
+}
+
+void vp9_build_masked_inter_predictor_complex(uint8_t *dst, int dst_stride,
+                                              uint8_t *dst2, int dst2_stride,
+                                              int plane,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize,
+                                              BLOCK_SIZE top_bsize,
+                                              PARTITION_TYPE partition) {
+  int i, j;
+  uint8_t mask[32];
+  int top_w = 4 << b_width_log2(top_bsize),
+      top_h = 4 << b_height_log2(top_bsize);
+  int w = 4 << b_width_log2(bsize), h = 4 << b_height_log2(bsize);
+  int w_offset = (mi_col - mi_col_ori) << 3,
+      h_offset = (mi_row - mi_row_ori) << 3;
+  int m;
+
+  if (plane > 0) {
+    top_w = top_w >> 1; top_h = top_h >> 1;
+    w = w >> 1; h = h >> 1;
+    w_offset = w_offset >> 1; h_offset = h_offset >> 1;
+  }
+  switch (partition) {
+    case PARTITION_HORZ:
+      generate_1dmask(h, mask + h_offset);
+      vpx_memset(mask, 64, h_offset);
+      vpx_memset(mask + h_offset + h, 0, top_h - h_offset - h);
+      break;
+    case PARTITION_VERT:
+      generate_1dmask(w, mask + w_offset);
+      vpx_memset(mask, 64, w_offset);
+      vpx_memset(mask + w_offset + w, 0, top_w - w_offset - w);
+      break;
+    default:
+      assert(0);
+  }
+  for (i = 0; i < top_h; ++i)
+    for (j = 0; j < top_w; ++j) {
+      m = partition == PARTITION_HORZ ? mask[i] : mask[j];
+      if (m == 64)
+        continue;
+      if (m == 0)
+        dst[i * dst_stride + j] = dst2[i * dst2_stride + j];
+      else
+        dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
+                                  dst2[i * dst2_stride + j] *
+                                  (64 - m) + 32) >> 6;
+    }
+}
+
+#if CONFIG_MASKED_INTERINTER
+void vp9_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          int mi_row_ori, int mi_col_ori,
+                                          BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+  const int mask_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int mask_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+           build_inter_predictors(xd, plane, i++, bw, bh, 4 * x, 4 * y, 4, 4,
+                                  mask_offset_x, mask_offset_y, mi_x, mi_y);
+    } else {
+      build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+                             mask_offset_x, mask_offset_y, mi_x, mi_y);
+    }
+  }
+}
+#endif
+
+void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
+                                                  int mi_row, int mi_col,
+                                                  int mi_row_ori,
+                                                  int mi_col_ori,
+                                                  BLOCK_SIZE top_bsize,
+                                                  PARTITION_TYPE partition) {
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+#if CONFIG_MASKED_INTERINTER
+  const int mask_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int mask_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+#endif
+  uint8_t *orig_dst;
+  int orig_dst_stride;
+  int bw = 4 << b_width_log2(top_bsize);
+  int bh = 4 << b_height_log2(top_bsize);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1, 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2, 32 * 32);
+
+  orig_dst = xd->plane[0].dst.buf;
+  orig_dst_stride = xd->plane[0].dst.stride;
+  build_inter_predictors(xd, 0, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                         mask_offset_x, mask_offset_y,
+#endif
+                         mi_x, mi_y);
+
+  xd->plane[0].dst.buf = tmp_buf;
+  xd->plane[0].dst.stride = 32;
+  switch (partition) {
+    case PARTITION_HORZ:
+      build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                             mask_offset_x, mask_offset_y,
+#endif
+                             mi_x, mi_y);
+      break;
+    case PARTITION_VERT:
+      build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                             mask_offset_x, mask_offset_y,
+#endif
+                             mi_x, mi_y);
+      break;
+    case PARTITION_SPLIT:
+      build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                             mask_offset_x, mask_offset_y,
+#endif
+                             mi_x, mi_y);
+      xd->plane[0].dst.buf = tmp_buf1;
+      xd->plane[0].dst.stride = 32;
+      build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                             mask_offset_x, mask_offset_y,
+#endif
+                             mi_x, mi_y);
+      xd->plane[0].dst.buf = tmp_buf2;
+      xd->plane[0].dst.stride = 32;
+      build_inter_predictors(xd, 0, 3, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                             mask_offset_x, mask_offset_y,
+#endif
+                             mi_x, mi_y);
+      break;
+    default:
+      assert(0);
+  }
+
+  if (partition != PARTITION_SPLIT) {
+    vp9_build_masked_inter_predictor_complex(orig_dst, orig_dst_stride,
+                                             tmp_buf, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             partition);
+    xd->plane[0].dst.buf = orig_dst;
+    xd->plane[0].dst.stride = orig_dst_stride;
+  } else {
+    vp9_build_masked_inter_predictor_complex(orig_dst, orig_dst_stride,
+                                             tmp_buf, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             PARTITION_VERT);
+    vp9_build_masked_inter_predictor_complex(tmp_buf1, 32,
+                                             tmp_buf2, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             PARTITION_VERT);
+    vp9_build_masked_inter_predictor_complex(orig_dst, orig_dst_stride,
+                                             tmp_buf1, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             PARTITION_HORZ);
+    xd->plane[0].dst.buf = orig_dst;
+    xd->plane[0].dst.stride = orig_dst_stride;
+  }
+}
+
+void vp9_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_MASKED_INTERINTER
+                                                   int mi_row, int mi_col,
+#endif
+                                                   int mi_row_ori,
+                                                   int mi_col_ori,
+                                                   BLOCK_SIZE top_bsize) {
+  int plane;
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+#if CONFIG_MASKED_INTERINTER
+  const int mask_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int mask_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+#endif
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(top_bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                           mask_offset_x, mask_offset_y,
+#endif
+                           mi_x, mi_y);
+  }
+}
+#endif
+
 // TODO(jingning): This function serves as a placeholder for decoder prediction
 // using on demand border extension. It should be moved to /decoder/ directory.
 static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                       int bw, int bh,
                                       int x, int y, int w, int h,
+#if CONFIG_SUPERTX && CONFIG_MASKED_INTERINTER
+                                       int mask_offset_x, int mask_offset_y,
+#endif
                                       int mi_x, int mi_y) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  const MODE_INFO *mi = xd->mi[0];
@@ -715,8 +1050,14 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
      uint8_t tmp_dst[4096];
      inter_predictor(buf_ptr, buf_stride, tmp_dst, 64,
                     subpel_x, subpel_y, sf, w, h, 0, kernel, xs, ys);
+#if !CONFIG_SUPERTX
      build_masked_compound(dst, dst_buf->stride, tmp_dst, 64,
                            mi->mbmi.mask_index, mi->mbmi.sb_type, h, w);
+#else
+      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst, 64, plane,
+                                   mi->mbmi.mask_index, mi->mbmi.sb_type,
+                                   mask_offset_x, mask_offset_y, h, w);
+#endif
    } else {
 #endif
    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
@@ -746,10 +1087,18 @@ void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
      for (y = 0; y < num_4x4_h; ++y)
        for (x = 0; x < num_4x4_w; ++x)
          dec_build_inter_predictors(xd, plane, i++, bw, bh,
-                                     4 * x, 4 * y, 4, 4, mi_x, mi_y);
+                                     4 * x, 4 * y, 4, 4,
+#if CONFIG_SUPERTX && CONFIG_MASKED_INTERINTER
+                                     0, 0,
+#endif
+                                     mi_x, mi_y);
    } else {
      dec_build_inter_predictors(xd, plane, 0, bw, bh,
-                                 0, 0, bw, bh, mi_x, mi_y);
+                                 0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_MASKED_INTERINTER
+                                 0, 0,
+#endif
+                                 mi_x, mi_y);
    }
  }
 #if CONFIG_INTERINTRA
@@ -763,6 +1112,174 @@ void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
 #endif
 }

+#if CONFIG_SUPERTX
+#if CONFIG_MASKED_INTERINTER
+void vp9_dec_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+  const int mask_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int mask_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          dec_build_inter_predictors(xd, plane, i++, bw, bh, 4 * x, 4 * y, 4, 4,
+                                     mask_offset_x, mask_offset_y, mi_x, mi_y);
+    } else {
+      dec_build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+                                 mask_offset_x, mask_offset_y, mi_x, mi_y);
+    }
+  }
+}
+#endif
+
+void vp9_dec_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
+                                                  int mi_row, int mi_col,
+                                                  int mi_row_ori,
+                                                  int mi_col_ori,
+                                                  BLOCK_SIZE top_bsize,
+                                                  PARTITION_TYPE partition) {
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+#if CONFIG_MASKED_INTERINTER
+  const int mask_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int mask_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+#endif
+  uint8_t *orig_dst;
+  int orig_dst_stride;
+  int bw = 4 << b_width_log2(top_bsize);
+  int bh = 4 << b_height_log2(top_bsize);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1, 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2, 32 * 32);
+
+  orig_dst = xd->plane[0].dst.buf;
+  orig_dst_stride = xd->plane[0].dst.stride;
+  dec_build_inter_predictors(xd, 0, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                             mask_offset_x, mask_offset_y,
+#endif
+                             mi_x, mi_y);
+
+  xd->plane[0].dst.buf = tmp_buf;
+  xd->plane[0].dst.stride = 32;
+  switch (partition) {
+    case PARTITION_HORZ:
+      dec_build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                                 mask_offset_x, mask_offset_y,
+#endif
+                                 mi_x, mi_y);
+      break;
+    case PARTITION_VERT:
+      dec_build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                                 mask_offset_x, mask_offset_y,
+#endif
+                                 mi_x, mi_y);
+      break;
+    case PARTITION_SPLIT:
+      dec_build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                                 mask_offset_x, mask_offset_y,
+#endif
+                                 mi_x, mi_y);
+      xd->plane[0].dst.buf = tmp_buf1;
+      xd->plane[0].dst.stride = 32;
+      dec_build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                                 mask_offset_x, mask_offset_y,
+#endif
+                                 mi_x, mi_y);
+      xd->plane[0].dst.buf = tmp_buf2;
+      xd->plane[0].dst.stride = 32;
+      dec_build_inter_predictors(xd, 0, 3, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                                 mask_offset_x, mask_offset_y,
+#endif
+                                 mi_x, mi_y);
+      break;
+    default:
+      assert(0);
+  }
+
+  if (partition != PARTITION_SPLIT) {
+    vp9_build_masked_inter_predictor_complex(orig_dst, orig_dst_stride,
+                                             tmp_buf, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             partition);
+    xd->plane[0].dst.buf = orig_dst;
+    xd->plane[0].dst.stride = orig_dst_stride;
+  } else {
+    vp9_build_masked_inter_predictor_complex(orig_dst, orig_dst_stride,
+                                             tmp_buf, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             PARTITION_VERT);
+    vp9_build_masked_inter_predictor_complex(tmp_buf1, 32,
+                                             tmp_buf2, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             PARTITION_VERT);
+    vp9_build_masked_inter_predictor_complex(orig_dst, orig_dst_stride,
+                                             tmp_buf1, 32,
+                                             0, mi_row, mi_col,
+                                             mi_row_ori, mi_col_ori,
+                                             BLOCK_8X8, top_bsize,
+                                             PARTITION_HORZ);
+    xd->plane[0].dst.buf = orig_dst;
+    xd->plane[0].dst.stride = orig_dst_stride;
+  }
+}
+
+void vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_MASKED_INTERINTER
+                                                       int mi_row, int mi_col,
+#endif
+                                                       int mi_row_ori,
+                                                       int mi_col_ori,
+                                                       BLOCK_SIZE top_bsize) {
+  int plane;
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+#if CONFIG_MASKED_INTERINTER
+  const int mask_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int mask_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+#endif
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(top_bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    dec_build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_MASKED_INTERINTER
+                               mask_offset_x, mask_offset_y,
+#endif
+                               mi_x, mi_y);
+  }
+}
+#endif
+
 void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
                          const YV12_BUFFER_CONFIG *src,
                          int mi_row, int mi_col) {
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -72,6 +72,53 @@ void vp9_generate_hard_mask(int mask_index, BLOCK_SIZE sb_type,
                          int h, int w, uint8_t *mask, int stride);
 #endif

+#if CONFIG_SUPERTX
+void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
+                                                  int mi_row, int mi_col,
+                                                  int mi_row_ori,
+                                                  int mi_col_ori,
+                                                  BLOCK_SIZE top_bsize,
+                                                  PARTITION_TYPE partition);
+void vp9_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_MASKED_INTERINTER
+                                                   int mi_row, int mi_col,
+#endif
+                                                   int mi_row_ori,
+                                                   int mi_col_ori,
+                                                   BLOCK_SIZE top_bsize);
+void vp9_build_masked_inter_predictor_complex(uint8_t *dst, int dst_stride,
+                                              uint8_t *dst2, int dst2_stride,
+                                              int plane,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize,
+                                              BLOCK_SIZE top_bsize,
+                                              PARTITION_TYPE partition);
+void vp9_dec_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
+                                                      int mi_row, int mi_col,
+                                                      int mi_row_ori,
+                                                      int mi_col_ori,
+                                                      BLOCK_SIZE top_bsize,
+                                                      PARTITION_TYPE p);
+void vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_MASKED_INTERINTER
+                                                       int mi_row, int mi_col,
+#endif
+                                                       int mi_row_ori,
+                                                       int mi_col_ori,
+                                                       BLOCK_SIZE top_bsize);
+#if CONFIG_MASKED_INTERINTER
+void vp9_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          int mi_row_ori, int mi_col_ori,
+                                          BLOCK_SIZE bsize);
+void vp9_dec_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize);
+#endif
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -335,6 +335,84 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  return &xd->mi[0]->mbmi;
 }

+#if CONFIG_SUPERTX
+static void set_offsets_extend(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                               const TileInfo *const tile,
+                               BLOCK_SIZE top_bsize,
+                               int mi_row, int mi_col,
+                               int mi_row_ori, int mi_col_ori) {
+  const int bw = num_8x8_blocks_wide_lookup[top_bsize];
+  const int bh = num_8x8_blocks_high_lookup[top_bsize];
+  const int offset = mi_row * cm->mi_stride + mi_col;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+
+  set_mi_row_col(xd, tile, mi_row_ori, bh, mi_col_ori, bw,
+                 cm->mi_rows, cm->mi_cols);
+}
+
+static void set_mb_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                           const TileInfo *const tile,
+                           BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+  xd->mi[0]->mbmi.sb_type = bsize;
+  for (y = 0; y < y_mis; ++y)
+    for (x = !y; x < x_mis; ++x)
+      xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+}
+
+static void set_offsets_topblock(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                 const TileInfo *const tile,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int offset = mi_row * cm->mi_stride + mi_col;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+}
+
+static void set_param_topblock(VP9_COMMON *const cm,  MACROBLOCKD *const xd,
+                              BLOCK_SIZE bsize, int mi_row, int mi_col,
+#if CONFIG_EXT_TX
+                              int txfm,
+#endif
+                              int skip) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+
+  for (y = 0; y < y_mis; ++y)
+    for (x = 0; x < x_mis; ++x) {
+      xd->mi[y * cm->mi_stride + x]->mbmi.skip = skip;
+#if CONFIG_EXT_TX
+      xd->mi[y * cm->mi_stride + x]->mbmi.ext_txfrm = txfm;
+#endif
+    }
+}
+#endif
+
 static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                    int idx, int mi_row, int mi_col) {
  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -348,14 +426,246 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  xd->corrupted |= ref_buffer->buf->corrupted;
 }

+#if CONFIG_SUPERTX
+static void dec_predict_b_extend(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                 const TileInfo *const tile,
+                                 int mi_row, int mi_col,
+                                 int mi_row_ori, int mi_col_ori,
+                                 BLOCK_SIZE top_bsize) {
+  set_offsets_extend(cm, xd, tile, top_bsize, mi_row, mi_col,
+                     mi_row_ori, mi_col_ori);
+
+  set_ref(cm, xd, 0, mi_row_ori, mi_col_ori);
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    set_ref(cm, xd, 1, mi_row_ori, mi_col_ori);
+  xd->mi[0]->mbmi.tx_size = b_width_log2(top_bsize);
+#if !CONFIG_MASKED_INTERINTER
+  vp9_dec_build_inter_predictors_sb(xd, mi_row_ori, mi_col_ori, top_bsize);
+#else
+  vp9_dec_build_inter_predictors_sb_extend(xd, mi_row, mi_col,
+                                           mi_row_ori, mi_col_ori, top_bsize);
+#endif
+}
+
+static void dec_predict_b_sub8x8_extend(VP9_COMMON *const cm,
+                                        MACROBLOCKD *const xd,
+                                        const TileInfo *const tile,
+                                        int mi_row, int mi_col,
+                                        int mi_row_ori, int mi_col_ori,
+                                        BLOCK_SIZE top_bsize,
+                                        PARTITION_TYPE partition) {
+  set_offsets_extend(cm, xd, tile, top_bsize, mi_row, mi_col,
+                     mi_row_ori, mi_col_ori);
+
+  set_ref(cm, xd, 0, mi_row_ori, mi_col_ori);
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    set_ref(cm, xd, 1, mi_row_ori, mi_col_ori);
+  xd->mi[0]->mbmi.tx_size = b_width_log2(top_bsize);
+  vp9_dec_build_inter_predictors_sby_sub8x8_extend(xd, mi_row, mi_col,
+                                                   mi_row_ori, mi_col_ori,
+                                                   top_bsize, partition);
+  vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(xd,
+#if CONFIG_MASKED_INTERINTER
+                                                    mi_row, mi_col,
+#endif
+                                                    mi_row_ori, mi_col_ori,
+                                                    top_bsize);
+}
+
+static void dec_predict_sb_complex(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                   const TileInfo *const tile,
+                                   int mi_row, int mi_col,
+                                   int mi_row_ori, int mi_col_ori,
+                                   BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                                   uint8_t *dst_buf[3], int dst_stride[3]) {
+  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  MB_MODE_INFO *mbmi;
+  int i, offset = mi_row * cm->mi_stride + mi_col;
+
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1, MAX_MB_PLANE * 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2, MAX_MB_PLANE * 32 * 32);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf3, MAX_MB_PLANE * 32 * 32);
+  uint8_t *dst_buf1[3] = {tmp_buf1, tmp_buf1 + 32 * 32, tmp_buf1 + 2 * 32 * 32};
+  uint8_t *dst_buf2[3] = {tmp_buf2, tmp_buf2 + 32 * 32, tmp_buf2 + 2 * 32 * 32};
+  uint8_t *dst_buf3[3] = {tmp_buf3, tmp_buf3 + 32 * 32, tmp_buf3 + 2 * 32 * 32};
+  int dst_stride1[3] = {32, 32, 32};
+  int dst_stride2[3] = {32, 32, 32};
+  int dst_stride3[3] = {32, 32, 32};
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+  mbmi = &xd->mi[0]->mbmi;
+  partition = partition_lookup[bsl][mbmi->sb_type];
+  subsize = get_subsize(bsize, partition);
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = dst_buf[i];
+    xd->plane[i].dst.stride = dst_stride[i];
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      assert(bsize < top_bsize);
+      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
+                           top_bsize);
+      break;
+    case PARTITION_HORZ:
+      if (bsize > BLOCK_8X8) {
+        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
+                             mi_col_ori, top_bsize);
+      } else {
+        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
+                                    mi_row_ori, mi_col_ori,
+                                    top_bsize, partition);
+      }
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          xd->plane[i].dst.buf = tmp_buf1 + i * 32 * 32;
+          xd->plane[i].dst.stride = 32;
+        }
+        dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col,
+                             mi_row_ori, mi_col_ori, top_bsize);
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          xd->plane[i].dst.buf = dst_buf[i];
+          xd->plane[i].dst.stride = dst_stride[i];
+          vp9_build_masked_inter_predictor_complex(dst_buf[i], dst_stride[i],
+                                                   dst_buf1[i], dst_stride1[i],
+                                                   i,
+                                                   mi_row, mi_col,
+                                                   mi_row_ori, mi_col_ori,
+                                                   bsize, top_bsize,
+                                                   PARTITION_HORZ);
+        }
+      }
+      break;
+    case PARTITION_VERT:
+      if (bsize > BLOCK_8X8) {
+        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
+                             mi_col_ori, top_bsize);
+      } else {
+        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
+                                    mi_row_ori, mi_col_ori,
+                                    top_bsize, partition);
+      }
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          xd->plane[i].dst.buf = tmp_buf1 + i * 32 * 32;
+          xd->plane[i].dst.stride = 32;
+        }
+        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col + hbs, mi_row_ori,
+                             mi_col_ori, top_bsize);
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          xd->plane[i].dst.buf = dst_buf[i];
+          xd->plane[i].dst.stride = dst_stride[i];
+          vp9_build_masked_inter_predictor_complex(dst_buf[i], dst_stride[i],
+                                                   dst_buf1[i], dst_stride1[i],
+                                                   i,
+                                                   mi_row, mi_col,
+                                                   mi_row_ori, mi_col_ori,
+                                                   bsize, top_bsize,
+                                                   PARTITION_VERT);
+        }
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
+                                    mi_row_ori, mi_col_ori,
+                                    top_bsize, partition);
+      } else {
+        dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col,
+                               mi_row_ori, mi_col_ori, subsize, top_bsize,
+                               dst_buf, dst_stride);
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col + hbs,
+                                 mi_row_ori, mi_col_ori, subsize, top_bsize,
+                                 dst_buf1, dst_stride1);
+        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+          dec_predict_sb_complex(cm, xd, tile, mi_row + hbs, mi_col,
+                                 mi_row_ori, mi_col_ori, subsize, top_bsize,
+                                 dst_buf2, dst_stride2);
+        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(cm, xd, tile, mi_row + hbs, mi_col + hbs,
+                                 mi_row_ori, mi_col_ori, subsize, top_bsize,
+                                 dst_buf3, dst_stride3);
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+            vp9_build_masked_inter_predictor_complex(dst_buf[i], dst_stride[i],
+                                                     dst_buf1[i],
+                                                     dst_stride1[i],
+                                                     i, mi_row, mi_col,
+                                                     mi_row_ori, mi_col_ori,
+                                                     bsize, top_bsize,
+                                                     PARTITION_VERT);
+            if (mi_row + hbs < cm->mi_rows) {
+              vp9_build_masked_inter_predictor_complex(dst_buf2[i],
+                                                       dst_stride2[i],
+                                                       dst_buf3[i],
+                                                       dst_stride3[i],
+                                                       i, mi_row, mi_col,
+                                                       mi_row_ori, mi_col_ori,
+                                                       bsize, top_bsize,
+                                                       PARTITION_VERT);
+              vp9_build_masked_inter_predictor_complex(dst_buf[i],
+                                                       dst_stride[i],
+                                                       dst_buf2[i],
+                                                       dst_stride2[i],
+                                                       i, mi_row, mi_col,
+                                                       mi_row_ori, mi_col_ori,
+                                                       bsize, top_bsize,
+                                                       PARTITION_HORZ);
+            }
+          } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
+            vp9_build_masked_inter_predictor_complex(dst_buf[i],
+                                                     dst_stride[i],
+                                                     dst_buf2[i],
+                                                     dst_stride2[i],
+                                                     i, mi_row, mi_col,
+                                                     mi_row_ori, mi_col_ori,
+                                                     bsize, top_bsize,
+                                                     PARTITION_HORZ);
+          }
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+#endif
+
 static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                         const TileInfo *const tile,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif
                         int mi_row, int mi_col,
                         vp9_reader *r, BLOCK_SIZE bsize) {
  const int less8x8 = bsize < BLOCK_8X8;
+#if !CONFIG_SUPERTX
  MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
-  vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
+#else
+  MB_MODE_INFO *mbmi;
+  if (!supertx_enabled) {
+    mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
+  } else {
+    set_mb_offsets(cm, xd, tile, bsize, mi_row, mi_col);
+  }
+#endif
+  vp9_read_mode_info(cm, xd, tile,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r);

+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
  if (less8x8)
    bsize = BLOCK_8X8;

@@ -389,6 +699,9 @@ static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
        mbmi->skip = 1;  // skip loopfilter
    }
  }
+#if CONFIG_SUPERTX
+  }
+#endif

  xd->corrupted |= vp9_reader_has_error(r);
 }
@@ -419,45 +732,161 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,

 static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             const TileInfo *const tile,
+#if CONFIG_SUPERTX
+                             int read_token, int supertx_enabled,
+#endif
                             int mi_row, int mi_col,
                             vp9_reader* r, BLOCK_SIZE bsize) {
  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
  PARTITION_TYPE partition;
  BLOCK_SIZE subsize;
+#if CONFIG_SUPERTX
+  int skip = 0;
+#if CONFIG_EXT_TX
+  int txfm = 0;
+#endif
+#endif

  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
    return;

  partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
  subsize = get_subsize(bsize, partition);
+#if CONFIG_SUPERTX
+  if (cm->frame_type != KEY_FRAME &&
+      partition != PARTITION_NONE &&
+      bsize <= BLOCK_32X32 &&
+      !supertx_enabled) {
+    TX_SIZE supertx_size = b_width_log2(bsize);
+    if (partition == PARTITION_SPLIT) {
+      supertx_enabled = vp9_read(r, cm->fc.supertxsplit_prob[supertx_size]);
+      cm->counts.supertxsplit[supertx_size][supertx_enabled]++;
+    } else {
+      supertx_enabled = vp9_read(r, cm->fc.supertx_prob[supertx_size]);
+      cm->counts.supertx[supertx_size][supertx_enabled]++;
+    }
+  }
+  if (supertx_enabled && read_token) {
+    int offset = mi_row * cm->mi_stride + mi_col;
+    xd->mi = cm->mi_grid_visible + offset;
+    xd->mi[0] = &cm->mi[offset];
+    set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[bsize],
+                   mi_col, num_8x8_blocks_wide_lookup[bsize],
+                   cm->mi_rows, cm->mi_cols);
+    set_skip_context(xd, mi_row, mi_col);
+    // Here we assume mbmi->segment_id = 0
+    skip = read_skip(cm, xd, 0, r);
+    if (skip)
+      reset_skip_context(xd, bsize);
+#if CONFIG_EXT_TX
+    if (bsize <= BLOCK_16X16 && !skip) {
+      txfm = vp9_read(r, cm->fc.ext_tx_prob);
+      if (!cm->frame_parallel_decoding_mode)
+        ++cm->counts.ext_tx[txfm];
+    }
+#endif
+  }
+#endif
  if (subsize < BLOCK_8X8) {
-    decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+    decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                 supertx_enabled,
+#endif
+                 mi_row, mi_col, r, subsize);
  } else {
    switch (partition) {
      case PARTITION_NONE:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, subsize);
        break;
      case PARTITION_HORZ:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, subsize);
        if (mi_row + hbs < cm->mi_rows)
-          decode_block(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+          decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif
+                       mi_row + hbs, mi_col, r, subsize);
        break;
      case PARTITION_VERT:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, subsize);
        if (mi_col + hbs < cm->mi_cols)
-          decode_block(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+          decode_block(cm, xd, tile,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif
+                       mi_row, mi_col + hbs, r, subsize);
        break;
      case PARTITION_SPLIT:
-        decode_partition(cm, xd, tile, mi_row,       mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row,       mi_col + hbs, r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+        decode_partition(cm, xd, tile,
+#if CONFIG_SUPERTX
+                         !supertx_enabled, supertx_enabled,
+#endif
+                         mi_row,       mi_col,       r, subsize);
+        decode_partition(cm, xd, tile,
+#if CONFIG_SUPERTX
+                         !supertx_enabled, supertx_enabled,
+#endif
+                         mi_row,       mi_col + hbs, r, subsize);
+        decode_partition(cm, xd, tile,
+#if CONFIG_SUPERTX
+                         !supertx_enabled, supertx_enabled,
+#endif
+                         mi_row + hbs, mi_col,       r, subsize);
+        decode_partition(cm, xd, tile,
+#if CONFIG_SUPERTX
+                         !supertx_enabled, supertx_enabled,
+#endif
+                         mi_row + hbs, mi_col + hbs, r, subsize);
        break;
      default:
        assert(0 && "Invalid partition type");
    }
  }

+#if CONFIG_SUPERTX
+  if (supertx_enabled && read_token) {
+    uint8_t *dst_buf[3];
+    int dst_stride[3], i;
+
+    vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      dst_buf[i] = xd->plane[i].dst.buf;
+      dst_stride[i] = xd->plane[i].dst.stride;
+    }
+    dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col, mi_row, mi_col,
+                           bsize, bsize, dst_buf, dst_stride);
+
+    if (!skip) {
+      int eobtotal = 0;
+      struct inter_args arg = { cm, xd, r, &eobtotal };
+      set_offsets_topblock(cm, xd, tile, bsize, mi_row, mi_col);
+#if CONFIG_EXT_TX
+      xd->mi[0]->mbmi.ext_txfrm = txfm;
+#endif
+      vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+      if (!(subsize < BLOCK_8X8) && eobtotal == 0)
+        skip = 1;
+    }
+    set_param_topblock(cm, xd, bsize, mi_row, mi_col,
+#if CONFIG_EXT_TX
+                       txfm,
+#endif
+                       skip);
+  }
+#endif
+
  // update partition context
  if (bsize >= BLOCK_8X8 &&
      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
@@ -855,7 +1284,11 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
        vp9_zero(tile_data->xd.left_seg_context);
        for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
             mi_col += MI_BLOCK_SIZE) {
-          decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col,
+          decode_partition(tile_data->cm, &tile_data->xd, &tile,
+#if CONFIG_SUPERTX
+                           1, 0,
+#endif
+                           mi_row, mi_col,
                           &tile_data->bit_reader, BLOCK_64X64);
        }
      }
@@ -909,6 +1342,9 @@ static int tile_worker_hook(void *arg1, void *arg2) {
    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
         mi_col += MI_BLOCK_SIZE) {
      decode_partition(tile_data->cm, &tile_data->xd, tile,
+#if CONFIG_SUPERTX
+                       1, 0,
+#endif
                       mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
    }
  }
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -144,7 +144,11 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  return segment_id;
 }

+#if !CONFIG_SUPERTX
 static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+#else
+int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+#endif
                     int segment_id, vp9_reader *r) {
  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
    return 1;
@@ -550,6 +554,9 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
                                       MACROBLOCKD *const xd,
                                       const TileInfo *const tile,
                                       MODE_INFO *const mi,
+#if CONFIG_SUPERTX && CONFIG_EXT_TX
+                                       int supertx_enabled,
+#endif
                                       int mi_row, int mi_col, vp9_reader *r) {
  MB_MODE_INFO *const mbmi = &mi->mbmi;
  const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -564,6 +571,9 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
 #if CONFIG_EXT_TX
  if (mbmi->tx_size <= TX_16X16 &&
      bsize >= BLOCK_8X8 &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif
      !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
      !mbmi->skip) {
    mbmi->ext_txfrm = vp9_read(r, cm->fc.ext_tx_prob);
@@ -700,6 +710,9 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
 static void read_inter_frame_mode_info(VP9_COMMON *const cm,
                                       MACROBLOCKD *const xd,
                                       const TileInfo *const tile,
+#if CONFIG_SUPERTX
+                                       int supertx_enabled,
+#endif
                                       int mi_row, int mi_col, vp9_reader *r) {
  MODE_INFO *const mi = xd->mi[0];
  MB_MODE_INFO *const mbmi = &mi->mbmi;
@@ -707,23 +720,46 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm,

  mbmi->mv[0].as_int = 0;
  mbmi->mv[1].as_int = 0;
+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
  inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
  mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type,
                               !mbmi->skip || !inter_block, r);
+#if CONFIG_SUPERTX
+  } else {
+    const int ctx = vp9_get_intra_inter_context(xd);
+    mbmi->segment_id = 0;
+    inter_block = 1;
+    if (!cm->frame_parallel_decoding_mode)
+      ++cm->counts.intra_inter[ctx][1];
+  }
+#endif

  if (inter_block)
-    read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(cm, xd, tile, mi,
+#if CONFIG_SUPERTX && CONFIG_EXT_TX
+                               supertx_enabled,
+#endif
+                               mi_row, mi_col, r);
  else
    read_intra_block_mode_info(cm, mi, r);
 }

 void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
                        const TileInfo *const tile,
+#if CONFIG_SUPERTX
+                        int supertx_enabled,
+#endif
                        int mi_row, int mi_col, vp9_reader *r) {
  if (frame_is_intra_only(cm))
    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
  else
-    read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
+    read_inter_frame_mode_info(cm, xd, tile,
+#if CONFIG_SUPERTX
+                               supertx_enabled,
+#endif
+                               mi_row, mi_col, r);
 }
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -21,8 +21,16 @@ struct TileInfo;

 void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
                        const struct TileInfo *const tile,
+#if CONFIG_SUPERTX
+                        int supertx_enabled,
+#endif
                        int mi_row, int mi_col, vp9_reader *r);

+#if CONFIG_SUPERTX
+int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+              int segment_id, vp9_reader *r);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -39,6 +39,18 @@ static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS];
 static struct vp9_token partition_encodings[PARTITION_TYPES];
 static struct vp9_token inter_mode_encodings[INTER_MODES];

+#if CONFIG_SUPERTX
+static int vp9_check_supertx(VP9_COMMON *cm, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize) {
+  MODE_INFO **mi;
+
+  mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+
+  return mi[0]->mbmi.tx_size == b_width_log2(bsize) &&
+         mi[0]->mbmi.sb_type < bsize;
+}
+#endif
+
 void vp9_entropy_mode_init() {
  vp9_tokens_from_tree(intra_mode_encodings, vp9_intra_mode_tree);
  vp9_tokens_from_tree(switchable_interp_encodings, vp9_switchable_interp_tree);
@@ -225,6 +237,9 @@ static void write_ref_frames(const VP9_COMP *cpi, vp9_writer *w) {
 }

 static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
+#if CONFIG_SUPERTX
+                                int supertx_enabled,
+#endif
                                vp9_writer *w) {
  VP9_COMMON *const cm = &cpi->common;
  const nmv_context *nmvc = &cm->fc.nmvc;
@@ -252,12 +267,28 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
    }
  }

+#if CONFIG_SUPERTX
+  if (!supertx_enabled)
+#endif
  skip = write_skip(cpi, segment_id, mi, w);
+#if CONFIG_SUPERTX
+  else
+    skip = mbmi->skip;
+#endif

+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
    vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
+#if CONFIG_SUPERTX
+  }
+#endif

  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif
      !(is_inter &&
        (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
    write_selected_tx_size(cpi, mbmi->tx_size, bsize, w);
@@ -305,6 +336,9 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
 #if CONFIG_EXT_TX
    if (mbmi->tx_size <= TX_16X16 &&
        bsize >= BLOCK_8X8 &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif
        !mbmi->skip &&
        !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
      vp9_write(w, mbmi->ext_txfrm, cm->fc.ext_tx_prob);
@@ -451,6 +485,9 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,

 static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
                          vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+#if CONFIG_SUPERTX
+                          int supertx_enabled,
+#endif
                          int mi_row, int mi_col) {
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
@@ -466,11 +503,21 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
  if (frame_is_intra_only(cm)) {
    write_mb_modes_kf(cpi, xd->mi, w);
  } else {
+#if CONFIG_SUPERTX
+    pack_inter_mode_mvs(cpi, m, supertx_enabled, w);
+#else
    pack_inter_mode_mvs(cpi, m, w);
+#endif
  }

+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
  assert(*tok < tok_end);
  pack_mb_tokens(w, tok, tok_end);
+#if CONFIG_SUPERTX
+  }
+#endif
 }

 static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -497,6 +544,9 @@ static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
 static void write_modes_sb(VP9_COMP *cpi,
                           const TileInfo *const tile,
                           vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+#if CONFIG_SUPERTX
+                           int pack_token, int supertx_enabled,
+#endif
                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
@@ -513,36 +563,105 @@ static void write_modes_sb(VP9_COMP *cpi,
  partition = partition_lookup[bsl][m->mbmi.sb_type];
  write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
  subsize = get_subsize(bsize, partition);
+#if CONFIG_SUPERTX
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  set_mi_row_col(xd, tile,
+                 mi_row, num_8x8_blocks_high_lookup[bsize],
+                 mi_col, num_8x8_blocks_wide_lookup[bsize],
+                 cm->mi_rows, cm->mi_cols);
+  if (!supertx_enabled && cm->frame_type != KEY_FRAME &&
+      partition != PARTITION_NONE && bsize <= BLOCK_32X32) {
+    TX_SIZE supertx_size = b_width_log2(bsize);
+    vp9_prob prob = partition == PARTITION_SPLIT ?
+                    cm->fc.supertxsplit_prob[supertx_size] :
+                    cm->fc.supertx_prob[supertx_size];
+    supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size);
+    vp9_write(w, supertx_enabled, prob);
+    if (supertx_enabled) {
+      vp9_write(w, xd->mi[0]->mbmi.skip, vp9_get_skip_prob(cm, xd));
+#if CONFIG_EXT_TX
+      if (supertx_size <= TX_16X16 && !xd->mi[0]->mbmi.skip)
+        vp9_write(w, xd->mi[0]->mbmi.ext_txfrm, cm->fc.ext_tx_prob);
+#endif
+    }
+  }
+#endif
  if (subsize < BLOCK_8X8) {
-    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+    write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                  supertx_enabled,
+#endif
+                  mi_row, mi_col);
  } else {
    switch (partition) {
      case PARTITION_NONE:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col);
        break;
      case PARTITION_HORZ:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col);
        if (mi_row + bs < cm->mi_rows)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+          write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                        supertx_enabled,
+#endif
+                        mi_row + bs, mi_col);
        break;
      case PARTITION_VERT:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col);
        if (mi_col + bs < cm->mi_cols)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+          write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                        supertx_enabled,
+#endif
+                        mi_row, mi_col + bs);
        break;
      case PARTITION_SPLIT:
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       !supertx_enabled, supertx_enabled,
+#endif
+                       mi_row, mi_col, subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       !supertx_enabled, supertx_enabled,
+#endif
+                       mi_row, mi_col + bs,
                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       !supertx_enabled, supertx_enabled,
+#endif
+                       mi_row + bs, mi_col,
                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       !supertx_enabled, supertx_enabled,
+#endif
+                       mi_row + bs, mi_col + bs,
                       subsize);
        break;
      default:
        assert(0);
    }
  }
+#if CONFIG_SUPERTX
+  if (partition != PARTITION_NONE && supertx_enabled && pack_token) {
+    assert(*tok < tok_end);
+    pack_mb_tokens(w, tok, tok_end);
+  }
+#endif

  // update partition context
  if (bsize >= BLOCK_8X8 &&
@@ -560,7 +679,11 @@ static void write_modes(VP9_COMP *cpi,
    vp9_zero(cpi->mb.e_mbd.left_seg_context);
    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
         mi_col += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
+      write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                     1, 0,
+#endif
+                     mi_row, mi_col,
                     BLOCK_64X64);
  }
 }
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -578,6 +578,26 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
  }
 }

+#if CONFIG_SUPERTX
+void vp9_encode_sb_supertx(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  int plane;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    BLOCK_SIZE plane_size = bsize - 3 * (plane > 0);
+    const struct macroblockd_plane* const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
+    vp9_subtract_plane(x, bsize, plane);
+    vp9_get_entropy_contexts(bsize, tx_size, pd,
+                             ctx.ta[plane], ctx.tl[plane]);
+    encode_block(plane, 0, plane_size, b_width_log2(plane_size), &arg);
+  }
+}
+#endif
+
 static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                               TX_SIZE tx_size, void *arg) {
  struct encode_b_args* const args = arg;
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -21,6 +21,9 @@ extern "C" {
 #endif

 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+#if CONFIG_SUPERTX
+void vp9_encode_sb_supertx(MACROBLOCK *x, BLOCK_SIZE bsize);
+#endif
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -611,8 +611,10 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
  int pt = combine_entropy_contexts(*A, *L);
  int c, cost;
  // Check for consistency of tx_size with mode info
+#if !CONFIG_SUPERTX
  assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
                              : get_uv_tx_size(mbmi) == tx_size);
+#endif

  if (eob == 0) {
    // single eob token
@@ -777,7 +779,11 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
  }
 }

+#if !CONFIG_SUPERTX
 static void txfm_rd_in_plane(MACROBLOCK *x,
+#else
+void txfm_rd_in_plane(MACROBLOCK *x,
+#endif
                             int *rate, int64_t *distortion,
                             int *skippable, int64_t *sse,
                             int64_t ref_best_rd, int plane,
@@ -813,6 +819,41 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
  }
 }

+#if CONFIG_SUPERTX
+void txfm_rd_in_plane_supertx(MACROBLOCK *x,
+                              int *rate, int64_t *distortion,
+                              int *skippable, int64_t *sse,
+                              int64_t ref_best_rd, int plane,
+                              BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              int use_fast_coef_casting) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  vp9_zero(args);
+  args.x = x;
+  args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
+
+  vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+
+  args.so = get_scan(xd, tx_size, pd->plane_type, 0);
+
+  block_rd_txfm(plane, 0, get_plane_block_size(bsize, pd), tx_size, &args);
+
+  if (args.skip) {
+    *rate       = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse        = INT64_MAX;
+    *skippable  = 0;
+  } else {
+    *distortion = args.this_dist;
+    *rate       = args.this_rate;
+    *sse        = args.this_sse;
+    *skippable  = !x->plane[plane].eobs[0];
+  }
+}
+#endif
+
 static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
                                   int *rate, int64_t *distortion,
                                   int *skip, int64_t *sse,
@@ -3687,6 +3728,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                  const TileInfo *const tile,
                                  int mi_row, int mi_col,
                                  int *returnrate,
+#if CONFIG_SUPERTX
+                                  int *returnrate_nocoef,
+#endif
                                  int64_t *returndistortion,
                                  BLOCK_SIZE bsize,
                                  PICK_MODE_CONTEXT *ctx,
@@ -3768,6 +3812,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    x->pred_sse[i] = INT_MAX;

  *returnrate = INT_MAX;
+#if CONFIG_SUPERTX
+  *returnrate_nocoef = INT_MAX;
+#endif

  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
    x->pred_mv_sad[ref_frame] = INT_MAX;
@@ -4042,6 +4089,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      TX_SIZE uv_tx;
 #if CONFIG_FILTERINTRA
      mbmi->filterbit = 0;
+#endif
+#if CONFIG_EXT_TX
+      mbmi->ext_txfrm = 0;
 #endif
      intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
                            bsize, tx_cache, best_rd);
@@ -4174,6 +4224,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
        // Back out the coefficient coding costs
        rate2 -= (rate_y + rate_uv);
        // for best yrd calculation
+#if CONFIG_SUPERTX
+        rate_y = 0;
+#endif
        rate_uv = 0;

        // Cost the skip mb case
@@ -4253,6 +4306,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
        }

        *returnrate = rate2;
+#if CONFIG_SUPERTX
+        *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        if (!disable_skip) {
+          *returnrate_nocoef -= vp9_cost_bit(vp9_get_skip_prob(cm, xd),
+                                             skippable || this_skip2);
+        }
+        *returnrate_nocoef -= vp9_cost_bit(vp9_get_intra_inter_prob(cm, xd),
+                                           mbmi->ref_frame[0] != INTRA_FRAME);
+#endif
        *returndistortion = distortion2;
        best_rd = this_rd;
        best_mbmode = *mbmi;
@@ -4536,6 +4598,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                                      const TileInfo *const tile,
                                      int mi_row, int mi_col,
                                      int *returnrate,
+#if CONFIG_SUPERTX
+                                      int *returnrate_nocoef,
+#endif
                                      int64_t *returndistortion,
                                      BLOCK_SIZE bsize,
                                      PICK_MODE_CONTEXT *ctx,
@@ -4611,6 +4676,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
  rate_uv_intra = INT_MAX;

  *returnrate = INT_MAX;
+#if CONFIG_SUPERTX
+  *returnrate_nocoef = INT_MAX;
+#endif

  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
@@ -4750,6 +4818,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,

    if (ref_frame == INTRA_FRAME) {
      int rate;
+#if CONFIG_EXT_TX
+      mbmi->ext_txfrm = 0;
+#endif
      if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
                                       &distortion_y, best_rd) >= best_rd)
        continue;
@@ -5004,6 +5075,15 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
        }

        *returnrate = rate2;
+#if CONFIG_SUPERTX
+        *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        if (!disable_skip)
+          *returnrate_nocoef -= vp9_cost_bit(vp9_get_skip_prob(cm, xd),
+                                             this_skip2);
+        *returnrate_nocoef -= vp9_cost_bit(vp9_get_intra_inter_prob(cm, xd),
+                                           mbmi->ref_frame[0] != INTRA_FRAME);
+        assert(*returnrate_nocoef > 0);
+#endif
        *returndistortion = distortion2;
        best_rd = this_rd;
        best_yrd = best_rd -
@@ -5109,6 +5189,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,

  if (best_rd == INT64_MAX) {
    *returnrate = INT_MAX;
+#if CONFIG_SUPERTX
+    *returnrate_nocoef = INT_MAX;
+#endif
    *returndistortion = INT64_MAX;
    return best_rd;
  }
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -171,6 +171,9 @@ int64_t vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
                                  const struct TileInfo *const tile,
                                  int mi_row, int mi_col,
                                  int *returnrate,
+#if CONFIG_SUPERTX
+                                  int *returnrate_nocoef,
+#endif
                                  int64_t *returndistortion,
                                  BLOCK_SIZE bsize,
                                  PICK_MODE_CONTEXT *ctx,
@@ -191,6 +194,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
                                      const struct TileInfo *const tile,
                                      int mi_row, int mi_col,
                                      int *returnrate,
+#if CONFIG_SUPERTX
+                                      int *returnrate_nocoef,
+#endif
                                      int64_t *returndistortion,
                                      BLOCK_SIZE bsize,
                                      PICK_MODE_CONTEXT *ctx,
@@ -222,6 +228,21 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd,
                          int mi_row, int mi_col,
                          const struct scale_factors *scale,
                          const struct scale_factors *scale_uv);
+
+#if CONFIG_SUPERTX
+void txfm_rd_in_plane_supertx(MACROBLOCK *x,
+                              int *rate, int64_t *distortion,
+                              int *skippable, int64_t *sse,
+                              int64_t ref_best_rd, int plane,
+                              BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              int use_fast_coef_casting);
+void txfm_rd_in_plane(MACROBLOCK *x,
+                      int *rate, int64_t *distortion,
+                      int *skippable, int64_t *sse,
+                      int64_t ref_best_rd, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size,
+                      int use_fast_coef_casting);
+#endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -334,3 +334,41 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
    *t = t_backup;
  }
 }
+
+#if CONFIG_SUPERTX
+void vp9_tokenize_sb_supertx(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
+                             BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  const int ctx = vp9_get_skip_context(xd);
+  const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
+                                              SEG_LVL_SKIP);
+  struct tokenize_b_args arg = {cpi, xd, t};
+  int plane;
+  if (mbmi->skip) {
+    if (!dry_run)
+      cm->counts.skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    if (dry_run)
+      *t = t_backup;
+    return;
+  }
+
+  if (!dry_run) {
+    cm->counts.skip[ctx][0] += skip_inc;
+    for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+      BLOCK_SIZE plane_size = plane ? (bsize - 3) : bsize;
+      tokenize_b(plane, 0, plane_size, b_width_log2(plane_size), &arg);
+    }
+  } else {
+    for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+      BLOCK_SIZE plane_size = plane ? (bsize - 3) : bsize;
+      set_entropy_context_b(plane, 0, plane_size, b_width_log2(plane_size),
+                            &arg);
+    }
+    *t = t_backup;
+  }
+}
+#endif
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -46,6 +46,10 @@ struct VP9_COMP;

 void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                     BLOCK_SIZE bsize);
+#if CONFIG_SUPERTX
+void vp9_tokenize_sb_supertx(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
+                             BLOCK_SIZE bsize);
+#endif

 extern const int16_t *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to