diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index dc6625760..2736417b8 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -432,8 +432,10 @@ typedef struct macroblockd {
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][CODING_UNIT_SIZE *
                                                         CODING_UNIT_SIZE]);
 #if CONFIG_PALETTE
-  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, palette_map_buffer[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][CODING_UNIT_SIZE *
+                                                  CODING_UNIT_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, palette_map_buffer[CODING_UNIT_SIZE *
+                                                  CODING_UNIT_SIZE]);
 #endif  // CONFIG_PALETTE
 
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index df15fbcda..e19b55413 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -16,7 +16,10 @@
 
 #if CONFIG_WEDGE_PARTITION
 static const vp9_prob default_wedge_interinter_prob[BLOCK_SIZES] = {
-  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+  192, 192, 192
+#endif
 };
 #endif  // CONFIG_WEDGE_PARTITION
 
@@ -42,11 +45,17 @@ const vp9_tree_index vp9_sr_usfilter_tree[TREE_SIZE(SR_USFILTER_NUM)] = {
 
 #if CONFIG_INTERINTRA
 static const vp9_prob default_interintra_prob[BLOCK_SIZES] = {
-  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+  192, 192, 192
+#endif
 };
 #if CONFIG_WEDGE_PARTITION
 static const vp9_prob default_wedge_interintra_prob[BLOCK_SIZES] = {
-  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+#if CONFIG_EXT_CODING_UNIT_SIZE
+  192, 192, 192
+#endif
 };
 #endif  // CONFIG_WEDGE_PARTITION
 #endif  // CONFIG_INTERINTRA
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 347ea8cf0..4a81a405a 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -579,11 +579,12 @@ static void build_masked_compound(uint8_t *dst, int dst_stride,
                                   int wedge_index, BLOCK_SIZE sb_type,
                                   int h, int w) {
   int i, j;
-  uint8_t mask[4096];
-  vp9_generate_masked_weight(wedge_index, sb_type, h, w, mask, 64);
+  uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
+  vp9_generate_masked_weight(wedge_index, sb_type, h, w, mask,
+                             CODING_UNIT_SIZE);
   for (i = 0; i < h; ++i)
     for (j = 0; j < w; ++j) {
-      int m = mask[i * 64 + j];
+      int m = mask[i * CODING_UNIT_SIZE + j];
       dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
                                  dst2[i * dst2_stride + j] *
                                  ((1 << WEDGE_WEIGHT_BITS) - m) +
@@ -598,13 +599,14 @@ static void build_masked_compound_highbd(uint8_t *dst_8, int dst_stride,
                                          int wedge_index, BLOCK_SIZE sb_type,
                                          int h, int w) {
   int i, j;
-  uint8_t mask[4096];
+  uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
   uint16_t *dst2 = CONVERT_TO_SHORTPTR(dst2_8);
-  vp9_generate_masked_weight(wedge_index, sb_type, h, w, mask, 64);
+  vp9_generate_masked_weight(wedge_index, sb_type, h, w, mask,
+                             CODING_UNIT_SIZE);
   for (i = 0; i < h; ++i)
     for (j = 0; j < w; ++j) {
-      int m = mask[i * 64 + j];
+      int m = mask[i * CODING_UNIT_SIZE + j];
       dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
                                  dst2[i * dst2_stride + j] *
                                  ((1 << WEDGE_WEIGHT_BITS) - m) +
@@ -640,12 +642,13 @@ static void build_masked_compound_extend(uint8_t *dst, int dst_stride,
                                          int wedge_offset_x, int wedge_offset_y,
                                          int h, int w) {
   int i, j;
-  uint8_t mask[4096];
+  uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
   generate_masked_weight_extend(wedge_index, plane, sb_type, h, w,
-                                wedge_offset_x, wedge_offset_y, mask, 64);
+                                wedge_offset_x, wedge_offset_y, mask,
+                                CODING_UNIT_SIZE);
   for (i = 0; i < h; ++i)
     for (j = 0; j < w; ++j) {
-      int m = mask[i * 64 + j];
+      int m = mask[i * CODING_UNIT_SIZE + j];
       dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
                                  dst2[i * dst2_stride + j] *
                                  ((1 << WEDGE_WEIGHT_BITS) - m) +
@@ -662,14 +665,15 @@ static void build_masked_compound_extend_highbd(
     int wedge_offset_x, int wedge_offset_y,
     int h, int w) {
   int i, j;
-  uint8_t mask[4096];
+  uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
   uint16_t *dst2 = CONVERT_TO_SHORTPTR(dst2_8);
   generate_masked_weight_extend(wedge_index, plane, sb_type, h, w,
-                                wedge_offset_x, wedge_offset_y, mask, 64);
+                                wedge_offset_x, wedge_offset_y, mask,
+                                CODING_UNIT_SIZE);
   for (i = 0; i < h; ++i)
     for (j = 0; j < w; ++j) {
-      int m = mask[i * 64 + j];
+      int m = mask[i * CODING_UNIT_SIZE + j];
       dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
                                  dst2[i * dst2_stride + j] *
                                  ((1 << WEDGE_WEIGHT_BITS) - m) +
@@ -765,33 +769,33 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     if (ref && get_wedge_bits(mi->mbmi.sb_type)
         && mi->mbmi.use_wedge_interinter) {
 #if CONFIG_VP9_HIGHBITDEPTH
-      uint8_t tmp_dst_[8192];
+      uint8_t tmp_dst_[2 * CODING_UNIT_SIZE * CODING_UNIT_SIZE];
       uint8_t *tmp_dst =
           (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
           CONVERT_TO_BYTEPTR(tmp_dst_) : tmp_dst_;
 #else
-      uint8_t tmp_dst[4096];
+      uint8_t tmp_dst[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
 #endif
 #if CONFIG_GLOBAL_MOTION
       if (is_global) {
         vp9_warp_plane(gm[ref], pre_buf->buf0,
                        pre_buf->width, pre_buf->height, pre_buf->stride,
                        tmp_dst, (mi_x >> pd->subsampling_x) + x,
-                       (mi_y >> pd->subsampling_y) + y, w, h, 64,
+                       (mi_y >> pd->subsampling_y) + y, w, h, CODING_UNIT_SIZE,
                        pd->subsampling_x, pd->subsampling_y, xs, ys);
       } else {
 #endif  // CONFIG_GLOBAL_MOTION
 #if CONFIG_VP9_HIGHBITDEPTH
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          highbd_inter_predictor(pre, pre_buf->stride, tmp_dst, 64,
-                                 subpel_x, subpel_y, sf, w, h, 0, kernel,
-                                 xs, ys, xd->bd);
+          highbd_inter_predictor(pre, pre_buf->stride, tmp_dst,
+                                 CODING_UNIT_SIZE, subpel_x, subpel_y, sf, w, h,
+                                 0, kernel, xs, ys, xd->bd);
         } else {
-          inter_predictor(pre, pre_buf->stride, tmp_dst, 64,
+          inter_predictor(pre, pre_buf->stride, tmp_dst, CODING_UNIT_SIZE,
                           subpel_x, subpel_y, sf, w, h, 0, kernel, xs, ys);
         }
 #else
-        inter_predictor(pre, pre_buf->stride, tmp_dst, 64,
+        inter_predictor(pre, pre_buf->stride, tmp_dst, CODING_UNIT_SIZE,
                         subpel_x, subpel_y, sf, w, h, 0, kernel, xs, ys);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_GLOBAL_MOTION
@@ -801,19 +805,20 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         build_masked_compound_extend_highbd(
-            dst, dst_buf->stride, tmp_dst, 64, plane,
+            dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE, plane,
             mi->mbmi.interinter_wedge_index,
             mi->mbmi.sb_type,
             wedge_offset_x, wedge_offset_y, h, w);
       } else {
         build_masked_compound_extend(
-            dst, dst_buf->stride, tmp_dst, 64, plane,
+            dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE, plane,
             mi->mbmi.interinter_wedge_index,
             mi->mbmi.sb_type,
             wedge_offset_x, wedge_offset_y, h, w);
       }
 #else
-      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst, 64, plane,
+      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst,
+                                   CODING_UNIT_SIZE, plane,
                                    mi->mbmi.interinter_wedge_index,
                                    mi->mbmi.sb_type,
                                    wedge_offset_x, wedge_offset_y, h, w);
@@ -821,12 +826,13 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 #else   // CONFIG_SUPERTX
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        build_masked_compound_highbd(dst, dst_buf->stride, tmp_dst, 64,
+        build_masked_compound_highbd(dst, dst_buf->stride, tmp_dst,
+                                     CODING_UNIT_SIZE,
                                      mi->mbmi.interinter_wedge_index,
                                      mi->mbmi.sb_type, h, w);
       else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        build_masked_compound(dst, dst_buf->stride, tmp_dst, 64,
+        build_masked_compound(dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE,
                               mi->mbmi.interinter_wedge_index, mi->mbmi.sb_type,
                               h, w);
 #endif  // CONFIG_SUPERTX
@@ -1514,33 +1520,33 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     if (ref && get_wedge_bits(mi->mbmi.sb_type)
         && mi->mbmi.use_wedge_interinter) {
 #if CONFIG_VP9_HIGHBITDEPTH
-      uint8_t tmp_dst_[8192];
+      uint8_t tmp_dst_[2 * CODING_UNIT_SIZE * CODING_UNIT_SIZE];
       uint8_t *tmp_dst =
           (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
           CONVERT_TO_BYTEPTR(tmp_dst_) : tmp_dst_;
 #else
-      uint8_t tmp_dst[4096];
+      uint8_t tmp_dst[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
 #endif
 #if CONFIG_GLOBAL_MOTION
       if (is_global) {
         vp9_warp_plane(gm[ref], pre_buf->buf0,
                        pre_buf->width, pre_buf->height, pre_buf->stride,
                        tmp_dst, (mi_x >> pd->subsampling_x) + x,
-                       (mi_y >> pd->subsampling_y) + y, w, h, 64,
+                       (mi_y >> pd->subsampling_y) + y, w, h, CODING_UNIT_SIZE,
                        pd->subsampling_x, pd->subsampling_y, xs, ys);
       } else {
 #endif  // CONFIG_GLOBAL_MOTION
 #if CONFIG_VP9_HIGHBITDEPTH
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          highbd_inter_predictor(buf_ptr, buf_stride, tmp_dst, 64,
+          highbd_inter_predictor(buf_ptr, buf_stride, tmp_dst, CODING_UNIT_SIZE,
                                  subpel_x, subpel_y, sf, w, h, 0, kernel,
                                  xs, ys, xd->bd);
         } else {
-          inter_predictor(buf_ptr, buf_stride, tmp_dst, 64,
+          inter_predictor(buf_ptr, buf_stride, tmp_dst, CODING_UNIT_SIZE,
                           subpel_x, subpel_y, sf, w, h, 0, kernel, xs, ys);
         }
 #else
-        inter_predictor(buf_ptr, buf_stride, tmp_dst, 64,
+        inter_predictor(buf_ptr, buf_stride, tmp_dst, CODING_UNIT_SIZE,
                         subpel_x, subpel_y, sf, w, h, 0, kernel, xs, ys);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_GLOBAL_MOTION
@@ -1550,18 +1556,20 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         build_masked_compound_extend_highbd(
-            dst, dst_buf->stride, tmp_dst, 64, plane,
+            dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE, plane,
             mi->mbmi.interinter_wedge_index,
             mi->mbmi.sb_type,
             wedge_offset_x, wedge_offset_y, h, w);
       } else {
-        build_masked_compound_extend(dst, dst_buf->stride, tmp_dst, 64, plane,
+        build_masked_compound_extend(dst, dst_buf->stride, tmp_dst,
+                                     CODING_UNIT_SIZE, plane,
                                      mi->mbmi.interinter_wedge_index,
                                      mi->mbmi.sb_type,
                                      wedge_offset_x, wedge_offset_y, h, w);
       }
 #else
-      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst, 64, plane,
+      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst,
+                                   CODING_UNIT_SIZE, plane,
                                    mi->mbmi.interinter_wedge_index,
                                    mi->mbmi.sb_type,
                                    wedge_offset_x, wedge_offset_y, h, w);
@@ -1569,16 +1577,17 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 #else   // CONFIG_SUPERTX
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        build_masked_compound_highbd(dst, dst_buf->stride, tmp_dst, 64,
+        build_masked_compound_highbd(dst, dst_buf->stride, tmp_dst,
+                                     CODING_UNIT_SIZE,
                                      mi->mbmi.interinter_wedge_index,
                                      mi->mbmi.sb_type, h, w);
       } else {
-        build_masked_compound(dst, dst_buf->stride, tmp_dst, 64,
+        build_masked_compound(dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE,
                               mi->mbmi.interinter_wedge_index, mi->mbmi.sb_type,
                               h, w);
       }
 #else
-      build_masked_compound(dst, dst_buf->stride, tmp_dst, 64,
+      build_masked_compound(dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE,
                             mi->mbmi.interinter_wedge_index, mi->mbmi.sb_type,
                             h, w);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -2060,19 +2069,19 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
     if (ref && get_wedge_bits(mi->mbmi.sb_type)
         && mi->mbmi.use_wedge_interinter) {
 #if CONFIG_VP9_HIGHBITDEPTH
-      uint8_t tmp_dst_[8192];
+      uint8_t tmp_dst_[2 * CODING_UNIT_SIZE * CODING_UNIT_SIZE];
       uint8_t *tmp_dst =
           (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
           CONVERT_TO_BYTEPTR(tmp_dst_) : tmp_dst_;
 #else
-      uint8_t tmp_dst[4096];
+      uint8_t tmp_dst[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_GLOBAL_MOTION
       if (is_global) {
         vp9_warp_plane(gm[ref], pre_buf->buf0,
                        pre_buf->width, pre_buf->height, pre_buf->stride,
                        tmp_dst, (mi_x >> pd->subsampling_x) + x,
-                       (mi_y >> pd->subsampling_y) + y, w, h, 64,
+                       (mi_y >> pd->subsampling_y) + y, w, h, CODING_UNIT_SIZE,
                        pd->subsampling_x, pd->subsampling_y, xs, ys);
       } else {
 #endif  // CONFIG_GLOBAL_MOTION
@@ -2080,18 +2089,20 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
           int k;
           for (k = 0; k < h; ++k)
-            vpx_memcpy(tmp_dst_ + 128 * k, ext_dst1 + ext_dst_stride1 * 2 * k,
-                       w * 2);
+            vpx_memcpy(tmp_dst_ + 2 * CODING_UNIT_SIZE * k, ext_dst1 +
+                       ext_dst_stride1 * 2 * k, w * 2);
         } else {
           int k;
           for (k = 0; k < h; ++k)
-            vpx_memcpy(tmp_dst_ + 64 * k, ext_dst1 + ext_dst_stride1 * k, w);
+            vpx_memcpy(tmp_dst_ + CODING_UNIT_SIZE * k, ext_dst1 +
+                       ext_dst_stride1 * k, w);
         }
 #else
         {
           int k;
           for (k = 0; k < h; ++k)
-            vpx_memcpy(tmp_dst + 64 * k, ext_dst1 + ext_dst_stride1 * k, w);
+            vpx_memcpy(tmp_dst + CODING_UNIT_SIZE * k, ext_dst1 +
+                       ext_dst_stride1 * k, w);
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_GLOBAL_MOTION
@@ -2101,19 +2112,20 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         build_masked_compound_extend_highbd(
-            dst, dst_buf->stride, tmp_dst, 64, plane,
+            dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE, plane,
             mi->mbmi.interinter_wedge_index,
             mi->mbmi.sb_type,
             wedge_offset_x, wedge_offset_y, h, w);
       } else {
         build_masked_compound_extend(
-            dst, dst_buf->stride, tmp_dst, 64, plane,
+            dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE, plane,
             mi->mbmi.interinter_wedge_index,
             mi->mbmi.sb_type,
             wedge_offset_x, wedge_offset_y, h, w);
       }
 #else
-      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst, 64, plane,
+      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst,
+                                   CODING_UNIT_SIZE, plane,
                                    mi->mbmi.interinter_wedge_index,
                                    mi->mbmi.sb_type,
                                    wedge_offset_x, wedge_offset_y, h, w);
@@ -2121,12 +2133,13 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
 #else   // CONFIG_SUPERTX
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        build_masked_compound_highbd(dst, dst_buf->stride, tmp_dst, 64,
+        build_masked_compound_highbd(dst, dst_buf->stride, tmp_dst,
+                                     CODING_UNIT_SIZE,
                                      mi->mbmi.interinter_wedge_index,
                                      mi->mbmi.sb_type, h, w);
       else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        build_masked_compound(dst, dst_buf->stride, tmp_dst, 64,
+        build_masked_compound(dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE,
                               mi->mbmi.interinter_wedge_index,
                               mi->mbmi.sb_type, h, w);
 #endif  // CONFIG_SUPERTX
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 638ff9664..d299c17d3 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -1443,6 +1443,7 @@ static INLINE TX_SIZE blocklen_to_txsize(int bs) {
       return TX_32X32;
       break;
     case 64:
+    case 128:
     default:
 #if CONFIG_TX64X64
       return TX_64X64;
@@ -1768,7 +1769,7 @@ static void combine_interintra(PREDICTION_MODE mode,
 
 #if CONFIG_WEDGE_PARTITION
   if (use_wedge_interintra && get_wedge_bits(bsize)) {
-    uint8_t mask[4096];
+    uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
     vp9_generate_masked_weight_interintra(wedge_index, bsize, bh, bw, mask, bw);
     for (i = 0; i < bh; ++i) {
       for (j = 0; j < bw; ++j) {
@@ -1917,7 +1918,7 @@ static void combine_interintra_highbd(PREDICTION_MODE mode,
 
 #if CONFIG_WEDGE_PARTITION
   if (use_wedge_interintra && get_wedge_bits(bsize)) {
-    uint8_t mask[4096];
+    uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
     vp9_generate_masked_weight_interintra(wedge_index, bsize, bh, bw, mask, bw);
     for (i = 0; i < bh; ++i) {
       for (j = 0; j < bw; ++j) {
@@ -2418,7 +2419,8 @@ void vp9_build_interintra_predictors_sby(MACROBLOCKD *xd,
   int bh = 4 << b_height_log2_lookup[bsize];
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, intrapredictor, 4096);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, intrapredictor, CODING_UNIT_SIZE *
+                          CODING_UNIT_SIZE);
     build_intra_predictors_for_interintra_highbd(
         xd, xd->plane[0].dst.buf, xd->plane[0].dst.stride,
         CONVERT_TO_BYTEPTR(intrapredictor), bw,
@@ -2466,8 +2468,10 @@ void vp9_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
   int bh = 4 << b_height_log2_lookup[uvbsize];
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, uintrapredictor, 4096);
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, vintrapredictor, 4096);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, uintrapredictor, CODING_UNIT_SIZE *
+                          CODING_UNIT_SIZE);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, vintrapredictor, CODING_UNIT_SIZE *
+                          CODING_UNIT_SIZE);
     build_intra_predictors_for_interintra_highbd(
         xd, xd->plane[1].dst.buf, xd->plane[1].dst.stride,
         CONVERT_TO_BYTEPTR(uintrapredictor), bw,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index cdb24ed44..7caa9295f 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1563,6 +1563,35 @@ if (vpx_config("CONFIG_WEDGE_PARTITION") eq "yes") {
 
   add_proto qw/unsigned int vp9_masked_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
   specialize qw/vp9_masked_sad4x4/;
+
+  if (vpx_config("CONFIG_EXT_CODING_UNIT_SIZE") eq "yes") {
+    add_proto qw/unsigned int vp9_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    specialize qw/vp9_masked_variance128x128/;
+
+    add_proto qw/unsigned int vp9_masked_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    specialize qw/vp9_masdctked_variance128x64/;
+
+    add_proto qw/unsigned int vp9_masked_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    specialize qw/vp9_masked_variance64x128/;
+
+    add_proto qw/unsigned int vp9_masked_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    specialize qw/vp9_masked_sub_pixel_variance128x128/;
+
+    add_proto qw/unsigned int vp9_masked_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    specialize qw/vp9_masked_sub_pixel_variance128x64/;
+
+    add_proto qw/unsigned int vp9_masked_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    specialize qw/vp9_masked_sub_pixel_variance64x128/;
+
+    add_proto qw/unsigned int vp9_masked_sad128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+    specialize qw/vp9_masked_sad128x128/;
+
+    add_proto qw/unsigned int vp9_masked_sad128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+    specialize qw/vp9_masked_sad128x64/;
+
+    add_proto qw/unsigned int vp9_masked_sad64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+    specialize qw/vp9_masked_sad64x128/;
+  }
 }
 # ENCODEMB INVOKE
 
@@ -2797,6 +2826,71 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
     add_proto qw/unsigned int vp9_highbd_masked_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
     specialize qw/vp9_highbd_masked_sad4x4/;
+
+    if (vpx_config("CONFIG_EXT_CODING_UNIT_SIZE") eq "yes") {
+      add_proto qw/unsigned int vp9_highbd_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_masked_variance128x128/;
+
+      add_proto qw/unsigned int vp9_highbd_masked_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_masked_variance128x64/;
+
+      add_proto qw/unsigned int vp9_highbd_masked_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_masked_variance64x128/;
+
+      add_proto qw/unsigned int vp9_highbd_10_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_10_masked_variance128x128/;
+
+      add_proto qw/unsigned int vp9_highbd_10_masked_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_10_masked_variance128x64/;
+
+      add_proto qw/unsigned int vp9_highbd_10_masked_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_10_masked_variance64x128/;
+
+      add_proto qw/unsigned int vp9_highbd_12_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_10_masked_variance128x128/;
+
+      add_proto qw/unsigned int vp9_highbd_12_masked_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_10_masked_variance128x64/;
+
+      add_proto qw/unsigned int vp9_highbd_12_masked_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_10_masked_variance64x128/;
+
+      add_proto qw/unsigned int vp9_highbd_masked_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_masked_sub_pixel_variance128x128/;
+
+      add_proto qw/unsigned int vp9_highbd_masked_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_masked_sub_pixel_variance128x64/;
+
+      add_proto qw/unsigned int vp9_highbd_masked_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_masked_sub_pixel_variance64x128/;
+
+      add_proto qw/unsigned int vp9_highbd_10_masked_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_10_masked_sub_pixel_variance128x128/;
+
+      add_proto qw/unsigned int vp9_highbd_10_masked_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_10_masked_sub_pixel_variance128x64/;
+
+      add_proto qw/unsigned int vp9_highbd_10_masked_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_10_masked_sub_pixel_variance64x128/;
+
+      add_proto qw/unsigned int vp9_highbd_12_masked_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_12_masked_sub_pixel_variance128x128/;
+
+      add_proto qw/unsigned int vp9_highbd_12_masked_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_12_masked_sub_pixel_variance128x64/;
+
+      add_proto qw/unsigned int vp9_highbd_12_masked_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+      specialize qw/vp9_highbd_12_masked_sub_pixel_variance64x128/;
+
+      add_proto qw/unsigned int vp9_highbd_masked_sad128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+      specialize qw/vp9_highbd_masked_sad128x128/;
+
+      add_proto qw/unsigned int vp9_highbd_masked_sad128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+      specialize qw/vp9_highbd_masked_sad128x64/;
+
+      add_proto qw/unsigned int vp9_highbd_masked_sad64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+      specialize qw/vp9_highbd_masked_sad64x128/;
+    }
   }
 
   # ENCODEMB INVOKE
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index d982c41c1..b5d215f2b 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -2770,7 +2770,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
                          0,
 #endif
                          mi_row, mi_col,
-                         &tile_data->bit_reader, BLOCK_64X64);
+                         &tile_data->bit_reader, BLOCK_LARGEST);
       }
       pbi->mb.corrupted |= tile_data->xd.corrupted;
     }
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index db10fb6d4..fdea439a6 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -626,7 +626,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
   if (!is_inter && bsize >= BLOCK_8X8 && cm->allow_palette_mode) {
     int n, i, j, k, rows, cols, palette_ctx, color_ctx;
     int color_new_idx = -1, color_order[PALETTE_MAX_SIZE];
-    uint8_t buffer[4096];
+    uint8_t buffer[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
     const MODE_INFO *above_mi = xd->up_available ?
         xd->mi[-xd->mi_stride].src_mi : NULL;
     const MODE_INFO *left_mi = xd->left_available ?
@@ -1039,7 +1039,7 @@ static void write_mb_modes_kf(const VP9_COMMON *cm,
       ) {
     int n, m1, m2, i, j, k, rows, cols, palette_ctx, color_ctx;
     int color_new_idx = -1, color_order[PALETTE_MAX_SIZE];
-    uint8_t buffer[4096];
+    uint8_t buffer[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
 
     palette_ctx = 0;
     if (above_mi)
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 6f3e9ed31..3456112cf 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -157,8 +157,10 @@ struct macroblock {
                           int eob, int bd);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_PALETTE
-  DECLARE_ALIGNED(16, double, kmeans_data_buffer[MAX_MB_PLANE * 64 * 64]);
-  DECLARE_ALIGNED(16, int, kmeans_indices_buffer[64 * 64]);
+  DECLARE_ALIGNED(16, double, kmeans_data_buffer[MAX_MB_PLANE *
+                                          CODING_UNIT_SIZE * CODING_UNIT_SIZE]);
+  DECLARE_ALIGNED(16, int, kmeans_indices_buffer[CODING_UNIT_SIZE *
+                                                 CODING_UNIT_SIZE]);
 #endif  // CONFIG_PALETTE
 };
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 9a22501f2..90891ffaa 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2995,7 +2995,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
 
 #if CONFIG_PALETTE
-  if (bsize == BLOCK_64X64) {
+  if (bsize == BLOCK_LARGEST) {
     c = &pc_tree->current;
     c->palette_buf_size = cm->current_palette_size;
     vpx_memcpy(c->palette_colors_buf, cm->current_palette_colors,
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 3210b3213..4370b72e0 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -876,6 +876,11 @@ static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
                 m, m_stride) >> 4; \
 }
 
+#if CONFIG_EXT_CODING_UNIT_SIZE
+MAKE_MBFP_SAD_WRAPPER(vp9_highbd_masked_sad128x128)
+MAKE_MBFP_SAD_WRAPPER(vp9_highbd_masked_sad128x64)
+MAKE_MBFP_SAD_WRAPPER(vp9_highbd_masked_sad64x128)
+#endif
 MAKE_MBFP_SAD_WRAPPER(vp9_highbd_masked_sad64x64)
 MAKE_MBFP_SAD_WRAPPER(vp9_highbd_masked_sad64x32)
 MAKE_MBFP_SAD_WRAPPER(vp9_highbd_masked_sad32x64)
@@ -1059,6 +1064,20 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
 #endif
 
 #if CONFIG_WEDGE_PARTITION
+#if CONFIG_EXT_CODING_UNIT_SIZE
+        HIGHBD_MBFP(BLOCK_128X128,
+                    vp9_highbd_masked_sad128x128_bits8,
+                    vp9_highbd_masked_variance128x128,
+                    vp9_highbd_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64,
+                    vp9_highbd_masked_sad128x64_bits8,
+                    vp9_highbd_masked_variance128x64,
+                    vp9_highbd_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128,
+                    vp9_highbd_masked_sad64x128_bits8,
+                    vp9_highbd_masked_variance64x128,
+                    vp9_highbd_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_CODING_UNIT_SIZE
         HIGHBD_MBFP(BLOCK_64X64,
                     vp9_highbd_masked_sad64x64_bits8,
                     vp9_highbd_masked_variance64x64,
@@ -1278,6 +1297,20 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
 #endif
 
 #if CONFIG_WEDGE_PARTITION
+#if CONFIG_EXT_CODING_UNIT_SIZE
+        HIGHBD_MBFP(BLOCK_128X128,
+                    vp9_highbd_masked_sad128x128_bits10,
+                    vp9_highbd_10_masked_variance128x128,
+                    vp9_highbd_10_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64,
+                    vp9_highbd_masked_sad128x64_bits10,
+                    vp9_highbd_10_masked_variance128x64,
+                    vp9_highbd_10_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128,
+                    vp9_highbd_masked_sad64x128_bits10,
+                    vp9_highbd_10_masked_variance64x128,
+                    vp9_highbd_10_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_CODING_UNIT_SIZE
         HIGHBD_MBFP(BLOCK_64X64,
                     vp9_highbd_masked_sad64x64_bits10,
                     vp9_highbd_10_masked_variance64x64,
@@ -1497,6 +1530,20 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
 #endif
 
 #if CONFIG_WEDGE_PARTITION
+#if CONFIG_EXT_CODING_UNIT_SIZE
+        HIGHBD_MBFP(BLOCK_128X128,
+                    vp9_highbd_masked_sad128x128_bits12,
+                    vp9_highbd_12_masked_variance128x128,
+                    vp9_highbd_12_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64,
+                    vp9_highbd_masked_sad128x64_bits12,
+                    vp9_highbd_12_masked_variance128x64,
+                    vp9_highbd_12_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128,
+                    vp9_highbd_masked_sad64x128_bits12,
+                    vp9_highbd_12_masked_variance64x128,
+                    vp9_highbd_12_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_CODING_UNIT_SIZE
         HIGHBD_MBFP(BLOCK_64X64,
                     vp9_highbd_masked_sad64x64_bits12,
                     vp9_highbd_12_masked_variance64x64,
@@ -1995,6 +2042,14 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
   cpi->fn_ptr[BT].mvf             = MVF;  \
   cpi->fn_ptr[BT].msvf            = MSVF;
 
+#if CONFIG_EXT_CODING_UNIT_SIZE
+  MBFP(BLOCK_128X128, vp9_masked_sad128x128, vp9_masked_variance128x128,
+       vp9_masked_sub_pixel_variance128x128)
+  MBFP(BLOCK_128X64, vp9_masked_sad128x64, vp9_masked_variance128x64,
+         vp9_masked_sub_pixel_variance128x64)
+  MBFP(BLOCK_64X128, vp9_masked_sad64x128, vp9_masked_variance64x128,
+         vp9_masked_sub_pixel_variance64x128)
+#endif
   MBFP(BLOCK_64X64, vp9_masked_sad64x64, vp9_masked_variance64x64,
        vp9_masked_sub_pixel_variance64x64)
   MBFP(BLOCK_64X32, vp9_masked_sad64x32, vp9_masked_variance64x32,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index c4cd87756..46e5865b7 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2481,9 +2481,11 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     vpx_memset(x->kmeans_data_buffer, 0,
-               sizeof(x->kmeans_data_buffer[0] * 4096));
+               sizeof(x->kmeans_data_buffer[0] * CODING_UNIT_SIZE *
+               CODING_UNIT_SIZE));
     vpx_memset(x->kmeans_indices_buffer, 0,
-               sizeof(x->kmeans_indices_buffer[0] * 4096));
+               sizeof(x->kmeans_indices_buffer[0] * CODING_UNIT_SIZE *
+               CODING_UNIT_SIZE));
     mic->mbmi.palette_enabled[0] = 1;
     vp9_cost_tokens(palette_size_cost,
                     cpi->common.fc.palette_size_prob[bsize - BLOCK_8X8],
@@ -5114,8 +5116,8 @@ static void do_masked_motion_search_indexed(VP9_COMP *cpi, MACROBLOCK *x,
   int w = (4 << b_width_log2_lookup[sb_type]);
   int h = (4 << b_height_log2_lookup[sb_type]);
   int i, j;
-  uint8_t mask[4096];
-  int mask_stride = 64;
+  uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
+  int mask_stride = CODING_UNIT_SIZE;
 
   vp9_generate_masked_weight(wedge_index, sb_type, h, w, mask, mask_stride);
   // vp9_generate_hard_mask(wedge_index, sb_type, h, w, mask, mask_stride);
@@ -5628,10 +5630,15 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (have_newmv_in_inter_mode(this_mode)) {
       int_mv tmp_mv[2];
       int rate_mvs[2], tmp_rate_mv = 0;
-      uint8_t pred0[8192 * 3], pred1[8192 * 3];
-      uint8_t *preds0[3] = {pred0, pred0 + 8192, pred0 + 16384};
-      uint8_t *preds1[3] = {pred1, pred1 + 8192, pred1 + 16384};
-      int strides[3] = {64, 64, 64};
+      uint8_t pred0[2 * CODING_UNIT_SIZE * CODING_UNIT_SIZE * 3];
+      uint8_t pred1[2 * CODING_UNIT_SIZE * CODING_UNIT_SIZE * 3];
+      uint8_t *preds0[3] = {pred0,
+                            pred0 + 2 * CODING_UNIT_SIZE * CODING_UNIT_SIZE,
+                            pred0 + 4 * CODING_UNIT_SIZE * CODING_UNIT_SIZE};
+      uint8_t *preds1[3] = {pred1,
+                            pred1 + 2 * CODING_UNIT_SIZE * CODING_UNIT_SIZE,
+                            pred1 + 4 * CODING_UNIT_SIZE * CODING_UNIT_SIZE};
+      int strides[3] = {CODING_UNIT_SIZE, CODING_UNIT_SIZE, CODING_UNIT_SIZE};
       vp9_build_inter_predictors_for_planes_single_buf(
           xd, bsize, mi_row, mi_col, 0, preds0, strides);
       vp9_build_inter_predictors_for_planes_single_buf(
@@ -5702,10 +5709,15 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         mbmi->mv[1].as_int = cur_mv[1].as_int;
       }
     } else {
-      uint8_t pred0[8192 * 3], pred1[8192 * 3];
-      uint8_t *preds0[3] = {pred0, pred0 + 8192, pred0 + 16384};
-      uint8_t *preds1[3] = {pred1, pred1 + 8192, pred1 + 16384};
-      int strides[3] = {64, 64, 64};
+      uint8_t pred0[2 * CODING_UNIT_SIZE * CODING_UNIT_SIZE * 3];
+      uint8_t pred1[2 * CODING_UNIT_SIZE * CODING_UNIT_SIZE * 3];
+      uint8_t *preds0[3] = {pred0,
+                            pred0 + 2 * CODING_UNIT_SIZE * CODING_UNIT_SIZE,
+                            pred0 + 4 * CODING_UNIT_SIZE * CODING_UNIT_SIZE};
+      uint8_t *preds1[3] = {pred1,
+                            pred1 + 2 * CODING_UNIT_SIZE * CODING_UNIT_SIZE,
+                            pred1 + 4 * CODING_UNIT_SIZE * CODING_UNIT_SIZE};
+      int strides[3] = {CODING_UNIT_SIZE, CODING_UNIT_SIZE, CODING_UNIT_SIZE};
       vp9_build_inter_predictors_for_planes_single_buf(
           xd, bsize, mi_row, mi_col, 0, preds0, strides);
       vp9_build_inter_predictors_for_planes_single_buf(
@@ -5761,7 +5773,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #ifdef WEDGE_INTERINTRA_REFINE_SEARCH
     int bw = 4 << b_width_log2_lookup[mbmi->sb_type],
         bh = 4 << b_height_log2_lookup[mbmi->sb_type];
-    uint8_t mask[4096];
+    uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
     int_mv tmp_mv;
     int tmp_rate_mv = 0;
 #endif  // WEDGE_INTERINTRA_REFINE_SEARCH
@@ -5769,7 +5781,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->ref_frame[1] = NONE;
     for (j = 0; j < MAX_MB_PLANE; j++) {
       xd->plane[j].dst.buf = tmp_buf + j * tmp_buf_sz;
-      xd->plane[j].dst.stride = 64;
+      xd->plane[j].dst.stride = CODING_UNIT_SIZE;
     }
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
     restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -5781,8 +5793,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       mbmi->interintra_uv_mode = interintra_mode;
       rmode = cpi->mbmode_cost[mbmi->interintra_mode];
       vp9_build_interintra_predictors(xd, tmp_buf, tmp_buf + tmp_buf_sz,
-                                      tmp_buf + 2 * tmp_buf_sz, 64, 64, 64,
-                                      bsize);
+                                      tmp_buf + 2 * tmp_buf_sz,
+                                      CODING_UNIT_SIZE, CODING_UNIT_SIZE,
+                                      CODING_UNIT_SIZE, bsize);
       model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
                       &skip_txfm_sb, &skip_sse_sb);
       rd = RDCOST(x->rdmult, x->rddiv, rmode + rate_sum, dist_sum);
@@ -5799,8 +5812,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (wedge_bits) {
       mbmi->use_wedge_interintra = 0;
       vp9_build_interintra_predictors(xd, tmp_buf, tmp_buf + tmp_buf_sz,
-                                      tmp_buf + 2 * tmp_buf_sz, 64, 64, 64,
-                                      bsize);
+                                      tmp_buf + 2 * tmp_buf_sz,
+                                      CODING_UNIT_SIZE, CODING_UNIT_SIZE,
+                                      CODING_UNIT_SIZE, bsize);
       model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, NULL, NULL);
       rwedge = vp9_cost_bit(cm->fc.wedge_interintra_prob[bsize], 0);
       rd = RDCOST(x->rdmult, x->rddiv,
@@ -5815,8 +5829,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         mbmi->interintra_wedge_index = wedge_index;
         mbmi->interintra_uv_wedge_index = wedge_index;
         vp9_build_interintra_predictors(xd, tmp_buf, tmp_buf + tmp_buf_sz,
-                                       tmp_buf + 2 * tmp_buf_sz, 64, 64, 64,
-                                       bsize);
+                                       tmp_buf + 2 * tmp_buf_sz,
+                                      CODING_UNIT_SIZE, CODING_UNIT_SIZE,
+                                      CODING_UNIT_SIZE, bsize);
         model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, NULL, NULL);
         rd = RDCOST(x->rdmult, x->rddiv,
                     rmode + rate_mv_tmp + rwedge + rate_sum, dist_sum);
@@ -6186,7 +6201,8 @@ static void rd_pick_palette_444(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
   const uint8_t *src_y = x->plane[0].src.buf;
   const uint8_t *src_u = x->plane[1].src.buf;
   const uint8_t *src_v = x->plane[2].src.buf;
-  uint8_t palette_color_map_copy[4096], best_palette_color_map[4096];
+  uint8_t palette_color_map_copy[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
+  uint8_t best_palette_color_map[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
   int rows = 4 * num_4x4_blocks_high_lookup[bsize];
   int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
   int src_stride_y = x->plane[0].src.stride;
@@ -6255,9 +6271,11 @@ static void rd_pick_palette_444(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
       vpx_memcpy(palette_color_map_copy, xd->plane[0].color_index_map,
                  rows * cols * sizeof(xd->plane[0].color_index_map[0]));
     vpx_memset(x->kmeans_data_buffer, 0,
-               sizeof(x->kmeans_data_buffer[0]) * 3 * 4096);
+               sizeof(x->kmeans_data_buffer[0]) * 3 * CODING_UNIT_SIZE *
+               CODING_UNIT_SIZE);
     vpx_memset(xd->palette_map_buffer, 0,
-               sizeof(xd->palette_map_buffer[0]) * 4096);
+               sizeof(xd->palette_map_buffer[0]) * CODING_UNIT_SIZE *
+               CODING_UNIT_SIZE);
     vpx_memset(centroids, 0, sizeof(centroids[0]) * 3 * PALETTE_MAX_SIZE);
     vp9_cost_tokens(palette_size_cost,
                     cpi->common.fc.palette_size_prob[bsize - BLOCK_8X8],
@@ -6738,11 +6756,11 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_VP9_HIGHBITDEPTH
   uint16_t best_palette[PALETTE_MAX_SIZE];
   uint16_t palette_colors_uv[TX_SIZES][2 * PALETTE_MAX_SIZE];
-  uint16_t palette_color_map_uv[TX_SIZES][4096];
+  uint16_t palette_color_map_uv[TX_SIZES][CODING_UNIT_SIZE * CODING_UNIT_SIZE];
 #else
   uint8_t best_palette[PALETTE_MAX_SIZE];
   uint8_t palette_colors_uv[TX_SIZES][2 * PALETTE_MAX_SIZE];
-  uint8_t palette_color_map_uv[TX_SIZES][4096];
+  uint8_t palette_color_map_uv[TX_SIZES][CODING_UNIT_SIZE * CODING_UNIT_SIZE];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   const MODE_INFO *above_mi = xd->up_available ?
       xd->mi[-xd->mi_stride].src_mi : NULL;
@@ -8056,9 +8074,11 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
 #endif
       vpx_memset(x->kmeans_data_buffer, 0,
-                 sizeof(x->kmeans_data_buffer[0] * 4096));
+                 sizeof(x->kmeans_data_buffer[0] * CODING_UNIT_SIZE *
+                 CODING_UNIT_SIZE));
       vpx_memset(x->kmeans_indices_buffer, 0,
-                 sizeof(x->kmeans_indices_buffer[0] * 4096));
+                 sizeof(x->kmeans_indices_buffer[0] * CODING_UNIT_SIZE *
+                 CODING_UNIT_SIZE));
       mbmi->palette_enabled[0] = 1;
       vp9_cost_tokens(palette_size_cost,
                       cpi->common.fc.palette_size_prob[bsize - BLOCK_8X8],
diff --git a/vp9/encoder/vp9_sad.c b/vp9/encoder/vp9_sad.c
index 9081dd765..12c93d7bf 100644
--- a/vp9/encoder/vp9_sad.c
+++ b/vp9/encoder/vp9_sad.c
@@ -336,6 +336,11 @@ unsigned int vp9_masked_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
   return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, n); \
 }
 
+#if CONFIG_EXT_CODING_UNIT_SIZE
+MASKSADMxN(128, 128)
+MASKSADMxN(128, 64)
+MASKSADMxN(64, 128)
+#endif
 MASKSADMxN(64, 64)
 MASKSADMxN(64, 32)
 MASKSADMxN(32, 64)
@@ -373,7 +378,7 @@ static INLINE unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride,
   return sad;
 }
 
-#define highbd_MASKSADMxN(m, n) \
+#define HIGHBD_MASKSADMXN(m, n) \
 unsigned int vp9_highbd_masked_sad##m##x##n##_c(const uint8_t *src, \
                                                 int src_stride, \
                                                 const uint8_t *ref, \
@@ -384,18 +389,23 @@ unsigned int vp9_highbd_masked_sad##m##x##n##_c(const uint8_t *src, \
                            msk, msk_stride, m, n); \
 }
 
-highbd_MASKSADMxN(64, 64)
-highbd_MASKSADMxN(64, 32)
-highbd_MASKSADMxN(32, 64)
-highbd_MASKSADMxN(32, 32)
-highbd_MASKSADMxN(32, 16)
-highbd_MASKSADMxN(16, 32)
-highbd_MASKSADMxN(16, 16)
-highbd_MASKSADMxN(16, 8)
-highbd_MASKSADMxN(8, 16)
-highbd_MASKSADMxN(8, 8)
-highbd_MASKSADMxN(8, 4)
-highbd_MASKSADMxN(4, 8)
-highbd_MASKSADMxN(4, 4)
+#if CONFIG_EXT_CODING_UNIT_SIZE
+HIGHBD_MASKSADMXN(128, 128)
+HIGHBD_MASKSADMXN(128, 64)
+HIGHBD_MASKSADMXN(64, 128)
+#endif
+HIGHBD_MASKSADMXN(64, 64)
+HIGHBD_MASKSADMXN(64, 32)
+HIGHBD_MASKSADMXN(32, 64)
+HIGHBD_MASKSADMXN(32, 32)
+HIGHBD_MASKSADMXN(32, 16)
+HIGHBD_MASKSADMXN(16, 32)
+HIGHBD_MASKSADMXN(16, 16)
+HIGHBD_MASKSADMXN(16, 8)
+HIGHBD_MASKSADMXN(8, 16)
+HIGHBD_MASKSADMXN(8, 8)
+HIGHBD_MASKSADMXN(8, 4)
+HIGHBD_MASKSADMXN(4, 8)
+HIGHBD_MASKSADMXN(4, 4)
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_WEDGE_PARTITION
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index ad0cc36ee..a89e02859 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -772,6 +772,17 @@ MASK_SUBPIX_VAR(64, 32)
 MASK_VAR(64, 64)
 MASK_SUBPIX_VAR(64, 64)
 
+#if CONFIG_EXT_CODING_UNIT_SIZE
+MASK_VAR(64, 128)
+MASK_SUBPIX_VAR(64, 128)
+
+MASK_VAR(128, 64)
+MASK_SUBPIX_VAR(128, 64)
+
+MASK_VAR(128, 128)
+MASK_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_CODING_UNIT_SIZE
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void highbd_masked_variance64(const uint8_t *a8, int  a_stride,
                               const uint8_t *b8, int  b_stride,
@@ -971,5 +982,15 @@ HIGHBD_MASK_SUBPIX_VAR(64, 32)
 HIGHBD_MASK_VAR(64, 64)
 HIGHBD_MASK_SUBPIX_VAR(64, 64)
 
+#if CONFIG_EXT_CODING_UNIT_SIZE
+HIGHBD_MASK_VAR(64, 128)
+HIGHBD_MASK_SUBPIX_VAR(64, 128)
+
+HIGHBD_MASK_VAR(128, 64)
+HIGHBD_MASK_SUBPIX_VAR(128, 64)
+
+HIGHBD_MASK_VAR(128, 128)
+HIGHBD_MASK_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_CODING_UNIT_SIZE
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_WEDGE_PARTITION