Adds wedge-partitions for compound prediction

Results with this experiment only: +0.642% on derflr. With other experiments: +4.733% Change-Id: Ieb2022f8e49ac38a7e7129e261a6bf69ae9666b9
2015-01-12 18:09:10 -08:00
parent 5f0093bb98
commit db5dd49996
18 changed files with 1731 additions and 27 deletions
--- a/1
+++ b/1
@@ -289,6 +289,7 @@ EXPERIMENT_LIST="
    supertx
    copy_mode
    interintra
+    wedge_partition
 "
 CONFIG_LIST="
    external_build
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -165,6 +165,10 @@ typedef struct {
  PREDICTION_MODE interintra_mode;
  PREDICTION_MODE interintra_uv_mode;
 #endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+  int use_wedge_interinter;
+  int wedge_index;
+#endif  // CONFIG_WEDGE_PARTITION
 } MB_MODE_INFO;

 typedef struct MODE_INFO {
@@ -439,6 +443,24 @@ static INLINE int is_interintra_allowed(BLOCK_SIZE sb_type) {
 }
 #endif  // CONFIG_INTERINTRA

+#if CONFIG_WEDGE_PARTITION
+#define WEDGE_BITS_SML   3
+#define WEDGE_BITS_MED   4
+#define WEDGE_BITS_BIG   5
+#define WEDGE_NONE      -1
+
+static inline int get_wedge_bits(BLOCK_SIZE sb_type) {
+  if (sb_type < BLOCK_8X8)
+    return 0;
+  if (sb_type <= BLOCK_8X8)
+    return WEDGE_BITS_SML;
+  else if (sb_type <= BLOCK_32X32)
+    return WEDGE_BITS_MED;
+  else
+    return WEDGE_BITS_BIG;
+}
+#endif  // CONFIG_WEDGE_PARTITION
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -13,6 +13,12 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"

+#if CONFIG_WEDGE_PARTITION
+static const vp9_prob default_wedge_interinter_prob[BLOCK_SIZES] = {
+  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
+};
+#endif  // CONFIG_WEDGE_PARTITION
+
 #if CONFIG_INTERINTRA
 static const vp9_prob default_interintra_prob[BLOCK_SIZES] = {
  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
@@ -179,7 +185,7 @@ static const vp9_prob default_filterintra_prob[TX_SIZES][INTRA_MODES] = {
  {175,   203,   213,    86,    45,    71,    41,   150,   125,   154},
  {235,   230,   154,   202,   154,   205,    37,   128,     0,   202}
 };
-#endif
+#endif  // CONFIG_FILTERINTRA

 const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                     [PARTITION_TYPES - 1] = {
@@ -466,6 +472,9 @@ void vp9_init_mode_probs(FRAME_CONTEXT *fc) {
 #if CONFIG_INTERINTRA
  vp9_copy(fc->interintra_prob, default_interintra_prob);
 #endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+  vp9_copy(fc->wedge_interinter_prob, default_wedge_interinter_prob);
+#endif  // CONFIG_WEDGE_PARTITION
 }

 const vp9_tree_index vp9_switchable_interp_tree
@@ -615,6 +624,14 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
                                          counts->interintra[i]);
  }
 #endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    if (get_wedge_bits(i))
+      fc->wedge_interinter_prob[i] = adapt_prob
+          (pre_fc->wedge_interinter_prob[i],
+           counts->wedge_interinter[i]);
+  }
+#endif  // CONFIG_WEDGE_PARTITION
 }

 static void set_default_lf_deltas(struct loopfilter *lf) {
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -77,6 +77,9 @@ typedef struct frame_contexts {
 #if CONFIG_INTERINTRA
  vp9_prob interintra_prob[BLOCK_SIZES];
 #endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+  vp9_prob wedge_interinter_prob[BLOCK_SIZES];
+#endif  // CONFIG_WEDGE_PARTITION
 } FRAME_CONTEXT;

 typedef struct {
@@ -118,6 +121,9 @@ typedef struct {
 #if CONFIG_INTERINTRA
  unsigned int interintra[BLOCK_SIZES][2];
 #endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+  unsigned int wedge_interinter[BLOCK_SIZES][2];
+#endif  // CONFIG_WEDGE_PARTITION
 } FRAME_COUNTS;

 extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -257,9 +257,349 @@ static MV average_split_mvs(const struct macroblockd_plane *pd,
  return res;
 }

+#if CONFIG_WEDGE_PARTITION
+#define WEDGE_WEIGHT_BITS 6
+
+static int get_masked_weight(int m) {
+  #define SMOOTHER_LEN  32
+  static const uint8_t smoothfn[2 * SMOOTHER_LEN + 1] = {
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  1,  1,  1,
+      1,  1,  2,  2,  3,  4,  5,  6,
+      8,  9, 12, 14, 17, 21, 24, 28,
+      32,
+      36, 40, 43, 47, 50, 52, 55, 56,
+      58, 59, 60, 61, 62, 62, 63, 63,
+      63, 63, 63, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+  };
+  if (m < -SMOOTHER_LEN)
+    return 0;
+  else if (m > SMOOTHER_LEN)
+    return (1 << WEDGE_WEIGHT_BITS);
+  else
+    return smoothfn[m + SMOOTHER_LEN];
+}
+
+static int get_hard_mask(int m) {
+  return 1 << WEDGE_WEIGHT_BITS * (m > 0);
+}
+
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/4) + a[1]*(y - a[3]*h/4) = 0
+// The soft mask is obtained by computing f(x, y) and then calling
+// get_masked_weight(f(x, y)).
+static const int wedge_params_sml[1 << WEDGE_BITS_SML][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+};
+
+static const int wedge_params_med_hgtw[1 << WEDGE_BITS_MED][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+};
+
+static const int wedge_params_med_hltw[1 << WEDGE_BITS_MED][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+};
+
+static const int wedge_params_med_heqw[1 << WEDGE_BITS_MED][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int wedge_params_big_hgtw[1 << WEDGE_BITS_BIG][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 2},
+  { 0, -2, 0, 2},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 2, 0},
+  {-2,  0, 2, 0},
+};
+
+static const int wedge_params_big_hltw[1 << WEDGE_BITS_BIG][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 2},
+  { 0, -2, 0, 2},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 2, 0},
+  {-2,  0, 2, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int wedge_params_big_heqw[1 << WEDGE_BITS_BIG][4] = {
+  {-1,  2, 2, 2},
+  { 1, -2, 2, 2},
+  {-2,  1, 2, 2},
+  { 2, -1, 2, 2},
+  { 2,  1, 2, 2},
+  {-2, -1, 2, 2},
+  { 1,  2, 2, 2},
+  {-1, -2, 2, 2},
+
+  {-1,  2, 2, 1},
+  { 1, -2, 2, 1},
+  {-1,  2, 2, 3},
+  { 1, -2, 2, 3},
+  { 1,  2, 2, 1},
+  {-1, -2, 2, 1},
+  { 1,  2, 2, 3},
+  {-1, -2, 2, 3},
+
+  {-2,  1, 1, 2},
+  { 2, -1, 1, 2},
+  {-2,  1, 3, 2},
+  { 2, -1, 3, 2},
+  { 2,  1, 1, 2},
+  {-2, -1, 1, 2},
+  { 2,  1, 3, 2},
+  {-2, -1, 3, 2},
+
+  { 0,  2, 0, 1},
+  { 0, -2, 0, 1},
+  { 0,  2, 0, 3},
+  { 0, -2, 0, 3},
+  { 2,  0, 1, 0},
+  {-2,  0, 1, 0},
+  { 2,  0, 3, 0},
+  {-2,  0, 3, 0},
+};
+
+static const int *get_wedge_params(int wedge_index,
+                                   BLOCK_SIZE sb_type,
+                                   int h, int w) {
+  const int *a = NULL;
+  const int wedge_bits = get_wedge_bits(sb_type);
+
+  if (wedge_index == WEDGE_NONE)
+    return NULL;
+
+  if (wedge_bits == WEDGE_BITS_SML) {
+    a = wedge_params_sml[wedge_index];
+  } else if (wedge_bits == WEDGE_BITS_MED) {
+    if (h > w)
+      a = wedge_params_med_hgtw[wedge_index];
+    else if (h < w)
+      a = wedge_params_med_hltw[wedge_index];
+    else
+      a = wedge_params_med_heqw[wedge_index];
+  } else if (wedge_bits == WEDGE_BITS_BIG) {
+    if (h > w)
+      a = wedge_params_big_hgtw[wedge_index];
+    else if (h < w)
+      a = wedge_params_big_hltw[wedge_index];
+    else
+      a = wedge_params_big_heqw[wedge_index];
+  } else {
+    assert(0);
+  }
+  return a;
+}
+
+void vp9_generate_masked_weight(int wedge_index,
+                                BLOCK_SIZE sb_type,
+                                int h, int w,
+                                uint8_t *mask, int stride) {
+  int i, j;
+  const int *a = get_wedge_params(wedge_index, sb_type, h, w);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * w) / 4);
+      int y = (i - (a[3] * h) / 4);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_masked_weight(m);
+    }
+}
+
+void vp9_generate_hard_mask(int wedge_index, BLOCK_SIZE sb_type,
+                            int h, int w, uint8_t *mask, int stride) {
+  int i, j;
+  const int *a = get_wedge_params(wedge_index, sb_type, h, w);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * w) / 4);
+      int y = (i - (a[3] * h) / 4);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_hard_mask(m);
+    }
+}
+
+static void build_masked_compound(uint8_t *dst, int dst_stride,
+                                  uint8_t *dst2, int dst2_stride,
+                                  int wedge_index, BLOCK_SIZE sb_type,
+                                  int h, int w) {
+  int i, j;
+  uint8_t mask[4096];
+  vp9_generate_masked_weight(wedge_index, sb_type, h, w, mask, 64);
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int m = mask[i * 64 + j];
+      dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
+                                 dst2[i * dst2_stride + j] *
+                                 ((1 << WEDGE_WEIGHT_BITS) - m) +
+                                 (1 << (WEDGE_WEIGHT_BITS - 1))) >>
+                                 WEDGE_WEIGHT_BITS;
+    }
+}
+
+#if CONFIG_SUPERTX
+void generate_masked_weight_extend(int wedge_index, int plane,
+                                   BLOCK_SIZE sb_type, int h, int w,
+                                   int wedge_offset_x, int wedge_offset_y,
+                                   uint8_t *mask, int stride) {
+  int i, j;
+  int subh = (plane ? 2 : 4) << b_height_log2_lookup[sb_type];
+  int subw = (plane ? 2 : 4) << b_width_log2_lookup[sb_type];
+  const int *a = get_wedge_params(wedge_index, sb_type, subh, subw);
+  if (!a) return;
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (j - (a[2] * subw) / 4 - wedge_offset_x);
+      int y = (i - (a[3] * subh) / 4 - wedge_offset_y);
+      int m = a[0] * x + a[1] * y;
+      mask[i * stride + j] = get_masked_weight(m);
+    }
+}
+
+static void build_masked_compound_extend(uint8_t *dst, int dst_stride,
+                                         uint8_t *dst2, int dst2_stride,
+                                         int plane,
+                                         int wedge_index, BLOCK_SIZE sb_type,
+                                         int wedge_offset_x, int wedge_offset_y,
+                                         int h, int w) {
+  int i, j;
+  uint8_t mask[4096];
+  generate_masked_weight_extend(wedge_index, plane, sb_type, h, w,
+                                wedge_offset_x, wedge_offset_y, mask, 64);
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int m = mask[i * 64 + j];
+      dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
+                                 dst2[i * dst2_stride + j] *
+                                 ((1 << WEDGE_WEIGHT_BITS) - m) +
+                                 (1 << (WEDGE_WEIGHT_BITS - 1))) >>
+                                 WEDGE_WEIGHT_BITS;
+    }
+}
+#endif  // CONFIG_SUPERTX
+#endif  // CONFIG_WEDGE_PARTITION
+
 static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                   int bw, int bh,
                                   int x, int y, int w, int h,
+#if CONFIG_SUPERTX && CONFIG_WEDGE_PARTITION
+                                   int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX && CONFIG_WEDGE_PARTITION
                                   int mi_x, int mi_y) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  const MODE_INFO *mi = xd->mi[0].src_mi;
@@ -306,6 +646,38 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
    pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
           + (scaled_mv.col >> SUBPEL_BITS);

+#if CONFIG_WEDGE_PARTITION
+    if (ref && get_wedge_bits(mi->mbmi.sb_type)
+        && mi->mbmi.use_wedge_interinter) {
+      uint8_t tmp_dst[4096];
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        high_inter_predictor(pre, pre_buf->stride, tmp_dst, 64,
+                             subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
+                             xd->bd);
+      } else {
+        inter_predictor(pre, pre_buf->stride, tmp_dst, 64,
+                        subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
+      }
+#else
+      inter_predictor(pre, pre_buf->stride, tmp_dst, 64,
+                      subpel_x, subpel_y, sf, w, h, 0, kernel, xs, ys);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_SUPERTX
+      // TODO(debargha): Need high bitdepth versions
+      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst, 64, plane,
+                                   mi->mbmi.wedge_index, mi->mbmi.sb_type,
+                                   wedge_offset_x, wedge_offset_y, h, w);
+#else
+      build_masked_compound(dst, dst_buf->stride, tmp_dst, 64,
+                            mi->mbmi.wedge_index, mi->mbmi.sb_type, h, w);
+#endif  // CONFIG_SUPERTX
+    } else {
+      inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                      subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
+    }
+#else  // CONFIG_WEDGE_PARTITION
+
 #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
      high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
@@ -319,6 +691,7 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
    inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
                    subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_WEDGE_PARTITION
  }
 }

@@ -342,10 +715,18 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
      for (y = 0; y < num_4x4_h; ++y)
        for (x = 0; x < num_4x4_w; ++x)
           build_inter_predictors(xd, plane, i++, bw, bh,
-                                  4 * x, 4 * y, 4, 4, mi_x, mi_y);
+                                  4 * x, 4 * y, 4, 4,
+#if CONFIG_SUPERTX && CONFIG_WEDGE_PARTITION
+                                  0, 0,
+#endif
+                                  mi_x, mi_y);
    } else {
      build_inter_predictors(xd, plane, 0, bw, bh,
-                             0, 0, bw, bh, mi_x, mi_y);
+                             0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_WEDGE_PARTITION
+                             0, 0,
+#endif
+                             mi_x, mi_y);
    }
  }
 }
@@ -483,6 +864,39 @@ void vp9_build_masked_inter_predictor_complex(
  }
 }

+#if CONFIG_WEDGE_PARTITION
+void vp9_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          int mi_row_ori, int mi_col_ori,
+                                          BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+  const int wedge_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int wedge_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0].src_mi->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+           build_inter_predictors(xd, plane, i++, bw, bh, 4 * x, 4 * y, 4, 4,
+                                  wedge_offset_x, wedge_offset_y, mi_x, mi_y);
+    } else {
+      build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+                             wedge_offset_x, wedge_offset_y, mi_x, mi_y);
+    }
+  }
+}
+#endif  // CONFIG_WEDGE_PARTITION
+
 void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
                                                  int mi_row, int mi_col,
                                                  int mi_row_ori,
@@ -491,6 +905,10 @@ void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
                                                  PARTITION_TYPE partition) {
  const int mi_x = mi_col_ori * MI_SIZE;
  const int mi_y = mi_row_ori * MI_SIZE;
+#if CONFIG_WEDGE_PARTITION
+  const int wedge_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int wedge_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+#endif  // CONFIG_WEDGE_PARTITION
  uint8_t *orig_dst;
  int orig_dst_stride;
  int bw = 4 << b_width_log2_lookup[top_bsize];
@@ -502,6 +920,9 @@ void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
  orig_dst = xd->plane[0].dst.buf;
  orig_dst_stride = xd->plane[0].dst.stride;
  build_inter_predictors(xd, 0, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                         wedge_offset_x, wedge_offset_y,
+#endif
                         mi_x, mi_y);

  xd->plane[0].dst.buf = tmp_buf;
@@ -509,22 +930,37 @@ void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
  switch (partition) {
    case PARTITION_HORZ:
      build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                             wedge_offset_x, wedge_offset_y,
+#endif
                             mi_x, mi_y);
      break;
    case PARTITION_VERT:
      build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                             wedge_offset_x, wedge_offset_y,
+#endif
                             mi_x, mi_y);
      break;
    case PARTITION_SPLIT:
      build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                             wedge_offset_x, wedge_offset_y,
+#endif
                             mi_x, mi_y);
      xd->plane[0].dst.buf = tmp_buf1;
      xd->plane[0].dst.stride = MAXTXLEN;
      build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                             wedge_offset_x, wedge_offset_y,
+#endif
                             mi_x, mi_y);
      xd->plane[0].dst.buf = tmp_buf2;
      xd->plane[0].dst.stride = MAXTXLEN;
      build_inter_predictors(xd, 0, 3, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                             wedge_offset_x, wedge_offset_y,
+#endif
                             mi_x, mi_y);
      break;
    default:
@@ -563,12 +999,19 @@ void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
 }

 void vp9_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_WEDGE_PARTITION
+                                                   int mi_row, int mi_col,
+#endif
                                                   int mi_row_ori,
                                                   int mi_col_ori,
                                                   BLOCK_SIZE top_bsize) {
  int plane;
  const int mi_x = mi_col_ori * MI_SIZE;
  const int mi_y = mi_row_ori * MI_SIZE;
+#if CONFIG_WEDGE_PARTITION
+  const int wedge_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int wedge_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+#endif
  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
    const BLOCK_SIZE plane_bsize = get_plane_block_size(top_bsize,
                                                        &xd->plane[plane]);
@@ -578,6 +1021,9 @@ void vp9_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
    const int bh = 4 * num_4x4_h;

    build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                           wedge_offset_x, wedge_offset_y,
+#endif
                           mi_x, mi_y);
  }
 }
@@ -588,6 +1034,9 @@ void vp9_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
 static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                       int bw, int bh,
                                       int x, int y, int w, int h,
+#if CONFIG_SUPERTX && CONFIG_WEDGE_PARTITION
+                                       int wedge_offset_x, int wedge_offset_y,
+#endif
                                       int mi_x, int mi_y) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  const MODE_INFO *mi = xd->mi[0].src_mi;
@@ -748,6 +1197,50 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
      }
    }

+#if CONFIG_WEDGE_PARTITION
+    if (ref && get_wedge_bits(mi->mbmi.sb_type)
+        && mi->mbmi.use_wedge_interinter) {
+      uint8_t tmp_dst[4096];
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        high_inter_predictor(buf_ptr, buf_stride, tmp_dst, 64,
+                             subpel_x, subpel_y, sf, w, h, ref, kernel,
+                             xs, ys, xd->bd);
+      } else {
+        inter_predictor(buf_ptr, buf_stride, tmp_dst, 64,
+                        subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
+      }
+#else
+      inter_predictor(buf_ptr, buf_stride, tmp_dst, 64,
+                     subpel_x, subpel_y, sf, w, h, 0, kernel, xs, ys);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_SUPERTX
+      // TODO(debargha): highbitdepth versions
+      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst, 64, plane,
+                                   mi->mbmi.wedge_index, mi->mbmi.sb_type,
+                                   wedge_offset_x, wedge_offset_y, h, w);
+#else
+      build_masked_compound(dst, dst_buf->stride, tmp_dst, 64,
+                            mi->mbmi.wedge_index, mi->mbmi.sb_type, h, w);
+#endif  // CONFIG_SUPERTX
+    } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride,
+                             subpel_x, subpel_y, sf, w, h, ref, kernel,
+                             xs, ys, xd->bd);
+      } else {
+        inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                        subpel_y, sf, w, h, ref, kernel, xs, ys);
+      }
+#else
+      inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                      subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+
+#else  // CONFIG_WEDGE_PARTITION
+
 #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
      high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
@@ -760,6 +1253,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
                    subpel_y, sf, w, h, ref, kernel, xs, ys);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_WEDGE_PARTITION
  }
 }

@@ -782,10 +1276,18 @@ void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
      for (y = 0; y < num_4x4_h; ++y)
        for (x = 0; x < num_4x4_w; ++x)
          dec_build_inter_predictors(xd, plane, i++, bw, bh,
-                                     4 * x, 4 * y, 4, 4, mi_x, mi_y);
+                                     4 * x, 4 * y, 4, 4,
+#if CONFIG_SUPERTX && CONFIG_WEDGE_PARTITION
+                                     0, 0,
+#endif
+                                     mi_x, mi_y);
    } else {
      dec_build_inter_predictors(xd, plane, 0, bw, bh,
-                                 0, 0, bw, bh, mi_x, mi_y);
+                                 0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_WEDGE_PARTITION
+                                 0, 0,
+#endif
+                                 mi_x, mi_y);
    }
  }
 #if CONFIG_INTERINTRA
@@ -800,6 +1302,41 @@ void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
 }

 #if CONFIG_SUPERTX
+#if CONFIG_WEDGE_PARTITION
+void vp9_dec_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col_ori * MI_SIZE;
+  const int mi_y = mi_row_ori * MI_SIZE;
+  const int wedge_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int wedge_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0].src_mi->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          dec_build_inter_predictors(xd, plane, i++, bw, bh, 4 * x, 4 * y, 4, 4,
+                                     wedge_offset_x, wedge_offset_y,
+                                     mi_x, mi_y);
+    } else {
+      dec_build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+                                 wedge_offset_x, wedge_offset_y,
+                                 mi_x, mi_y);
+    }
+  }
+}
+#endif  // CONFIG_WEDGE_PARTITION
+
 void vp9_dec_build_inter_predictors_sby_sub8x8_extend(
    MACROBLOCKD *xd,
    int mi_row, int mi_col,
@@ -809,6 +1346,10 @@ void vp9_dec_build_inter_predictors_sby_sub8x8_extend(
    PARTITION_TYPE partition) {
  const int mi_x = mi_col_ori * MI_SIZE;
  const int mi_y = mi_row_ori * MI_SIZE;
+#if CONFIG_WEDGE_PARTITION
+  const int wedge_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int wedge_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+#endif
  uint8_t *orig_dst;
  int orig_dst_stride;
  int bw = 4 << b_width_log2_lookup[top_bsize];
@@ -820,6 +1361,9 @@ void vp9_dec_build_inter_predictors_sby_sub8x8_extend(
  orig_dst = xd->plane[0].dst.buf;
  orig_dst_stride = xd->plane[0].dst.stride;
  dec_build_inter_predictors(xd, 0, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                             wedge_offset_x, wedge_offset_y,
+#endif
                             mi_x, mi_y);

  xd->plane[0].dst.buf = tmp_buf;
@@ -827,22 +1371,37 @@ void vp9_dec_build_inter_predictors_sby_sub8x8_extend(
  switch (partition) {
    case PARTITION_HORZ:
      dec_build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                                 wedge_offset_x, wedge_offset_y,
+#endif
                                 mi_x, mi_y);
      break;
    case PARTITION_VERT:
      dec_build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                                 wedge_offset_x, wedge_offset_y,
+#endif
                                 mi_x, mi_y);
      break;
    case PARTITION_SPLIT:
      dec_build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                                 wedge_offset_x, wedge_offset_y,
+#endif
                                 mi_x, mi_y);
      xd->plane[0].dst.buf = tmp_buf1;
      xd->plane[0].dst.stride = MAXTXLEN;
      dec_build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                                 wedge_offset_x, wedge_offset_y,
+#endif
                                 mi_x, mi_y);
      xd->plane[0].dst.buf = tmp_buf2;
      xd->plane[0].dst.stride = MAXTXLEN;
      dec_build_inter_predictors(xd, 0, 3, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                                 wedge_offset_x, wedge_offset_y,
+#endif
                                 mi_x, mi_y);
      break;
    default:
@@ -881,12 +1440,19 @@ void vp9_dec_build_inter_predictors_sby_sub8x8_extend(
 }

 void vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_WEDGE_PARTITION
+                                                       int mi_row, int mi_col,
+#endif
                                                       int mi_row_ori,
                                                       int mi_col_ori,
                                                       BLOCK_SIZE top_bsize) {
  int plane;
  const int mi_x = mi_col_ori * MI_SIZE;
  const int mi_y = mi_row_ori * MI_SIZE;
+#if CONFIG_WEDGE_PARTITION
+  const int wedge_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
+  const int wedge_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
+#endif
  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
    const BLOCK_SIZE plane_bsize = get_plane_block_size(top_bsize,
                                                        &xd->plane[plane]);
@@ -896,6 +1462,9 @@ void vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
    const int bh = 4 * num_4x4_h;

    dec_build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_WEDGE_PARTITION
+                               wedge_offset_x, wedge_offset_y,
+#endif
                               mi_x, mi_y);
  }
 }
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -76,6 +76,13 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                          const struct scale_factors *sf);

+#if CONFIG_WEDGE_PARTITION
+void vp9_generate_masked_weight(int wedge_index, BLOCK_SIZE sb_type,
+                                int h, int w, uint8_t *mask, int stride);
+void vp9_generate_hard_mask(int wedge_index, BLOCK_SIZE sb_type,
+                            int h, int w, uint8_t *mask, int stride);
+#endif  // CONFIG_WEDGE_PARTITION
+
 #if CONFIG_SUPERTX
 struct macroblockd_plane;
 void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
@@ -85,6 +92,9 @@ void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
                                                  BLOCK_SIZE top_bsize,
                                                  PARTITION_TYPE partition);
 void vp9_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_WEDGE_PARTITION
+                                                   int mi_row, int mi_col,
+#endif
                                                   int mi_row_ori,
                                                   int mi_col_ori,
                                                   BLOCK_SIZE top_bsize);
@@ -100,9 +110,23 @@ void vp9_dec_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
                                                      BLOCK_SIZE top_bsize,
                                                      PARTITION_TYPE p);
 void vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_WEDGE_PARTITION
+                                                       int mi_row, int mi_col,
+#endif
                                                       int mi_row_ori,
                                                       int mi_col_ori,
                                                       BLOCK_SIZE top_bsize);
+
+#if CONFIG_WEDGE_PARTITION
+void vp9_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          int mi_row_ori, int mi_col_ori,
+                                          BLOCK_SIZE bsize);
+void vp9_dec_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int mi_row_ori, int mi_col_ori,
+                                              BLOCK_SIZE bsize);
+#endif  // CONFIG_WEDGE_PARTITION
 #endif  // CONFIG_SUPERTX

 #ifdef __cplusplus
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1220,6 +1220,124 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_highbd_avg_8x8/;
 }

+if (vpx_config("CONFIG_WEDGE_PARTITION") eq "yes") {
+  add_proto qw/unsigned int vp9_masked_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_variance32x16/;
+
+  add_proto qw/unsigned int vp9_masked_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masdctked_variance16x32/;
+
+  add_proto qw/unsigned int vp9_masked_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_variance64x32/;
+
+  add_proto qw/unsigned int vp9_masked_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_variance32x64/;
+
+  add_proto qw/unsigned int vp9_masked_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_variance32x32/;
+
+  add_proto qw/unsigned int vp9_masked_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_variance64x64/;
+
+  add_proto qw/unsigned int vp9_masked_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_variance16x16/;
+
+  add_proto qw/unsigned int vp9_masked_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_variance16x8/;
+
+  add_proto qw/unsigned int vp9_masked_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_variance8x16/;
+
+  add_proto qw/unsigned int vp9_masked_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_variance8x8/;
+
+  add_proto qw/unsigned int vp9_masked_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_variance8x4/;
+
+  add_proto qw/unsigned int vp9_masked_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_variance4x8/;
+
+  add_proto qw/unsigned int vp9_masked_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_variance4x4/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance64x64/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance32x64/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance64x32/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance32x16/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance16x32/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance32x32/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance16x16/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance8x16/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance16x8/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance8x8/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance8x4/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance4x8/;
+
+  add_proto qw/unsigned int vp9_masked_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+  specialize qw/vp9_masked_sub_pixel_variance4x4/;
+
+  add_proto qw/unsigned int vp9_masked_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad64x64/;
+
+  add_proto qw/unsigned int vp9_masked_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad32x64/;
+
+  add_proto qw/unsigned int vp9_masked_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad64x32/;
+
+  add_proto qw/unsigned int vp9_masked_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad32x16/;
+
+  add_proto qw/unsigned int vp9_masked_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad16x32/;
+
+  add_proto qw/unsigned int vp9_masked_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad32x32/;
+
+  add_proto qw/unsigned int vp9_masked_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad16x16/;
+
+  add_proto qw/unsigned int vp9_masked_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad16x8/;
+
+  add_proto qw/unsigned int vp9_masked_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad8x16/;
+
+  add_proto qw/unsigned int vp9_masked_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad8x8/;
+
+  add_proto qw/unsigned int vp9_masked_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad8x4/;
+
+  add_proto qw/unsigned int vp9_masked_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad4x8/;
+
+  add_proto qw/unsigned int vp9_masked_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
+  specialize qw/vp9_masked_sad4x4/;
+}
 # ENCODEMB INVOKE

 add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -786,7 +786,12 @@ static void dec_predict_b_extend(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  if (has_second_ref(&xd->mi[0].mbmi))
    set_ref(cm, xd, 1, mi_row_ori, mi_col_ori);
  mbmi->tx_size = b_width_log2_lookup[top_bsize];
+#if CONFIG_WEDGE_PARTITION
+  vp9_dec_build_inter_predictors_sb_extend(xd, mi_row, mi_col,
+                                           mi_row_ori, mi_col_ori, top_bsize);
+#else
  vp9_dec_build_inter_predictors_sb(xd, mi_row_ori, mi_col_ori, top_bsize);
+#endif  // CONFIG_WEDGE_PARTITION
 }

 static void dec_predict_b_sub8x8_extend(VP9_COMMON *const cm,
@@ -806,7 +811,11 @@ static void dec_predict_b_sub8x8_extend(VP9_COMMON *const cm,
  vp9_dec_build_inter_predictors_sby_sub8x8_extend(xd, mi_row, mi_col,
                                                   mi_row_ori, mi_col_ori,
                                                   top_bsize, partition);
-  vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(xd, mi_row_ori, mi_col_ori,
+  vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(xd,
+#if CONFIG_WEDGE_PARTITION
+                                                    mi_row, mi_col,
+#endif
+                                                    mi_row_ori, mi_col_ori,
                                                    top_bsize);
 }

@@ -2253,6 +2262,14 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
      }
    }
 #endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZES; i++) {
+        if (get_wedge_bits(i))
+          vp9_diff_update_prob(&r, &fc->wedge_interinter_prob[i]);
+      }
+    }
+#endif  // CONFIG_WEDGE_PARTITION
  }

  return vp9_reader_has_error(&r);
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -747,8 +747,22 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
                                nearestmv, nearmv, is_compound, allow_hp, r);
  }
 #if CONFIG_TX_SKIP
-    mbmi->uv_mode = mbmi->mode;
+  mbmi->uv_mode = mbmi->mode;
 #endif
+#if CONFIG_WEDGE_PARTITION
+  mbmi->use_wedge_interinter = 0;
+  if (cm->reference_mode != SINGLE_REFERENCE &&
+      is_inter_mode(mbmi->mode) &&
+      get_wedge_bits(bsize) &&
+      mbmi->ref_frame[1] > INTRA_FRAME) {
+    mbmi->use_wedge_interinter =
+        vp9_read(r, cm->fc.wedge_interinter_prob[bsize]);
+    cm->counts.wedge_interinter[bsize][mbmi->use_wedge_interinter]++;
+    if (mbmi->use_wedge_interinter) {
+      mbmi->wedge_index = vp9_read_literal(r, get_wedge_bits(bsize));
+    }
+  }
+#endif  // CONFIG_WEDGE_PARTITION
 }

 static void read_inter_frame_mode_info(VP9_COMMON *const cm,
@@ -792,23 +806,26 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm,
    COPY_MODE copy_mode_backup = mbmi->copy_mode;
 #if CONFIG_SUPERTX
    TX_SIZE tx_size_backup = mbmi->tx_size;
-#endif
+#endif  // CONFIG_SUPERTX
 #if CONFIG_EXT_TX
    EXT_TX_TYPE ext_txfrm_backup = mbmi->ext_txfrm;
-#endif
+#endif  // CONFIG_EXT_TX

    inter_block = 1;
    *mbmi = *inter_ref_list[mbmi->copy_mode - REF0];
+#if CONFIG_SUPERTX
+    mbmi->tx_size = tx_size_backup;
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_TX
+    mbmi->ext_txfrm = ext_txfrm_backup;
+#endif  // CONFIG_EXT_TX
 #if CONFIG_INTERINTRA
    if (mbmi->ref_frame[1] == INTRA_FRAME)
      mbmi->ref_frame[1] = NONE;
 #endif  // CONFIG_INTERINTRA
-#if CONFIG_SUPERTX
-    mbmi->tx_size = tx_size_backup;
-#endif
-#if CONFIG_EXT_TX
-    mbmi->ext_txfrm = ext_txfrm_backup;
-#endif
+#if CONFIG_WEDGE_PARTITION
+    mbmi->use_wedge_interinter = 0;
+#endif  // CONFIG_WEDGE_PARTITION
    mbmi->sb_type = bsize_backup;
    mbmi->mode = NEARESTMV;
    mbmi->skip = skip_backup;
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -567,6 +567,17 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
                        allow_hp);
      }
    }
+#if CONFIG_WEDGE_PARTITION
+    if (cm->reference_mode != SINGLE_REFERENCE &&
+        is_inter_mode(mode) &&
+        get_wedge_bits(bsize) &&
+        mbmi->ref_frame[1] > INTRA_FRAME) {
+      vp9_write(w, mbmi->use_wedge_interinter,
+                cm->fc.wedge_interinter_prob[bsize]);
+      if (mbmi->use_wedge_interinter)
+        vp9_write_literal(w, mbmi->wedge_index, get_wedge_bits(bsize));
+    }
+#endif  // CONFIG_WEDGE_PARTITION
  }
 }

@@ -1648,6 +1659,15 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
      }
    }
 #endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZES; i++)
+        if (get_wedge_bits(i))
+          vp9_cond_prob_diff_update(&header_bc,
+                                    &fc->wedge_interinter_prob[i],
+                                    cm->counts.wedge_interinter[i]);
+    }
+#endif  // CONFIG_WEDGE_PARTITION
  }

  vp9_stop_encode(&header_bc);
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -62,7 +62,11 @@ static int check_intra_b(PICK_MODE_CONTEXT *ctx);
 static int check_intra_sb(VP9_COMP *cpi, const TileInfo *const tile,
                          int mi_row, int mi_col, BLOCK_SIZE bsize,
                          PC_TREE *pc_tree);
-static void predict_superblock(VP9_COMP *cpi, int mi_row_ori, int mi_col_ori,
+static void predict_superblock(VP9_COMP *cpi,
+#if CONFIG_WEDGE_PARTITION
+                               int mi_row, int mi_col,
+#endif  // CONFIG_WEDGE_PARTITION
+                               int mi_row_ori, int mi_col_ori,
                               BLOCK_SIZE bsize);
 static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
                            PC_TREE *pc_tree);
@@ -871,7 +875,13 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
          ++cm->counts.interintra[bsize][0];
        }
      }
-#endif
+#endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+      if (cm->reference_mode != SINGLE_REFERENCE &&
+          get_wedge_bits(bsize) &&
+          mbmi->ref_frame[1] > INTRA_FRAME)
+        ++cm->counts.wedge_interinter[bsize][mbmi->use_wedge_interinter];
+#endif  // CONFIG_WEDGE_PARTITION
    }

    rd_opt->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
@@ -971,6 +981,12 @@ static void update_state_supertx(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
        const int ctx = vp9_get_pred_context_switchable_interp(xd);
        ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
      }
+#if CONFIG_WEDGE_PARTITION
+      if (cm->reference_mode != SINGLE_REFERENCE &&
+          get_wedge_bits(bsize) &&
+          mbmi->ref_frame[1] > INTRA_FRAME)
+        ++cm->counts.wedge_interinter[bsize][mbmi->use_wedge_interinter];
+#endif  // CONFIG_WEDGE_PARTITION
    }

    rd_opt->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
@@ -4914,6 +4930,9 @@ static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
 }

 static void predict_superblock(VP9_COMP *cpi,
+#if CONFIG_WEDGE_PARTITION
+                               int mi_row, int mi_col,
+#endif  // CONFIG_WEDGE_PARTITION
                               int mi_row_ori, int mi_col_ori,
                               BLOCK_SIZE bsize) {
  VP9_COMMON *const cm = &cpi->common;
@@ -4937,7 +4956,12 @@ static void predict_superblock(VP9_COMP *cpi,
    vp9_setup_pre_planes(xd, ref, cfg, mi_row_ori, mi_col_ori,
                         &xd->block_refs[ref]->sf);
  }
+#if CONFIG_WEDGE_PARTITION
+  vp9_build_inter_predictors_sb_extend(xd, mi_row, mi_col,
+                                       mi_row_ori, mi_col_ori, bsize);
+#else
  vp9_build_inter_predictors_sb(xd, mi_row_ori, mi_col_ori, bsize);
+#endif  // CONFIG_WEDGE_PARTITION
 }

 static void predict_superblock_sub8x8_extend(VP9_COMP *cpi,
@@ -4970,6 +4994,9 @@ static void predict_superblock_sub8x8_extend(VP9_COMP *cpi,
                                               mi_row_ori, mi_col_ori,
                                               top_bsize, partition);
  vp9_build_inter_predictors_sbuv_sub8x8_extend(xd,
+#if CONFIG_WEDGE_PARTITION
+                                                mi_row, mi_col,
+#endif
                                                mi_row_ori, mi_col_ori,
                                                top_bsize);
 }
@@ -4996,7 +5023,11 @@ static void predict_b_extend(VP9_COMP *cpi, const TileInfo *const tile,
                             BLOCK_SIZE bsize, BLOCK_SIZE top_bsize) {
  set_offsets_extend(cpi, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
                     bsize, top_bsize);
-  predict_superblock(cpi, mi_row_ori, mi_col_ori, top_bsize);
+  predict_superblock(cpi,
+#if CONFIG_WEDGE_PARTITION
+                     mi_row, mi_col,
+#endif
+                     mi_row_ori, mi_col_ori, top_bsize);

  if (output_enabled)
    update_stats(&cpi->common, &cpi->mb);
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1655,6 +1655,40 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
      vp9_sub_pixel_avg_variance4x4,
      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)

+#if CONFIG_WEDGE_PARTITION
+#define MBFP(BT, MSDF, MVF, MSVF)         \
+  cpi->fn_ptr[BT].msdf            = MSDF; \
+  cpi->fn_ptr[BT].mvf             = MVF;  \
+  cpi->fn_ptr[BT].msvf            = MSVF;
+
+  MBFP(BLOCK_64X64, vp9_masked_sad64x64, vp9_masked_variance64x64,
+       vp9_masked_sub_pixel_variance64x64)
+  MBFP(BLOCK_64X32, vp9_masked_sad64x32, vp9_masked_variance64x32,
+         vp9_masked_sub_pixel_variance64x32)
+  MBFP(BLOCK_32X64, vp9_masked_sad32x64, vp9_masked_variance32x64,
+         vp9_masked_sub_pixel_variance32x64)
+  MBFP(BLOCK_32X32, vp9_masked_sad32x32, vp9_masked_variance32x32,
+       vp9_masked_sub_pixel_variance32x32)
+  MBFP(BLOCK_32X16, vp9_masked_sad32x16, vp9_masked_variance32x16,
+       vp9_masked_sub_pixel_variance32x16)
+  MBFP(BLOCK_16X32, vp9_masked_sad16x32, vp9_masked_variance16x32,
+       vp9_masked_sub_pixel_variance16x32)
+  MBFP(BLOCK_16X16, vp9_masked_sad16x16, vp9_masked_variance16x16,
+         vp9_masked_sub_pixel_variance16x16)
+  MBFP(BLOCK_16X8, vp9_masked_sad16x8, vp9_masked_variance16x8,
+         vp9_masked_sub_pixel_variance16x8)
+  MBFP(BLOCK_8X16, vp9_masked_sad8x16, vp9_masked_variance8x16,
+         vp9_masked_sub_pixel_variance8x16)
+  MBFP(BLOCK_8X8, vp9_masked_sad8x8, vp9_masked_variance8x8,
+       vp9_masked_sub_pixel_variance8x8)
+  MBFP(BLOCK_4X8, vp9_masked_sad4x8, vp9_masked_variance4x8,
+       vp9_masked_sub_pixel_variance4x8)
+  MBFP(BLOCK_8X4, vp9_masked_sad8x4, vp9_masked_variance8x4,
+       vp9_masked_sub_pixel_variance8x4)
+  MBFP(BLOCK_4X4, vp9_masked_sad4x4, vp9_masked_variance4x4,
+       vp9_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_WEDGE_PARTITION
+
 #if CONFIG_VP9_HIGHBITDEPTH
  highbd_set_var_fns(cpi);
 #endif
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -2037,3 +2037,354 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,

  return var;
 }
+
+#if CONFIG_WEDGE_PARTITION
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+    vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+              src_stride, mask, mask_stride, &sse)
+
+/* checks if (r, c) has better score than previous best */
+
+#define MVC(r, c)                                       \
+    (mvcost ?                                           \
+     ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +         \
+       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \
+      error_per_bit + 4096) >> 13 : 0)
+
+#define CHECK_BETTER(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = (DIST(r, c));                                            \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+int vp9_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
+                                        uint8_t *mask, int mask_stride,
+                                        MV *bestmv, const MV *ref_mv,
+                                        int allow_hp,
+                                        int error_per_bit,
+                                        const vp9_variance_fn_ptr_t *vfp,
+                                        int forced_stop,
+                                        int iters_per_step,
+                                        int *mvjcost, int *mvcost[2],
+                                        int *distortion,
+                                        unsigned int *sse1, int is_second) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int whichdir;
+  int thismse;
+  unsigned int halfiters = iters_per_step;
+  unsigned int quarteriters = iters_per_step;
+  unsigned int eighthiters = iters_per_step;
+
+  const int y_stride = xd->plane[0].pre[is_second].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[is_second].buf;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+
+  int tr = br;
+  int tc = bc;
+
+  // central mv
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  // calculate central point error
+  besterr = vfp->mvf(y + offset, y_stride, z, src_stride, mask, mask_stride,
+                     sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+  // 1/2 pel
+  FIRST_LEVEL_CHECKS;
+  if (halfiters > 1) {
+    SECOND_LEVEL_CHECKS;
+  }
+  tr = br;
+  tc = bc;
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+int vp9_get_masked_mvpred_var(const MACROBLOCK *x,
+                              uint8_t *mask, int mask_stride,
+                              const MV *best_mv, const MV *center_mv,
+                              const vp9_variance_fn_ptr_t *vfp,
+                              int use_mvcost, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->mvf(what->buf, what->stride,
+                  get_buf_from_mv(in_what, best_mv), in_what->stride,
+                  mask, mask_stride, &unused) +
+      (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                 x->mvcost, x->errorperbit) : 0);
+}
+
+int vp9_masked_refining_search_sad_c(const MACROBLOCK *x,
+                                     uint8_t *mask, int mask_stride,
+                                     MV *ref_mv, int error_per_bit,
+                                     int search_range,
+                                     const vp9_variance_fn_ptr_t *fn_ptr,
+                                     const MV *center_mv, int is_second) {
+  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->msdf(what->buf, what->stride,
+                                       get_buf_from_mv(in_what, ref_mv),
+                                       in_what->stride, mask, mask_stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 4; j++) {
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->msdf(what->buf, what->stride,
+            get_buf_from_mv(in_what, &mv), in_what->stride, mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+int vp9_masked_diamond_search_sad_c(const MACROBLOCK *x,
+                                    const search_site_config *cfg,
+                                    uint8_t *mask, int mask_stride,
+                                    MV *ref_mv, MV *best_mv,
+                                    int search_param,
+                                    int sad_per_bit, int *num00,
+                                    const vp9_variance_fn_ptr_t *fn_ptr,
+                                    const MV *center_mv, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
+  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  const uint8_t *best_address, *in_what_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+  int i, j, step;
+
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  in_what_ref = get_buf_from_mv(in_what, ref_mv);
+  best_address = in_what_ref;
+  *num00 = 0;
+  *best_mv = *ref_mv;
+
+  // Check the starting position
+  best_sad = fn_ptr->msdf(what->buf, what->stride,
+                         best_address, in_what->stride,
+                         mask, mask_stride) +
+      mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j++) {
+      const MV mv = {best_mv->row + ss[i].mv.row,
+                     best_mv->col + ss[i].mv.col};
+      if (is_mv_in(x, &mv)) {
+       int sad = fn_ptr->msdf(what->buf, what->stride,
+                              best_address + ss[i].offset, in_what->stride,
+                              mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = i;
+          }
+        }
+      }
+
+      i++;
+    }
+
+    if (best_site != last_site) {
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = {best_mv->row + ss[best_site].mv.row,
+                            best_mv->col + ss[best_site].mv.col};
+        if (is_mv_in(x, &this_mv)) {
+          int sad = fn_ptr->msdf(what->buf, what->stride,
+                                 best_address + ss[best_site].offset,
+                                 in_what->stride, mask, mask_stride);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what_ref) {
+      (*num00)++;
+    }
+  }
+  return best_sad;
+}
+
+int vp9_masked_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
+                                  uint8_t *mask, int mask_stride,
+                                  MV *mvp_full, int step_param,
+                                  int sadpb, int further_steps, int do_refine,
+                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const MV *ref_mv, MV *dst_mv,
+                                  int is_second) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = vp9_masked_diamond_search_sad_c(x, &cpi->ss_cfg,
+                                                mask, mask_stride,
+                                                mvp_full, &temp_mv,
+                                                step_param, sadpb, &n,
+                                                fn_ptr, ref_mv, is_second);
+  if (bestsme < INT_MAX)
+    bestsme = vp9_get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
+                                        fn_ptr, 1, is_second);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps)
+    do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = vp9_masked_diamond_search_sad_c(x, &cpi->ss_cfg,
+                                                mask, mask_stride,
+                                                mvp_full, &temp_mv,
+                                                step_param + n, sadpb, &num00,
+                                                fn_ptr, ref_mv, is_second);
+      if (thissme < INT_MAX)
+        thissme = vp9_get_masked_mvpred_var(x, mask, mask_stride,
+                                            &temp_mv, ref_mv, fn_ptr, 1,
+                                            is_second);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n)
+        do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *dst_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = vp9_masked_refining_search_sad_c(x, mask, mask_stride,
+                                               &best_mv, sadpb, search_range,
+                                               fn_ptr, ref_mv, is_second);
+    if (thissme < INT_MAX)
+      thissme = vp9_get_masked_mvpred_var(x, mask, mask_stride,
+                                          &best_mv, ref_mv, fn_ptr, 1,
+                                          is_second);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *dst_mv = best_mv;
+    }
+  }
+  return bestsme;
+}
+#endif  // CONFIG_WEDGE_PARTITION
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -146,6 +146,26 @@ int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x,
                          const MV *ref_mv, MV *tmp_mv,
                          int var_max, int rd);

+#if CONFIG_WEDGE_PARTITION
+int vp9_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
+                                        uint8_t *mask, int mask_stride,
+                                        MV *bestmv, const MV *ref_mv,
+                                        int allow_hp,
+                                        int error_per_bit,
+                                        const vp9_variance_fn_ptr_t *vfp,
+                                        int forced_stop,
+                                        int iters_per_step,
+                                        int *mvjcost, int *mvcost[2],
+                                        int *distortion,
+                                        unsigned int *sse1, int is_second);
+int vp9_masked_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
+                                  uint8_t *mask, int mask_stride,
+                                  MV *mvp_full, int step_param,
+                                  int sadpb, int further_steps, int do_refine,
+                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const MV *ref_mv, MV *dst_mv,
+                                  int is_second);
+#endif  // CONFIG_WEDGE_PARTITION
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -305,8 +305,8 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
    }
  }

-  *skip_txfm_sb = skip_flag;
-  *skip_sse_sb = total_sse << 4;
+  if (skip_txfm_sb) *skip_txfm_sb = skip_flag;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
  *out_rate_sum = (int)rate_sum;
  *out_dist_sum = dist_sum << 4;
 }
@@ -2063,7 +2063,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
            // max mv magnitude and the best ref mvs of the current block for
            // the given reference.
            step_param = (vp9_init_search_range(max_mv) +
-                              cpi->mv_step_param) / 2;
+                          cpi->mv_step_param) / 2;
          } else {
            step_param = cpi->mv_step_param;
          }
@@ -2484,7 +2484,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
    // max mv magnitude and that based on the best ref mvs of the current
    // block for the given reference.
    step_param = (vp9_init_search_range(x->max_mv_context[ref]) +
-                    cpi->mv_step_param) / 2;
+                  cpi->mv_step_param) / 2;
  } else {
    step_param = cpi->mv_step_param;
  }
@@ -2753,6 +2753,169 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd,
  }
 }

+#if CONFIG_WEDGE_PARTITION
+static void do_masked_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    uint8_t *mask, int mask_stride,
+                                    BLOCK_SIZE bsize,
+                                    int mi_row, int mi_col,
+                                    int_mv *tmp_mv, int *rate_mv,
+                                    int is_second) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const VP9_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  int bestsme = INT_MAX;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+  int ref = mbmi->ref_frame[is_second];
+  MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
+
+  int tmp_col_min = x->mv_col_min;
+  int tmp_col_max = x->mv_col_max;
+  int tmp_row_min = x->mv_row_min;
+  int tmp_row_max = x->mv_row_max;
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
+                                                                        ref);
+
+  MV pred_mv[3];
+  pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
+  pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref];
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[is_second];
+
+    vp9_setup_pre_planes(xd, is_second, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  vp9_set_mv_search_range(x, &ref_mv);
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+    // Take wtd average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param = (vp9_init_search_range(x->max_mv_context[ref]) +
+                  cpi->mv_step_param) / 2;
+  } else {
+    step_param = cpi->mv_step_param;
+  }
+
+  // TODO(debargha): is show_frame needed here?
+  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
+      cm->show_frame) {
+    int boffset = 2 * (b_width_log2_lookup[BLOCK_64X64] -
+          MIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+    step_param = MAX(step_param, boffset);
+  }
+
+  if (cpi->sf.adaptive_motion_search) {
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
+    int i;
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5)
+      step_param += 2;
+
+    for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+      if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+        x->pred_mv[ref].row = 0;
+        x->pred_mv[ref].col = 0;
+        tmp_mv->as_int = INVALID_MV;
+
+        if (scaled_ref_frame) {
+          int i;
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[is_second] = backup_yv12[i];
+        }
+        return;
+      }
+    }
+  }
+
+  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  bestsme = vp9_masked_full_pixel_diamond(cpi, x, mask, mask_stride,
+                                          &mvp_full, step_param, sadpb,
+                                          MAX_MVSEARCH_STEPS - 1 - step_param,
+                                          1, &cpi->fn_ptr[bsize],
+                                          &ref_mv, &tmp_mv->as_mv, is_second);
+
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  if (bestsme < INT_MAX) {
+    int dis;  /* TODO: use dis in distortion calculation later. */
+    vp9_find_best_masked_sub_pixel_tree(x, mask, mask_stride,
+                                        &tmp_mv->as_mv, &ref_mv,
+                                        cm->allow_high_precision_mv,
+                                        x->errorperbit,
+                                        &cpi->fn_ptr[bsize],
+                                        cpi->sf.mv.subpel_force_stop,
+                                        cpi->sf.mv.subpel_iters_per_step,
+                                        x->nmvjointcost, x->mvcost,
+                                        &dis, &x->pred_sse[ref], is_second);
+  }
+  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+  if (cpi->sf.adaptive_motion_search && cm->show_frame)
+    x->pred_mv[ref] = tmp_mv->as_mv;
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[is_second] = backup_yv12[i];
+  }
+}
+
+static void do_masked_motion_search_indexed(VP9_COMP *cpi, MACROBLOCK *x,
+                                            int wedge_index,
+                                            BLOCK_SIZE bsize,
+                                            int mi_row, int mi_col,
+                                            int_mv *tmp_mv, int *rate_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  BLOCK_SIZE sb_type = mbmi->sb_type;
+  int w = (4 << b_width_log2_lookup[sb_type]);
+  int h = (4 << b_height_log2_lookup[sb_type]);
+  int i, j;
+  uint8_t mask[4096];
+  int mask_stride = 64;
+
+  vp9_generate_masked_weight(wedge_index, sb_type, h, w,
+                             mask, mask_stride);
+  /*
+  vp9_generate_hard_mask(wedge_index, sb_type, h, w,
+                         mask, mask_stride);
+                         */
+
+  do_masked_motion_search(cpi, x, mask, mask_stride, bsize,
+                          mi_row, mi_col, &tmp_mv[0], &rate_mv[0], 0);
+
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j)
+      mask[i * mask_stride + j] = 64 - mask[i * mask_stride + j];
+
+  do_masked_motion_search(cpi, x, mask, mask_stride, bsize,
+                          mi_row, mi_col, &tmp_mv[1], &rate_mv[1], 1);
+}
+#endif  // CONFIG_WEDGE_PARTITION
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                 BLOCK_SIZE bsize,
                                 int64_t txfm_cache[],
@@ -2768,6 +2931,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_INTERINTRA
                                 int *compmode_interintra_cost,
                                 int single_newmv_rate[MAX_REF_FRAMES],
+#endif
+#if CONFIG_WEDGE_PARTITION
+                                 int *compmode_wedge_cost,
 #endif
                                 int64_t *psse,
                                 const int64_t ref_best_rd) {
@@ -2796,8 +2962,10 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  uint8_t *orig_dst[MAX_MB_PLANE];
  int orig_dst_stride[MAX_MB_PLANE];
  int rs = 0;
-#if CONFIG_INTERINTRA
+#if CONFIG_INTERINTRA || CONFIG_WEDGE_PARTITION
  int rate_mv_tmp = 0;
+#endif
+#if CONFIG_INTERINTRA
  const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
 #endif
  INTERP_FILTER best_filter = SWITCHABLE;
@@ -2813,6 +2981,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  int64_t skip_sse_sb = INT64_MAX;
  int64_t distortion_y = 0, distortion_uv = 0;

+#if CONFIG_WEDGE_PARTITION
+  mbmi->use_wedge_interinter = 0;
+  *compmode_wedge_cost = 0;
+#endif  // CONFIG_WEDGE_PARTITION
+
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
@@ -2862,7 +3035,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                   &mbmi->ref_mvs[refs[1]][0].as_mv,
                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
      }
-#if !CONFIG_INTERINTRA
+#if !(CONFIG_INTERINTRA || CONFIG_WEDGE_PARTITION)
      *rate2 += rate_mv;
 #endif
    } else {
@@ -2886,13 +3059,15 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                           &tmp_mv, &rate_mv);
      if (tmp_mv.as_int == INVALID_MV)
        return INT64_MAX;
+#if !CONFIG_WEDGE_PARTITION
      *rate2 += rate_mv;
+#endif
      frame_mv[refs[0]].as_int =
          xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
      single_newmv[refs[0]].as_int = tmp_mv.as_int;
 #endif  // CONFIG_INTERINTRA
    }
-#if CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION || CONFIG_INTERINTRA
    rate_mv_tmp = rate_mv;
 #endif
  }
@@ -3045,6 +3220,98 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
      cm->interp_filter : best_filter;
  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0;

+#if CONFIG_WEDGE_PARTITION
+  if (is_comp_pred && get_wedge_bits(bsize)) {
+    int wedge_index, best_wedge_index = WEDGE_NONE, rs;
+    int rate_sum;
+    int64_t dist_sum;
+    int64_t best_rd_nowedge = INT64_MAX;
+    int64_t best_rd_wedge = INT64_MAX;
+    int wedge_types;
+    mbmi->use_wedge_interinter = 0;
+    rs = vp9_cost_bit(cm->fc.wedge_interinter_prob[bsize], 0);
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, NULL, NULL);
+    rd = RDCOST(x->rdmult, x->rddiv, rs + rate_mv_tmp + rate_sum, dist_sum);
+    best_rd_nowedge = rd;
+    mbmi->use_wedge_interinter = 1;
+    rs = get_wedge_bits(bsize) * 256 +
+        vp9_cost_bit(cm->fc.wedge_interinter_prob[bsize], 1);
+    wedge_types = (1 << get_wedge_bits(bsize));
+    if (this_mode == NEWMV) {
+      int_mv tmp_mv[2];
+      int rate_mvs[2], tmp_rate_mv;
+      for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+        mbmi->wedge_index = wedge_index;
+        vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, NULL, NULL);
+        rd = RDCOST(x->rdmult, x->rddiv, rs + rate_mv_tmp + rate_sum, dist_sum);
+        if (rd < best_rd_wedge) {
+          best_wedge_index = wedge_index;
+          best_rd_wedge = rd;
+        }
+      }
+      mbmi->wedge_index = best_wedge_index;
+      do_masked_motion_search_indexed(cpi, x, mbmi->wedge_index, bsize,
+                                      mi_row, mi_col,
+                                      tmp_mv, rate_mvs);
+      tmp_rate_mv = rate_mvs[0] + rate_mvs[1];
+      mbmi->mv[0].as_int = tmp_mv[0].as_int;
+      mbmi->mv[1].as_int = tmp_mv[1].as_int;
+      vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+      model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, NULL, NULL);
+      rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_mv + rate_sum, dist_sum);
+      if (rd < best_rd_wedge) {
+        best_rd_wedge = rd;
+      } else {
+        mbmi->mv[0].as_int = cur_mv[0].as_int;
+        mbmi->mv[1].as_int = cur_mv[1].as_int;
+        tmp_rate_mv = rate_mv_tmp;
+      }
+      if (best_rd_wedge < best_rd_nowedge) {
+        mbmi->use_wedge_interinter = 1;
+        mbmi->wedge_index = best_wedge_index;
+        xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+        xd->mi[0].src_mi->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
+        rate_mv_tmp = tmp_rate_mv;
+      } else {
+        mbmi->use_wedge_interinter = 0;
+        mbmi->mv[0].as_int = cur_mv[0].as_int;
+        mbmi->mv[1].as_int = cur_mv[1].as_int;
+      }
+    } else {
+      for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+        mbmi->wedge_index = wedge_index;
+        vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, NULL, NULL);
+        rd = RDCOST(x->rdmult, x->rddiv, rs + rate_mv_tmp + rate_sum, dist_sum);
+        if (rd < best_rd_wedge) {
+          best_wedge_index = wedge_index;
+          best_rd_wedge = rd;
+        }
+      }
+      if (best_rd_wedge < best_rd_nowedge) {
+        mbmi->use_wedge_interinter = 1;
+        mbmi->wedge_index = best_wedge_index;
+      } else {
+        mbmi->use_wedge_interinter = 0;
+      }
+    }
+
+    if (ref_best_rd < INT64_MAX &&
+        MIN(best_rd_wedge, best_rd_nowedge) / 2 > ref_best_rd)
+      return INT64_MAX;
+
+    pred_exists = 0;
+    if (mbmi->use_wedge_interinter)
+      *compmode_wedge_cost = get_wedge_bits(bsize) * 256 +
+          vp9_cost_bit(cm->fc.wedge_interinter_prob[bsize], 1);
+    else
+      *compmode_wedge_cost =
+          vp9_cost_bit(cm->fc.wedge_interinter_prob[bsize], 0);
+  }
+#endif  // CONFIG_WEDGE_PARTITION
+
 #if CONFIG_INTERINTRA
  if ((!is_comp_pred) && is_comp_interintra_pred &&
      is_interintra_allowed(mbmi->sb_type)) {
@@ -3096,7 +3363,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  }
 #endif  // CONFIG_INTERINTRA

-#if CONFIG_INTERINTRA
+#if CONFIG_INTERINTRA || CONFIG_WEDGE_PARTITION
  *rate2 += rate_mv_tmp;
 #endif

@@ -3588,6 +3855,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    int compmode_cost = 0;
 #if CONFIG_INTERINTRA
    int compmode_interintra_cost = 0;
+#endif
+#if CONFIG_WEDGE_PARTITION
+    int compmode_wedge_cost = 0;
 #endif
    int rate2 = 0, rate_y = 0, rate_uv = 0;
    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
@@ -3783,6 +4053,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    mbmi->interintra_mode = (PREDICTION_MODE)(DC_PRED - 1);
    mbmi->interintra_uv_mode = (PREDICTION_MODE)(DC_PRED - 1);
 #endif
+#if CONFIG_WEDGE_PARTITION
+    mbmi->use_wedge_interinter = 0;
+#endif

    if (ref_frame == INTRA_FRAME) {
      TX_SIZE uv_tx;
@@ -3921,6 +4194,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_INTERINTRA
                                  &compmode_interintra_cost,
                                  single_newmv_rate,
+#endif
+#if CONFIG_WEDGE_PARTITION
+                                  &compmode_wedge_cost,
 #endif
                                  &total_sse, best_rd);
      if (this_rd == INT64_MAX)
@@ -3935,6 +4211,11 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_INTERINTRA
    rate2 += compmode_interintra_cost;
 #endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+    if ((cm->reference_mode == REFERENCE_MODE_SELECT ||
+         cm->reference_mode == COMPOUND_REFERENCE) && comp_pred)
+      rate2 += compmode_wedge_cost;
+#endif

    // Estimate the reference frame signaling cost and add it
    // to the rolling cost variable.
@@ -4259,7 +4540,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_INTERINTRA
    if (mbmi->ref_frame[1] == INTRA_FRAME)
      mbmi->ref_frame[1] = NONE;
-#endif
+#endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE_PARTITION
+    mbmi->use_wedge_interinter = 0;
+#endif  // CONFIG_WEDGE_PARTITION
    mbmi->sb_type = bsize;
    mbmi->inter_ref_count = inter_ref_count;
    mbmi->copy_mode = copy_mode;
@@ -4576,6 +4860,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
  b_mode_info best_bmodes[4];
  int best_skip2 = 0;
  int ref_frame_skip_mask[2] = { 0 };
+
 #if CONFIG_EXT_TX
  mbmi->ext_txfrm = NORM;
 #endif
@@ -4587,6 +4872,9 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_COPY_MODE
  mbmi->copy_mode = NOREF;
 #endif
+#if CONFIG_WEDGE_PARTITION
+  mbmi->use_wedge_interinter = 0;
+#endif

  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
  vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
--- a/vp9/encoder/vp9_sad.c
+++ b/vp9/encoder/vp9_sad.c
@@ -274,3 +274,47 @@ highbd_sadMxNxK(4, 4, 8)
 highbd_sadMxNx4D(4, 4)

 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_WEDGE_PARTITION
+// TODO(debargha): Need highbd versions of these
+static INLINE unsigned int masked_sad(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      const uint8_t *m, int m_stride,
+                                      int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += m[x] * abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sad = (sad + 31) >> 6;
+
+  return sad;
+}
+
+#define MASKSADMxN(m, n) \
+unsigned int vp9_masked_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride, \
+                                         const uint8_t *msk, int msk_stride) { \
+  return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, n); \
+}
+
+MASKSADMxN(64, 64)
+MASKSADMxN(64, 32)
+MASKSADMxN(32, 64)
+MASKSADMxN(32, 32)
+MASKSADMxN(32, 16)
+MASKSADMxN(16, 32)
+MASKSADMxN(16, 16)
+MASKSADMxN(16, 8)
+MASKSADMxN(8, 16)
+MASKSADMxN(8, 8)
+MASKSADMxN(8, 4)
+MASKSADMxN(4, 8)
+MASKSADMxN(4, 4)
+#endif  // CONFIG_WEDGE_PARTITION
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -649,3 +649,98 @@ void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_WEDGE_PARTITION
+// TODO(debargha): Need highbd versions of these
+void masked_variance(const uint8_t *a, int  a_stride,
+                     const uint8_t *b, int  b_stride,
+                     const uint8_t *m, int  m_stride,
+                     int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = (a[j] - b[j]) * (m[j]);
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  *sum = (*sum >= 0) ? ((*sum + 31) >> 6) : -((-*sum + 31) >> 6);
+  *sse = (*sse + 2047) >> 12;
+}
+
+#define MASK_VAR(W, H) \
+unsigned int vp9_masked_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                              const uint8_t *b, int b_stride, \
+                                              const uint8_t *m, int m_stride, \
+                                              unsigned int *sse) { \
+  int sum; \
+  masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define MASK_SUBPIX_VAR(W, H) \
+unsigned int vp9_masked_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  const uint8_t *msk, int msk_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+\
+  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
+                                    BILINEAR_FILTERS_2TAP(xoffset)); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  return vp9_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, \
+                                          msk, msk_stride, sse); \
+}
+
+MASK_VAR(4, 4)
+MASK_SUBPIX_VAR(4, 4)
+
+MASK_VAR(4, 8)
+MASK_SUBPIX_VAR(4, 8)
+
+MASK_VAR(8, 4)
+MASK_SUBPIX_VAR(8, 4)
+
+MASK_VAR(8, 8)
+MASK_SUBPIX_VAR(8, 8)
+
+MASK_VAR(8, 16)
+MASK_SUBPIX_VAR(8, 16)
+
+MASK_VAR(16, 8)
+MASK_SUBPIX_VAR(16, 8)
+
+MASK_VAR(16, 16)
+MASK_SUBPIX_VAR(16, 16)
+
+MASK_VAR(16, 32)
+MASK_SUBPIX_VAR(16, 32)
+
+MASK_VAR(32, 16)
+MASK_SUBPIX_VAR(32, 16)
+
+MASK_VAR(32, 32)
+MASK_SUBPIX_VAR(32, 32)
+
+MASK_VAR(32, 64)
+MASK_SUBPIX_VAR(32, 64)
+
+MASK_VAR(64, 32)
+MASK_SUBPIX_VAR(64, 32)
+
+MASK_VAR(64, 64)
+MASK_SUBPIX_VAR(64, 64)
+#endif  // CONFIG_WEDGE_PARTITION
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -84,6 +84,31 @@ typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
                                                   unsigned int *sse,
                                                   const uint8_t *second_pred);

+#if CONFIG_WEDGE_PARTITION
+typedef unsigned int(*vp9_masked_sad_fn_t)(const uint8_t *src_ptr,
+                                           int source_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride,
+                                           const uint8_t *msk_ptr,
+                                           int msk_stride);
+typedef unsigned int (*vp9_masked_variance_fn_t)(const uint8_t *src_ptr,
+                                                 int source_stride,
+                                                 const uint8_t *ref_ptr,
+                                                 int ref_stride,
+                                                 const uint8_t *msk_ptr,
+                                                 int msk_stride,
+                                                 unsigned int *sse);
+typedef unsigned int (*vp9_masked_subpixvariance_fn_t)(const uint8_t *src_ptr,
+                                                       int source_stride,
+                                                       int xoffset,
+                                                       int yoffset,
+                                                       const uint8_t *ref_ptr,
+                                                       int Refstride,
+                                                       const uint8_t *msk_ptr,
+                                                       int msk_stride,
+                                                       unsigned int *sse);
+#endif  // CONFIG_WEDGE_PARTITION
+
 typedef struct vp9_variance_vtable {
  vp9_sad_fn_t               sdf;
  vp9_sad_avg_fn_t           sdaf;
@@ -93,6 +118,11 @@ typedef struct vp9_variance_vtable {
  vp9_sad_multi_fn_t         sdx3f;
  vp9_sad_multi_fn_t         sdx8f;
  vp9_sad_multi_d_fn_t       sdx4df;
+#if CONFIG_WEDGE_PARTITION
+  vp9_masked_sad_fn_t            msdf;
+  vp9_masked_variance_fn_t       mvf;
+  vp9_masked_subpixvariance_fn_t msvf;
+#endif  // CONFIG_WEDGE_PARTITION
 } vp9_variance_fn_ptr_t;

 void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,