Merge "Rework wedge experiment to improve speed." into nextgen

2016-01-14 16:13:38 +00:00 · 2016-01-14 16:13:38 +00:00 · c20adb59b2
commit c20adb59b2
parent 839ab586f6 f93bdebcf9
9 changed files with 260 additions and 433 deletions
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@ -303,15 +303,15 @@ static MV average_split_mvs(const struct macroblockd_plane *pd,
 static int get_masked_weight(int m) {
  #define SMOOTHER_LEN  32
  static const uint8_t smoothfn[2 * SMOOTHER_LEN + 1] = {
-      0,  0,  0,  0,  0,  0,  0,  0,
-      0,  0,  0,  0,  0,  1,  1,  1,
-      1,  1,  2,  2,  3,  4,  5,  6,
-      8,  9, 12, 14, 17, 21, 24, 28,
-      32,
-      36, 40, 43, 47, 50, 52, 55, 56,
-      58, 59, 60, 61, 62, 62, 63, 63,
-      63, 63, 63, 64, 64, 64, 64, 64,
-      64, 64, 64, 64, 64, 64, 64, 64,
+    0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  1,  1,  1,
+    1,  1,  2,  2,  3,  4,  5,  6,
+    8,  9, 12, 14, 17, 21, 24, 28,
+    32,
+    36, 40, 43, 47, 50, 52, 55, 56,
+    58, 59, 60, 61, 62, 62, 63, 63,
+    63, 63, 63, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64,
  };
  if (m < -SMOOTHER_LEN)
    return 0;
@ -321,8 +321,93 @@ static int get_masked_weight(int m) {
    return smoothfn[m + SMOOTHER_LEN];
 }

-static int get_hard_mask(int m) {
-  return 1 << WEDGE_WEIGHT_BITS * (m > 0);
+// [negative][transpose][reverse]
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_mask_obl[2][2][2][MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
+// [negative][transpose]
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_mask_str[2][2][MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
+
+void vp9_init_wedge_masks() {
+  int i, j;
+  const int w = MASK_MASTER_SIZE;
+  const int h = MASK_MASTER_SIZE;
+  const int stride = MASK_MASTER_STRIDE;
+  const int a[4] = {2, 1, 2, 2};
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) {
+      int x = (2 * j + 1 - (a[2] * w) / 2);
+      int y = (2 * i + 1 - (a[3] * h) / 2);
+      int m = (a[0] * x + a[1] * y) / 2;
+      wedge_mask_obl[0][0][0][i * stride + j] =
+          wedge_mask_obl[0][1][0][j * stride + i] =
+          wedge_mask_obl[0][0][1][i * stride + w - 1 - j] =
+          wedge_mask_obl[0][1][1][(w - 1 - j) * stride + i] =
+          get_masked_weight(m);
+      wedge_mask_obl[1][0][0][i * stride + j] =
+          wedge_mask_obl[1][1][0][j * stride + i] =
+          wedge_mask_obl[1][0][1][i * stride + w - 1 - j] =
+          wedge_mask_obl[1][1][1][(w - 1 - j) * stride + i] =
+          (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(m);
+      wedge_mask_str[0][0][i * stride + j] =
+          wedge_mask_str[0][1][j * stride + i] =
+          get_masked_weight(x);
+      wedge_mask_str[1][0][i * stride + j] =
+          wedge_mask_str[1][1][j * stride + i] =
+          (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(x);
+    }
+}
+
+static void get_wedge_mask_from_array(const int *a,
+                                      int h, int w,
+                                      uint8_t *mask, int stride) {
+  const int woff = (a[2] * w) >> 2;
+  const int hoff = (a[3] * h) >> 2;
+  const int oblique = (abs(a[0]) + abs(a[1]) == 3);
+  const uint8_t *master;
+  int transpose, reverse, negative;
+  int i;
+
+  if (oblique) {
+    negative = (a[0] < 0);
+    transpose = (abs(a[0]) == 1);
+    reverse = (a[0] < 0) ^ (a[1] < 0);
+  } else {
+    negative = (a[0] < 0 || a[1] < 0);
+    transpose = (a[0] == 0);
+    reverse = 0;
+  }
+  master = (oblique ?
+            wedge_mask_obl[negative][transpose][reverse] :
+            wedge_mask_str[negative][transpose]) +
+      MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
+      MASK_MASTER_SIZE / 2 - woff;
+  for (i = 0; i < h; ++i)
+    vpx_memcpy(mask + i * stride, master + i * MASK_MASTER_STRIDE, w);
+}
+
+static const uint8_t *get_wedge_mask_inplace(const int *a,
+                                             int h, int w) {
+  const int woff = (a[2] * w) >> 2;
+  const int hoff = (a[3] * h) >> 2;
+  const int oblique = (abs(a[0]) + abs(a[1]) == 3);
+  const uint8_t *master;
+  int transpose, reverse, negative;
+  if (oblique) {
+    negative = (a[0] < 0);
+    transpose = (abs(a[0]) == 1);
+    reverse = (a[0] < 0) ^ (a[1] < 0);
+  } else {
+    negative = (a[0] < 0 || a[1] < 0);
+    transpose = (a[0] == 0);
+    reverse = 0;
+  }
+  master = (oblique ?
+            wedge_mask_obl[negative][transpose][reverse] :
+            wedge_mask_str[negative][transpose]) +
+      MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
+      MASK_MASTER_SIZE / 2 - woff;
+  return master;
 }

 // Equation of line: f(x, y) = a[0]*(x - a[2]*w/4) + a[1]*(y - a[3]*h/4) = 0
@ -333,10 +418,10 @@ static const int wedge_params_sml[1 << WEDGE_BITS_SML][4] = {
  { 1, -2, 2, 2},
  {-2,  1, 2, 2},
  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
+  { 2,  1, 2, 2},
  {-1, -2, 2, 2},
+  { 1,  2, 2, 2},
 };

 static const int wedge_params_med_hgtw[1 << WEDGE_BITS_MED][4] = {
@ -344,19 +429,19 @@ static const int wedge_params_med_hgtw[1 << WEDGE_BITS_MED][4] = {
  { 1, -2, 2, 2},
  {-2,  1, 2, 2},
  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
+  { 2,  1, 2, 2},
  {-1, -2, 2, 2},
+  { 1,  2, 2, 2},

  {-1,  2, 2, 1},
  { 1, -2, 2, 1},
  {-1,  2, 2, 3},
  { 1, -2, 2, 3},
-  { 1,  2, 2, 1},
  {-1, -2, 2, 1},
-  { 1,  2, 2, 3},
+  { 1,  2, 2, 1},
  {-1, -2, 2, 3},
+  { 1,  2, 2, 3},
 };

 static const int wedge_params_med_hltw[1 << WEDGE_BITS_MED][4] = {
@ -364,19 +449,19 @@ static const int wedge_params_med_hltw[1 << WEDGE_BITS_MED][4] = {
  { 1, -2, 2, 2},
  {-2,  1, 2, 2},
  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
+  { 2,  1, 2, 2},
  {-1, -2, 2, 2},
+  { 1,  2, 2, 2},

  {-2,  1, 1, 2},
  { 2, -1, 1, 2},
  {-2,  1, 3, 2},
  { 2, -1, 3, 2},
-  { 2,  1, 1, 2},
  {-2, -1, 1, 2},
-  { 2,  1, 3, 2},
+  { 2,  1, 1, 2},
  {-2, -1, 3, 2},
+  { 2,  1, 3, 2},
 };

 static const int wedge_params_med_heqw[1 << WEDGE_BITS_MED][4] = {
@ -384,19 +469,19 @@ static const int wedge_params_med_heqw[1 << WEDGE_BITS_MED][4] = {
  { 1, -2, 2, 2},
  {-2,  1, 2, 2},
  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
+  { 2,  1, 2, 2},
  {-1, -2, 2, 2},
+  { 1,  2, 2, 2},

-  { 0,  2, 0, 1},
  { 0, -2, 0, 1},
-  { 0,  2, 0, 3},
+  { 0,  2, 0, 1},
  { 0, -2, 0, 3},
-  { 2,  0, 1, 0},
+  { 0,  2, 0, 3},
  {-2,  0, 1, 0},
-  { 2,  0, 3, 0},
+  { 2,  0, 1, 0},
  {-2,  0, 3, 0},
+  { 2,  0, 3, 0},
 };

 static const int wedge_params_big_hgtw[1 << WEDGE_BITS_BIG][4] = {
@ -404,37 +489,37 @@ static const int wedge_params_big_hgtw[1 << WEDGE_BITS_BIG][4] = {
  { 1, -2, 2, 2},
  {-2,  1, 2, 2},
  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
+  { 2,  1, 2, 2},
  {-1, -2, 2, 2},
+  { 1,  2, 2, 2},

  {-1,  2, 2, 1},
  { 1, -2, 2, 1},
  {-1,  2, 2, 3},
  { 1, -2, 2, 3},
-  { 1,  2, 2, 1},
  {-1, -2, 2, 1},
-  { 1,  2, 2, 3},
+  { 1,  2, 2, 1},
  {-1, -2, 2, 3},
+  { 1,  2, 2, 3},

  {-2,  1, 1, 2},
  { 2, -1, 1, 2},
  {-2,  1, 3, 2},
  { 2, -1, 3, 2},
-  { 2,  1, 1, 2},
  {-2, -1, 1, 2},
-  { 2,  1, 3, 2},
+  { 2,  1, 1, 2},
  {-2, -1, 3, 2},
+  { 2,  1, 3, 2},

-  { 0,  2, 0, 1},
  { 0, -2, 0, 1},
-  { 0,  2, 0, 2},
+  { 0,  2, 0, 1},
  { 0, -2, 0, 2},
-  { 0,  2, 0, 3},
+  { 0,  2, 0, 2},
  { 0, -2, 0, 3},
-  { 2,  0, 2, 0},
+  { 0,  2, 0, 3},
  {-2,  0, 2, 0},
+  { 2,  0, 2, 0},
 };

 static const int wedge_params_big_hltw[1 << WEDGE_BITS_BIG][4] = {
@ -442,37 +527,37 @@ static const int wedge_params_big_hltw[1 << WEDGE_BITS_BIG][4] = {
  { 1, -2, 2, 2},
  {-2,  1, 2, 2},
  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
+  { 2,  1, 2, 2},
  {-1, -2, 2, 2},
+  { 1,  2, 2, 2},

  {-1,  2, 2, 1},
  { 1, -2, 2, 1},
  {-1,  2, 2, 3},
  { 1, -2, 2, 3},
-  { 1,  2, 2, 1},
  {-1, -2, 2, 1},
-  { 1,  2, 2, 3},
+  { 1,  2, 2, 1},
  {-1, -2, 2, 3},
+  { 1,  2, 2, 3},

  {-2,  1, 1, 2},
  { 2, -1, 1, 2},
  {-2,  1, 3, 2},
  { 2, -1, 3, 2},
-  { 2,  1, 1, 2},
  {-2, -1, 1, 2},
-  { 2,  1, 3, 2},
+  { 2,  1, 1, 2},
  {-2, -1, 3, 2},
+  { 2,  1, 3, 2},

-  { 0,  2, 0, 2},
  { 0, -2, 0, 2},
-  { 2,  0, 1, 0},
+  { 0,  2, 0, 2},
  {-2,  0, 1, 0},
-  { 2,  0, 2, 0},
+  { 2,  0, 1, 0},
  {-2,  0, 2, 0},
-  { 2,  0, 3, 0},
+  { 2,  0, 2, 0},
  {-2,  0, 3, 0},
+  { 2,  0, 3, 0},
 };

 static const int wedge_params_big_heqw[1 << WEDGE_BITS_BIG][4] = {
@ -480,37 +565,37 @@ static const int wedge_params_big_heqw[1 << WEDGE_BITS_BIG][4] = {
  { 1, -2, 2, 2},
  {-2,  1, 2, 2},
  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
+  { 2,  1, 2, 2},
  {-1, -2, 2, 2},
+  { 1,  2, 2, 2},

  {-1,  2, 2, 1},
  { 1, -2, 2, 1},
  {-1,  2, 2, 3},
  { 1, -2, 2, 3},
-  { 1,  2, 2, 1},
  {-1, -2, 2, 1},
-  { 1,  2, 2, 3},
+  { 1,  2, 2, 1},
  {-1, -2, 2, 3},
+  { 1,  2, 2, 3},

  {-2,  1, 1, 2},
  { 2, -1, 1, 2},
  {-2,  1, 3, 2},
  { 2, -1, 3, 2},
-  { 2,  1, 1, 2},
  {-2, -1, 1, 2},
-  { 2,  1, 3, 2},
+  { 2,  1, 1, 2},
  {-2, -1, 3, 2},
+  { 2,  1, 3, 2},

-  { 0,  2, 0, 1},
  { 0, -2, 0, 1},
-  { 0,  2, 0, 3},
+  { 0,  2, 0, 1},
  { 0, -2, 0, 3},
-  { 2,  0, 1, 0},
+  { 0,  2, 0, 3},
  {-2,  0, 1, 0},
-  { 2,  0, 3, 0},
+  { 2,  0, 1, 0},
  {-2,  0, 3, 0},
+  { 2,  0, 3, 0},
 };

 static const int *get_wedge_params(int wedge_index,
@ -544,34 +629,40 @@ static const int *get_wedge_params(int wedge_index,
  return a;
 }

-void vp9_generate_masked_weight(int wedge_index,
-                                BLOCK_SIZE sb_type,
-                                int h, int w,
-                                uint8_t *mask, int stride) {
-  int i, j;
+const uint8_t *vp9_get_soft_mask(int wedge_index,
+                                 BLOCK_SIZE sb_type,
+                                 int h, int w) {
  const int *a = get_wedge_params(wedge_index, sb_type, h, w);
-  if (!a) return;
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) {
-      int x = (j - (a[2] * w) / 4);
-      int y = (i - (a[3] * h) / 4);
-      int m = a[0] * x + a[1] * y;
-      mask[i * stride + j] = get_masked_weight(m);
-    }
+  if (a) {
+    return get_wedge_mask_inplace(a, h, w);
+  } else {
+    return NULL;
+  }
 }

+// To be deprecated
+void vp9_generate_soft_mask(int wedge_index,
+                            BLOCK_SIZE sb_type,
+                            int h, int w,
+                            uint8_t *mask, int stride) {
+  const int *a = get_wedge_params(wedge_index, sb_type, h, w);
+  if (a) {
+    get_wedge_mask_from_array(a, h, w, mask, stride);
+  }
+}
+
+// To be deprecated
 void vp9_generate_hard_mask(int wedge_index, BLOCK_SIZE sb_type,
                            int h, int w, uint8_t *mask, int stride) {
  int i, j;
  const int *a = get_wedge_params(wedge_index, sb_type, h, w);
-  if (!a) return;
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) {
-      int x = (j - (a[2] * w) / 4);
-      int y = (i - (a[3] * h) / 4);
-      int m = a[0] * x + a[1] * y;
-      mask[i * stride + j] = get_hard_mask(m);
-    }
+  if (a) {
+    get_wedge_mask_from_array(a, h, w, mask, stride);
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        mask[i * stride + j] = mask[i * stride + j] > 0;
+      }
+  }
 }

 static void build_masked_compound(uint8_t *dst, int dst_stride,
@ -579,12 +670,10 @@ static void build_masked_compound(uint8_t *dst, int dst_stride,
                                  int wedge_index, BLOCK_SIZE sb_type,
                                  int h, int w) {
  int i, j;
-  uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
-  vp9_generate_masked_weight(wedge_index, sb_type, h, w, mask,
-                             CODING_UNIT_SIZE);
+  const uint8_t *mask = vp9_get_soft_mask(wedge_index, sb_type, h, w);
  for (i = 0; i < h; ++i)
    for (j = 0; j < w; ++j) {
-      int m = mask[i * CODING_UNIT_SIZE + j];
+      int m = mask[i * MASK_MASTER_STRIDE + j];
      dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
                                 dst2[i * dst2_stride + j] *
                                 ((1 << WEDGE_WEIGHT_BITS) - m) +
@ -599,14 +688,12 @@ static void build_masked_compound_highbd(uint8_t *dst_8, int dst_stride,
                                         int wedge_index, BLOCK_SIZE sb_type,
                                         int h, int w) {
  int i, j;
-  uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
+  const uint8_t *mask = vp9_get_soft_mask(wedge_index, sb_type, h, w);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
  uint16_t *dst2 = CONVERT_TO_SHORTPTR(dst2_8);
-  vp9_generate_masked_weight(wedge_index, sb_type, h, w, mask,
-                             CODING_UNIT_SIZE);
  for (i = 0; i < h; ++i)
    for (j = 0; j < w; ++j) {
-      int m = mask[i * CODING_UNIT_SIZE + j];
+      int m = mask[i * MASK_MASTER_STRIDE + j];
      dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
                                 dst2[i * dst2_stride + j] *
                                 ((1 << WEDGE_WEIGHT_BITS) - m) +
@ -617,10 +704,31 @@ static void build_masked_compound_highbd(uint8_t *dst_8, int dst_stride,
 #endif  // CONFIG_VP9_HIGHBITDEPTH

 #if CONFIG_SUPERTX
-void generate_masked_weight_extend(int wedge_index, int plane,
-                                   BLOCK_SIZE sb_type, int h, int w,
-                                   int wedge_offset_x, int wedge_offset_y,
-                                   uint8_t *mask, int stride) {
+const uint8_t *get_soft_mask_extend(int wedge_index, int plane,
+                                    BLOCK_SIZE sb_type,
+                                    int h, int w,
+                                    int wedge_offset_y,
+                                    int wedge_offset_x) {
+  int subh = (plane ? 2 : 4) << b_height_log2_lookup[sb_type];
+  int subw = (plane ? 2 : 4) << b_width_log2_lookup[sb_type];
+  const int *a = get_wedge_params(wedge_index, sb_type, subh, subw);
+  (void) h;
+  (void) w;
+  if (a) {
+    const uint8_t *mask = get_wedge_mask_inplace(a, subh, subw);
+    mask -= (wedge_offset_x + wedge_offset_y * MASK_MASTER_STRIDE);
+    return mask;
+  } else {
+    return NULL;
+  }
+}
+
+// To be deprecated
+static void generate_soft_mask_extend(int wedge_index, int plane,
+                                      BLOCK_SIZE sb_type, int h, int w,
+                                      int wedge_offset_y,
+                                      int wedge_offset_x,
+                                      uint8_t *mask, int stride) {
  int i, j;
  int subh = (plane ? 2 : 4) << b_height_log2_lookup[sb_type];
  int subw = (plane ? 2 : 4) << b_width_log2_lookup[sb_type];
@ -628,9 +736,9 @@ void generate_masked_weight_extend(int wedge_index, int plane,
  if (!a) return;
  for (i = 0; i < h; ++i)
    for (j = 0; j < w; ++j) {
-      int x = (j - (a[2] * subw) / 4 - wedge_offset_x);
-      int y = (i - (a[3] * subh) / 4 - wedge_offset_y);
-      int m = a[0] * x + a[1] * y;
+      int x = (2 * j + 1 - (a[2] * subw) / 2 - 2 * wedge_offset_x);
+      int y = (2 * i + 1 - (a[3] * subh) / 2 - 2 * wedge_offset_y);
+      int m = (a[0] * x + a[1] * y) / 2;
      mask[i * stride + j] = get_masked_weight(m);
    }
 }
@ -639,16 +747,14 @@ static void build_masked_compound_extend(uint8_t *dst, int dst_stride,
                                         uint8_t *dst2, int dst2_stride,
                                         int plane,
                                         int wedge_index, BLOCK_SIZE sb_type,
-                                         int wedge_offset_x, int wedge_offset_y,
+                                         int wedge_offset_y, int wedge_offset_x,
                                         int h, int w) {
  int i, j;
-  uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
-  generate_masked_weight_extend(wedge_index, plane, sb_type, h, w,
-                                wedge_offset_x, wedge_offset_y, mask,
-                                CODING_UNIT_SIZE);
+  const uint8_t *mask = get_soft_mask_extend(
+     wedge_index, plane, sb_type, h, w, wedge_offset_y, wedge_offset_x);
  for (i = 0; i < h; ++i)
    for (j = 0; j < w; ++j) {
-      int m = mask[i * CODING_UNIT_SIZE + j];
+      int m = mask[i * MASK_MASTER_STRIDE + j];
      dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
                                 dst2[i * dst2_stride + j] *
                                 ((1 << WEDGE_WEIGHT_BITS) - m) +
@ -662,18 +768,16 @@ static void build_masked_compound_extend_highbd(
    uint8_t *dst_8, int dst_stride,
    uint8_t *dst2_8, int dst2_stride, int plane,
    int wedge_index, BLOCK_SIZE sb_type,
-    int wedge_offset_x, int wedge_offset_y,
+    int wedge_offset_y, int wedge_offset_x,
    int h, int w) {
  int i, j;
-  uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
+  const uint8_t *mask = get_soft_mask_extend(
+      wedge_index, plane, sb_type, h, w, wedge_offset_y, wedge_offset_x);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
  uint16_t *dst2 = CONVERT_TO_SHORTPTR(dst2_8);
-  generate_masked_weight_extend(wedge_index, plane, sb_type, h, w,
-                                wedge_offset_x, wedge_offset_y, mask,
-                                CODING_UNIT_SIZE);
  for (i = 0; i < h; ++i)
    for (j = 0; j < w; ++j) {
-      int m = mask[i * CODING_UNIT_SIZE + j];
+      int m = mask[i * MASK_MASTER_STRIDE + j];
      dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
                                 dst2[i * dst2_stride + j] *
                                 ((1 << WEDGE_WEIGHT_BITS) - m) +
@ -808,20 +912,20 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
            dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE, plane,
            mi->mbmi.interinter_wedge_index,
            mi->mbmi.sb_type,
-            wedge_offset_x, wedge_offset_y, h, w);
+            wedge_offset_y, wedge_offset_x, h, w);
      } else {
        build_masked_compound_extend(
            dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE, plane,
            mi->mbmi.interinter_wedge_index,
            mi->mbmi.sb_type,
-            wedge_offset_x, wedge_offset_y, h, w);
+            wedge_offset_y, wedge_offset_x, h, w);
      }
 #else
      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst,
                                   CODING_UNIT_SIZE, plane,
                                   mi->mbmi.interinter_wedge_index,
                                   mi->mbmi.sb_type,
-                                   wedge_offset_x, wedge_offset_y, h, w);
+                                   wedge_offset_y, wedge_offset_x, h, w);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #else   // CONFIG_SUPERTX
 #if CONFIG_VP9_HIGHBITDEPTH
@ -1559,20 +1663,20 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
            dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE, plane,
            mi->mbmi.interinter_wedge_index,
            mi->mbmi.sb_type,
-            wedge_offset_x, wedge_offset_y, h, w);
+            wedge_offset_y, wedge_offset_x, h, w);
      } else {
        build_masked_compound_extend(dst, dst_buf->stride, tmp_dst,
                                     CODING_UNIT_SIZE, plane,
                                     mi->mbmi.interinter_wedge_index,
                                     mi->mbmi.sb_type,
-                                     wedge_offset_x, wedge_offset_y, h, w);
+                                     wedge_offset_y, wedge_offset_x, h, w);
      }
 #else
      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst,
                                   CODING_UNIT_SIZE, plane,
                                   mi->mbmi.interinter_wedge_index,
                                   mi->mbmi.sb_type,
-                                   wedge_offset_x, wedge_offset_y, h, w);
+                                   wedge_offset_y, wedge_offset_x, h, w);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #else   // CONFIG_SUPERTX
 #if CONFIG_VP9_HIGHBITDEPTH
@ -2115,20 +2219,20 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
            dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE, plane,
            mi->mbmi.interinter_wedge_index,
            mi->mbmi.sb_type,
-            wedge_offset_x, wedge_offset_y, h, w);
+            wedge_offset_y, wedge_offset_x, h, w);
      } else {
        build_masked_compound_extend(
            dst, dst_buf->stride, tmp_dst, CODING_UNIT_SIZE, plane,
            mi->mbmi.interinter_wedge_index,
            mi->mbmi.sb_type,
-            wedge_offset_x, wedge_offset_y, h, w);
+            wedge_offset_y, wedge_offset_x, h, w);
      }
 #else
      build_masked_compound_extend(dst, dst_buf->stride, tmp_dst,
                                   CODING_UNIT_SIZE, plane,
                                   mi->mbmi.interinter_wedge_index,
                                   mi->mbmi.sb_type,
-                                   wedge_offset_x, wedge_offset_y, h, w);
+                                   wedge_offset_y, wedge_offset_x, h, w);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #else   // CONFIG_SUPERTX
 #if CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@ -90,8 +90,17 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
                          const struct scale_factors *sf);

 #if CONFIG_WEDGE_PARTITION
-void vp9_generate_masked_weight(int wedge_index, BLOCK_SIZE sb_type,
-                                int h, int w, uint8_t *mask, int stride);
+
+#define MASK_MASTER_SIZE   (2 * CODING_UNIT_SIZE)
+#define MASK_MASTER_STRIDE (2 * CODING_UNIT_SIZE)
+
+void vp9_init_wedge_masks();
+
+const uint8_t *vp9_get_soft_mask(int wedge_index,
+                                 BLOCK_SIZE sb_type,
+                                 int h, int w);
+void vp9_generate_soft_mask(int wedge_index, BLOCK_SIZE sb_type,
+                            int h, int w, uint8_t *mask, int stride);
 void vp9_generate_hard_mask(int wedge_index, BLOCK_SIZE sb_type,
                            int h, int w, uint8_t *mask, int stride);
 void vp9_build_inter_predictors_for_planes_single_buf(
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@ -14,6 +14,7 @@
 #include "vpx_mem/vpx_mem.h"

 #include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_onyxc_int.h"

 const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = {
@ -1454,283 +1455,6 @@ static INLINE TX_SIZE blocklen_to_txsize(int bs) {
  }
 }

-#if CONFIG_WEDGE_PARTITION
-
-static int get_masked_weight_interintra(int m) {
-#define SMOOTHER_LEN_INTERINTRA  32
-  static const uint8_t smoothfn[2 * SMOOTHER_LEN_INTERINTRA + 1] = {
-      0,  0,  0,  0,  0,  0,  0,  0,
-      0,  0,  0,  0,  0,  1,  1,  1,
-      1,  1,  2,  2,  3,  4,  5,  6,
-      8,  9, 12, 14, 17, 21, 24, 28,
-      32,
-      36, 40, 43, 47, 50, 52, 55, 56,
-      58, 59, 60, 61, 62, 62, 63, 63,
-      63, 63, 63, 64, 64, 64, 64, 64,
-      64, 64, 64, 64, 64, 64, 64, 64,
-  };
-  if (m < -SMOOTHER_LEN_INTERINTRA)
-    return 0;
-  else if (m > SMOOTHER_LEN_INTERINTRA)
-    return (1 << WEDGE_WEIGHT_BITS);
-  else
-    return smoothfn[m + SMOOTHER_LEN_INTERINTRA];
-}
-
-static int get_hard_mask_interintra(int m) {
-  return m > 0;
-}
-
-// Equation of line: f(x, y) = a[0]*(x - a[2]*w/4) + a[1]*(y - a[3]*h/4) = 0
-// The soft mask is obtained by computing f(x, y) and then calling
-// get_masked_weight(f(x, y)).
-static const int wedge_params_sml_interintra[1 << WEDGE_BITS_SML][4] = {
-  {-1,  2, 2, 2},
-  { 1, -2, 2, 2},
-  {-2,  1, 2, 2},
-  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
-  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
-  {-1, -2, 2, 2},
-};
-
-static const int wedge_params_med_hgtw_interintra[1 << WEDGE_BITS_MED][4] = {
-  {-1,  2, 2, 2},
-  { 1, -2, 2, 2},
-  {-2,  1, 2, 2},
-  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
-  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
-  {-1, -2, 2, 2},
-
-  {-1,  2, 2, 1},
-  { 1, -2, 2, 1},
-  {-1,  2, 2, 3},
-  { 1, -2, 2, 3},
-  { 1,  2, 2, 1},
-  {-1, -2, 2, 1},
-  { 1,  2, 2, 3},
-  {-1, -2, 2, 3},
-};
-
-static const int wedge_params_med_hltw_interintra[1 << WEDGE_BITS_MED][4] = {
-  {-1,  2, 2, 2},
-  { 1, -2, 2, 2},
-  {-2,  1, 2, 2},
-  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
-  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
-  {-1, -2, 2, 2},
-
-  {-2,  1, 1, 2},
-  { 2, -1, 1, 2},
-  {-2,  1, 3, 2},
-  { 2, -1, 3, 2},
-  { 2,  1, 1, 2},
-  {-2, -1, 1, 2},
-  { 2,  1, 3, 2},
-  {-2, -1, 3, 2},
-};
-
-static const int wedge_params_med_heqw_interintra[1 << WEDGE_BITS_MED][4] = {
-  {-1,  2, 2, 2},
-  { 1, -2, 2, 2},
-  {-2,  1, 2, 2},
-  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
-  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
-  {-1, -2, 2, 2},
-
-  { 0,  2, 0, 1},
-  { 0, -2, 0, 1},
-  { 0,  2, 0, 3},
-  { 0, -2, 0, 3},
-  { 2,  0, 1, 0},
-  {-2,  0, 1, 0},
-  { 2,  0, 3, 0},
-  {-2,  0, 3, 0},
-};
-
-static const int wedge_params_big_hgtw_interintra[1 << WEDGE_BITS_BIG][4] = {
-  {-1,  2, 2, 2},
-  { 1, -2, 2, 2},
-  {-2,  1, 2, 2},
-  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
-  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
-  {-1, -2, 2, 2},
-
-  {-1,  2, 2, 1},
-  { 1, -2, 2, 1},
-  {-1,  2, 2, 3},
-  { 1, -2, 2, 3},
-  { 1,  2, 2, 1},
-  {-1, -2, 2, 1},
-  { 1,  2, 2, 3},
-  {-1, -2, 2, 3},
-
-  {-2,  1, 1, 2},
-  { 2, -1, 1, 2},
-  {-2,  1, 3, 2},
-  { 2, -1, 3, 2},
-  { 2,  1, 1, 2},
-  {-2, -1, 1, 2},
-  { 2,  1, 3, 2},
-  {-2, -1, 3, 2},
-
-  { 0,  2, 0, 1},
-  { 0, -2, 0, 1},
-  { 0,  2, 0, 2},
-  { 0, -2, 0, 2},
-  { 0,  2, 0, 3},
-  { 0, -2, 0, 3},
-  { 2,  0, 2, 0},
-  {-2,  0, 2, 0},
-};
-
-static const int wedge_params_big_hltw_interintra[1 << WEDGE_BITS_BIG][4] = {
-  {-1,  2, 2, 2},
-  { 1, -2, 2, 2},
-  {-2,  1, 2, 2},
-  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
-  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
-  {-1, -2, 2, 2},
-
-  {-1,  2, 2, 1},
-  { 1, -2, 2, 1},
-  {-1,  2, 2, 3},
-  { 1, -2, 2, 3},
-  { 1,  2, 2, 1},
-  {-1, -2, 2, 1},
-  { 1,  2, 2, 3},
-  {-1, -2, 2, 3},
-
-  {-2,  1, 1, 2},
-  { 2, -1, 1, 2},
-  {-2,  1, 3, 2},
-  { 2, -1, 3, 2},
-  { 2,  1, 1, 2},
-  {-2, -1, 1, 2},
-  { 2,  1, 3, 2},
-  {-2, -1, 3, 2},
-
-  { 0,  2, 0, 2},
-  { 0, -2, 0, 2},
-  { 2,  0, 1, 0},
-  {-2,  0, 1, 0},
-  { 2,  0, 2, 0},
-  {-2,  0, 2, 0},
-  { 2,  0, 3, 0},
-  {-2,  0, 3, 0},
-};
-
-static const int wedge_params_big_heqw_interintra[1 << WEDGE_BITS_BIG][4] = {
-  {-1,  2, 2, 2},
-  { 1, -2, 2, 2},
-  {-2,  1, 2, 2},
-  { 2, -1, 2, 2},
-  { 2,  1, 2, 2},
-  {-2, -1, 2, 2},
-  { 1,  2, 2, 2},
-  {-1, -2, 2, 2},
-
-  {-1,  2, 2, 1},
-  { 1, -2, 2, 1},
-  {-1,  2, 2, 3},
-  { 1, -2, 2, 3},
-  { 1,  2, 2, 1},
-  {-1, -2, 2, 1},
-  { 1,  2, 2, 3},
-  {-1, -2, 2, 3},
-
-  {-2,  1, 1, 2},
-  { 2, -1, 1, 2},
-  {-2,  1, 3, 2},
-  { 2, -1, 3, 2},
-  { 2,  1, 1, 2},
-  {-2, -1, 1, 2},
-  { 2,  1, 3, 2},
-  {-2, -1, 3, 2},
-
-  { 0,  2, 0, 1},
-  { 0, -2, 0, 1},
-  { 0,  2, 0, 3},
-  { 0, -2, 0, 3},
-  { 2,  0, 1, 0},
-  {-2,  0, 1, 0},
-  { 2,  0, 3, 0},
-  {-2,  0, 3, 0},
-};
-
-static const int *get_wedge_params_interintra(int wedge_index,
-                                              BLOCK_SIZE sb_type,
-                                              int h, int w) {
-  const int *a = NULL;
-  const int wedge_bits = get_wedge_bits(sb_type);
-
-  if (wedge_index == WEDGE_NONE)
-    return NULL;
-
-  if (wedge_bits == WEDGE_BITS_SML) {
-    a = wedge_params_sml_interintra[wedge_index];
-  } else if (wedge_bits == WEDGE_BITS_MED) {
-    if (h > w)
-      a = wedge_params_med_hgtw_interintra[wedge_index];
-    else if (h < w)
-      a = wedge_params_med_hltw_interintra[wedge_index];
-    else
-      a = wedge_params_med_heqw_interintra[wedge_index];
-  } else if (wedge_bits == WEDGE_BITS_BIG) {
-    if (h > w)
-      a = wedge_params_big_hgtw_interintra[wedge_index];
-    else if (h < w)
-      a = wedge_params_big_hltw_interintra[wedge_index];
-    else
-      a = wedge_params_big_heqw_interintra[wedge_index];
-  } else {
-    assert(0);
-  }
-  return a;
-}
-
-void vp9_generate_masked_weight_interintra(int wedge_index,
-                                           BLOCK_SIZE sb_type,
-                                           int h, int w,
-                                           uint8_t *mask, int stride) {
-  int i, j;
-  const int *a = get_wedge_params_interintra(wedge_index, sb_type, h, w);
-  if (!a) return;
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) {
-      int x = (j - (a[2] * w) / 4);
-      int y = (i - (a[3] * h) / 4);
-      int m = a[0] * x + a[1] * y;
-      mask[i * stride + j] = get_masked_weight_interintra(m);
-    }
-}
-
-void vp9_generate_hard_mask_interintra(int wedge_index, BLOCK_SIZE sb_type,
-                            int h, int w, uint8_t *mask, int stride) {
-  int i, j;
-  const int *a = get_wedge_params_interintra(wedge_index, sb_type, h, w);
-  if (!a) return;
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) {
-      int x = (j - (a[2] * w) / 4);
-      int y = (i - (a[3] * h) / 4);
-      int m = a[0] * x + a[1] * y;
-      mask[i * stride + j] = get_hard_mask_interintra(m);
-    }
-}
-#endif  // CONFIG_WEDGE_PARTITION
-
 static void combine_interintra(PREDICTION_MODE mode,
 #if CONFIG_WEDGE_PARTITION
                               int use_wedge_interintra,
@ -1769,11 +1493,10 @@ static void combine_interintra(PREDICTION_MODE mode,

 #if CONFIG_WEDGE_PARTITION
  if (use_wedge_interintra && get_wedge_bits(bsize)) {
-    uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
-    vp9_generate_masked_weight_interintra(wedge_index, bsize, bh, bw, mask, bw);
+    const uint8_t *mask = vp9_get_soft_mask(wedge_index, bsize, bh, bw);
    for (i = 0; i < bh; ++i) {
      for (j = 0; j < bw; ++j) {
-        int m = mask[i * bw + j];
+        int m = mask[i * MASK_MASTER_STRIDE + j];
        comppred[i * compstride + j] =
            (intrapred[i * intrastride + j] * m +
             interpred[i * interstride + j] * ((1 << WEDGE_WEIGHT_BITS) - m) +
@ -1918,11 +1641,10 @@ static void combine_interintra_highbd(PREDICTION_MODE mode,

 #if CONFIG_WEDGE_PARTITION
  if (use_wedge_interintra && get_wedge_bits(bsize)) {
-    uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
-    vp9_generate_masked_weight_interintra(wedge_index, bsize, bh, bw, mask, bw);
+    const uint8_t *mask = vp9_get_soft_mask(wedge_index, bsize, bh, bw);
    for (i = 0; i < bh; ++i) {
      for (j = 0; j < bw; ++j) {
-        int m = mask[i * bw + j];
+        int m = mask[i * MASK_MASTER_STRIDE + j];
        comppred[i * compstride + j] =
            (intrapred[i * intrastride + j] * m +
             interpred[i * interstride + j] * ((1 << WEDGE_WEIGHT_BITS) - m) +
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@ -47,12 +47,6 @@ void vp9_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
                                          uint8_t *vpred,
                                          int ustride, int vstride,
                                          BLOCK_SIZE bsize);
-#if CONFIG_WEDGE_PARTITION
-void vp9_generate_masked_weight_interintra(int wedge_index,
-                                           BLOCK_SIZE sb_type,
-                                           int h, int w,
-                                           uint8_t *mask, int stride);
-#endif  // CONFIG_WEDGE_PARTITION
 #endif  // CONFIG_INTERINTRA
 #ifdef __cplusplus
 }  // extern "C"
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@ -25,6 +25,7 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_systemdependent.h"

@ -39,6 +40,9 @@ static void initialize_dec() {
  if (!init_done) {
    vp9_rtcd();
    vp9_init_intra_predictors();
+#if CONFIG_WEDGE_PARTITION
+    vp9_init_wedge_masks();
+#endif  // CONFIG_WEDGE_PARTITION
    init_done = 1;
  }
 }
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@ -149,6 +149,9 @@ void vp9_initialize_enc() {
    vp9_entropy_mv_init();
    vp9_entropy_mode_init();
    vp9_temporal_filter_init();
+#if CONFIG_WEDGE_PARTITION
+    vp9_init_wedge_masks();
+#endif
    init_done = 1;
  }
 }
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@ -2077,7 +2077,7 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
  }

 int vp9_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
-                                        uint8_t *mask, int mask_stride,
+                                        const uint8_t *mask, int mask_stride,
                                        MV *bestmv, const MV *ref_mv,
                                        int allow_hp,
                                        int error_per_bit,
@ -2173,7 +2173,7 @@ int vp9_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
 #undef CHECK_BETTER

 int vp9_get_masked_mvpred_var(const MACROBLOCK *x,
-                              uint8_t *mask, int mask_stride,
+                              const uint8_t *mask, int mask_stride,
                              const MV *best_mv, const MV *center_mv,
                              const vp9_variance_fn_ptr_t *vfp,
                              int use_mvcost, int is_second) {
@ -2191,7 +2191,7 @@ int vp9_get_masked_mvpred_var(const MACROBLOCK *x,
 }

 int vp9_masked_refining_search_sad_c(const MACROBLOCK *x,
-                                     uint8_t *mask, int mask_stride,
+                                     const uint8_t *mask, int mask_stride,
                                     MV *ref_mv, int error_per_bit,
                                     int search_range,
                                     const vp9_variance_fn_ptr_t *fn_ptr,
@ -2238,7 +2238,7 @@ int vp9_masked_refining_search_sad_c(const MACROBLOCK *x,

 int vp9_masked_diamond_search_sad_c(const MACROBLOCK *x,
                                    const search_site_config *cfg,
-                                    uint8_t *mask, int mask_stride,
+                                    const uint8_t *mask, int mask_stride,
                                    MV *ref_mv, MV *best_mv,
                                    int search_param,
                                    int sad_per_bit, int *num00,
@ -2329,7 +2329,7 @@ int vp9_masked_diamond_search_sad_c(const MACROBLOCK *x,
 }

 int vp9_masked_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
-                                  uint8_t *mask, int mask_stride,
+                                  const uint8_t *mask, int mask_stride,
                                  MV *mvp_full, int step_param,
                                  int sadpb, int further_steps, int do_refine,
                                  const vp9_variance_fn_ptr_t *fn_ptr,
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@ -148,7 +148,7 @@ int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x,

 #if CONFIG_WEDGE_PARTITION
 int vp9_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
-                                        uint8_t *mask, int mask_stride,
+                                        const uint8_t *mask, int mask_stride,
                                        MV *bestmv, const MV *ref_mv,
                                        int allow_hp,
                                        int error_per_bit,
@ -159,7 +159,7 @@ int vp9_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
                                        int *distortion,
                                        unsigned int *sse1, int is_second);
 int vp9_masked_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
-                                  uint8_t *mask, int mask_stride,
+                                  const uint8_t *mask, int mask_stride,
                                  MV *mvp_full, int step_param,
                                  int sadpb, int further_steps, int do_refine,
                                  const vp9_variance_fn_ptr_t *fn_ptr,
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -4965,7 +4965,7 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd,

 #if CONFIG_WEDGE_PARTITION
 static void do_masked_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                    uint8_t *mask, int mask_stride,
+                                    const uint8_t *mask, int mask_stride,
                                    BLOCK_SIZE bsize,
                                    int mi_row, int mi_col,
                                    int_mv *tmp_mv, int *rate_mv,
@ -5115,12 +5115,9 @@ static void do_masked_motion_search_indexed(VP9_COMP *cpi, MACROBLOCK *x,
  BLOCK_SIZE sb_type = mbmi->sb_type;
  int w = (4 << b_width_log2_lookup[sb_type]);
  int h = (4 << b_height_log2_lookup[sb_type]);
-  int i, j;
-  uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
-  int mask_stride = CODING_UNIT_SIZE;
-
-  vp9_generate_masked_weight(wedge_index, sb_type, h, w, mask, mask_stride);
-  // vp9_generate_hard_mask(wedge_index, sb_type, h, w, mask, mask_stride);
+  const uint8_t *mask;
+  const int mask_stride = MASK_MASTER_STRIDE;
+  mask = vp9_get_soft_mask(wedge_index, sb_type, h, w);

  if (which == 0 || which == 2)
    do_masked_motion_search(cpi, x, mask, mask_stride, bsize,
@ -5131,10 +5128,8 @@ static void do_masked_motion_search_indexed(VP9_COMP *cpi, MACROBLOCK *x,
                            0);

  if (which == 1 || which == 2) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j)
-        mask[i * mask_stride + j] = 64 - mask[i * mask_stride + j];
-
+    // get the negative mask
+    mask = vp9_get_soft_mask(wedge_index ^ 1, sb_type, h, w);
    do_masked_motion_search(cpi, x, mask, mask_stride, bsize,
                            mi_row, mi_col, &tmp_mv[1], &rate_mv[1],
 #if CONFIG_NEW_INTER
@ -5773,7 +5768,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #ifdef WEDGE_INTERINTRA_REFINE_SEARCH
    int bw = 4 << b_width_log2_lookup[mbmi->sb_type],
        bh = 4 << b_height_log2_lookup[mbmi->sb_type];
-    uint8_t mask[CODING_UNIT_SIZE * CODING_UNIT_SIZE];
    int_mv tmp_mv;
    int tmp_rate_mv = 0;
 #endif  // WEDGE_INTERINTRA_REFINE_SEARCH
@ -5830,8 +5824,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
        mbmi->interintra_uv_wedge_index = wedge_index;
        vp9_build_interintra_predictors(xd, tmp_buf, tmp_buf + tmp_buf_sz,
                                       tmp_buf + 2 * tmp_buf_sz,
-                                      CODING_UNIT_SIZE, CODING_UNIT_SIZE,
-                                      CODING_UNIT_SIZE, bsize);
+                                       CODING_UNIT_SIZE, CODING_UNIT_SIZE,
+                                       CODING_UNIT_SIZE, bsize);
        model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, NULL, NULL);
        rd = RDCOST(x->rdmult, x->rddiv,
                    rmode + rate_mv_tmp + rwedge + rate_sum, dist_sum);
@ -5847,15 +5841,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #else
      if (this_mode == NEWMV) {
 #endif  // CONFIG_NEW_INTER
-        int j;
+        // get negative of mask
+        const uint8_t* mask = vp9_get_soft_mask(
+            best_wedge_index ^ 1, bsize, bh, bw);
        mbmi->interintra_wedge_index = best_wedge_index;
        mbmi->interintra_uv_wedge_index = best_wedge_index;
-        vp9_generate_masked_weight_interintra(best_wedge_index, bsize,
-                                              bh, bw, mask, bw);
-        for (i = 0; i < bh; ++i)
-            for (j = 0; j < bw; ++j)
-              mask[i * bw + j] = 64 - mask[i * bw + j];
-        do_masked_motion_search(cpi, x, mask, bw, bsize,
+        do_masked_motion_search(cpi, x, mask, MASK_MASTER_STRIDE, bsize,
                                mi_row, mi_col, &tmp_mv, &tmp_rate_mv,
 #if CONFIG_NEW_INTER
                                &ref_mv[0],