Add joint motion search in comp_inter_inter mode(experiment)

In current code, motion vectors got from single prediction mode are used in compound prediction mode directly. These motion vectors may not give accurate prediction since they are searched independently. In this patch, we took Pascal's suggestion, and did joint motion search in compound prediction mode to find better motion vectors in this situation. Test results: Overall PSNR: 0.570%(derf), 0.918%(stdhd); SSIM: 0.572%(derf), 1.009%(stdhd); The encoder is a little slower. This can be improved since some c code is used in motion search. Change-Id: Ib30c9240f6c56c9b070867b4ca89412a76d9f3c6
2013-05-07 09:45:28 -07:00 · 2013-05-07 09:45:28 -07:00 · 9f5811c2da
commit 9f5811c2da
parent 4305dd4778
9 changed files with 837 additions and 33 deletions
--- a/1
+++ b/1
@ -247,6 +247,7 @@ EXPERIMENT_LIST="
    multiple_arf
    non420
    ab4x4
+    comp_inter_joint_search
 "
 CONFIG_LIST="
    external_build
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@ -36,6 +36,7 @@ typedef enum BLOCK_SIZE_TYPE {
  BLOCK_SIZE_SB32X64,
  BLOCK_SIZE_SB64X32,
  BLOCK_SIZE_SB64X64,
+  BLOCK_SIZE_TYPES
 } BLOCK_SIZE_TYPE;

 typedef enum PARTITION_TYPE {
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -337,41 +337,74 @@ vp9_variance4x4_mmx=vp9_variance4x4_mmx
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance64x64 sse2

+prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x64
+
 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x64

+prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x64
+
 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance64x32

+prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x32
+
 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x16

+prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x16
+
 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x32

+prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x32
+
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x32 sse2

+prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x32
+
 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3

+prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x16
+
 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x16 sse2 mmx
 vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt

+prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x16
+
 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt

+prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x8
+
 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x8 sse2 mmx
 vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt

+prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x8
+
 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance4x4 sse2 mmx
 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt

+prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance4x4
+
 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad64x64 sse2

--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@ -413,6 +413,201 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,

  return besterr;
 }
+
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+#undef DIST
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+    vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+              z, src_stride, &sse, second_pred)
+
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion,
+                                 unsigned int *sse1,
+                                 const uint8_t *second_pred, int w, int h) {
+  uint8_t *z = x->plane[0].src.buf;
+  int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int rr, rc, br, bc, hstep;
+  int tr, tc;
+  unsigned int besterr = INT_MAX;
+  unsigned int left, right, up, down, diag;
+  unsigned int sse;
+  unsigned int whichdir;
+  unsigned int halfiters = 4;
+  unsigned int quarteriters = 4;
+  unsigned int eighthiters = 4;
+  int thismse;
+  int maxc, minc, maxr, minr;
+  int y_stride;
+  int offset;
+  int usehp = xd->allow_high_precision_mv;
+
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+  uint8_t *y = xd->plane[0].pre[0].buf +
+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+               bestmv->as_mv.col;
+
+  y_stride = xd->plane[0].pre[0].stride;
+
+  rr = ref_mv->as_mv.row;
+  rc = ref_mv->as_mv.col;
+  br = bestmv->as_mv.row << 3;
+  bc = bestmv->as_mv.col << 3;
+  hstep = 4;
+  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
+             ((1 << MV_MAX_BITS) - 1));
+  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
+             ((1 << MV_MAX_BITS) - 1));
+
+  tr = br;
+  tc = bc;
+
+
+  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+  // central mv
+  bestmv->as_mv.row <<= 3;
+  bestmv->as_mv.col <<= 3;
+
+  // calculate central point error
+  // TODO(yunqingwang): central pointer error was already calculated in full-
+  // pixel search, and can be passed in this function.
+  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,
+                         error_per_bit, xd->allow_high_precision_mv);
+
+  // Each subsequent iteration checks at least one point in
+  // common with the last iteration could be 2 ( if diag selected)
+  while (--halfiters) {
+    // 1/2 pel
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  hstep >>= 1;
+  while (--quarteriters) {
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  if (xd->allow_high_precision_mv) {
+    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+  } else {
+    usehp = 0;
+  }
+
+  if (usehp) {
+    hstep >>= 1;
+    while (--eighthiters) {
+      CHECK_BETTER(left, tr, tc - hstep);
+      CHECK_BETTER(right, tr, tc + hstep);
+      CHECK_BETTER(up, tr - hstep, tc);
+      CHECK_BETTER(down, tr + hstep, tc);
+
+      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+      switch (whichdir) {
+        case 0:
+          CHECK_BETTER(diag, tr - hstep, tc - hstep);
+          break;
+        case 1:
+          CHECK_BETTER(diag, tr - hstep, tc + hstep);
+          break;
+        case 2:
+          CHECK_BETTER(diag, tr + hstep, tc - hstep);
+          break;
+        case 3:
+          CHECK_BETTER(diag, tr + hstep, tc + hstep);
+          break;
+      }
+
+      // no reason to check the same one again.
+      if (tr == br && tc == bc)
+        break;
+
+      tr = br;
+      tc = bc;
+    }
+  }
+  bestmv->as_mv.row = br;
+  bestmv->as_mv.col = bc;
+
+  vpx_free(comp_pred);
+
+  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
 #undef MVC
 #undef PRE
 #undef DIST
@ -2132,7 +2327,109 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
    return INT_MAX;
 }

+#if CONFIG_COMP_INTER_JOINT_SEARCH
+/* This function is called when we do joint motion search in comp_inter_inter
+ * mode.
+ */
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2], int_mv *center_mv,
+                             const uint8_t *second_pred, int w, int h) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+      {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+  int i, j;
+  int this_row_offset, this_col_offset;

+  int what_stride = x->plane[0].src.stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  uint8_t *what = x->plane[0].src.buf;
+  uint8_t *best_address = xd->plane[0].pre[0].buf +
+                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
+                          ref_mv->as_mv.col;
+  uint8_t *check_here;
+  unsigned int thissad;
+  int_mv this_mv;
+  unsigned int bestsad = INT_MAX;
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  /* Compound pred buffer */
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  /* Get compound pred by averaging two pred blocks. */
+  comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+
+  bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 8; j++) {
+      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+      if ((this_col_offset > x->mv_col_min) &&
+          (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) &&
+          (this_row_offset < x->mv_row_max)) {
+        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+            best_address;
+
+        /* Get compound block and use it to calculate SAD. */
+        comp_avg_pred(comp_pred, second_pred, w, h, check_here,
+                      in_what_stride);
+        thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+
+        if (thissad < bestsad) {
+          this_mv.as_mv.row = this_row_offset;
+          this_mv.as_mv.col = this_col_offset;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
+                                    mvsadcost, error_per_bit);
+
+          if (thissad < bestsad) {
+            bestsad = thissad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->as_mv.row += neighbors[best_site].row;
+      ref_mv->as_mv.col += neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride +
+          neighbors[best_site].col;
+    }
+  }
+
+  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+  if (bestsad < INT_MAX) {
+    int besterr;
+    comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+    besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
+        (unsigned int *)(&thissad)) +
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
+                    xd->allow_high_precision_mv);
+    vpx_free(comp_pred);
+    return besterr;
+  } else {
+    vpx_free(comp_pred);
+    return INT_MAX;
+  }
+}
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH

 #ifdef ENTROPY_STATS
 void print_mode_context(VP9_COMMON *pc) {
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@ -79,5 +79,21 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
                                       int *mvjcost, int *mvcost[2],
                                       int_mv *center_mv);

+#if CONFIG_COMP_INTER_JOINT_SEARCH
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion, unsigned int *sse1,
+                                 const uint8_t *second_pred,
+                                 int w, int h);

+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2],
+                             int_mv *center_mv, const uint8_t *second_pred,
+                             int w, int h);
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 #endif  // VP9_ENCODER_VP9_MCOMP_H_
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@ -1516,10 +1516,11 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
  for (i = 0; i < MAX_MODES; i++)
    cpi->rd_thresh_mult[i] = 128;

-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
+#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
    cpi->fn_ptr[BT].sdf            = SDF; \
    cpi->fn_ptr[BT].vf             = VF; \
    cpi->fn_ptr[BT].svf            = SVF; \
+    cpi->fn_ptr[BT].svaf           = SVAF; \
    cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
    cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
    cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
@ -1528,57 +1529,64 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
    cpi->fn_ptr[BT].sdx4df         = SDX4DF;

  BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance32x16, NULL, NULL,
      NULL, NULL, NULL,
      vp9_sad32x16x4d)

  BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance16x32, NULL, NULL,
      NULL, NULL, NULL,
      vp9_sad16x32x4d)

  BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance64x32, NULL, NULL,
      NULL, NULL, NULL,
      vp9_sad64x32x4d)

  BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance32x64, NULL, NULL,
      NULL, NULL, NULL,
      vp9_sad32x64x4d)

  BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
+      vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
+      vp9_variance_halfpixvar32x32_v,
      vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
      vp9_sad32x32x4d)

  BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v,
+      vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
+      vp9_variance_halfpixvar64x64_v,
      vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
      vp9_sad64x64x4d)

  BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
-       vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
-       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
-       vp9_sad16x16x4d)
+      vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
+      vp9_variance_halfpixvar16x16_v,
+      vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+      vp9_sad16x16x4d)

  BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
-      NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+      vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
+      vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)

  BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
-      NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+      vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
+      vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)

  BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
-      NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+      vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
+      vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)

  BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL)
+      NULL, NULL, NULL, NULL, NULL, NULL, NULL)

  BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL)
+      NULL, NULL, NULL, NULL, NULL, NULL, NULL)

  BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
-      NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+      vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
+      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)

  cpi->full_search_sad = vp9_full_search_sad;
  cpi->diamond_search_sad = vp9_diamond_search_sad;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -1807,8 +1807,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                 INTERPOLATIONFILTERTYPE *best_filter,
                                 int_mv frame_mv[MB_MODE_COUNT]
                                                [MAX_REF_FRAMES],
-                                 YV12_BUFFER_CONFIG *scaled_ref_frame,
-                                 int mi_row, int mi_col) {
+                                 YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                 int mi_row, int mi_col,
+                                 int_mv single_newmv[MAX_REF_FRAMES]) {
  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);

  VP9_COMMON *cm = &cpi->common;
@ -1838,6 +1839,152 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
      ref_mv[1] = mbmi->ref_mvs[refs[1]][0];

      if (is_comp_pred) {
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+        const int b_sz[BLOCK_SIZE_TYPES][2] = {
+            {4, 4},
+            {8, 8},
+            {8, 16},
+            {16, 8},
+            {16, 16},
+            {16, 32},
+            {32, 16},
+            {32, 32},
+            {32, 64},
+            {64, 32},
+            {64, 64}
+        };
+
+        int ite;
+        // Prediction buffer from second frame.
+        uint8_t *second_pred = vpx_memalign(16, b_sz[bsize][0] *
+                                            b_sz[bsize][1] * sizeof(uint8_t));
+
+        // Do joint motion search in compound mode to get more accurate mv.
+        struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+        struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+        struct buf_2d scaled_first_yv12;
+
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          // Swap out the reference frame for a version that's been scaled to
+          // match the resolution of the current frame, allowing the existing
+          // motion search code to be used without additional modifications.
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_yv12[i] = xd->plane[i].pre[0];
+
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                           NULL, NULL);
+        }
+
+        if (scaled_ref_frame[1]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_second_yv12[i] = xd->plane[i].pre[1];
+
+          setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+                           NULL, NULL);
+        }
+        xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+                                                mi_row, mi_col);
+        xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+                                                mi_row, mi_col);
+
+        scaled_first_yv12 = xd->plane[0].pre[0];
+
+        // Initialize mv using single prediction mode result.
+        frame_mv[NEWMV][refs[0]].as_int = single_newmv[refs[0]].as_int;
+        frame_mv[NEWMV][refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+        // Iteration: joint search is done once for each ref frame.
+        // Tried allowing search multiple times iteratively, and break out if
+        // it couldn't find better mv. But tests didn't show noticeable
+        // improvement.
+        for (ite = 0; ite < 2; ite++) {
+          struct buf_2d ref_yv12[2] = {xd->plane[0].pre[0],
+                                       xd->plane[0].pre[1]};
+          int bestsme = INT_MAX;
+          int sadpb = x->sadperbit16;
+          int_mv tmp_mv;
+          int search_range = 3;
+
+          int tmp_col_min = x->mv_col_min;
+          int tmp_col_max = x->mv_col_max;
+          int tmp_row_min = x->mv_row_min;
+          int tmp_row_max = x->mv_row_max;
+          int id = ite % 2;
+
+          // Get pred block from second frame.
+          vp9_build_inter_predictor(ref_yv12[!id].buf,
+                                    ref_yv12[!id].stride,
+                                    second_pred, b_sz[bsize][0],
+                                    &frame_mv[NEWMV][refs[!id]],
+                                    &xd->scale_factor[!id],
+                                    b_sz[bsize][0], b_sz[bsize][1], 0,
+                                    &xd->subpix);
+
+          // Compound motion search on first ref frame.
+          if (id)
+            xd->plane[0].pre[0] = ref_yv12[id];
+          vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+          // Use mv result from single mode as mvp.
+          tmp_mv.as_int = frame_mv[NEWMV][refs[id]].as_int;
+
+          tmp_mv.as_mv.col >>= 3;
+          tmp_mv.as_mv.row >>= 3;
+
+          // Small-range full-pixel motion search
+          bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                             search_range,
+                                             &cpi->fn_ptr[block_size],
+                                             x->nmvjointcost, x->mvcost,
+                                             &ref_mv[id], second_pred,
+                                             b_sz[bsize][0], b_sz[bsize][1]);
+
+          x->mv_col_min = tmp_col_min;
+          x->mv_col_max = tmp_col_max;
+          x->mv_row_min = tmp_row_min;
+          x->mv_row_max = tmp_row_max;
+
+          if (bestsme < INT_MAX) {
+            int dis; /* TODO: use dis in distortion calculation later. */
+            unsigned int sse;
+
+            vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+                                         &ref_mv[id],
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[block_size],
+                                         x->nmvjointcost, x->mvcost,
+                                         &dis, &sse, second_pred,
+                                         b_sz[bsize][0], b_sz[bsize][1]);
+          }
+
+          frame_mv[NEWMV][refs[id]].as_int =
+              xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
+          if (id)
+            xd->plane[0].pre[0] = scaled_first_yv12;
+        }
+
+        // restore the predictor
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[0] = backup_yv12[i];
+        }
+
+        if (scaled_ref_frame[1]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[1] = backup_second_yv12[i];
+        }
+
+        vpx_free(second_pred);
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
        if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
            frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
          return INT64_MAX;
@ -1862,7 +2009,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
        int tmp_row_min = x->mv_row_min;
        int tmp_row_max = x->mv_row_max;

-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
          int i;

          // Swap out the reference frame for a version that's been scaled to
@ -1871,7 +2018,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
          for (i = 0; i < MAX_MB_PLANE; i++)
            backup_yv12[i] = xd->plane[i].pre[0];

-          setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col,
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
                           NULL, NULL);
        }

@ -1914,6 +2061,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
        }
        frame_mv[NEWMV][refs[0]].as_int =
          xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+        single_newmv[refs[0]].as_int = tmp_mv.as_int;

        // Add the new motion vector cost to our rolling cost variable
        *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
@ -1921,7 +2069,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  96, xd->allow_high_precision_mv);

        // restore the predictor, if required
-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
          int i;

          for (i = 0; i < MAX_MB_PLANE; i++)
@ -2205,6 +2353,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
  int frame_mdcounts[4][4];
  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  int_mv single_newmv[MAX_REF_FRAMES];
  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                    VP9_ALT_FLAG };
  int idx_list[4] = {0,
@ -2251,6 +2400,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  xd->mode_info_context->mbmi.segment_id = segment_id;
  estimate_ref_frame_costs(cpi, segment_id, ref_costs);
  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+  vpx_memset(&single_newmv, 0, sizeof(single_newmv));

  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
    best_pred_rd[i] = INT64_MAX;
@ -2608,7 +2758,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
          vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
      mbmi->mode = this_mode;
    } else {
-      YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
+      YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
      int fb;

      if (mbmi->ref_frame == LAST_FRAME) {
@ -2620,7 +2770,20 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      }

      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+        scaled_ref_frame[0] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+
+      if (comp_pred) {
+        if (mbmi->second_ref_frame == LAST_FRAME) {
+          fb = cpi->lst_fb_idx;
+        } else if (mbmi->second_ref_frame == GOLDEN_FRAME) {
+          fb = cpi->gld_fb_idx;
+        } else {
+          fb = cpi->alt_fb_idx;
+        }
+
+        if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+          scaled_ref_frame[1] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+      }

      this_rd = handle_inter_mode(cpi, x, bsize,
                                  mdcounts, txfm_cache,
@ -2630,7 +2793,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                  &rate_uv, &distortion_uv,
                                  &mode_excluded, &disable_skip,
                                  mode_index, &tmp_best_filter, frame_mv,
-                                  scaled_ref_frame, mi_row, mi_col);
+                                  scaled_ref_frame, mi_row, mi_col,
+                                  single_newmv);
      if (this_rd == INT64_MAX)
        continue;
    }
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@ -12,6 +12,7 @@
 #define VP9_ENCODER_VP9_VARIANCE_H_

 #include "vpx/vpx_integer.h"
+// #include "./vpx_config.h"

 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                    int source_stride,
@ -50,6 +51,15 @@ typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,
                                                int Refstride,
                                                unsigned int *sse);

+typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
+                                                   int source_stride,
+                                                   int xoffset,
+                                                   int yoffset,
+                                                   const uint8_t *ref_ptr,
+                                                   int Refstride,
+                                                   unsigned int *sse,
+                                                   const uint8_t *second_pred);
+
 typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
                                int rp, unsigned long *sum_s,
                                unsigned long *sum_r, unsigned long *sum_sq_s,
@ -64,15 +74,33 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
                                                   int  ref_stride);

 typedef struct vp9_variance_vtable {
-    vp9_sad_fn_t            sdf;
-    vp9_variance_fn_t       vf;
-    vp9_subpixvariance_fn_t svf;
-    vp9_variance_fn_t       svf_halfpix_h;
-    vp9_variance_fn_t       svf_halfpix_v;
-    vp9_variance_fn_t       svf_halfpix_hv;
-    vp9_sad_multi_fn_t      sdx3f;
-    vp9_sad_multi1_fn_t     sdx8f;
-    vp9_sad_multi_d_fn_t    sdx4df;
+    vp9_sad_fn_t               sdf;
+    vp9_variance_fn_t          vf;
+    vp9_subpixvariance_fn_t    svf;
+    vp9_subp_avg_variance_fn_t svaf;
+    vp9_variance_fn_t          svf_halfpix_h;
+    vp9_variance_fn_t          svf_halfpix_v;
+    vp9_variance_fn_t          svf_halfpix_hv;
+    vp9_sad_multi_fn_t         sdx3f;
+    vp9_sad_multi1_fn_t        sdx8f;
+    vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;

+// #if CONFIG_COMP_INTER_JOINT_SEARCH
+static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight,
+                          int height, uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < weight; j++) {
+      int tmp;
+      tmp = pred[j] + ref[j];
+      comp_pred[j] = (tmp + 1) >> 1;
+    }
+    comp_pred += weight;
+    pred += weight;
+    ref += ref_stride;
+  }
+}
+// #endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 #endif  // VP9_ENCODER_VP9_VARIANCE_H_
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@ -13,6 +13,7 @@
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_subpelvar.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"

 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
  unsigned int i, sum = 0;
@ -58,6 +59,29 @@ unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr,
  return vp9_variance64x32_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
 }

+unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
+  return vp9_variance64x32_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,
                                 int  source_stride,
                                 const uint8_t *ref_ptr,
@ -92,6 +116,29 @@ unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr,
  return vp9_variance32x64_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }

+unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
+  return vp9_variance32x64_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,
                                 int  source_stride,
                                 const uint8_t *ref_ptr,
@ -126,6 +173,29 @@ unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr,
  return vp9_variance32x16_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }

+unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
+  return vp9_variance32x16_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,
                                 int  source_stride,
                                 const uint8_t *ref_ptr,
@ -160,6 +230,29 @@ unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr,
  return vp9_variance16x32_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }

+unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
+  return vp9_variance16x32_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
                                 int  source_stride,
                                 const uint8_t *ref_ptr,
@ -317,6 +410,31 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
  return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
 }

+unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint8_t temp2[20 * 16];
+  const int16_t *hfilter, *vfilter;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4);  // compound pred buffer
+  uint16_t fdata3[5 * 4];  // Temp data bufffer used in filtering
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  // First filter 1d Horizontal
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 5, 4, hfilter);
+
+  // Now filter Verticaly
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
+  comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
+  return vp9_variance4x4_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
+}

 unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
                                         int  src_pixels_per_line,
@ -339,6 +457,29 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
  return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }

+unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 8];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
+  return vp9_variance8x8_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
                                           int  src_pixels_per_line,
                                           int  xoffset,
@ -360,6 +501,30 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
  return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }

+unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[17 * 16];
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
+
+  comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
+  return vp9_variance16x16_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
                                           int  src_pixels_per_line,
                                           int  xoffset,
@ -381,6 +546,29 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
  return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
 }

+unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
+  return vp9_variance64x64_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
                                           int  src_pixels_per_line,
                                           int  xoffset,
@ -402,6 +590,29 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
  return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }

+unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
+  return vp9_variance32x32_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
                                              int  source_stride,
                                              const uint8_t *ref_ptr,
@ -543,6 +754,29 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
  return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }

+unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[16 * 9];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
+  return vp9_variance16x8_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
                                          int  src_pixels_per_line,
                                          int  xoffset,
@ -564,3 +798,25 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
  return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }

+unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 16];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
+  return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}