From 5fc9688792735888731acaf5e2628cd1b3a76d1b Mon Sep 17 00:00:00 2001
From: Sergey Kolomenkin <kolomenkin@gmail.com>
Date: Mon, 7 Dec 2015 12:54:57 -0800
Subject: [PATCH 01/12] fix null pointer crash in Win32 because esp register is
 broken

https://bugs.chromium.org/p/webm/issues/detail?id=1105

Change-Id: I304ea85ea1f6474e26f074dc39dc0748b90d4d3d
---
 vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm | 4 ++--
 vpx_dsp/x86/subpel_variance_sse2.asm             | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 93df92a9e..5f9c963d4 100644
--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -124,7 +124,6 @@ SECTION .text
 
       ; Store bilin_filter and pw_8 location in stack
       GET_GOT eax
-      add esp, 4                ; restore esp
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -132,6 +131,7 @@ SECTION .text
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
+      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
@@ -141,7 +141,6 @@ SECTION .text
 
       ; Store bilin_filter and pw_8 location in stack
       GET_GOT eax
-      add esp, 4                ; restore esp
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -149,6 +148,7 @@ SECTION .text
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
+      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %endif
   %else
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index 05dcff75e..1176a2f4c 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -140,7 +140,6 @@ SECTION .text
 
       ;Store bilin_filter and pw_8 location in stack
       GET_GOT eax
-      add esp, 4                ; restore esp
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -148,6 +147,7 @@ SECTION .text
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
+      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
@@ -157,7 +157,6 @@ SECTION .text
 
       ;Store bilin_filter and pw_8 location in stack
       GET_GOT eax
-      add esp, 4                ; restore esp
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -165,6 +164,7 @@ SECTION .text
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
+      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %endif
   %else

From 4e692bbee29d1566e43ed4499249651d4c01f9c4 Mon Sep 17 00:00:00 2001
From: paulwilkins <paulwilkins@google.com>
Date: Tue, 8 Dec 2015 15:48:24 +0000
Subject: [PATCH 02/12] Changes to exhaustive motion search.

This change has been imported from VP9 and
alters the nature and use of exhaustive motion search.

Firstly any exhaustive search is preceded by a normal step search.
The exhaustive search is only carried out if the distortion resulting
from the step search is above a threshold value.

Secondly the simple +/- 64 exhaustive search is replaced by a
multi stage mesh based search where each stage has a range
and step/interval size. Subsequent stages use the best position from
the previous stage as the center of the search but use a reduced range
and interval size.

For example:
  stage 1: Range +/- 64 interval 4
  stage 2: Range +/- 32 interval 2
  stage 3: Range +/- 15 interval 1

This process, especially when it follows on from a normal step
search, has shown itself to be almost as effective as a full range
exhaustive search with step 1 but greatly lowers the computational
complexity such that it can be used in some cases for speeds 0-2.

This patch also removes a double exhaustive search for sub 8x8 blocks
which also contained  a bug (the two searches used different distortion
metrics).

For best quality in my test animation sequence this patch has almost
no impact on quality but improves encode speed by more than 5X.

Restricted use in good quality speeds 0-2 yields significant quality gains
on the animation test of 0.2 - 0.5 db with only a small impact on encode
speed. On most natural video clips, however, where the step search
is performing well, the quality gain and speed impact are small.

Change-Id: Iac24152ae239f42a246f39ee5f00fe62d193cb98
---
 vp10/encoder/block.h          |   2 +
 vp10/encoder/encodeframe.c    |   6 +
 vp10/encoder/encoder.c        |   4 +-
 vp10/encoder/encoder.h        |   2 +
 vp10/encoder/ethread.c        |   5 +
 vp10/encoder/mcomp.c          | 214 ++++++++++++++++++++++++++--------
 vp10/encoder/rdopt.c          |  23 +---
 vp10/encoder/speed_features.c |  51 +++++++-
 vp10/encoder/speed_features.h |  19 +++
 vp9/encoder/vp9_ratectrl.c    |   2 +-
 10 files changed, 253 insertions(+), 75 deletions(-)

diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index cb2a234c9..ab0252baa 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -70,6 +70,8 @@ struct macroblock {
   int rddiv;
   int rdmult;
   int mb_energy;
+  int * m_search_count_ptr;
+  int * ex_search_count_ptr;
 
   // These are set to their default values at the beginning, and then adjusted
   // further in the encoding process.
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index f050905e7..bcdcff3d3 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -2642,6 +2642,10 @@ void vp10_encode_tile(VP10_COMP *cpi, ThreadData *td,
   TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   int mi_row;
 
+  // Set up pointers to per thread motion search counters.
+  td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
+  td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
+
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
@@ -2695,6 +2699,8 @@ static void encode_frame_internal(VP10_COMP *cpi) {
   vp10_zero(rdc->coef_counts);
   vp10_zero(rdc->comp_pred_diff);
   vp10_zero(rdc->filter_diff);
+  rdc->m_search_count = 0;   // Count of motion search hits.
+  rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
 
   for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) {
 #if CONFIG_MISC_FIXES
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 630f35e78..6bba84845 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -2843,7 +2843,7 @@ static void output_frame_level_debug_stats(VP10_COMP *cpi) {
   recon_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
+    fprintf(f, "%10u %dx%d  %10d %10d %d %d %10d %10d %10d %10d"
        "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
        "%10"PRId64" %10"PRId64" %10d "
        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
@@ -2852,6 +2852,8 @@ static void output_frame_level_debug_stats(VP10_COMP *cpi) {
         "%10lf %8u %10"PRId64" %10d %10d %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
+        cpi->td.rd_counts.m_search_count,
+        cpi->td.rd_counts.ex_search_count,
         cpi->rc.source_alt_ref_pending,
         cpi->rc.source_alt_ref_active,
         cpi->rc.this_frame_target,
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 8519c27a6..2a44e4744 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -250,6 +250,8 @@ typedef struct RD_COUNTS {
   vp10_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
   int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int m_search_count;
+  int ex_search_count;
 } RD_COUNTS;
 
 typedef struct ThreadData {
diff --git a/vp10/encoder/ethread.c b/vp10/encoder/ethread.c
index ea491058a..ad47ccf04 100644
--- a/vp10/encoder/ethread.c
+++ b/vp10/encoder/ethread.c
@@ -30,6 +30,11 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
             for (n = 0; n < ENTROPY_TOKENS; n++)
               td->rd_counts.coef_counts[i][j][k][l][m][n] +=
                   td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+
+
+  // Counts of all motion searches and exhuastive mesh searches.
+  td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
+  td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
 }
 
 static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index d6ab00fdc..04e1dafee 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -1523,69 +1523,83 @@ int vp10_fast_dia_search(const MACROBLOCK *x,
 
 #undef CHECK_BETTER
 
-int vp10_full_range_search_c(const MACROBLOCK *x,
-                            const search_site_config *cfg,
-                            MV *ref_mv, MV *best_mv,
-                            int search_param, int sad_per_bit, int *num00,
-                            const vp9_variance_fn_ptr_t *fn_ptr,
-                            const MV *center_mv) {
+// Exhuastive motion search around a given centre position with a given
+// step size.
+static int exhuastive_mesh_search(const MACROBLOCK *x,
+                                  MV *ref_mv, MV *best_mv,
+                                  int range, int step, int sad_per_bit,
+                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int range = 64;
-  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  MV fcenter_mv = {center_mv->row, center_mv->col};
   unsigned int best_sad = INT_MAX;
   int r, c, i;
   int start_col, end_col, start_row, end_row;
+  int col_step = (step > 1) ? step : 4;
 
-  // The cfg and search_param parameters are not used in this search variant
-  (void)cfg;
-  (void)search_param;
+  assert(step >= 1);
 
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  *best_mv = *ref_mv;
-  *num00 = 11;
+  clamp_mv(&fcenter_mv, x->mv_col_min, x->mv_col_max,
+           x->mv_row_min, x->mv_row_max);
+  *best_mv = fcenter_mv;
   best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, ref_mv), in_what->stride) +
-                 mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  start_row = VPXMAX(-range, x->mv_row_min - ref_mv->row);
-  start_col = VPXMAX(-range, x->mv_col_min - ref_mv->col);
-  end_row = VPXMIN(range, x->mv_row_max - ref_mv->row);
-  end_col = VPXMIN(range, x->mv_col_max - ref_mv->col);
+             get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+             mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
+  start_row = VPXMAX(-range, x->mv_row_min - fcenter_mv.row);
+  start_col = VPXMAX(-range, x->mv_col_min - fcenter_mv.col);
+  end_row = VPXMIN(range, x->mv_row_max - fcenter_mv.row);
+  end_col = VPXMIN(range, x->mv_col_max - fcenter_mv.col);
 
-  for (r = start_row; r <= end_row; ++r) {
-    for (c = start_col; c <= end_col; c += 4) {
-      if (c + 3 <= end_col) {
-        unsigned int sads[4];
-        const uint8_t *addrs[4];
-        for (i = 0; i < 4; ++i) {
-          const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
-          addrs[i] = get_buf_from_mv(in_what, &mv);
-        }
-
-        fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
-
-        for (i = 0; i < 4; ++i) {
-          if (sads[i] < best_sad) {
-            const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
-            const unsigned int sad = sads[i] +
-                mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += col_step) {
+      // Step > 1 means we are not checking every location in this pass.
+      if (step > 1) {
+        const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c};
+        unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                           get_buf_from_mv(in_what, &mv), in_what->stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            *best_mv = mv;
           }
         }
       } else {
-        for (i = 0; i < end_col - c; ++i) {
-          const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
-          unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-              get_buf_from_mv(in_what, &mv), in_what->stride);
-          if (sad < best_sad) {
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        // 4 sads in a single call if we are checking every location
+        if (c + 3 <= end_col) {
+          unsigned int sads[4];
+          const uint8_t *addrs[4];
+          for (i = 0; i < 4; ++i) {
+            const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+            addrs[i] = get_buf_from_mv(in_what, &mv);
+          }
+          fn_ptr->sdx4df(what->buf, what->stride, addrs,
+                         in_what->stride, sads);
+
+          for (i = 0; i < 4; ++i) {
+            if (sads[i] < best_sad) {
+              const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+              const unsigned int sad = sads[i] +
+                  mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
+            }
+          }
+        } else {
+          for (i = 0; i < end_col - c; ++i) {
+            const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+            unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                get_buf_from_mv(in_what, &mv), in_what->stride);
             if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
+              sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
             }
           }
         }
@@ -2014,6 +2028,70 @@ int vp10_full_pixel_diamond(const VP10_COMP *cpi, MACROBLOCK *x,
   return bestsme;
 }
 
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(VP10_COMP *cpi, MACROBLOCK *x,
+                                 MV *centre_mv_full, int sadpb,  int *cost_list,
+                                 const vp9_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MV temp_mv = {centre_mv_full->row, centre_mv_full->col};
+  MV f_ref_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int bestsme;
+  int i;
+  int interval = sf->mesh_patterns[0].interval;
+  int range = sf->mesh_patterns[0].range;
+  int baseline_interval_divisor;
+
+  // Keep track of number of exhaustive calls (this frame in this thread).
+  ++(*x->ex_search_count_ptr);
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < MIN_RANGE) || (range > MAX_RANGE) ||
+      (interval < MIN_INTERVAL) || (interval > range))
+    return INT_MAX;
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+  range = VPXMIN(range, MAX_RANGE);
+  interval = VPXMAX(interval, range / baseline_interval_divisor);
+
+  // initial search
+  bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range,
+                                  interval, sadpb, fn_ptr, &temp_mv);
+
+  if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv,
+                                       sf->mesh_patterns[i].range,
+                                       sf->mesh_patterns[i].interval,
+                                       sadpb, fn_ptr, &temp_mv);
+
+      if (sf->mesh_patterns[i].interval == 1)
+        break;
+    }
+  }
+
+  if (bestsme < INT_MAX)
+    bestsme = vp10_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  *dst_mv = temp_mv;
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
+  return bestsme;
+}
+
 int vp10_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
                           const vp9_variance_fn_ptr_t *fn_ptr,
@@ -2327,6 +2405,18 @@ int vp10_refining_search_8p_c(const MACROBLOCK *x,
   return best_sad;
 }
 
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(VP10_COMP *cpi, MACROBLOCK *x) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const int max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT,
+      (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+  return sf->allow_exhaustive_searches &&
+      (sf->exhaustive_searches_thresh < INT_MAX) &&
+      (*x->ex_search_count_ptr <= max_ex) &&
+      !cpi->rc.is_src_frame_alt_ref;
+}
+
 int vp10_full_pixel_search(VP10_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full,
                           int step_param, int error_per_bit,
@@ -2345,6 +2435,9 @@ int vp10_full_pixel_search(VP10_COMP *cpi, MACROBLOCK *x,
     cost_list[4] = INT_MAX;
   }
 
+  // Keep track of number of searches (this frame in this thread).
+  ++(*x->m_search_count_ptr);
+
   switch (method) {
     case FAST_DIAMOND:
       var = vp10_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
@@ -2370,6 +2463,29 @@ int vp10_full_pixel_search(VP10_COMP *cpi, MACROBLOCK *x,
       var = vp10_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
                                    MAX_MVSEARCH_STEPS - 1 - step_param,
                                    1, cost_list, fn_ptr, ref_mv, tmp_mv);
+
+      // Should we allow a follow on exhaustive search?
+      if (is_exhaustive_allowed(cpi, x)) {
+        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>= 8 - (b_width_log2_lookup[bsize] +
+                                b_height_log2_lookup[bsize]);
+
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) {
+            int var_ex;
+          MV tmp_mv_ex;
+          var_ex = full_pixel_exhaustive(cpi, x, tmp_mv,
+                                         error_per_bit, cost_list, fn_ptr,
+                                         ref_mv, &tmp_mv_ex);
+
+          if (var_ex < var) {
+            var = var_ex;
+            *tmp_mv = tmp_mv_ex;
+          }
+        }
+      }
+      break;
+
       break;
     default:
       assert(0 && "Invalid search method.");
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 974700e19..bba21716f 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1783,7 +1783,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x,
             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
           MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
           int step_param = 0;
-          int thissme, bestsme = INT_MAX;
+          int bestsme = INT_MAX;
           int sadpb = x->sadperbit4;
           MV mvp_full;
           int max_mv;
@@ -1838,27 +1838,6 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x,
               &bsi->ref_mv[0]->as_mv, new_mv,
               INT_MAX, 1);
 
-          // Should we do a full search (best quality only)
-          if (cpi->oxcf.mode == BEST) {
-            int_mv *const best_mv = &mi->bmi[i].as_mv[0];
-            /* Check if mvp_full is within the range. */
-            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
-                     x->mv_row_min, x->mv_row_max);
-            thissme = cpi->full_search_sad(x, &mvp_full,
-                                           sadpb, 16, &cpi->fn_ptr[bsize],
-                                           &bsi->ref_mv[0]->as_mv,
-                                           &best_mv->as_mv);
-            cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
-            if (thissme < bestsme) {
-              bestsme = thissme;
-              *new_mv = best_mv->as_mv;
-            } else {
-              // The full search result is actually worse so re-instate the
-              // previous best vector
-              best_mv->as_mv = *new_mv;
-            }
-          }
-
           if (bestsme < INT_MAX) {
             int distortion;
             cpi->find_fractional_mv_step(
diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index d40383f0e..ce0aebeab 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -16,6 +16,23 @@
 
 #include "vpx_dsp/vpx_dsp_common.h"
 
+// Mesh search patters for various speed settings
+static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] =
+    {{64, 4}, {28, 2}, {15, 1}, {7, 1}};
+
+#define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+static MESH_PATTERN good_quality_mesh_patterns[MAX_MESH_SPEED + 1]
+                                              [MAX_MESH_STEP] =
+    {{{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+     {{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+     {{64, 8},  {14, 2}, {7, 1},  {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+    };
+static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] =
+    {50, 25, 15, 5, 1, 1};
+
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
 static int frame_is_boosted(const VP10_COMP *cpi) {
@@ -251,6 +268,8 @@ static void set_rt_speed_feature(VP10_COMP *cpi, SPEED_FEATURES *sf,
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
   sf->use_fast_coef_costing = 1;
+  sf->allow_exhaustive_searches = 0;
+  sf->exhaustive_searches_thresh = INT_MAX;
 
   if (speed >= 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
@@ -498,8 +517,36 @@ void vp10_set_speed_features_framesize_independent(VP10_COMP *cpi) {
     set_good_speed_feature(cpi, cm, sf, oxcf->speed);
 
   cpi->full_search_sad = vp10_full_search_sad;
-  cpi->diamond_search_sad = oxcf->mode == BEST ? vp10_full_range_search
-                                               : vp10_diamond_search_sad;
+  cpi->diamond_search_sad = vp10_diamond_search_sad;
+
+  sf->allow_exhaustive_searches = 1;
+  if (oxcf->mode == BEST) {
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+      sf->exhaustive_searches_thresh = (1 << 20);
+    else
+      sf->exhaustive_searches_thresh = (1 << 21);
+    sf->max_exaustive_pct = 100;
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
+      sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+    }
+  } else {
+    int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+      sf->exhaustive_searches_thresh = (1 << 22);
+    else
+      sf->exhaustive_searches_thresh = (1 << 23);
+    sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
+    if (speed > 0)
+      sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
+
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range =
+          good_quality_mesh_patterns[speed][i].range;
+      sf->mesh_patterns[i].interval =
+          good_quality_mesh_patterns[speed][i].interval;
+    }
+  }
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
diff --git a/vp10/encoder/speed_features.h b/vp10/encoder/speed_features.h
index 3969a2ff8..3b9199929 100644
--- a/vp10/encoder/speed_features.h
+++ b/vp10/encoder/speed_features.h
@@ -195,6 +195,13 @@ typedef struct MV_SPEED_FEATURES {
   int fullpel_search_step_param;
 } MV_SPEED_FEATURES;
 
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+  int range;
+  int interval;
+} MESH_PATTERN;
+
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
 
@@ -290,6 +297,18 @@ typedef struct SPEED_FEATURES {
   // point for this motion search and limits the search range around it.
   int adaptive_motion_search;
 
+  // Flag for allowing some use of exhaustive searches;
+  int allow_exhaustive_searches;
+
+  // Threshold for allowing exhaistive motion search.
+  int exhaustive_searches_thresh;
+
+  // Maximum number of exhaustive searches for a frame.
+  int max_exaustive_pct;
+
+  // Pattern to be used for any exhaustive mesh searches.
+  MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
   int schedule_mode_search;
 
   // Allows sub 8x8 modes to use the prediction filter that was determined
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 45445df8d..8ab51cd20 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1075,7 +1075,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cq_level;
       } else {
-       active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
 
         // Modify best quality for second level arfs. For mode VPX_Q this
         // becomes the baseline frame q.

From ece4fd5d2247c9512b31a93dd593de567beaf928 Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Wed, 9 Dec 2015 10:10:17 -0800
Subject: [PATCH 03/12] Backport temporal filter approach to VP9

This commit enables the new temporal filter system for VP9. For
speed 1, it improves the compression performance:
derf  0.54%
stdhd 1.62%

Change-Id: I041760044def943e464345223790d4efad70b91e
---
 vp9/encoder/vp9_temporal_filter.c | 46 +++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 16f9c8573..7e4c61fbc 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -135,15 +135,38 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
+      int pixel_value = *frame2;
+
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
 
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier  *= modifier;
-      modifier  *= 3;
       modifier  += rounding;
       modifier >>= strength;
 
@@ -418,16 +441,17 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
           }
 #else
           // Apply the filter (YUV)
-          vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+          // TODO(jingning): Need SIMD optimization for this.
+          vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
                                     predictor, 16, 16,
                                     strength, filter_weight,
                                     accumulator, count);
-          vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+          vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
                                     predictor + 256,
                                     mb_uv_width, mb_uv_height, strength,
                                     filter_weight, accumulator + 256,
                                     count + 256);
-          vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+          vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
                                     predictor + 512,
                                     mb_uv_width, mb_uv_height, strength,
                                     filter_weight, accumulator + 512,

From acf580d2bb38fbe61aee283133fa101f8a33e4df Mon Sep 17 00:00:00 2001
From: Tom Finegan <tomfinegan@google.com>
Date: Wed, 9 Dec 2015 13:52:06 -0800
Subject: [PATCH 04/12] iosbuild.sh: Support macosx targets in Xcode 7.

Xcode 7 refuses to link to x86 and x86_64 code that's built for
iphone sim, so add an extra command line flag that forces iosbuild
to use darwin15 targets.

Change-Id: I2228d458f5cccf4d26866040380a974f88d9d360
---
 build/make/iosbuild.sh | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 6f7180d08..ae5ba182d 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -29,11 +29,14 @@ SCRIPT_DIR=$(dirname "$0")
 LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
 LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 ORIG_PWD="$(pwd)"
-TARGETS="arm64-darwin-gcc
-         armv7-darwin-gcc
-         armv7s-darwin-gcc
-         x86-iphonesimulator-gcc
-         x86_64-iphonesimulator-gcc"
+ARM_TARGETS="arm64-darwin-gcc
+             armv7-darwin-gcc
+             armv7s-darwin-gcc"
+SIM_TARGETS="x86-iphonesimulator-gcc
+             x86_64-iphonesimulator-gcc"
+OSX_TARGETS="x86-darwin15-gcc
+             x86_64-darwin15-gcc"
+TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
 
 # Configures for the target specified by $1, and invokes make with the dist
 # target using $DIST_DIR as the distribution output directory.
@@ -197,15 +200,27 @@ cleanup() {
   fi
 }
 
+print_list() {
+  local indent="$1"
+  shift
+  local list="$@"
+  for entry in ${list}; do
+    echo "${indent}${entry}"
+  done
+}
+
 iosbuild_usage() {
 cat << EOF
   Usage: ${0##*/} [arguments]
     --help: Display this message and exit.
     --extra-configure-args <args>: Extra args to pass when configuring libvpx.
+    --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
+              and x86_64. Allows linking to framework when builds target MacOSX
+              instead of iOS.
     --preserve-build-output: Do not delete the build directory.
     --show-build-output: Show output from each library build.
     --targets <targets>: Override default target list. Defaults:
-         ${TARGETS}
+$(print_list "        " ${TARGETS})
     --test-link: Confirms all targets can be linked. Functionally identical to
                  passing --enable-examples via --extra-configure-args.
     --verbose: Output information about the environment and each stage of the
@@ -249,6 +264,9 @@ while [ -n "$1" ]; do
       TARGETS="$2"
       shift
       ;;
+    --macosx)
+      TARGETS="${ARM_TARGETS} ${OSX_TARGETS}"
+      ;;
     --verbose)
       VERBOSE=yes
       ;;
@@ -273,10 +291,12 @@ cat << EOF
   MAKEFLAGS=${MAKEFLAGS}
   ORIG_PWD=${ORIG_PWD}
   PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
-  TARGETS="${TARGETS}"
+  TARGETS="$(print_list "" ${TARGETS})"
+  OSX_TARGETS="${OSX_TARGETS}"
+  SIM_TARGETS="${SIM_TARGETS}"
 EOF
 fi
 
 build_framework "${TARGETS}"
 echo "Successfully built '${FRAMEWORK_DIR}' for:"
-echo "         ${TARGETS}"
+print_list "" ${TARGETS}

From d3c972403a6c30d360a26283d8a9a847d04afd09 Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Wed, 9 Dec 2015 13:43:27 -0800
Subject: [PATCH 05/12] Sync high bit-depth temporal filter

Change-Id: Ifdcfb91416be8189569f703bee9be253d7b3d9b6
---
 vp9/encoder/vp9_temporal_filter.c | 117 ++++++++++++++++++------------
 1 file changed, 69 insertions(+), 48 deletions(-)

diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 7e4c61fbc..015dbc0ca 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -205,15 +205,34 @@ void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1_8,
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
+      int pixel_value = *frame2;
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
 
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier *= modifier;
       modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
       modifier += rounding;
       modifier >>= strength;
 
@@ -406,56 +425,58 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
           if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
             int adj_strength = strength + 2 * (mbd->bd - 8);
             // Apply the filter (YUV)
-            vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
-                                             f->y_stride,
-                                             predictor, 16, 16, adj_strength,
-                                             filter_weight,
-                                             accumulator, count);
-            vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 256,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength,
-                                             filter_weight, accumulator + 256,
-                                             count + 256);
-            vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 512,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength, filter_weight,
-                                             accumulator + 512, count + 512);
+            vp9_highbd_temporal_filter_apply_c(f->y_buffer + mb_y_offset,
+                                               f->y_stride,
+                                               predictor, 16, 16, adj_strength,
+                                               filter_weight,
+                                               accumulator, count);
+            vp9_highbd_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                               f->uv_stride, predictor + 256,
+                                               mb_uv_width, mb_uv_height,
+                                               adj_strength,
+                                               filter_weight, accumulator + 256,
+                                               count + 256);
+            vp9_highbd_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                               f->uv_stride, predictor + 512,
+                                               mb_uv_width, mb_uv_height,
+                                               adj_strength, filter_weight,
+                                               accumulator + 512, count + 512);
           } else {
             // Apply the filter (YUV)
-            vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
-                                      predictor, 16, 16,
-                                      strength, filter_weight,
-                                      accumulator, count);
-            vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 256,
-                                      mb_uv_width, mb_uv_height, strength,
-                                      filter_weight, accumulator + 256,
-                                      count + 256);
-            vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 512,
-                                      mb_uv_width, mb_uv_height, strength,
-                                      filter_weight, accumulator + 512,
-                                      count + 512);
+            vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+                                        predictor, 16, 16,
+                                        strength, filter_weight,
+                                        accumulator, count);
+            vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                        f->uv_stride,
+                                        predictor + 256,
+                                        mb_uv_width, mb_uv_height, strength,
+                                        filter_weight, accumulator + 256,
+                                        count + 256);
+            vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                        f->uv_stride,
+                                        predictor + 512,
+                                        mb_uv_width, mb_uv_height, strength,
+                                        filter_weight, accumulator + 512,
+                                        count + 512);
           }
 #else
           // Apply the filter (YUV)
           // TODO(jingning): Need SIMD optimization for this.
           vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
-                                    predictor, 16, 16,
-                                    strength, filter_weight,
-                                    accumulator, count);
+                                      predictor, 16, 16,
+                                      strength, filter_weight,
+                                      accumulator, count);
           vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 256,
-                                    mb_uv_width, mb_uv_height, strength,
-                                    filter_weight, accumulator + 256,
-                                    count + 256);
+                                      predictor + 256,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 256,
+                                      count + 256);
           vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 512,
-                                    mb_uv_width, mb_uv_height, strength,
-                                    filter_weight, accumulator + 512,
-                                    count + 512);
+                                      predictor + 512,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 512,
+                                      count + 512);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         }
       }

From 221ed5e47bf2861dab989f3aa0387db7088477a9 Mon Sep 17 00:00:00 2001
From: Yaowu Xu <yaowu@google.com>
Date: Wed, 9 Dec 2015 18:22:44 -0800
Subject: [PATCH 06/12] Fix enc/dec mismatches for aq-mode 1 and 2

Change-Id: Iefad9d8d96a08dcc788a5efdca2df6a815d1205f
---
 vp10/decoder/decodeframe.c | 16 ++++++----------
 vp10/encoder/encodeframe.c | 20 ++++++++------------
 2 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 80f1778de..70d012b27 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -1152,16 +1152,12 @@ static void setup_quantization(VP10_COMMON *const cm, MACROBLOCKD *const xd,
   cm->uv_dc_delta_q = read_delta_q(rb);
   cm->uv_ac_delta_q = read_delta_q(rb);
   cm->dequant_bit_depth = cm->bit_depth;
-  for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) {
-#if CONFIG_MISC_FIXES
-    const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
-#endif
-    xd->lossless[i] = cm->y_dc_delta_q == 0 &&
-#if CONFIG_MISC_FIXES
-                      qindex == 0 &&
-#else
-                      cm->base_qindex == 0 &&
-#endif
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
+                       vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
+                       cm->base_qindex;
+    xd->lossless[i] = qindex == 0 &&
+                      cm->y_dc_delta_q == 0 &&
                       cm->uv_dc_delta_q == 0 &&
                       cm->uv_ac_delta_q == 0;
   }
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index f050905e7..f1814a929 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -1155,7 +1155,7 @@ static void rd_pick_sb_modes(VP10_COMP *cpi,
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     x->source_variance =
         vp10_high_get_sby_perpixel_variance(cpi, &x->plane[0].src,
-                                           bsize, xd->bd);
+                                            bsize, xd->bd);
   } else {
     x->source_variance =
       vp10_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
@@ -2579,7 +2579,7 @@ static MV_REFERENCE_FRAME get_frame_type(const VP10_COMP *cpi) {
 }
 
 static TX_MODE select_tx_mode(const VP10_COMP *cpi, MACROBLOCKD *const xd) {
-  if (!cpi->common.seg.enabled && xd->lossless[0])
+  if (xd->lossless[0])
     return ONLY_4X4;
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
     return ALLOW_32X32;
@@ -2696,16 +2696,12 @@ static void encode_frame_internal(VP10_COMP *cpi) {
   vp10_zero(rdc->comp_pred_diff);
   vp10_zero(rdc->filter_diff);
 
-  for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) {
-#if CONFIG_MISC_FIXES
-    const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
-#endif
-    xd->lossless[i] = cm->y_dc_delta_q == 0 &&
-#if CONFIG_MISC_FIXES
-                      qindex == 0 &&
-#else
-                      cm->base_qindex == 0 &&
-#endif
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
+                       vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
+                       cm->base_qindex;
+    xd->lossless[i] = qindex == 0 &&
+                      cm->y_dc_delta_q == 0 &&
                       cm->uv_dc_delta_q == 0 &&
                       cm->uv_ac_delta_q == 0;
   }

From 4ec5953080bc82f8772fe7c610f28b937287d824 Mon Sep 17 00:00:00 2001
From: Jian Zhou <zhoujian@google.com>
Date: Thu, 10 Dec 2015 11:59:58 -0800
Subject: [PATCH 07/12] Code clean of dc_left/top_predictor_16x16

Remove some redundant code.

Change-Id: Ida2e8c0ce28770f7a9545ca014fe792b04295260
---
 vpx_dsp/x86/intrapred_sse2.asm | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index edbf05e33..3a9fc6c0d 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -239,14 +239,11 @@ cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   pxor                  m1, m1
-  pxor                  m2, m2
   mova                  m0, [aboveq]
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 4
   psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
   movhlps               m2, m0
   paddw                 m0, m2
   paddw                 m0, [GLOBAL(pw2_16)]
@@ -271,14 +268,11 @@ cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   pxor                  m1, m1
-  pxor                  m2, m2
   mova                  m0, [leftq]
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 4
   psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
   movhlps               m2, m0
   paddw                 m0, m2
   paddw                 m0, [GLOBAL(pw2_16)]

From 6786280807a6e7711f3e47f9d0588f1ddd4d6f6c Mon Sep 17 00:00:00 2001
From: Yaowu Xu <yaowu@google.com>
Date: Thu, 10 Dec 2015 12:41:27 -0800
Subject: [PATCH 08/12] Fix two msvc build issues

Change-Id: Iece22223773dd6d0f87f8f59827705acd2ebe2a4
---
 vp10/common/reconintra.c | 2 +-
 vp10/encoder/bitstream.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index b852a655f..e9e3949ad 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -262,7 +262,7 @@ static void vp10_init_intra_predictors_internal(void) {
 }
 
 #if CONFIG_MISC_FIXES
-static inline void memset16(uint16_t *dst, int val, int n) {
+static INLINE void memset16(uint16_t *dst, int val, int n) {
   while (n--)
     *dst++ = val;
 }
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 32275d4ae..3765fa879 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -1473,7 +1473,7 @@ void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size) {
     assert(n_log2_tiles > 0);
     vpx_wb_write_literal(&saved_wb, mag, 2);
     if (mag < 3)
-      data_sz = remux_tiles(data, data_sz, 1 << n_log2_tiles, mag);
+      data_sz = (int)remux_tiles(data, data_sz, 1 << n_log2_tiles, mag);
   } else {
     assert(n_log2_tiles == 0);
   }

From 322ea7ff5bce741e0657179f63ee1c1bb54da864 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Thu, 10 Dec 2015 14:25:01 -0800
Subject: [PATCH 09/12] Fix the win32 crash when GET_GOT is not defined

This patch continues to fix the win32 crash issue:
https://bugs.chromium.org/p/webm/issues/detail?id=1105

Johann's patch is here:
https://chromium-review.googlesource.com/#/c/316446/2

Change-Id: I7fe191c717e40df8602e229371321efb0d689375
---
 third_party/x86inc/x86inc.asm                    |  4 ++++
 vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm | 12 ++++++++----
 vpx_dsp/x86/subpel_variance_sse2.asm             | 12 ++++++++----
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index be59de311..e4cda88ef 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -119,6 +119,7 @@
 %if ABI_IS_32BIT
     %if CONFIG_PIC=1
         %ifidn __OUTPUT_FORMAT__,elf32
+            %define GET_GOT_DEFINED 1
             %define GET_GOT_SAVE_ARG 1
             %define WRT_PLT wrt ..plt
             %macro GET_GOT 1
@@ -138,6 +139,7 @@
                 %define RESTORE_GOT pop %1
             %endmacro
         %elifidn __OUTPUT_FORMAT__,macho32
+            %define GET_GOT_DEFINED 1
             %define GET_GOT_SAVE_ARG 1
             %macro GET_GOT 1
                 push %1
@@ -149,6 +151,8 @@
                 %undef RESTORE_GOT
                 %define RESTORE_GOT pop %1
             %endmacro
+        %else
+            %define GET_GOT_DEFINED 0
         %endif
     %endif
 
diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 5f9c963d4..22d52a2af 100644
--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -123,7 +123,10 @@ SECTION .text
       %define sec_str sec_stridemp
 
       ; Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -131,7 +134,6 @@ SECTION .text
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
-      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
@@ -140,7 +142,10 @@ SECTION .text
       %define block_height heightd
 
       ; Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -148,7 +153,6 @@ SECTION .text
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
-      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %endif
   %else
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index 1176a2f4c..c655e4b34 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -139,7 +139,10 @@ SECTION .text
       %define sec_str sec_stridemp
 
       ;Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -147,7 +150,6 @@ SECTION .text
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
-      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
@@ -156,7 +158,10 @@ SECTION .text
       %define block_height heightd
 
       ;Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -164,7 +169,6 @@ SECTION .text
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
-      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %endif
   %else

From cd08120d6202fe1bee2d715962e5942054558313 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Thu, 10 Dec 2015 15:28:07 -0800
Subject: [PATCH 10/12] Minor cleanup

Removed unused GET_GOT_SAVE_ARG.

Change-Id: I0ae41c2d0dcd6d7d1c8dda05062fcdb737fd917d
---
 third_party/x86inc/x86inc.asm | 2 --
 vpx_ports/x86_abi_support.asm | 2 --
 2 files changed, 4 deletions(-)

diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index e4cda88ef..e7d3fa5eb 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -120,7 +120,6 @@
     %if CONFIG_PIC=1
         %ifidn __OUTPUT_FORMAT__,elf32
             %define GET_GOT_DEFINED 1
-            %define GET_GOT_SAVE_ARG 1
             %define WRT_PLT wrt ..plt
             %macro GET_GOT 1
                 extern _GLOBAL_OFFSET_TABLE_
@@ -140,7 +139,6 @@
             %endmacro
         %elifidn __OUTPUT_FORMAT__,macho32
             %define GET_GOT_DEFINED 1
-            %define GET_GOT_SAVE_ARG 1
             %macro GET_GOT 1
                 push %1
                 call %%get_got
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index c94b76a06..708fa101c 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -189,7 +189,6 @@
 %if ABI_IS_32BIT
   %if CONFIG_PIC=1
   %ifidn __OUTPUT_FORMAT__,elf32
-    %define GET_GOT_SAVE_ARG 1
     %define WRT_PLT wrt ..plt
     %macro GET_GOT 1
       extern _GLOBAL_OFFSET_TABLE_
@@ -208,7 +207,6 @@
       %define RESTORE_GOT pop %1
     %endmacro
   %elifidn __OUTPUT_FORMAT__,macho32
-    %define GET_GOT_SAVE_ARG 1
     %macro GET_GOT 1
       push %1
       call %%get_got

From 40ee78bc19d5c16f7f808cfac8ae2e0cd49039cf Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 9 Dec 2015 23:53:11 -0800
Subject: [PATCH 11/12] dc_left_pred[48]: fix pic builds

GET_GOT modifies the stack pointer so the offset for left's address will
be wrong if loaded afterword.

Change-Id: Iff9433aec45f5f6fe1a59ed8080c589bad429536
---
 vpx_dsp/x86/intrapred_sse2.asm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index edbf05e33..0acb023d8 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -47,9 +47,9 @@ cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
 
 INIT_XMM sse2
 cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
   GET_GOT     goffsetq
 
-  movifnidn          leftq, leftmp
   pxor                  m1, m1
   movd                  m0, [leftq]
   psadbw                m0, m1
@@ -143,9 +143,9 @@ cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
 
 INIT_XMM sse2
 cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
   GET_GOT     goffsetq
 
-  movifnidn          leftq, leftmp
   pxor                  m1, m1
   movq                  m0, [leftq]
   DEFINE_ARGS dst, stride, stride3

From 4d2cfeab361e013d225b7c6ab30d0420b2652e28 Mon Sep 17 00:00:00 2001
From: Yaowu Xu <yaowu@google.com>
Date: Thu, 10 Dec 2015 16:29:01 -0800
Subject: [PATCH 12/12] Proper fix of a msvc complier warning

Change-Id: I701ab4993be7cfb15b61a1adbbaf5565bd14ae27
---
 vp10/encoder/bitstream.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 3765fa879..5f6d9d342 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -1473,7 +1473,7 @@ void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size) {
     assert(n_log2_tiles > 0);
     vpx_wb_write_literal(&saved_wb, mag, 2);
     if (mag < 3)
-      data_sz = (int)remux_tiles(data, data_sz, 1 << n_log2_tiles, mag);
+      data_sz = remux_tiles(data, (int)data_sz, 1 << n_log2_tiles, mag);
   } else {
     assert(n_log2_tiles == 0);
   }