Test gerrit.

2017-12-05 18:07:21 -05:00
150 changed files with 3854 additions and 5687 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -1,12 +1,12 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-# Generated with clang-format 5.0.0
+# Generated with clang-format 4.0.1
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
+AlignEscapedNewlinesLeft: true
 AlignOperands:   true
 AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: true
@@ -33,20 +33,14 @@ BraceWrapping:
  BeforeCatch:     false
  BeforeElse:      false
  IndentBraces:    false
  SplitEmptyFunction: true
  SplitEmptyRecord: true
  SplitEmptyNamespace: true
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: Attach
 BreakBeforeInheritanceComma: false
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
 BreakConstructorInitializers: BeforeColon
 BreakAfterJavaFieldAnnotations: false
 BreakStringLiterals: true
 ColumnLimit:     80
 CommentPragmas:  '^ IWYU pragma:'
 CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
@@ -54,11 +48,7 @@ Cpp11BracedListStyle: false
 DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
 ForEachMacros:
  - foreach
  - Q_FOREACH
  - BOOST_FOREACH
 IncludeCategories:
  - Regex:           '^<.*\.h>'
    Priority:        1
@@ -80,7 +70,6 @@ NamespaceIndentation: None
 ObjCBlockIndentWidth: 2
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: false
 PenaltyBreakAssignment: 2
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
@@ -90,7 +79,6 @@ PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
 ReflowComments:  true
 SortIncludes:    false
 SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
--- a/.mailmap
+++ b/.mailmap
@@ -3,7 +3,6 @@ Aℓex Converse <aconverse@google.com>
 Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
 Chris Cunningham <chcunningham@chromium.org>
 Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
 Deb Mukherjee <debargha@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
@@ -22,21 +21,18 @@ Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Pascal Massimino <pascal.massimino@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
 Peter Boström <pbos@chromium.org> <pbos@google.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
 Sami Pietilä <samipietila@google.com>
 Shiyou Yin <yinshiyou-hf@loongson.cn>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
 Urvang Joshi <urvang@google.com> <urvang@chromium.org>
 Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <Yaowu Xu>
--- a/16
+++ b/16
@@ -3,13 +3,13 @@
 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
 Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
 Aleksey Vasenev <margtu-fivt@ya.ru>
 Alexander Potapenko <glider@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
 Alexandra Hájková <alexandra.khirnova@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -17,7 +17,6 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
 Angie Chiang <angiebird@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
@@ -25,9 +24,7 @@ Attila Nagy <attilanagy@google.com>
 Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
 Cheng Chen <chengchen@google.com>
 chm <chm@rock-chips.com>
 Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
@@ -49,12 +46,10 @@ Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
 Gregor Jasny <gjasny@gmail.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
 Han Shen <shenhan@google.com>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
 Ivan Krasin <krasin@chromium.org>
@@ -88,7 +83,6 @@ Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 Kaustubh Raste <kaustubh.raste@imgtec.com>
 KO Myung-Hun <komh@chollian.net>
 Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
 Linfeng Zhang <linfengz@google.com>
 Lou Quillio <louquillio@google.com>
@@ -107,7 +101,6 @@ Mikhal Shemer <mikhal@google.com>
 Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
 Min Ye <yeemmi@google.com>
 Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
 Nathan E. Egge <negge@mozilla.com>
 Nico Weber <thakis@chromium.org>
@@ -118,15 +111,12 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
 Pengchong Jin <pengchong@google.com>
-Peter Boström <pbos@chromium.org>
+Peter Boström <pbos@google.com>
 Peter Collingbourne <pcc@chromium.org>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
 Rafael de Lucena Valle <rafaeldelucena@gmail.com>
 Rahul Chaudhry <rahulchaudhry@google.com>
 Ralph Giles <giles@xiph.org>
 Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
 Rob Bradford <rob@linux.intel.com>
@@ -145,7 +135,6 @@ Shiyou Yin <yinshiyou-hf@loongson.cn>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
 Sylvestre Ledru <sylvestre@mozilla.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
@@ -158,7 +147,6 @@ Tom Finegan <tomfinegan@google.com>
 Tristan Matthews <le.businessman@gmail.com>
 Urvang Joshi <urvang@google.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
 Vlad Tsyrklevich <vtsyrklevich@chromium.org>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
--- a/25
+++ b/25
@@ -1,28 +1,3 @@
 2017-01-04 v1.7.0 "Mandarin Duck"
  This release focused on high bit depth performance (10/12 bit) and vp9
  encoding improvements.
  - Upgrading:
    This release is ABI incompatible due to new vp9 encoder features.
    Frame parallel decoding for vp9 has been removed.
  - Enhancements:
    vp9 encoding supports additional threads with --row-mt. This can be greater
    than the number of tiles.
    Two new vp9 encoder options have been added:
      --corpus-complexity
      --tune-content=film
    Additional tooling for respecting the vp9 "level" profiles has been added.
  - Bug fixes:
    A variety of fuzzing issues.
    vp8 threading fix for ARM.
    Codec control VP9_SET_SKIP_LOOP_FILTER fixed.
    Reject invalid multi resolution configurations.
 2017-01-09 v1.6.1 "Long Tailed Duck"
  This release improves upon the VP9 encoder and speeds up the encoding and
  decoding processes.
--- a/4
+++ b/4
@@ -1,4 +1,4 @@
-README - 24 January 2018
+README - 26 January 2017
 Welcome to the WebM VP8/VP9 Codec SDK!
@@ -63,8 +63,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv8-linux-gcc
    mips32-linux-gcc
    mips64-linux-gcc
    ppc64-linux-gcc
    ppc64le-linux-gcc
    sparc-solaris-gcc
    x86-android-gcc
    x86-darwin8-gcc
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -1,13 +1,4 @@
 #!/usr/bin/env perl
 ##
 ##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
 ##  that can be found in the LICENSE file in the root of the source
 ##  tree. An additional intellectual property rights grant can be found
 ##  in the file PATENTS.  All contributing project authors may
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 no strict 'refs';
 use warnings;
@@ -209,7 +200,6 @@ sub filter {
 sub common_top() {
  my $include_guard = uc($opts{sym})."_H_";
  print <<EOF;
 // This file is generated. Do not edit.
 #ifndef ${include_guard}
 #define ${include_guard}
--- a/build/make/version.sh
+++ b/build/make/version.sh
@@ -60,7 +60,6 @@ if [ ${bare} ]; then
    echo "${changelog_version}${git_version_id}" > $$.tmp
 else
    cat<<EOF>$$.tmp
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  $major_version
 #define VERSION_MINOR  $minor_version
 #define VERSION_PATCH  $patch_version
--- a/2
+++ b/2
@@ -665,7 +665,7 @@ process_toolchain() {
             gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
             enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
             all_targets="${all_targets} solution"
-             INLINE="__inline"
+             INLINE="__forceinline"
        ;;
    esac
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -429,8 +429,7 @@ static void set_rate_control_stats(struct RateControlStats *rc,
        rc->layer_framerate[layer] = framerate / cfg->ts_rate_decimator[tl];
      if (tl > 0) {
        rc->layer_pfb[layer] =
-            1000.0 *
+            1000.0 * (cfg->layer_target_bitrate[layer] -
            (cfg->layer_target_bitrate[layer] -
                      cfg->layer_target_bitrate[layer - 1]) /
            (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]);
      } else {
@@ -574,8 +573,8 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
      } else {
        if (is_key_frame) {
          ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+              VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF |
-              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF;
+              VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
        } else {
          ref_frame_config->frame_flags[sl] =
              VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
@@ -589,24 +588,14 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
      } else {
        ref_frame_config->frame_flags[sl] =
            VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
        if (sl == num_spatial_layers - 1)
          ref_frame_config->frame_flags[sl] =
              VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_ARF |
              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
      }
    }
    if (tl == 0) {
      ref_frame_config->lst_fb_idx[sl] = sl;
-      if (sl) {
+      if (sl)
        if (is_key_frame) {
          ref_frame_config->lst_fb_idx[sl] = sl - 1;
          ref_frame_config->gld_fb_idx[sl] = sl;
        } else {
        ref_frame_config->gld_fb_idx[sl] = sl - 1;
-        }
+      else
      } else {
        ref_frame_config->gld_fb_idx[sl] = 0;
      }
      ref_frame_config->alt_fb_idx[sl] = 0;
    } else if (tl == 1) {
      ref_frame_config->lst_fb_idx[sl] = sl;
@@ -749,8 +738,6 @@ int main(int argc, const char **argv) {
      // the encode for the whole superframe. The encoder will internally loop
      // over all the spatial layers for the current superframe.
      vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
      // TODO(jianj): Fix the parameter passing for "is_key_frame" in
      // set_frame_flags_bypass_model() for case of periodic key frames.
      set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id,
                                  svc_ctx.spatial_layers, frame_cnt == 0,
                                  &ref_frame_config);
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -26,29 +26,19 @@
 #include "../tools_common.h"
 #include "../video_writer.h"
-#define ROI_MAP 0
+#define VP8_ROI_MAP 0
 #define zero(Dest) memset(&Dest, 0, sizeof(Dest));
 static const char *exec_name;
 void usage_exit(void) { exit(EXIT_FAILURE); }
-// Denoiser states for vp8, for temporal denoising.
+// Denoiser states, for temporal denoising.
-enum denoiserStateVp8 {
+enum denoiserState {
-  kVp8DenoiserOff,
+  kDenoiserOff,
-  kVp8DenoiserOnYOnly,
+  kDenoiserOnYOnly,
-  kVp8DenoiserOnYUV,
+  kDenoiserOnYUV,
-  kVp8DenoiserOnYUVAggressive,
+  kDenoiserOnYUVAggressive,
-  kVp8DenoiserOnAdaptive
+  kDenoiserOnAdaptive
 };
 // Denoiser states for vp9, for temporal denoising.
 enum denoiserStateVp9 {
  kVp9DenoiserOff,
  kVp9DenoiserOnYOnly,
  // For SVC: denoise the top two spatial layers.
  kVp9DenoiserOnYTwoSpatialLayers
 };
 static int mode_to_num_layers[13] = { 1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3 };
@@ -101,9 +91,8 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc,
  for (i = 0; i < cfg->ts_number_layers; ++i) {
    if (i > 0) {
      rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i];
-      rc->layer_pfb[i] =
+      rc->layer_pfb[i] = 1000.0 * (rc->layer_target_bitrate[i] -
-          1000.0 *
+                                   rc->layer_target_bitrate[i - 1]) /
          (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
                         (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
    }
    rc->layer_input_frames[i] = 0;
@@ -167,60 +156,38 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
    die("Error: Number of input frames not equal to output! \n");
 }
-#if ROI_MAP
+#if VP8_ROI_MAP
-static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg,
+static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) {
                        vpx_roi_map_t *roi) {
  unsigned int i, j;
-  int block_size = 0;
+  memset(roi, 0, sizeof(*roi));
  uint8_t is_vp8 = strncmp(enc_name, "vp8", 3) == 0 ? 1 : 0;
  uint8_t is_vp9 = strncmp(enc_name, "vp9", 3) == 0 ? 1 : 0;
  if (!is_vp8 && !is_vp9) {
    die("unsupported codec.");
  }
  zero(*roi);
  block_size = is_vp9 && !is_vp8 ? 8 : 16;
  // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for
  // segment is 16x16 for vp8, 8x8 for vp9.
-  roi->rows = (cfg->g_h + block_size - 1) / block_size;
+  roi->rows = (cfg->g_h + 15) / 16;
-  roi->cols = (cfg->g_w + block_size - 1) / block_size;
+  roi->cols = (cfg->g_w + 15) / 16;
  // Applies delta QP on the segment blocks, varies from -63 to 63.
  // Setting to negative means lower QP (better quality).
  // Below we set delta_q to the extreme (-63) to show strong effect.
-  // VP8 uses the first 4 segments. VP9 uses all 8 segments.
+  roi->delta_q[0] = 0;
  zero(roi->delta_q);
  roi->delta_q[1] = -63;
  roi->delta_q[2] = 0;
  roi->delta_q[3] = 0;
  // Applies delta loopfilter strength on the segment blocks, varies from -63 to
-  // 63. Setting to positive means stronger loopfilter. VP8 uses the first 4
+  // 63. Setting to positive means stronger loopfilter.
-  // segments. VP9 uses all 8 segments.
+  roi->delta_lf[0] = 0;
-  zero(roi->delta_lf);
+  roi->delta_lf[1] = 0;
  roi->delta_lf[2] = 0;
  roi->delta_lf[3] = 0;
  if (is_vp8) {
  // Applies skip encoding threshold on the segment blocks, varies from 0 to
  // UINT_MAX. Larger value means more skipping of encoding is possible.
  // This skip threshold only applies on delta frames.
-    zero(roi->static_threshold);
+  roi->static_threshold[0] = 0;
-  }
+  roi->static_threshold[1] = 0;
-
+  roi->static_threshold[2] = 0;
-  if (is_vp9) {
+  roi->static_threshold[3] = 0;
    // Apply skip segment. Setting to 1 means this block will be copied from
    // previous frame.
    zero(roi->skip);
  }
  if (is_vp9) {
    // Apply ref frame segment.
    // -1 : Do not apply this segment.
    //  0 : Froce using intra.
    //  1 : Force using last.
    //  2 : Force using golden.
    //  3 : Force using alfref but not used in non-rd pickmode for 0 lag.
    memset(roi->ref_frame, -1, sizeof(roi->ref_frame));
    roi->ref_frame[1] = 1;
  }
  // Use 2 states: 1 is center square, 0 is the rest.
  roi->roi_map =
@@ -588,7 +555,7 @@ int main(int argc, char **argv) {
  int layering_mode = 0;
  int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 };
  int flag_periodicity = 1;
-#if ROI_MAP
+#if VP8_ROI_MAP
  vpx_roi_map_t roi;
 #endif
  vpx_svc_layer_id_t layer_id = { 0, 0 };
@@ -788,11 +755,11 @@ int main(int argc, char **argv) {
  if (strncmp(encoder->name, "vp8", 3) == 0) {
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
    vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
-#if ROI_MAP
+#if VP8_ROI_MAP
-    set_roi_map(encoder->name, &cfg, &roi);
+    vp8_set_roi_map(&cfg, &roi);
    if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
      die_codec(&codec, "Failed to set ROI map");
 #endif
@@ -805,16 +772,10 @@ int main(int argc, char **argv) {
    vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0);
    vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0);
    vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
-    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff);
+    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
    vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
 #if ROI_MAP
    set_roi_map(encoder->name, &cfg, &roi);
    if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
      die_codec(&codec, "Failed to set ROI map");
    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 0);
 #endif
    // TODO(marpan/jianj): There is an issue with row-mt for low resolutons at
    // high speed settings, disable its use for those cases for now.
    if (cfg.g_threads > 1 && ((cfg.g_w > 320 && cfg.g_h > 240) || speed < 7))
@@ -942,8 +903,5 @@ int main(int argc, char **argv) {
  for (i = 0; i < cfg.ts_number_layers; ++i) vpx_video_writer_close(outfile[i]);
  vpx_img_free(&raw);
 #if ROI_MAP
  free(roi.roi_map);
 #endif
  return EXIT_SUCCESS;
 }
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -943,6 +943,18 @@ GENERATE_XML           = NO
 XML_OUTPUT             = xml
 # The XML_SCHEMA tag can be used to specify an XML schema,
 # which can be used by a validating XML parser to check the
 # syntax of the XML files.
 XML_SCHEMA             =
 # The XML_DTD tag can be used to specify an XML DTD,
 # which can be used by a validating XML parser to check the
 # syntax of the XML files.
 XML_DTD                =
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting
 # and cross-referencing information) to the XML output. Note that
--- a/libs.mk
+++ b/libs.mk
@@ -233,8 +233,8 @@ OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
-SO_VERSION_MAJOR := 5
+SO_VERSION_MAJOR := 4
-SO_VERSION_MINOR := 0
+SO_VERSION_MINOR := 1
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
--- a/test/blockiness_test.cc
+++ b/test/blockiness_test.cc
@@ -215,7 +215,7 @@ using std::tr1::make_tuple;
 #if CONFIG_VP9_ENCODER
 const BlockinessParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238)
+  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
 };
 INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests));
 #endif
--- a/test/consistency_test.cc
+++ b/test/consistency_test.cc
@@ -205,7 +205,7 @@ using std::tr1::make_tuple;
 #if CONFIG_VP9_ENCODER
 const ConsistencyParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238)
+  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
 };
 INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test,
                        ::testing::ValuesIn(c_vp9_tests));
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -539,7 +539,6 @@ class DatarateTestVP9Large
    denoiser_offon_test_ = 0;
    denoiser_offon_period_ = -1;
    frame_parallel_decoding_mode_ = 1;
    use_roi_ = 0;
  }
  //
@@ -622,10 +621,6 @@ class DatarateTestVP9Large
    encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
                     frame_parallel_decoding_mode_);
    if (use_roi_) {
      encoder->Control(VP9E_SET_ROI_MAP, &roi_);
    }
    if (cfg_.ts_number_layers > 1) {
      if (video->frame() == 0) {
        encoder->Control(VP9E_SET_SVC, 1);
@@ -706,8 +701,6 @@ class DatarateTestVP9Large
  int denoiser_offon_test_;
  int denoiser_offon_period_;
  int frame_parallel_decoding_mode_;
  bool use_roi_;
  vpx_roi_map_t roi_;
 };
 // Check basic rate targeting for VBR mode with 0 lag.
@@ -1080,68 +1073,6 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
  }
 }
 class DatarateTestVP9RealTime : public DatarateTestVP9Large {
 public:
  virtual ~DatarateTestVP9RealTime() {}
 };
 // Check VP9 region of interest feature.
 TEST_P(DatarateTestVP9RealTime, RegionOfInterest) {
  if (deadline_ != VPX_DL_REALTIME || set_cpu_used_ < 5) return;
  cfg_.rc_buf_initial_sz = 500;
  cfg_.rc_buf_optimal_sz = 500;
  cfg_.rc_buf_sz = 1000;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.rc_min_quantizer = 0;
  cfg_.rc_max_quantizer = 63;
  cfg_.rc_end_usage = VPX_CBR;
  cfg_.g_lag_in_frames = 0;
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 300);
  cfg_.rc_target_bitrate = 450;
  cfg_.g_w = 352;
  cfg_.g_h = 288;
  ResetModel();
  // Set ROI parameters
  use_roi_ = true;
  memset(&roi_, 0, sizeof(roi_));
  roi_.rows = (cfg_.g_h + 7) / 8;
  roi_.cols = (cfg_.g_w + 7) / 8;
  roi_.delta_q[1] = -20;
  roi_.delta_lf[1] = -20;
  memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame));
  roi_.ref_frame[1] = 1;
  // Use 2 states: 1 is center square, 0 is the rest.
  roi_.roi_map = reinterpret_cast<uint8_t *>(
      calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)));
  ASSERT_TRUE(roi_.roi_map != NULL);
  for (unsigned int i = 0; i < roi_.rows; ++i) {
    for (unsigned int j = 0; j < roi_.cols; ++j) {
      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
        roi_.roi_map[i * roi_.cols + j] = 1;
      }
    }
  }
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90)
      << " The datarate for the file exceeds the target!";
  ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4)
      << " The datarate for the file missed the target!";
  free(roi_.roi_map);
 }
 #if CONFIG_VP9_TEMPORAL_DENOISING
 class DatarateTestVP9LargeDenoiser : public DatarateTestVP9Large {
 public:
@@ -1285,78 +1216,18 @@ class DatarateOnePassCbrSvc
  }
  virtual void ResetModel() {
    last_pts_ = 0;
    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
    frame_number_ = 0;
    first_drop_ = 0;
    bits_total_ = 0;
    duration_ = 0.0;
    mismatch_psnr_ = 0.0;
    mismatch_nframes_ = 0;
    denoiser_on_ = 0;
    tune_content_ = 0;
    base_speed_setting_ = 5;
    spatial_layer_id_ = 0;
    temporal_layer_id_ = 0;
    update_pattern_ = 0;
    memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_));
    memset(bits_total_, 0, sizeof(bits_total_));
    memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_));
    dynamic_drop_layer_ = false;
  }
  virtual void BeginPassHook(unsigned int /*pass*/) {}
  // Example pattern for spatial layers and 2 temporal layers used in the
  // bypass/flexible mode. The pattern corresponds to the pattern
  // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
  // non-flexible mode, except that we disable inter-layer prediction.
  void set_frame_flags_bypass_mode(
      int tl, int num_spatial_layers, int is_key_frame,
      vpx_svc_ref_frame_config_t *ref_frame_config) {
    for (int sl = 0; sl < num_spatial_layers; ++sl) {
      if (!tl) {
        if (!sl) {
          ref_frame_config->frame_flags[sl] =
              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF |
              VP8_EFLAG_NO_UPD_ARF;
        } else {
          if (is_key_frame) {
            ref_frame_config->frame_flags[sl] =
                VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
                VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF;
          } else {
            ref_frame_config->frame_flags[sl] =
                VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF |
                VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF;
          }
        }
      } else if (tl == 1) {
        if (!sl) {
          ref_frame_config->frame_flags[sl] =
              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
        } else {
          ref_frame_config->frame_flags[sl] =
              VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
              VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_REF_GF;
        }
      }
      if (tl == 0) {
        ref_frame_config->lst_fb_idx[sl] = sl;
        if (sl) {
          if (is_key_frame) {
            ref_frame_config->lst_fb_idx[sl] = sl - 1;
            ref_frame_config->gld_fb_idx[sl] = sl;
          } else {
            ref_frame_config->gld_fb_idx[sl] = sl - 1;
          }
        } else {
          ref_frame_config->gld_fb_idx[sl] = 0;
        }
        ref_frame_config->alt_fb_idx[sl] = 0;
      } else if (tl == 1) {
        ref_frame_config->lst_fb_idx[sl] = sl;
        ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
        ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
      }
    }
  }
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    if (video->frame() == 0) {
@@ -1381,137 +1252,36 @@ class DatarateOnePassCbrSvc
      encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1);
      encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
    }
    if (update_pattern_ && video->frame() >= 100) {
      vpx_svc_layer_id_t layer_id;
      if (video->frame() == 100) {
        cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
        encoder->Config(&cfg_);
      }
      // Set layer id since the pattern changed.
      layer_id.spatial_layer_id = 0;
      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
                                  number_spatial_layers_, 0, &ref_frame_config);
      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
    }
    if (dynamic_drop_layer_) {
      if (video->frame() == 100) {
        // Change layer bitrates to set top layer to 0. This will trigger skip
        // encoding/dropping of top spatial layer.
        cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[2];
        cfg_.layer_target_bitrate[2] = 0;
        encoder->Config(&cfg_);
      } else if (video->frame() == 300) {
        // Change layer bitrate on top layer to non-zero to start encoding it
        // again.
        cfg_.layer_target_bitrate[2] = 500;
        cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[2];
        encoder->Config(&cfg_);
      }
    }
    const vpx_rational_t tb = video->timebase();
    timebase_ = static_cast<double>(tb.num) / tb.den;
    duration_ = 0;
  }
  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
    vpx_svc_layer_id_t layer_id;
    encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
    spatial_layer_id_ = layer_id.spatial_layer_id;
    temporal_layer_id_ = layer_id.temporal_layer_id;
    // Update buffer with per-layer target frame bandwidth, this is done
    // for every frame passed to the encoder (encoded or dropped).
    // For temporal layers, update the cumulative buffer level.
    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
        const int layer = sl * number_temporal_layers_ + tl;
        bits_in_buffer_model_[layer] +=
            static_cast<int64_t>(layer_target_avg_bandwidth_[layer]);
      }
    }
  }
  vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
                                         uint32_t sizes[8], int *count) {
    uint8_t marker;
    marker = *(data + data_sz - 1);
    *count = 0;
    if ((marker & 0xe0) == 0xc0) {
      const uint32_t frames = (marker & 0x7) + 1;
      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
      const size_t index_sz = 2 + mag * frames;
      // This chunk is marked as having a superframe index but doesn't have
      // enough data for it, thus it's an invalid superframe index.
      if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
      {
        const uint8_t marker2 = *(data + data_sz - index_sz);
        // This chunk is marked as having a superframe index but doesn't have
        // the matching marker byte at the front of the index therefore it's an
        // invalid chunk.
        if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
      }
      {
        uint32_t i, j;
        const uint8_t *x = &data[data_sz - index_sz + 1];
        for (i = 0; i < frames; ++i) {
          uint32_t this_sz = 0;
          for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
          sizes[i] = this_sz;
        }
        *count = frames;
      }
    }
    return VPX_CODEC_OK;
  }
  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    uint32_t sizes[8] = { 0 };
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-    int count = 0;
+    if (last_pts_ == 0) duration = 1;
-    last_pts_ = pkt->data.frame.pts;
+    bits_in_buffer_model_ += static_cast<int64_t>(
        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
    const bool key_frame =
        (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
-    parse_superframe_index(static_cast<const uint8_t *>(pkt->data.frame.buf),
+    if (!key_frame) {
-                           pkt->data.frame.sz, sizes, &count);
+      // TODO(marpan): This check currently fails for some of the SVC tests,
-    if (!dynamic_drop_layer_) ASSERT_EQ(count, number_spatial_layers_);
+      // re-enable when issue (webm:1350) is resolved.
-    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      //  ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-      sizes[sl] = sizes[sl] << 3;
+      //                                      << pkt->data.frame.pts;
      // Update the total encoded bits per layer.
      // For temporal layers, update the cumulative encoded bits per layer.
      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
        const int layer = sl * number_temporal_layers_ + tl;
        bits_total_[layer] += static_cast<int64_t>(sizes[sl]);
        // Update the per-layer buffer level with the encoded frame size.
        bits_in_buffer_model_[layer] -= static_cast<int64_t>(sizes[sl]);
        // There should be no buffer underrun, except on the base
        // temporal layer, since there may be key frames there.
        if (!key_frame && tl > 0) {
          ASSERT_GE(bits_in_buffer_model_[layer], 0)
              << "Buffer Underrun at frame " << pkt->data.frame.pts;
    }
    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
    bits_in_buffer_model_ -= static_cast<int64_t>(frame_size_in_bits);
    bits_total_ += frame_size_in_bits;
    if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
    last_pts_ = pkt->data.frame.pts;
    bits_in_last_frame_ = frame_size_in_bits;
    ++frame_number_;
  }
      ASSERT_EQ(pkt->data.frame.width[sl],
                top_sl_width_ * svc_params_.scaling_factor_num[sl] /
                    svc_params_.scaling_factor_den[sl]);
      ASSERT_EQ(pkt->data.frame.height[sl],
                top_sl_height_ * svc_params_.scaling_factor_num[sl] /
                    svc_params_.scaling_factor_den[sl]);
    }
  }
  virtual void EndPassHook(void) {
-    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+    if (bits_total_) {
-      for (int tl = 0; tl < number_temporal_layers_; ++tl) {
+      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
        const int layer = sl * number_temporal_layers_ + tl;
        const double file_size_in_kb = bits_total_[layer] / 1000.;
      duration_ = (last_pts_ + 1) * timebase_;
-        file_datarate_[layer] = file_size_in_kb / duration_;
+      file_datarate_ = file_size_in_kb / duration_;
      }
    }
  }
@@ -1524,11 +1294,13 @@ class DatarateOnePassCbrSvc
  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
  vpx_codec_pts_t last_pts_;
-  int64_t bits_in_buffer_model_[VPX_MAX_LAYERS];
+  int64_t bits_in_buffer_model_;
  double timebase_;
-  int64_t bits_total_[VPX_MAX_LAYERS];
+  int frame_number_;
  vpx_codec_pts_t first_drop_;
  int64_t bits_total_;
  double duration_;
-  double file_datarate_[VPX_MAX_LAYERS];
+  double file_datarate_;
  size_t bits_in_last_frame_;
  vpx_svc_extra_cfg_t svc_params_;
  int speed_setting_;
@@ -1537,27 +1309,14 @@ class DatarateOnePassCbrSvc
  int denoiser_on_;
  int tune_content_;
  int base_speed_setting_;
  int spatial_layer_id_;
  int temporal_layer_id_;
  int number_spatial_layers_;
  int number_temporal_layers_;
  int layer_target_avg_bandwidth_[VPX_MAX_LAYERS];
  bool dynamic_drop_layer_;
  unsigned int top_sl_width_;
  unsigned int top_sl_height_;
  vpx_svc_ref_frame_config_t ref_frame_config;
  int update_pattern_;
 };
 static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
                                  const vpx_svc_extra_cfg_t *svc_params,
                                  int spatial_layers, int temporal_layers,
-                                  int temporal_layering_mode,
+                                  int temporal_layering_mode) {
                                  int *layer_target_avg_bandwidth,
                                  int64_t *bits_in_buffer_model) {
  int sl, spatial_layer_target;
  float total = 0;
  float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
  float framerate = 30.0;
  for (sl = 0; sl < spatial_layers; ++sl) {
    if (svc_params->scaling_factor_den[sl] > 0) {
      alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] * 1.0 /
@@ -1577,41 +1336,8 @@ static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
    } else if (temporal_layering_mode == 2) {
      enc_cfg->layer_target_bitrate[index] = spatial_layer_target * 2 / 3;
      enc_cfg->layer_target_bitrate[index + 1] = spatial_layer_target;
    } else if (temporal_layering_mode <= 1) {
      enc_cfg->layer_target_bitrate[index] = spatial_layer_target;
    }
  }
  for (sl = 0; sl < spatial_layers; ++sl) {
    for (int tl = 0; tl < temporal_layers; ++tl) {
      const int layer = sl * temporal_layers + tl;
      float layer_framerate = framerate;
      if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2;
      if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4;
      if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2;
      layer_target_avg_bandwidth[layer] = static_cast<int>(
          enc_cfg->layer_target_bitrate[layer] * 1000.0 / layer_framerate);
      bits_in_buffer_model[layer] =
          enc_cfg->layer_target_bitrate[layer] * enc_cfg->rc_buf_initial_sz;
    }
  }
 }
 static void CheckLayerRateTargeting(vpx_codec_enc_cfg_t *const cfg,
                                    int number_spatial_layers,
                                    int number_temporal_layers,
                                    double *file_datarate,
                                    double thresh_overshoot,
                                    double thresh_undershoot) {
  for (int sl = 0; sl < number_spatial_layers; ++sl)
    for (int tl = 0; tl < number_temporal_layers; ++tl) {
      const int layer = sl * number_temporal_layers + tl;
      ASSERT_GE(cfg->layer_target_bitrate[layer],
                file_datarate[layer] * thresh_overshoot)
          << " The datarate for the file exceeds the target by too much!";
      ASSERT_LE(cfg->layer_target_bitrate[layer],
                file_datarate[layer] * thresh_undershoot)
          << " The datarate for the file is lower than the target by too much!";
    }
 }
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
@@ -1637,21 +1363,14 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TLScreenContent1) {
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 10;
  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  number_temporal_layers_ = cfg_.ts_number_layers;
  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
  top_sl_width_ = 1280;
  top_sl_height_ = 720;
  cfg_.rc_target_bitrate = 500;
  ResetModel();
  tune_content_ = 1;
  base_speed_setting_ = speed_setting_;
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }
@@ -1679,30 +1398,26 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL) {
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-  number_temporal_layers_ = cfg_.ts_number_layers;
+                                       30, 1, 0, 200);
  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                       0, 400);
  top_sl_width_ = 640;
  top_sl_height_ = 480;
  // TODO(marpan): Check that effective_datarate for each layer hits the
  // layer target_bitrate.
  for (int i = 200; i <= 800; i += 200) {
    cfg_.rc_target_bitrate = i;
    ResetModel();
    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-                            number_temporal_layers_, file_datarate_, 0.78,
+        << " The datarate for the file exceeds the target by too much!";
-                            1.15);
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
        << " The datarate for the file is lower than the target by too much!";
 #if CONFIG_VP9_DECODER
    // Number of temporal layers > 1, so half of the frames in this SVC pattern
    // will be non-reference frame and hence encoder will avoid loopfilter.
-    // Since frame dropper is off, we can expect 200 (half of the sequence)
+    // Since frame dropper is off, we can expcet 100 (half of the sequence)
    // mismatched frames.
-    EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
+    EXPECT_EQ(static_cast<unsigned int>(100), GetMismatchFrames());
 #endif
  }
 }
@@ -1731,43 +1446,33 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLDenoiserOn) {
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  number_temporal_layers_ = cfg_.ts_number_layers;
  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                       0, 400);
  top_sl_width_ = 640;
  top_sl_height_ = 480;
  // TODO(marpan): Check that effective_datarate for each layer hits the
  // layer target_bitrate.
  // For SVC, noise_sen = 1 means denoising only the top spatial layer
  // noise_sen = 2 means denoising the two top spatial layers.
  for (int noise_sen = 1; noise_sen <= 2; noise_sen++) {
  for (int i = 600; i <= 1000; i += 200) {
    cfg_.rc_target_bitrate = i;
    ResetModel();
-      denoiser_on_ = noise_sen;
+    denoiser_on_ = 1;
    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                            cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
                            layer_target_avg_bandwidth_, bits_in_buffer_model_);
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-      CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-                              number_temporal_layers_, file_datarate_, 0.78,
+        << " The datarate for the file exceeds the target by too much!";
-                              1.15);
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
        << " The datarate for the file is lower than the target by too much!";
 #if CONFIG_VP9_DECODER
-      // Number of temporal layers > 1, so half of the frames in this SVC
+    // Number of temporal layers > 1, so half of the frames in this SVC pattern
      // pattern
    // will be non-reference frame and hence encoder will avoid loopfilter.
-      // Since frame dropper is off, we can expect 200 (half of the sequence)
+    // Since frame dropper is off, we can expcet 150 (half of the sequence)
    // mismatched frames.
-      EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
+    EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
 #endif
  }
 }
 }
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
 // temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) {
+TEST_P(DatarateOnePassCbrSvc, DISABLED_OnePassCbrSvc2SL3TLSmallKf) {
  cfg_.rc_buf_initial_sz = 500;
  cfg_.rc_buf_optimal_sz = 500;
  cfg_.rc_buf_sz = 1000;
@@ -1788,25 +1493,21 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) {
  svc_params_.scaling_factor_num[1] = 288;
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 10;
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 200);
  cfg_.rc_target_bitrate = 400;
  number_spatial_layers_ = cfg_.ss_number_layers;
  number_temporal_layers_ = cfg_.ts_number_layers;
  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                       0, 400);
  top_sl_width_ = 640;
  top_sl_height_ = 480;
  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
  for (int j = 64; j <= 67; j++) {
    cfg_.kf_max_dist = j;
    ResetModel();
    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.80)
-                            number_temporal_layers_, file_datarate_, 0.78,
+        << " The datarate for the file exceeds the target by too much!";
-                            1.15);
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
        << " The datarate for the file is lower than the target by too much!";
  }
 }
@@ -1834,25 +1535,22 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) {
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  number_temporal_layers_ = cfg_.ts_number_layers;
  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
  top_sl_width_ = 1280;
  top_sl_height_ = 720;
  cfg_.rc_target_bitrate = 800;
  ResetModel();
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+      << " The datarate for the file exceeds the target by too much!";
  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
      << " The datarate for the file is lower than the target by too much!";
 #if CONFIG_VP9_DECODER
  // Number of temporal layers > 1, so half of the frames in this SVC pattern
  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expect 30 (half of the sequence)
+  // Since frame dropper is off, we can expcet 150 (half of the sequence)
  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
+  EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
 #endif
 }
@@ -1882,126 +1580,25 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL) {
  svc_params_.scaling_factor_den[2] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  number_temporal_layers_ = cfg_.ts_number_layers;
  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                       0, 400);
  top_sl_width_ = 640;
  top_sl_height_ = 480;
  cfg_.rc_target_bitrate = 800;
  ResetModel();
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+      << " The datarate for the file exceeds the target by too much!";
  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
      << " The datarate for the file is lower than the target by too much!";
 #if CONFIG_VP9_DECODER
  // Number of temporal layers > 1, so half of the frames in this SVC pattern
  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expect 200 (half of the sequence)
+  // Since frame dropper is off, we can expcet 150 (half of the sequence)
  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
+  EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
 #endif
 }
 // Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
 // 2 temporal layers, with a change on the fly from the fixed SVC pattern to one
 // generate via SVC_SET_REF_FRAME_CONFIG. The new pattern also disables
 // inter-layer prediction.
 TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL2TLDynamicPatternChange) {
  cfg_.rc_buf_initial_sz = 500;
  cfg_.rc_buf_optimal_sz = 500;
  cfg_.rc_buf_sz = 1000;
  cfg_.rc_min_quantizer = 0;
  cfg_.rc_max_quantizer = 63;
  cfg_.rc_end_usage = VPX_CBR;
  cfg_.g_lag_in_frames = 0;
  cfg_.ss_number_layers = 3;
  cfg_.ts_number_layers = 2;
  cfg_.ts_rate_decimator[0] = 2;
  cfg_.ts_rate_decimator[1] = 1;
  cfg_.g_error_resilient = 1;
  cfg_.g_threads = 1;
  cfg_.temporal_layering_mode = 2;
  svc_params_.scaling_factor_num[0] = 72;
  svc_params_.scaling_factor_den[0] = 288;
  svc_params_.scaling_factor_num[1] = 144;
  svc_params_.scaling_factor_den[1] = 288;
  svc_params_.scaling_factor_num[2] = 288;
  svc_params_.scaling_factor_den[2] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
  number_spatial_layers_ = cfg_.ss_number_layers;
  number_temporal_layers_ = cfg_.ts_number_layers;
  // Change SVC pattern on the fly.
  update_pattern_ = 1;
  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                       0, 400);
  top_sl_width_ = 640;
  top_sl_height_ = 480;
  cfg_.rc_target_bitrate = 800;
  ResetModel();
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
 #if CONFIG_VP9_DECODER
  // Number of temporal layers > 1, so half of the frames in this SVC pattern
  // will be non-reference frame and hence encoder will avoid loopfilter.
  // Since frame dropper is off, we can expect 200 (half of the sequence)
  // mismatched frames.
  EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
 #endif
 }
 // Check basic rate targeting for 1 pass CBR SVC with 3 spatial layers and on
 // the fly switching to 2 spatial layers and then back to 3. This switch is done
 // by setting top spatial layer bitrate to 0, and then back to non-zero, during
 // the sequence.
 TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL_to_2SL_dynamic) {
  cfg_.rc_buf_initial_sz = 500;
  cfg_.rc_buf_optimal_sz = 500;
  cfg_.rc_buf_sz = 1000;
  cfg_.rc_min_quantizer = 0;
  cfg_.rc_max_quantizer = 63;
  cfg_.rc_end_usage = VPX_CBR;
  cfg_.g_lag_in_frames = 0;
  cfg_.ss_number_layers = 3;
  cfg_.ts_number_layers = 1;
  cfg_.ts_rate_decimator[0] = 1;
  cfg_.g_error_resilient = 1;
  cfg_.g_threads = 1;
  cfg_.temporal_layering_mode = 0;
  svc_params_.scaling_factor_num[0] = 72;
  svc_params_.scaling_factor_den[0] = 288;
  svc_params_.scaling_factor_num[1] = 144;
  svc_params_.scaling_factor_den[1] = 288;
  svc_params_.scaling_factor_num[2] = 288;
  svc_params_.scaling_factor_den[2] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
  number_spatial_layers_ = cfg_.ss_number_layers;
  number_temporal_layers_ = cfg_.ts_number_layers;
  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                       0, 400);
  top_sl_width_ = 640;
  top_sl_height_ = 480;
  cfg_.rc_target_bitrate = 800;
  ResetModel();
  dynamic_drop_layer_ = true;
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
  // Don't check rate targeting on top spatial layer since it will be skipped
  // for part of the sequence.
  CheckLayerRateTargeting(&cfg_, number_spatial_layers_ - 1,
                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
 }
 // Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
 // temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
 TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) {
@@ -2027,25 +1624,20 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) {
  svc_params_.scaling_factor_num[2] = 288;
  svc_params_.scaling_factor_den[2] = 288;
  cfg_.rc_dropframe_thresh = 10;
  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  cfg_.rc_target_bitrate = 800;
  number_spatial_layers_ = cfg_.ss_number_layers;
  number_temporal_layers_ = cfg_.ts_number_layers;
  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                       0, 400);
  top_sl_width_ = 640;
  top_sl_height_ = 480;
  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
  for (int j = 32; j <= 35; j++) {
    cfg_.kf_max_dist = j;
    ResetModel();
    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.80)
-                            number_temporal_layers_, file_datarate_, 0.78,
+        << " The datarate for the file exceeds the target by too much!";
-                            1.15);
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30)
        << " The datarate for the file is lower than the target by too much!";
  }
 }
@@ -2075,25 +1667,22 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL4threads) {
  svc_params_.scaling_factor_den[2] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  number_temporal_layers_ = cfg_.ts_number_layers;
  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
  top_sl_width_ = 1280;
  top_sl_height_ = 720;
  cfg_.rc_target_bitrate = 800;
  ResetModel();
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+      << " The datarate for the file exceeds the target by too much!";
  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
      << " The datarate for the file is lower than the target by too much!";
 #if CONFIG_VP9_DECODER
  // Number of temporal layers > 1, so half of the frames in this SVC pattern
  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expect 30 (half of the sequence)
+  // Since frame dropper is off, we can expcet 150 (half of the sequence)
  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
+  EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
 #endif
 }
@@ -2125,21 +1714,9 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TL5x5MultipleRuns) {
  cfg_.layer_target_bitrate[0] = 300;
  cfg_.layer_target_bitrate[1] = 1400;
  cfg_.rc_target_bitrate = 1700;
-  number_spatial_layers_ = cfg_.ss_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  number_temporal_layers_ = cfg_.ts_number_layers;
  ResetModel();
  layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30;
  bits_in_buffer_model_[0] =
      cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz;
  layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30;
  bits_in_buffer_model_[1] =
      cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz;
  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
  top_sl_width_ = 1280;
  top_sl_height_ = 720;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }
@@ -2152,9 +1729,6 @@ VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
                          ::testing::Values(::libvpx_test::kOnePassGood,
                                            ::libvpx_test::kRealTime),
                          ::testing::Range(2, 9));
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime,
                          ::testing::Values(::libvpx_test::kRealTime),
                          ::testing::Range(5, 9));
 #if CONFIG_VP9_TEMPORAL_DENOISING
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser,
                          ::testing::Values(::libvpx_test::kRealTime),
--- a/test/dct_partial_test.cc
+++ b/test/dct_partial_test.cc
@@ -28,8 +28,8 @@
 using libvpx_test::ACMRandom;
 using libvpx_test::Buffer;
 using std::tr1::make_tuple;
 using std::tr1::tuple;
 using std::tr1::make_tuple;
 namespace {
 typedef void (*PartialFdctFunc)(const int16_t *in, tran_low_t *out, int stride);
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -106,90 +106,4 @@ TEST(EncodeAPI, ImageSizeSetting) {
 }
 #endif
 // Set up 2 spatial streams with 2 temporal layers per stream, and generate
 // invalid configuration by setting the temporal layer rate allocation
 // (ts_target_bitrate[]) to 0 for both layers. This should fail independent of
 // CONFIG_MULTI_RES_ENCODING.
 TEST(EncodeAPI, MultiResEncode) {
  static const vpx_codec_iface_t *kCodecs[] = {
 #if CONFIG_VP8_ENCODER
    &vpx_codec_vp8_cx_algo,
 #endif
 #if CONFIG_VP9_ENCODER
    &vpx_codec_vp9_cx_algo,
 #endif
  };
  const int width = 1280;
  const int height = 720;
  const int width_down = width / 2;
  const int height_down = height / 2;
  const int target_bitrate = 1000;
  const int framerate = 30;
  for (int c = 0; c < NELEMENTS(kCodecs); ++c) {
    const vpx_codec_iface_t *const iface = kCodecs[c];
    vpx_codec_ctx_t enc[2];
    vpx_codec_enc_cfg_t cfg[2];
    vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
    memset(enc, 0, sizeof(enc));
    for (int i = 0; i < 2; i++) {
      vpx_codec_enc_config_default(iface, &cfg[i], 0);
    }
    /* Highest-resolution encoder settings */
    cfg[0].g_w = width;
    cfg[0].g_h = height;
    cfg[0].rc_dropframe_thresh = 0;
    cfg[0].rc_end_usage = VPX_CBR;
    cfg[0].rc_resize_allowed = 0;
    cfg[0].rc_min_quantizer = 2;
    cfg[0].rc_max_quantizer = 56;
    cfg[0].rc_undershoot_pct = 100;
    cfg[0].rc_overshoot_pct = 15;
    cfg[0].rc_buf_initial_sz = 500;
    cfg[0].rc_buf_optimal_sz = 600;
    cfg[0].rc_buf_sz = 1000;
    cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
    cfg[0].g_lag_in_frames = 0;
    cfg[0].kf_mode = VPX_KF_AUTO;
    cfg[0].kf_min_dist = 3000;
    cfg[0].kf_max_dist = 3000;
    cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */
    cfg[0].g_timebase.num = 1;                 /* Set fps */
    cfg[0].g_timebase.den = framerate;
    memcpy(&cfg[1], &cfg[0], sizeof(cfg[0]));
    cfg[1].rc_target_bitrate = 500;
    cfg[1].g_w = width_down;
    cfg[1].g_h = height_down;
    for (int i = 0; i < 2; i++) {
      cfg[i].ts_number_layers = 2;
      cfg[i].ts_periodicity = 2;
      cfg[i].ts_rate_decimator[0] = 2;
      cfg[i].ts_rate_decimator[1] = 1;
      cfg[i].ts_layer_id[0] = 0;
      cfg[i].ts_layer_id[1] = 1;
      // Invalid parameters.
      cfg[i].ts_target_bitrate[0] = 0;
      cfg[i].ts_target_bitrate[1] = 0;
    }
    // VP9 should report incapable, VP8 invalid for all configurations.
    const char kVP9Name[] = "WebM Project VP9";
    const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface),
                                sizeof(kVP9Name) - 1) == 0;
    EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
              vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
    for (int i = 0; i < 2; i++) {
      vpx_codec_destroy(&enc[i]);
    }
  }
 }
 }  // namespace
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -201,8 +201,6 @@ void EncoderTest::RunLoop(VideoSource *video) {
      PreEncodeFrameHook(video, encoder.get());
      encoder->EncodeFrame(video, frame_flags_);
      PostEncodeFrameHook(encoder.get());
      CxDataIterator iter = encoder->GetCxData();
      bool has_cxdata = false;
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -128,11 +128,6 @@ class Encoder {
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }
  void Control(int ctrl_id, struct vpx_svc_ref_frame_config *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }
  void Control(int ctrl_id, struct vpx_svc_parameters *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
@@ -142,12 +137,15 @@ class Encoder {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }
 #endif
 #if CONFIG_VP8_ENCODER
  void Control(int ctrl_id, vpx_roi_map_t *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }
 #endif
  void Config(const vpx_codec_enc_cfg_t *cfg) {
    const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
@@ -221,8 +219,6 @@ class EncoderTest {
  virtual void PreEncodeFrameHook(VideoSource * /*video*/,
                                  Encoder * /*encoder*/) {}
  virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {}
  // Hook to be called on every compressed data packet.
  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -675,9 +675,7 @@ INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
                        ::testing::Values(make_tuple(&vpx_fdct8x8_neon,
                                                     &vpx_idct8x8_64_add_neon,
                                                     0, VPX_BITS_8)));
-// TODO(linfengz): reenable these functions once test vector failures are
+#if !CONFIG_VP9_HIGHBITDEPTH
 // addressed.
 #if 0   // !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
    NEON, FwdTrans8x8HT,
    ::testing::Values(
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -174,4 +174,4 @@ INSTANTIATE_TEST_CASE_P(MSA, IDCTTest,
 INSTANTIATE_TEST_CASE_P(MMI, IDCTTest,
                        ::testing::Values(vp8_short_idct4x4llm_mmi));
 #endif  // HAVE_MMI
-}  // namespace
+}
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -123,7 +123,6 @@ TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
 #if CONFIG_VP8_DECODER
 const DecodeParam kVP8InvalidFileTests[] = {
  { 1, "invalid-bug-1443.ivf" },
  { 1, "invalid-token-partition.ivf" },
 };
 VP8_INSTANTIATE_TEST_CASE(InvalidFileTest,
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -114,18 +114,6 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
  }
 }
 uint8_t GetOuterThresh(ACMRandom *rnd) {
  return static_cast<uint8_t>(rnd->RandRange(3 * MAX_LOOP_FILTER + 5));
 }
 uint8_t GetInnerThresh(ACMRandom *rnd) {
  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1));
 }
 uint8_t GetHevThresh(ACMRandom *rnd) {
  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1) >> 4);
 }
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
 public:
  virtual ~Loop8Test6Param() {}
@@ -174,15 +162,15 @@ TEST_P(Loop8Test6Param, OperationCheck) {
  int first_failure = -1;
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = GetOuterThresh(&rnd);
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetInnerThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
    DECLARE_ALIGNED(16, const uint8_t,
                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetHevThresh(&rnd);
+    tmp = rnd.Rand8();
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -233,15 +221,15 @@ TEST_P(Loop8Test6Param, ValueCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = GetOuterThresh(&rnd);
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetInnerThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
    DECLARE_ALIGNED(16, const uint8_t,
                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetHevThresh(&rnd);
+    tmp = rnd.Rand8();
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -283,27 +271,27 @@ TEST_P(Loop8Test9Param, OperationCheck) {
  int first_failure = -1;
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = GetOuterThresh(&rnd);
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetInnerThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
    DECLARE_ALIGNED(16, const uint8_t,
                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetHevThresh(&rnd);
+    tmp = rnd.Rand8();
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetOuterThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetInnerThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
    DECLARE_ALIGNED(16, const uint8_t,
                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetHevThresh(&rnd);
+    tmp = rnd.Rand8();
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -346,27 +334,27 @@ TEST_P(Loop8Test9Param, ValueCheck) {
  int first_failure = -1;
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = GetOuterThresh(&rnd);
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetInnerThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
    DECLARE_ALIGNED(16, const uint8_t,
                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetHevThresh(&rnd);
+    tmp = rnd.Rand8();
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetOuterThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetInnerThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
    DECLARE_ALIGNED(16, const uint8_t,
                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetHevThresh(&rnd);
+    tmp = rnd.Rand8();
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -277,29 +277,12 @@ class ResizeTest
    SetMode(GET_PARAM(1));
  }
  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
    encode_frame_width_.push_back(pkt->data.frame.width[0]);
    encode_frame_height_.push_back(pkt->data.frame.height[0]);
  }
  unsigned int GetFrameWidth(size_t idx) const {
    return encode_frame_width_[idx];
  }
  unsigned int GetFrameHeight(size_t idx) const {
    return encode_frame_height_[idx];
  }
  virtual void DecompressedFrameHook(const vpx_image_t &img,
                                     vpx_codec_pts_t pts) {
    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
  }
  std::vector<FrameInfo> frame_info_list_;
  std::vector<unsigned int> encode_frame_width_;
  std::vector<unsigned int> encode_frame_height_;
 };
 TEST_P(ResizeTest, TestExternalResizeWorks) {
@@ -313,9 +296,6 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
    const unsigned int frame = static_cast<unsigned>(info->pts);
    unsigned int expected_w;
    unsigned int expected_h;
    const size_t idx = info - frame_info_list_.begin();
    ASSERT_EQ(info->w, GetFrameWidth(idx));
    ASSERT_EQ(info->h, GetFrameHeight(idx));
    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
                        &expected_h, 0);
    EXPECT_EQ(expected_w, info->w)
@@ -484,23 +464,8 @@ class ResizeRealtimeTest
    ++mismatch_nframes_;
  }
  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
    encode_frame_width_.push_back(pkt->data.frame.width[0]);
    encode_frame_height_.push_back(pkt->data.frame.height[0]);
  }
  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
  unsigned int GetFrameWidth(size_t idx) const {
    return encode_frame_width_[idx];
  }
  unsigned int GetFrameHeight(size_t idx) const {
    return encode_frame_height_[idx];
  }
  void DefaultConfig() {
    cfg_.rc_buf_initial_sz = 500;
    cfg_.rc_buf_optimal_sz = 600;
@@ -528,8 +493,6 @@ class ResizeRealtimeTest
  bool change_bitrate_;
  double mismatch_psnr_;
  int mismatch_nframes_;
  std::vector<unsigned int> encode_frame_width_;
  std::vector<unsigned int> encode_frame_height_;
 };
 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
@@ -619,9 +582,6 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
  int resize_count = 0;
  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
       info != frame_info_list_.end(); ++info) {
    const size_t idx = info - frame_info_list_.begin();
    ASSERT_EQ(info->w, GetFrameWidth(idx));
    ASSERT_EQ(info->h, GetFrameHeight(idx));
    if (info->w != last_w || info->h != last_h) {
      resize_count++;
      if (resize_count == 1) {
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -112,9 +112,8 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_SSE2
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_CASE_P(MSA, SumSquaresTest, ::testing::Values(make_tuple(
-    MSA, SumSquaresTest,
+                                                 &vpx_sum_squares_2d_i16_c,
    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
                                                 &vpx_sum_squares_2d_i16_msa)));
 #endif  // HAVE_MSA
 }  // namespace
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -734,8 +734,6 @@ endif  # CONFIG_VP9_HIGHBITDEPTH
 # Invalid files for testing libvpx error checking.
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -852,7 +852,5 @@ e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res
 fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf
 fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res
 1a0e405606939f2febab1a21b30c37cb8f2c8cb1 *invalid-token-partition.ivf
 90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res
 17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm
 e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -61,6 +61,7 @@ int main(int argc, char **argv) {
 #if !CONFIG_SHARED
 // Shared library builds don't support whitebox tests
 // that exercise internal symbols.
 #if CONFIG_VP8
  vp8_rtcd();
 #endif  // CONFIG_VP8
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc
@@ -27,8 +27,8 @@
 namespace {
 using libvpx_test::ACMRandom;
 using std::string;
 using libvpx_test::ACMRandom;
 #if CONFIG_WEBM_IO
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -59,7 +59,7 @@ const TestVideoParam kTestVectors[] = {
 // Encoding modes tested
 const libvpx_test::TestMode kEncodingModeVectors[] = {
  ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-  ::libvpx_test::kRealTime
+  ::libvpx_test::kRealTime,
 };
 // Speed settings tested
--- a/test/vp9_motion_vector_test.cc
+++ b/test/vp9_motion_vector_test.cc
@@ -22,7 +22,7 @@ namespace {
 // Encoding modes
 const libvpx_test::TestMode kEncodingModeVectors[] = {
  ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-  ::libvpx_test::kRealTime
+  ::libvpx_test::kRealTime,
 };
 // Encoding speeds
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -14,9 +14,9 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vp9_rtcd.h"
 #include "test/acm_random.h"
 #include "test/buffer.h"
 #include "test/clear_system_state.h"
@@ -42,7 +42,7 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
                             uint16_t *eob, const int16_t *scan,
                             const int16_t *iscan);
 typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
-                        int /*max_size*/, bool /*is_fp*/>
+                        int /*max_size*/>
    QuantizeParam;
 // Wrapper for FP version which does not use zbin or quant_shift.
@@ -69,15 +69,11 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,
 class VP9QuantizeBase {
 public:
-  VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
+  VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size)
-      : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) {
+      : bit_depth_(bit_depth), max_size_(max_size) {
    max_value_ = (1 << bit_depth_) - 1;
    zbin_ptr_ =
        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
    round_fp_ptr_ = reinterpret_cast<int16_t *>(
        vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
        vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
    round_ptr_ =
        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
    quant_ptr_ =
@@ -90,15 +86,11 @@ class VP9QuantizeBase {
  ~VP9QuantizeBase() {
    vpx_free(zbin_ptr_);
    vpx_free(round_fp_ptr_);
    vpx_free(quant_fp_ptr_);
    vpx_free(round_ptr_);
    vpx_free(quant_ptr_);
    vpx_free(quant_shift_ptr_);
    vpx_free(dequant_ptr_);
    zbin_ptr_ = NULL;
    round_fp_ptr_ = NULL;
    quant_fp_ptr_ = NULL;
    round_ptr_ = NULL;
    quant_ptr_ = NULL;
    quant_shift_ptr_ = NULL;
@@ -108,8 +100,6 @@ class VP9QuantizeBase {
 protected:
  int16_t *zbin_ptr_;
  int16_t *round_fp_ptr_;
  int16_t *quant_fp_ptr_;
  int16_t *round_ptr_;
  int16_t *quant_ptr_;
  int16_t *quant_shift_ptr_;
@@ -117,136 +107,29 @@ class VP9QuantizeBase {
  const vpx_bit_depth_t bit_depth_;
  int max_value_;
  const int max_size_;
  const bool is_fp_;
 };
 class VP9QuantizeTest : public VP9QuantizeBase,
                        public ::testing::TestWithParam<QuantizeParam> {
 public:
  VP9QuantizeTest()
-      : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3), GET_PARAM(4)),
+      : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3)), quantize_op_(GET_PARAM(0)),
-        quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}
+        ref_quantize_op_(GET_PARAM(1)) {}
 protected:
  const QuantizeFunc quantize_op_;
  const QuantizeFunc ref_quantize_op_;
 };
 // This quantizer compares the AC coefficients to the quantization step size to
 // determine if further multiplication operations are needed.
 // Based on vp9_quantize_fp_sse2().
 inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        int skip_block, const int16_t *round_ptr,
                        const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                        uint16_t *eob_ptr, const int16_t *scan,
                        const int16_t *iscan, int is_32x32) {
  int i, eob = -1;
  const int thr = dequant_ptr[1] >> (1 + is_32x32);
  (void)iscan;
  (void)skip_block;
  assert(!skip_block);
  // Quantization pass: All coefficients with index >= zero_flag are
  // skippable. Note: zero_flag can be zero.
  for (i = 0; i < n_coeffs; i += 16) {
    int y;
    int nzflag_cnt = 0;
    int abs_coeff[16];
    int coeff_sign[16];
    // count nzflag for each row (16 tran_low_t)
    for (y = 0; y < 16; ++y) {
      const int rc = i + y;
      const int coeff = coeff_ptr[rc];
      coeff_sign[y] = (coeff >> 31);
      abs_coeff[y] = (coeff ^ coeff_sign[y]) - coeff_sign[y];
      // The first 16 are skipped in the sse2 code.  Do the same here to match.
      if (i >= 16 && (abs_coeff[y] <= thr)) {
        nzflag_cnt++;
      }
    }
    for (y = 0; y < 16; ++y) {
      const int rc = i + y;
      // If all of the AC coeffs in a row has magnitude less than the
      // quantization step_size/2, quantize to zero.
      if (nzflag_cnt < 16) {
        int tmp;
        int _round;
        if (is_32x32) {
          _round = ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
        } else {
          _round = round_ptr[rc != 0];
        }
        tmp = clamp(abs_coeff[y] + _round, INT16_MIN, INT16_MAX);
        tmp = (tmp * quant_ptr[rc != 0]) >> (16 - is_32x32);
        qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y];
        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
        if (is_32x32) {
          dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
        } else {
          dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
        }
      } else {
        qcoeff_ptr[rc] = 0;
        dqcoeff_ptr[rc] = 0;
      }
    }
  }
  // Scan for eob.
  for (i = 0; i < n_coeffs; i++) {
    // Use the scan order to find the correct eob.
    const int rc = scan[i];
    if (qcoeff_ptr[rc]) {
      eob = i;
    }
  }
  *eob_ptr = eob + 1;
 }
 void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                      int skip_block, const int16_t *round_ptr,
                      const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                      uint16_t *eob_ptr, const int16_t *scan,
                      const int16_t *iscan) {
  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
 }
 void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                            int skip_block, const int16_t *round_ptr,
                            const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                            tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                            uint16_t *eob_ptr, const int16_t *scan,
                            const int16_t *iscan) {
  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
 }
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
                          int16_t *quant, int16_t *quant_shift,
-                          int16_t *dequant, int16_t *round_fp,
+                          int16_t *dequant) {
                          int16_t *quant_fp) {
  // Max when q == 0.  Otherwise, it is 48 for Y and 42 for U/V.
  const int max_qrounding_factor_fp = 64;
  for (int j = 0; j < 2; j++) {
    // The range is 4 to 1828 in the VP9 tables.
    const int qlookup = rnd->RandRange(1825) + 4;
    round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7;
    quant_fp[j] = (1 << 16) / qlookup;
    // Values determined by deconstructing vp9_init_quantizer().
    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
    // values or U/V values of any bit depth. This is because y_delta is not
    // factored into the vp9_ac_quant() call.
    zbin[j] = rnd->RandRange(1200);
    // round may be up to 685 for Y values or 914 for U/V.
    round[j] = rnd->RandRange(914);
    // quant ranges from 1 to -32703
@@ -258,8 +141,6 @@ void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
  }
  for (int j = 2; j < 8; j++) {
    zbin[j] = zbin[1];
    round_fp[j] = round_fp[1];
    quant_fp[j] = quant_fp[1];
    round[j] = round[1];
    quant[j] = quant[1];
    quant_shift[j] = quant_shift[1];
@@ -298,18 +179,18 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
    const int count = (4 << sz) * (4 << sz);
    coeff.Set(&rnd, -max_value_, max_value_);
    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                         quant_shift_ptr_, dequant_ptr_);
                         quant_fp_ptr_);
    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
+    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
-        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+                     round_ptr_, quant_ptr_, quant_shift_ptr_,
-        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
                     dequant_ptr_, &ref_eob, scan_order->scan,
                     scan_order->iscan);
    ASM_REGISTER_STATE_CHECK(
        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
                     round_ptr_, quant_ptr_, quant_shift_ptr_,
                     qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
                     dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
@@ -360,18 +241,18 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
    coeff.TopLeftPixel()[rnd(count)] =
        static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                         quant_shift_ptr_, dequant_ptr_);
                         quant_fp_ptr_);
    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
+    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
-        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+                     round_ptr_, quant_ptr_, quant_shift_ptr_,
-        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
                     dequant_ptr_, &ref_eob, scan_order->scan,
                     scan_order->iscan);
    ASM_REGISTER_STATE_CHECK(
        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
                     round_ptr_, quant_ptr_, quant_shift_ptr_,
                     qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
                     dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
@@ -418,10 +299,7 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) {
      const int count = (4 << sz) * (4 << sz);
      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                           quant_shift_ptr_, dequant_ptr_);
                           quant_fp_ptr_);
      int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
      int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
      if (i == 0) {
        // When |coeff values| are less than zbin the results are 0.
@@ -441,10 +319,10 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) {
      vpx_usec_timer timer;
      vpx_usec_timer_start(&timer);
      for (int j = 0; j < 100000000 / count; ++j) {
-        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
-                     q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(),
+                     round_ptr_, quant_ptr_, quant_shift_ptr_,
-                     dqcoeff.TopLeftPixel(), dequant_ptr_, &eob,
+                     qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
-                     scan_order->scan, scan_order->iscan);
+                     dequant_ptr_, &eob, scan_order->scan, scan_order->iscan);
      }
      vpx_usec_timer_mark(&timer);
      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
@@ -467,54 +345,50 @@ INSTANTIATE_TEST_CASE_P(
    SSE2, VP9QuantizeTest,
    ::testing::Values(
        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
+                   VPX_BITS_8, 16),
        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
+                   VPX_BITS_10, 16),
        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
+                   VPX_BITS_12, 16),
        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32),
        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32),
        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32)));
 #else
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_CASE_P(SSE2, VP9QuantizeTest,
-    SSE2, VP9QuantizeTest,
+                        ::testing::Values(make_tuple(&vpx_quantize_b_sse2,
-    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+                                                     &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
+                                                     VPX_BITS_8, 16)));
                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                 16, true)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
    DISABLED_SSE2, VP9QuantizeTest,
    ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
                                 16)));
 #endif  // HAVE_SSE2
 #if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
 #if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(
    SSSE3, VP9QuantizeTest,
    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
                                 VPX_BITS_8, 16, false),
                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                 16, true),
                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
                                 &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                 VPX_BITS_8, 32, true)));
 #else
 INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest,
                        ::testing::Values(make_tuple(&vpx_quantize_b_ssse3,
                                                     &vpx_quantize_b_c,
-                                                     VPX_BITS_8, 16, false)));
+                                                     VPX_BITS_8, 16)));
 #endif
 #if ARCH_X86_64
 // TODO(johannkoenig): SSSE3 optimizations do not yet pass this test.
-INSTANTIATE_TEST_CASE_P(DISABLED_SSSE3, VP9QuantizeTest,
+INSTANTIATE_TEST_CASE_P(
-                        ::testing::Values(make_tuple(
+    DISABLED_SSSE3, VP9QuantizeTest,
-                            &vpx_quantize_b_32x32_ssse3,
+    ::testing::Values(make_tuple(&vpx_quantize_b_32x32_ssse3,
-                            &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false)));
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32),
                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
                                 16),
                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
                                 VPX_BITS_8, 32)));
 #endif  // ARCH_X86_64
 #endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
@@ -524,54 +398,36 @@ INSTANTIATE_TEST_CASE_P(DISABLED_SSSE3, VP9QuantizeTest,
 INSTANTIATE_TEST_CASE_P(
    AVX, VP9QuantizeTest,
    ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
+                                 VPX_BITS_8, 16),
                      // Even though SSSE3 and AVX do not match the reference
                      // code, we can keep them in sync with each other.
                      make_tuple(&vpx_quantize_b_32x32_avx,
-                                 &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32,
+                                 &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32)));
                                 false)));
 #endif  // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH
 #if ARCH_X86_64 && HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(
    AVX2, VP9QuantizeTest,
    ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                 16, true)));
 #endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
 // TODO(webm:1448): dqcoeff is not handled correctly in HBD builds.
 #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
    NEON, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
+    ::testing::Values(
-                                 VPX_BITS_8, 16, false),
+        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16),
-                      make_tuple(&vpx_quantize_b_32x32_neon,
+        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                   VPX_BITS_8, 32),
                                 false),
        make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
-                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16),
                                 16, true),
        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
-                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32)));
                                 VPX_BITS_8, 32, true)));
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
 // Only useful to compare "Speed" test results.
 INSTANTIATE_TEST_CASE_P(
    DISABLED_C, VP9QuantizeTest,
    ::testing::Values(
-        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
+        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16),
        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
-                   32, false),
+                   32),
        make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
-                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16),
        make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
        make_tuple(&QuantFPWrapper<quantize_fp_32x32_nz_c>,
                   &QuantFPWrapper<quantize_fp_32x32_nz_c>, VPX_BITS_8, 32,
                   true),
        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_c>,
-                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32)));
                   true)));
 }  // namespace
--- a/test/vp9_scale_test.cc
+++ b/test/vp9_scale_test.cc
@@ -47,7 +47,7 @@ class ScaleTest : public VpxScaleBase,
        scale_fn_(&img_, &dst_img_, filter_type, phase_scaler));
  }
-  void RunTest(INTERP_FILTER filter_type) {
+  void RunTest() {
    static const int kNumSizesToTest = 20;
    static const int kNumScaleFactorsToTest = 4;
    static const int kSizesToTest[] = {
@@ -55,6 +55,7 @@ class ScaleTest : public VpxScaleBase,
      22, 24, 26, 28, 30, 32, 34, 68, 128, 134
    };
    static const int kScaleFactors[] = { 1, 2, 3, 4 };
    for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
      for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
        for (int h = 0; h < kNumSizesToTest; ++h) {
          const int src_height = kSizesToTest[h];
@@ -81,8 +82,8 @@ class ScaleTest : public VpxScaleBase,
                if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
                  continue;
                }
-              ASSERT_NO_FATAL_FAILURE(ResetScaleImages(src_width, src_height,
+                ASSERT_NO_FATAL_FAILURE(ResetScaleImages(
-                                                       dst_width, dst_height));
+                    src_width, src_height, dst_width, dst_height));
                ReferenceScaleFrame(filter_type, phase_scaler);
                ScaleFrame(filter_type, phase_scaler);
                if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
@@ -91,8 +92,8 @@ class ScaleTest : public VpxScaleBase,
                      "filter_type = %d, phase_scaler = %d, src_width = %4d, "
                      "src_height = %4d, dst_width = %4d, dst_height = %4d, "
                      "scale factor = %d:%d\n",
-                    filter_type, phase_scaler, src_width, src_height, dst_width,
+                      filter_type, phase_scaler, src_width, src_height,
-                    dst_height, sf_down, sf_up);
+                      dst_width, dst_height, sf_down, sf_up);
                  PrintDiff();
                }
                CompareImages(dst_img_);
@@ -103,6 +104,7 @@ class ScaleTest : public VpxScaleBase,
        }
      }
    }
  }
  void PrintDiffComponent(const uint8_t *const ref, const uint8_t *const opt,
                          const int stride, const int width, const int height,
@@ -143,10 +145,7 @@ class ScaleTest : public VpxScaleBase,
  ScaleFrameFunc scale_fn_;
 };
-TEST_P(ScaleTest, ScaleFrame_EightTap) { RunTest(EIGHTTAP); }
+TEST_P(ScaleTest, ScaleFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
 TEST_P(ScaleTest, ScaleFrame_EightTapSmooth) { RunTest(EIGHTTAP_SMOOTH); }
 TEST_P(ScaleTest, ScaleFrame_EightTapSharp) { RunTest(EIGHTTAP_SHARP); }
 TEST_P(ScaleTest, ScaleFrame_Bilinear) { RunTest(BILINEAR); }
 TEST_P(ScaleTest, DISABLED_Speed) {
  static const int kCountSpeedTestBlock = 100;
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -147,6 +147,7 @@ TEST(VPxWorkerThreadTest, TestInterfaceAPI) {
 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests
 #if CONFIG_WEBM_IO
 struct FileList {
  const char *name;
--- a/tools/all_builds.py
+++ b/tools/all_builds.py
@@ -0,0 +1,72 @@
 #!/usr/bin/python
 import getopt
 import subprocess
 import sys
 LONG_OPTIONS = ["shard=", "shards="]
 BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental"
 def RunCommand(command):
  run = subprocess.Popen(command, shell=True)
  output = run.communicate()
  if run.returncode:
    print "Non-zero return code: " + str(run.returncode) + " => exiting!"
    sys.exit(1)
 def list_of_experiments():
  experiments = []
  configure_file = open("configure")
  list_start = False
  for line in configure_file.read().split("\n"):
    if line == 'EXPERIMENT_LIST="':
      list_start = True
    elif line == '"':
      list_start = False
    elif list_start:
      currently_broken = ["csm"]
      experiment = line[4:]
      if experiment not in currently_broken:
        experiments.append(experiment)
  return experiments
 def main(argv):
  # Parse arguments
  options = {"--shard": 0, "--shards": 1}
  if "--" in argv:
    opt_end_index = argv.index("--")
  else:
    opt_end_index = len(argv)
  try:
    o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS)
  except getopt.GetoptError, err:
    print str(err)
    print "Usage: %s [--shard=<n> --shards=<n>] -- [configure flag ...]"%argv[0]
    sys.exit(2)
  options.update(o)
  extra_args = argv[opt_end_index + 1:]
  # Shard experiment list
  shard = int(options["--shard"])
  shards = int(options["--shards"])
  experiments = list_of_experiments()
  base_command = " ".join([BASE_COMMAND] + extra_args)
  configs = [base_command]
  configs += ["%s --enable-%s" % (base_command, e) for e in experiments]
  my_configs = zip(configs, range(len(configs)))
  my_configs = filter(lambda x: x[1] % shards == shard, my_configs)
  my_configs = [e[0] for e in my_configs]
  # Run configs for this shard
  for config in my_configs:
    test_build(config)
 def test_build(configure_command):
  print "\033[34m\033[47mTesting %s\033[0m" % (configure_command)
  RunCommand(configure_command)
  RunCommand("make clean")
  RunCommand("make")
 if __name__ == "__main__":
  main(sys.argv)
--- a/tools/author_first_release.sh
+++ b/tools/author_first_release.sh
@@ -0,0 +1,15 @@
 #!/bin/bash
 ##
 ## List the release each author first contributed to.
 ##
 ## Usage: author_first_release.sh [TAGS]
 ##
 ## If the TAGS arguments are unspecified, all tags reported by `git tag`
 ## will be considered.
 ##
 tags=${@:-$(git tag)}
 for tag in $tags; do
  git shortlog -n -e -s $tag |
      cut -f2- |
      awk "{print \"${tag#v}\t\"\$0}"
 done | sort -k2  | uniq -f2
--- a/tools/ftfy.sh
+++ b/tools/ftfy.sh
@@ -0,0 +1,158 @@
 #!/bin/sh
 self="$0"
 dirname_self=$(dirname "$self")
 usage() {
  cat <<EOF >&2
 Usage: $self [option]
 This script applies a whitespace transformation to the commit at HEAD. If no
 options are given, then the modified files are left in the working tree.
 Options:
  -h, --help     Shows this message
  -n, --dry-run  Shows a diff of the changes to be made.
  --amend        Squashes the changes into the commit at HEAD
                     This option will also reformat the commit message.
  --commit       Creates a new commit containing only the whitespace changes
  --msg-only     Reformat the commit message only, ignore the patch itself.
 EOF
  rm -f ${CLEAN_FILES}
  exit 1
 }
 log() {
  echo "${self##*/}: $@" >&2
 }
 vpx_style() {
  for f; do
    case "$f" in
      *.h|*.c|*.cc)
        clang-format -i --style=file "$f"
        ;;
    esac
  done
 }
 apply() {
  [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1"
 }
 commit() {
  LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}')
  if [ -z "$LAST_CHANGEID" ]; then
    log "HEAD doesn't have a Change-Id, unable to generate a new commit"
    exit 1
  fi
  # Build a deterministic Change-Id from the parent's
  NEW_CHANGEID=${LAST_CHANGEID}-styled
  NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin)
  # Commit, preserving authorship from the parent commit.
  git commit -a -C HEAD > /dev/null
  git commit --amend -F- << EOF
 Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9}
 Change-Id: ${NEW_CHANGEID}
 EOF
 }
 show_commit_msg_diff() {
  if [ $DIFF_MSG_RESULT -ne 0 ]; then
    log "Modified commit message:"
    diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3
  fi
 }
 amend() {
  show_commit_msg_diff
  if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then
    git commit -a --amend -F "$NEW_COMMIT_MSG"
  fi
 }
 diff_msg() {
  git log -1 --format=%B > "$ORIG_COMMIT_MSG"
  "${dirname_self}"/wrap-commit-msg.py \
      < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG"
  cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG"
  DIFF_MSG_RESULT=$?
 }
 # Temporary files
 ORIG_DIFF=orig.diff.$$
 MODIFIED_DIFF=modified.diff.$$
 FINAL_DIFF=final.diff.$$
 ORIG_COMMIT_MSG=orig.commit-msg.$$
 NEW_COMMIT_MSG=new.commit-msg.$$
 CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}"
 CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}"
 # Preconditions
 [ $# -lt 2 ] || usage
 if ! clang-format -version >/dev/null 2>&1; then
  log "clang-format not found"
  exit 1
 fi
 if ! git diff --quiet HEAD; then
  log "Working tree is dirty, commit your changes first"
  exit 1
 fi
 # Need to be in the root
 cd "$(git rev-parse --show-toplevel)"
 # Collect the original diff
 git show > "${ORIG_DIFF}"
 # Apply the style guide on new and modified files and collect its diff
 for f in $(git diff HEAD^ --name-only -M90 --diff-filter=AM); do
  case "$f" in
    third_party/*) continue;;
  esac
  vpx_style "$f"
 done
 git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}"
 # Intersect the two diffs
 "${dirname_self}"/intersect-diffs.py \
    "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}"
 INTERSECT_RESULT=$?
 git reset --hard >/dev/null
 # Fixup the commit message
 diff_msg
 # Handle options
 if [ -n "$1" ]; then
  case "$1" in
    -h|--help) usage;;
    -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;;
    --commit) apply "${FINAL_DIFF}"; commit;;
    --amend) apply "${FINAL_DIFF}"; amend;;
    --msg-only) amend;;
    *) usage;;
  esac
 else
  apply "${FINAL_DIFF}"
  if ! git diff --quiet; then
    log "Formatting changes applied, verify and commit."
    log "See also: http://www.webmproject.org/code/contribute/conventions/"
    git diff --stat
  fi
 fi
 rm -f ${CLEAN_FILES}
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -37,9 +37,7 @@ extern "C" {
 #define SEGMENT_DELTADATA 0
 #define SEGMENT_ABSDATA 1
-typedef struct {
+typedef struct { int r, c; } POS;
  int r, c;
 } POS;
 #define PLANE_TYPE_Y_NO_DC 0
 #define PLANE_TYPE_Y2 1
@@ -182,9 +180,6 @@ typedef struct {
  unsigned int low_res_ref_frames[MAX_REF_FRAMES];
  // The video frame counter value for the key frame, for lowest resolution.
  unsigned int key_frame_counter_value;
  // Flags to signal skipped encoding of previous and base layer stream.
  unsigned int skip_encoding_prev_stream;
  unsigned int skip_encoding_base_stream;
  LOWER_RES_MB_INFO *mb_info;
 } LOWER_RES_FRAME_INFO;
 #endif
--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -20,7 +20,8 @@ static void copy_and_extend_plane(unsigned char *s, /* source */
                                  int et,           /* extend top border */
                                  int el,           /* extend left border */
                                  int eb,           /* extend bottom border */
-                                  int er) {         /* extend right border */
+                                  int er            /* extend right border */
                                  ) {
  int i;
  unsigned char *src_ptr1, *src_ptr2;
  unsigned char *dest_ptr1, *dest_ptr2;
--- a/vp8/common/mips/mmi/idct_blk_mmi.c
+++ b/vp8/common/mips/mmi/idct_blk_mmi.c
@@ -12,7 +12,7 @@
 #include "vpx_mem/vpx_mem.h"
 void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
-                                      int stride, char *eobs) {
+                                      int stride, int8_t *eobs) {
  int i, j;
  for (i = 0; i < 4; i++) {
@@ -33,7 +33,8 @@ void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
 }
 void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu,
-                                       uint8_t *dstv, int stride, char *eobs) {
+                                       uint8_t *dstv, int stride,
                                       int8_t *eobs) {
  int i, j;
  for (i = 0; i < 2; i++) {
--- a/vp8/common/mips/mmi/loopfilter_filters_mmi.c
+++ b/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -461,87 +461,96 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
  );
 }
 /* clang-format off */
 #define VP8_MBLOOP_HPSRAB                                               \
-  "punpcklbh  %[ftmp10],  %[ftmp10],          %[ftmp0]            \n\t" \
+  "xor        %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t" \
-  "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp0]            \n\t" \
+  "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
-  "psrah      %[ftmp10],  %[ftmp10],          %[ftmp9]            \n\t" \
+  "punpcklbh  %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t" \
-  "psrah      %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t" \
+  "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp0]            \n\t" \
-  "packsshb   %[ftmp0],   %[ftmp10],          %[ftmp11]            \n\t"
+  "psrah      %[ftmp3],   %[ftmp3],           %[ftmp9]            \n\t" \
  "psrah      %[ftmp8],   %[ftmp8],           %[ftmp9]            \n\t" \
  "packsshb   %[ftmp0],   %[ftmp3],           %[ftmp8]            \n\t"
 #define VP8_MBLOOP_HPSRAB_PMULHH(reg1, reg2) \
  "pmulhh   " #reg1 ",  " #reg1 ",  " #reg2 "                     \n\t"
 #define VP8_MBLOOP_HPSRAB_ADD(reg) \
-  "punpcklbh  %[ftmp1],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "xor        %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t" \
-  "punpckhbh  %[ftmp2],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
-  "pmulhh     %[ftmp1],   %[ftmp1],         " #reg "              \n\t" \
+  "punpcklbh  %[ftmp3],   %[ftmp3],           %[ftmp2]            \n\t" \
-  "pmulhh     %[ftmp2],   %[ftmp2],         " #reg "              \n\t" \
+  "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp2]            \n\t" \
-  "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_003f]       \n\t" \
+  VP8_MBLOOP_HPSRAB_PMULHH(%[ftmp3], reg)                               \
-  "paddh      %[ftmp2],   %[ftmp2],           %[ff_ph_003f]       \n\t" \
+  VP8_MBLOOP_HPSRAB_PMULHH(%[ftmp8], reg)                               \
-  "psrah      %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],           %[ff_ph_003f]       \n\t" \
-  "psrah      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],           %[ff_ph_003f]       \n\t" \
-  "packsshb   %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+  "psrah      %[ftmp3],   %[ftmp3],           %[ftmp9]            \n\t" \
-/* clang-format on */
+  "psrah      %[ftmp8],   %[ftmp8],           %[ftmp9]            \n\t" \
  "packsshb   %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"
 void vp8_mbloop_filter_horizontal_edge_mmi(
    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
    const unsigned char *limit, const unsigned char *thresh, int count) {
  uint32_t tmp[1];
-  double ftmp[13];
+  mips_reg addr[2];
  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
  double ftmp[10];
  __asm__ volatile (
    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
    "1:                                                             \n\t"
    "gsldlc1    %[ftmp9],   0x07(%[limit])                          \n\t"
    "gsldrc1    %[ftmp9],   0x00(%[limit])                          \n\t"
    /* ftmp1: p3 */
    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
    /* ftmp3: p2 */
    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    "gsldlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
    /* ftmp4: p1 */
    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    "gsldlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
    /* ftmp5: p0 */
    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
    /* ftmp6: q0 */
    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
    /* ftmp7: q1 */
    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
    /* ftmp8: q2 */
    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
    /* ftmp2: q3 */
    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    "gsldlc1    %[ftmp2],   0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp2],   0x00(%[src_ptr])                        \n\t"
-    "gsldlc1    %[ftmp12],  0x07(%[blimit])                         \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
    "gsldrc1    %[ftmp12],  0x00(%[blimit])                         \n\t"
    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
    "gsldlc1    %[ftmp1],   0x07(%[addr1])                          \n\t"
    "gsldrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
    "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
    "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp0],   %[ftmp1],           %[ftmp3]            \n\t"
    "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
    /* ftmp4:p1 */
    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
    "gsldlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
    "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-    "pasubub    %[ftmp10],  %[ftmp4],           %[ftmp5]            \n\t"
+
-    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+    /* ftmp5:p0 */
    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
    "gsldlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
    "gsldrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp1],   %[ftmp4],           %[ftmp5]            \n\t"
    "sdc1       %[ftmp1],   0x00(%[srct])                           \n\t"
    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-    "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
+
-    "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp9]            \n\t"
+    /* ftmp6:q0 */
    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
    /* ftmp7:q1 */
    "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
    "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
    "pasubub    %[ftmp1],   %[ftmp7],           %[ftmp6]            \n\t"
    "sdc1       %[ftmp1],   0x08(%[srct])                           \n\t"
    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
    MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
    "gsldlc1    %[ftmp8],   0x07(%[addr1])                          \n\t"
    "gsldrc1    %[ftmp8],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
    MMI_ADDU(%[addr1], %[addr0], %[tmp0])
    "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
    "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
@@ -554,7 +563,9 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
    "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
-    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp12]           \n\t"
+    "gsldlc1    %[ftmp9],   0x07(%[blimit])                         \n\t"
    "gsldrc1    %[ftmp9],   0x00(%[blimit])                         \n\t"
    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
    "xor        %[ftmp9],   %[ftmp9],           %[ftmp9]            \n\t"
    /* ftmp0: mask */
@@ -562,8 +573,10 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
    "gsldlc1    %[ftmp9],   0x07(%[thresh])                         \n\t"
    "gsldrc1    %[ftmp9],   0x00(%[thresh])                         \n\t"
-    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+    "ldc1       %[ftmp1],   0x00(%[srct])                           \n\t"
-    "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "ldc1       %[ftmp2],   0x08(%[srct])                           \n\t"
    "psubusb    %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
    "xor        %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
@@ -575,13 +588,14 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
    "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
    "psubsb     %[ftmp9],   %[ftmp6],           %[ftmp5]            \n\t"
    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
-    "pandn      %[ftmp12],  %[ftmp1],           %[ftmp2]            \n\t"
+    "sdc1       %[ftmp2],   0x00(%[srct])                           \n\t"
    "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
    "li         %[tmp0],    0x0b                                    \n\t"
@@ -592,66 +606,70 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
    "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_04]         \n\t"
    VP8_MBLOOP_HPSRAB
    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
    "ldc1       %[ftmp2],   0x00(%[srct])                           \n\t"
    "pandn      %[ftmp2],   %[ftmp1],           %[ftmp2]            \n\t"
    "li         %[tmp0],    0x07                                    \n\t"
    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
-    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
-    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gssdlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
-    "gssdrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
    "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
-    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp3]            \n\t"
-    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+
-    "gssdlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gssdlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
-    "gssdlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
-    "gssdrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
-    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
-    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
-    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp1]            \n\t"
+    "gsldlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
-    "psubsb     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
-    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
-    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    "gsldlc1    %[ftmp7],   0x07(%[addr1])                          \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gsldrc1    %[ftmp7],   0x00(%[addr1])                          \n\t"
-    "gssdlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+
-    "gssdrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp3]            \n\t"
-    "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
-    "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
    "gssdlc1    %[ftmp7],   0x07(%[addr1])                          \n\t"
    "gssdrc1    %[ftmp7],   0x00(%[addr1])                          \n\t"
    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
    "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
    "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
    "addiu      %[count],   %[count],           -0x01               \n\t"
    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
    "bnez       %[count],   1b                                      \n\t"
    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
-      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
-      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
    : [limit]"r"(limit),                [blimit]"r"(blimit),
-      [thresh]"r"(thresh),
+      [srct]"r"(srct),                  [thresh]"r"(thresh),
      [src_pixel_step]"r"((mips_reg)src_pixel_step),
      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_03]"f"(ff_pb_03),
@@ -678,60 +696,64 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
    const unsigned char *limit, const unsigned char *thresh, int count) {
  mips_reg tmp[1];
  mips_reg addr[2];
  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
-  double ftmp[14];
+  double ftmp[13];
  __asm__ volatile (
    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
    MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
    "1:                                                             \n\t"
-    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
-    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[tmp0])
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp11],  0x07(%[addr0])                          \n\t"
-    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
-    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    "punpcklbh  %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "punpckhbh  %[ftmp2],   %[ftmp11],          %[ftmp12]           \n\t"
    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
-    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "gsldlc1    %[ftmp11],  0x07(%[src_ptr])                        \n\t"
-    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[src_ptr])                        \n\t"
-    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
-    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
+    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
    "punpcklbh  %[ftmp3],   %[ftmp11],          %[ftmp12]           \n\t"
    "punpckhbh  %[ftmp4],   %[ftmp11],          %[ftmp12]           \n\t"
-    "punpcklhw  %[ftmp1],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp4],           %[ftmp2]            \n\t"
-    "punpckhhw  %[ftmp2],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp4],           %[ftmp2]            \n\t"
-    "punpcklhw  %[ftmp3],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpcklhw  %[ftmp7],   %[ftmp3],           %[ftmp1]            \n\t"
-    "punpckhhw  %[ftmp4],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp8],   %[ftmp3],           %[ftmp1]            \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
-    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[addr0], %[src_ptr], %[tmp0])
-    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    "gsldlc1    %[ftmp11],  0x07(%[addr0])                          \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldrc1    %[ftmp11],  0x00(%[addr0])                          \n\t"
-    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[addr0], %[src_ptr], %[src_pixel_step])
-    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
-    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp11],          %[ftmp12]           \n\t"
-    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp11],          %[ftmp12]           \n\t"
    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
-    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
-    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    MMI_SUBU(%[addr0], %[src_ptr], %[tmp0])
-    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "gsldlc1    %[ftmp11],  0x07(%[addr0])                          \n\t"
-    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr0])                          \n\t"
    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
    "punpcklbh  %[ftmp0],   %[ftmp11],          %[ftmp12]           \n\t"
    "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp12]           \n\t"
-    "punpcklhw  %[ftmp5],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
-    "punpckhhw  %[ftmp6],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
-    "punpcklhw  %[ftmp7],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpcklhw  %[ftmp3],   %[ftmp0],           %[ftmp9]            \n\t"
-    "punpckhhw  %[ftmp8],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp0],           %[ftmp9]            \n\t"
    "gsldlc1    %[ftmp13],  0x07(%[limit])                          \n\t"
    "gsldrc1    %[ftmp13],  0x00(%[limit])                          \n\t"
    /* ftmp9:q0  ftmp10:q1 */
    "punpcklwd  %[ftmp9],   %[ftmp1],           %[ftmp5]            \n\t"
    "punpckhwd  %[ftmp10],  %[ftmp1],           %[ftmp5]            \n\t"
@@ -749,61 +771,60 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "punpcklwd  %[ftmp5],   %[ftmp4],           %[ftmp8]            \n\t"
    "punpckhwd  %[ftmp6],   %[ftmp4],           %[ftmp8]            \n\t"
    "gsldlc1    %[ftmp8],   0x07(%[limit])                          \n\t"
    "gsldrc1    %[ftmp8],   0x00(%[limit])                          \n\t"
    /* abs (q3-q2) */
    "pasubub    %[ftmp7],   %[ftmp12],          %[ftmp11]           \n\t"
-    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp13]           \n\t"
+    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp8]            \n\t"
    /* abs (q2-q1) */
    "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
-    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
    /* ftmp3: abs(q1-q0) */
    "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp13]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp8]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
    /* ftmp4: abs(p1-p0) */
    "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp13]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp8]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
    /* abs (p2-p1) */
    "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
    /* abs (p3-p2) */
    "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
-
+    /* abs (p0-q0) */
    "gsldlc1    %[ftmp13],  0x07(%[blimit])                         \n\t"
    "gsldrc1    %[ftmp13],  0x00(%[blimit])                         \n\t"
    "gsldlc1    %[ftmp7],   0x07(%[thresh])                         \n\t"
    "gsldrc1    %[ftmp7],   0x00(%[thresh])                         \n\t"
    /* abs (p0-q0) * 2 */
    "pasubub    %[ftmp1],   %[ftmp9],           %[ftmp6]            \n\t"
    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
-    /* abs (p1-q1) / 2 */
+    /* abs (p1-q1) */
    "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
    "and        %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
    "li         %[tmp0],    0x01                                    \n\t"
    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
    "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
    "paddusb    %[ftmp12],  %[ftmp1],           %[ftmp12]           \n\t"
-    "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp13]           \n\t"
+
    "gsldlc1    %[ftmp8],   0x07(%[blimit])                         \n\t"
    "gsldrc1    %[ftmp8],   0x00(%[blimit])                         \n\t"
    "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
    "xor        %[ftmp12],  %[ftmp12],          %[ftmp12]           \n\t"
    /* ftmp0: mask */
    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
-    /* abs(p1-p0) - thresh */
+    "gsldlc1    %[ftmp8],   0x07(%[thresh])                         \n\t"
-    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp7]            \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[thresh])                         \n\t"
-    /* abs(q1-q0) - thresh */
+    /* ftmp3: abs(q1-q0)  ftmp4: abs(p1-p0) */
-    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp7]            \n\t"
+    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp8]            \n\t"
    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"
    "or         %[ftmp3],   %[ftmp4],           %[ftmp3]            \n\t"
    "pcmpeqb    %[ftmp3],   %[ftmp3],           %[ftmp12]           \n\t"
    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
    /* ftmp1: hev */
    "xor        %[ftmp1],   %[ftmp3],           %[ftmp1]            \n\t"
    /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
    "xor        %[ftmp11],  %[ftmp11],          %[ff_pb_80]         \n\t"
    "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
    "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
@@ -816,30 +837,30 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
    /* filter_value &= mask */
    "and        %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
    /* Filter2 = filter_value & hev */
    "and        %[ftmp3],   %[ftmp1],           %[ftmp0]            \n\t"
    /* filter_value &= ~hev */
    "pandn      %[ftmp0],   %[ftmp1],           %[ftmp0]            \n\t"
    "paddsb     %[ftmp4],   %[ftmp3],           %[ff_pb_04]         \n\t"
    "li         %[tmp0],    0x0b                                    \n\t"
    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
    "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
    "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t"
    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp4]            \n\t"
    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp4]            \n\t"
    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
    "packsshb   %[ftmp4],   %[ftmp7],           %[ftmp8]            \n\t"
    /* ftmp9: qs0 */
    "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp4]            \n\t"
    "paddsb     %[ftmp3],   %[ftmp3],           %[ff_pb_03]         \n\t"
    "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
    "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t"
    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp3]            \n\t"
    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
    "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
-    /* ftmp6: ps0 */
+
    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
    "li         %[tmp0],    0x07                                    \n\t"
@@ -851,10 +872,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
    VP8_MBLOOP_VPSRAB_ADDT
    "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp3]            \n\t"
    /* ftmp9: oq0 */
    "xor        %[ftmp9],   %[ftmp4],           %[ff_pb_80]         \n\t"
    "paddsb     %[ftmp4],   %[ftmp6],           %[ftmp3]            \n\t"
    /* ftmp6: op0 */
    "xor        %[ftmp6],   %[ftmp4],           %[ff_pb_80]         \n\t"
    VP8_MBLOOP_VPSRAB_ADDH
@@ -863,10 +882,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
    VP8_MBLOOP_VPSRAB_ADDT
    "psubsb     %[ftmp4],   %[ftmp10],          %[ftmp3]            \n\t"
    /* ftmp10: oq1 */
    "xor        %[ftmp10],   %[ftmp4],          %[ff_pb_80]         \n\t"
    "paddsb     %[ftmp4],   %[ftmp5],           %[ftmp3]            \n\t"
    /* ftmp5: op1 */
    "xor        %[ftmp5],   %[ftmp4],           %[ff_pb_80]         \n\t"
    VP8_MBLOOP_VPSRAB_ADDH
@@ -874,10 +891,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "pmulhh     %[ftmp8],   %[ftmp8],           %[ff_ph_0900]       \n\t"
    VP8_MBLOOP_VPSRAB_ADDT
    "psubsb     %[ftmp4],   %[ftmp11],          %[ftmp3]            \n\t"
    /* ftmp11: oq2 */
    "xor        %[ftmp11],  %[ftmp4],           %[ff_pb_80]         \n\t"
    "paddsb     %[ftmp4],   %[ftmp2],           %[ftmp3]            \n\t"
    /* ftmp2: op2 */
    "xor        %[ftmp2],   %[ftmp4],           %[ff_pb_80]         \n\t"
    "ldc1       %[ftmp12],  0x00(%[srct])                           \n\t"
@@ -901,40 +916,41 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "punpcklhw  %[ftmp10],  %[ftmp1],           %[ftmp3]            \n\t"
    "punpckhhw  %[ftmp11],  %[ftmp1],           %[ftmp3]            \n\t"
    "punpcklwd  %[ftmp0],   %[ftmp7],           %[ftmp11]           \n\t"
    "punpckhwd  %[ftmp1],   %[ftmp7],           %[ftmp11]           \n\t"
    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
    "punpcklwd  %[ftmp0],   %[ftmp6],           %[ftmp10]           \n\t"
    "punpckhwd  %[ftmp1],   %[ftmp6],           %[ftmp10]           \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+
    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
    "punpcklwd  %[ftmp0],   %[ftmp7],           %[ftmp11]           \n\t"
    "punpckhwd  %[ftmp1],   %[ftmp7],           %[ftmp11]           \n\t"
    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
    "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
    "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
    "punpcklwd  %[ftmp1],   %[ftmp5],           %[ftmp9]            \n\t"
    "punpckhwd  %[ftmp0],   %[ftmp5],           %[ftmp9]            \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    MMI_SUBU(%[addr0], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
    "punpcklwd  %[ftmp1],   %[ftmp4],           %[ftmp8]            \n\t"
    "punpckhwd  %[ftmp0],   %[ftmp4],           %[ftmp8]            \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
    "addiu      %[count],   %[count],           -0x01               \n\t"
    MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
@@ -946,9 +962,9 @@ void vp8_mbloop_filter_vertical_edge_mmi(
      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
-      [ftmp12]"=&f"(ftmp[12]),            [ftmp13]"=&f"(ftmp[13]),
+      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
-      [tmp0]"=&r"(tmp[0]),                [src_ptr]"+&r"(src_ptr),
+      [addr0]"=&r"(addr[0]),
-      [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
    : [limit]"r"(limit),                [blimit]"r"(blimit),
      [srct]"r"(srct),                  [thresh]"r"(thresh),
      [src_pixel_step]"r"((mips_reg)src_pixel_step),
--- a/vp8/common/mips/mmi/sixtap_filter_mmi.c
+++ b/vp8/common/mips/mmi/sixtap_filter_mmi.c
@@ -86,7 +86,6 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
  register double ftmp8 asm("$f18");
  register double ftmp9 asm("$f20");
  register double ftmp10 asm("$f22");
  register double ftmp11 asm("$f24");
 #else
  register double fzero asm("$f0");
  register double ftmp0 asm("$f1");
@@ -100,7 +99,6 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
  register double ftmp8 asm("$f9");
  register double ftmp9 asm("$f10");
  register double ftmp10 asm("$f11");
  register double ftmp11 asm("$f12");
 #endif  // _MIPS_SIM == _ABIO32
  __asm__ volatile (
@@ -114,13 +112,11 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
    "li         %[tmp0],        0x07                                  \n\t"
    "mtc1       %[tmp0],        %[ftmp7]                              \n\t"
    "li         %[tmp0],        0x08                                  \n\t"
-    "mtc1       %[tmp0],        %[ftmp11]                             \n\t"
+    "mtc1       %[tmp0],        %[ftmp10]                             \n\t"
    "1:                                                               \n\t"
    "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"
    "gsldrc1    %[ftmp9],      -0x02(%[src_ptr])                      \n\t"
    "gsldlc1    %[ftmp10],      0x06(%[src_ptr])                      \n\t"
    "gsldrc1    %[ftmp10],      -0x01(%[src_ptr])                     \n\t"
    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
    "pmullh     %[ftmp8],       %[ftmp6],          %[ftmp0]           \n\t"
@@ -129,21 +125,24 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp4]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
-    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "gsldlc1    %[ftmp9],       0x06(%[src_ptr])                      \n\t"
    "gsldrc1    %[ftmp9],      -0x01(%[src_ptr])                      \n\t"
    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp1]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
-    "punpckhbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "punpckhbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp5]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
-    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "dsrl       %[ftmp9],       %[ftmp9],          %[ftmp10]          \n\t"
-    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp2]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
-    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "dsrl       %[ftmp9],       %[ftmp9],          %[ftmp10]          \n\t"
-    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp3]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
@@ -164,9 +163,8 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
-      [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),
+      [tmp0]"=&r"(tmp[0]),              [src_ptr]"+&r"(src_ptr),
-      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
      [src_ptr]"+&r"(src_ptr)
    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width),
      [ff_ph_40]"f"(ff_ph_40)
@@ -192,11 +190,6 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
  register double ftmp6 asm("$f14");
  register double ftmp7 asm("$f16");
  register double ftmp8 asm("$f18");
  register double ftmp9 asm("$f20");
  register double ftmp10 asm("$f22");
  register double ftmp11 asm("$f24");
  register double ftmp12 asm("$f26");
  register double ftmp13 asm("$f28");
 #else
  register double fzero asm("$f0");
  register double ftmp0 asm("$f1");
@@ -208,11 +201,6 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
  register double ftmp6 asm("$f7");
  register double ftmp7 asm("$f8");
  register double ftmp8 asm("$f9");
  register double ftmp9 asm("$f10");
  register double ftmp10 asm("$f11");
  register double ftmp11 asm("$f12");
  register double ftmp12 asm("$f13");
  register double ftmp13 asm("$f14");
 #endif  // _MIPS_SIM == _ABIO32
  __asm__ volatile (
@@ -222,56 +210,52 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
    "ldc1       %[ftmp3],     0x30(%[vp8_filter])                     \n\t"
    "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"
    "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"
    MMI_SUBU(%[src_ptr],   %[src_ptr],      %[pixels_per_line_x2])
    "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
    "li         %[tmp0],      0x07                                    \n\t"
-    "mtc1       %[tmp0],      %[ftmp13]                               \n\t"
+    "mtc1       %[tmp0],      %[ftmp7]                                \n\t"
    /* In order to make full use of memory load delay slot,
     * Operation of memory loading and calculating has been rearranged.
     */
    "1:                                                               \n\t"
    "gsldlc1    %[ftmp6],     0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp6],     0x00(%[src_ptr])                        \n\t"
    "pmullh     %[ftmp8],     %[ftmp6],        %[ftmp0]               \n\t"
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line])
-    "gsldlc1    %[ftmp7],     0x07(%[addr0])                          \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp7],     0x00(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp1]               \n\t"
    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
-    "gsldlc1    %[ftmp8],     0x07(%[addr0])                          \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp8],     0x00(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp2]               \n\t"
    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
-    "gsldlc1    %[ftmp9],     0x07(%[addr0])                          \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp9],     0x00(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp4]               \n\t"
    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
-    "gsldlc1    %[ftmp10],    0x07(%[addr0])                          \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp10],    0x00(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp3]               \n\t"
    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
-    "gsldlc1    %[ftmp11],    0x07(%[addr0])                          \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp11],    0x00(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp5]               \n\t"
    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
-    "pmullh     %[ftmp12],    %[ftmp6],        %[ftmp0]               \n\t"
+    "paddsh     %[ftmp8],     %[ftmp8],        %[ff_ph_40]            \n\t"
-
+    "psrah      %[ftmp8],     %[ftmp8],        %[ftmp7]               \n\t"
-    "pmullh     %[ftmp7],     %[ftmp7],        %[ftmp1]               \n\t"
+    "packushb   %[ftmp8],     %[ftmp8],        %[fzero]               \n\t"
-    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp7]               \n\t"
+    "gsswlc1    %[ftmp8],     0x03(%[output_ptr])                     \n\t"
-
+    "gsswrc1    %[ftmp8],     0x00(%[output_ptr])                     \n\t"
    "pmullh     %[ftmp8],     %[ftmp8],        %[ftmp2]               \n\t"
    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp8]               \n\t"
    "pmullh     %[ftmp9],     %[ftmp9],        %[ftmp4]               \n\t"
    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp9]               \n\t"
    "pmullh     %[ftmp10],    %[ftmp10],       %[ftmp3]               \n\t"
    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp10]              \n\t"
    "pmullh     %[ftmp11],    %[ftmp11],       %[ftmp5]               \n\t"
    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp11]              \n\t"
    "paddsh     %[ftmp12],    %[ftmp12],       %[ff_ph_40]            \n\t"
    "psrah      %[ftmp12],    %[ftmp12],       %[ftmp13]              \n\t"
    "packushb   %[ftmp12],    %[ftmp12],       %[fzero]               \n\t"
    "gsswlc1    %[ftmp12],    0x03(%[output_ptr])                     \n\t"
    "gsswrc1    %[ftmp12],    0x00(%[output_ptr])                     \n\t"
    MMI_ADDIU(%[output_height], %[output_height], -0x01)
    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
@@ -281,11 +265,9 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
-      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
+      [tmp0]"=&r"(tmp[0]),              [addr0]"=&r"(addr[0]),
-      [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),
+      [src_ptr]"+&r"(src_ptr),          [output_ptr]"+&r"(output_ptr),
-      [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),
+      [output_height]"+&r"(output_height)
      [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),
      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
      [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
      [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
@@ -319,7 +301,6 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi(
    "1:                                                               \n\t"
    "gsldlc1    %[ftmp0],       0x07(%[src_ptr])                      \n\t"
    "gsldrc1    %[ftmp0],       0x00(%[src_ptr])                      \n\t"
    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
    "punpcklbh  %[ftmp1],       %[ftmp0],          %[fzero]           \n\t"
    "gssdlc1    %[ftmp1],       0x07(%[output_ptr])                   \n\t"
@@ -327,6 +308,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi(
    "addiu      %[output_height], %[output_height], -0x01             \n\t"
    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
    "bnez       %[output_height],               1b                    \n\t"
    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
@@ -356,12 +338,12 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
    "1:                                                               \n\t"
    "gsldlc1    %[ftmp0],     0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp0],     0x00(%[src_ptr])                        \n\t"
    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
    MMI_ADDIU(%[output_height], %[output_height], -0x01)
    "packushb   %[ftmp1],     %[ftmp0],        %[fzero]               \n\t"
    "gsswlc1    %[ftmp1],     0x03(%[output_ptr])                     \n\t"
    "gsswrc1    %[ftmp1],     0x00(%[output_ptr])                     \n\t"
    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
    MMI_ADDIU(%[output_height], %[output_height], -0x01)
    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
    "bnez       %[output_height], 1b                                  \n\t"
    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
@@ -404,7 +386,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
      }                                                                        \
    } else {                                                                   \
      for (i = 0; i < loop; ++i) {                                             \
-        vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m,         \
+        vp8_filter_block1dc_v6_mmi(FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, \
                                   dst_pitch, n * 2, VFilter);                 \
      }                                                                        \
    }                                                                          \
--- a/vp8/common/modecont.c
+++ b/vp8/common/modecont.c
@@ -11,16 +11,28 @@
 #include "entropy.h"
 const int vp8_mode_contexts[6][4] = {
-  { /* 0 */
+  {
-    7, 1, 1, 143 },
+      /* 0 */
-  { /* 1 */
+      7, 1, 1, 143,
-    14, 18, 14, 107 },
+  },
-  { /* 2 */
+  {
-    135, 64, 57, 68 },
+      /* 1 */
-  { /* 3 */
+      14, 18, 14, 107,
-    60, 56, 128, 65 },
+  },
-  { /* 4 */
+  {
-    159, 134, 128, 34 },
+      /* 2 */
-  { /* 5 */
+      135, 64, 57, 68,
-    234, 188, 128, 28 },
+  },
  {
      /* 3 */
      60, 56, 128, 65,
  },
  {
      /* 4 */
      159, 134, 128, 34,
  },
  {
      /* 5 */
      234, 188, 128, 28,
  },
 };
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -1,13 +1,3 @@
 ##
 ##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
 ##  that can be found in the LICENSE file in the root of the source
 ##  tree. An additional intellectual property rights grant can be found
 ##  in the file PATENTS.  All contributing project authors may
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 sub vp8_common_forward_decls() {
 print <<EOF
 /*
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -95,7 +95,9 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
                                  int src_pixels_per_line, int xoffset,
                                  int yoffset, unsigned char *dst_ptr,
-                                  int dst_pitch) {
+                                  int dst_pitch
                                  ) {
  DECLARE_ALIGNED(16, unsigned short,
                  FData2[24 * 24]); /* Temp data bufffer used in filtering */
@@ -234,7 +236,9 @@ extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
                                   int src_pixels_per_line, int xoffset,
                                   int yoffset, unsigned char *dst_ptr,
-                                   int dst_pitch) {
+                                   int dst_pitch
                                   ) {
  DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
  if (xoffset) {
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -674,7 +674,7 @@ static unsigned int read_partition_size(VP8D_COMP *pbi,
 static int read_is_valid(const unsigned char *start, size_t len,
                         const unsigned char *end) {
-  return len != 0 && end > start && len <= (size_t)(end - start);
+  return (start + len > start && start + len <= end);
 }
 static unsigned int read_available_partition_size(
--- a/vp8/decoder/ec_types.h
+++ b/vp8/decoder/ec_types.h
@@ -34,9 +34,7 @@ typedef struct {
 /* Structure used to hold all the overlaps of a macroblock. The overlaps of a
 * macroblock is further divided into block overlaps.
 */
-typedef struct {
+typedef struct { B_OVERLAP overlaps[16]; } MB_OVERLAP;
  B_OVERLAP overlaps[16];
 } MB_OVERLAP;
 /* Structure for keeping track of motion vectors and which reference frame they
 * refer to. Used for motion vector interpolation.
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -31,9 +31,7 @@ typedef struct {
  void *ptr2;
 } DECODETHREAD_DATA;
-typedef struct {
+typedef struct { MACROBLOCKD mbd; } MB_ROW_DEC;
  MACROBLOCKD mbd;
 } MB_ROW_DEC;
 typedef struct {
  int enabled;
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -739,21 +739,24 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
    /* Allocate memory for above_row buffers. */
    CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
+      CHECK_MEM_ERROR(
-                      vpx_memalign(16, sizeof(unsigned char) *
+          pbi->mt_yabove_row[i],
-                                           (width + (VP8BORDERINPIXELS << 1))));
+          vpx_memalign(
              16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1))));
    CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
+      CHECK_MEM_ERROR(
-                      vpx_memalign(16, sizeof(unsigned char) *
+          pbi->mt_uabove_row[i],
-                                           (uv_width + VP8BORDERINPIXELS)));
+          vpx_memalign(16,
                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
    CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
+      CHECK_MEM_ERROR(
-                      vpx_memalign(16, sizeof(unsigned char) *
+          pbi->mt_vabove_row[i],
-                                           (uv_width + VP8BORDERINPIXELS)));
+          vpx_memalign(16,
                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
    /* Allocate memory for left_col buffers. */
    CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -989,8 +989,8 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
    bits_per_mb_at_this_q =
        vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
-    bits_per_mb_at_this_q =
+    bits_per_mb_at_this_q = (int)(.5 +
-        (int)(.5 + err_correction_factor * speed_correction *
+                                  err_correction_factor * speed_correction *
                                      cpi->twopass.est_max_qcorrection_factor *
                                      cpi->twopass.section_max_qfactor *
                                      (double)bits_per_mb_at_this_q);
@@ -1086,7 +1086,8 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
        vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
    bits_per_mb_at_this_q =
-        (int)(.5 + err_correction_factor * speed_correction * clip_iifactor *
+        (int)(.5 +
              err_correction_factor * speed_correction * clip_iifactor *
                  (double)bits_per_mb_at_this_q);
    /* Mode and motion overhead */
@@ -1272,7 +1273,8 @@ void vp8_init_second_pass(VP8_COMP *cpi) {
   * sum duration is not. Its calculated based on the actual durations of
   * all frames from the first pass.
   */
-  vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
+  vp8_new_framerate(cpi,
                    10000000.0 * cpi->twopass.total_stats.count /
                        cpi->twopass.total_stats.duration);
  cpi->output_framerate = cpi->framerate;
@@ -1737,8 +1739,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
            /* Dont break out very close to a key frame */
            ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
            ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
-            (!flash_detected) &&
+            (!flash_detected) && ((mv_ratio_accumulator > 100.0) ||
            ((mv_ratio_accumulator > 100.0) ||
                                  (abs_mv_in_out_accumulator > 3.0) ||
                                  (mv_in_out_accumulator < -2.0) ||
                                  ((boost_score - old_boost_score) < 2.0)))) {
@@ -1814,8 +1815,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      (next_frame.pcnt_inter > 0.75) &&
      ((mv_in_out_accumulator / (double)i > -0.2) ||
       (mv_in_out_accumulator > -2.0)) &&
-      (cpi->gfu_boost > 100) &&
+      (cpi->gfu_boost > 100) && (cpi->twopass.gf_decay_rate <=
      (cpi->twopass.gf_decay_rate <=
                                 (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))))
 #endif
  {
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2862,6 +2862,7 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
    fclose(yframe);
 }
 #endif
 /* return of 0 means drop frame */
 #if !CONFIG_REALTIME_ONLY
 /* Function to test for conditions that indeicate we should loop
@@ -3363,6 +3364,11 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
        (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info;
    if (cpi->oxcf.mr_encoder_id) {
      // TODO(marpan): This constraint shouldn't be needed, as we would like
      // to allow for key frame setting (forced or periodic) defined per
      // spatial layer. For now, keep this in.
      cm->frame_type = low_res_frame_info->frame_type;
      // Check if lower resolution is available for motion vector reuse.
      if (cm->frame_type != KEY_FRAME) {
        cpi->mr_low_res_mv_avail = 1;
@@ -3387,16 +3393,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
                     == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
        */
      }
      // Disable motion vector reuse (i.e., disable any usage of the low_res)
      // if the previous lower stream is skipped/disabled.
      if (low_res_frame_info->skip_encoding_prev_stream) {
        cpi->mr_low_res_mv_avail = 0;
    }
    }
    // This stream is not skipped (i.e., it's being encoded), so set this skip
    // flag to 0. This is needed for the next stream (i.e., which is the next
    // frame to be encoded).
    low_res_frame_info->skip_encoding_prev_stream = 0;
    // On a key frame: For the lowest resolution, keep track of the key frame
    // counter value. For the higher resolutions, reset the current video
@@ -4785,6 +4782,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
    cpi->temporal_pattern_counter++;
  }
 /* reset to normal state now that we are done. */
 #if 0
    {
        char filename[512];
@@ -5000,13 +4999,10 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
        // be received for that high layer, which will yield an incorrect
        // frame rate (from time-stamp adjustment in above calculation).
        if (cpi->oxcf.mr_encoder_id) {
          if (!low_res_frame_info->skip_encoding_base_stream)
          cpi->ref_framerate = low_res_frame_info->low_res_framerate;
        } else {
          // Keep track of frame rate for lowest resolution.
          low_res_frame_info->low_res_framerate = cpi->ref_framerate;
          // The base stream is being encoded so set skip flag to 0.
          low_res_frame_info->skip_encoding_base_stream = 0;
        }
      }
 #endif
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1052,7 +1052,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
   * overflow when values are large
   */
  projected_size_based_on_q =
-      (int)(((.5 + rate_correction_factor *
+      (int)(((.5 +
              rate_correction_factor *
                  vp8_bits_per_mb[cpi->common.frame_type][Q]) *
             cpi->common.MBs) /
            (1 << BPER_MB_NORMBITS));
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -23,7 +23,6 @@
 #include "modecosts.h"
 #include "encodeintra.h"
 #include "pickinter.h"
 #include "vp8/common/common.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra.h"
@@ -770,8 +769,8 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
    vp8_quantize_mbuv(x);
    rate_to = rd_cost_mbuv(x);
-    this_rate =
+    this_rate = rate_to +
-        rate_to + x->intra_uv_mode_cost[xd->frame_type]
+                x->intra_uv_mode_cost[xd->frame_type]
                                     [xd->mode_info_context->mbmi.uv_mode];
    this_distortion = vp8_mbuverror(x) / 4;
@@ -960,13 +959,19 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
  vp8_variance_fn_ptr_t *v_fn_ptr;
  ENTROPY_CONTEXT_PLANES t_above, t_left;
  ENTROPY_CONTEXT *ta;
  ENTROPY_CONTEXT *tl;
  ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
  ENTROPY_CONTEXT *ta_b;
  ENTROPY_CONTEXT *tl_b;
  memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
  memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vp8_zero(t_above_b);
+  ta = (ENTROPY_CONTEXT *)&t_above;
-  vp8_zero(t_left_b);
+  tl = (ENTROPY_CONTEXT *)&t_left;
  ta_b = (ENTROPY_CONTEXT *)&t_above_b;
  tl_b = (ENTROPY_CONTEXT *)&t_left_b;
  br = 0;
  bd = 0;
@@ -1146,13 +1151,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
        mode_selected = this_mode;
        best_label_rd = this_rd;
-        memcpy(&t_above_b, &t_above_s, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
-        memcpy(&t_left_b, &t_left_s, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
      }
    } /*for each 4x4 mode*/
-    memcpy(&t_above, &t_above_b, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
-    memcpy(&t_left, &t_left_b, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
                bsi->ref_mv, x->mvcost);
--- a/vp8/encoder/treewriter.h
+++ b/vp8/encoder/treewriter.h
@@ -56,7 +56,8 @@ static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2],
 static void vp8_treed_write(vp8_writer *const w, vp8_tree t,
                            const vp8_prob *const p, int v,
-                            int n) { /* number of bits in v, assumed nonzero */
+                            int n /* number of bits in v, assumed nonzero */
                            ) {
  vp8_tree_index i = 0;
  do {
@@ -72,7 +73,8 @@ static INLINE void vp8_write_token(vp8_writer *const w, vp8_tree t,
 }
 static int vp8_treed_cost(vp8_tree t, const vp8_prob *const p, int v,
-                          int n) { /* number of bits in v, assumed nonzero */
+                          int n /* number of bits in v, assumed nonzero */
                          ) {
  int c = 0;
  vp8_tree_index i = 0;
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -802,20 +802,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
                                   unsigned long deadline) {
  vpx_codec_err_t res = VPX_CODEC_OK;
-  if (!ctx->cfg.rc_target_bitrate) {
+  if (!ctx->cfg.rc_target_bitrate) return res;
 #if CONFIG_MULTI_RES_ENCODING
    if (!ctx->cpi) return VPX_CODEC_ERROR;
    if (ctx->cpi->oxcf.mr_total_resolutions > 1) {
      LOWER_RES_FRAME_INFO *low_res_frame_info =
          (LOWER_RES_FRAME_INFO *)ctx->cpi->oxcf.mr_low_res_mode_info;
      if (!low_res_frame_info) return VPX_CODEC_ERROR;
      low_res_frame_info->skip_encoding_prev_stream = 1;
      if (ctx->cpi->oxcf.mr_encoder_id == 0)
        low_res_frame_info->skip_encoding_base_stream = 1;
    }
 #endif
    return res;
  }
  if (img) res = validate_img(ctx, img);
@@ -915,8 +902,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
            (unsigned long)((delta * ctx->cfg.g_timebase.den + round) /
                            ctx->cfg.g_timebase.num / 10000000);
        pkt.data.frame.flags = lib_flags << 16;
        pkt.data.frame.width[0] = cpi->common.Width;
        pkt.data.frame.height[0] = cpi->common.Height;
        if (lib_flags & FRAMEFLAGS_KEY) {
          pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
@@ -1274,9 +1259,6 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = {
      vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    cfg_maps; */
      vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
      vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   get_cx_data; */
-      vp8e_set_config,
+      vp8e_set_config, NULL, vp8e_get_preview, vp8e_mr_alloc_mem,
      NULL,
      vp8e_get_preview,
      vp8e_mr_alloc_mem,
  } /* encoder functions */
 };
--- a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
@@ -1,160 +0,0 @@
 /*
 *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 #include <assert.h>
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/arm/neon/vp9_iht_neon.h"
 #include "vpx_dsp/arm/highbd_idct_neon.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/txfm_common.h"
 static INLINE void highbd_iadst4(int32x4_t *const io) {
  const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 };
  const int32x4_t sinpi = vld1q_s32(sinpis);
  int32x4_t s[8];
  s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0);
  s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1);
  s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0);
  s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1);
  s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0);
  s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1);
  s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1);
  s[7] = vsubq_s32(io[0], io[2]);
  s[7] = vaddq_s32(s[7], io[3]);
  s[0] = vaddq_s32(s[0], s[3]);
  s[0] = vaddq_s32(s[0], s[5]);
  s[1] = vsubq_s32(s[1], s[4]);
  s[1] = vsubq_s32(s[1], s[6]);
  s[3] = s[2];
  s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0);
  io[0] = vaddq_s32(s[0], s[3]);
  io[1] = vaddq_s32(s[1], s[3]);
  io[2] = s[2];
  io[3] = vaddq_s32(s[0], s[1]);
  io[3] = vsubq_s32(io[3], s[3]);
  io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS);
  io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS);
  io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS);
  io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS);
 }
 void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
                                   int stride, int tx_type, int bd) {
  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
  int16x8_t a[2];
  int32x4_t c[4];
  c[0] = vld1q_s32(input);
  c[1] = vld1q_s32(input + 4);
  c[2] = vld1q_s32(input + 8);
  c[3] = vld1q_s32(input + 12);
  if (bd == 8) {
    a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1]));
    a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3]));
    transpose_s16_4x4q(&a[0], &a[1]);
    switch (tx_type) {
      case DCT_DCT:
        idct4x4_16_kernel_bd8(a);
        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
        transpose_s16_4x4q(&a[0], &a[1]);
        idct4x4_16_kernel_bd8(a);
        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
        break;
      case ADST_DCT:
        idct4x4_16_kernel_bd8(a);
        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
        transpose_s16_4x4q(&a[0], &a[1]);
        iadst4(a);
        break;
      case DCT_ADST:
        iadst4(a);
        transpose_s16_4x4q(&a[0], &a[1]);
        idct4x4_16_kernel_bd8(a);
        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
        break;
      default:
        assert(tx_type == ADST_ADST);
        iadst4(a);
        transpose_s16_4x4q(&a[0], &a[1]);
        iadst4(a);
        break;
    }
    a[0] = vrshrq_n_s16(a[0], 4);
    a[1] = vrshrq_n_s16(a[1], 4);
  } else {
    switch (tx_type) {
      case DCT_DCT: {
        const int32x4_t cospis = vld1q_s32(kCospi32);
        if (bd == 10) {
          idct4x4_16_kernel_bd10(cospis, c);
          idct4x4_16_kernel_bd10(cospis, c);
        } else {
          idct4x4_16_kernel_bd12(cospis, c);
          idct4x4_16_kernel_bd12(cospis, c);
        }
        break;
      }
      case ADST_DCT: {
        const int32x4_t cospis = vld1q_s32(kCospi32);
        if (bd == 10) {
          idct4x4_16_kernel_bd10(cospis, c);
        } else {
          idct4x4_16_kernel_bd12(cospis, c);
        }
        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
        highbd_iadst4(c);
        break;
      }
      case DCT_ADST: {
        const int32x4_t cospis = vld1q_s32(kCospi32);
        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
        highbd_iadst4(c);
        if (bd == 10) {
          idct4x4_16_kernel_bd10(cospis, c);
        } else {
          idct4x4_16_kernel_bd12(cospis, c);
        }
        break;
      }
      default: {
        assert(tx_type == ADST_ADST);
        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
        highbd_iadst4(c);
        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
        highbd_iadst4(c);
        break;
      }
    }
    a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4));
    a[1] = vcombine_s16(vqrshrn_n_s32(c[2], 4), vqrshrn_n_s32(c[3], 4));
  }
  highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max);
  highbd_idct4x4_1_add_kernel1(&dest, stride, a[1], max);
 }
--- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -14,63 +14,206 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/arm/neon/vp9_iht_neon.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/txfm_common.h"
 static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
  int32x4_t q8s32, q9s32;
  int16x4x2_t d0x2s16, d1x2s16;
  int32x4x2_t q0x2s32;
  d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
  d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
  q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
  q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
  q0x2s32 = vtrnq_s32(q8s32, q9s32);
  *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
  *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
 }
 static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
                                             int16x4_t *d2s16) {
  *d0s16 = vdup_n_s16(cospi_8_64);
  *d1s16 = vdup_n_s16(cospi_16_64);
  *d2s16 = vdup_n_s16(cospi_24_64);
 }
 static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
                                           int16x4_t *d5s16, int16x8_t *q3s16) {
  *d3s16 = vdup_n_s16(sinpi_1_9);
  *d4s16 = vdup_n_s16(sinpi_2_9);
  *q3s16 = vdupq_n_s16(sinpi_3_9);
  *d5s16 = vdup_n_s16(sinpi_4_9);
 }
 static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
                              int16x4_t *d2s16, int16x8_t *q8s16,
                              int16x8_t *q9s16) {
  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
  int16x4_t d26s16, d27s16, d28s16, d29s16;
  int32x4_t q10s32, q13s32, q14s32, q15s32;
  int16x8_t q13s16, q14s16;
  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);
  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);
  q15s32 = vmull_s16(d17s16, *d2s16);
  q10s32 = vmull_s16(d17s16, *d0s16);
  q13s32 = vmull_s16(d23s16, *d1s16);
  q14s32 = vmull_s16(d24s16, *d1s16);
  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
  d26s16 = vrshrn_n_s32(q13s32, 14);
  d27s16 = vrshrn_n_s32(q14s32, 14);
  d29s16 = vrshrn_n_s32(q15s32, 14);
  d28s16 = vrshrn_n_s32(q10s32, 14);
  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);
  *q8s16 = vaddq_s16(q13s16, q14s16);
  *q9s16 = vsubq_s16(q13s16, q14s16);
  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
 }
 static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
                               int16x4_t *d5s16, int16x8_t *q3s16,
                               int16x8_t *q8s16, int16x8_t *q9s16) {
  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
  d6s16 = vget_low_s16(*q3s16);
  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);
  q10s32 = vmull_s16(*d3s16, d16s16);
  q11s32 = vmull_s16(*d4s16, d16s16);
  q12s32 = vmull_s16(d6s16, d17s16);
  q13s32 = vmull_s16(*d5s16, d18s16);
  q14s32 = vmull_s16(*d3s16, d18s16);
  q15s32 = vmovl_s16(d16s16);
  q15s32 = vaddw_s16(q15s32, d19s16);
  q8s32 = vmull_s16(*d4s16, d19s16);
  q15s32 = vsubw_s16(q15s32, d18s16);
  q9s32 = vmull_s16(*d5s16, d19s16);
  q10s32 = vaddq_s32(q10s32, q13s32);
  q10s32 = vaddq_s32(q10s32, q8s32);
  q11s32 = vsubq_s32(q11s32, q14s32);
  q8s32 = vdupq_n_s32(sinpi_3_9);
  q11s32 = vsubq_s32(q11s32, q9s32);
  q15s32 = vmulq_s32(q15s32, q8s32);
  q13s32 = vaddq_s32(q10s32, q12s32);
  q10s32 = vaddq_s32(q10s32, q11s32);
  q14s32 = vaddq_s32(q11s32, q12s32);
  q10s32 = vsubq_s32(q10s32, q12s32);
  d16s16 = vrshrn_n_s32(q13s32, 14);
  d17s16 = vrshrn_n_s32(q14s32, 14);
  d18s16 = vrshrn_n_s32(q15s32, 14);
  d19s16 = vrshrn_n_s32(q10s32, 14);
  *q8s16 = vcombine_s16(d16s16, d17s16);
  *q9s16 = vcombine_s16(d18s16, d19s16);
 }
 void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
-  int16x8_t a[2];
+  uint8x8_t d26u8, d27u8;
-  uint8x8_t s[2], d[2];
+  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
-  uint16x8_t sum[2];
+  uint32x2_t d26u32, d27u32;
  int16x8_t q3s16, q8s16, q9s16;
  uint16x8_t q8u16, q9u16;
-  assert(!((intptr_t)dest % sizeof(uint32_t)));
+  d26u32 = d27u32 = vdup_n_u32(0);
  assert(!(stride % sizeof(uint32_t)));
-  a[0] = load_tran_low_to_s16q(input);
+  q8s16 = vld1q_s16(input);
-  a[1] = load_tran_low_to_s16q(input + 8);
+  q9s16 = vld1q_s16(input + 8);
-  transpose_s16_4x4q(&a[0], &a[1]);
+
  TRANSPOSE4X4(&q8s16, &q9s16);
  switch (tx_type) {
-    case DCT_DCT:
+    case 0:  // idct_idct is not supported. Fall back to C
-      idct4x4_16_kernel_bd8(a);
+      vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
-      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+      return;
-      transpose_s16_4x4q(&a[0], &a[1]);
+    case 1:  // iadst_idct
-      idct4x4_16_kernel_bd8(a);
+      // generate constants
-      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      break;
+      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-    case ADST_DCT:
+      // first transform rows
-      idct4x4_16_kernel_bd8(a);
+      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
      transpose_s16_4x4q(&a[0], &a[1]);
      iadst4(a);
      break;
-    case DCT_ADST:
+      // transpose the matrix
-      iadst4(a);
+      TRANSPOSE4X4(&q8s16, &q9s16);
      transpose_s16_4x4q(&a[0], &a[1]);
      idct4x4_16_kernel_bd8(a);
      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
      break;
-    default:
+      // then transform columns
-      assert(tx_type == ADST_ADST);
+      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-      iadst4(a);
+      break;
-      transpose_s16_4x4q(&a[0], &a[1]);
+    case 2:  // idct_iadst
-      iadst4(a);
+      // generate constantsyy
      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
      // first transform rows
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
      // transpose the matrix
      TRANSPOSE4X4(&q8s16, &q9s16);
      // then transform columns
      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
      break;
    case 3:  // iadst_iadst
      // generate constants
      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
      // first transform rows
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
      // transpose the matrix
      TRANSPOSE4X4(&q8s16, &q9s16);
      // then transform columns
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
      break;
    default:  // iadst_idct
      assert(0);
      break;
  }
-  a[0] = vrshrq_n_s16(a[0], 4);
+  q8s16 = vrshrq_n_s16(q8s16, 4);
-  a[1] = vrshrq_n_s16(a[1], 4);
+  q9s16 = vrshrq_n_s16(q9s16, 4);
-  s[0] = load_u8(dest, stride);
+
-  s[1] = load_u8(dest + 2 * stride, stride);
+  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
-  sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s[0]);
+  dest += stride;
-  sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), s[1]);
+  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
-  d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0]));
+  dest += stride;
-  d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1]));
+  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
-  store_u8(dest, stride, d[0]);
+  dest += stride;
-  store_u8(dest + 2 * stride, stride, d[1]);
+  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
  dest -= stride;
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
  dest -= stride;
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
  dest -= stride;
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
 }
--- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -14,199 +14,527 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
-static INLINE void iadst_half_butterfly_neon(int16x8_t *const x,
+static int16_t cospi_2_64 = 16305;
-                                             const int16x4_t c) {
+static int16_t cospi_4_64 = 16069;
-  const int16x8_t sum = vaddq_s16(x[0], x[1]);
+static int16_t cospi_6_64 = 15679;
-  const int16x8_t sub = vsubq_s16(x[0], x[1]);
+static int16_t cospi_8_64 = 15137;
-  int32x4_t t0[2], t1[2];
+static int16_t cospi_10_64 = 14449;
 static int16_t cospi_12_64 = 13623;
 static int16_t cospi_14_64 = 12665;
 static int16_t cospi_16_64 = 11585;
 static int16_t cospi_18_64 = 10394;
 static int16_t cospi_20_64 = 9102;
 static int16_t cospi_22_64 = 7723;
 static int16_t cospi_24_64 = 6270;
 static int16_t cospi_26_64 = 4756;
 static int16_t cospi_28_64 = 3196;
 static int16_t cospi_30_64 = 1606;
-  t0[0] = vmull_lane_s16(vget_low_s16(sum), c, 0);
+static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-  t0[1] = vmull_lane_s16(vget_high_s16(sum), c, 0);
+                              int16x8_t *q10s16, int16x8_t *q11s16,
-  t1[0] = vmull_lane_s16(vget_low_s16(sub), c, 0);
+                              int16x8_t *q12s16, int16x8_t *q13s16,
-  t1[1] = vmull_lane_s16(vget_high_s16(sub), c, 0);
+                              int16x8_t *q14s16, int16x8_t *q15s16) {
-  x[0] = dct_const_round_shift_low_8(t0);
+  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  x[1] = dct_const_round_shift_low_8(t1);
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
  d0s16 = vdup_n_s16(cospi_28_64);
  d1s16 = vdup_n_s16(cospi_4_64);
  d2s16 = vdup_n_s16(cospi_12_64);
  d3s16 = vdup_n_s16(cospi_20_64);
  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);
  d20s16 = vget_low_s16(*q10s16);
  d21s16 = vget_high_s16(*q10s16);
  d22s16 = vget_low_s16(*q11s16);
  d23s16 = vget_high_s16(*q11s16);
  d24s16 = vget_low_s16(*q12s16);
  d25s16 = vget_high_s16(*q12s16);
  d26s16 = vget_low_s16(*q13s16);
  d27s16 = vget_high_s16(*q13s16);
  d28s16 = vget_low_s16(*q14s16);
  d29s16 = vget_high_s16(*q14s16);
  d30s16 = vget_low_s16(*q15s16);
  d31s16 = vget_high_s16(*q15s16);
  q2s32 = vmull_s16(d18s16, d0s16);
  q3s32 = vmull_s16(d19s16, d0s16);
  q5s32 = vmull_s16(d26s16, d2s16);
  q6s32 = vmull_s16(d27s16, d2s16);
  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
  d8s16 = vrshrn_n_s32(q2s32, 14);
  d9s16 = vrshrn_n_s32(q3s32, 14);
  d10s16 = vrshrn_n_s32(q5s32, 14);
  d11s16 = vrshrn_n_s32(q6s32, 14);
  q4s16 = vcombine_s16(d8s16, d9s16);
  q5s16 = vcombine_s16(d10s16, d11s16);
  q2s32 = vmull_s16(d18s16, d1s16);
  q3s32 = vmull_s16(d19s16, d1s16);
  q9s32 = vmull_s16(d26s16, d3s16);
  q13s32 = vmull_s16(d27s16, d3s16);
  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
  d14s16 = vrshrn_n_s32(q2s32, 14);
  d15s16 = vrshrn_n_s32(q3s32, 14);
  d12s16 = vrshrn_n_s32(q9s32, 14);
  d13s16 = vrshrn_n_s32(q13s32, 14);
  q6s16 = vcombine_s16(d12s16, d13s16);
  q7s16 = vcombine_s16(d14s16, d15s16);
  d0s16 = vdup_n_s16(cospi_16_64);
  q2s32 = vmull_s16(d16s16, d0s16);
  q3s32 = vmull_s16(d17s16, d0s16);
  q13s32 = vmull_s16(d16s16, d0s16);
  q15s32 = vmull_s16(d17s16, d0s16);
  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
  d0s16 = vdup_n_s16(cospi_24_64);
  d1s16 = vdup_n_s16(cospi_8_64);
  d18s16 = vrshrn_n_s32(q2s32, 14);
  d19s16 = vrshrn_n_s32(q3s32, 14);
  d22s16 = vrshrn_n_s32(q13s32, 14);
  d23s16 = vrshrn_n_s32(q15s32, 14);
  *q9s16 = vcombine_s16(d18s16, d19s16);
  *q11s16 = vcombine_s16(d22s16, d23s16);
  q2s32 = vmull_s16(d20s16, d0s16);
  q3s32 = vmull_s16(d21s16, d0s16);
  q8s32 = vmull_s16(d20s16, d1s16);
  q12s32 = vmull_s16(d21s16, d1s16);
  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
  d26s16 = vrshrn_n_s32(q2s32, 14);
  d27s16 = vrshrn_n_s32(q3s32, 14);
  d30s16 = vrshrn_n_s32(q8s32, 14);
  d31s16 = vrshrn_n_s32(q12s32, 14);
  *q13s16 = vcombine_s16(d26s16, d27s16);
  *q15s16 = vcombine_s16(d30s16, d31s16);
  q0s16 = vaddq_s16(*q9s16, *q15s16);
  q1s16 = vaddq_s16(*q11s16, *q13s16);
  q2s16 = vsubq_s16(*q11s16, *q13s16);
  q3s16 = vsubq_s16(*q9s16, *q15s16);
  *q13s16 = vsubq_s16(q4s16, q5s16);
  q4s16 = vaddq_s16(q4s16, q5s16);
  *q14s16 = vsubq_s16(q7s16, q6s16);
  q7s16 = vaddq_s16(q7s16, q6s16);
  d26s16 = vget_low_s16(*q13s16);
  d27s16 = vget_high_s16(*q13s16);
  d28s16 = vget_low_s16(*q14s16);
  d29s16 = vget_high_s16(*q14s16);
  d16s16 = vdup_n_s16(cospi_16_64);
  q9s32 = vmull_s16(d28s16, d16s16);
  q10s32 = vmull_s16(d29s16, d16s16);
  q11s32 = vmull_s16(d28s16, d16s16);
  q12s32 = vmull_s16(d29s16, d16s16);
  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
  d10s16 = vrshrn_n_s32(q9s32, 14);
  d11s16 = vrshrn_n_s32(q10s32, 14);
  d12s16 = vrshrn_n_s32(q11s32, 14);
  d13s16 = vrshrn_n_s32(q12s32, 14);
  q5s16 = vcombine_s16(d10s16, d11s16);
  q6s16 = vcombine_s16(d12s16, d13s16);
  *q8s16 = vaddq_s16(q0s16, q7s16);
  *q9s16 = vaddq_s16(q1s16, q6s16);
  *q10s16 = vaddq_s16(q2s16, q5s16);
  *q11s16 = vaddq_s16(q3s16, q4s16);
  *q12s16 = vsubq_s16(q3s16, q4s16);
  *q13s16 = vsubq_s16(q2s16, q5s16);
  *q14s16 = vsubq_s16(q1s16, q6s16);
  *q15s16 = vsubq_s16(q0s16, q7s16);
 }
-static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0,
+static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                                                 const int16x8_t in1,
+                               int16x8_t *q10s16, int16x8_t *q11s16,
-                                                 const int16x4_t c,
+                               int16x8_t *q12s16, int16x8_t *q13s16,
-                                                 int32x4_t *const s0,
+                               int16x8_t *q14s16, int16x8_t *q15s16) {
-                                                 int32x4_t *const s1) {
+  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+  int16x8_t q2s16, q4s16, q5s16, q6s16;
  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
  int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+  d16s16 = vget_low_s16(*q8s16);
-  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+  d17s16 = vget_high_s16(*q8s16);
-  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+  d18s16 = vget_low_s16(*q9s16);
-  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+  d19s16 = vget_high_s16(*q9s16);
-}
+  d20s16 = vget_low_s16(*q10s16);
  d21s16 = vget_high_s16(*q10s16);
  d22s16 = vget_low_s16(*q11s16);
  d23s16 = vget_high_s16(*q11s16);
  d24s16 = vget_low_s16(*q12s16);
  d25s16 = vget_high_s16(*q12s16);
  d26s16 = vget_low_s16(*q13s16);
  d27s16 = vget_high_s16(*q13s16);
  d28s16 = vget_low_s16(*q14s16);
  d29s16 = vget_high_s16(*q14s16);
  d30s16 = vget_low_s16(*q15s16);
  d31s16 = vget_high_s16(*q15s16);
-static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0,
+  d14s16 = vdup_n_s16(cospi_2_64);
-                                                 const int16x8_t in1,
+  d15s16 = vdup_n_s16(cospi_30_64);
                                                 const int16x4_t c,
                                                 int32x4_t *const s0,
                                                 int32x4_t *const s1) {
  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
-  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+  q1s32 = vmull_s16(d30s16, d14s16);
-  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+  q2s32 = vmull_s16(d31s16, d14s16);
-  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+  q3s32 = vmull_s16(d30s16, d15s16);
-  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+  q4s32 = vmull_s16(d31s16, d15s16);
 }
-static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0,
+  d30s16 = vdup_n_s16(cospi_18_64);
-                                                 const int16x8_t in1,
+  d31s16 = vdup_n_s16(cospi_14_64);
                                                 const int16x4_t c,
                                                 int32x4_t *const s0,
                                                 int32x4_t *const s1) {
  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
-  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+  q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
-  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+  q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
-  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+  q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
-  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+  q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
 }
-static INLINE int16x8_t add_dct_const_round_shift_low_8(
+  q5s32 = vmull_s16(d22s16, d30s16);
-    const int32x4_t *const in0, const int32x4_t *const in1) {
+  q6s32 = vmull_s16(d23s16, d30s16);
-  int32x4_t sum[2];
+  q7s32 = vmull_s16(d22s16, d31s16);
  q8s32 = vmull_s16(d23s16, d31s16);
-  sum[0] = vaddq_s32(in0[0], in1[0]);
+  q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
-  sum[1] = vaddq_s32(in0[1], in1[1]);
+  q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
-  return dct_const_round_shift_low_8(sum);
+  q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
-}
+  q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
-static INLINE int16x8_t sub_dct_const_round_shift_low_8(
+  q11s32 = vaddq_s32(q1s32, q5s32);
-    const int32x4_t *const in0, const int32x4_t *const in1) {
+  q12s32 = vaddq_s32(q2s32, q6s32);
-  int32x4_t sum[2];
+  q1s32 = vsubq_s32(q1s32, q5s32);
  q2s32 = vsubq_s32(q2s32, q6s32);
-  sum[0] = vsubq_s32(in0[0], in1[0]);
+  d22s16 = vrshrn_n_s32(q11s32, 14);
-  sum[1] = vsubq_s32(in0[1], in1[1]);
+  d23s16 = vrshrn_n_s32(q12s32, 14);
-  return dct_const_round_shift_low_8(sum);
+  *q11s16 = vcombine_s16(d22s16, d23s16);
 }
-static INLINE void iadst8(int16x8_t *const io) {
+  q12s32 = vaddq_s32(q3s32, q7s32);
-  const int16x4_t c0 =
+  q15s32 = vaddq_s32(q4s32, q8s32);
-      create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
+  q3s32 = vsubq_s32(q3s32, q7s32);
-  const int16x4_t c1 =
+  q4s32 = vsubq_s32(q4s32, q8s32);
      create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
  const int16x4_t c2 =
      create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
  int16x8_t x[8], t[4];
  int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
-  x[0] = io[7];
+  d2s16 = vrshrn_n_s32(q1s32, 14);
-  x[1] = io[0];
+  d3s16 = vrshrn_n_s32(q2s32, 14);
-  x[2] = io[5];
+  d24s16 = vrshrn_n_s32(q12s32, 14);
-  x[3] = io[2];
+  d25s16 = vrshrn_n_s32(q15s32, 14);
-  x[4] = io[3];
+  d6s16 = vrshrn_n_s32(q3s32, 14);
-  x[5] = io[4];
+  d7s16 = vrshrn_n_s32(q4s32, 14);
-  x[6] = io[1];
+  *q12s16 = vcombine_s16(d24s16, d25s16);
  x[7] = io[6];
-  // stage 1
+  d0s16 = vdup_n_s16(cospi_10_64);
-  iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1);
+  d1s16 = vdup_n_s16(cospi_22_64);
-  iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3);
+  q4s32 = vmull_s16(d26s16, d0s16);
-  iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5);
+  q5s32 = vmull_s16(d27s16, d0s16);
-  iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7);
+  q2s32 = vmull_s16(d26s16, d1s16);
  q6s32 = vmull_s16(d27s16, d1s16);
-  x[0] = add_dct_const_round_shift_low_8(s0, s4);
+  d30s16 = vdup_n_s16(cospi_26_64);
-  x[1] = add_dct_const_round_shift_low_8(s1, s5);
+  d31s16 = vdup_n_s16(cospi_6_64);
  x[2] = add_dct_const_round_shift_low_8(s2, s6);
  x[3] = add_dct_const_round_shift_low_8(s3, s7);
  x[4] = sub_dct_const_round_shift_low_8(s0, s4);
  x[5] = sub_dct_const_round_shift_low_8(s1, s5);
  x[6] = sub_dct_const_round_shift_low_8(s2, s6);
  x[7] = sub_dct_const_round_shift_low_8(s3, s7);
-  // stage 2
+  q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
-  t[0] = x[0];
+  q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
-  t[1] = x[1];
+  q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
-  t[2] = x[2];
+  q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
  t[3] = x[3];
  iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5);
  iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6);
-  x[0] = vaddq_s16(t[0], t[2]);
+  q0s32 = vmull_s16(d18s16, d30s16);
-  x[1] = vaddq_s16(t[1], t[3]);
+  q13s32 = vmull_s16(d19s16, d30s16);
  x[2] = vsubq_s16(t[0], t[2]);
  x[3] = vsubq_s16(t[1], t[3]);
  x[4] = add_dct_const_round_shift_low_8(s4, s6);
  x[5] = add_dct_const_round_shift_low_8(s5, s7);
  x[6] = sub_dct_const_round_shift_low_8(s4, s6);
  x[7] = sub_dct_const_round_shift_low_8(s5, s7);
-  // stage 3
+  q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
-  iadst_half_butterfly_neon(x + 2, c2);
+  q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
  iadst_half_butterfly_neon(x + 6, c2);
-  io[0] = x[0];
+  q10s32 = vmull_s16(d18s16, d31s16);
-  io[1] = vnegq_s16(x[4]);
+  q9s32 = vmull_s16(d19s16, d31s16);
-  io[2] = x[6];
+
-  io[3] = vnegq_s16(x[2]);
+  q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
-  io[4] = x[3];
+  q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
-  io[5] = vnegq_s16(x[7]);
+
-  io[6] = x[5];
+  q14s32 = vaddq_s32(q2s32, q10s32);
-  io[7] = vnegq_s16(x[1]);
+  q15s32 = vaddq_s32(q6s32, q9s32);
  q2s32 = vsubq_s32(q2s32, q10s32);
  q6s32 = vsubq_s32(q6s32, q9s32);
  d28s16 = vrshrn_n_s32(q14s32, 14);
  d29s16 = vrshrn_n_s32(q15s32, 14);
  d4s16 = vrshrn_n_s32(q2s32, 14);
  d5s16 = vrshrn_n_s32(q6s32, 14);
  *q14s16 = vcombine_s16(d28s16, d29s16);
  q9s32 = vaddq_s32(q4s32, q0s32);
  q10s32 = vaddq_s32(q5s32, q13s32);
  q4s32 = vsubq_s32(q4s32, q0s32);
  q5s32 = vsubq_s32(q5s32, q13s32);
  d30s16 = vdup_n_s16(cospi_8_64);
  d31s16 = vdup_n_s16(cospi_24_64);
  d18s16 = vrshrn_n_s32(q9s32, 14);
  d19s16 = vrshrn_n_s32(q10s32, 14);
  d8s16 = vrshrn_n_s32(q4s32, 14);
  d9s16 = vrshrn_n_s32(q5s32, 14);
  *q9s16 = vcombine_s16(d18s16, d19s16);
  q5s32 = vmull_s16(d2s16, d30s16);
  q6s32 = vmull_s16(d3s16, d30s16);
  q7s32 = vmull_s16(d2s16, d31s16);
  q0s32 = vmull_s16(d3s16, d31s16);
  q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
  q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
  q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
  q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
  q1s32 = vmull_s16(d4s16, d30s16);
  q3s32 = vmull_s16(d5s16, d30s16);
  q10s32 = vmull_s16(d4s16, d31s16);
  q2s32 = vmull_s16(d5s16, d31s16);
  q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
  q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
  q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
  q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
  *q8s16 = vaddq_s16(*q11s16, *q9s16);
  *q11s16 = vsubq_s16(*q11s16, *q9s16);
  q4s16 = vaddq_s16(*q12s16, *q14s16);
  *q12s16 = vsubq_s16(*q12s16, *q14s16);
  q14s32 = vaddq_s32(q5s32, q1s32);
  q15s32 = vaddq_s32(q6s32, q3s32);
  q5s32 = vsubq_s32(q5s32, q1s32);
  q6s32 = vsubq_s32(q6s32, q3s32);
  d18s16 = vrshrn_n_s32(q14s32, 14);
  d19s16 = vrshrn_n_s32(q15s32, 14);
  d10s16 = vrshrn_n_s32(q5s32, 14);
  d11s16 = vrshrn_n_s32(q6s32, 14);
  *q9s16 = vcombine_s16(d18s16, d19s16);
  q1s32 = vaddq_s32(q7s32, q10s32);
  q3s32 = vaddq_s32(q0s32, q2s32);
  q7s32 = vsubq_s32(q7s32, q10s32);
  q0s32 = vsubq_s32(q0s32, q2s32);
  d28s16 = vrshrn_n_s32(q1s32, 14);
  d29s16 = vrshrn_n_s32(q3s32, 14);
  d14s16 = vrshrn_n_s32(q7s32, 14);
  d15s16 = vrshrn_n_s32(q0s32, 14);
  *q14s16 = vcombine_s16(d28s16, d29s16);
  d30s16 = vdup_n_s16(cospi_16_64);
  d22s16 = vget_low_s16(*q11s16);
  d23s16 = vget_high_s16(*q11s16);
  q2s32 = vmull_s16(d22s16, d30s16);
  q3s32 = vmull_s16(d23s16, d30s16);
  q13s32 = vmull_s16(d22s16, d30s16);
  q1s32 = vmull_s16(d23s16, d30s16);
  d24s16 = vget_low_s16(*q12s16);
  d25s16 = vget_high_s16(*q12s16);
  q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
  q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
  q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
  q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
  d4s16 = vrshrn_n_s32(q2s32, 14);
  d5s16 = vrshrn_n_s32(q3s32, 14);
  d24s16 = vrshrn_n_s32(q13s32, 14);
  d25s16 = vrshrn_n_s32(q1s32, 14);
  q2s16 = vcombine_s16(d4s16, d5s16);
  *q12s16 = vcombine_s16(d24s16, d25s16);
  q13s32 = vmull_s16(d10s16, d30s16);
  q1s32 = vmull_s16(d11s16, d30s16);
  q11s32 = vmull_s16(d10s16, d30s16);
  q0s32 = vmull_s16(d11s16, d30s16);
  q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
  q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
  q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
  q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
  d20s16 = vrshrn_n_s32(q13s32, 14);
  d21s16 = vrshrn_n_s32(q1s32, 14);
  d12s16 = vrshrn_n_s32(q11s32, 14);
  d13s16 = vrshrn_n_s32(q0s32, 14);
  *q10s16 = vcombine_s16(d20s16, d21s16);
  q6s16 = vcombine_s16(d12s16, d13s16);
  q5s16 = vdupq_n_s16(0);
  *q9s16 = vsubq_s16(q5s16, *q9s16);
  *q11s16 = vsubq_s16(q5s16, q2s16);
  *q13s16 = vsubq_s16(q5s16, q6s16);
  *q15s16 = vsubq_s16(q5s16, q4s16);
 }
 void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
-  const int16x8_t cospis = vld1q_s16(kCospi);
+  int i;
-  const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+  uint8_t *d1, *d2;
-  const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  int16x8_t a[8];
+  uint64x1_t d0u64, d1u64, d2u64, d3u64;
  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  uint16x8_t q8u16, q9u16, q10u16, q11u16;
-  a[0] = load_tran_low_to_s16q(input + 0 * 8);
+  q8s16 = vld1q_s16(input);
-  a[1] = load_tran_low_to_s16q(input + 1 * 8);
+  q9s16 = vld1q_s16(input + 8);
-  a[2] = load_tran_low_to_s16q(input + 2 * 8);
+  q10s16 = vld1q_s16(input + 8 * 2);
-  a[3] = load_tran_low_to_s16q(input + 3 * 8);
+  q11s16 = vld1q_s16(input + 8 * 3);
-  a[4] = load_tran_low_to_s16q(input + 4 * 8);
+  q12s16 = vld1q_s16(input + 8 * 4);
-  a[5] = load_tran_low_to_s16q(input + 5 * 8);
+  q13s16 = vld1q_s16(input + 8 * 5);
-  a[6] = load_tran_low_to_s16q(input + 6 * 8);
+  q14s16 = vld1q_s16(input + 8 * 6);
-  a[7] = load_tran_low_to_s16q(input + 7 * 8);
+  q15s16 = vld1q_s16(input + 8 * 7);
-  transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                    &q15s16);
  switch (tx_type) {
-    case DCT_DCT:
+    case 0:  // idct_idct is not supported. Fall back to C
-      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+      vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
-      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      return;
-      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+    case 1:  // iadst_idct
-      break;
+      // generate IDCT constants
      // GENERATE_IDCT_CONSTANTS
-    case ADST_DCT:
+      // first transform rows
-      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+                 &q15s16);
      iadst8(a);
      break;
-    case DCT_ADST:
+      // transpose the matrix
-      iadst8(a);
+      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+                        &q14s16, &q15s16);
      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
      break;
-    default:
+      // generate IADST constants
-      assert(tx_type == ADST_ADST);
+      // GENERATE_IADST_CONSTANTS
-      iadst8(a);
+
-      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      // then transform columns
-      iadst8(a);
+      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                  &q15s16);
      break;
    case 2:  // idct_iadst
      // generate IADST constants
      // GENERATE_IADST_CONSTANTS
      // first transform rows
      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                  &q15s16);
      // transpose the matrix
      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
                        &q14s16, &q15s16);
      // generate IDCT constants
      // GENERATE_IDCT_CONSTANTS
      // then transform columns
      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                 &q15s16);
      break;
    case 3:  // iadst_iadst
      // generate IADST constants
      // GENERATE_IADST_CONSTANTS
      // first transform rows
      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                  &q15s16);
      // transpose the matrix
      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
                        &q14s16, &q15s16);
      // then transform columns
      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                  &q15s16);
      break;
    default:  // iadst_idct
      assert(0);
      break;
  }
-  idct8x8_add8x8_neon(a, dest, stride);
+  q8s16 = vrshrq_n_s16(q8s16, 5);
  q9s16 = vrshrq_n_s16(q9s16, 5);
  q10s16 = vrshrq_n_s16(q10s16, 5);
  q11s16 = vrshrq_n_s16(q11s16, 5);
  q12s16 = vrshrq_n_s16(q12s16, 5);
  q13s16 = vrshrq_n_s16(q13s16, 5);
  q14s16 = vrshrq_n_s16(q14s16, 5);
  q15s16 = vrshrq_n_s16(q15s16, 5);
  for (d1 = d2 = dest, i = 0; i < 2; i++) {
    if (i != 0) {
      q8s16 = q12s16;
      q9s16 = q13s16;
      q10s16 = q14s16;
      q11s16 = q15s16;
    }
    d0u64 = vld1_u64((uint64_t *)d1);
    d1 += stride;
    d1u64 = vld1_u64((uint64_t *)d1);
    d1 += stride;
    d2u64 = vld1_u64((uint64_t *)d1);
    d1 += stride;
    d3u64 = vld1_u64((uint64_t *)d1);
    d1 += stride;
    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
    q10u16 =
        vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
    q11u16 =
        vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
    d2 += stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
    d2 += stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
    d2 += stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
    d2 += stride;
  }
 }
--- a/vp9/common/arm/neon/vp9_iht_neon.h
+++ b/vp9/common/arm/neon/vp9_iht_neon.h
@@ -1,60 +0,0 @@
 /*
 *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
 #define VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
 #include <arm_neon.h>
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/txfm_common.h"
 static INLINE void iadst4(int16x8_t *const io) {
  const int32x4_t c3 = vdupq_n_s32(sinpi_3_9);
  int16x4_t x[4];
  int32x4_t s[8], output[4];
  const int16x4_t c =
      create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9);
  x[0] = vget_low_s16(io[0]);
  x[1] = vget_low_s16(io[1]);
  x[2] = vget_high_s16(io[0]);
  x[3] = vget_high_s16(io[1]);
  s[0] = vmull_lane_s16(x[0], c, 0);
  s[1] = vmull_lane_s16(x[0], c, 1);
  s[2] = vmull_lane_s16(x[1], c, 2);
  s[3] = vmull_lane_s16(x[2], c, 3);
  s[4] = vmull_lane_s16(x[2], c, 0);
  s[5] = vmull_lane_s16(x[3], c, 1);
  s[6] = vmull_lane_s16(x[3], c, 3);
  s[7] = vaddl_s16(x[0], x[3]);
  s[7] = vsubw_s16(s[7], x[2]);
  s[0] = vaddq_s32(s[0], s[3]);
  s[0] = vaddq_s32(s[0], s[5]);
  s[1] = vsubq_s32(s[1], s[4]);
  s[1] = vsubq_s32(s[1], s[6]);
  s[3] = s[2];
  s[2] = vmulq_s32(c3, s[7]);
  output[0] = vaddq_s32(s[0], s[3]);
  output[1] = vaddq_s32(s[1], s[3]);
  output[2] = s[2];
  output[3] = vaddq_s32(s[0], s[1]);
  output[3] = vsubq_s32(output[3], s[3]);
  dct_const_round_shift_low_8_dual(output, &io[0], &io[1]);
 }
 #endif  // VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -42,7 +42,6 @@ const vpx_prob vp9_cat6_prob_high12[] = { 255, 255, 255, 255, 254, 254,
                                          177, 153, 140, 133, 130, 129 };
 #endif
 /* clang-format off */
 const uint8_t vp9_coefband_trans_8x8plus[1024] = {
  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
  // beyond MAXBAND_INDEX+1 all values are filled as 5
@@ -86,7 +85,6 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = {
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
 /* clang-format on */
 const uint8_t vp9_coefband_trans_4x4[16] = {
  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -137,6 +137,7 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
 #define COEFF_PROB_MODELS 255
 #define UNCONSTRAINED_NODES 3
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -186,19 +186,16 @@ const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] =
      { 93, 24, 99 },   // a split, l not split
      { 85, 119, 44 },  // l split, a not split
      { 62, 59, 67 },   // a/l both split
      // 16x16 -> 8x8
      { 149, 53, 53 },  // a/l both not split
      { 94, 20, 48 },   // a split, l not split
      { 83, 53, 24 },   // l split, a not split
      { 52, 18, 18 },   // a/l both split
      // 32x32 -> 16x16
      { 150, 40, 39 },  // a/l both not split
      { 78, 12, 26 },   // a split, l not split
      { 67, 33, 11 },   // l split, a not split
      { 24, 7, 5 },     // a/l both split
      // 64x64 -> 32x32
      { 174, 35, 49 },  // a/l both not split
      { 68, 11, 27 },   // a split, l not split
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -22,7 +22,9 @@ const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
  18,          -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10,
 };
-const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1 };
+const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
  -0, -1,
 };
 const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2,  -1,
                                                               4,  -2, -3 };
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1174,7 +1174,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
    }
    // Disable filtering on the leftmost column
-    border_mask = ~(mi_col == 0 ? 1 : 0);
+    border_mask = ~(mi_col == 0);
 #if CONFIG_VP9_HIGHBITDEPTH
    if (cm->use_highbitdepth) {
      highbd_filter_selectively_vert(
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -229,7 +229,8 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
        else
          pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME);
      } else {
-        pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
+        pred_context = 1 +
                       2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
                            edge_mi->ref_frame[1] == GOLDEN_FRAME);
      }
    } else {  // inter/inter
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1,13 +1,3 @@
 ##
 ##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
 ##  that can be found in the LICENSE file in the root of the source
 ##  tree. An additional intellectual property rights grant can be found
 ##  in the file PATENTS.  All contributing project authors may
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 sub vp9_common_forward_decls() {
 print <<EOF
 /*
@@ -67,13 +57,13 @@ add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *outp
 if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
  # Note that there are more specializations appended when
  # CONFIG_VP9_HIGHBITDEPTH is off.
-  specialize qw/vp9_iht4x4_16_add neon sse2/;
+  specialize qw/vp9_iht4x4_16_add sse2/;
  specialize qw/vp9_iht8x8_64_add sse2/;
  specialize qw/vp9_iht16x16_256_add sse2/;
  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
    # Note that these specializations are appended to the above ones.
-    specialize qw/vp9_iht4x4_16_add dspr2 msa/;
+    specialize qw/vp9_iht4x4_16_add neon dspr2 msa/;
-    specialize qw/vp9_iht8x8_64_add dspr2 msa/;
+    specialize qw/vp9_iht8x8_64_add neon dspr2 msa/;
    specialize qw/vp9_iht16x16_256_add dspr2 msa/;
  }
 }
@@ -101,12 +91,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
  if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
    specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
    specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
    specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
  }
 }
 #
@@ -129,7 +113,7 @@ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
 add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon sse2 avx2/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
 add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64";
--- a/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
+++ b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
@@ -1,419 +0,0 @@
 /*
 *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_idct.h"
 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
                                                      const int c,
                                                      __m128i *const s) {
  const __m128i pair_c = pair_set_epi32(4 * c, 0);
  __m128i x[2];
  extend_64bit(in, x);
  s[0] = _mm_mul_epi32(pair_c, x[0]);
  s[1] = _mm_mul_epi32(pair_c, x[1]);
 }
 static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
                                                 const __m128i in1,
                                                 const int c0, const int c1,
                                                 __m128i *const s0,
                                                 __m128i *const s1) {
  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
  __m128i t00[2], t01[2], t10[2], t11[2];
  __m128i x0[2], x1[2];
  extend_64bit(in0, x0);
  extend_64bit(in1, x1);
  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
  s0[0] = _mm_add_epi64(t00[0], t11[0]);
  s0[1] = _mm_add_epi64(t00[1], t11[1]);
  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
 }
 static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {
  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],
      s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],
      x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];
  // stage 1
  highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);
  highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);
  highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);
  highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);
  highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);
  highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,
                                s11);
  highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,
                                s13);
  highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,
                                s15);
  x0[0] = _mm_add_epi64(s0[0], s8[0]);
  x0[1] = _mm_add_epi64(s0[1], s8[1]);
  x1[0] = _mm_add_epi64(s1[0], s9[0]);
  x1[1] = _mm_add_epi64(s1[1], s9[1]);
  x2[0] = _mm_add_epi64(s2[0], s10[0]);
  x2[1] = _mm_add_epi64(s2[1], s10[1]);
  x3[0] = _mm_add_epi64(s3[0], s11[0]);
  x3[1] = _mm_add_epi64(s3[1], s11[1]);
  x4[0] = _mm_add_epi64(s4[0], s12[0]);
  x4[1] = _mm_add_epi64(s4[1], s12[1]);
  x5[0] = _mm_add_epi64(s5[0], s13[0]);
  x5[1] = _mm_add_epi64(s5[1], s13[1]);
  x6[0] = _mm_add_epi64(s6[0], s14[0]);
  x6[1] = _mm_add_epi64(s6[1], s14[1]);
  x7[0] = _mm_add_epi64(s7[0], s15[0]);
  x7[1] = _mm_add_epi64(s7[1], s15[1]);
  x8[0] = _mm_sub_epi64(s0[0], s8[0]);
  x8[1] = _mm_sub_epi64(s0[1], s8[1]);
  x9[0] = _mm_sub_epi64(s1[0], s9[0]);
  x9[1] = _mm_sub_epi64(s1[1], s9[1]);
  x10[0] = _mm_sub_epi64(s2[0], s10[0]);
  x10[1] = _mm_sub_epi64(s2[1], s10[1]);
  x11[0] = _mm_sub_epi64(s3[0], s11[0]);
  x11[1] = _mm_sub_epi64(s3[1], s11[1]);
  x12[0] = _mm_sub_epi64(s4[0], s12[0]);
  x12[1] = _mm_sub_epi64(s4[1], s12[1]);
  x13[0] = _mm_sub_epi64(s5[0], s13[0]);
  x13[1] = _mm_sub_epi64(s5[1], s13[1]);
  x14[0] = _mm_sub_epi64(s6[0], s14[0]);
  x14[1] = _mm_sub_epi64(s6[1], s14[1]);
  x15[0] = _mm_sub_epi64(s7[0], s15[0]);
  x15[1] = _mm_sub_epi64(s7[1], s15[1]);
  x0[0] = dct_const_round_shift_64bit(x0[0]);
  x0[1] = dct_const_round_shift_64bit(x0[1]);
  x1[0] = dct_const_round_shift_64bit(x1[0]);
  x1[1] = dct_const_round_shift_64bit(x1[1]);
  x2[0] = dct_const_round_shift_64bit(x2[0]);
  x2[1] = dct_const_round_shift_64bit(x2[1]);
  x3[0] = dct_const_round_shift_64bit(x3[0]);
  x3[1] = dct_const_round_shift_64bit(x3[1]);
  x4[0] = dct_const_round_shift_64bit(x4[0]);
  x4[1] = dct_const_round_shift_64bit(x4[1]);
  x5[0] = dct_const_round_shift_64bit(x5[0]);
  x5[1] = dct_const_round_shift_64bit(x5[1]);
  x6[0] = dct_const_round_shift_64bit(x6[0]);
  x6[1] = dct_const_round_shift_64bit(x6[1]);
  x7[0] = dct_const_round_shift_64bit(x7[0]);
  x7[1] = dct_const_round_shift_64bit(x7[1]);
  x8[0] = dct_const_round_shift_64bit(x8[0]);
  x8[1] = dct_const_round_shift_64bit(x8[1]);
  x9[0] = dct_const_round_shift_64bit(x9[0]);
  x9[1] = dct_const_round_shift_64bit(x9[1]);
  x10[0] = dct_const_round_shift_64bit(x10[0]);
  x10[1] = dct_const_round_shift_64bit(x10[1]);
  x11[0] = dct_const_round_shift_64bit(x11[0]);
  x11[1] = dct_const_round_shift_64bit(x11[1]);
  x12[0] = dct_const_round_shift_64bit(x12[0]);
  x12[1] = dct_const_round_shift_64bit(x12[1]);
  x13[0] = dct_const_round_shift_64bit(x13[0]);
  x13[1] = dct_const_round_shift_64bit(x13[1]);
  x14[0] = dct_const_round_shift_64bit(x14[0]);
  x14[1] = dct_const_round_shift_64bit(x14[1]);
  x15[0] = dct_const_round_shift_64bit(x15[0]);
  x15[1] = dct_const_round_shift_64bit(x15[1]);
  x0[0] = pack_4(x0[0], x0[1]);
  x1[0] = pack_4(x1[0], x1[1]);
  x2[0] = pack_4(x2[0], x2[1]);
  x3[0] = pack_4(x3[0], x3[1]);
  x4[0] = pack_4(x4[0], x4[1]);
  x5[0] = pack_4(x5[0], x5[1]);
  x6[0] = pack_4(x6[0], x6[1]);
  x7[0] = pack_4(x7[0], x7[1]);
  x8[0] = pack_4(x8[0], x8[1]);
  x9[0] = pack_4(x9[0], x9[1]);
  x10[0] = pack_4(x10[0], x10[1]);
  x11[0] = pack_4(x11[0], x11[1]);
  x12[0] = pack_4(x12[0], x12[1]);
  x13[0] = pack_4(x13[0], x13[1]);
  x14[0] = pack_4(x14[0], x14[1]);
  x15[0] = pack_4(x15[0], x15[1]);
  // stage 2
  s0[0] = x0[0];
  s1[0] = x1[0];
  s2[0] = x2[0];
  s3[0] = x3[0];
  s4[0] = x4[0];
  s5[0] = x5[0];
  s6[0] = x6[0];
  s7[0] = x7[0];
  x0[0] = _mm_add_epi32(s0[0], s4[0]);
  x1[0] = _mm_add_epi32(s1[0], s5[0]);
  x2[0] = _mm_add_epi32(s2[0], s6[0]);
  x3[0] = _mm_add_epi32(s3[0], s7[0]);
  x4[0] = _mm_sub_epi32(s0[0], s4[0]);
  x5[0] = _mm_sub_epi32(s1[0], s5[0]);
  x6[0] = _mm_sub_epi32(s2[0], s6[0]);
  x7[0] = _mm_sub_epi32(s3[0], s7[0]);
  highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);
  highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,
                                s11);
  highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,
                                s12);
  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,
                                s14);
  x8[0] = _mm_add_epi64(s8[0], s12[0]);
  x8[1] = _mm_add_epi64(s8[1], s12[1]);
  x9[0] = _mm_add_epi64(s9[0], s13[0]);
  x9[1] = _mm_add_epi64(s9[1], s13[1]);
  x10[0] = _mm_add_epi64(s10[0], s14[0]);
  x10[1] = _mm_add_epi64(s10[1], s14[1]);
  x11[0] = _mm_add_epi64(s11[0], s15[0]);
  x11[1] = _mm_add_epi64(s11[1], s15[1]);
  x12[0] = _mm_sub_epi64(s8[0], s12[0]);
  x12[1] = _mm_sub_epi64(s8[1], s12[1]);
  x13[0] = _mm_sub_epi64(s9[0], s13[0]);
  x13[1] = _mm_sub_epi64(s9[1], s13[1]);
  x14[0] = _mm_sub_epi64(s10[0], s14[0]);
  x14[1] = _mm_sub_epi64(s10[1], s14[1]);
  x15[0] = _mm_sub_epi64(s11[0], s15[0]);
  x15[1] = _mm_sub_epi64(s11[1], s15[1]);
  x8[0] = dct_const_round_shift_64bit(x8[0]);
  x8[1] = dct_const_round_shift_64bit(x8[1]);
  x9[0] = dct_const_round_shift_64bit(x9[0]);
  x9[1] = dct_const_round_shift_64bit(x9[1]);
  x10[0] = dct_const_round_shift_64bit(x10[0]);
  x10[1] = dct_const_round_shift_64bit(x10[1]);
  x11[0] = dct_const_round_shift_64bit(x11[0]);
  x11[1] = dct_const_round_shift_64bit(x11[1]);
  x12[0] = dct_const_round_shift_64bit(x12[0]);
  x12[1] = dct_const_round_shift_64bit(x12[1]);
  x13[0] = dct_const_round_shift_64bit(x13[0]);
  x13[1] = dct_const_round_shift_64bit(x13[1]);
  x14[0] = dct_const_round_shift_64bit(x14[0]);
  x14[1] = dct_const_round_shift_64bit(x14[1]);
  x15[0] = dct_const_round_shift_64bit(x15[0]);
  x15[1] = dct_const_round_shift_64bit(x15[1]);
  x8[0] = pack_4(x8[0], x8[1]);
  x9[0] = pack_4(x9[0], x9[1]);
  x10[0] = pack_4(x10[0], x10[1]);
  x11[0] = pack_4(x11[0], x11[1]);
  x12[0] = pack_4(x12[0], x12[1]);
  x13[0] = pack_4(x13[0], x13[1]);
  x14[0] = pack_4(x14[0], x14[1]);
  x15[0] = pack_4(x15[0], x15[1]);
  // stage 3
  s0[0] = x0[0];
  s1[0] = x1[0];
  s2[0] = x2[0];
  s3[0] = x3[0];
  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
  s8[0] = x8[0];
  s9[0] = x9[0];
  s10[0] = x10[0];
  s11[0] = x11[0];
  highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,
                                s13);
  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,
                                s14);
  x0[0] = _mm_add_epi32(s0[0], s2[0]);
  x1[0] = _mm_add_epi32(s1[0], s3[0]);
  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
  x4[0] = _mm_add_epi64(s4[0], s6[0]);
  x4[1] = _mm_add_epi64(s4[1], s6[1]);
  x5[0] = _mm_add_epi64(s5[0], s7[0]);
  x5[1] = _mm_add_epi64(s5[1], s7[1]);
  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
  x4[0] = dct_const_round_shift_64bit(x4[0]);
  x4[1] = dct_const_round_shift_64bit(x4[1]);
  x5[0] = dct_const_round_shift_64bit(x5[0]);
  x5[1] = dct_const_round_shift_64bit(x5[1]);
  x6[0] = dct_const_round_shift_64bit(x6[0]);
  x6[1] = dct_const_round_shift_64bit(x6[1]);
  x7[0] = dct_const_round_shift_64bit(x7[0]);
  x7[1] = dct_const_round_shift_64bit(x7[1]);
  x4[0] = pack_4(x4[0], x4[1]);
  x5[0] = pack_4(x5[0], x5[1]);
  x6[0] = pack_4(x6[0], x6[1]);
  x7[0] = pack_4(x7[0], x7[1]);
  x8[0] = _mm_add_epi32(s8[0], s10[0]);
  x9[0] = _mm_add_epi32(s9[0], s11[0]);
  x10[0] = _mm_sub_epi32(s8[0], s10[0]);
  x11[0] = _mm_sub_epi32(s9[0], s11[0]);
  x12[0] = _mm_add_epi64(s12[0], s14[0]);
  x12[1] = _mm_add_epi64(s12[1], s14[1]);
  x13[0] = _mm_add_epi64(s13[0], s15[0]);
  x13[1] = _mm_add_epi64(s13[1], s15[1]);
  x14[0] = _mm_sub_epi64(s12[0], s14[0]);
  x14[1] = _mm_sub_epi64(s12[1], s14[1]);
  x15[0] = _mm_sub_epi64(s13[0], s15[0]);
  x15[1] = _mm_sub_epi64(s13[1], s15[1]);
  x12[0] = dct_const_round_shift_64bit(x12[0]);
  x12[1] = dct_const_round_shift_64bit(x12[1]);
  x13[0] = dct_const_round_shift_64bit(x13[0]);
  x13[1] = dct_const_round_shift_64bit(x13[1]);
  x14[0] = dct_const_round_shift_64bit(x14[0]);
  x14[1] = dct_const_round_shift_64bit(x14[1]);
  x15[0] = dct_const_round_shift_64bit(x15[0]);
  x15[1] = dct_const_round_shift_64bit(x15[1]);
  x12[0] = pack_4(x12[0], x12[1]);
  x13[0] = pack_4(x13[0], x13[1]);
  x14[0] = pack_4(x14[0], x14[1]);
  x15[0] = pack_4(x15[0], x15[1]);
  // stage 4
  s2[0] = _mm_add_epi32(x2[0], x3[0]);
  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
  s6[0] = _mm_add_epi32(x7[0], x6[0]);
  s7[0] = _mm_sub_epi32(x7[0], x6[0]);
  s10[0] = _mm_add_epi32(x11[0], x10[0]);
  s11[0] = _mm_sub_epi32(x11[0], x10[0]);
  s14[0] = _mm_add_epi32(x14[0], x15[0]);
  s15[0] = _mm_sub_epi32(x14[0], x15[0]);
  highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);
  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
  highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);
  highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);
  highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);
  highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);
  x2[0] = dct_const_round_shift_64bit(s2[0]);
  x2[1] = dct_const_round_shift_64bit(s2[1]);
  x3[0] = dct_const_round_shift_64bit(s3[0]);
  x3[1] = dct_const_round_shift_64bit(s3[1]);
  x6[0] = dct_const_round_shift_64bit(s6[0]);
  x6[1] = dct_const_round_shift_64bit(s6[1]);
  x7[0] = dct_const_round_shift_64bit(s7[0]);
  x7[1] = dct_const_round_shift_64bit(s7[1]);
  x10[0] = dct_const_round_shift_64bit(s10[0]);
  x10[1] = dct_const_round_shift_64bit(s10[1]);
  x11[0] = dct_const_round_shift_64bit(s11[0]);
  x11[1] = dct_const_round_shift_64bit(s11[1]);
  x14[0] = dct_const_round_shift_64bit(s14[0]);
  x14[1] = dct_const_round_shift_64bit(s14[1]);
  x15[0] = dct_const_round_shift_64bit(s15[0]);
  x15[1] = dct_const_round_shift_64bit(s15[1]);
  x2[0] = pack_4(x2[0], x2[1]);
  x3[0] = pack_4(x3[0], x3[1]);
  x6[0] = pack_4(x6[0], x6[1]);
  x7[0] = pack_4(x7[0], x7[1]);
  x10[0] = pack_4(x10[0], x10[1]);
  x11[0] = pack_4(x11[0], x11[1]);
  x14[0] = pack_4(x14[0], x14[1]);
  x15[0] = pack_4(x15[0], x15[1]);
  io[0] = x0[0];
  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);
  io[2] = x12[0];
  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
  io[4] = x6[0];
  io[5] = x14[0];
  io[6] = x10[0];
  io[7] = x2[0];
  io[8] = x3[0];
  io[9] = x11[0];
  io[10] = x15[0];
  io[11] = x7[0];
  io[12] = x5[0];
  io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);
  io[14] = x9[0];
  io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
 }
 void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,
                                        int stride, int tx_type, int bd) {
  int i;
  __m128i out[16], *in;
  if (bd == 8) {
    __m128i l[16], r[16];
    in = l;
    for (i = 0; i < 2; i++) {
      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
        idct16_8col(in, in);
      } else {
        vpx_iadst16_8col_sse2(in);
      }
      in = r;
      input += 128;
    }
    for (i = 0; i < 16; i += 8) {
      int j;
      transpose_16bit_8x8(l + i, out);
      transpose_16bit_8x8(r + i, out + 8);
      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
        idct16_8col(out, out);
      } else {
        vpx_iadst16_8col_sse2(out);
      }
      for (j = 0; j < 16; ++j) {
        highbd_write_buffer_8(dest + j * stride, out[j], bd);
      }
      dest += 8;
    }
  } else {
    __m128i all[4][16];
    for (i = 0; i < 4; i++) {
      in = all[i];
      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
        vpx_highbd_idct16_4col_sse4_1(in);
      } else {
        highbd_iadst16_4col_sse4_1(in);
      }
      input += 4 * 16;
    }
    for (i = 0; i < 16; i += 4) {
      int j;
      transpose_32bit_4x4(all[0] + i, out + 0);
      transpose_32bit_4x4(all[1] + i, out + 4);
      transpose_32bit_4x4(all[2] + i, out + 8);
      transpose_32bit_4x4(all[3] + i, out + 12);
      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
        vpx_highbd_idct16_4col_sse4_1(out);
      } else {
        highbd_iadst16_4col_sse4_1(out);
      }
      for (j = 0; j < 16; ++j) {
        highbd_write_buffer_4(dest + j * stride, out[j], bd);
      }
      dest += 4;
    }
  }
 }
--- a/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
+++ b/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
@@ -1,131 +0,0 @@
 /*
 *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_idct.h"
 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 static INLINE void highbd_iadst4_sse4_1(__m128i *const io) {
  const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0);
  const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0);
  const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0);
  const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0);
  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2];
  __m128i temp[2];
  transpose_32bit_4x4(io, io);
  extend_64bit(io[0], temp);
  s0[0] = _mm_mul_epi32(pair_c1, temp[0]);
  s0[1] = _mm_mul_epi32(pair_c1, temp[1]);
  s1[0] = _mm_mul_epi32(pair_c2, temp[0]);
  s1[1] = _mm_mul_epi32(pair_c2, temp[1]);
  extend_64bit(io[1], temp);
  s2[0] = _mm_mul_epi32(pair_c3, temp[0]);
  s2[1] = _mm_mul_epi32(pair_c3, temp[1]);
  extend_64bit(io[2], temp);
  s3[0] = _mm_mul_epi32(pair_c4, temp[0]);
  s3[1] = _mm_mul_epi32(pair_c4, temp[1]);
  s4[0] = _mm_mul_epi32(pair_c1, temp[0]);
  s4[1] = _mm_mul_epi32(pair_c1, temp[1]);
  extend_64bit(io[3], temp);
  s5[0] = _mm_mul_epi32(pair_c2, temp[0]);
  s5[1] = _mm_mul_epi32(pair_c2, temp[1]);
  s6[0] = _mm_mul_epi32(pair_c4, temp[0]);
  s6[1] = _mm_mul_epi32(pair_c4, temp[1]);
  t0[0] = _mm_add_epi64(s0[0], s3[0]);
  t0[1] = _mm_add_epi64(s0[1], s3[1]);
  t0[0] = _mm_add_epi64(t0[0], s5[0]);
  t0[1] = _mm_add_epi64(t0[1], s5[1]);
  t1[0] = _mm_sub_epi64(s1[0], s4[0]);
  t1[1] = _mm_sub_epi64(s1[1], s4[1]);
  t1[0] = _mm_sub_epi64(t1[0], s6[0]);
  t1[1] = _mm_sub_epi64(t1[1], s6[1]);
  temp[0] = _mm_sub_epi32(io[0], io[2]);
  temp[0] = _mm_add_epi32(temp[0], io[3]);
  extend_64bit(temp[0], temp);
  t2[0] = _mm_mul_epi32(pair_c3, temp[0]);
  t2[1] = _mm_mul_epi32(pair_c3, temp[1]);
  s0[0] = _mm_add_epi64(t0[0], s2[0]);
  s0[1] = _mm_add_epi64(t0[1], s2[1]);
  s1[0] = _mm_add_epi64(t1[0], s2[0]);
  s1[1] = _mm_add_epi64(t1[1], s2[1]);
  s3[0] = _mm_add_epi64(t0[0], t1[0]);
  s3[1] = _mm_add_epi64(t0[1], t1[1]);
  s3[0] = _mm_sub_epi64(s3[0], s2[0]);
  s3[1] = _mm_sub_epi64(s3[1], s2[1]);
  s0[0] = dct_const_round_shift_64bit(s0[0]);
  s0[1] = dct_const_round_shift_64bit(s0[1]);
  s1[0] = dct_const_round_shift_64bit(s1[0]);
  s1[1] = dct_const_round_shift_64bit(s1[1]);
  s2[0] = dct_const_round_shift_64bit(t2[0]);
  s2[1] = dct_const_round_shift_64bit(t2[1]);
  s3[0] = dct_const_round_shift_64bit(s3[0]);
  s3[1] = dct_const_round_shift_64bit(s3[1]);
  io[0] = pack_4(s0[0], s0[1]);
  io[1] = pack_4(s1[0], s1[1]);
  io[2] = pack_4(s2[0], s2[1]);
  io[3] = pack_4(s3[0], s3[1]);
 }
 void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
                                     int stride, int tx_type, int bd) {
  __m128i io[4];
  io[0] = _mm_load_si128((const __m128i *)(input + 0));
  io[1] = _mm_load_si128((const __m128i *)(input + 4));
  io[2] = _mm_load_si128((const __m128i *)(input + 8));
  io[3] = _mm_load_si128((const __m128i *)(input + 12));
  if (bd == 8) {
    __m128i io_short[2];
    io_short[0] = _mm_packs_epi32(io[0], io[1]);
    io_short[1] = _mm_packs_epi32(io[2], io[3]);
    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
      idct4_sse2(io_short);
    } else {
      iadst4_sse2(io_short);
    }
    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
      idct4_sse2(io_short);
    } else {
      iadst4_sse2(io_short);
    }
    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
    io[0] = _mm_srai_epi16(io_short[0], 4);
    io[1] = _mm_srai_epi16(io_short[1], 4);
  } else {
    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
      highbd_idct4_sse4_1(io);
    } else {
      highbd_iadst4_sse4_1(io);
    }
    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
      highbd_idct4_sse4_1(io);
    } else {
      highbd_iadst4_sse4_1(io);
    }
    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
  }
  recon_and_store_4x4(io, dest, stride, bd);
 }
--- a/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
+++ b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
@@ -1,255 +0,0 @@
 /*
 *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_idct.h"
 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
                                                      const int c,
                                                      __m128i *const s) {
  const __m128i pair_c = pair_set_epi32(4 * c, 0);
  __m128i x[2];
  extend_64bit(in, x);
  s[0] = _mm_mul_epi32(pair_c, x[0]);
  s[1] = _mm_mul_epi32(pair_c, x[1]);
 }
 static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
                                                 const __m128i in1,
                                                 const int c0, const int c1,
                                                 __m128i *const s0,
                                                 __m128i *const s1) {
  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
  __m128i t00[2], t01[2], t10[2], t11[2];
  __m128i x0[2], x1[2];
  extend_64bit(in0, x0);
  extend_64bit(in1, x1);
  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
  s0[0] = _mm_add_epi64(t00[0], t11[0]);
  s0[1] = _mm_add_epi64(t00[1], t11[1]);
  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
 }
 static void highbd_iadst8_sse4_1(__m128i *const io) {
  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
  transpose_32bit_4x4x2(io, io);
  // stage 1
  highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1);
  highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5);
  x0[0] = _mm_add_epi64(s0[0], s4[0]);
  x0[1] = _mm_add_epi64(s0[1], s4[1]);
  x1[0] = _mm_add_epi64(s1[0], s5[0]);
  x1[1] = _mm_add_epi64(s1[1], s5[1]);
  x4[0] = _mm_sub_epi64(s0[0], s4[0]);
  x4[1] = _mm_sub_epi64(s0[1], s4[1]);
  x5[0] = _mm_sub_epi64(s1[0], s5[0]);
  x5[1] = _mm_sub_epi64(s1[1], s5[1]);
  highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3);
  highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7);
  x2[0] = _mm_add_epi64(s2[0], s6[0]);
  x2[1] = _mm_add_epi64(s2[1], s6[1]);
  x3[0] = _mm_add_epi64(s3[0], s7[0]);
  x3[1] = _mm_add_epi64(s3[1], s7[1]);
  x6[0] = _mm_sub_epi64(s2[0], s6[0]);
  x6[1] = _mm_sub_epi64(s2[1], s6[1]);
  x7[0] = _mm_sub_epi64(s3[0], s7[0]);
  x7[1] = _mm_sub_epi64(s3[1], s7[1]);
  x0[0] = dct_const_round_shift_64bit(x0[0]);
  x0[1] = dct_const_round_shift_64bit(x0[1]);
  x1[0] = dct_const_round_shift_64bit(x1[0]);
  x1[1] = dct_const_round_shift_64bit(x1[1]);
  x2[0] = dct_const_round_shift_64bit(x2[0]);
  x2[1] = dct_const_round_shift_64bit(x2[1]);
  x3[0] = dct_const_round_shift_64bit(x3[0]);
  x3[1] = dct_const_round_shift_64bit(x3[1]);
  x4[0] = dct_const_round_shift_64bit(x4[0]);
  x4[1] = dct_const_round_shift_64bit(x4[1]);
  x5[0] = dct_const_round_shift_64bit(x5[0]);
  x5[1] = dct_const_round_shift_64bit(x5[1]);
  x6[0] = dct_const_round_shift_64bit(x6[0]);
  x6[1] = dct_const_round_shift_64bit(x6[1]);
  x7[0] = dct_const_round_shift_64bit(x7[0]);
  x7[1] = dct_const_round_shift_64bit(x7[1]);
  s0[0] = pack_4(x0[0], x0[1]);  // s0 = x0;
  s1[0] = pack_4(x1[0], x1[1]);  // s1 = x1;
  s2[0] = pack_4(x2[0], x2[1]);  // s2 = x2;
  s3[0] = pack_4(x3[0], x3[1]);  // s3 = x3;
  x4[0] = pack_4(x4[0], x4[1]);
  x5[0] = pack_4(x5[0], x5[1]);
  x6[0] = pack_4(x6[0], x6[1]);
  x7[0] = pack_4(x7[0], x7[1]);
  // stage 2
  x0[0] = _mm_add_epi32(s0[0], s2[0]);
  x1[0] = _mm_add_epi32(s1[0], s3[0]);
  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
  x4[0] = _mm_add_epi64(s4[0], s6[0]);
  x4[1] = _mm_add_epi64(s4[1], s6[1]);
  x5[0] = _mm_add_epi64(s5[0], s7[0]);
  x5[1] = _mm_add_epi64(s5[1], s7[1]);
  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
  x4[0] = dct_const_round_shift_64bit(x4[0]);
  x4[1] = dct_const_round_shift_64bit(x4[1]);
  x5[0] = dct_const_round_shift_64bit(x5[0]);
  x5[1] = dct_const_round_shift_64bit(x5[1]);
  x6[0] = dct_const_round_shift_64bit(x6[0]);
  x6[1] = dct_const_round_shift_64bit(x6[1]);
  x7[0] = dct_const_round_shift_64bit(x7[0]);
  x7[1] = dct_const_round_shift_64bit(x7[1]);
  x4[0] = pack_4(x4[0], x4[1]);
  x5[0] = pack_4(x5[0], x5[1]);
  x6[0] = pack_4(x6[0], x6[1]);
  x7[0] = pack_4(x7[0], x7[1]);
  // stage 3
  s2[0] = _mm_add_epi32(x2[0], x3[0]);
  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
  s6[0] = _mm_add_epi32(x6[0], x7[0]);
  s7[0] = _mm_sub_epi32(x6[0], x7[0]);
  highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2);
  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
  x2[0] = dct_const_round_shift_64bit(s2[0]);
  x2[1] = dct_const_round_shift_64bit(s2[1]);
  x3[0] = dct_const_round_shift_64bit(s3[0]);
  x3[1] = dct_const_round_shift_64bit(s3[1]);
  x6[0] = dct_const_round_shift_64bit(s6[0]);
  x6[1] = dct_const_round_shift_64bit(s6[1]);
  x7[0] = dct_const_round_shift_64bit(s7[0]);
  x7[1] = dct_const_round_shift_64bit(s7[1]);
  x2[0] = pack_4(x2[0], x2[1]);
  x3[0] = pack_4(x3[0], x3[1]);
  x6[0] = pack_4(x6[0], x6[1]);
  x7[0] = pack_4(x7[0], x7[1]);
  io[0] = x0[0];
  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
  io[2] = x6[0];
  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]);
  io[4] = x3[0];
  io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]);
  io[6] = x5[0];
  io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
 }
 void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
                                     int stride, int tx_type, int bd) {
  __m128i io[16];
  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
  io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
  io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
  io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
  io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
  io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
  io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
  io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
  io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
  if (bd == 8) {
    __m128i io_short[8];
    io_short[0] = _mm_packs_epi32(io[0], io[4]);
    io_short[1] = _mm_packs_epi32(io[1], io[5]);
    io_short[2] = _mm_packs_epi32(io[2], io[6]);
    io_short[3] = _mm_packs_epi32(io[3], io[7]);
    io_short[4] = _mm_packs_epi32(io[8], io[12]);
    io_short[5] = _mm_packs_epi32(io[9], io[13]);
    io_short[6] = _mm_packs_epi32(io[10], io[14]);
    io_short[7] = _mm_packs_epi32(io[11], io[15]);
    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
      vpx_idct8_sse2(io_short);
    } else {
      iadst8_sse2(io_short);
    }
    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
      vpx_idct8_sse2(io_short);
    } else {
      iadst8_sse2(io_short);
    }
    round_shift_8x8(io_short, io);
  } else {
    __m128i temp[4];
    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
      vpx_highbd_idct8x8_half1d_sse4_1(io);
      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
    } else {
      highbd_iadst8_sse4_1(io);
      highbd_iadst8_sse4_1(&io[8]);
    }
    temp[0] = io[4];
    temp[1] = io[5];
    temp[2] = io[6];
    temp[3] = io[7];
    io[4] = io[8];
    io[5] = io[9];
    io[6] = io[10];
    io[7] = io[11];
    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
      vpx_highbd_idct8x8_half1d_sse4_1(io);
      io[8] = temp[0];
      io[9] = temp[1];
      io[10] = temp[2];
      io[11] = temp[3];
      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
    } else {
      highbd_iadst8_sse4_1(io);
      io[8] = temp[0];
      io[9] = temp[1];
      io[10] = temp[2];
      io[11] = temp[3];
      highbd_iadst8_sse4_1(&io[8]);
    }
    highbd_idct8x8_final_round(io);
  }
  recon_and_store_8x8(io, dest, stride, bd);
 }
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -10,6 +10,8 @@
 #include "./vp9_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
@@ -20,23 +22,23 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
  in[1] = load_input_data8(input + 8);
  switch (tx_type) {
-    case DCT_DCT:
+    case 0:  // DCT_DCT
      idct4_sse2(in);
      idct4_sse2(in);
      break;
-    case ADST_DCT:
+    case 1:  // ADST_DCT
      idct4_sse2(in);
      iadst4_sse2(in);
      break;
-    case DCT_ADST:
+    case 2:  // DCT_ADST
      iadst4_sse2(in);
      idct4_sse2(in);
      break;
-    default:
+    case 3:  // ADST_ADST
      assert(tx_type == ADST_ADST);
      iadst4_sse2(in);
      iadst4_sse2(in);
      break;
    default: assert(0); break;
  }
  // Final round and shift
@@ -65,23 +67,23 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
  in[7] = load_input_data8(input + 8 * 7);
  switch (tx_type) {
-    case DCT_DCT:
+    case 0:  // DCT_DCT
-      vpx_idct8_sse2(in);
+      idct8_sse2(in);
-      vpx_idct8_sse2(in);
+      idct8_sse2(in);
      break;
-    case ADST_DCT:
+    case 1:  // ADST_DCT
-      vpx_idct8_sse2(in);
+      idct8_sse2(in);
      iadst8_sse2(in);
      break;
-    case DCT_ADST:
+    case 2:  // DCT_ADST
      iadst8_sse2(in);
-      vpx_idct8_sse2(in);
+      idct8_sse2(in);
      break;
-    default:
+    case 3:  // ADST_ADST
      assert(tx_type == ADST_ADST);
      iadst8_sse2(in);
      iadst8_sse2(in);
      break;
    default: assert(0); break;
  }
  // Final rounding and shift
@@ -199,23 +201,23 @@ void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
  load_buffer_8x16(input, in1);
  switch (tx_type) {
-    case DCT_DCT:
+    case 0:  // DCT_DCT
      idct16_sse2(in0, in1);
      idct16_sse2(in0, in1);
      break;
-    case ADST_DCT:
+    case 1:  // ADST_DCT
      idct16_sse2(in0, in1);
      iadst16_sse2(in0, in1);
      break;
-    case DCT_ADST:
+    case 2:  // DCT_ADST
      iadst16_sse2(in0, in1);
      idct16_sse2(in0, in1);
      break;
-    default:
+    case 3:  // ADST_ADST
      assert(tx_type == ADST_ADST);
      iadst16_sse2(in0, in1);
      iadst16_sse2(in0, in1);
      break;
    default: assert(0); break;
  }
  write_buffer_8x16(dest, in0, stride);
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -464,6 +464,10 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
      cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5);
    }
  }
  if (cpi->svc.spatial_layer_id > 0) {
    cr->motion_thresh = 4;
    cr->rate_boost_fac = 12;
  }
  if (cpi->oxcf.rc_mode == VPX_VBR) {
    // To be adjusted for VBR mode, e.g., based on gf period and boost.
    // For now use smaller qp-delta (than CBR), no second boosted seg, and
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -12,10 +12,7 @@
 #include "vp9/encoder/vp9_encoder.h"
 static const BLOCK_SIZE square[] = {
-  BLOCK_8X8,
+  BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
  BLOCK_16X16,
  BLOCK_32X32,
  BLOCK_64X64,
 };
 static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -189,12 +189,11 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
    int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
    int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv,
    int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx,
-    int use_svc, int spatial_layer) {
+    int use_svc) {
  const int sse_diff = (ctx->newmv_sse == UINT_MAX)
                           ? 0
                           : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
  int frame;
  int denoise_layer_idx = 0;
  MACROBLOCKD *filter_mbd = &mb->e_mbd;
  MODE_INFO *mi = filter_mbd->mi[0];
  MODE_INFO saved_mi;
@@ -255,10 +254,6 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
      frame = lst_fb_idx + 1;
    else if (frame == GOLDEN_FRAME)
      frame = gld_fb_idx + 1;
    // Shift for the second spatial layer.
    if (num_spatial_layers - spatial_layer == 2)
      frame = frame + denoiser->num_ref_frames;
    denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
  }
  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
@@ -294,21 +289,18 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
                  denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col);
  filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
-  filter_mbd->plane[0].dst.buf = block_start(
+  filter_mbd->plane[0].dst.buf =
-      denoiser->mc_running_avg_y[denoise_layer_idx].y_buffer,
+      block_start(denoiser->mc_running_avg_y.y_buffer,
-      denoiser->mc_running_avg_y[denoise_layer_idx].y_stride, mi_row, mi_col);
+                  denoiser->mc_running_avg_y.y_stride, mi_row, mi_col);
-  filter_mbd->plane[0].dst.stride =
+  filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
-      denoiser->mc_running_avg_y[denoise_layer_idx].y_stride;
+  filter_mbd->plane[1].dst.buf =
-  filter_mbd->plane[1].dst.buf = block_start(
+      block_start(denoiser->mc_running_avg_y.u_buffer,
-      denoiser->mc_running_avg_y[denoise_layer_idx].u_buffer,
+                  denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
-      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride;
-  filter_mbd->plane[1].dst.stride =
+  filter_mbd->plane[2].dst.buf =
-      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
+      block_start(denoiser->mc_running_avg_y.v_buffer,
-  filter_mbd->plane[2].dst.buf = block_start(
+                  denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
-      denoiser->mc_running_avg_y[denoise_layer_idx].v_buffer,
+  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;
      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
  filter_mbd->plane[2].dst.stride =
      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
  set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
  vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
@@ -332,17 +324,9 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
  int zeromv_filter = 0;
  VP9_DENOISER *denoiser = &cpi->denoiser;
  VP9_DENOISER_DECISION decision = COPY_BLOCK;
-
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
-  const int shift =
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
          ? denoiser->num_ref_frames
          : 0;
  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift];
  const int denoise_layer_index =
      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1;
  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index];
  uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
  uint8_t *mc_avg_start =
      block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
  struct buf_2d src = mb->plane[0].src;
@@ -397,7 +381,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
        &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
        motion_magnitude, is_skin, &zeromv_filter, consec_zeromv,
        cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx,
-        cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id);
+        cpi->gld_fb_idx, cpi->use_svc);
  if (decision == FILTER_BLOCK) {
    decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start,
@@ -448,8 +432,7 @@ void vp9_denoiser_update_frame_info(
    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
    int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
    int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
-    int svc_base_is_key, int second_spatial_layer) {
+    int svc_base_is_key) {
  const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
  // Copy source into denoised reference buffers on KEY_FRAME or
  // if the just encoded frame was resized. For SVC, copy source if the base
  // spatial layer was key frame.
@@ -458,8 +441,8 @@ void vp9_denoiser_update_frame_info(
    int i;
    // Start at 1 so as not to overwrite the INTRA_FRAME
    for (i = 1; i < denoiser->num_ref_frames; ++i) {
-      if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL)
+      if (denoiser->running_avg_y[i].buffer_alloc != NULL)
-        copy_frame(&denoiser->running_avg_y[i + shift], &src);
+        copy_frame(&denoiser->running_avg_y[i], &src);
    }
    denoiser->reset = 0;
    return;
@@ -468,29 +451,29 @@ void vp9_denoiser_update_frame_info(
  // If more than one refresh occurs, must copy frame buffer.
  if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) {
    if (refresh_alt_ref_frame) {
-      copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+      copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1],
-                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
+                 &denoiser->running_avg_y[INTRA_FRAME]);
    }
    if (refresh_golden_frame) {
-      copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+      copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1],
-                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
+                 &denoiser->running_avg_y[INTRA_FRAME]);
    }
    if (refresh_last_frame) {
-      copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+      copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1],
-                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
+                 &denoiser->running_avg_y[INTRA_FRAME]);
    }
  } else {
    if (refresh_alt_ref_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+      swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1],
-                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
+                        &denoiser->running_avg_y[INTRA_FRAME]);
    }
    if (refresh_golden_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+      swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1],
-                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
+                        &denoiser->running_avg_y[INTRA_FRAME]);
    }
    if (refresh_last_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+      swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1],
-                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
+                        &denoiser->running_avg_y[INTRA_FRAME]);
    }
  }
 }
@@ -539,75 +522,44 @@ static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm,
 }
 int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
-                             int svc_buf_shift, int refresh_alt,
+                             int refresh_alt, int refresh_gld, int refresh_lst,
-                             int refresh_gld, int refresh_lst, int alt_fb_idx,
+                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) {
                             int gld_fb_idx, int lst_fb_idx) {
  int fail = 0;
  if (refresh_alt) {
    // Increase the frame buffer index by 1 to map it to the buffer index in the
    // denoiser.
-    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, alt_fb_idx + 1);
                                           alt_fb_idx + 1 + svc_buf_shift);
    if (fail) return 1;
  }
  if (refresh_gld) {
-    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, gld_fb_idx + 1);
                                           gld_fb_idx + 1 + svc_buf_shift);
    if (fail) return 1;
  }
  if (refresh_lst) {
-    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, lst_fb_idx + 1);
                                           lst_fb_idx + 1 + svc_buf_shift);
    if (fail) return 1;
  }
  return 0;
 }
-int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+int vp9_denoiser_alloc(VP9_COMMON *cm, int use_svc, VP9_DENOISER *denoiser,
-                       int use_svc, int noise_sen, int width, int height,
+                       int width, int height, int ssx, int ssy,
                       int ssx, int ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                       int use_highbitdepth,
 #endif
                       int border) {
-  int i, layer, fail, init_num_ref_frames;
+  int i, fail, init_num_ref_frames;
  const int legacy_byte_alignment = 0;
  int num_layers = 1;
  int scaled_width = width;
  int scaled_height = height;
  if (use_svc) {
    LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id *
                                                svc->number_temporal_layers +
                                            svc->temporal_layer_id];
    get_layer_resolution(width, height, lc->scaling_factor_num,
                         lc->scaling_factor_den, &scaled_width, &scaled_height);
    // For SVC: only denoise at most 2 spatial (highest) layers.
    if (noise_sen >= 2)
      // Denoise from one spatial layer below the top.
      svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 2, 0);
    else
      // Only denoise the top spatial layer.
      svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 1, 0);
    num_layers = svc->number_spatial_layers - svc->first_layer_denoise;
  }
  assert(denoiser != NULL);
  denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
  init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
  denoiser->num_layers = num_layers;
  CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
                  vpx_calloc(denoiser->num_ref_frames * num_layers,
                             sizeof(denoiser->running_avg_y[0])));
  CHECK_MEM_ERROR(
-      cm, denoiser->mc_running_avg_y,
+      cm, denoiser->running_avg_y,
-      vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
+      vpx_calloc(denoiser->num_ref_frames, sizeof(denoiser->running_avg_y[0])));
  for (layer = 0; layer < num_layers; ++layer) {
    const int denoise_width = (layer == 0) ? width : scaled_width;
    const int denoise_height = (layer == 0) ? height : scaled_height;
  for (i = 0; i < init_num_ref_frames; ++i) {
-      fail = vpx_alloc_frame_buffer(
+    fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
-          &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer],
+                                  ssx, ssy,
          denoise_width, denoise_height, ssx, ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                  use_highbitdepth,
 #endif
@@ -621,8 +573,8 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
 #endif
  }
-    fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y[layer],
+  fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, ssx,
-                                  denoise_width, denoise_height, ssx, ssy,
+                                ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                use_highbitdepth,
 #endif
@@ -631,10 +583,7 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
    vp9_denoiser_free(denoiser);
    return 1;
  }
  }
  // denoiser->last_source only used for noise_estimation, so only for top
  // layer.
  fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                use_highbitdepth,
@@ -660,18 +609,12 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) {
    return;
  }
  denoiser->frame_buffer_initialized = 0;
-  for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) {
+  for (i = 0; i < denoiser->num_ref_frames; ++i) {
    vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
  }
  vpx_free(denoiser->running_avg_y);
  denoiser->running_avg_y = NULL;
-
+  vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
  for (i = 0; i < denoiser->num_layers; ++i) {
    vpx_free_frame_buffer(&denoiser->mc_running_avg_y[i]);
  }
  vpx_free(denoiser->mc_running_avg_y);
  denoiser->mc_running_avg_y = NULL;
  vpx_free_frame_buffer(&denoiser->last_source);
 }
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -44,12 +44,11 @@ typedef enum vp9_denoiser_level {
 typedef struct vp9_denoiser {
  YV12_BUFFER_CONFIG *running_avg_y;
-  YV12_BUFFER_CONFIG *mc_running_avg_y;
+  YV12_BUFFER_CONFIG mc_running_avg_y;
  YV12_BUFFER_CONFIG last_source;
  int frame_buffer_initialized;
  int reset;
  int num_ref_frames;
  int num_layers;
  VP9_DENOISER_LEVEL denoising_level;
  VP9_DENOISER_LEVEL prev_denoising_level;
 } VP9_DENOISER;
@@ -67,13 +66,12 @@ typedef struct {
 } VP9_PICKMODE_CTX_DEN;
 struct VP9_COMP;
 struct SVC;
 void vp9_denoiser_update_frame_info(
    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
    int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
    int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
-    int svc_base_is_key, int second_spatial_layer);
+    int svc_base_is_key);
 void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
                          int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
@@ -86,13 +84,11 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
                                     PICK_MODE_CONTEXT *ctx);
 int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
-                             int svc_buf_shift, int refresh_alt,
+                             int refresh_alt, int refresh_gld, int refresh_lst,
-                             int refresh_gld, int refresh_lst, int alt_fb_idx,
+                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx);
                             int gld_fb_idx, int lst_fb_idx);
-int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+int vp9_denoiser_alloc(VP9_COMMON *cm, int use_svc, VP9_DENOISER *denoiser,
-                       int use_svc, int noise_sen, int width, int height,
+                       int width, int height, int ssx, int ssy,
                       int ssx, int ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                       int use_highbitdepth,
 #endif
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1513,8 +1513,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
          }
        }
      }
-      if (is_key_frame ||
+      if (is_key_frame || (low_res &&
-          (low_res && vt.split[i].split[j].part_variances.none.variance >
+                           vt.split[i].split[j].part_variances.none.variance >
                               threshold_4x4avg)) {
        force_split[split_index] = 0;
        // Go down to 4x4 down-sampling for variance.
@@ -3403,8 +3403,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
        // Rate and distortion based partition search termination clause.
        if (!cpi->sf.ml_partition_search_early_termination &&
-            !x->e_mbd.lossless &&
+            !x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
                                   (best_rdc.dist < dist_breakout_thr &&
                                    best_rdc.rate < rate_breakout_thr))) {
          do_rect = 0;
@@ -4621,9 +4620,8 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
    if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
-    CHECK_MEM_ERROR(
+    CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows *
-        cm, cpi->tile_data,
+                                                   sizeof(*cpi->tile_data)));
        vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
    cpi->allocated_tiles = tile_cols * tile_rows;
    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -50,8 +50,7 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 }
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-  { 10, 6 },
+  { 10, 6 }, { 8, 5 },
  { 8, 5 },
 };
 // 'num' can be negative, but 'shift' must be non-negative.
@@ -201,8 +200,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
          const int band_next = band_translate[i + 1];
          const int token_next =
              (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
-          unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS]
+          unsigned int(
-                                               [ENTROPY_TOKENS] =
+              *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
              token_costs + band_next;
          token_cache[rc] = vp9_pt_energy_class[t0];
          ctx_next = get_coef_context(nb, token_cache, i + 1);
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -65,12 +65,12 @@
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0
-// Whether to use high precision mv for altref computation.
+#define ALTREF_HIGH_PRECISION_MV 1     // Whether to use high precision mv
-#define ALTREF_HIGH_PRECISION_MV 1
+                                       //  for altref computation.
-
+#define HIGH_PRECISION_MV_QTHRESH 200  // Q threshold for high precision
-// Q threshold for high precision mv. Choose a very high value for now so that
+                                       // mv. Choose a very high value for
-// HIGH_PRECISION is always chosen.
+                                       // now so that HIGH_PRECISION is always
-#define HIGH_PRECISION_MV_QTHRESH 200
+                                       // chosen.
 #define FRAME_SIZE_FACTOR 128  // empirical params for context model threshold
 #define FRAME_RATE_FACTOR 8
@@ -437,37 +437,34 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) {
 /* clang-format off */
 const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
-  //         sample rate    size   breadth  bitrate  cpb
+  { LEVEL_1,   829440,      36864,    200,    400,    2, 1,  4,  8 },
-  { LEVEL_1,   829440,      36864,    512,   200,    400,    2, 1,  4,  8 },
+  { LEVEL_1_1, 2764800,     73728,    800,    1000,   2, 1,  4,  8 },
-  { LEVEL_1_1, 2764800,     73728,    768,   800,    1000,   2, 1,  4,  8 },
+  { LEVEL_2,   4608000,     122880,   1800,   1500,   2, 1,  4,  8 },
-  { LEVEL_2,   4608000,     122880,   960,   1800,   1500,   2, 1,  4,  8 },
+  { LEVEL_2_1, 9216000,     245760,   3600,   2800,   2, 2,  4,  8 },
-  { LEVEL_2_1, 9216000,     245760,   1344,  3600,   2800,   2, 2,  4,  8 },
+  { LEVEL_3,   20736000,    552960,   7200,   6000,   2, 4,  4,  8 },
-  { LEVEL_3,   20736000,    552960,   2048,  7200,   6000,   2, 4,  4,  8 },
+  { LEVEL_3_1, 36864000,    983040,   12000,  10000,  2, 4,  4,  8 },
-  { LEVEL_3_1, 36864000,    983040,   2752,  12000,  10000,  2, 4,  4,  8 },
+  { LEVEL_4,   83558400,    2228224,  18000,  16000,  4, 4,  4,  8 },
-  { LEVEL_4,   83558400,    2228224,  4160,  18000,  16000,  4, 4,  4,  8 },
+  { LEVEL_4_1, 160432128,   2228224,  30000,  18000,  4, 4,  5,  6 },
-  { LEVEL_4_1, 160432128,   2228224,  4160,  30000,  18000,  4, 4,  5,  6 },
+  { LEVEL_5,   311951360,   8912896,  60000,  36000,  6, 8,  6,  4 },
-  { LEVEL_5,   311951360,   8912896,  8384,  60000,  36000,  6, 8,  6,  4 },
+  { LEVEL_5_1, 588251136,   8912896,  120000, 46000,  8, 8,  10, 4 },
  { LEVEL_5_1, 588251136,   8912896,  8384,  120000, 46000,  8, 8,  10, 4 },
  // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when
  // they are finalized (currently tentative).
-  { LEVEL_5_2, 1176502272,  8912896,  8384,  180000, 90000,  8, 8,  10, 4 },
+  { LEVEL_5_2, 1176502272,  8912896,  180000, 90000,  8, 8,  10, 4 },
-  { LEVEL_6,   1176502272,  35651584, 16832, 180000, 90000,  8, 16, 10, 4 },
+  { LEVEL_6,   1176502272,  35651584, 180000, 90000,  8, 16, 10, 4 },
-  { LEVEL_6_1, 2353004544u, 35651584, 16832, 240000, 180000, 8, 16, 10, 4 },
+  { LEVEL_6_1, 2353004544u, 35651584, 240000, 180000, 8, 16, 10, 4 },
-  { LEVEL_6_2, 4706009088u, 35651584, 16832, 480000, 360000, 8, 16, 10, 4 },
+  { LEVEL_6_2, 4706009088u, 35651584, 480000, 360000, 8, 16, 10, 4 },
 };
 /* clang-format on */
-static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] =
-  "The average bit-rate is too high.",
+    { "The average bit-rate is too high.",
      "The picture size is too large.",
  "The picture width/height is too large.",
      "The luma sample rate is too large.",
      "The CPB size is too large.",
      "The compression ratio is too small",
      "Too many column tiles are used.",
      "The alt-ref distance is too small.",
-  "Too many reference buffers are used."
+      "Too many reference buffers are used." };
 };
 static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
  switch (mode) {
@@ -547,74 +544,6 @@ static void apply_active_map(VP9_COMP *cpi) {
  }
 }
 static void apply_roi_map(VP9_COMP *cpi) {
  VP9_COMMON *cm = &cpi->common;
  struct segmentation *const seg = &cm->seg;
  vpx_roi_map_t *roi = &cpi->roi;
  const int *delta_q = roi->delta_q;
  const int *delta_lf = roi->delta_lf;
  const int *skip = roi->skip;
  int ref_frame[8];
  int internal_delta_q[MAX_SEGMENTS];
  int i;
  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                    VP9_ALT_FLAG };
  // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
  // realtime mode.
  if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return;
  if (!roi->enabled) return;
  memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame));
  vp9_enable_segmentation(seg);
  vp9_clearall_segfeatures(seg);
  // Select delta coding method;
  seg->abs_delta = SEGMENT_DELTADATA;
  memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols));
  for (i = 0; i < MAX_SEGMENTS; ++i) {
    // Translate the external delta q values to internal values.
    internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i]));
    if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i];
    vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q);
    vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
    if (internal_delta_q[i] != 0) {
      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
      vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]);
    }
    if (delta_lf[i] != 0) {
      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF);
      vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]);
    }
    if (skip[i] != 0) {
      vp9_enable_segfeature(seg, i, SEG_LVL_SKIP);
      vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]);
    }
    if (ref_frame[i] >= 0) {
      int valid_ref = 1;
      // ALTREF is not used as reference for nonrd_pickmode with 0 lag.
      if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode)
        valid_ref = 0;
      // If GOLDEN is selected, make sure it's set as reference.
      if (ref_frame[i] == GOLDEN_FRAME &&
          !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) {
        valid_ref = 0;
      }
      // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
      // same reference.
      if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0)
        ref_frame[i] = LAST_FRAME;
      if (valid_ref) {
        vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME);
        vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]);
      }
    }
  }
  roi->enabled = 1;
 }
 static void init_level_info(Vp9LevelInfo *level_info) {
  Vp9LevelStats *const level_stats = &level_info->level_stats;
  Vp9LevelSpec *const level_spec = &level_info->level_spec;
@@ -625,13 +554,6 @@ static void init_level_info(Vp9LevelInfo *level_info) {
  level_spec->min_altref_distance = INT_MAX;
 }
 static int check_seg_range(int seg_data[8], int range) {
  return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range ||
           abs(seg_data[2]) > range || abs(seg_data[3]) > range ||
           abs(seg_data[4]) > range || abs(seg_data[5]) > range ||
           abs(seg_data[6]) > range || abs(seg_data[7]) > range);
 }
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
  int i;
  const Vp9LevelSpec *this_level;
@@ -644,8 +566,6 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
            (double)this_level->max_luma_sample_rate *
                (1 + SAMPLE_RATE_GRACE_P) ||
        level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
        level_spec->max_luma_picture_breadth >
            this_level->max_luma_picture_breadth ||
        level_spec->average_bitrate > this_level->average_bitrate ||
        level_spec->max_cpb_size > this_level->max_cpb_size ||
        level_spec->compression_ratio < this_level->compression_ratio ||
@@ -658,61 +578,6 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
  return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
 }
 int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
                    unsigned int cols, int delta_q[8], int delta_lf[8],
                    int skip[8], int ref_frame[8]) {
  VP9_COMMON *cm = &cpi->common;
  vpx_roi_map_t *roi = &cpi->roi;
  const int range = 63;
  const int ref_frame_range = 3;  // Alt-ref
  const int skip_range = 1;
  const int frame_rows = cpi->common.mi_rows;
  const int frame_cols = cpi->common.mi_cols;
  // Check number of rows and columns match
  if (frame_rows != (int)rows || frame_cols != (int)cols) {
    return -1;
  }
  if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
      !check_seg_range(ref_frame, ref_frame_range) ||
      !check_seg_range(skip, skip_range))
    return -1;
  // Also disable segmentation if no deltas are specified.
  if (!map ||
      (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] |
         delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] |
         delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] |
         delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] |
         skip[5] | skip[6] | skip[7]) &&
       (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 &&
        ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 &&
        ref_frame[6] == -1 && ref_frame[7] == -1))) {
    vp9_disable_segmentation(&cm->seg);
    cpi->roi.enabled = 0;
    return 0;
  }
  if (roi->roi_map) {
    vpx_free(roi->roi_map);
    roi->roi_map = NULL;
  }
  CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols));
  // Copy to ROI sturcture in the compressor.
  memcpy(roi->roi_map, map, rows * cols);
  memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0]));
  memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0]));
  memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0]));
  memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0]));
  roi->enabled = 1;
  roi->rows = rows;
  roi->cols = cols;
  return 0;
 }
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
                       int cols) {
  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
@@ -947,9 +812,6 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
  vpx_free(cpi->active_map.map);
  cpi->active_map.map = NULL;
  vpx_free(cpi->roi.roi_map);
  cpi->roi.roi_map = NULL;
  vpx_free(cpi->consec_zero_mv);
  cpi->consec_zero_mv = NULL;
@@ -1254,9 +1116,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
  // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
  // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
-  // target of 1/4x1/4. number_spatial_layers must be greater than 2.
+  // target of 1/4x1/4.
-  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
+  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc) {
      cpi->svc.number_spatial_layers > 2) {
    cpi->svc.scaled_temp_is_alloc = 1;
    if (vpx_realloc_frame_buffer(
            &cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1,
@@ -1358,8 +1219,8 @@ static void set_tile_limits(VP9_COMP *cpi) {
  }
  if (cpi->oxcf.target_level == LEVEL_AUTO) {
-    const int level_tile_cols =
+    const uint32_t pic_size = cpi->common.width * cpi->common.height;
-        log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+    const int level_tile_cols = log_tile_cols_from_picsize_level(pic_size);
    if (cm->log2_tile_cols > level_tile_cols) {
      cm->log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
    }
@@ -1987,8 +1848,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
           cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
      vp9_cyclic_refresh_reset_resize(cpi);
    rc->rc_1_frame = 0;
    rc->rc_2_frame = 0;
  }
  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
@@ -1999,24 +1858,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
                                           (int)cpi->oxcf.target_bandwidth);
  }
  // Check for resetting the rc flags (rc_1_frame, rc_2_frame) if the
  // configuration change has a large change in avg_frame_bandwidth.
  // For SVC check for resetting based on spatial layer average bandwidth.
  // Also reset buffer level to optimal level.
  if (cm->current_video_frame > 0) {
    if (cpi->use_svc) {
      vp9_svc_check_reset_layer_rc_flag(cpi);
    } else {
      if (rc->avg_frame_bandwidth > (3 * rc->last_avg_frame_bandwidth >> 1) ||
          rc->avg_frame_bandwidth < (rc->last_avg_frame_bandwidth >> 1)) {
        rc->rc_1_frame = 0;
        rc->rc_2_frame = 0;
        rc->bits_off_target = rc->optimal_buffer_level;
        rc->buffer_level = rc->optimal_buffer_level;
      }
    }
  }
  cpi->alt_ref_source = NULL;
  rc->is_src_frame_alt_ref = 0;
@@ -2151,9 +1992,8 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
  realloc_segmentation_maps(cpi);
-  CHECK_MEM_ERROR(
+  CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols,
-      cm, cpi->skin_map,
+                                                sizeof(cpi->skin_map[0])));
      vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
  CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
@@ -3016,26 +2856,18 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
      cpi->denoiser.denoising_level > kDenLowLow) {
    int svc_base_is_key = 0;
    int denoise_svc_second_layer = 0;
    if (cpi->use_svc) {
      int realloc_fail = 0;
      const int svc_buf_shift =
          cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
              ? cpi->denoiser.num_ref_frames
              : 0;
      int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
                                   cpi->svc.temporal_layer_id,
                                   cpi->svc.number_temporal_layers);
      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
      svc_base_is_key = lc->is_key_frame;
-      denoise_svc_second_layer =
+
-          cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 ? 1
+      // Check if we need to allocate extra buffers in the denoiser for
                                                                          : 0;
      // Check if we need to allocate extra buffers in the denoiser
      // for
      // refreshed frames.
      realloc_fail = vp9_denoiser_realloc_svc(
-          cm, &cpi->denoiser, svc_buf_shift, cpi->refresh_alt_ref_frame,
+          cm, &cpi->denoiser, cpi->refresh_alt_ref_frame,
          cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx,
          cpi->gld_fb_idx, cpi->lst_fb_idx);
      if (realloc_fail)
@@ -3046,8 +2878,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
        &cpi->denoiser, *cpi->Source, cpi->common.frame_type,
        cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame,
        cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx,
-        cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key,
+        cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key);
        denoise_svc_second_layer);
  }
 #endif
  if (is_one_pass_cbr_svc(cpi)) {
@@ -3482,9 +3313,8 @@ static void setup_denoiser_buffer(VP9_COMP *cpi) {
  VP9_COMMON *const cm = &cpi->common;
  if (cpi->oxcf.noise_sensitivity > 0 &&
      !cpi->denoiser.frame_buffer_initialized) {
-    if (vp9_denoiser_alloc(cm, &cpi->svc, &cpi->denoiser, cpi->use_svc,
+    if (vp9_denoiser_alloc(cm, cpi->use_svc, &cpi->denoiser, cm->width,
-                           cpi->oxcf.noise_sensitivity, cm->width, cm->height,
+                           cm->height, cm->subsampling_x, cm->subsampling_y,
                           cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                           cm->use_highbitdepth,
 #endif
@@ -3765,8 +3595,6 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
    // it may be pretty bad for rate-control,
    // and I should handle it somehow
    vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
  } else if (cpi->roi.enabled && cm->frame_type != KEY_FRAME) {
    apply_roi_map(cpi);
  }
  apply_active_map(cpi);
@@ -4497,15 +4325,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
  struct segmentation *const seg = &cm->seg;
  TX_SIZE t;
  // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
      !cpi->svc.rc_drop_superframe && cpi->oxcf.target_bandwidth == 0) {
    cpi->svc.skip_enhancement_layer = 1;
    vp9_rc_postencode_update_drop_frame(cpi);
    cpi->ext_refresh_frame_flags_pending = 0;
    return;
  }
  set_ext_overrides(cpi);
  vpx_clear_system_state();
@@ -4597,6 +4416,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
    if (vp9_rc_drop_frame(cpi) ||
        (is_one_pass_cbr_svc(cpi) && cpi->svc.rc_drop_superframe == 1)) {
      vp9_rc_postencode_update_drop_frame(cpi);
      ++cm->current_video_frame;
      cpi->ext_refresh_frame_flags_pending = 0;
      cpi->svc.rc_drop_superframe = 1;
      cpi->last_frame_dropped = 1;
@@ -5009,7 +4829,6 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
  int i, idx;
  uint64_t luma_samples, dur_end;
  const uint32_t luma_pic_size = cm->width * cm->height;
  const uint32_t luma_pic_breadth = VPXMAX(cm->width, cm->height);
  LevelConstraint *const level_constraint = &cpi->level_constraint;
  const int8_t level_index = level_constraint->level_index;
  double cpb_data_size;
@@ -5113,11 +4932,6 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
    level_spec->max_luma_picture_size = luma_pic_size;
  }
  // update max_luma_picture_breadth
  if (luma_pic_breadth > level_spec->max_luma_picture_breadth) {
    level_spec->max_luma_picture_breadth = luma_pic_breadth;
  }
  // update compression_ratio
  level_spec->compression_ratio = (double)level_stats->total_uncompressed_size *
                                  cm->bit_depth /
@@ -5138,15 +4952,6 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
                         level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]);
    }
    if (level_spec->max_luma_picture_breadth >
        vp9_level_defs[level_index].max_luma_picture_breadth) {
      level_constraint->fail_flag |= (1 << LUMA_PIC_BREADTH_TOO_LARGE);
      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                         "Failed to encode to the target level %d. %s",
                         vp9_level_defs[level_index].level,
                         level_fail_messages[LUMA_PIC_BREADTH_TOO_LARGE]);
    }
    if ((double)level_spec->max_luma_sample_rate >
        (double)vp9_level_defs[level_index].max_luma_sample_rate *
            (1 + SAMPLE_RATE_GRACE_P)) {
@@ -5347,6 +5152,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
      cm->intra_only = 0;
      // if the flags indicate intra frame, but if the current picture is for
      // non-zero spatial layer, it should not be an intra picture.
      // TODO(Won Kap): this needs to change if per-layer intra frame is
      // allowed.
      if ((source->flags & VPX_EFLAG_FORCE_KF) &&
          cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
        source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
@@ -5479,6 +5286,21 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
  }
 #endif  // CONFIG_REALTIME_ONLY
 #if 1
  {
    VP9_COMMON *const cm = &cpi->common;
    TWO_PASS *const twopass = &cpi->twopass;
    GF_GROUP *const gf_group = &twopass->gf_group;
    printf("Frame=%d, gf_group_update_type[gf_group_index=%d]=%d, "
           "show_frame=%d\n",
           cm->current_video_frame, gf_group->index,
           gf_group->update_type[gf_group->index],
           cm->show_frame);
  }
 #endif  // 0
  if (cm->refresh_frame_context)
    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
@@ -5513,6 +5335,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
    double samples = 0.0;
    cpi->bytes += (int)(*size);
 #if 1
 	{
 		printf("Frame %d: rate: %d\n",
 			   cm->current_video_frame, (int)(*size));
 	}
 #endif  // 0
    if (cm->show_frame) {
      uint32_t bit_depth = 8;
      uint32_t in_bit_depth = 8;
@@ -5542,6 +5371,19 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
        cpi->total_samples += psnr.samples[0];
        samples = psnr.samples[0];
 #if 1
        {
 			const int rddiv = cpi->rd.RDDIV;
 			const int rdmult = cpi->rd.RDMULT;
 			const int64_t rdcost = RDCOST(
 				rdmult, rddiv, (int)(*size) * 8, psnr.sse[0]);
 			printf("Frame %d: distortion: %" PRIu64 " rdcost: %" PRId64 "\n",
 				   cm->current_video_frame, psnr.sse[0], rdcost);
 			printf("%d %d\n", rddiv, rdmult);
 		}
 #endif  // 0
        {
          PSNR_STATS psnr2;
          double frame_ssim2 = 0, weight = 0;
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -383,7 +383,6 @@ typedef struct {
  VP9_LEVEL level;
  uint64_t max_luma_sample_rate;
  uint32_t max_luma_picture_size;
  uint32_t max_luma_picture_breadth;
  double average_bitrate;  // in kilobits per second
  double max_cpb_size;     // in kilobits
  double compression_ratio;
@@ -423,15 +422,14 @@ typedef struct {
 typedef enum {
  BITRATE_TOO_LARGE = 0,
-  LUMA_PIC_SIZE_TOO_LARGE,
+  LUMA_PIC_SIZE_TOO_LARGE = 1,
-  LUMA_PIC_BREADTH_TOO_LARGE,
+  LUMA_SAMPLE_RATE_TOO_LARGE = 2,
-  LUMA_SAMPLE_RATE_TOO_LARGE,
+  CPB_TOO_LARGE = 3,
-  CPB_TOO_LARGE,
+  COMPRESSION_RATIO_TOO_SMALL = 4,
-  COMPRESSION_RATIO_TOO_SMALL,
+  TOO_MANY_COLUMN_TILE = 5,
-  TOO_MANY_COLUMN_TILE,
+  ALTREF_DIST_TOO_SMALL = 6,
-  ALTREF_DIST_TOO_SMALL,
+  TOO_MANY_REF_BUFFER = 7,
-  TOO_MANY_REF_BUFFER,
+  TARGET_LEVEL_FAIL_IDS = 8
  TARGET_LEVEL_FAIL_IDS
 } TARGET_LEVEL_FAIL_ID;
 typedef struct {
@@ -723,8 +721,6 @@ typedef struct VP9_COMP {
  uint8_t *count_arf_frame_usage;
  uint8_t *count_lastgolden_frame_usage;
  vpx_roi_map_t roi;
 } VP9_COMP;
 void vp9_initialize_enc(void);
@@ -870,8 +866,9 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
 #if CONFIG_VP9_TEMPORAL_DENOISING
 static INLINE int denoise_svc(const struct VP9_COMP *const cpi) {
-  return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >=
+  return (!cpi->use_svc ||
-                                                cpi->svc.first_layer_denoise));
+          (cpi->use_svc &&
           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
 }
 #endif
@@ -923,14 +920,10 @@ static INLINE int get_level_index(VP9_LEVEL level) {
 // Return the log2 value of max column tiles corresponding to the level that
 // the picture size fits into.
-static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
+static INLINE int log_tile_cols_from_picsize_level(uint32_t pic_size) {
                                                   uint32_t height) {
  int i;
  const uint32_t pic_size = width * height;
  const uint32_t pic_breadth = VPXMAX(width, height);
  for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
-    if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+    if (vp9_level_defs[i].max_luma_picture_size > pic_size) {
        vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
      return get_msb(vp9_level_defs[i].max_col_tiles);
    }
  }
@@ -939,10 +932,6 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
 int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
                    unsigned int cols, int delta_q[8], int delta_lf[8],
                    int skip[8], int ref_frame[8]);
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 void vp9_set_row_mt(VP9_COMP *cpi);
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -66,8 +66,8 @@ static int get_max_tile_cols(VP9_COMP *cpi) {
  log2_tile_cols =
      clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
  if (cpi->oxcf.target_level == LEVEL_AUTO) {
-    const int level_tile_cols =
+    const uint32_t pic_size = cpi->common.width * cpi->common.height;
-        log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+    const int level_tile_cols = log_tile_cols_from_picsize_level(pic_size);
    if (log2_tile_cols > level_tile_cols) {
      log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
    }
@@ -390,9 +390,8 @@ void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
 }
 #if !CONFIG_REALTIME_ONLY
-static int first_pass_worker_hook(void *arg1, void *arg2) {
+static int first_pass_worker_hook(EncWorkerData *const thread_data,
-  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+                                  MultiThreadHandle *multi_thread_ctxt) {
  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
  VP9_COMP *const cpi = thread_data->cpi;
  const VP9_COMMON *const cm = &cpi->common;
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -471,8 +470,8 @@ void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
    }
  }
-  launch_enc_workers(cpi, first_pass_worker_hook, multi_thread_ctxt,
+  launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook,
-                     num_workers);
+                     multi_thread_ctxt, num_workers);
  first_tile_col = &cpi->tile_data[0];
  for (i = 1; i < tile_cols; i++) {
@@ -481,9 +480,8 @@ void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
  }
 }
-static int temporal_filter_worker_hook(void *arg1, void *arg2) {
+static int temporal_filter_worker_hook(EncWorkerData *const thread_data,
-  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+                                       MultiThreadHandle *multi_thread_ctxt) {
  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
  VP9_COMP *const cpi = thread_data->cpi;
  const VP9_COMMON *const cm = &cpi->common;
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -555,14 +553,13 @@ void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
    }
  }
-  launch_enc_workers(cpi, temporal_filter_worker_hook, multi_thread_ctxt,
+  launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook,
-                     num_workers);
+                     multi_thread_ctxt, num_workers);
 }
 #endif  // !CONFIG_REALTIME_ONLY
-static int enc_row_mt_worker_hook(void *arg1, void *arg2) {
+static int enc_row_mt_worker_hook(EncWorkerData *const thread_data,
-  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+                                  MultiThreadHandle *multi_thread_ctxt) {
  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
  VP9_COMP *const cpi = thread_data->cpi;
  const VP9_COMMON *const cm = &cpi->common;
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -651,8 +648,8 @@ void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
    }
  }
-  launch_enc_workers(cpi, enc_row_mt_worker_hook, multi_thread_ctxt,
+  launch_enc_workers(cpi, (VPxWorkerHook)enc_row_mt_worker_hook,
-                     num_workers);
+                     multi_thread_ctxt, num_workers);
  for (i = 0; i < num_workers; i++) {
    VPxWorker *const worker = &cpi->workers[i];
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -44,6 +44,7 @@
 #define COMPLEXITY_STATS_OUTPUT 0
 #define FIRST_PASS_Q 10.0
 #define GF_MAX_BOOST 96.0
 #define INTRA_MODE_PENALTY 1024
 #define MIN_ARF_GF_BOOST 240
 #define MIN_DECAY_FACTOR 0.01
@@ -731,7 +732,8 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
  // Exclude any image dead zone
  if (fp_acc_data->image_data_start_row > 0) {
    fp_acc_data->intra_skip_count =
-        VPXMAX(0, fp_acc_data->intra_skip_count -
+        VPXMAX(0,
               fp_acc_data->intra_skip_count -
                   (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
  }
@@ -1947,7 +1949,6 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
 }
 #define BASELINE_ERR_PER_MB 12500.0
 #define GF_MAX_BOOST 96.0
 static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
                               double this_frame_mv_in_out) {
  double frame_boost;
@@ -2237,6 +2238,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
    }
    gf_group->arf_update_idx[0] = arf_buffer_indices[0];
    gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
    // Step over the golden frame / overlay frame
    if (EOF == input_stats(twopass, &frame_stats)) return;
  }
  // Deduct the boost bits for arf (or gf if it is not a key frame)
@@ -2281,8 +2285,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
  // Define middle frame
  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
-  normal_frames =
+  normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
      rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
  if (normal_frames > 1)
    normal_frame_bits = (int)(total_group_bits / normal_frames);
  else
@@ -2380,8 +2383,6 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise,
 // Analyse and define a gf/arf group.
 #define ARF_DECAY_BREAKOUT 0.10
 #define ARF_ABS_ZOOM_THRESH 4.0
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  VP9_COMMON *const cm = &cpi->common;
  RATE_CONTROL *const rc = &cpi->rc;
@@ -2410,6 +2411,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  double mv_in_out_accumulator = 0.0;
  double abs_mv_in_out_accumulator = 0.0;
  double mv_ratio_accumulator_thresh;
  double mv_in_out_thresh;
  double abs_mv_in_out_thresh;
  double sr_accumulator = 0.0;
  const double av_err = get_distribution_av_err(cpi, twopass);
@@ -2455,7 +2457,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  // Motion breakout threshold for loop below depends on image size.
  mv_ratio_accumulator_thresh =
      (cpi->initial_height + cpi->initial_width) / 4.0;
-  abs_mv_in_out_thresh = ARF_ABS_ZOOM_THRESH;
+  mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 300.0;
  abs_mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 200.0;
  // Set a maximum and minimum interval for the GF group.
  // If the image appears almost completely static we can extend beyond this.
@@ -2540,17 +2543,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      // Update the accumulator for second ref error difference.
      // This is intended to give an indication of how much the coded error is
      // increasing over time.
      if (i == 1) {
        sr_accumulator += next_frame.coded_error;
      } else {
      sr_accumulator += (next_frame.sr_coded_error - next_frame.coded_error);
-      }
+      sr_accumulator = VPXMAX(0.0, sr_accumulator);
    }
    // Break out conditions.
-    // Break at maximum of active_max_gf_interval unless almost totally static.
+    if (
-    if (((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) &&
+        // Break at active_max_gf_interval unless almost totally static.
-         (i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
+        ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
        (
            // Don't break out with a very short interval.
            (i >= active_min_gf_interval) &&
@@ -2559,6 +2559,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
            (!flash_detected) &&
            ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
             (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) ||
             (mv_in_out_accumulator < -mv_in_out_thresh) ||
             (sr_accumulator > next_frame.intra_error)))) {
      break;
    }
@@ -2570,8 +2571,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
  // Should we use the alternate reference frame.
-  if ((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && allow_alt_ref &&
+  if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
-      (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) {
+      (i >= rc->min_gf_interval)) {
    const int forward_frames = (rc->frames_to_key - i >= i - 1)
                                   ? i - 1
                                   : VPXMAX(0, rc->frames_to_key - i);
@@ -2599,10 +2600,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 #endif
  // Set the interval until the next gf.
-  rc->baseline_gf_interval =
+  // rc->baseline_gf_interval = 8;
-      (twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH)
+  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
          ? (i - (is_key_frame || rc->source_alt_ref_pending))
          : i;
  // Only encode alt reference frame in temporal base layer. So
  // baseline_gf_interval should be multiple of a temporal layer group
@@ -2700,26 +2699,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 #endif
 }
 // Intra / Inter threshold very low
 #define VERY_LOW_II 1.5
 // Clean slide transitions we expect a sharp single frame spike in error.
 #define ERROR_SPIKE 5.0
 // Slide show transition detection.
 // Tests for case where there is very low error either side of the current frame
 // but much higher just for this frame. This can help detect key frames in
 // slide shows even where the slides are pictures of different sizes.
 // Also requires that intra and inter errors are very similar to help eliminate
 // harmful false positives.
 // It will not help if the transition is a fade or other multi-frame effect.
 static int slide_transition(const FIRSTPASS_STATS *this_frame,
                            const FIRSTPASS_STATS *last_frame,
                            const FIRSTPASS_STATS *next_frame) {
  return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) &&
         (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) &&
         (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE));
 }
 // Threshold for use of the lagging second reference frame. High second ref
 // usage may point to a transient event like a flash or occlusion rather than
 // a real scene cut.
@@ -2764,7 +2743,6 @@ static int test_candidate_kf(TWO_PASS *twopass,
  if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
      (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
       (slide_transition(this_frame, last_frame, next_frame)) ||
       ((pcnt_intra > MIN_INTRA_LEVEL) &&
        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
        ((this_frame->intra_error /
@@ -2836,7 +2814,6 @@ static int test_candidate_kf(TWO_PASS *twopass,
 #define FRAMES_TO_CHECK_DECAY 8
 #define MIN_KF_TOT_BOOST 300
 #define KF_BOOST_SCAN_MAX_FRAMES 32
 #define KF_ABS_ZOOM_THRESH 6.0
 #ifdef AGGRESSIVE_VBR
 #define KF_MAX_FRAME_BOOST 80.0
@@ -2864,7 +2841,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  double kf_group_err = 0.0;
  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
  double sr_accumulator = 0.0;
  double abs_mv_in_out_accumulator = 0.0;
  const double av_err = get_distribution_av_err(cpi, twopass);
  vp9_zero(next_frame);
@@ -3029,14 +3005,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      double zm_factor;
      // Monitor for static sections.
      // First frame in kf group the second ref indicator is invalid.
      if (i > 0) {
      zero_motion_accumulator = VPXMIN(
          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
      } else {
        zero_motion_accumulator =
            next_frame.pcnt_inter - next_frame.pcnt_motion;
      }
      // Factor 0.75-1.25 based on how much of frame is static.
      zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
@@ -3050,14 +3020,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                        KF_MAX_FRAME_BOOST * zm_factor);
      boost_score += frame_boost;
-
+      if (frame_boost < 25.00) break;
      // Measure of zoom. Large zoom tends to indicate reduced boost.
      abs_mv_in_out_accumulator +=
          fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion);
      if ((frame_boost < 25.00) ||
          (abs_mv_in_out_accumulator > KF_ABS_ZOOM_THRESH))
        break;
    } else {
      break;
    }
@@ -3072,16 +3035,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  twopass->section_intra_rating = calculate_section_intra_ratio(
      start_position, twopass->stats_in_end, rc->frames_to_key);
  // Special case for static / slide show content but dont apply
  // if the kf group is very short.
  if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
    rc->kf_boost = VPXMAX((rc->frames_to_key * 100), MAX_KF_TOT_BOOST);
  } else {
  // Apply various clamps for min and max boost
  rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
  rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
  }
  // Work out how many bits to allocate for the key frame itself.
  kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -120,12 +120,12 @@ typedef enum {
 typedef struct {
  unsigned char index;
  unsigned char first_inter_index;
-  RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
-  FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
-  int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
 } GF_GROUP;
 typedef struct {
--- a/vp9/encoder/vp9_mbgraph.h
+++ b/vp9/encoder/vp9_mbgraph.h
@@ -25,9 +25,7 @@ typedef struct {
  } ref[MAX_REF_FRAMES];
 } MBGRAPH_MB_STATS;
-typedef struct {
+typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS;
  MBGRAPH_MB_STATS *mb_stats;
 } MBGRAPH_FRAME_STATS;
 struct VP9_COMP;
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1785,10 +1785,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
 }
 static const MV search_pos[4] = {
-  { -1, 0 },
+  { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
  { 0, -1 },
  { 0, 1 },
  { 1, 0 },
 };
 unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
@@ -1879,10 +1876,7 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
  {
    const uint8_t *const pos[4] = {
-      ref_buf - ref_stride,
+      ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride,
      ref_buf - 1,
      ref_buf + 1,
      ref_buf + ref_stride,
    };
    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -21,15 +21,6 @@
 #include "vp9/encoder/vp9_noise_estimate.h"
 #include "vp9/encoder/vp9_encoder.h"
 #if CONFIG_VP9_TEMPORAL_DENOISING
 // For SVC: only do noise estimation on top spatial layer.
 static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) {
  return (!cpi->use_svc ||
          (cpi->use_svc &&
           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
 }
 #endif
 void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
  ne->enabled = 0;
  ne->level = kLowLow;
@@ -54,7 +45,7 @@ static int enable_noise_estimation(VP9_COMP *const cpi) {
 #endif
 // Enable noise estimation if denoising is on.
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
      cpi->common.width >= 320 && cpi->common.height >= 180)
    return 1;
 #endif
@@ -120,7 +111,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
  // Estimate is between current source and last source.
  YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) {
    last_source = &cpi->denoiser.last_source;
    // Tune these thresholds for different resolutions when denoising is
    // enabled.
@@ -140,7 +131,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
      (cpi->svc.number_spatial_layers == 1 &&
       (ne->last_w != cm->width || ne->last_h != cm->height))) {
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
      copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
    if (last_source != NULL) {
@@ -155,7 +146,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
    ne->count = 0;
    ne->num_frames_estimate = 10;
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
        cpi->svc.current_superframe > 1) {
      vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
      copy_frame(&cpi->denoiser.last_source, cpi->Source);
@@ -258,7 +249,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
      // Normalize.
      avg_est = avg_est / num_samples;
      // Update noise estimate.
-      ne->value = (int)((3 * ne->value + avg_est) >> 2);
+      ne->value = (int)((15 * ne->value + avg_est) >> 4);
      ne->count++;
      if (ne->count == ne->num_frames_estimate) {
        // Reset counter and check noise level condition.
@@ -266,14 +257,14 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
        ne->count = 0;
        ne->level = vp9_noise_estimate_extract_level(ne);
 #if CONFIG_VP9_TEMPORAL_DENOISING
-        if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+        if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
          vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
 #endif
      }
    }
  }
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
    copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
 }
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1488,6 +1488,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
  int skip_ref_find_pred[4] = { 0 };
  unsigned int sse_zeromv_normalized = UINT_MAX;
  unsigned int best_sse_sofar = UINT_MAX;
  unsigned int thresh_svc_skip_golden = 500;
 #if CONFIG_VP9_TEMPORAL_DENOISING
  VP9_PICKMODE_CTX_DEN ctx_den;
  int64_t zero_last_cost_orig = INT64_MAX;
@@ -1495,23 +1496,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 #endif
  INTERP_FILTER filter_gf_svc = EIGHTTAP;
  MV_REFERENCE_FRAME best_second_ref_frame = NONE;
  const struct segmentation *const seg = &cm->seg;
  int comp_modes = 0;
  int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES;
  int flag_svc_subpel = 0;
  int svc_mv_col = 0;
  int svc_mv_row = 0;
  unsigned int thresh_svc_skip_golden = 500;
  // Lower the skip threshold if lower spatial layer is better quality relative
  // to current layer.
  if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex > 150 &&
      cm->base_qindex > cpi->svc.lower_layer_qindex + 15)
    thresh_svc_skip_golden = 100;
  // Increase skip threshold if lower spatial layer is lower quality relative
  // to current layer.
  else if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex < 140 &&
           cm->base_qindex < cpi->svc.lower_layer_qindex - 20)
    thresh_svc_skip_golden = 1000;
  init_ref_frame_cost(cm, xd, ref_frame_cost);
@@ -1649,16 +1635,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
      cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME)
    comp_modes = 2;
  // If the segment reference frame feature is enabled and it's set to GOLDEN
  // reference, then make sure we don't skip checking GOLDEN, this is to
  // prevent possibility of not picking any mode.
  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
    usable_ref_frame = GOLDEN_FRAME;
    skip_ref_find_pred[GOLDEN_FRAME] = 0;
    thresh_svc_skip_golden = 0;
  }
  for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
    if (!skip_ref_find_pred[ref_frame]) {
      find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
@@ -1671,18 +1647,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
  if (cpi->use_svc || cpi->oxcf.speed <= 7 || bsize < BLOCK_32X32)
    x->sb_use_mv_part = 0;
  // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used
  // an averaging filter for downsampling (phase = 8). If so, we will test
  // a nonzero motion mode on the spatial (goldeen) reference.
  // The nonzero motion is half pixel shifted to left and top (-4, -4).
  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
      svc_force_zero_mode[GOLDEN_FRAME - 1] &&
      cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) {
    svc_mv_col = -4;
    svc_mv_row = -4;
    flag_svc_subpel = 1;
  }
  for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) {
    int rate_mv = 0;
    int mode_rd_thresh;
@@ -1696,7 +1660,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
    int inter_mv_mode = 0;
    int skip_this_mv = 0;
    int comp_pred = 0;
    int force_gf_mv = 0;
    PREDICTION_MODE this_mode;
    second_ref_frame = NONE;
@@ -1717,29 +1680,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
      comp_pred = 1;
    }
    if (ref_frame > usable_ref_frame) continue;
    if (skip_ref_find_pred[ref_frame]) continue;
    // If the segment reference frame feature is enabled then do nothing if the
    // current ref frame is not allowed.
    if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
        get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
      continue;
    if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) {
      force_gf_mv = 1;
      // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
      // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
      if (this_mode == NEWMV) {
        frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col;
        frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row;
      } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col ||
                 frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) {
        continue;
      }
    }
    if (comp_pred) {
      const struct segmentation *const seg = &cm->seg;
      if (!cpi->allow_comp_inter_inter) continue;
      // Skip compound inter modes if ARF is not available.
      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
@@ -1748,6 +1690,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
      if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
    }
    if (ref_frame > usable_ref_frame) continue;
    if (skip_ref_find_pred[ref_frame]) continue;
    // For SVC, skip the golden (spatial) reference search if sse of zeromv_last
    // is below threshold.
    if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
@@ -1792,7 +1737,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
    // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
    // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
    // later.
-    if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+    if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
        frame_mv[this_mode][ref_frame].as_int != 0) {
      continue;
    }
@@ -1806,15 +1751,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
    }
    if (cpi->use_svc) {
-      if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] &&
+      if (svc_force_zero_mode[ref_frame - 1] &&
          frame_mv[this_mode][ref_frame].as_int != 0)
        continue;
    }
    // Disable this drop out case if the ref frame segment level feature is
    // enabled for this segment. This is to prevent the possibility that we end
    // up unable to pick any mode.
    if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) {
    if (sf->reference_masking &&
        !(frame_mv[this_mode][ref_frame].as_int == 0 &&
          ref_frame == LAST_FRAME)) {
@@ -1838,7 +1779,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
      }
    }
    if (ref_frame_skip_mask & (1 << ref_frame)) continue;
    }
    // Select prediction reference frames.
    for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -1868,7 +1808,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                             &rd_thresh_freq_fact[mode_index])))
      continue;
-    if (this_mode == NEWMV && !force_gf_mv) {
+    if (this_mode == NEWMV) {
      if (ref_frame > LAST_FRAME && !cpi->use_svc &&
          cpi->oxcf.rc_mode == VPX_CBR) {
        int tmp_sad;
@@ -2009,7 +1949,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
    if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
        pred_filter_search &&
        (ref_frame == LAST_FRAME ||
-         (ref_frame == GOLDEN_FRAME && !force_gf_mv &&
+         (ref_frame == GOLDEN_FRAME &&
          (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
        (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
      int pf_rate[3];
@@ -2233,11 +2173,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
  // For spatial enhancemanent layer: perform intra prediction only if base
  // layer is chosen as the reference. Always perform intra prediction if
-  // LAST is the only reference, or is_key_frame is set, or on base
+  // LAST is the only reference or is_key_frame is set.
  // temporal layer.
  if (cpi->svc.spatial_layer_id) {
    perform_intra_pred =
        cpi->svc.temporal_layer_id == 0 ||
        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame ||
        !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
        (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
@@ -2247,13 +2185,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
  if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
      cpi->rc.is_src_frame_alt_ref)
    perform_intra_pred = 0;
  // If the segment reference frame feature is enabled and set then
  // skip the intra prediction.
  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0)
    perform_intra_pred = 0;
  // Perform intra prediction search, if the best SAD is above a certain
  // threshold.
  if (best_rdc.rdcost == INT64_MAX ||
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -31,13 +31,10 @@
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_ratectrl.h"
-// Max rate per frame for 1080P and below encodes if no level requirement given.
+// Max rate target for 1080P and below encodes under normal circumstances
-// For larger formats limit to MAX_MB_RATE bits per MB
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
 // 4Mbits is derived from the level requirement for level 4 (1080P 30) which
 // requires that HW can sustain a rate of 16Mbits over a 4 frame group.
 // If a lower level requirement is specified then this may over ride this value.
 #define MAX_MB_RATE 250
-#define MAXRATE_1080P 4000000
+#define MAXRATE_1080P 2025000
 #define DEFAULT_KF_BOOST 2000
 #define DEFAULT_GF_BOOST 2000
@@ -1103,9 +1100,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
      // Baseline value derived from cpi->active_worst_quality and kf boost.
      active_best_quality =
          get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
      if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
        active_best_quality /= 4;
      }
      // Allow somewhat lower kf minq with small image formats.
      if ((cm->width * cm->height) <= (352 * 288)) {
@@ -1494,22 +1488,15 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
    cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref;
  }
  if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0;
  rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
  if (cpi->use_svc &&
      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
    cpi->svc.lower_layer_qindex = cm->base_qindex;
 }
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
  // Update buffer level with zero size, update frame counters, and return.
  update_buffer_level(cpi, 0);
  cpi->common.current_video_frame++;
  cpi->rc.frames_since_key++;
  cpi->rc.frames_to_key--;
  cpi->rc.rc_2_frame = 0;
  cpi->rc.rc_1_frame = 0;
  cpi->rc.last_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
 }
 static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
@@ -1593,7 +1580,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
      // Adjust boost and af_ratio based on avg_frame_low_motion, which varies
      // between 0 and 100 (stationary, 100% zero/small motion).
      rc->gfu_boost =
-          VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+          VPXMAX(500,
                 DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
                     (rc->avg_frame_low_motion + 100));
      rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
    }
@@ -1869,8 +1857,13 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
          cpi->framerate, rc->min_gf_interval);
-    // Extended max interval for genuinely static scenes like slide shows.
+    // Extended interval for genuinely static scenes
-    rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
+    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
    if (is_altref_enabled(cpi)) {
      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
    }
    if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
      rc->max_gf_interval = rc->static_scene_max_gf_interval;
@@ -1880,12 +1873,9 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
    if (oxcf->target_level == LEVEL_AUTO) {
      const uint32_t pic_size = cpi->common.width * cpi->common.height;
      const uint32_t pic_breadth =
          VPXMAX(cpi->common.width, cpi->common.height);
      int i;
      for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
-        if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+        if (vp9_level_defs[i].max_luma_picture_size > pic_size) {
            vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
          if (rc->min_gf_interval <=
              (int)vp9_level_defs[i].min_altref_distance) {
            rc->min_gf_interval =
@@ -1914,12 +1904,12 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
      VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
  // A maximum bitrate for a frame is defined.
-  // However this limit is extended if a very high rate is given on the command
+  // The baseline for this aligns with HW implementations that
-  // line or the the rate cannnot be acheived because of a user specificed max q
+  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
-  // (e.g. when the user specifies lossless encode).
+  // per 16x16 MB (averaged over a frame). However this limit is extended if
-  //
+  // a very high rate is given on the command line or the the rate cannnot
-  // If a level is specified that requires a lower maximum rate then the level
+  // be acheived because of a user specificed max q (e.g. when the user
-  // value take precedence.
+  // specifies lossless encode.
  vbr_max_bits =
      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
            100);
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -34,14 +34,6 @@ extern "C" {
 #define FRAME_OVERHEAD_BITS 200
 // Threshold used to define a KF group as static (e.g. a slide show).
 // Essentially this means that no frame in the group has more than 1% of MBs
 // that are not marked as coded with 0,0 motion in the first pass.
 #define STATIC_KF_GROUP_THRESH 99
 // The maximum duration of a GF group that is static (for example a slide show).
 #define MAX_STATIC_GF_GROUP_LENGTH 250
 typedef enum {
  INTER_NORMAL = 0,
  INTER_HIGH = 1,
@@ -160,8 +152,6 @@ typedef struct {
  int rc_2_frame;
  int q_1_frame;
  int q_2_frame;
  // Keep track of the last target average frame bandwidth.
  int last_avg_frame_bandwidth;
  // Auto frame-scaling variables.
  FRAME_SCALE_LEVEL frame_size_selector;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -59,9 +59,7 @@ typedef struct {
  MV_REFERENCE_FRAME ref_frame[2];
 } MODE_DEFINITION;
-typedef struct {
+typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
  MV_REFERENCE_FRAME ref_frame[2];
 } REF_DEFINITION;
 struct rdcost_block_args {
  const VP9_COMP *cpi;
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -37,16 +37,14 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
  svc->scaled_one_half = 0;
  svc->current_superframe = 0;
  svc->non_reference_frame = 0;
  svc->skip_enhancement_layer = 0;
  for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
    svc->ext_frame_flags[sl] = 0;
    svc->ext_lst_fb_idx[sl] = 0;
    svc->ext_gld_fb_idx[sl] = 1;
    svc->ext_alt_fb_idx[sl] = 2;
-    svc->downsample_filter_type[sl] = BILINEAR;
+    svc->downsample_filter_type[sl] = EIGHTTAP;
-    svc->downsample_filter_phase[sl] = 8;  // Set to 8 for averaging filter.
+    svc->downsample_filter_phase[sl] = 0;  // Set to 8 for averaging filter.
  }
  if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
@@ -155,8 +153,6 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
  int sl, tl, layer = 0, spatial_layer_target;
  float bitrate_alloc = 1.0;
  cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
    for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
      for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
@@ -393,7 +389,7 @@ int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
             .is_key_frame;
 }
-void get_layer_resolution(const int width_org, const int height_org,
+static void get_layer_resolution(const int width_org, const int height_org,
                                 const int num, const int den, int *width_out,
                                 int *height_out) {
  int w, h;
@@ -549,8 +545,6 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
    if (!spatial_id) {
      cpi->ref_frame_flags = VP9_LAST_FLAG;
    } else {
      if (spatial_id == cpi->svc.number_spatial_layers - 1)
        cpi->ext_refresh_alt_ref_frame = 0;
      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
    }
  }
@@ -610,7 +604,6 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering(
 int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
  int width = 0, height = 0;
  LAYER_CONTEXT *lc = NULL;
  cpi->svc.skip_enhancement_layer = 0;
  if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1;
  cpi->svc.force_zero_mode_spatial_ref = 1;
  cpi->svc.mi_stride[cpi->svc.spatial_layer_id] = cpi->common.mi_stride;
@@ -663,14 +656,10 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
                       lc->scaling_factor_num, lc->scaling_factor_den, &width,
                       &height);
-  // For resolutions <= VGA: set phase of the filter = 8 (for symmetric
+  // For resolutions <= QVGA: set phase of the filter = 8 (for symmetric
  // averaging filter), use bilinear for now.
-  if (width * height <= 640 * 480) {
+  if (width * height <= 320 * 240) {
    cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR;
    // Use Eightap_smooth for low resolutions.
    if (width * height <= 320 * 240)
      cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] =
          EIGHTTAP_SMOOTH;
    cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8;
  }
@@ -872,28 +861,3 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
  vp9_update_temporal_layer_framerate(cpi);
  vp9_restore_layer_context(cpi);
 }
 void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
  SVC *svc = &cpi->svc;
  int sl, tl;
  for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
    // Check for reset based on avg_frame_bandwidth for spatial layer sl.
    int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
                                 svc->number_temporal_layers);
    LAYER_CONTEXT *lc = &svc->layer_context[layer];
    RATE_CONTROL *lrc = &lc->rc;
    if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) ||
        lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) {
      // Reset for all temporal layers with spatial layer sl.
      for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
        int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
        LAYER_CONTEXT *lc = &svc->layer_context[layer];
        RATE_CONTROL *lrc = &lc->rc;
        lrc->rc_1_frame = 0;
        lrc->rc_2_frame = 0;
        lrc->bits_off_target = lrc->optimal_buffer_level;
        lrc->buffer_level = lrc->optimal_buffer_level;
      }
    }
  }
 }
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -49,7 +49,7 @@ typedef struct {
  uint8_t speed;
 } LAYER_CONTEXT;
-typedef struct SVC {
+typedef struct {
  int spatial_layer_id;
  int temporal_layer_id;
  int number_spatial_layers;
@@ -99,12 +99,6 @@ typedef struct SVC {
  BLOCK_SIZE *prev_partition_svc;
  int mi_stride[VPX_MAX_LAYERS];
  int first_layer_denoise;
  int skip_enhancement_layer;
  int lower_layer_qindex;
 } SVC;
 struct VP9_COMP;
@@ -134,10 +128,6 @@ void vp9_save_layer_context(struct VP9_COMP *const cpi);
 // Initialize second pass rc for spatial svc.
 void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
 void get_layer_resolution(const int width_org, const int height_org,
                          const int num, const int den, int *width_out,
                          int *height_out);
 // Increment number of video frames in layer
 void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);
@@ -158,8 +148,6 @@ void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
 void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);
 void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -170,13 +170,13 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst4_sse2(in);
      write_buffer_4x4(output, in);
      break;
-    default:
+    case ADST_ADST:
      assert(tx_type == ADST_ADST);
      load_buffer_4x4(input, in, stride);
      fadst4_sse2(in);
      fadst4_sse2(in);
      write_buffer_4x4(output, in);
      break;
    default: assert(0); break;
  }
 }
@@ -1097,14 +1097,14 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
      right_shift_8x8(in, 1);
      write_buffer_8x8(output, in, 8);
      break;
-    default:
+    case ADST_ADST:
      assert(tx_type == ADST_ADST);
      load_buffer_8x8(input, in, stride);
      fadst8_sse2(in);
      fadst8_sse2(in);
      right_shift_8x8(in, 1);
      write_buffer_8x8(output, in, 8);
      break;
    default: assert(0); break;
  }
 }
@@ -1963,13 +1963,13 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst16_sse2(in0, in1);
      write_buffer_16x16(output, in0, in1, 16);
      break;
-    default:
+    case ADST_ADST:
      assert(tx_type == ADST_ADST);
      load_buffer_16x16(input, in0, in1, stride);
      fadst16_sse2(in0, in1);
      right_shift_16x16(in0, in1);
      fadst16_sse2(in0, in1);
      write_buffer_16x16(output, in0, in1, 16);
      break;
    default: assert(0); break;
  }
 }
--- a/vp9/encoder/x86/vp9_error_avx2.c
+++ b/vp9/encoder/x86/vp9_error_avx2.c
@@ -1,7 +1,7 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
- *  Use of this source code is governed by a BSD-style license
+ *  Usee of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -1,140 +0,0 @@
 /*
 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <assert.h>
 #include <immintrin.h>  // AVX2
 #include "./vp9_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
 #include "vpx_dsp/x86/quantize_x86.h"
 // Zero fill 8 positions in the output buffer.
 static INLINE void store_zero_tran_low(tran_low_t *a) {
  const __m256i zero = _mm256_setzero_si256();
 #if CONFIG_VP9_HIGHBITDEPTH
  _mm256_storeu_si256((__m256i *)(a), zero);
  _mm256_storeu_si256((__m256i *)(a + 8), zero);
 #else
  _mm256_storeu_si256((__m256i *)(a), zero);
 #endif
 }
 static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
                                   __m256i *coeff256) {
  const __m256i iscan = _mm256_loadu_si256(iscan_ptr);
  const __m256i zero256 = _mm256_setzero_si256();
 #if CONFIG_VP9_HIGHBITDEPTH
  // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as
  // B1 A1 B0 A0.  Shuffle to B1 B0 A1 A0 in order to scan eob correctly.
  const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8);
  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256);
 #else
  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256);
 #endif
  const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256);
  // Add one to convert from indices to counts
  const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0);
  return _mm256_and_si256(iscan_plus_one, nzero_coeff0);
 }
 void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *round_ptr,
                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan_ptr,
                          const int16_t *iscan_ptr) {
  __m128i eob;
  __m256i round256, quant256, dequant256;
  __m256i eob256, thr256;
  (void)scan_ptr;
  (void)skip_block;
  assert(!skip_block);
  coeff_ptr += n_coeffs;
  iscan_ptr += n_coeffs;
  qcoeff_ptr += n_coeffs;
  dqcoeff_ptr += n_coeffs;
  n_coeffs = -n_coeffs;
  {
    __m256i coeff256;
    // Setup global values
    {
      const __m128i round = _mm_load_si128((const __m128i *)round_ptr);
      const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr);
      const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr);
      round256 = _mm256_castsi128_si256(round);
      round256 = _mm256_permute4x64_epi64(round256, 0x54);
      quant256 = _mm256_castsi128_si256(quant);
      quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
      dequant256 = _mm256_castsi128_si256(dequant);
      dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
    }
    {
      __m256i qcoeff256;
      __m256i qtmp256;
      coeff256 = load_tran_low(coeff_ptr + n_coeffs);
      qcoeff256 = _mm256_abs_epi16(coeff256);
      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
    }
    eob256 = scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256);
    n_coeffs += 8 * 2;
  }
  // remove dc constants
  dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
  quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
  round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
  thr256 = _mm256_srai_epi16(dequant256, 1);
  // AC only loop
  while (n_coeffs < 0) {
    __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs);
    __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
    int32_t nzflag =
        _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
    if (nzflag) {
      __m256i qtmp256;
      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
      eob256 = _mm256_max_epi16(
          eob256,
          scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256));
    } else {
      store_zero_tran_low(qcoeff_ptr + n_coeffs);
      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
    }
    n_coeffs += 8 * 2;
  }
  eob = _mm_max_epi16(_mm256_castsi256_si128(eob256),
                      _mm256_extracti128_si256(eob256, 1));
  *eob_ptr = accumulate_eob(eob);
 }
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -169,7 +169,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \
  pshuflw                         m7, m8, 0x1
  pmaxsw                          m8, m7
  pextrw                          r6, m8, 0
-  mov                           [r2], r6w
+  mov                           [r2], r6
  RET
 %endmacro
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -63,17 +63,7 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
 VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct4x4_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct8x8_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct16x16_msa.c
 VP9_COMMON_SRCS-$(HAVE_SSE2)  += common/x86/vp9_idct_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht4x4_add_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht8x8_add_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht_neon.h
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MSA)  += common/mips/msa/vp9_mfqe_msa.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
 endif
@@ -81,11 +71,22 @@ ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans4_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans8_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
-else
+endif
-VP9_COMMON_SRCS-$(HAVE_NEON)   += common/arm/neon/vp9_highbd_iht4x4_add_neon.c
+
-VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c
+# common (msa)
-VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c
-VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
 endif
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
 endif
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1067,7 +1067,8 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
  vpx_codec_frame_flags_t flags = lib_flags << 16;
  if (lib_flags & FRAMEFLAGS_KEY ||
-      (cpi->use_svc && cpi->svc
+      (cpi->use_svc &&
       cpi->svc
           .layer_context[cpi->svc.spatial_layer_id *
                              cpi->svc.number_temporal_layers +
                          cpi->svc.temporal_layer_id]
@@ -1212,7 +1213,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
           -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data,
                                         &dst_time_stamp, &dst_end_time_stamp,
                                         !img)) {
-      if (size || (cpi->use_svc && cpi->svc.skip_enhancement_layer)) {
+      if (size) {
        vpx_codec_cx_pkt_t pkt;
 #if CONFIG_SPATIAL_SVC
@@ -1233,8 +1234,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
          ctx->pending_frame_magnitude |= size;
          cx_data += size;
          cx_data_sz -= size;
          pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
          pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
          if (ctx->output_cx_pkt_cb.output_cx_pkt) {
            pkt.kind = VPX_CODEC_CX_FRAME_PKT;
@@ -1261,11 +1260,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
        pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
            timebase, dst_end_time_stamp - dst_time_stamp);
        pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
        pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
        pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
        if (ctx->pending_cx_data) {
-          if (size) ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
          ctx->pending_frame_magnitude |= size;
          ctx->pending_cx_data_sz += size;
          // write the superframe only for the case when
@@ -1415,22 +1412,12 @@ static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) {
 static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
                                        va_list args) {
-  vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
+  (void)ctx;
  (void)args;
-  if (data) {
+  // TODO(yaowu): Need to re-implement and test for VP9.
    vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
    if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
                         roi->delta_q, roi->delta_lf, roi->skip,
                         roi->ref_frame)) {
      return VPX_CODEC_OK;
    } else {
  return VPX_CODEC_INVALID_PARAM;
 }
  } else {
    return VPX_CODEC_INVALID_PARAM;
  }
 }
 static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
@@ -1619,7 +1606,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
  // Setters
  { VP8_SET_REFERENCE, ctrl_set_reference },
  { VP8_SET_POSTPROC, ctrl_set_previewpp },
-  { VP9E_SET_ROI_MAP, ctrl_set_roi_map },
+  { VP8E_SET_ROI_MAP, ctrl_set_roi_map },
  { VP8E_SET_ACTIVEMAP, ctrl_set_active_map },
  { VP8E_SET_SCALEMODE, ctrl_set_scale_mode },
  { VP8E_SET_CPUUSED, ctrl_set_cpuused },
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -103,7 +103,6 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -12,11 +12,8 @@
 * \brief Provides the high level interface to wrap encoder algorithms.
 *
 */
 #include <assert.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vp8/common/blockd.h"
 #include "vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
@@ -84,8 +81,6 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
    int i;
    void *mem_loc = NULL;
    if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE;
    if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) {
      for (i = 0; i < num_enc; i++) {
        vpx_codec_priv_enc_mr_cfg_t mr_cfg;
@@ -94,7 +89,9 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
        if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
            dsf->den > dsf->num) {
          res = VPX_CODEC_INVALID_PARAM;
-        } else {
+          break;
        }
        mr_cfg.mr_low_res_mode_info = mem_loc;
        mr_cfg.mr_total_resolutions = num_enc;
        mr_cfg.mr_encoder_id = num_enc - 1 - i;
@@ -113,7 +110,6 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
        ctx->init_flags = flags;
        ctx->config.enc = cfg;
        res = ctx->iface->init(ctx, &mr_cfg);
        }
        if (res) {
          const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
@@ -128,14 +124,10 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
            vpx_codec_destroy(ctx);
            i--;
          }
 #if CONFIG_MULTI_RES_ENCODING
          assert(mem_loc);
          free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info);
          free(mem_loc);
 #endif
          return SAVE_STATUS(ctx, res);
        }
        if (res) break;
        ctx++;
        cfg++;
        dsf++;
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -125,7 +125,7 @@ extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
 enum vp8e_enc_control_id {
  /*!\brief Codec control function to pass an ROI map to encoder.
   *
-   * Supported in codecs: VP8
+   * Supported in codecs: VP8, VP9
   */
  VP8E_SET_ROI_MAP = 8,
@@ -408,7 +408,7 @@ enum vp8e_enc_control_id {
  /*!\brief Codec control function to set noise sensitivity.
   *
-   *  0: off, 1: On(YOnly), 2: For SVC only, on top two spatial layers(YOnly)
+   *  0: off, 1: On(YOnly)
   *
   * Supported in codecs: VP9
   */
@@ -423,12 +423,6 @@ enum vp8e_enc_control_id {
   */
  VP9E_SET_SVC,
  /*!\brief Codec control function to pass an ROI map to encoder.
   *
   * Supported in codecs: VP9
   */
  VP9E_SET_ROI_MAP,
  /*!\brief Codec control function to set parameters for SVC.
   * \note Parameters contain min_q, max_q, scaling factor for each of the
   *       SVC layers.
@@ -649,20 +643,16 @@ typedef enum vp9e_temporal_layering_mode {
 */
 typedef struct vpx_roi_map {
-  /*! If ROI is enabled. */
+  /*! An id between 0 and 3 for each 16x16 region within a frame. */
  uint8_t enabled;
  /*! An id between 0-3 (0-7 for vp9) for each 16x16 (8x8 for VP9)
   * region within a frame. */
  unsigned char *roi_map;
  unsigned int rows; /**< Number of rows. */
  unsigned int cols; /**< Number of columns. */
-  /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */
+  // TODO(paulwilkins): broken for VP9 which has 8 segments
-  int delta_q[8];  /**< Quantizer deltas. */
+  // q and loop filter deltas for each segment
-  int delta_lf[8]; /**< Loop filter deltas. */
+  // (see MAX_MB_SEGMENTS)
-  /*! skip and ref frame segment is only used in VP9. */
+  int delta_q[4];  /**< Quantizer deltas. */
-  int skip[8];      /**< Skip this block. */
+  int delta_lf[4]; /**< Loop filter deltas. */
-  int ref_frame[8]; /**< Reference frame for this block. */
+  /*! Static breakout threshold for each segment. */
  /*! Static breakout threshold for each segment. Only used in VP8. */
  unsigned int static_threshold[4];
 } vpx_roi_map_t;
@@ -759,8 +749,6 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int)
 #define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
 VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *)
 #define VPX_CTRL_VP8E_SET_ROI_MAP
 VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *)
 #define VPX_CTRL_VP9E_SET_ROI_MAP
 VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *)
 #define VPX_CTRL_VP8E_SET_ACTIVEMAP
 VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *)
--- a/Show More
+++ b/Show More