Test gerrit.

2017-12-05 18:07:21 -05:00
150 changed files with 3854 additions and 5687 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -1,12 +1,12 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-# Generated with clang-format 5.0.0
+# Generated with clang-format 4.0.1
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
+AlignEscapedNewlinesLeft: true
 AlignOperands:   true
 AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: true
@@ -33,20 +33,14 @@ BraceWrapping:
  BeforeCatch:     false
  BeforeElse:      false
  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
 BreakAfterJavaFieldAnnotations: false
 BreakStringLiterals: true
 ColumnLimit:     80
 CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
@@ -54,11 +48,7 @@ Cpp11BracedListStyle: false
 DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
 IncludeCategories:
  - Regex:           '^<.*\.h>'
    Priority:        1
@@ -80,7 +70,6 @@ NamespaceIndentation: None
 ObjCBlockIndentWidth: 2
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: false
-PenaltyBreakAssignment: 2
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
@@ -90,7 +79,6 @@ PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
 ReflowComments:  true
 SortIncludes:    false
-SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
--- a/.mailmap
+++ b/.mailmap
@@ -3,7 +3,6 @@ Aℓex Converse <aconverse@google.com>
 Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
-Chris Cunningham <chcunningham@chromium.org>
 Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
 Deb Mukherjee <debargha@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
@@ -22,21 +21,18 @@ Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Pascal Massimino <pascal.massimino@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
-Peter Boström <pbos@chromium.org> <pbos@google.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
 Sami Pietilä <samipietila@google.com>
-Shiyou Yin <yinshiyou-hf@loongson.cn>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
-Urvang Joshi <urvang@google.com> <urvang@chromium.org>
 Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <Yaowu Xu>
--- a/16
+++ b/16
@@ -3,13 +3,13 @@

 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
 Aleksey Vasenev <margtu-fivt@ya.ru>
 Alexander Potapenko <glider@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
-Alexandra Hájková <alexandra.khirnova@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -17,7 +17,6 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
-Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
 Angie Chiang <angiebird@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
@@ -25,9 +24,7 @@ Attila Nagy <attilanagy@google.com>
 Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
-Cheng Chen <chengchen@google.com>
 chm <chm@rock-chips.com>
-Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
@@ -49,12 +46,10 @@ Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
-Gregor Jasny <gjasny@gmail.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
-Han Shen <shenhan@google.com>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
 Ivan Krasin <krasin@chromium.org>
@@ -88,7 +83,6 @@ Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 Kaustubh Raste <kaustubh.raste@imgtec.com>
 KO Myung-Hun <komh@chollian.net>
-Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
 Linfeng Zhang <linfengz@google.com>
 Lou Quillio <louquillio@google.com>
@@ -107,7 +101,6 @@ Mikhal Shemer <mikhal@google.com>
 Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
 Min Ye <yeemmi@google.com>
-Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
 Nathan E. Egge <negge@mozilla.com>
 Nico Weber <thakis@chromium.org>
@@ -118,15 +111,12 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
 Pengchong Jin <pengchong@google.com>
-Peter Boström <pbos@chromium.org>
-Peter Collingbourne <pcc@chromium.org>
+Peter Boström <pbos@google.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
-Rafael de Lucena Valle <rafaeldelucena@gmail.com>
-Rahul Chaudhry <rahulchaudhry@google.com>
 Ralph Giles <giles@xiph.org>
 Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
 Rob Bradford <rob@linux.intel.com>
@@ -145,7 +135,6 @@ Shiyou Yin <yinshiyou-hf@loongson.cn>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
-Sylvestre Ledru <sylvestre@mozilla.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
@@ -158,7 +147,6 @@ Tom Finegan <tomfinegan@google.com>
 Tristan Matthews <le.businessman@gmail.com>
 Urvang Joshi <urvang@google.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
-Vlad Tsyrklevich <vtsyrklevich@chromium.org>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
--- a/25
+++ b/25
@@ -1,28 +1,3 @@
-2017-01-04 v1.7.0 "Mandarin Duck"
-  This release focused on high bit depth performance (10/12 bit) and vp9
-  encoding improvements.
-
-  - Upgrading:
-    This release is ABI incompatible due to new vp9 encoder features.
-
-    Frame parallel decoding for vp9 has been removed.
-
-  - Enhancements:
-    vp9 encoding supports additional threads with --row-mt. This can be greater
-    than the number of tiles.
-
-    Two new vp9 encoder options have been added:
-      --corpus-complexity
-      --tune-content=film
-
-    Additional tooling for respecting the vp9 "level" profiles has been added.
-
-  - Bug fixes:
-    A variety of fuzzing issues.
-    vp8 threading fix for ARM.
-    Codec control VP9_SET_SKIP_LOOP_FILTER fixed.
-    Reject invalid multi resolution configurations.
-
 2017-01-09 v1.6.1 "Long Tailed Duck"
  This release improves upon the VP9 encoder and speeds up the encoding and
  decoding processes.
--- a/4
+++ b/4
@@ -1,4 +1,4 @@
-README - 24 January 2018
+README - 26 January 2017

 Welcome to the WebM VP8/VP9 Codec SDK!

@@ -63,8 +63,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv8-linux-gcc
    mips32-linux-gcc
    mips64-linux-gcc
-    ppc64-linux-gcc
-    ppc64le-linux-gcc
    sparc-solaris-gcc
    x86-android-gcc
    x86-darwin8-gcc
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -1,13 +1,4 @@
 #!/usr/bin/env perl
-##
-##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##

 no strict 'refs';
 use warnings;
@@ -209,7 +200,6 @@ sub filter {
 sub common_top() {
  my $include_guard = uc($opts{sym})."_H_";
  print <<EOF;
-// This file is generated. Do not edit.
 #ifndef ${include_guard}
 #define ${include_guard}

--- a/build/make/version.sh
+++ b/build/make/version.sh
@@ -60,7 +60,6 @@ if [ ${bare} ]; then
    echo "${changelog_version}${git_version_id}" > $$.tmp
 else
    cat<<EOF>$$.tmp
-// This file is generated. Do not edit.
 #define VERSION_MAJOR  $major_version
 #define VERSION_MINOR  $minor_version
 #define VERSION_PATCH  $patch_version
--- a/2
+++ b/2
@@ -665,7 +665,7 @@ process_toolchain() {
             gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
             enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
             all_targets="${all_targets} solution"
-             INLINE="__inline"
+             INLINE="__forceinline"
        ;;
    esac

--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -429,9 +429,8 @@ static void set_rate_control_stats(struct RateControlStats *rc,
        rc->layer_framerate[layer] = framerate / cfg->ts_rate_decimator[tl];
      if (tl > 0) {
        rc->layer_pfb[layer] =
-            1000.0 *
-            (cfg->layer_target_bitrate[layer] -
-             cfg->layer_target_bitrate[layer - 1]) /
+            1000.0 * (cfg->layer_target_bitrate[layer] -
+                      cfg->layer_target_bitrate[layer - 1]) /
            (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]);
      } else {
        rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] /
@@ -574,8 +573,8 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
      } else {
        if (is_key_frame) {
          ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
-              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF;
+              VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF |
+              VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
        } else {
          ref_frame_config->frame_flags[sl] =
              VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
@@ -589,24 +588,14 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
      } else {
        ref_frame_config->frame_flags[sl] =
            VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
-        if (sl == num_spatial_layers - 1)
-          ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_ARF |
-              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
      }
    }
    if (tl == 0) {
      ref_frame_config->lst_fb_idx[sl] = sl;
-      if (sl) {
-        if (is_key_frame) {
-          ref_frame_config->lst_fb_idx[sl] = sl - 1;
-          ref_frame_config->gld_fb_idx[sl] = sl;
-        } else {
-          ref_frame_config->gld_fb_idx[sl] = sl - 1;
-        }
-      } else {
+      if (sl)
+        ref_frame_config->gld_fb_idx[sl] = sl - 1;
+      else
        ref_frame_config->gld_fb_idx[sl] = 0;
-      }
      ref_frame_config->alt_fb_idx[sl] = 0;
    } else if (tl == 1) {
      ref_frame_config->lst_fb_idx[sl] = sl;
@@ -749,8 +738,6 @@ int main(int argc, const char **argv) {
      // the encode for the whole superframe. The encoder will internally loop
      // over all the spatial layers for the current superframe.
      vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
-      // TODO(jianj): Fix the parameter passing for "is_key_frame" in
-      // set_frame_flags_bypass_model() for case of periodic key frames.
      set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id,
                                  svc_ctx.spatial_layers, frame_cnt == 0,
                                  &ref_frame_config);
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -26,29 +26,19 @@
 #include "../tools_common.h"
 #include "../video_writer.h"

-#define ROI_MAP 0
-
-#define zero(Dest) memset(&Dest, 0, sizeof(Dest));
+#define VP8_ROI_MAP 0

 static const char *exec_name;

 void usage_exit(void) { exit(EXIT_FAILURE); }

-// Denoiser states for vp8, for temporal denoising.
-enum denoiserStateVp8 {
-  kVp8DenoiserOff,
-  kVp8DenoiserOnYOnly,
-  kVp8DenoiserOnYUV,
-  kVp8DenoiserOnYUVAggressive,
-  kVp8DenoiserOnAdaptive
-};
-
-// Denoiser states for vp9, for temporal denoising.
-enum denoiserStateVp9 {
-  kVp9DenoiserOff,
-  kVp9DenoiserOnYOnly,
-  // For SVC: denoise the top two spatial layers.
-  kVp9DenoiserOnYTwoSpatialLayers
+// Denoiser states, for temporal denoising.
+enum denoiserState {
+  kDenoiserOff,
+  kDenoiserOnYOnly,
+  kDenoiserOnYUV,
+  kDenoiserOnYUVAggressive,
+  kDenoiserOnAdaptive
 };

 static int mode_to_num_layers[13] = { 1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3 };
@@ -101,10 +91,9 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc,
  for (i = 0; i < cfg->ts_number_layers; ++i) {
    if (i > 0) {
      rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i];
-      rc->layer_pfb[i] =
-          1000.0 *
-          (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
-          (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
+      rc->layer_pfb[i] = 1000.0 * (rc->layer_target_bitrate[i] -
+                                   rc->layer_target_bitrate[i - 1]) /
+                         (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
    }
    rc->layer_input_frames[i] = 0;
    rc->layer_enc_frames[i] = 0;
@@ -167,60 +156,38 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
    die("Error: Number of input frames not equal to output! \n");
 }

-#if ROI_MAP
-static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg,
-                        vpx_roi_map_t *roi) {
+#if VP8_ROI_MAP
+static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) {
  unsigned int i, j;
-  int block_size = 0;
-  uint8_t is_vp8 = strncmp(enc_name, "vp8", 3) == 0 ? 1 : 0;
-  uint8_t is_vp9 = strncmp(enc_name, "vp9", 3) == 0 ? 1 : 0;
-  if (!is_vp8 && !is_vp9) {
-    die("unsupported codec.");
-  }
-  zero(*roi);
-
-  block_size = is_vp9 && !is_vp8 ? 8 : 16;
+  memset(roi, 0, sizeof(*roi));

  // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for
  // segment is 16x16 for vp8, 8x8 for vp9.
-  roi->rows = (cfg->g_h + block_size - 1) / block_size;
-  roi->cols = (cfg->g_w + block_size - 1) / block_size;
+  roi->rows = (cfg->g_h + 15) / 16;
+  roi->cols = (cfg->g_w + 15) / 16;

  // Applies delta QP on the segment blocks, varies from -63 to 63.
  // Setting to negative means lower QP (better quality).
  // Below we set delta_q to the extreme (-63) to show strong effect.
-  // VP8 uses the first 4 segments. VP9 uses all 8 segments.
-  zero(roi->delta_q);
+  roi->delta_q[0] = 0;
  roi->delta_q[1] = -63;
+  roi->delta_q[2] = 0;
+  roi->delta_q[3] = 0;

  // Applies delta loopfilter strength on the segment blocks, varies from -63 to
-  // 63. Setting to positive means stronger loopfilter. VP8 uses the first 4
-  // segments. VP9 uses all 8 segments.
-  zero(roi->delta_lf);
+  // 63. Setting to positive means stronger loopfilter.
+  roi->delta_lf[0] = 0;
+  roi->delta_lf[1] = 0;
+  roi->delta_lf[2] = 0;
+  roi->delta_lf[3] = 0;

-  if (is_vp8) {
-    // Applies skip encoding threshold on the segment blocks, varies from 0 to
-    // UINT_MAX. Larger value means more skipping of encoding is possible.
-    // This skip threshold only applies on delta frames.
-    zero(roi->static_threshold);
-  }
-
-  if (is_vp9) {
-    // Apply skip segment. Setting to 1 means this block will be copied from
-    // previous frame.
-    zero(roi->skip);
-  }
-
-  if (is_vp9) {
-    // Apply ref frame segment.
-    // -1 : Do not apply this segment.
-    //  0 : Froce using intra.
-    //  1 : Force using last.
-    //  2 : Force using golden.
-    //  3 : Force using alfref but not used in non-rd pickmode for 0 lag.
-    memset(roi->ref_frame, -1, sizeof(roi->ref_frame));
-    roi->ref_frame[1] = 1;
-  }
+  // Applies skip encoding threshold on the segment blocks, varies from 0 to
+  // UINT_MAX. Larger value means more skipping of encoding is possible.
+  // This skip threshold only applies on delta frames.
+  roi->static_threshold[0] = 0;
+  roi->static_threshold[1] = 0;
+  roi->static_threshold[2] = 0;
+  roi->static_threshold[3] = 0;

  // Use 2 states: 1 is center square, 0 is the rest.
  roi->roi_map =
@@ -588,7 +555,7 @@ int main(int argc, char **argv) {
  int layering_mode = 0;
  int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 };
  int flag_periodicity = 1;
-#if ROI_MAP
+#if VP8_ROI_MAP
  vpx_roi_map_t roi;
 #endif
  vpx_svc_layer_id_t layer_id = { 0, 0 };
@@ -788,11 +755,11 @@ int main(int argc, char **argv) {

  if (strncmp(encoder->name, "vp8", 3) == 0) {
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
    vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
-#if ROI_MAP
-    set_roi_map(encoder->name, &cfg, &roi);
+#if VP8_ROI_MAP
+    vp8_set_roi_map(&cfg, &roi);
    if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
      die_codec(&codec, "Failed to set ROI map");
 #endif
@@ -805,16 +772,10 @@ int main(int argc, char **argv) {
    vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0);
    vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0);
    vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
-    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff);
+    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
    vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
-#if ROI_MAP
-    set_roi_map(encoder->name, &cfg, &roi);
-    if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
-      die_codec(&codec, "Failed to set ROI map");
-    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 0);
-#endif
    // TODO(marpan/jianj): There is an issue with row-mt for low resolutons at
    // high speed settings, disable its use for those cases for now.
    if (cfg.g_threads > 1 && ((cfg.g_w > 320 && cfg.g_h > 240) || speed < 7))
@@ -942,8 +903,5 @@ int main(int argc, char **argv) {
  for (i = 0; i < cfg.ts_number_layers; ++i) vpx_video_writer_close(outfile[i]);

  vpx_img_free(&raw);
-#if ROI_MAP
-  free(roi.roi_map);
-#endif
  return EXIT_SUCCESS;
 }
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -943,6 +943,18 @@ GENERATE_XML           = NO

 XML_OUTPUT             = xml

+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting
 # and cross-referencing information) to the XML output. Note that
--- a/libs.mk
+++ b/libs.mk
@@ -233,8 +233,8 @@ OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)

-SO_VERSION_MAJOR := 5
-SO_VERSION_MINOR := 0
+SO_VERSION_MAJOR := 4
+SO_VERSION_MINOR := 1
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
--- a/test/blockiness_test.cc
+++ b/test/blockiness_test.cc
@@ -215,7 +215,7 @@ using std::tr1::make_tuple;

 #if CONFIG_VP9_ENCODER
 const BlockinessParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238)
+  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
 };
 INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests));
 #endif
--- a/test/consistency_test.cc
+++ b/test/consistency_test.cc
@@ -205,7 +205,7 @@ using std::tr1::make_tuple;

 #if CONFIG_VP9_ENCODER
 const ConsistencyParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238)
+  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
 };
 INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test,
                        ::testing::ValuesIn(c_vp9_tests));
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -539,7 +539,6 @@ class DatarateTestVP9Large
    denoiser_offon_test_ = 0;
    denoiser_offon_period_ = -1;
    frame_parallel_decoding_mode_ = 1;
-    use_roi_ = 0;
  }

  //
@@ -622,10 +621,6 @@ class DatarateTestVP9Large
    encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
                     frame_parallel_decoding_mode_);

-    if (use_roi_) {
-      encoder->Control(VP9E_SET_ROI_MAP, &roi_);
-    }
-
    if (cfg_.ts_number_layers > 1) {
      if (video->frame() == 0) {
        encoder->Control(VP9E_SET_SVC, 1);
@@ -706,8 +701,6 @@ class DatarateTestVP9Large
  int denoiser_offon_test_;
  int denoiser_offon_period_;
  int frame_parallel_decoding_mode_;
-  bool use_roi_;
-  vpx_roi_map_t roi_;
 };

 // Check basic rate targeting for VBR mode with 0 lag.
@@ -1080,68 +1073,6 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
  }
 }

-class DatarateTestVP9RealTime : public DatarateTestVP9Large {
- public:
-  virtual ~DatarateTestVP9RealTime() {}
-};
-
-// Check VP9 region of interest feature.
-TEST_P(DatarateTestVP9RealTime, RegionOfInterest) {
-  if (deadline_ != VPX_DL_REALTIME || set_cpu_used_ < 5) return;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 0;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 300);
-
-  cfg_.rc_target_bitrate = 450;
-  cfg_.g_w = 352;
-  cfg_.g_h = 288;
-
-  ResetModel();
-
-  // Set ROI parameters
-  use_roi_ = true;
-  memset(&roi_, 0, sizeof(roi_));
-
-  roi_.rows = (cfg_.g_h + 7) / 8;
-  roi_.cols = (cfg_.g_w + 7) / 8;
-
-  roi_.delta_q[1] = -20;
-  roi_.delta_lf[1] = -20;
-  memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame));
-  roi_.ref_frame[1] = 1;
-
-  // Use 2 states: 1 is center square, 0 is the rest.
-  roi_.roi_map = reinterpret_cast<uint8_t *>(
-      calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)));
-  ASSERT_TRUE(roi_.roi_map != NULL);
-
-  for (unsigned int i = 0; i < roi_.rows; ++i) {
-    for (unsigned int j = 0; j < roi_.cols; ++j) {
-      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
-          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
-        roi_.roi_map[i * roi_.cols + j] = 1;
-      }
-    }
-  }
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90)
-      << " The datarate for the file exceeds the target!";
-
-  ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4)
-      << " The datarate for the file missed the target!";
-
-  free(roi_.roi_map);
-}
-
 #if CONFIG_VP9_TEMPORAL_DENOISING
 class DatarateTestVP9LargeDenoiser : public DatarateTestVP9Large {
 public:
@@ -1285,78 +1216,18 @@ class DatarateOnePassCbrSvc
  }
  virtual void ResetModel() {
    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    first_drop_ = 0;
+    bits_total_ = 0;
    duration_ = 0.0;
    mismatch_psnr_ = 0.0;
    mismatch_nframes_ = 0;
    denoiser_on_ = 0;
    tune_content_ = 0;
    base_speed_setting_ = 5;
-    spatial_layer_id_ = 0;
-    temporal_layer_id_ = 0;
-    update_pattern_ = 0;
-    memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_));
-    memset(bits_total_, 0, sizeof(bits_total_));
-    memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_));
-    dynamic_drop_layer_ = false;
  }
  virtual void BeginPassHook(unsigned int /*pass*/) {}
-
-  // Example pattern for spatial layers and 2 temporal layers used in the
-  // bypass/flexible mode. The pattern corresponds to the pattern
-  // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
-  // non-flexible mode, except that we disable inter-layer prediction.
-  void set_frame_flags_bypass_mode(
-      int tl, int num_spatial_layers, int is_key_frame,
-      vpx_svc_ref_frame_config_t *ref_frame_config) {
-    for (int sl = 0; sl < num_spatial_layers; ++sl) {
-      if (!tl) {
-        if (!sl) {
-          ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF |
-              VP8_EFLAG_NO_UPD_ARF;
-        } else {
-          if (is_key_frame) {
-            ref_frame_config->frame_flags[sl] =
-                VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
-                VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF;
-          } else {
-            ref_frame_config->frame_flags[sl] =
-                VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF |
-                VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF;
-          }
-        }
-      } else if (tl == 1) {
-        if (!sl) {
-          ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
-              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
-        } else {
-          ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
-              VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_REF_GF;
-        }
-      }
-      if (tl == 0) {
-        ref_frame_config->lst_fb_idx[sl] = sl;
-        if (sl) {
-          if (is_key_frame) {
-            ref_frame_config->lst_fb_idx[sl] = sl - 1;
-            ref_frame_config->gld_fb_idx[sl] = sl;
-          } else {
-            ref_frame_config->gld_fb_idx[sl] = sl - 1;
-          }
-        } else {
-          ref_frame_config->gld_fb_idx[sl] = 0;
-        }
-        ref_frame_config->alt_fb_idx[sl] = 0;
-      } else if (tl == 1) {
-        ref_frame_config->lst_fb_idx[sl] = sl;
-        ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
-        ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
-      }
-    }
-  }
-
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    if (video->frame() == 0) {
@@ -1381,137 +1252,36 @@ class DatarateOnePassCbrSvc
      encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1);
      encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
    }
-
-    if (update_pattern_ && video->frame() >= 100) {
-      vpx_svc_layer_id_t layer_id;
-      if (video->frame() == 100) {
-        cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
-        encoder->Config(&cfg_);
-      }
-      // Set layer id since the pattern changed.
-      layer_id.spatial_layer_id = 0;
-      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
-      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
-      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
-                                  number_spatial_layers_, 0, &ref_frame_config);
-      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
-    }
-
-    if (dynamic_drop_layer_) {
-      if (video->frame() == 100) {
-        // Change layer bitrates to set top layer to 0. This will trigger skip
-        // encoding/dropping of top spatial layer.
-        cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[2];
-        cfg_.layer_target_bitrate[2] = 0;
-        encoder->Config(&cfg_);
-      } else if (video->frame() == 300) {
-        // Change layer bitrate on top layer to non-zero to start encoding it
-        // again.
-        cfg_.layer_target_bitrate[2] = 500;
-        cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[2];
-        encoder->Config(&cfg_);
-      }
-    }
    const vpx_rational_t tb = video->timebase();
    timebase_ = static_cast<double>(tb.num) / tb.den;
    duration_ = 0;
  }
-
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
-    vpx_svc_layer_id_t layer_id;
-    encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
-    spatial_layer_id_ = layer_id.spatial_layer_id;
-    temporal_layer_id_ = layer_id.temporal_layer_id;
-    // Update buffer with per-layer target frame bandwidth, this is done
-    // for every frame passed to the encoder (encoded or dropped).
-    // For temporal layers, update the cumulative buffer level.
-    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
-      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
-        const int layer = sl * number_temporal_layers_ + tl;
-        bits_in_buffer_model_[layer] +=
-            static_cast<int64_t>(layer_target_avg_bandwidth_[layer]);
-      }
-    }
-  }
-
-  vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                         uint32_t sizes[8], int *count) {
-    uint8_t marker;
-    marker = *(data + data_sz - 1);
-    *count = 0;
-    if ((marker & 0xe0) == 0xc0) {
-      const uint32_t frames = (marker & 0x7) + 1;
-      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-      const size_t index_sz = 2 + mag * frames;
-      // This chunk is marked as having a superframe index but doesn't have
-      // enough data for it, thus it's an invalid superframe index.
-      if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
-      {
-        const uint8_t marker2 = *(data + data_sz - index_sz);
-        // This chunk is marked as having a superframe index but doesn't have
-        // the matching marker byte at the front of the index therefore it's an
-        // invalid chunk.
-        if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
-      }
-      {
-        uint32_t i, j;
-        const uint8_t *x = &data[data_sz - index_sz + 1];
-        for (i = 0; i < frames; ++i) {
-          uint32_t this_sz = 0;
-
-          for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
-          sizes[i] = this_sz;
-        }
-        *count = frames;
-      }
-    }
-    return VPX_CODEC_OK;
-  }
-
  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    uint32_t sizes[8] = { 0 };
-    int count = 0;
-    last_pts_ = pkt->data.frame.pts;
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+    if (last_pts_ == 0) duration = 1;
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
    const bool key_frame =
        (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
-    parse_superframe_index(static_cast<const uint8_t *>(pkt->data.frame.buf),
-                           pkt->data.frame.sz, sizes, &count);
-    if (!dynamic_drop_layer_) ASSERT_EQ(count, number_spatial_layers_);
-    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
-      sizes[sl] = sizes[sl] << 3;
-      // Update the total encoded bits per layer.
-      // For temporal layers, update the cumulative encoded bits per layer.
-      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
-        const int layer = sl * number_temporal_layers_ + tl;
-        bits_total_[layer] += static_cast<int64_t>(sizes[sl]);
-        // Update the per-layer buffer level with the encoded frame size.
-        bits_in_buffer_model_[layer] -= static_cast<int64_t>(sizes[sl]);
-        // There should be no buffer underrun, except on the base
-        // temporal layer, since there may be key frames there.
-        if (!key_frame && tl > 0) {
-          ASSERT_GE(bits_in_buffer_model_[layer], 0)
-              << "Buffer Underrun at frame " << pkt->data.frame.pts;
-        }
-      }
-
-      ASSERT_EQ(pkt->data.frame.width[sl],
-                top_sl_width_ * svc_params_.scaling_factor_num[sl] /
-                    svc_params_.scaling_factor_den[sl]);
-
-      ASSERT_EQ(pkt->data.frame.height[sl],
-                top_sl_height_ * svc_params_.scaling_factor_num[sl] /
-                    svc_params_.scaling_factor_den[sl]);
+    if (!key_frame) {
+      // TODO(marpan): This check currently fails for some of the SVC tests,
+      // re-enable when issue (webm:1350) is resolved.
+      //  ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
+      //                                      << pkt->data.frame.pts;
    }
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+    bits_in_buffer_model_ -= static_cast<int64_t>(frame_size_in_bits);
+    bits_total_ += frame_size_in_bits;
+    if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
+    last_pts_ = pkt->data.frame.pts;
+    bits_in_last_frame_ = frame_size_in_bits;
+    ++frame_number_;
  }
-
  virtual void EndPassHook(void) {
-    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
-      for (int tl = 0; tl < number_temporal_layers_; ++tl) {
-        const int layer = sl * number_temporal_layers_ + tl;
-        const double file_size_in_kb = bits_total_[layer] / 1000.;
-        duration_ = (last_pts_ + 1) * timebase_;
-        file_datarate_[layer] = file_size_in_kb / duration_;
-      }
+    if (bits_total_) {
+      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
+      duration_ = (last_pts_ + 1) * timebase_;
+      file_datarate_ = file_size_in_kb / duration_;
    }
  }

@@ -1524,11 +1294,13 @@ class DatarateOnePassCbrSvc
  unsigned int GetMismatchFrames() { return mismatch_nframes_; }

  vpx_codec_pts_t last_pts_;
-  int64_t bits_in_buffer_model_[VPX_MAX_LAYERS];
+  int64_t bits_in_buffer_model_;
  double timebase_;
-  int64_t bits_total_[VPX_MAX_LAYERS];
+  int frame_number_;
+  vpx_codec_pts_t first_drop_;
+  int64_t bits_total_;
  double duration_;
-  double file_datarate_[VPX_MAX_LAYERS];
+  double file_datarate_;
  size_t bits_in_last_frame_;
  vpx_svc_extra_cfg_t svc_params_;
  int speed_setting_;
@@ -1537,27 +1309,14 @@ class DatarateOnePassCbrSvc
  int denoiser_on_;
  int tune_content_;
  int base_speed_setting_;
-  int spatial_layer_id_;
-  int temporal_layer_id_;
-  int number_spatial_layers_;
-  int number_temporal_layers_;
-  int layer_target_avg_bandwidth_[VPX_MAX_LAYERS];
-  bool dynamic_drop_layer_;
-  unsigned int top_sl_width_;
-  unsigned int top_sl_height_;
-  vpx_svc_ref_frame_config_t ref_frame_config;
-  int update_pattern_;
 };
 static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
                                  const vpx_svc_extra_cfg_t *svc_params,
                                  int spatial_layers, int temporal_layers,
-                                  int temporal_layering_mode,
-                                  int *layer_target_avg_bandwidth,
-                                  int64_t *bits_in_buffer_model) {
+                                  int temporal_layering_mode) {
  int sl, spatial_layer_target;
  float total = 0;
  float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
-  float framerate = 30.0;
  for (sl = 0; sl < spatial_layers; ++sl) {
    if (svc_params->scaling_factor_den[sl] > 0) {
      alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] * 1.0 /
@@ -1577,41 +1336,8 @@ static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
    } else if (temporal_layering_mode == 2) {
      enc_cfg->layer_target_bitrate[index] = spatial_layer_target * 2 / 3;
      enc_cfg->layer_target_bitrate[index + 1] = spatial_layer_target;
-    } else if (temporal_layering_mode <= 1) {
-      enc_cfg->layer_target_bitrate[index] = spatial_layer_target;
    }
  }
-  for (sl = 0; sl < spatial_layers; ++sl) {
-    for (int tl = 0; tl < temporal_layers; ++tl) {
-      const int layer = sl * temporal_layers + tl;
-      float layer_framerate = framerate;
-      if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2;
-      if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4;
-      if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2;
-      layer_target_avg_bandwidth[layer] = static_cast<int>(
-          enc_cfg->layer_target_bitrate[layer] * 1000.0 / layer_framerate);
-      bits_in_buffer_model[layer] =
-          enc_cfg->layer_target_bitrate[layer] * enc_cfg->rc_buf_initial_sz;
-    }
-  }
-}
-
-static void CheckLayerRateTargeting(vpx_codec_enc_cfg_t *const cfg,
-                                    int number_spatial_layers,
-                                    int number_temporal_layers,
-                                    double *file_datarate,
-                                    double thresh_overshoot,
-                                    double thresh_undershoot) {
-  for (int sl = 0; sl < number_spatial_layers; ++sl)
-    for (int tl = 0; tl < number_temporal_layers; ++tl) {
-      const int layer = sl * number_temporal_layers + tl;
-      ASSERT_GE(cfg->layer_target_bitrate[layer],
-                file_datarate[layer] * thresh_overshoot)
-          << " The datarate for the file exceeds the target by too much!";
-      ASSERT_LE(cfg->layer_target_bitrate[layer],
-                file_datarate[layer] * thresh_undershoot)
-          << " The datarate for the file is lower than the target by too much!";
-    }
 }

 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
@@ -1637,21 +1363,14 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TLScreenContent1) {
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 10;
  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
-  top_sl_width_ = 1280;
-  top_sl_height_ = 720;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  cfg_.rc_target_bitrate = 500;
  ResetModel();
  tune_content_ = 1;
  base_speed_setting_ = speed_setting_;
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }

@@ -1679,30 +1398,26 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL) {
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
-                                       0, 400);
-  top_sl_width_ = 640;
-  top_sl_height_ = 480;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
  // TODO(marpan): Check that effective_datarate for each layer hits the
  // layer target_bitrate.
  for (int i = 200; i <= 800; i += 200) {
    cfg_.rc_target_bitrate = i;
    ResetModel();
    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                            number_temporal_layers_, file_datarate_, 0.78,
-                            1.15);
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
+        << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
+        << " The datarate for the file is lower than the target by too much!";
 #if CONFIG_VP9_DECODER
    // Number of temporal layers > 1, so half of the frames in this SVC pattern
    // will be non-reference frame and hence encoder will avoid loopfilter.
-    // Since frame dropper is off, we can expect 200 (half of the sequence)
+    // Since frame dropper is off, we can expcet 100 (half of the sequence)
    // mismatched frames.
-    EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
+    EXPECT_EQ(static_cast<unsigned int>(100), GetMismatchFrames());
 #endif
  }
 }
@@ -1731,43 +1446,33 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLDenoiserOn) {
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
-                                       0, 400);
-  top_sl_width_ = 640;
-  top_sl_height_ = 480;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  // TODO(marpan): Check that effective_datarate for each layer hits the
  // layer target_bitrate.
-  // For SVC, noise_sen = 1 means denoising only the top spatial layer
-  // noise_sen = 2 means denoising the two top spatial layers.
-  for (int noise_sen = 1; noise_sen <= 2; noise_sen++) {
-    for (int i = 600; i <= 1000; i += 200) {
-      cfg_.rc_target_bitrate = i;
-      ResetModel();
-      denoiser_on_ = noise_sen;
-      assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                            cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                            layer_target_avg_bandwidth_, bits_in_buffer_model_);
-      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-      CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                              number_temporal_layers_, file_datarate_, 0.78,
-                              1.15);
+  for (int i = 600; i <= 1000; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    denoiser_on_ = 1;
+    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
+        << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
+        << " The datarate for the file is lower than the target by too much!";
 #if CONFIG_VP9_DECODER
-      // Number of temporal layers > 1, so half of the frames in this SVC
-      // pattern
-      // will be non-reference frame and hence encoder will avoid loopfilter.
-      // Since frame dropper is off, we can expect 200 (half of the sequence)
-      // mismatched frames.
-      EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
+    // Number of temporal layers > 1, so half of the frames in this SVC pattern
+    // will be non-reference frame and hence encoder will avoid loopfilter.
+    // Since frame dropper is off, we can expcet 150 (half of the sequence)
+    // mismatched frames.
+    EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
 #endif
-    }
  }
 }

 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
 // temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) {
+TEST_P(DatarateOnePassCbrSvc, DISABLED_OnePassCbrSvc2SL3TLSmallKf) {
  cfg_.rc_buf_initial_sz = 500;
  cfg_.rc_buf_optimal_sz = 500;
  cfg_.rc_buf_sz = 1000;
@@ -1788,25 +1493,21 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) {
  svc_params_.scaling_factor_num[1] = 288;
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 10;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
  cfg_.rc_target_bitrate = 400;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
-                                       0, 400);
-  top_sl_width_ = 640;
-  top_sl_height_ = 480;
  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
  for (int j = 64; j <= 67; j++) {
    cfg_.kf_max_dist = j;
    ResetModel();
    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                            number_temporal_layers_, file_datarate_, 0.78,
-                            1.15);
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.80)
+        << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
+        << " The datarate for the file is lower than the target by too much!";
  }
 }

@@ -1834,25 +1535,22 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) {
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
-  top_sl_width_ = 1280;
-  top_sl_height_ = 720;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  cfg_.rc_target_bitrate = 800;
  ResetModel();
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
+      << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
+      << " The datarate for the file is lower than the target by too much!";
 #if CONFIG_VP9_DECODER
  // Number of temporal layers > 1, so half of the frames in this SVC pattern
  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expect 30 (half of the sequence)
+  // Since frame dropper is off, we can expcet 150 (half of the sequence)
  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
+  EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
 #endif
 }

@@ -1882,126 +1580,25 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL) {
  svc_params_.scaling_factor_den[2] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
-                                       0, 400);
-  top_sl_width_ = 640;
-  top_sl_height_ = 480;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  cfg_.rc_target_bitrate = 800;
  ResetModel();
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
+      << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
+      << " The datarate for the file is lower than the target by too much!";
 #if CONFIG_VP9_DECODER
  // Number of temporal layers > 1, so half of the frames in this SVC pattern
  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expect 200 (half of the sequence)
+  // Since frame dropper is off, we can expcet 150 (half of the sequence)
  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
+  EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
 #endif
 }

-// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
-// 2 temporal layers, with a change on the fly from the fixed SVC pattern to one
-// generate via SVC_SET_REF_FRAME_CONFIG. The new pattern also disables
-// inter-layer prediction.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL2TLDynamicPatternChange) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 3;
-  cfg_.ts_number_layers = 2;
-  cfg_.ts_rate_decimator[0] = 2;
-  cfg_.ts_rate_decimator[1] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 1;
-  cfg_.temporal_layering_mode = 2;
-  svc_params_.scaling_factor_num[0] = 72;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 144;
-  svc_params_.scaling_factor_den[1] = 288;
-  svc_params_.scaling_factor_num[2] = 288;
-  svc_params_.scaling_factor_den[2] = 288;
-  cfg_.rc_dropframe_thresh = 0;
-  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  // Change SVC pattern on the fly.
-  update_pattern_ = 1;
-  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
-                                       0, 400);
-  top_sl_width_ = 640;
-  top_sl_height_ = 480;
-  cfg_.rc_target_bitrate = 800;
-  ResetModel();
-  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
-#if CONFIG_VP9_DECODER
-  // Number of temporal layers > 1, so half of the frames in this SVC pattern
-  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expect 200 (half of the sequence)
-  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
-#endif
-}
-
-// Check basic rate targeting for 1 pass CBR SVC with 3 spatial layers and on
-// the fly switching to 2 spatial layers and then back to 3. This switch is done
-// by setting top spatial layer bitrate to 0, and then back to non-zero, during
-// the sequence.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL_to_2SL_dynamic) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 3;
-  cfg_.ts_number_layers = 1;
-  cfg_.ts_rate_decimator[0] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 1;
-  cfg_.temporal_layering_mode = 0;
-  svc_params_.scaling_factor_num[0] = 72;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 144;
-  svc_params_.scaling_factor_den[1] = 288;
-  svc_params_.scaling_factor_num[2] = 288;
-  svc_params_.scaling_factor_den[2] = 288;
-  cfg_.rc_dropframe_thresh = 0;
-  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
-                                       0, 400);
-  top_sl_width_ = 640;
-  top_sl_height_ = 480;
-  cfg_.rc_target_bitrate = 800;
-  ResetModel();
-  dynamic_drop_layer_ = true;
-  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  // Don't check rate targeting on top spatial layer since it will be skipped
-  // for part of the sequence.
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_ - 1,
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
-}
-
 // Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
 // temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
 TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) {
@@ -2027,25 +1624,20 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) {
  svc_params_.scaling_factor_num[2] = 288;
  svc_params_.scaling_factor_den[2] = 288;
  cfg_.rc_dropframe_thresh = 10;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  cfg_.rc_target_bitrate = 800;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
-                                       0, 400);
-  top_sl_width_ = 640;
-  top_sl_height_ = 480;
  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
  for (int j = 32; j <= 35; j++) {
    cfg_.kf_max_dist = j;
    ResetModel();
    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                            number_temporal_layers_, file_datarate_, 0.78,
-                            1.15);
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.80)
+        << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30)
+        << " The datarate for the file is lower than the target by too much!";
  }
 }

@@ -2075,25 +1667,22 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL4threads) {
  svc_params_.scaling_factor_den[2] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
-  top_sl_width_ = 1280;
-  top_sl_height_ = 720;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  cfg_.rc_target_bitrate = 800;
  ResetModel();
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
+      << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
+      << " The datarate for the file is lower than the target by too much!";
 #if CONFIG_VP9_DECODER
  // Number of temporal layers > 1, so half of the frames in this SVC pattern
  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expect 30 (half of the sequence)
+  // Since frame dropper is off, we can expcet 150 (half of the sequence)
  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
+  EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
 #endif
 }

@@ -2125,21 +1714,9 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TL5x5MultipleRuns) {
  cfg_.layer_target_bitrate[0] = 300;
  cfg_.layer_target_bitrate[1] = 1400;
  cfg_.rc_target_bitrate = 1700;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  ResetModel();
-  layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30;
-  bits_in_buffer_model_[0] =
-      cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz;
-  layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30;
-  bits_in_buffer_model_[1] =
-      cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
-  top_sl_width_ = 1280;
-  top_sl_height_ = 720;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }

@@ -2152,9 +1729,6 @@ VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
                          ::testing::Values(::libvpx_test::kOnePassGood,
                                            ::libvpx_test::kRealTime),
                          ::testing::Range(2, 9));
-VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime,
-                          ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Range(5, 9));
 #if CONFIG_VP9_TEMPORAL_DENOISING
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser,
                          ::testing::Values(::libvpx_test::kRealTime),
--- a/test/dct_partial_test.cc
+++ b/test/dct_partial_test.cc
@@ -28,8 +28,8 @@

 using libvpx_test::ACMRandom;
 using libvpx_test::Buffer;
-using std::tr1::make_tuple;
 using std::tr1::tuple;
+using std::tr1::make_tuple;

 namespace {
 typedef void (*PartialFdctFunc)(const int16_t *in, tran_low_t *out, int stride);
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -106,90 +106,4 @@ TEST(EncodeAPI, ImageSizeSetting) {
 }
 #endif

-// Set up 2 spatial streams with 2 temporal layers per stream, and generate
-// invalid configuration by setting the temporal layer rate allocation
-// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of
-// CONFIG_MULTI_RES_ENCODING.
-TEST(EncodeAPI, MultiResEncode) {
-  static const vpx_codec_iface_t *kCodecs[] = {
-#if CONFIG_VP8_ENCODER
-    &vpx_codec_vp8_cx_algo,
-#endif
-#if CONFIG_VP9_ENCODER
-    &vpx_codec_vp9_cx_algo,
-#endif
-  };
-  const int width = 1280;
-  const int height = 720;
-  const int width_down = width / 2;
-  const int height_down = height / 2;
-  const int target_bitrate = 1000;
-  const int framerate = 30;
-
-  for (int c = 0; c < NELEMENTS(kCodecs); ++c) {
-    const vpx_codec_iface_t *const iface = kCodecs[c];
-    vpx_codec_ctx_t enc[2];
-    vpx_codec_enc_cfg_t cfg[2];
-    vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
-
-    memset(enc, 0, sizeof(enc));
-
-    for (int i = 0; i < 2; i++) {
-      vpx_codec_enc_config_default(iface, &cfg[i], 0);
-    }
-
-    /* Highest-resolution encoder settings */
-    cfg[0].g_w = width;
-    cfg[0].g_h = height;
-    cfg[0].rc_dropframe_thresh = 0;
-    cfg[0].rc_end_usage = VPX_CBR;
-    cfg[0].rc_resize_allowed = 0;
-    cfg[0].rc_min_quantizer = 2;
-    cfg[0].rc_max_quantizer = 56;
-    cfg[0].rc_undershoot_pct = 100;
-    cfg[0].rc_overshoot_pct = 15;
-    cfg[0].rc_buf_initial_sz = 500;
-    cfg[0].rc_buf_optimal_sz = 600;
-    cfg[0].rc_buf_sz = 1000;
-    cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
-    cfg[0].g_lag_in_frames = 0;
-
-    cfg[0].kf_mode = VPX_KF_AUTO;
-    cfg[0].kf_min_dist = 3000;
-    cfg[0].kf_max_dist = 3000;
-
-    cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */
-    cfg[0].g_timebase.num = 1;                 /* Set fps */
-    cfg[0].g_timebase.den = framerate;
-
-    memcpy(&cfg[1], &cfg[0], sizeof(cfg[0]));
-    cfg[1].rc_target_bitrate = 500;
-    cfg[1].g_w = width_down;
-    cfg[1].g_h = height_down;
-
-    for (int i = 0; i < 2; i++) {
-      cfg[i].ts_number_layers = 2;
-      cfg[i].ts_periodicity = 2;
-      cfg[i].ts_rate_decimator[0] = 2;
-      cfg[i].ts_rate_decimator[1] = 1;
-      cfg[i].ts_layer_id[0] = 0;
-      cfg[i].ts_layer_id[1] = 1;
-      // Invalid parameters.
-      cfg[i].ts_target_bitrate[0] = 0;
-      cfg[i].ts_target_bitrate[1] = 0;
-    }
-
-    // VP9 should report incapable, VP8 invalid for all configurations.
-    const char kVP9Name[] = "WebM Project VP9";
-    const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface),
-                                sizeof(kVP9Name) - 1) == 0;
-    EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
-
-    for (int i = 0; i < 2; i++) {
-      vpx_codec_destroy(&enc[i]);
-    }
-  }
-}
-
 }  // namespace
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -201,8 +201,6 @@ void EncoderTest::RunLoop(VideoSource *video) {
      PreEncodeFrameHook(video, encoder.get());
      encoder->EncodeFrame(video, frame_flags_);

-      PostEncodeFrameHook(encoder.get());
-
      CxDataIterator iter = encoder->GetCxData();

      bool has_cxdata = false;
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -128,11 +128,6 @@ class Encoder {
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }

-  void Control(int ctrl_id, struct vpx_svc_ref_frame_config *arg) {
-    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
-    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
-  }
-
  void Control(int ctrl_id, struct vpx_svc_parameters *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
@@ -142,12 +137,15 @@ class Encoder {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }
+#endif

+#if CONFIG_VP8_ENCODER
  void Control(int ctrl_id, vpx_roi_map_t *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }
 #endif
+
  void Config(const vpx_codec_enc_cfg_t *cfg) {
    const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
@@ -221,8 +219,6 @@ class EncoderTest {
  virtual void PreEncodeFrameHook(VideoSource * /*video*/,
                                  Encoder * /*encoder*/) {}

-  virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {}
-
  // Hook to be called on every compressed data packet.
  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}

--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -675,9 +675,7 @@ INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
                        ::testing::Values(make_tuple(&vpx_fdct8x8_neon,
                                                     &vpx_idct8x8_64_add_neon,
                                                     0, VPX_BITS_8)));
-// TODO(linfengz): reenable these functions once test vector failures are
-// addressed.
-#if 0   // !CONFIG_VP9_HIGHBITDEPTH
+#if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
    NEON, FwdTrans8x8HT,
    ::testing::Values(
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -174,4 +174,4 @@ INSTANTIATE_TEST_CASE_P(MSA, IDCTTest,
 INSTANTIATE_TEST_CASE_P(MMI, IDCTTest,
                        ::testing::Values(vp8_short_idct4x4llm_mmi));
 #endif  // HAVE_MMI
-}  // namespace
+}
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -123,7 +123,6 @@ TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
 #if CONFIG_VP8_DECODER
 const DecodeParam kVP8InvalidFileTests[] = {
  { 1, "invalid-bug-1443.ivf" },
-  { 1, "invalid-token-partition.ivf" },
 };

 VP8_INSTANTIATE_TEST_CASE(InvalidFileTest,
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -114,18 +114,6 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
  }
 }

-uint8_t GetOuterThresh(ACMRandom *rnd) {
-  return static_cast<uint8_t>(rnd->RandRange(3 * MAX_LOOP_FILTER + 5));
-}
-
-uint8_t GetInnerThresh(ACMRandom *rnd) {
-  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1));
-}
-
-uint8_t GetHevThresh(ACMRandom *rnd) {
-  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1) >> 4);
-}
-
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
 public:
  virtual ~Loop8Test6Param() {}
@@ -174,15 +162,15 @@ TEST_P(Loop8Test6Param, OperationCheck) {
  int first_failure = -1;
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = GetOuterThresh(&rnd);
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetInnerThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
    DECLARE_ALIGNED(16, const uint8_t,
                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetHevThresh(&rnd);
+    tmp = rnd.Rand8();
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -233,15 +221,15 @@ TEST_P(Loop8Test6Param, ValueCheck) {

  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = GetOuterThresh(&rnd);
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetInnerThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
    DECLARE_ALIGNED(16, const uint8_t,
                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetHevThresh(&rnd);
+    tmp = rnd.Rand8();
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -283,27 +271,27 @@ TEST_P(Loop8Test9Param, OperationCheck) {
  int first_failure = -1;
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = GetOuterThresh(&rnd);
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetInnerThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
    DECLARE_ALIGNED(16, const uint8_t,
                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetHevThresh(&rnd);
+    tmp = rnd.Rand8();
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetOuterThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetInnerThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
    DECLARE_ALIGNED(16, const uint8_t,
                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetHevThresh(&rnd);
+    tmp = rnd.Rand8();
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -346,27 +334,27 @@ TEST_P(Loop8Test9Param, ValueCheck) {
  int first_failure = -1;
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = GetOuterThresh(&rnd);
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetInnerThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
    DECLARE_ALIGNED(16, const uint8_t,
                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetHevThresh(&rnd);
+    tmp = rnd.Rand8();
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetOuterThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetInnerThresh(&rnd);
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
    DECLARE_ALIGNED(16, const uint8_t,
                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = GetHevThresh(&rnd);
+    tmp = rnd.Rand8();
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -277,29 +277,12 @@ class ResizeTest
    SetMode(GET_PARAM(1));
  }

-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
-    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
-    encode_frame_width_.push_back(pkt->data.frame.width[0]);
-    encode_frame_height_.push_back(pkt->data.frame.height[0]);
-  }
-
-  unsigned int GetFrameWidth(size_t idx) const {
-    return encode_frame_width_[idx];
-  }
-
-  unsigned int GetFrameHeight(size_t idx) const {
-    return encode_frame_height_[idx];
-  }
-
  virtual void DecompressedFrameHook(const vpx_image_t &img,
                                     vpx_codec_pts_t pts) {
    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
  }

  std::vector<FrameInfo> frame_info_list_;
-  std::vector<unsigned int> encode_frame_width_;
-  std::vector<unsigned int> encode_frame_height_;
 };

 TEST_P(ResizeTest, TestExternalResizeWorks) {
@@ -313,9 +296,6 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
    const unsigned int frame = static_cast<unsigned>(info->pts);
    unsigned int expected_w;
    unsigned int expected_h;
-    const size_t idx = info - frame_info_list_.begin();
-    ASSERT_EQ(info->w, GetFrameWidth(idx));
-    ASSERT_EQ(info->h, GetFrameHeight(idx));
    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
                        &expected_h, 0);
    EXPECT_EQ(expected_w, info->w)
@@ -484,23 +464,8 @@ class ResizeRealtimeTest
    ++mismatch_nframes_;
  }

-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
-    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
-    encode_frame_width_.push_back(pkt->data.frame.width[0]);
-    encode_frame_height_.push_back(pkt->data.frame.height[0]);
-  }
-
  unsigned int GetMismatchFrames() { return mismatch_nframes_; }

-  unsigned int GetFrameWidth(size_t idx) const {
-    return encode_frame_width_[idx];
-  }
-
-  unsigned int GetFrameHeight(size_t idx) const {
-    return encode_frame_height_[idx];
-  }
-
  void DefaultConfig() {
    cfg_.rc_buf_initial_sz = 500;
    cfg_.rc_buf_optimal_sz = 600;
@@ -528,8 +493,6 @@ class ResizeRealtimeTest
  bool change_bitrate_;
  double mismatch_psnr_;
  int mismatch_nframes_;
-  std::vector<unsigned int> encode_frame_width_;
-  std::vector<unsigned int> encode_frame_height_;
 };

 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
@@ -619,9 +582,6 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
  int resize_count = 0;
  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
       info != frame_info_list_.end(); ++info) {
-    const size_t idx = info - frame_info_list_.begin();
-    ASSERT_EQ(info->w, GetFrameWidth(idx));
-    ASSERT_EQ(info->h, GetFrameHeight(idx));
    if (info->w != last_w || info->h != last_h) {
      resize_count++;
      if (resize_count == 1) {
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -112,9 +112,8 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_SSE2

 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(
-    MSA, SumSquaresTest,
-    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
-                                 &vpx_sum_squares_2d_i16_msa)));
+INSTANTIATE_TEST_CASE_P(MSA, SumSquaresTest, ::testing::Values(make_tuple(
+                                                 &vpx_sum_squares_2d_i16_c,
+                                                 &vpx_sum_squares_2d_i16_msa)));
 #endif  // HAVE_MSA
 }  // namespace
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -734,8 +734,6 @@ endif  # CONFIG_VP9_HIGHBITDEPTH
 # Invalid files for testing libvpx error checking.
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -852,7 +852,5 @@ e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res
 fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf
 fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res
-1a0e405606939f2febab1a21b30c37cb8f2c8cb1 *invalid-token-partition.ivf
-90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res
 17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm
 e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -61,6 +61,7 @@ int main(int argc, char **argv) {
 #if !CONFIG_SHARED
 // Shared library builds don't support whitebox tests
 // that exercise internal symbols.
+
 #if CONFIG_VP8
  vp8_rtcd();
 #endif  // CONFIG_VP8
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc
@@ -27,8 +27,8 @@

 namespace {

-using libvpx_test::ACMRandom;
 using std::string;
+using libvpx_test::ACMRandom;

 #if CONFIG_WEBM_IO

--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -59,7 +59,7 @@ const TestVideoParam kTestVectors[] = {
 // Encoding modes tested
 const libvpx_test::TestMode kEncodingModeVectors[] = {
  ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-  ::libvpx_test::kRealTime
+  ::libvpx_test::kRealTime,
 };

 // Speed settings tested
--- a/test/vp9_motion_vector_test.cc
+++ b/test/vp9_motion_vector_test.cc
@@ -22,7 +22,7 @@ namespace {
 // Encoding modes
 const libvpx_test::TestMode kEncodingModeVectors[] = {
  ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-  ::libvpx_test::kRealTime
+  ::libvpx_test::kRealTime,
 };

 // Encoding speeds
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -14,9 +14,9 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

-#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "test/acm_random.h"
 #include "test/buffer.h"
 #include "test/clear_system_state.h"
@@ -42,7 +42,7 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
                             uint16_t *eob, const int16_t *scan,
                             const int16_t *iscan);
 typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
-                        int /*max_size*/, bool /*is_fp*/>
+                        int /*max_size*/>
    QuantizeParam;

 // Wrapper for FP version which does not use zbin or quant_shift.
@@ -69,15 +69,11 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,

 class VP9QuantizeBase {
 public:
-  VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
-      : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) {
+  VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size)
+      : bit_depth_(bit_depth), max_size_(max_size) {
    max_value_ = (1 << bit_depth_) - 1;
    zbin_ptr_ =
        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
-    round_fp_ptr_ = reinterpret_cast<int16_t *>(
-        vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
-    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
-        vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
    round_ptr_ =
        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
    quant_ptr_ =
@@ -90,15 +86,11 @@ class VP9QuantizeBase {

  ~VP9QuantizeBase() {
    vpx_free(zbin_ptr_);
-    vpx_free(round_fp_ptr_);
-    vpx_free(quant_fp_ptr_);
    vpx_free(round_ptr_);
    vpx_free(quant_ptr_);
    vpx_free(quant_shift_ptr_);
    vpx_free(dequant_ptr_);
    zbin_ptr_ = NULL;
-    round_fp_ptr_ = NULL;
-    quant_fp_ptr_ = NULL;
    round_ptr_ = NULL;
    quant_ptr_ = NULL;
    quant_shift_ptr_ = NULL;
@@ -108,8 +100,6 @@ class VP9QuantizeBase {

 protected:
  int16_t *zbin_ptr_;
-  int16_t *round_fp_ptr_;
-  int16_t *quant_fp_ptr_;
  int16_t *round_ptr_;
  int16_t *quant_ptr_;
  int16_t *quant_shift_ptr_;
@@ -117,136 +107,29 @@ class VP9QuantizeBase {
  const vpx_bit_depth_t bit_depth_;
  int max_value_;
  const int max_size_;
-  const bool is_fp_;
 };

 class VP9QuantizeTest : public VP9QuantizeBase,
                        public ::testing::TestWithParam<QuantizeParam> {
 public:
  VP9QuantizeTest()
-      : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3), GET_PARAM(4)),
-        quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}
+      : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3)), quantize_op_(GET_PARAM(0)),
+        ref_quantize_op_(GET_PARAM(1)) {}

 protected:
  const QuantizeFunc quantize_op_;
  const QuantizeFunc ref_quantize_op_;
 };

-// This quantizer compares the AC coefficients to the quantization step size to
-// determine if further multiplication operations are needed.
-// Based on vp9_quantize_fp_sse2().
-inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t *round_ptr,
-                        const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                        uint16_t *eob_ptr, const int16_t *scan,
-                        const int16_t *iscan, int is_32x32) {
-  int i, eob = -1;
-  const int thr = dequant_ptr[1] >> (1 + is_32x32);
-  (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
-
-  // Quantization pass: All coefficients with index >= zero_flag are
-  // skippable. Note: zero_flag can be zero.
-  for (i = 0; i < n_coeffs; i += 16) {
-    int y;
-    int nzflag_cnt = 0;
-    int abs_coeff[16];
-    int coeff_sign[16];
-
-    // count nzflag for each row (16 tran_low_t)
-    for (y = 0; y < 16; ++y) {
-      const int rc = i + y;
-      const int coeff = coeff_ptr[rc];
-      coeff_sign[y] = (coeff >> 31);
-      abs_coeff[y] = (coeff ^ coeff_sign[y]) - coeff_sign[y];
-      // The first 16 are skipped in the sse2 code.  Do the same here to match.
-      if (i >= 16 && (abs_coeff[y] <= thr)) {
-        nzflag_cnt++;
-      }
-    }
-
-    for (y = 0; y < 16; ++y) {
-      const int rc = i + y;
-      // If all of the AC coeffs in a row has magnitude less than the
-      // quantization step_size/2, quantize to zero.
-      if (nzflag_cnt < 16) {
-        int tmp;
-        int _round;
-
-        if (is_32x32) {
-          _round = ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-        } else {
-          _round = round_ptr[rc != 0];
-        }
-        tmp = clamp(abs_coeff[y] + _round, INT16_MIN, INT16_MAX);
-        tmp = (tmp * quant_ptr[rc != 0]) >> (16 - is_32x32);
-        qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y];
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-        if (is_32x32) {
-          dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-        } else {
-          dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-        }
-      } else {
-        qcoeff_ptr[rc] = 0;
-        dqcoeff_ptr[rc] = 0;
-      }
-    }
-  }
-
-  // Scan for eob.
-  for (i = 0; i < n_coeffs; i++) {
-    // Use the scan order to find the correct eob.
-    const int rc = scan[i];
-    if (qcoeff_ptr[rc]) {
-      eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
-  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
-              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
-}
-
-void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr, const int16_t *scan,
-                            const int16_t *iscan) {
-  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
-              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
-}
-
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
                          int16_t *quant, int16_t *quant_shift,
-                          int16_t *dequant, int16_t *round_fp,
-                          int16_t *quant_fp) {
-  // Max when q == 0.  Otherwise, it is 48 for Y and 42 for U/V.
-  const int max_qrounding_factor_fp = 64;
-
+                          int16_t *dequant) {
  for (int j = 0; j < 2; j++) {
-    // The range is 4 to 1828 in the VP9 tables.
-    const int qlookup = rnd->RandRange(1825) + 4;
-    round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7;
-    quant_fp[j] = (1 << 16) / qlookup;
-
    // Values determined by deconstructing vp9_init_quantizer().
    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
    // values or U/V values of any bit depth. This is because y_delta is not
    // factored into the vp9_ac_quant() call.
    zbin[j] = rnd->RandRange(1200);
-
    // round may be up to 685 for Y values or 914 for U/V.
    round[j] = rnd->RandRange(914);
    // quant ranges from 1 to -32703
@@ -258,8 +141,6 @@ void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
  }
  for (int j = 2; j < 8; j++) {
    zbin[j] = zbin[1];
-    round_fp[j] = round_fp[1];
-    quant_fp[j] = quant_fp[1];
    round[j] = round[1];
    quant[j] = quant[1];
    quant_shift[j] = quant_shift[1];
@@ -298,19 +179,19 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
    const int count = (4 << sz) * (4 << sz);
    coeff.Set(&rnd, -max_value_, max_value_);
    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
-                         quant_fp_ptr_);
-    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
-    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
-    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
-                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_order->scan, scan_order->iscan);
+                         quant_shift_ptr_, dequant_ptr_);

-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
-        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
-        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
+                     round_ptr_, quant_ptr_, quant_shift_ptr_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_order->scan,
+                     scan_order->iscan);
+
+    ASM_REGISTER_STATE_CHECK(
+        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
+                     round_ptr_, quant_ptr_, quant_shift_ptr_,
+                     qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));

    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
    EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
@@ -360,19 +241,19 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
    coeff.TopLeftPixel()[rnd(count)] =
        static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
-                         quant_fp_ptr_);
-    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
-    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
-    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
-                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_order->scan, scan_order->iscan);
+                         quant_shift_ptr_, dequant_ptr_);

-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
-        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
-        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
+                     round_ptr_, quant_ptr_, quant_shift_ptr_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_order->scan,
+                     scan_order->iscan);
+
+    ASM_REGISTER_STATE_CHECK(
+        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
+                     round_ptr_, quant_ptr_, quant_shift_ptr_,
+                     qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));

    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
    EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
@@ -418,10 +299,7 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) {
      const int count = (4 << sz) * (4 << sz);

      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
-                           quant_fp_ptr_);
-      int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
-      int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+                           quant_shift_ptr_, dequant_ptr_);

      if (i == 0) {
        // When |coeff values| are less than zbin the results are 0.
@@ -441,10 +319,10 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) {
      vpx_usec_timer timer;
      vpx_usec_timer_start(&timer);
      for (int j = 0; j < 100000000 / count; ++j) {
-        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
-                     q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(),
-                     dqcoeff.TopLeftPixel(), dequant_ptr_, &eob,
-                     scan_order->scan, scan_order->iscan);
+        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
+                     round_ptr_, quant_ptr_, quant_shift_ptr_,
+                     qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &eob, scan_order->scan, scan_order->iscan);
      }
      vpx_usec_timer_mark(&timer);
      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
@@ -467,54 +345,50 @@ INSTANTIATE_TEST_CASE_P(
    SSE2, VP9QuantizeTest,
    ::testing::Values(
        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
+                   VPX_BITS_8, 16),
        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
+                   VPX_BITS_10, 16),
        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
+                   VPX_BITS_12, 16),
        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32),
        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32),
        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32)));

 #else
-INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
-                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
-                                 16, true)));
+INSTANTIATE_TEST_CASE_P(SSE2, VP9QuantizeTest,
+                        ::testing::Values(make_tuple(&vpx_quantize_b_sse2,
+                                                     &vpx_quantize_b_c,
+                                                     VPX_BITS_8, 16)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_SSE2, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
+                                 16)));
 #endif  // HAVE_SSE2

 #if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
-#if ARCH_X86_64
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
-                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
-                                 16, true),
-                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
-                                 &QuantFPWrapper<quantize_fp_32x32_nz_c>,
-                                 VPX_BITS_8, 32, true)));
-#else
 INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest,
                        ::testing::Values(make_tuple(&vpx_quantize_b_ssse3,
                                                     &vpx_quantize_b_c,
-                                                     VPX_BITS_8, 16, false)));
-#endif
+                                                     VPX_BITS_8, 16)));

 #if ARCH_X86_64
 // TODO(johannkoenig): SSSE3 optimizations do not yet pass this test.
-INSTANTIATE_TEST_CASE_P(DISABLED_SSSE3, VP9QuantizeTest,
-                        ::testing::Values(make_tuple(
-                            &vpx_quantize_b_32x32_ssse3,
-                            &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false)));
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_SSSE3, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_32x32_ssse3,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
+                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
+                                 16),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
+                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                                 VPX_BITS_8, 32)));
 #endif  // ARCH_X86_64
 #endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH

@@ -524,54 +398,36 @@ INSTANTIATE_TEST_CASE_P(DISABLED_SSSE3, VP9QuantizeTest,
 INSTANTIATE_TEST_CASE_P(
    AVX, VP9QuantizeTest,
    ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
+                                 VPX_BITS_8, 16),
                      // Even though SSSE3 and AVX do not match the reference
                      // code, we can keep them in sync with each other.
                      make_tuple(&vpx_quantize_b_32x32_avx,
-                                 &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32,
-                                 false)));
+                                 &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32)));
 #endif  // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH

-#if ARCH_X86_64 && HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
-    AVX2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
-                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
-                                 16, true)));
-#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
-
 // TODO(webm:1448): dqcoeff is not handled correctly in HBD builds.
 #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
    NEON, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_neon,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
-                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
-                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
-                                 16, true),
-                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
-                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
-                                 VPX_BITS_8, 32, true)));
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16),
+        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_8, 32),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32)));
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH

 // Only useful to compare "Speed" test results.
 INSTANTIATE_TEST_CASE_P(
    DISABLED_C, VP9QuantizeTest,
    ::testing::Values(
-        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
+        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16),
        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
-                   32, false),
+                   32),
        make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
-                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
-        make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
-                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
-        make_tuple(&QuantFPWrapper<quantize_fp_32x32_nz_c>,
-                   &QuantFPWrapper<quantize_fp_32x32_nz_c>, VPX_BITS_8, 32,
-                   true),
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16),
        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_c>,
-                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
-                   true)));
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32)));
 }  // namespace
--- a/test/vp9_scale_test.cc
+++ b/test/vp9_scale_test.cc
@@ -47,7 +47,7 @@ class ScaleTest : public VpxScaleBase,
        scale_fn_(&img_, &dst_img_, filter_type, phase_scaler));
  }

-  void RunTest(INTERP_FILTER filter_type) {
+  void RunTest() {
    static const int kNumSizesToTest = 20;
    static const int kNumScaleFactorsToTest = 4;
    static const int kSizesToTest[] = {
@@ -55,48 +55,50 @@ class ScaleTest : public VpxScaleBase,
      22, 24, 26, 28, 30, 32, 34, 68, 128, 134
    };
    static const int kScaleFactors[] = { 1, 2, 3, 4 };
-    for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
-      for (int h = 0; h < kNumSizesToTest; ++h) {
-        const int src_height = kSizesToTest[h];
-        for (int w = 0; w < kNumSizesToTest; ++w) {
-          const int src_width = kSizesToTest[w];
-          for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
-               ++sf_up_idx) {
-            const int sf_up = kScaleFactors[sf_up_idx];
-            for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
-                 ++sf_down_idx) {
-              const int sf_down = kScaleFactors[sf_down_idx];
-              const int dst_width = src_width * sf_up / sf_down;
-              const int dst_height = src_height * sf_up / sf_down;
-              if (sf_up == sf_down && sf_up != 1) {
-                continue;
+    for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
+      for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
+        for (int h = 0; h < kNumSizesToTest; ++h) {
+          const int src_height = kSizesToTest[h];
+          for (int w = 0; w < kNumSizesToTest; ++w) {
+            const int src_width = kSizesToTest[w];
+            for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
+                 ++sf_up_idx) {
+              const int sf_up = kScaleFactors[sf_up_idx];
+              for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+                   ++sf_down_idx) {
+                const int sf_down = kScaleFactors[sf_down_idx];
+                const int dst_width = src_width * sf_up / sf_down;
+                const int dst_height = src_height * sf_up / sf_down;
+                if (sf_up == sf_down && sf_up != 1) {
+                  continue;
+                }
+                // I420 frame width and height must be even.
+                if (!dst_width || !dst_height || dst_width & 1 ||
+                    dst_height & 1) {
+                  continue;
+                }
+                // vpx_convolve8_c() has restriction on the step which cannot
+                // exceed 64 (ratio 1 to 4).
+                if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
+                  continue;
+                }
+                ASSERT_NO_FATAL_FAILURE(ResetScaleImages(
+                    src_width, src_height, dst_width, dst_height));
+                ReferenceScaleFrame(filter_type, phase_scaler);
+                ScaleFrame(filter_type, phase_scaler);
+                if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+                           ref_img_.frame_size)) {
+                  printf(
+                      "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+                      "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+                      "scale factor = %d:%d\n",
+                      filter_type, phase_scaler, src_width, src_height,
+                      dst_width, dst_height, sf_down, sf_up);
+                  PrintDiff();
+                }
+                CompareImages(dst_img_);
+                DeallocScaleImages();
              }
-              // I420 frame width and height must be even.
-              if (!dst_width || !dst_height || dst_width & 1 ||
-                  dst_height & 1) {
-                continue;
-              }
-              // vpx_convolve8_c() has restriction on the step which cannot
-              // exceed 64 (ratio 1 to 4).
-              if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
-                continue;
-              }
-              ASSERT_NO_FATAL_FAILURE(ResetScaleImages(src_width, src_height,
-                                                       dst_width, dst_height));
-              ReferenceScaleFrame(filter_type, phase_scaler);
-              ScaleFrame(filter_type, phase_scaler);
-              if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
-                         ref_img_.frame_size)) {
-                printf(
-                    "filter_type = %d, phase_scaler = %d, src_width = %4d, "
-                    "src_height = %4d, dst_width = %4d, dst_height = %4d, "
-                    "scale factor = %d:%d\n",
-                    filter_type, phase_scaler, src_width, src_height, dst_width,
-                    dst_height, sf_down, sf_up);
-                PrintDiff();
-              }
-              CompareImages(dst_img_);
-              DeallocScaleImages();
            }
          }
        }
@@ -143,10 +145,7 @@ class ScaleTest : public VpxScaleBase,
  ScaleFrameFunc scale_fn_;
 };

-TEST_P(ScaleTest, ScaleFrame_EightTap) { RunTest(EIGHTTAP); }
-TEST_P(ScaleTest, ScaleFrame_EightTapSmooth) { RunTest(EIGHTTAP_SMOOTH); }
-TEST_P(ScaleTest, ScaleFrame_EightTapSharp) { RunTest(EIGHTTAP_SHARP); }
-TEST_P(ScaleTest, ScaleFrame_Bilinear) { RunTest(BILINEAR); }
+TEST_P(ScaleTest, ScaleFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }

 TEST_P(ScaleTest, DISABLED_Speed) {
  static const int kCountSpeedTestBlock = 100;
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -147,6 +147,7 @@ TEST(VPxWorkerThreadTest, TestInterfaceAPI) {

 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests
+
 #if CONFIG_WEBM_IO
 struct FileList {
  const char *name;
--- a/tools/all_builds.py
+++ b/tools/all_builds.py
@@ -0,0 +1,72 @@
+#!/usr/bin/python
+
+import getopt
+import subprocess
+import sys
+
+LONG_OPTIONS = ["shard=", "shards="]
+BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental"
+
+def RunCommand(command):
+  run = subprocess.Popen(command, shell=True)
+  output = run.communicate()
+  if run.returncode:
+    print "Non-zero return code: " + str(run.returncode) + " => exiting!"
+    sys.exit(1)
+
+def list_of_experiments():
+  experiments = []
+  configure_file = open("configure")
+  list_start = False
+  for line in configure_file.read().split("\n"):
+    if line == 'EXPERIMENT_LIST="':
+      list_start = True
+    elif line == '"':
+      list_start = False
+    elif list_start:
+      currently_broken = ["csm"]
+      experiment = line[4:]
+      if experiment not in currently_broken:
+        experiments.append(experiment)
+  return experiments
+
+def main(argv):
+  # Parse arguments
+  options = {"--shard": 0, "--shards": 1}
+  if "--" in argv:
+    opt_end_index = argv.index("--")
+  else:
+    opt_end_index = len(argv)
+  try:
+    o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS)
+  except getopt.GetoptError, err:
+    print str(err)
+    print "Usage: %s [--shard=<n> --shards=<n>] -- [configure flag ...]"%argv[0]
+    sys.exit(2)
+
+  options.update(o)
+  extra_args = argv[opt_end_index + 1:]
+
+  # Shard experiment list
+  shard = int(options["--shard"])
+  shards = int(options["--shards"])
+  experiments = list_of_experiments()
+  base_command = " ".join([BASE_COMMAND] + extra_args)
+  configs = [base_command]
+  configs += ["%s --enable-%s" % (base_command, e) for e in experiments]
+  my_configs = zip(configs, range(len(configs)))
+  my_configs = filter(lambda x: x[1] % shards == shard, my_configs)
+  my_configs = [e[0] for e in my_configs]
+
+  # Run configs for this shard
+  for config in my_configs:
+    test_build(config)
+
+def test_build(configure_command):
+  print "\033[34m\033[47mTesting %s\033[0m" % (configure_command)
+  RunCommand(configure_command)
+  RunCommand("make clean")
+  RunCommand("make")
+
+if __name__ == "__main__":
+  main(sys.argv)
--- a/tools/author_first_release.sh
+++ b/tools/author_first_release.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+##
+## List the release each author first contributed to.
+##
+## Usage: author_first_release.sh [TAGS]
+##
+## If the TAGS arguments are unspecified, all tags reported by `git tag`
+## will be considered.
+##
+tags=${@:-$(git tag)}
+for tag in $tags; do
+  git shortlog -n -e -s $tag |
+      cut -f2- |
+      awk "{print \"${tag#v}\t\"\$0}"
+done | sort -k2  | uniq -f2
--- a/tools/ftfy.sh
+++ b/tools/ftfy.sh
@@ -0,0 +1,158 @@
+#!/bin/sh
+self="$0"
+dirname_self=$(dirname "$self")
+
+usage() {
+  cat <<EOF >&2
+Usage: $self [option]
+
+This script applies a whitespace transformation to the commit at HEAD. If no
+options are given, then the modified files are left in the working tree.
+
+Options:
+  -h, --help     Shows this message
+  -n, --dry-run  Shows a diff of the changes to be made.
+  --amend        Squashes the changes into the commit at HEAD
+                     This option will also reformat the commit message.
+  --commit       Creates a new commit containing only the whitespace changes
+  --msg-only     Reformat the commit message only, ignore the patch itself.
+
+EOF
+  rm -f ${CLEAN_FILES}
+  exit 1
+}
+
+
+log() {
+  echo "${self##*/}: $@" >&2
+}
+
+
+vpx_style() {
+  for f; do
+    case "$f" in
+      *.h|*.c|*.cc)
+        clang-format -i --style=file "$f"
+        ;;
+    esac
+  done
+}
+
+
+apply() {
+  [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1"
+}
+
+
+commit() {
+  LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}')
+  if [ -z "$LAST_CHANGEID" ]; then
+    log "HEAD doesn't have a Change-Id, unable to generate a new commit"
+    exit 1
+  fi
+
+  # Build a deterministic Change-Id from the parent's
+  NEW_CHANGEID=${LAST_CHANGEID}-styled
+  NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin)
+
+  # Commit, preserving authorship from the parent commit.
+  git commit -a -C HEAD > /dev/null
+  git commit --amend -F- << EOF
+Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9}
+
+Change-Id: ${NEW_CHANGEID}
+EOF
+}
+
+
+show_commit_msg_diff() {
+  if [ $DIFF_MSG_RESULT -ne 0 ]; then
+    log "Modified commit message:"
+    diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3
+  fi
+}
+
+
+amend() {
+  show_commit_msg_diff
+  if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then
+    git commit -a --amend -F "$NEW_COMMIT_MSG"
+  fi
+}
+
+
+diff_msg() {
+  git log -1 --format=%B > "$ORIG_COMMIT_MSG"
+  "${dirname_self}"/wrap-commit-msg.py \
+      < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG"
+  cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG"
+  DIFF_MSG_RESULT=$?
+}
+
+
+# Temporary files
+ORIG_DIFF=orig.diff.$$
+MODIFIED_DIFF=modified.diff.$$
+FINAL_DIFF=final.diff.$$
+ORIG_COMMIT_MSG=orig.commit-msg.$$
+NEW_COMMIT_MSG=new.commit-msg.$$
+CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}"
+CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}"
+
+# Preconditions
+[ $# -lt 2 ] || usage
+
+if ! clang-format -version >/dev/null 2>&1; then
+  log "clang-format not found"
+  exit 1
+fi
+
+if ! git diff --quiet HEAD; then
+  log "Working tree is dirty, commit your changes first"
+  exit 1
+fi
+
+# Need to be in the root
+cd "$(git rev-parse --show-toplevel)"
+
+# Collect the original diff
+git show > "${ORIG_DIFF}"
+
+# Apply the style guide on new and modified files and collect its diff
+for f in $(git diff HEAD^ --name-only -M90 --diff-filter=AM); do
+  case "$f" in
+    third_party/*) continue;;
+  esac
+  vpx_style "$f"
+done
+git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}"
+
+# Intersect the two diffs
+"${dirname_self}"/intersect-diffs.py \
+    "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}"
+INTERSECT_RESULT=$?
+git reset --hard >/dev/null
+
+# Fixup the commit message
+diff_msg
+
+# Handle options
+if [ -n "$1" ]; then
+  case "$1" in
+    -h|--help) usage;;
+    -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;;
+    --commit) apply "${FINAL_DIFF}"; commit;;
+    --amend) apply "${FINAL_DIFF}"; amend;;
+    --msg-only) amend;;
+    *) usage;;
+  esac
+else
+  apply "${FINAL_DIFF}"
+  if ! git diff --quiet; then
+    log "Formatting changes applied, verify and commit."
+    log "See also: http://www.webmproject.org/code/contribute/conventions/"
+    git diff --stat
+  fi
+fi
+
+rm -f ${CLEAN_FILES}
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -37,9 +37,7 @@ extern "C" {
 #define SEGMENT_DELTADATA 0
 #define SEGMENT_ABSDATA 1

-typedef struct {
-  int r, c;
-} POS;
+typedef struct { int r, c; } POS;

 #define PLANE_TYPE_Y_NO_DC 0
 #define PLANE_TYPE_Y2 1
@@ -182,9 +180,6 @@ typedef struct {
  unsigned int low_res_ref_frames[MAX_REF_FRAMES];
  // The video frame counter value for the key frame, for lowest resolution.
  unsigned int key_frame_counter_value;
-  // Flags to signal skipped encoding of previous and base layer stream.
-  unsigned int skip_encoding_prev_stream;
-  unsigned int skip_encoding_base_stream;
  LOWER_RES_MB_INFO *mb_info;
 } LOWER_RES_FRAME_INFO;
 #endif
--- a/vp8/common/default_coef_probs.h
+++ b/vp8/common/default_coef_probs.h
@@ -6,7 +6,7 @@
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
- */
+*/

 #ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_
 #define VP8_COMMON_DEFAULT_COEF_PROBS_H_
--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -20,7 +20,8 @@ static void copy_and_extend_plane(unsigned char *s, /* source */
                                  int et,           /* extend top border */
                                  int el,           /* extend left border */
                                  int eb,           /* extend bottom border */
-                                  int er) {         /* extend right border */
+                                  int er            /* extend right border */
+                                  ) {
  int i;
  unsigned char *src_ptr1, *src_ptr2;
  unsigned char *dest_ptr1, *dest_ptr2;
--- a/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
+++ b/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
@@ -934,8 +934,8 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
  s4 = s3 + p;

  /* load quad-byte vectors
-   * memory is 4 byte aligned
-   */
+  * memory is 4 byte aligned
+  */
  p2 = *((uint32_t *)(s1 - 4));
  p6 = *((uint32_t *)(s1));
  p1 = *((uint32_t *)(s2 - 4));
@@ -990,8 +990,8 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
      :);

  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
-   * mask will be zero and filtering is not needed
-   */
+  * mask will be zero and filtering is not needed
+  */
  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
                             thresh, &hev, &mask);
@@ -2102,8 +2102,8 @@ void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
  s4 = s3 + p;

  /* load quad-byte vectors
-   * memory is 4 byte aligned
-   */
+  * memory is 4 byte aligned
+  */
  p2 = *((uint32_t *)(s1 - 4));
  p6 = *((uint32_t *)(s1));
  p1 = *((uint32_t *)(s2 - 4));
--- a/vp8/common/mips/mmi/idct_blk_mmi.c
+++ b/vp8/common/mips/mmi/idct_blk_mmi.c
@@ -12,7 +12,7 @@
 #include "vpx_mem/vpx_mem.h"

 void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
-                                      int stride, char *eobs) {
+                                      int stride, int8_t *eobs) {
  int i, j;

  for (i = 0; i < 4; i++) {
@@ -33,7 +33,8 @@ void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
 }

 void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu,
-                                       uint8_t *dstv, int stride, char *eobs) {
+                                       uint8_t *dstv, int stride,
+                                       int8_t *eobs) {
  int i, j;

  for (i = 0; i < 2; i++) {
--- a/vp8/common/mips/mmi/loopfilter_filters_mmi.c
+++ b/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -461,87 +461,96 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
  );
 }

-/* clang-format off */
 #define VP8_MBLOOP_HPSRAB                                               \
-  "punpcklbh  %[ftmp10],  %[ftmp10],          %[ftmp0]            \n\t" \
-  "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp0]            \n\t" \
-  "psrah      %[ftmp10],  %[ftmp10],          %[ftmp9]            \n\t" \
-  "psrah      %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t" \
-  "packsshb   %[ftmp0],   %[ftmp10],          %[ftmp11]            \n\t"
+  "xor        %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t" \
+  "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
+  "punpcklbh  %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp0]            \n\t" \
+  "psrah      %[ftmp3],   %[ftmp3],           %[ftmp9]            \n\t" \
+  "psrah      %[ftmp8],   %[ftmp8],           %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp0],   %[ftmp3],           %[ftmp8]            \n\t"

-#define VP8_MBLOOP_HPSRAB_ADD(reg)                                      \
-  "punpcklbh  %[ftmp1],   %[ftmp0],           %[ftmp12]           \n\t" \
-  "punpckhbh  %[ftmp2],   %[ftmp0],           %[ftmp12]           \n\t" \
-  "pmulhh     %[ftmp1],   %[ftmp1],         " #reg "              \n\t" \
-  "pmulhh     %[ftmp2],   %[ftmp2],         " #reg "              \n\t" \
-  "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_003f]       \n\t" \
-  "paddh      %[ftmp2],   %[ftmp2],           %[ff_ph_003f]       \n\t" \
-  "psrah      %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t" \
-  "psrah      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t" \
-  "packsshb   %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
-/* clang-format on */
+#define VP8_MBLOOP_HPSRAB_PMULHH(reg1, reg2) \
+  "pmulhh   " #reg1 ",  " #reg1 ",  " #reg2 "                     \n\t"
+
+#define VP8_MBLOOP_HPSRAB_ADD(reg) \
+  "xor        %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t" \
+  "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
+  "punpcklbh  %[ftmp3],   %[ftmp3],           %[ftmp2]            \n\t" \
+  "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp2]            \n\t" \
+  VP8_MBLOOP_HPSRAB_PMULHH(%[ftmp3], reg)                               \
+  VP8_MBLOOP_HPSRAB_PMULHH(%[ftmp8], reg)                               \
+  "paddh      %[ftmp3],   %[ftmp3],           %[ff_ph_003f]       \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],           %[ff_ph_003f]       \n\t" \
+  "psrah      %[ftmp3],   %[ftmp3],           %[ftmp9]            \n\t" \
+  "psrah      %[ftmp8],   %[ftmp8],           %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"

 void vp8_mbloop_filter_horizontal_edge_mmi(
    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
    const unsigned char *limit, const unsigned char *thresh, int count) {
  uint32_t tmp[1];
-  double ftmp[13];
+  mips_reg addr[2];
+  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
+  double ftmp[10];

  __asm__ volatile (
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
    "1:                                                             \n\t"
    "gsldlc1    %[ftmp9],   0x07(%[limit])                          \n\t"
    "gsldrc1    %[ftmp9],   0x00(%[limit])                          \n\t"
-    /* ftmp1: p3 */
-    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
-    /* ftmp3: p2 */
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
-    /* ftmp4: p1 */
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
-    /* ftmp5: p0 */
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
-    /* ftmp6: q0 */
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
-    /* ftmp7: q1 */
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
-    /* ftmp8: q2 */
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
-    /* ftmp2: q3 */
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp2],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp2],   0x00(%[src_ptr])                        \n\t"

-    "gsldlc1    %[ftmp12],  0x07(%[blimit])                         \n\t"
-    "gsldrc1    %[ftmp12],  0x00(%[blimit])                         \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])

+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp1],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+    "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp0],   %[ftmp1],           %[ftmp3]            \n\t"
    "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+
+    /* ftmp4:p1 */
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-    "pasubub    %[ftmp10],  %[ftmp4],           %[ftmp5]            \n\t"
-    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+
+    /* ftmp5:p0 */
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp4],           %[ftmp5]            \n\t"
+    "sdc1       %[ftmp1],   0x00(%[srct])                           \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-    "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
-    "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp9]            \n\t"
+
+    /* ftmp6:q0 */
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    /* ftmp7:q1 */
+    "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp7],           %[ftmp6]            \n\t"
+    "sdc1       %[ftmp1],   0x08(%[srct])                           \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp8],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+    "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
@@ -554,7 +563,9 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
    "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
-    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp12]           \n\t"
+    "gsldlc1    %[ftmp9],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp9],   0x00(%[blimit])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
    "xor        %[ftmp9],   %[ftmp9],           %[ftmp9]            \n\t"
    /* ftmp0: mask */
@@ -562,26 +573,29 @@ void vp8_mbloop_filter_horizontal_edge_mmi(

    "gsldlc1    %[ftmp9],   0x07(%[thresh])                         \n\t"
    "gsldrc1    %[ftmp9],   0x00(%[thresh])                         \n\t"
-    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
-    "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp9]            \n\t"
+    "ldc1       %[ftmp1],   0x00(%[srct])                           \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "ldc1       %[ftmp2],   0x08(%[srct])                           \n\t"
+    "psubusb    %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
    "xor        %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
-    /* ftmp1: hev */
+    /* ftmp1:hev*/
    "xor        %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"

    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+
    "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
    "psubsb     %[ftmp9],   %[ftmp6],           %[ftmp5]            \n\t"
    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
-    "pandn      %[ftmp12],  %[ftmp1],           %[ftmp2]            \n\t"
+    "sdc1       %[ftmp2],   0x00(%[srct])                           \n\t"
    "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"

    "li         %[tmp0],    0x0b                                    \n\t"
@@ -592,71 +606,75 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
    "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_04]         \n\t"
    VP8_MBLOOP_HPSRAB
    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "ldc1       %[ftmp2],   0x00(%[srct])                           \n\t"
+    "pandn      %[ftmp2],   %[ftmp1],           %[ftmp2]            \n\t"

    "li         %[tmp0],    0x07                                    \n\t"
    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
-    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
-
    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
-    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp1]            \n\t"
-    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
-    "gssdlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
    "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
    "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
-
    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
-    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
-    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp3]            \n\t"
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+
+    "gssdlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"

    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
-    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
-    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp1]            \n\t"
-    "psubsb     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
-    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
-    "gssdlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+    "gsldlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[addr1])                          \n\t"
+
+    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp3]            \n\t"
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
+    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
+    "gssdlc1    %[ftmp7],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+    "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"

-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
    "addiu      %[count],   %[count],           -0x01               \n\t"
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
    "bnez       %[count],   1b                                      \n\t"
    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
-      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
-      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
-      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count)
-    : [limit]"r"(limit),                  [blimit]"r"(blimit),
-      [thresh]"r"(thresh),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [srct]"r"(srct),                  [thresh]"r"(thresh),
      [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [ff_pb_fe]"f"(ff_pb_fe),            [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_04]"f"(ff_pb_04),            [ff_pb_03]"f"(ff_pb_03),
-      [ff_ph_0900]"f"(ff_ph_0900),        [ff_ph_1b00]"f"(ff_ph_1b00),
-      [ff_ph_1200]"f"(ff_ph_1200),        [ff_ph_003f]"f"(ff_ph_003f)
+      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
+      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_03]"f"(ff_pb_03),
+      [ff_ph_0900]"f"(ff_ph_0900),      [ff_ph_1b00]"f"(ff_ph_1b00),
+      [ff_ph_1200]"f"(ff_ph_1200),      [ff_ph_003f]"f"(ff_ph_003f)
    : "memory"
  );
 }
@@ -678,60 +696,64 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
    const unsigned char *limit, const unsigned char *thresh, int count) {
  mips_reg tmp[1];
+  mips_reg addr[2];
  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
-  double ftmp[14];
+  double ftmp[13];

  __asm__ volatile (
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
    MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)

    "1:                                                             \n\t"
-    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
+    MMI_ADDU(%[addr0], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
+    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
+    "punpcklbh  %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp2],   %[ftmp11],          %[ftmp12]           \n\t"

-    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
-    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
-    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
-    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
+    "gsldlc1    %[ftmp11],  0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
+    "punpcklbh  %[ftmp3],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp4],   %[ftmp11],          %[ftmp12]           \n\t"

-    "punpcklhw  %[ftmp1],   %[ftmp12],          %[ftmp10]           \n\t"
-    "punpckhhw  %[ftmp2],   %[ftmp12],          %[ftmp10]           \n\t"
-    "punpcklhw  %[ftmp3],   %[ftmp11],          %[ftmp9]            \n\t"
-    "punpckhhw  %[ftmp4],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp4],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp4],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp7],   %[ftmp3],           %[ftmp1]            \n\t"
+    "punpckhhw  %[ftmp8],   %[ftmp3],           %[ftmp1]            \n\t"

-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+    MMI_SUBU(%[addr0], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr0])                          \n\t"
+    MMI_SUBU(%[addr0], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp11],          %[ftmp12]           \n\t"

-    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
-    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
-    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
-    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr0], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
+    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp12]           \n\t"

-    "punpcklhw  %[ftmp5],   %[ftmp12],          %[ftmp10]           \n\t"
-    "punpckhhw  %[ftmp6],   %[ftmp12],          %[ftmp10]           \n\t"
-    "punpcklhw  %[ftmp7],   %[ftmp11],          %[ftmp9]            \n\t"
-    "punpckhhw  %[ftmp8],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpcklhw  %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp3],   %[ftmp0],           %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp0],           %[ftmp9]            \n\t"

-    "gsldlc1    %[ftmp13],  0x07(%[limit])                          \n\t"
-    "gsldrc1    %[ftmp13],  0x00(%[limit])                          \n\t"
    /* ftmp9:q0  ftmp10:q1 */
    "punpcklwd  %[ftmp9],   %[ftmp1],           %[ftmp5]            \n\t"
    "punpckhwd  %[ftmp10],  %[ftmp1],           %[ftmp5]            \n\t"
@@ -749,61 +771,60 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "punpcklwd  %[ftmp5],   %[ftmp4],           %[ftmp8]            \n\t"
    "punpckhwd  %[ftmp6],   %[ftmp4],           %[ftmp8]            \n\t"

+    "gsldlc1    %[ftmp8],   0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[limit])                          \n\t"
+
    /* abs (q3-q2) */
    "pasubub    %[ftmp7],   %[ftmp12],          %[ftmp11]           \n\t"
-    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp13]           \n\t"
+    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp8]            \n\t"
    /* abs (q2-q1) */
    "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
-    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
    /* ftmp3: abs(q1-q0) */
    "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp13]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp8]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
    /* ftmp4: abs(p1-p0) */
    "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp13]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp8]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
    /* abs (p2-p1) */
    "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
    /* abs (p3-p2) */
    "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
-
-    "gsldlc1    %[ftmp13],  0x07(%[blimit])                         \n\t"
-    "gsldrc1    %[ftmp13],  0x00(%[blimit])                         \n\t"
-    "gsldlc1    %[ftmp7],   0x07(%[thresh])                         \n\t"
-    "gsldrc1    %[ftmp7],   0x00(%[thresh])                         \n\t"
-    /* abs (p0-q0) * 2 */
+    /* abs (p0-q0) */
    "pasubub    %[ftmp1],   %[ftmp9],           %[ftmp6]            \n\t"
    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
-    /* abs (p1-q1) / 2 */
+    /* abs (p1-q1) */
    "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
    "and        %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
    "li         %[tmp0],    0x01                                    \n\t"
    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
    "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
    "paddusb    %[ftmp12],  %[ftmp1],           %[ftmp12]           \n\t"
-    "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp13]           \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[blimit])                         \n\t"
+    "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
    "xor        %[ftmp12],  %[ftmp12],          %[ftmp12]           \n\t"
-    /* ftmp0: mask */
    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"

-    /* abs(p1-p0) - thresh */
-    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp7]            \n\t"
-    /* abs(q1-q0) - thresh */
-    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp7]            \n\t"
+    "gsldlc1    %[ftmp8],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[thresh])                         \n\t"
+    /* ftmp3: abs(q1-q0)  ftmp4: abs(p1-p0) */
+    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"
    "or         %[ftmp3],   %[ftmp4],           %[ftmp3]            \n\t"
    "pcmpeqb    %[ftmp3],   %[ftmp3],           %[ftmp12]           \n\t"
    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
-    /* ftmp1: hev */
    "xor        %[ftmp1],   %[ftmp3],           %[ftmp1]            \n\t"

-    /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
    "xor        %[ftmp11],  %[ftmp11],          %[ff_pb_80]         \n\t"
    "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
    "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
@@ -816,30 +837,30 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
-    /* filter_value &= mask */
    "and        %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
-    /* Filter2 = filter_value & hev */
    "and        %[ftmp3],   %[ftmp1],           %[ftmp0]            \n\t"
-    /* filter_value &= ~hev */
    "pandn      %[ftmp0],   %[ftmp1],           %[ftmp0]            \n\t"

    "paddsb     %[ftmp4],   %[ftmp3],           %[ff_pb_04]         \n\t"
    "li         %[tmp0],    0x0b                                    \n\t"
    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t"
    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp4]            \n\t"
    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp4]            \n\t"
    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
    "packsshb   %[ftmp4],   %[ftmp7],           %[ftmp8]            \n\t"
-    /* ftmp9: qs0 */
    "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp4]            \n\t"
    "paddsb     %[ftmp3],   %[ftmp3],           %[ff_pb_03]         \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t"
    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp3]            \n\t"
    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
    "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
-    /* ftmp6: ps0 */
+
    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"

    "li         %[tmp0],    0x07                                    \n\t"
@@ -851,10 +872,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
    VP8_MBLOOP_VPSRAB_ADDT
    "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp3]            \n\t"
-    /* ftmp9: oq0 */
    "xor        %[ftmp9],   %[ftmp4],           %[ff_pb_80]         \n\t"
    "paddsb     %[ftmp4],   %[ftmp6],           %[ftmp3]            \n\t"
-    /* ftmp6: op0 */
    "xor        %[ftmp6],   %[ftmp4],           %[ff_pb_80]         \n\t"

    VP8_MBLOOP_VPSRAB_ADDH
@@ -863,10 +882,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
    VP8_MBLOOP_VPSRAB_ADDT
    "psubsb     %[ftmp4],   %[ftmp10],          %[ftmp3]            \n\t"
-    /* ftmp10: oq1 */
    "xor        %[ftmp10],   %[ftmp4],          %[ff_pb_80]         \n\t"
    "paddsb     %[ftmp4],   %[ftmp5],           %[ftmp3]            \n\t"
-    /* ftmp5: op1 */
    "xor        %[ftmp5],   %[ftmp4],           %[ff_pb_80]         \n\t"

    VP8_MBLOOP_VPSRAB_ADDH
@@ -874,10 +891,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "pmulhh     %[ftmp8],   %[ftmp8],           %[ff_ph_0900]       \n\t"
    VP8_MBLOOP_VPSRAB_ADDT
    "psubsb     %[ftmp4],   %[ftmp11],          %[ftmp3]            \n\t"
-    /* ftmp11: oq2 */
    "xor        %[ftmp11],  %[ftmp4],           %[ff_pb_80]         \n\t"
    "paddsb     %[ftmp4],   %[ftmp2],           %[ftmp3]            \n\t"
-    /* ftmp2: op2 */
    "xor        %[ftmp2],   %[ftmp4],           %[ff_pb_80]         \n\t"

    "ldc1       %[ftmp12],  0x00(%[srct])                           \n\t"
@@ -901,40 +916,41 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "punpcklhw  %[ftmp10],  %[ftmp1],           %[ftmp3]            \n\t"
    "punpckhhw  %[ftmp11],  %[ftmp1],           %[ftmp3]            \n\t"

-    "punpcklwd  %[ftmp0],   %[ftmp7],           %[ftmp11]           \n\t"
-    "punpckhwd  %[ftmp1],   %[ftmp7],           %[ftmp11]           \n\t"
-    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
-
    "punpcklwd  %[ftmp0],   %[ftmp6],           %[ftmp10]           \n\t"
    "punpckhwd  %[ftmp1],   %[ftmp6],           %[ftmp10]           \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+
    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+
+    "punpcklwd  %[ftmp0],   %[ftmp7],           %[ftmp11]           \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp7],           %[ftmp11]           \n\t"
+    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"

    "punpcklwd  %[ftmp1],   %[ftmp5],           %[ftmp9]            \n\t"
    "punpckhwd  %[ftmp0],   %[ftmp5],           %[ftmp9]            \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[addr0], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
+    MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"

    "punpcklwd  %[ftmp1],   %[ftmp4],           %[ftmp8]            \n\t"
    "punpckhwd  %[ftmp0],   %[ftmp4],           %[ftmp8]            \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
-    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
+    MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
    "addiu      %[count],   %[count],           -0x01               \n\t"

    MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
@@ -946,9 +962,9 @@ void vp8_mbloop_filter_vertical_edge_mmi(
      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
-      [ftmp12]"=&f"(ftmp[12]),            [ftmp13]"=&f"(ftmp[13]),
-      [tmp0]"=&r"(tmp[0]),                [src_ptr]"+&r"(src_ptr),
-      [count]"+&r"(count)
+      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
    : [limit]"r"(limit),                [blimit]"r"(blimit),
      [srct]"r"(srct),                  [thresh]"r"(thresh),
      [src_pixel_step]"r"((mips_reg)src_pixel_step),
--- a/vp8/common/mips/mmi/sixtap_filter_mmi.c
+++ b/vp8/common/mips/mmi/sixtap_filter_mmi.c
@@ -86,7 +86,6 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
  register double ftmp8 asm("$f18");
  register double ftmp9 asm("$f20");
  register double ftmp10 asm("$f22");
-  register double ftmp11 asm("$f24");
 #else
  register double fzero asm("$f0");
  register double ftmp0 asm("$f1");
@@ -100,7 +99,6 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
  register double ftmp8 asm("$f9");
  register double ftmp9 asm("$f10");
  register double ftmp10 asm("$f11");
-  register double ftmp11 asm("$f12");
 #endif  // _MIPS_SIM == _ABIO32

  __asm__ volatile (
@@ -114,13 +112,11 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
    "li         %[tmp0],        0x07                                  \n\t"
    "mtc1       %[tmp0],        %[ftmp7]                              \n\t"
    "li         %[tmp0],        0x08                                  \n\t"
-    "mtc1       %[tmp0],        %[ftmp11]                             \n\t"
+    "mtc1       %[tmp0],        %[ftmp10]                             \n\t"

    "1:                                                               \n\t"
    "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"
-    "gsldrc1    %[ftmp9],       -0x02(%[src_ptr])                     \n\t"
-    "gsldlc1    %[ftmp10],      0x06(%[src_ptr])                      \n\t"
-    "gsldrc1    %[ftmp10],      -0x01(%[src_ptr])                     \n\t"
+    "gsldrc1    %[ftmp9],      -0x02(%[src_ptr])                      \n\t"

    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
    "pmullh     %[ftmp8],       %[ftmp6],          %[ftmp0]           \n\t"
@@ -129,21 +125,24 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp4]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"

-    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "gsldlc1    %[ftmp9],       0x06(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp9],      -0x01(%[src_ptr])                      \n\t"
+
+    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp1]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"

-    "punpckhbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "punpckhbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp5]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"

-    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
-    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "dsrl       %[ftmp9],       %[ftmp9],          %[ftmp10]          \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp2]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"

-    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
-    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "dsrl       %[ftmp9],       %[ftmp9],          %[ftmp10]          \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp3]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"

@@ -164,9 +163,8 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
-      [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),
-      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
-      [src_ptr]"+&r"(src_ptr)
+      [tmp0]"=&r"(tmp[0]),              [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width),
      [ff_ph_40]"f"(ff_ph_40)
@@ -192,11 +190,6 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
  register double ftmp6 asm("$f14");
  register double ftmp7 asm("$f16");
  register double ftmp8 asm("$f18");
-  register double ftmp9 asm("$f20");
-  register double ftmp10 asm("$f22");
-  register double ftmp11 asm("$f24");
-  register double ftmp12 asm("$f26");
-  register double ftmp13 asm("$f28");
 #else
  register double fzero asm("$f0");
  register double ftmp0 asm("$f1");
@@ -208,11 +201,6 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
  register double ftmp6 asm("$f7");
  register double ftmp7 asm("$f8");
  register double ftmp8 asm("$f9");
-  register double ftmp9 asm("$f10");
-  register double ftmp10 asm("$f11");
-  register double ftmp11 asm("$f12");
-  register double ftmp12 asm("$f13");
-  register double ftmp13 asm("$f14");
 #endif  // _MIPS_SIM == _ABIO32

  __asm__ volatile (
@@ -222,56 +210,52 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
    "ldc1       %[ftmp3],     0x30(%[vp8_filter])                     \n\t"
    "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"
    "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"
+    MMI_SUBU(%[src_ptr],   %[src_ptr],      %[pixels_per_line_x2])
    "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
    "li         %[tmp0],      0x07                                    \n\t"
-    "mtc1       %[tmp0],      %[ftmp13]                               \n\t"
+    "mtc1       %[tmp0],      %[ftmp7]                                \n\t"

-    /* In order to make full use of memory load delay slot,
-     * Operation of memory loading and calculating has been rearranged.
-     */
    "1:                                                               \n\t"
    "gsldlc1    %[ftmp6],     0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp6],     0x00(%[src_ptr])                        \n\t"
+    "pmullh     %[ftmp8],     %[ftmp6],        %[ftmp0]               \n\t"
+
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line])
-    "gsldlc1    %[ftmp7],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp7],     0x00(%[addr0])                          \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
+    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp1]               \n\t"
+    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
+
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
-    "gsldlc1    %[ftmp8],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp8],     0x00(%[addr0])                          \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
+    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp2]               \n\t"
+    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"

    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
-    "gsldlc1    %[ftmp9],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp9],     0x00(%[addr0])                          \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
+    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp4]               \n\t"
+    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
+
    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
-    "gsldlc1    %[ftmp10],    0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp10],    0x00(%[addr0])                          \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
+    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp3]               \n\t"
+    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
+
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
-    "gsldlc1    %[ftmp11],    0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp11],    0x00(%[addr0])                          \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
+    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp5]               \n\t"
+    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"

-    "pmullh     %[ftmp12],    %[ftmp6],        %[ftmp0]               \n\t"
-
-    "pmullh     %[ftmp7],     %[ftmp7],        %[ftmp1]               \n\t"
-    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp7]               \n\t"
-
-    "pmullh     %[ftmp8],     %[ftmp8],        %[ftmp2]               \n\t"
-    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp8]               \n\t"
-
-    "pmullh     %[ftmp9],     %[ftmp9],        %[ftmp4]               \n\t"
-    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp9]               \n\t"
-
-    "pmullh     %[ftmp10],    %[ftmp10],       %[ftmp3]               \n\t"
-    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp10]              \n\t"
-
-    "pmullh     %[ftmp11],    %[ftmp11],       %[ftmp5]               \n\t"
-    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp11]              \n\t"
-
-    "paddsh     %[ftmp12],    %[ftmp12],       %[ff_ph_40]            \n\t"
-    "psrah      %[ftmp12],    %[ftmp12],       %[ftmp13]              \n\t"
-    "packushb   %[ftmp12],    %[ftmp12],       %[fzero]               \n\t"
-    "gsswlc1    %[ftmp12],    0x03(%[output_ptr])                     \n\t"
-    "gsswrc1    %[ftmp12],    0x00(%[output_ptr])                     \n\t"
+    "paddsh     %[ftmp8],     %[ftmp8],        %[ff_ph_40]            \n\t"
+    "psrah      %[ftmp8],     %[ftmp8],        %[ftmp7]               \n\t"
+    "packushb   %[ftmp8],     %[ftmp8],        %[fzero]               \n\t"
+    "gsswlc1    %[ftmp8],     0x03(%[output_ptr])                     \n\t"
+    "gsswrc1    %[ftmp8],     0x00(%[output_ptr])                     \n\t"

    MMI_ADDIU(%[output_height], %[output_height], -0x01)
    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
@@ -281,11 +265,9 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
-      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
-      [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),
-      [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),
-      [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),
-      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+      [tmp0]"=&r"(tmp[0]),              [addr0]"=&r"(addr[0]),
+      [src_ptr]"+&r"(src_ptr),          [output_ptr]"+&r"(output_ptr),
+      [output_height]"+&r"(output_height)
    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
      [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
      [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
@@ -319,7 +301,6 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi(
    "1:                                                               \n\t"
    "gsldlc1    %[ftmp0],       0x07(%[src_ptr])                      \n\t"
    "gsldrc1    %[ftmp0],       0x00(%[src_ptr])                      \n\t"
-    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])

    "punpcklbh  %[ftmp1],       %[ftmp0],          %[fzero]           \n\t"
    "gssdlc1    %[ftmp1],       0x07(%[output_ptr])                   \n\t"
@@ -327,6 +308,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi(

    "addiu      %[output_height], %[output_height], -0x01             \n\t"
    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
+    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
    "bnez       %[output_height],               1b                    \n\t"
    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
@@ -356,12 +338,12 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
    "1:                                                               \n\t"
    "gsldlc1    %[ftmp0],     0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp0],     0x00(%[src_ptr])                        \n\t"
-    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
-    MMI_ADDIU(%[output_height], %[output_height], -0x01)
    "packushb   %[ftmp1],     %[ftmp0],        %[fzero]               \n\t"
    "gsswlc1    %[ftmp1],     0x03(%[output_ptr])                     \n\t"
    "gsswrc1    %[ftmp1],     0x00(%[output_ptr])                     \n\t"

+    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
+    MMI_ADDIU(%[output_height], %[output_height], -0x01)
    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
    "bnez       %[output_height], 1b                                  \n\t"
    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
@@ -404,7 +386,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
      }                                                                        \
    } else {                                                                   \
      for (i = 0; i < loop; ++i) {                                             \
-        vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m,         \
+        vp8_filter_block1dc_v6_mmi(FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, \
                                   dst_pitch, n * 2, VFilter);                 \
      }                                                                        \
    }                                                                          \
--- a/vp8/common/modecont.c
+++ b/vp8/common/modecont.c
@@ -11,16 +11,28 @@
 #include "entropy.h"

 const int vp8_mode_contexts[6][4] = {
-  { /* 0 */
-    7, 1, 1, 143 },
-  { /* 1 */
-    14, 18, 14, 107 },
-  { /* 2 */
-    135, 64, 57, 68 },
-  { /* 3 */
-    60, 56, 128, 65 },
-  { /* 4 */
-    159, 134, 128, 34 },
-  { /* 5 */
-    234, 188, 128, 28 },
+  {
+      /* 0 */
+      7, 1, 1, 143,
+  },
+  {
+      /* 1 */
+      14, 18, 14, 107,
+  },
+  {
+      /* 2 */
+      135, 64, 57, 68,
+  },
+  {
+      /* 3 */
+      60, 56, 128, 65,
+  },
+  {
+      /* 4 */
+      159, 134, 128, 34,
+  },
+  {
+      /* 5 */
+      234, 188, 128, 28,
+  },
 };
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -1,13 +1,3 @@
-##
-##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-
 sub vp8_common_forward_decls() {
 print <<EOF
 /*
--- a/vp8/common/vp8_entropymodedata.h
+++ b/vp8/common/vp8_entropymodedata.h
@@ -6,7 +6,7 @@
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
- */
+*/

 #ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_
 #define VP8_COMMON_VP8_ENTROPYMODEDATA_H_
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -95,7 +95,9 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
                                  int src_pixels_per_line, int xoffset,
                                  int yoffset, unsigned char *dst_ptr,
-                                  int dst_pitch) {
+                                  int dst_pitch
+
+                                  ) {
  DECLARE_ALIGNED(16, unsigned short,
                  FData2[24 * 24]); /* Temp data bufffer used in filtering */

@@ -234,7 +236,9 @@ extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
                                   int src_pixels_per_line, int xoffset,
                                   int yoffset, unsigned char *dst_ptr,
-                                   int dst_pitch) {
+                                   int dst_pitch
+
+                                   ) {
  DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);

  if (xoffset) {
@@ -347,8 +351,8 @@ void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
                                   yoffset);
    } else {
      /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
-       * yoffset==0) case correctly. Add copy function here to guarantee
-       * six-tap function handles all possible offsets. */
+        * yoffset==0) case correctly. Add copy function here to guarantee
+        * six-tap function handles all possible offsets. */
      int r;

      for (r = 0; r < 4; ++r) {
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -674,7 +674,7 @@ static unsigned int read_partition_size(VP8D_COMP *pbi,

 static int read_is_valid(const unsigned char *start, size_t len,
                         const unsigned char *end) {
-  return len != 0 && end > start && len <= (size_t)(end - start);
+  return (start + len > start && start + len <= end);
 }

 static unsigned int read_available_partition_size(
--- a/vp8/decoder/ec_types.h
+++ b/vp8/decoder/ec_types.h
@@ -34,9 +34,7 @@ typedef struct {
 /* Structure used to hold all the overlaps of a macroblock. The overlaps of a
 * macroblock is further divided into block overlaps.
 */
-typedef struct {
-  B_OVERLAP overlaps[16];
-} MB_OVERLAP;
+typedef struct { B_OVERLAP overlaps[16]; } MB_OVERLAP;

 /* Structure for keeping track of motion vectors and which reference frame they
 * refer to. Used for motion vector interpolation.
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -31,9 +31,7 @@ typedef struct {
  void *ptr2;
 } DECODETHREAD_DATA;

-typedef struct {
-  MACROBLOCKD mbd;
-} MB_ROW_DEC;
+typedef struct { MACROBLOCKD mbd; } MB_ROW_DEC;

 typedef struct {
  int enabled;
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -739,21 +739,24 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
    /* Allocate memory for above_row buffers. */
    CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
-                      vpx_memalign(16, sizeof(unsigned char) *
-                                           (width + (VP8BORDERINPIXELS << 1))));
+      CHECK_MEM_ERROR(
+          pbi->mt_yabove_row[i],
+          vpx_memalign(
+              16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1))));

    CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
-                      vpx_memalign(16, sizeof(unsigned char) *
-                                           (uv_width + VP8BORDERINPIXELS)));
+      CHECK_MEM_ERROR(
+          pbi->mt_uabove_row[i],
+          vpx_memalign(16,
+                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));

    CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
-                      vpx_memalign(16, sizeof(unsigned char) *
-                                           (uv_width + VP8BORDERINPIXELS)));
+      CHECK_MEM_ERROR(
+          pbi->mt_vabove_row[i],
+          vpx_memalign(16,
+                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));

    /* Allocate memory for left_col buffers. */
    CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@@ -9,12 +9,12 @@
 */

 /****************************************************************************
- *
- *   Module Title :     boolhuff.h
- *
- *   Description  :     Bool Coder header file.
- *
- ****************************************************************************/
+*
+*   Module Title :     boolhuff.h
+*
+*   Description  :     Bool Coder header file.
+*
+****************************************************************************/
 #ifndef VP8_ENCODER_BOOLHUFF_H_
 #define VP8_ENCODER_BOOLHUFF_H_

--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -989,11 +989,11 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
    bits_per_mb_at_this_q =
        vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;

-    bits_per_mb_at_this_q =
-        (int)(.5 + err_correction_factor * speed_correction *
-                       cpi->twopass.est_max_qcorrection_factor *
-                       cpi->twopass.section_max_qfactor *
-                       (double)bits_per_mb_at_this_q);
+    bits_per_mb_at_this_q = (int)(.5 +
+                                  err_correction_factor * speed_correction *
+                                      cpi->twopass.est_max_qcorrection_factor *
+                                      cpi->twopass.section_max_qfactor *
+                                      (double)bits_per_mb_at_this_q);

    /* Mode and motion overhead */
    /* As Q rises in real encode loop rd code will force overhead down
@@ -1086,8 +1086,9 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
        vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;

    bits_per_mb_at_this_q =
-        (int)(.5 + err_correction_factor * speed_correction * clip_iifactor *
-                       (double)bits_per_mb_at_this_q);
+        (int)(.5 +
+              err_correction_factor * speed_correction * clip_iifactor *
+                  (double)bits_per_mb_at_this_q);

    /* Mode and motion overhead */
    /* As Q rises in real encode loop rd code will force overhead down
@@ -1272,8 +1273,9 @@ void vp8_init_second_pass(VP8_COMP *cpi) {
   * sum duration is not. Its calculated based on the actual durations of
   * all frames from the first pass.
   */
-  vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
-                             cpi->twopass.total_stats.duration);
+  vp8_new_framerate(cpi,
+                    10000000.0 * cpi->twopass.total_stats.count /
+                        cpi->twopass.total_stats.duration);

  cpi->output_framerate = cpi->framerate;
  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
@@ -1737,11 +1739,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
            /* Dont break out very close to a key frame */
            ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
            ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
-            (!flash_detected) &&
-            ((mv_ratio_accumulator > 100.0) ||
-             (abs_mv_in_out_accumulator > 3.0) ||
-             (mv_in_out_accumulator < -2.0) ||
-             ((boost_score - old_boost_score) < 2.0)))) {
+            (!flash_detected) && ((mv_ratio_accumulator > 100.0) ||
+                                  (abs_mv_in_out_accumulator > 3.0) ||
+                                  (mv_in_out_accumulator < -2.0) ||
+                                  ((boost_score - old_boost_score) < 2.0)))) {
      boost_score = old_boost_score;
      break;
    }
@@ -1814,9 +1815,8 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      (next_frame.pcnt_inter > 0.75) &&
      ((mv_in_out_accumulator / (double)i > -0.2) ||
       (mv_in_out_accumulator > -2.0)) &&
-      (cpi->gfu_boost > 100) &&
-      (cpi->twopass.gf_decay_rate <=
-       (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))))
+      (cpi->gfu_boost > 100) && (cpi->twopass.gf_decay_rate <=
+                                 (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))))
 #endif
  {
    int Boost;
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2862,6 +2862,7 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
    fclose(yframe);
 }
 #endif
+/* return of 0 means drop frame */

 #if !CONFIG_REALTIME_ONLY
 /* Function to test for conditions that indeicate we should loop
@@ -3363,6 +3364,11 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
        (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info;

    if (cpi->oxcf.mr_encoder_id) {
+      // TODO(marpan): This constraint shouldn't be needed, as we would like
+      // to allow for key frame setting (forced or periodic) defined per
+      // spatial layer. For now, keep this in.
+      cm->frame_type = low_res_frame_info->frame_type;
+
      // Check if lower resolution is available for motion vector reuse.
      if (cm->frame_type != KEY_FRAME) {
        cpi->mr_low_res_mv_avail = 1;
@@ -3387,16 +3393,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
                     == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
        */
      }
-      // Disable motion vector reuse (i.e., disable any usage of the low_res)
-      // if the previous lower stream is skipped/disabled.
-      if (low_res_frame_info->skip_encoding_prev_stream) {
-        cpi->mr_low_res_mv_avail = 0;
-      }
    }
-    // This stream is not skipped (i.e., it's being encoded), so set this skip
-    // flag to 0. This is needed for the next stream (i.e., which is the next
-    // frame to be encoded).
-    low_res_frame_info->skip_encoding_prev_stream = 0;

    // On a key frame: For the lowest resolution, keep track of the key frame
    // counter value. For the higher resolutions, reset the current video
@@ -3802,7 +3799,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,

  /* Setup background Q adjustment for error resilient mode.
   * For multi-layer encodes only enable this for the base layer.
-   */
+  */
  if (cpi->cyclic_refresh_mode_enabled) {
    // Special case for screen_content_mode with golden frame updates.
    int disable_cr_gf =
@@ -4785,6 +4782,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
    cpi->temporal_pattern_counter++;
  }

+/* reset to normal state now that we are done. */
+
 #if 0
    {
        char filename[512];
@@ -5000,13 +4999,10 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
        // be received for that high layer, which will yield an incorrect
        // frame rate (from time-stamp adjustment in above calculation).
        if (cpi->oxcf.mr_encoder_id) {
-          if (!low_res_frame_info->skip_encoding_base_stream)
-            cpi->ref_framerate = low_res_frame_info->low_res_framerate;
+          cpi->ref_framerate = low_res_frame_info->low_res_framerate;
        } else {
          // Keep track of frame rate for lowest resolution.
          low_res_frame_info->low_res_framerate = cpi->ref_framerate;
-          // The base stream is being encoded so set skip flag to 0.
-          low_res_frame_info->skip_encoding_base_stream = 0;
        }
      }
 #endif
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -741,10 +741,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
  x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

  /* If the frame has big static background and current MB is in low
-   *  motion area, its mode decision is biased to ZEROMV mode.
-   *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12).
-   *  At such speed settings, ZEROMV is already heavily favored.
-   */
+  *  motion area, its mode decision is biased to ZEROMV mode.
+  *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12).
+  *  At such speed settings, ZEROMV is already heavily favored.
+  */
  if (cpi->Speed < 12) {
    calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
  }
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -996,7 +996,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
           * bits on this frame even if it is a contructed arf.
           * The active maximum quantizer insures that an appropriate
           * number of bits will be spent if needed for contstructed ARFs.
-           */
+          */
          cpi->this_frame_target = 0;
        }

@@ -1052,8 +1052,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
   * overflow when values are large
   */
  projected_size_based_on_q =
-      (int)(((.5 + rate_correction_factor *
-                       vp8_bits_per_mb[cpi->common.frame_type][Q]) *
+      (int)(((.5 +
+              rate_correction_factor *
+                  vp8_bits_per_mb[cpi->common.frame_type][Q]) *
             cpi->common.MBs) /
            (1 << BPER_MB_NORMBITS));

--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -23,7 +23,6 @@
 #include "modecosts.h"
 #include "encodeintra.h"
 #include "pickinter.h"
-#include "vp8/common/common.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra.h"
@@ -770,9 +769,9 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
    vp8_quantize_mbuv(x);

    rate_to = rd_cost_mbuv(x);
-    this_rate =
-        rate_to + x->intra_uv_mode_cost[xd->frame_type]
-                                       [xd->mode_info_context->mbmi.uv_mode];
+    this_rate = rate_to +
+                x->intra_uv_mode_cost[xd->frame_type]
+                                     [xd->mode_info_context->mbmi.uv_mode];

    this_distortion = vp8_mbuverror(x) / 4;

@@ -960,13 +959,19 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
  vp8_variance_fn_ptr_t *v_fn_ptr;

  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
  ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
+  ENTROPY_CONTEXT *ta_b;
+  ENTROPY_CONTEXT *tl_b;

  memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
  memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vp8_zero(t_above_b);
-  vp8_zero(t_left_b);
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+  ta_b = (ENTROPY_CONTEXT *)&t_above_b;
+  tl_b = (ENTROPY_CONTEXT *)&t_left_b;

  br = 0;
  bd = 0;
@@ -1146,13 +1151,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
        mode_selected = this_mode;
        best_label_rd = this_rd;

-        memcpy(&t_above_b, &t_above_s, sizeof(ENTROPY_CONTEXT_PLANES));
-        memcpy(&t_left_b, &t_left_s, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
      }
    } /*for each 4x4 mode*/

-    memcpy(&t_above, &t_above_b, sizeof(ENTROPY_CONTEXT_PLANES));
-    memcpy(&t_left, &t_left_b, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));

    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
                bsi->ref_mv, x->mvcost);
--- a/vp8/encoder/treewriter.h
+++ b/vp8/encoder/treewriter.h
@@ -56,7 +56,8 @@ static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2],

 static void vp8_treed_write(vp8_writer *const w, vp8_tree t,
                            const vp8_prob *const p, int v,
-                            int n) { /* number of bits in v, assumed nonzero */
+                            int n /* number of bits in v, assumed nonzero */
+                            ) {
  vp8_tree_index i = 0;

  do {
@@ -72,7 +73,8 @@ static INLINE void vp8_write_token(vp8_writer *const w, vp8_tree t,
 }

 static int vp8_treed_cost(vp8_tree t, const vp8_prob *const p, int v,
-                          int n) { /* number of bits in v, assumed nonzero */
+                          int n /* number of bits in v, assumed nonzero */
+                          ) {
  int c = 0;
  vp8_tree_index i = 0;

--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -802,20 +802,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
                                   unsigned long deadline) {
  vpx_codec_err_t res = VPX_CODEC_OK;

-  if (!ctx->cfg.rc_target_bitrate) {
-#if CONFIG_MULTI_RES_ENCODING
-    if (!ctx->cpi) return VPX_CODEC_ERROR;
-    if (ctx->cpi->oxcf.mr_total_resolutions > 1) {
-      LOWER_RES_FRAME_INFO *low_res_frame_info =
-          (LOWER_RES_FRAME_INFO *)ctx->cpi->oxcf.mr_low_res_mode_info;
-      if (!low_res_frame_info) return VPX_CODEC_ERROR;
-      low_res_frame_info->skip_encoding_prev_stream = 1;
-      if (ctx->cpi->oxcf.mr_encoder_id == 0)
-        low_res_frame_info->skip_encoding_base_stream = 1;
-    }
-#endif
-    return res;
-  }
+  if (!ctx->cfg.rc_target_bitrate) return res;

  if (img) res = validate_img(ctx, img);

@@ -915,8 +902,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
            (unsigned long)((delta * ctx->cfg.g_timebase.den + round) /
                            ctx->cfg.g_timebase.num / 10000000);
        pkt.data.frame.flags = lib_flags << 16;
-        pkt.data.frame.width[0] = cpi->common.Width;
-        pkt.data.frame.height[0] = cpi->common.Height;

        if (lib_flags & FRAMEFLAGS_KEY) {
          pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
@@ -1274,9 +1259,6 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = {
      vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    cfg_maps; */
      vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
      vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   get_cx_data; */
-      vp8e_set_config,
-      NULL,
-      vp8e_get_preview,
-      vp8e_mr_alloc_mem,
+      vp8e_set_config, NULL, vp8e_get_preview, vp8e_mr_alloc_mem,
  } /* encoder functions */
 };
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -200,9 +200,9 @@ static vpx_codec_err_t update_error_state(
 static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
                            void *user_priv) {
  /** vpx_img_wrap() doesn't allow specifying independent strides for
-   * the Y, U, and V planes, nor other alignment adjustments that
-   * might be representable by a YV12_BUFFER_CONFIG, so we just
-   * initialize all the fields.*/
+    * the Y, U, and V planes, nor other alignment adjustments that
+    * might be representable by a YV12_BUFFER_CONFIG, so we just
+    * initialize all the fields.*/
  img->fmt = VPX_IMG_FMT_I420;
  img->w = yv12->y_stride;
  img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
--- a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
@@ -1,160 +0,0 @@
-/*
- *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/arm/neon/vp9_iht_neon.h"
-#include "vpx_dsp/arm/highbd_idct_neon.h"
-#include "vpx_dsp/arm/idct_neon.h"
-#include "vpx_dsp/arm/mem_neon.h"
-#include "vpx_dsp/txfm_common.h"
-
-static INLINE void highbd_iadst4(int32x4_t *const io) {
-  const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 };
-  const int32x4_t sinpi = vld1q_s32(sinpis);
-  int32x4_t s[8];
-
-  s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0);
-  s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1);
-  s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0);
-  s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1);
-  s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0);
-  s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1);
-  s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1);
-  s[7] = vsubq_s32(io[0], io[2]);
-  s[7] = vaddq_s32(s[7], io[3]);
-
-  s[0] = vaddq_s32(s[0], s[3]);
-  s[0] = vaddq_s32(s[0], s[5]);
-  s[1] = vsubq_s32(s[1], s[4]);
-  s[1] = vsubq_s32(s[1], s[6]);
-  s[3] = s[2];
-  s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0);
-
-  io[0] = vaddq_s32(s[0], s[3]);
-  io[1] = vaddq_s32(s[1], s[3]);
-  io[2] = s[2];
-  io[3] = vaddq_s32(s[0], s[1]);
-  io[3] = vsubq_s32(io[3], s[3]);
-  io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS);
-  io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS);
-  io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS);
-  io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS);
-}
-
-void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
-                                   int stride, int tx_type, int bd) {
-  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
-  int16x8_t a[2];
-  int32x4_t c[4];
-
-  c[0] = vld1q_s32(input);
-  c[1] = vld1q_s32(input + 4);
-  c[2] = vld1q_s32(input + 8);
-  c[3] = vld1q_s32(input + 12);
-
-  if (bd == 8) {
-    a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1]));
-    a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3]));
-    transpose_s16_4x4q(&a[0], &a[1]);
-
-    switch (tx_type) {
-      case DCT_DCT:
-        idct4x4_16_kernel_bd8(a);
-        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
-        transpose_s16_4x4q(&a[0], &a[1]);
-        idct4x4_16_kernel_bd8(a);
-        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
-        break;
-
-      case ADST_DCT:
-        idct4x4_16_kernel_bd8(a);
-        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
-        transpose_s16_4x4q(&a[0], &a[1]);
-        iadst4(a);
-        break;
-
-      case DCT_ADST:
-        iadst4(a);
-        transpose_s16_4x4q(&a[0], &a[1]);
-        idct4x4_16_kernel_bd8(a);
-        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
-        break;
-
-      default:
-        assert(tx_type == ADST_ADST);
-        iadst4(a);
-        transpose_s16_4x4q(&a[0], &a[1]);
-        iadst4(a);
-        break;
-    }
-    a[0] = vrshrq_n_s16(a[0], 4);
-    a[1] = vrshrq_n_s16(a[1], 4);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT: {
-        const int32x4_t cospis = vld1q_s32(kCospi32);
-
-        if (bd == 10) {
-          idct4x4_16_kernel_bd10(cospis, c);
-          idct4x4_16_kernel_bd10(cospis, c);
-        } else {
-          idct4x4_16_kernel_bd12(cospis, c);
-          idct4x4_16_kernel_bd12(cospis, c);
-        }
-        break;
-      }
-
-      case ADST_DCT: {
-        const int32x4_t cospis = vld1q_s32(kCospi32);
-
-        if (bd == 10) {
-          idct4x4_16_kernel_bd10(cospis, c);
-        } else {
-          idct4x4_16_kernel_bd12(cospis, c);
-        }
-        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
-        highbd_iadst4(c);
-        break;
-      }
-
-      case DCT_ADST: {
-        const int32x4_t cospis = vld1q_s32(kCospi32);
-
-        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
-        highbd_iadst4(c);
-        if (bd == 10) {
-          idct4x4_16_kernel_bd10(cospis, c);
-        } else {
-          idct4x4_16_kernel_bd12(cospis, c);
-        }
-        break;
-      }
-
-      default: {
-        assert(tx_type == ADST_ADST);
-        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
-        highbd_iadst4(c);
-        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
-        highbd_iadst4(c);
-        break;
-      }
-    }
-    a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4));
-    a[1] = vcombine_s16(vqrshrn_n_s32(c[2], 4), vqrshrn_n_s32(c[3], 4));
-  }
-
-  highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max);
-  highbd_idct4x4_1_add_kernel1(&dest, stride, a[1], max);
-}
--- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -14,63 +14,206 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/common/arm/neon/vp9_iht_neon.h"
-#include "vpx_dsp/arm/idct_neon.h"
-#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/txfm_common.h"

+static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
+  int32x4_t q8s32, q9s32;
+  int16x4x2_t d0x2s16, d1x2s16;
+  int32x4x2_t q0x2s32;
+
+  d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
+  d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
+
+  q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
+  q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
+  q0x2s32 = vtrnq_s32(q8s32, q9s32);
+
+  *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
+  *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
+}
+
+static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
+                                             int16x4_t *d2s16) {
+  *d0s16 = vdup_n_s16(cospi_8_64);
+  *d1s16 = vdup_n_s16(cospi_16_64);
+  *d2s16 = vdup_n_s16(cospi_24_64);
+}
+
+static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
+                                           int16x4_t *d5s16, int16x8_t *q3s16) {
+  *d3s16 = vdup_n_s16(sinpi_1_9);
+  *d4s16 = vdup_n_s16(sinpi_2_9);
+  *q3s16 = vdupq_n_s16(sinpi_3_9);
+  *d5s16 = vdup_n_s16(sinpi_4_9);
+}
+
+static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
+                              int16x4_t *d2s16, int16x8_t *q8s16,
+                              int16x8_t *q9s16) {
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
+  int16x4_t d26s16, d27s16, d28s16, d29s16;
+  int32x4_t q10s32, q13s32, q14s32, q15s32;
+  int16x8_t q13s16, q14s16;
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+
+  d23s16 = vadd_s16(d16s16, d18s16);
+  d24s16 = vsub_s16(d16s16, d18s16);
+
+  q15s32 = vmull_s16(d17s16, *d2s16);
+  q10s32 = vmull_s16(d17s16, *d0s16);
+  q13s32 = vmull_s16(d23s16, *d1s16);
+  q14s32 = vmull_s16(d24s16, *d1s16);
+  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
+  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
+
+  d26s16 = vrshrn_n_s32(q13s32, 14);
+  d27s16 = vrshrn_n_s32(q14s32, 14);
+  d29s16 = vrshrn_n_s32(q15s32, 14);
+  d28s16 = vrshrn_n_s32(q10s32, 14);
+
+  q13s16 = vcombine_s16(d26s16, d27s16);
+  q14s16 = vcombine_s16(d28s16, d29s16);
+  *q8s16 = vaddq_s16(q13s16, q14s16);
+  *q9s16 = vsubq_s16(q13s16, q14s16);
+  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
+}
+
+static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
+                               int16x4_t *d5s16, int16x8_t *q3s16,
+                               int16x8_t *q8s16, int16x8_t *q9s16) {
+  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
+  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+  d6s16 = vget_low_s16(*q3s16);
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+
+  q10s32 = vmull_s16(*d3s16, d16s16);
+  q11s32 = vmull_s16(*d4s16, d16s16);
+  q12s32 = vmull_s16(d6s16, d17s16);
+  q13s32 = vmull_s16(*d5s16, d18s16);
+  q14s32 = vmull_s16(*d3s16, d18s16);
+  q15s32 = vmovl_s16(d16s16);
+  q15s32 = vaddw_s16(q15s32, d19s16);
+  q8s32 = vmull_s16(*d4s16, d19s16);
+  q15s32 = vsubw_s16(q15s32, d18s16);
+  q9s32 = vmull_s16(*d5s16, d19s16);
+
+  q10s32 = vaddq_s32(q10s32, q13s32);
+  q10s32 = vaddq_s32(q10s32, q8s32);
+  q11s32 = vsubq_s32(q11s32, q14s32);
+  q8s32 = vdupq_n_s32(sinpi_3_9);
+  q11s32 = vsubq_s32(q11s32, q9s32);
+  q15s32 = vmulq_s32(q15s32, q8s32);
+
+  q13s32 = vaddq_s32(q10s32, q12s32);
+  q10s32 = vaddq_s32(q10s32, q11s32);
+  q14s32 = vaddq_s32(q11s32, q12s32);
+  q10s32 = vsubq_s32(q10s32, q12s32);
+
+  d16s16 = vrshrn_n_s32(q13s32, 14);
+  d17s16 = vrshrn_n_s32(q14s32, 14);
+  d18s16 = vrshrn_n_s32(q15s32, 14);
+  d19s16 = vrshrn_n_s32(q10s32, 14);
+
+  *q8s16 = vcombine_s16(d16s16, d17s16);
+  *q9s16 = vcombine_s16(d18s16, d19s16);
+}
+
 void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
-  int16x8_t a[2];
-  uint8x8_t s[2], d[2];
-  uint16x8_t sum[2];
+  uint8x8_t d26u8, d27u8;
+  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
+  uint32x2_t d26u32, d27u32;
+  int16x8_t q3s16, q8s16, q9s16;
+  uint16x8_t q8u16, q9u16;

-  assert(!((intptr_t)dest % sizeof(uint32_t)));
-  assert(!(stride % sizeof(uint32_t)));
+  d26u32 = d27u32 = vdup_n_u32(0);

-  a[0] = load_tran_low_to_s16q(input);
-  a[1] = load_tran_low_to_s16q(input + 8);
-  transpose_s16_4x4q(&a[0], &a[1]);
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+
+  TRANSPOSE4X4(&q8s16, &q9s16);

  switch (tx_type) {
-    case DCT_DCT:
-      idct4x4_16_kernel_bd8(a);
-      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
-      transpose_s16_4x4q(&a[0], &a[1]);
-      idct4x4_16_kernel_bd8(a);
-      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
-      break;
+    case 0:  // idct_idct is not supported. Fall back to C
+      vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
+      return;
+    case 1:  // iadst_idct
+      // generate constants
+      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);

-    case ADST_DCT:
-      idct4x4_16_kernel_bd8(a);
-      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
-      transpose_s16_4x4q(&a[0], &a[1]);
-      iadst4(a);
-      break;
+      // first transform rows
+      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);

-    case DCT_ADST:
-      iadst4(a);
-      transpose_s16_4x4q(&a[0], &a[1]);
-      idct4x4_16_kernel_bd8(a);
-      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
-      break;
+      // transpose the matrix
+      TRANSPOSE4X4(&q8s16, &q9s16);

-    default:
-      assert(tx_type == ADST_ADST);
-      iadst4(a);
-      transpose_s16_4x4q(&a[0], &a[1]);
-      iadst4(a);
+      // then transform columns
+      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+      break;
+    case 2:  // idct_iadst
+      // generate constantsyy
+      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+      // first transform rows
+      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+      // transpose the matrix
+      TRANSPOSE4X4(&q8s16, &q9s16);
+
+      // then transform columns
+      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+      break;
+    case 3:  // iadst_iadst
+      // generate constants
+      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+      // first transform rows
+      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+      // transpose the matrix
+      TRANSPOSE4X4(&q8s16, &q9s16);
+
+      // then transform columns
+      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+      break;
+    default:  // iadst_idct
+      assert(0);
      break;
  }

-  a[0] = vrshrq_n_s16(a[0], 4);
-  a[1] = vrshrq_n_s16(a[1], 4);
-  s[0] = load_u8(dest, stride);
-  s[1] = load_u8(dest + 2 * stride, stride);
-  sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s[0]);
-  sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), s[1]);
-  d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0]));
-  d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1]));
-  store_u8(dest, stride, d[0]);
-  store_u8(dest + 2 * stride, stride, d[1]);
+  q8s16 = vrshrq_n_s16(q8s16, 4);
+  q9s16 = vrshrq_n_s16(q9s16, 4);
+
+  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
+  dest += stride;
+  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
+  dest += stride;
+  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
+  dest += stride;
+  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
+  dest -= stride;
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
+  dest -= stride;
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
+  dest -= stride;
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
 }
--- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -14,199 +14,527 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
-#include "vpx_dsp/arm/idct_neon.h"
-#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"

-static INLINE void iadst_half_butterfly_neon(int16x8_t *const x,
-                                             const int16x4_t c) {
-  const int16x8_t sum = vaddq_s16(x[0], x[1]);
-  const int16x8_t sub = vsubq_s16(x[0], x[1]);
-  int32x4_t t0[2], t1[2];
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;

-  t0[0] = vmull_lane_s16(vget_low_s16(sum), c, 0);
-  t0[1] = vmull_lane_s16(vget_high_s16(sum), c, 0);
-  t1[0] = vmull_lane_s16(vget_low_s16(sub), c, 0);
-  t1[1] = vmull_lane_s16(vget_high_s16(sub), c, 0);
-  x[0] = dct_const_round_shift_low_8(t0);
-  x[1] = dct_const_round_shift_low_8(t1);
+static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
+                              int16x8_t *q10s16, int16x8_t *q11s16,
+                              int16x8_t *q12s16, int16x8_t *q13s16,
+                              int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d0s16, d1s16, d2s16, d3s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+  d0s16 = vdup_n_s16(cospi_28_64);
+  d1s16 = vdup_n_s16(cospi_4_64);
+  d2s16 = vdup_n_s16(cospi_12_64);
+  d3s16 = vdup_n_s16(cospi_20_64);
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);
+
+  q2s32 = vmull_s16(d18s16, d0s16);
+  q3s32 = vmull_s16(d19s16, d0s16);
+  q5s32 = vmull_s16(d26s16, d2s16);
+  q6s32 = vmull_s16(d27s16, d2s16);
+
+  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+  d8s16 = vrshrn_n_s32(q2s32, 14);
+  d9s16 = vrshrn_n_s32(q3s32, 14);
+  d10s16 = vrshrn_n_s32(q5s32, 14);
+  d11s16 = vrshrn_n_s32(q6s32, 14);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  q2s32 = vmull_s16(d18s16, d1s16);
+  q3s32 = vmull_s16(d19s16, d1s16);
+  q9s32 = vmull_s16(d26s16, d3s16);
+  q13s32 = vmull_s16(d27s16, d3s16);
+
+  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+  d14s16 = vrshrn_n_s32(q2s32, 14);
+  d15s16 = vrshrn_n_s32(q3s32, 14);
+  d12s16 = vrshrn_n_s32(q9s32, 14);
+  d13s16 = vrshrn_n_s32(q13s32, 14);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+  q7s16 = vcombine_s16(d14s16, d15s16);
+
+  d0s16 = vdup_n_s16(cospi_16_64);
+
+  q2s32 = vmull_s16(d16s16, d0s16);
+  q3s32 = vmull_s16(d17s16, d0s16);
+  q13s32 = vmull_s16(d16s16, d0s16);
+  q15s32 = vmull_s16(d17s16, d0s16);
+
+  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+  d0s16 = vdup_n_s16(cospi_24_64);
+  d1s16 = vdup_n_s16(cospi_8_64);
+
+  d18s16 = vrshrn_n_s32(q2s32, 14);
+  d19s16 = vrshrn_n_s32(q3s32, 14);
+  d22s16 = vrshrn_n_s32(q13s32, 14);
+  d23s16 = vrshrn_n_s32(q15s32, 14);
+  *q9s16 = vcombine_s16(d18s16, d19s16);
+  *q11s16 = vcombine_s16(d22s16, d23s16);
+
+  q2s32 = vmull_s16(d20s16, d0s16);
+  q3s32 = vmull_s16(d21s16, d0s16);
+  q8s32 = vmull_s16(d20s16, d1s16);
+  q12s32 = vmull_s16(d21s16, d1s16);
+
+  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+  d26s16 = vrshrn_n_s32(q2s32, 14);
+  d27s16 = vrshrn_n_s32(q3s32, 14);
+  d30s16 = vrshrn_n_s32(q8s32, 14);
+  d31s16 = vrshrn_n_s32(q12s32, 14);
+  *q13s16 = vcombine_s16(d26s16, d27s16);
+  *q15s16 = vcombine_s16(d30s16, d31s16);
+
+  q0s16 = vaddq_s16(*q9s16, *q15s16);
+  q1s16 = vaddq_s16(*q11s16, *q13s16);
+  q2s16 = vsubq_s16(*q11s16, *q13s16);
+  q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+  *q13s16 = vsubq_s16(q4s16, q5s16);
+  q4s16 = vaddq_s16(q4s16, q5s16);
+  *q14s16 = vsubq_s16(q7s16, q6s16);
+  q7s16 = vaddq_s16(q7s16, q6s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+
+  d16s16 = vdup_n_s16(cospi_16_64);
+
+  q9s32 = vmull_s16(d28s16, d16s16);
+  q10s32 = vmull_s16(d29s16, d16s16);
+  q11s32 = vmull_s16(d28s16, d16s16);
+  q12s32 = vmull_s16(d29s16, d16s16);
+
+  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+  d10s16 = vrshrn_n_s32(q9s32, 14);
+  d11s16 = vrshrn_n_s32(q10s32, 14);
+  d12s16 = vrshrn_n_s32(q11s32, 14);
+  d13s16 = vrshrn_n_s32(q12s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  *q8s16 = vaddq_s16(q0s16, q7s16);
+  *q9s16 = vaddq_s16(q1s16, q6s16);
+  *q10s16 = vaddq_s16(q2s16, q5s16);
+  *q11s16 = vaddq_s16(q3s16, q4s16);
+  *q12s16 = vsubq_s16(q3s16, q4s16);
+  *q13s16 = vsubq_s16(q2s16, q5s16);
+  *q14s16 = vsubq_s16(q1s16, q6s16);
+  *q15s16 = vsubq_s16(q0s16, q7s16);
 }

-static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0,
-                                                 const int16x8_t in1,
-                                                 const int16x4_t c,
-                                                 int32x4_t *const s0,
-                                                 int32x4_t *const s1) {
-  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
-  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
-  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
-  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
+                               int16x8_t *q10s16, int16x8_t *q11s16,
+                               int16x8_t *q12s16, int16x8_t *q13s16,
+                               int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int16x8_t q2s16, q4s16, q5s16, q6s16;
+  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
+  int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;

-  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
-  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
-  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
-  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
-}
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);

-static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0,
-                                                 const int16x8_t in1,
-                                                 const int16x4_t c,
-                                                 int32x4_t *const s0,
-                                                 int32x4_t *const s1) {
-  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
-  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
-  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
-  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+  d14s16 = vdup_n_s16(cospi_2_64);
+  d15s16 = vdup_n_s16(cospi_30_64);

-  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
-  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
-  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
-  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
-}
+  q1s32 = vmull_s16(d30s16, d14s16);
+  q2s32 = vmull_s16(d31s16, d14s16);
+  q3s32 = vmull_s16(d30s16, d15s16);
+  q4s32 = vmull_s16(d31s16, d15s16);

-static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0,
-                                                 const int16x8_t in1,
-                                                 const int16x4_t c,
-                                                 int32x4_t *const s0,
-                                                 int32x4_t *const s1) {
-  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
-  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
-  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
-  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+  d30s16 = vdup_n_s16(cospi_18_64);
+  d31s16 = vdup_n_s16(cospi_14_64);

-  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
-  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
-  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
-  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
-}
+  q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
+  q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
+  q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
+  q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);

-static INLINE int16x8_t add_dct_const_round_shift_low_8(
-    const int32x4_t *const in0, const int32x4_t *const in1) {
-  int32x4_t sum[2];
+  q5s32 = vmull_s16(d22s16, d30s16);
+  q6s32 = vmull_s16(d23s16, d30s16);
+  q7s32 = vmull_s16(d22s16, d31s16);
+  q8s32 = vmull_s16(d23s16, d31s16);

-  sum[0] = vaddq_s32(in0[0], in1[0]);
-  sum[1] = vaddq_s32(in0[1], in1[1]);
-  return dct_const_round_shift_low_8(sum);
-}
+  q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
+  q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
+  q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
+  q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);

-static INLINE int16x8_t sub_dct_const_round_shift_low_8(
-    const int32x4_t *const in0, const int32x4_t *const in1) {
-  int32x4_t sum[2];
+  q11s32 = vaddq_s32(q1s32, q5s32);
+  q12s32 = vaddq_s32(q2s32, q6s32);
+  q1s32 = vsubq_s32(q1s32, q5s32);
+  q2s32 = vsubq_s32(q2s32, q6s32);

-  sum[0] = vsubq_s32(in0[0], in1[0]);
-  sum[1] = vsubq_s32(in0[1], in1[1]);
-  return dct_const_round_shift_low_8(sum);
-}
+  d22s16 = vrshrn_n_s32(q11s32, 14);
+  d23s16 = vrshrn_n_s32(q12s32, 14);
+  *q11s16 = vcombine_s16(d22s16, d23s16);

-static INLINE void iadst8(int16x8_t *const io) {
-  const int16x4_t c0 =
-      create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
-  const int16x4_t c1 =
-      create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
-  const int16x4_t c2 =
-      create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
-  int16x8_t x[8], t[4];
-  int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+  q12s32 = vaddq_s32(q3s32, q7s32);
+  q15s32 = vaddq_s32(q4s32, q8s32);
+  q3s32 = vsubq_s32(q3s32, q7s32);
+  q4s32 = vsubq_s32(q4s32, q8s32);

-  x[0] = io[7];
-  x[1] = io[0];
-  x[2] = io[5];
-  x[3] = io[2];
-  x[4] = io[3];
-  x[5] = io[4];
-  x[6] = io[1];
-  x[7] = io[6];
+  d2s16 = vrshrn_n_s32(q1s32, 14);
+  d3s16 = vrshrn_n_s32(q2s32, 14);
+  d24s16 = vrshrn_n_s32(q12s32, 14);
+  d25s16 = vrshrn_n_s32(q15s32, 14);
+  d6s16 = vrshrn_n_s32(q3s32, 14);
+  d7s16 = vrshrn_n_s32(q4s32, 14);
+  *q12s16 = vcombine_s16(d24s16, d25s16);

-  // stage 1
-  iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1);
-  iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3);
-  iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5);
-  iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7);
+  d0s16 = vdup_n_s16(cospi_10_64);
+  d1s16 = vdup_n_s16(cospi_22_64);
+  q4s32 = vmull_s16(d26s16, d0s16);
+  q5s32 = vmull_s16(d27s16, d0s16);
+  q2s32 = vmull_s16(d26s16, d1s16);
+  q6s32 = vmull_s16(d27s16, d1s16);

-  x[0] = add_dct_const_round_shift_low_8(s0, s4);
-  x[1] = add_dct_const_round_shift_low_8(s1, s5);
-  x[2] = add_dct_const_round_shift_low_8(s2, s6);
-  x[3] = add_dct_const_round_shift_low_8(s3, s7);
-  x[4] = sub_dct_const_round_shift_low_8(s0, s4);
-  x[5] = sub_dct_const_round_shift_low_8(s1, s5);
-  x[6] = sub_dct_const_round_shift_low_8(s2, s6);
-  x[7] = sub_dct_const_round_shift_low_8(s3, s7);
+  d30s16 = vdup_n_s16(cospi_26_64);
+  d31s16 = vdup_n_s16(cospi_6_64);

-  // stage 2
-  t[0] = x[0];
-  t[1] = x[1];
-  t[2] = x[2];
-  t[3] = x[3];
-  iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5);
-  iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6);
+  q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
+  q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
+  q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
+  q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);

-  x[0] = vaddq_s16(t[0], t[2]);
-  x[1] = vaddq_s16(t[1], t[3]);
-  x[2] = vsubq_s16(t[0], t[2]);
-  x[3] = vsubq_s16(t[1], t[3]);
-  x[4] = add_dct_const_round_shift_low_8(s4, s6);
-  x[5] = add_dct_const_round_shift_low_8(s5, s7);
-  x[6] = sub_dct_const_round_shift_low_8(s4, s6);
-  x[7] = sub_dct_const_round_shift_low_8(s5, s7);
+  q0s32 = vmull_s16(d18s16, d30s16);
+  q13s32 = vmull_s16(d19s16, d30s16);

-  // stage 3
-  iadst_half_butterfly_neon(x + 2, c2);
-  iadst_half_butterfly_neon(x + 6, c2);
+  q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
+  q13s32 = vmlal_s16(q13s32, d29s16, d31s16);

-  io[0] = x[0];
-  io[1] = vnegq_s16(x[4]);
-  io[2] = x[6];
-  io[3] = vnegq_s16(x[2]);
-  io[4] = x[3];
-  io[5] = vnegq_s16(x[7]);
-  io[6] = x[5];
-  io[7] = vnegq_s16(x[1]);
+  q10s32 = vmull_s16(d18s16, d31s16);
+  q9s32 = vmull_s16(d19s16, d31s16);
+
+  q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
+  q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
+
+  q14s32 = vaddq_s32(q2s32, q10s32);
+  q15s32 = vaddq_s32(q6s32, q9s32);
+  q2s32 = vsubq_s32(q2s32, q10s32);
+  q6s32 = vsubq_s32(q6s32, q9s32);
+
+  d28s16 = vrshrn_n_s32(q14s32, 14);
+  d29s16 = vrshrn_n_s32(q15s32, 14);
+  d4s16 = vrshrn_n_s32(q2s32, 14);
+  d5s16 = vrshrn_n_s32(q6s32, 14);
+  *q14s16 = vcombine_s16(d28s16, d29s16);
+
+  q9s32 = vaddq_s32(q4s32, q0s32);
+  q10s32 = vaddq_s32(q5s32, q13s32);
+  q4s32 = vsubq_s32(q4s32, q0s32);
+  q5s32 = vsubq_s32(q5s32, q13s32);
+
+  d30s16 = vdup_n_s16(cospi_8_64);
+  d31s16 = vdup_n_s16(cospi_24_64);
+
+  d18s16 = vrshrn_n_s32(q9s32, 14);
+  d19s16 = vrshrn_n_s32(q10s32, 14);
+  d8s16 = vrshrn_n_s32(q4s32, 14);
+  d9s16 = vrshrn_n_s32(q5s32, 14);
+  *q9s16 = vcombine_s16(d18s16, d19s16);
+
+  q5s32 = vmull_s16(d2s16, d30s16);
+  q6s32 = vmull_s16(d3s16, d30s16);
+  q7s32 = vmull_s16(d2s16, d31s16);
+  q0s32 = vmull_s16(d3s16, d31s16);
+
+  q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
+  q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
+  q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
+  q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
+
+  q1s32 = vmull_s16(d4s16, d30s16);
+  q3s32 = vmull_s16(d5s16, d30s16);
+  q10s32 = vmull_s16(d4s16, d31s16);
+  q2s32 = vmull_s16(d5s16, d31s16);
+
+  q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
+  q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
+  q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
+  q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
+
+  *q8s16 = vaddq_s16(*q11s16, *q9s16);
+  *q11s16 = vsubq_s16(*q11s16, *q9s16);
+  q4s16 = vaddq_s16(*q12s16, *q14s16);
+  *q12s16 = vsubq_s16(*q12s16, *q14s16);
+
+  q14s32 = vaddq_s32(q5s32, q1s32);
+  q15s32 = vaddq_s32(q6s32, q3s32);
+  q5s32 = vsubq_s32(q5s32, q1s32);
+  q6s32 = vsubq_s32(q6s32, q3s32);
+
+  d18s16 = vrshrn_n_s32(q14s32, 14);
+  d19s16 = vrshrn_n_s32(q15s32, 14);
+  d10s16 = vrshrn_n_s32(q5s32, 14);
+  d11s16 = vrshrn_n_s32(q6s32, 14);
+  *q9s16 = vcombine_s16(d18s16, d19s16);
+
+  q1s32 = vaddq_s32(q7s32, q10s32);
+  q3s32 = vaddq_s32(q0s32, q2s32);
+  q7s32 = vsubq_s32(q7s32, q10s32);
+  q0s32 = vsubq_s32(q0s32, q2s32);
+
+  d28s16 = vrshrn_n_s32(q1s32, 14);
+  d29s16 = vrshrn_n_s32(q3s32, 14);
+  d14s16 = vrshrn_n_s32(q7s32, 14);
+  d15s16 = vrshrn_n_s32(q0s32, 14);
+  *q14s16 = vcombine_s16(d28s16, d29s16);
+
+  d30s16 = vdup_n_s16(cospi_16_64);
+
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  q2s32 = vmull_s16(d22s16, d30s16);
+  q3s32 = vmull_s16(d23s16, d30s16);
+  q13s32 = vmull_s16(d22s16, d30s16);
+  q1s32 = vmull_s16(d23s16, d30s16);
+
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
+  q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
+  q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
+  q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+
+  d4s16 = vrshrn_n_s32(q2s32, 14);
+  d5s16 = vrshrn_n_s32(q3s32, 14);
+  d24s16 = vrshrn_n_s32(q13s32, 14);
+  d25s16 = vrshrn_n_s32(q1s32, 14);
+  q2s16 = vcombine_s16(d4s16, d5s16);
+  *q12s16 = vcombine_s16(d24s16, d25s16);
+
+  q13s32 = vmull_s16(d10s16, d30s16);
+  q1s32 = vmull_s16(d11s16, d30s16);
+  q11s32 = vmull_s16(d10s16, d30s16);
+  q0s32 = vmull_s16(d11s16, d30s16);
+
+  q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
+  q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
+  q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
+  q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+
+  d20s16 = vrshrn_n_s32(q13s32, 14);
+  d21s16 = vrshrn_n_s32(q1s32, 14);
+  d12s16 = vrshrn_n_s32(q11s32, 14);
+  d13s16 = vrshrn_n_s32(q0s32, 14);
+  *q10s16 = vcombine_s16(d20s16, d21s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  q5s16 = vdupq_n_s16(0);
+
+  *q9s16 = vsubq_s16(q5s16, *q9s16);
+  *q11s16 = vsubq_s16(q5s16, q2s16);
+  *q13s16 = vsubq_s16(q5s16, q6s16);
+  *q15s16 = vsubq_s16(q5s16, q4s16);
 }

 void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
-  const int16x8_t cospis = vld1q_s16(kCospi);
-  const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
-  const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
-  int16x8_t a[8];
+  int i;
+  uint8_t *d1, *d2;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  uint64x1_t d0u64, d1u64, d2u64, d3u64;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  uint16x8_t q8u16, q9u16, q10u16, q11u16;

-  a[0] = load_tran_low_to_s16q(input + 0 * 8);
-  a[1] = load_tran_low_to_s16q(input + 1 * 8);
-  a[2] = load_tran_low_to_s16q(input + 2 * 8);
-  a[3] = load_tran_low_to_s16q(input + 3 * 8);
-  a[4] = load_tran_low_to_s16q(input + 4 * 8);
-  a[5] = load_tran_low_to_s16q(input + 5 * 8);
-  a[6] = load_tran_low_to_s16q(input + 6 * 8);
-  a[7] = load_tran_low_to_s16q(input + 7 * 8);
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+  q10s16 = vld1q_s16(input + 8 * 2);
+  q11s16 = vld1q_s16(input + 8 * 3);
+  q12s16 = vld1q_s16(input + 8 * 4);
+  q13s16 = vld1q_s16(input + 8 * 5);
+  q14s16 = vld1q_s16(input + 8 * 6);
+  q15s16 = vld1q_s16(input + 8 * 7);

-  transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                    &q15s16);

  switch (tx_type) {
-    case DCT_DCT:
-      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
-      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
-      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
-      break;
+    case 0:  // idct_idct is not supported. Fall back to C
+      vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
+      return;
+    case 1:  // iadst_idct
+      // generate IDCT constants
+      // GENERATE_IDCT_CONSTANTS

-    case ADST_DCT:
-      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
-      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
-      iadst8(a);
-      break;
+      // first transform rows
+      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                 &q15s16);

-    case DCT_ADST:
-      iadst8(a);
-      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
-      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
-      break;
+      // transpose the matrix
+      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
+                        &q14s16, &q15s16);

-    default:
-      assert(tx_type == ADST_ADST);
-      iadst8(a);
-      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
-      iadst8(a);
+      // generate IADST constants
+      // GENERATE_IADST_CONSTANTS
+
+      // then transform columns
+      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                  &q15s16);
+      break;
+    case 2:  // idct_iadst
+      // generate IADST constants
+      // GENERATE_IADST_CONSTANTS
+
+      // first transform rows
+      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                  &q15s16);
+
+      // transpose the matrix
+      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
+                        &q14s16, &q15s16);
+
+      // generate IDCT constants
+      // GENERATE_IDCT_CONSTANTS
+
+      // then transform columns
+      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                 &q15s16);
+      break;
+    case 3:  // iadst_iadst
+      // generate IADST constants
+      // GENERATE_IADST_CONSTANTS
+
+      // first transform rows
+      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                  &q15s16);
+
+      // transpose the matrix
+      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
+                        &q14s16, &q15s16);
+
+      // then transform columns
+      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+                  &q15s16);
+      break;
+    default:  // iadst_idct
+      assert(0);
      break;
  }

-  idct8x8_add8x8_neon(a, dest, stride);
+  q8s16 = vrshrq_n_s16(q8s16, 5);
+  q9s16 = vrshrq_n_s16(q9s16, 5);
+  q10s16 = vrshrq_n_s16(q10s16, 5);
+  q11s16 = vrshrq_n_s16(q11s16, 5);
+  q12s16 = vrshrq_n_s16(q12s16, 5);
+  q13s16 = vrshrq_n_s16(q13s16, 5);
+  q14s16 = vrshrq_n_s16(q14s16, 5);
+  q15s16 = vrshrq_n_s16(q15s16, 5);
+
+  for (d1 = d2 = dest, i = 0; i < 2; i++) {
+    if (i != 0) {
+      q8s16 = q12s16;
+      q9s16 = q13s16;
+      q10s16 = q14s16;
+      q11s16 = q15s16;
+    }
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+    q10u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+    q11u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += stride;
+  }
 }
--- a/vp9/common/arm/neon/vp9_iht_neon.h
+++ b/vp9/common/arm/neon/vp9_iht_neon.h
@@ -1,60 +0,0 @@
-/*
- *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
-#define VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
-
-#include <arm_neon.h>
-
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx_dsp/arm/idct_neon.h"
-#include "vpx_dsp/arm/mem_neon.h"
-#include "vpx_dsp/txfm_common.h"
-
-static INLINE void iadst4(int16x8_t *const io) {
-  const int32x4_t c3 = vdupq_n_s32(sinpi_3_9);
-  int16x4_t x[4];
-  int32x4_t s[8], output[4];
-  const int16x4_t c =
-      create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9);
-
-  x[0] = vget_low_s16(io[0]);
-  x[1] = vget_low_s16(io[1]);
-  x[2] = vget_high_s16(io[0]);
-  x[3] = vget_high_s16(io[1]);
-
-  s[0] = vmull_lane_s16(x[0], c, 0);
-  s[1] = vmull_lane_s16(x[0], c, 1);
-  s[2] = vmull_lane_s16(x[1], c, 2);
-  s[3] = vmull_lane_s16(x[2], c, 3);
-  s[4] = vmull_lane_s16(x[2], c, 0);
-  s[5] = vmull_lane_s16(x[3], c, 1);
-  s[6] = vmull_lane_s16(x[3], c, 3);
-  s[7] = vaddl_s16(x[0], x[3]);
-  s[7] = vsubw_s16(s[7], x[2]);
-
-  s[0] = vaddq_s32(s[0], s[3]);
-  s[0] = vaddq_s32(s[0], s[5]);
-  s[1] = vsubq_s32(s[1], s[4]);
-  s[1] = vsubq_s32(s[1], s[6]);
-  s[3] = s[2];
-  s[2] = vmulq_s32(c3, s[7]);
-
-  output[0] = vaddq_s32(s[0], s[3]);
-  output[1] = vaddq_s32(s[1], s[3]);
-  output[2] = s[2];
-  output[3] = vaddq_s32(s[0], s[1]);
-  output[3] = vsubq_s32(output[3], s[3]);
-  dct_const_round_shift_low_8_dual(output, &io[0], &io[1]);
-}
-
-#endif  // VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -42,7 +42,6 @@ const vpx_prob vp9_cat6_prob_high12[] = { 255, 255, 255, 255, 254, 254,
                                          177, 153, 140, 133, 130, 129 };
 #endif

-/* clang-format off */
 const uint8_t vp9_coefband_trans_8x8plus[1024] = {
  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
  // beyond MAXBAND_INDEX+1 all values are filled as 5
@@ -86,7 +85,6 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = {
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
-/* clang-format on */

 const uint8_t vp9_coefband_trans_4x4[16] = {
  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -137,6 +137,7 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
+
 #define COEFF_PROB_MODELS 255

 #define UNCONSTRAINED_NODES 3
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -186,19 +186,16 @@ const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] =
      { 93, 24, 99 },   // a split, l not split
      { 85, 119, 44 },  // l split, a not split
      { 62, 59, 67 },   // a/l both split
-
      // 16x16 -> 8x8
      { 149, 53, 53 },  // a/l both not split
      { 94, 20, 48 },   // a split, l not split
      { 83, 53, 24 },   // l split, a not split
      { 52, 18, 18 },   // a/l both split
-
      // 32x32 -> 16x16
      { 150, 40, 39 },  // a/l both not split
      { 78, 12, 26 },   // a split, l not split
      { 67, 33, 11 },   // l split, a not split
      { 24, 7, 5 },     // a/l both split
-
      // 64x64 -> 32x32
      { 174, 35, 49 },  // a/l both not split
      { 68, 11, 27 },   // a split, l not split
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -22,7 +22,9 @@ const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
  18,          -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10,
 };

-const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1 };
+const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
+  -0, -1,
+};

 const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2,  -1,
                                                               4,  -2, -3 };
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1174,7 +1174,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
    }

    // Disable filtering on the leftmost column
-    border_mask = ~(mi_col == 0 ? 1 : 0);
+    border_mask = ~(mi_col == 0);
 #if CONFIG_VP9_HIGHBITDEPTH
    if (cm->use_highbitdepth) {
      highbd_filter_selectively_vert(
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -229,8 +229,9 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
        else
          pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME);
      } else {
-        pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
-                                edge_mi->ref_frame[1] == GOLDEN_FRAME);
+        pred_context = 1 +
+                       2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
+                            edge_mi->ref_frame[1] == GOLDEN_FRAME);
      }
    } else {  // inter/inter
      const int above_has_second = has_second_ref(above_mi);
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1,13 +1,3 @@
-##
-##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-
 sub vp9_common_forward_decls() {
 print <<EOF
 /*
@@ -67,13 +57,13 @@ add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *outp
 if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
  # Note that there are more specializations appended when
  # CONFIG_VP9_HIGHBITDEPTH is off.
-  specialize qw/vp9_iht4x4_16_add neon sse2/;
+  specialize qw/vp9_iht4x4_16_add sse2/;
  specialize qw/vp9_iht8x8_64_add sse2/;
  specialize qw/vp9_iht16x16_256_add sse2/;
  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
    # Note that these specializations are appended to the above ones.
-    specialize qw/vp9_iht4x4_16_add dspr2 msa/;
-    specialize qw/vp9_iht8x8_64_add dspr2 msa/;
+    specialize qw/vp9_iht4x4_16_add neon dspr2 msa/;
+    specialize qw/vp9_iht8x8_64_add neon dspr2 msa/;
    specialize qw/vp9_iht16x16_256_add dspr2 msa/;
  }
 }
@@ -101,12 +91,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";

  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
-
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
-    specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
-    specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
-    specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
-  }
 }

 #
@@ -129,7 +113,7 @@ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";

 add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon sse2 avx2/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";

 add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64";
--- a/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
+++ b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
@@ -1,419 +0,0 @@
-/*
- *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_idct.h"
-#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
-#include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/transpose_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-
-static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
-                                                      const int c,
-                                                      __m128i *const s) {
-  const __m128i pair_c = pair_set_epi32(4 * c, 0);
-  __m128i x[2];
-
-  extend_64bit(in, x);
-  s[0] = _mm_mul_epi32(pair_c, x[0]);
-  s[1] = _mm_mul_epi32(pair_c, x[1]);
-}
-
-static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
-                                                 const __m128i in1,
-                                                 const int c0, const int c1,
-                                                 __m128i *const s0,
-                                                 __m128i *const s1) {
-  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
-  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
-  __m128i t00[2], t01[2], t10[2], t11[2];
-  __m128i x0[2], x1[2];
-
-  extend_64bit(in0, x0);
-  extend_64bit(in1, x1);
-  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
-  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
-  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
-  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
-  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
-  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
-  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
-  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
-
-  s0[0] = _mm_add_epi64(t00[0], t11[0]);
-  s0[1] = _mm_add_epi64(t00[1], t11[1]);
-  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
-  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
-}
-
-static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {
-  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],
-      s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
-  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],
-      x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];
-
-  // stage 1
-  highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);
-  highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);
-  highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);
-  highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);
-  highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);
-  highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,
-                                s11);
-  highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,
-                                s13);
-  highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,
-                                s15);
-
-  x0[0] = _mm_add_epi64(s0[0], s8[0]);
-  x0[1] = _mm_add_epi64(s0[1], s8[1]);
-  x1[0] = _mm_add_epi64(s1[0], s9[0]);
-  x1[1] = _mm_add_epi64(s1[1], s9[1]);
-  x2[0] = _mm_add_epi64(s2[0], s10[0]);
-  x2[1] = _mm_add_epi64(s2[1], s10[1]);
-  x3[0] = _mm_add_epi64(s3[0], s11[0]);
-  x3[1] = _mm_add_epi64(s3[1], s11[1]);
-  x4[0] = _mm_add_epi64(s4[0], s12[0]);
-  x4[1] = _mm_add_epi64(s4[1], s12[1]);
-  x5[0] = _mm_add_epi64(s5[0], s13[0]);
-  x5[1] = _mm_add_epi64(s5[1], s13[1]);
-  x6[0] = _mm_add_epi64(s6[0], s14[0]);
-  x6[1] = _mm_add_epi64(s6[1], s14[1]);
-  x7[0] = _mm_add_epi64(s7[0], s15[0]);
-  x7[1] = _mm_add_epi64(s7[1], s15[1]);
-  x8[0] = _mm_sub_epi64(s0[0], s8[0]);
-  x8[1] = _mm_sub_epi64(s0[1], s8[1]);
-  x9[0] = _mm_sub_epi64(s1[0], s9[0]);
-  x9[1] = _mm_sub_epi64(s1[1], s9[1]);
-  x10[0] = _mm_sub_epi64(s2[0], s10[0]);
-  x10[1] = _mm_sub_epi64(s2[1], s10[1]);
-  x11[0] = _mm_sub_epi64(s3[0], s11[0]);
-  x11[1] = _mm_sub_epi64(s3[1], s11[1]);
-  x12[0] = _mm_sub_epi64(s4[0], s12[0]);
-  x12[1] = _mm_sub_epi64(s4[1], s12[1]);
-  x13[0] = _mm_sub_epi64(s5[0], s13[0]);
-  x13[1] = _mm_sub_epi64(s5[1], s13[1]);
-  x14[0] = _mm_sub_epi64(s6[0], s14[0]);
-  x14[1] = _mm_sub_epi64(s6[1], s14[1]);
-  x15[0] = _mm_sub_epi64(s7[0], s15[0]);
-  x15[1] = _mm_sub_epi64(s7[1], s15[1]);
-
-  x0[0] = dct_const_round_shift_64bit(x0[0]);
-  x0[1] = dct_const_round_shift_64bit(x0[1]);
-  x1[0] = dct_const_round_shift_64bit(x1[0]);
-  x1[1] = dct_const_round_shift_64bit(x1[1]);
-  x2[0] = dct_const_round_shift_64bit(x2[0]);
-  x2[1] = dct_const_round_shift_64bit(x2[1]);
-  x3[0] = dct_const_round_shift_64bit(x3[0]);
-  x3[1] = dct_const_round_shift_64bit(x3[1]);
-  x4[0] = dct_const_round_shift_64bit(x4[0]);
-  x4[1] = dct_const_round_shift_64bit(x4[1]);
-  x5[0] = dct_const_round_shift_64bit(x5[0]);
-  x5[1] = dct_const_round_shift_64bit(x5[1]);
-  x6[0] = dct_const_round_shift_64bit(x6[0]);
-  x6[1] = dct_const_round_shift_64bit(x6[1]);
-  x7[0] = dct_const_round_shift_64bit(x7[0]);
-  x7[1] = dct_const_round_shift_64bit(x7[1]);
-  x8[0] = dct_const_round_shift_64bit(x8[0]);
-  x8[1] = dct_const_round_shift_64bit(x8[1]);
-  x9[0] = dct_const_round_shift_64bit(x9[0]);
-  x9[1] = dct_const_round_shift_64bit(x9[1]);
-  x10[0] = dct_const_round_shift_64bit(x10[0]);
-  x10[1] = dct_const_round_shift_64bit(x10[1]);
-  x11[0] = dct_const_round_shift_64bit(x11[0]);
-  x11[1] = dct_const_round_shift_64bit(x11[1]);
-  x12[0] = dct_const_round_shift_64bit(x12[0]);
-  x12[1] = dct_const_round_shift_64bit(x12[1]);
-  x13[0] = dct_const_round_shift_64bit(x13[0]);
-  x13[1] = dct_const_round_shift_64bit(x13[1]);
-  x14[0] = dct_const_round_shift_64bit(x14[0]);
-  x14[1] = dct_const_round_shift_64bit(x14[1]);
-  x15[0] = dct_const_round_shift_64bit(x15[0]);
-  x15[1] = dct_const_round_shift_64bit(x15[1]);
-  x0[0] = pack_4(x0[0], x0[1]);
-  x1[0] = pack_4(x1[0], x1[1]);
-  x2[0] = pack_4(x2[0], x2[1]);
-  x3[0] = pack_4(x3[0], x3[1]);
-  x4[0] = pack_4(x4[0], x4[1]);
-  x5[0] = pack_4(x5[0], x5[1]);
-  x6[0] = pack_4(x6[0], x6[1]);
-  x7[0] = pack_4(x7[0], x7[1]);
-  x8[0] = pack_4(x8[0], x8[1]);
-  x9[0] = pack_4(x9[0], x9[1]);
-  x10[0] = pack_4(x10[0], x10[1]);
-  x11[0] = pack_4(x11[0], x11[1]);
-  x12[0] = pack_4(x12[0], x12[1]);
-  x13[0] = pack_4(x13[0], x13[1]);
-  x14[0] = pack_4(x14[0], x14[1]);
-  x15[0] = pack_4(x15[0], x15[1]);
-
-  // stage 2
-  s0[0] = x0[0];
-  s1[0] = x1[0];
-  s2[0] = x2[0];
-  s3[0] = x3[0];
-  s4[0] = x4[0];
-  s5[0] = x5[0];
-  s6[0] = x6[0];
-  s7[0] = x7[0];
-  x0[0] = _mm_add_epi32(s0[0], s4[0]);
-  x1[0] = _mm_add_epi32(s1[0], s5[0]);
-  x2[0] = _mm_add_epi32(s2[0], s6[0]);
-  x3[0] = _mm_add_epi32(s3[0], s7[0]);
-  x4[0] = _mm_sub_epi32(s0[0], s4[0]);
-  x5[0] = _mm_sub_epi32(s1[0], s5[0]);
-  x6[0] = _mm_sub_epi32(s2[0], s6[0]);
-  x7[0] = _mm_sub_epi32(s3[0], s7[0]);
-
-  highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);
-  highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,
-                                s11);
-  highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,
-                                s12);
-  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,
-                                s14);
-
-  x8[0] = _mm_add_epi64(s8[0], s12[0]);
-  x8[1] = _mm_add_epi64(s8[1], s12[1]);
-  x9[0] = _mm_add_epi64(s9[0], s13[0]);
-  x9[1] = _mm_add_epi64(s9[1], s13[1]);
-  x10[0] = _mm_add_epi64(s10[0], s14[0]);
-  x10[1] = _mm_add_epi64(s10[1], s14[1]);
-  x11[0] = _mm_add_epi64(s11[0], s15[0]);
-  x11[1] = _mm_add_epi64(s11[1], s15[1]);
-  x12[0] = _mm_sub_epi64(s8[0], s12[0]);
-  x12[1] = _mm_sub_epi64(s8[1], s12[1]);
-  x13[0] = _mm_sub_epi64(s9[0], s13[0]);
-  x13[1] = _mm_sub_epi64(s9[1], s13[1]);
-  x14[0] = _mm_sub_epi64(s10[0], s14[0]);
-  x14[1] = _mm_sub_epi64(s10[1], s14[1]);
-  x15[0] = _mm_sub_epi64(s11[0], s15[0]);
-  x15[1] = _mm_sub_epi64(s11[1], s15[1]);
-  x8[0] = dct_const_round_shift_64bit(x8[0]);
-  x8[1] = dct_const_round_shift_64bit(x8[1]);
-  x9[0] = dct_const_round_shift_64bit(x9[0]);
-  x9[1] = dct_const_round_shift_64bit(x9[1]);
-  x10[0] = dct_const_round_shift_64bit(x10[0]);
-  x10[1] = dct_const_round_shift_64bit(x10[1]);
-  x11[0] = dct_const_round_shift_64bit(x11[0]);
-  x11[1] = dct_const_round_shift_64bit(x11[1]);
-  x12[0] = dct_const_round_shift_64bit(x12[0]);
-  x12[1] = dct_const_round_shift_64bit(x12[1]);
-  x13[0] = dct_const_round_shift_64bit(x13[0]);
-  x13[1] = dct_const_round_shift_64bit(x13[1]);
-  x14[0] = dct_const_round_shift_64bit(x14[0]);
-  x14[1] = dct_const_round_shift_64bit(x14[1]);
-  x15[0] = dct_const_round_shift_64bit(x15[0]);
-  x15[1] = dct_const_round_shift_64bit(x15[1]);
-  x8[0] = pack_4(x8[0], x8[1]);
-  x9[0] = pack_4(x9[0], x9[1]);
-  x10[0] = pack_4(x10[0], x10[1]);
-  x11[0] = pack_4(x11[0], x11[1]);
-  x12[0] = pack_4(x12[0], x12[1]);
-  x13[0] = pack_4(x13[0], x13[1]);
-  x14[0] = pack_4(x14[0], x14[1]);
-  x15[0] = pack_4(x15[0], x15[1]);
-
-  // stage 3
-  s0[0] = x0[0];
-  s1[0] = x1[0];
-  s2[0] = x2[0];
-  s3[0] = x3[0];
-  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
-  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
-  s8[0] = x8[0];
-  s9[0] = x9[0];
-  s10[0] = x10[0];
-  s11[0] = x11[0];
-  highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,
-                                s13);
-  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,
-                                s14);
-
-  x0[0] = _mm_add_epi32(s0[0], s2[0]);
-  x1[0] = _mm_add_epi32(s1[0], s3[0]);
-  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
-  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
-  x4[0] = _mm_add_epi64(s4[0], s6[0]);
-  x4[1] = _mm_add_epi64(s4[1], s6[1]);
-  x5[0] = _mm_add_epi64(s5[0], s7[0]);
-  x5[1] = _mm_add_epi64(s5[1], s7[1]);
-  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
-  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
-  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
-  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
-  x4[0] = dct_const_round_shift_64bit(x4[0]);
-  x4[1] = dct_const_round_shift_64bit(x4[1]);
-  x5[0] = dct_const_round_shift_64bit(x5[0]);
-  x5[1] = dct_const_round_shift_64bit(x5[1]);
-  x6[0] = dct_const_round_shift_64bit(x6[0]);
-  x6[1] = dct_const_round_shift_64bit(x6[1]);
-  x7[0] = dct_const_round_shift_64bit(x7[0]);
-  x7[1] = dct_const_round_shift_64bit(x7[1]);
-  x4[0] = pack_4(x4[0], x4[1]);
-  x5[0] = pack_4(x5[0], x5[1]);
-  x6[0] = pack_4(x6[0], x6[1]);
-  x7[0] = pack_4(x7[0], x7[1]);
-  x8[0] = _mm_add_epi32(s8[0], s10[0]);
-  x9[0] = _mm_add_epi32(s9[0], s11[0]);
-  x10[0] = _mm_sub_epi32(s8[0], s10[0]);
-  x11[0] = _mm_sub_epi32(s9[0], s11[0]);
-  x12[0] = _mm_add_epi64(s12[0], s14[0]);
-  x12[1] = _mm_add_epi64(s12[1], s14[1]);
-  x13[0] = _mm_add_epi64(s13[0], s15[0]);
-  x13[1] = _mm_add_epi64(s13[1], s15[1]);
-  x14[0] = _mm_sub_epi64(s12[0], s14[0]);
-  x14[1] = _mm_sub_epi64(s12[1], s14[1]);
-  x15[0] = _mm_sub_epi64(s13[0], s15[0]);
-  x15[1] = _mm_sub_epi64(s13[1], s15[1]);
-  x12[0] = dct_const_round_shift_64bit(x12[0]);
-  x12[1] = dct_const_round_shift_64bit(x12[1]);
-  x13[0] = dct_const_round_shift_64bit(x13[0]);
-  x13[1] = dct_const_round_shift_64bit(x13[1]);
-  x14[0] = dct_const_round_shift_64bit(x14[0]);
-  x14[1] = dct_const_round_shift_64bit(x14[1]);
-  x15[0] = dct_const_round_shift_64bit(x15[0]);
-  x15[1] = dct_const_round_shift_64bit(x15[1]);
-  x12[0] = pack_4(x12[0], x12[1]);
-  x13[0] = pack_4(x13[0], x13[1]);
-  x14[0] = pack_4(x14[0], x14[1]);
-  x15[0] = pack_4(x15[0], x15[1]);
-
-  // stage 4
-  s2[0] = _mm_add_epi32(x2[0], x3[0]);
-  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
-  s6[0] = _mm_add_epi32(x7[0], x6[0]);
-  s7[0] = _mm_sub_epi32(x7[0], x6[0]);
-  s10[0] = _mm_add_epi32(x11[0], x10[0]);
-  s11[0] = _mm_sub_epi32(x11[0], x10[0]);
-  s14[0] = _mm_add_epi32(x14[0], x15[0]);
-  s15[0] = _mm_sub_epi32(x14[0], x15[0]);
-  highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);
-  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
-  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
-  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
-  highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);
-  highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);
-  highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);
-  highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);
-
-  x2[0] = dct_const_round_shift_64bit(s2[0]);
-  x2[1] = dct_const_round_shift_64bit(s2[1]);
-  x3[0] = dct_const_round_shift_64bit(s3[0]);
-  x3[1] = dct_const_round_shift_64bit(s3[1]);
-  x6[0] = dct_const_round_shift_64bit(s6[0]);
-  x6[1] = dct_const_round_shift_64bit(s6[1]);
-  x7[0] = dct_const_round_shift_64bit(s7[0]);
-  x7[1] = dct_const_round_shift_64bit(s7[1]);
-  x10[0] = dct_const_round_shift_64bit(s10[0]);
-  x10[1] = dct_const_round_shift_64bit(s10[1]);
-  x11[0] = dct_const_round_shift_64bit(s11[0]);
-  x11[1] = dct_const_round_shift_64bit(s11[1]);
-  x14[0] = dct_const_round_shift_64bit(s14[0]);
-  x14[1] = dct_const_round_shift_64bit(s14[1]);
-  x15[0] = dct_const_round_shift_64bit(s15[0]);
-  x15[1] = dct_const_round_shift_64bit(s15[1]);
-  x2[0] = pack_4(x2[0], x2[1]);
-  x3[0] = pack_4(x3[0], x3[1]);
-  x6[0] = pack_4(x6[0], x6[1]);
-  x7[0] = pack_4(x7[0], x7[1]);
-  x10[0] = pack_4(x10[0], x10[1]);
-  x11[0] = pack_4(x11[0], x11[1]);
-  x14[0] = pack_4(x14[0], x14[1]);
-  x15[0] = pack_4(x15[0], x15[1]);
-
-  io[0] = x0[0];
-  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);
-  io[2] = x12[0];
-  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
-  io[4] = x6[0];
-  io[5] = x14[0];
-  io[6] = x10[0];
-  io[7] = x2[0];
-  io[8] = x3[0];
-  io[9] = x11[0];
-  io[10] = x15[0];
-  io[11] = x7[0];
-  io[12] = x5[0];
-  io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);
-  io[14] = x9[0];
-  io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
-}
-
-void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,
-                                        int stride, int tx_type, int bd) {
-  int i;
-  __m128i out[16], *in;
-
-  if (bd == 8) {
-    __m128i l[16], r[16];
-
-    in = l;
-    for (i = 0; i < 2; i++) {
-      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
-      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
-      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
-        idct16_8col(in, in);
-      } else {
-        vpx_iadst16_8col_sse2(in);
-      }
-      in = r;
-      input += 128;
-    }
-
-    for (i = 0; i < 16; i += 8) {
-      int j;
-      transpose_16bit_8x8(l + i, out);
-      transpose_16bit_8x8(r + i, out + 8);
-      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
-        idct16_8col(out, out);
-      } else {
-        vpx_iadst16_8col_sse2(out);
-      }
-
-      for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_8(dest + j * stride, out[j], bd);
-      }
-      dest += 8;
-    }
-  } else {
-    __m128i all[4][16];
-
-    for (i = 0; i < 4; i++) {
-      in = all[i];
-      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
-      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
-      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
-        vpx_highbd_idct16_4col_sse4_1(in);
-      } else {
-        highbd_iadst16_4col_sse4_1(in);
-      }
-      input += 4 * 16;
-    }
-
-    for (i = 0; i < 16; i += 4) {
-      int j;
-      transpose_32bit_4x4(all[0] + i, out + 0);
-      transpose_32bit_4x4(all[1] + i, out + 4);
-      transpose_32bit_4x4(all[2] + i, out + 8);
-      transpose_32bit_4x4(all[3] + i, out + 12);
-      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
-        vpx_highbd_idct16_4col_sse4_1(out);
-      } else {
-        highbd_iadst16_4col_sse4_1(out);
-      }
-
-      for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_4(dest + j * stride, out[j], bd);
-      }
-      dest += 4;
-    }
-  }
-}
--- a/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
+++ b/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
@@ -1,131 +0,0 @@
-/*
- *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_idct.h"
-#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
-#include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/transpose_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-
-static INLINE void highbd_iadst4_sse4_1(__m128i *const io) {
-  const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0);
-  const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0);
-  const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0);
-  const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0);
-  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2];
-  __m128i temp[2];
-
-  transpose_32bit_4x4(io, io);
-
-  extend_64bit(io[0], temp);
-  s0[0] = _mm_mul_epi32(pair_c1, temp[0]);
-  s0[1] = _mm_mul_epi32(pair_c1, temp[1]);
-  s1[0] = _mm_mul_epi32(pair_c2, temp[0]);
-  s1[1] = _mm_mul_epi32(pair_c2, temp[1]);
-
-  extend_64bit(io[1], temp);
-  s2[0] = _mm_mul_epi32(pair_c3, temp[0]);
-  s2[1] = _mm_mul_epi32(pair_c3, temp[1]);
-
-  extend_64bit(io[2], temp);
-  s3[0] = _mm_mul_epi32(pair_c4, temp[0]);
-  s3[1] = _mm_mul_epi32(pair_c4, temp[1]);
-  s4[0] = _mm_mul_epi32(pair_c1, temp[0]);
-  s4[1] = _mm_mul_epi32(pair_c1, temp[1]);
-
-  extend_64bit(io[3], temp);
-  s5[0] = _mm_mul_epi32(pair_c2, temp[0]);
-  s5[1] = _mm_mul_epi32(pair_c2, temp[1]);
-  s6[0] = _mm_mul_epi32(pair_c4, temp[0]);
-  s6[1] = _mm_mul_epi32(pair_c4, temp[1]);
-
-  t0[0] = _mm_add_epi64(s0[0], s3[0]);
-  t0[1] = _mm_add_epi64(s0[1], s3[1]);
-  t0[0] = _mm_add_epi64(t0[0], s5[0]);
-  t0[1] = _mm_add_epi64(t0[1], s5[1]);
-  t1[0] = _mm_sub_epi64(s1[0], s4[0]);
-  t1[1] = _mm_sub_epi64(s1[1], s4[1]);
-  t1[0] = _mm_sub_epi64(t1[0], s6[0]);
-  t1[1] = _mm_sub_epi64(t1[1], s6[1]);
-  temp[0] = _mm_sub_epi32(io[0], io[2]);
-  temp[0] = _mm_add_epi32(temp[0], io[3]);
-  extend_64bit(temp[0], temp);
-  t2[0] = _mm_mul_epi32(pair_c3, temp[0]);
-  t2[1] = _mm_mul_epi32(pair_c3, temp[1]);
-
-  s0[0] = _mm_add_epi64(t0[0], s2[0]);
-  s0[1] = _mm_add_epi64(t0[1], s2[1]);
-  s1[0] = _mm_add_epi64(t1[0], s2[0]);
-  s1[1] = _mm_add_epi64(t1[1], s2[1]);
-  s3[0] = _mm_add_epi64(t0[0], t1[0]);
-  s3[1] = _mm_add_epi64(t0[1], t1[1]);
-  s3[0] = _mm_sub_epi64(s3[0], s2[0]);
-  s3[1] = _mm_sub_epi64(s3[1], s2[1]);
-
-  s0[0] = dct_const_round_shift_64bit(s0[0]);
-  s0[1] = dct_const_round_shift_64bit(s0[1]);
-  s1[0] = dct_const_round_shift_64bit(s1[0]);
-  s1[1] = dct_const_round_shift_64bit(s1[1]);
-  s2[0] = dct_const_round_shift_64bit(t2[0]);
-  s2[1] = dct_const_round_shift_64bit(t2[1]);
-  s3[0] = dct_const_round_shift_64bit(s3[0]);
-  s3[1] = dct_const_round_shift_64bit(s3[1]);
-  io[0] = pack_4(s0[0], s0[1]);
-  io[1] = pack_4(s1[0], s1[1]);
-  io[2] = pack_4(s2[0], s2[1]);
-  io[3] = pack_4(s3[0], s3[1]);
-}
-
-void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
-                                     int stride, int tx_type, int bd) {
-  __m128i io[4];
-
-  io[0] = _mm_load_si128((const __m128i *)(input + 0));
-  io[1] = _mm_load_si128((const __m128i *)(input + 4));
-  io[2] = _mm_load_si128((const __m128i *)(input + 8));
-  io[3] = _mm_load_si128((const __m128i *)(input + 12));
-
-  if (bd == 8) {
-    __m128i io_short[2];
-
-    io_short[0] = _mm_packs_epi32(io[0], io[1]);
-    io_short[1] = _mm_packs_epi32(io[2], io[3]);
-    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
-      idct4_sse2(io_short);
-    } else {
-      iadst4_sse2(io_short);
-    }
-    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
-      idct4_sse2(io_short);
-    } else {
-      iadst4_sse2(io_short);
-    }
-    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
-    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
-    io[0] = _mm_srai_epi16(io_short[0], 4);
-    io[1] = _mm_srai_epi16(io_short[1], 4);
-  } else {
-    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
-      highbd_idct4_sse4_1(io);
-    } else {
-      highbd_iadst4_sse4_1(io);
-    }
-    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
-      highbd_idct4_sse4_1(io);
-    } else {
-      highbd_iadst4_sse4_1(io);
-    }
-    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
-    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
-  }
-
-  recon_and_store_4x4(io, dest, stride, bd);
-}
--- a/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
+++ b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
@@ -1,255 +0,0 @@
-/*
- *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_idct.h"
-#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
-#include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/transpose_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-
-static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
-                                                      const int c,
-                                                      __m128i *const s) {
-  const __m128i pair_c = pair_set_epi32(4 * c, 0);
-  __m128i x[2];
-
-  extend_64bit(in, x);
-  s[0] = _mm_mul_epi32(pair_c, x[0]);
-  s[1] = _mm_mul_epi32(pair_c, x[1]);
-}
-
-static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
-                                                 const __m128i in1,
-                                                 const int c0, const int c1,
-                                                 __m128i *const s0,
-                                                 __m128i *const s1) {
-  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
-  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
-  __m128i t00[2], t01[2], t10[2], t11[2];
-  __m128i x0[2], x1[2];
-
-  extend_64bit(in0, x0);
-  extend_64bit(in1, x1);
-  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
-  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
-  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
-  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
-  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
-  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
-  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
-  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
-
-  s0[0] = _mm_add_epi64(t00[0], t11[0]);
-  s0[1] = _mm_add_epi64(t00[1], t11[1]);
-  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
-  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
-}
-
-static void highbd_iadst8_sse4_1(__m128i *const io) {
-  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
-  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
-
-  transpose_32bit_4x4x2(io, io);
-
-  // stage 1
-  highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1);
-  highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5);
-  x0[0] = _mm_add_epi64(s0[0], s4[0]);
-  x0[1] = _mm_add_epi64(s0[1], s4[1]);
-  x1[0] = _mm_add_epi64(s1[0], s5[0]);
-  x1[1] = _mm_add_epi64(s1[1], s5[1]);
-  x4[0] = _mm_sub_epi64(s0[0], s4[0]);
-  x4[1] = _mm_sub_epi64(s0[1], s4[1]);
-  x5[0] = _mm_sub_epi64(s1[0], s5[0]);
-  x5[1] = _mm_sub_epi64(s1[1], s5[1]);
-
-  highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3);
-  highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7);
-  x2[0] = _mm_add_epi64(s2[0], s6[0]);
-  x2[1] = _mm_add_epi64(s2[1], s6[1]);
-  x3[0] = _mm_add_epi64(s3[0], s7[0]);
-  x3[1] = _mm_add_epi64(s3[1], s7[1]);
-  x6[0] = _mm_sub_epi64(s2[0], s6[0]);
-  x6[1] = _mm_sub_epi64(s2[1], s6[1]);
-  x7[0] = _mm_sub_epi64(s3[0], s7[0]);
-  x7[1] = _mm_sub_epi64(s3[1], s7[1]);
-
-  x0[0] = dct_const_round_shift_64bit(x0[0]);
-  x0[1] = dct_const_round_shift_64bit(x0[1]);
-  x1[0] = dct_const_round_shift_64bit(x1[0]);
-  x1[1] = dct_const_round_shift_64bit(x1[1]);
-  x2[0] = dct_const_round_shift_64bit(x2[0]);
-  x2[1] = dct_const_round_shift_64bit(x2[1]);
-  x3[0] = dct_const_round_shift_64bit(x3[0]);
-  x3[1] = dct_const_round_shift_64bit(x3[1]);
-  x4[0] = dct_const_round_shift_64bit(x4[0]);
-  x4[1] = dct_const_round_shift_64bit(x4[1]);
-  x5[0] = dct_const_round_shift_64bit(x5[0]);
-  x5[1] = dct_const_round_shift_64bit(x5[1]);
-  x6[0] = dct_const_round_shift_64bit(x6[0]);
-  x6[1] = dct_const_round_shift_64bit(x6[1]);
-  x7[0] = dct_const_round_shift_64bit(x7[0]);
-  x7[1] = dct_const_round_shift_64bit(x7[1]);
-  s0[0] = pack_4(x0[0], x0[1]);  // s0 = x0;
-  s1[0] = pack_4(x1[0], x1[1]);  // s1 = x1;
-  s2[0] = pack_4(x2[0], x2[1]);  // s2 = x2;
-  s3[0] = pack_4(x3[0], x3[1]);  // s3 = x3;
-  x4[0] = pack_4(x4[0], x4[1]);
-  x5[0] = pack_4(x5[0], x5[1]);
-  x6[0] = pack_4(x6[0], x6[1]);
-  x7[0] = pack_4(x7[0], x7[1]);
-
-  // stage 2
-  x0[0] = _mm_add_epi32(s0[0], s2[0]);
-  x1[0] = _mm_add_epi32(s1[0], s3[0]);
-  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
-  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
-
-  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
-  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
-
-  x4[0] = _mm_add_epi64(s4[0], s6[0]);
-  x4[1] = _mm_add_epi64(s4[1], s6[1]);
-  x5[0] = _mm_add_epi64(s5[0], s7[0]);
-  x5[1] = _mm_add_epi64(s5[1], s7[1]);
-  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
-  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
-  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
-  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
-  x4[0] = dct_const_round_shift_64bit(x4[0]);
-  x4[1] = dct_const_round_shift_64bit(x4[1]);
-  x5[0] = dct_const_round_shift_64bit(x5[0]);
-  x5[1] = dct_const_round_shift_64bit(x5[1]);
-  x6[0] = dct_const_round_shift_64bit(x6[0]);
-  x6[1] = dct_const_round_shift_64bit(x6[1]);
-  x7[0] = dct_const_round_shift_64bit(x7[0]);
-  x7[1] = dct_const_round_shift_64bit(x7[1]);
-  x4[0] = pack_4(x4[0], x4[1]);
-  x5[0] = pack_4(x5[0], x5[1]);
-  x6[0] = pack_4(x6[0], x6[1]);
-  x7[0] = pack_4(x7[0], x7[1]);
-
-  // stage 3
-  s2[0] = _mm_add_epi32(x2[0], x3[0]);
-  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
-  s6[0] = _mm_add_epi32(x6[0], x7[0]);
-  s7[0] = _mm_sub_epi32(x6[0], x7[0]);
-  highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2);
-  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
-  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
-  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
-
-  x2[0] = dct_const_round_shift_64bit(s2[0]);
-  x2[1] = dct_const_round_shift_64bit(s2[1]);
-  x3[0] = dct_const_round_shift_64bit(s3[0]);
-  x3[1] = dct_const_round_shift_64bit(s3[1]);
-  x6[0] = dct_const_round_shift_64bit(s6[0]);
-  x6[1] = dct_const_round_shift_64bit(s6[1]);
-  x7[0] = dct_const_round_shift_64bit(s7[0]);
-  x7[1] = dct_const_round_shift_64bit(s7[1]);
-  x2[0] = pack_4(x2[0], x2[1]);
-  x3[0] = pack_4(x3[0], x3[1]);
-  x6[0] = pack_4(x6[0], x6[1]);
-  x7[0] = pack_4(x7[0], x7[1]);
-
-  io[0] = x0[0];
-  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
-  io[2] = x6[0];
-  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]);
-  io[4] = x3[0];
-  io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]);
-  io[6] = x5[0];
-  io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
-}
-
-void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
-                                     int stride, int tx_type, int bd) {
-  __m128i io[16];
-
-  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
-  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
-  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
-  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
-  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
-  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
-  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
-  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
-  io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
-  io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
-  io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
-  io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
-  io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
-  io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
-  io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
-  io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
-
-  if (bd == 8) {
-    __m128i io_short[8];
-
-    io_short[0] = _mm_packs_epi32(io[0], io[4]);
-    io_short[1] = _mm_packs_epi32(io[1], io[5]);
-    io_short[2] = _mm_packs_epi32(io[2], io[6]);
-    io_short[3] = _mm_packs_epi32(io[3], io[7]);
-    io_short[4] = _mm_packs_epi32(io[8], io[12]);
-    io_short[5] = _mm_packs_epi32(io[9], io[13]);
-    io_short[6] = _mm_packs_epi32(io[10], io[14]);
-    io_short[7] = _mm_packs_epi32(io[11], io[15]);
-
-    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
-      vpx_idct8_sse2(io_short);
-    } else {
-      iadst8_sse2(io_short);
-    }
-    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
-      vpx_idct8_sse2(io_short);
-    } else {
-      iadst8_sse2(io_short);
-    }
-    round_shift_8x8(io_short, io);
-  } else {
-    __m128i temp[4];
-
-    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
-      vpx_highbd_idct8x8_half1d_sse4_1(io);
-      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
-    } else {
-      highbd_iadst8_sse4_1(io);
-      highbd_iadst8_sse4_1(&io[8]);
-    }
-
-    temp[0] = io[4];
-    temp[1] = io[5];
-    temp[2] = io[6];
-    temp[3] = io[7];
-    io[4] = io[8];
-    io[5] = io[9];
-    io[6] = io[10];
-    io[7] = io[11];
-
-    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
-      vpx_highbd_idct8x8_half1d_sse4_1(io);
-      io[8] = temp[0];
-      io[9] = temp[1];
-      io[10] = temp[2];
-      io[11] = temp[3];
-      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
-    } else {
-      highbd_iadst8_sse4_1(io);
-      io[8] = temp[0];
-      io[9] = temp[1];
-      io[10] = temp[2];
-      io[11] = temp[3];
-      highbd_iadst8_sse4_1(&io[8]);
-    }
-    highbd_idct8x8_final_round(io);
-  }
-  recon_and_store_8x8(io, dest, stride, bd);
-}
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -10,6 +10,8 @@

 #include "./vp9_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"

 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
@@ -20,23 +22,23 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
  in[1] = load_input_data8(input + 8);

  switch (tx_type) {
-    case DCT_DCT:
+    case 0:  // DCT_DCT
      idct4_sse2(in);
      idct4_sse2(in);
      break;
-    case ADST_DCT:
+    case 1:  // ADST_DCT
      idct4_sse2(in);
      iadst4_sse2(in);
      break;
-    case DCT_ADST:
+    case 2:  // DCT_ADST
      iadst4_sse2(in);
      idct4_sse2(in);
      break;
-    default:
-      assert(tx_type == ADST_ADST);
+    case 3:  // ADST_ADST
      iadst4_sse2(in);
      iadst4_sse2(in);
      break;
+    default: assert(0); break;
  }

  // Final round and shift
@@ -65,23 +67,23 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
  in[7] = load_input_data8(input + 8 * 7);

  switch (tx_type) {
-    case DCT_DCT:
-      vpx_idct8_sse2(in);
-      vpx_idct8_sse2(in);
+    case 0:  // DCT_DCT
+      idct8_sse2(in);
+      idct8_sse2(in);
      break;
-    case ADST_DCT:
-      vpx_idct8_sse2(in);
+    case 1:  // ADST_DCT
+      idct8_sse2(in);
      iadst8_sse2(in);
      break;
-    case DCT_ADST:
+    case 2:  // DCT_ADST
      iadst8_sse2(in);
-      vpx_idct8_sse2(in);
+      idct8_sse2(in);
      break;
-    default:
-      assert(tx_type == ADST_ADST);
+    case 3:  // ADST_ADST
      iadst8_sse2(in);
      iadst8_sse2(in);
      break;
+    default: assert(0); break;
  }

  // Final rounding and shift
@@ -199,23 +201,23 @@ void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
  load_buffer_8x16(input, in1);

  switch (tx_type) {
-    case DCT_DCT:
+    case 0:  // DCT_DCT
      idct16_sse2(in0, in1);
      idct16_sse2(in0, in1);
      break;
-    case ADST_DCT:
+    case 1:  // ADST_DCT
      idct16_sse2(in0, in1);
      iadst16_sse2(in0, in1);
      break;
-    case DCT_ADST:
+    case 2:  // DCT_ADST
      iadst16_sse2(in0, in1);
      idct16_sse2(in0, in1);
      break;
-    default:
-      assert(tx_type == ADST_ADST);
+    case 3:  // ADST_ADST
      iadst16_sse2(in0, in1);
      iadst16_sse2(in0, in1);
      break;
+    default: assert(0); break;
  }

  write_buffer_8x16(dest, in0, stride);
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -464,6 +464,10 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
      cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5);
    }
  }
+  if (cpi->svc.spatial_layer_id > 0) {
+    cr->motion_thresh = 4;
+    cr->rate_boost_fac = 12;
+  }
  if (cpi->oxcf.rc_mode == VPX_VBR) {
    // To be adjusted for VBR mode, e.g., based on gf period and boost.
    // For now use smaller qp-delta (than CBR), no second boosted seg, and
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -12,10 +12,7 @@
 #include "vp9/encoder/vp9_encoder.h"

 static const BLOCK_SIZE square[] = {
-  BLOCK_8X8,
-  BLOCK_16X16,
-  BLOCK_32X32,
-  BLOCK_64X64,
+  BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
 };

 static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -189,12 +189,11 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
    int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
    int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv,
    int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx,
-    int use_svc, int spatial_layer) {
+    int use_svc) {
  const int sse_diff = (ctx->newmv_sse == UINT_MAX)
                           ? 0
                           : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
  int frame;
-  int denoise_layer_idx = 0;
  MACROBLOCKD *filter_mbd = &mb->e_mbd;
  MODE_INFO *mi = filter_mbd->mi[0];
  MODE_INFO saved_mi;
@@ -255,10 +254,6 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
      frame = lst_fb_idx + 1;
    else if (frame == GOLDEN_FRAME)
      frame = gld_fb_idx + 1;
-    // Shift for the second spatial layer.
-    if (num_spatial_layers - spatial_layer == 2)
-      frame = frame + denoiser->num_ref_frames;
-    denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
  }

  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
@@ -294,21 +289,18 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
                  denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col);
  filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;

-  filter_mbd->plane[0].dst.buf = block_start(
-      denoiser->mc_running_avg_y[denoise_layer_idx].y_buffer,
-      denoiser->mc_running_avg_y[denoise_layer_idx].y_stride, mi_row, mi_col);
-  filter_mbd->plane[0].dst.stride =
-      denoiser->mc_running_avg_y[denoise_layer_idx].y_stride;
-  filter_mbd->plane[1].dst.buf = block_start(
-      denoiser->mc_running_avg_y[denoise_layer_idx].u_buffer,
-      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
-  filter_mbd->plane[1].dst.stride =
-      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
-  filter_mbd->plane[2].dst.buf = block_start(
-      denoiser->mc_running_avg_y[denoise_layer_idx].v_buffer,
-      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
-  filter_mbd->plane[2].dst.stride =
-      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
+  filter_mbd->plane[0].dst.buf =
+      block_start(denoiser->mc_running_avg_y.y_buffer,
+                  denoiser->mc_running_avg_y.y_stride, mi_row, mi_col);
+  filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
+  filter_mbd->plane[1].dst.buf =
+      block_start(denoiser->mc_running_avg_y.u_buffer,
+                  denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
+  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride;
+  filter_mbd->plane[2].dst.buf =
+      block_start(denoiser->mc_running_avg_y.v_buffer,
+                  denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
+  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;

  set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
  vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
@@ -332,17 +324,9 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
  int zeromv_filter = 0;
  VP9_DENOISER *denoiser = &cpi->denoiser;
  VP9_DENOISER_DECISION decision = COPY_BLOCK;
-
-  const int shift =
-      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
-          ? denoiser->num_ref_frames
-          : 0;
-  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift];
-  const int denoise_layer_index =
-      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1;
-  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index];
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
  uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
-
  uint8_t *mc_avg_start =
      block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
  struct buf_2d src = mb->plane[0].src;
@@ -397,7 +381,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
        &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
        motion_magnitude, is_skin, &zeromv_filter, consec_zeromv,
        cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx,
-        cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id);
+        cpi->gld_fb_idx, cpi->use_svc);

  if (decision == FILTER_BLOCK) {
    decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start,
@@ -448,8 +432,7 @@ void vp9_denoiser_update_frame_info(
    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
    int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
    int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
-    int svc_base_is_key, int second_spatial_layer) {
-  const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
+    int svc_base_is_key) {
  // Copy source into denoised reference buffers on KEY_FRAME or
  // if the just encoded frame was resized. For SVC, copy source if the base
  // spatial layer was key frame.
@@ -458,8 +441,8 @@ void vp9_denoiser_update_frame_info(
    int i;
    // Start at 1 so as not to overwrite the INTRA_FRAME
    for (i = 1; i < denoiser->num_ref_frames; ++i) {
-      if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL)
-        copy_frame(&denoiser->running_avg_y[i + shift], &src);
+      if (denoiser->running_avg_y[i].buffer_alloc != NULL)
+        copy_frame(&denoiser->running_avg_y[i], &src);
    }
    denoiser->reset = 0;
    return;
@@ -468,29 +451,29 @@ void vp9_denoiser_update_frame_info(
  // If more than one refresh occurs, must copy frame buffer.
  if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) {
    if (refresh_alt_ref_frame) {
-      copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
-                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1],
+                 &denoiser->running_avg_y[INTRA_FRAME]);
    }
    if (refresh_golden_frame) {
-      copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
-                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1],
+                 &denoiser->running_avg_y[INTRA_FRAME]);
    }
    if (refresh_last_frame) {
-      copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
-                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1],
+                 &denoiser->running_avg_y[INTRA_FRAME]);
    }
  } else {
    if (refresh_alt_ref_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
-                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1],
+                        &denoiser->running_avg_y[INTRA_FRAME]);
    }
    if (refresh_golden_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
-                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1],
+                        &denoiser->running_avg_y[INTRA_FRAME]);
    }
    if (refresh_last_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
-                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1],
+                        &denoiser->running_avg_y[INTRA_FRAME]);
    }
  }
 }
@@ -539,90 +522,44 @@ static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm,
 }

 int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
-                             int svc_buf_shift, int refresh_alt,
-                             int refresh_gld, int refresh_lst, int alt_fb_idx,
-                             int gld_fb_idx, int lst_fb_idx) {
+                             int refresh_alt, int refresh_gld, int refresh_lst,
+                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) {
  int fail = 0;
  if (refresh_alt) {
    // Increase the frame buffer index by 1 to map it to the buffer index in the
    // denoiser.
-    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
-                                           alt_fb_idx + 1 + svc_buf_shift);
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, alt_fb_idx + 1);
    if (fail) return 1;
  }
  if (refresh_gld) {
-    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
-                                           gld_fb_idx + 1 + svc_buf_shift);
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, gld_fb_idx + 1);
    if (fail) return 1;
  }
  if (refresh_lst) {
-    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
-                                           lst_fb_idx + 1 + svc_buf_shift);
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, lst_fb_idx + 1);
    if (fail) return 1;
  }
  return 0;
 }

-int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
-                       int use_svc, int noise_sen, int width, int height,
-                       int ssx, int ssy,
+int vp9_denoiser_alloc(VP9_COMMON *cm, int use_svc, VP9_DENOISER *denoiser,
+                       int width, int height, int ssx, int ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                       int use_highbitdepth,
 #endif
                       int border) {
-  int i, layer, fail, init_num_ref_frames;
+  int i, fail, init_num_ref_frames;
  const int legacy_byte_alignment = 0;
-  int num_layers = 1;
-  int scaled_width = width;
-  int scaled_height = height;
-  if (use_svc) {
-    LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id *
-                                                svc->number_temporal_layers +
-                                            svc->temporal_layer_id];
-    get_layer_resolution(width, height, lc->scaling_factor_num,
-                         lc->scaling_factor_den, &scaled_width, &scaled_height);
-    // For SVC: only denoise at most 2 spatial (highest) layers.
-    if (noise_sen >= 2)
-      // Denoise from one spatial layer below the top.
-      svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 2, 0);
-    else
-      // Only denoise the top spatial layer.
-      svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 1, 0);
-    num_layers = svc->number_spatial_layers - svc->first_layer_denoise;
-  }
  assert(denoiser != NULL);
+
  denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
  init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
-  denoiser->num_layers = num_layers;
-  CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
-                  vpx_calloc(denoiser->num_ref_frames * num_layers,
-                             sizeof(denoiser->running_avg_y[0])));
  CHECK_MEM_ERROR(
-      cm, denoiser->mc_running_avg_y,
-      vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
-
-  for (layer = 0; layer < num_layers; ++layer) {
-    const int denoise_width = (layer == 0) ? width : scaled_width;
-    const int denoise_height = (layer == 0) ? height : scaled_height;
-    for (i = 0; i < init_num_ref_frames; ++i) {
-      fail = vpx_alloc_frame_buffer(
-          &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer],
-          denoise_width, denoise_height, ssx, ssy,
-#if CONFIG_VP9_HIGHBITDEPTH
-          use_highbitdepth,
-#endif
-          border, legacy_byte_alignment);
-      if (fail) {
-        vp9_denoiser_free(denoiser);
-        return 1;
-      }
-#ifdef OUTPUT_YUV_DENOISED
-      make_grayscale(&denoiser->running_avg_y[i]);
-#endif
-    }
-
-    fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y[layer],
-                                  denoise_width, denoise_height, ssx, ssy,
+      cm, denoiser->running_avg_y,
+      vpx_calloc(denoiser->num_ref_frames, sizeof(denoiser->running_avg_y[0])));
+  for (i = 0; i < init_num_ref_frames; ++i) {
+    fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
+                                  ssx, ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                  use_highbitdepth,
 #endif
@@ -631,10 +568,22 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
      vp9_denoiser_free(denoiser);
      return 1;
    }
+#ifdef OUTPUT_YUV_DENOISED
+    make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+  }
+
+  fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, ssx,
+                                ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                use_highbitdepth,
+#endif
+                                border, legacy_byte_alignment);
+  if (fail) {
+    vp9_denoiser_free(denoiser);
+    return 1;
  }

-  // denoiser->last_source only used for noise_estimation, so only for top
-  // layer.
  fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                use_highbitdepth,
@@ -660,18 +609,12 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) {
    return;
  }
  denoiser->frame_buffer_initialized = 0;
-  for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) {
+  for (i = 0; i < denoiser->num_ref_frames; ++i) {
    vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
  }
  vpx_free(denoiser->running_avg_y);
  denoiser->running_avg_y = NULL;
-
-  for (i = 0; i < denoiser->num_layers; ++i) {
-    vpx_free_frame_buffer(&denoiser->mc_running_avg_y[i]);
-  }
-
-  vpx_free(denoiser->mc_running_avg_y);
-  denoiser->mc_running_avg_y = NULL;
+  vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
  vpx_free_frame_buffer(&denoiser->last_source);
 }

--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -44,12 +44,11 @@ typedef enum vp9_denoiser_level {

 typedef struct vp9_denoiser {
  YV12_BUFFER_CONFIG *running_avg_y;
-  YV12_BUFFER_CONFIG *mc_running_avg_y;
+  YV12_BUFFER_CONFIG mc_running_avg_y;
  YV12_BUFFER_CONFIG last_source;
  int frame_buffer_initialized;
  int reset;
  int num_ref_frames;
-  int num_layers;
  VP9_DENOISER_LEVEL denoising_level;
  VP9_DENOISER_LEVEL prev_denoising_level;
 } VP9_DENOISER;
@@ -67,13 +66,12 @@ typedef struct {
 } VP9_PICKMODE_CTX_DEN;

 struct VP9_COMP;
-struct SVC;

 void vp9_denoiser_update_frame_info(
    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
    int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
    int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
-    int svc_base_is_key, int second_spatial_layer);
+    int svc_base_is_key);

 void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
                          int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
@@ -86,13 +84,11 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
                                     PICK_MODE_CONTEXT *ctx);

 int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
-                             int svc_buf_shift, int refresh_alt,
-                             int refresh_gld, int refresh_lst, int alt_fb_idx,
-                             int gld_fb_idx, int lst_fb_idx);
+                             int refresh_alt, int refresh_gld, int refresh_lst,
+                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx);

-int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
-                       int use_svc, int noise_sen, int width, int height,
-                       int ssx, int ssy,
+int vp9_denoiser_alloc(VP9_COMMON *cm, int use_svc, VP9_DENOISER *denoiser,
+                       int width, int height, int ssx, int ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                       int use_highbitdepth,
 #endif
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1513,9 +1513,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
          }
        }
      }
-      if (is_key_frame ||
-          (low_res && vt.split[i].split[j].part_variances.none.variance >
-                          threshold_4x4avg)) {
+      if (is_key_frame || (low_res &&
+                           vt.split[i].split[j].part_variances.none.variance >
+                               threshold_4x4avg)) {
        force_split[split_index] = 0;
        // Go down to 4x4 down-sampling for variance.
        variance4x4downsample[i2 + j] = 1;
@@ -3403,10 +3403,9 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,

        // Rate and distortion based partition search termination clause.
        if (!cpi->sf.ml_partition_search_early_termination &&
-            !x->e_mbd.lossless &&
-            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
-             (best_rdc.dist < dist_breakout_thr &&
-              best_rdc.rate < rate_breakout_thr))) {
+            !x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+                                   (best_rdc.dist < dist_breakout_thr &&
+                                    best_rdc.rate < rate_breakout_thr))) {
          do_rect = 0;
        }
      }
@@ -4621,9 +4620,8 @@ void vp9_init_tile_data(VP9_COMP *cpi) {

  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
    if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
-    CHECK_MEM_ERROR(
-        cm, cpi->tile_data,
-        vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+    CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows *
+                                                   sizeof(*cpi->tile_data)));
    cpi->allocated_tiles = tile_cols * tile_rows;

    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -50,8 +50,7 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 }

 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-  { 10, 6 },
-  { 8, 5 },
+  { 10, 6 }, { 8, 5 },
 };

 // 'num' can be negative, but 'shift' must be non-negative.
@@ -201,9 +200,9 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
          const int band_next = band_translate[i + 1];
          const int token_next =
              (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
-          unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS]
-                                               [ENTROPY_TOKENS] =
-                                                   token_costs + band_next;
+          unsigned int(
+              *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+              token_costs + band_next;
          token_cache[rc] = vp9_pt_energy_class[t0];
          ctx_next = get_coef_context(nb, token_cache, i + 1);
          token_tree_sel_next = (x == 0);
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -65,12 +65,12 @@
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0

-// Whether to use high precision mv for altref computation.
-#define ALTREF_HIGH_PRECISION_MV 1
-
-// Q threshold for high precision mv. Choose a very high value for now so that
-// HIGH_PRECISION is always chosen.
-#define HIGH_PRECISION_MV_QTHRESH 200
+#define ALTREF_HIGH_PRECISION_MV 1     // Whether to use high precision mv
+                                       //  for altref computation.
+#define HIGH_PRECISION_MV_QTHRESH 200  // Q threshold for high precision
+                                       // mv. Choose a very high value for
+                                       // now so that HIGH_PRECISION is always
+                                       // chosen.

 #define FRAME_SIZE_FACTOR 128  // empirical params for context model threshold
 #define FRAME_RATE_FACTOR 8
@@ -437,37 +437,34 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) {

 /* clang-format off */
 const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
-  //         sample rate    size   breadth  bitrate  cpb
-  { LEVEL_1,   829440,      36864,    512,   200,    400,    2, 1,  4,  8 },
-  { LEVEL_1_1, 2764800,     73728,    768,   800,    1000,   2, 1,  4,  8 },
-  { LEVEL_2,   4608000,     122880,   960,   1800,   1500,   2, 1,  4,  8 },
-  { LEVEL_2_1, 9216000,     245760,   1344,  3600,   2800,   2, 2,  4,  8 },
-  { LEVEL_3,   20736000,    552960,   2048,  7200,   6000,   2, 4,  4,  8 },
-  { LEVEL_3_1, 36864000,    983040,   2752,  12000,  10000,  2, 4,  4,  8 },
-  { LEVEL_4,   83558400,    2228224,  4160,  18000,  16000,  4, 4,  4,  8 },
-  { LEVEL_4_1, 160432128,   2228224,  4160,  30000,  18000,  4, 4,  5,  6 },
-  { LEVEL_5,   311951360,   8912896,  8384,  60000,  36000,  6, 8,  6,  4 },
-  { LEVEL_5_1, 588251136,   8912896,  8384,  120000, 46000,  8, 8,  10, 4 },
+  { LEVEL_1,   829440,      36864,    200,    400,    2, 1,  4,  8 },
+  { LEVEL_1_1, 2764800,     73728,    800,    1000,   2, 1,  4,  8 },
+  { LEVEL_2,   4608000,     122880,   1800,   1500,   2, 1,  4,  8 },
+  { LEVEL_2_1, 9216000,     245760,   3600,   2800,   2, 2,  4,  8 },
+  { LEVEL_3,   20736000,    552960,   7200,   6000,   2, 4,  4,  8 },
+  { LEVEL_3_1, 36864000,    983040,   12000,  10000,  2, 4,  4,  8 },
+  { LEVEL_4,   83558400,    2228224,  18000,  16000,  4, 4,  4,  8 },
+  { LEVEL_4_1, 160432128,   2228224,  30000,  18000,  4, 4,  5,  6 },
+  { LEVEL_5,   311951360,   8912896,  60000,  36000,  6, 8,  6,  4 },
+  { LEVEL_5_1, 588251136,   8912896,  120000, 46000,  8, 8,  10, 4 },
  // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when
  // they are finalized (currently tentative).
-  { LEVEL_5_2, 1176502272,  8912896,  8384,  180000, 90000,  8, 8,  10, 4 },
-  { LEVEL_6,   1176502272,  35651584, 16832, 180000, 90000,  8, 16, 10, 4 },
-  { LEVEL_6_1, 2353004544u, 35651584, 16832, 240000, 180000, 8, 16, 10, 4 },
-  { LEVEL_6_2, 4706009088u, 35651584, 16832, 480000, 360000, 8, 16, 10, 4 },
+  { LEVEL_5_2, 1176502272,  8912896,  180000, 90000,  8, 8,  10, 4 },
+  { LEVEL_6,   1176502272,  35651584, 180000, 90000,  8, 16, 10, 4 },
+  { LEVEL_6_1, 2353004544u, 35651584, 240000, 180000, 8, 16, 10, 4 },
+  { LEVEL_6_2, 4706009088u, 35651584, 480000, 360000, 8, 16, 10, 4 },
 };
 /* clang-format on */

-static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
-  "The average bit-rate is too high.",
-  "The picture size is too large.",
-  "The picture width/height is too large.",
-  "The luma sample rate is too large.",
-  "The CPB size is too large.",
-  "The compression ratio is too small",
-  "Too many column tiles are used.",
-  "The alt-ref distance is too small.",
-  "Too many reference buffers are used."
-};
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] =
+    { "The average bit-rate is too high.",
+      "The picture size is too large.",
+      "The luma sample rate is too large.",
+      "The CPB size is too large.",
+      "The compression ratio is too small",
+      "Too many column tiles are used.",
+      "The alt-ref distance is too small.",
+      "Too many reference buffers are used." };

 static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
  switch (mode) {
@@ -547,74 +544,6 @@ static void apply_active_map(VP9_COMP *cpi) {
  }
 }

-static void apply_roi_map(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  struct segmentation *const seg = &cm->seg;
-  vpx_roi_map_t *roi = &cpi->roi;
-  const int *delta_q = roi->delta_q;
-  const int *delta_lf = roi->delta_lf;
-  const int *skip = roi->skip;
-  int ref_frame[8];
-  int internal_delta_q[MAX_SEGMENTS];
-  int i;
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
-
-  // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
-  // realtime mode.
-  if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return;
-  if (!roi->enabled) return;
-
-  memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame));
-
-  vp9_enable_segmentation(seg);
-  vp9_clearall_segfeatures(seg);
-  // Select delta coding method;
-  seg->abs_delta = SEGMENT_DELTADATA;
-
-  memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols));
-
-  for (i = 0; i < MAX_SEGMENTS; ++i) {
-    // Translate the external delta q values to internal values.
-    internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i]));
-    if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i];
-    vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q);
-    vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
-    if (internal_delta_q[i] != 0) {
-      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
-      vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]);
-    }
-    if (delta_lf[i] != 0) {
-      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF);
-      vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]);
-    }
-    if (skip[i] != 0) {
-      vp9_enable_segfeature(seg, i, SEG_LVL_SKIP);
-      vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]);
-    }
-    if (ref_frame[i] >= 0) {
-      int valid_ref = 1;
-      // ALTREF is not used as reference for nonrd_pickmode with 0 lag.
-      if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode)
-        valid_ref = 0;
-      // If GOLDEN is selected, make sure it's set as reference.
-      if (ref_frame[i] == GOLDEN_FRAME &&
-          !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) {
-        valid_ref = 0;
-      }
-      // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
-      // same reference.
-      if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0)
-        ref_frame[i] = LAST_FRAME;
-      if (valid_ref) {
-        vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME);
-        vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]);
-      }
-    }
-  }
-  roi->enabled = 1;
-}
-
 static void init_level_info(Vp9LevelInfo *level_info) {
  Vp9LevelStats *const level_stats = &level_info->level_stats;
  Vp9LevelSpec *const level_spec = &level_info->level_spec;
@@ -625,13 +554,6 @@ static void init_level_info(Vp9LevelInfo *level_info) {
  level_spec->min_altref_distance = INT_MAX;
 }

-static int check_seg_range(int seg_data[8], int range) {
-  return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range ||
-           abs(seg_data[2]) > range || abs(seg_data[3]) > range ||
-           abs(seg_data[4]) > range || abs(seg_data[5]) > range ||
-           abs(seg_data[6]) > range || abs(seg_data[7]) > range);
-}
-
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
  int i;
  const Vp9LevelSpec *this_level;
@@ -644,8 +566,6 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
            (double)this_level->max_luma_sample_rate *
                (1 + SAMPLE_RATE_GRACE_P) ||
        level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
-        level_spec->max_luma_picture_breadth >
-            this_level->max_luma_picture_breadth ||
        level_spec->average_bitrate > this_level->average_bitrate ||
        level_spec->max_cpb_size > this_level->max_cpb_size ||
        level_spec->compression_ratio < this_level->compression_ratio ||
@@ -658,61 +578,6 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
  return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
 }

-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
-                    unsigned int cols, int delta_q[8], int delta_lf[8],
-                    int skip[8], int ref_frame[8]) {
-  VP9_COMMON *cm = &cpi->common;
-  vpx_roi_map_t *roi = &cpi->roi;
-  const int range = 63;
-  const int ref_frame_range = 3;  // Alt-ref
-  const int skip_range = 1;
-  const int frame_rows = cpi->common.mi_rows;
-  const int frame_cols = cpi->common.mi_cols;
-
-  // Check number of rows and columns match
-  if (frame_rows != (int)rows || frame_cols != (int)cols) {
-    return -1;
-  }
-
-  if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
-      !check_seg_range(ref_frame, ref_frame_range) ||
-      !check_seg_range(skip, skip_range))
-    return -1;
-
-  // Also disable segmentation if no deltas are specified.
-  if (!map ||
-      (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] |
-         delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] |
-         delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] |
-         delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] |
-         skip[5] | skip[6] | skip[7]) &&
-       (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 &&
-        ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 &&
-        ref_frame[6] == -1 && ref_frame[7] == -1))) {
-    vp9_disable_segmentation(&cm->seg);
-    cpi->roi.enabled = 0;
-    return 0;
-  }
-
-  if (roi->roi_map) {
-    vpx_free(roi->roi_map);
-    roi->roi_map = NULL;
-  }
-  CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols));
-
-  // Copy to ROI sturcture in the compressor.
-  memcpy(roi->roi_map, map, rows * cols);
-  memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0]));
-  memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0]));
-  memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0]));
-  memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0]));
-  roi->enabled = 1;
-  roi->rows = rows;
-  roi->cols = cols;
-
-  return 0;
-}
-
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
                       int cols) {
  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
@@ -947,9 +812,6 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
  vpx_free(cpi->active_map.map);
  cpi->active_map.map = NULL;

-  vpx_free(cpi->roi.roi_map);
-  cpi->roi.roi_map = NULL;
-
  vpx_free(cpi->consec_zero_mv);
  cpi->consec_zero_mv = NULL;

@@ -1254,9 +1116,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {

  // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
  // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
-  // target of 1/4x1/4. number_spatial_layers must be greater than 2.
-  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
-      cpi->svc.number_spatial_layers > 2) {
+  // target of 1/4x1/4.
+  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc) {
    cpi->svc.scaled_temp_is_alloc = 1;
    if (vpx_realloc_frame_buffer(
            &cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1,
@@ -1358,8 +1219,8 @@ static void set_tile_limits(VP9_COMP *cpi) {
  }

  if (cpi->oxcf.target_level == LEVEL_AUTO) {
-    const int level_tile_cols =
-        log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+    const uint32_t pic_size = cpi->common.width * cpi->common.height;
+    const int level_tile_cols = log_tile_cols_from_picsize_level(pic_size);
    if (cm->log2_tile_cols > level_tile_cols) {
      cm->log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
    }
@@ -1987,8 +1848,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
           cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
      vp9_cyclic_refresh_reset_resize(cpi);
-    rc->rc_1_frame = 0;
-    rc->rc_2_frame = 0;
  }

  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
@@ -1999,24 +1858,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
                                           (int)cpi->oxcf.target_bandwidth);
  }

-  // Check for resetting the rc flags (rc_1_frame, rc_2_frame) if the
-  // configuration change has a large change in avg_frame_bandwidth.
-  // For SVC check for resetting based on spatial layer average bandwidth.
-  // Also reset buffer level to optimal level.
-  if (cm->current_video_frame > 0) {
-    if (cpi->use_svc) {
-      vp9_svc_check_reset_layer_rc_flag(cpi);
-    } else {
-      if (rc->avg_frame_bandwidth > (3 * rc->last_avg_frame_bandwidth >> 1) ||
-          rc->avg_frame_bandwidth < (rc->last_avg_frame_bandwidth >> 1)) {
-        rc->rc_1_frame = 0;
-        rc->rc_2_frame = 0;
-        rc->bits_off_target = rc->optimal_buffer_level;
-        rc->buffer_level = rc->optimal_buffer_level;
-      }
-    }
-  }
-
  cpi->alt_ref_source = NULL;
  rc->is_src_frame_alt_ref = 0;

@@ -2151,9 +1992,8 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,

  realloc_segmentation_maps(cpi);

-  CHECK_MEM_ERROR(
-      cm, cpi->skin_map,
-      vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
+  CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                                sizeof(cpi->skin_map[0])));

  CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());

@@ -3016,26 +2856,18 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
      cpi->denoiser.denoising_level > kDenLowLow) {
    int svc_base_is_key = 0;
-    int denoise_svc_second_layer = 0;
    if (cpi->use_svc) {
      int realloc_fail = 0;
-      const int svc_buf_shift =
-          cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
-              ? cpi->denoiser.num_ref_frames
-              : 0;
      int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
                                   cpi->svc.temporal_layer_id,
                                   cpi->svc.number_temporal_layers);
      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
      svc_base_is_key = lc->is_key_frame;
-      denoise_svc_second_layer =
-          cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 ? 1
-                                                                          : 0;
-      // Check if we need to allocate extra buffers in the denoiser
-      // for
+
+      // Check if we need to allocate extra buffers in the denoiser for
      // refreshed frames.
      realloc_fail = vp9_denoiser_realloc_svc(
-          cm, &cpi->denoiser, svc_buf_shift, cpi->refresh_alt_ref_frame,
+          cm, &cpi->denoiser, cpi->refresh_alt_ref_frame,
          cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx,
          cpi->gld_fb_idx, cpi->lst_fb_idx);
      if (realloc_fail)
@@ -3046,8 +2878,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
        &cpi->denoiser, *cpi->Source, cpi->common.frame_type,
        cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame,
        cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx,
-        cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key,
-        denoise_svc_second_layer);
+        cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key);
  }
 #endif
  if (is_one_pass_cbr_svc(cpi)) {
@@ -3482,9 +3313,8 @@ static void setup_denoiser_buffer(VP9_COMP *cpi) {
  VP9_COMMON *const cm = &cpi->common;
  if (cpi->oxcf.noise_sensitivity > 0 &&
      !cpi->denoiser.frame_buffer_initialized) {
-    if (vp9_denoiser_alloc(cm, &cpi->svc, &cpi->denoiser, cpi->use_svc,
-                           cpi->oxcf.noise_sensitivity, cm->width, cm->height,
-                           cm->subsampling_x, cm->subsampling_y,
+    if (vp9_denoiser_alloc(cm, cpi->use_svc, &cpi->denoiser, cm->width,
+                           cm->height, cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                           cm->use_highbitdepth,
 #endif
@@ -3765,8 +3595,6 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
    // it may be pretty bad for rate-control,
    // and I should handle it somehow
    vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
-  } else if (cpi->roi.enabled && cm->frame_type != KEY_FRAME) {
-    apply_roi_map(cpi);
  }

  apply_active_map(cpi);
@@ -4497,15 +4325,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
  struct segmentation *const seg = &cm->seg;
  TX_SIZE t;

-  // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
-      !cpi->svc.rc_drop_superframe && cpi->oxcf.target_bandwidth == 0) {
-    cpi->svc.skip_enhancement_layer = 1;
-    vp9_rc_postencode_update_drop_frame(cpi);
-    cpi->ext_refresh_frame_flags_pending = 0;
-    return;
-  }
-
  set_ext_overrides(cpi);
  vpx_clear_system_state();

@@ -4597,6 +4416,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
    if (vp9_rc_drop_frame(cpi) ||
        (is_one_pass_cbr_svc(cpi) && cpi->svc.rc_drop_superframe == 1)) {
      vp9_rc_postencode_update_drop_frame(cpi);
+      ++cm->current_video_frame;
      cpi->ext_refresh_frame_flags_pending = 0;
      cpi->svc.rc_drop_superframe = 1;
      cpi->last_frame_dropped = 1;
@@ -5009,7 +4829,6 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
  int i, idx;
  uint64_t luma_samples, dur_end;
  const uint32_t luma_pic_size = cm->width * cm->height;
-  const uint32_t luma_pic_breadth = VPXMAX(cm->width, cm->height);
  LevelConstraint *const level_constraint = &cpi->level_constraint;
  const int8_t level_index = level_constraint->level_index;
  double cpb_data_size;
@@ -5113,11 +4932,6 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
    level_spec->max_luma_picture_size = luma_pic_size;
  }

-  // update max_luma_picture_breadth
-  if (luma_pic_breadth > level_spec->max_luma_picture_breadth) {
-    level_spec->max_luma_picture_breadth = luma_pic_breadth;
-  }
-
  // update compression_ratio
  level_spec->compression_ratio = (double)level_stats->total_uncompressed_size *
                                  cm->bit_depth /
@@ -5138,15 +4952,6 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
                         level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]);
    }

-    if (level_spec->max_luma_picture_breadth >
-        vp9_level_defs[level_index].max_luma_picture_breadth) {
-      level_constraint->fail_flag |= (1 << LUMA_PIC_BREADTH_TOO_LARGE);
-      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
-                         "Failed to encode to the target level %d. %s",
-                         vp9_level_defs[level_index].level,
-                         level_fail_messages[LUMA_PIC_BREADTH_TOO_LARGE]);
-    }
-
    if ((double)level_spec->max_luma_sample_rate >
        (double)vp9_level_defs[level_index].max_luma_sample_rate *
            (1 + SAMPLE_RATE_GRACE_P)) {
@@ -5347,6 +5152,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
      cm->intra_only = 0;
      // if the flags indicate intra frame, but if the current picture is for
      // non-zero spatial layer, it should not be an intra picture.
+      // TODO(Won Kap): this needs to change if per-layer intra frame is
+      // allowed.
      if ((source->flags & VPX_EFLAG_FORCE_KF) &&
          cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
        source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
@@ -5479,6 +5286,21 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
  }
 #endif  // CONFIG_REALTIME_ONLY

+#if 1
+  {
+    VP9_COMMON *const cm = &cpi->common;
+    TWO_PASS *const twopass = &cpi->twopass;
+    GF_GROUP *const gf_group = &twopass->gf_group;
+
+    printf("Frame=%d, gf_group_update_type[gf_group_index=%d]=%d, "
+           "show_frame=%d\n",
+           cm->current_video_frame, gf_group->index,
+           gf_group->update_type[gf_group->index],
+           cm->show_frame);
+  }
+#endif  // 0
+
+
  if (cm->refresh_frame_context)
    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;

@@ -5512,6 +5334,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
  if (oxcf->pass != 1) {
    double samples = 0.0;
    cpi->bytes += (int)(*size);
+    
+ #if 1
+	{
+		printf("Frame %d: rate: %d\n",
+			   cm->current_video_frame, (int)(*size));
+	}
+#endif  // 0

    if (cm->show_frame) {
      uint32_t bit_depth = 8;
@@ -5541,6 +5370,19 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
        cpi->total_sq_error += psnr.sse[0];
        cpi->total_samples += psnr.samples[0];
        samples = psnr.samples[0];
+        
+#if 1
+        {
+			const int rddiv = cpi->rd.RDDIV;
+			const int rdmult = cpi->rd.RDMULT;
+			const int64_t rdcost = RDCOST(
+				rdmult, rddiv, (int)(*size) * 8, psnr.sse[0]);
+			printf("Frame %d: distortion: %" PRIu64 " rdcost: %" PRId64 "\n",
+				   cm->current_video_frame, psnr.sse[0], rdcost);
+			printf("%d %d\n", rddiv, rdmult);
+				
+		}
+#endif  // 0

        {
          PSNR_STATS psnr2;
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -383,7 +383,6 @@ typedef struct {
  VP9_LEVEL level;
  uint64_t max_luma_sample_rate;
  uint32_t max_luma_picture_size;
-  uint32_t max_luma_picture_breadth;
  double average_bitrate;  // in kilobits per second
  double max_cpb_size;     // in kilobits
  double compression_ratio;
@@ -423,15 +422,14 @@ typedef struct {

 typedef enum {
  BITRATE_TOO_LARGE = 0,
-  LUMA_PIC_SIZE_TOO_LARGE,
-  LUMA_PIC_BREADTH_TOO_LARGE,
-  LUMA_SAMPLE_RATE_TOO_LARGE,
-  CPB_TOO_LARGE,
-  COMPRESSION_RATIO_TOO_SMALL,
-  TOO_MANY_COLUMN_TILE,
-  ALTREF_DIST_TOO_SMALL,
-  TOO_MANY_REF_BUFFER,
-  TARGET_LEVEL_FAIL_IDS
+  LUMA_PIC_SIZE_TOO_LARGE = 1,
+  LUMA_SAMPLE_RATE_TOO_LARGE = 2,
+  CPB_TOO_LARGE = 3,
+  COMPRESSION_RATIO_TOO_SMALL = 4,
+  TOO_MANY_COLUMN_TILE = 5,
+  ALTREF_DIST_TOO_SMALL = 6,
+  TOO_MANY_REF_BUFFER = 7,
+  TARGET_LEVEL_FAIL_IDS = 8
 } TARGET_LEVEL_FAIL_ID;

 typedef struct {
@@ -723,8 +721,6 @@ typedef struct VP9_COMP {

  uint8_t *count_arf_frame_usage;
  uint8_t *count_lastgolden_frame_usage;
-
-  vpx_roi_map_t roi;
 } VP9_COMP;

 void vp9_initialize_enc(void);
@@ -870,8 +866,9 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {

 #if CONFIG_VP9_TEMPORAL_DENOISING
 static INLINE int denoise_svc(const struct VP9_COMP *const cpi) {
-  return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >=
-                                                cpi->svc.first_layer_denoise));
+  return (!cpi->use_svc ||
+          (cpi->use_svc &&
+           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
 }
 #endif

@@ -923,14 +920,10 @@ static INLINE int get_level_index(VP9_LEVEL level) {

 // Return the log2 value of max column tiles corresponding to the level that
 // the picture size fits into.
-static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
-                                                   uint32_t height) {
+static INLINE int log_tile_cols_from_picsize_level(uint32_t pic_size) {
  int i;
-  const uint32_t pic_size = width * height;
-  const uint32_t pic_breadth = VPXMAX(width, height);
  for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
-    if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
-        vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
+    if (vp9_level_defs[i].max_luma_picture_size > pic_size) {
      return get_msb(vp9_level_defs[i].max_col_tiles);
    }
  }
@@ -939,10 +932,6 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width,

 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);

-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
-                    unsigned int cols, int delta_q[8], int delta_lf[8],
-                    int skip[8], int ref_frame[8]);
-
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);

 void vp9_set_row_mt(VP9_COMP *cpi);
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -66,8 +66,8 @@ static int get_max_tile_cols(VP9_COMP *cpi) {
  log2_tile_cols =
      clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
  if (cpi->oxcf.target_level == LEVEL_AUTO) {
-    const int level_tile_cols =
-        log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+    const uint32_t pic_size = cpi->common.width * cpi->common.height;
+    const int level_tile_cols = log_tile_cols_from_picsize_level(pic_size);
    if (log2_tile_cols > level_tile_cols) {
      log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
    }
@@ -390,9 +390,8 @@ void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
 }

 #if !CONFIG_REALTIME_ONLY
-static int first_pass_worker_hook(void *arg1, void *arg2) {
-  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
-  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
+static int first_pass_worker_hook(EncWorkerData *const thread_data,
+                                  MultiThreadHandle *multi_thread_ctxt) {
  VP9_COMP *const cpi = thread_data->cpi;
  const VP9_COMMON *const cm = &cpi->common;
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -471,8 +470,8 @@ void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
    }
  }

-  launch_enc_workers(cpi, first_pass_worker_hook, multi_thread_ctxt,
-                     num_workers);
+  launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook,
+                     multi_thread_ctxt, num_workers);

  first_tile_col = &cpi->tile_data[0];
  for (i = 1; i < tile_cols; i++) {
@@ -481,9 +480,8 @@ void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
  }
 }

-static int temporal_filter_worker_hook(void *arg1, void *arg2) {
-  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
-  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
+static int temporal_filter_worker_hook(EncWorkerData *const thread_data,
+                                       MultiThreadHandle *multi_thread_ctxt) {
  VP9_COMP *const cpi = thread_data->cpi;
  const VP9_COMMON *const cm = &cpi->common;
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -555,14 +553,13 @@ void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
    }
  }

-  launch_enc_workers(cpi, temporal_filter_worker_hook, multi_thread_ctxt,
-                     num_workers);
+  launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook,
+                     multi_thread_ctxt, num_workers);
 }
 #endif  // !CONFIG_REALTIME_ONLY

-static int enc_row_mt_worker_hook(void *arg1, void *arg2) {
-  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
-  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
+static int enc_row_mt_worker_hook(EncWorkerData *const thread_data,
+                                  MultiThreadHandle *multi_thread_ctxt) {
  VP9_COMP *const cpi = thread_data->cpi;
  const VP9_COMMON *const cm = &cpi->common;
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -651,8 +648,8 @@ void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
    }
  }

-  launch_enc_workers(cpi, enc_row_mt_worker_hook, multi_thread_ctxt,
-                     num_workers);
+  launch_enc_workers(cpi, (VPxWorkerHook)enc_row_mt_worker_hook,
+                     multi_thread_ctxt, num_workers);

  for (i = 0; i < num_workers; i++) {
    VPxWorker *const worker = &cpi->workers[i];
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -44,6 +44,7 @@
 #define COMPLEXITY_STATS_OUTPUT 0

 #define FIRST_PASS_Q 10.0
+#define GF_MAX_BOOST 96.0
 #define INTRA_MODE_PENALTY 1024
 #define MIN_ARF_GF_BOOST 240
 #define MIN_DECAY_FACTOR 0.01
@@ -731,8 +732,9 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
  // Exclude any image dead zone
  if (fp_acc_data->image_data_start_row > 0) {
    fp_acc_data->intra_skip_count =
-        VPXMAX(0, fp_acc_data->intra_skip_count -
-                      (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
+        VPXMAX(0,
+               fp_acc_data->intra_skip_count -
+                   (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
  }

  fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;
@@ -1947,7 +1949,6 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
 }

 #define BASELINE_ERR_PER_MB 12500.0
-#define GF_MAX_BOOST 96.0
 static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
                               double this_frame_mv_in_out) {
  double frame_boost;
@@ -2237,6 +2238,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
    }
    gf_group->arf_update_idx[0] = arf_buffer_indices[0];
    gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
+
+    // Step over the golden frame / overlay frame
+    if (EOF == input_stats(twopass, &frame_stats)) return;
  }

  // Deduct the boost bits for arf (or gf if it is not a key frame)
@@ -2281,8 +2285,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
  // Define middle frame
  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;

-  normal_frames =
-      rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
+  normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
  if (normal_frames > 1)
    normal_frame_bits = (int)(total_group_bits / normal_frames);
  else
@@ -2380,8 +2383,6 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise,

 // Analyse and define a gf/arf group.
 #define ARF_DECAY_BREAKOUT 0.10
-#define ARF_ABS_ZOOM_THRESH 4.0
-
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  VP9_COMMON *const cm = &cpi->common;
  RATE_CONTROL *const rc = &cpi->rc;
@@ -2410,6 +2411,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  double mv_in_out_accumulator = 0.0;
  double abs_mv_in_out_accumulator = 0.0;
  double mv_ratio_accumulator_thresh;
+  double mv_in_out_thresh;
  double abs_mv_in_out_thresh;
  double sr_accumulator = 0.0;
  const double av_err = get_distribution_av_err(cpi, twopass);
@@ -2455,7 +2457,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  // Motion breakout threshold for loop below depends on image size.
  mv_ratio_accumulator_thresh =
      (cpi->initial_height + cpi->initial_width) / 4.0;
-  abs_mv_in_out_thresh = ARF_ABS_ZOOM_THRESH;
+  mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 300.0;
+  abs_mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 200.0;

  // Set a maximum and minimum interval for the GF group.
  // If the image appears almost completely static we can extend beyond this.
@@ -2540,17 +2543,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      // Update the accumulator for second ref error difference.
      // This is intended to give an indication of how much the coded error is
      // increasing over time.
-      if (i == 1) {
-        sr_accumulator += next_frame.coded_error;
-      } else {
-        sr_accumulator += (next_frame.sr_coded_error - next_frame.coded_error);
-      }
+      sr_accumulator += (next_frame.sr_coded_error - next_frame.coded_error);
+      sr_accumulator = VPXMAX(0.0, sr_accumulator);
    }

    // Break out conditions.
-    // Break at maximum of active_max_gf_interval unless almost totally static.
-    if (((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) &&
-         (i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
+    if (
+        // Break at active_max_gf_interval unless almost totally static.
+        ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
        (
            // Don't break out with a very short interval.
            (i >= active_min_gf_interval) &&
@@ -2559,6 +2559,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
            (!flash_detected) &&
            ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
             (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) ||
+             (mv_in_out_accumulator < -mv_in_out_thresh) ||
             (sr_accumulator > next_frame.intra_error)))) {
      break;
    }
@@ -2570,8 +2571,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;

  // Should we use the alternate reference frame.
-  if ((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && allow_alt_ref &&
-      (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) {
+  if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
+      (i >= rc->min_gf_interval)) {
    const int forward_frames = (rc->frames_to_key - i >= i - 1)
                                   ? i - 1
                                   : VPXMAX(0, rc->frames_to_key - i);
@@ -2599,10 +2600,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 #endif

  // Set the interval until the next gf.
-  rc->baseline_gf_interval =
-      (twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH)
-          ? (i - (is_key_frame || rc->source_alt_ref_pending))
-          : i;
+  // rc->baseline_gf_interval = 8;
+  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);

  // Only encode alt reference frame in temporal base layer. So
  // baseline_gf_interval should be multiple of a temporal layer group
@@ -2700,26 +2699,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 #endif
 }

-// Intra / Inter threshold very low
-#define VERY_LOW_II 1.5
-// Clean slide transitions we expect a sharp single frame spike in error.
-#define ERROR_SPIKE 5.0
-
-// Slide show transition detection.
-// Tests for case where there is very low error either side of the current frame
-// but much higher just for this frame. This can help detect key frames in
-// slide shows even where the slides are pictures of different sizes.
-// Also requires that intra and inter errors are very similar to help eliminate
-// harmful false positives.
-// It will not help if the transition is a fade or other multi-frame effect.
-static int slide_transition(const FIRSTPASS_STATS *this_frame,
-                            const FIRSTPASS_STATS *last_frame,
-                            const FIRSTPASS_STATS *next_frame) {
-  return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) &&
-         (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) &&
-         (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE));
-}
-
 // Threshold for use of the lagging second reference frame. High second ref
 // usage may point to a transient event like a flash or occlusion rather than
 // a real scene cut.
@@ -2764,7 +2743,6 @@ static int test_candidate_kf(TWO_PASS *twopass,
  if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
      (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
-       (slide_transition(this_frame, last_frame, next_frame)) ||
       ((pcnt_intra > MIN_INTRA_LEVEL) &&
        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
        ((this_frame->intra_error /
@@ -2836,7 +2814,6 @@ static int test_candidate_kf(TWO_PASS *twopass,
 #define FRAMES_TO_CHECK_DECAY 8
 #define MIN_KF_TOT_BOOST 300
 #define KF_BOOST_SCAN_MAX_FRAMES 32
-#define KF_ABS_ZOOM_THRESH 6.0

 #ifdef AGGRESSIVE_VBR
 #define KF_MAX_FRAME_BOOST 80.0
@@ -2864,7 +2841,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  double kf_group_err = 0.0;
  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
  double sr_accumulator = 0.0;
-  double abs_mv_in_out_accumulator = 0.0;
  const double av_err = get_distribution_av_err(cpi, twopass);
  vp9_zero(next_frame);

@@ -3029,14 +3005,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      double zm_factor;

      // Monitor for static sections.
-      // First frame in kf group the second ref indicator is invalid.
-      if (i > 0) {
-        zero_motion_accumulator = VPXMIN(
-            zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
-      } else {
-        zero_motion_accumulator =
-            next_frame.pcnt_inter - next_frame.pcnt_motion;
-      }
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));

      // Factor 0.75-1.25 based on how much of frame is static.
      zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
@@ -3050,14 +3020,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                        KF_MAX_FRAME_BOOST * zm_factor);

      boost_score += frame_boost;
-
-      // Measure of zoom. Large zoom tends to indicate reduced boost.
-      abs_mv_in_out_accumulator +=
-          fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion);
-
-      if ((frame_boost < 25.00) ||
-          (abs_mv_in_out_accumulator > KF_ABS_ZOOM_THRESH))
-        break;
+      if (frame_boost < 25.00) break;
    } else {
      break;
    }
@@ -3072,16 +3035,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  twopass->section_intra_rating = calculate_section_intra_ratio(
      start_position, twopass->stats_in_end, rc->frames_to_key);

-  // Special case for static / slide show content but dont apply
-  // if the kf group is very short.
-  if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
-    rc->kf_boost = VPXMAX((rc->frames_to_key * 100), MAX_KF_TOT_BOOST);
-  } else {
-    // Apply various clamps for min and max boost
-    rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
-    rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
-    rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
-  }
+  // Apply various clamps for min and max boost
+  rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
+  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
+  rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);

  // Work out how many bits to allocate for the key frame itself.
  kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -120,12 +120,12 @@ typedef enum {
 typedef struct {
  unsigned char index;
  unsigned char first_inter_index;
-  RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
+  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
 } GF_GROUP;

 typedef struct {
--- a/vp9/encoder/vp9_mbgraph.h
+++ b/vp9/encoder/vp9_mbgraph.h
@@ -25,9 +25,7 @@ typedef struct {
  } ref[MAX_REF_FRAMES];
 } MBGRAPH_MB_STATS;

-typedef struct {
-  MBGRAPH_MB_STATS *mb_stats;
-} MBGRAPH_FRAME_STATS;
+typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS;

 struct VP9_COMP;

--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1785,10 +1785,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
 }

 static const MV search_pos[4] = {
-  { -1, 0 },
-  { 0, -1 },
-  { 0, 1 },
-  { 1, 0 },
+  { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
 };

 unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
@@ -1879,10 +1876,7 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,

  {
    const uint8_t *const pos[4] = {
-      ref_buf - ref_stride,
-      ref_buf - 1,
-      ref_buf + 1,
-      ref_buf + ref_stride,
+      ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride,
    };

    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -21,15 +21,6 @@
 #include "vp9/encoder/vp9_noise_estimate.h"
 #include "vp9/encoder/vp9_encoder.h"

-#if CONFIG_VP9_TEMPORAL_DENOISING
-// For SVC: only do noise estimation on top spatial layer.
-static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) {
-  return (!cpi->use_svc ||
-          (cpi->use_svc &&
-           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
-}
-#endif
-
 void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
  ne->enabled = 0;
  ne->level = kLowLow;
@@ -54,7 +45,7 @@ static int enable_noise_estimation(VP9_COMP *const cpi) {
 #endif
 // Enable noise estimation if denoising is on.
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
      cpi->common.width >= 320 && cpi->common.height >= 180)
    return 1;
 #endif
@@ -120,7 +111,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
  // Estimate is between current source and last source.
  YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) {
    last_source = &cpi->denoiser.last_source;
    // Tune these thresholds for different resolutions when denoising is
    // enabled.
@@ -140,7 +131,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
      (cpi->svc.number_spatial_layers == 1 &&
       (ne->last_w != cm->width || ne->last_h != cm->height))) {
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
      copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
    if (last_source != NULL) {
@@ -155,7 +146,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
    ne->count = 0;
    ne->num_frames_estimate = 10;
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
        cpi->svc.current_superframe > 1) {
      vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
      copy_frame(&cpi->denoiser.last_source, cpi->Source);
@@ -258,7 +249,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
      // Normalize.
      avg_est = avg_est / num_samples;
      // Update noise estimate.
-      ne->value = (int)((3 * ne->value + avg_est) >> 2);
+      ne->value = (int)((15 * ne->value + avg_est) >> 4);
      ne->count++;
      if (ne->count == ne->num_frames_estimate) {
        // Reset counter and check noise level condition.
@@ -266,14 +257,14 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
        ne->count = 0;
        ne->level = vp9_noise_estimate_extract_level(ne);
 #if CONFIG_VP9_TEMPORAL_DENOISING
-        if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+        if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
          vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
 #endif
      }
    }
  }
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
    copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
 }
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1488,6 +1488,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
  int skip_ref_find_pred[4] = { 0 };
  unsigned int sse_zeromv_normalized = UINT_MAX;
  unsigned int best_sse_sofar = UINT_MAX;
+  unsigned int thresh_svc_skip_golden = 500;
 #if CONFIG_VP9_TEMPORAL_DENOISING
  VP9_PICKMODE_CTX_DEN ctx_den;
  int64_t zero_last_cost_orig = INT64_MAX;
@@ -1495,23 +1496,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 #endif
  INTERP_FILTER filter_gf_svc = EIGHTTAP;
  MV_REFERENCE_FRAME best_second_ref_frame = NONE;
-  const struct segmentation *const seg = &cm->seg;
  int comp_modes = 0;
  int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES;
-  int flag_svc_subpel = 0;
-  int svc_mv_col = 0;
-  int svc_mv_row = 0;
-  unsigned int thresh_svc_skip_golden = 500;
-  // Lower the skip threshold if lower spatial layer is better quality relative
-  // to current layer.
-  if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex > 150 &&
-      cm->base_qindex > cpi->svc.lower_layer_qindex + 15)
-    thresh_svc_skip_golden = 100;
-  // Increase skip threshold if lower spatial layer is lower quality relative
-  // to current layer.
-  else if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex < 140 &&
-           cm->base_qindex < cpi->svc.lower_layer_qindex - 20)
-    thresh_svc_skip_golden = 1000;

  init_ref_frame_cost(cm, xd, ref_frame_cost);

@@ -1649,16 +1635,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
      cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME)
    comp_modes = 2;

-  // If the segment reference frame feature is enabled and it's set to GOLDEN
-  // reference, then make sure we don't skip checking GOLDEN, this is to
-  // prevent possibility of not picking any mode.
-  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
-      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
-    usable_ref_frame = GOLDEN_FRAME;
-    skip_ref_find_pred[GOLDEN_FRAME] = 0;
-    thresh_svc_skip_golden = 0;
-  }
-
  for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
    if (!skip_ref_find_pred[ref_frame]) {
      find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
@@ -1671,18 +1647,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
  if (cpi->use_svc || cpi->oxcf.speed <= 7 || bsize < BLOCK_32X32)
    x->sb_use_mv_part = 0;

-  // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used
-  // an averaging filter for downsampling (phase = 8). If so, we will test
-  // a nonzero motion mode on the spatial (goldeen) reference.
-  // The nonzero motion is half pixel shifted to left and top (-4, -4).
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
-      svc_force_zero_mode[GOLDEN_FRAME - 1] &&
-      cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) {
-    svc_mv_col = -4;
-    svc_mv_row = -4;
-    flag_svc_subpel = 1;
-  }
-
  for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) {
    int rate_mv = 0;
    int mode_rd_thresh;
@@ -1696,7 +1660,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
    int inter_mv_mode = 0;
    int skip_this_mv = 0;
    int comp_pred = 0;
-    int force_gf_mv = 0;
    PREDICTION_MODE this_mode;
    second_ref_frame = NONE;

@@ -1717,29 +1680,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
      comp_pred = 1;
    }

-    if (ref_frame > usable_ref_frame) continue;
-    if (skip_ref_find_pred[ref_frame]) continue;
-
-    // If the segment reference frame feature is enabled then do nothing if the
-    // current ref frame is not allowed.
-    if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
-        get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
-      continue;
-
-    if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) {
-      force_gf_mv = 1;
-      // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
-      // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
-      if (this_mode == NEWMV) {
-        frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col;
-        frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row;
-      } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col ||
-                 frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) {
-        continue;
-      }
-    }
-
    if (comp_pred) {
+      const struct segmentation *const seg = &cm->seg;
      if (!cpi->allow_comp_inter_inter) continue;
      // Skip compound inter modes if ARF is not available.
      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
@@ -1748,6 +1690,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
      if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
    }

+    if (ref_frame > usable_ref_frame) continue;
+    if (skip_ref_find_pred[ref_frame]) continue;
+
    // For SVC, skip the golden (spatial) reference search if sse of zeromv_last
    // is below threshold.
    if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
@@ -1792,7 +1737,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
    // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
    // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
    // later.
-    if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+    if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
        frame_mv[this_mode][ref_frame].as_int != 0) {
      continue;
    }
@@ -1806,39 +1751,34 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
    }

    if (cpi->use_svc) {
-      if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] &&
+      if (svc_force_zero_mode[ref_frame - 1] &&
          frame_mv[this_mode][ref_frame].as_int != 0)
        continue;
    }

-    // Disable this drop out case if the ref frame segment level feature is
-    // enabled for this segment. This is to prevent the possibility that we end
-    // up unable to pick any mode.
-    if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) {
-      if (sf->reference_masking &&
-          !(frame_mv[this_mode][ref_frame].as_int == 0 &&
-            ref_frame == LAST_FRAME)) {
-        if (usable_ref_frame < ALTREF_FRAME) {
-          if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
-            i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-            if ((cpi->ref_frame_flags & flag_list[i]))
-              if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
-                ref_frame_skip_mask |= (1 << ref_frame);
-          }
-        } else if (!cpi->rc.is_src_frame_alt_ref &&
-                   !(frame_mv[this_mode][ref_frame].as_int == 0 &&
-                     ref_frame == ALTREF_FRAME)) {
-          int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
-          int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
-          if (((cpi->ref_frame_flags & flag_list[ref1]) &&
-               (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
-              ((cpi->ref_frame_flags & flag_list[ref2]) &&
-               (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
-            ref_frame_skip_mask |= (1 << ref_frame);
+    if (sf->reference_masking &&
+        !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+          ref_frame == LAST_FRAME)) {
+      if (usable_ref_frame < ALTREF_FRAME) {
+        if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
+          i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+          if ((cpi->ref_frame_flags & flag_list[i]))
+            if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+              ref_frame_skip_mask |= (1 << ref_frame);
        }
+      } else if (!cpi->rc.is_src_frame_alt_ref &&
+                 !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+                   ref_frame == ALTREF_FRAME)) {
+        int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
+        int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
+        if (((cpi->ref_frame_flags & flag_list[ref1]) &&
+             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
+            ((cpi->ref_frame_flags & flag_list[ref2]) &&
+             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
+          ref_frame_skip_mask |= (1 << ref_frame);
      }
-      if (ref_frame_skip_mask & (1 << ref_frame)) continue;
    }
+    if (ref_frame_skip_mask & (1 << ref_frame)) continue;

    // Select prediction reference frames.
    for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -1868,7 +1808,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                             &rd_thresh_freq_fact[mode_index])))
      continue;

-    if (this_mode == NEWMV && !force_gf_mv) {
+    if (this_mode == NEWMV) {
      if (ref_frame > LAST_FRAME && !cpi->use_svc &&
          cpi->oxcf.rc_mode == VPX_CBR) {
        int tmp_sad;
@@ -2009,7 +1949,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
    if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
        pred_filter_search &&
        (ref_frame == LAST_FRAME ||
-         (ref_frame == GOLDEN_FRAME && !force_gf_mv &&
+         (ref_frame == GOLDEN_FRAME &&
          (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
        (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
      int pf_rate[3];
@@ -2233,11 +2173,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,

  // For spatial enhancemanent layer: perform intra prediction only if base
  // layer is chosen as the reference. Always perform intra prediction if
-  // LAST is the only reference, or is_key_frame is set, or on base
-  // temporal layer.
+  // LAST is the only reference or is_key_frame is set.
  if (cpi->svc.spatial_layer_id) {
    perform_intra_pred =
-        cpi->svc.temporal_layer_id == 0 ||
        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame ||
        !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
        (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
@@ -2247,13 +2185,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
  if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
      cpi->rc.is_src_frame_alt_ref)
    perform_intra_pred = 0;
-
-  // If the segment reference frame feature is enabled and set then
-  // skip the intra prediction.
-  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
-      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0)
-    perform_intra_pred = 0;
-
  // Perform intra prediction search, if the best SAD is above a certain
  // threshold.
  if (best_rdc.rdcost == INT64_MAX ||
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -31,13 +31,10 @@
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_ratectrl.h"

-// Max rate per frame for 1080P and below encodes if no level requirement given.
-// For larger formats limit to MAX_MB_RATE bits per MB
-// 4Mbits is derived from the level requirement for level 4 (1080P 30) which
-// requires that HW can sustain a rate of 16Mbits over a 4 frame group.
-// If a lower level requirement is specified then this may over ride this value.
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
 #define MAX_MB_RATE 250
-#define MAXRATE_1080P 4000000
+#define MAXRATE_1080P 2025000

 #define DEFAULT_KF_BOOST 2000
 #define DEFAULT_GF_BOOST 2000
@@ -1103,9 +1100,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
      // Baseline value derived from cpi->active_worst_quality and kf boost.
      active_best_quality =
          get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
-      if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
-        active_best_quality /= 4;
-      }

      // Allow somewhat lower kf minq with small image formats.
      if ((cm->width * cm->height) <= (352 * 288)) {
@@ -1494,22 +1488,15 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
    cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref;
  }
  if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0;
-
-  rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
-  if (cpi->use_svc &&
-      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
-    cpi->svc.lower_layer_qindex = cm->base_qindex;
 }

 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
  // Update buffer level with zero size, update frame counters, and return.
  update_buffer_level(cpi, 0);
-  cpi->common.current_video_frame++;
  cpi->rc.frames_since_key++;
  cpi->rc.frames_to_key--;
  cpi->rc.rc_2_frame = 0;
  cpi->rc.rc_1_frame = 0;
-  cpi->rc.last_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
 }

 static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
@@ -1593,8 +1580,9 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
      // Adjust boost and af_ratio based on avg_frame_low_motion, which varies
      // between 0 and 100 (stationary, 100% zero/small motion).
      rc->gfu_boost =
-          VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
-                          (rc->avg_frame_low_motion + 100));
+          VPXMAX(500,
+                 DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+                     (rc->avg_frame_low_motion + 100));
      rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
    }
    adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
@@ -1869,8 +1857,13 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
          cpi->framerate, rc->min_gf_interval);

-    // Extended max interval for genuinely static scenes like slide shows.
-    rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
+    // Extended interval for genuinely static scenes
+    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+
+    if (is_altref_enabled(cpi)) {
+      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+    }

    if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
      rc->max_gf_interval = rc->static_scene_max_gf_interval;
@@ -1880,12 +1873,9 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,

    if (oxcf->target_level == LEVEL_AUTO) {
      const uint32_t pic_size = cpi->common.width * cpi->common.height;
-      const uint32_t pic_breadth =
-          VPXMAX(cpi->common.width, cpi->common.height);
      int i;
      for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
-        if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
-            vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
+        if (vp9_level_defs[i].max_luma_picture_size > pic_size) {
          if (rc->min_gf_interval <=
              (int)vp9_level_defs[i].min_altref_distance) {
            rc->min_gf_interval =
@@ -1914,12 +1904,12 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
      VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);

  // A maximum bitrate for a frame is defined.
-  // However this limit is extended if a very high rate is given on the command
-  // line or the the rate cannnot be acheived because of a user specificed max q
-  // (e.g. when the user specifies lossless encode).
-  //
-  // If a level is specified that requires a lower maximum rate then the level
-  // value take precedence.
+  // The baseline for this aligns with HW implementations that
+  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+  // per 16x16 MB (averaged over a frame). However this limit is extended if
+  // a very high rate is given on the command line or the the rate cannnot
+  // be acheived because of a user specificed max q (e.g. when the user
+  // specifies lossless encode.
  vbr_max_bits =
      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
            100);
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -34,14 +34,6 @@ extern "C" {

 #define FRAME_OVERHEAD_BITS 200

-// Threshold used to define a KF group as static (e.g. a slide show).
-// Essentially this means that no frame in the group has more than 1% of MBs
-// that are not marked as coded with 0,0 motion in the first pass.
-#define STATIC_KF_GROUP_THRESH 99
-
-// The maximum duration of a GF group that is static (for example a slide show).
-#define MAX_STATIC_GF_GROUP_LENGTH 250
-
 typedef enum {
  INTER_NORMAL = 0,
  INTER_HIGH = 1,
@@ -160,8 +152,6 @@ typedef struct {
  int rc_2_frame;
  int q_1_frame;
  int q_2_frame;
-  // Keep track of the last target average frame bandwidth.
-  int last_avg_frame_bandwidth;

  // Auto frame-scaling variables.
  FRAME_SCALE_LEVEL frame_size_selector;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -59,9 +59,7 @@ typedef struct {
  MV_REFERENCE_FRAME ref_frame[2];
 } MODE_DEFINITION;

-typedef struct {
-  MV_REFERENCE_FRAME ref_frame[2];
-} REF_DEFINITION;
+typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;

 struct rdcost_block_args {
  const VP9_COMP *cpi;
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -37,16 +37,14 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
  svc->scaled_one_half = 0;
  svc->current_superframe = 0;
  svc->non_reference_frame = 0;
-  svc->skip_enhancement_layer = 0;
-
  for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
    svc->ext_frame_flags[sl] = 0;
    svc->ext_lst_fb_idx[sl] = 0;
    svc->ext_gld_fb_idx[sl] = 1;
    svc->ext_alt_fb_idx[sl] = 2;
-    svc->downsample_filter_type[sl] = BILINEAR;
-    svc->downsample_filter_phase[sl] = 8;  // Set to 8 for averaging filter.
+    svc->downsample_filter_type[sl] = EIGHTTAP;
+    svc->downsample_filter_phase[sl] = 0;  // Set to 8 for averaging filter.
  }

  if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
@@ -155,8 +153,6 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
  int sl, tl, layer = 0, spatial_layer_target;
  float bitrate_alloc = 1.0;

-  cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
-
  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
    for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
      for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
@@ -393,9 +389,9 @@ int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
             .is_key_frame;
 }

-void get_layer_resolution(const int width_org, const int height_org,
-                          const int num, const int den, int *width_out,
-                          int *height_out) {
+static void get_layer_resolution(const int width_org, const int height_org,
+                                 const int num, const int den, int *width_out,
+                                 int *height_out) {
  int w, h;

  if (width_out == NULL || height_out == NULL || den == 0) return;
@@ -549,8 +545,6 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
    if (!spatial_id) {
      cpi->ref_frame_flags = VP9_LAST_FLAG;
    } else {
-      if (spatial_id == cpi->svc.number_spatial_layers - 1)
-        cpi->ext_refresh_alt_ref_frame = 0;
      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
    }
  }
@@ -610,7 +604,6 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering(
 int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
  int width = 0, height = 0;
  LAYER_CONTEXT *lc = NULL;
-  cpi->svc.skip_enhancement_layer = 0;
  if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1;
  cpi->svc.force_zero_mode_spatial_ref = 1;
  cpi->svc.mi_stride[cpi->svc.spatial_layer_id] = cpi->common.mi_stride;
@@ -663,14 +656,10 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
                       lc->scaling_factor_num, lc->scaling_factor_den, &width,
                       &height);

-  // For resolutions <= VGA: set phase of the filter = 8 (for symmetric
+  // For resolutions <= QVGA: set phase of the filter = 8 (for symmetric
  // averaging filter), use bilinear for now.
-  if (width * height <= 640 * 480) {
+  if (width * height <= 320 * 240) {
    cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR;
-    // Use Eightap_smooth for low resolutions.
-    if (width * height <= 320 * 240)
-      cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] =
-          EIGHTTAP_SMOOTH;
    cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8;
  }

@@ -872,28 +861,3 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
  vp9_update_temporal_layer_framerate(cpi);
  vp9_restore_layer_context(cpi);
 }
-
-void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
-  SVC *svc = &cpi->svc;
-  int sl, tl;
-  for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
-    // Check for reset based on avg_frame_bandwidth for spatial layer sl.
-    int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
-                                 svc->number_temporal_layers);
-    LAYER_CONTEXT *lc = &svc->layer_context[layer];
-    RATE_CONTROL *lrc = &lc->rc;
-    if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) ||
-        lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) {
-      // Reset for all temporal layers with spatial layer sl.
-      for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
-        int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
-        LAYER_CONTEXT *lc = &svc->layer_context[layer];
-        RATE_CONTROL *lrc = &lc->rc;
-        lrc->rc_1_frame = 0;
-        lrc->rc_2_frame = 0;
-        lrc->bits_off_target = lrc->optimal_buffer_level;
-        lrc->buffer_level = lrc->optimal_buffer_level;
-      }
-    }
-  }
-}
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -49,7 +49,7 @@ typedef struct {
  uint8_t speed;
 } LAYER_CONTEXT;

-typedef struct SVC {
+typedef struct {
  int spatial_layer_id;
  int temporal_layer_id;
  int number_spatial_layers;
@@ -99,12 +99,6 @@ typedef struct SVC {

  BLOCK_SIZE *prev_partition_svc;
  int mi_stride[VPX_MAX_LAYERS];
-
-  int first_layer_denoise;
-
-  int skip_enhancement_layer;
-
-  int lower_layer_qindex;
 } SVC;

 struct VP9_COMP;
@@ -134,10 +128,6 @@ void vp9_save_layer_context(struct VP9_COMP *const cpi);
 // Initialize second pass rc for spatial svc.
 void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);

-void get_layer_resolution(const int width_org, const int height_org,
-                          const int num, const int den, int *width_out,
-                          int *height_out);
-
 // Increment number of video frames in layer
 void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);

@@ -158,8 +148,6 @@ void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);

 void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);

-void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -170,13 +170,13 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst4_sse2(in);
      write_buffer_4x4(output, in);
      break;
-    default:
-      assert(tx_type == ADST_ADST);
+    case ADST_ADST:
      load_buffer_4x4(input, in, stride);
      fadst4_sse2(in);
      fadst4_sse2(in);
      write_buffer_4x4(output, in);
      break;
+    default: assert(0); break;
  }
 }

@@ -1097,14 +1097,14 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
      right_shift_8x8(in, 1);
      write_buffer_8x8(output, in, 8);
      break;
-    default:
-      assert(tx_type == ADST_ADST);
+    case ADST_ADST:
      load_buffer_8x8(input, in, stride);
      fadst8_sse2(in);
      fadst8_sse2(in);
      right_shift_8x8(in, 1);
      write_buffer_8x8(output, in, 8);
      break;
+    default: assert(0); break;
  }
 }

@@ -1963,13 +1963,13 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst16_sse2(in0, in1);
      write_buffer_16x16(output, in0, in1, 16);
      break;
-    default:
-      assert(tx_type == ADST_ADST);
+    case ADST_ADST:
      load_buffer_16x16(input, in0, in1, stride);
      fadst16_sse2(in0, in1);
      right_shift_16x16(in0, in1);
      fadst16_sse2(in0, in1);
      write_buffer_16x16(output, in0, in1, 16);
      break;
+    default: assert(0); break;
  }
 }
--- a/vp9/encoder/x86/vp9_error_avx2.c
+++ b/vp9/encoder/x86/vp9_error_avx2.c
@@ -1,7 +1,7 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
- *  Use of this source code is governed by a BSD-style license
+ *  Usee of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -1,140 +0,0 @@
-/*
- *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <immintrin.h>  // AVX2
-
-#include "./vp9_rtcd.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
-#include "vpx_dsp/x86/quantize_x86.h"
-
-// Zero fill 8 positions in the output buffer.
-static INLINE void store_zero_tran_low(tran_low_t *a) {
-  const __m256i zero = _mm256_setzero_si256();
-#if CONFIG_VP9_HIGHBITDEPTH
-  _mm256_storeu_si256((__m256i *)(a), zero);
-  _mm256_storeu_si256((__m256i *)(a + 8), zero);
-#else
-  _mm256_storeu_si256((__m256i *)(a), zero);
-#endif
-}
-
-static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
-                                   __m256i *coeff256) {
-  const __m256i iscan = _mm256_loadu_si256(iscan_ptr);
-  const __m256i zero256 = _mm256_setzero_si256();
-#if CONFIG_VP9_HIGHBITDEPTH
-  // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as
-  // B1 A1 B0 A0.  Shuffle to B1 B0 A1 A0 in order to scan eob correctly.
-  const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8);
-  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256);
-#else
-  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256);
-#endif
-  const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256);
-  // Add one to convert from indices to counts
-  const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0);
-  return _mm256_and_si256(iscan_plus_one, nzero_coeff0);
-}
-
-void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *round_ptr,
-                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan_ptr,
-                          const int16_t *iscan_ptr) {
-  __m128i eob;
-  __m256i round256, quant256, dequant256;
-  __m256i eob256, thr256;
-
-  (void)scan_ptr;
-  (void)skip_block;
-  assert(!skip_block);
-
-  coeff_ptr += n_coeffs;
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-
-  {
-    __m256i coeff256;
-
-    // Setup global values
-    {
-      const __m128i round = _mm_load_si128((const __m128i *)round_ptr);
-      const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr);
-      const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      round256 = _mm256_castsi128_si256(round);
-      round256 = _mm256_permute4x64_epi64(round256, 0x54);
-
-      quant256 = _mm256_castsi128_si256(quant);
-      quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
-
-      dequant256 = _mm256_castsi128_si256(dequant);
-      dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
-    }
-
-    {
-      __m256i qcoeff256;
-      __m256i qtmp256;
-      coeff256 = load_tran_low(coeff_ptr + n_coeffs);
-      qcoeff256 = _mm256_abs_epi16(coeff256);
-      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
-      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
-      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
-      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
-      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
-      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
-    }
-
-    eob256 = scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256);
-    n_coeffs += 8 * 2;
-  }
-
-  // remove dc constants
-  dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
-  quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
-  round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
-
-  thr256 = _mm256_srai_epi16(dequant256, 1);
-
-  // AC only loop
-  while (n_coeffs < 0) {
-    __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs);
-    __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
-    int32_t nzflag =
-        _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
-
-    if (nzflag) {
-      __m256i qtmp256;
-      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
-      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
-      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
-      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
-      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
-      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
-      eob256 = _mm256_max_epi16(
-          eob256,
-          scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256));
-    } else {
-      store_zero_tran_low(qcoeff_ptr + n_coeffs);
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-    }
-    n_coeffs += 8 * 2;
-  }
-
-  eob = _mm_max_epi16(_mm256_castsi256_si128(eob256),
-                      _mm256_extracti128_si256(eob256, 1));
-
-  *eob_ptr = accumulate_eob(eob);
-}
--- a/Show More
+++ b/Show More