disable vp9_iht8x8_64_add_neon

cherry picked from: commit 0685ec767c Author: James Zern <jzern@google.com> Date: Sat Mar 3 12:47:24 2018 -0800 this causes test vector failures BUG=webm:1403 Change-Id: I7d37a05fbf4641ea352c947053aa4eaeb7f5c318
VP9 ROI: reset use_roi_ in datarate test.
2018-03-06 16:12:19 -08:00 · 2018-02-10 08:39:43 -08:00 · 2018-02-09 14:47:00 -08:00 · 2018-02-09 19:01:52 +00:00 · 2018-02-09 10:55:46 -08:00 · 2018-02-09 18:54:55 +00:00
150 changed files with 5684 additions and 3815 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -1,12 +1,12 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-# Generated with clang-format 4.0.1
+# Generated with clang-format 5.0.0
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
-AlignEscapedNewlinesLeft: true
+AlignEscapedNewlines: Left
 AlignOperands:   true
 AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: true
@@ -33,14 +33,20 @@ BraceWrapping:
  BeforeCatch:     false
  BeforeElse:      false
  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
 BreakAfterJavaFieldAnnotations: false
 BreakStringLiterals: true
 ColumnLimit:     80
 CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
@@ -48,7 +54,11 @@ Cpp11BracedListStyle: false
 DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
-ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
 IncludeCategories:
  - Regex:           '^<.*\.h>'
    Priority:        1
@@ -70,6 +80,7 @@ NamespaceIndentation: None
 ObjCBlockIndentWidth: 2
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: false
+PenaltyBreakAssignment: 2
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
@@ -79,6 +90,7 @@ PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
 ReflowComments:  true
 SortIncludes:    false
+SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
--- a/.mailmap
+++ b/.mailmap
@@ -3,6 +3,7 @@ Aℓex Converse <aconverse@google.com>
 Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Chris Cunningham <chcunningham@chromium.org>
 Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
 Deb Mukherjee <debargha@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
@@ -21,18 +22,21 @@ Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Pascal Massimino <pascal.massimino@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
+Peter Boström <pbos@chromium.org> <pbos@google.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
 Sami Pietilä <samipietila@google.com>
+Shiyou Yin <yinshiyou-hf@loongson.cn>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
+Urvang Joshi <urvang@google.com> <urvang@chromium.org>
 Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <Yaowu Xu>
--- a/16
+++ b/16
@@ -3,13 +3,13 @@

 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
-Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
 Aleksey Vasenev <margtu-fivt@ya.ru>
 Alexander Potapenko <glider@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
+Alexandra Hájková <alexandra.khirnova@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -17,6 +17,7 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
+Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
 Angie Chiang <angiebird@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
@@ -24,7 +25,9 @@ Attila Nagy <attilanagy@google.com>
 Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
+Cheng Chen <chengchen@google.com>
 chm <chm@rock-chips.com>
+Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
@@ -46,10 +49,12 @@ Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
+Gregor Jasny <gjasny@gmail.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
+Han Shen <shenhan@google.com>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
 Ivan Krasin <krasin@chromium.org>
@@ -83,6 +88,7 @@ Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 Kaustubh Raste <kaustubh.raste@imgtec.com>
 KO Myung-Hun <komh@chollian.net>
+Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
 Linfeng Zhang <linfengz@google.com>
 Lou Quillio <louquillio@google.com>
@@ -101,6 +107,7 @@ Mikhal Shemer <mikhal@google.com>
 Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
 Min Ye <yeemmi@google.com>
+Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
 Nathan E. Egge <negge@mozilla.com>
 Nico Weber <thakis@chromium.org>
@@ -111,12 +118,15 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
 Pengchong Jin <pengchong@google.com>
-Peter Boström <pbos@google.com>
+Peter Boström <pbos@chromium.org>
+Peter Collingbourne <pcc@chromium.org>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
+Rafael de Lucena Valle <rafaeldelucena@gmail.com>
+Rahul Chaudhry <rahulchaudhry@google.com>
 Ralph Giles <giles@xiph.org>
 Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
 Rob Bradford <rob@linux.intel.com>
@@ -135,6 +145,7 @@ Shiyou Yin <yinshiyou-hf@loongson.cn>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
+Sylvestre Ledru <sylvestre@mozilla.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
@@ -147,6 +158,7 @@ Tom Finegan <tomfinegan@google.com>
 Tristan Matthews <le.businessman@gmail.com>
 Urvang Joshi <urvang@google.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
+Vlad Tsyrklevich <vtsyrklevich@chromium.org>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
--- a/25
+++ b/25
@@ -1,3 +1,28 @@
+2017-01-04 v1.7.0 "Mandarin Duck"
+  This release focused on high bit depth performance (10/12 bit) and vp9
+  encoding improvements.
+
+  - Upgrading:
+    This release is ABI incompatible due to new vp9 encoder features.
+
+    Frame parallel decoding for vp9 has been removed.
+
+  - Enhancements:
+    vp9 encoding supports additional threads with --row-mt. This can be greater
+    than the number of tiles.
+
+    Two new vp9 encoder options have been added:
+      --corpus-complexity
+      --tune-content=film
+
+    Additional tooling for respecting the vp9 "level" profiles has been added.
+
+  - Bug fixes:
+    A variety of fuzzing issues.
+    vp8 threading fix for ARM.
+    Codec control VP9_SET_SKIP_LOOP_FILTER fixed.
+    Reject invalid multi resolution configurations.
+
 2017-01-09 v1.6.1 "Long Tailed Duck"
  This release improves upon the VP9 encoder and speeds up the encoding and
  decoding processes.
--- a/4
+++ b/4
@@ -1,4 +1,4 @@
-README - 26 January 2017
+README - 24 January 2018

 Welcome to the WebM VP8/VP9 Codec SDK!

@@ -63,6 +63,8 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv8-linux-gcc
    mips32-linux-gcc
    mips64-linux-gcc
+    ppc64-linux-gcc
+    ppc64le-linux-gcc
    sparc-solaris-gcc
    x86-android-gcc
    x86-darwin8-gcc
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -1,4 +1,13 @@
 #!/usr/bin/env perl
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##

 no strict 'refs';
 use warnings;
@@ -200,6 +209,7 @@ sub filter {
 sub common_top() {
  my $include_guard = uc($opts{sym})."_H_";
  print <<EOF;
+// This file is generated. Do not edit.
 #ifndef ${include_guard}
 #define ${include_guard}

--- a/build/make/version.sh
+++ b/build/make/version.sh
@@ -60,6 +60,7 @@ if [ ${bare} ]; then
    echo "${changelog_version}${git_version_id}" > $$.tmp
 else
    cat<<EOF>$$.tmp
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  $major_version
 #define VERSION_MINOR  $minor_version
 #define VERSION_PATCH  $patch_version
--- a/2
+++ b/2
@@ -665,7 +665,7 @@ process_toolchain() {
             gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
             enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
             all_targets="${all_targets} solution"
-             INLINE="__forceinline"
+             INLINE="__inline"
        ;;
    esac

--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -429,8 +429,9 @@ static void set_rate_control_stats(struct RateControlStats *rc,
        rc->layer_framerate[layer] = framerate / cfg->ts_rate_decimator[tl];
      if (tl > 0) {
        rc->layer_pfb[layer] =
-            1000.0 * (cfg->layer_target_bitrate[layer] -
-                      cfg->layer_target_bitrate[layer - 1]) /
+            1000.0 *
+            (cfg->layer_target_bitrate[layer] -
+             cfg->layer_target_bitrate[layer - 1]) /
            (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]);
      } else {
        rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] /
@@ -573,8 +574,8 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
      } else {
        if (is_key_frame) {
          ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF |
-              VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF;
        } else {
          ref_frame_config->frame_flags[sl] =
              VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
@@ -588,14 +589,24 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
      } else {
        ref_frame_config->frame_flags[sl] =
            VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+        if (sl == num_spatial_layers - 1)
+          ref_frame_config->frame_flags[sl] =
+              VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_ARF |
+              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
      }
    }
    if (tl == 0) {
      ref_frame_config->lst_fb_idx[sl] = sl;
-      if (sl)
-        ref_frame_config->gld_fb_idx[sl] = sl - 1;
-      else
+      if (sl) {
+        if (is_key_frame) {
+          ref_frame_config->lst_fb_idx[sl] = sl - 1;
+          ref_frame_config->gld_fb_idx[sl] = sl;
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = sl - 1;
+        }
+      } else {
        ref_frame_config->gld_fb_idx[sl] = 0;
+      }
      ref_frame_config->alt_fb_idx[sl] = 0;
    } else if (tl == 1) {
      ref_frame_config->lst_fb_idx[sl] = sl;
@@ -738,6 +749,8 @@ int main(int argc, const char **argv) {
      // the encode for the whole superframe. The encoder will internally loop
      // over all the spatial layers for the current superframe.
      vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
+      // TODO(jianj): Fix the parameter passing for "is_key_frame" in
+      // set_frame_flags_bypass_model() for case of periodic key frames.
      set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id,
                                  svc_ctx.spatial_layers, frame_cnt == 0,
                                  &ref_frame_config);
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -26,19 +26,29 @@
 #include "../tools_common.h"
 #include "../video_writer.h"

-#define VP8_ROI_MAP 0
+#define ROI_MAP 0
+
+#define zero(Dest) memset(&Dest, 0, sizeof(Dest));

 static const char *exec_name;

 void usage_exit(void) { exit(EXIT_FAILURE); }

-// Denoiser states, for temporal denoising.
-enum denoiserState {
-  kDenoiserOff,
-  kDenoiserOnYOnly,
-  kDenoiserOnYUV,
-  kDenoiserOnYUVAggressive,
-  kDenoiserOnAdaptive
+// Denoiser states for vp8, for temporal denoising.
+enum denoiserStateVp8 {
+  kVp8DenoiserOff,
+  kVp8DenoiserOnYOnly,
+  kVp8DenoiserOnYUV,
+  kVp8DenoiserOnYUVAggressive,
+  kVp8DenoiserOnAdaptive
+};
+
+// Denoiser states for vp9, for temporal denoising.
+enum denoiserStateVp9 {
+  kVp9DenoiserOff,
+  kVp9DenoiserOnYOnly,
+  // For SVC: denoise the top two spatial layers.
+  kVp9DenoiserOnYTwoSpatialLayers
 };

 static int mode_to_num_layers[13] = { 1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3 };
@@ -91,9 +101,10 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc,
  for (i = 0; i < cfg->ts_number_layers; ++i) {
    if (i > 0) {
      rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i];
-      rc->layer_pfb[i] = 1000.0 * (rc->layer_target_bitrate[i] -
-                                   rc->layer_target_bitrate[i - 1]) /
-                         (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
+      rc->layer_pfb[i] =
+          1000.0 *
+          (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
+          (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
    }
    rc->layer_input_frames[i] = 0;
    rc->layer_enc_frames[i] = 0;
@@ -156,38 +167,60 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
    die("Error: Number of input frames not equal to output! \n");
 }

-#if VP8_ROI_MAP
-static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) {
+#if ROI_MAP
+static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg,
+                        vpx_roi_map_t *roi) {
  unsigned int i, j;
-  memset(roi, 0, sizeof(*roi));
+  int block_size = 0;
+  uint8_t is_vp8 = strncmp(enc_name, "vp8", 3) == 0 ? 1 : 0;
+  uint8_t is_vp9 = strncmp(enc_name, "vp9", 3) == 0 ? 1 : 0;
+  if (!is_vp8 && !is_vp9) {
+    die("unsupported codec.");
+  }
+  zero(*roi);
+
+  block_size = is_vp9 && !is_vp8 ? 8 : 16;

  // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for
  // segment is 16x16 for vp8, 8x8 for vp9.
-  roi->rows = (cfg->g_h + 15) / 16;
-  roi->cols = (cfg->g_w + 15) / 16;
+  roi->rows = (cfg->g_h + block_size - 1) / block_size;
+  roi->cols = (cfg->g_w + block_size - 1) / block_size;

  // Applies delta QP on the segment blocks, varies from -63 to 63.
  // Setting to negative means lower QP (better quality).
  // Below we set delta_q to the extreme (-63) to show strong effect.
-  roi->delta_q[0] = 0;
+  // VP8 uses the first 4 segments. VP9 uses all 8 segments.
+  zero(roi->delta_q);
  roi->delta_q[1] = -63;
-  roi->delta_q[2] = 0;
-  roi->delta_q[3] = 0;

  // Applies delta loopfilter strength on the segment blocks, varies from -63 to
-  // 63. Setting to positive means stronger loopfilter.
-  roi->delta_lf[0] = 0;
-  roi->delta_lf[1] = 0;
-  roi->delta_lf[2] = 0;
-  roi->delta_lf[3] = 0;
+  // 63. Setting to positive means stronger loopfilter. VP8 uses the first 4
+  // segments. VP9 uses all 8 segments.
+  zero(roi->delta_lf);

-  // Applies skip encoding threshold on the segment blocks, varies from 0 to
-  // UINT_MAX. Larger value means more skipping of encoding is possible.
-  // This skip threshold only applies on delta frames.
-  roi->static_threshold[0] = 0;
-  roi->static_threshold[1] = 0;
-  roi->static_threshold[2] = 0;
-  roi->static_threshold[3] = 0;
+  if (is_vp8) {
+    // Applies skip encoding threshold on the segment blocks, varies from 0 to
+    // UINT_MAX. Larger value means more skipping of encoding is possible.
+    // This skip threshold only applies on delta frames.
+    zero(roi->static_threshold);
+  }
+
+  if (is_vp9) {
+    // Apply skip segment. Setting to 1 means this block will be copied from
+    // previous frame.
+    zero(roi->skip);
+  }
+
+  if (is_vp9) {
+    // Apply ref frame segment.
+    // -1 : Do not apply this segment.
+    //  0 : Froce using intra.
+    //  1 : Force using last.
+    //  2 : Force using golden.
+    //  3 : Force using alfref but not used in non-rd pickmode for 0 lag.
+    memset(roi->ref_frame, -1, sizeof(roi->ref_frame));
+    roi->ref_frame[1] = 1;
+  }

  // Use 2 states: 1 is center square, 0 is the rest.
  roi->roi_map =
@@ -555,7 +588,7 @@ int main(int argc, char **argv) {
  int layering_mode = 0;
  int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 };
  int flag_periodicity = 1;
-#if VP8_ROI_MAP
+#if ROI_MAP
  vpx_roi_map_t roi;
 #endif
  vpx_svc_layer_id_t layer_id = { 0, 0 };
@@ -755,11 +788,11 @@ int main(int argc, char **argv) {

  if (strncmp(encoder->name, "vp8", 3) == 0) {
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
    vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
-#if VP8_ROI_MAP
-    vp8_set_roi_map(&cfg, &roi);
+#if ROI_MAP
+    set_roi_map(encoder->name, &cfg, &roi);
    if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
      die_codec(&codec, "Failed to set ROI map");
 #endif
@@ -772,10 +805,16 @@ int main(int argc, char **argv) {
    vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0);
    vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0);
    vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
-    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff);
    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
    vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
+#if ROI_MAP
+    set_roi_map(encoder->name, &cfg, &roi);
+    if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
+      die_codec(&codec, "Failed to set ROI map");
+    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 0);
+#endif
    // TODO(marpan/jianj): There is an issue with row-mt for low resolutons at
    // high speed settings, disable its use for those cases for now.
    if (cfg.g_threads > 1 && ((cfg.g_w > 320 && cfg.g_h > 240) || speed < 7))
@@ -903,5 +942,8 @@ int main(int argc, char **argv) {
  for (i = 0; i < cfg.ts_number_layers; ++i) vpx_video_writer_close(outfile[i]);

  vpx_img_free(&raw);
+#if ROI_MAP
+  free(roi.roi_map);
+#endif
  return EXIT_SUCCESS;
 }
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -943,18 +943,6 @@ GENERATE_XML           = NO

 XML_OUTPUT             = xml

-# The XML_SCHEMA tag can be used to specify an XML schema,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_SCHEMA             =
-
-# The XML_DTD tag can be used to specify an XML DTD,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_DTD                =
-
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting
 # and cross-referencing information) to the XML output. Note that
--- a/libs.mk
+++ b/libs.mk
@@ -233,8 +233,8 @@ OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)

-SO_VERSION_MAJOR := 4
-SO_VERSION_MINOR := 1
+SO_VERSION_MAJOR := 5
+SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
--- a/test/blockiness_test.cc
+++ b/test/blockiness_test.cc
@@ -215,7 +215,7 @@ using std::tr1::make_tuple;

 #if CONFIG_VP9_ENCODER
 const BlockinessParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
+  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238)
 };
 INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests));
 #endif
--- a/test/consistency_test.cc
+++ b/test/consistency_test.cc
@@ -205,7 +205,7 @@ using std::tr1::make_tuple;

 #if CONFIG_VP9_ENCODER
 const ConsistencyParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
+  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238)
 };
 INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test,
                        ::testing::ValuesIn(c_vp9_tests));
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -539,6 +539,7 @@ class DatarateTestVP9Large
    denoiser_offon_test_ = 0;
    denoiser_offon_period_ = -1;
    frame_parallel_decoding_mode_ = 1;
+    use_roi_ = 0;
  }

  //
@@ -621,6 +622,10 @@ class DatarateTestVP9Large
    encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
                     frame_parallel_decoding_mode_);

+    if (use_roi_) {
+      encoder->Control(VP9E_SET_ROI_MAP, &roi_);
+    }
+
    if (cfg_.ts_number_layers > 1) {
      if (video->frame() == 0) {
        encoder->Control(VP9E_SET_SVC, 1);
@@ -701,6 +706,8 @@ class DatarateTestVP9Large
  int denoiser_offon_test_;
  int denoiser_offon_period_;
  int frame_parallel_decoding_mode_;
+  bool use_roi_;
+  vpx_roi_map_t roi_;
 };

 // Check basic rate targeting for VBR mode with 0 lag.
@@ -1073,6 +1080,68 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
  }
 }

+class DatarateTestVP9RealTime : public DatarateTestVP9Large {
+ public:
+  virtual ~DatarateTestVP9RealTime() {}
+};
+
+// Check VP9 region of interest feature.
+TEST_P(DatarateTestVP9RealTime, RegionOfInterest) {
+  if (deadline_ != VPX_DL_REALTIME || set_cpu_used_ < 5) return;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+
+  cfg_.rc_target_bitrate = 450;
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  ResetModel();
+
+  // Set ROI parameters
+  use_roi_ = true;
+  memset(&roi_, 0, sizeof(roi_));
+
+  roi_.rows = (cfg_.g_h + 7) / 8;
+  roi_.cols = (cfg_.g_w + 7) / 8;
+
+  roi_.delta_q[1] = -20;
+  roi_.delta_lf[1] = -20;
+  memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame));
+  roi_.ref_frame[1] = 1;
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi_.roi_map = reinterpret_cast<uint8_t *>(
+      calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)));
+  ASSERT_TRUE(roi_.roi_map != NULL);
+
+  for (unsigned int i = 0; i < roi_.rows; ++i) {
+    for (unsigned int j = 0; j < roi_.cols; ++j) {
+      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
+          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
+        roi_.roi_map[i * roi_.cols + j] = 1;
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4)
+      << " The datarate for the file missed the target!";
+
+  free(roi_.roi_map);
+}
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
 class DatarateTestVP9LargeDenoiser : public DatarateTestVP9Large {
 public:
@@ -1216,18 +1285,78 @@ class DatarateOnePassCbrSvc
  }
  virtual void ResetModel() {
    last_pts_ = 0;
-    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
-    frame_number_ = 0;
-    first_drop_ = 0;
-    bits_total_ = 0;
    duration_ = 0.0;
    mismatch_psnr_ = 0.0;
    mismatch_nframes_ = 0;
    denoiser_on_ = 0;
    tune_content_ = 0;
    base_speed_setting_ = 5;
+    spatial_layer_id_ = 0;
+    temporal_layer_id_ = 0;
+    update_pattern_ = 0;
+    memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_));
+    memset(bits_total_, 0, sizeof(bits_total_));
+    memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_));
+    dynamic_drop_layer_ = false;
  }
  virtual void BeginPassHook(unsigned int /*pass*/) {}
+
+  // Example pattern for spatial layers and 2 temporal layers used in the
+  // bypass/flexible mode. The pattern corresponds to the pattern
+  // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
+  // non-flexible mode, except that we disable inter-layer prediction.
+  void set_frame_flags_bypass_mode(
+      int tl, int num_spatial_layers, int is_key_frame,
+      vpx_svc_ref_frame_config_t *ref_frame_config) {
+    for (int sl = 0; sl < num_spatial_layers; ++sl) {
+      if (!tl) {
+        if (!sl) {
+          ref_frame_config->frame_flags[sl] =
+              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF |
+              VP8_EFLAG_NO_UPD_ARF;
+        } else {
+          if (is_key_frame) {
+            ref_frame_config->frame_flags[sl] =
+                VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+                VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF;
+          } else {
+            ref_frame_config->frame_flags[sl] =
+                VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF |
+                VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF;
+          }
+        }
+      } else if (tl == 1) {
+        if (!sl) {
+          ref_frame_config->frame_flags[sl] =
+              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+        } else {
+          ref_frame_config->frame_flags[sl] =
+              VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
+              VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_REF_GF;
+        }
+      }
+      if (tl == 0) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        if (sl) {
+          if (is_key_frame) {
+            ref_frame_config->lst_fb_idx[sl] = sl - 1;
+            ref_frame_config->gld_fb_idx[sl] = sl;
+          } else {
+            ref_frame_config->gld_fb_idx[sl] = sl - 1;
+          }
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = 0;
+        }
+        ref_frame_config->alt_fb_idx[sl] = 0;
+      } else if (tl == 1) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
+        ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
+      }
+    }
+  }
+
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    if (video->frame() == 0) {
@@ -1252,36 +1381,137 @@ class DatarateOnePassCbrSvc
      encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1);
      encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
    }
+
+    if (update_pattern_ && video->frame() >= 100) {
+      vpx_svc_layer_id_t layer_id;
+      if (video->frame() == 100) {
+        cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+        encoder->Config(&cfg_);
+      }
+      // Set layer id since the pattern changed.
+      layer_id.spatial_layer_id = 0;
+      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
+                                  number_spatial_layers_, 0, &ref_frame_config);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
+    }
+
+    if (dynamic_drop_layer_) {
+      if (video->frame() == 100) {
+        // Change layer bitrates to set top layer to 0. This will trigger skip
+        // encoding/dropping of top spatial layer.
+        cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[2];
+        cfg_.layer_target_bitrate[2] = 0;
+        encoder->Config(&cfg_);
+      } else if (video->frame() == 300) {
+        // Change layer bitrate on top layer to non-zero to start encoding it
+        // again.
+        cfg_.layer_target_bitrate[2] = 500;
+        cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[2];
+        encoder->Config(&cfg_);
+      }
+    }
    const vpx_rational_t tb = video->timebase();
    timebase_ = static_cast<double>(tb.num) / tb.den;
    duration_ = 0;
  }
+
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    vpx_svc_layer_id_t layer_id;
+    encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+    spatial_layer_id_ = layer_id.spatial_layer_id;
+    temporal_layer_id_ = layer_id.temporal_layer_id;
+    // Update buffer with per-layer target frame bandwidth, this is done
+    // for every frame passed to the encoder (encoded or dropped).
+    // For temporal layers, update the cumulative buffer level.
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        bits_in_buffer_model_[layer] +=
+            static_cast<int64_t>(layer_target_avg_bandwidth_[layer]);
+      }
+    }
+  }
+
+  vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
+                                         uint32_t sizes[8], int *count) {
+    uint8_t marker;
+    marker = *(data + data_sz - 1);
+    *count = 0;
+    if ((marker & 0xe0) == 0xc0) {
+      const uint32_t frames = (marker & 0x7) + 1;
+      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+      const size_t index_sz = 2 + mag * frames;
+      // This chunk is marked as having a superframe index but doesn't have
+      // enough data for it, thus it's an invalid superframe index.
+      if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
+      {
+        const uint8_t marker2 = *(data + data_sz - index_sz);
+        // This chunk is marked as having a superframe index but doesn't have
+        // the matching marker byte at the front of the index therefore it's an
+        // invalid chunk.
+        if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
+      }
+      {
+        uint32_t i, j;
+        const uint8_t *x = &data[data_sz - index_sz + 1];
+        for (i = 0; i < frames; ++i) {
+          uint32_t this_sz = 0;
+
+          for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
+          sizes[i] = this_sz;
+        }
+        *count = frames;
+      }
+    }
+    return VPX_CODEC_OK;
+  }
+
  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-    if (last_pts_ == 0) duration = 1;
-    bits_in_buffer_model_ += static_cast<int64_t>(
-        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+    uint32_t sizes[8] = { 0 };
+    int count = 0;
+    last_pts_ = pkt->data.frame.pts;
    const bool key_frame =
        (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
-    if (!key_frame) {
-      // TODO(marpan): This check currently fails for some of the SVC tests,
-      // re-enable when issue (webm:1350) is resolved.
-      //  ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-      //                                      << pkt->data.frame.pts;
+    parse_superframe_index(static_cast<const uint8_t *>(pkt->data.frame.buf),
+                           pkt->data.frame.sz, sizes, &count);
+    if (!dynamic_drop_layer_) ASSERT_EQ(count, number_spatial_layers_);
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      sizes[sl] = sizes[sl] << 3;
+      // Update the total encoded bits per layer.
+      // For temporal layers, update the cumulative encoded bits per layer.
+      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        bits_total_[layer] += static_cast<int64_t>(sizes[sl]);
+        // Update the per-layer buffer level with the encoded frame size.
+        bits_in_buffer_model_[layer] -= static_cast<int64_t>(sizes[sl]);
+        // There should be no buffer underrun, except on the base
+        // temporal layer, since there may be key frames there.
+        if (!key_frame && tl > 0) {
+          ASSERT_GE(bits_in_buffer_model_[layer], 0)
+              << "Buffer Underrun at frame " << pkt->data.frame.pts;
+        }
+      }
+
+      ASSERT_EQ(pkt->data.frame.width[sl],
+                top_sl_width_ * svc_params_.scaling_factor_num[sl] /
+                    svc_params_.scaling_factor_den[sl]);
+
+      ASSERT_EQ(pkt->data.frame.height[sl],
+                top_sl_height_ * svc_params_.scaling_factor_num[sl] /
+                    svc_params_.scaling_factor_den[sl]);
    }
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
-    bits_in_buffer_model_ -= static_cast<int64_t>(frame_size_in_bits);
-    bits_total_ += frame_size_in_bits;
-    if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
-    last_pts_ = pkt->data.frame.pts;
-    bits_in_last_frame_ = frame_size_in_bits;
-    ++frame_number_;
  }
+
  virtual void EndPassHook(void) {
-    if (bits_total_) {
-      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
-      duration_ = (last_pts_ + 1) * timebase_;
-      file_datarate_ = file_size_in_kb / duration_;
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      for (int tl = 0; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        const double file_size_in_kb = bits_total_[layer] / 1000.;
+        duration_ = (last_pts_ + 1) * timebase_;
+        file_datarate_[layer] = file_size_in_kb / duration_;
+      }
    }
  }

@@ -1294,13 +1524,11 @@ class DatarateOnePassCbrSvc
  unsigned int GetMismatchFrames() { return mismatch_nframes_; }

  vpx_codec_pts_t last_pts_;
-  int64_t bits_in_buffer_model_;
+  int64_t bits_in_buffer_model_[VPX_MAX_LAYERS];
  double timebase_;
-  int frame_number_;
-  vpx_codec_pts_t first_drop_;
-  int64_t bits_total_;
+  int64_t bits_total_[VPX_MAX_LAYERS];
  double duration_;
-  double file_datarate_;
+  double file_datarate_[VPX_MAX_LAYERS];
  size_t bits_in_last_frame_;
  vpx_svc_extra_cfg_t svc_params_;
  int speed_setting_;
@@ -1309,14 +1537,27 @@ class DatarateOnePassCbrSvc
  int denoiser_on_;
  int tune_content_;
  int base_speed_setting_;
+  int spatial_layer_id_;
+  int temporal_layer_id_;
+  int number_spatial_layers_;
+  int number_temporal_layers_;
+  int layer_target_avg_bandwidth_[VPX_MAX_LAYERS];
+  bool dynamic_drop_layer_;
+  unsigned int top_sl_width_;
+  unsigned int top_sl_height_;
+  vpx_svc_ref_frame_config_t ref_frame_config;
+  int update_pattern_;
 };
 static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
                                  const vpx_svc_extra_cfg_t *svc_params,
                                  int spatial_layers, int temporal_layers,
-                                  int temporal_layering_mode) {
+                                  int temporal_layering_mode,
+                                  int *layer_target_avg_bandwidth,
+                                  int64_t *bits_in_buffer_model) {
  int sl, spatial_layer_target;
  float total = 0;
  float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
+  float framerate = 30.0;
  for (sl = 0; sl < spatial_layers; ++sl) {
    if (svc_params->scaling_factor_den[sl] > 0) {
      alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] * 1.0 /
@@ -1336,8 +1577,41 @@ static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
    } else if (temporal_layering_mode == 2) {
      enc_cfg->layer_target_bitrate[index] = spatial_layer_target * 2 / 3;
      enc_cfg->layer_target_bitrate[index + 1] = spatial_layer_target;
+    } else if (temporal_layering_mode <= 1) {
+      enc_cfg->layer_target_bitrate[index] = spatial_layer_target;
    }
  }
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    for (int tl = 0; tl < temporal_layers; ++tl) {
+      const int layer = sl * temporal_layers + tl;
+      float layer_framerate = framerate;
+      if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2;
+      if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4;
+      if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2;
+      layer_target_avg_bandwidth[layer] = static_cast<int>(
+          enc_cfg->layer_target_bitrate[layer] * 1000.0 / layer_framerate);
+      bits_in_buffer_model[layer] =
+          enc_cfg->layer_target_bitrate[layer] * enc_cfg->rc_buf_initial_sz;
+    }
+  }
+}
+
+static void CheckLayerRateTargeting(vpx_codec_enc_cfg_t *const cfg,
+                                    int number_spatial_layers,
+                                    int number_temporal_layers,
+                                    double *file_datarate,
+                                    double thresh_overshoot,
+                                    double thresh_undershoot) {
+  for (int sl = 0; sl < number_spatial_layers; ++sl)
+    for (int tl = 0; tl < number_temporal_layers; ++tl) {
+      const int layer = sl * number_temporal_layers + tl;
+      ASSERT_GE(cfg->layer_target_bitrate[layer],
+                file_datarate[layer] * thresh_overshoot)
+          << " The datarate for the file exceeds the target by too much!";
+      ASSERT_LE(cfg->layer_target_bitrate[layer],
+                file_datarate[layer] * thresh_undershoot)
+          << " The datarate for the file is lower than the target by too much!";
+    }
 }

 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
@@ -1363,14 +1637,21 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TLScreenContent1) {
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 10;
  cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
  cfg_.rc_target_bitrate = 500;
  ResetModel();
  tune_content_ = 1;
  base_speed_setting_ = speed_setting_;
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }

@@ -1398,26 +1679,30 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL) {
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
  // TODO(marpan): Check that effective_datarate for each layer hits the
  // layer target_bitrate.
  for (int i = 200; i <= 800; i += 200) {
    cfg_.rc_target_bitrate = i;
    ResetModel();
    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-        << " The datarate for the file is lower than the target by too much!";
+    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                            number_temporal_layers_, file_datarate_, 0.78,
+                            1.15);
 #if CONFIG_VP9_DECODER
    // Number of temporal layers > 1, so half of the frames in this SVC pattern
    // will be non-reference frame and hence encoder will avoid loopfilter.
-    // Since frame dropper is off, we can expcet 100 (half of the sequence)
+    // Since frame dropper is off, we can expect 200 (half of the sequence)
    // mismatched frames.
-    EXPECT_EQ(static_cast<unsigned int>(100), GetMismatchFrames());
+    EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
 #endif
  }
 }
@@ -1446,33 +1731,43 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLDenoiserOn) {
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
  // TODO(marpan): Check that effective_datarate for each layer hits the
  // layer target_bitrate.
-  for (int i = 600; i <= 1000; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    denoiser_on_ = 1;
-    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-        << " The datarate for the file is lower than the target by too much!";
+  // For SVC, noise_sen = 1 means denoising only the top spatial layer
+  // noise_sen = 2 means denoising the two top spatial layers.
+  for (int noise_sen = 1; noise_sen <= 2; noise_sen++) {
+    for (int i = 600; i <= 1000; i += 200) {
+      cfg_.rc_target_bitrate = i;
+      ResetModel();
+      denoiser_on_ = noise_sen;
+      assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+                            cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                            layer_target_avg_bandwidth_, bits_in_buffer_model_);
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                              number_temporal_layers_, file_datarate_, 0.78,
+                              1.15);
 #if CONFIG_VP9_DECODER
-    // Number of temporal layers > 1, so half of the frames in this SVC pattern
-    // will be non-reference frame and hence encoder will avoid loopfilter.
-    // Since frame dropper is off, we can expcet 150 (half of the sequence)
-    // mismatched frames.
-    EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
+      // Number of temporal layers > 1, so half of the frames in this SVC
+      // pattern
+      // will be non-reference frame and hence encoder will avoid loopfilter.
+      // Since frame dropper is off, we can expect 200 (half of the sequence)
+      // mismatched frames.
+      EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
 #endif
+    }
  }
 }

 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
 // temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
-TEST_P(DatarateOnePassCbrSvc, DISABLED_OnePassCbrSvc2SL3TLSmallKf) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) {
  cfg_.rc_buf_initial_sz = 500;
  cfg_.rc_buf_optimal_sz = 500;
  cfg_.rc_buf_sz = 1000;
@@ -1493,21 +1788,25 @@ TEST_P(DatarateOnePassCbrSvc, DISABLED_OnePassCbrSvc2SL3TLSmallKf) {
  svc_params_.scaling_factor_num[1] = 288;
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 10;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
  cfg_.rc_target_bitrate = 400;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
  for (int j = 64; j <= 67; j++) {
    cfg_.kf_max_dist = j;
    ResetModel();
    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.80)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-        << " The datarate for the file is lower than the target by too much!";
+    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                            number_temporal_layers_, file_datarate_, 0.78,
+                            1.15);
  }
 }

@@ -1535,22 +1834,25 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) {
  svc_params_.scaling_factor_den[1] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
  cfg_.rc_target_bitrate = 800;
  ResetModel();
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-      << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-      << " The datarate for the file is lower than the target by too much!";
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
 #if CONFIG_VP9_DECODER
  // Number of temporal layers > 1, so half of the frames in this SVC pattern
  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expcet 150 (half of the sequence)
+  // Since frame dropper is off, we can expect 30 (half of the sequence)
  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
+  EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
 #endif
 }

@@ -1580,25 +1882,126 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL) {
  svc_params_.scaling_factor_den[2] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
  cfg_.rc_target_bitrate = 800;
  ResetModel();
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-      << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
-      << " The datarate for the file is lower than the target by too much!";
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
 #if CONFIG_VP9_DECODER
  // Number of temporal layers > 1, so half of the frames in this SVC pattern
  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expcet 150 (half of the sequence)
+  // Since frame dropper is off, we can expect 200 (half of the sequence)
  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
+  EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
 #endif
 }

+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 2 temporal layers, with a change on the fly from the fixed SVC pattern to one
+// generate via SVC_SET_REF_FRAME_CONFIG. The new pattern also disables
+// inter-layer prediction.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL2TLDynamicPatternChange) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 3;
+  cfg_.ts_number_layers = 2;
+  cfg_.ts_rate_decimator[0] = 2;
+  cfg_.ts_rate_decimator[1] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 2;
+  svc_params_.scaling_factor_num[0] = 72;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 144;
+  svc_params_.scaling_factor_den[1] = 288;
+  svc_params_.scaling_factor_num[2] = 288;
+  svc_params_.scaling_factor_den[2] = 288;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  // Change SVC pattern on the fly.
+  update_pattern_ = 1;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+#if CONFIG_VP9_DECODER
+  // Number of temporal layers > 1, so half of the frames in this SVC pattern
+  // will be non-reference frame and hence encoder will avoid loopfilter.
+  // Since frame dropper is off, we can expect 200 (half of the sequence)
+  // mismatched frames.
+  EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC with 3 spatial layers and on
+// the fly switching to 2 spatial layers and then back to 3. This switch is done
+// by setting top spatial layer bitrate to 0, and then back to non-zero, during
+// the sequence.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL_to_2SL_dynamic) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 3;
+  cfg_.ts_number_layers = 1;
+  cfg_.ts_rate_decimator[0] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 0;
+  svc_params_.scaling_factor_num[0] = 72;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 144;
+  svc_params_.scaling_factor_den[1] = 288;
+  svc_params_.scaling_factor_num[2] = 288;
+  svc_params_.scaling_factor_den[2] = 288;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  dynamic_drop_layer_ = true;
+  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Don't check rate targeting on top spatial layer since it will be skipped
+  // for part of the sequence.
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_ - 1,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+}
+
 // Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
 // temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
 TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) {
@@ -1624,20 +2027,25 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) {
  svc_params_.scaling_factor_num[2] = 288;
  svc_params_.scaling_factor_den[2] = 288;
  cfg_.rc_dropframe_thresh = 10;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
  cfg_.rc_target_bitrate = 800;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
  for (int j = 32; j <= 35; j++) {
    cfg_.kf_max_dist = j;
    ResetModel();
    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.80)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30)
-        << " The datarate for the file is lower than the target by too much!";
+    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                            number_temporal_layers_, file_datarate_, 0.78,
+                            1.15);
  }
 }

@@ -1667,22 +2075,25 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL4threads) {
  svc_params_.scaling_factor_den[2] = 288;
  cfg_.rc_dropframe_thresh = 0;
  cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
  cfg_.rc_target_bitrate = 800;
  ResetModel();
  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-      << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
-      << " The datarate for the file is lower than the target by too much!";
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
 #if CONFIG_VP9_DECODER
  // Number of temporal layers > 1, so half of the frames in this SVC pattern
  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expcet 150 (half of the sequence)
+  // Since frame dropper is off, we can expect 30 (half of the sequence)
  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
+  EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
 #endif
 }

@@ -1714,9 +2125,21 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TL5x5MultipleRuns) {
  cfg_.layer_target_bitrate[0] = 300;
  cfg_.layer_target_bitrate[1] = 1400;
  cfg_.rc_target_bitrate = 1700;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
  ResetModel();
+  layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30;
+  bits_in_buffer_model_[0] =
+      cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz;
+  layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30;
+  bits_in_buffer_model_[1] =
+      cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }

@@ -1729,6 +2152,9 @@ VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
                          ::testing::Values(::libvpx_test::kOnePassGood,
                                            ::libvpx_test::kRealTime),
                          ::testing::Range(2, 9));
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Range(5, 9));
 #if CONFIG_VP9_TEMPORAL_DENOISING
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser,
                          ::testing::Values(::libvpx_test::kRealTime),
--- a/test/dct_partial_test.cc
+++ b/test/dct_partial_test.cc
@@ -28,8 +28,8 @@

 using libvpx_test::ACMRandom;
 using libvpx_test::Buffer;
-using std::tr1::tuple;
 using std::tr1::make_tuple;
+using std::tr1::tuple;

 namespace {
 typedef void (*PartialFdctFunc)(const int16_t *in, tran_low_t *out, int stride);
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -106,4 +106,90 @@ TEST(EncodeAPI, ImageSizeSetting) {
 }
 #endif

+// Set up 2 spatial streams with 2 temporal layers per stream, and generate
+// invalid configuration by setting the temporal layer rate allocation
+// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of
+// CONFIG_MULTI_RES_ENCODING.
+TEST(EncodeAPI, MultiResEncode) {
+  static const vpx_codec_iface_t *kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+    &vpx_codec_vp8_cx_algo,
+#endif
+#if CONFIG_VP9_ENCODER
+    &vpx_codec_vp9_cx_algo,
+#endif
+  };
+  const int width = 1280;
+  const int height = 720;
+  const int width_down = width / 2;
+  const int height_down = height / 2;
+  const int target_bitrate = 1000;
+  const int framerate = 30;
+
+  for (int c = 0; c < NELEMENTS(kCodecs); ++c) {
+    const vpx_codec_iface_t *const iface = kCodecs[c];
+    vpx_codec_ctx_t enc[2];
+    vpx_codec_enc_cfg_t cfg[2];
+    vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
+
+    memset(enc, 0, sizeof(enc));
+
+    for (int i = 0; i < 2; i++) {
+      vpx_codec_enc_config_default(iface, &cfg[i], 0);
+    }
+
+    /* Highest-resolution encoder settings */
+    cfg[0].g_w = width;
+    cfg[0].g_h = height;
+    cfg[0].rc_dropframe_thresh = 0;
+    cfg[0].rc_end_usage = VPX_CBR;
+    cfg[0].rc_resize_allowed = 0;
+    cfg[0].rc_min_quantizer = 2;
+    cfg[0].rc_max_quantizer = 56;
+    cfg[0].rc_undershoot_pct = 100;
+    cfg[0].rc_overshoot_pct = 15;
+    cfg[0].rc_buf_initial_sz = 500;
+    cfg[0].rc_buf_optimal_sz = 600;
+    cfg[0].rc_buf_sz = 1000;
+    cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
+    cfg[0].g_lag_in_frames = 0;
+
+    cfg[0].kf_mode = VPX_KF_AUTO;
+    cfg[0].kf_min_dist = 3000;
+    cfg[0].kf_max_dist = 3000;
+
+    cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */
+    cfg[0].g_timebase.num = 1;                 /* Set fps */
+    cfg[0].g_timebase.den = framerate;
+
+    memcpy(&cfg[1], &cfg[0], sizeof(cfg[0]));
+    cfg[1].rc_target_bitrate = 500;
+    cfg[1].g_w = width_down;
+    cfg[1].g_h = height_down;
+
+    for (int i = 0; i < 2; i++) {
+      cfg[i].ts_number_layers = 2;
+      cfg[i].ts_periodicity = 2;
+      cfg[i].ts_rate_decimator[0] = 2;
+      cfg[i].ts_rate_decimator[1] = 1;
+      cfg[i].ts_layer_id[0] = 0;
+      cfg[i].ts_layer_id[1] = 1;
+      // Invalid parameters.
+      cfg[i].ts_target_bitrate[0] = 0;
+      cfg[i].ts_target_bitrate[1] = 0;
+    }
+
+    // VP9 should report incapable, VP8 invalid for all configurations.
+    const char kVP9Name[] = "WebM Project VP9";
+    const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface),
+                                sizeof(kVP9Name) - 1) == 0;
+    EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
+              vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
+
+    for (int i = 0; i < 2; i++) {
+      vpx_codec_destroy(&enc[i]);
+    }
+  }
+}
+
 }  // namespace
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -201,6 +201,8 @@ void EncoderTest::RunLoop(VideoSource *video) {
      PreEncodeFrameHook(video, encoder.get());
      encoder->EncodeFrame(video, frame_flags_);

+      PostEncodeFrameHook(encoder.get());
+
      CxDataIterator iter = encoder->GetCxData();

      bool has_cxdata = false;
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -128,6 +128,11 @@ class Encoder {
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }

+  void Control(int ctrl_id, struct vpx_svc_ref_frame_config *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
  void Control(int ctrl_id, struct vpx_svc_parameters *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
@@ -137,15 +142,12 @@ class Encoder {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }
-#endif

-#if CONFIG_VP8_ENCODER
  void Control(int ctrl_id, vpx_roi_map_t *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }
 #endif
-
  void Config(const vpx_codec_enc_cfg_t *cfg) {
    const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
@@ -219,6 +221,8 @@ class EncoderTest {
  virtual void PreEncodeFrameHook(VideoSource * /*video*/,
                                  Encoder * /*encoder*/) {}

+  virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {}
+
  // Hook to be called on every compressed data packet.
  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}

--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -675,7 +675,9 @@ INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
                        ::testing::Values(make_tuple(&vpx_fdct8x8_neon,
                                                     &vpx_idct8x8_64_add_neon,
                                                     0, VPX_BITS_8)));
-#if !CONFIG_VP9_HIGHBITDEPTH
+// TODO(linfengz): reenable these functions once test vector failures are
+// addressed.
+#if 0   // !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
    NEON, FwdTrans8x8HT,
    ::testing::Values(
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -174,4 +174,4 @@ INSTANTIATE_TEST_CASE_P(MSA, IDCTTest,
 INSTANTIATE_TEST_CASE_P(MMI, IDCTTest,
                        ::testing::Values(vp8_short_idct4x4llm_mmi));
 #endif  // HAVE_MMI
-}
+}  // namespace
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -123,6 +123,7 @@ TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
 #if CONFIG_VP8_DECODER
 const DecodeParam kVP8InvalidFileTests[] = {
  { 1, "invalid-bug-1443.ivf" },
+  { 1, "invalid-token-partition.ivf" },
 };

 VP8_INSTANTIATE_TEST_CASE(InvalidFileTest,
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -114,6 +114,18 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
  }
 }

+uint8_t GetOuterThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(3 * MAX_LOOP_FILTER + 5));
+}
+
+uint8_t GetInnerThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1));
+}
+
+uint8_t GetHevThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1) >> 4);
+}
+
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
 public:
  virtual ~Loop8Test6Param() {}
@@ -162,15 +174,15 @@ TEST_P(Loop8Test6Param, OperationCheck) {
  int first_failure = -1;
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -221,15 +233,15 @@ TEST_P(Loop8Test6Param, ValueCheck) {

  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -271,27 +283,27 @@ TEST_P(Loop8Test9Param, OperationCheck) {
  int first_failure = -1;
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    tmp = GetOuterThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -334,27 +346,27 @@ TEST_P(Loop8Test9Param, ValueCheck) {
  int first_failure = -1;
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    tmp = GetOuterThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -277,12 +277,29 @@ class ResizeTest
    SetMode(GET_PARAM(1));
  }

+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
+    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
+    encode_frame_width_.push_back(pkt->data.frame.width[0]);
+    encode_frame_height_.push_back(pkt->data.frame.height[0]);
+  }
+
+  unsigned int GetFrameWidth(size_t idx) const {
+    return encode_frame_width_[idx];
+  }
+
+  unsigned int GetFrameHeight(size_t idx) const {
+    return encode_frame_height_[idx];
+  }
+
  virtual void DecompressedFrameHook(const vpx_image_t &img,
                                     vpx_codec_pts_t pts) {
    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
  }

  std::vector<FrameInfo> frame_info_list_;
+  std::vector<unsigned int> encode_frame_width_;
+  std::vector<unsigned int> encode_frame_height_;
 };

 TEST_P(ResizeTest, TestExternalResizeWorks) {
@@ -296,6 +313,9 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
    const unsigned int frame = static_cast<unsigned>(info->pts);
    unsigned int expected_w;
    unsigned int expected_h;
+    const size_t idx = info - frame_info_list_.begin();
+    ASSERT_EQ(info->w, GetFrameWidth(idx));
+    ASSERT_EQ(info->h, GetFrameHeight(idx));
    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
                        &expected_h, 0);
    EXPECT_EQ(expected_w, info->w)
@@ -464,8 +484,23 @@ class ResizeRealtimeTest
    ++mismatch_nframes_;
  }

+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
+    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
+    encode_frame_width_.push_back(pkt->data.frame.width[0]);
+    encode_frame_height_.push_back(pkt->data.frame.height[0]);
+  }
+
  unsigned int GetMismatchFrames() { return mismatch_nframes_; }

+  unsigned int GetFrameWidth(size_t idx) const {
+    return encode_frame_width_[idx];
+  }
+
+  unsigned int GetFrameHeight(size_t idx) const {
+    return encode_frame_height_[idx];
+  }
+
  void DefaultConfig() {
    cfg_.rc_buf_initial_sz = 500;
    cfg_.rc_buf_optimal_sz = 600;
@@ -493,6 +528,8 @@ class ResizeRealtimeTest
  bool change_bitrate_;
  double mismatch_psnr_;
  int mismatch_nframes_;
+  std::vector<unsigned int> encode_frame_width_;
+  std::vector<unsigned int> encode_frame_height_;
 };

 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
@@ -582,6 +619,9 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
  int resize_count = 0;
  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
       info != frame_info_list_.end(); ++info) {
+    const size_t idx = info - frame_info_list_.begin();
+    ASSERT_EQ(info->w, GetFrameWidth(idx));
+    ASSERT_EQ(info->h, GetFrameHeight(idx));
    if (info->w != last_w || info->h != last_h) {
      resize_count++;
      if (resize_count == 1) {
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -112,8 +112,9 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_SSE2

 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, SumSquaresTest, ::testing::Values(make_tuple(
-                                                 &vpx_sum_squares_2d_i16_c,
-                                                 &vpx_sum_squares_2d_i16_msa)));
+INSTANTIATE_TEST_CASE_P(
+    MSA, SumSquaresTest,
+    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+                                 &vpx_sum_squares_2d_i16_msa)));
 #endif  // HAVE_MSA
 }  // namespace
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -734,6 +734,8 @@ endif  # CONFIG_VP9_HIGHBITDEPTH
 # Invalid files for testing libvpx error checking.
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -852,5 +852,7 @@ e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res
 fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf
 fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res
+1a0e405606939f2febab1a21b30c37cb8f2c8cb1 *invalid-token-partition.ivf
+90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res
 17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm
 e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -61,7 +61,6 @@ int main(int argc, char **argv) {
 #if !CONFIG_SHARED
 // Shared library builds don't support whitebox tests
 // that exercise internal symbols.
-
 #if CONFIG_VP8
  vp8_rtcd();
 #endif  // CONFIG_VP8
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc
@@ -27,8 +27,8 @@

 namespace {

-using std::string;
 using libvpx_test::ACMRandom;
+using std::string;

 #if CONFIG_WEBM_IO

--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -59,7 +59,7 @@ const TestVideoParam kTestVectors[] = {
 // Encoding modes tested
 const libvpx_test::TestMode kEncodingModeVectors[] = {
  ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-  ::libvpx_test::kRealTime,
+  ::libvpx_test::kRealTime
 };

 // Speed settings tested
--- a/test/vp9_motion_vector_test.cc
+++ b/test/vp9_motion_vector_test.cc
@@ -22,7 +22,7 @@ namespace {
 // Encoding modes
 const libvpx_test::TestMode kEncodingModeVectors[] = {
  ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-  ::libvpx_test::kRealTime,
+  ::libvpx_test::kRealTime
 };

 // Encoding speeds
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -14,9 +14,9 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
-#include "./vp9_rtcd.h"
 #include "test/acm_random.h"
 #include "test/buffer.h"
 #include "test/clear_system_state.h"
@@ -42,7 +42,7 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
                             uint16_t *eob, const int16_t *scan,
                             const int16_t *iscan);
 typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
-                        int /*max_size*/>
+                        int /*max_size*/, bool /*is_fp*/>
    QuantizeParam;

 // Wrapper for FP version which does not use zbin or quant_shift.
@@ -69,11 +69,15 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,

 class VP9QuantizeBase {
 public:
-  VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size)
-      : bit_depth_(bit_depth), max_size_(max_size) {
+  VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
+      : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) {
    max_value_ = (1 << bit_depth_) - 1;
    zbin_ptr_ =
        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
+    round_fp_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
+    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
    round_ptr_ =
        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
    quant_ptr_ =
@@ -86,11 +90,15 @@ class VP9QuantizeBase {

  ~VP9QuantizeBase() {
    vpx_free(zbin_ptr_);
+    vpx_free(round_fp_ptr_);
+    vpx_free(quant_fp_ptr_);
    vpx_free(round_ptr_);
    vpx_free(quant_ptr_);
    vpx_free(quant_shift_ptr_);
    vpx_free(dequant_ptr_);
    zbin_ptr_ = NULL;
+    round_fp_ptr_ = NULL;
+    quant_fp_ptr_ = NULL;
    round_ptr_ = NULL;
    quant_ptr_ = NULL;
    quant_shift_ptr_ = NULL;
@@ -100,6 +108,8 @@ class VP9QuantizeBase {

 protected:
  int16_t *zbin_ptr_;
+  int16_t *round_fp_ptr_;
+  int16_t *quant_fp_ptr_;
  int16_t *round_ptr_;
  int16_t *quant_ptr_;
  int16_t *quant_shift_ptr_;
@@ -107,29 +117,136 @@ class VP9QuantizeBase {
  const vpx_bit_depth_t bit_depth_;
  int max_value_;
  const int max_size_;
+  const bool is_fp_;
 };

 class VP9QuantizeTest : public VP9QuantizeBase,
                        public ::testing::TestWithParam<QuantizeParam> {
 public:
  VP9QuantizeTest()
-      : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3)), quantize_op_(GET_PARAM(0)),
-        ref_quantize_op_(GET_PARAM(1)) {}
+      : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3), GET_PARAM(4)),
+        quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}

 protected:
  const QuantizeFunc quantize_op_;
  const QuantizeFunc ref_quantize_op_;
 };

+// This quantizer compares the AC coefficients to the quantization step size to
+// determine if further multiplication operations are needed.
+// Based on vp9_quantize_fp_sse2().
+inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        int skip_block, const int16_t *round_ptr,
+                        const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan,
+                        const int16_t *iscan, int is_32x32) {
+  int i, eob = -1;
+  const int thr = dequant_ptr[1] >> (1 + is_32x32);
+  (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < n_coeffs; i += 16) {
+    int y;
+    int nzflag_cnt = 0;
+    int abs_coeff[16];
+    int coeff_sign[16];
+
+    // count nzflag for each row (16 tran_low_t)
+    for (y = 0; y < 16; ++y) {
+      const int rc = i + y;
+      const int coeff = coeff_ptr[rc];
+      coeff_sign[y] = (coeff >> 31);
+      abs_coeff[y] = (coeff ^ coeff_sign[y]) - coeff_sign[y];
+      // The first 16 are skipped in the sse2 code.  Do the same here to match.
+      if (i >= 16 && (abs_coeff[y] <= thr)) {
+        nzflag_cnt++;
+      }
+    }
+
+    for (y = 0; y < 16; ++y) {
+      const int rc = i + y;
+      // If all of the AC coeffs in a row has magnitude less than the
+      // quantization step_size/2, quantize to zero.
+      if (nzflag_cnt < 16) {
+        int tmp;
+        int _round;
+
+        if (is_32x32) {
+          _round = ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        } else {
+          _round = round_ptr[rc != 0];
+        }
+        tmp = clamp(abs_coeff[y] + _round, INT16_MIN, INT16_MAX);
+        tmp = (tmp * quant_ptr[rc != 0]) >> (16 - is_32x32);
+        qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y];
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+        if (is_32x32) {
+          dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+        } else {
+          dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+        }
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
+    }
+  }
+
+  // Scan for eob.
+  for (i = 0; i < n_coeffs; i++) {
+    // Use the scan order to find the correct eob.
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan) {
+  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
+              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
+}
+
+void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                            uint16_t *eob_ptr, const int16_t *scan,
+                            const int16_t *iscan) {
+  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
+              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
+}
+
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
                          int16_t *quant, int16_t *quant_shift,
-                          int16_t *dequant) {
+                          int16_t *dequant, int16_t *round_fp,
+                          int16_t *quant_fp) {
+  // Max when q == 0.  Otherwise, it is 48 for Y and 42 for U/V.
+  const int max_qrounding_factor_fp = 64;
+
  for (int j = 0; j < 2; j++) {
+    // The range is 4 to 1828 in the VP9 tables.
+    const int qlookup = rnd->RandRange(1825) + 4;
+    round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7;
+    quant_fp[j] = (1 << 16) / qlookup;
+
    // Values determined by deconstructing vp9_init_quantizer().
    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
    // values or U/V values of any bit depth. This is because y_delta is not
    // factored into the vp9_ac_quant() call.
    zbin[j] = rnd->RandRange(1200);
+
    // round may be up to 685 for Y values or 914 for U/V.
    round[j] = rnd->RandRange(914);
    // quant ranges from 1 to -32703
@@ -141,6 +258,8 @@ void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
  }
  for (int j = 2; j < 8; j++) {
    zbin[j] = zbin[1];
+    round_fp[j] = round_fp[1];
+    quant_fp[j] = quant_fp[1];
    round[j] = round[1];
    quant[j] = quant[1];
    quant_shift[j] = quant_shift[1];
@@ -179,19 +298,19 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
    const int count = (4 << sz) * (4 << sz);
    coeff.Set(&rnd, -max_value_, max_value_);
    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                         quant_shift_ptr_, dequant_ptr_);
+                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                         quant_fp_ptr_);
+    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                     scan_order->scan, scan_order->iscan);

-    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
-                     round_ptr_, quant_ptr_, quant_shift_ptr_,
-                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_order->scan,
-                     scan_order->iscan);
-
-    ASM_REGISTER_STATE_CHECK(
-        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
-                     round_ptr_, quant_ptr_, quant_shift_ptr_,
-                     qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(
+        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));

    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
    EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
@@ -241,19 +360,19 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
    coeff.TopLeftPixel()[rnd(count)] =
        static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                         quant_shift_ptr_, dequant_ptr_);
+                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                         quant_fp_ptr_);
+    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                     scan_order->scan, scan_order->iscan);

-    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
-                     round_ptr_, quant_ptr_, quant_shift_ptr_,
-                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_order->scan,
-                     scan_order->iscan);
-
-    ASM_REGISTER_STATE_CHECK(
-        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
-                     round_ptr_, quant_ptr_, quant_shift_ptr_,
-                     qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(
+        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));

    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
    EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
@@ -299,7 +418,10 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) {
      const int count = (4 << sz) * (4 << sz);

      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                           quant_shift_ptr_, dequant_ptr_);
+                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                           quant_fp_ptr_);
+      int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+      int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;

      if (i == 0) {
        // When |coeff values| are less than zbin the results are 0.
@@ -319,10 +441,10 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) {
      vpx_usec_timer timer;
      vpx_usec_timer_start(&timer);
      for (int j = 0; j < 100000000 / count; ++j) {
-        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_,
-                     round_ptr_, quant_ptr_, quant_shift_ptr_,
-                     qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &eob, scan_order->scan, scan_order->iscan);
+        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+                     q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(),
+                     dqcoeff.TopLeftPixel(), dequant_ptr_, &eob,
+                     scan_order->scan, scan_order->iscan);
      }
      vpx_usec_timer_mark(&timer);
      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
@@ -345,50 +467,54 @@ INSTANTIATE_TEST_CASE_P(
    SSE2, VP9QuantizeTest,
    ::testing::Values(
        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16),
+                   VPX_BITS_8, 16, false),
        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16),
+                   VPX_BITS_10, 16, false),
        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16),
+                   VPX_BITS_12, 16, false),
        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32),
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32),
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32)));
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));

 #else
-INSTANTIATE_TEST_CASE_P(SSE2, VP9QuantizeTest,
-                        ::testing::Values(make_tuple(&vpx_quantize_b_sse2,
-                                                     &vpx_quantize_b_c,
-                                                     VPX_BITS_8, 16)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
-                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
-                                 16)));
+    SSE2, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2

 #if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
+                                 &QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                                 VPX_BITS_8, 32, true)));
+#else
 INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest,
                        ::testing::Values(make_tuple(&vpx_quantize_b_ssse3,
                                                     &vpx_quantize_b_c,
-                                                     VPX_BITS_8, 16)));
+                                                     VPX_BITS_8, 16, false)));
+#endif

 #if ARCH_X86_64
 // TODO(johannkoenig): SSSE3 optimizations do not yet pass this test.
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_32x32_ssse3,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32),
-                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
-                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
-                                 16),
-                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
-                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
-                                 VPX_BITS_8, 32)));
+INSTANTIATE_TEST_CASE_P(DISABLED_SSSE3, VP9QuantizeTest,
+                        ::testing::Values(make_tuple(
+                            &vpx_quantize_b_32x32_ssse3,
+                            &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false)));
 #endif  // ARCH_X86_64
 #endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH

@@ -398,36 +524,54 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
    AVX, VP9QuantizeTest,
    ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16),
+                                 VPX_BITS_8, 16, false),
                      // Even though SSSE3 and AVX do not match the reference
                      // code, we can keep them in sync with each other.
                      make_tuple(&vpx_quantize_b_32x32_avx,
-                                 &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32)));
+                                 &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32,
+                                 false)));
 #endif  // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH

+#if ARCH_X86_64 && HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
+
 // TODO(webm:1448): dqcoeff is not handled correctly in HBD builds.
 #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
    NEON, VP9QuantizeTest,
-    ::testing::Values(
-        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16),
-        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_8, 32),
-        make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
-                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16),
-        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
-                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32)));
+    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_neon,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                                 VPX_BITS_8, 32, true)));
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH

 // Only useful to compare "Speed" test results.
 INSTANTIATE_TEST_CASE_P(
    DISABLED_C, VP9QuantizeTest,
    ::testing::Values(
-        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16),
+        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
-                   32),
+                   32, false),
        make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
-                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16),
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                   &QuantFPWrapper<quantize_fp_32x32_nz_c>, VPX_BITS_8, 32,
+                   true),
        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_c>,
-                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32)));
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+                   true)));
 }  // namespace
--- a/test/vp9_scale_test.cc
+++ b/test/vp9_scale_test.cc
@@ -47,7 +47,7 @@ class ScaleTest : public VpxScaleBase,
        scale_fn_(&img_, &dst_img_, filter_type, phase_scaler));
  }

-  void RunTest() {
+  void RunTest(INTERP_FILTER filter_type) {
    static const int kNumSizesToTest = 20;
    static const int kNumScaleFactorsToTest = 4;
    static const int kSizesToTest[] = {
@@ -55,50 +55,48 @@ class ScaleTest : public VpxScaleBase,
      22, 24, 26, 28, 30, 32, 34, 68, 128, 134
    };
    static const int kScaleFactors[] = { 1, 2, 3, 4 };
-    for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
-      for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
-        for (int h = 0; h < kNumSizesToTest; ++h) {
-          const int src_height = kSizesToTest[h];
-          for (int w = 0; w < kNumSizesToTest; ++w) {
-            const int src_width = kSizesToTest[w];
-            for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
-                 ++sf_up_idx) {
-              const int sf_up = kScaleFactors[sf_up_idx];
-              for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
-                   ++sf_down_idx) {
-                const int sf_down = kScaleFactors[sf_down_idx];
-                const int dst_width = src_width * sf_up / sf_down;
-                const int dst_height = src_height * sf_up / sf_down;
-                if (sf_up == sf_down && sf_up != 1) {
-                  continue;
-                }
-                // I420 frame width and height must be even.
-                if (!dst_width || !dst_height || dst_width & 1 ||
-                    dst_height & 1) {
-                  continue;
-                }
-                // vpx_convolve8_c() has restriction on the step which cannot
-                // exceed 64 (ratio 1 to 4).
-                if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
-                  continue;
-                }
-                ASSERT_NO_FATAL_FAILURE(ResetScaleImages(
-                    src_width, src_height, dst_width, dst_height));
-                ReferenceScaleFrame(filter_type, phase_scaler);
-                ScaleFrame(filter_type, phase_scaler);
-                if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
-                           ref_img_.frame_size)) {
-                  printf(
-                      "filter_type = %d, phase_scaler = %d, src_width = %4d, "
-                      "src_height = %4d, dst_width = %4d, dst_height = %4d, "
-                      "scale factor = %d:%d\n",
-                      filter_type, phase_scaler, src_width, src_height,
-                      dst_width, dst_height, sf_down, sf_up);
-                  PrintDiff();
-                }
-                CompareImages(dst_img_);
-                DeallocScaleImages();
+    for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
+      for (int h = 0; h < kNumSizesToTest; ++h) {
+        const int src_height = kSizesToTest[h];
+        for (int w = 0; w < kNumSizesToTest; ++w) {
+          const int src_width = kSizesToTest[w];
+          for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
+               ++sf_up_idx) {
+            const int sf_up = kScaleFactors[sf_up_idx];
+            for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+                 ++sf_down_idx) {
+              const int sf_down = kScaleFactors[sf_down_idx];
+              const int dst_width = src_width * sf_up / sf_down;
+              const int dst_height = src_height * sf_up / sf_down;
+              if (sf_up == sf_down && sf_up != 1) {
+                continue;
              }
+              // I420 frame width and height must be even.
+              if (!dst_width || !dst_height || dst_width & 1 ||
+                  dst_height & 1) {
+                continue;
+              }
+              // vpx_convolve8_c() has restriction on the step which cannot
+              // exceed 64 (ratio 1 to 4).
+              if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
+                continue;
+              }
+              ASSERT_NO_FATAL_FAILURE(ResetScaleImages(src_width, src_height,
+                                                       dst_width, dst_height));
+              ReferenceScaleFrame(filter_type, phase_scaler);
+              ScaleFrame(filter_type, phase_scaler);
+              if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+                         ref_img_.frame_size)) {
+                printf(
+                    "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+                    "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+                    "scale factor = %d:%d\n",
+                    filter_type, phase_scaler, src_width, src_height, dst_width,
+                    dst_height, sf_down, sf_up);
+                PrintDiff();
+              }
+              CompareImages(dst_img_);
+              DeallocScaleImages();
            }
          }
        }
@@ -145,7 +143,10 @@ class ScaleTest : public VpxScaleBase,
  ScaleFrameFunc scale_fn_;
 };

-TEST_P(ScaleTest, ScaleFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
+TEST_P(ScaleTest, ScaleFrame_EightTap) { RunTest(EIGHTTAP); }
+TEST_P(ScaleTest, ScaleFrame_EightTapSmooth) { RunTest(EIGHTTAP_SMOOTH); }
+TEST_P(ScaleTest, ScaleFrame_EightTapSharp) { RunTest(EIGHTTAP_SHARP); }
+TEST_P(ScaleTest, ScaleFrame_Bilinear) { RunTest(BILINEAR); }

 TEST_P(ScaleTest, DISABLED_Speed) {
  static const int kCountSpeedTestBlock = 100;
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -147,7 +147,6 @@ TEST(VPxWorkerThreadTest, TestInterfaceAPI) {

 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests
-
 #if CONFIG_WEBM_IO
 struct FileList {
  const char *name;
--- a/tools/all_builds.py
+++ b/tools/all_builds.py
@@ -1,72 +0,0 @@
-#!/usr/bin/python
-
-import getopt
-import subprocess
-import sys
-
-LONG_OPTIONS = ["shard=", "shards="]
-BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental"
-
-def RunCommand(command):
-  run = subprocess.Popen(command, shell=True)
-  output = run.communicate()
-  if run.returncode:
-    print "Non-zero return code: " + str(run.returncode) + " => exiting!"
-    sys.exit(1)
-
-def list_of_experiments():
-  experiments = []
-  configure_file = open("configure")
-  list_start = False
-  for line in configure_file.read().split("\n"):
-    if line == 'EXPERIMENT_LIST="':
-      list_start = True
-    elif line == '"':
-      list_start = False
-    elif list_start:
-      currently_broken = ["csm"]
-      experiment = line[4:]
-      if experiment not in currently_broken:
-        experiments.append(experiment)
-  return experiments
-
-def main(argv):
-  # Parse arguments
-  options = {"--shard": 0, "--shards": 1}
-  if "--" in argv:
-    opt_end_index = argv.index("--")
-  else:
-    opt_end_index = len(argv)
-  try:
-    o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS)
-  except getopt.GetoptError, err:
-    print str(err)
-    print "Usage: %s [--shard=<n> --shards=<n>] -- [configure flag ...]"%argv[0]
-    sys.exit(2)
-
-  options.update(o)
-  extra_args = argv[opt_end_index + 1:]
-
-  # Shard experiment list
-  shard = int(options["--shard"])
-  shards = int(options["--shards"])
-  experiments = list_of_experiments()
-  base_command = " ".join([BASE_COMMAND] + extra_args)
-  configs = [base_command]
-  configs += ["%s --enable-%s" % (base_command, e) for e in experiments]
-  my_configs = zip(configs, range(len(configs)))
-  my_configs = filter(lambda x: x[1] % shards == shard, my_configs)
-  my_configs = [e[0] for e in my_configs]
-
-  # Run configs for this shard
-  for config in my_configs:
-    test_build(config)
-
-def test_build(configure_command):
-  print "\033[34m\033[47mTesting %s\033[0m" % (configure_command)
-  RunCommand(configure_command)
-  RunCommand("make clean")
-  RunCommand("make")
-
-if __name__ == "__main__":
-  main(sys.argv)
--- a/tools/author_first_release.sh
+++ b/tools/author_first_release.sh
@@ -1,15 +0,0 @@
-#!/bin/bash
-##
-## List the release each author first contributed to.
-##
-## Usage: author_first_release.sh [TAGS]
-##
-## If the TAGS arguments are unspecified, all tags reported by `git tag`
-## will be considered.
-##
-tags=${@:-$(git tag)}
-for tag in $tags; do
-  git shortlog -n -e -s $tag |
-      cut -f2- |
-      awk "{print \"${tag#v}\t\"\$0}"
-done | sort -k2  | uniq -f2
--- a/tools/ftfy.sh
+++ b/tools/ftfy.sh
@@ -1,158 +0,0 @@
-#!/bin/sh
-self="$0"
-dirname_self=$(dirname "$self")
-
-usage() {
-  cat <<EOF >&2
-Usage: $self [option]
-
-This script applies a whitespace transformation to the commit at HEAD. If no
-options are given, then the modified files are left in the working tree.
-
-Options:
-  -h, --help     Shows this message
-  -n, --dry-run  Shows a diff of the changes to be made.
-  --amend        Squashes the changes into the commit at HEAD
-                     This option will also reformat the commit message.
-  --commit       Creates a new commit containing only the whitespace changes
-  --msg-only     Reformat the commit message only, ignore the patch itself.
-
-EOF
-  rm -f ${CLEAN_FILES}
-  exit 1
-}
-
-
-log() {
-  echo "${self##*/}: $@" >&2
-}
-
-
-vpx_style() {
-  for f; do
-    case "$f" in
-      *.h|*.c|*.cc)
-        clang-format -i --style=file "$f"
-        ;;
-    esac
-  done
-}
-
-
-apply() {
-  [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1"
-}
-
-
-commit() {
-  LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}')
-  if [ -z "$LAST_CHANGEID" ]; then
-    log "HEAD doesn't have a Change-Id, unable to generate a new commit"
-    exit 1
-  fi
-
-  # Build a deterministic Change-Id from the parent's
-  NEW_CHANGEID=${LAST_CHANGEID}-styled
-  NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin)
-
-  # Commit, preserving authorship from the parent commit.
-  git commit -a -C HEAD > /dev/null
-  git commit --amend -F- << EOF
-Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9}
-
-Change-Id: ${NEW_CHANGEID}
-EOF
-}
-
-
-show_commit_msg_diff() {
-  if [ $DIFF_MSG_RESULT -ne 0 ]; then
-    log "Modified commit message:"
-    diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3
-  fi
-}
-
-
-amend() {
-  show_commit_msg_diff
-  if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then
-    git commit -a --amend -F "$NEW_COMMIT_MSG"
-  fi
-}
-
-
-diff_msg() {
-  git log -1 --format=%B > "$ORIG_COMMIT_MSG"
-  "${dirname_self}"/wrap-commit-msg.py \
-      < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG"
-  cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG"
-  DIFF_MSG_RESULT=$?
-}
-
-
-# Temporary files
-ORIG_DIFF=orig.diff.$$
-MODIFIED_DIFF=modified.diff.$$
-FINAL_DIFF=final.diff.$$
-ORIG_COMMIT_MSG=orig.commit-msg.$$
-NEW_COMMIT_MSG=new.commit-msg.$$
-CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}"
-CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}"
-
-# Preconditions
-[ $# -lt 2 ] || usage
-
-if ! clang-format -version >/dev/null 2>&1; then
-  log "clang-format not found"
-  exit 1
-fi
-
-if ! git diff --quiet HEAD; then
-  log "Working tree is dirty, commit your changes first"
-  exit 1
-fi
-
-# Need to be in the root
-cd "$(git rev-parse --show-toplevel)"
-
-# Collect the original diff
-git show > "${ORIG_DIFF}"
-
-# Apply the style guide on new and modified files and collect its diff
-for f in $(git diff HEAD^ --name-only -M90 --diff-filter=AM); do
-  case "$f" in
-    third_party/*) continue;;
-  esac
-  vpx_style "$f"
-done
-git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}"
-
-# Intersect the two diffs
-"${dirname_self}"/intersect-diffs.py \
-    "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}"
-INTERSECT_RESULT=$?
-git reset --hard >/dev/null
-
-# Fixup the commit message
-diff_msg
-
-# Handle options
-if [ -n "$1" ]; then
-  case "$1" in
-    -h|--help) usage;;
-    -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;;
-    --commit) apply "${FINAL_DIFF}"; commit;;
-    --amend) apply "${FINAL_DIFF}"; amend;;
-    --msg-only) amend;;
-    *) usage;;
-  esac
-else
-  apply "${FINAL_DIFF}"
-  if ! git diff --quiet; then
-    log "Formatting changes applied, verify and commit."
-    log "See also: http://www.webmproject.org/code/contribute/conventions/"
-    git diff --stat
-  fi
-fi
-
-rm -f ${CLEAN_FILES}
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -37,7 +37,9 @@ extern "C" {
 #define SEGMENT_DELTADATA 0
 #define SEGMENT_ABSDATA 1

-typedef struct { int r, c; } POS;
+typedef struct {
+  int r, c;
+} POS;

 #define PLANE_TYPE_Y_NO_DC 0
 #define PLANE_TYPE_Y2 1
@@ -180,6 +182,9 @@ typedef struct {
  unsigned int low_res_ref_frames[MAX_REF_FRAMES];
  // The video frame counter value for the key frame, for lowest resolution.
  unsigned int key_frame_counter_value;
+  // Flags to signal skipped encoding of previous and base layer stream.
+  unsigned int skip_encoding_prev_stream;
+  unsigned int skip_encoding_base_stream;
  LOWER_RES_MB_INFO *mb_info;
 } LOWER_RES_FRAME_INFO;
 #endif
--- a/vp8/common/default_coef_probs.h
+++ b/vp8/common/default_coef_probs.h
@@ -6,7 +6,7 @@
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
-*/
+ */

 #ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_
 #define VP8_COMMON_DEFAULT_COEF_PROBS_H_
--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -20,8 +20,7 @@ static void copy_and_extend_plane(unsigned char *s, /* source */
                                  int et,           /* extend top border */
                                  int el,           /* extend left border */
                                  int eb,           /* extend bottom border */
-                                  int er            /* extend right border */
-                                  ) {
+                                  int er) {         /* extend right border */
  int i;
  unsigned char *src_ptr1, *src_ptr2;
  unsigned char *dest_ptr1, *dest_ptr2;
--- a/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
+++ b/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
@@ -934,8 +934,8 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
  s4 = s3 + p;

  /* load quad-byte vectors
-  * memory is 4 byte aligned
-  */
+   * memory is 4 byte aligned
+   */
  p2 = *((uint32_t *)(s1 - 4));
  p6 = *((uint32_t *)(s1));
  p1 = *((uint32_t *)(s2 - 4));
@@ -990,8 +990,8 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
      :);

  /* if (p1 - p4 == 0) and (p2 - p3 == 0)
-  * mask will be zero and filtering is not needed
-  */
+   * mask will be zero and filtering is not needed
+   */
  if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
    vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
                             thresh, &hev, &mask);
@@ -2102,8 +2102,8 @@ void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
  s4 = s3 + p;

  /* load quad-byte vectors
-  * memory is 4 byte aligned
-  */
+   * memory is 4 byte aligned
+   */
  p2 = *((uint32_t *)(s1 - 4));
  p6 = *((uint32_t *)(s1));
  p1 = *((uint32_t *)(s2 - 4));
--- a/vp8/common/mips/mmi/idct_blk_mmi.c
+++ b/vp8/common/mips/mmi/idct_blk_mmi.c
@@ -12,7 +12,7 @@
 #include "vpx_mem/vpx_mem.h"

 void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
-                                      int stride, int8_t *eobs) {
+                                      int stride, char *eobs) {
  int i, j;

  for (i = 0; i < 4; i++) {
@@ -33,8 +33,7 @@ void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
 }

 void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu,
-                                       uint8_t *dstv, int stride,
-                                       int8_t *eobs) {
+                                       uint8_t *dstv, int stride, char *eobs) {
  int i, j;

  for (i = 0; i < 2; i++) {
--- a/vp8/common/mips/mmi/loopfilter_filters_mmi.c
+++ b/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -461,96 +461,87 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
  );
 }

+/* clang-format off */
 #define VP8_MBLOOP_HPSRAB                                               \
-  "xor        %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t" \
-  "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
-  "punpcklbh  %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t" \
-  "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp0]            \n\t" \
-  "psrah      %[ftmp3],   %[ftmp3],           %[ftmp9]            \n\t" \
-  "psrah      %[ftmp8],   %[ftmp8],           %[ftmp9]            \n\t" \
-  "packsshb   %[ftmp0],   %[ftmp3],           %[ftmp8]            \n\t"
+  "punpcklbh  %[ftmp10],  %[ftmp10],          %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp0]            \n\t" \
+  "psrah      %[ftmp10],  %[ftmp10],          %[ftmp9]            \n\t" \
+  "psrah      %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp0],   %[ftmp10],          %[ftmp11]            \n\t"

-#define VP8_MBLOOP_HPSRAB_PMULHH(reg1, reg2) \
-  "pmulhh   " #reg1 ",  " #reg1 ",  " #reg2 "                     \n\t"
-
-#define VP8_MBLOOP_HPSRAB_ADD(reg) \
-  "xor        %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t" \
-  "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
-  "punpcklbh  %[ftmp3],   %[ftmp3],           %[ftmp2]            \n\t" \
-  "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp2]            \n\t" \
-  VP8_MBLOOP_HPSRAB_PMULHH(%[ftmp3], reg)                               \
-  VP8_MBLOOP_HPSRAB_PMULHH(%[ftmp8], reg)                               \
-  "paddh      %[ftmp3],   %[ftmp3],           %[ff_ph_003f]       \n\t" \
-  "paddh      %[ftmp8],   %[ftmp8],           %[ff_ph_003f]       \n\t" \
-  "psrah      %[ftmp3],   %[ftmp3],           %[ftmp9]            \n\t" \
-  "psrah      %[ftmp8],   %[ftmp8],           %[ftmp9]            \n\t" \
-  "packsshb   %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"
+#define VP8_MBLOOP_HPSRAB_ADD(reg)                                      \
+  "punpcklbh  %[ftmp1],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "punpckhbh  %[ftmp2],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "pmulhh     %[ftmp1],   %[ftmp1],         " #reg "              \n\t" \
+  "pmulhh     %[ftmp2],   %[ftmp2],         " #reg "              \n\t" \
+  "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_003f]       \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],           %[ff_ph_003f]       \n\t" \
+  "psrah      %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t" \
+  "psrah      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+/* clang-format on */

 void vp8_mbloop_filter_horizontal_edge_mmi(
    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
    const unsigned char *limit, const unsigned char *thresh, int count) {
  uint32_t tmp[1];
-  mips_reg addr[2];
-  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
-  double ftmp[10];
+  double ftmp[13];

  __asm__ volatile (
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
    "1:                                                             \n\t"
    "gsldlc1    %[ftmp9],   0x07(%[limit])                          \n\t"
    "gsldrc1    %[ftmp9],   0x00(%[limit])                          \n\t"
+    /* ftmp1: p3 */
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp3: p2 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp4: p1 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp5: p0 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp6: q0 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp7: q1 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp8: q2 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp2: q3 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp2],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[src_ptr])                        \n\t"

-    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp12],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[blimit])                         \n\t"

-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
-    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
-    "gsldlc1    %[ftmp1],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
-    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
-    "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp0],   %[ftmp1],           %[ftmp3]            \n\t"
    "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
-
-    /* ftmp4:p1 */
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
-    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
-    "gsldlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-
-    /* ftmp5:p0 */
-    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
-    "pasubub    %[ftmp1],   %[ftmp4],           %[ftmp5]            \n\t"
-    "sdc1       %[ftmp1],   0x00(%[srct])                           \n\t"
-    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "pasubub    %[ftmp10],  %[ftmp4],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-
-    /* ftmp6:q0 */
-    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
-
-    /* ftmp7:q1 */
-    "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
-    "pasubub    %[ftmp1],   %[ftmp7],           %[ftmp6]            \n\t"
-    "sdc1       %[ftmp1],   0x08(%[srct])                           \n\t"
-    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-
-    MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
-    "gsldlc1    %[ftmp8],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp8],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-
-    MMI_ADDU(%[addr1], %[addr0], %[tmp0])
-    "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
    "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
@@ -563,9 +554,7 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
    "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
-    "gsldlc1    %[ftmp9],   0x07(%[blimit])                         \n\t"
-    "gsldrc1    %[ftmp9],   0x00(%[blimit])                         \n\t"
-    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp12]           \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
    "xor        %[ftmp9],   %[ftmp9],           %[ftmp9]            \n\t"
    /* ftmp0: mask */
@@ -573,29 +562,26 @@ void vp8_mbloop_filter_horizontal_edge_mmi(

    "gsldlc1    %[ftmp9],   0x07(%[thresh])                         \n\t"
    "gsldrc1    %[ftmp9],   0x00(%[thresh])                         \n\t"
-    "ldc1       %[ftmp1],   0x00(%[srct])                           \n\t"
-    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
-    "ldc1       %[ftmp2],   0x08(%[srct])                           \n\t"
-    "psubusb    %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp9]            \n\t"
    "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
    "xor        %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
-    /* ftmp1:hev*/
+    /* ftmp1: hev */
    "xor        %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"

    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
-
    "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
    "psubsb     %[ftmp9],   %[ftmp6],           %[ftmp5]            \n\t"
    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
    "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
-    "sdc1       %[ftmp2],   0x00(%[srct])                           \n\t"
+    "pandn      %[ftmp12],  %[ftmp1],           %[ftmp2]            \n\t"
    "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"

    "li         %[tmp0],    0x0b                                    \n\t"
@@ -606,75 +592,71 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
    "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_04]         \n\t"
    VP8_MBLOOP_HPSRAB
    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
-    "ldc1       %[ftmp2],   0x00(%[srct])                           \n\t"
-    "pandn      %[ftmp2],   %[ftmp1],           %[ftmp2]            \n\t"

    "li         %[tmp0],    0x07                                    \n\t"
    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+
    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
-    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
-    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp1]            \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
-
-    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
-    "gssdrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
    "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
-    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp3]            \n\t"
-    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
-
-    "gssdlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
-    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
-    "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
-    "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"

    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
-    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
-    "gsldlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
-    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
-    "gsldlc1    %[ftmp7],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp7],   0x00(%[addr1])                          \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"

-    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
-    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp3]            \n\t"
-    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
-    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
-    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp7],   0x07(%[addr1])                          \n\t"
-    "gssdrc1    %[ftmp7],   0x00(%[addr1])                          \n\t"
-    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
-    "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
-    "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
-
-    "addiu      %[count],   %[count],           -0x01               \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "addiu      %[count],   %[count],           -0x01               \n\t"
    "bnez       %[count],   1b                                      \n\t"
    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
-      [tmp0]"=&r"(tmp[0]),
-      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
-    : [limit]"r"(limit),                [blimit]"r"(blimit),
-      [srct]"r"(srct),                  [thresh]"r"(thresh),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
+      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count)
+    : [limit]"r"(limit),                  [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
      [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_03]"f"(ff_pb_03),
-      [ff_ph_0900]"f"(ff_ph_0900),      [ff_ph_1b00]"f"(ff_ph_1b00),
-      [ff_ph_1200]"f"(ff_ph_1200),      [ff_ph_003f]"f"(ff_ph_003f)
+      [ff_pb_fe]"f"(ff_pb_fe),            [ff_pb_80]"f"(ff_pb_80),
+      [ff_pb_04]"f"(ff_pb_04),            [ff_pb_03]"f"(ff_pb_03),
+      [ff_ph_0900]"f"(ff_ph_0900),        [ff_ph_1b00]"f"(ff_ph_1b00),
+      [ff_ph_1200]"f"(ff_ph_1200),        [ff_ph_003f]"f"(ff_ph_003f)
    : "memory"
  );
 }
@@ -696,64 +678,60 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
    const unsigned char *limit, const unsigned char *thresh, int count) {
  mips_reg tmp[1];
-  mips_reg addr[2];
  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
-  double ftmp[13];
+  double ftmp[14];

  __asm__ volatile (
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
    MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)

    "1:                                                             \n\t"
-    MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
-    MMI_ADDU(%[addr0], %[src_ptr], %[tmp0])
-    "gsldlc1    %[ftmp11],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp11],  0x00(%[addr0])                          \n\t"
-    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
-    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
-    "punpcklbh  %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
-    "punpckhbh  %[ftmp2],   %[ftmp11],          %[ftmp12]           \n\t"
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"

-    "gsldlc1    %[ftmp11],  0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp11],  0x00(%[src_ptr])                        \n\t"
-    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
-    "punpcklbh  %[ftmp3],   %[ftmp11],          %[ftmp12]           \n\t"
-    "punpckhbh  %[ftmp4],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"

-    "punpcklhw  %[ftmp5],   %[ftmp4],           %[ftmp2]            \n\t"
-    "punpckhhw  %[ftmp6],   %[ftmp4],           %[ftmp2]            \n\t"
-    "punpcklhw  %[ftmp7],   %[ftmp3],           %[ftmp1]            \n\t"
-    "punpckhhw  %[ftmp8],   %[ftmp3],           %[ftmp1]            \n\t"
+    "punpcklhw  %[ftmp1],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp2],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp3],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp11],          %[ftmp9]            \n\t"

-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
-    MMI_SUBU(%[addr0], %[src_ptr], %[tmp0])
-    "gsldlc1    %[ftmp11],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp11],  0x00(%[addr0])                          \n\t"
-    MMI_SUBU(%[addr0], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
-    "punpcklbh  %[ftmp9],   %[ftmp11],          %[ftmp12]           \n\t"
-    "punpckhbh  %[ftmp10],  %[ftmp11],          %[ftmp12]           \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"

-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
-    MMI_SUBU(%[addr0], %[src_ptr], %[tmp0])
-    "gsldlc1    %[ftmp11],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp11],  0x00(%[addr0])                          \n\t"
-    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
-    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
-    "punpcklbh  %[ftmp0],   %[ftmp11],          %[ftmp12]           \n\t"
-    "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp12]           \n\t"
+    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"

-    "punpcklhw  %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
-    "punpckhhw  %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
-    "punpcklhw  %[ftmp3],   %[ftmp0],           %[ftmp9]            \n\t"
-    "punpckhhw  %[ftmp4],   %[ftmp0],           %[ftmp9]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp7],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp8],   %[ftmp11],          %[ftmp9]            \n\t"

+    "gsldlc1    %[ftmp13],  0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp13],  0x00(%[limit])                          \n\t"
    /* ftmp9:q0  ftmp10:q1 */
    "punpcklwd  %[ftmp9],   %[ftmp1],           %[ftmp5]            \n\t"
    "punpckhwd  %[ftmp10],  %[ftmp1],           %[ftmp5]            \n\t"
@@ -771,60 +749,61 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "punpcklwd  %[ftmp5],   %[ftmp4],           %[ftmp8]            \n\t"
    "punpckhwd  %[ftmp6],   %[ftmp4],           %[ftmp8]            \n\t"

-    "gsldlc1    %[ftmp8],   0x07(%[limit])                          \n\t"
-    "gsldrc1    %[ftmp8],   0x00(%[limit])                          \n\t"
-
    /* abs (q3-q2) */
    "pasubub    %[ftmp7],   %[ftmp12],          %[ftmp11]           \n\t"
-    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp13]           \n\t"
    /* abs (q2-q1) */
    "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
-    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
    /* ftmp3: abs(q1-q0) */
    "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp13]           \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
    /* ftmp4: abs(p1-p0) */
    "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp13]           \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
    /* abs (p2-p1) */
    "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
    /* abs (p3-p2) */
    "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
-    /* abs (p0-q0) */
+
+    "gsldlc1    %[ftmp13],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp13],  0x00(%[blimit])                         \n\t"
+    "gsldlc1    %[ftmp7],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[thresh])                         \n\t"
+    /* abs (p0-q0) * 2 */
    "pasubub    %[ftmp1],   %[ftmp9],           %[ftmp6]            \n\t"
    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
-    /* abs (p1-q1) */
+    /* abs (p1-q1) / 2 */
    "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
    "and        %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
    "li         %[tmp0],    0x01                                    \n\t"
    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
    "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
    "paddusb    %[ftmp12],  %[ftmp1],           %[ftmp12]           \n\t"
-
-    "gsldlc1    %[ftmp8],   0x07(%[blimit])                         \n\t"
-    "gsldrc1    %[ftmp8],   0x00(%[blimit])                         \n\t"
-    "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
+    "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp13]           \n\t"
    "or         %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
    "xor        %[ftmp12],  %[ftmp12],          %[ftmp12]           \n\t"
+    /* ftmp0: mask */
    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"

-    "gsldlc1    %[ftmp8],   0x07(%[thresh])                         \n\t"
-    "gsldrc1    %[ftmp8],   0x00(%[thresh])                         \n\t"
-    /* ftmp3: abs(q1-q0)  ftmp4: abs(p1-p0) */
-    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp8]            \n\t"
-    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"
+    /* abs(p1-p0) - thresh */
+    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp7]            \n\t"
+    /* abs(q1-q0) - thresh */
+    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp7]            \n\t"
    "or         %[ftmp3],   %[ftmp4],           %[ftmp3]            \n\t"
    "pcmpeqb    %[ftmp3],   %[ftmp3],           %[ftmp12]           \n\t"
    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp1: hev */
    "xor        %[ftmp1],   %[ftmp3],           %[ftmp1]            \n\t"

+    /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
    "xor        %[ftmp11],  %[ftmp11],          %[ff_pb_80]         \n\t"
    "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
    "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
@@ -837,30 +816,30 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    /* filter_value &= mask */
    "and        %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
+    /* Filter2 = filter_value & hev */
    "and        %[ftmp3],   %[ftmp1],           %[ftmp0]            \n\t"
+    /* filter_value &= ~hev */
    "pandn      %[ftmp0],   %[ftmp1],           %[ftmp0]            \n\t"

    "paddsb     %[ftmp4],   %[ftmp3],           %[ff_pb_04]         \n\t"
    "li         %[tmp0],    0x0b                                    \n\t"
    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t"
    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp4]            \n\t"
    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp4]            \n\t"
    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
    "packsshb   %[ftmp4],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* ftmp9: qs0 */
    "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp4]            \n\t"
    "paddsb     %[ftmp3],   %[ftmp3],           %[ff_pb_03]         \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t"
    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp3]            \n\t"
    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
    "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
-
+    /* ftmp6: ps0 */
    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"

    "li         %[tmp0],    0x07                                    \n\t"
@@ -872,8 +851,10 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
    VP8_MBLOOP_VPSRAB_ADDT
    "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp3]            \n\t"
+    /* ftmp9: oq0 */
    "xor        %[ftmp9],   %[ftmp4],           %[ff_pb_80]         \n\t"
    "paddsb     %[ftmp4],   %[ftmp6],           %[ftmp3]            \n\t"
+    /* ftmp6: op0 */
    "xor        %[ftmp6],   %[ftmp4],           %[ff_pb_80]         \n\t"

    VP8_MBLOOP_VPSRAB_ADDH
@@ -882,8 +863,10 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
    VP8_MBLOOP_VPSRAB_ADDT
    "psubsb     %[ftmp4],   %[ftmp10],          %[ftmp3]            \n\t"
+    /* ftmp10: oq1 */
    "xor        %[ftmp10],   %[ftmp4],          %[ff_pb_80]         \n\t"
    "paddsb     %[ftmp4],   %[ftmp5],           %[ftmp3]            \n\t"
+    /* ftmp5: op1 */
    "xor        %[ftmp5],   %[ftmp4],           %[ff_pb_80]         \n\t"

    VP8_MBLOOP_VPSRAB_ADDH
@@ -891,8 +874,10 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "pmulhh     %[ftmp8],   %[ftmp8],           %[ff_ph_0900]       \n\t"
    VP8_MBLOOP_VPSRAB_ADDT
    "psubsb     %[ftmp4],   %[ftmp11],          %[ftmp3]            \n\t"
+    /* ftmp11: oq2 */
    "xor        %[ftmp11],  %[ftmp4],           %[ff_pb_80]         \n\t"
    "paddsb     %[ftmp4],   %[ftmp2],           %[ftmp3]            \n\t"
+    /* ftmp2: op2 */
    "xor        %[ftmp2],   %[ftmp4],           %[ff_pb_80]         \n\t"

    "ldc1       %[ftmp12],  0x00(%[srct])                           \n\t"
@@ -916,41 +901,40 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "punpcklhw  %[ftmp10],  %[ftmp1],           %[ftmp3]            \n\t"
    "punpckhhw  %[ftmp11],  %[ftmp1],           %[ftmp3]            \n\t"

-    "punpcklwd  %[ftmp0],   %[ftmp6],           %[ftmp10]           \n\t"
-    "punpckhwd  %[ftmp1],   %[ftmp6],           %[ftmp10]           \n\t"
-
-    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
-    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-
    "punpcklwd  %[ftmp0],   %[ftmp7],           %[ftmp11]           \n\t"
    "punpckhwd  %[ftmp1],   %[ftmp7],           %[ftmp11]           \n\t"
-    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
-    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp0],   %[ftmp6],           %[ftmp10]           \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp6],           %[ftmp10]           \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"

    "punpcklwd  %[ftmp1],   %[ftmp5],           %[ftmp9]            \n\t"
    "punpckhwd  %[ftmp0],   %[ftmp5],           %[ftmp9]            \n\t"
-    MMI_SUBU(%[addr0], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
-    MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"

    "punpcklwd  %[ftmp1],   %[ftmp4],           %[ftmp8]            \n\t"
    "punpckhwd  %[ftmp0],   %[ftmp4],           %[ftmp8]            \n\t"
-    MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
-    MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
    "addiu      %[count],   %[count],           -0x01               \n\t"

    MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
@@ -962,9 +946,9 @@ void vp8_mbloop_filter_vertical_edge_mmi(
      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
-      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
-      [addr0]"=&r"(addr[0]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [ftmp12]"=&f"(ftmp[12]),            [ftmp13]"=&f"(ftmp[13]),
+      [tmp0]"=&r"(tmp[0]),                [src_ptr]"+&r"(src_ptr),
+      [count]"+&r"(count)
    : [limit]"r"(limit),                [blimit]"r"(blimit),
      [srct]"r"(srct),                  [thresh]"r"(thresh),
      [src_pixel_step]"r"((mips_reg)src_pixel_step),
--- a/vp8/common/mips/mmi/sixtap_filter_mmi.c
+++ b/vp8/common/mips/mmi/sixtap_filter_mmi.c
@@ -86,6 +86,7 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
  register double ftmp8 asm("$f18");
  register double ftmp9 asm("$f20");
  register double ftmp10 asm("$f22");
+  register double ftmp11 asm("$f24");
 #else
  register double fzero asm("$f0");
  register double ftmp0 asm("$f1");
@@ -99,6 +100,7 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
  register double ftmp8 asm("$f9");
  register double ftmp9 asm("$f10");
  register double ftmp10 asm("$f11");
+  register double ftmp11 asm("$f12");
 #endif  // _MIPS_SIM == _ABIO32

  __asm__ volatile (
@@ -112,11 +114,13 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
    "li         %[tmp0],        0x07                                  \n\t"
    "mtc1       %[tmp0],        %[ftmp7]                              \n\t"
    "li         %[tmp0],        0x08                                  \n\t"
-    "mtc1       %[tmp0],        %[ftmp10]                             \n\t"
+    "mtc1       %[tmp0],        %[ftmp11]                             \n\t"

    "1:                                                               \n\t"
    "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"
-    "gsldrc1    %[ftmp9],      -0x02(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp9],       -0x02(%[src_ptr])                     \n\t"
+    "gsldlc1    %[ftmp10],      0x06(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp10],      -0x01(%[src_ptr])                     \n\t"

    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
    "pmullh     %[ftmp8],       %[ftmp6],          %[ftmp0]           \n\t"
@@ -125,24 +129,21 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp4]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"

-    "gsldlc1    %[ftmp9],       0x06(%[src_ptr])                      \n\t"
-    "gsldrc1    %[ftmp9],      -0x01(%[src_ptr])                      \n\t"
-
-    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp1]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"

-    "punpckhbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
+    "punpckhbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp5]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"

-    "dsrl       %[ftmp9],       %[ftmp9],          %[ftmp10]          \n\t"
-    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
+    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp2]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"

-    "dsrl       %[ftmp9],       %[ftmp9],          %[ftmp10]          \n\t"
-    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
+    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp3]           \n\t"
    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"

@@ -163,8 +164,9 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
-      [tmp0]"=&r"(tmp[0]),              [src_ptr]"+&r"(src_ptr),
-      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+      [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
+      [src_ptr]"+&r"(src_ptr)
    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width),
      [ff_ph_40]"f"(ff_ph_40)
@@ -190,6 +192,11 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
  register double ftmp6 asm("$f14");
  register double ftmp7 asm("$f16");
  register double ftmp8 asm("$f18");
+  register double ftmp9 asm("$f20");
+  register double ftmp10 asm("$f22");
+  register double ftmp11 asm("$f24");
+  register double ftmp12 asm("$f26");
+  register double ftmp13 asm("$f28");
 #else
  register double fzero asm("$f0");
  register double ftmp0 asm("$f1");
@@ -201,6 +208,11 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
  register double ftmp6 asm("$f7");
  register double ftmp7 asm("$f8");
  register double ftmp8 asm("$f9");
+  register double ftmp9 asm("$f10");
+  register double ftmp10 asm("$f11");
+  register double ftmp11 asm("$f12");
+  register double ftmp12 asm("$f13");
+  register double ftmp13 asm("$f14");
 #endif  // _MIPS_SIM == _ABIO32

  __asm__ volatile (
@@ -210,52 +222,56 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
    "ldc1       %[ftmp3],     0x30(%[vp8_filter])                     \n\t"
    "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"
    "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"
-    MMI_SUBU(%[src_ptr],   %[src_ptr],      %[pixels_per_line_x2])
    "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
    "li         %[tmp0],      0x07                                    \n\t"
-    "mtc1       %[tmp0],      %[ftmp7]                                \n\t"
+    "mtc1       %[tmp0],      %[ftmp13]                               \n\t"

+    /* In order to make full use of memory load delay slot,
+     * Operation of memory loading and calculating has been rearranged.
+     */
    "1:                                                               \n\t"
    "gsldlc1    %[ftmp6],     0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp6],     0x00(%[src_ptr])                        \n\t"
-    "pmullh     %[ftmp8],     %[ftmp6],        %[ftmp0]               \n\t"
-
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line])
-    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
-    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp1]               \n\t"
-    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
-
+    "gsldlc1    %[ftmp7],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],     0x00(%[addr0])                          \n\t"
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
-    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
-    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp2]               \n\t"
-    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
+    "gsldlc1    %[ftmp8],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp8],     0x00(%[addr0])                          \n\t"

    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
-    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
-    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp4]               \n\t"
-    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
-
+    "gsldlc1    %[ftmp9],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp9],     0x00(%[addr0])                          \n\t"
    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
-    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
-    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp3]               \n\t"
-    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
-
+    "gsldlc1    %[ftmp10],    0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp10],    0x00(%[addr0])                          \n\t"
    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
-    "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t"
-    "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp5]               \n\t"
-    "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t"
+    "gsldlc1    %[ftmp11],    0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp11],    0x00(%[addr0])                          \n\t"

-    "paddsh     %[ftmp8],     %[ftmp8],        %[ff_ph_40]            \n\t"
-    "psrah      %[ftmp8],     %[ftmp8],        %[ftmp7]               \n\t"
-    "packushb   %[ftmp8],     %[ftmp8],        %[fzero]               \n\t"
-    "gsswlc1    %[ftmp8],     0x03(%[output_ptr])                     \n\t"
-    "gsswrc1    %[ftmp8],     0x00(%[output_ptr])                     \n\t"
+    "pmullh     %[ftmp12],    %[ftmp6],        %[ftmp0]               \n\t"
+
+    "pmullh     %[ftmp7],     %[ftmp7],        %[ftmp1]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp7]               \n\t"
+
+    "pmullh     %[ftmp8],     %[ftmp8],        %[ftmp2]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp8]               \n\t"
+
+    "pmullh     %[ftmp9],     %[ftmp9],        %[ftmp4]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp9]               \n\t"
+
+    "pmullh     %[ftmp10],    %[ftmp10],       %[ftmp3]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp10]              \n\t"
+
+    "pmullh     %[ftmp11],    %[ftmp11],       %[ftmp5]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp11]              \n\t"
+
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ff_ph_40]            \n\t"
+    "psrah      %[ftmp12],    %[ftmp12],       %[ftmp13]              \n\t"
+    "packushb   %[ftmp12],    %[ftmp12],       %[fzero]               \n\t"
+    "gsswlc1    %[ftmp12],    0x03(%[output_ptr])                     \n\t"
+    "gsswrc1    %[ftmp12],    0x00(%[output_ptr])                     \n\t"

    MMI_ADDIU(%[output_height], %[output_height], -0x01)
    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
@@ -265,9 +281,11 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
-      [tmp0]"=&r"(tmp[0]),              [addr0]"=&r"(addr[0]),
-      [src_ptr]"+&r"(src_ptr),          [output_ptr]"+&r"(output_ptr),
-      [output_height]"+&r"(output_height)
+      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
+      [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),
+      [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
      [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
      [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
@@ -301,6 +319,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi(
    "1:                                                               \n\t"
    "gsldlc1    %[ftmp0],       0x07(%[src_ptr])                      \n\t"
    "gsldrc1    %[ftmp0],       0x00(%[src_ptr])                      \n\t"
+    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])

    "punpcklbh  %[ftmp1],       %[ftmp0],          %[fzero]           \n\t"
    "gssdlc1    %[ftmp1],       0x07(%[output_ptr])                   \n\t"
@@ -308,7 +327,6 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi(

    "addiu      %[output_height], %[output_height], -0x01             \n\t"
    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
-    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
    "bnez       %[output_height],               1b                    \n\t"
    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
@@ -338,12 +356,12 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
    "1:                                                               \n\t"
    "gsldlc1    %[ftmp0],     0x07(%[src_ptr])                        \n\t"
    "gsldrc1    %[ftmp0],     0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
+    MMI_ADDIU(%[output_height], %[output_height], -0x01)
    "packushb   %[ftmp1],     %[ftmp0],        %[fzero]               \n\t"
    "gsswlc1    %[ftmp1],     0x03(%[output_ptr])                     \n\t"
    "gsswrc1    %[ftmp1],     0x00(%[output_ptr])                     \n\t"

-    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
-    MMI_ADDIU(%[output_height], %[output_height], -0x01)
    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
    "bnez       %[output_height], 1b                                  \n\t"
    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
@@ -386,7 +404,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
      }                                                                        \
    } else {                                                                   \
      for (i = 0; i < loop; ++i) {                                             \
-        vp8_filter_block1dc_v6_mmi(FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, \
+        vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m,         \
                                   dst_pitch, n * 2, VFilter);                 \
      }                                                                        \
    }                                                                          \
--- a/vp8/common/modecont.c
+++ b/vp8/common/modecont.c
@@ -11,28 +11,16 @@
 #include "entropy.h"

 const int vp8_mode_contexts[6][4] = {
-  {
-      /* 0 */
-      7, 1, 1, 143,
-  },
-  {
-      /* 1 */
-      14, 18, 14, 107,
-  },
-  {
-      /* 2 */
-      135, 64, 57, 68,
-  },
-  {
-      /* 3 */
-      60, 56, 128, 65,
-  },
-  {
-      /* 4 */
-      159, 134, 128, 34,
-  },
-  {
-      /* 5 */
-      234, 188, 128, 28,
-  },
+  { /* 0 */
+    7, 1, 1, 143 },
+  { /* 1 */
+    14, 18, 14, 107 },
+  { /* 2 */
+    135, 64, 57, 68 },
+  { /* 3 */
+    60, 56, 128, 65 },
+  { /* 4 */
+    159, 134, 128, 34 },
+  { /* 5 */
+    234, 188, 128, 28 },
 };
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vp8_common_forward_decls() {
 print <<EOF
 /*
--- a/vp8/common/vp8_entropymodedata.h
+++ b/vp8/common/vp8_entropymodedata.h
@@ -6,7 +6,7 @@
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
-*/
+ */

 #ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_
 #define VP8_COMMON_VP8_ENTROPYMODEDATA_H_
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -95,9 +95,7 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
                                  int src_pixels_per_line, int xoffset,
                                  int yoffset, unsigned char *dst_ptr,
-                                  int dst_pitch
-
-                                  ) {
+                                  int dst_pitch) {
  DECLARE_ALIGNED(16, unsigned short,
                  FData2[24 * 24]); /* Temp data bufffer used in filtering */

@@ -236,9 +234,7 @@ extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
                                   int src_pixels_per_line, int xoffset,
                                   int yoffset, unsigned char *dst_ptr,
-                                   int dst_pitch
-
-                                   ) {
+                                   int dst_pitch) {
  DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);

  if (xoffset) {
@@ -351,8 +347,8 @@ void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
                                   yoffset);
    } else {
      /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
-        * yoffset==0) case correctly. Add copy function here to guarantee
-        * six-tap function handles all possible offsets. */
+       * yoffset==0) case correctly. Add copy function here to guarantee
+       * six-tap function handles all possible offsets. */
      int r;

      for (r = 0; r < 4; ++r) {
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -674,7 +674,7 @@ static unsigned int read_partition_size(VP8D_COMP *pbi,

 static int read_is_valid(const unsigned char *start, size_t len,
                         const unsigned char *end) {
-  return (start + len > start && start + len <= end);
+  return len != 0 && end > start && len <= (size_t)(end - start);
 }

 static unsigned int read_available_partition_size(
--- a/vp8/decoder/ec_types.h
+++ b/vp8/decoder/ec_types.h
@@ -34,7 +34,9 @@ typedef struct {
 /* Structure used to hold all the overlaps of a macroblock. The overlaps of a
 * macroblock is further divided into block overlaps.
 */
-typedef struct { B_OVERLAP overlaps[16]; } MB_OVERLAP;
+typedef struct {
+  B_OVERLAP overlaps[16];
+} MB_OVERLAP;

 /* Structure for keeping track of motion vectors and which reference frame they
 * refer to. Used for motion vector interpolation.
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -31,7 +31,9 @@ typedef struct {
  void *ptr2;
 } DECODETHREAD_DATA;

-typedef struct { MACROBLOCKD mbd; } MB_ROW_DEC;
+typedef struct {
+  MACROBLOCKD mbd;
+} MB_ROW_DEC;

 typedef struct {
  int enabled;
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -739,24 +739,21 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
    /* Allocate memory for above_row buffers. */
    CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(
-          pbi->mt_yabove_row[i],
-          vpx_memalign(
-              16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1))));
+      CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
+                      vpx_memalign(16, sizeof(unsigned char) *
+                                           (width + (VP8BORDERINPIXELS << 1))));

    CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(
-          pbi->mt_uabove_row[i],
-          vpx_memalign(16,
-                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+      CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
+                      vpx_memalign(16, sizeof(unsigned char) *
+                                           (uv_width + VP8BORDERINPIXELS)));

    CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(
-          pbi->mt_vabove_row[i],
-          vpx_memalign(16,
-                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+      CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
+                      vpx_memalign(16, sizeof(unsigned char) *
+                                           (uv_width + VP8BORDERINPIXELS)));

    /* Allocate memory for left_col buffers. */
    CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@@ -9,12 +9,12 @@
 */

 /****************************************************************************
-*
-*   Module Title :     boolhuff.h
-*
-*   Description  :     Bool Coder header file.
-*
-****************************************************************************/
+ *
+ *   Module Title :     boolhuff.h
+ *
+ *   Description  :     Bool Coder header file.
+ *
+ ****************************************************************************/
 #ifndef VP8_ENCODER_BOOLHUFF_H_
 #define VP8_ENCODER_BOOLHUFF_H_

--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -989,11 +989,11 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
    bits_per_mb_at_this_q =
        vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;

-    bits_per_mb_at_this_q = (int)(.5 +
-                                  err_correction_factor * speed_correction *
-                                      cpi->twopass.est_max_qcorrection_factor *
-                                      cpi->twopass.section_max_qfactor *
-                                      (double)bits_per_mb_at_this_q);
+    bits_per_mb_at_this_q =
+        (int)(.5 + err_correction_factor * speed_correction *
+                       cpi->twopass.est_max_qcorrection_factor *
+                       cpi->twopass.section_max_qfactor *
+                       (double)bits_per_mb_at_this_q);

    /* Mode and motion overhead */
    /* As Q rises in real encode loop rd code will force overhead down
@@ -1086,9 +1086,8 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
        vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;

    bits_per_mb_at_this_q =
-        (int)(.5 +
-              err_correction_factor * speed_correction * clip_iifactor *
-                  (double)bits_per_mb_at_this_q);
+        (int)(.5 + err_correction_factor * speed_correction * clip_iifactor *
+                       (double)bits_per_mb_at_this_q);

    /* Mode and motion overhead */
    /* As Q rises in real encode loop rd code will force overhead down
@@ -1273,9 +1272,8 @@ void vp8_init_second_pass(VP8_COMP *cpi) {
   * sum duration is not. Its calculated based on the actual durations of
   * all frames from the first pass.
   */
-  vp8_new_framerate(cpi,
-                    10000000.0 * cpi->twopass.total_stats.count /
-                        cpi->twopass.total_stats.duration);
+  vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
+                             cpi->twopass.total_stats.duration);

  cpi->output_framerate = cpi->framerate;
  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
@@ -1739,10 +1737,11 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
            /* Dont break out very close to a key frame */
            ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
            ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
-            (!flash_detected) && ((mv_ratio_accumulator > 100.0) ||
-                                  (abs_mv_in_out_accumulator > 3.0) ||
-                                  (mv_in_out_accumulator < -2.0) ||
-                                  ((boost_score - old_boost_score) < 2.0)))) {
+            (!flash_detected) &&
+            ((mv_ratio_accumulator > 100.0) ||
+             (abs_mv_in_out_accumulator > 3.0) ||
+             (mv_in_out_accumulator < -2.0) ||
+             ((boost_score - old_boost_score) < 2.0)))) {
      boost_score = old_boost_score;
      break;
    }
@@ -1815,8 +1814,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      (next_frame.pcnt_inter > 0.75) &&
      ((mv_in_out_accumulator / (double)i > -0.2) ||
       (mv_in_out_accumulator > -2.0)) &&
-      (cpi->gfu_boost > 100) && (cpi->twopass.gf_decay_rate <=
-                                 (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))))
+      (cpi->gfu_boost > 100) &&
+      (cpi->twopass.gf_decay_rate <=
+       (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))))
 #endif
  {
    int Boost;
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2862,7 +2862,6 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
    fclose(yframe);
 }
 #endif
-/* return of 0 means drop frame */

 #if !CONFIG_REALTIME_ONLY
 /* Function to test for conditions that indeicate we should loop
@@ -3364,11 +3363,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
        (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info;

    if (cpi->oxcf.mr_encoder_id) {
-      // TODO(marpan): This constraint shouldn't be needed, as we would like
-      // to allow for key frame setting (forced or periodic) defined per
-      // spatial layer. For now, keep this in.
-      cm->frame_type = low_res_frame_info->frame_type;
-
      // Check if lower resolution is available for motion vector reuse.
      if (cm->frame_type != KEY_FRAME) {
        cpi->mr_low_res_mv_avail = 1;
@@ -3393,7 +3387,16 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
                     == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
        */
      }
+      // Disable motion vector reuse (i.e., disable any usage of the low_res)
+      // if the previous lower stream is skipped/disabled.
+      if (low_res_frame_info->skip_encoding_prev_stream) {
+        cpi->mr_low_res_mv_avail = 0;
+      }
    }
+    // This stream is not skipped (i.e., it's being encoded), so set this skip
+    // flag to 0. This is needed for the next stream (i.e., which is the next
+    // frame to be encoded).
+    low_res_frame_info->skip_encoding_prev_stream = 0;

    // On a key frame: For the lowest resolution, keep track of the key frame
    // counter value. For the higher resolutions, reset the current video
@@ -3799,7 +3802,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,

  /* Setup background Q adjustment for error resilient mode.
   * For multi-layer encodes only enable this for the base layer.
-  */
+   */
  if (cpi->cyclic_refresh_mode_enabled) {
    // Special case for screen_content_mode with golden frame updates.
    int disable_cr_gf =
@@ -4782,8 +4785,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
    cpi->temporal_pattern_counter++;
  }

-/* reset to normal state now that we are done. */
-
 #if 0
    {
        char filename[512];
@@ -4999,10 +5000,13 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
        // be received for that high layer, which will yield an incorrect
        // frame rate (from time-stamp adjustment in above calculation).
        if (cpi->oxcf.mr_encoder_id) {
-          cpi->ref_framerate = low_res_frame_info->low_res_framerate;
+          if (!low_res_frame_info->skip_encoding_base_stream)
+            cpi->ref_framerate = low_res_frame_info->low_res_framerate;
        } else {
          // Keep track of frame rate for lowest resolution.
          low_res_frame_info->low_res_framerate = cpi->ref_framerate;
+          // The base stream is being encoded so set skip flag to 0.
+          low_res_frame_info->skip_encoding_base_stream = 0;
        }
      }
 #endif
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -741,10 +741,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
  x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

  /* If the frame has big static background and current MB is in low
-  *  motion area, its mode decision is biased to ZEROMV mode.
-  *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12).
-  *  At such speed settings, ZEROMV is already heavily favored.
-  */
+   *  motion area, its mode decision is biased to ZEROMV mode.
+   *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12).
+   *  At such speed settings, ZEROMV is already heavily favored.
+   */
  if (cpi->Speed < 12) {
    calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
  }
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -996,7 +996,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
           * bits on this frame even if it is a contructed arf.
           * The active maximum quantizer insures that an appropriate
           * number of bits will be spent if needed for contstructed ARFs.
-          */
+           */
          cpi->this_frame_target = 0;
        }

@@ -1052,9 +1052,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
   * overflow when values are large
   */
  projected_size_based_on_q =
-      (int)(((.5 +
-              rate_correction_factor *
-                  vp8_bits_per_mb[cpi->common.frame_type][Q]) *
+      (int)(((.5 + rate_correction_factor *
+                       vp8_bits_per_mb[cpi->common.frame_type][Q]) *
             cpi->common.MBs) /
            (1 << BPER_MB_NORMBITS));

--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -23,6 +23,7 @@
 #include "modecosts.h"
 #include "encodeintra.h"
 #include "pickinter.h"
+#include "vp8/common/common.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra.h"
@@ -769,9 +770,9 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
    vp8_quantize_mbuv(x);

    rate_to = rd_cost_mbuv(x);
-    this_rate = rate_to +
-                x->intra_uv_mode_cost[xd->frame_type]
-                                     [xd->mode_info_context->mbmi.uv_mode];
+    this_rate =
+        rate_to + x->intra_uv_mode_cost[xd->frame_type]
+                                       [xd->mode_info_context->mbmi.uv_mode];

    this_distortion = vp8_mbuverror(x) / 4;

@@ -959,19 +960,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
  vp8_variance_fn_ptr_t *v_fn_ptr;

  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
  ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
-  ENTROPY_CONTEXT *ta_b;
-  ENTROPY_CONTEXT *tl_b;

  memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
  memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-  ta_b = (ENTROPY_CONTEXT *)&t_above_b;
-  tl_b = (ENTROPY_CONTEXT *)&t_left_b;
+  vp8_zero(t_above_b);
+  vp8_zero(t_left_b);

  br = 0;
  bd = 0;
@@ -1151,13 +1146,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
        mode_selected = this_mode;
        best_label_rd = this_rd;

-        memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
-        memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(&t_above_b, &t_above_s, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(&t_left_b, &t_left_s, sizeof(ENTROPY_CONTEXT_PLANES));
      }
    } /*for each 4x4 mode*/

-    memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
-    memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, &t_above_b, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, &t_left_b, sizeof(ENTROPY_CONTEXT_PLANES));

    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
                bsi->ref_mv, x->mvcost);
--- a/vp8/encoder/treewriter.h
+++ b/vp8/encoder/treewriter.h
@@ -56,8 +56,7 @@ static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2],

 static void vp8_treed_write(vp8_writer *const w, vp8_tree t,
                            const vp8_prob *const p, int v,
-                            int n /* number of bits in v, assumed nonzero */
-                            ) {
+                            int n) { /* number of bits in v, assumed nonzero */
  vp8_tree_index i = 0;

  do {
@@ -73,8 +72,7 @@ static INLINE void vp8_write_token(vp8_writer *const w, vp8_tree t,
 }

 static int vp8_treed_cost(vp8_tree t, const vp8_prob *const p, int v,
-                          int n /* number of bits in v, assumed nonzero */
-                          ) {
+                          int n) { /* number of bits in v, assumed nonzero */
  int c = 0;
  vp8_tree_index i = 0;

--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -802,7 +802,20 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
                                   unsigned long deadline) {
  vpx_codec_err_t res = VPX_CODEC_OK;

-  if (!ctx->cfg.rc_target_bitrate) return res;
+  if (!ctx->cfg.rc_target_bitrate) {
+#if CONFIG_MULTI_RES_ENCODING
+    if (!ctx->cpi) return VPX_CODEC_ERROR;
+    if (ctx->cpi->oxcf.mr_total_resolutions > 1) {
+      LOWER_RES_FRAME_INFO *low_res_frame_info =
+          (LOWER_RES_FRAME_INFO *)ctx->cpi->oxcf.mr_low_res_mode_info;
+      if (!low_res_frame_info) return VPX_CODEC_ERROR;
+      low_res_frame_info->skip_encoding_prev_stream = 1;
+      if (ctx->cpi->oxcf.mr_encoder_id == 0)
+        low_res_frame_info->skip_encoding_base_stream = 1;
+    }
+#endif
+    return res;
+  }

  if (img) res = validate_img(ctx, img);

@@ -902,6 +915,8 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
            (unsigned long)((delta * ctx->cfg.g_timebase.den + round) /
                            ctx->cfg.g_timebase.num / 10000000);
        pkt.data.frame.flags = lib_flags << 16;
+        pkt.data.frame.width[0] = cpi->common.Width;
+        pkt.data.frame.height[0] = cpi->common.Height;

        if (lib_flags & FRAMEFLAGS_KEY) {
          pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
@@ -1259,6 +1274,9 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = {
      vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    cfg_maps; */
      vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
      vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   get_cx_data; */
-      vp8e_set_config, NULL, vp8e_get_preview, vp8e_mr_alloc_mem,
+      vp8e_set_config,
+      NULL,
+      vp8e_get_preview,
+      vp8e_mr_alloc_mem,
  } /* encoder functions */
 };
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -200,9 +200,9 @@ static vpx_codec_err_t update_error_state(
 static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
                            void *user_priv) {
  /** vpx_img_wrap() doesn't allow specifying independent strides for
-    * the Y, U, and V planes, nor other alignment adjustments that
-    * might be representable by a YV12_BUFFER_CONFIG, so we just
-    * initialize all the fields.*/
+   * the Y, U, and V planes, nor other alignment adjustments that
+   * might be representable by a YV12_BUFFER_CONFIG, so we just
+   * initialize all the fields.*/
  img->fmt = VPX_IMG_FMT_I420;
  img->w = yv12->y_stride;
  img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
--- a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
@@ -0,0 +1,160 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void highbd_iadst4(int32x4_t *const io) {
+  const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 };
+  const int32x4_t sinpi = vld1q_s32(sinpis);
+  int32x4_t s[8];
+
+  s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0);
+  s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1);
+  s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0);
+  s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1);
+  s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0);
+  s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1);
+  s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1);
+  s[7] = vsubq_s32(io[0], io[2]);
+  s[7] = vaddq_s32(s[7], io[3]);
+
+  s[0] = vaddq_s32(s[0], s[3]);
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[4]);
+  s[1] = vsubq_s32(s[1], s[6]);
+  s[3] = s[2];
+  s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0);
+
+  io[0] = vaddq_s32(s[0], s[3]);
+  io[1] = vaddq_s32(s[1], s[3]);
+  io[2] = s[2];
+  io[3] = vaddq_s32(s[0], s[1]);
+  io[3] = vsubq_s32(io[3], s[3]);
+  io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS);
+  io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS);
+  io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS);
+  io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS);
+}
+
+void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int tx_type, int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  int16x8_t a[2];
+  int32x4_t c[4];
+
+  c[0] = vld1q_s32(input);
+  c[1] = vld1q_s32(input + 4);
+  c[2] = vld1q_s32(input + 8);
+  c[3] = vld1q_s32(input + 12);
+
+  if (bd == 8) {
+    a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1]));
+    a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3]));
+    transpose_s16_4x4q(&a[0], &a[1]);
+
+    switch (tx_type) {
+      case DCT_DCT:
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        transpose_s16_4x4q(&a[0], &a[1]);
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        break;
+
+      case ADST_DCT:
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        transpose_s16_4x4q(&a[0], &a[1]);
+        iadst4(a);
+        break;
+
+      case DCT_ADST:
+        iadst4(a);
+        transpose_s16_4x4q(&a[0], &a[1]);
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        break;
+
+      default:
+        assert(tx_type == ADST_ADST);
+        iadst4(a);
+        transpose_s16_4x4q(&a[0], &a[1]);
+        iadst4(a);
+        break;
+    }
+    a[0] = vrshrq_n_s16(a[0], 4);
+    a[1] = vrshrq_n_s16(a[1], 4);
+  } else {
+    switch (tx_type) {
+      case DCT_DCT: {
+        const int32x4_t cospis = vld1q_s32(kCospi32);
+
+        if (bd == 10) {
+          idct4x4_16_kernel_bd10(cospis, c);
+          idct4x4_16_kernel_bd10(cospis, c);
+        } else {
+          idct4x4_16_kernel_bd12(cospis, c);
+          idct4x4_16_kernel_bd12(cospis, c);
+        }
+        break;
+      }
+
+      case ADST_DCT: {
+        const int32x4_t cospis = vld1q_s32(kCospi32);
+
+        if (bd == 10) {
+          idct4x4_16_kernel_bd10(cospis, c);
+        } else {
+          idct4x4_16_kernel_bd12(cospis, c);
+        }
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        break;
+      }
+
+      case DCT_ADST: {
+        const int32x4_t cospis = vld1q_s32(kCospi32);
+
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        if (bd == 10) {
+          idct4x4_16_kernel_bd10(cospis, c);
+        } else {
+          idct4x4_16_kernel_bd12(cospis, c);
+        }
+        break;
+      }
+
+      default: {
+        assert(tx_type == ADST_ADST);
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        break;
+      }
+    }
+    a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4));
+    a[1] = vcombine_s16(vqrshrn_n_s32(c[2], 4), vqrshrn_n_s32(c[3], 4));
+  }
+
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max);
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a[1], max);
+}
--- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -14,206 +14,63 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/txfm_common.h"

-static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
-  int32x4_t q8s32, q9s32;
-  int16x4x2_t d0x2s16, d1x2s16;
-  int32x4x2_t q0x2s32;
-
-  d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
-  d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
-
-  q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
-  q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
-  q0x2s32 = vtrnq_s32(q8s32, q9s32);
-
-  *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
-  *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
-}
-
-static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
-                                             int16x4_t *d2s16) {
-  *d0s16 = vdup_n_s16(cospi_8_64);
-  *d1s16 = vdup_n_s16(cospi_16_64);
-  *d2s16 = vdup_n_s16(cospi_24_64);
-}
-
-static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
-                                           int16x4_t *d5s16, int16x8_t *q3s16) {
-  *d3s16 = vdup_n_s16(sinpi_1_9);
-  *d4s16 = vdup_n_s16(sinpi_2_9);
-  *q3s16 = vdupq_n_s16(sinpi_3_9);
-  *d5s16 = vdup_n_s16(sinpi_4_9);
-}
-
-static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
-                              int16x4_t *d2s16, int16x8_t *q8s16,
-                              int16x8_t *q9s16) {
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
-  int16x4_t d26s16, d27s16, d28s16, d29s16;
-  int32x4_t q10s32, q13s32, q14s32, q15s32;
-  int16x8_t q13s16, q14s16;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, *d2s16);
-  q10s32 = vmull_s16(d17s16, *d0s16);
-  q13s32 = vmull_s16(d23s16, *d1s16);
-  q14s32 = vmull_s16(d24s16, *d1s16);
-  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
-  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
-
-  d26s16 = vrshrn_n_s32(q13s32, 14);
-  d27s16 = vrshrn_n_s32(q14s32, 14);
-  d29s16 = vrshrn_n_s32(q15s32, 14);
-  d28s16 = vrshrn_n_s32(q10s32, 14);
-
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-  *q8s16 = vaddq_s16(q13s16, q14s16);
-  *q9s16 = vsubq_s16(q13s16, q14s16);
-  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
-}
-
-static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
-                               int16x4_t *d5s16, int16x8_t *q3s16,
-                               int16x8_t *q8s16, int16x8_t *q9s16) {
-  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
-  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
-  d6s16 = vget_low_s16(*q3s16);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-
-  q10s32 = vmull_s16(*d3s16, d16s16);
-  q11s32 = vmull_s16(*d4s16, d16s16);
-  q12s32 = vmull_s16(d6s16, d17s16);
-  q13s32 = vmull_s16(*d5s16, d18s16);
-  q14s32 = vmull_s16(*d3s16, d18s16);
-  q15s32 = vmovl_s16(d16s16);
-  q15s32 = vaddw_s16(q15s32, d19s16);
-  q8s32 = vmull_s16(*d4s16, d19s16);
-  q15s32 = vsubw_s16(q15s32, d18s16);
-  q9s32 = vmull_s16(*d5s16, d19s16);
-
-  q10s32 = vaddq_s32(q10s32, q13s32);
-  q10s32 = vaddq_s32(q10s32, q8s32);
-  q11s32 = vsubq_s32(q11s32, q14s32);
-  q8s32 = vdupq_n_s32(sinpi_3_9);
-  q11s32 = vsubq_s32(q11s32, q9s32);
-  q15s32 = vmulq_s32(q15s32, q8s32);
-
-  q13s32 = vaddq_s32(q10s32, q12s32);
-  q10s32 = vaddq_s32(q10s32, q11s32);
-  q14s32 = vaddq_s32(q11s32, q12s32);
-  q10s32 = vsubq_s32(q10s32, q12s32);
-
-  d16s16 = vrshrn_n_s32(q13s32, 14);
-  d17s16 = vrshrn_n_s32(q14s32, 14);
-  d18s16 = vrshrn_n_s32(q15s32, 14);
-  d19s16 = vrshrn_n_s32(q10s32, 14);
-
-  *q8s16 = vcombine_s16(d16s16, d17s16);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-}
-
 void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
-  uint8x8_t d26u8, d27u8;
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
-  uint32x2_t d26u32, d27u32;
-  int16x8_t q3s16, q8s16, q9s16;
-  uint16x8_t q8u16, q9u16;
+  int16x8_t a[2];
+  uint8x8_t s[2], d[2];
+  uint16x8_t sum[2];

-  d26u32 = d27u32 = vdup_n_u32(0);
+  assert(!((intptr_t)dest % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));

-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-
-  TRANSPOSE4X4(&q8s16, &q9s16);
+  a[0] = load_tran_low_to_s16q(input);
+  a[1] = load_tran_low_to_s16q(input + 8);
+  transpose_s16_4x4q(&a[0], &a[1]);

  switch (tx_type) {
-    case 0:  // idct_idct is not supported. Fall back to C
-      vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
-      return;
-    case 1:  // iadst_idct
-      // generate constants
-      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
-      // first transform rows
-      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+    case DCT_DCT:
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+      transpose_s16_4x4q(&a[0], &a[1]);
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
      break;
-    case 2:  // idct_iadst
-      // generate constantsyy
-      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);

-      // first transform rows
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+    case ADST_DCT:
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+      transpose_s16_4x4q(&a[0], &a[1]);
+      iadst4(a);
      break;
-    case 3:  // iadst_iadst
-      // generate constants
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);

-      // first transform rows
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+    case DCT_ADST:
+      iadst4(a);
+      transpose_s16_4x4q(&a[0], &a[1]);
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
      break;
-    default:  // iadst_idct
-      assert(0);
+
+    default:
+      assert(tx_type == ADST_ADST);
+      iadst4(a);
+      transpose_s16_4x4q(&a[0], &a[1]);
+      iadst4(a);
      break;
  }

-  q8s16 = vrshrq_n_s16(q8s16, 4);
-  q9s16 = vrshrq_n_s16(q9s16, 4);
-
-  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
-  dest += stride;
-  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
-  dest += stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
-  dest += stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
-
-  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
-  dest -= stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
-  dest -= stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
-  dest -= stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+  a[0] = vrshrq_n_s16(a[0], 4);
+  a[1] = vrshrq_n_s16(a[1], 4);
+  s[0] = load_u8(dest, stride);
+  s[1] = load_u8(dest + 2 * stride, stride);
+  sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s[0]);
+  sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), s[1]);
+  d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0]));
+  d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1]));
+  store_u8(dest, stride, d[0]);
+  store_u8(dest + 2 * stride, stride, d[1]);
 }
--- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -14,527 +14,199 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"

-static int16_t cospi_2_64 = 16305;
-static int16_t cospi_4_64 = 16069;
-static int16_t cospi_6_64 = 15679;
-static int16_t cospi_8_64 = 15137;
-static int16_t cospi_10_64 = 14449;
-static int16_t cospi_12_64 = 13623;
-static int16_t cospi_14_64 = 12665;
-static int16_t cospi_16_64 = 11585;
-static int16_t cospi_18_64 = 10394;
-static int16_t cospi_20_64 = 9102;
-static int16_t cospi_22_64 = 7723;
-static int16_t cospi_24_64 = 6270;
-static int16_t cospi_26_64 = 4756;
-static int16_t cospi_28_64 = 3196;
-static int16_t cospi_30_64 = 1606;
+static INLINE void iadst_half_butterfly_neon(int16x8_t *const x,
+                                             const int16x4_t c) {
+  const int16x8_t sum = vaddq_s16(x[0], x[1]);
+  const int16x8_t sub = vsubq_s16(x[0], x[1]);
+  int32x4_t t0[2], t1[2];

-static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                              int16x8_t *q10s16, int16x8_t *q11s16,
-                              int16x8_t *q12s16, int16x8_t *q13s16,
-                              int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
-  d0s16 = vdup_n_s16(cospi_28_64);
-  d1s16 = vdup_n_s16(cospi_4_64);
-  d2s16 = vdup_n_s16(cospi_12_64);
-  d3s16 = vdup_n_s16(cospi_20_64);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d26s16, d2s16);
-  q6s32 = vmull_s16(d27s16, d2s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
-  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
-  d8s16 = vrshrn_n_s32(q2s32, 14);
-  d9s16 = vrshrn_n_s32(q3s32, 14);
-  d10s16 = vrshrn_n_s32(q5s32, 14);
-  d11s16 = vrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q2s32 = vmull_s16(d18s16, d1s16);
-  q3s32 = vmull_s16(d19s16, d1s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q13s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
-  d14s16 = vrshrn_n_s32(q2s32, 14);
-  d15s16 = vrshrn_n_s32(q3s32, 14);
-  d12s16 = vrshrn_n_s32(q9s32, 14);
-  d13s16 = vrshrn_n_s32(q13s32, 14);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d0s16 = vdup_n_s16(cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d0s16);
-  q3s32 = vmull_s16(d17s16, d0s16);
-  q13s32 = vmull_s16(d16s16, d0s16);
-  q15s32 = vmull_s16(d17s16, d0s16);
-
-  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
-  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
-  d0s16 = vdup_n_s16(cospi_24_64);
-  d1s16 = vdup_n_s16(cospi_8_64);
-
-  d18s16 = vrshrn_n_s32(q2s32, 14);
-  d19s16 = vrshrn_n_s32(q3s32, 14);
-  d22s16 = vrshrn_n_s32(q13s32, 14);
-  d23s16 = vrshrn_n_s32(q15s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q2s32 = vmull_s16(d20s16, d0s16);
-  q3s32 = vmull_s16(d21s16, d0s16);
-  q8s32 = vmull_s16(d20s16, d1s16);
-  q12s32 = vmull_s16(d21s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
-  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
-  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
-  d26s16 = vrshrn_n_s32(q2s32, 14);
-  d27s16 = vrshrn_n_s32(q3s32, 14);
-  d30s16 = vrshrn_n_s32(q8s32, 14);
-  d31s16 = vrshrn_n_s32(q12s32, 14);
-  *q13s16 = vcombine_s16(d26s16, d27s16);
-  *q15s16 = vcombine_s16(d30s16, d31s16);
-
-  q0s16 = vaddq_s16(*q9s16, *q15s16);
-  q1s16 = vaddq_s16(*q11s16, *q13s16);
-  q2s16 = vsubq_s16(*q11s16, *q13s16);
-  q3s16 = vsubq_s16(*q9s16, *q15s16);
-
-  *q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  *q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-
-  d16s16 = vdup_n_s16(cospi_16_64);
-
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vrshrn_n_s32(q9s32, 14);
-  d11s16 = vrshrn_n_s32(q10s32, 14);
-  d12s16 = vrshrn_n_s32(q11s32, 14);
-  d13s16 = vrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  *q8s16 = vaddq_s16(q0s16, q7s16);
-  *q9s16 = vaddq_s16(q1s16, q6s16);
-  *q10s16 = vaddq_s16(q2s16, q5s16);
-  *q11s16 = vaddq_s16(q3s16, q4s16);
-  *q12s16 = vsubq_s16(q3s16, q4s16);
-  *q13s16 = vsubq_s16(q2s16, q5s16);
-  *q14s16 = vsubq_s16(q1s16, q6s16);
-  *q15s16 = vsubq_s16(q0s16, q7s16);
+  t0[0] = vmull_lane_s16(vget_low_s16(sum), c, 0);
+  t0[1] = vmull_lane_s16(vget_high_s16(sum), c, 0);
+  t1[0] = vmull_lane_s16(vget_low_s16(sub), c, 0);
+  t1[1] = vmull_lane_s16(vget_high_s16(sub), c, 0);
+  x[0] = dct_const_round_shift_low_8(t0);
+  x[1] = dct_const_round_shift_low_8(t1);
 }

-static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                               int16x8_t *q10s16, int16x8_t *q11s16,
-                               int16x8_t *q12s16, int16x8_t *q13s16,
-                               int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q2s16, q4s16, q5s16, q6s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
-  int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);

-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+}

-  d14s16 = vdup_n_s16(cospi_2_64);
-  d15s16 = vdup_n_s16(cospi_30_64);
+static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);

-  q1s32 = vmull_s16(d30s16, d14s16);
-  q2s32 = vmull_s16(d31s16, d14s16);
-  q3s32 = vmull_s16(d30s16, d15s16);
-  q4s32 = vmull_s16(d31s16, d15s16);
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+}

-  d30s16 = vdup_n_s16(cospi_18_64);
-  d31s16 = vdup_n_s16(cospi_14_64);
+static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);

-  q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
-  q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
-  q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
-  q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+}

-  q5s32 = vmull_s16(d22s16, d30s16);
-  q6s32 = vmull_s16(d23s16, d30s16);
-  q7s32 = vmull_s16(d22s16, d31s16);
-  q8s32 = vmull_s16(d23s16, d31s16);
+static INLINE int16x8_t add_dct_const_round_shift_low_8(
+    const int32x4_t *const in0, const int32x4_t *const in1) {
+  int32x4_t sum[2];

-  q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
-  q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
-  q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
-  q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
+  sum[0] = vaddq_s32(in0[0], in1[0]);
+  sum[1] = vaddq_s32(in0[1], in1[1]);
+  return dct_const_round_shift_low_8(sum);
+}

-  q11s32 = vaddq_s32(q1s32, q5s32);
-  q12s32 = vaddq_s32(q2s32, q6s32);
-  q1s32 = vsubq_s32(q1s32, q5s32);
-  q2s32 = vsubq_s32(q2s32, q6s32);
+static INLINE int16x8_t sub_dct_const_round_shift_low_8(
+    const int32x4_t *const in0, const int32x4_t *const in1) {
+  int32x4_t sum[2];

-  d22s16 = vrshrn_n_s32(q11s32, 14);
-  d23s16 = vrshrn_n_s32(q12s32, 14);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
+  sum[0] = vsubq_s32(in0[0], in1[0]);
+  sum[1] = vsubq_s32(in0[1], in1[1]);
+  return dct_const_round_shift_low_8(sum);
+}

-  q12s32 = vaddq_s32(q3s32, q7s32);
-  q15s32 = vaddq_s32(q4s32, q8s32);
-  q3s32 = vsubq_s32(q3s32, q7s32);
-  q4s32 = vsubq_s32(q4s32, q8s32);
+static INLINE void iadst8(int16x8_t *const io) {
+  const int16x4_t c0 =
+      create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
+  const int16x4_t c1 =
+      create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
+  const int16x4_t c2 =
+      create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
+  int16x8_t x[8], t[4];
+  int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];

-  d2s16 = vrshrn_n_s32(q1s32, 14);
-  d3s16 = vrshrn_n_s32(q2s32, 14);
-  d24s16 = vrshrn_n_s32(q12s32, 14);
-  d25s16 = vrshrn_n_s32(q15s32, 14);
-  d6s16 = vrshrn_n_s32(q3s32, 14);
-  d7s16 = vrshrn_n_s32(q4s32, 14);
-  *q12s16 = vcombine_s16(d24s16, d25s16);
+  x[0] = io[7];
+  x[1] = io[0];
+  x[2] = io[5];
+  x[3] = io[2];
+  x[4] = io[3];
+  x[5] = io[4];
+  x[6] = io[1];
+  x[7] = io[6];

-  d0s16 = vdup_n_s16(cospi_10_64);
-  d1s16 = vdup_n_s16(cospi_22_64);
-  q4s32 = vmull_s16(d26s16, d0s16);
-  q5s32 = vmull_s16(d27s16, d0s16);
-  q2s32 = vmull_s16(d26s16, d1s16);
-  q6s32 = vmull_s16(d27s16, d1s16);
+  // stage 1
+  iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1);
+  iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3);
+  iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5);
+  iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7);

-  d30s16 = vdup_n_s16(cospi_26_64);
-  d31s16 = vdup_n_s16(cospi_6_64);
+  x[0] = add_dct_const_round_shift_low_8(s0, s4);
+  x[1] = add_dct_const_round_shift_low_8(s1, s5);
+  x[2] = add_dct_const_round_shift_low_8(s2, s6);
+  x[3] = add_dct_const_round_shift_low_8(s3, s7);
+  x[4] = sub_dct_const_round_shift_low_8(s0, s4);
+  x[5] = sub_dct_const_round_shift_low_8(s1, s5);
+  x[6] = sub_dct_const_round_shift_low_8(s2, s6);
+  x[7] = sub_dct_const_round_shift_low_8(s3, s7);

-  q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
-  q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
-  q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
-  q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
+  // stage 2
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5);
+  iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6);

-  q0s32 = vmull_s16(d18s16, d30s16);
-  q13s32 = vmull_s16(d19s16, d30s16);
+  x[0] = vaddq_s16(t[0], t[2]);
+  x[1] = vaddq_s16(t[1], t[3]);
+  x[2] = vsubq_s16(t[0], t[2]);
+  x[3] = vsubq_s16(t[1], t[3]);
+  x[4] = add_dct_const_round_shift_low_8(s4, s6);
+  x[5] = add_dct_const_round_shift_low_8(s5, s7);
+  x[6] = sub_dct_const_round_shift_low_8(s4, s6);
+  x[7] = sub_dct_const_round_shift_low_8(s5, s7);

-  q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
-  q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
+  // stage 3
+  iadst_half_butterfly_neon(x + 2, c2);
+  iadst_half_butterfly_neon(x + 6, c2);

-  q10s32 = vmull_s16(d18s16, d31s16);
-  q9s32 = vmull_s16(d19s16, d31s16);
-
-  q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
-  q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
-
-  q14s32 = vaddq_s32(q2s32, q10s32);
-  q15s32 = vaddq_s32(q6s32, q9s32);
-  q2s32 = vsubq_s32(q2s32, q10s32);
-  q6s32 = vsubq_s32(q6s32, q9s32);
-
-  d28s16 = vrshrn_n_s32(q14s32, 14);
-  d29s16 = vrshrn_n_s32(q15s32, 14);
-  d4s16 = vrshrn_n_s32(q2s32, 14);
-  d5s16 = vrshrn_n_s32(q6s32, 14);
-  *q14s16 = vcombine_s16(d28s16, d29s16);
-
-  q9s32 = vaddq_s32(q4s32, q0s32);
-  q10s32 = vaddq_s32(q5s32, q13s32);
-  q4s32 = vsubq_s32(q4s32, q0s32);
-  q5s32 = vsubq_s32(q5s32, q13s32);
-
-  d30s16 = vdup_n_s16(cospi_8_64);
-  d31s16 = vdup_n_s16(cospi_24_64);
-
-  d18s16 = vrshrn_n_s32(q9s32, 14);
-  d19s16 = vrshrn_n_s32(q10s32, 14);
-  d8s16 = vrshrn_n_s32(q4s32, 14);
-  d9s16 = vrshrn_n_s32(q5s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q5s32 = vmull_s16(d2s16, d30s16);
-  q6s32 = vmull_s16(d3s16, d30s16);
-  q7s32 = vmull_s16(d2s16, d31s16);
-  q0s32 = vmull_s16(d3s16, d31s16);
-
-  q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
-  q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
-  q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
-  q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
-
-  q1s32 = vmull_s16(d4s16, d30s16);
-  q3s32 = vmull_s16(d5s16, d30s16);
-  q10s32 = vmull_s16(d4s16, d31s16);
-  q2s32 = vmull_s16(d5s16, d31s16);
-
-  q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
-  q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
-  q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
-  q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
-
-  *q8s16 = vaddq_s16(*q11s16, *q9s16);
-  *q11s16 = vsubq_s16(*q11s16, *q9s16);
-  q4s16 = vaddq_s16(*q12s16, *q14s16);
-  *q12s16 = vsubq_s16(*q12s16, *q14s16);
-
-  q14s32 = vaddq_s32(q5s32, q1s32);
-  q15s32 = vaddq_s32(q6s32, q3s32);
-  q5s32 = vsubq_s32(q5s32, q1s32);
-  q6s32 = vsubq_s32(q6s32, q3s32);
-
-  d18s16 = vrshrn_n_s32(q14s32, 14);
-  d19s16 = vrshrn_n_s32(q15s32, 14);
-  d10s16 = vrshrn_n_s32(q5s32, 14);
-  d11s16 = vrshrn_n_s32(q6s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q1s32 = vaddq_s32(q7s32, q10s32);
-  q3s32 = vaddq_s32(q0s32, q2s32);
-  q7s32 = vsubq_s32(q7s32, q10s32);
-  q0s32 = vsubq_s32(q0s32, q2s32);
-
-  d28s16 = vrshrn_n_s32(q1s32, 14);
-  d29s16 = vrshrn_n_s32(q3s32, 14);
-  d14s16 = vrshrn_n_s32(q7s32, 14);
-  d15s16 = vrshrn_n_s32(q0s32, 14);
-  *q14s16 = vcombine_s16(d28s16, d29s16);
-
-  d30s16 = vdup_n_s16(cospi_16_64);
-
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  q2s32 = vmull_s16(d22s16, d30s16);
-  q3s32 = vmull_s16(d23s16, d30s16);
-  q13s32 = vmull_s16(d22s16, d30s16);
-  q1s32 = vmull_s16(d23s16, d30s16);
-
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
-  q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
-
-  d4s16 = vrshrn_n_s32(q2s32, 14);
-  d5s16 = vrshrn_n_s32(q3s32, 14);
-  d24s16 = vrshrn_n_s32(q13s32, 14);
-  d25s16 = vrshrn_n_s32(q1s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  *q12s16 = vcombine_s16(d24s16, d25s16);
-
-  q13s32 = vmull_s16(d10s16, d30s16);
-  q1s32 = vmull_s16(d11s16, d30s16);
-  q11s32 = vmull_s16(d10s16, d30s16);
-  q0s32 = vmull_s16(d11s16, d30s16);
-
-  q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
-  q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
-  q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
-  q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
-
-  d20s16 = vrshrn_n_s32(q13s32, 14);
-  d21s16 = vrshrn_n_s32(q1s32, 14);
-  d12s16 = vrshrn_n_s32(q11s32, 14);
-  d13s16 = vrshrn_n_s32(q0s32, 14);
-  *q10s16 = vcombine_s16(d20s16, d21s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  q5s16 = vdupq_n_s16(0);
-
-  *q9s16 = vsubq_s16(q5s16, *q9s16);
-  *q11s16 = vsubq_s16(q5s16, q2s16);
-  *q13s16 = vsubq_s16(q5s16, q6s16);
-  *q15s16 = vsubq_s16(q5s16, q4s16);
+  io[0] = x[0];
+  io[1] = vnegq_s16(x[4]);
+  io[2] = x[6];
+  io[3] = vnegq_s16(x[2]);
+  io[4] = x[3];
+  io[5] = vnegq_s16(x[7]);
+  io[6] = x[5];
+  io[7] = vnegq_s16(x[1]);
 }

 void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
-  int i;
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
+  const int16x8_t cospis = vld1q_s16(kCospi);
+  const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+  const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+  int16x8_t a[8];

-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 8 * 2);
-  q11s16 = vld1q_s16(input + 8 * 3);
-  q12s16 = vld1q_s16(input + 8 * 4);
-  q13s16 = vld1q_s16(input + 8 * 5);
-  q14s16 = vld1q_s16(input + 8 * 6);
-  q15s16 = vld1q_s16(input + 8 * 7);
+  a[0] = load_tran_low_to_s16q(input + 0 * 8);
+  a[1] = load_tran_low_to_s16q(input + 1 * 8);
+  a[2] = load_tran_low_to_s16q(input + 2 * 8);
+  a[3] = load_tran_low_to_s16q(input + 3 * 8);
+  a[4] = load_tran_low_to_s16q(input + 4 * 8);
+  a[5] = load_tran_low_to_s16q(input + 5 * 8);
+  a[6] = load_tran_low_to_s16q(input + 6 * 8);
+  a[7] = load_tran_low_to_s16q(input + 7 * 8);

-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
+  transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);

  switch (tx_type) {
-    case 0:  // idct_idct is not supported. Fall back to C
-      vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
-      return;
-    case 1:  // iadst_idct
-      // generate IDCT constants
-      // GENERATE_IDCT_CONSTANTS
-
-      // first transform rows
-      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                 &q15s16);
-
-      // transpose the matrix
-      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                        &q14s16, &q15s16);
-
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // then transform columns
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
+    case DCT_DCT:
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
      break;
-    case 2:  // idct_iadst
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS

-      // first transform rows
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-
-      // transpose the matrix
-      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                        &q14s16, &q15s16);
-
-      // generate IDCT constants
-      // GENERATE_IDCT_CONSTANTS
-
-      // then transform columns
-      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                 &q15s16);
+    case ADST_DCT:
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      iadst8(a);
      break;
-    case 3:  // iadst_iadst
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS

-      // first transform rows
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-
-      // transpose the matrix
-      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                        &q14s16, &q15s16);
-
-      // then transform columns
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
+    case DCT_ADST:
+      iadst8(a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
      break;
-    default:  // iadst_idct
-      assert(0);
+
+    default:
+      assert(tx_type == ADST_ADST);
+      iadst8(a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      iadst8(a);
      break;
  }

-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  for (d1 = d2 = dest, i = 0; i < 2; i++) {
-    if (i != 0) {
-      q8s16 = q12s16;
-      q9s16 = q13s16;
-      q10s16 = q14s16;
-      q11s16 = q15s16;
-    }
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-    q10u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-    q11u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += stride;
-  }
+  idct8x8_add8x8_neon(a, dest, stride);
 }
--- a/vp9/common/arm/neon/vp9_iht_neon.h
+++ b/vp9/common/arm/neon/vp9_iht_neon.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
+#define VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void iadst4(int16x8_t *const io) {
+  const int32x4_t c3 = vdupq_n_s32(sinpi_3_9);
+  int16x4_t x[4];
+  int32x4_t s[8], output[4];
+  const int16x4_t c =
+      create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9);
+
+  x[0] = vget_low_s16(io[0]);
+  x[1] = vget_low_s16(io[1]);
+  x[2] = vget_high_s16(io[0]);
+  x[3] = vget_high_s16(io[1]);
+
+  s[0] = vmull_lane_s16(x[0], c, 0);
+  s[1] = vmull_lane_s16(x[0], c, 1);
+  s[2] = vmull_lane_s16(x[1], c, 2);
+  s[3] = vmull_lane_s16(x[2], c, 3);
+  s[4] = vmull_lane_s16(x[2], c, 0);
+  s[5] = vmull_lane_s16(x[3], c, 1);
+  s[6] = vmull_lane_s16(x[3], c, 3);
+  s[7] = vaddl_s16(x[0], x[3]);
+  s[7] = vsubw_s16(s[7], x[2]);
+
+  s[0] = vaddq_s32(s[0], s[3]);
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[4]);
+  s[1] = vsubq_s32(s[1], s[6]);
+  s[3] = s[2];
+  s[2] = vmulq_s32(c3, s[7]);
+
+  output[0] = vaddq_s32(s[0], s[3]);
+  output[1] = vaddq_s32(s[1], s[3]);
+  output[2] = s[2];
+  output[3] = vaddq_s32(s[0], s[1]);
+  output[3] = vsubq_s32(output[3], s[3]);
+  dct_const_round_shift_low_8_dual(output, &io[0], &io[1]);
+}
+
+#endif  // VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -42,6 +42,7 @@ const vpx_prob vp9_cat6_prob_high12[] = { 255, 255, 255, 255, 254, 254,
                                          177, 153, 140, 133, 130, 129 };
 #endif

+/* clang-format off */
 const uint8_t vp9_coefband_trans_8x8plus[1024] = {
  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
  // beyond MAXBAND_INDEX+1 all values are filled as 5
@@ -85,6 +86,7 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = {
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
+/* clang-format on */

 const uint8_t vp9_coefband_trans_4x4[16] = {
  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -137,7 +137,6 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
-
 #define COEFF_PROB_MODELS 255

 #define UNCONSTRAINED_NODES 3
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -186,16 +186,19 @@ const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] =
      { 93, 24, 99 },   // a split, l not split
      { 85, 119, 44 },  // l split, a not split
      { 62, 59, 67 },   // a/l both split
+
      // 16x16 -> 8x8
      { 149, 53, 53 },  // a/l both not split
      { 94, 20, 48 },   // a split, l not split
      { 83, 53, 24 },   // l split, a not split
      { 52, 18, 18 },   // a/l both split
+
      // 32x32 -> 16x16
      { 150, 40, 39 },  // a/l both not split
      { 78, 12, 26 },   // a split, l not split
      { 67, 33, 11 },   // l split, a not split
      { 24, 7, 5 },     // a/l both split
+
      // 64x64 -> 32x32
      { 174, 35, 49 },  // a/l both not split
      { 68, 11, 27 },   // a split, l not split
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -22,9 +22,7 @@ const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
  18,          -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10,
 };

-const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
-  -0, -1,
-};
+const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1 };

 const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2,  -1,
                                                               4,  -2, -3 };
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1174,7 +1174,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
    }

    // Disable filtering on the leftmost column
-    border_mask = ~(mi_col == 0);
+    border_mask = ~(mi_col == 0 ? 1 : 0);
 #if CONFIG_VP9_HIGHBITDEPTH
    if (cm->use_highbitdepth) {
      highbd_filter_selectively_vert(
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -229,9 +229,8 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
        else
          pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME);
      } else {
-        pred_context = 1 +
-                       2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
-                            edge_mi->ref_frame[1] == GOLDEN_FRAME);
+        pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
+                                edge_mi->ref_frame[1] == GOLDEN_FRAME);
      }
    } else {  // inter/inter
      const int above_has_second = has_second_ref(above_mi);
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vp9_common_forward_decls() {
 print <<EOF
 /*
@@ -57,13 +67,13 @@ add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *outp
 if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
  # Note that there are more specializations appended when
  # CONFIG_VP9_HIGHBITDEPTH is off.
-  specialize qw/vp9_iht4x4_16_add sse2/;
+  specialize qw/vp9_iht4x4_16_add neon sse2/;
  specialize qw/vp9_iht8x8_64_add sse2/;
  specialize qw/vp9_iht16x16_256_add sse2/;
  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
    # Note that these specializations are appended to the above ones.
-    specialize qw/vp9_iht4x4_16_add neon dspr2 msa/;
-    specialize qw/vp9_iht8x8_64_add neon dspr2 msa/;
+    specialize qw/vp9_iht4x4_16_add dspr2 msa/;
+    specialize qw/vp9_iht8x8_64_add dspr2 msa/;
    specialize qw/vp9_iht16x16_256_add dspr2 msa/;
  }
 }
@@ -91,6 +101,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";

  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
+
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+    specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
+    specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
+    specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
+  }
 }

 #
@@ -113,7 +129,7 @@ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";

 add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp neon sse2 avx2/, "$ssse3_x86_64";

 add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64";
--- a/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
+++ b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
@@ -0,0 +1,419 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+                                                      const int c,
+                                                      __m128i *const s) {
+  const __m128i pair_c = pair_set_epi32(4 * c, 0);
+  __m128i x[2];
+
+  extend_64bit(in, x);
+  s[0] = _mm_mul_epi32(pair_c, x[0]);
+  s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+                                                 const __m128i in1,
+                                                 const int c0, const int c1,
+                                                 __m128i *const s0,
+                                                 __m128i *const s1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i t00[2], t01[2], t10[2], t11[2];
+  __m128i x0[2], x1[2];
+
+  extend_64bit(in0, x0);
+  extend_64bit(in1, x1);
+  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+  s0[0] = _mm_add_epi64(t00[0], t11[0]);
+  s0[1] = _mm_add_epi64(t00[1], t11[1]);
+  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],
+      s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],
+      x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];
+
+  // stage 1
+  highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);
+  highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);
+  highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);
+  highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);
+  highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,
+                                s11);
+  highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,
+                                s13);
+  highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,
+                                s15);
+
+  x0[0] = _mm_add_epi64(s0[0], s8[0]);
+  x0[1] = _mm_add_epi64(s0[1], s8[1]);
+  x1[0] = _mm_add_epi64(s1[0], s9[0]);
+  x1[1] = _mm_add_epi64(s1[1], s9[1]);
+  x2[0] = _mm_add_epi64(s2[0], s10[0]);
+  x2[1] = _mm_add_epi64(s2[1], s10[1]);
+  x3[0] = _mm_add_epi64(s3[0], s11[0]);
+  x3[1] = _mm_add_epi64(s3[1], s11[1]);
+  x4[0] = _mm_add_epi64(s4[0], s12[0]);
+  x4[1] = _mm_add_epi64(s4[1], s12[1]);
+  x5[0] = _mm_add_epi64(s5[0], s13[0]);
+  x5[1] = _mm_add_epi64(s5[1], s13[1]);
+  x6[0] = _mm_add_epi64(s6[0], s14[0]);
+  x6[1] = _mm_add_epi64(s6[1], s14[1]);
+  x7[0] = _mm_add_epi64(s7[0], s15[0]);
+  x7[1] = _mm_add_epi64(s7[1], s15[1]);
+  x8[0] = _mm_sub_epi64(s0[0], s8[0]);
+  x8[1] = _mm_sub_epi64(s0[1], s8[1]);
+  x9[0] = _mm_sub_epi64(s1[0], s9[0]);
+  x9[1] = _mm_sub_epi64(s1[1], s9[1]);
+  x10[0] = _mm_sub_epi64(s2[0], s10[0]);
+  x10[1] = _mm_sub_epi64(s2[1], s10[1]);
+  x11[0] = _mm_sub_epi64(s3[0], s11[0]);
+  x11[1] = _mm_sub_epi64(s3[1], s11[1]);
+  x12[0] = _mm_sub_epi64(s4[0], s12[0]);
+  x12[1] = _mm_sub_epi64(s4[1], s12[1]);
+  x13[0] = _mm_sub_epi64(s5[0], s13[0]);
+  x13[1] = _mm_sub_epi64(s5[1], s13[1]);
+  x14[0] = _mm_sub_epi64(s6[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s6[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s7[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s7[1], s15[1]);
+
+  x0[0] = dct_const_round_shift_64bit(x0[0]);
+  x0[1] = dct_const_round_shift_64bit(x0[1]);
+  x1[0] = dct_const_round_shift_64bit(x1[0]);
+  x1[1] = dct_const_round_shift_64bit(x1[1]);
+  x2[0] = dct_const_round_shift_64bit(x2[0]);
+  x2[1] = dct_const_round_shift_64bit(x2[1]);
+  x3[0] = dct_const_round_shift_64bit(x3[0]);
+  x3[1] = dct_const_round_shift_64bit(x3[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x8[0] = dct_const_round_shift_64bit(x8[0]);
+  x8[1] = dct_const_round_shift_64bit(x8[1]);
+  x9[0] = dct_const_round_shift_64bit(x9[0]);
+  x9[1] = dct_const_round_shift_64bit(x9[1]);
+  x10[0] = dct_const_round_shift_64bit(x10[0]);
+  x10[1] = dct_const_round_shift_64bit(x10[1]);
+  x11[0] = dct_const_round_shift_64bit(x11[0]);
+  x11[1] = dct_const_round_shift_64bit(x11[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x0[0] = pack_4(x0[0], x0[1]);
+  x1[0] = pack_4(x1[0], x1[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x8[0] = pack_4(x8[0], x8[1]);
+  x9[0] = pack_4(x9[0], x9[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 2
+  s0[0] = x0[0];
+  s1[0] = x1[0];
+  s2[0] = x2[0];
+  s3[0] = x3[0];
+  s4[0] = x4[0];
+  s5[0] = x5[0];
+  s6[0] = x6[0];
+  s7[0] = x7[0];
+  x0[0] = _mm_add_epi32(s0[0], s4[0]);
+  x1[0] = _mm_add_epi32(s1[0], s5[0]);
+  x2[0] = _mm_add_epi32(s2[0], s6[0]);
+  x3[0] = _mm_add_epi32(s3[0], s7[0]);
+  x4[0] = _mm_sub_epi32(s0[0], s4[0]);
+  x5[0] = _mm_sub_epi32(s1[0], s5[0]);
+  x6[0] = _mm_sub_epi32(s2[0], s6[0]);
+  x7[0] = _mm_sub_epi32(s3[0], s7[0]);
+
+  highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);
+  highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,
+                                s11);
+  highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,
+                                s12);
+  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,
+                                s14);
+
+  x8[0] = _mm_add_epi64(s8[0], s12[0]);
+  x8[1] = _mm_add_epi64(s8[1], s12[1]);
+  x9[0] = _mm_add_epi64(s9[0], s13[0]);
+  x9[1] = _mm_add_epi64(s9[1], s13[1]);
+  x10[0] = _mm_add_epi64(s10[0], s14[0]);
+  x10[1] = _mm_add_epi64(s10[1], s14[1]);
+  x11[0] = _mm_add_epi64(s11[0], s15[0]);
+  x11[1] = _mm_add_epi64(s11[1], s15[1]);
+  x12[0] = _mm_sub_epi64(s8[0], s12[0]);
+  x12[1] = _mm_sub_epi64(s8[1], s12[1]);
+  x13[0] = _mm_sub_epi64(s9[0], s13[0]);
+  x13[1] = _mm_sub_epi64(s9[1], s13[1]);
+  x14[0] = _mm_sub_epi64(s10[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s10[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s11[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s11[1], s15[1]);
+  x8[0] = dct_const_round_shift_64bit(x8[0]);
+  x8[1] = dct_const_round_shift_64bit(x8[1]);
+  x9[0] = dct_const_round_shift_64bit(x9[0]);
+  x9[1] = dct_const_round_shift_64bit(x9[1]);
+  x10[0] = dct_const_round_shift_64bit(x10[0]);
+  x10[1] = dct_const_round_shift_64bit(x10[1]);
+  x11[0] = dct_const_round_shift_64bit(x11[0]);
+  x11[1] = dct_const_round_shift_64bit(x11[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x8[0] = pack_4(x8[0], x8[1]);
+  x9[0] = pack_4(x9[0], x9[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 3
+  s0[0] = x0[0];
+  s1[0] = x1[0];
+  s2[0] = x2[0];
+  s3[0] = x3[0];
+  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+  s8[0] = x8[0];
+  s9[0] = x9[0];
+  s10[0] = x10[0];
+  s11[0] = x11[0];
+  highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,
+                                s13);
+  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,
+                                s14);
+
+  x0[0] = _mm_add_epi32(s0[0], s2[0]);
+  x1[0] = _mm_add_epi32(s1[0], s3[0]);
+  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+  x4[0] = _mm_add_epi64(s4[0], s6[0]);
+  x4[1] = _mm_add_epi64(s4[1], s6[1]);
+  x5[0] = _mm_add_epi64(s5[0], s7[0]);
+  x5[1] = _mm_add_epi64(s5[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x8[0] = _mm_add_epi32(s8[0], s10[0]);
+  x9[0] = _mm_add_epi32(s9[0], s11[0]);
+  x10[0] = _mm_sub_epi32(s8[0], s10[0]);
+  x11[0] = _mm_sub_epi32(s9[0], s11[0]);
+  x12[0] = _mm_add_epi64(s12[0], s14[0]);
+  x12[1] = _mm_add_epi64(s12[1], s14[1]);
+  x13[0] = _mm_add_epi64(s13[0], s15[0]);
+  x13[1] = _mm_add_epi64(s13[1], s15[1]);
+  x14[0] = _mm_sub_epi64(s12[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s12[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s13[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s13[1], s15[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 4
+  s2[0] = _mm_add_epi32(x2[0], x3[0]);
+  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+  s6[0] = _mm_add_epi32(x7[0], x6[0]);
+  s7[0] = _mm_sub_epi32(x7[0], x6[0]);
+  s10[0] = _mm_add_epi32(x11[0], x10[0]);
+  s11[0] = _mm_sub_epi32(x11[0], x10[0]);
+  s14[0] = _mm_add_epi32(x14[0], x15[0]);
+  s15[0] = _mm_sub_epi32(x14[0], x15[0]);
+  highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);
+  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+  highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);
+  highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);
+  highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);
+  highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);
+
+  x2[0] = dct_const_round_shift_64bit(s2[0]);
+  x2[1] = dct_const_round_shift_64bit(s2[1]);
+  x3[0] = dct_const_round_shift_64bit(s3[0]);
+  x3[1] = dct_const_round_shift_64bit(s3[1]);
+  x6[0] = dct_const_round_shift_64bit(s6[0]);
+  x6[1] = dct_const_round_shift_64bit(s6[1]);
+  x7[0] = dct_const_round_shift_64bit(s7[0]);
+  x7[1] = dct_const_round_shift_64bit(s7[1]);
+  x10[0] = dct_const_round_shift_64bit(s10[0]);
+  x10[1] = dct_const_round_shift_64bit(s10[1]);
+  x11[0] = dct_const_round_shift_64bit(s11[0]);
+  x11[1] = dct_const_round_shift_64bit(s11[1]);
+  x14[0] = dct_const_round_shift_64bit(s14[0]);
+  x14[1] = dct_const_round_shift_64bit(s14[1]);
+  x15[0] = dct_const_round_shift_64bit(s15[0]);
+  x15[1] = dct_const_round_shift_64bit(s15[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  io[0] = x0[0];
+  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);
+  io[2] = x12[0];
+  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+  io[4] = x6[0];
+  io[5] = x14[0];
+  io[6] = x10[0];
+  io[7] = x2[0];
+  io[8] = x3[0];
+  io[9] = x11[0];
+  io[10] = x15[0];
+  io[11] = x7[0];
+  io[12] = x5[0];
+  io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);
+  io[14] = x9[0];
+  io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int tx_type, int bd) {
+  int i;
+  __m128i out[16], *in;
+
+  if (bd == 8) {
+    __m128i l[16], r[16];
+
+    in = l;
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+        idct16_8col(in, in);
+      } else {
+        vpx_iadst16_8col_sse2(in);
+      }
+      in = r;
+      input += 128;
+    }
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
+      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+        idct16_8col(out, out);
+      } else {
+        vpx_iadst16_8col_sse2(out);
+      }
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[4][16];
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+        vpx_highbd_idct16_4col_sse4_1(in);
+      } else {
+        highbd_iadst16_4col_sse4_1(in);
+      }
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+        vpx_highbd_idct16_4col_sse4_1(out);
+      } else {
+        highbd_iadst16_4col_sse4_1(out);
+      }
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
--- a/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
+++ b/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
@@ -0,0 +1,131 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst4_sse4_1(__m128i *const io) {
+  const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0);
+  const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0);
+  const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0);
+  const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0);
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2];
+  __m128i temp[2];
+
+  transpose_32bit_4x4(io, io);
+
+  extend_64bit(io[0], temp);
+  s0[0] = _mm_mul_epi32(pair_c1, temp[0]);
+  s0[1] = _mm_mul_epi32(pair_c1, temp[1]);
+  s1[0] = _mm_mul_epi32(pair_c2, temp[0]);
+  s1[1] = _mm_mul_epi32(pair_c2, temp[1]);
+
+  extend_64bit(io[1], temp);
+  s2[0] = _mm_mul_epi32(pair_c3, temp[0]);
+  s2[1] = _mm_mul_epi32(pair_c3, temp[1]);
+
+  extend_64bit(io[2], temp);
+  s3[0] = _mm_mul_epi32(pair_c4, temp[0]);
+  s3[1] = _mm_mul_epi32(pair_c4, temp[1]);
+  s4[0] = _mm_mul_epi32(pair_c1, temp[0]);
+  s4[1] = _mm_mul_epi32(pair_c1, temp[1]);
+
+  extend_64bit(io[3], temp);
+  s5[0] = _mm_mul_epi32(pair_c2, temp[0]);
+  s5[1] = _mm_mul_epi32(pair_c2, temp[1]);
+  s6[0] = _mm_mul_epi32(pair_c4, temp[0]);
+  s6[1] = _mm_mul_epi32(pair_c4, temp[1]);
+
+  t0[0] = _mm_add_epi64(s0[0], s3[0]);
+  t0[1] = _mm_add_epi64(s0[1], s3[1]);
+  t0[0] = _mm_add_epi64(t0[0], s5[0]);
+  t0[1] = _mm_add_epi64(t0[1], s5[1]);
+  t1[0] = _mm_sub_epi64(s1[0], s4[0]);
+  t1[1] = _mm_sub_epi64(s1[1], s4[1]);
+  t1[0] = _mm_sub_epi64(t1[0], s6[0]);
+  t1[1] = _mm_sub_epi64(t1[1], s6[1]);
+  temp[0] = _mm_sub_epi32(io[0], io[2]);
+  temp[0] = _mm_add_epi32(temp[0], io[3]);
+  extend_64bit(temp[0], temp);
+  t2[0] = _mm_mul_epi32(pair_c3, temp[0]);
+  t2[1] = _mm_mul_epi32(pair_c3, temp[1]);
+
+  s0[0] = _mm_add_epi64(t0[0], s2[0]);
+  s0[1] = _mm_add_epi64(t0[1], s2[1]);
+  s1[0] = _mm_add_epi64(t1[0], s2[0]);
+  s1[1] = _mm_add_epi64(t1[1], s2[1]);
+  s3[0] = _mm_add_epi64(t0[0], t1[0]);
+  s3[1] = _mm_add_epi64(t0[1], t1[1]);
+  s3[0] = _mm_sub_epi64(s3[0], s2[0]);
+  s3[1] = _mm_sub_epi64(s3[1], s2[1]);
+
+  s0[0] = dct_const_round_shift_64bit(s0[0]);
+  s0[1] = dct_const_round_shift_64bit(s0[1]);
+  s1[0] = dct_const_round_shift_64bit(s1[0]);
+  s1[1] = dct_const_round_shift_64bit(s1[1]);
+  s2[0] = dct_const_round_shift_64bit(t2[0]);
+  s2[1] = dct_const_round_shift_64bit(t2[1]);
+  s3[0] = dct_const_round_shift_64bit(s3[0]);
+  s3[1] = dct_const_round_shift_64bit(s3[1]);
+  io[0] = pack_4(s0[0], s0[1]);
+  io[1] = pack_4(s1[0], s1[1]);
+  io[2] = pack_4(s2[0], s2[1]);
+  io[3] = pack_4(s3[0], s3[1]);
+}
+
+void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int tx_type, int bd) {
+  __m128i io[4];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 8));
+  io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+  if (bd == 8) {
+    __m128i io_short[2];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[1]);
+    io_short[1] = _mm_packs_epi32(io[2], io[3]);
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      idct4_sse2(io_short);
+    } else {
+      iadst4_sse2(io_short);
+    }
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      idct4_sse2(io_short);
+    } else {
+      iadst4_sse2(io_short);
+    }
+    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+    io[0] = _mm_srai_epi16(io_short[0], 4);
+    io[1] = _mm_srai_epi16(io_short[1], 4);
+  } else {
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      highbd_idct4_sse4_1(io);
+    } else {
+      highbd_iadst4_sse4_1(io);
+    }
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      highbd_idct4_sse4_1(io);
+    } else {
+      highbd_iadst4_sse4_1(io);
+    }
+    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+  }
+
+  recon_and_store_4x4(io, dest, stride, bd);
+}
--- a/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
+++ b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
@@ -0,0 +1,255 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+                                                      const int c,
+                                                      __m128i *const s) {
+  const __m128i pair_c = pair_set_epi32(4 * c, 0);
+  __m128i x[2];
+
+  extend_64bit(in, x);
+  s[0] = _mm_mul_epi32(pair_c, x[0]);
+  s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+                                                 const __m128i in1,
+                                                 const int c0, const int c1,
+                                                 __m128i *const s0,
+                                                 __m128i *const s1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i t00[2], t01[2], t10[2], t11[2];
+  __m128i x0[2], x1[2];
+
+  extend_64bit(in0, x0);
+  extend_64bit(in1, x1);
+  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+  s0[0] = _mm_add_epi64(t00[0], t11[0]);
+  s0[1] = _mm_add_epi64(t00[1], t11[1]);
+  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst8_sse4_1(__m128i *const io) {
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
+
+  transpose_32bit_4x4x2(io, io);
+
+  // stage 1
+  highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1);
+  highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5);
+  x0[0] = _mm_add_epi64(s0[0], s4[0]);
+  x0[1] = _mm_add_epi64(s0[1], s4[1]);
+  x1[0] = _mm_add_epi64(s1[0], s5[0]);
+  x1[1] = _mm_add_epi64(s1[1], s5[1]);
+  x4[0] = _mm_sub_epi64(s0[0], s4[0]);
+  x4[1] = _mm_sub_epi64(s0[1], s4[1]);
+  x5[0] = _mm_sub_epi64(s1[0], s5[0]);
+  x5[1] = _mm_sub_epi64(s1[1], s5[1]);
+
+  highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3);
+  highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7);
+  x2[0] = _mm_add_epi64(s2[0], s6[0]);
+  x2[1] = _mm_add_epi64(s2[1], s6[1]);
+  x3[0] = _mm_add_epi64(s3[0], s7[0]);
+  x3[1] = _mm_add_epi64(s3[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s2[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s2[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s3[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s3[1], s7[1]);
+
+  x0[0] = dct_const_round_shift_64bit(x0[0]);
+  x0[1] = dct_const_round_shift_64bit(x0[1]);
+  x1[0] = dct_const_round_shift_64bit(x1[0]);
+  x1[1] = dct_const_round_shift_64bit(x1[1]);
+  x2[0] = dct_const_round_shift_64bit(x2[0]);
+  x2[1] = dct_const_round_shift_64bit(x2[1]);
+  x3[0] = dct_const_round_shift_64bit(x3[0]);
+  x3[1] = dct_const_round_shift_64bit(x3[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  s0[0] = pack_4(x0[0], x0[1]);  // s0 = x0;
+  s1[0] = pack_4(x1[0], x1[1]);  // s1 = x1;
+  s2[0] = pack_4(x2[0], x2[1]);  // s2 = x2;
+  s3[0] = pack_4(x3[0], x3[1]);  // s3 = x3;
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  // stage 2
+  x0[0] = _mm_add_epi32(s0[0], s2[0]);
+  x1[0] = _mm_add_epi32(s1[0], s3[0]);
+  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+
+  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+
+  x4[0] = _mm_add_epi64(s4[0], s6[0]);
+  x4[1] = _mm_add_epi64(s4[1], s6[1]);
+  x5[0] = _mm_add_epi64(s5[0], s7[0]);
+  x5[1] = _mm_add_epi64(s5[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  // stage 3
+  s2[0] = _mm_add_epi32(x2[0], x3[0]);
+  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+  s6[0] = _mm_add_epi32(x6[0], x7[0]);
+  s7[0] = _mm_sub_epi32(x6[0], x7[0]);
+  highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2);
+  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+
+  x2[0] = dct_const_round_shift_64bit(s2[0]);
+  x2[1] = dct_const_round_shift_64bit(s2[1]);
+  x3[0] = dct_const_round_shift_64bit(s3[0]);
+  x3[1] = dct_const_round_shift_64bit(s3[1]);
+  x6[0] = dct_const_round_shift_64bit(s6[0]);
+  x6[1] = dct_const_round_shift_64bit(s6[1]);
+  x7[0] = dct_const_round_shift_64bit(s7[0]);
+  x7[1] = dct_const_round_shift_64bit(s7[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  io[0] = x0[0];
+  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+  io[2] = x6[0];
+  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]);
+  io[4] = x3[0];
+  io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]);
+  io[6] = x5[0];
+  io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int tx_type, int bd) {
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+  io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+  io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+  io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+  io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+  io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+  io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+  io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+  io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[4]);
+    io_short[1] = _mm_packs_epi32(io[1], io[5]);
+    io_short[2] = _mm_packs_epi32(io[2], io[6]);
+    io_short[3] = _mm_packs_epi32(io[3], io[7]);
+    io_short[4] = _mm_packs_epi32(io[8], io[12]);
+    io_short[5] = _mm_packs_epi32(io[9], io[13]);
+    io_short[6] = _mm_packs_epi32(io[10], io[14]);
+    io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      vpx_idct8_sse2(io_short);
+    } else {
+      iadst8_sse2(io_short);
+    }
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      vpx_idct8_sse2(io_short);
+    } else {
+      iadst8_sse2(io_short);
+    }
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      vpx_highbd_idct8x8_half1d_sse4_1(io);
+      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+    } else {
+      highbd_iadst8_sse4_1(io);
+      highbd_iadst8_sse4_1(&io[8]);
+    }
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    io[4] = io[8];
+    io[5] = io[9];
+    io[6] = io[10];
+    io[7] = io[11];
+
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      vpx_highbd_idct8x8_half1d_sse4_1(io);
+      io[8] = temp[0];
+      io[9] = temp[1];
+      io[10] = temp[2];
+      io[11] = temp[3];
+      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+    } else {
+      highbd_iadst8_sse4_1(io);
+      io[8] = temp[0];
+      io[9] = temp[1];
+      io[10] = temp[2];
+      io[11] = temp[3];
+      highbd_iadst8_sse4_1(&io[8]);
+    }
+    highbd_idct8x8_final_round(io);
+  }
+  recon_and_store_8x8(io, dest, stride, bd);
+}
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -10,8 +10,6 @@

 #include "./vp9_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-#include "vpx_ports/mem.h"

 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
@@ -22,23 +20,23 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
  in[1] = load_input_data8(input + 8);

  switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
      idct4_sse2(in);
      idct4_sse2(in);
      break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
      idct4_sse2(in);
      iadst4_sse2(in);
      break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
      iadst4_sse2(in);
      idct4_sse2(in);
      break;
-    case 3:  // ADST_ADST
+    default:
+      assert(tx_type == ADST_ADST);
      iadst4_sse2(in);
      iadst4_sse2(in);
      break;
-    default: assert(0); break;
  }

  // Final round and shift
@@ -67,23 +65,23 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
  in[7] = load_input_data8(input + 8 * 7);

  switch (tx_type) {
-    case 0:  // DCT_DCT
-      idct8_sse2(in);
-      idct8_sse2(in);
+    case DCT_DCT:
+      vpx_idct8_sse2(in);
+      vpx_idct8_sse2(in);
      break;
-    case 1:  // ADST_DCT
-      idct8_sse2(in);
+    case ADST_DCT:
+      vpx_idct8_sse2(in);
      iadst8_sse2(in);
      break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
      iadst8_sse2(in);
-      idct8_sse2(in);
+      vpx_idct8_sse2(in);
      break;
-    case 3:  // ADST_ADST
+    default:
+      assert(tx_type == ADST_ADST);
      iadst8_sse2(in);
      iadst8_sse2(in);
      break;
-    default: assert(0); break;
  }

  // Final rounding and shift
@@ -201,23 +199,23 @@ void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
  load_buffer_8x16(input, in1);

  switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
      idct16_sse2(in0, in1);
      idct16_sse2(in0, in1);
      break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
      idct16_sse2(in0, in1);
      iadst16_sse2(in0, in1);
      break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
      iadst16_sse2(in0, in1);
      idct16_sse2(in0, in1);
      break;
-    case 3:  // ADST_ADST
+    default:
+      assert(tx_type == ADST_ADST);
      iadst16_sse2(in0, in1);
      iadst16_sse2(in0, in1);
      break;
-    default: assert(0); break;
  }

  write_buffer_8x16(dest, in0, stride);
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -464,10 +464,6 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
      cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5);
    }
  }
-  if (cpi->svc.spatial_layer_id > 0) {
-    cr->motion_thresh = 4;
-    cr->rate_boost_fac = 12;
-  }
  if (cpi->oxcf.rc_mode == VPX_VBR) {
    // To be adjusted for VBR mode, e.g., based on gf period and boost.
    // For now use smaller qp-delta (than CBR), no second boosted seg, and
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -12,7 +12,10 @@
 #include "vp9/encoder/vp9_encoder.h"

 static const BLOCK_SIZE square[] = {
-  BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
+  BLOCK_8X8,
+  BLOCK_16X16,
+  BLOCK_32X32,
+  BLOCK_64X64,
 };

 static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -189,11 +189,12 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
    int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
    int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv,
    int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx,
-    int use_svc) {
+    int use_svc, int spatial_layer) {
  const int sse_diff = (ctx->newmv_sse == UINT_MAX)
                           ? 0
                           : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
  int frame;
+  int denoise_layer_idx = 0;
  MACROBLOCKD *filter_mbd = &mb->e_mbd;
  MODE_INFO *mi = filter_mbd->mi[0];
  MODE_INFO saved_mi;
@@ -254,6 +255,10 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
      frame = lst_fb_idx + 1;
    else if (frame == GOLDEN_FRAME)
      frame = gld_fb_idx + 1;
+    // Shift for the second spatial layer.
+    if (num_spatial_layers - spatial_layer == 2)
+      frame = frame + denoiser->num_ref_frames;
+    denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
  }

  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
@@ -289,18 +294,21 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
                  denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col);
  filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;

-  filter_mbd->plane[0].dst.buf =
-      block_start(denoiser->mc_running_avg_y.y_buffer,
-                  denoiser->mc_running_avg_y.y_stride, mi_row, mi_col);
-  filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
-  filter_mbd->plane[1].dst.buf =
-      block_start(denoiser->mc_running_avg_y.u_buffer,
-                  denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
-  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride;
-  filter_mbd->plane[2].dst.buf =
-      block_start(denoiser->mc_running_avg_y.v_buffer,
-                  denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
-  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;
+  filter_mbd->plane[0].dst.buf = block_start(
+      denoiser->mc_running_avg_y[denoise_layer_idx].y_buffer,
+      denoiser->mc_running_avg_y[denoise_layer_idx].y_stride, mi_row, mi_col);
+  filter_mbd->plane[0].dst.stride =
+      denoiser->mc_running_avg_y[denoise_layer_idx].y_stride;
+  filter_mbd->plane[1].dst.buf = block_start(
+      denoiser->mc_running_avg_y[denoise_layer_idx].u_buffer,
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+  filter_mbd->plane[1].dst.stride =
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
+  filter_mbd->plane[2].dst.buf = block_start(
+      denoiser->mc_running_avg_y[denoise_layer_idx].v_buffer,
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+  filter_mbd->plane[2].dst.stride =
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;

  set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
  vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
@@ -324,9 +332,17 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
  int zeromv_filter = 0;
  VP9_DENOISER *denoiser = &cpi->denoiser;
  VP9_DENOISER_DECISION decision = COPY_BLOCK;
-  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
-  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
+
+  const int shift =
+      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+          ? denoiser->num_ref_frames
+          : 0;
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift];
+  const int denoise_layer_index =
+      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1;
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index];
  uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+
  uint8_t *mc_avg_start =
      block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
  struct buf_2d src = mb->plane[0].src;
@@ -381,7 +397,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
        &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
        motion_magnitude, is_skin, &zeromv_filter, consec_zeromv,
        cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx,
-        cpi->gld_fb_idx, cpi->use_svc);
+        cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id);

  if (decision == FILTER_BLOCK) {
    decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start,
@@ -432,7 +448,8 @@ void vp9_denoiser_update_frame_info(
    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
    int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
    int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
-    int svc_base_is_key) {
+    int svc_base_is_key, int second_spatial_layer) {
+  const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
  // Copy source into denoised reference buffers on KEY_FRAME or
  // if the just encoded frame was resized. For SVC, copy source if the base
  // spatial layer was key frame.
@@ -441,8 +458,8 @@ void vp9_denoiser_update_frame_info(
    int i;
    // Start at 1 so as not to overwrite the INTRA_FRAME
    for (i = 1; i < denoiser->num_ref_frames; ++i) {
-      if (denoiser->running_avg_y[i].buffer_alloc != NULL)
-        copy_frame(&denoiser->running_avg_y[i], &src);
+      if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL)
+        copy_frame(&denoiser->running_avg_y[i + shift], &src);
    }
    denoiser->reset = 0;
    return;
@@ -451,29 +468,29 @@ void vp9_denoiser_update_frame_info(
  // If more than one refresh occurs, must copy frame buffer.
  if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) {
    if (refresh_alt_ref_frame) {
-      copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1],
-                 &denoiser->running_avg_y[INTRA_FRAME]);
+      copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
    }
    if (refresh_golden_frame) {
-      copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1],
-                 &denoiser->running_avg_y[INTRA_FRAME]);
+      copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
    }
    if (refresh_last_frame) {
-      copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1],
-                 &denoiser->running_avg_y[INTRA_FRAME]);
+      copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
    }
  } else {
    if (refresh_alt_ref_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1],
-                        &denoiser->running_avg_y[INTRA_FRAME]);
+      swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
    }
    if (refresh_golden_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1],
-                        &denoiser->running_avg_y[INTRA_FRAME]);
+      swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
    }
    if (refresh_last_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1],
-                        &denoiser->running_avg_y[INTRA_FRAME]);
+      swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
    }
  }
 }
@@ -522,44 +539,90 @@ static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm,
 }

 int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
-                             int refresh_alt, int refresh_gld, int refresh_lst,
-                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) {
+                             int svc_buf_shift, int refresh_alt,
+                             int refresh_gld, int refresh_lst, int alt_fb_idx,
+                             int gld_fb_idx, int lst_fb_idx) {
  int fail = 0;
  if (refresh_alt) {
    // Increase the frame buffer index by 1 to map it to the buffer index in the
    // denoiser.
-    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, alt_fb_idx + 1);
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                           alt_fb_idx + 1 + svc_buf_shift);
    if (fail) return 1;
  }
  if (refresh_gld) {
-    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, gld_fb_idx + 1);
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                           gld_fb_idx + 1 + svc_buf_shift);
    if (fail) return 1;
  }
  if (refresh_lst) {
-    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, lst_fb_idx + 1);
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                           lst_fb_idx + 1 + svc_buf_shift);
    if (fail) return 1;
  }
  return 0;
 }

-int vp9_denoiser_alloc(VP9_COMMON *cm, int use_svc, VP9_DENOISER *denoiser,
-                       int width, int height, int ssx, int ssy,
+int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+                       int use_svc, int noise_sen, int width, int height,
+                       int ssx, int ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                       int use_highbitdepth,
 #endif
                       int border) {
-  int i, fail, init_num_ref_frames;
+  int i, layer, fail, init_num_ref_frames;
  const int legacy_byte_alignment = 0;
+  int num_layers = 1;
+  int scaled_width = width;
+  int scaled_height = height;
+  if (use_svc) {
+    LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id *
+                                                svc->number_temporal_layers +
+                                            svc->temporal_layer_id];
+    get_layer_resolution(width, height, lc->scaling_factor_num,
+                         lc->scaling_factor_den, &scaled_width, &scaled_height);
+    // For SVC: only denoise at most 2 spatial (highest) layers.
+    if (noise_sen >= 2)
+      // Denoise from one spatial layer below the top.
+      svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 2, 0);
+    else
+      // Only denoise the top spatial layer.
+      svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 1, 0);
+    num_layers = svc->number_spatial_layers - svc->first_layer_denoise;
+  }
  assert(denoiser != NULL);
-
  denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
  init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
+  denoiser->num_layers = num_layers;
+  CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+                  vpx_calloc(denoiser->num_ref_frames * num_layers,
+                             sizeof(denoiser->running_avg_y[0])));
  CHECK_MEM_ERROR(
-      cm, denoiser->running_avg_y,
-      vpx_calloc(denoiser->num_ref_frames, sizeof(denoiser->running_avg_y[0])));
-  for (i = 0; i < init_num_ref_frames; ++i) {
-    fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
-                                  ssx, ssy,
+      cm, denoiser->mc_running_avg_y,
+      vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
+
+  for (layer = 0; layer < num_layers; ++layer) {
+    const int denoise_width = (layer == 0) ? width : scaled_width;
+    const int denoise_height = (layer == 0) ? height : scaled_height;
+    for (i = 0; i < init_num_ref_frames; ++i) {
+      fail = vpx_alloc_frame_buffer(
+          &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer],
+          denoise_width, denoise_height, ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+          use_highbitdepth,
+#endif
+          border, legacy_byte_alignment);
+      if (fail) {
+        vp9_denoiser_free(denoiser);
+        return 1;
+      }
+#ifdef OUTPUT_YUV_DENOISED
+      make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+    }
+
+    fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y[layer],
+                                  denoise_width, denoise_height, ssx, ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                  use_highbitdepth,
 #endif
@@ -568,22 +631,10 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, int use_svc, VP9_DENOISER *denoiser,
      vp9_denoiser_free(denoiser);
      return 1;
    }
-#ifdef OUTPUT_YUV_DENOISED
-    make_grayscale(&denoiser->running_avg_y[i]);
-#endif
-  }
-
-  fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, ssx,
-                                ssy,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                use_highbitdepth,
-#endif
-                                border, legacy_byte_alignment);
-  if (fail) {
-    vp9_denoiser_free(denoiser);
-    return 1;
  }

+  // denoiser->last_source only used for noise_estimation, so only for top
+  // layer.
  fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                use_highbitdepth,
@@ -609,12 +660,18 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) {
    return;
  }
  denoiser->frame_buffer_initialized = 0;
-  for (i = 0; i < denoiser->num_ref_frames; ++i) {
+  for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) {
    vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
  }
  vpx_free(denoiser->running_avg_y);
  denoiser->running_avg_y = NULL;
-  vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
+
+  for (i = 0; i < denoiser->num_layers; ++i) {
+    vpx_free_frame_buffer(&denoiser->mc_running_avg_y[i]);
+  }
+
+  vpx_free(denoiser->mc_running_avg_y);
+  denoiser->mc_running_avg_y = NULL;
  vpx_free_frame_buffer(&denoiser->last_source);
 }

--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -44,11 +44,12 @@ typedef enum vp9_denoiser_level {

 typedef struct vp9_denoiser {
  YV12_BUFFER_CONFIG *running_avg_y;
-  YV12_BUFFER_CONFIG mc_running_avg_y;
+  YV12_BUFFER_CONFIG *mc_running_avg_y;
  YV12_BUFFER_CONFIG last_source;
  int frame_buffer_initialized;
  int reset;
  int num_ref_frames;
+  int num_layers;
  VP9_DENOISER_LEVEL denoising_level;
  VP9_DENOISER_LEVEL prev_denoising_level;
 } VP9_DENOISER;
@@ -66,12 +67,13 @@ typedef struct {
 } VP9_PICKMODE_CTX_DEN;

 struct VP9_COMP;
+struct SVC;

 void vp9_denoiser_update_frame_info(
    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
    int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
    int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
-    int svc_base_is_key);
+    int svc_base_is_key, int second_spatial_layer);

 void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
                          int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
@@ -84,11 +86,13 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
                                     PICK_MODE_CONTEXT *ctx);

 int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
-                             int refresh_alt, int refresh_gld, int refresh_lst,
-                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx);
+                             int svc_buf_shift, int refresh_alt,
+                             int refresh_gld, int refresh_lst, int alt_fb_idx,
+                             int gld_fb_idx, int lst_fb_idx);

-int vp9_denoiser_alloc(VP9_COMMON *cm, int use_svc, VP9_DENOISER *denoiser,
-                       int width, int height, int ssx, int ssy,
+int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+                       int use_svc, int noise_sen, int width, int height,
+                       int ssx, int ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                       int use_highbitdepth,
 #endif
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1513,9 +1513,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
          }
        }
      }
-      if (is_key_frame || (low_res &&
-                           vt.split[i].split[j].part_variances.none.variance >
-                               threshold_4x4avg)) {
+      if (is_key_frame ||
+          (low_res && vt.split[i].split[j].part_variances.none.variance >
+                          threshold_4x4avg)) {
        force_split[split_index] = 0;
        // Go down to 4x4 down-sampling for variance.
        variance4x4downsample[i2 + j] = 1;
@@ -3403,9 +3403,10 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,

        // Rate and distortion based partition search termination clause.
        if (!cpi->sf.ml_partition_search_early_termination &&
-            !x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
-                                   (best_rdc.dist < dist_breakout_thr &&
-                                    best_rdc.rate < rate_breakout_thr))) {
+            !x->e_mbd.lossless &&
+            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+             (best_rdc.dist < dist_breakout_thr &&
+              best_rdc.rate < rate_breakout_thr))) {
          do_rect = 0;
        }
      }
@@ -4620,8 +4621,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) {

  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
    if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
-    CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows *
-                                                   sizeof(*cpi->tile_data)));
+    CHECK_MEM_ERROR(
+        cm, cpi->tile_data,
+        vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
    cpi->allocated_tiles = tile_cols * tile_rows;

    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -50,7 +50,8 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 }

 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-  { 10, 6 }, { 8, 5 },
+  { 10, 6 },
+  { 8, 5 },
 };

 // 'num' can be negative, but 'shift' must be non-negative.
@@ -200,9 +201,9 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
          const int band_next = band_translate[i + 1];
          const int token_next =
              (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
-          unsigned int(
-              *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-              token_costs + band_next;
+          unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS]
+                                               [ENTROPY_TOKENS] =
+                                                   token_costs + band_next;
          token_cache[rc] = vp9_pt_energy_class[t0];
          ctx_next = get_coef_context(nb, token_cache, i + 1);
          token_tree_sel_next = (x == 0);
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -65,12 +65,12 @@
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0

-#define ALTREF_HIGH_PRECISION_MV 1     // Whether to use high precision mv
-                                       //  for altref computation.
-#define HIGH_PRECISION_MV_QTHRESH 200  // Q threshold for high precision
-                                       // mv. Choose a very high value for
-                                       // now so that HIGH_PRECISION is always
-                                       // chosen.
+// Whether to use high precision mv for altref computation.
+#define ALTREF_HIGH_PRECISION_MV 1
+
+// Q threshold for high precision mv. Choose a very high value for now so that
+// HIGH_PRECISION is always chosen.
+#define HIGH_PRECISION_MV_QTHRESH 200

 #define FRAME_SIZE_FACTOR 128  // empirical params for context model threshold
 #define FRAME_RATE_FACTOR 8
@@ -437,34 +437,37 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) {

 /* clang-format off */
 const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
-  { LEVEL_1,   829440,      36864,    200,    400,    2, 1,  4,  8 },
-  { LEVEL_1_1, 2764800,     73728,    800,    1000,   2, 1,  4,  8 },
-  { LEVEL_2,   4608000,     122880,   1800,   1500,   2, 1,  4,  8 },
-  { LEVEL_2_1, 9216000,     245760,   3600,   2800,   2, 2,  4,  8 },
-  { LEVEL_3,   20736000,    552960,   7200,   6000,   2, 4,  4,  8 },
-  { LEVEL_3_1, 36864000,    983040,   12000,  10000,  2, 4,  4,  8 },
-  { LEVEL_4,   83558400,    2228224,  18000,  16000,  4, 4,  4,  8 },
-  { LEVEL_4_1, 160432128,   2228224,  30000,  18000,  4, 4,  5,  6 },
-  { LEVEL_5,   311951360,   8912896,  60000,  36000,  6, 8,  6,  4 },
-  { LEVEL_5_1, 588251136,   8912896,  120000, 46000,  8, 8,  10, 4 },
+  //         sample rate    size   breadth  bitrate  cpb
+  { LEVEL_1,   829440,      36864,    512,   200,    400,    2, 1,  4,  8 },
+  { LEVEL_1_1, 2764800,     73728,    768,   800,    1000,   2, 1,  4,  8 },
+  { LEVEL_2,   4608000,     122880,   960,   1800,   1500,   2, 1,  4,  8 },
+  { LEVEL_2_1, 9216000,     245760,   1344,  3600,   2800,   2, 2,  4,  8 },
+  { LEVEL_3,   20736000,    552960,   2048,  7200,   6000,   2, 4,  4,  8 },
+  { LEVEL_3_1, 36864000,    983040,   2752,  12000,  10000,  2, 4,  4,  8 },
+  { LEVEL_4,   83558400,    2228224,  4160,  18000,  16000,  4, 4,  4,  8 },
+  { LEVEL_4_1, 160432128,   2228224,  4160,  30000,  18000,  4, 4,  5,  6 },
+  { LEVEL_5,   311951360,   8912896,  8384,  60000,  36000,  6, 8,  6,  4 },
+  { LEVEL_5_1, 588251136,   8912896,  8384,  120000, 46000,  8, 8,  10, 4 },
  // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when
  // they are finalized (currently tentative).
-  { LEVEL_5_2, 1176502272,  8912896,  180000, 90000,  8, 8,  10, 4 },
-  { LEVEL_6,   1176502272,  35651584, 180000, 90000,  8, 16, 10, 4 },
-  { LEVEL_6_1, 2353004544u, 35651584, 240000, 180000, 8, 16, 10, 4 },
-  { LEVEL_6_2, 4706009088u, 35651584, 480000, 360000, 8, 16, 10, 4 },
+  { LEVEL_5_2, 1176502272,  8912896,  8384,  180000, 90000,  8, 8,  10, 4 },
+  { LEVEL_6,   1176502272,  35651584, 16832, 180000, 90000,  8, 16, 10, 4 },
+  { LEVEL_6_1, 2353004544u, 35651584, 16832, 240000, 180000, 8, 16, 10, 4 },
+  { LEVEL_6_2, 4706009088u, 35651584, 16832, 480000, 360000, 8, 16, 10, 4 },
 };
 /* clang-format on */

-static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] =
-    { "The average bit-rate is too high.",
-      "The picture size is too large.",
-      "The luma sample rate is too large.",
-      "The CPB size is too large.",
-      "The compression ratio is too small",
-      "Too many column tiles are used.",
-      "The alt-ref distance is too small.",
-      "Too many reference buffers are used." };
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+  "The average bit-rate is too high.",
+  "The picture size is too large.",
+  "The picture width/height is too large.",
+  "The luma sample rate is too large.",
+  "The CPB size is too large.",
+  "The compression ratio is too small",
+  "Too many column tiles are used.",
+  "The alt-ref distance is too small.",
+  "Too many reference buffers are used."
+};

 static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
  switch (mode) {
@@ -544,6 +547,74 @@ static void apply_active_map(VP9_COMP *cpi) {
  }
 }

+static void apply_roi_map(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
+  vpx_roi_map_t *roi = &cpi->roi;
+  const int *delta_q = roi->delta_q;
+  const int *delta_lf = roi->delta_lf;
+  const int *skip = roi->skip;
+  int ref_frame[8];
+  int internal_delta_q[MAX_SEGMENTS];
+  int i;
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+
+  // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
+  // realtime mode.
+  if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return;
+  if (!roi->enabled) return;
+
+  memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame));
+
+  vp9_enable_segmentation(seg);
+  vp9_clearall_segfeatures(seg);
+  // Select delta coding method;
+  seg->abs_delta = SEGMENT_DELTADATA;
+
+  memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols));
+
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    // Translate the external delta q values to internal values.
+    internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i]));
+    if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i];
+    vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q);
+    vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
+    if (internal_delta_q[i] != 0) {
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]);
+    }
+    if (delta_lf[i] != 0) {
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF);
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]);
+    }
+    if (skip[i] != 0) {
+      vp9_enable_segfeature(seg, i, SEG_LVL_SKIP);
+      vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]);
+    }
+    if (ref_frame[i] >= 0) {
+      int valid_ref = 1;
+      // ALTREF is not used as reference for nonrd_pickmode with 0 lag.
+      if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode)
+        valid_ref = 0;
+      // If GOLDEN is selected, make sure it's set as reference.
+      if (ref_frame[i] == GOLDEN_FRAME &&
+          !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) {
+        valid_ref = 0;
+      }
+      // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
+      // same reference.
+      if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0)
+        ref_frame[i] = LAST_FRAME;
+      if (valid_ref) {
+        vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME);
+        vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]);
+      }
+    }
+  }
+  roi->enabled = 1;
+}
+
 static void init_level_info(Vp9LevelInfo *level_info) {
  Vp9LevelStats *const level_stats = &level_info->level_stats;
  Vp9LevelSpec *const level_spec = &level_info->level_spec;
@@ -554,6 +625,13 @@ static void init_level_info(Vp9LevelInfo *level_info) {
  level_spec->min_altref_distance = INT_MAX;
 }

+static int check_seg_range(int seg_data[8], int range) {
+  return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range ||
+           abs(seg_data[2]) > range || abs(seg_data[3]) > range ||
+           abs(seg_data[4]) > range || abs(seg_data[5]) > range ||
+           abs(seg_data[6]) > range || abs(seg_data[7]) > range);
+}
+
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
  int i;
  const Vp9LevelSpec *this_level;
@@ -566,6 +644,8 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
            (double)this_level->max_luma_sample_rate *
                (1 + SAMPLE_RATE_GRACE_P) ||
        level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
+        level_spec->max_luma_picture_breadth >
+            this_level->max_luma_picture_breadth ||
        level_spec->average_bitrate > this_level->average_bitrate ||
        level_spec->max_cpb_size > this_level->max_cpb_size ||
        level_spec->compression_ratio < this_level->compression_ratio ||
@@ -578,6 +658,61 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
  return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
 }

+int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
+                    unsigned int cols, int delta_q[8], int delta_lf[8],
+                    int skip[8], int ref_frame[8]) {
+  VP9_COMMON *cm = &cpi->common;
+  vpx_roi_map_t *roi = &cpi->roi;
+  const int range = 63;
+  const int ref_frame_range = 3;  // Alt-ref
+  const int skip_range = 1;
+  const int frame_rows = cpi->common.mi_rows;
+  const int frame_cols = cpi->common.mi_cols;
+
+  // Check number of rows and columns match
+  if (frame_rows != (int)rows || frame_cols != (int)cols) {
+    return -1;
+  }
+
+  if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
+      !check_seg_range(ref_frame, ref_frame_range) ||
+      !check_seg_range(skip, skip_range))
+    return -1;
+
+  // Also disable segmentation if no deltas are specified.
+  if (!map ||
+      (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] |
+         delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] |
+         delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] |
+         delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] |
+         skip[5] | skip[6] | skip[7]) &&
+       (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 &&
+        ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 &&
+        ref_frame[6] == -1 && ref_frame[7] == -1))) {
+    vp9_disable_segmentation(&cm->seg);
+    cpi->roi.enabled = 0;
+    return 0;
+  }
+
+  if (roi->roi_map) {
+    vpx_free(roi->roi_map);
+    roi->roi_map = NULL;
+  }
+  CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols));
+
+  // Copy to ROI sturcture in the compressor.
+  memcpy(roi->roi_map, map, rows * cols);
+  memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0]));
+  memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0]));
+  memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0]));
+  memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0]));
+  roi->enabled = 1;
+  roi->rows = rows;
+  roi->cols = cols;
+
+  return 0;
+}
+
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
                       int cols) {
  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
@@ -812,6 +947,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
  vpx_free(cpi->active_map.map);
  cpi->active_map.map = NULL;

+  vpx_free(cpi->roi.roi_map);
+  cpi->roi.roi_map = NULL;
+
  vpx_free(cpi->consec_zero_mv);
  cpi->consec_zero_mv = NULL;

@@ -1116,8 +1254,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {

  // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
  // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
-  // target of 1/4x1/4.
-  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc) {
+  // target of 1/4x1/4. number_spatial_layers must be greater than 2.
+  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
+      cpi->svc.number_spatial_layers > 2) {
    cpi->svc.scaled_temp_is_alloc = 1;
    if (vpx_realloc_frame_buffer(
            &cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1,
@@ -1219,8 +1358,8 @@ static void set_tile_limits(VP9_COMP *cpi) {
  }

  if (cpi->oxcf.target_level == LEVEL_AUTO) {
-    const uint32_t pic_size = cpi->common.width * cpi->common.height;
-    const int level_tile_cols = log_tile_cols_from_picsize_level(pic_size);
+    const int level_tile_cols =
+        log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
    if (cm->log2_tile_cols > level_tile_cols) {
      cm->log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
    }
@@ -1848,6 +1987,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
           cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
      vp9_cyclic_refresh_reset_resize(cpi);
+    rc->rc_1_frame = 0;
+    rc->rc_2_frame = 0;
  }

  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
@@ -1858,6 +1999,24 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
                                           (int)cpi->oxcf.target_bandwidth);
  }

+  // Check for resetting the rc flags (rc_1_frame, rc_2_frame) if the
+  // configuration change has a large change in avg_frame_bandwidth.
+  // For SVC check for resetting based on spatial layer average bandwidth.
+  // Also reset buffer level to optimal level.
+  if (cm->current_video_frame > 0) {
+    if (cpi->use_svc) {
+      vp9_svc_check_reset_layer_rc_flag(cpi);
+    } else {
+      if (rc->avg_frame_bandwidth > (3 * rc->last_avg_frame_bandwidth >> 1) ||
+          rc->avg_frame_bandwidth < (rc->last_avg_frame_bandwidth >> 1)) {
+        rc->rc_1_frame = 0;
+        rc->rc_2_frame = 0;
+        rc->bits_off_target = rc->optimal_buffer_level;
+        rc->buffer_level = rc->optimal_buffer_level;
+      }
+    }
+  }
+
  cpi->alt_ref_source = NULL;
  rc->is_src_frame_alt_ref = 0;

@@ -1992,8 +2151,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,

  realloc_segmentation_maps(cpi);

-  CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols,
-                                                sizeof(cpi->skin_map[0])));
+  CHECK_MEM_ERROR(
+      cm, cpi->skin_map,
+      vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));

  CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());

@@ -2856,18 +3016,26 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
      cpi->denoiser.denoising_level > kDenLowLow) {
    int svc_base_is_key = 0;
+    int denoise_svc_second_layer = 0;
    if (cpi->use_svc) {
      int realloc_fail = 0;
+      const int svc_buf_shift =
+          cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+              ? cpi->denoiser.num_ref_frames
+              : 0;
      int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
                                   cpi->svc.temporal_layer_id,
                                   cpi->svc.number_temporal_layers);
      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
      svc_base_is_key = lc->is_key_frame;
-
-      // Check if we need to allocate extra buffers in the denoiser for
+      denoise_svc_second_layer =
+          cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 ? 1
+                                                                          : 0;
+      // Check if we need to allocate extra buffers in the denoiser
+      // for
      // refreshed frames.
      realloc_fail = vp9_denoiser_realloc_svc(
-          cm, &cpi->denoiser, cpi->refresh_alt_ref_frame,
+          cm, &cpi->denoiser, svc_buf_shift, cpi->refresh_alt_ref_frame,
          cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx,
          cpi->gld_fb_idx, cpi->lst_fb_idx);
      if (realloc_fail)
@@ -2878,7 +3046,8 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
        &cpi->denoiser, *cpi->Source, cpi->common.frame_type,
        cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame,
        cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx,
-        cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key);
+        cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key,
+        denoise_svc_second_layer);
  }
 #endif
  if (is_one_pass_cbr_svc(cpi)) {
@@ -3313,8 +3482,9 @@ static void setup_denoiser_buffer(VP9_COMP *cpi) {
  VP9_COMMON *const cm = &cpi->common;
  if (cpi->oxcf.noise_sensitivity > 0 &&
      !cpi->denoiser.frame_buffer_initialized) {
-    if (vp9_denoiser_alloc(cm, cpi->use_svc, &cpi->denoiser, cm->width,
-                           cm->height, cm->subsampling_x, cm->subsampling_y,
+    if (vp9_denoiser_alloc(cm, &cpi->svc, &cpi->denoiser, cpi->use_svc,
+                           cpi->oxcf.noise_sensitivity, cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                           cm->use_highbitdepth,
 #endif
@@ -3595,6 +3765,8 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
    // it may be pretty bad for rate-control,
    // and I should handle it somehow
    vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
+  } else if (cpi->roi.enabled && cm->frame_type != KEY_FRAME) {
+    apply_roi_map(cpi);
  }

  apply_active_map(cpi);
@@ -4325,6 +4497,15 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
  struct segmentation *const seg = &cm->seg;
  TX_SIZE t;

+  // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
+      !cpi->svc.rc_drop_superframe && cpi->oxcf.target_bandwidth == 0) {
+    cpi->svc.skip_enhancement_layer = 1;
+    vp9_rc_postencode_update_drop_frame(cpi);
+    cpi->ext_refresh_frame_flags_pending = 0;
+    return;
+  }
+
  set_ext_overrides(cpi);
  vpx_clear_system_state();

@@ -4416,7 +4597,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
    if (vp9_rc_drop_frame(cpi) ||
        (is_one_pass_cbr_svc(cpi) && cpi->svc.rc_drop_superframe == 1)) {
      vp9_rc_postencode_update_drop_frame(cpi);
-      ++cm->current_video_frame;
      cpi->ext_refresh_frame_flags_pending = 0;
      cpi->svc.rc_drop_superframe = 1;
      cpi->last_frame_dropped = 1;
@@ -4829,6 +5009,7 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
  int i, idx;
  uint64_t luma_samples, dur_end;
  const uint32_t luma_pic_size = cm->width * cm->height;
+  const uint32_t luma_pic_breadth = VPXMAX(cm->width, cm->height);
  LevelConstraint *const level_constraint = &cpi->level_constraint;
  const int8_t level_index = level_constraint->level_index;
  double cpb_data_size;
@@ -4932,6 +5113,11 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
    level_spec->max_luma_picture_size = luma_pic_size;
  }

+  // update max_luma_picture_breadth
+  if (luma_pic_breadth > level_spec->max_luma_picture_breadth) {
+    level_spec->max_luma_picture_breadth = luma_pic_breadth;
+  }
+
  // update compression_ratio
  level_spec->compression_ratio = (double)level_stats->total_uncompressed_size *
                                  cm->bit_depth /
@@ -4952,6 +5138,15 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
                         level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]);
    }

+    if (level_spec->max_luma_picture_breadth >
+        vp9_level_defs[level_index].max_luma_picture_breadth) {
+      level_constraint->fail_flag |= (1 << LUMA_PIC_BREADTH_TOO_LARGE);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[LUMA_PIC_BREADTH_TOO_LARGE]);
+    }
+
    if ((double)level_spec->max_luma_sample_rate >
        (double)vp9_level_defs[level_index].max_luma_sample_rate *
            (1 + SAMPLE_RATE_GRACE_P)) {
@@ -5152,8 +5347,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
      cm->intra_only = 0;
      // if the flags indicate intra frame, but if the current picture is for
      // non-zero spatial layer, it should not be an intra picture.
-      // TODO(Won Kap): this needs to change if per-layer intra frame is
-      // allowed.
      if ((source->flags & VPX_EFLAG_FORCE_KF) &&
          cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
        source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -383,6 +383,7 @@ typedef struct {
  VP9_LEVEL level;
  uint64_t max_luma_sample_rate;
  uint32_t max_luma_picture_size;
+  uint32_t max_luma_picture_breadth;
  double average_bitrate;  // in kilobits per second
  double max_cpb_size;     // in kilobits
  double compression_ratio;
@@ -422,14 +423,15 @@ typedef struct {

 typedef enum {
  BITRATE_TOO_LARGE = 0,
-  LUMA_PIC_SIZE_TOO_LARGE = 1,
-  LUMA_SAMPLE_RATE_TOO_LARGE = 2,
-  CPB_TOO_LARGE = 3,
-  COMPRESSION_RATIO_TOO_SMALL = 4,
-  TOO_MANY_COLUMN_TILE = 5,
-  ALTREF_DIST_TOO_SMALL = 6,
-  TOO_MANY_REF_BUFFER = 7,
-  TARGET_LEVEL_FAIL_IDS = 8
+  LUMA_PIC_SIZE_TOO_LARGE,
+  LUMA_PIC_BREADTH_TOO_LARGE,
+  LUMA_SAMPLE_RATE_TOO_LARGE,
+  CPB_TOO_LARGE,
+  COMPRESSION_RATIO_TOO_SMALL,
+  TOO_MANY_COLUMN_TILE,
+  ALTREF_DIST_TOO_SMALL,
+  TOO_MANY_REF_BUFFER,
+  TARGET_LEVEL_FAIL_IDS
 } TARGET_LEVEL_FAIL_ID;

 typedef struct {
@@ -721,6 +723,8 @@ typedef struct VP9_COMP {

  uint8_t *count_arf_frame_usage;
  uint8_t *count_lastgolden_frame_usage;
+
+  vpx_roi_map_t roi;
 } VP9_COMP;

 void vp9_initialize_enc(void);
@@ -866,9 +870,8 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {

 #if CONFIG_VP9_TEMPORAL_DENOISING
 static INLINE int denoise_svc(const struct VP9_COMP *const cpi) {
-  return (!cpi->use_svc ||
-          (cpi->use_svc &&
-           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+  return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >=
+                                                cpi->svc.first_layer_denoise));
 }
 #endif

@@ -920,10 +923,14 @@ static INLINE int get_level_index(VP9_LEVEL level) {

 // Return the log2 value of max column tiles corresponding to the level that
 // the picture size fits into.
-static INLINE int log_tile_cols_from_picsize_level(uint32_t pic_size) {
+static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
+                                                   uint32_t height) {
  int i;
+  const uint32_t pic_size = width * height;
+  const uint32_t pic_breadth = VPXMAX(width, height);
  for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
-    if (vp9_level_defs[i].max_luma_picture_size > pic_size) {
+    if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+        vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
      return get_msb(vp9_level_defs[i].max_col_tiles);
    }
  }
@@ -932,6 +939,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t pic_size) {

 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);

+int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
+                    unsigned int cols, int delta_q[8], int delta_lf[8],
+                    int skip[8], int ref_frame[8]);
+
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);

 void vp9_set_row_mt(VP9_COMP *cpi);
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -66,8 +66,8 @@ static int get_max_tile_cols(VP9_COMP *cpi) {
  log2_tile_cols =
      clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
  if (cpi->oxcf.target_level == LEVEL_AUTO) {
-    const uint32_t pic_size = cpi->common.width * cpi->common.height;
-    const int level_tile_cols = log_tile_cols_from_picsize_level(pic_size);
+    const int level_tile_cols =
+        log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
    if (log2_tile_cols > level_tile_cols) {
      log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
    }
@@ -390,8 +390,9 @@ void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
 }

 #if !CONFIG_REALTIME_ONLY
-static int first_pass_worker_hook(EncWorkerData *const thread_data,
-                                  MultiThreadHandle *multi_thread_ctxt) {
+static int first_pass_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
  VP9_COMP *const cpi = thread_data->cpi;
  const VP9_COMMON *const cm = &cpi->common;
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -470,8 +471,8 @@ void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
    }
  }

-  launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook,
-                     multi_thread_ctxt, num_workers);
+  launch_enc_workers(cpi, first_pass_worker_hook, multi_thread_ctxt,
+                     num_workers);

  first_tile_col = &cpi->tile_data[0];
  for (i = 1; i < tile_cols; i++) {
@@ -480,8 +481,9 @@ void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
  }
 }

-static int temporal_filter_worker_hook(EncWorkerData *const thread_data,
-                                       MultiThreadHandle *multi_thread_ctxt) {
+static int temporal_filter_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
  VP9_COMP *const cpi = thread_data->cpi;
  const VP9_COMMON *const cm = &cpi->common;
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -553,13 +555,14 @@ void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
    }
  }

-  launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook,
-                     multi_thread_ctxt, num_workers);
+  launch_enc_workers(cpi, temporal_filter_worker_hook, multi_thread_ctxt,
+                     num_workers);
 }
 #endif  // !CONFIG_REALTIME_ONLY

-static int enc_row_mt_worker_hook(EncWorkerData *const thread_data,
-                                  MultiThreadHandle *multi_thread_ctxt) {
+static int enc_row_mt_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
  VP9_COMP *const cpi = thread_data->cpi;
  const VP9_COMMON *const cm = &cpi->common;
  const int tile_cols = 1 << cm->log2_tile_cols;
@@ -648,8 +651,8 @@ void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
    }
  }

-  launch_enc_workers(cpi, (VPxWorkerHook)enc_row_mt_worker_hook,
-                     multi_thread_ctxt, num_workers);
+  launch_enc_workers(cpi, enc_row_mt_worker_hook, multi_thread_ctxt,
+                     num_workers);

  for (i = 0; i < num_workers; i++) {
    VPxWorker *const worker = &cpi->workers[i];
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -44,7 +44,6 @@
 #define COMPLEXITY_STATS_OUTPUT 0

 #define FIRST_PASS_Q 10.0
-#define GF_MAX_BOOST 96.0
 #define INTRA_MODE_PENALTY 1024
 #define MIN_ARF_GF_BOOST 240
 #define MIN_DECAY_FACTOR 0.01
@@ -732,9 +731,8 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
  // Exclude any image dead zone
  if (fp_acc_data->image_data_start_row > 0) {
    fp_acc_data->intra_skip_count =
-        VPXMAX(0,
-               fp_acc_data->intra_skip_count -
-                   (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
+        VPXMAX(0, fp_acc_data->intra_skip_count -
+                      (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
  }

  fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;
@@ -1949,6 +1947,7 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
 }

 #define BASELINE_ERR_PER_MB 12500.0
+#define GF_MAX_BOOST 96.0
 static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
                               double this_frame_mv_in_out) {
  double frame_boost;
@@ -2238,9 +2237,6 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
    }
    gf_group->arf_update_idx[0] = arf_buffer_indices[0];
    gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
-
-    // Step over the golden frame / overlay frame
-    if (EOF == input_stats(twopass, &frame_stats)) return;
  }

  // Deduct the boost bits for arf (or gf if it is not a key frame)
@@ -2285,7 +2281,8 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
  // Define middle frame
  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;

-  normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
+  normal_frames =
+      rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
  if (normal_frames > 1)
    normal_frame_bits = (int)(total_group_bits / normal_frames);
  else
@@ -2383,6 +2380,8 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise,

 // Analyse and define a gf/arf group.
 #define ARF_DECAY_BREAKOUT 0.10
+#define ARF_ABS_ZOOM_THRESH 4.0
+
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  VP9_COMMON *const cm = &cpi->common;
  RATE_CONTROL *const rc = &cpi->rc;
@@ -2411,7 +2410,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  double mv_in_out_accumulator = 0.0;
  double abs_mv_in_out_accumulator = 0.0;
  double mv_ratio_accumulator_thresh;
-  double mv_in_out_thresh;
  double abs_mv_in_out_thresh;
  double sr_accumulator = 0.0;
  const double av_err = get_distribution_av_err(cpi, twopass);
@@ -2457,8 +2455,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  // Motion breakout threshold for loop below depends on image size.
  mv_ratio_accumulator_thresh =
      (cpi->initial_height + cpi->initial_width) / 4.0;
-  mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 300.0;
-  abs_mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 200.0;
+  abs_mv_in_out_thresh = ARF_ABS_ZOOM_THRESH;

  // Set a maximum and minimum interval for the GF group.
  // If the image appears almost completely static we can extend beyond this.
@@ -2543,14 +2540,17 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      // Update the accumulator for second ref error difference.
      // This is intended to give an indication of how much the coded error is
      // increasing over time.
-      sr_accumulator += (next_frame.sr_coded_error - next_frame.coded_error);
-      sr_accumulator = VPXMAX(0.0, sr_accumulator);
+      if (i == 1) {
+        sr_accumulator += next_frame.coded_error;
+      } else {
+        sr_accumulator += (next_frame.sr_coded_error - next_frame.coded_error);
+      }
    }

    // Break out conditions.
-    if (
-        // Break at active_max_gf_interval unless almost totally static.
-        ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
+    // Break at maximum of active_max_gf_interval unless almost totally static.
+    if (((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) &&
+         (i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
        (
            // Don't break out with a very short interval.
            (i >= active_min_gf_interval) &&
@@ -2559,7 +2559,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
            (!flash_detected) &&
            ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
             (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) ||
-             (mv_in_out_accumulator < -mv_in_out_thresh) ||
             (sr_accumulator > next_frame.intra_error)))) {
      break;
    }
@@ -2571,8 +2570,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;

  // Should we use the alternate reference frame.
-  if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
-      (i >= rc->min_gf_interval)) {
+  if ((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && allow_alt_ref &&
+      (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) {
    const int forward_frames = (rc->frames_to_key - i >= i - 1)
                                   ? i - 1
                                   : VPXMAX(0, rc->frames_to_key - i);
@@ -2600,7 +2599,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 #endif

  // Set the interval until the next gf.
-  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+  rc->baseline_gf_interval =
+      (twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH)
+          ? (i - (is_key_frame || rc->source_alt_ref_pending))
+          : i;

  // Only encode alt reference frame in temporal base layer. So
  // baseline_gf_interval should be multiple of a temporal layer group
@@ -2698,6 +2700,26 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 #endif
 }

+// Intra / Inter threshold very low
+#define VERY_LOW_II 1.5
+// Clean slide transitions we expect a sharp single frame spike in error.
+#define ERROR_SPIKE 5.0
+
+// Slide show transition detection.
+// Tests for case where there is very low error either side of the current frame
+// but much higher just for this frame. This can help detect key frames in
+// slide shows even where the slides are pictures of different sizes.
+// Also requires that intra and inter errors are very similar to help eliminate
+// harmful false positives.
+// It will not help if the transition is a fade or other multi-frame effect.
+static int slide_transition(const FIRSTPASS_STATS *this_frame,
+                            const FIRSTPASS_STATS *last_frame,
+                            const FIRSTPASS_STATS *next_frame) {
+  return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) &&
+         (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) &&
+         (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE));
+}
+
 // Threshold for use of the lagging second reference frame. High second ref
 // usage may point to a transient event like a flash or occlusion rather than
 // a real scene cut.
@@ -2742,6 +2764,7 @@ static int test_candidate_kf(TWO_PASS *twopass,
  if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
      (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+       (slide_transition(this_frame, last_frame, next_frame)) ||
       ((pcnt_intra > MIN_INTRA_LEVEL) &&
        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
        ((this_frame->intra_error /
@@ -2813,6 +2836,7 @@ static int test_candidate_kf(TWO_PASS *twopass,
 #define FRAMES_TO_CHECK_DECAY 8
 #define MIN_KF_TOT_BOOST 300
 #define KF_BOOST_SCAN_MAX_FRAMES 32
+#define KF_ABS_ZOOM_THRESH 6.0

 #ifdef AGGRESSIVE_VBR
 #define KF_MAX_FRAME_BOOST 80.0
@@ -2840,6 +2864,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  double kf_group_err = 0.0;
  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
  double sr_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
  const double av_err = get_distribution_av_err(cpi, twopass);
  vp9_zero(next_frame);

@@ -3004,8 +3029,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      double zm_factor;

      // Monitor for static sections.
-      zero_motion_accumulator = VPXMIN(
-          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+      // First frame in kf group the second ref indicator is invalid.
+      if (i > 0) {
+        zero_motion_accumulator = VPXMIN(
+            zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+      } else {
+        zero_motion_accumulator =
+            next_frame.pcnt_inter - next_frame.pcnt_motion;
+      }

      // Factor 0.75-1.25 based on how much of frame is static.
      zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
@@ -3019,7 +3050,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                        KF_MAX_FRAME_BOOST * zm_factor);

      boost_score += frame_boost;
-      if (frame_boost < 25.00) break;
+
+      // Measure of zoom. Large zoom tends to indicate reduced boost.
+      abs_mv_in_out_accumulator +=
+          fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion);
+
+      if ((frame_boost < 25.00) ||
+          (abs_mv_in_out_accumulator > KF_ABS_ZOOM_THRESH))
+        break;
    } else {
      break;
    }
@@ -3034,10 +3072,16 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  twopass->section_intra_rating = calculate_section_intra_ratio(
      start_position, twopass->stats_in_end, rc->frames_to_key);

-  // Apply various clamps for min and max boost
-  rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
-  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
-  rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
+  // Special case for static / slide show content but dont apply
+  // if the kf group is very short.
+  if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
+    rc->kf_boost = VPXMAX((rc->frames_to_key * 100), MAX_KF_TOT_BOOST);
+  } else {
+    // Apply various clamps for min and max boost
+    rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
+    rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
+    rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
+  }

  // Work out how many bits to allocate for the key frame itself.
  kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -120,12 +120,12 @@ typedef enum {
 typedef struct {
  unsigned char index;
  unsigned char first_inter_index;
-  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
-  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
-  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+  RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1];
 } GF_GROUP;

 typedef struct {
--- a/vp9/encoder/vp9_mbgraph.h
+++ b/vp9/encoder/vp9_mbgraph.h
@@ -25,7 +25,9 @@ typedef struct {
  } ref[MAX_REF_FRAMES];
 } MBGRAPH_MB_STATS;

-typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS;
+typedef struct {
+  MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;

 struct VP9_COMP;

--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1785,7 +1785,10 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
 }

 static const MV search_pos[4] = {
-  { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
+  { -1, 0 },
+  { 0, -1 },
+  { 0, 1 },
+  { 1, 0 },
 };

 unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
@@ -1876,7 +1879,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,

  {
    const uint8_t *const pos[4] = {
-      ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride,
+      ref_buf - ref_stride,
+      ref_buf - 1,
+      ref_buf + 1,
+      ref_buf + ref_stride,
    };

    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -21,6 +21,15 @@
 #include "vp9/encoder/vp9_noise_estimate.h"
 #include "vp9/encoder/vp9_encoder.h"

+#if CONFIG_VP9_TEMPORAL_DENOISING
+// For SVC: only do noise estimation on top spatial layer.
+static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) {
+  return (!cpi->use_svc ||
+          (cpi->use_svc &&
+           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+}
+#endif
+
 void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
  ne->enabled = 0;
  ne->level = kLowLow;
@@ -45,7 +54,7 @@ static int enable_noise_estimation(VP9_COMP *const cpi) {
 #endif
 // Enable noise estimation if denoising is on.
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
      cpi->common.width >= 320 && cpi->common.height >= 180)
    return 1;
 #endif
@@ -111,7 +120,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
  // Estimate is between current source and last source.
  YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) {
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
    last_source = &cpi->denoiser.last_source;
    // Tune these thresholds for different resolutions when denoising is
    // enabled.
@@ -131,7 +140,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
      (cpi->svc.number_spatial_layers == 1 &&
       (ne->last_w != cm->width || ne->last_h != cm->height))) {
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
      copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
    if (last_source != NULL) {
@@ -146,7 +155,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
    ne->count = 0;
    ne->num_frames_estimate = 10;
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
        cpi->svc.current_superframe > 1) {
      vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
      copy_frame(&cpi->denoiser.last_source, cpi->Source);
@@ -249,7 +258,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
      // Normalize.
      avg_est = avg_est / num_samples;
      // Update noise estimate.
-      ne->value = (int)((15 * ne->value + avg_est) >> 4);
+      ne->value = (int)((3 * ne->value + avg_est) >> 2);
      ne->count++;
      if (ne->count == ne->num_frames_estimate) {
        // Reset counter and check noise level condition.
@@ -257,14 +266,14 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
        ne->count = 0;
        ne->level = vp9_noise_estimate_extract_level(ne);
 #if CONFIG_VP9_TEMPORAL_DENOISING
-        if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
+        if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
          vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
 #endif
      }
    }
  }
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
    copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
 }
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1488,7 +1488,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
  int skip_ref_find_pred[4] = { 0 };
  unsigned int sse_zeromv_normalized = UINT_MAX;
  unsigned int best_sse_sofar = UINT_MAX;
-  unsigned int thresh_svc_skip_golden = 500;
 #if CONFIG_VP9_TEMPORAL_DENOISING
  VP9_PICKMODE_CTX_DEN ctx_den;
  int64_t zero_last_cost_orig = INT64_MAX;
@@ -1496,8 +1495,23 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 #endif
  INTERP_FILTER filter_gf_svc = EIGHTTAP;
  MV_REFERENCE_FRAME best_second_ref_frame = NONE;
+  const struct segmentation *const seg = &cm->seg;
  int comp_modes = 0;
  int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES;
+  int flag_svc_subpel = 0;
+  int svc_mv_col = 0;
+  int svc_mv_row = 0;
+  unsigned int thresh_svc_skip_golden = 500;
+  // Lower the skip threshold if lower spatial layer is better quality relative
+  // to current layer.
+  if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex > 150 &&
+      cm->base_qindex > cpi->svc.lower_layer_qindex + 15)
+    thresh_svc_skip_golden = 100;
+  // Increase skip threshold if lower spatial layer is lower quality relative
+  // to current layer.
+  else if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex < 140 &&
+           cm->base_qindex < cpi->svc.lower_layer_qindex - 20)
+    thresh_svc_skip_golden = 1000;

  init_ref_frame_cost(cm, xd, ref_frame_cost);

@@ -1635,6 +1649,16 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
      cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME)
    comp_modes = 2;

+  // If the segment reference frame feature is enabled and it's set to GOLDEN
+  // reference, then make sure we don't skip checking GOLDEN, this is to
+  // prevent possibility of not picking any mode.
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+    usable_ref_frame = GOLDEN_FRAME;
+    skip_ref_find_pred[GOLDEN_FRAME] = 0;
+    thresh_svc_skip_golden = 0;
+  }
+
  for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
    if (!skip_ref_find_pred[ref_frame]) {
      find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
@@ -1647,6 +1671,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
  if (cpi->use_svc || cpi->oxcf.speed <= 7 || bsize < BLOCK_32X32)
    x->sb_use_mv_part = 0;

+  // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used
+  // an averaging filter for downsampling (phase = 8). If so, we will test
+  // a nonzero motion mode on the spatial (goldeen) reference.
+  // The nonzero motion is half pixel shifted to left and top (-4, -4).
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
+      svc_force_zero_mode[GOLDEN_FRAME - 1] &&
+      cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) {
+    svc_mv_col = -4;
+    svc_mv_row = -4;
+    flag_svc_subpel = 1;
+  }
+
  for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) {
    int rate_mv = 0;
    int mode_rd_thresh;
@@ -1660,6 +1696,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
    int inter_mv_mode = 0;
    int skip_this_mv = 0;
    int comp_pred = 0;
+    int force_gf_mv = 0;
    PREDICTION_MODE this_mode;
    second_ref_frame = NONE;

@@ -1680,8 +1717,29 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
      comp_pred = 1;
    }

+    if (ref_frame > usable_ref_frame) continue;
+    if (skip_ref_find_pred[ref_frame]) continue;
+
+    // If the segment reference frame feature is enabled then do nothing if the
+    // current ref frame is not allowed.
+    if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+      continue;
+
+    if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) {
+      force_gf_mv = 1;
+      // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
+      // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
+      if (this_mode == NEWMV) {
+        frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col;
+        frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row;
+      } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col ||
+                 frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) {
+        continue;
+      }
+    }
+
    if (comp_pred) {
-      const struct segmentation *const seg = &cm->seg;
      if (!cpi->allow_comp_inter_inter) continue;
      // Skip compound inter modes if ARF is not available.
      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
@@ -1690,9 +1748,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
      if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
    }

-    if (ref_frame > usable_ref_frame) continue;
-    if (skip_ref_find_pred[ref_frame]) continue;
-
    // For SVC, skip the golden (spatial) reference search if sse of zeromv_last
    // is below threshold.
    if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
@@ -1737,7 +1792,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
    // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
    // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
    // later.
-    if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+    if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
        frame_mv[this_mode][ref_frame].as_int != 0) {
      continue;
    }
@@ -1751,34 +1806,39 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
    }

    if (cpi->use_svc) {
-      if (svc_force_zero_mode[ref_frame - 1] &&
+      if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] &&
          frame_mv[this_mode][ref_frame].as_int != 0)
        continue;
    }

-    if (sf->reference_masking &&
-        !(frame_mv[this_mode][ref_frame].as_int == 0 &&
-          ref_frame == LAST_FRAME)) {
-      if (usable_ref_frame < ALTREF_FRAME) {
-        if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
-          i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-          if ((cpi->ref_frame_flags & flag_list[i]))
-            if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
-              ref_frame_skip_mask |= (1 << ref_frame);
+    // Disable this drop out case if the ref frame segment level feature is
+    // enabled for this segment. This is to prevent the possibility that we end
+    // up unable to pick any mode.
+    if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) {
+      if (sf->reference_masking &&
+          !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+            ref_frame == LAST_FRAME)) {
+        if (usable_ref_frame < ALTREF_FRAME) {
+          if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
+            i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+            if ((cpi->ref_frame_flags & flag_list[i]))
+              if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+                ref_frame_skip_mask |= (1 << ref_frame);
+          }
+        } else if (!cpi->rc.is_src_frame_alt_ref &&
+                   !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+                     ref_frame == ALTREF_FRAME)) {
+          int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
+          int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
+          if (((cpi->ref_frame_flags & flag_list[ref1]) &&
+               (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
+              ((cpi->ref_frame_flags & flag_list[ref2]) &&
+               (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
+            ref_frame_skip_mask |= (1 << ref_frame);
        }
-      } else if (!cpi->rc.is_src_frame_alt_ref &&
-                 !(frame_mv[this_mode][ref_frame].as_int == 0 &&
-                   ref_frame == ALTREF_FRAME)) {
-        int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
-        int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
-        if (((cpi->ref_frame_flags & flag_list[ref1]) &&
-             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
-            ((cpi->ref_frame_flags & flag_list[ref2]) &&
-             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
-          ref_frame_skip_mask |= (1 << ref_frame);
      }
+      if (ref_frame_skip_mask & (1 << ref_frame)) continue;
    }
-    if (ref_frame_skip_mask & (1 << ref_frame)) continue;

    // Select prediction reference frames.
    for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -1808,7 +1868,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                             &rd_thresh_freq_fact[mode_index])))
      continue;

-    if (this_mode == NEWMV) {
+    if (this_mode == NEWMV && !force_gf_mv) {
      if (ref_frame > LAST_FRAME && !cpi->use_svc &&
          cpi->oxcf.rc_mode == VPX_CBR) {
        int tmp_sad;
@@ -1949,7 +2009,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
    if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
        pred_filter_search &&
        (ref_frame == LAST_FRAME ||
-         (ref_frame == GOLDEN_FRAME &&
+         (ref_frame == GOLDEN_FRAME && !force_gf_mv &&
          (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
        (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
      int pf_rate[3];
@@ -2173,9 +2233,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,

  // For spatial enhancemanent layer: perform intra prediction only if base
  // layer is chosen as the reference. Always perform intra prediction if
-  // LAST is the only reference or is_key_frame is set.
+  // LAST is the only reference, or is_key_frame is set, or on base
+  // temporal layer.
  if (cpi->svc.spatial_layer_id) {
    perform_intra_pred =
+        cpi->svc.temporal_layer_id == 0 ||
        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame ||
        !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
        (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
@@ -2185,6 +2247,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
  if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
      cpi->rc.is_src_frame_alt_ref)
    perform_intra_pred = 0;
+
+  // If the segment reference frame feature is enabled and set then
+  // skip the intra prediction.
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0)
+    perform_intra_pred = 0;
+
  // Perform intra prediction search, if the best SAD is above a certain
  // threshold.
  if (best_rdc.rdcost == INT64_MAX ||
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -31,10 +31,13 @@
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_ratectrl.h"

-// Max rate target for 1080P and below encodes under normal circumstances
-// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+// Max rate per frame for 1080P and below encodes if no level requirement given.
+// For larger formats limit to MAX_MB_RATE bits per MB
+// 4Mbits is derived from the level requirement for level 4 (1080P 30) which
+// requires that HW can sustain a rate of 16Mbits over a 4 frame group.
+// If a lower level requirement is specified then this may over ride this value.
 #define MAX_MB_RATE 250
-#define MAXRATE_1080P 2025000
+#define MAXRATE_1080P 4000000

 #define DEFAULT_KF_BOOST 2000
 #define DEFAULT_GF_BOOST 2000
@@ -1100,6 +1103,9 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
      // Baseline value derived from cpi->active_worst_quality and kf boost.
      active_best_quality =
          get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
+      if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+        active_best_quality /= 4;
+      }

      // Allow somewhat lower kf minq with small image formats.
      if ((cm->width * cm->height) <= (352 * 288)) {
@@ -1488,15 +1494,22 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
    cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref;
  }
  if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0;
+
+  rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
+  if (cpi->use_svc &&
+      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+    cpi->svc.lower_layer_qindex = cm->base_qindex;
 }

 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
  // Update buffer level with zero size, update frame counters, and return.
  update_buffer_level(cpi, 0);
+  cpi->common.current_video_frame++;
  cpi->rc.frames_since_key++;
  cpi->rc.frames_to_key--;
  cpi->rc.rc_2_frame = 0;
  cpi->rc.rc_1_frame = 0;
+  cpi->rc.last_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
 }

 static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
@@ -1580,9 +1593,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
      // Adjust boost and af_ratio based on avg_frame_low_motion, which varies
      // between 0 and 100 (stationary, 100% zero/small motion).
      rc->gfu_boost =
-          VPXMAX(500,
-                 DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
-                     (rc->avg_frame_low_motion + 100));
+          VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+                          (rc->avg_frame_low_motion + 100));
      rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
    }
    adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
@@ -1857,13 +1869,8 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
          cpi->framerate, rc->min_gf_interval);

-    // Extended interval for genuinely static scenes
-    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
-
-    if (is_altref_enabled(cpi)) {
-      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
-    }
+    // Extended max interval for genuinely static scenes like slide shows.
+    rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;

    if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
      rc->max_gf_interval = rc->static_scene_max_gf_interval;
@@ -1873,9 +1880,12 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,

    if (oxcf->target_level == LEVEL_AUTO) {
      const uint32_t pic_size = cpi->common.width * cpi->common.height;
+      const uint32_t pic_breadth =
+          VPXMAX(cpi->common.width, cpi->common.height);
      int i;
      for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
-        if (vp9_level_defs[i].max_luma_picture_size > pic_size) {
+        if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+            vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
          if (rc->min_gf_interval <=
              (int)vp9_level_defs[i].min_altref_distance) {
            rc->min_gf_interval =
@@ -1904,12 +1914,12 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
      VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);

  // A maximum bitrate for a frame is defined.
-  // The baseline for this aligns with HW implementations that
-  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
-  // per 16x16 MB (averaged over a frame). However this limit is extended if
-  // a very high rate is given on the command line or the the rate cannnot
-  // be acheived because of a user specificed max q (e.g. when the user
-  // specifies lossless encode.
+  // However this limit is extended if a very high rate is given on the command
+  // line or the the rate cannnot be acheived because of a user specificed max q
+  // (e.g. when the user specifies lossless encode).
+  //
+  // If a level is specified that requires a lower maximum rate then the level
+  // value take precedence.
  vbr_max_bits =
      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
            100);
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -34,6 +34,14 @@ extern "C" {

 #define FRAME_OVERHEAD_BITS 200

+// Threshold used to define a KF group as static (e.g. a slide show).
+// Essentially this means that no frame in the group has more than 1% of MBs
+// that are not marked as coded with 0,0 motion in the first pass.
+#define STATIC_KF_GROUP_THRESH 99
+
+// The maximum duration of a GF group that is static (for example a slide show).
+#define MAX_STATIC_GF_GROUP_LENGTH 250
+
 typedef enum {
  INTER_NORMAL = 0,
  INTER_HIGH = 1,
@@ -152,6 +160,8 @@ typedef struct {
  int rc_2_frame;
  int q_1_frame;
  int q_2_frame;
+  // Keep track of the last target average frame bandwidth.
+  int last_avg_frame_bandwidth;

  // Auto frame-scaling variables.
  FRAME_SCALE_LEVEL frame_size_selector;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -59,7 +59,9 @@ typedef struct {
  MV_REFERENCE_FRAME ref_frame[2];
 } MODE_DEFINITION;

-typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;

 struct rdcost_block_args {
  const VP9_COMP *cpi;
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -37,14 +37,16 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
  svc->scaled_one_half = 0;
  svc->current_superframe = 0;
  svc->non_reference_frame = 0;
+  svc->skip_enhancement_layer = 0;
+
  for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
    svc->ext_frame_flags[sl] = 0;
    svc->ext_lst_fb_idx[sl] = 0;
    svc->ext_gld_fb_idx[sl] = 1;
    svc->ext_alt_fb_idx[sl] = 2;
-    svc->downsample_filter_type[sl] = EIGHTTAP;
-    svc->downsample_filter_phase[sl] = 0;  // Set to 8 for averaging filter.
+    svc->downsample_filter_type[sl] = BILINEAR;
+    svc->downsample_filter_phase[sl] = 8;  // Set to 8 for averaging filter.
  }

  if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
@@ -153,6 +155,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
  int sl, tl, layer = 0, spatial_layer_target;
  float bitrate_alloc = 1.0;

+  cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
+
  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
    for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
      for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
@@ -389,9 +393,9 @@ int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
             .is_key_frame;
 }

-static void get_layer_resolution(const int width_org, const int height_org,
-                                 const int num, const int den, int *width_out,
-                                 int *height_out) {
+void get_layer_resolution(const int width_org, const int height_org,
+                          const int num, const int den, int *width_out,
+                          int *height_out) {
  int w, h;

  if (width_out == NULL || height_out == NULL || den == 0) return;
@@ -545,6 +549,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
    if (!spatial_id) {
      cpi->ref_frame_flags = VP9_LAST_FLAG;
    } else {
+      if (spatial_id == cpi->svc.number_spatial_layers - 1)
+        cpi->ext_refresh_alt_ref_frame = 0;
      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
    }
  }
@@ -604,6 +610,7 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering(
 int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
  int width = 0, height = 0;
  LAYER_CONTEXT *lc = NULL;
+  cpi->svc.skip_enhancement_layer = 0;
  if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1;
  cpi->svc.force_zero_mode_spatial_ref = 1;
  cpi->svc.mi_stride[cpi->svc.spatial_layer_id] = cpi->common.mi_stride;
@@ -656,10 +663,14 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
                       lc->scaling_factor_num, lc->scaling_factor_den, &width,
                       &height);

-  // For resolutions <= QVGA: set phase of the filter = 8 (for symmetric
+  // For resolutions <= VGA: set phase of the filter = 8 (for symmetric
  // averaging filter), use bilinear for now.
-  if (width * height <= 320 * 240) {
+  if (width * height <= 640 * 480) {
    cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR;
+    // Use Eightap_smooth for low resolutions.
+    if (width * height <= 320 * 240)
+      cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] =
+          EIGHTTAP_SMOOTH;
    cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8;
  }

@@ -861,3 +872,28 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
  vp9_update_temporal_layer_framerate(cpi);
  vp9_restore_layer_context(cpi);
 }
+
+void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
+  SVC *svc = &cpi->svc;
+  int sl, tl;
+  for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    // Check for reset based on avg_frame_bandwidth for spatial layer sl.
+    int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
+                                 svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    RATE_CONTROL *lrc = &lc->rc;
+    if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) ||
+        lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) {
+      // Reset for all temporal layers with spatial layer sl.
+      for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+        int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+        LAYER_CONTEXT *lc = &svc->layer_context[layer];
+        RATE_CONTROL *lrc = &lc->rc;
+        lrc->rc_1_frame = 0;
+        lrc->rc_2_frame = 0;
+        lrc->bits_off_target = lrc->optimal_buffer_level;
+        lrc->buffer_level = lrc->optimal_buffer_level;
+      }
+    }
+  }
+}
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -49,7 +49,7 @@ typedef struct {
  uint8_t speed;
 } LAYER_CONTEXT;

-typedef struct {
+typedef struct SVC {
  int spatial_layer_id;
  int temporal_layer_id;
  int number_spatial_layers;
@@ -99,6 +99,12 @@ typedef struct {

  BLOCK_SIZE *prev_partition_svc;
  int mi_stride[VPX_MAX_LAYERS];
+
+  int first_layer_denoise;
+
+  int skip_enhancement_layer;
+
+  int lower_layer_qindex;
 } SVC;

 struct VP9_COMP;
@@ -128,6 +134,10 @@ void vp9_save_layer_context(struct VP9_COMP *const cpi);
 // Initialize second pass rc for spatial svc.
 void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);

+void get_layer_resolution(const int width_org, const int height_org,
+                          const int num, const int den, int *width_out,
+                          int *height_out);
+
 // Increment number of video frames in layer
 void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);

@@ -148,6 +158,8 @@ void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);

 void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);

+void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -170,13 +170,13 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst4_sse2(in);
      write_buffer_4x4(output, in);
      break;
-    case ADST_ADST:
+    default:
+      assert(tx_type == ADST_ADST);
      load_buffer_4x4(input, in, stride);
      fadst4_sse2(in);
      fadst4_sse2(in);
      write_buffer_4x4(output, in);
      break;
-    default: assert(0); break;
  }
 }

@@ -1097,14 +1097,14 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
      right_shift_8x8(in, 1);
      write_buffer_8x8(output, in, 8);
      break;
-    case ADST_ADST:
+    default:
+      assert(tx_type == ADST_ADST);
      load_buffer_8x8(input, in, stride);
      fadst8_sse2(in);
      fadst8_sse2(in);
      right_shift_8x8(in, 1);
      write_buffer_8x8(output, in, 8);
      break;
-    default: assert(0); break;
  }
 }

@@ -1963,13 +1963,13 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst16_sse2(in0, in1);
      write_buffer_16x16(output, in0, in1, 16);
      break;
-    case ADST_ADST:
+    default:
+      assert(tx_type == ADST_ADST);
      load_buffer_16x16(input, in0, in1, stride);
      fadst16_sse2(in0, in1);
      right_shift_16x16(in0, in1);
      fadst16_sse2(in0, in1);
      write_buffer_16x16(output, in0, in1, 16);
      break;
-    default: assert(0); break;
  }
 }
--- a/vp9/encoder/x86/vp9_error_avx2.c
+++ b/vp9/encoder/x86/vp9_error_avx2.c
@@ -1,7 +1,7 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
- *  Usee of this source code is governed by a BSD-style license
+ *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -0,0 +1,140 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>  // AVX2
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_dsp/x86/quantize_x86.h"
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+  const __m256i zero = _mm256_setzero_si256();
+#if CONFIG_VP9_HIGHBITDEPTH
+  _mm256_storeu_si256((__m256i *)(a), zero);
+  _mm256_storeu_si256((__m256i *)(a + 8), zero);
+#else
+  _mm256_storeu_si256((__m256i *)(a), zero);
+#endif
+}
+
+static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
+                                   __m256i *coeff256) {
+  const __m256i iscan = _mm256_loadu_si256(iscan_ptr);
+  const __m256i zero256 = _mm256_setzero_si256();
+#if CONFIG_VP9_HIGHBITDEPTH
+  // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as
+  // B1 A1 B0 A0.  Shuffle to B1 B0 A1 A0 in order to scan eob correctly.
+  const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8);
+  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256);
+#else
+  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256);
+#endif
+  const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256);
+  // Add one to convert from indices to counts
+  const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0);
+  return _mm256_and_si256(iscan_plus_one, nzero_coeff0);
+}
+
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *round_ptr,
+                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                          uint16_t *eob_ptr, const int16_t *scan_ptr,
+                          const int16_t *iscan_ptr) {
+  __m128i eob;
+  __m256i round256, quant256, dequant256;
+  __m256i eob256, thr256;
+
+  (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  {
+    __m256i coeff256;
+
+    // Setup global values
+    {
+      const __m128i round = _mm_load_si128((const __m128i *)round_ptr);
+      const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr);
+      const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+      round256 = _mm256_castsi128_si256(round);
+      round256 = _mm256_permute4x64_epi64(round256, 0x54);
+
+      quant256 = _mm256_castsi128_si256(quant);
+      quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
+
+      dequant256 = _mm256_castsi128_si256(dequant);
+      dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
+    }
+
+    {
+      __m256i qcoeff256;
+      __m256i qtmp256;
+      coeff256 = load_tran_low(coeff_ptr + n_coeffs);
+      qcoeff256 = _mm256_abs_epi16(coeff256);
+      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
+      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
+    }
+
+    eob256 = scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256);
+    n_coeffs += 8 * 2;
+  }
+
+  // remove dc constants
+  dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
+  quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
+  round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+
+  thr256 = _mm256_srai_epi16(dequant256, 1);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs);
+    __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
+    int32_t nzflag =
+        _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
+
+    if (nzflag) {
+      __m256i qtmp256;
+      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
+      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
+      eob256 = _mm256_max_epi16(
+          eob256,
+          scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256));
+    } else {
+      store_zero_tran_low(qcoeff_ptr + n_coeffs);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+    }
+    n_coeffs += 8 * 2;
+  }
+
+  eob = _mm_max_epi16(_mm256_castsi256_si128(eob256),
+                      _mm256_extracti128_si256(eob256, 1));
+
+  *eob_ptr = accumulate_eob(eob);
+}
--- a/Show More
+++ b/Show More