vp9: fix multiple run for vp9 datarate tests using one bitrate.

Change-Id: I1458bf25fadc23e8be5a9532a153d7129a53accf
Merge "vp9: refactor vp9 datarate test for better sharding."
2018-04-16 14:31:49 -07:00 · 2018-04-15 07:03:16 +00:00 · 2018-04-13 22:39:30 -07:00 · 2018-04-13 15:36:39 -07:00 · 2018-04-13 03:40:50 +00:00 · 2018-04-13 03:40:20 +00:00
624 changed files with 101265 additions and 61556 deletions
--- a/.clang-format
+++ b/.clang-format
@ -1,12 +1,12 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-# Generated with clang-format 3.8.1
+# Generated with clang-format 5.0.0
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
-AlignEscapedNewlinesLeft: true
+AlignEscapedNewlines: Left
 AlignOperands:   true
 AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: true
@ -33,12 +33,20 @@ BraceWrapping:
  BeforeCatch:     false
  BeforeElse:      false
  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
 ColumnLimit:     80
 CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
@ -46,7 +54,11 @@ Cpp11BracedListStyle: false
 DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
-ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
 IncludeCategories:
  - Regex:           '^<.*\.h>'
    Priority:        1
@ -54,9 +66,12 @@ IncludeCategories:
    Priority:        2
  - Regex:           '.*'
    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
 IndentCaseLabels: true
 IndentWidth:     2
 IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
 KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
@ -65,6 +80,7 @@ NamespaceIndentation: None
 ObjCBlockIndentWidth: 2
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: false
+PenaltyBreakAssignment: 2
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
@ -74,7 +90,9 @@ PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
 ReflowComments:  true
 SortIncludes:    false
+SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
 SpaceInEmptyParentheses: false
--- a/.mailmap
+++ b/.mailmap
@ -3,6 +3,7 @@ Aℓex Converse <aconverse@google.com>
 Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Chris Cunningham <chcunningham@chromium.org>
 Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
 Deb Mukherjee <debargha@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
@ -21,17 +22,21 @@ Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Pascal Massimino <pascal.massimino@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
+Peter Boström <pbos@chromium.org> <pbos@google.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
 Sami Pietilä <samipietila@google.com>
+Shiyou Yin <yinshiyou-hf@loongson.cn>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
+Urvang Joshi <urvang@google.com> <urvang@chromium.org>
+Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <Yaowu Xu>
--- a/29
+++ b/29
@ -3,11 +3,13 @@

 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
-Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
+Aleksey Vasenev <margtu-fivt@ya.ru>
+Alexander Potapenko <glider@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
+Alexandra Hájková <alexandra.khirnova@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@ -15,6 +17,7 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
+Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
 Angie Chiang <angiebird@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
@ -22,11 +25,14 @@ Attila Nagy <attilanagy@google.com>
 Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
+Cheng Chen <chengchen@google.com>
 chm <chm@rock-chips.com>
+Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
 Deb Mukherjee <debargha@google.com>
+Deepa K G <deepa.kg@ittiam.com>
 Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
 Dragan Mrdjan <dmrdjan@mips.com>
@ -37,17 +43,21 @@ Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
+Gabriel Marin <gmx@chromium.org>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
+Gregor Jasny <gjasny@gmail.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
+Han Shen <shenhan@google.com>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
+Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
 Jacky Chen <jackychen@google.com>
@ -61,6 +71,7 @@ Jean-Yves Avenard <jyavenard@mozilla.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
+Jerome Jiang <jianj@google.com>
 Jia Jia <jia.jia@linaro.org>
 Jian Zhou <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
@ -77,6 +88,7 @@ Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 Kaustubh Raste <kaustubh.raste@imgtec.com>
 KO Myung-Hun <komh@chollian.net>
+Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
 Linfeng Zhang <linfengz@google.com>
 Lou Quillio <louquillio@google.com>
@ -92,8 +104,12 @@ Michael Kohler <michaelkohler@live.com>
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
+Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
+Min Ye <yeemmi@google.com>
+Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
+Nathan E. Egge <negge@mozilla.com>
 Nico Weber <thakis@chromium.org>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
@ -102,16 +118,22 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
 Pengchong Jin <pengchong@google.com>
+Peter Boström <pbos@chromium.org>
+Peter Collingbourne <pcc@chromium.org>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
+Rafael de Lucena Valle <rafaeldelucena@gmail.com>
+Rahul Chaudhry <rahulchaudhry@google.com>
 Ralph Giles <giles@xiph.org>
+Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
 Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
 Sami Pietilä <samipietila@google.com>
+Sarah Parker <sarahparker@google.com>
 Sasi Inguva <isasi@google.com>
 Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
@ -119,9 +141,11 @@ Sean McGovern <gseanmcg@gmail.com>
 Sergey Kolomenkin <kolomenkin@gmail.com>
 Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
+Shiyou Yin <yinshiyou-hf@loongson.cn>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
+Sylvestre Ledru <sylvestre@mozilla.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
@ -131,7 +155,10 @@ Thijs Vermeir <thijsvermeir@gmail.com>
 Tim Kopp <tkopp@google.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
+Tristan Matthews <le.businessman@gmail.com>
+Urvang Joshi <urvang@google.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
+Vlad Tsyrklevich <vtsyrklevich@chromium.org>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
--- a/41
+++ b/41
@ -1,3 +1,44 @@
+2017-01-04 v1.7.0 "Mandarin Duck"
+  This release focused on high bit depth performance (10/12 bit) and vp9
+  encoding improvements.
+
+  - Upgrading:
+    This release is ABI incompatible due to new vp9 encoder features.
+
+    Frame parallel decoding for vp9 has been removed.
+
+  - Enhancements:
+    vp9 encoding supports additional threads with --row-mt. This can be greater
+    than the number of tiles.
+
+    Two new vp9 encoder options have been added:
+      --corpus-complexity
+      --tune-content=film
+
+    Additional tooling for respecting the vp9 "level" profiles has been added.
+
+  - Bug fixes:
+    A variety of fuzzing issues.
+    vp8 threading fix for ARM.
+    Codec control VP9_SET_SKIP_LOOP_FILTER fixed.
+    Reject invalid multi resolution configurations.
+
+2017-01-09 v1.6.1 "Long Tailed Duck"
+  This release improves upon the VP9 encoder and speeds up the encoding and
+  decoding processes.
+
+  - Upgrading:
+    This release is ABI compatible with 1.6.0.
+
+  - Enhancements:
+    Faster VP9 encoding and decoding.
+    High bit depth builds now provide similar speed for 8 bit encode and decode
+    for x86 targets. Other platforms and higher bit depth improvements are in
+    progress.
+
+  - Bug Fixes:
+    A variety of fuzzing issues.
+
 2016-07-20 v1.6.0 "Khaki Campbell Duck"
  This release improves upon the VP9 encoder and speeds up the encoding and
  decoding processes.
--- a/45
+++ b/45
@ -1,4 +1,4 @@
-README - 20 July 2016
+README - 24 January 2018

 Welcome to the WebM VP8/VP9 Codec SDK!

@ -9,22 +9,26 @@ COMPILING THE APPLICATIONS/LIBRARIES:

  1. Prerequisites

-    * All x86 targets require the Yasm[1] assembler be installed.
-    * All Windows builds require that Cygwin[2] be installed.
-    * Building the documentation requires Doxygen[3]. If you do not
+    * All x86 targets require the Yasm[1] assembler be installed[2].
+    * All Windows builds require that Cygwin[3] be installed.
+    * Building the documentation requires Doxygen[4]. If you do not
      have this package, the install-docs option will be disabled.
-    * Downloading the data for the unit tests requires curl[4] and sha1sum.
+    * Downloading the data for the unit tests requires curl[5] and sha1sum.
      sha1sum is provided via the GNU coreutils, installed by default on
      many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
      available, a compatible version of sha1sum can be built from
-      source[5]. These requirements are optional if not running the unit
+      source[6]. These requirements are optional if not running the unit
      tests.

    [1]: http://www.tortall.net/projects/yasm
-    [2]: http://www.cygwin.com
-    [3]: http://www.doxygen.org
-    [4]: http://curl.haxx.se
-    [5]: http://www.microbrew.org/tools/md5sha1sum/
+    [2]: For Visual Studio the base yasm binary (not vsyasm) should be in the
+         PATH for Visual Studio. For VS2017 it is sufficient to rename
+         yasm-<version>-<arch>.exe to yasm.exe and place it in:
+         Program Files (x86)/Microsoft Visual Studio/2017/<level>/Common7/Tools/
+    [3]: http://www.cygwin.com
+    [4]: http://www.doxygen.org
+    [5]: http://curl.haxx.se
+    [6]: http://www.microbrew.org/tools/md5sha1sum/

  2. Out-of-tree builds
  Out of tree builds are a supported method of building the application. For
@ -41,12 +45,22 @@ COMPILING THE APPLICATIONS/LIBRARIES:
  used to get a list of supported options:
    $ ../libvpx/configure --help

-  4. Cross development
+  4. Compiler analyzers
+  Compilers have added sanitizers which instrument binaries with information
+  about address calculation, memory usage, threading, undefined behavior, and
+  other common errors. To simplify building libvpx with some of these features
+  use tools/set_analyzer_env.sh before running configure. It will set the
+  compiler and necessary flags for building as well as environment variables
+  read by the analyzer when testing the binaries.
+    $ source ../libvpx/tools/set_analyzer_env.sh address
+
+  5. Cross development
  For cross development, the most notable option is the --target option. The
  most up-to-date list of supported targets can be found at the bottom of the
  --help output of the configure script. As of this writing, the list of
  available targets is:

+    arm64-android-gcc
    arm64-darwin-gcc
    arm64-linux-gcc
    armv7-android-gcc
@ -57,10 +71,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv7-win32-vs11
    armv7-win32-vs12
    armv7-win32-vs14
+    armv7-win32-vs15
    armv7s-darwin-gcc
    armv8-linux-gcc
    mips32-linux-gcc
    mips64-linux-gcc
+    ppc64-linux-gcc
+    ppc64le-linux-gcc
    sparc-solaris-gcc
    x86-android-gcc
    x86-darwin8-gcc
@ -73,6 +90,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86-darwin13-gcc
    x86-darwin14-gcc
    x86-darwin15-gcc
+    x86-darwin16-gcc
    x86-iphonesimulator-gcc
    x86-linux-gcc
    x86-linux-icc
@ -83,6 +101,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86-win32-vs11
    x86-win32-vs12
    x86-win32-vs14
+    x86-win32-vs15
    x86_64-android-gcc
    x86_64-darwin9-gcc
    x86_64-darwin10-gcc
@ -91,6 +110,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86_64-darwin13-gcc
    x86_64-darwin14-gcc
    x86_64-darwin15-gcc
+    x86_64-darwin16-gcc
    x86_64-iphonesimulator-gcc
    x86_64-linux-gcc
    x86_64-linux-icc
@ -100,6 +120,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86_64-win64-vs11
    x86_64-win64-vs12
    x86_64-win64-vs14
+    x86_64-win64-vs15
    generic-gnu

  The generic-gnu target, in conjunction with the CROSS environment variable,
@ -115,7 +136,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
  environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be
  passed to these executables with CFLAGS, LDFLAGS, and ASFLAGS.

-  5. Configuration errors
+  6. Configuration errors
  If the configuration step fails, the first step is to look in the error log.
  This defaults to config.log. This should give a good indication of what went
  wrong. If not, contact us for support.
--- a/build/.gitattributes
+++ b/build/.gitattributes
@ -1,2 +0,0 @@
-*-vs8/*.rules -crlf
-*-msvs/*.rules -crlf
--- a/build/.gitignore
+++ b/build/.gitignore
@ -1 +0,0 @@
-x86*-win32-vs*
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@ -64,6 +64,9 @@ CONFIG_DIR := $(LOCAL_PATH)/
 LIBVPX_PATH := $(LOCAL_PATH)/libvpx
 ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas
 ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL)
+ifneq ($(V),1)
+  qexec := @
+endif

 # Use the makefiles generated by upstream configure to determine which files to
 # build. Also set any architecture-specific flags.
@ -103,8 +106,8 @@ LOCAL_ASMFLAGS := -I$(LIBVPX_PATH)

 .PRECIOUS: %.asm.S
 $(ASM_CNV_PATH)/libvpx/%.asm.S: $(LIBVPX_PATH)/%.asm
-	@mkdir -p $(dir $@)
-	@$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
+	$(qexec)mkdir -p $(dir $@)
+	$(qexec)$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@

 # For building *_rtcd.h, which have rules in libs.mk
 TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN)))
@ -150,15 +153,27 @@ CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.S, \
 LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS)

 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+  ASM_INCLUDES := vpx_dsp/arm/idct_neon.asm.S
  CODEC_SRCS_ASM_NEON = $(foreach v, \
                        $(CODEC_SRCS_ASM_ARM_ALL),\
                        $(if $(findstring neon,$(v)),$(v),))
+  CODEC_SRCS_ASM_NEON := $(filter-out $(addprefix %, $(ASM_INCLUDES)), \
+                         $(CODEC_SRCS_ASM_NEON))
  CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.S, \
                                $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \
                                $(CODEC_SRCS_ASM_NEON))
  LOCAL_SRC_FILES += $(patsubst %.S, \
                     %.S.neon, \
                     $(CODEC_SRCS_ASM_NEON_ADS2GAS))
+
+  NEON_ASM_TARGETS = $(patsubst %.S, \
+                     $(ASM_CNV_PATH)/libvpx/%.S, \
+                     $(CODEC_SRCS_ASM_NEON))
+# add a dependency to the full path to the ads2gas output to ensure the
+# includes are converted first.
+ifneq ($(strip $(NEON_ASM_TARGETS)),)
+$(NEON_ASM_TARGETS): $(addprefix $(ASM_CNV_PATH)/libvpx/, $(ASM_INCLUDES))
+endif
 endif

 LOCAL_CFLAGS += \
@ -187,7 +202,7 @@ $$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h
 $$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h

 rtcd_dep_template_CONFIG_ASM_ABIS := x86 x86_64 armeabi-v7a
-ifneq ($(findstring $(TARGET_ARCH_ABI),$(rtcd_dep_template_CONFIG_ASM_ABIS)),)
+ifneq ($$(findstring $(TARGET_ARCH_ABI),$$(rtcd_dep_template_CONFIG_ASM_ABIS)),)
 $$(rtcd_dep_template_SRCS): vpx_config.asm
 endif
 endef
@ -197,16 +212,17 @@ $(eval $(call rtcd_dep_template))
 .PHONY: clean
 clean:
 	@echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]"
-	@$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
-	@$(RM) -r $(ASM_CNV_PATH)
-	@$(RM) $(CLEAN-OBJS)
+	$(qexec)$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
+	$(qexec)$(RM) -r $(ASM_CNV_PATH)
+	$(qexec)$(RM) $(CLEAN-OBJS)

 ifeq ($(ENABLE_SHARED),1)
+  LOCAL_CFLAGS += -fPIC
  include $(BUILD_SHARED_LIBRARY)
 else
  include $(BUILD_STATIC_LIBRARY)
 endif

 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
-$(call import-module,cpufeatures)
+$(call import-module,android/cpufeatures)
 endif
--- a/build/make/Makefile
+++ b/build/make/Makefile
@ -124,6 +124,7 @@ ifeq ($(TOOLCHAIN), x86-os2-gcc)
 CFLAGS += -mstackrealign
 endif

+# x86[_64]
 $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
 $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
 $(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
@ -138,6 +139,12 @@ $(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
 $(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
 $(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
 $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
+$(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
+$(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
+
+# POWER
+$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
+$(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx

 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@ -23,9 +23,11 @@ use lib $FindBin::Bin;
 use thumb;

 my $thumb = 0;
+my $elf = 1;

 foreach my $arg (@ARGV) {
    $thumb = 1 if ($arg eq "-thumb");
+    $elf = 0 if ($arg eq "-noelf");
 }

 print "@ This file was created from a .asm file\n";
@ -140,7 +142,11 @@ while (<STDIN>)

    # Make function visible to linker, and make additional symbol with
    # prepended underscore
-    s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
+    if ($elf) {
+        s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
+    } else {
+        s/EXPORT\s+\|([\$\w]*)\|/.global $1/;
+    }
    s/IMPORT\s+\|([\$\w]*)\|/.global $1/;

    s/EXPORT\s+([\$\w]*)/.global $1/;
@ -181,11 +187,16 @@ while (<STDIN>)
    # eabi_attributes numerical equivalents can be found in the
    # "ARM IHI 0045C" document.

-    # REQUIRE8 Stack is required to be 8-byte aligned
-    s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
+    if ($elf) {
+        # REQUIRE8 Stack is required to be 8-byte aligned
+        s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;

-    # PRESERVE8 Stack 8-byte align is preserved
-    s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
+        # PRESERVE8 Stack 8-byte align is preserved
+        s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
+    } else {
+        s/\sREQUIRE8//;
+        s/\sPRESERVE8//;
+    }

    # Use PROC and ENDP to give the symbols a .size directive.
    # This makes them show up properly in debugging tools like gdb and valgrind.
@ -202,7 +213,7 @@ while (<STDIN>)
        my $proc;
        s/\bENDP\b/@ $&/;
        $proc = pop(@proc_stack);
-        $_ = "\t.size $proc, .-$proc".$_ if ($proc);
+        $_ = "\t.size $proc, .-$proc".$_ if ($proc and $elf);
    }

    # EQU directive
@ -225,4 +236,4 @@ while (<STDIN>)
 }

 # Mark that this object doesn't need an executable stack.
-printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n");
+printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n") if $elf;
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@ -403,6 +403,23 @@ check_gcc_machine_option() {
  fi
 }

+# tests for -m$2, -m$3, -m$4... toggling the feature given in $1.
+check_gcc_machine_options() {
+  feature="$1"
+  shift
+  flags="-m$1"
+  shift
+  for opt in $*; do
+    flags="$flags -m$opt"
+  done
+
+  if enabled gcc && ! disabled "$feature" && ! check_cflags $flags; then
+    RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
+  else
+    soft_enable "$feature"
+  fi
+}
+
 write_common_config_banner() {
  print_webm_license config.mk "##" ""
  echo '# This file automatically generated by configure. Do not edit!' >> config.mk
@ -674,7 +691,6 @@ check_xcode_minimum_version() {
 process_common_toolchain() {
  if [ -z "$toolchain" ]; then
    gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
-
    # detect tgt_isa
    case "$gcctarget" in
      aarch64*)
@ -697,6 +713,18 @@ process_common_toolchain() {
      *sparc*)
        tgt_isa=sparc
        ;;
+      power*64*-*)
+        tgt_isa=ppc64
+        ;;
+      power*)
+        tgt_isa=ppc
+        ;;
+      *mips64el*)
+        tgt_isa=mips64
+        ;;
+      *mips32el*)
+        tgt_isa=mips32
+        ;;
    esac

    # detect tgt_os
@ -725,9 +753,20 @@ process_common_toolchain() {
        tgt_isa=x86_64
        tgt_os=darwin15
        ;;
+      *darwin16*)
+        tgt_isa=x86_64
+        tgt_os=darwin16
+        ;;
+      *darwin17*)
+        tgt_isa=x86_64
+        tgt_os=darwin17
+        ;;
      x86_64*mingw32*)
        tgt_os=win64
        ;;
+      x86_64*cygwin*)
+        tgt_os=win64
+        ;;
      *mingw32*|*cygwin*)
        [ -z "$tgt_isa" ] && tgt_isa=x86
        tgt_os=win32
@ -775,6 +814,9 @@ process_common_toolchain() {
    mips*)
      enable_feature mips
      ;;
+    ppc*)
+      enable_feature ppc
+      ;;
  esac

  # PIC is probably what we want when building shared libs
@ -843,6 +885,14 @@ process_common_toolchain() {
      add_cflags  "-mmacosx-version-min=10.11"
      add_ldflags "-mmacosx-version-min=10.11"
      ;;
+    *-darwin16-*)
+      add_cflags  "-mmacosx-version-min=10.12"
+      add_ldflags "-mmacosx-version-min=10.12"
+      ;;
+    *-darwin17-*)
+      add_cflags  "-mmacosx-version-min=10.13"
+      add_ldflags "-mmacosx-version-min=10.13"
+      ;;
    *-iphonesimulator-*)
      add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
      add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
@ -891,7 +941,6 @@ process_common_toolchain() {
          setup_gnu_toolchain
          arch_int=${tgt_isa##armv}
          arch_int=${arch_int%%te}
-          check_add_asflags --defsym ARCHITECTURE=${arch_int}
          tune_cflags="-mtune="
          if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
            if [ -z "${float_abi}" ]; then
@ -918,6 +967,19 @@ EOF

          enabled debug && add_asflags -g
          asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
+
+          case ${tgt_os} in
+            win*)
+              asm_conversion_cmd="$asm_conversion_cmd -noelf"
+              AS="$CC -c"
+              EXE_SFX=.exe
+              enable_feature thumb
+              ;;
+            *)
+              check_add_asflags --defsym ARCHITECTURE=${arch_int}
+              ;;
+          esac
+
          if enabled thumb; then
            asm_conversion_cmd="$asm_conversion_cmd -thumb"
            check_add_cflags -mthumb
@ -1144,12 +1206,25 @@ EOF
        fi
      fi

+      if enabled mmi; then
+        tgt_isa=loongson3a
+        check_add_ldflags -march=loongson3a
+      fi
+
      check_add_cflags -march=${tgt_isa}
      check_add_asflags -march=${tgt_isa}
      check_add_asflags -KPIC
      ;;
+    ppc*)
+      link_with_cc=gcc
+      setup_gnu_toolchain
+      check_gcc_machine_option "vsx"
+      ;;
    x86*)
      case  ${tgt_os} in
+        android)
+          soft_enable realtime_only
+          ;;
        win*)
          enabled gcc && add_cflags -fno-common
          ;;
@ -1202,6 +1277,13 @@ EOF
          AS=msvs
          msvs_arch_dir=x86-msvs
          vc_version=${tgt_cc##vs}
+          case $vc_version in
+            7|8|9|10|11|12|13|14)
+              echo "${tgt_cc} does not support avx512, disabling....."
+              RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 "
+              soft_disable avx512
+              ;;
+          esac
          case $vc_version in
            7|8|9|10)
              echo "${tgt_cc} does not support avx/avx2, disabling....."
@ -1246,8 +1328,12 @@ EOF
        elif disabled $ext; then
          disable_exts="yes"
        else
-          # use the shortened version for the flag: sse4_1 -> sse4
-          check_gcc_machine_option ${ext%_*} $ext
+          if [ "$ext" = "avx512" ]; then
+            check_gcc_machine_options $ext avx512f avx512cd avx512bw avx512dq avx512vl
+          else
+            # use the shortened version for the flag: sse4_1 -> sse4
+            check_gcc_machine_option ${ext%_*} $ext
+          fi
        fi
      done

@ -1273,7 +1359,6 @@ EOF
        esac
        log_echo "  using $AS"
      fi
-      [ "${AS##*/}" = nasm ] && add_asflags -Ox
      AS_SFX=.asm
      case  ${tgt_os} in
        win32)
@ -1282,7 +1367,7 @@ EOF
          EXE_SFX=.exe
          ;;
        win64)
-          add_asflags -f x64
+          add_asflags -f win64
          enabled debug && add_asflags -g cv8
          EXE_SFX=.exe
          ;;
@ -1309,7 +1394,8 @@ EOF
          add_cflags  ${sim_arch}
          add_ldflags ${sim_arch}

-          if [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then
+          if [ "$(disabled external_build)" ] &&
+              [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then
            # yasm v1.3.0 doesn't know what -fembed-bitcode means, so turning it
            # on is pointless (unless building a C-only lib). Warn the user, but
            # do nothing here.
@ -1416,6 +1502,10 @@ EOF
          echo "msa optimizations are available only for little endian platforms"
          disable_feature msa
        fi
+        if enabled mmi; then
+          echo "mmi optimizations are available only for little endian platforms"
+          disable_feature mmi
+        fi
      fi
      ;;
  esac
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh
@ -25,7 +25,7 @@ files.
 Options:
    --help                      Print this message
    --out=outfile               Redirect output to a file
-    --ver=version               Version (7,8,9,10,11,12,14) of visual studio to generate for
+    --ver=version               Version (7,8,9,10,11,12,14,15) of visual studio to generate for
    --target=isa-os-cc          Target specifier
 EOF
    exit 1
@ -215,7 +215,7 @@ for opt in "$@"; do
    ;;
    --ver=*) vs_ver="$optval"
             case $optval in
-             10|11|12|14)
+             10|11|12|14|15)
             ;;
             *) die Unrecognized Visual Studio Version in $opt
             ;;
@ -240,9 +240,12 @@ case "${vs_ver:-10}" in
    12) sln_vers="12.00"
       sln_vers_str="Visual Studio 2013"
    ;;
-    14) sln_vers="14.00"
+    14) sln_vers="12.00"
       sln_vers_str="Visual Studio 2015"
    ;;
+    15) sln_vers="12.00"
+       sln_vers_str="Visual Studio 2017"
+    ;;
 esac
 sfx=vcxproj

--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@ -34,7 +34,7 @@ Options:
    --name=project_name         Name of the project (required)
    --proj-guid=GUID            GUID to use for the project
    --module-def=filename       File containing export definitions (for DLLs)
-    --ver=version               Version (10,11,12,14) of visual studio to generate for
+    --ver=version               Version (10,11,12,14,15) of visual studio to generate for
    --src-path-bare=dir         Path to root of source tree
    -Ipath/to/include           Additional include directories
    -DFLAG[=value]              Preprocessor macros to define
@ -168,7 +168,7 @@ for opt in "$@"; do
        --ver=*)
            vs_ver="$optval"
            case "$optval" in
-                10|11|12|14)
+                10|11|12|14|15)
                ;;
                *) die Unrecognized Visual Studio Version in $opt
                ;;
@ -218,7 +218,7 @@ guid=${guid:-`generate_uuid`}
 asm_use_custom_step=false
 uses_asm=${uses_asm:-false}
 case "${vs_ver:-11}" in
-    10|11|12|14)
+    10|11|12|14|15)
       asm_use_custom_step=$uses_asm
    ;;
 esac
@ -347,6 +347,9 @@ generate_vcxproj() {
            if [ "$vs_ver" = "14" ]; then
                tag_content PlatformToolset v140
            fi
+            if [ "$vs_ver" = "15" ]; then
+                tag_content PlatformToolset v141
+            fi
            tag_content CharacterSet Unicode
            if [ "$config" = "Release" ]; then
                tag_content WholeProgramOptimization true
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@ -35,8 +35,8 @@ ARM_TARGETS="arm64-darwin-gcc
             armv7s-darwin-gcc"
 SIM_TARGETS="x86-iphonesimulator-gcc
             x86_64-iphonesimulator-gcc"
-OSX_TARGETS="x86-darwin15-gcc
-             x86_64-darwin15-gcc"
+OSX_TARGETS="x86-darwin16-gcc
+             x86_64-darwin16-gcc"
 TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"

 # Configures for the target specified by $1, and invokes make with the dist
@ -271,7 +271,7 @@ cat << EOF
    --help: Display this message and exit.
    --enable-shared: Build a dynamic framework for use on iOS 8 or later.
    --extra-configure-args <args>: Extra args to pass when configuring libvpx.
-    --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
+    --macosx: Uses darwin16 targets instead of iphonesimulator targets for x86
              and x86_64. Allows linking to framework when builds target MacOSX
              instead of iOS.
    --preserve-build-output: Do not delete the build directory.
--- a/build/make/msvs_common.sh
+++ b/build/make/msvs_common.sh
@ -41,6 +41,15 @@ fix_path() {
 # Corrects the paths in file_list in one pass for efficiency.
 # $1 is the name of the array to be modified.
 fix_file_list() {
+    if [ "${FIXPATH}" = "echo_path" ] ; then
+      # When used with echo_path, fix_file_list is a no-op. Avoid warning about
+      # unsupported 'declare -n' when it is not important.
+      return 0
+    elif [ "${BASH_VERSINFO}" -lt 4 ] ; then
+      echo "Cygwin path conversion has failed. Please use a version of bash"
+      echo "which supports nameref (-n), introduced in bash 4.3"
+      return 1
+    fi
    declare -n array_ref=$1
    files=$(fix_path "${array_ref[@]}")
    local IFS=$'\n'
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@ -1,4 +1,13 @@
 #!/usr/bin/env perl
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##

 no strict 'refs';
 use warnings;
@ -200,6 +209,7 @@ sub filter {
 sub common_top() {
  my $include_guard = uc($opts{sym})."_H_";
  print <<EOF;
+// This file is generated. Do not edit.
 #ifndef ${include_guard}
 #define ${include_guard}

@ -335,6 +345,36 @@ EOF
  common_bottom;
 }

+sub ppc() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/ppc.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = ppc_simd_caps();
+    (void)flags;
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
 sub unoptimized() {
  determine_indirection "c";
  common_top;
@ -361,10 +401,10 @@ EOF

 &require("c");
 if ($opts{arch} eq 'x86') {
-  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/);
  x86;
 } elsif ($opts{arch} eq 'x86_64') {
-  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/);
  @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
  &require(@REQUIRES);
  x86;
@ -381,6 +421,10 @@ if ($opts{arch} eq 'x86') {
      @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
      last;
    }
+    if (/HAVE_MMI=yes/) {
+      @ALL_ARCHS = filter("$opts{arch}", qw/mmi/);
+      last;
+    }
  }
  close CONFIG_FILE;
  mips;
@ -390,6 +434,9 @@ if ($opts{arch} eq 'x86') {
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
  @ALL_ARCHS = filter(qw/neon/);
  arm;
+} elsif ($opts{arch} =~ /^ppc/ ) {
+  @ALL_ARCHS = filter(qw/vsx/);
+  ppc;
 } else {
  unoptimized;
 }
--- a/build/make/thumb.pm
+++ b/build/make/thumb.pm
@ -54,13 +54,6 @@ sub FixThumbInstructions($$)
    # "addne r0, r0, r2".
    s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g;

-    # Convert a conditional addition to the pc register into a series of
-    # instructions. This converts "addlt pc, pc, r3, lsl #2" into
-    # "itttt lt", "movlt.n r12, pc", "addlt.w r12, #12",
-    # "addlt.w r12, r12, r3, lsl #2", "movlt.n pc, r12".
-    # This assumes that r12 is free at this point.
-    s/^(\s*)addlt(\s+)pc,\s*pc,\s*(\w+),\s*lsl\s*#(\d+)/$1itttt$2lt\n$1movlt.n$2r12, pc\n$1addlt.w$2r12, #12\n$1addlt.w$2r12, r12, $3, lsl #($4-$branch_shift_offset)\n$1movlt.n$2pc, r12/g;
-
    # Convert "mov pc, lr" into "bx lr", since the former only works
    # for switching from arm to thumb (and only in armv7), but not
    # from thumb to arm.
--- a/build/make/version.sh
+++ b/build/make/version.sh
@ -60,6 +60,7 @@ if [ ${bare} ]; then
    echo "${changelog_version}${git_version_id}" > $$.tmp
 else
    cat<<EOF>$$.tmp
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  $major_version
 #define VERSION_MINOR  $minor_version
 #define VERSION_PATCH  $patch_version
--- a/codereview.settings
+++ b/codereview.settings
@ -1,5 +1,4 @@
-# This file is used by gcl to get repository specific information.
-GERRIT_HOST: chromium-review.googlesource.com
-GERRIT_PORT: 29418
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
 CODE_REVIEW_SERVER: chromium-review.googlesource.com
 GERRIT_SQUASH_UPLOADS: False
--- a/46
+++ b/46
@ -101,18 +101,23 @@ EOF
 all_platforms="${all_platforms} arm64-android-gcc"
 all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} arm64-linux-gcc"
+all_platforms="${all_platforms} arm64-win64-gcc"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
+all_platforms="${all_platforms} armv7-win32-gcc"
 all_platforms="${all_platforms} armv7-win32-vs11"
 all_platforms="${all_platforms} armv7-win32-vs12"
 all_platforms="${all_platforms} armv7-win32-vs14"
+all_platforms="${all_platforms} armv7-win32-vs15"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} armv8-linux-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
+all_platforms="${all_platforms} ppc64-linux-gcc"
+all_platforms="${all_platforms} ppc64le-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
 all_platforms="${all_platforms} x86-android-gcc"
 all_platforms="${all_platforms} x86-darwin8-gcc"
@ -125,6 +130,8 @@ all_platforms="${all_platforms} x86-darwin12-gcc"
 all_platforms="${all_platforms} x86-darwin13-gcc"
 all_platforms="${all_platforms} x86-darwin14-gcc"
 all_platforms="${all_platforms} x86-darwin15-gcc"
+all_platforms="${all_platforms} x86-darwin16-gcc"
+all_platforms="${all_platforms} x86-darwin17-gcc"
 all_platforms="${all_platforms} x86-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
@ -135,6 +142,7 @@ all_platforms="${all_platforms} x86-win32-vs10"
 all_platforms="${all_platforms} x86-win32-vs11"
 all_platforms="${all_platforms} x86-win32-vs12"
 all_platforms="${all_platforms} x86-win32-vs14"
+all_platforms="${all_platforms} x86-win32-vs15"
 all_platforms="${all_platforms} x86_64-android-gcc"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
@ -143,6 +151,8 @@ all_platforms="${all_platforms} x86_64-darwin12-gcc"
 all_platforms="${all_platforms} x86_64-darwin13-gcc"
 all_platforms="${all_platforms} x86_64-darwin14-gcc"
 all_platforms="${all_platforms} x86_64-darwin15-gcc"
+all_platforms="${all_platforms} x86_64-darwin16-gcc"
+all_platforms="${all_platforms} x86_64-darwin17-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
@ -152,6 +162,7 @@ all_platforms="${all_platforms} x86_64-win64-vs10"
 all_platforms="${all_platforms} x86_64-win64-vs11"
 all_platforms="${all_platforms} x86_64-win64-vs12"
 all_platforms="${all_platforms} x86_64-win64-vs14"
+all_platforms="${all_platforms} x86_64-win64-vs15"
 all_platforms="${all_platforms} generic-gnu"

 # all_targets is a list of all targets that can be configured
@ -163,11 +174,14 @@ for t in ${all_targets}; do
    [ -f "${source_path}/${t}.mk" ] && enable_feature ${t}
 done

+if ! diff --version >/dev/null; then
+  die "diff missing: Try installing diffutils via your package manager."
+fi
+
 if ! perl --version >/dev/null; then
    die "Perl is required to build"
 fi

-
 if [ "`cd \"${source_path}\" && pwd`" != "`pwd`" ]; then
  # test to see if source_path already configured
  if [ -f "${source_path}/vpx_config.h" ]; then
@ -223,6 +237,7 @@ ARCH_LIST="
    mips
    x86
    x86_64
+    ppc
 "
 ARCH_EXT_LIST_X86="
    mmx
@ -233,7 +248,13 @@ ARCH_EXT_LIST_X86="
    sse4_1
    avx
    avx2
+    avx512
 "
+
+ARCH_EXT_LIST_LOONGSON="
+    mmi
+"
+
 ARCH_EXT_LIST="
    neon
    neon_asm
@ -244,6 +265,10 @@ ARCH_EXT_LIST="
    mips64

    ${ARCH_EXT_LIST_X86}
+
+    vsx
+
+    ${ARCH_EXT_LIST_LOONGSON}
 "
 HAVE_LIST="
    ${ARCH_EXT_LIST}
@ -252,10 +277,8 @@ HAVE_LIST="
    unistd_h
 "
 EXPERIMENT_LIST="
-    spatial_svc
    fp_mb_stats
    emulate_hardware
-    misc_fixes
 "
 CONFIG_LIST="
    dependency_tracking
@ -310,6 +333,7 @@ CONFIG_LIST="
    better_hw_compatibility
    experimental
    size_limit
+    always_adjust_bpm
    ${EXPERIMENT_LIST}
 "
 CMDLINE_SELECT="
@ -369,6 +393,7 @@ CMDLINE_SELECT="
    better_hw_compatibility
    vp9_highbitdepth
    experimental
+    always_adjust_bpm
 "

 process_cmdline() {
@ -570,6 +595,7 @@ process_toolchain() {
        check_add_cflags -Wdeclaration-after-statement
        check_add_cflags -Wdisabled-optimization
        check_add_cflags -Wfloat-conversion
+        check_add_cflags -Wparentheses-equality
        check_add_cflags -Wpointer-arith
        check_add_cflags -Wtype-limits
        check_add_cflags -Wcast-qual
@ -583,16 +609,16 @@ process_toolchain() {
        # https://bugs.chromium.org/p/webm/issues/detail?id=1069
        check_add_cflags -Wextra
        # check_add_cflags also adds to cxxflags. gtest does not do well with
-        # -Wundef so add it explicitly to CFLAGS only.
+        # these flags so add them explicitly to CFLAGS only.
        check_cflags -Wundef && add_cflags_only -Wundef
+        check_cflags -Wframe-larger-than=52000 && \
+          add_cflags_only -Wframe-larger-than=52000
        if enabled mips || [ -z "${INLINE}" ]; then
          enabled extra_warnings || check_add_cflags -Wno-unused-function
        fi
-        if ! enabled vp9_highbitdepth; then
-          # Avoid this warning for third_party C++ sources. Some reorganization
-          # would be needed to apply this only to test/*.cc.
-          check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
-        fi
+        # Avoid this warning for third_party C++ sources. Some reorganization
+        # would be needed to apply this only to test/*.cc.
+        check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
    fi

    if enabled icc; then
@ -644,7 +670,7 @@ process_toolchain() {
             gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
             enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
             all_targets="${all_targets} solution"
-             INLINE="__forceinline"
+             INLINE="__inline"
        ;;
    esac

--- a/examples.mk
+++ b/examples.mk
@ -109,18 +109,17 @@ ifeq ($(CONFIG_WEBM_IO),yes)
 endif
 vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
 vpxenc.DESCRIPTION           = Full featured encoder
-ifeq ($(CONFIG_SPATIAL_SVC),yes)
-  EXAMPLES-$(CONFIG_VP9_ENCODER)      += vp9_spatial_svc_encoder.c
-  vp9_spatial_svc_encoder.SRCS        += args.c args.h
-  vp9_spatial_svc_encoder.SRCS        += ivfenc.c ivfenc.h
-  vp9_spatial_svc_encoder.SRCS        += tools_common.c tools_common.h
-  vp9_spatial_svc_encoder.SRCS        += video_common.h
-  vp9_spatial_svc_encoder.SRCS        += video_writer.h video_writer.c
-  vp9_spatial_svc_encoder.SRCS        += vpx_ports/msvc.h
-  vp9_spatial_svc_encoder.SRCS        += vpxstats.c vpxstats.h
-  vp9_spatial_svc_encoder.GUID        = 4A38598D-627D-4505-9C7B-D4020C84100D
-  vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder
-endif
+
+EXAMPLES-$(CONFIG_VP9_ENCODER)      += vp9_spatial_svc_encoder.c
+vp9_spatial_svc_encoder.SRCS        += args.c args.h
+vp9_spatial_svc_encoder.SRCS        += ivfenc.c ivfenc.h
+vp9_spatial_svc_encoder.SRCS        += tools_common.c tools_common.h
+vp9_spatial_svc_encoder.SRCS        += video_common.h
+vp9_spatial_svc_encoder.SRCS        += video_writer.h video_writer.c
+vp9_spatial_svc_encoder.SRCS        += vpx_ports/msvc.h
+vp9_spatial_svc_encoder.SRCS        += vpxstats.c vpxstats.h
+vp9_spatial_svc_encoder.GUID        = 4A38598D-627D-4505-9C7B-D4020C84100D
+vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder

 ifneq ($(CONFIG_SHARED),yes)
 EXAMPLES-$(CONFIG_VP9_ENCODER)    += resize_util.c
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@ -151,7 +151,7 @@ static void write_ivf_frame_header(FILE *outfile,
  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return;

  pts = pkt->data.frame.pts;
-  mem_put_le32(header, pkt->data.frame.sz);
+  mem_put_le32(header, (int)pkt->data.frame.sz);
  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
  mem_put_le32(header + 8, pts >> 32);

@ -190,7 +190,7 @@ static void set_temporal_layer_pattern(int num_temporal_layers,
      cfg->ts_layer_id[0] = 0;
      cfg->ts_layer_id[1] = 1;
      // Use 60/40 bit allocation as example.
-      cfg->ts_target_bitrate[0] = 0.6f * bitrate;
+      cfg->ts_target_bitrate[0] = (int)(0.6f * bitrate);
      cfg->ts_target_bitrate[1] = bitrate;

      /* 0=L, 1=GF */
@ -240,9 +240,9 @@ static void set_temporal_layer_pattern(int num_temporal_layers,
      cfg->ts_layer_id[1] = 2;
      cfg->ts_layer_id[2] = 1;
      cfg->ts_layer_id[3] = 2;
-      // Use 40/20/40 bit allocation as example.
-      cfg->ts_target_bitrate[0] = 0.4f * bitrate;
-      cfg->ts_target_bitrate[1] = 0.6f * bitrate;
+      // Use 45/20/35 bit allocation as example.
+      cfg->ts_target_bitrate[0] = (int)(0.45f * bitrate);
+      cfg->ts_target_bitrate[1] = (int)(0.65f * bitrate);
      cfg->ts_target_bitrate[2] = bitrate;

      /* 0=L, 1=GF, 2=ARF */
@ -294,8 +294,8 @@ int main(int argc, char **argv) {
  vpx_codec_err_t res[NUM_ENCODERS];

  int i;
-  long width;
-  long height;
+  int width;
+  int height;
  int length_frame;
  int frame_avail;
  int got_data;
@ -347,9 +347,9 @@ int main(int argc, char **argv) {

  printf("Using %s\n", vpx_codec_iface_name(interface));

-  width = strtol(argv[1], NULL, 0);
-  height = strtol(argv[2], NULL, 0);
-  framerate = strtol(argv[3], NULL, 0);
+  width = (int)strtol(argv[1], NULL, 0);
+  height = (int)strtol(argv[2], NULL, 0);
+  framerate = (int)strtol(argv[3], NULL, 0);

  if (width < 16 || width % 2 || height < 16 || height % 2)
    die("Invalid resolution: %ldx%ld", width, height);
@ -371,12 +371,13 @@ int main(int argc, char **argv) {

  // Bitrates per spatial layer: overwrite default rates above.
  for (i = 0; i < NUM_ENCODERS; i++) {
-    target_bitrate[i] = strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
+    target_bitrate[i] = (int)strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
  }

  // Temporal layers per spatial layers: overwrite default settings above.
  for (i = 0; i < NUM_ENCODERS; i++) {
-    num_temporal_layers[i] = strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
+    num_temporal_layers[i] =
+        (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
    if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3)
      die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n",
          num_temporal_layers);
@ -391,9 +392,9 @@ int main(int argc, char **argv) {
    downsampled_input[i] = fopen(filename, "wb");
  }

-  key_frame_insert = strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
+  key_frame_insert = (int)strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);

-  show_psnr = strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
+  show_psnr = (int)strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);

  /* Populate default encoder configuration */
  for (i = 0; i < NUM_ENCODERS; i++) {
@ -460,7 +461,7 @@ int main(int argc, char **argv) {

  // Set the number of threads per encode/spatial layer.
  // (1, 1, 1) means no encoder threading.
-  cfg[0].g_threads = 2;
+  cfg[0].g_threads = 1;
  cfg[1].g_threads = 1;
  cfg[2].g_threads = 1;

@ -469,7 +470,7 @@ int main(int argc, char **argv) {
    if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
      die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);

-  if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
+  if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w)
    read_frame_p = read_frame;
  else
    read_frame_p = read_frame_by_row;
@ -507,9 +508,11 @@ int main(int argc, char **argv) {

  /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */
  /* Enable denoising for the highest-resolution encoder. */
-  if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 4))
+  if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1))
    die_codec(&codec[0], "Failed to set noise_sensitivity");
-  for (i = 1; i < NUM_ENCODERS; i++) {
+  if (vpx_codec_control(&codec[1], VP8E_SET_NOISE_SENSITIVITY, 1))
+    die_codec(&codec[1], "Failed to set noise_sensitivity");
+  for (i = 2; i < NUM_ENCODERS; i++) {
    if (vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0))
      die_codec(&codec[i], "Failed to set noise_sensitivity");
  }
@ -556,7 +559,8 @@ int main(int argc, char **argv) {
        /* Write out down-sampled input. */
        length_frame = cfg[i].g_w * cfg[i].g_h * 3 / 2;
        if (fwrite(raw[i].planes[0], 1, length_frame,
-                   downsampled_input[NUM_ENCODERS - i - 1]) != length_frame) {
+                   downsampled_input[NUM_ENCODERS - i - 1]) !=
+            (unsigned int)length_frame) {
          return EXIT_FAILURE;
        }
      }
@ -617,10 +621,6 @@ int main(int argc, char **argv) {
            break;
          default: break;
        }
-        printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT &&
-                       (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)
-                   ? "K"
-                   : "");
        fflush(stdout);
      }
    }
@ -661,7 +661,6 @@ int main(int argc, char **argv) {
      write_ivf_file_header(outfile[i], &cfg[i], frame_cnt - 1);
    fclose(outfile[i]);
  }
-  printf("\n");

  return EXIT_SUCCESS;
 }
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@ -168,7 +168,7 @@ void usage_exit(void) {
 static void parse_command_line(int argc, const char **argv_,
                               AppInput *app_input, SvcContext *svc_ctx,
                               vpx_codec_enc_cfg_t *enc_cfg) {
-  struct arg arg = { 0 };
+  struct arg arg;
  char **argv = NULL;
  char **argi = NULL;
  char **argj = NULL;
@ -429,8 +429,9 @@ static void set_rate_control_stats(struct RateControlStats *rc,
        rc->layer_framerate[layer] = framerate / cfg->ts_rate_decimator[tl];
      if (tl > 0) {
        rc->layer_pfb[layer] =
-            1000.0 * (cfg->layer_target_bitrate[layer] -
-                      cfg->layer_target_bitrate[layer - 1]) /
+            1000.0 *
+            (cfg->layer_target_bitrate[layer] -
+             cfg->layer_target_bitrate[layer - 1]) /
            (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]);
      } else {
        rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] /
@ -502,14 +503,12 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
  printf("Average, rms-variance, and percent-fluct: %f %f %f \n",
         rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate),
         perc_fluctuation);
-  if (frame_cnt != tot_num_frames)
-    die("Error: Number of input frames not equal to output encoded frames != "
-        "%d tot_num_frames = %d\n",
-        frame_cnt, tot_num_frames);
+  printf("Num of input, num of encoded (super) frames: %d %d \n", frame_cnt,
+         tot_num_frames);
 }

 vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                       uint32_t sizes[8], int *count) {
+                                       uint64_t sizes[8], int *count) {
  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
  // it is a super frame index. If the last byte of real video compression
  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
@ -561,9 +560,10 @@ vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
 // bypass/flexible mode. The pattern corresponds to the pattern
 // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
 // non-flexible mode.
-void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
+void set_frame_flags_bypass_mode(int tl, int num_spatial_layers,
                                 int is_key_frame,
                                 vpx_svc_ref_frame_config_t *ref_frame_config) {
+  int sl;
  for (sl = 0; sl < num_spatial_layers; ++sl) {
    if (!tl) {
      if (!sl) {
@ -573,8 +573,8 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
      } else {
        if (is_key_frame) {
          ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF |
-              VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF;
        } else {
          ref_frame_config->frame_flags[sl] =
              VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
@ -588,14 +588,24 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
      } else {
        ref_frame_config->frame_flags[sl] =
            VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+        if (sl == num_spatial_layers - 1)
+          ref_frame_config->frame_flags[sl] =
+              VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_ARF |
+              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
      }
    }
    if (tl == 0) {
      ref_frame_config->lst_fb_idx[sl] = sl;
-      if (sl)
-        ref_frame_config->gld_fb_idx[sl] = sl - 1;
-      else
+      if (sl) {
+        if (is_key_frame) {
+          ref_frame_config->lst_fb_idx[sl] = sl - 1;
+          ref_frame_config->gld_fb_idx[sl] = sl;
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = sl - 1;
+        }
+      } else {
        ref_frame_config->gld_fb_idx[sl] = 0;
+      }
      ref_frame_config->alt_fb_idx[sl] = 0;
    } else if (tl == 1) {
      ref_frame_config->lst_fb_idx[sl] = sl;
@ -606,9 +616,9 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
 }

 int main(int argc, const char **argv) {
-  AppInput app_input = { 0 };
+  AppInput app_input;
  VpxVideoWriter *writer = NULL;
-  VpxVideoInfo info = { 0 };
+  VpxVideoInfo info;
  vpx_codec_ctx_t codec;
  vpx_codec_enc_cfg_t enc_cfg;
  SvcContext svc_ctx;
@ -622,7 +632,7 @@ int main(int argc, const char **argv) {
  int end_of_stream = 0;
  int frames_received = 0;
 #if OUTPUT_RC_STATS
-  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = { NULL };
+  VpxVideoWriter *outfile[VPX_SS_MAX_LAYERS] = { NULL };
  struct RateControlStats rc;
  vpx_svc_layer_id_t layer_id;
  vpx_svc_ref_frame_config_t ref_frame_config;
@ -634,14 +644,16 @@ int main(int argc, const char **argv) {
  struct vpx_usec_timer timer;
  int64_t cx_time = 0;
  memset(&svc_ctx, 0, sizeof(svc_ctx));
-  svc_ctx.log_print = 1;
+  memset(&app_input, 0, sizeof(AppInput));
+  memset(&info, 0, sizeof(VpxVideoInfo));
  exec_name = argv[0];
  parse_command_line(argc, argv, &app_input, &svc_ctx, &enc_cfg);

 // Allocate image buffer
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (!vpx_img_alloc(&raw, enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420
-                                                          : VPX_IMG_FMT_I42016,
+  if (!vpx_img_alloc(&raw,
+                     enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420
+                                                    : VPX_IMG_FMT_I42016,
                     enc_cfg.g_w, enc_cfg.g_h, 32)) {
    die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
  }
@ -660,6 +672,10 @@ int main(int argc, const char **argv) {
    die("Failed to initialize encoder\n");

 #if OUTPUT_RC_STATS
+  rc.window_count = 1;
+  rc.window_size = 15;  // Silence a static analysis warning.
+  rc.avg_st_encoding_bitrate = 0.0;
+  rc.variance_st_encoding_bitrate = 0.0;
  if (svc_ctx.output_rc_stat) {
    set_rate_control_stats(&rc, &enc_cfg);
    framerate = enc_cfg.g_timebase.den / enc_cfg.g_timebase.num;
@ -678,16 +694,16 @@ int main(int argc, const char **argv) {
      die("Failed to open %s for writing\n", app_input.output_filename);
  }
 #if OUTPUT_RC_STATS
-  // For now, just write temporal layer streams.
-  // TODO(wonkap): do spatial by re-writing superframe.
+  // Write out spatial layer stream.
+  // TODO(marpan/jianj): allow for writing each spatial and temporal stream.
  if (svc_ctx.output_rc_stat) {
-    for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
+    for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
      char file_name[PATH_MAX];

-      snprintf(file_name, sizeof(file_name), "%s_t%d.ivf",
-               app_input.output_filename, tl);
-      outfile[tl] = vpx_video_writer_open(file_name, kContainerIVF, &info);
-      if (!outfile[tl]) die("Failed to open %s for writing", file_name);
+      snprintf(file_name, sizeof(file_name), "%s_s%d.ivf",
+               app_input.output_filename, sl);
+      outfile[sl] = vpx_video_writer_open(file_name, kContainerIVF, &info);
+      if (!outfile[sl]) die("Failed to open %s for writing", file_name);
    }
  }
 #endif
@ -697,12 +713,22 @@ int main(int argc, const char **argv) {

  if (svc_ctx.speed != -1)
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
-  if (svc_ctx.threads)
-    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
+  if (svc_ctx.threads) {
+    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, get_msb(svc_ctx.threads));
+    if (svc_ctx.threads > 1)
+      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1);
+    else
+      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0);
+  }
  if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
  if (svc_ctx.speed >= 5)
    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+  vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900);
+
+  vpx_codec_control(&codec, VP9E_SET_SVC_INTER_LAYER_PRED, 0);
+
+  vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);

  // Encode frames
  while (!end_of_stream) {
@ -731,7 +757,9 @@ int main(int argc, const char **argv) {
      // the encode for the whole superframe. The encoder will internally loop
      // over all the spatial layers for the current superframe.
      vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
-      set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id,
+      // TODO(jianj): Fix the parameter passing for "is_key_frame" in
+      // set_frame_flags_bypass_model() for case of periodic key frames.
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
                                  svc_ctx.spatial_layers, frame_cnt == 0,
                                  &ref_frame_config);
      vpx_codec_control(&codec, VP9E_SET_SVC_REF_FRAME_CONFIG,
@ -742,6 +770,17 @@ int main(int argc, const char **argv) {
        ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
                                layer_id.temporal_layer_id];
      }
+    } else {
+      // For the fixed pattern SVC, temporal layer is given by superframe count.
+      unsigned int tl = 0;
+      if (enc_cfg.ts_number_layers == 2)
+        tl = (frame_cnt % 2 != 0);
+      else if (enc_cfg.ts_number_layers == 3) {
+        if (frame_cnt % 2 != 0) tl = 2;
+        if ((frame_cnt > 1) && ((frame_cnt - 2) % 4 == 0)) tl = 1;
+      }
+      for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl)
+        ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + tl];
    }

    vpx_usec_timer_start(&timer);
@ -751,7 +790,6 @@ int main(int argc, const char **argv) {
    vpx_usec_timer_mark(&timer);
    cx_time += vpx_usec_timer_elapsed(&timer);

-    printf("%s", vpx_svc_get_message(&svc_ctx));
    fflush(stdout);
    if (res != VPX_CODEC_OK) {
      die_codec(&codec, "Failed to encode frame");
@ -763,51 +801,64 @@ int main(int argc, const char **argv) {
          SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal;
          if (cx_pkt->data.frame.sz > 0) {
 #if OUTPUT_RC_STATS
-            uint32_t sizes[8];
+            uint64_t sizes[8];
+            uint64_t sizes_parsed[8];
            int count = 0;
+            vp9_zero(sizes);
+            vp9_zero(sizes_parsed);
 #endif
            vpx_video_writer_write_frame(writer, cx_pkt->data.frame.buf,
                                         cx_pkt->data.frame.sz,
                                         cx_pkt->data.frame.pts);
 #if OUTPUT_RC_STATS
-            // TODO(marpan/wonkap): Put this (to line728) in separate function.
+            // TODO(marpan): Put this (to line728) in separate function.
            if (svc_ctx.output_rc_stat) {
              vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
              parse_superframe_index(cx_pkt->data.frame.buf,
-                                     cx_pkt->data.frame.sz, sizes, &count);
-              // Note computing input_layer_frames here won't account for frame
-              // drops in rate control stats.
-              // TODO(marpan): Fix this for non-bypass mode so we can get stats
-              // for dropped frames.
+                                     cx_pkt->data.frame.sz, sizes_parsed,
+                                     &count);
+              if (enc_cfg.ss_number_layers == 1)
+                sizes[0] = cx_pkt->data.frame.sz;
              if (svc_ctx.temporal_layering_mode !=
                  VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+                int num_layers_encoded = 0;
                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-                  ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
-                                          layer_id.temporal_layer_id];
+                  sizes[sl] = 0;
+                  if (cx_pkt->data.frame.spatial_layer_encoded[sl]) {
+                    sizes[sl] = sizes_parsed[num_layers_encoded];
+                    num_layers_encoded++;
+                  }
+                }
+                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                  unsigned int sl2;
+                  uint64_t tot_size = 0;
+                  for (sl2 = 0; sl2 <= sl; ++sl2) {
+                    if (cx_pkt->data.frame.spatial_layer_encoded[sl2])
+                      tot_size += sizes[sl2];
+                  }
+                  if (tot_size > 0)
+                    vpx_video_writer_write_frame(
+                        outfile[sl], cx_pkt->data.frame.buf, (size_t)(tot_size),
+                        cx_pkt->data.frame.pts);
                }
              }
-              for (tl = layer_id.temporal_layer_id;
-                   tl < enc_cfg.ts_number_layers; ++tl) {
-                vpx_video_writer_write_frame(
-                    outfile[tl], cx_pkt->data.frame.buf, cx_pkt->data.frame.sz,
-                    cx_pkt->data.frame.pts);
-              }
-
              for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-                for (tl = layer_id.temporal_layer_id;
-                     tl < enc_cfg.ts_number_layers; ++tl) {
-                  const int layer = sl * enc_cfg.ts_number_layers + tl;
-                  ++rc.layer_tot_enc_frames[layer];
-                  rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
-                  // Keep count of rate control stats per layer, for non-key
-                  // frames.
-                  if (tl == (unsigned int)layer_id.temporal_layer_id &&
-                      !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
-                    rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl];
-                    rc.layer_avg_rate_mismatch[layer] +=
-                        fabs(8.0 * sizes[sl] - rc.layer_pfb[layer]) /
-                        rc.layer_pfb[layer];
-                    ++rc.layer_enc_frames[layer];
+                if (cx_pkt->data.frame.spatial_layer_encoded[sl]) {
+                  for (tl = layer_id.temporal_layer_id;
+                       tl < enc_cfg.ts_number_layers; ++tl) {
+                    const int layer = sl * enc_cfg.ts_number_layers + tl;
+                    ++rc.layer_tot_enc_frames[layer];
+                    rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
+                    // Keep count of rate control stats per layer, for non-key
+                    // frames.
+                    if (tl == (unsigned int)layer_id.temporal_layer_id &&
+                        !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
+                      rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl];
+                      rc.layer_avg_rate_mismatch[layer] +=
+                          fabs(8.0 * sizes[sl] - rc.layer_pfb[layer]) /
+                          rc.layer_pfb[layer];
+                      ++rc.layer_enc_frames[layer];
+                    }
                  }
                }
              }
@ -816,9 +867,9 @@ int main(int argc, const char **argv) {
              // window of size rc->window, shifted by rc->window / 2.
              // Ignore first window segment, due to key frame.
              if (frame_cnt > (unsigned int)rc.window_size) {
-                tl = layer_id.temporal_layer_id;
                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-                  sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
+                  if (cx_pkt->data.frame.spatial_layer_encoded[sl])
+                    sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
                }
                if (frame_cnt % rc.window_size == 0) {
                  rc.window_count += 1;
@ -833,7 +884,6 @@ int main(int argc, const char **argv) {
              // Second shifted window.
              if (frame_cnt >
                  (unsigned int)(rc.window_size + rc.window_size / 2)) {
-                tl = layer_id.temporal_layer_id;
                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
                  sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate;
                }
@ -900,8 +950,8 @@ int main(int argc, const char **argv) {
  }
 #if OUTPUT_RC_STATS
  if (svc_ctx.output_rc_stat) {
-    for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
-      vpx_video_writer_close(outfile[tl]);
+    for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+      vpx_video_writer_close(outfile[sl]);
    }
  }
 #endif
@ -910,7 +960,7 @@ int main(int argc, const char **argv) {
         1000000 * (double)frame_cnt / (double)cx_time);
  vpx_img_free(&raw);
  // display average size, psnr
-  printf("%s", vpx_svc_dump_statistics(&svc_ctx));
+  vpx_svc_dump_statistics(&svc_ctx);
  vpx_svc_release(&svc_ctx);
  return EXIT_SUCCESS;
 }
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@ -22,21 +22,34 @@
 #include "../vpx_ports/vpx_timer.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
+#include "vpx_ports/bitops.h"

 #include "../tools_common.h"
 #include "../video_writer.h"

+#define ROI_MAP 0
+
+#define zero(Dest) memset(&Dest, 0, sizeof(Dest));
+
 static const char *exec_name;

 void usage_exit(void) { exit(EXIT_FAILURE); }

-// Denoiser states, for temporal denoising.
-enum denoiserState {
-  kDenoiserOff,
-  kDenoiserOnYOnly,
-  kDenoiserOnYUV,
-  kDenoiserOnYUVAggressive,
-  kDenoiserOnAdaptive
+// Denoiser states for vp8, for temporal denoising.
+enum denoiserStateVp8 {
+  kVp8DenoiserOff,
+  kVp8DenoiserOnYOnly,
+  kVp8DenoiserOnYUV,
+  kVp8DenoiserOnYUVAggressive,
+  kVp8DenoiserOnAdaptive
+};
+
+// Denoiser states for vp9, for temporal denoising.
+enum denoiserStateVp9 {
+  kVp9DenoiserOff,
+  kVp9DenoiserOnYOnly,
+  // For SVC: denoise the top two spatial layers.
+  kVp9DenoiserOnYTwoSpatialLayers
 };

 static int mode_to_num_layers[13] = { 1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3 };
@ -89,9 +102,10 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc,
  for (i = 0; i < cfg->ts_number_layers; ++i) {
    if (i > 0) {
      rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i];
-      rc->layer_pfb[i] = 1000.0 * (rc->layer_target_bitrate[i] -
-                                   rc->layer_target_bitrate[i - 1]) /
-                         (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
+      rc->layer_pfb[i] =
+          1000.0 *
+          (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
+          (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
    }
    rc->layer_input_frames[i] = 0;
    rc->layer_enc_frames[i] = 0;
@ -154,6 +168,75 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
    die("Error: Number of input frames not equal to output! \n");
 }

+#if ROI_MAP
+static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg,
+                        vpx_roi_map_t *roi) {
+  unsigned int i, j;
+  int block_size = 0;
+  uint8_t is_vp8 = strncmp(enc_name, "vp8", 3) == 0 ? 1 : 0;
+  uint8_t is_vp9 = strncmp(enc_name, "vp9", 3) == 0 ? 1 : 0;
+  if (!is_vp8 && !is_vp9) {
+    die("unsupported codec.");
+  }
+  zero(*roi);
+
+  block_size = is_vp9 && !is_vp8 ? 8 : 16;
+
+  // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for
+  // segment is 16x16 for vp8, 8x8 for vp9.
+  roi->rows = (cfg->g_h + block_size - 1) / block_size;
+  roi->cols = (cfg->g_w + block_size - 1) / block_size;
+
+  // Applies delta QP on the segment blocks, varies from -63 to 63.
+  // Setting to negative means lower QP (better quality).
+  // Below we set delta_q to the extreme (-63) to show strong effect.
+  // VP8 uses the first 4 segments. VP9 uses all 8 segments.
+  zero(roi->delta_q);
+  roi->delta_q[1] = -63;
+
+  // Applies delta loopfilter strength on the segment blocks, varies from -63 to
+  // 63. Setting to positive means stronger loopfilter. VP8 uses the first 4
+  // segments. VP9 uses all 8 segments.
+  zero(roi->delta_lf);
+
+  if (is_vp8) {
+    // Applies skip encoding threshold on the segment blocks, varies from 0 to
+    // UINT_MAX. Larger value means more skipping of encoding is possible.
+    // This skip threshold only applies on delta frames.
+    zero(roi->static_threshold);
+  }
+
+  if (is_vp9) {
+    // Apply skip segment. Setting to 1 means this block will be copied from
+    // previous frame.
+    zero(roi->skip);
+  }
+
+  if (is_vp9) {
+    // Apply ref frame segment.
+    // -1 : Do not apply this segment.
+    //  0 : Froce using intra.
+    //  1 : Force using last.
+    //  2 : Force using golden.
+    //  3 : Force using alfref but not used in non-rd pickmode for 0 lag.
+    memset(roi->ref_frame, -1, sizeof(roi->ref_frame));
+    roi->ref_frame[1] = 1;
+  }
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi->roi_map =
+      (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map));
+  for (i = 0; i < roi->rows; ++i) {
+    for (j = 0; j < roi->cols; ++j) {
+      if (i > (roi->rows >> 2) && i < ((roi->rows * 3) >> 2) &&
+          j > (roi->cols >> 2) && j < ((roi->cols * 3) >> 2)) {
+        roi->roi_map[i * roi->cols + j] = 1;
+      }
+    }
+  }
+}
+#endif
+
 // Temporal scaling parameters:
 // NOTE: The 3 prediction frames cannot be used interchangeably due to
 // differences in the way they are handled throughout the code. The
@ -495,6 +578,7 @@ int main(int argc, char **argv) {
  vpx_codec_err_t res;
  unsigned int width;
  unsigned int height;
+  uint32_t error_resilient = 0;
  int speed;
  int frame_avail;
  int got_data;
@ -505,16 +589,15 @@ int main(int argc, char **argv) {
  int layering_mode = 0;
  int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 };
  int flag_periodicity = 1;
-#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
-  vpx_svc_layer_id_t layer_id = { 0, 0 };
-#else
-  vpx_svc_layer_id_t layer_id = { 0 };
+#if ROI_MAP
+  vpx_roi_map_t roi;
 #endif
+  vpx_svc_layer_id_t layer_id = { 0, 0 };
  const VpxInterface *encoder = NULL;
  FILE *infile = NULL;
  struct RateControlMetrics rc;
  int64_t cx_time = 0;
-  const int min_args_base = 12;
+  const int min_args_base = 13;
 #if CONFIG_VP9_HIGHBITDEPTH
  vpx_bit_depth_t bit_depth = VPX_BITS_8;
  int input_bit_depth = 8;
@ -526,17 +609,21 @@ int main(int argc, char **argv) {
  double sum_bitrate2 = 0.0;
  double framerate = 30.0;

+  zero(rc.layer_target_bitrate);
+
  exec_name = argv[0];
  // Check usage and arguments.
  if (argc < min_args) {
 #if CONFIG_VP9_HIGHBITDEPTH
    die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
-        "<rate_num> <rate_den> <speed> <frame_drop_threshold> <threads> <mode> "
+        "<rate_num> <rate_den> <speed> <frame_drop_threshold> "
+        "<error_resilient> <threads> <mode> "
        "<Rate_0> ... <Rate_nlayers-1> <bit-depth> \n",
        argv[0]);
 #else
    die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
-        "<rate_num> <rate_den> <speed> <frame_drop_threshold> <threads> <mode> "
+        "<rate_num> <rate_den> <speed> <frame_drop_threshold> "
+        "<error_resilient> <threads> <mode> "
        "<Rate_0> ... <Rate_nlayers-1> \n",
        argv[0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@ -553,9 +640,9 @@ int main(int argc, char **argv) {
    die("Invalid resolution: %d x %d", width, height);
  }

-  layering_mode = (int)strtol(argv[11], NULL, 0);
+  layering_mode = (int)strtol(argv[12], NULL, 0);
  if (layering_mode < 0 || layering_mode > 13) {
-    die("Invalid layering mode (0..12) %s", argv[11]);
+    die("Invalid layering mode (0..12) %s", argv[12]);
  }

  if (argc != min_args + mode_to_num_layers[layering_mode]) {
@ -619,11 +706,11 @@ int main(int argc, char **argv) {

  for (i = min_args_base;
       (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) {
-    rc.layer_target_bitrate[i - 12] = (int)strtol(argv[i], NULL, 0);
+    rc.layer_target_bitrate[i - 13] = (int)strtol(argv[i], NULL, 0);
    if (strncmp(encoder->name, "vp8", 3) == 0)
-      cfg.ts_target_bitrate[i - 12] = rc.layer_target_bitrate[i - 12];
+      cfg.ts_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13];
    else if (strncmp(encoder->name, "vp9", 3) == 0)
-      cfg.layer_target_bitrate[i - 12] = rc.layer_target_bitrate[i - 12];
+      cfg.layer_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13];
  }

  // Real time parameters.
@ -634,7 +721,7 @@ int main(int argc, char **argv) {
  if (strncmp(encoder->name, "vp9", 3) == 0) cfg.rc_max_quantizer = 52;
  cfg.rc_undershoot_pct = 50;
  cfg.rc_overshoot_pct = 50;
-  cfg.rc_buf_initial_sz = 500;
+  cfg.rc_buf_initial_sz = 600;
  cfg.rc_buf_optimal_sz = 600;
  cfg.rc_buf_sz = 1000;

@ -642,10 +729,14 @@ int main(int argc, char **argv) {
  cfg.rc_resize_allowed = 0;

  // Use 1 thread as default.
-  cfg.g_threads = (unsigned int)strtoul(argv[10], NULL, 0);
+  cfg.g_threads = (unsigned int)strtoul(argv[11], NULL, 0);

+  error_resilient = (uint32_t)strtoul(argv[10], NULL, 0);
+  if (error_resilient != 0 && error_resilient != 1) {
+    die("Invalid value for error resilient (0, 1): %d.", error_resilient);
+  }
  // Enable error resilient mode.
-  cfg.g_error_resilient = 1;
+  cfg.g_error_resilient = error_resilient;
  cfg.g_lag_in_frames = 0;
  cfg.kf_mode = VPX_KF_AUTO;

@ -700,18 +791,39 @@ int main(int argc, char **argv) {

  if (strncmp(encoder->name, "vp8", 3) == 0) {
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+    vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
+#if ROI_MAP
+    set_roi_map(encoder->name, &cfg, &roi);
+    if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
+      die_codec(&codec, "Failed to set ROI map");
+#endif
+
  } else if (strncmp(encoder->name, "vp9", 3) == 0) {
    vpx_svc_extra_cfg_t svc_params;
    memset(&svc_params, 0, sizeof(svc_params));
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+    vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0);
+    vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0);
    vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
-    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff);
    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
    vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
-    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
+    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, get_msb(cfg.g_threads));
+#if ROI_MAP
+    set_roi_map(encoder->name, &cfg, &roi);
+    if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
+      die_codec(&codec, "Failed to set ROI map");
+    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 0);
+#endif
+    // TODO(marpan/jianj): There is an issue with row-mt for low resolutons at
+    // high speed settings, disable its use for those cases for now.
+    if (cfg.g_threads > 1 && ((cfg.g_w > 320 && cfg.g_h > 240) || speed < 7))
+      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1);
+    else
+      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0);
    if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1 : 0))
      die_codec(&codec, "Failed to set SVC");
    for (i = 0; i < cfg.ts_number_layers; ++i) {
@ -730,7 +842,7 @@ int main(int argc, char **argv) {
  // For generating smaller key frames, use a smaller max_intra_size_pct
  // value, like 100 or 200.
  {
-    const int max_intra_size_pct = 900;
+    const int max_intra_size_pct = 1000;
    vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
                      max_intra_size_pct);
  }
@ -740,10 +852,8 @@ int main(int argc, char **argv) {
    struct vpx_usec_timer timer;
    vpx_codec_iter_t iter = NULL;
    const vpx_codec_cx_pkt_t *pkt;
-#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
    // Update the temporal layer_id. No spatial layers in this test.
    layer_id.spatial_layer_id = 0;
-#endif
    layer_id.temporal_layer_id =
        cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
    if (strncmp(encoder->name, "vp9", 3) == 0) {
@ -835,5 +945,8 @@ int main(int argc, char **argv) {
  for (i = 0; i < cfg.ts_number_layers; ++i) vpx_video_writer_close(outfile[i]);

  vpx_img_free(&raw);
+#if ROI_MAP
+  free(roi.roi_map);
+#endif
  return EXIT_SUCCESS;
 }
--- a/libs.doxy_template
+++ b/libs.doxy_template
@ -943,18 +943,6 @@ GENERATE_XML           = NO

 XML_OUTPUT             = xml

-# The XML_SCHEMA tag can be used to specify an XML schema,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_SCHEMA             =
-
-# The XML_DTD tag can be used to specify an XML DTD,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_DTD                =
-
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting
 # and cross-referencing information) to the XML output. Note that
--- a/libs.mk
+++ b/libs.mk
@ -88,7 +88,7 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
  CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
-  INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h
+  INSTALL-LIBS-yes += include/vpx/svc_context.h
  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
  CODEC_DOC_SECTIONS += vp9 vp9_encoder
@ -149,12 +149,11 @@ CODEC_SRCS-yes += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm
 endif
 CODEC_EXPORTS-yes += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
-ifeq ($(CONFIG_SPATIAL_SVC),yes)
-CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_spatial_svc
-endif
+CODEC_EXPORTS-$(CONFIG_VP9_ENCODER) += vpx/exports_spatial_svc
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec

 INSTALL-LIBS-yes += include/vpx/vpx_codec.h
@ -187,6 +186,13 @@ libvpx_srcs.txt:
 	@echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
 CLEAN-OBJS += libvpx_srcs.txt

+# Assembly files that are included, but don't define symbols themselves.
+# Filtered out to avoid Windows build warnings.
+ASM_INCLUDES := \
+    third_party/x86inc/x86inc.asm \
+    vpx_config.asm \
+    vpx_ports/x86_abi_support.asm \
+    vpx_dsp/x86/bitdepth_conversion_sse2.asm \

 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
@ -198,12 +204,7 @@ vpx.def: $(call enabled,CODEC_EXPORTS)
            --out=$@ $^
 CLEAN-OBJS += vpx.def

-# Assembly files that are included, but don't define symbols themselves.
-# Filtered out to avoid Visual Studio build warnings.
-ASM_INCLUDES := \
-    third_party/x86inc/x86inc.asm \
-    vpx_config.asm \
-    vpx_ports/x86_abi_support.asm \
+vpx.$(VCPROJ_SFX): VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^)

 vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
@ -217,7 +218,15 @@ vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
            --ver=$(CONFIG_VS_VERSION) \
            --src-path-bare="$(SRC_PATH_BARE)" \
            --out=$@ $(CFLAGS) \
-            $(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \
+            $(filter $(SRC_PATH_BARE)/vp8/%.c, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vp8/%.h, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vpx/%, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vpx_dsp/%, $(VCPROJ_SRCS)) \
+            $(filter-out $(addprefix $(SRC_PATH_BARE)/, \
+                           vp8/%.c vp8/%.h vp9/%.c vp9/%.h vpx/% vpx_dsp/%), \
+              $(VCPROJ_SRCS)) \
            --src-path-bare="$(SRC_PATH_BARE)" \

 PROJECTS-yes += vpx.$(VCPROJ_SFX)
@ -227,12 +236,12 @@ vpx.$(VCPROJ_SFX): $(RTCD)

 endif
 else
-LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))
+LIBVPX_OBJS=$(call objs, $(filter-out $(ASM_INCLUDES), $(CODEC_SRCS)))
 OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)

-SO_VERSION_MAJOR := 4
+SO_VERSION_MAJOR := 5
 SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
@ -391,7 +400,7 @@ LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS))
 LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX)
 LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
                     $(call enabled,LIBVPX_TEST_DATA))
-libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
+libvpx_test_data_url=https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/$(1)

 TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX)
 TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
@ -404,8 +413,16 @@ CLEAN-OBJS += libvpx_test_srcs.txt

 $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
 	@echo "    [DOWNLOAD] $@"
-	$(qexec)trap 'rm -f $@' INT TERM &&\
-            curl -L -o $@ $(call libvpx_test_data_url,$(@F))
+	# Attempt to download the file using curl, retrying once if it fails for a
+	# partial file (18).
+	$(qexec)( \
+	  trap 'rm -f $@' INT TERM; \
+	  curl="curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \
+	  $$curl; \
+	  case "$$?" in \
+	    18) $$curl -C -;; \
+	  esac \
+	)

 testdata:: $(LIBVPX_TEST_DATA)
 	$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
--- a/rate_hist.c
+++ b/rate_hist.c
@ -37,7 +37,13 @@ struct rate_hist {
 struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg,
                                      const vpx_rational_t *fps) {
  int i;
-  struct rate_hist *hist = malloc(sizeof(*hist));
+  struct rate_hist *hist = calloc(1, sizeof(*hist));
+
+  if (hist == NULL || cfg == NULL || fps == NULL || fps->num == 0 ||
+      fps->den == 0) {
+    destroy_rate_histogram(hist);
+    return NULL;
+  }

  // Determine the number of samples in the buffer. Use the file's framerate
  // to determine the number of frames in rc_buf_sz milliseconds, with an
@ -80,7 +86,11 @@ void update_rate_histogram(struct rate_hist *hist,
                      (uint64_t)cfg->g_timebase.num /
                      (uint64_t)cfg->g_timebase.den;

-  int idx = hist->frames++ % hist->samples;
+  int idx;
+
+  if (hist == NULL || cfg == NULL || pkt == NULL) return;
+
+  idx = hist->frames++ % hist->samples;
  hist->pts[idx] = now;
  hist->sz[idx] = (int)pkt->data.frame.sz;

@ -116,9 +126,14 @@ void update_rate_histogram(struct rate_hist *hist,
 static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets,
                              int *num_buckets) {
  int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0;
-  int buckets = *num_buckets;
+  int buckets;
  int i;

+  assert(bucket != NULL);
+  assert(num_buckets != NULL);
+
+  buckets = *num_buckets;
+
  /* Find the extrema for this list of buckets */
  big_bucket = small_bucket = 0;
  for (i = 0; i < buckets; i++) {
@ -181,6 +196,8 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets,
  const char *pat1, *pat2;
  int i;

+  assert(bucket != NULL);
+
  switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
    case 1:
    case 2:
@ -259,6 +276,8 @@ void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg,
  int i, scale;
  int buckets = 0;

+  if (hist == NULL || cfg == NULL) return;
+
  for (i = 0; i < RATE_BINS; i++) {
    if (hist->bucket[i].low == INT_MAX) continue;
    hist->bucket[buckets++] = hist->bucket[i];
--- a/test/acm_random.h
+++ b/test/acm_random.h
@ -11,6 +11,10 @@
 #ifndef TEST_ACM_RANDOM_H_
 #define TEST_ACM_RANDOM_H_

+#include <assert.h>
+
+#include <limits>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"

 #include "vpx/vpx_integer.h"
@ -50,6 +54,13 @@ class ACMRandom {
    return r < 128 ? r << 4 : r >> 4;
  }

+  uint32_t RandRange(const uint32_t range) {
+    // testing::internal::Random::Generate provides values in the range
+    // testing::internal::Random::kMaxRange.
+    assert(range <= testing::internal::Random::kMaxRange);
+    return random_.Generate(range);
+  }
+
  int PseudoUniform(int range) { return random_.Generate(range); }

  int operator()(int n) { return PseudoUniform(n); }
--- a/test/android/Android.mk
+++ b/test/android/Android.mk
@ -32,6 +32,7 @@ LOCAL_CPP_EXTENSION := .cc
 LOCAL_MODULE := gtest
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/
+LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/
 LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
 include $(BUILD_STATIC_LIBRARY)

--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@ -14,6 +14,7 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"

@ -22,6 +23,7 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"

 using libvpx_test::ACMRandom;

@ -89,7 +91,7 @@ class AverageTestBase : public ::testing::Test {
 };
 typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch);

-typedef std::tr1::tuple<int, int, int, int, AverageFunction> AvgFunc;
+typedef ::testing::tuple<int, int, int, int, AverageFunction> AvgFunc;

 class AverageTest : public AverageTestBase,
                    public ::testing::WithParamInterface<AvgFunc> {
@ -120,7 +122,7 @@ class AverageTest : public AverageTestBase,
 typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
                              const int ref_stride, const int height);

-typedef std::tr1::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+typedef ::testing::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;

 class IntProRowTest : public AverageTestBase,
                      public ::testing::WithParamInterface<IntProRowParam> {
@ -162,7 +164,7 @@ class IntProRowTest : public AverageTestBase,

 typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);

-typedef std::tr1::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+typedef ::testing::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;

 class IntProColTest : public AverageTestBase,
                      public ::testing::WithParamInterface<IntProColParam> {
@ -186,8 +188,8 @@ class IntProColTest : public AverageTestBase,
  int16_t sum_c_;
 };

-typedef int (*SatdFunc)(const int16_t *coeffs, int length);
-typedef std::tr1::tuple<int, SatdFunc> SatdTestParam;
+typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
+typedef ::testing::tuple<int, SatdFunc> SatdTestParam;

 class SatdTest : public ::testing::Test,
                 public ::testing::WithParamInterface<SatdTestParam> {
@ -196,7 +198,7 @@ class SatdTest : public ::testing::Test,
    satd_size_ = GET_PARAM(0);
    satd_func_ = GET_PARAM(1);
    rnd_.Reset(ACMRandom::DeterministicSeed());
-    src_ = reinterpret_cast<int16_t *>(
+    src_ = reinterpret_cast<tran_low_t *>(
        vpx_memalign(16, sizeof(*src_) * satd_size_));
    ASSERT_TRUE(src_ != NULL);
  }
@ -206,12 +208,15 @@ class SatdTest : public ::testing::Test,
    vpx_free(src_);
  }

-  void FillConstant(const int16_t val) {
+  void FillConstant(const tran_low_t val) {
    for (int i = 0; i < satd_size_; ++i) src_[i] = val;
  }

  void FillRandom() {
-    for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16();
+    for (int i = 0; i < satd_size_; ++i) {
+      const int16_t tmp = rnd_.Rand16();
+      src_[i] = (tran_low_t)tmp;
+    }
  }

  void Check(const int expected) {
@ -223,11 +228,66 @@ class SatdTest : public ::testing::Test,
  int satd_size_;

 private:
-  int16_t *src_;
+  tran_low_t *src_;
  SatdFunc satd_func_;
  ACMRandom rnd_;
 };

+typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
+                                  const tran_low_t *dqcoeff, int block_size);
+typedef ::testing::tuple<int, BlockErrorFunc> BlockErrorTestFPParam;
+
+class BlockErrorTestFP
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<BlockErrorTestFPParam> {
+ protected:
+  virtual void SetUp() {
+    txfm_size_ = GET_PARAM(0);
+    block_error_func_ = GET_PARAM(1);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    coeff_ = reinterpret_cast<tran_low_t *>(
+        vpx_memalign(16, sizeof(*coeff_) * txfm_size_));
+    dqcoeff_ = reinterpret_cast<tran_low_t *>(
+        vpx_memalign(16, sizeof(*dqcoeff_) * txfm_size_));
+    ASSERT_TRUE(coeff_ != NULL);
+    ASSERT_TRUE(dqcoeff_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+    vpx_free(coeff_);
+    vpx_free(dqcoeff_);
+  }
+
+  void FillConstant(const tran_low_t coeff_val, const tran_low_t dqcoeff_val) {
+    for (int i = 0; i < txfm_size_; ++i) coeff_[i] = coeff_val;
+    for (int i = 0; i < txfm_size_; ++i) dqcoeff_[i] = dqcoeff_val;
+  }
+
+  void FillRandom() {
+    // Just two fixed seeds
+    rnd_.Reset(0xb0b9);
+    for (int i = 0; i < txfm_size_; ++i) coeff_[i] = rnd_.Rand16() >> 1;
+    rnd_.Reset(0xb0c8);
+    for (int i = 0; i < txfm_size_; ++i) dqcoeff_[i] = rnd_.Rand16() >> 1;
+  }
+
+  void Check(const int64_t expected) {
+    int64_t total;
+    ASM_REGISTER_STATE_CHECK(
+        total = block_error_func_(coeff_, dqcoeff_, txfm_size_));
+    EXPECT_EQ(expected, total);
+  }
+
+  int txfm_size_;
+
+ private:
+  tran_low_t *coeff_;
+  tran_low_t *dqcoeff_;
+  BlockErrorFunc block_error_func_;
+  ACMRandom rnd_;
+};
+
 uint8_t *AverageTestBase::source_data_ = NULL;

 TEST_P(AverageTest, MinValue) {
@ -308,7 +368,67 @@ TEST_P(SatdTest, Random) {
  Check(expected);
 }

-using std::tr1::make_tuple;
+TEST_P(SatdTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 20000;
+  vpx_usec_timer timer;
+  DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);
+  const int blocksize = GET_PARAM(0);
+
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    GET_PARAM(1)(coeff, blocksize);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+
+TEST_P(BlockErrorTestFP, MinValue) {
+  const int64_t kMin = -32640;
+  const int64_t expected = kMin * kMin * txfm_size_;
+  FillConstant(kMin, 0);
+  Check(expected);
+}
+
+TEST_P(BlockErrorTestFP, MaxValue) {
+  const int64_t kMax = 32640;
+  const int64_t expected = kMax * kMax * txfm_size_;
+  FillConstant(kMax, 0);
+  Check(expected);
+}
+
+TEST_P(BlockErrorTestFP, Random) {
+  int64_t expected;
+  switch (txfm_size_) {
+    case 16: expected = 2051681432; break;
+    case 64: expected = 11075114379; break;
+    case 256: expected = 44386271116; break;
+    case 1024: expected = 184774996089; break;
+    default:
+      FAIL() << "Invalid satd size (" << txfm_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+
+TEST_P(BlockErrorTestFP, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 20000;
+  vpx_usec_timer timer;
+  DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[1024]);
+  const int blocksize = GET_PARAM(0);
+
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    GET_PARAM(1)(coeff, dqcoeff, blocksize);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+
+using ::testing::make_tuple;

 INSTANTIATE_TEST_CASE_P(
    C, AverageTest,
@ -321,6 +441,13 @@ INSTANTIATE_TEST_CASE_P(C, SatdTest,
                                          make_tuple(256, &vpx_satd_c),
                                          make_tuple(1024, &vpx_satd_c)));

+INSTANTIATE_TEST_CASE_P(
+    C, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_c),
+                      make_tuple(64, &vp9_block_error_fp_c),
+                      make_tuple(256, &vp9_block_error_fp_c),
+                      make_tuple(1024, &vp9_block_error_fp_c)));
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
    SSE2, AverageTest,
@ -350,6 +477,28 @@ INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,
                                          make_tuple(64, &vpx_satd_sse2),
                                          make_tuple(256, &vpx_satd_sse2),
                                          make_tuple(1024, &vpx_satd_sse2)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2),
+                      make_tuple(64, &vp9_block_error_fp_sse2),
+                      make_tuple(256, &vp9_block_error_fp_sse2),
+                      make_tuple(1024, &vp9_block_error_fp_sse2)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, SatdTest,
+                        ::testing::Values(make_tuple(16, &vpx_satd_avx2),
+                                          make_tuple(64, &vpx_satd_avx2),
+                                          make_tuple(256, &vpx_satd_avx2),
+                                          make_tuple(1024, &vpx_satd_avx2)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2),
+                      make_tuple(64, &vp9_block_error_fp_avx2),
+                      make_tuple(256, &vp9_block_error_fp_avx2),
+                      make_tuple(1024, &vp9_block_error_fp_avx2)));
 #endif

 #if HAVE_NEON
@ -381,7 +530,18 @@ INSTANTIATE_TEST_CASE_P(NEON, SatdTest,
                                          make_tuple(64, &vpx_satd_neon),
                                          make_tuple(256, &vpx_satd_neon),
                                          make_tuple(1024, &vpx_satd_neon)));
-#endif
+
+// TODO(jianj): Remove the highbitdepth flag once the SIMD functions are
+// in place.
+#if !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    NEON, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon),
+                      make_tuple(64, &vp9_block_error_fp_neon),
+                      make_tuple(256, &vp9_block_error_fp_neon),
+                      make_tuple(1024, &vp9_block_error_fp_neon)));
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON

 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(
@ -392,6 +552,30 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(16, 16, 0, 4, &vpx_avg_4x4_msa),
                      make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa),
                      make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa)));
-#endif
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, IntProRowTest,
+    ::testing::Values(make_tuple(16, &vpx_int_pro_row_msa, &vpx_int_pro_row_c),
+                      make_tuple(32, &vpx_int_pro_row_msa, &vpx_int_pro_row_c),
+                      make_tuple(64, &vpx_int_pro_row_msa,
+                                 &vpx_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, IntProColTest,
+    ::testing::Values(make_tuple(16, &vpx_int_pro_col_msa, &vpx_int_pro_col_c),
+                      make_tuple(32, &vpx_int_pro_col_msa, &vpx_int_pro_col_c),
+                      make_tuple(64, &vpx_int_pro_col_msa,
+                                 &vpx_int_pro_col_c)));
+
+// TODO(jingning): Remove the highbitdepth flag once the SIMD functions are
+// in place.
+#if !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(MSA, SatdTest,
+                        ::testing::Values(make_tuple(16, &vpx_satd_msa),
+                                          make_tuple(64, &vpx_satd_msa),
+                                          make_tuple(256, &vpx_satd_msa),
+                                          make_tuple(1024, &vpx_satd_msa)));
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_MSA

 }  // namespace
--- a/test/blockiness_test.cc
+++ b/test/blockiness_test.cc
@ -141,7 +141,7 @@ class BlockinessTestBase : public ::testing::Test {
 };

 #if CONFIG_VP9_ENCODER
-typedef std::tr1::tuple<int, int> BlockinessParam;
+typedef ::testing::tuple<int, int> BlockinessParam;
 class BlockinessVP9Test
    : public BlockinessTestBase,
      public ::testing::WithParamInterface<BlockinessParam> {
@ -208,14 +208,14 @@ TEST_P(BlockinessVP9Test, WorstCaseBlockiness) {
 }
 #endif  // CONFIG_VP9_ENCODER

-using std::tr1::make_tuple;
+using ::testing::make_tuple;

 //------------------------------------------------------------------------------
 // C functions

 #if CONFIG_VP9_ENCODER
 const BlockinessParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
+  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238)
 };
 INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests));
 #endif
--- a/test/buffer.h
+++ b/test/buffer.h
@ -0,0 +1,382 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_BUFFER_H_
+#define TEST_BUFFER_H_
+
+#include <stdio.h>
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace libvpx_test {
+
+template <typename T>
+class Buffer {
+ public:
+  Buffer(int width, int height, int top_padding, int left_padding,
+         int right_padding, int bottom_padding)
+      : width_(width), height_(height), top_padding_(top_padding),
+        left_padding_(left_padding), right_padding_(right_padding),
+        bottom_padding_(bottom_padding), alignment_(0), padding_value_(0),
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+
+  Buffer(int width, int height, int top_padding, int left_padding,
+         int right_padding, int bottom_padding, unsigned int alignment)
+      : width_(width), height_(height), top_padding_(top_padding),
+        left_padding_(left_padding), right_padding_(right_padding),
+        bottom_padding_(bottom_padding), alignment_(alignment),
+        padding_value_(0), stride_(0), raw_size_(0), num_elements_(0),
+        raw_buffer_(NULL) {}
+
+  Buffer(int width, int height, int padding)
+      : width_(width), height_(height), top_padding_(padding),
+        left_padding_(padding), right_padding_(padding),
+        bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0),
+        raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+
+  Buffer(int width, int height, int padding, unsigned int alignment)
+      : width_(width), height_(height), top_padding_(padding),
+        left_padding_(padding), right_padding_(padding),
+        bottom_padding_(padding), alignment_(alignment), padding_value_(0),
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+
+  ~Buffer() {
+    if (alignment_) {
+      vpx_free(raw_buffer_);
+    } else {
+      delete[] raw_buffer_;
+    }
+  }
+
+  T *TopLeftPixel() const;
+
+  int stride() const { return stride_; }
+
+  // Set the buffer (excluding padding) to 'value'.
+  void Set(const T value);
+
+  // Set the buffer (excluding padding) to the output of ACMRandom function
+  // 'rand_func'.
+  void Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)());
+
+  // Set the buffer (excluding padding) to the output of ACMRandom function
+  // 'RandRange' with range 'low' to 'high' which typically must be within
+  // testing::internal::Random::kMaxRange (1u << 31). However, because we want
+  // to allow negative low (and high) values, it is restricted to INT32_MAX
+  // here.
+  void Set(ACMRandom *rand_class, const T low, const T high);
+
+  // Copy the contents of Buffer 'a' (excluding padding).
+  void CopyFrom(const Buffer<T> &a);
+
+  void DumpBuffer() const;
+
+  // Highlight the differences between two buffers if they are the same size.
+  void PrintDifference(const Buffer<T> &a) const;
+
+  bool HasPadding() const;
+
+  // Sets all the values in the buffer to 'padding_value'.
+  void SetPadding(const T padding_value);
+
+  // Checks if all the values (excluding padding) are equal to 'value' if the
+  // Buffers are the same size.
+  bool CheckValues(const T value) const;
+
+  // Check that padding matches the expected value or there is no padding.
+  bool CheckPadding() const;
+
+  // Compare the non-padding portion of two buffers if they are the same size.
+  bool CheckValues(const Buffer<T> &a) const;
+
+  bool Init() {
+    if (raw_buffer_ != NULL) return false;
+    EXPECT_GT(width_, 0);
+    EXPECT_GT(height_, 0);
+    EXPECT_GE(top_padding_, 0);
+    EXPECT_GE(left_padding_, 0);
+    EXPECT_GE(right_padding_, 0);
+    EXPECT_GE(bottom_padding_, 0);
+    stride_ = left_padding_ + width_ + right_padding_;
+    num_elements_ = stride_ * (top_padding_ + height_ + bottom_padding_);
+    raw_size_ = num_elements_ * sizeof(T);
+    if (alignment_) {
+      EXPECT_GE(alignment_, sizeof(T));
+      // Ensure alignment of the first value will be preserved.
+      EXPECT_EQ((left_padding_ * sizeof(T)) % alignment_, 0u);
+      // Ensure alignment of the subsequent rows will be preserved when there is
+      // a stride.
+      if (stride_ != width_) {
+        EXPECT_EQ((stride_ * sizeof(T)) % alignment_, 0u);
+      }
+      raw_buffer_ = reinterpret_cast<T *>(vpx_memalign(alignment_, raw_size_));
+    } else {
+      raw_buffer_ = new (std::nothrow) T[num_elements_];
+    }
+    EXPECT_TRUE(raw_buffer_ != NULL);
+    SetPadding(std::numeric_limits<T>::max());
+    return !::testing::Test::HasFailure();
+  }
+
+ private:
+  bool BufferSizesMatch(const Buffer<T> &a) const;
+
+  const int width_;
+  const int height_;
+  const int top_padding_;
+  const int left_padding_;
+  const int right_padding_;
+  const int bottom_padding_;
+  const unsigned int alignment_;
+  T padding_value_;
+  int stride_;
+  int raw_size_;
+  int num_elements_;
+  T *raw_buffer_;
+};
+
+template <typename T>
+T *Buffer<T>::TopLeftPixel() const {
+  if (!raw_buffer_) return NULL;
+  return raw_buffer_ + (top_padding_ * stride_) + left_padding_;
+}
+
+template <typename T>
+void Buffer<T>::Set(const T value) {
+  if (!raw_buffer_) return;
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      src[width] = value;
+    }
+    src += stride_;
+  }
+}
+
+template <typename T>
+void Buffer<T>::Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()) {
+  if (!raw_buffer_) return;
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      src[width] = (*rand_class.*rand_func)();
+    }
+    src += stride_;
+  }
+}
+
+template <typename T>
+void Buffer<T>::Set(ACMRandom *rand_class, const T low, const T high) {
+  if (!raw_buffer_) return;
+
+  EXPECT_LE(low, high);
+  EXPECT_LE(static_cast<int64_t>(high) - low,
+            std::numeric_limits<int32_t>::max());
+
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      // 'low' will be promoted to unsigned given the return type of RandRange.
+      // Store the value as an int to avoid unsigned overflow warnings when
+      // 'low' is negative.
+      const int32_t value =
+          static_cast<int32_t>((*rand_class).RandRange(high - low));
+      src[width] = static_cast<T>(value + low);
+    }
+    src += stride_;
+  }
+}
+
+template <typename T>
+void Buffer<T>::CopyFrom(const Buffer<T> &a) {
+  if (!raw_buffer_) return;
+  if (!BufferSizesMatch(a)) return;
+
+  T *a_src = a.TopLeftPixel();
+  T *b_src = this->TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      b_src[width] = a_src[width];
+    }
+    a_src += a.stride();
+    b_src += this->stride();
+  }
+}
+
+template <typename T>
+void Buffer<T>::DumpBuffer() const {
+  if (!raw_buffer_) return;
+  for (int height = 0; height < height_ + top_padding_ + bottom_padding_;
+       ++height) {
+    for (int width = 0; width < stride_; ++width) {
+      printf("%4d", raw_buffer_[height + width * stride_]);
+    }
+    printf("\n");
+  }
+}
+
+template <typename T>
+bool Buffer<T>::HasPadding() const {
+  if (!raw_buffer_) return false;
+  return top_padding_ || left_padding_ || right_padding_ || bottom_padding_;
+}
+
+template <typename T>
+void Buffer<T>::PrintDifference(const Buffer<T> &a) const {
+  if (!raw_buffer_) return;
+  if (!BufferSizesMatch(a)) return;
+
+  T *a_src = a.TopLeftPixel();
+  T *b_src = TopLeftPixel();
+
+  printf("This buffer:\n");
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      if (a_src[width] != b_src[width]) {
+        printf("*%3d", b_src[width]);
+      } else {
+        printf("%4d", b_src[width]);
+      }
+    }
+    printf("\n");
+    a_src += a.stride();
+    b_src += this->stride();
+  }
+
+  a_src = a.TopLeftPixel();
+  b_src = TopLeftPixel();
+
+  printf("Reference buffer:\n");
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      if (a_src[width] != b_src[width]) {
+        printf("*%3d", a_src[width]);
+      } else {
+        printf("%4d", a_src[width]);
+      }
+    }
+    printf("\n");
+    a_src += a.stride();
+    b_src += this->stride();
+  }
+}
+
+template <typename T>
+void Buffer<T>::SetPadding(const T padding_value) {
+  if (!raw_buffer_) return;
+  padding_value_ = padding_value;
+
+  T *src = raw_buffer_;
+  for (int i = 0; i < num_elements_; ++i) {
+    src[i] = padding_value;
+  }
+}
+
+template <typename T>
+bool Buffer<T>::CheckValues(const T value) const {
+  if (!raw_buffer_) return false;
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      if (value != src[width]) {
+        return false;
+      }
+    }
+    src += stride_;
+  }
+  return true;
+}
+
+template <typename T>
+bool Buffer<T>::CheckPadding() const {
+  if (!raw_buffer_) return false;
+  if (!HasPadding()) return true;
+
+  // Top padding.
+  T const *top = raw_buffer_;
+  for (int i = 0; i < stride_ * top_padding_; ++i) {
+    if (padding_value_ != top[i]) {
+      return false;
+    }
+  }
+
+  // Left padding.
+  T const *left = TopLeftPixel() - left_padding_;
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < left_padding_; ++width) {
+      if (padding_value_ != left[width]) {
+        return false;
+      }
+    }
+    left += stride_;
+  }
+
+  // Right padding.
+  T const *right = TopLeftPixel() + width_;
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < right_padding_; ++width) {
+      if (padding_value_ != right[width]) {
+        return false;
+      }
+    }
+    right += stride_;
+  }
+
+  // Bottom padding
+  T const *bottom = raw_buffer_ + (top_padding_ + height_) * stride_;
+  for (int i = 0; i < stride_ * bottom_padding_; ++i) {
+    if (padding_value_ != bottom[i]) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename T>
+bool Buffer<T>::CheckValues(const Buffer<T> &a) const {
+  if (!raw_buffer_) return false;
+  if (!BufferSizesMatch(a)) return false;
+
+  T *a_src = a.TopLeftPixel();
+  T *b_src = this->TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      if (a_src[width] != b_src[width]) {
+        return false;
+      }
+    }
+    a_src += a.stride();
+    b_src += this->stride();
+  }
+  return true;
+}
+
+template <typename T>
+bool Buffer<T>::BufferSizesMatch(const Buffer<T> &a) const {
+  if (!raw_buffer_) return false;
+  if (a.width_ != this->width_ || a.height_ != this->height_) {
+    printf(
+        "Reference buffer of size %dx%d does not match this buffer which is "
+        "size %dx%d\n",
+        a.width_, a.height_, this->width_, this->height_);
+    return false;
+  }
+
+  return true;
+}
+}  // namespace libvpx_test
+#endif  // TEST_BUFFER_H_
--- a/test/byte_alignment_test.cc
+++ b/test/byte_alignment_test.cc
@ -128,8 +128,8 @@ class ByteAlignmentTest
  // TODO(fgalligan): Move the MD5 testing code into another class.
  void OpenMd5File(const std::string &md5_file_name_) {
    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
-                                   << md5_file_name_;
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "MD5 file open failed. Filename: " << md5_file_name_;
  }

  void CheckMd5(const vpx_image_t &img) {
@ -171,8 +171,9 @@ TEST_F(ByteAlignmentTest, SwitchByteAlignment) {
 TEST_P(ByteAlignmentTest, TestAlignment) {
  const ByteAlignmentTestParam t = GetParam();
  SetByteAlignment(t.byte_alignment, t.expected_value);
-  if (t.decode_remaining)
+  if (t.decode_remaining) {
    ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames(t.byte_alignment));
+  }
 }

 INSTANTIATE_TEST_CASE_P(Alignments, ByteAlignmentTest,
--- a/test/clear_system_state.h
+++ b/test/clear_system_state.h
@ -11,19 +11,13 @@
 #define TEST_CLEAR_SYSTEM_STATE_H_

 #include "./vpx_config.h"
-#if ARCH_X86 || ARCH_X86_64
-#include "vpx_ports/x86.h"
-#endif
+#include "vpx_ports/system_state.h"

 namespace libvpx_test {

 // Reset system to a known state. This function should be used for all non-API
 // test cases.
-inline void ClearSystemState() {
-#if ARCH_X86 || ARCH_X86_64
-  vpx_reset_mmx_state();
-#endif
-}
+inline void ClearSystemState() { vpx_clear_system_state(); }

 }  // namespace libvpx_test
 #endif  // TEST_CLEAR_SYSTEM_STATE_H_
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@ -53,23 +53,22 @@ class CodecFactory {
 template <class T1>
 class CodecTestWithParam
    : public ::testing::TestWithParam<
-          std::tr1::tuple<const libvpx_test::CodecFactory *, T1> > {};
+          ::testing::tuple<const libvpx_test::CodecFactory *, T1> > {};

 template <class T1, class T2>
 class CodecTestWith2Params
    : public ::testing::TestWithParam<
-          std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2> > {};
+          ::testing::tuple<const libvpx_test::CodecFactory *, T1, T2> > {};

 template <class T1, class T2, class T3>
 class CodecTestWith3Params
    : public ::testing::TestWithParam<
-          std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2, T3> > {};
+          ::testing::tuple<const libvpx_test::CodecFactory *, T1, T2, T3> > {};

 template <class T1, class T2, class T3, class T4>
 class CodecTestWith4Params
-    : public ::testing::TestWithParam<
-          std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2, T3, T4> > {
-};
+    : public ::testing::TestWithParam< ::testing::tuple<
+          const libvpx_test::CodecFactory *, T1, T2, T3, T4> > {};

 /*
 * VP8 Codec Definitions
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@ -0,0 +1,182 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/register_state_check.h"
+#include "vpx_ports/vpx_timer.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+using ::libvpx_test::Buffer;
+
+typedef void (*AvgPredFunc)(uint8_t *a, const uint8_t *b, int w, int h,
+                            const uint8_t *c, int c_stride);
+
+uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; }
+
+void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref,
+                    int width, int height, Buffer<uint8_t> *avg) {
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      avg->TopLeftPixel()[y * avg->stride() + x] =
+          avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x],
+                            ref.TopLeftPixel()[y * ref.stride() + x]);
+    }
+  }
+}
+
+class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
+ public:
+  virtual void SetUp() {
+    avg_pred_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  AvgPredFunc avg_pred_func_;
+  ACMRandom rnd_;
+};
+
+TEST_P(AvgPredTest, SizeCombinations) {
+  // This is called as part of the sub pixel variance. As such it must be one of
+  // the variance block sizes.
+
+  for (int width_pow = 2; width_pow <= 6; ++width_pow) {
+    for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
+         ++height_pow) {
+      // Don't test 4x2 or 64x128
+      if (height_pow == 1 || height_pow == 7) continue;
+
+      // The sse2 special-cases when ref width == stride, so make sure to test
+      // it.
+      for (int ref_padding = 0; ref_padding < 2; ref_padding++) {
+        const int width = 1 << width_pow;
+        const int height = 1 << height_pow;
+        // Only the reference buffer may have a stride not equal to width.
+        Buffer<uint8_t> ref =
+            Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        ASSERT_TRUE(ref.Init());
+        Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(pred.Init());
+        Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(avg_ref.Init());
+        Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(avg_chk.Init());
+
+        ref.Set(&rnd_, &ACMRandom::Rand8);
+        pred.Set(&rnd_, &ACMRandom::Rand8);
+
+        reference_pred(pred, ref, width, height, &avg_ref);
+        ASM_REGISTER_STATE_CHECK(
+            avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width,
+                           height, ref.TopLeftPixel(), ref.stride()));
+
+        EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
+        if (HasFailure()) {
+          printf("Width: %d Height: %d\n", width, height);
+          avg_chk.PrintDifference(avg_ref);
+          return;
+        }
+      }
+    }
+  }
+}
+
+TEST_P(AvgPredTest, CompareReferenceRandom) {
+  const int width = 64;
+  const int height = 32;
+  Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, 8);
+  ASSERT_TRUE(ref.Init());
+  Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+  ASSERT_TRUE(pred.Init());
+  Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+  ASSERT_TRUE(avg_ref.Init());
+  Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+  ASSERT_TRUE(avg_chk.Init());
+
+  for (int i = 0; i < 500; ++i) {
+    ref.Set(&rnd_, &ACMRandom::Rand8);
+    pred.Set(&rnd_, &ACMRandom::Rand8);
+
+    reference_pred(pred, ref, width, height, &avg_ref);
+    ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk.TopLeftPixel(),
+                                            pred.TopLeftPixel(), width, height,
+                                            ref.TopLeftPixel(), ref.stride()));
+    EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
+    if (HasFailure()) {
+      printf("Width: %d Height: %d\n", width, height);
+      avg_chk.PrintDifference(avg_ref);
+      return;
+    }
+  }
+}
+
+TEST_P(AvgPredTest, DISABLED_Speed) {
+  for (int width_pow = 2; width_pow <= 6; ++width_pow) {
+    for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
+         ++height_pow) {
+      // Don't test 4x2 or 64x128
+      if (height_pow == 1 || height_pow == 7) continue;
+
+      for (int ref_padding = 0; ref_padding < 2; ref_padding++) {
+        const int width = 1 << width_pow;
+        const int height = 1 << height_pow;
+        Buffer<uint8_t> ref =
+            Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        ASSERT_TRUE(ref.Init());
+        Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(pred.Init());
+        Buffer<uint8_t> avg = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(avg.Init());
+
+        ref.Set(&rnd_, &ACMRandom::Rand8);
+        pred.Set(&rnd_, &ACMRandom::Rand8);
+
+        vpx_usec_timer timer;
+        vpx_usec_timer_start(&timer);
+        for (int i = 0; i < 10000000 / (width * height); ++i) {
+          avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height,
+                         ref.TopLeftPixel(), ref.stride());
+        }
+        vpx_usec_timer_mark(&timer);
+
+        const int elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&timer));
+        printf("Average Test (ref_padding: %d) %dx%d time: %5d us\n",
+               ref_padding, width, height, elapsed_time);
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, AvgPredTest,
+                        ::testing::Values(&vpx_comp_avg_pred_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, AvgPredTest,
+                        ::testing::Values(&vpx_comp_avg_pred_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, AvgPredTest,
+                        ::testing::Values(&vpx_comp_avg_pred_neon));
+#endif  // HAVE_NEON
+
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, AvgPredTest,
+                        ::testing::Values(&vpx_comp_avg_pred_vsx));
+#endif  // HAVE_VSX
+}  // namespace
--- a/test/consistency_test.cc
+++ b/test/consistency_test.cc
@ -127,7 +127,7 @@ class ConsistencyTestBase : public ::testing::Test {
 };

 #if CONFIG_VP9_ENCODER
-typedef std::tr1::tuple<int, int> ConsistencyParam;
+typedef ::testing::tuple<int, int> ConsistencyParam;
 class ConsistencyVP9Test
    : public ConsistencyTestBase,
      public ::testing::WithParamInterface<ConsistencyParam> {
@ -198,14 +198,14 @@ TEST_P(ConsistencyVP9Test, ConsistencyIsZero) {
 }
 #endif  // CONFIG_VP9_ENCODER

-using std::tr1::make_tuple;
+using ::testing::make_tuple;

 //------------------------------------------------------------------------------
 // C functions

 #if CONFIG_VP9_ENCODER
 const ConsistencyParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
+  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238)
 };
 INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test,
                        ::testing::ValuesIn(c_vp9_tests));
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@ -25,6 +25,7 @@
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"

 namespace {

@ -32,9 +33,9 @@ static const unsigned int kMaxDimension = 64;

 typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h);
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h);

 typedef void (*WrapperFilterBlock2d8Func)(
    const uint8_t *src_ptr, const unsigned int src_stride,
@ -76,7 +77,7 @@ struct ConvolveFunctions {
  int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
 };

-typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
+typedef ::testing::tuple<int, int, const ConvolveFunctions *> ConvolveParam;

 #define ALL_SIZES(convolve_fn)                                            \
  make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn),         \
@ -300,9 +301,9 @@ void wrapper_filter_average_block2d_8_c(
    filter_average_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr,
                               dst_stride, output_width, output_height);
  } else {
-    highbd_filter_average_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+    highbd_filter_average_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride,
                                      hfilter, vfilter,
-                                      CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
+                                      CAST_TO_SHORTPTR(dst_ptr), dst_stride,
                                      output_width, output_height, use_highbd);
  }
 #else
@ -323,8 +324,8 @@ void wrapper_filter_block2d_8_c(const uint8_t *src_ptr,
    filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr,
                       dst_stride, output_width, output_height);
  } else {
-    highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride, hfilter,
-                              vfilter, CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
+    highbd_filter_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, hfilter,
+                              vfilter, CAST_TO_SHORTPTR(dst_ptr), dst_stride,
                              output_width, output_height, use_highbd);
  }
 #else
@ -449,7 +450,9 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {

  void CheckGuardBlocks() {
    for (int i = 0; i < kOutputBufferSize; ++i) {
-      if (IsIndexInBorder(i)) EXPECT_EQ(255, output_[i]);
+      if (IsIndexInBorder(i)) {
+        EXPECT_EQ(255, output_[i]);
+      }
    }
  }

@ -459,7 +462,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
    if (UUT_->use_highbd_ == 0) {
      return input_ + offset;
    } else {
-      return CONVERT_TO_BYTEPTR(input16_) + offset;
+      return CAST_TO_BYTEPTR(input16_ + offset);
    }
 #else
    return input_ + offset;
@ -472,7 +475,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
    if (UUT_->use_highbd_ == 0) {
      return output_ + offset;
    } else {
-      return CONVERT_TO_BYTEPTR(output16_) + offset;
+      return CAST_TO_BYTEPTR(output16_ + offset);
    }
 #else
    return output_ + offset;
@ -485,7 +488,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
    if (UUT_->use_highbd_ == 0) {
      return output_ref_ + offset;
    } else {
-      return CONVERT_TO_BYTEPTR(output16_ref_) + offset;
+      return CAST_TO_BYTEPTR(output16_ref_ + offset);
    }
 #else
    return output_ref_ + offset;
@ -497,7 +500,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
    if (UUT_->use_highbd_ == 0) {
      return list[index];
    } else {
-      return CONVERT_TO_SHORTPTR(list)[index];
+      return CAST_TO_SHORTPTR(list)[index];
    }
 #else
    return list[index];
@ -509,7 +512,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
    if (UUT_->use_highbd_ == 0) {
      list[index] = (uint8_t)val;
    } else {
-      CONVERT_TO_SHORTPTR(list)[index] = val;
+      CAST_TO_SHORTPTR(list)[index] = val;
    }
 #else
    list[index] = (uint8_t)val;
@ -539,12 +542,167 @@ uint16_t *ConvolveTest::output16_ref_ = NULL;

 TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); }

+TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
+                   width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve_copy_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
+                   width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve_avg_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_Scale_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                   width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve_scale_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                  width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_vert_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                  width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_avg_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
 TEST_P(ConvolveTest, Copy) {
  uint8_t *const in = input();
  uint8_t *const out = output();

  ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride,
-                                          NULL, 0, NULL, 0, Width(), Height()));
+                                          NULL, 0, 0, 0, 0, Width(), Height()));

  CheckGuardBlocks();

@ -563,7 +721,7 @@ TEST_P(ConvolveTest, Avg) {
  CopyOutputToRef();

  ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride,
-                                          NULL, 0, NULL, 0, Width(), Height()));
+                                          NULL, 0, 0, 0, 0, Width(), Height()));

  CheckGuardBlocks();

@ -580,12 +738,10 @@ TEST_P(ConvolveTest, Avg) {
 TEST_P(ConvolveTest, CopyHoriz) {
  uint8_t *const in = input();
  uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };

  ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride,
-                                         filter8, 16, filter8, 16, Width(),
-                                         Height()));
+                                         vp9_filter_kernels[0], 0, 16, 0, 16,
+                                         Width(), Height()));

  CheckGuardBlocks();

@ -600,12 +756,10 @@ TEST_P(ConvolveTest, CopyHoriz) {
 TEST_P(ConvolveTest, CopyVert) {
  uint8_t *const in = input();
  uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };

  ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride,
-                                         filter8, 16, filter8, 16, Width(),
-                                         Height()));
+                                         vp9_filter_kernels[0], 0, 16, 0, 16,
+                                         Width(), Height()));

  CheckGuardBlocks();

@ -620,12 +774,10 @@ TEST_P(ConvolveTest, CopyVert) {
 TEST_P(ConvolveTest, Copy2D) {
  uint8_t *const in = input();
  uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };

  ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride,
-                                          filter8, 16, filter8, 16, Width(),
-                                          Height()));
+                                          vp9_filter_kernels[0], 0, 16, 0, 16,
+                                          Width(), Height()));

  CheckGuardBlocks();

@ -661,7 +813,6 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
  }
 }

-const int16_t kInvalidFilter[8] = { 0 };
 const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = {
  wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c
 };
@ -677,7 +828,7 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
    if (UUT_->use_highbd_ == 0) {
      ref = ref8;
    } else {
-      ref = CONVERT_TO_BYTEPTR(ref16);
+      ref = CAST_TO_BYTEPTR(ref16);
    }
 #else
    uint8_t ref[kOutputStride * kMaxDimension];
@ -714,21 +865,21 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
                                      Width(), Height(), UUT_->use_highbd_);

          if (filter_x && filter_y)
-            ASM_REGISTER_STATE_CHECK(UUT_->hv8_[i](
-                in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                filters[filter_y], 16, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters,
+                              filter_x, 16, filter_y, 16, Width(), Height()));
          else if (filter_y)
-            ASM_REGISTER_STATE_CHECK(UUT_->v8_[i](
-                in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
-                filters[filter_y], 16, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0,
+                             16, filter_y, 16, Width(), Height()));
          else if (filter_x)
-            ASM_REGISTER_STATE_CHECK(UUT_->h8_[i](
-                in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                kInvalidFilter, 16, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters,
+                             filter_x, 16, 0, 16, Width(), Height()));
          else
-            ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](
-                in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                kInvalidFilter, 0, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](in, kInputStride, out,
+                                                    kOutputStride, NULL, 0, 0,
+                                                    0, 0, Width(), Height()));

          CheckGuardBlocks();

@ -756,7 +907,7 @@ TEST_P(ConvolveTest, FilterExtremes) {
  if (UUT_->use_highbd_ == 0) {
    ref = ref8;
  } else {
-    ref = CONVERT_TO_BYTEPTR(ref16);
+    ref = CAST_TO_BYTEPTR(ref16);
  }
 #else
  uint8_t ref[kOutputStride * kMaxDimension];
@ -812,21 +963,21 @@ TEST_P(ConvolveTest, FilterExtremes) {
                                       filters[filter_y], ref, kOutputStride,
                                       Width(), Height(), UUT_->use_highbd_);
            if (filter_x && filter_y)
-              ASM_REGISTER_STATE_CHECK(UUT_->hv8_[0](
-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                  filters[filter_y], 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters,
+                                filter_x, 16, filter_y, 16, Width(), Height()));
            else if (filter_y)
-              ASM_REGISTER_STATE_CHECK(UUT_->v8_[0](
-                  in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
-                  filters[filter_y], 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0,
+                               16, filter_y, 16, Width(), Height()));
            else if (filter_x)
-              ASM_REGISTER_STATE_CHECK(UUT_->h8_[0](
-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                  kInvalidFilter, 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters,
+                               filter_x, 16, 0, 16, Width(), Height()));
            else
-              ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](
-                  in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                  kInvalidFilter, 0, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out,
+                                                      kOutputStride, NULL, 0, 0,
+                                                      0, 0, Width(), Height()));

            for (int y = 0; y < Height(); ++y) {
              for (int x = 0; x < Width(); ++x)
@ -845,44 +996,63 @@ TEST_P(ConvolveTest, FilterExtremes) {

 /* This test exercises that enough rows and columns are filtered with every
   possible initial fractional positions and scaling steps. */
+#if !CONFIG_VP9_HIGHBITDEPTH
+static const ConvolveFunc scaled_2d_c_funcs[2] = { vpx_scaled_2d_c,
+                                                   vpx_scaled_avg_2d_c };
+
 TEST_P(ConvolveTest, CheckScalingFiltering) {
  uint8_t *const in = input();
  uint8_t *const out = output();
-  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
+  uint8_t ref[kOutputStride * kMaxDimension];

-  SetConstantInput(127);
+  ::libvpx_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      const uint16_t r = prng.Rand8Extremes();
+      assign_val(in, y * kInputStride + x, r);
+    }
+  }

-  for (int frac = 0; frac < 16; ++frac) {
-    for (int step = 1; step <= 32; ++step) {
-      /* Test the horizontal and vertical filters in combination. */
-      ASM_REGISTER_STATE_CHECK(
-          UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap[frac],
-                         step, eighttap[frac], step, Width(), Height()));
+  for (int i = 0; i < 2; ++i) {
+    for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
+      const InterpKernel *const eighttap = vp9_filter_kernels[filter_type];
+      for (int frac = 0; frac < 16; ++frac) {
+        for (int step = 1; step <= 32; ++step) {
+          /* Test the horizontal and vertical filters in combination. */
+          scaled_2d_c_funcs[i](in, kInputStride, ref, kOutputStride, eighttap,
+                               frac, step, frac, step, Width(), Height());
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->shv8_[i](in, kInputStride, out, kOutputStride, eighttap,
+                             frac, step, frac, step, Width(), Height()));

-      CheckGuardBlocks();
+          CheckGuardBlocks();

-      for (int y = 0; y < Height(); ++y) {
-        for (int x = 0; x < Width(); ++x) {
-          ASSERT_EQ(lookup(in, y * kInputStride + x),
-                    lookup(out, y * kOutputStride + x))
-              << "x == " << x << ", y == " << y << ", frac == " << frac
-              << ", step == " << step;
+          for (int y = 0; y < Height(); ++y) {
+            for (int x = 0; x < Width(); ++x) {
+              ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                        lookup(out, y * kOutputStride + x))
+                  << "x == " << x << ", y == " << y << ", frac == " << frac
+                  << ", step == " << step;
+            }
+          }
        }
      }
    }
  }
 }
+#endif

-using std::tr1::make_tuple;
+using ::testing::make_tuple;

 #if CONFIG_VP9_HIGHBITDEPTH
 #define WRAP(func, bd)                                                       \
  void wrap_##func##_##bd(                                                   \
      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride,    \
-      const int16_t *filter_y, int filter_y_stride, int w, int h) {          \
-    vpx_highbd_##func(src, src_stride, dst, dst_stride, filter_x,            \
-                      filter_x_stride, filter_y, filter_y_stride, w, h, bd); \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride,   \
+                      reinterpret_cast<uint16_t *>(dst), dst_stride, filter, \
+                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);         \
  }

 #if HAVE_SSE2 && ARCH_X86_64
@ -912,6 +1082,35 @@ WRAP(convolve8_sse2, 12)
 WRAP(convolve8_avg_sse2, 12)
 #endif  // HAVE_SSE2 && ARCH_X86_64

+#if HAVE_AVX2
+WRAP(convolve_copy_avx2, 8)
+WRAP(convolve_avg_avx2, 8)
+WRAP(convolve8_horiz_avx2, 8)
+WRAP(convolve8_avg_horiz_avx2, 8)
+WRAP(convolve8_vert_avx2, 8)
+WRAP(convolve8_avg_vert_avx2, 8)
+WRAP(convolve8_avx2, 8)
+WRAP(convolve8_avg_avx2, 8)
+
+WRAP(convolve_copy_avx2, 10)
+WRAP(convolve_avg_avx2, 10)
+WRAP(convolve8_avx2, 10)
+WRAP(convolve8_horiz_avx2, 10)
+WRAP(convolve8_vert_avx2, 10)
+WRAP(convolve8_avg_avx2, 10)
+WRAP(convolve8_avg_horiz_avx2, 10)
+WRAP(convolve8_avg_vert_avx2, 10)
+
+WRAP(convolve_copy_avx2, 12)
+WRAP(convolve_avg_avx2, 12)
+WRAP(convolve8_avx2, 12)
+WRAP(convolve8_horiz_avx2, 12)
+WRAP(convolve8_vert_avx2, 12)
+WRAP(convolve8_avg_avx2, 12)
+WRAP(convolve8_avg_horiz_avx2, 12)
+WRAP(convolve8_avg_vert_avx2, 12)
+#endif  // HAVE_AVX2
+
 #if HAVE_NEON
 WRAP(convolve_copy_neon, 8)
 WRAP(convolve_avg_neon, 8)
@ -1057,18 +1256,48 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
                        ::testing::ValuesIn(kArrayConvolve8_ssse3));
 #endif

-#if HAVE_AVX2 && HAVE_SSSE3
+#if HAVE_AVX2
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_avx2(
+    wrap_convolve_copy_avx2_8, wrap_convolve_avg_avx2_8,
+    wrap_convolve8_horiz_avx2_8, wrap_convolve8_avg_horiz_avx2_8,
+    wrap_convolve8_vert_avx2_8, wrap_convolve8_avg_vert_avx2_8,
+    wrap_convolve8_avx2_8, wrap_convolve8_avg_avx2_8, wrap_convolve8_horiz_c_8,
+    wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
+    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_avx2(
+    wrap_convolve_copy_avx2_10, wrap_convolve_avg_avx2_10,
+    wrap_convolve8_horiz_avx2_10, wrap_convolve8_avg_horiz_avx2_10,
+    wrap_convolve8_vert_avx2_10, wrap_convolve8_avg_vert_avx2_10,
+    wrap_convolve8_avx2_10, wrap_convolve8_avg_avx2_10,
+    wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
+    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
+    wrap_convolve8_avg_c_10, 10);
+const ConvolveFunctions convolve12_avx2(
+    wrap_convolve_copy_avx2_12, wrap_convolve_avg_avx2_12,
+    wrap_convolve8_horiz_avx2_12, wrap_convolve8_avg_horiz_avx2_12,
+    wrap_convolve8_vert_avx2_12, wrap_convolve8_avg_vert_avx2_12,
+    wrap_convolve8_avx2_12, wrap_convolve8_avg_avx2_12,
+    wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
+    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
+    wrap_convolve8_avg_c_12, 12);
+const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2),
+                                               ALL_SIZES(convolve10_avx2),
+                                               ALL_SIZES(convolve12_avx2) };
+INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_avx2));
+#else   // !CONFIG_VP9_HIGHBITDEPTH
 const ConvolveFunctions convolve8_avx2(
    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2,
-    vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2,
-    vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
+    vpx_convolve8_avg_horiz_avx2, vpx_convolve8_vert_avx2,
+    vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2,
    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
-
 const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
 INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
                        ::testing::ValuesIn(kArrayConvolve8_avx2));
-#endif  // HAVE_AVX2 && HAVE_SSSE3
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_AVX2

 #if HAVE_NEON
 #if CONFIG_VP9_HIGHBITDEPTH
@ -1105,7 +1334,7 @@ const ConvolveFunctions convolve8_neon(
    vpx_convolve8_avg_horiz_neon, vpx_convolve8_vert_neon,
    vpx_convolve8_avg_vert_neon, vpx_convolve8_neon, vpx_convolve8_avg_neon,
    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
-    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_neon, vpx_scaled_avg_2d_c, 0);

 const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon) };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@ -1132,10 +1361,34 @@ const ConvolveFunctions convolve8_msa(
    vpx_convolve8_avg_horiz_msa, vpx_convolve8_vert_msa,
    vpx_convolve8_avg_vert_msa, vpx_convolve8_msa, vpx_convolve8_avg_msa,
    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
-    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_msa, vpx_scaled_avg_2d_c, 0);

 const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) };
 INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest,
                        ::testing::ValuesIn(kArrayConvolve8_msa));
 #endif  // HAVE_MSA
+
+#if HAVE_VSX
+const ConvolveFunctions convolve8_vsx(
+    vpx_convolve_copy_vsx, vpx_convolve_avg_vsx, vpx_convolve8_horiz_vsx,
+    vpx_convolve8_avg_horiz_vsx, vpx_convolve8_vert_vsx,
+    vpx_convolve8_avg_vert_vsx, vpx_convolve8_vsx, vpx_convolve8_avg_vsx,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+const ConvolveParam kArrayConvolve_vsx[] = { ALL_SIZES(convolve8_vsx) };
+INSTANTIATE_TEST_CASE_P(VSX, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve_vsx));
+#endif  // HAVE_VSX
+
+#if HAVE_MMI
+const ConvolveFunctions convolve8_mmi(
+    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_mmi,
+    vpx_convolve8_avg_horiz_c, vpx_convolve8_vert_mmi,
+    vpx_convolve8_avg_vert_mmi, vpx_convolve8_mmi, vpx_convolve8_avg_mmi,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+const ConvolveParam kArrayConvolve_mmi[] = { ALL_SIZES(convolve8_mmi) };
+INSTANTIATE_TEST_CASE_P(MMI, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve_mmi));
+#endif  // HAVE_MMI
 }  // namespace
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@ -229,9 +229,10 @@ typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                        int tx_type);

-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
-typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
+typedef ::testing::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t>
+    Dct16x16Param;
+typedef ::testing::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
+typedef ::testing::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
    Idct16x16Param;

 void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
@ -255,11 +256,11 @@ void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,

 #if CONFIG_VP9_HIGHBITDEPTH
 void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_c(in, out, stride, 10);
+  vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }

 void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_c(in, out, stride, 12);
+  vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }

 void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
@ -273,36 +274,36 @@ void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
 }

 void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10);
+  vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
 }

 void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
+  vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
 }

 #if HAVE_SSE2
 void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_c(in, out, stride, 10);
+  vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }

 void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_c(in, out, stride, 12);
+  vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }

 void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
 }

 void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
 }

 void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
 }

 void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@ -353,7 +354,7 @@ class Trans16x16TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
      } else {
        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
      }

@ -475,10 +476,10 @@ class Trans16x16TestBase {
        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
 #if CONFIG_VP9_HIGHBITDEPTH
      } else {
-        inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,
+        inv_txfm_ref(output_ref_block, CAST_TO_BYTEPTR(ref16), pitch_,
                     tx_type_);
        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(output_ref_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(output_ref_block, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
      }
      if (bit_depth_ == VPX_BITS_8) {
@ -530,8 +531,7 @@ class Trans16x16TestBase {
        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
 #if CONFIG_VP9_HIGHBITDEPTH
      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), 16));
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), 16));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
      }

@ -543,8 +543,8 @@ class Trans16x16TestBase {
        const uint32_t diff = dst[j] - src[j];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
        const uint32_t error = diff * diff;
-        EXPECT_GE(1u, error) << "Error: 16x16 IDCT has error " << error
-                             << " at index " << j;
+        EXPECT_GE(1u, error)
+            << "Error: 16x16 IDCT has error " << error << " at index " << j;
      }
    }
  }
@ -585,9 +585,9 @@ class Trans16x16TestBase {
        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
      } else {
 #if CONFIG_VP9_HIGHBITDEPTH
-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
      }

@ -745,67 +745,7 @@ TEST_P(InvTrans16x16DCT, CompareReference) {
  CompareInvReference(ref_txfm_, thresh_);
 }

-class PartialTrans16x16Test : public ::testing::TestWithParam<
-                                  std::tr1::tuple<FdctFunc, vpx_bit_depth_t> > {
- public:
-  virtual ~PartialTrans16x16Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    bit_depth_ = GET_PARAM(1);
-  }
-
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  vpx_bit_depth_t bit_depth_;
-  FdctFunc fwd_txfm_;
-};
-
-TEST_P(PartialTrans16x16Test, Extremes) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  const int minval = -maxval;
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]);
-}
-
-TEST_P(PartialTrans16x16Test, Random) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  int sum = 0;
-  for (int i = 0; i < kNumCoeffs; ++i) {
-    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
-    input[i] = val;
-    sum += val;
-  }
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ(sum >> 1, output[0]);
-}
-
-using std::tr1::make_tuple;
+using ::testing::make_tuple;

 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@ -837,11 +777,6 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    C, PartialTrans16x16Test,
-    ::testing::Values(make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
    C, Trans16x16HT,
@ -850,17 +785,14 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_c,
-                                                     VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH

-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
    NEON, Trans16x16DCT,
-    ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_neon,
-                                 0, VPX_BITS_8)));
-#endif
+    ::testing::Values(make_tuple(&vpx_fdct16x16_neon,
+                                 &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8)));
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE

 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@ -877,9 +809,6 @@ INSTANTIATE_TEST_CASE_P(
                                 2, VPX_BITS_8),
                      make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
                                 3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@ -914,9 +843,6 @@ INSTANTIATE_TEST_CASE_P(
                                 &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
                      make_tuple(&idct16x16_12, &idct16x16_256_add_12_sse2,
                                 3167, VPX_BITS_12)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@ -932,8 +858,12 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
                   VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_msa,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(VSX, Trans16x16DCT,
+                        ::testing::Values(make_tuple(&vpx_fdct16x16_c,
+                                                     &vpx_idct16x16_256_add_vsx,
+                                                     0, VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@ -66,16 +66,16 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
 typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
 typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);

-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
+typedef ::testing::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
    Trans32x32Param;

 #if CONFIG_VP9_HIGHBITDEPTH
 void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10);
+  vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }

 void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12);
+  vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH

@ -137,7 +137,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
 #if CONFIG_VP9_HIGHBITDEPTH
    } else {
      ASM_REGISTER_STATE_CHECK(
-          inv_txfm_(test_temp_block, CONVERT_TO_BYTEPTR(dst16), 32));
+          inv_txfm_(test_temp_block, CAST_TO_BYTEPTR(dst16), 32));
 #endif
    }

@ -275,7 +275,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
 #if CONFIG_VP9_HIGHBITDEPTH
    } else {
-      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CAST_TO_BYTEPTR(dst16), 32));
 #endif
    }
    for (int j = 0; j < kNumCoeffs; ++j) {
@ -292,68 +292,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
  }
 }

-class PartialTrans32x32Test
-    : public ::testing::TestWithParam<
-          std::tr1::tuple<FwdTxfmFunc, vpx_bit_depth_t> > {
- public:
-  virtual ~PartialTrans32x32Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    bit_depth_ = GET_PARAM(1);
-  }
-
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  vpx_bit_depth_t bit_depth_;
-  FwdTxfmFunc fwd_txfm_;
-};
-
-TEST_P(PartialTrans32x32Test, Extremes) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  const int minval = -maxval;
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ((maxval * kNumCoeffs) >> 3, output[0]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]);
-}
-
-TEST_P(PartialTrans32x32Test, Random) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  int sum = 0;
-  for (int i = 0; i < kNumCoeffs; ++i) {
-    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
-    input[i] = val;
-    sum += val;
-  }
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ(sum >> 3, output[0]);
-}
-
-using std::tr1::make_tuple;
+using ::testing::make_tuple;

 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@ -366,11 +305,6 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8),
        make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1,
                   VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    C, PartialTrans32x32Test,
-    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
    C, Trans32x32Test,
@ -378,19 +312,16 @@ INSTANTIATE_TEST_CASE_P(
                                 VPX_BITS_8),
                      make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c,
                                 1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(C, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_c,
-                                                     VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH

-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
    NEON, Trans32x32Test,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_neon,
-                                 0, VPX_BITS_8),
-                      make_tuple(&vpx_fdct32x32_rd_c,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
+                                 &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct32x32_rd_neon,
                                 &vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE

 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@ -399,9 +330,6 @@ INSTANTIATE_TEST_CASE_P(
                                 &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
                      make_tuple(&vpx_fdct32x32_rd_sse2,
                                 &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@ -418,9 +346,6 @@ INSTANTIATE_TEST_CASE_P(
                   VPX_BITS_8),
        make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_c, 1,
                   VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

 #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@ -439,8 +364,14 @@ INSTANTIATE_TEST_CASE_P(
                                 &vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8),
                      make_tuple(&vpx_fdct32x32_rd_msa,
                                 &vpx_idct32x32_1024_add_msa, 1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(MSA, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    VSX, Trans32x32Test,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_vsx,
+                                 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct32x32_rd_c,
+                                 &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
--- a/test/dct_partial_test.cc
+++ b/test/dct_partial_test.cc
@ -0,0 +1,169 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
+
+namespace {
+typedef void (*PartialFdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+
+typedef tuple<PartialFdctFunc, int /* size */, vpx_bit_depth_t>
+    PartialFdctParam;
+
+tran_low_t partial_fdct_ref(const Buffer<int16_t> &in, int size) {
+  int64_t sum = 0;
+  for (int y = 0; y < size; ++y) {
+    for (int x = 0; x < size; ++x) {
+      sum += in.TopLeftPixel()[y * in.stride() + x];
+    }
+  }
+
+  switch (size) {
+    case 4: sum *= 2; break;
+    case 8: /*sum = sum;*/ break;
+    case 16: sum >>= 1; break;
+    case 32: sum >>= 3; break;
+  }
+
+  return static_cast<tran_low_t>(sum);
+}
+
+class PartialFdctTest : public ::testing::TestWithParam<PartialFdctParam> {
+ public:
+  PartialFdctTest() {
+    fwd_txfm_ = GET_PARAM(0);
+    size_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int16_t maxvalue =
+        clip_pixel_highbd(std::numeric_limits<int16_t>::max(), bit_depth_);
+    const int16_t minvalue = -maxvalue;
+    Buffer<int16_t> input_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(input_block.Init());
+    Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(output_block.Init());
+
+    for (int i = 0; i < 100; ++i) {
+      if (i == 0) {
+        input_block.Set(maxvalue);
+      } else if (i == 1) {
+        input_block.Set(minvalue);
+      } else {
+        input_block.Set(&rnd, minvalue, maxvalue);
+      }
+
+      ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(),
+                                         output_block.TopLeftPixel(),
+                                         input_block.stride()));
+
+      EXPECT_EQ(partial_fdct_ref(input_block, size_),
+                output_block.TopLeftPixel()[0]);
+    }
+  }
+
+  PartialFdctFunc fwd_txfm_;
+  vpx_bit_depth_t bit_depth_;
+  int size_;
+};
+
+TEST_P(PartialFdctTest, PartialFdctTest) { RunTest(); }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_10),
+                      make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_10),
+                      make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_10),
+                      make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_sse2, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_sse2, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_sse2, 4, VPX_BITS_8)));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    NEON, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    NEON, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(MSA, PartialFdctTest,
+                        ::testing::Values(make_tuple(&vpx_fdct8x8_1_msa, 8,
+                                                     VPX_BITS_8)));
+#else   // !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    MSA, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_msa, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_msa, 8, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_MSA
+}  // namespace
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@ -0,0 +1,728 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
+
+namespace {
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+typedef void (*FhtFuncRef)(const Buffer<int16_t> &in, Buffer<tran_low_t> *out,
+                           int size, int tx_type);
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+typedef void (*IhtWithBdFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                              int tx_type, int bd);
+
+template <FdctFunc fn>
+void fdct_wrapper(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  (void)tx_type;
+  fn(in, out, stride);
+}
+
+template <IdctFunc fn>
+void idct_wrapper(const tran_low_t *in, uint8_t *out, int stride, int tx_type,
+                  int bd) {
+  (void)tx_type;
+  (void)bd;
+  fn(in, out, stride);
+}
+
+template <IhtFunc fn>
+void iht_wrapper(const tran_low_t *in, uint8_t *out, int stride, int tx_type,
+                 int bd) {
+  (void)bd;
+  fn(in, out, stride, tx_type);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*HighbdIdctFunc)(const tran_low_t *in, uint16_t *out, int stride,
+                               int bd);
+
+typedef void (*HighbdIhtFunc)(const tran_low_t *in, uint16_t *out, int stride,
+                              int tx_type, int bd);
+
+template <HighbdIdctFunc fn>
+void highbd_idct_wrapper(const tran_low_t *in, uint8_t *out, int stride,
+                         int tx_type, int bd) {
+  (void)tx_type;
+  fn(in, CAST_TO_SHORTPTR(out), stride, bd);
+}
+
+template <HighbdIhtFunc fn>
+void highbd_iht_wrapper(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type, int bd) {
+  fn(in, CAST_TO_SHORTPTR(out), stride, tx_type, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+struct FuncInfo {
+  FhtFunc ft_func;
+  IhtWithBdFunc it_func;
+  int size;
+  int pixel_size;
+};
+
+/* forward transform, inverse transform, size, transform type, bit depth */
+typedef tuple<int, const FuncInfo *, int, vpx_bit_depth_t> DctParam;
+
+void fdct_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+              int /*tx_type*/) {
+  const int16_t *i = in.TopLeftPixel();
+  const int i_stride = in.stride();
+  tran_low_t *o = out->TopLeftPixel();
+  if (size == 4) {
+    vpx_fdct4x4_c(i, o, i_stride);
+  } else if (size == 8) {
+    vpx_fdct8x8_c(i, o, i_stride);
+  } else if (size == 16) {
+    vpx_fdct16x16_c(i, o, i_stride);
+  } else if (size == 32) {
+    vpx_fdct32x32_c(i, o, i_stride);
+  }
+}
+
+void fht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+             int tx_type) {
+  const int16_t *i = in.TopLeftPixel();
+  const int i_stride = in.stride();
+  tran_low_t *o = out->TopLeftPixel();
+  if (size == 4) {
+    vp9_fht4x4_c(i, o, i_stride, tx_type);
+  } else if (size == 8) {
+    vp9_fht8x8_c(i, o, i_stride, tx_type);
+  } else if (size == 16) {
+    vp9_fht16x16_c(i, o, i_stride, tx_type);
+  }
+}
+
+void fwht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+              int /*tx_type*/) {
+  ASSERT_EQ(size, 4);
+  vp9_fwht4x4_c(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
+}
+
+class TransTestBase : public ::testing::TestWithParam<DctParam> {
+ public:
+  virtual void SetUp() {
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    const int idx = GET_PARAM(0);
+    const FuncInfo *func_info = &(GET_PARAM(1)[idx]);
+    tx_type_ = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    fwd_txfm_ = func_info->ft_func;
+    inv_txfm_ = func_info->it_func;
+    size_ = func_info->size;
+    pixel_size_ = func_info->pixel_size;
+    max_pixel_value_ = (1 << bit_depth_) - 1;
+
+    // Randomize stride_ to a value less than or equal to 1024
+    stride_ = rnd_(1024) + 1;
+    if (stride_ < size_) {
+      stride_ = size_;
+    }
+    // Align stride_ to 16 if it's bigger than 16.
+    if (stride_ > 16) {
+      stride_ &= ~15;
+    }
+
+    block_size_ = size_ * stride_;
+
+    src_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, pixel_size_ * block_size_));
+    ASSERT_TRUE(src_ != NULL);
+    dst_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, pixel_size_ * block_size_));
+    ASSERT_TRUE(dst_ != NULL);
+  }
+
+  virtual void TearDown() {
+    vpx_free(src_);
+    src_ = NULL;
+    vpx_free(dst_);
+    dst_ = NULL;
+    libvpx_test::ClearSystemState();
+  }
+
+  void InitMem() {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
+    if (pixel_size_ == 1) {
+      for (int j = 0; j < block_size_; ++j) {
+        src_[j] = rnd_.Rand16() & max_pixel_value_;
+      }
+      for (int j = 0; j < block_size_; ++j) {
+        dst_[j] = rnd_.Rand16() & max_pixel_value_;
+      }
+    } else {
+      ASSERT_EQ(pixel_size_, 2);
+      uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+      uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+      for (int j = 0; j < block_size_; ++j) {
+        src[j] = rnd_.Rand16() & max_pixel_value_;
+      }
+      for (int j = 0; j < block_size_; ++j) {
+        dst[j] = rnd_.Rand16() & max_pixel_value_;
+      }
+    }
+  }
+
+  void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
+    fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_);
+  }
+
+  void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
+    inv_txfm_(in.TopLeftPixel(), out, stride_, tx_type_, bit_depth_);
+  }
+
+ protected:
+  void RunAccuracyCheck(int limit) {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    Buffer<int16_t> test_input_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(test_input_block.Init());
+    Buffer<tran_low_t> test_temp_block =
+        Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(test_temp_block.Init());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+    for (int i = 0; i < count_test_block; ++i) {
+      InitMem();
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          if (pixel_size_ == 1) {
+            test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] =
+                src_[h * stride_ + w] - dst_[h * stride_ + w];
+          } else {
+            ASSERT_EQ(pixel_size_, 2);
+            const uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+            const uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+            test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] =
+                src[h * stride_ + w] - dst[h * stride_ + w];
+          }
+        }
+      }
+
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block, &test_temp_block));
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst_));
+
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          int diff;
+          if (pixel_size_ == 1) {
+            diff = dst_[h * stride_ + w] - src_[h * stride_ + w];
+          } else {
+            ASSERT_EQ(pixel_size_, 2);
+            const uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+            const uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+            diff = dst[h * stride_ + w] - src[h * stride_ + w];
+          }
+          const uint32_t error = diff * diff;
+          if (max_error < error) max_error = error;
+          total_error += error;
+        }
+      }
+    }
+
+    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
+        << "Error: " << size_ << "x" << size_
+        << " transform/inverse transform has an individual round trip error > "
+        << limit;
+
+    EXPECT_GE(count_test_block * limit, total_error)
+        << "Error: " << size_ << "x" << size_
+        << " transform/inverse transform has average round trip error > "
+        << limit << " per block";
+  }
+
+  void RunCoeffCheck() {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    Buffer<int16_t> input_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(input_block.Init());
+    Buffer<tran_low_t> output_ref_block = Buffer<tran_low_t>(size_, size_, 0);
+    ASSERT_TRUE(output_ref_block.Init());
+    Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(output_block.Init());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-max_pixel_value_,
+      // max_pixel_value_].
+      input_block.Set(&rnd, -max_pixel_value_, max_pixel_value_);
+
+      fwd_txfm_ref(input_block, &output_ref_block, size_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, &output_block));
+
+      // The minimum quant value is 4.
+      EXPECT_TRUE(output_block.CheckValues(output_ref_block));
+      if (::testing::Test::HasFailure()) {
+        printf("Size: %d Transform type: %d\n", size_, tx_type_);
+        output_block.PrintDifference(output_ref_block);
+        return;
+      }
+    }
+  }
+
+  void RunMemCheck() {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    Buffer<int16_t> input_extreme_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(input_extreme_block.Init());
+    Buffer<tran_low_t> output_ref_block = Buffer<tran_low_t>(size_, size_, 0);
+    ASSERT_TRUE(output_ref_block.Init());
+    Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(output_block.Init());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with -max_pixel_value_ or max_pixel_value_.
+      if (i == 0) {
+        input_extreme_block.Set(max_pixel_value_);
+      } else if (i == 1) {
+        input_extreme_block.Set(-max_pixel_value_);
+      } else {
+        for (int h = 0; h < size_; ++h) {
+          for (int w = 0; w < size_; ++w) {
+            input_extreme_block
+                .TopLeftPixel()[h * input_extreme_block.stride() + w] =
+                rnd.Rand8() % 2 ? max_pixel_value_ : -max_pixel_value_;
+          }
+        }
+      }
+
+      fwd_txfm_ref(input_extreme_block, &output_ref_block, size_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block, &output_block));
+
+      // The minimum quant value is 4.
+      EXPECT_TRUE(output_block.CheckValues(output_ref_block));
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          EXPECT_GE(
+              4 * DCT_MAX_VALUE << (bit_depth_ - 8),
+              abs(output_block.TopLeftPixel()[h * output_block.stride() + w]))
+              << "Error: " << size_ << "x" << size_
+              << " transform has coefficient larger than 4*DCT_MAX_VALUE"
+              << " at " << w << "," << h;
+          if (::testing::Test::HasFailure()) {
+            printf("Size: %d Transform type: %d\n", size_, tx_type_);
+            output_block.DumpBuffer();
+            return;
+          }
+        }
+      }
+    }
+  }
+
+  void RunInvAccuracyCheck(int limit) {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    Buffer<int16_t> in = Buffer<int16_t>(size_, size_, 4);
+    ASSERT_TRUE(in.Init());
+    Buffer<tran_low_t> coeff = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(coeff.Init());
+    Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(dst.Init());
+    Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0);
+    ASSERT_TRUE(src.Init());
+    Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(dst16.Init());
+    Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0);
+    ASSERT_TRUE(src16.Init());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      InitMem();
+      // Initialize a test block with input range [-max_pixel_value_,
+      // max_pixel_value_].
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          if (pixel_size_ == 1) {
+            in.TopLeftPixel()[h * in.stride() + w] =
+                src_[h * stride_ + w] - dst_[h * stride_ + w];
+          } else {
+            ASSERT_EQ(pixel_size_, 2);
+            const uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+            const uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+            in.TopLeftPixel()[h * in.stride() + w] =
+                src[h * stride_ + w] - dst[h * stride_ + w];
+          }
+        }
+      }
+
+      fwd_txfm_ref(in, &coeff, size_, tx_type_);
+
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst_));
+
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          int diff;
+          if (pixel_size_ == 1) {
+            diff = dst_[h * stride_ + w] - src_[h * stride_ + w];
+          } else {
+            ASSERT_EQ(pixel_size_, 2);
+            const uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+            const uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+            diff = dst[h * stride_ + w] - src[h * stride_ + w];
+          }
+          const uint32_t error = diff * diff;
+          EXPECT_GE(static_cast<uint32_t>(limit), error)
+              << "Error: " << size_ << "x" << size_
+              << " inverse transform has error " << error << " at " << w << ","
+              << h;
+          if (::testing::Test::HasFailure()) {
+            printf("Size: %d Transform type: %d\n", size_, tx_type_);
+            return;
+          }
+        }
+      }
+    }
+  }
+
+  FhtFunc fwd_txfm_;
+  FhtFuncRef fwd_txfm_ref;
+  IhtWithBdFunc inv_txfm_;
+  ACMRandom rnd_;
+  uint8_t *src_;
+  uint8_t *dst_;
+  vpx_bit_depth_t bit_depth_;
+  int tx_type_;
+  int max_pixel_value_;
+  int size_;
+  int stride_;
+  int pixel_size_;
+  int block_size_;
+};
+
+/* -------------------------------------------------------------------------- */
+
+class TransDCT : public TransTestBase {
+ public:
+  TransDCT() { fwd_txfm_ref = fdct_ref; }
+};
+
+TEST_P(TransDCT, AccuracyCheck) {
+  int t = 1;
+  if (size_ == 16 && bit_depth_ > 10 && pixel_size_ == 2) {
+    t = 2;
+  } else if (size_ == 32 && bit_depth_ > 10 && pixel_size_ == 2) {
+    t = 7;
+  }
+  RunAccuracyCheck(t);
+}
+
+TEST_P(TransDCT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransDCT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransDCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
+
+static const FuncInfo dct_c_func_info[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  { &fdct_wrapper<vpx_highbd_fdct4x4_c>,
+    &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_c>, 4, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct8x8_c>,
+    &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_c>, 8, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct16x16_c>,
+    &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_c>, 16, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct32x32_c>,
+    &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_c>, 32, 2 },
+#endif
+  { &fdct_wrapper<vpx_fdct4x4_c>, &idct_wrapper<vpx_idct4x4_16_add_c>, 4, 1 },
+  { &fdct_wrapper<vpx_fdct8x8_c>, &idct_wrapper<vpx_idct8x8_64_add_c>, 8, 1 },
+  { &fdct_wrapper<vpx_fdct16x16_c>, &idct_wrapper<vpx_idct16x16_256_add_c>, 16,
+    1 },
+  { &fdct_wrapper<vpx_fdct32x32_c>, &idct_wrapper<vpx_idct32x32_1024_add_c>, 32,
+    1 }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    C, TransDCT,
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(dct_c_func_info) /
+                                             sizeof(dct_c_func_info[0]))),
+        ::testing::Values(dct_c_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+
+#if !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2
+static const FuncInfo dct_sse2_func_info[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  { &fdct_wrapper<vpx_highbd_fdct4x4_sse2>,
+    &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_sse2>, 4, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct8x8_sse2>,
+    &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_sse2>, 8, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct16x16_sse2>,
+    &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_sse2>, 16, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct32x32_sse2>,
+    &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, 32, 2 },
+#endif
+  { &fdct_wrapper<vpx_fdct4x4_sse2>, &idct_wrapper<vpx_idct4x4_16_add_sse2>, 4,
+    1 },
+  { &fdct_wrapper<vpx_fdct8x8_sse2>, &idct_wrapper<vpx_idct8x8_64_add_sse2>, 8,
+    1 },
+  { &fdct_wrapper<vpx_fdct16x16_sse2>,
+    &idct_wrapper<vpx_idct16x16_256_add_sse2>, 16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_sse2>,
+    &idct_wrapper<vpx_idct32x32_1024_add_sse2>, 32, 1 }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, TransDCT,
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(dct_sse2_func_info) /
+                                             sizeof(dct_sse2_func_info[0]))),
+        ::testing::Values(dct_sse2_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+// vpx_fdct8x8_ssse3 is only available in 64 bit builds.
+static const FuncInfo dct_ssse3_func_info = {
+  &fdct_wrapper<vpx_fdct8x8_ssse3>, &idct_wrapper<vpx_idct8x8_64_add_sse2>, 8, 1
+};
+
+// TODO(johannkoenig): high bit depth fdct8x8.
+INSTANTIATE_TEST_CASE_P(SSSE3, TransDCT,
+                        ::testing::Values(make_tuple(0, &dct_ssse3_func_info, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_avx2_func_info = {
+  &fdct_wrapper<vpx_fdct32x32_avx2>, &idct_wrapper<vpx_idct32x32_1024_add_sse2>,
+  32, 1
+};
+
+// TODO(johannkoenig): high bit depth fdct32x32.
+INSTANTIATE_TEST_CASE_P(AVX2, TransDCT,
+                        ::testing::Values(make_tuple(0, &dct_avx2_func_info, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_NEON
+static const FuncInfo dct_neon_func_info[4] = {
+  { &fdct_wrapper<vpx_fdct4x4_neon>, &idct_wrapper<vpx_idct4x4_16_add_neon>, 4,
+    1 },
+  { &fdct_wrapper<vpx_fdct8x8_neon>, &idct_wrapper<vpx_idct8x8_64_add_neon>, 8,
+    1 },
+  { &fdct_wrapper<vpx_fdct16x16_neon>,
+    &idct_wrapper<vpx_idct16x16_256_add_neon>, 16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_neon>,
+    &idct_wrapper<vpx_idct32x32_1024_add_neon>, 32, 1 }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, TransDCT,
+    ::testing::Combine(::testing::Range(0, 4),
+                       ::testing::Values(dct_neon_func_info),
+                       ::testing::Values(0), ::testing::Values(VPX_BITS_8)));
+#endif  // HAVE_NEON
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_msa_func_info[4] = {
+  { &fdct_wrapper<vpx_fdct4x4_msa>, &idct_wrapper<vpx_idct4x4_16_add_msa>, 4,
+    1 },
+  { &fdct_wrapper<vpx_fdct8x8_msa>, &idct_wrapper<vpx_idct8x8_64_add_msa>, 8,
+    1 },
+  { &fdct_wrapper<vpx_fdct16x16_msa>, &idct_wrapper<vpx_idct16x16_256_add_msa>,
+    16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_msa>, &idct_wrapper<vpx_idct32x32_1024_add_msa>,
+    32, 1 }
+};
+
+INSTANTIATE_TEST_CASE_P(MSA, TransDCT,
+                        ::testing::Combine(::testing::Range(0, 4),
+                                           ::testing::Values(dct_msa_func_info),
+                                           ::testing::Values(0),
+                                           ::testing::Values(VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_vsx_func_info = {
+  &fdct_wrapper<vpx_fdct4x4_c>, &idct_wrapper<vpx_idct4x4_16_add_vsx>, 4, 1
+};
+
+INSTANTIATE_TEST_CASE_P(VSX, TransDCT,
+                        ::testing::Values(make_tuple(0, &dct_vsx_func_info, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH &&
+
+#endif  // !CONFIG_EMULATE_HARDWARE
+
+/* -------------------------------------------------------------------------- */
+
+class TransHT : public TransTestBase {
+ public:
+  TransHT() { fwd_txfm_ref = fht_ref; }
+};
+
+TEST_P(TransHT, AccuracyCheck) {
+  RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 && pixel_size_ == 2 ? 2 : 1);
+}
+
+TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
+
+static const FuncInfo ht_c_func_info[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_c>, 4,
+    2 },
+  { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_c>, 8,
+    2 },
+  { &vp9_highbd_fht16x16_c, &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>,
+    16, 2 },
+#endif
+  { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_c>, 4, 1 },
+  { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_c>, 8, 1 },
+  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_c>, 16, 1 }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    C, TransHT,
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(ht_c_func_info) /
+                                             sizeof(ht_c_func_info[0]))),
+        ::testing::Values(ht_c_func_info), ::testing::Range(0, 4),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+
+#if !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON
+
+static const FuncInfo ht_neon_func_info[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
+    2 },
+  { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
+    2 },
+  { &vp9_highbd_fht16x16_c,
+    &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 },
+#endif
+  { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
+  { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
+  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, TransHT,
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(ht_neon_func_info) /
+                                             sizeof(ht_neon_func_info[0]))),
+        ::testing::Values(ht_neon_func_info), ::testing::Range(0, 4),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+#endif  // HAVE_NEON
+
+#if HAVE_SSE2
+
+static const FuncInfo ht_sse2_func_info[3] = {
+  { &vp9_fht4x4_sse2, &iht_wrapper<vp9_iht4x4_16_add_sse2>, 4, 1 },
+  { &vp9_fht8x8_sse2, &iht_wrapper<vp9_iht8x8_64_add_sse2>, 8, 1 },
+  { &vp9_fht16x16_sse2, &iht_wrapper<vp9_iht16x16_256_add_sse2>, 16, 1 }
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, TransHT,
+                        ::testing::Combine(::testing::Range(0, 3),
+                                           ::testing::Values(ht_sse2_func_info),
+                                           ::testing::Range(0, 4),
+                                           ::testing::Values(VPX_BITS_8)));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo ht_sse4_1_func_info[3] = {
+  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_sse4_1>,
+    4, 2 },
+  { vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>,
+    8, 2 },
+  { &vp9_highbd_fht16x16_c,
+    &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, 2 }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, TransHT,
+    ::testing::Combine(::testing::Range(0, 3),
+                       ::testing::Values(ht_sse4_1_func_info),
+                       ::testing::Range(0, 4),
+                       ::testing::Values(VPX_BITS_8, VPX_BITS_10,
+                                         VPX_BITS_12)));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // !CONFIG_EMULATE_HARDWARE
+
+/* -------------------------------------------------------------------------- */
+
+class TransWHT : public TransTestBase {
+ public:
+  TransWHT() { fwd_txfm_ref = fwht_ref; }
+};
+
+TEST_P(TransWHT, AccuracyCheck) { RunAccuracyCheck(0); }
+
+TEST_P(TransWHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransWHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransWHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
+
+static const FuncInfo wht_c_func_info[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  { &fdct_wrapper<vp9_highbd_fwht4x4_c>,
+    &highbd_idct_wrapper<vpx_highbd_iwht4x4_16_add_c>, 4, 2 },
+#endif
+  { &fdct_wrapper<vp9_fwht4x4_c>, &idct_wrapper<vpx_iwht4x4_16_add_c>, 4, 1 }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    C, TransWHT,
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(wht_c_func_info) /
+                                             sizeof(wht_c_func_info[0]))),
+        ::testing::Values(wht_c_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+
+#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+static const FuncInfo wht_sse2_func_info = {
+  &fdct_wrapper<vp9_fwht4x4_sse2>, &idct_wrapper<vpx_iwht4x4_16_add_sse2>, 4, 1
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, TransWHT,
+                        ::testing::Values(make_tuple(0, &wht_sse2_func_info, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+}  // namespace
--- a/test/decode_api_test.cc
+++ b/test/decode_api_test.cc
@ -172,4 +172,21 @@ TEST(DecodeAPI, Vp9PeekSI) {
 }
 #endif  // CONFIG_VP9_DECODER

+TEST(DecodeAPI, HighBitDepthCapability) {
+// VP8 should not claim VP9 HBD as a capability.
+#if CONFIG_VP8_DECODER
+  const vpx_codec_caps_t vp8_caps = vpx_codec_get_caps(&vpx_codec_vp8_dx_algo);
+  EXPECT_EQ(vp8_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0);
+#endif
+
+#if CONFIG_VP9_DECODER
+  const vpx_codec_caps_t vp9_caps = vpx_codec_get_caps(&vpx_codec_vp9_dx_algo);
+#if CONFIG_VP9_HIGHBITDEPTH
+  EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, VPX_CODEC_CAP_HIGHBITDEPTH);
+#else
+  EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0);
+#endif
+#endif
+}
+
 }  // namespace
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@ -21,7 +21,7 @@
 #include "./ivfenc.h"
 #include "./vpx_version.h"

-using std::tr1::make_tuple;
+using ::testing::make_tuple;

 namespace {

@ -34,7 +34,7 @@ const char kNewEncodeOutputFile[] = "new_encode.ivf";
 /*
 DecodePerfTest takes a tuple of filename + number of threads to decode with
 */
-typedef std::tr1::tuple<const char *, unsigned> DecodePerfParam;
+typedef ::testing::tuple<const char *, unsigned> DecodePerfParam;

 const DecodePerfParam kVP9DecodePerfVectors[] = {
  make_tuple("vp90-2-bbb_426x240_tile_1x1_180kbps.webm", 1),
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@ -52,14 +52,15 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder,
    /* Vp8's implementation of PeekStream returns an error if the frame you
     * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first
     * frame, which must be a keyframe. */
-    if (video->frame_number() == 0)
-      ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
-                                        << vpx_codec_err_to_string(res_peek);
+    if (video->frame_number() == 0) {
+      ASSERT_EQ(VPX_CODEC_OK, res_peek)
+          << "Peek return failed: " << vpx_codec_err_to_string(res_peek);
+    }
  } else {
    /* The Vp9 implementation of PeekStream returns an error only if the
     * data passed to it isn't a valid Vp9 chunk. */
-    ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
-                                      << vpx_codec_err_to_string(res_peek);
+    ASSERT_EQ(VPX_CODEC_OK, res_peek)
+        << "Peek return failed: " << vpx_codec_err_to_string(res_peek);
  }
 }

--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@ -62,4 +62,134 @@ TEST(EncodeAPI, InvalidParams) {
  }
 }

+TEST(EncodeAPI, HighBitDepthCapability) {
+// VP8 should not claim VP9 HBD as a capability.
+#if CONFIG_VP8_ENCODER
+  const vpx_codec_caps_t vp8_caps = vpx_codec_get_caps(&vpx_codec_vp8_cx_algo);
+  EXPECT_EQ(vp8_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0);
+#endif
+
+#if CONFIG_VP9_ENCODER
+  const vpx_codec_caps_t vp9_caps = vpx_codec_get_caps(&vpx_codec_vp9_cx_algo);
+#if CONFIG_VP9_HIGHBITDEPTH
+  EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, VPX_CODEC_CAP_HIGHBITDEPTH);
+#else
+  EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0);
+#endif
+#endif
+}
+
+#if CONFIG_VP8_ENCODER
+TEST(EncodeAPI, ImageSizeSetting) {
+  const int width = 711;
+  const int height = 360;
+  const int bps = 12;
+  vpx_image_t img;
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+  uint8_t *img_buf = reinterpret_cast<uint8_t *>(
+      calloc(width * height * bps / 8, sizeof(*img_buf)));
+  vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg, 0);
+
+  cfg.g_w = width;
+  cfg.g_h = height;
+
+  vpx_img_wrap(&img, VPX_IMG_FMT_I420, width, height, 1, img_buf);
+
+  vpx_codec_enc_init(&enc, vpx_codec_vp8_cx(), &cfg, 0);
+
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, &img, 0, 1, 0, 0));
+
+  free(img_buf);
+
+  vpx_codec_destroy(&enc);
+}
+#endif
+
+// Set up 2 spatial streams with 2 temporal layers per stream, and generate
+// invalid configuration by setting the temporal layer rate allocation
+// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of
+// CONFIG_MULTI_RES_ENCODING.
+TEST(EncodeAPI, MultiResEncode) {
+  static const vpx_codec_iface_t *kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+    &vpx_codec_vp8_cx_algo,
+#endif
+#if CONFIG_VP9_ENCODER
+    &vpx_codec_vp9_cx_algo,
+#endif
+  };
+  const int width = 1280;
+  const int height = 720;
+  const int width_down = width / 2;
+  const int height_down = height / 2;
+  const int target_bitrate = 1000;
+  const int framerate = 30;
+
+  for (int c = 0; c < NELEMENTS(kCodecs); ++c) {
+    const vpx_codec_iface_t *const iface = kCodecs[c];
+    vpx_codec_ctx_t enc[2];
+    vpx_codec_enc_cfg_t cfg[2];
+    vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
+
+    memset(enc, 0, sizeof(enc));
+
+    for (int i = 0; i < 2; i++) {
+      vpx_codec_enc_config_default(iface, &cfg[i], 0);
+    }
+
+    /* Highest-resolution encoder settings */
+    cfg[0].g_w = width;
+    cfg[0].g_h = height;
+    cfg[0].rc_dropframe_thresh = 0;
+    cfg[0].rc_end_usage = VPX_CBR;
+    cfg[0].rc_resize_allowed = 0;
+    cfg[0].rc_min_quantizer = 2;
+    cfg[0].rc_max_quantizer = 56;
+    cfg[0].rc_undershoot_pct = 100;
+    cfg[0].rc_overshoot_pct = 15;
+    cfg[0].rc_buf_initial_sz = 500;
+    cfg[0].rc_buf_optimal_sz = 600;
+    cfg[0].rc_buf_sz = 1000;
+    cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
+    cfg[0].g_lag_in_frames = 0;
+
+    cfg[0].kf_mode = VPX_KF_AUTO;
+    cfg[0].kf_min_dist = 3000;
+    cfg[0].kf_max_dist = 3000;
+
+    cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */
+    cfg[0].g_timebase.num = 1;                 /* Set fps */
+    cfg[0].g_timebase.den = framerate;
+
+    memcpy(&cfg[1], &cfg[0], sizeof(cfg[0]));
+    cfg[1].rc_target_bitrate = 500;
+    cfg[1].g_w = width_down;
+    cfg[1].g_h = height_down;
+
+    for (int i = 0; i < 2; i++) {
+      cfg[i].ts_number_layers = 2;
+      cfg[i].ts_periodicity = 2;
+      cfg[i].ts_rate_decimator[0] = 2;
+      cfg[i].ts_rate_decimator[1] = 1;
+      cfg[i].ts_layer_id[0] = 0;
+      cfg[i].ts_layer_id[1] = 1;
+      // Invalid parameters.
+      cfg[i].ts_target_bitrate[0] = 0;
+      cfg[i].ts_target_bitrate[1] = 0;
+    }
+
+    // VP9 should report incapable, VP8 invalid for all configurations.
+    const char kVP9Name[] = "WebM Project VP9";
+    const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface),
+                                sizeof(kVP9Name) - 1) == 0;
+    EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
+              vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
+
+    for (int i = 0; i < 2; i++) {
+      vpx_codec_destroy(&enc[i]);
+    }
+  }
+}
+
 }  // namespace
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@ -201,6 +201,8 @@ void EncoderTest::RunLoop(VideoSource *video) {
      PreEncodeFrameHook(video, encoder.get());
      encoder->EncodeFrame(video, frame_flags_);

+      PostEncodeFrameHook();
+
      CxDataIterator iter = encoder->GetCxData();

      bool has_cxdata = false;
@ -226,6 +228,8 @@ void EncoderTest::RunLoop(VideoSource *video) {

          case VPX_CODEC_PSNR_PKT: PSNRPktHook(pkt); break;

+          case VPX_CODEC_STATS_PKT: StatsPktHook(pkt); break;
+
          default: break;
        }
      }
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@ -128,17 +128,31 @@ class Encoder {
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }

+  void Control(int ctrl_id, struct vpx_svc_ref_frame_config *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
  void Control(int ctrl_id, struct vpx_svc_parameters *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }
+
+  void Control(int ctrl_id, struct vpx_svc_frame_drop *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
  void Control(int ctrl_id, vpx_active_map_t *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }
-#endif

+  void Control(int ctrl_id, vpx_roi_map_t *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+#endif
  void Config(const vpx_codec_enc_cfg_t *cfg) {
    const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
@ -212,12 +226,17 @@ class EncoderTest {
  virtual void PreEncodeFrameHook(VideoSource * /*video*/,
                                  Encoder * /*encoder*/) {}

+  virtual void PostEncodeFrameHook() {}
+
  // Hook to be called on every compressed data packet.
  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}

  // Hook to be called on every PSNR packet.
  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}

+  // Hook to be called on every first pass stats packet.
+  virtual void StatsPktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}
+
  // Hook to determine whether the encode loop should continue.
  virtual bool Continue() const {
    return !(::testing::Test::HasFatalFailure() || abort_);
--- a/test/examples.sh
+++ b/test/examples.sh
@ -15,7 +15,7 @@
 example_tests=$(ls $(dirname $0)/*.sh)

 # List of script names to exclude.
-exclude_list="examples tools_common"
+exclude_list="examples stress tools_common"

 # Filter out the scripts in $exclude_list.
 for word in ${exclude_list}; do
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@ -34,7 +34,8 @@ struct ExternalFrameBuffer {
 // Class to manipulate a list of external frame buffers.
 class ExternalFrameBufferList {
 public:
-  ExternalFrameBufferList() : num_buffers_(0), ext_fb_list_(NULL) {}
+  ExternalFrameBufferList()
+      : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(NULL) {}

  virtual ~ExternalFrameBufferList() {
    for (int i = 0; i < num_buffers_; ++i) {
@ -71,6 +72,8 @@ class ExternalFrameBufferList {
    }

    SetFrameBuffer(idx, fb);
+
+    num_used_buffers_++;
    return 0;
  }

@ -106,6 +109,7 @@ class ExternalFrameBufferList {
    }
    EXPECT_EQ(1, ext_fb->in_use);
    ext_fb->in_use = 0;
+    num_used_buffers_--;
    return 0;
  }

@ -121,6 +125,8 @@ class ExternalFrameBufferList {
    }
  }

+  int num_used_buffers() const { return num_used_buffers_; }
+
 private:
  // Returns the index of the first free frame buffer. Returns |num_buffers_|
  // if there are no free frame buffers.
@ -145,6 +151,7 @@ class ExternalFrameBufferList {
  }

  int num_buffers_;
+  int num_used_buffers_;
  ExternalFrameBuffer *ext_fb_list_;
 };

@ -220,8 +227,8 @@ class ExternalFrameBufferMD5Test

  void OpenMD5File(const std::string &md5_file_name_) {
    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
-                                   << md5_file_name_;
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
  }

  virtual void DecompressedFrameHook(const vpx_image_t &img,
@ -273,6 +280,7 @@ class ExternalFrameBufferMD5Test

 #if CONFIG_WEBM_IO
 const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+const char kVP9NonRefTestFile[] = "vp90-2-22-svc_1280x720_1.webm";

 // Class for testing passing in external frame buffers to libvpx.
 class ExternalFrameBufferTest : public ::testing::Test {
@ -292,7 +300,9 @@ class ExternalFrameBufferTest : public ::testing::Test {

  virtual void TearDown() {
    delete decoder_;
+    decoder_ = NULL;
    delete video_;
+    video_ = NULL;
  }

  // Passes the external frame buffer information to libvpx.
@ -325,7 +335,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
    return VPX_CODEC_OK;
  }

- private:
+ protected:
  void CheckDecodedFrames() {
    libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
    const vpx_image_t *img = NULL;
@ -341,6 +351,25 @@ class ExternalFrameBufferTest : public ::testing::Test {
  int num_buffers_;
  ExternalFrameBufferList fb_list_;
 };
+
+class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
+ protected:
+  virtual void SetUp() {
+    video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+  }
+
+  virtual void CheckFrameBufferRelease() {
+    TearDown();
+    ASSERT_EQ(0, fb_list_.num_used_buffers());
+  }
+};
 #endif  // CONFIG_WEBM_IO

 // This test runs through the set of test vectors, and decodes them.
@ -419,6 +448,8 @@ TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
                                    release_vp9_frame_buffer));
  ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  // Only run this on long clips. Decoding a very short clip will return
+  // VPX_CODEC_OK even with only 2 buffers.
  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames());
 }

@ -467,6 +498,15 @@ TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
                                    release_vp9_frame_buffer));
 }
+
+TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
+                                    release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+  CheckFrameBufferRelease();
+}
 #endif  // CONFIG_WEBM_IO

 VP9_INSTANTIATE_TEST_CASE(
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@ -1,512 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "vp9/common/vp9_entropy.h"
-#include "vpx/vpx_codec.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-using libvpx_test::ACMRandom;
-
-namespace {
-const int kNumCoeffs = 16;
-typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        int tx_type);
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        int tx_type);
-
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
-
-void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 int /*tx_type*/) {
-  vpx_fdct4x4_c(in, out, stride);
-}
-
-void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
-  vp9_fht4x4_c(in, out, stride, tx_type);
-}
-
-void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 int /*tx_type*/) {
-  vp9_fwht4x4_c(in, out, stride);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_c(in, out, stride, 10);
-}
-
-void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_c(in, out, stride, 12);
-}
-
-void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10);
-}
-
-void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12);
-}
-
-void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10);
-}
-
-void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12);
-}
-
-#if HAVE_SSE2
-void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
-}
-
-void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
-}
-#endif  // HAVE_SSE2
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-class Trans4x4TestBase {
- public:
-  virtual ~Trans4x4TestBase() {}
-
- protected:
-  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
-
-  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
-
-  void RunAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    uint32_t max_error = 0;
-    int64_t total_error = 0;
-    const int count_test_block = 10000;
-    for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
-      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          test_input_block[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          test_input_block[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        ASSERT_EQ(VPX_BITS_8, bit_depth_);
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        if (max_error < error) max_error = error;
-        total_error += error;
-      }
-    }
-
-    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
-        << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit;
-
-    EXPECT_GE(count_test_block * limit, total_error)
-        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
-        << " per block";
-  }
-
-  void RunCoeffCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-      }
-
-      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-    }
-  }
-
-  void RunMemCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
-      }
-      if (i == 0) {
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
-      } else if (i == 1) {
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
-      }
-
-      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(input_extreme_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
-            << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      }
-    }
-  }
-
-  void RunInvAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          in[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          in[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
-
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        EXPECT_GE(static_cast<uint32_t>(limit), error)
-            << "Error: 4x4 IDCT has error " << error << " at index " << j;
-      }
-    }
-  }
-
-  int pitch_;
-  int tx_type_;
-  FhtFunc fwd_txfm_ref;
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-};
-
-class Trans4x4DCT : public Trans4x4TestBase,
-                    public ::testing::TestWithParam<Dct4x4Param> {
- public:
-  virtual ~Trans4x4DCT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_ = GET_PARAM(2);
-    pitch_ = 4;
-    fwd_txfm_ref = fdct4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4DCT, AccuracyCheck) { RunAccuracyCheck(1); }
-
-TEST_P(Trans4x4DCT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4DCT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4DCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-class Trans4x4HT : public Trans4x4TestBase,
-                   public ::testing::TestWithParam<Ht4x4Param> {
- public:
-  virtual ~Trans4x4HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_ = GET_PARAM(2);
-    pitch_ = 4;
-    fwd_txfm_ref = fht4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, tx_type_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, tx_type_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4HT, AccuracyCheck) { RunAccuracyCheck(1); }
-
-TEST_P(Trans4x4HT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4HT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-class Trans4x4WHT : public Trans4x4TestBase,
-                    public ::testing::TestWithParam<Dct4x4Param> {
- public:
-  virtual ~Trans4x4WHT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_ = GET_PARAM(2);
-    pitch_ = 4;
-    fwd_txfm_ref = fwht4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0); }
-
-TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
-using std::tr1::make_tuple;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4DCT,
-    ::testing::Values(
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_c,
-                                                     &vpx_idct4x4_16_add_c, 0,
-                                                     VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT,
-                        ::testing::Values(make_tuple(&vp9_fwht4x4_c,
-                                                     &vpx_iwht4x4_16_add_c, 0,
-                                                     VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_c,
-                                                     &vpx_idct4x4_16_add_neon,
-                                                     0, VPX_BITS_8)));
-#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    NEON, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
-#endif
-
-#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(SSE2, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_sse2,
-                                                     &vpx_idct4x4_16_add_sse2,
-                                                     0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
-#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4DCT,
-    ::testing::Values(
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, VPX_BITS_12),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
-
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(MSA, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_msa,
-                                                     &vpx_idct4x4_16_add_msa, 0,
-                                                     VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    MSA, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8)));
-#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-}  // namespace
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@ -43,9 +43,9 @@ typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                        int tx_type);

-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
-typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;
+typedef ::testing::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
+typedef ::testing::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
+typedef ::testing::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;

 void reference_8x8_dct_1d(const double in[8], double out[8]) {
  const double kInvSqrt2 = 0.707106781186547524400844362104;
@ -88,45 +88,45 @@ void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {

 #if CONFIG_VP9_HIGHBITDEPTH
 void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_c(in, out, stride, 10);
+  vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }

 void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_c(in, out, stride, 12);
+  vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }

 void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10);
+  vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
 }

 void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
+  vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
 }

 #if HAVE_SSE2

 void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_12_add_c(in, out, stride, 10);
+  vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }

 void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_12_add_c(in, out, stride, 12);
+  vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }

 void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
 }

 void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
 }

 void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
 }

 void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@ -257,7 +257,7 @@ class FwdTrans8x8TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
      } else {
        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
      }

@ -340,7 +340,7 @@ class FwdTrans8x8TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
      } else {
        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
      }

@ -413,7 +413,7 @@ class FwdTrans8x8TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
      } else {
        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
      }

@ -497,9 +497,9 @@ class FwdTrans8x8TestBase {
        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
 #if CONFIG_VP9_HIGHBITDEPTH
      } else {
-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
      }

@ -511,8 +511,8 @@ class FwdTrans8x8TestBase {
        const int diff = dst[j] - ref[j];
 #endif
        const uint32_t error = diff * diff;
-        EXPECT_EQ(0u, error) << "Error: 8x8 IDCT has error " << error
-                             << " at index " << j;
+        EXPECT_EQ(0u, error)
+            << "Error: 8x8 IDCT has error " << error << " at index " << j;
      }
    }
  }
@ -628,7 +628,7 @@ TEST_P(InvTrans8x8DCT, CompareReference) {
  CompareInvReference(ref_txfm_, thresh_);
 }

-using std::tr1::make_tuple;
+using ::testing::make_tuple;

 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@ -670,14 +670,13 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH

-#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
                        ::testing::Values(make_tuple(&vpx_fdct8x8_neon,
                                                     &vpx_idct8x8_64_add_neon,
                                                     0, VPX_BITS_8)));
-#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
    NEON, FwdTrans8x8HT,
    ::testing::Values(
@ -685,7 +684,8 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE

 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT,
@ -740,7 +740,7 @@ INSTANTIATE_TEST_CASE_P(
    !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
                        ::testing::Values(make_tuple(&vpx_fdct8x8_ssse3,
-                                                     &vpx_idct8x8_64_add_ssse3,
+                                                     &vpx_idct8x8_64_add_sse2,
                                                     0, VPX_BITS_8)));
 #endif

@ -757,4 +757,11 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8),
        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(VSX, FwdTrans8x8DCT,
+                        ::testing::Values(make_tuple(&vpx_fdct8x8_c,
+                                                     &vpx_idct8x8_64_add_vsx, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@ -13,6 +13,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"

 #include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/vpx_timer.h"

 #include "test/acm_random.h"
 #include "test/register_state_check.h"
@ -21,7 +22,8 @@ namespace {

 using ::libvpx_test::ACMRandom;

-typedef void (*HadamardFunc)(const int16_t *a, int a_stride, int16_t *b);
+typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride,
+                             tran_low_t *b);

 void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
  int16_t b[8];
@ -46,18 +48,16 @@ void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
  out[5] = c[3] - c[7];
 }

-void reference_hadamard8x8(const int16_t *a, int a_stride, int16_t *b) {
+void reference_hadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) {
  int16_t buf[64];
-  for (int i = 0; i < 8; ++i) {
-    hadamard_loop(a + i, a_stride, buf + i * 8);
-  }
+  int16_t buf2[64];
+  for (int i = 0; i < 8; ++i) hadamard_loop(a + i, a_stride, buf + i * 8);
+  for (int i = 0; i < 8; ++i) hadamard_loop(buf + i, 8, buf2 + i * 8);

-  for (int i = 0; i < 8; ++i) {
-    hadamard_loop(buf + i, 8, b + i * 8);
-  }
+  for (int i = 0; i < 64; ++i) b[i] = (tran_low_t)buf2[i];
 }

-void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
+void reference_hadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
  /* The source is a 16x16 block. The destination is rearranged to 8x32.
   * Input is 9 bit. */
  reference_hadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
@ -68,16 +68,16 @@ void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
  /* Overlay the 8x8 blocks and combine. */
  for (int i = 0; i < 64; ++i) {
    /* 8x8 steps the range up to 15 bits. */
-    const int16_t a0 = b[0];
-    const int16_t a1 = b[64];
-    const int16_t a2 = b[128];
-    const int16_t a3 = b[192];
+    const tran_low_t a0 = b[0];
+    const tran_low_t a1 = b[64];
+    const tran_low_t a2 = b[128];
+    const tran_low_t a3 = b[192];

    /* Prevent the result from escaping int16_t. */
-    const int16_t b0 = (a0 + a1) >> 1;
-    const int16_t b1 = (a0 - a1) >> 1;
-    const int16_t b2 = (a2 + a3) >> 1;
-    const int16_t b3 = (a2 - a3) >> 1;
+    const tran_low_t b0 = (a0 + a1) >> 1;
+    const tran_low_t b1 = (a0 - a1) >> 1;
+    const tran_low_t b2 = (a2 + a3) >> 1;
+    const tran_low_t b3 = (a2 - a3) >> 1;

    /* Store a 16 bit value. */
    b[0] = b0 + b2;
@ -101,12 +101,35 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFunc> {
  ACMRandom rnd_;
 };

+void HadamardSpeedTest(const char *name, HadamardFunc const func,
+                       const int16_t *input, int stride, tran_low_t *output,
+                       int times) {
+  int i;
+  vpx_usec_timer timer;
+
+  vpx_usec_timer_start(&timer);
+  for (i = 0; i < times; ++i) {
+    func(input, stride, output);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("%s[%12d runs]: %d us\n", name, times, elapsed_time);
+}
+
 class Hadamard8x8Test : public HadamardTestBase {};

+void HadamardSpeedTest8x8(HadamardFunc const func, int times) {
+  DECLARE_ALIGNED(16, int16_t, input[64]);
+  DECLARE_ALIGNED(16, tran_low_t, output[64]);
+  memset(input, 1, sizeof(input));
+  HadamardSpeedTest("Hadamard8x8", func, input, 8, output, times);
+}
+
 TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
  DECLARE_ALIGNED(16, int16_t, a[64]);
-  DECLARE_ALIGNED(16, int16_t, b[64]);
-  int16_t b_ref[64];
+  DECLARE_ALIGNED(16, tran_low_t, b[64]);
+  tran_low_t b_ref[64];
  for (int i = 0; i < 64; ++i) {
    a[i] = rnd_.Rand9Signed();
  }
@ -124,8 +147,8 @@ TEST_P(Hadamard8x8Test, CompareReferenceRandom) {

 TEST_P(Hadamard8x8Test, VaryStride) {
  DECLARE_ALIGNED(16, int16_t, a[64 * 8]);
-  DECLARE_ALIGNED(16, int16_t, b[64]);
-  int16_t b_ref[64];
+  DECLARE_ALIGNED(16, tran_low_t, b[64]);
+  tran_low_t b_ref[64];
  for (int i = 0; i < 64 * 8; ++i) {
    a[i] = rnd_.Rand9Signed();
  }
@ -144,6 +167,12 @@ TEST_P(Hadamard8x8Test, VaryStride) {
  }
 }

+TEST_P(Hadamard8x8Test, DISABLED_Speed) {
+  HadamardSpeedTest8x8(h_func_, 10);
+  HadamardSpeedTest8x8(h_func_, 10000);
+  HadamardSpeedTest8x8(h_func_, 10000000);
+}
+
 INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
                        ::testing::Values(&vpx_hadamard_8x8_c));

@ -162,12 +191,33 @@ INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
                        ::testing::Values(&vpx_hadamard_8x8_neon));
 #endif  // HAVE_NEON

+// TODO(jingning): Remove highbitdepth flag when the SIMD functions are
+// in place and turn on the unit test.
+#if !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_msa));
+#endif  // HAVE_MSA
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_vsx));
+#endif  // HAVE_VSX
+
 class Hadamard16x16Test : public HadamardTestBase {};

+void HadamardSpeedTest16x16(HadamardFunc const func, int times) {
+  DECLARE_ALIGNED(16, int16_t, input[256]);
+  DECLARE_ALIGNED(16, tran_low_t, output[256]);
+  memset(input, 1, sizeof(input));
+  HadamardSpeedTest("Hadamard16x16", func, input, 16, output, times);
+}
+
 TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
  DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
-  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
-  int16_t b_ref[16 * 16];
+  DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]);
+  tran_low_t b_ref[16 * 16];
  for (int i = 0; i < 16 * 16; ++i) {
    a[i] = rnd_.Rand9Signed();
  }
@ -185,8 +235,8 @@ TEST_P(Hadamard16x16Test, CompareReferenceRandom) {

 TEST_P(Hadamard16x16Test, VaryStride) {
  DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]);
-  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
-  int16_t b_ref[16 * 16];
+  DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]);
+  tran_low_t b_ref[16 * 16];
  for (int i = 0; i < 16 * 16 * 8; ++i) {
    a[i] = rnd_.Rand9Signed();
  }
@ -205,6 +255,12 @@ TEST_P(Hadamard16x16Test, VaryStride) {
  }
 }

+TEST_P(Hadamard16x16Test, DISABLED_Speed) {
+  HadamardSpeedTest16x16(h_func_, 10);
+  HadamardSpeedTest16x16(h_func_, 10000);
+  HadamardSpeedTest16x16(h_func_, 10000000);
+}
+
 INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
                        ::testing::Values(&vpx_hadamard_16x16_c));

@ -213,8 +269,25 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
                        ::testing::Values(&vpx_hadamard_16x16_sse2));
 #endif  // HAVE_SSE2

+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_avx2));
+#endif  // HAVE_AVX2
+
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_vsx));
+#endif  // HAVE_VSX
+
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
                        ::testing::Values(&vpx_hadamard_16x16_neon));
 #endif  // HAVE_NEON
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_msa));
+#endif  // HAVE_MSA
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@ -13,6 +13,7 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "vpx/vpx_integer.h"
@ -21,110 +22,156 @@ typedef void (*IdctFunc)(int16_t *input, unsigned char *pred_ptr,
                         int pred_stride, unsigned char *dst_ptr,
                         int dst_stride);
 namespace {
+
+using libvpx_test::Buffer;
+
 class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
 protected:
  virtual void SetUp() {
-    int i;
-
    UUT = GetParam();
-    memset(input, 0, sizeof(input));
-    /* Set up guard blocks */
-    for (i = 0; i < 256; i++) output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
+
+    input = new Buffer<int16_t>(4, 4, 0);
+    ASSERT_TRUE(input != NULL);
+    ASSERT_TRUE(input->Init());
+    predict = new Buffer<uint8_t>(4, 4, 3);
+    ASSERT_TRUE(predict != NULL);
+    ASSERT_TRUE(predict->Init());
+    output = new Buffer<uint8_t>(4, 4, 3);
+    ASSERT_TRUE(output != NULL);
+    ASSERT_TRUE(output->Init());
  }

-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  virtual void TearDown() {
+    delete input;
+    delete predict;
+    delete output;
+    libvpx_test::ClearSystemState();
+  }

  IdctFunc UUT;
-  int16_t input[16];
-  unsigned char output[256];
-  unsigned char predict[256];
+  Buffer<int16_t> *input;
+  Buffer<uint8_t> *predict;
+  Buffer<uint8_t> *output;
 };

-TEST_P(IDCTTest, TestGuardBlocks) {
-  int i;
-
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(0, output[i]) << i;
-    else
-      EXPECT_EQ(255, output[i]);
-  }
-}
-
 TEST_P(IDCTTest, TestAllZeros) {
-  int i;
+  // When the input is '0' the output will be '0'.
+  input->Set(0);
+  predict->Set(0);
+  output->Set(0);

-  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(),
+                               predict->stride(), output->TopLeftPixel(),
+                               output->stride()));

-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(0, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
-  }
+  ASSERT_TRUE(input->CheckValues(0));
+  ASSERT_TRUE(input->CheckPadding());
+  ASSERT_TRUE(output->CheckValues(0));
+  ASSERT_TRUE(output->CheckPadding());
 }

 TEST_P(IDCTTest, TestAllOnes) {
-  int i;
+  input->Set(0);
+  // When the first element is '4' it will fill the output buffer with '1'.
+  input->TopLeftPixel()[0] = 4;
+  predict->Set(0);
+  output->Set(0);

-  input[0] = 4;
-  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(),
+                               predict->stride(), output->TopLeftPixel(),
+                               output->stride()));

-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(1, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
-  }
+  ASSERT_TRUE(output->CheckValues(1));
+  ASSERT_TRUE(output->CheckPadding());
 }

 TEST_P(IDCTTest, TestAddOne) {
-  int i;
+  // Set the transform output to '1' and make sure it gets added to the
+  // prediction buffer.
+  input->Set(0);
+  input->TopLeftPixel()[0] = 4;
+  output->Set(0);

-  for (i = 0; i < 256; i++) predict[i] = i;
-  input[0] = 4;
-  ASM_REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
-
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(i + 1, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
+  uint8_t *pred = predict->TopLeftPixel();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      pred[y * predict->stride() + x] = y * 4 + x;
+    }
  }
+
+  ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(),
+                               predict->stride(), output->TopLeftPixel(),
+                               output->stride()));
+
+  uint8_t const *out = output->TopLeftPixel();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      EXPECT_EQ(1 + y * 4 + x, out[y * output->stride() + x]);
+    }
+  }
+
+  if (HasFailure()) {
+    output->DumpBuffer();
+  }
+
+  ASSERT_TRUE(output->CheckPadding());
 }

 TEST_P(IDCTTest, TestWithData) {
-  int i;
+  // Test a single known input.
+  predict->Set(0);

-  for (i = 0; i < 16; i++) input[i] = i;
-
-  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
-
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) > 3 || i > 63)
-      EXPECT_EQ(255, output[i]) << "i==" << i;
-    else if (i == 0)
-      EXPECT_EQ(11, output[i]) << "i==" << i;
-    else if (i == 34)
-      EXPECT_EQ(1, output[i]) << "i==" << i;
-    else if (i == 2 || i == 17 || i == 32)
-      EXPECT_EQ(3, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(0, output[i]) << "i==" << i;
+  int16_t *in = input->TopLeftPixel();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      in[y * input->stride() + x] = y * 4 + x;
+    }
  }
+
+  ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(),
+                               predict->stride(), output->TopLeftPixel(),
+                               output->stride()));
+
+  uint8_t *out = output->TopLeftPixel();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      switch (y * 4 + x) {
+        case 0: EXPECT_EQ(11, out[y * output->stride() + x]); break;
+        case 2:
+        case 5:
+        case 8: EXPECT_EQ(3, out[y * output->stride() + x]); break;
+        case 10: EXPECT_EQ(1, out[y * output->stride() + x]); break;
+        default: EXPECT_EQ(0, out[y * output->stride() + x]);
+      }
+    }
+  }
+
+  if (HasFailure()) {
+    output->DumpBuffer();
+  }
+
+  ASSERT_TRUE(output->CheckPadding());
 }

 INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
+
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, IDCTTest,
                        ::testing::Values(vp8_short_idct4x4llm_neon));
-#endif
+#endif  // HAVE_NEON
+
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
                        ::testing::Values(vp8_short_idct4x4llm_mmx));
-#endif
+#endif  // HAVE_MMX
+
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(MSA, IDCTTest,
                        ::testing::Values(vp8_short_idct4x4llm_msa));
-#endif
-}
+#endif  // HAVE_MSA
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_mmi));
+#endif  // HAVE_MMI
+}  // namespace
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@ -45,8 +45,8 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,

  void OpenResFile(const std::string &res_file_name_) {
    res_file_ = libvpx_test::OpenTestDataFile(res_file_name_);
-    ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: "
-                                   << res_file_name_;
+    ASSERT_TRUE(res_file_ != NULL)
+        << "Result file open failed. Filename: " << res_file_name_;
  }

  virtual bool HandleDecodeResult(
@ -120,11 +120,24 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,

 TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }

+#if CONFIG_VP8_DECODER
+const DecodeParam kVP8InvalidFileTests[] = {
+  { 1, "invalid-bug-1443.ivf" },
+  { 1, "invalid-token-partition.ivf" },
+};
+
+VP8_INSTANTIATE_TEST_CASE(InvalidFileTest,
+                          ::testing::ValuesIn(kVP8InvalidFileTests));
+#endif  // CONFIG_VP8_DECODER
+
 #if CONFIG_VP9_DECODER
 const DecodeParam kVP9InvalidFileTests[] = {
  { 1, "invalid-vp90-02-v2.webm" },
 #if CONFIG_VP9_HIGHBITDEPTH
  { 1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf" },
+  { 1,
+    "invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-."
+    "ivf" },
 #endif
  { 1, "invalid-vp90-03-v3.webm" },
  { 1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf" },
@ -144,6 +157,7 @@ const DecodeParam kVP9InvalidFileTests[] = {
  { 1, "invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf" },
  { 1,
    "invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf" },
+  { 1, "invalid-crbug-667044.webm" },
 };

 VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
@ -163,12 +177,12 @@ class InvalidFileInvalidPeekTest : public InvalidFileTest {
 TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); }

 #if CONFIG_VP8_DECODER
-const DecodeParam kVP8InvalidFileTests[] = {
+const DecodeParam kVP8InvalidPeekTests[] = {
  { 1, "invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf" },
 };

 VP8_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest,
-                          ::testing::ValuesIn(kVP8InvalidFileTests));
+                          ::testing::ValuesIn(kVP8InvalidPeekTests));
 #endif  // CONFIG_VP8_DECODER

 #if CONFIG_VP9_DECODER
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@ -47,8 +47,8 @@ class IVFVideoSource : public CompressedVideoSource {

  virtual void Begin() {
    input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;

    // Read file header
    uint8_t file_hdr[kIvfFileHdrSize];
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@ -68,7 +68,9 @@ TEST_P(KeyframeTest, TestRandomVideoSource) {

  // In realtime mode - auto placed keyframes are exceedingly rare,  don't
  // bother with this check   if(GetParam() > 0)
-  if (GET_PARAM(1) > 0) EXPECT_GT(kf_count_, 1);
+  if (GET_PARAM(1) > 0) {
+    EXPECT_GT(kf_count_, 1);
+  }
 }

 TEST_P(KeyframeTest, TestDisableKeyframes) {
@ -128,15 +130,16 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {

  // In realtime mode - auto placed keyframes are exceedingly rare,  don't
  // bother with this check
-  if (GET_PARAM(1) > 0)
+  if (GET_PARAM(1) > 0) {
    EXPECT_EQ(2u, kf_pts_list_.size()) << " Not the right number of keyframes ";
+  }

  // Verify that keyframes match the file keyframes in the file.
  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
       iter != kf_pts_list_.end(); ++iter) {
    if (deadline_ == VPX_DL_REALTIME && *iter > 0)
-      EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
-                                     << *iter;
+      EXPECT_EQ(0, (*iter - 1) % 30)
+          << "Unexpected keyframe at frame " << *iter;
    else
      EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter;
  }
--- a/test/level_test.cc
+++ b/test/level_test.cc
@ -66,6 +66,36 @@ class LevelTest
  int level_;
 };

+TEST_P(LevelTest, TestTargetLevel11Large) {
+  ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime);
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       60);
+  target_level_ = 11;
+  cfg_.rc_target_bitrate = 150;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(target_level_, level_);
+}
+
+TEST_P(LevelTest, TestTargetLevel20Large) {
+  ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime);
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 60);
+  target_level_ = 20;
+  cfg_.rc_target_bitrate = 1200;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(target_level_, level_);
+}
+
+TEST_P(LevelTest, TestTargetLevel31Large) {
+  ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime);
+  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30,
+                                       1, 0, 60);
+  target_level_ = 31;
+  cfg_.rc_target_bitrate = 8000;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(target_level_, level_);
+}
+
 // Test for keeping level stats only
 TEST_P(LevelTest, TestTargetLevel0) {
  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
@ -73,11 +103,11 @@ TEST_P(LevelTest, TestTargetLevel0) {
  target_level_ = 0;
  min_gf_internal_ = 4;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(11, level_);
+  ASSERT_GE(11, level_);

  cfg_.rc_target_bitrate = 1600;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(20, level_);
+  ASSERT_GE(20, level_);
 }

 // Test for level control being turned off
@ -94,12 +124,13 @@ TEST_P(LevelTest, TestTargetLevelApi) {
  vpx_codec_ctx_t enc;
  vpx_codec_enc_cfg_t cfg;
  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0));
+  cfg.rc_target_bitrate = 100;
  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, codec, &cfg, 0));
  for (int level = 0; level <= 256; ++level) {
    if (level == 10 || level == 11 || level == 20 || level == 21 ||
        level == 30 || level == 31 || level == 40 || level == 41 ||
        level == 50 || level == 51 || level == 52 || level == 60 ||
-        level == 61 || level == 62 || level == 0 || level == 255)
+        level == 61 || level == 62 || level == 0 || level == 1 || level == 255)
      EXPECT_EQ(VPX_CODEC_OK,
                vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level));
    else
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@ -56,8 +56,8 @@ typedef void (*dual_loop_op_t)(Pixel *s, int p, const uint8_t *blimit0,
                               const uint8_t *thresh1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH

-typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
-typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
+typedef ::testing::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
+typedef ::testing::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;

 void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
               const int mask, const int32_t p, const int i) {
@ -114,6 +114,18 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
  }
 }

+uint8_t GetOuterThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(3 * MAX_LOOP_FILTER + 5));
+}
+
+uint8_t GetInnerThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1));
+}
+
+uint8_t GetHevThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1) >> 4);
+}
+
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
 public:
  virtual ~Loop8Test6Param() {}
@ -162,15 +174,15 @@ TEST_P(Loop8Test6Param, OperationCheck) {
  int first_failure = -1;
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@ -221,15 +233,15 @@ TEST_P(Loop8Test6Param, ValueCheck) {

  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@ -271,27 +283,27 @@ TEST_P(Loop8Test9Param, OperationCheck) {
  int first_failure = -1;
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    tmp = GetOuterThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@ -334,27 +346,27 @@ TEST_P(Loop8Test9Param, ValueCheck) {
  int first_failure = -1;
  for (int i = 0; i < count_test_block; ++i) {
    int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    tmp = GetOuterThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
    DECLARE_ALIGNED(16, const uint8_t,
                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@ -390,7 +402,7 @@ TEST_P(Loop8Test9Param, ValueCheck) {
      << "First failed at test case " << first_failure;
 }

-using std::tr1::make_tuple;
+using ::testing::make_tuple;

 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
--- a/test/minmax_test.cc
+++ b/test/minmax_test.cc
@ -107,10 +107,10 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
      int min_ref, max_ref, min, max;
      reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
      ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
-      EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride
-                              << " and b_stride = " << b_stride;
-      EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride
-                              << " and b_stride = " << b_stride;
+      EXPECT_EQ(max_ref, max)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+      EXPECT_EQ(min_ref, min)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
    }
  }
 }
@ -127,4 +127,9 @@ INSTANTIATE_TEST_CASE_P(NEON, MinMaxTest,
                        ::testing::Values(&vpx_minmax_8x8_neon));
 #endif

+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, MinMaxTest,
+                        ::testing::Values(&vpx_minmax_8x8_msa));
+#endif
+
 }  // namespace
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@ -7,21 +7,40 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
+#include <limits.h>
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"

+using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
+
 typedef void (*VpxPostProcDownAndAcrossMbRowFunc)(
    unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line,
    int dst_pixels_per_line, int cols, unsigned char *flimit, int size);

+typedef void (*VpxMbPostProcAcrossIpFunc)(unsigned char *src, int pitch,
+                                          int rows, int cols, int flimit);
+
+typedef void (*VpxMbPostProcDownFunc)(unsigned char *dst, int pitch, int rows,
+                                      int cols, int flimit);
+
 namespace {

+// Compute the filter level used in post proc from the loop filter strength
+int q2mbl(int x) {
+  if (x < 20) x = 20;
+
+  x = 50 + (x - 50) * 10 / 8;
+  return x * x / 3;
+}
+
 class VpxPostProcDownAndAcrossMbRowTest
    : public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
 public:
@ -37,25 +56,16 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
  const int block_height = 16;

  // 5-tap filter needs 2 padding rows above and below the block in the input.
-  const int input_width = block_width;
-  const int input_height = block_height + 4;
-  const int input_stride = input_width;
-  const int input_size = input_width * input_height;
+  Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2);
+  ASSERT_TRUE(src_image.Init());

  // Filter extends output block by 8 samples at left and right edges.
-  const int output_width = block_width + 16;
-  const int output_height = block_height;
-  const int output_stride = output_width;
-  const int output_size = output_width * output_height;
+  // Though the left padding is only 8 bytes, the assembly code tries to
+  // read 16 bytes before the pointer.
+  Buffer<uint8_t> dst_image =
+      Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8);
+  ASSERT_TRUE(dst_image.Init());

-  uint8_t *const src_image =
-      reinterpret_cast<uint8_t *>(vpx_calloc(input_size, 1));
-  uint8_t *const dst_image =
-      reinterpret_cast<uint8_t *>(vpx_calloc(output_size, 1));
-
-  // Pointers to top-left pixel of block in the input and output images.
-  uint8_t *const src_image_ptr = src_image + (input_stride << 1);
-  uint8_t *const dst_image_ptr = dst_image + 8;
  uint8_t *const flimits =
      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
  (void)memset(flimits, 255, block_width);
@ -63,53 +73,412 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
  // Initialize pixels in the input:
  //   block pixels to value 1,
  //   border pixels to value 10.
-  (void)memset(src_image, 10, input_size);
-  uint8_t *pixel_ptr = src_image_ptr;
-  for (int i = 0; i < block_height; ++i) {
-    for (int j = 0; j < block_width; ++j) {
-      pixel_ptr[j] = 1;
-    }
-    pixel_ptr += input_stride;
-  }
+  src_image.SetPadding(10);
+  src_image.Set(1);

  // Initialize pixels in the output to 99.
-  (void)memset(dst_image, 99, output_size);
+  dst_image.Set(99);

-  ASM_REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr,
-                                      input_stride, output_stride, block_width,
-                                      flimits, 16));
+  ASM_REGISTER_STATE_CHECK(GetParam()(
+      src_image.TopLeftPixel(), dst_image.TopLeftPixel(), src_image.stride(),
+      dst_image.stride(), block_width, flimits, 16));

  static const uint8_t kExpectedOutput[block_height] = {
    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
  };

-  pixel_ptr = dst_image_ptr;
+  uint8_t *pixel_ptr = dst_image.TopLeftPixel();
  for (int i = 0; i < block_height; ++i) {
    for (int j = 0; j < block_width; ++j) {
-      ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]);
+      ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j])
+          << "at (" << i << ", " << j << ")";
    }
-    pixel_ptr += output_stride;
+    pixel_ptr += dst_image.stride();
  }

-  vpx_free(src_image);
-  vpx_free(dst_image);
  vpx_free(flimits);
 };

+TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
+  // Size of the underlying data block that will be filtered.
+  // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V
+  // blocks are always a multiple of 8 wide and exactly 8 high.
+  const int block_width = 136;
+  const int block_height = 16;
+
+  // 5-tap filter needs 2 padding rows above and below the block in the input.
+  // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
+  Buffer<uint8_t> src_image =
+      Buffer<uint8_t>(block_width, block_height, 2, 2, 10, 2);
+  ASSERT_TRUE(src_image.Init());
+
+  // Filter extends output block by 8 samples at left and right edges.
+  // Though the left padding is only 8 bytes, there is 'above' padding as well
+  // so when the assembly code tries to read 16 bytes before the pointer it is
+  // not a problem.
+  // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
+  Buffer<uint8_t> dst_image =
+      Buffer<uint8_t>(block_width, block_height, 8, 8, 16, 8);
+  ASSERT_TRUE(dst_image.Init());
+  Buffer<uint8_t> dst_image_ref = Buffer<uint8_t>(block_width, block_height, 8);
+  ASSERT_TRUE(dst_image_ref.Init());
+
+  // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock
+  // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so
+  // it must be padded out.
+  const int flimits_width = block_width % 16 ? block_width + 8 : block_width;
+  uint8_t *const flimits =
+      reinterpret_cast<uint8_t *>(vpx_memalign(16, flimits_width));
+
+  ACMRandom rnd;
+  rnd.Reset(ACMRandom::DeterministicSeed());
+  // Initialize pixels in the input:
+  //   block pixels to random values.
+  //   border pixels to value 10.
+  src_image.SetPadding(10);
+  src_image.Set(&rnd, &ACMRandom::Rand8);
+
+  for (int blocks = 0; blocks < block_width; blocks += 8) {
+    (void)memset(flimits, 0, sizeof(*flimits) * flimits_width);
+
+    for (int f = 0; f < 255; f++) {
+      (void)memset(flimits + blocks, f, sizeof(*flimits) * 8);
+
+      dst_image.Set(0);
+      dst_image_ref.Set(0);
+
+      vpx_post_proc_down_and_across_mb_row_c(
+          src_image.TopLeftPixel(), dst_image_ref.TopLeftPixel(),
+          src_image.stride(), dst_image_ref.stride(), block_width, flimits,
+          block_height);
+      ASM_REGISTER_STATE_CHECK(
+          GetParam()(src_image.TopLeftPixel(), dst_image.TopLeftPixel(),
+                     src_image.stride(), dst_image.stride(), block_width,
+                     flimits, block_height));
+
+      ASSERT_TRUE(dst_image.CheckValues(dst_image_ref));
+    }
+  }
+
+  vpx_free(flimits);
+}
+
+class VpxMbPostProcAcrossIpTest
+    : public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> {
+ public:
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void SetCols(unsigned char *s, int rows, int cols, int src_width) {
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < cols; c++) {
+        s[c] = c;
+      }
+      s += src_width;
+    }
+  }
+
+  void RunComparison(const unsigned char *expected_output, unsigned char *src_c,
+                     int rows, int cols, int src_pitch) {
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < cols; c++) {
+        ASSERT_EQ(expected_output[c], src_c[c])
+            << "at (" << r << ", " << c << ")";
+      }
+      src_c += src_pitch;
+    }
+  }
+
+  void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width,
+                      int filter_level, const unsigned char *expected_output) {
+    ASM_REGISTER_STATE_CHECK(
+        GetParam()(s, src_width, rows, cols, filter_level));
+    RunComparison(expected_output, s, rows, cols, src_width);
+  }
+};
+
+TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(src.Init());
+  src.SetPadding(10);
+  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
+
+  Buffer<uint8_t> expected_output = Buffer<uint8_t>(cols, rows, 0);
+  ASSERT_TRUE(expected_output.Init());
+  SetCols(expected_output.TopLeftPixel(), rows, cols, expected_output.stride());
+
+  RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(0),
+                 expected_output.TopLeftPixel());
+}
+
+TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(src.Init());
+  src.SetPadding(10);
+  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
+
+  static const unsigned char kExpectedOutput[cols] = {
+    2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 13
+  };
+
+  RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(70),
+                 kExpectedOutput);
+}
+
+TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(src.Init());
+  src.SetPadding(10);
+  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
+
+  static const unsigned char kExpectedOutput[cols] = {
+    2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 13
+  };
+
+  RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), INT_MAX,
+                 kExpectedOutput);
+
+  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
+
+  RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(100),
+                 kExpectedOutput);
+}
+
+TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> c_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(c_mem.Init());
+  Buffer<uint8_t> asm_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(asm_mem.Init());
+
+  // When level >= 100, the filter behaves the same as the level = INT_MAX
+  // When level < 20, it behaves the same as the level = 0
+  for (int level = 0; level < 100; level++) {
+    c_mem.SetPadding(10);
+    asm_mem.SetPadding(10);
+    SetCols(c_mem.TopLeftPixel(), rows, cols, c_mem.stride());
+    SetCols(asm_mem.TopLeftPixel(), rows, cols, asm_mem.stride());
+
+    vpx_mbpost_proc_across_ip_c(c_mem.TopLeftPixel(), c_mem.stride(), rows,
+                                cols, q2mbl(level));
+    ASM_REGISTER_STATE_CHECK(GetParam()(
+        asm_mem.TopLeftPixel(), asm_mem.stride(), rows, cols, q2mbl(level)));
+
+    ASSERT_TRUE(asm_mem.CheckValues(c_mem));
+  }
+}
+
+class VpxMbPostProcDownTest
+    : public ::testing::TestWithParam<VpxMbPostProcDownFunc> {
+ public:
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void SetRows(unsigned char *src_c, int rows, int cols, int src_width) {
+    for (int r = 0; r < rows; r++) {
+      memset(src_c, r, cols);
+      src_c += src_width;
+    }
+  }
+
+  void RunComparison(const unsigned char *expected_output, unsigned char *src_c,
+                     int rows, int cols, int src_pitch) {
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < cols; c++) {
+        ASSERT_EQ(expected_output[r * rows + c], src_c[c])
+            << "at (" << r << ", " << c << ")";
+      }
+      src_c += src_pitch;
+    }
+  }
+
+  void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width,
+                      int filter_level, const unsigned char *expected_output) {
+    ASM_REGISTER_STATE_CHECK(
+        GetParam()(s, src_width, rows, cols, filter_level));
+    RunComparison(expected_output, s, rows, cols, src_width);
+  }
+};
+
+TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
+  src_c.SetPadding(10);
+
+  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
+
+  static const unsigned char kExpectedOutput[rows * cols] = {
+    2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  2,
+    2,  3,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  3,  4,  4,  3,  3,  3,
+    4,  4,  3,  4,  4,  3,  3,  4,  5,  4,  4,  4,  4,  4,  4,  4,  5,  4,  4,
+    4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  8,  9,  9,  8,  8,  8,  9,
+    9,  8,  9,  9,  8,  8,  8,  9,  9,  10, 10, 9,  9,  9,  10, 10, 9,  10, 10,
+    9,  9,  9,  10, 10, 10, 11, 10, 10, 10, 11, 10, 11, 10, 11, 10, 10, 10, 11,
+    10, 11, 11, 11, 11, 11, 11, 11, 12, 11, 11, 11, 11, 11, 11, 11, 12, 11, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 12,
+    13, 12, 13, 12, 12, 12, 13, 12, 13, 12, 13, 12, 13, 13, 13, 14, 13, 13, 13,
+    13, 13, 13, 13, 14, 13, 13, 13, 13
+  };
+
+  RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), INT_MAX,
+                 kExpectedOutput);
+
+  src_c.SetPadding(10);
+  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
+  RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(100),
+                 kExpectedOutput);
+}
+
+TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
+  src_c.SetPadding(10);
+
+  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
+
+  static const unsigned char kExpectedOutput[rows * cols] = {
+    2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  2,
+    2,  3,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,
+    9,  9,  9,  9,  9,  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 13, 12,
+    13, 12, 13, 12, 12, 12, 13, 12, 13, 12, 13, 12, 13, 13, 13, 14, 13, 13, 13,
+    13, 13, 13, 13, 14, 13, 13, 13, 13
+  };
+
+  RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(70),
+                 kExpectedOutput);
+}
+
+TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
+  src_c.SetPadding(10);
+
+  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
+
+  unsigned char *expected_output = new unsigned char[rows * cols];
+  ASSERT_TRUE(expected_output != NULL);
+  SetRows(expected_output, rows, cols, cols);
+
+  RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(0),
+                 expected_output);
+
+  delete[] expected_output;
+}
+
+TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
+  const int rows = 16;
+  const int cols = 16;
+
+  ACMRandom rnd;
+  rnd.Reset(ACMRandom::DeterministicSeed());
+
+  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
+  Buffer<uint8_t> src_asm = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_asm.Init());
+
+  for (int level = 0; level < 100; level++) {
+    src_c.SetPadding(10);
+    src_asm.SetPadding(10);
+    src_c.Set(&rnd, &ACMRandom::Rand8);
+    src_asm.CopyFrom(src_c);
+
+    vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols,
+                           q2mbl(level));
+    ASM_REGISTER_STATE_CHECK(GetParam()(
+        src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level)));
+    ASSERT_TRUE(src_asm.CheckValues(src_c));
+
+    src_c.SetPadding(10);
+    src_asm.SetPadding(10);
+    src_c.Set(&rnd, &ACMRandom::Rand8Extremes);
+    src_asm.CopyFrom(src_c);
+
+    vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols,
+                           q2mbl(level));
+    ASM_REGISTER_STATE_CHECK(GetParam()(
+        src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level)));
+    ASSERT_TRUE(src_asm.CheckValues(src_c));
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(
    C, VpxPostProcDownAndAcrossMbRowTest,
    ::testing::Values(vpx_post_proc_down_and_across_mb_row_c));

+INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_c));
+
+INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_c));
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
    SSE2, VpxPostProcDownAndAcrossMbRowTest,
    ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2));
-#endif
+
+INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_sse2));
+
+INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, VpxPostProcDownAndAcrossMbRowTest,
+    ::testing::Values(vpx_post_proc_down_and_across_mb_row_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_neon));
+#endif  // HAVE_NEON

 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(
    MSA, VpxPostProcDownAndAcrossMbRowTest,
    ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa));
-#endif
+
+INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_msa));
+
+INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_msa));
+#endif  // HAVE_MSA

 }  // namespace
--- a/test/predict_test.cc
+++ b/test/predict_test.cc
@ -24,14 +24,14 @@

 namespace {

+using ::testing::make_tuple;
 using libvpx_test::ACMRandom;
-using std::tr1::make_tuple;

 typedef void (*PredictFunc)(uint8_t *src_ptr, int src_pixels_per_line,
                            int xoffset, int yoffset, uint8_t *dst_ptr,
                            int dst_pitch);

-typedef std::tr1::tuple<int, int, PredictFunc> PredictParam;
+typedef ::testing::tuple<int, int, PredictFunc> PredictParam;

 class PredictTestBase : public ::testing::TestWithParam<PredictParam> {
 public:
@ -324,6 +324,15 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(4, 4, &vp8_sixtap_predict4x4_msa)));
 #endif

+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(
+    MMI, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmi),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_mmi),
+                      make_tuple(8, 4, &vp8_sixtap_predict8x4_mmi),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi)));
+#endif
+
 class BilinearPredictTest : public PredictTestBase {};

 TEST_P(BilinearPredictTest, TestWithRandomData) {
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@ -33,10 +33,10 @@ const int kNumBlockEntries = 16;

 typedef void (*VP8Quantize)(BLOCK *b, BLOCKD *d);

-typedef std::tr1::tuple<VP8Quantize, VP8Quantize> VP8QuantizeParam;
+typedef ::testing::tuple<VP8Quantize, VP8Quantize> VP8QuantizeParam;

+using ::testing::make_tuple;
 using libvpx_test::ACMRandom;
-using std::tr1::make_tuple;

 // Create and populate a VP8_COMP instance which has a complete set of
 // quantization inputs as well as a second MACROBLOCKD for output.
@ -200,4 +200,12 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c),
        make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c)));
 #endif  // HAVE_MSA
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(
+    MMI, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c),
+        make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c)));
+#endif  // HAVE_MMI
 }  // namespace
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@ -28,11 +28,13 @@
 //   See platform implementations of RegisterStateCheckXXX for details.
 //

-#if defined(_WIN64)
+#if defined(_WIN64) && ARCH_X86_64

 #undef NOMINMAX
 #define NOMINMAX
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #include <winnt.h>

@ -111,8 +113,8 @@ class RegisterStateCheck {
    int64_t post_store[8];
    vpx_push_neon(post_store);
    for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(pre_store_[i], post_store[i]) << "d" << i + 8
-                                              << " has been modified";
+      EXPECT_EQ(pre_store_[i], post_store[i])
+          << "d" << i + 8 << " has been modified";
    }
  }

@ -136,7 +138,7 @@ class RegisterStateCheck {};

 }  // namespace libvpx_test

-#endif  // _WIN64
+#endif  // _WIN64 && ARCH_X86_64

 #if ARCH_X86 || ARCH_X86_64
 #if defined(__GNUC__)
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@ -277,12 +277,29 @@ class ResizeTest
    SetMode(GET_PARAM(1));
  }

+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
+    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
+    encode_frame_width_.push_back(pkt->data.frame.width[0]);
+    encode_frame_height_.push_back(pkt->data.frame.height[0]);
+  }
+
+  unsigned int GetFrameWidth(size_t idx) const {
+    return encode_frame_width_[idx];
+  }
+
+  unsigned int GetFrameHeight(size_t idx) const {
+    return encode_frame_height_[idx];
+  }
+
  virtual void DecompressedFrameHook(const vpx_image_t &img,
                                     vpx_codec_pts_t pts) {
    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
  }

  std::vector<FrameInfo> frame_info_list_;
+  std::vector<unsigned int> encode_frame_width_;
+  std::vector<unsigned int> encode_frame_height_;
 };

 TEST_P(ResizeTest, TestExternalResizeWorks) {
@ -296,12 +313,15 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
    const unsigned int frame = static_cast<unsigned>(info->pts);
    unsigned int expected_w;
    unsigned int expected_h;
+    const size_t idx = info - frame_info_list_.begin();
+    ASSERT_EQ(info->w, GetFrameWidth(idx));
+    ASSERT_EQ(info->h, GetFrameHeight(idx));
    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
                        &expected_h, 0);
-    EXPECT_EQ(expected_w, info->w) << "Frame " << frame
-                                   << " had unexpected width";
-    EXPECT_EQ(expected_h, info->h) << "Frame " << frame
-                                   << " had unexpected height";
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
  }
 }

@ -464,8 +484,23 @@ class ResizeRealtimeTest
    ++mismatch_nframes_;
  }

+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
+    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
+    encode_frame_width_.push_back(pkt->data.frame.width[0]);
+    encode_frame_height_.push_back(pkt->data.frame.height[0]);
+  }
+
  unsigned int GetMismatchFrames() { return mismatch_nframes_; }

+  unsigned int GetFrameWidth(size_t idx) const {
+    return encode_frame_width_[idx];
+  }
+
+  unsigned int GetFrameHeight(size_t idx) const {
+    return encode_frame_height_[idx];
+  }
+
  void DefaultConfig() {
    cfg_.rc_buf_initial_sz = 500;
    cfg_.rc_buf_optimal_sz = 600;
@ -493,6 +528,8 @@ class ResizeRealtimeTest
  bool change_bitrate_;
  double mismatch_psnr_;
  int mismatch_nframes_;
+  std::vector<unsigned int> encode_frame_width_;
+  std::vector<unsigned int> encode_frame_height_;
 };

 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
@ -513,10 +550,10 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
    unsigned int expected_h;
    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
                        &expected_h, 1);
-    EXPECT_EQ(expected_w, info->w) << "Frame " << frame
-                                   << " had unexpected width";
-    EXPECT_EQ(expected_h, info->h) << "Frame " << frame
-                                   << " had unexpected height";
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
  }
 }
@ -582,6 +619,9 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
  int resize_count = 0;
  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
       info != frame_info_list_.end(); ++info) {
+    const size_t idx = info - frame_info_list_.begin();
+    ASSERT_EQ(info->w, GetFrameWidth(idx));
+    ASSERT_EQ(info->h, GetFrameHeight(idx));
    if (info->w != last_w || info->h != last_h) {
      resize_count++;
      if (resize_count == 1) {
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@ -23,6 +23,7 @@
 #include "vpx/vpx_codec.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"

 template <typename Function>
 struct TestParams {
@ -84,7 +85,7 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
    }
    mask_ = (1 << bit_depth_) - 1;
-    source_stride_ = (params_.width + 31) & ~31;
+    source_stride_ = (params_.width + 63) & ~63;
    reference_stride_ = params_.width * 2;
    rnd_.Reset(ACMRandom::DeterministicSeed());
  }
@ -108,7 +109,7 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {

 protected:
  // Handle blocks up to 4 blocks 64x64 with stride up to 128
-  static const int kDataAlignment = 16;
+  static const int kDataAlignment = 32;
  static const int kDataBlockSize = 64 * 128;
  static const int kDataBufferSize = 4 * kDataBlockSize;

@ -463,6 +464,38 @@ TEST_P(SADx4Test, SrcAlignedByWidth) {
  source_data_ = tmp_source_data;
 }

+TEST_P(SADx4Test, DISABLED_Speed) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height);
+  uint32_t reference_sad[4], exp_sad[4];
+  vpx_usec_timer timer;
+
+  memset(reference_sad, 0, sizeof(reference_sad));
+  SADs(exp_sad);
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    for (int block = 0; block < 4; ++block) {
+      reference_sad[block] = ReferenceSAD(block);
+    }
+  }
+  vpx_usec_timer_mark(&timer);
+  for (int block = 0; block < 4; ++block) {
+    EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block;
+  }
+  const int elapsed_time =
+      static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+  printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height,
+         bit_depth_, elapsed_time);
+
+  reference_stride_ = tmp_stride;
+}
+
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
@ -644,19 +677,50 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 #if HAVE_NEON
 const SadMxNParam neon_tests[] = {
  SadMxNParam(64, 64, &vpx_sad64x64_neon),
+  SadMxNParam(64, 32, &vpx_sad64x32_neon),
  SadMxNParam(32, 32, &vpx_sad32x32_neon),
+  SadMxNParam(16, 32, &vpx_sad16x32_neon),
  SadMxNParam(16, 16, &vpx_sad16x16_neon),
  SadMxNParam(16, 8, &vpx_sad16x8_neon),
  SadMxNParam(8, 16, &vpx_sad8x16_neon),
  SadMxNParam(8, 8, &vpx_sad8x8_neon),
+  SadMxNParam(8, 4, &vpx_sad8x4_neon),
+  SadMxNParam(4, 8, &vpx_sad4x8_neon),
  SadMxNParam(4, 4, &vpx_sad4x4_neon),
 };
 INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));

+const SadMxNAvgParam avg_neon_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon),
+  SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_neon),
+  SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_neon),
+  SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon),
+  SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon),
+  SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon),
+};
+INSTANTIATE_TEST_CASE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
+
 const SadMxNx4Param x4d_neon_tests[] = {
  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon),
  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon),
  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon),
+  SadMxNx4Param(8, 16, &vpx_sad8x16x4d_neon),
+  SadMxNx4Param(8, 8, &vpx_sad8x8x4d_neon),
+  SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon),
+  SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon),
+  SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon),
 };
 INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 #endif  // HAVE_NEON
@ -865,6 +929,14 @@ const SadMxNx4Param x4d_avx2_tests[] = {
 INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 #endif  // HAVE_AVX2

+#if HAVE_AVX512
+const SadMxNx4Param x4d_avx512_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx512),
+};
+INSTANTIATE_TEST_CASE_P(AVX512, SADx4Test,
+                        ::testing::ValuesIn(x4d_avx512_tests));
+#endif  // HAVE_AVX512
+
 //------------------------------------------------------------------------------
 // MIPS functions
 #if HAVE_MSA
@ -920,4 +992,98 @@ const SadMxNx4Param x4d_msa_tests[] = {
 INSTANTIATE_TEST_CASE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
 #endif  // HAVE_MSA

+//------------------------------------------------------------------------------
+// VSX functions
+#if HAVE_VSX
+const SadMxNParam vsx_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_vsx),
+  SadMxNParam(64, 32, &vpx_sad64x32_vsx),
+  SadMxNParam(32, 64, &vpx_sad32x64_vsx),
+  SadMxNParam(32, 32, &vpx_sad32x32_vsx),
+  SadMxNParam(32, 16, &vpx_sad32x16_vsx),
+  SadMxNParam(16, 32, &vpx_sad16x32_vsx),
+  SadMxNParam(16, 16, &vpx_sad16x16_vsx),
+  SadMxNParam(16, 8, &vpx_sad16x8_vsx),
+};
+INSTANTIATE_TEST_CASE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests));
+
+const SadMxNAvgParam avg_vsx_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_vsx),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_vsx),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_vsx),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_vsx),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_vsx),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_vsx),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_vsx),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_vsx),
+};
+INSTANTIATE_TEST_CASE_P(VSX, SADavgTest, ::testing::ValuesIn(avg_vsx_tests));
+
+const SadMxNx4Param x4d_vsx_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_vsx),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_vsx),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_vsx),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_vsx),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_vsx),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_vsx),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_vsx),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_vsx),
+};
+INSTANTIATE_TEST_CASE_P(VSX, SADx4Test, ::testing::ValuesIn(x4d_vsx_tests));
+#endif  // HAVE_VSX
+
+//------------------------------------------------------------------------------
+// Loongson functions
+#if HAVE_MMI
+const SadMxNParam mmi_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_mmi),
+  SadMxNParam(64, 32, &vpx_sad64x32_mmi),
+  SadMxNParam(32, 64, &vpx_sad32x64_mmi),
+  SadMxNParam(32, 32, &vpx_sad32x32_mmi),
+  SadMxNParam(32, 16, &vpx_sad32x16_mmi),
+  SadMxNParam(16, 32, &vpx_sad16x32_mmi),
+  SadMxNParam(16, 16, &vpx_sad16x16_mmi),
+  SadMxNParam(16, 8, &vpx_sad16x8_mmi),
+  SadMxNParam(8, 16, &vpx_sad8x16_mmi),
+  SadMxNParam(8, 8, &vpx_sad8x8_mmi),
+  SadMxNParam(8, 4, &vpx_sad8x4_mmi),
+  SadMxNParam(4, 8, &vpx_sad4x8_mmi),
+  SadMxNParam(4, 4, &vpx_sad4x4_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADTest, ::testing::ValuesIn(mmi_tests));
+
+const SadMxNAvgParam avg_mmi_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_mmi),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_mmi),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_mmi),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_mmi),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_mmi),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_mmi),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_mmi),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_mmi),
+  SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_mmi),
+  SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_mmi),
+  SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_mmi),
+  SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_mmi),
+  SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADavgTest, ::testing::ValuesIn(avg_mmi_tests));
+
+const SadMxNx4Param x4d_mmi_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_mmi),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_mmi),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_mmi),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_mmi),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_mmi),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_mmi),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_mmi),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_mmi),
+  SadMxNx4Param(8, 16, &vpx_sad8x16x4d_mmi),
+  SadMxNx4Param(8, 8, &vpx_sad8x8x4d_mmi),
+  SadMxNx4Param(8, 4, &vpx_sad8x4x4d_mmi),
+  SadMxNx4Param(4, 8, &vpx_sad4x8x4d_mmi),
+  SadMxNx4Param(4, 4, &vpx_sad4x4x4d_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests));
+#endif  // HAVE_MMI
 }  // namespace
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@ -146,14 +146,6 @@ TEST(VP8RoiMapTest, ParameterCheck) {
      if (deltas_valid != roi_retval) break;
    }

-    // Test that we report and error if cyclic refresh is enabled.
-    cpi.cyclic_refresh_mode_enabled = 1;
-    roi_retval =
-        vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols,
-                       delta_q, delta_lf, threshold);
-    EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error";
-    cpi.cyclic_refresh_mode_enabled = 0;
-
    // Test invalid number of rows or colums.
    roi_retval =
        vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1,
--- a/test/stress.sh
+++ b/test/stress.sh
@ -0,0 +1,169 @@
+#!/bin/sh
+##
+##  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file performs a stress test. It runs (STRESS_ONEPASS_MAX_JOBS,
+##  default=5) one, (STRESS_TWOPASS_MAX_JOBS, default=5) two pass &
+##  (STRESS_RT_MAX_JOBS, default=5) encodes and (STRESS_<codec>_DECODE_MAX_JOBS,
+##  default=30) decodes in parallel.
+
+. $(dirname $0)/tools_common.sh
+
+YUV="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.yuv"
+VP8="${LIBVPX_TEST_DATA_PATH}/tos_vp8.webm"
+VP9="${LIBVPX_TEST_DATA_PATH}/vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm"
+DATA_URL="http://downloads.webmproject.org/test_data/libvpx/"
+SHA1_FILE="$(dirname $0)/test-data.sha1"
+
+# Set sha1sum to proper sha program (sha1sum, shasum, sha1). This code is
+# cribbed from libs.mk.
+[ -x "$(which sha1sum)" ] && sha1sum=sha1sum
+[ -x "$(which shasum)" ] && sha1sum=shasum
+[ -x "$(which sha1)" ] && sha1sum=sha1
+
+# Download a file from the url and check its sha1sum.
+download_and_check_file() {
+  # Get the file from the file path.
+  local readonly root="${1#${LIBVPX_TEST_DATA_PATH}/}"
+
+  # Download the file using curl. Trap to insure non partial file.
+  (trap "rm -f $1" INT TERM \
+    && eval "curl --retry 1 -L -o $1 ${DATA_URL}${root} ${devnull}")
+
+  # Check the sha1 sum of the file.
+  if [ -n "${sha1sum}" ]; then
+    set -e
+    grep ${root} ${SHA1_FILE} \
+      | (cd ${LIBVPX_TEST_DATA_PATH}; ${sha1sum} -c);
+  fi
+}
+
+# Environment check: Make sure input is available.
+stress_verify_environment() {
+  if [ ! -e "${SHA1_FILE}" ] ; then
+    echo "Missing ${SHA1_FILE}"
+    return 1
+  fi
+  for file in "${YUV}" "${VP8}" "${VP9}"; do
+    if [ ! -e "${file}" ] ; then
+      download_and_check_file "${file}"
+    fi
+  done
+  if [ ! -e "${YUV}" ] || [ ! -e "${VP8}" ] || [ ! -e "${VP9}" ] ; then
+    elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ -z "$(vpx_tool_path vpxenc)" ]; then
+    elog "vpxenc not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
+  if [ -z "$(vpx_tool_path vpxdec)" ]; then
+    elog "vpxdec not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+# This function runs tests on libvpx that run multiple encodes and decodes
+# in parallel in hopes of catching synchronization and/or threading issues.
+stress() {
+  local readonly decoder="$(vpx_tool_path vpxdec)"
+  local readonly encoder="$(vpx_tool_path vpxenc)"
+  local readonly codec="$1"
+  local readonly webm="$2"
+  local readonly decode_count="$3"
+  local readonly threads="$4"
+  local readonly enc_args="$5"
+  local pids=""
+  local rt_max_jobs=${STRESS_RT_MAX_JOBS:-5}
+  local onepass_max_jobs=${STRESS_ONEPASS_MAX_JOBS:-5}
+  local twopass_max_jobs=${STRESS_TWOPASS_MAX_JOBS:-5}
+
+  # Enable job control, so we can run multiple processes.
+  set -m
+
+  # Start $onepass_max_jobs encode jobs in parallel.
+  for i in $(seq ${onepass_max_jobs}); do
+    bitrate=$(($i * 20 + 300))
+    eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \
+      "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal --passes=1" \
+      "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.1pass.webm" \
+      "${enc_args}" ${devnull} &
+    pids="${pids} $!"
+  done
+
+  # Start $twopass_max_jobs encode jobs in parallel.
+  for i in $(seq ${twopass_max_jobs}); do
+    bitrate=$(($i * 20 + 300))
+    eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \
+      "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal --passes=2" \
+      "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.2pass.webm" \
+      "${enc_args}" ${devnull} &
+    pids="${pids} $!"
+  done
+
+  # Start $rt_max_jobs rt encode jobs in parallel.
+  for i in $(seq ${rt_max_jobs}); do
+    bitrate=$(($i * 20 + 300))
+    eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \
+      "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal " \
+      "--target-bitrate=${bitrate} --lag-in-frames=0 --error-resilient=1" \
+      "--kf-min-dist=3000 --kf-max-dist=3000 --cpu-used=-6 --static-thresh=1" \
+      "--end-usage=cbr --min-q=2 --max-q=56 --undershoot-pct=100" \
+      "--overshoot-pct=15 --buf-sz=1000 --buf-initial-sz=500" \
+      "--buf-optimal-sz=600 --max-intra-rate=900 --resize-allowed=0" \
+      "--drop-frame=0 --passes=1 --rt --noise-sensitivity=4" \
+      "-o ${VPX_TEST_OUTPUT_DIR}/${i}.rt.webm" ${devnull} &
+    pids="${pids} $!"
+  done
+
+  # Start $decode_count decode jobs in parallel.
+  for i in $(seq "${decode_count}"); do
+    eval "${decoder}" "-t ${threads}" "${webm}" "--noblit" ${devnull} &
+    pids="${pids} $!"
+  done
+
+  # Wait for all parallel jobs to finish.
+  fail=0
+  for job in "${pids}"; do
+    wait $job || fail=$(($fail + 1))
+  done
+  return $fail
+}
+
+vp8_stress_test() {
+  local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40}
+  if [ "$(vp8_decode_available)" = "yes" -a \
+       "$(vp8_encode_available)" = "yes" ]; then
+    stress vp8 "${VP8}" "${vp8_max_jobs}" 4
+  fi
+}
+
+vp9_stress() {
+  local vp9_max_jobs=${STRESS_VP9_DECODE_MAX_JOBS:-25}
+
+  if [ "$(vp9_decode_available)" = "yes" -a \
+       "$(vp9_encode_available)" = "yes" ]; then
+    stress vp9 "${VP9}" "${vp9_max_jobs}" "$@"
+  fi
+}
+
+vp9_stress_test() {
+  for threads in 4 8 100; do
+    vp9_stress "$threads" "--row-mt=0"
+  done
+}
+
+vp9_stress_test_row_mt() {
+  for threads in 4 8 100; do
+    vp9_stress "$threads" "--row-mt=1"
+  done
+}
+
+run_tests stress_verify_environment \
+  "vp8_stress_test vp9_stress_test vp9_stress_test_row_mt"
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@ -28,7 +28,7 @@ namespace {
 const int kNumIterations = 10000;

 typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int size);
-typedef std::tr1::tuple<SSI16Func, SSI16Func> SumSquaresParam;
+typedef ::testing::tuple<SSI16Func, SSI16Func> SumSquaresParam;

 class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> {
 public:
@ -102,7 +102,7 @@ TEST_P(SumSquaresTest, ExtremeValues) {
  }
 }

-using std::tr1::make_tuple;
+using ::testing::make_tuple;

 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
@ -110,4 +110,11 @@ INSTANTIATE_TEST_CASE_P(
    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
                                 &vpx_sum_squares_2d_i16_sse2)));
 #endif  // HAVE_SSE2
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+    MSA, SumSquaresTest,
+    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+                                 &vpx_sum_squares_2d_i16_msa)));
+#endif  // HAVE_MSA
 }  // namespace
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@ -18,7 +18,7 @@ namespace {

 const int kTestMode = 0;

-typedef std::tr1::tuple<libvpx_test::TestMode, int> SuperframeTestParam;
+typedef ::testing::tuple<libvpx_test::TestMode, int> SuperframeTestParam;

 class SuperframeTest
    : public ::libvpx_test::EncoderTest,
@ -31,7 +31,7 @@ class SuperframeTest
  virtual void SetUp() {
    InitializeConfig();
    const SuperframeTestParam input = GET_PARAM(1);
-    const libvpx_test::TestMode mode = std::tr1::get<kTestMode>(input);
+    const libvpx_test::TestMode mode = ::testing::get<kTestMode>(input);
    SetMode(mode);
    sf_count_ = 0;
    sf_count_max_ = INT_MAX;
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@ -1,789 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/decode_test_driver.h"
-#include "test/i420_video_source.h"
-
-#include "vp9/decoder/vp9_decoder.h"
-
-#include "vpx/svc_context.h"
-#include "vpx/vp8cx.h"
-#include "vpx/vpx_encoder.h"
-
-namespace {
-
-using libvpx_test::CodecFactory;
-using libvpx_test::Decoder;
-using libvpx_test::DxDataIterator;
-using libvpx_test::VP9CodecFactory;
-
-class SvcTest : public ::testing::Test {
- protected:
-  static const uint32_t kWidth = 352;
-  static const uint32_t kHeight = 288;
-
-  SvcTest()
-      : codec_iface_(0), test_file_name_("hantro_collage_w352h288.yuv"),
-        codec_initialized_(false), decoder_(0) {
-    memset(&svc_, 0, sizeof(svc_));
-    memset(&codec_, 0, sizeof(codec_));
-    memset(&codec_enc_, 0, sizeof(codec_enc_));
-  }
-
-  virtual ~SvcTest() {}
-
-  virtual void SetUp() {
-    svc_.log_level = SVC_LOG_DEBUG;
-    svc_.log_print = 0;
-
-    codec_iface_ = vpx_codec_vp9_cx();
-    const vpx_codec_err_t res =
-        vpx_codec_enc_config_default(codec_iface_, &codec_enc_, 0);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-
-    codec_enc_.g_w = kWidth;
-    codec_enc_.g_h = kHeight;
-    codec_enc_.g_timebase.num = 1;
-    codec_enc_.g_timebase.den = 60;
-    codec_enc_.kf_min_dist = 100;
-    codec_enc_.kf_max_dist = 100;
-
-    vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
-    VP9CodecFactory codec_factory;
-    decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
-
-    tile_columns_ = 0;
-    tile_rows_ = 0;
-  }
-
-  virtual void TearDown() {
-    ReleaseEncoder();
-    delete (decoder_);
-  }
-
-  void InitializeEncoder() {
-    const vpx_codec_err_t res =
-        vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-    vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4);  // Make the test faster
-    vpx_codec_control(&codec_, VP9E_SET_TILE_COLUMNS, tile_columns_);
-    vpx_codec_control(&codec_, VP9E_SET_TILE_ROWS, tile_rows_);
-    codec_initialized_ = true;
-  }
-
-  void ReleaseEncoder() {
-    vpx_svc_release(&svc_);
-    if (codec_initialized_) vpx_codec_destroy(&codec_);
-    codec_initialized_ = false;
-  }
-
-  void GetStatsData(std::string *const stats_buf) {
-    vpx_codec_iter_t iter = NULL;
-    const vpx_codec_cx_pkt_t *cx_pkt;
-
-    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
-      if (cx_pkt->kind == VPX_CODEC_STATS_PKT) {
-        EXPECT_GT(cx_pkt->data.twopass_stats.sz, 0U);
-        ASSERT_TRUE(cx_pkt->data.twopass_stats.buf != NULL);
-        stats_buf->append(static_cast<char *>(cx_pkt->data.twopass_stats.buf),
-                          cx_pkt->data.twopass_stats.sz);
-      }
-    }
-  }
-
-  void Pass1EncodeNFrames(const int n, const int layers,
-                          std::string *const stats_buf) {
-    vpx_codec_err_t res;
-
-    ASSERT_GT(n, 0);
-    ASSERT_GT(layers, 0);
-    svc_.spatial_layers = layers;
-    codec_enc_.g_pass = VPX_RC_FIRST_PASS;
-    InitializeEncoder();
-
-    libvpx_test::I420VideoSource video(
-        test_file_name_, codec_enc_.g_w, codec_enc_.g_h,
-        codec_enc_.g_timebase.den, codec_enc_.g_timebase.num, 0, 30);
-    video.Begin();
-
-    for (int i = 0; i < n; ++i) {
-      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                           video.duration(), VPX_DL_GOOD_QUALITY);
-      ASSERT_EQ(VPX_CODEC_OK, res);
-      GetStatsData(stats_buf);
-      video.Next();
-    }
-
-    // Flush encoder and test EOS packet.
-    res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(), video.duration(),
-                         VPX_DL_GOOD_QUALITY);
-    ASSERT_EQ(VPX_CODEC_OK, res);
-    GetStatsData(stats_buf);
-
-    ReleaseEncoder();
-  }
-
-  void StoreFrames(const size_t max_frame_received,
-                   struct vpx_fixed_buf *const outputs,
-                   size_t *const frame_received) {
-    vpx_codec_iter_t iter = NULL;
-    const vpx_codec_cx_pkt_t *cx_pkt;
-
-    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
-      if (cx_pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
-        const size_t frame_size = cx_pkt->data.frame.sz;
-
-        EXPECT_GT(frame_size, 0U);
-        ASSERT_TRUE(cx_pkt->data.frame.buf != NULL);
-        ASSERT_LT(*frame_received, max_frame_received);
-
-        if (*frame_received == 0)
-          EXPECT_EQ(1, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY));
-
-        outputs[*frame_received].buf = malloc(frame_size + 16);
-        ASSERT_TRUE(outputs[*frame_received].buf != NULL);
-        memcpy(outputs[*frame_received].buf, cx_pkt->data.frame.buf,
-               frame_size);
-        outputs[*frame_received].sz = frame_size;
-        ++(*frame_received);
-      }
-    }
-  }
-
-  void Pass2EncodeNFrames(std::string *const stats_buf, const int n,
-                          const int layers,
-                          struct vpx_fixed_buf *const outputs) {
-    vpx_codec_err_t res;
-    size_t frame_received = 0;
-
-    ASSERT_TRUE(outputs != NULL);
-    ASSERT_GT(n, 0);
-    ASSERT_GT(layers, 0);
-    svc_.spatial_layers = layers;
-    codec_enc_.rc_target_bitrate = 500;
-    if (codec_enc_.g_pass == VPX_RC_LAST_PASS) {
-      ASSERT_TRUE(stats_buf != NULL);
-      ASSERT_GT(stats_buf->size(), 0U);
-      codec_enc_.rc_twopass_stats_in.buf = &(*stats_buf)[0];
-      codec_enc_.rc_twopass_stats_in.sz = stats_buf->size();
-    }
-    InitializeEncoder();
-
-    libvpx_test::I420VideoSource video(
-        test_file_name_, codec_enc_.g_w, codec_enc_.g_h,
-        codec_enc_.g_timebase.den, codec_enc_.g_timebase.num, 0, 30);
-    video.Begin();
-
-    for (int i = 0; i < n; ++i) {
-      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                           video.duration(), VPX_DL_GOOD_QUALITY);
-      ASSERT_EQ(VPX_CODEC_OK, res);
-      StoreFrames(n, outputs, &frame_received);
-      video.Next();
-    }
-
-    // Flush encoder.
-    res = vpx_svc_encode(&svc_, &codec_, NULL, 0, video.duration(),
-                         VPX_DL_GOOD_QUALITY);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-    StoreFrames(n, outputs, &frame_received);
-
-    EXPECT_EQ(frame_received, static_cast<size_t>(n));
-
-    ReleaseEncoder();
-  }
-
-  void DecodeNFrames(const struct vpx_fixed_buf *const inputs, const int n) {
-    int decoded_frames = 0;
-    int received_frames = 0;
-
-    ASSERT_TRUE(inputs != NULL);
-    ASSERT_GT(n, 0);
-
-    for (int i = 0; i < n; ++i) {
-      ASSERT_TRUE(inputs[i].buf != NULL);
-      ASSERT_GT(inputs[i].sz, 0U);
-      const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-          static_cast<const uint8_t *>(inputs[i].buf), inputs[i].sz);
-      ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-      ++decoded_frames;
-
-      DxDataIterator dec_iter = decoder_->GetDxData();
-      while (dec_iter.Next() != NULL) {
-        ++received_frames;
-      }
-    }
-    EXPECT_EQ(decoded_frames, n);
-    EXPECT_EQ(received_frames, n);
-  }
-
-  void DropEnhancementLayers(struct vpx_fixed_buf *const inputs,
-                             const int num_super_frames,
-                             const int remained_spatial_layers) {
-    ASSERT_TRUE(inputs != NULL);
-    ASSERT_GT(num_super_frames, 0);
-    ASSERT_GT(remained_spatial_layers, 0);
-
-    for (int i = 0; i < num_super_frames; ++i) {
-      uint32_t frame_sizes[8] = { 0 };
-      int frame_count = 0;
-      int frames_found = 0;
-      int frame;
-      ASSERT_TRUE(inputs[i].buf != NULL);
-      ASSERT_GT(inputs[i].sz, 0U);
-
-      vpx_codec_err_t res = vp9_parse_superframe_index(
-          static_cast<const uint8_t *>(inputs[i].buf), inputs[i].sz,
-          frame_sizes, &frame_count, NULL, NULL);
-      ASSERT_EQ(VPX_CODEC_OK, res);
-
-      if (frame_count == 0) {
-        // There's no super frame but only a single frame.
-        ASSERT_EQ(1, remained_spatial_layers);
-      } else {
-        // Found a super frame.
-        uint8_t *frame_data = static_cast<uint8_t *>(inputs[i].buf);
-        uint8_t *frame_start = frame_data;
-        for (frame = 0; frame < frame_count; ++frame) {
-          // Looking for a visible frame.
-          if (frame_data[0] & 0x02) {
-            ++frames_found;
-            if (frames_found == remained_spatial_layers) break;
-          }
-          frame_data += frame_sizes[frame];
-        }
-        ASSERT_LT(frame, frame_count)
-            << "Couldn't find a visible frame. "
-            << "remained_spatial_layers: " << remained_spatial_layers
-            << "    super_frame: " << i;
-        if (frame == frame_count - 1) continue;
-
-        frame_data += frame_sizes[frame];
-
-        // We need to add one more frame for multiple frame contexts.
-        uint8_t marker =
-            static_cast<const uint8_t *>(inputs[i].buf)[inputs[i].sz - 1];
-        const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-        const size_t index_sz = 2 + mag * frame_count;
-        const size_t new_index_sz = 2 + mag * (frame + 1);
-        marker &= 0x0f8;
-        marker |= frame;
-
-        // Copy existing frame sizes.
-        memmove(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1,
-                new_index_sz - 2);
-        // New marker.
-        frame_data[0] = marker;
-        frame_data += (mag * (frame + 1) + 1);
-
-        *frame_data++ = marker;
-        inputs[i].sz = frame_data - frame_start;
-      }
-    }
-  }
-
-  void FreeBitstreamBuffers(struct vpx_fixed_buf *const inputs, const int n) {
-    ASSERT_TRUE(inputs != NULL);
-    ASSERT_GT(n, 0);
-
-    for (int i = 0; i < n; ++i) {
-      free(inputs[i].buf);
-      inputs[i].buf = NULL;
-      inputs[i].sz = 0;
-    }
-  }
-
-  SvcContext svc_;
-  vpx_codec_ctx_t codec_;
-  struct vpx_codec_enc_cfg codec_enc_;
-  vpx_codec_iface_t *codec_iface_;
-  std::string test_file_name_;
-  bool codec_initialized_;
-  Decoder *decoder_;
-  int tile_columns_;
-  int tile_rows_;
-};
-
-TEST_F(SvcTest, SvcInit) {
-  // test missing parameters
-  vpx_codec_err_t res = vpx_svc_init(NULL, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-  res = vpx_svc_init(&svc_, NULL, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-  res = vpx_svc_init(&svc_, &codec_, NULL, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 6;  // too many layers
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 0;  // use default layers
-  InitializeEncoder();
-  EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
-}
-
-TEST_F(SvcTest, InitTwoLayers) {
-  svc_.spatial_layers = 2;
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, InvalidOptions) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "not-an-option=1");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-}
-
-TEST_F(SvcTest, SetLayersOption) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "spatial-layers=3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  InitializeEncoder();
-  EXPECT_EQ(3, svc_.spatial_layers);
-}
-
-TEST_F(SvcTest, SetMultipleOptions) {
-  vpx_codec_err_t res =
-      vpx_svc_set_options(&svc_, "spatial-layers=2 scale-factors=1/3,2/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  InitializeEncoder();
-  EXPECT_EQ(2, svc_.spatial_layers);
-}
-
-TEST_F(SvcTest, SetScaleFactorsOption) {
-  svc_.spatial_layers = 2;
-  vpx_codec_err_t res =
-      vpx_svc_set_options(&svc_, "scale-factors=not-scale-factors");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3, 3*3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, SetQuantizersOption) {
-  svc_.spatial_layers = 2;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "max-quantizers=nothing");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "min-quantizers=nothing");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "max-quantizers=40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "min-quantizers=40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "max-quantizers=30,30 min-quantizers=40,40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "max-quantizers=40,40 min-quantizers=30,30");
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, SetAutoAltRefOption) {
-  svc_.spatial_layers = 5;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "auto-alt-refs=none");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1,1,0");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
-  InitializeEncoder();
-}
-
-// Test that decoder can handle an SVC frame as the first frame in a sequence.
-TEST_F(SvcTest, OnePassEncodeOneFrame) {
-  codec_enc_.g_pass = VPX_RC_ONE_PASS;
-  vpx_fixed_buf output = { 0 };
-  Pass2EncodeNFrames(NULL, 1, 2, &output);
-  DecodeNFrames(&output, 1);
-  FreeBitstreamBuffers(&output, 1);
-}
-
-TEST_F(SvcTest, OnePassEncodeThreeFrames) {
-  codec_enc_.g_pass = VPX_RC_ONE_PASS;
-  codec_enc_.g_lag_in_frames = 0;
-  vpx_fixed_buf outputs[3];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(NULL, 3, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 3);
-  FreeBitstreamBuffers(&outputs[0], 3);
-}
-
-TEST_F(SvcTest, TwoPassEncode10Frames) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(20, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-  vpx_fixed_buf outputs[20];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 20);
-  FreeBitstreamBuffers(&outputs[0], 20);
-}
-
-TEST_F(SvcTest, TwoPassEncode2SpatialLayersDecodeBaseLayerOnly) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode5SpatialLayersDecode54321Layers) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 5, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]);
-
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 4);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 3);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 2);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2SNRLayers) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
-  Pass1EncodeNFrames(20, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1");
-  vpx_fixed_buf outputs[20];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 20);
-  FreeBitstreamBuffers(&outputs[0], 20);
-}
-
-TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
-  Pass1EncodeNFrames(20, 3, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1");
-  vpx_fixed_buf outputs[20];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]);
-  DecodeNFrames(&outputs[0], 20);
-  DropEnhancementLayers(&outputs[0], 20, 2);
-  DecodeNFrames(&outputs[0], 20);
-  DropEnhancementLayers(&outputs[0], 20, 1);
-  DecodeNFrames(&outputs[0], 20);
-
-  FreeBitstreamBuffers(&outputs[0], 20);
-}
-
-TEST_F(SvcTest, SetMultipleFrameContextsOption) {
-  svc_.spatial_layers = 5;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 2;
-  res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, TwoPassEncode2SpatialLayersWithMultipleFrameContexts) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest,
-       TwoPassEncode2SpatialLayersWithMultipleFrameContextsDecodeBaselayer) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContexts) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1,1 scale-factors=1/1,1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest,
-       TwoPassEncode3SNRLayersWithMultipleFrameContextsDecode321Layer) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
-  Pass1EncodeNFrames(10, 3, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]);
-
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 2);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayers) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContexts) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1 scale-factors=1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersDecodeBaseLayer) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-
-  vpx_fixed_buf base_layer[5];
-  for (int i = 0; i < 5; ++i) base_layer[i] = outputs[i * 2];
-
-  DecodeNFrames(&base_layer[0], 5);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest,
-       TwoPassEncode2TemporalLayersWithMultipleFrameContextsDecodeBaseLayer) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1 scale-factors=1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-
-  vpx_fixed_buf base_layer[5];
-  for (int i = 0; i < 5; ++i) base_layer[i] = outputs[i * 2];
-
-  DecodeNFrames(&base_layer[0], 5);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithTiles) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
-  codec_enc_.g_w = 704;
-  codec_enc_.g_h = 144;
-  tile_columns_ = 1;
-  tile_rows_ = 1;
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContextsAndTiles) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  codec_enc_.g_error_resilient = 0;
-  codec_enc_.g_w = 704;
-  codec_enc_.g_h = 144;
-  tile_columns_ = 1;
-  tile_rows_ = 1;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1 scale-factors=1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-}  // namespace
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@ -0,0 +1,277 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/register_state_check.h"
+#include "vpx_ports/vpx_timer.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+using ::libvpx_test::Buffer;
+
+typedef void (*TemporalFilterFunc)(const uint8_t *a, unsigned int stride,
+                                   const uint8_t *b, unsigned int w,
+                                   unsigned int h, int filter_strength,
+                                   int filter_weight, unsigned int *accumulator,
+                                   uint16_t *count);
+
+// Calculate the difference between 'a' and 'b', sum in blocks of 9, and apply
+// filter based on strength and weight. Store the resulting filter amount in
+// 'count' and apply it to 'b' and store it in 'accumulator'.
+void reference_filter(const Buffer<uint8_t> &a, const Buffer<uint8_t> &b, int w,
+                      int h, int filter_strength, int filter_weight,
+                      Buffer<unsigned int> *accumulator,
+                      Buffer<uint16_t> *count) {
+  Buffer<int> diff_sq = Buffer<int>(w, h, 0);
+  ASSERT_TRUE(diff_sq.Init());
+  diff_sq.Set(0);
+
+  int rounding = 0;
+  if (filter_strength > 0) {
+    rounding = 1 << (filter_strength - 1);
+  }
+
+  // Calculate all the differences. Avoids re-calculating a bunch of extra
+  // values.
+  for (int height = 0; height < h; ++height) {
+    for (int width = 0; width < w; ++width) {
+      int diff = a.TopLeftPixel()[height * a.stride() + width] -
+                 b.TopLeftPixel()[height * b.stride() + width];
+      diff_sq.TopLeftPixel()[height * diff_sq.stride() + width] = diff * diff;
+    }
+  }
+
+  // For any given point, sum the neighboring values and calculate the
+  // modifier.
+  for (int height = 0; height < h; ++height) {
+    for (int width = 0; width < w; ++width) {
+      // Determine how many values are being summed.
+      int summed_values = 9;
+
+      if (height == 0 || height == (h - 1)) {
+        summed_values -= 3;
+      }
+
+      if (width == 0 || width == (w - 1)) {
+        if (summed_values == 6) {  // corner
+          summed_values -= 2;
+        } else {
+          summed_values -= 3;
+        }
+      }
+
+      // Sum the diff_sq of the surrounding values.
+      int sum = 0;
+      for (int idy = -1; idy <= 1; ++idy) {
+        for (int idx = -1; idx <= 1; ++idx) {
+          const int y = height + idy;
+          const int x = width + idx;
+
+          // If inside the border.
+          if (y >= 0 && y < h && x >= 0 && x < w) {
+            sum += diff_sq.TopLeftPixel()[y * diff_sq.stride() + x];
+          }
+        }
+      }
+
+      sum *= 3;
+      sum /= summed_values;
+      sum += rounding;
+      sum >>= filter_strength;
+
+      // Clamp the value and invert it.
+      if (sum > 16) sum = 16;
+      sum = 16 - sum;
+
+      sum *= filter_weight;
+
+      count->TopLeftPixel()[height * count->stride() + width] += sum;
+      accumulator->TopLeftPixel()[height * accumulator->stride() + width] +=
+          sum * b.TopLeftPixel()[height * b.stride() + width];
+    }
+  }
+}
+
+class TemporalFilterTest : public ::testing::TestWithParam<TemporalFilterFunc> {
+ public:
+  virtual void SetUp() {
+    filter_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  TemporalFilterFunc filter_func_;
+  ACMRandom rnd_;
+};
+
+TEST_P(TemporalFilterTest, SizeCombinations) {
+  // Depending on subsampling this function may be called with values of 8 or 16
+  // for width and height, in any combination.
+  Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
+  ASSERT_TRUE(a.Init());
+
+  const int filter_weight = 2;
+  const int filter_strength = 6;
+
+  for (int width = 8; width <= 16; width += 8) {
+    for (int height = 8; height <= 16; height += 8) {
+      // The second buffer must not have any border.
+      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+      ASSERT_TRUE(b.Init());
+      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_ref.Init());
+      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_chk.Init());
+      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_ref.Init());
+      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_chk.Init());
+
+      // The difference between the buffers must be small to pass the threshold
+      // to apply the filter.
+      a.Set(&rnd_, 0, 7);
+      b.Set(&rnd_, 0, 7);
+
+      accum_ref.Set(rnd_.Rand8());
+      accum_chk.CopyFrom(accum_ref);
+      count_ref.Set(rnd_.Rand8());
+      count_chk.CopyFrom(count_ref);
+      reference_filter(a, b, width, height, filter_strength, filter_weight,
+                       &accum_ref, &count_ref);
+      ASM_REGISTER_STATE_CHECK(
+          filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
+                       height, filter_strength, filter_weight,
+                       accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()));
+      EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
+      EXPECT_TRUE(count_chk.CheckValues(count_ref));
+      if (HasFailure()) {
+        printf("Width: %d Height: %d\n", width, height);
+        count_chk.PrintDifference(count_ref);
+        accum_chk.PrintDifference(accum_ref);
+        return;
+      }
+    }
+  }
+}
+
+TEST_P(TemporalFilterTest, CompareReferenceRandom) {
+  for (int width = 8; width <= 16; width += 8) {
+    for (int height = 8; height <= 16; height += 8) {
+      Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
+      ASSERT_TRUE(a.Init());
+      // The second buffer must not have any border.
+      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+      ASSERT_TRUE(b.Init());
+      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_ref.Init());
+      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_chk.Init());
+      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_ref.Init());
+      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_chk.Init());
+
+      for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
+        for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
+          for (int repeat = 0; repeat < 100; ++repeat) {
+            if (repeat < 50) {
+              a.Set(&rnd_, 0, 7);
+              b.Set(&rnd_, 0, 7);
+            } else {
+              // Check large (but close) values as well.
+              a.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
+                    std::numeric_limits<uint8_t>::max());
+              b.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
+                    std::numeric_limits<uint8_t>::max());
+            }
+
+            accum_ref.Set(rnd_.Rand8());
+            accum_chk.CopyFrom(accum_ref);
+            count_ref.Set(rnd_.Rand8());
+            count_chk.CopyFrom(count_ref);
+            reference_filter(a, b, width, height, filter_strength,
+                             filter_weight, &accum_ref, &count_ref);
+            ASM_REGISTER_STATE_CHECK(filter_func_(
+                a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height,
+                filter_strength, filter_weight, accum_chk.TopLeftPixel(),
+                count_chk.TopLeftPixel()));
+            EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
+            EXPECT_TRUE(count_chk.CheckValues(count_ref));
+            if (HasFailure()) {
+              printf("Weight: %d Strength: %d\n", filter_weight,
+                     filter_strength);
+              count_chk.PrintDifference(count_ref);
+              accum_chk.PrintDifference(accum_ref);
+              return;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(TemporalFilterTest, DISABLED_Speed) {
+  Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
+  ASSERT_TRUE(a.Init());
+
+  const int filter_weight = 2;
+  const int filter_strength = 6;
+
+  for (int width = 8; width <= 16; width += 8) {
+    for (int height = 8; height <= 16; height += 8) {
+      // The second buffer must not have any border.
+      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+      ASSERT_TRUE(b.Init());
+      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_ref.Init());
+      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_chk.Init());
+      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_ref.Init());
+      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_chk.Init());
+
+      a.Set(&rnd_, 0, 7);
+      b.Set(&rnd_, 0, 7);
+
+      accum_chk.Set(0);
+      count_chk.Set(0);
+
+      vpx_usec_timer timer;
+      vpx_usec_timer_start(&timer);
+      for (int i = 0; i < 10000; ++i) {
+        filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
+                     height, filter_strength, filter_weight,
+                     accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
+      }
+      vpx_usec_timer_mark(&timer);
+      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+      printf("Temporal filter %dx%d time: %5d us\n", width, height,
+             elapsed_time);
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest,
+                        ::testing::Values(&vp9_temporal_filter_apply_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest,
+                        ::testing::Values(&vp9_temporal_filter_apply_sse4_1));
+#endif  // HAVE_SSE4_1
+}  // namespace
--- a/test/test-data.mk
+++ b/test/test-data.mk
@ -20,8 +20,10 @@ LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv

 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_credits.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += noisy_clip_640_360.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv

 # Test vectors
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
@ -730,6 +732,10 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5
 endif  # CONFIG_VP9_HIGHBITDEPTH

 # Invalid files for testing libvpx error checking.
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
@ -770,6 +776,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s367
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm
@ -777,6 +785,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.web
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm.res

 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # Encode / Decode test
@ -811,7 +821,6 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv
@ -871,3 +880,5 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm.md5
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@ -6,6 +6,8 @@ b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
 c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+efafb92b7567bc04c3f1432ea6c268c1c31affd5 *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res
 fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v3.webm
 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-01-v3.webm.res
 d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm
@ -14,6 +16,7 @@ df1a1453feb3c00d7d89746c7003b4163523bff3 *invalid-vp90-03-v3.webm
 4935c62becc68c13642a03db1e6d3e2331c1c612 *invalid-vp90-03-v3.webm.res
 d637297561dd904eb2c97a9015deeb31c4a1e8d2 *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
 3a204bdbeaa3c6458b77bcebb8366d107267f55d *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
+9aa21d8b2cb9d39abe8a7bb6032dc66955fb4342 *noisy_clip_640_360.y4m
 a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m
 0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m
 ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m
@ -844,3 +847,12 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_
 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-crbug-629481.webm.res
 7602e00378161ca36ae93cc6ee12dd30b5ba1e1d *vp90-2-22-svc_1280x720_3.ivf
 02e53e3eefbf25ec0929047fe50876acdeb040bd *vp90-2-22-svc_1280x720_3.ivf.md5
+6fa3d3ac306a3d9ce1d610b78441dc00d2c2d4b9 *tos_vp8.webm
+e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res
+fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf
+fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res
+1a0e405606939f2febab1a21b30c37cb8f2c8cb1 *invalid-token-partition.ivf
+90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res
+17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm
+e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5
--- a/test/test.mk
+++ b/test/test.mk
@ -1,4 +1,5 @@
 LIBVPX_TEST_SRCS-yes += acm_random.h
+LIBVPX_TEST_SRCS-yes += buffer.h
 LIBVPX_TEST_SRCS-yes += clear_system_state.h
 LIBVPX_TEST_SRCS-yes += codec_factory.h
 LIBVPX_TEST_SRCS-yes += md5_helper.h
@ -21,7 +22,9 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += altref_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += alt_ref_aq_segment_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += datarate_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += vp8_datarate_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += vp9_datarate_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += svc_datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
@ -38,7 +41,6 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
@ -47,6 +49,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_motion_vector_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc

 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
@ -122,6 +125,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += predict_test.cc
 LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.cc
+LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.h

 ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes)
 LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp8_denoiser_sse2_test.cc
@ -149,25 +153,30 @@ LIBVPX_TEST_SRCS-yes                   += vp9_intrapred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += avg_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += comp_avg_pred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_partial_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc
+ifneq ($(CONFIG_REALTIME_ONLY),yes)
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc
+endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc

 ifeq ($(CONFIG_VP9_ENCODER),yes)
-LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc
 endif

 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
-LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp9_denoiser_sse2_test.cc
+LIBVPX_TEST_SRCS-yes += vp9_denoiser_test.cc
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc

--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@ -270,19 +270,22 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
 INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
                vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
                vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
-                vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, NULL,
-                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_neon)
+                vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon,
+                vpx_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
                vpx_dc_left_predictor_16x16_neon,
                vpx_dc_top_predictor_16x16_neon,
                vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
-                vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, NULL,
-                NULL, NULL, NULL, NULL, vpx_tm_predictor_16x16_neon)
+                vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon,
+                vpx_d135_predictor_16x16_neon, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
                vpx_dc_left_predictor_32x32_neon,
                vpx_dc_top_predictor_32x32_neon,
                vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
-                vpx_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon,
+                vpx_d135_predictor_32x32_neon, NULL, NULL, NULL, NULL,
                vpx_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON

@ -309,6 +312,31 @@ INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa,
                vpx_tm_predictor_32x32_msa)
 #endif  // HAVE_MSA

+#if HAVE_VSX
+INTRA_PRED_TEST(VSX, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_4x4_vsx, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_4x4_vsx)
+
+INTRA_PRED_TEST(VSX, TestIntraPred8, vpx_dc_predictor_8x8_vsx, NULL, NULL, NULL,
+                NULL, vpx_h_predictor_8x8_vsx, vpx_d45_predictor_8x8_vsx, NULL,
+                NULL, NULL, NULL, vpx_d63_predictor_8x8_vsx,
+                vpx_tm_predictor_8x8_vsx)
+
+INTRA_PRED_TEST(VSX, TestIntraPred16, vpx_dc_predictor_16x16_vsx,
+                vpx_dc_left_predictor_16x16_vsx, vpx_dc_top_predictor_16x16_vsx,
+                vpx_dc_128_predictor_16x16_vsx, vpx_v_predictor_16x16_vsx,
+                vpx_h_predictor_16x16_vsx, vpx_d45_predictor_16x16_vsx, NULL,
+                NULL, NULL, NULL, vpx_d63_predictor_16x16_vsx,
+                vpx_tm_predictor_16x16_vsx)
+
+INTRA_PRED_TEST(VSX, TestIntraPred32, vpx_dc_predictor_32x32_vsx,
+                vpx_dc_left_predictor_32x32_vsx, vpx_dc_top_predictor_32x32_vsx,
+                vpx_dc_128_predictor_32x32_vsx, vpx_v_predictor_32x32_vsx,
+                vpx_h_predictor_32x32_vsx, vpx_d45_predictor_32x32_vsx, NULL,
+                NULL, NULL, NULL, vpx_d63_predictor_32x32_vsx,
+                vpx_tm_predictor_32x32_vsx)
+#endif  // HAVE_VSX
+
 // -----------------------------------------------------------------------------

 #if CONFIG_VP9_HIGHBITDEPTH
@ -452,29 +480,107 @@ HIGHBD_INTRA_PRED_TEST(
    vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c)

 #if HAVE_SSE2
-HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4,
-                       vpx_highbd_dc_predictor_4x4_sse2, NULL, NULL, NULL,
-                       vpx_highbd_v_predictor_4x4_sse2, NULL, NULL, NULL, NULL,
-                       NULL, NULL, NULL, vpx_highbd_tm_predictor_4x4_c)
+HIGHBD_INTRA_PRED_TEST(
+    SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2,
+    vpx_highbd_dc_left_predictor_4x4_sse2, vpx_highbd_dc_top_predictor_4x4_sse2,
+    vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2,
+    vpx_highbd_h_predictor_4x4_sse2, NULL, vpx_highbd_d135_predictor_4x4_sse2,
+    vpx_highbd_d117_predictor_4x4_sse2, vpx_highbd_d153_predictor_4x4_sse2,
+    vpx_highbd_d207_predictor_4x4_sse2, vpx_highbd_d63_predictor_4x4_sse2,
+    vpx_highbd_tm_predictor_4x4_c)

 HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8,
-                       vpx_highbd_dc_predictor_8x8_sse2, NULL, NULL, NULL,
-                       vpx_highbd_v_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
-                       NULL, NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2)
+                       vpx_highbd_dc_predictor_8x8_sse2,
+                       vpx_highbd_dc_left_predictor_8x8_sse2,
+                       vpx_highbd_dc_top_predictor_8x8_sse2,
+                       vpx_highbd_dc_128_predictor_8x8_sse2,
+                       vpx_highbd_v_predictor_8x8_sse2,
+                       vpx_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2)

 HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16,
-                       vpx_highbd_dc_predictor_16x16_sse2, NULL, NULL, NULL,
-                       vpx_highbd_v_predictor_16x16_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL,
-                       vpx_highbd_tm_predictor_16x16_sse2)
+                       vpx_highbd_dc_predictor_16x16_sse2,
+                       vpx_highbd_dc_left_predictor_16x16_sse2,
+                       vpx_highbd_dc_top_predictor_16x16_sse2,
+                       vpx_highbd_dc_128_predictor_16x16_sse2,
+                       vpx_highbd_v_predictor_16x16_sse2,
+                       vpx_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, vpx_highbd_tm_predictor_16x16_sse2)

 HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
-                       vpx_highbd_dc_predictor_32x32_sse2, NULL, NULL, NULL,
-                       vpx_highbd_v_predictor_32x32_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL,
-                       vpx_highbd_tm_predictor_32x32_sse2)
+                       vpx_highbd_dc_predictor_32x32_sse2,
+                       vpx_highbd_dc_left_predictor_32x32_sse2,
+                       vpx_highbd_dc_top_predictor_32x32_sse2,
+                       vpx_highbd_dc_128_predictor_32x32_sse2,
+                       vpx_highbd_v_predictor_32x32_sse2,
+                       vpx_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, vpx_highbd_tm_predictor_32x32_sse2)
 #endif  // HAVE_SSE2

+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_d45_predictor_4x4_ssse3, NULL,
+                       NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_d45_predictor_8x8_ssse3,
+                       vpx_highbd_d135_predictor_8x8_ssse3,
+                       vpx_highbd_d117_predictor_8x8_ssse3,
+                       vpx_highbd_d153_predictor_8x8_ssse3,
+                       vpx_highbd_d207_predictor_8x8_ssse3,
+                       vpx_highbd_d63_predictor_8x8_ssse3, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_d45_predictor_16x16_ssse3,
+                       vpx_highbd_d135_predictor_16x16_ssse3,
+                       vpx_highbd_d117_predictor_16x16_ssse3,
+                       vpx_highbd_d153_predictor_16x16_ssse3,
+                       vpx_highbd_d207_predictor_16x16_ssse3,
+                       vpx_highbd_d63_predictor_16x16_ssse3, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_d45_predictor_32x32_ssse3,
+                       vpx_highbd_d135_predictor_32x32_ssse3,
+                       vpx_highbd_d117_predictor_32x32_ssse3,
+                       vpx_highbd_d153_predictor_32x32_ssse3,
+                       vpx_highbd_d207_predictor_32x32_ssse3,
+                       vpx_highbd_d63_predictor_32x32_ssse3, NULL)
+#endif  // HAVE_SSSE3
+
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon,
+    vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon,
+    vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
+    vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
+    vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+    vpx_highbd_tm_predictor_4x4_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
+    vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
+    vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
+    vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
+    vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL,
+    vpx_highbd_tm_predictor_8x8_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
+                       vpx_highbd_dc_predictor_16x16_neon,
+                       vpx_highbd_dc_left_predictor_16x16_neon,
+                       vpx_highbd_dc_top_predictor_16x16_neon,
+                       vpx_highbd_dc_128_predictor_16x16_neon,
+                       vpx_highbd_v_predictor_16x16_neon,
+                       vpx_highbd_h_predictor_16x16_neon,
+                       vpx_highbd_d45_predictor_16x16_neon,
+                       vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL,
+                       NULL, vpx_highbd_tm_predictor_16x16_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
+                       vpx_highbd_dc_predictor_32x32_neon,
+                       vpx_highbd_dc_left_predictor_32x32_neon,
+                       vpx_highbd_dc_top_predictor_32x32_neon,
+                       vpx_highbd_dc_128_predictor_32x32_neon,
+                       vpx_highbd_v_predictor_32x32_neon,
+                       vpx_highbd_h_predictor_32x32_neon,
+                       vpx_highbd_d45_predictor_32x32_neon,
+                       vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL,
+                       NULL, vpx_highbd_tm_predictor_32x32_neon)
+#endif  // HAVE_NEON
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH

 #include "test/test_libvpx.cc"
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@ -53,12 +53,14 @@ int main(int argc, char **argv) {
  }
  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
+  if (!(simd_caps & HAS_AVX512)) {
+    append_negative_gtest_filter(":AVX512.*:AVX512/*");
+  }
 #endif  // ARCH_X86 || ARCH_X86_64

 #if !CONFIG_SHARED
 // Shared library builds don't support whitebox tests
 // that exercise internal symbols.
-
 #if CONFIG_VP8
  vp8_rtcd();
 #endif  // CONFIG_VP8
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@ -28,13 +28,10 @@

 namespace {

-enum DecodeMode { kSerialMode, kFrameParallelMode };
+const int kThreads = 0;
+const int kFileName = 1;

-const int kDecodeMode = 0;
-const int kThreads = 1;
-const int kFileName = 2;
-
-typedef std::tr1::tuple<int, int, const char *> DecodeParam;
+typedef ::testing::tuple<int, const char *> DecodeParam;

 class TestVectorTest : public ::libvpx_test::DecoderTest,
                       public ::libvpx_test::CodecTestWithParam<DecodeParam> {
@ -53,8 +50,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,

  void OpenMD5File(const std::string &md5_file_name_) {
    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
-                                   << md5_file_name_;
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
  }

  virtual void DecompressedFrameHook(const vpx_image_t &img,
@ -91,30 +88,15 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 // the test failed.
 TEST_P(TestVectorTest, MD5Match) {
  const DecodeParam input = GET_PARAM(1);
-  const std::string filename = std::tr1::get<kFileName>(input);
-  const int threads = std::tr1::get<kThreads>(input);
-  const int mode = std::tr1::get<kDecodeMode>(input);
+  const std::string filename = ::testing::get<kFileName>(input);
  vpx_codec_flags_t flags = 0;
  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
  char str[256];

-  if (mode == kFrameParallelMode) {
-    flags |= VPX_CODEC_USE_FRAME_THREADING;
-#if CONFIG_VP9_DECODER
-    // TODO(hkuang): Fix frame parallel decode bug. See issue 1086.
-    if (resize_clips_.find(filename) != resize_clips_.end()) {
-      printf("Skipping the test file: %s, due to frame parallel decode bug.\n",
-             filename.c_str());
-      return;
-    }
-#endif
-  }
+  cfg.threads = ::testing::get<kThreads>(input);

-  cfg.threads = threads;
-
-  snprintf(str, sizeof(str) / sizeof(str[0]) - 1,
-           "file: %s  mode: %s threads: %d", filename.c_str(),
-           mode == 0 ? "Serial" : "Parallel", threads);
+  snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d",
+           filename.c_str(), cfg.threads);
  SCOPED_TRACE(str);

  // Open compressed video file.
@ -145,38 +127,44 @@ TEST_P(TestVectorTest, MD5Match) {
  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
 }

-// Test VP8 decode in serial mode with single thread.
-// NOTE: VP8 only support serial mode.
 #if CONFIG_VP8_DECODER
 VP8_INSTANTIATE_TEST_CASE(
    TestVectorTest,
    ::testing::Combine(
-        ::testing::Values(0),  // Serial Mode.
        ::testing::Values(1),  // Single thread.
        ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
                            libvpx_test::kVP8TestVectors +
                                libvpx_test::kNumVP8TestVectors)));
+
+// Test VP8 decode in with different numbers of threads.
+INSTANTIATE_TEST_CASE_P(
+    VP8MultiThreaded, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)),
+        ::testing::Combine(
+            ::testing::Range(2, 9),  // With 2 ~ 8 threads.
+            ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
+                                libvpx_test::kVP8TestVectors +
+                                    libvpx_test::kNumVP8TestVectors))));
+
 #endif  // CONFIG_VP8_DECODER

-// Test VP9 decode in serial mode with single thread.
 #if CONFIG_VP9_DECODER
 VP9_INSTANTIATE_TEST_CASE(
    TestVectorTest,
    ::testing::Combine(
-        ::testing::Values(0),  // Serial Mode.
        ::testing::Values(1),  // Single thread.
        ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                            libvpx_test::kVP9TestVectors +
                                libvpx_test::kNumVP9TestVectors)));

-// Test VP9 decode in frame parallel mode with different number of threads.
 INSTANTIATE_TEST_CASE_P(
-    VP9MultiThreadedFrameParallel, TestVectorTest,
+    VP9MultiThreaded, TestVectorTest,
    ::testing::Combine(
        ::testing::Values(
            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
        ::testing::Combine(
-            ::testing::Values(1),    // Frame Parallel mode.
            ::testing::Range(2, 9),  // With 2 ~ 8 threads.
            ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                                libvpx_test::kVP9TestVectors +
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@ -371,6 +371,7 @@ const char *const kVP9TestVectors[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
  "vp90-2-20-big_superframe-01.webm",
  "vp90-2-20-big_superframe-02.webm",
+  "vp90-2-22-svc_1280x720_1.webm",
  RESIZE_TEST_VECTORS
 };
 const char *const kVP9TestVectorsSvc[] = { "vp90-2-22-svc_1280x720_3.ivf" };
--- a/test/twopass_encoder.sh
+++ b/test/twopass_encoder.sh
@ -54,7 +54,10 @@ twopass_encoder_vp9() {
  fi
 }

-twopass_encoder_tests="twopass_encoder_vp8
-                       twopass_encoder_vp9"

-run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
+if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then
+  twopass_encoder_tests="twopass_encoder_vp8
+                         twopass_encoder_vp9"
+
+  run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
+fi
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc
@ -27,8 +27,8 @@

 namespace {

-using std::string;
 using libvpx_test::ACMRandom;
+using std::string;

 #if CONFIG_WEBM_IO

--- a/test/util.h
+++ b/test/util.h
@ -17,7 +17,7 @@
 #include "vpx/vpx_image.h"

 // Macros
-#define GET_PARAM(k) std::tr1::get<k>(GetParam())
+#define GET_PARAM(k) ::testing::get<k>(GetParam())

 inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) {
  assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) &&
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
--- a/test/video_source.h
+++ b/test/video_source.h
@ -13,7 +13,9 @@
 #if defined(_WIN32)
 #undef NOMINMAX
 #define NOMINMAX
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #endif
 #include <cstdio>
--- a/test/vp8_datarate_test.cc
+++ b/test/vp8_datarate_test.cc
@ -0,0 +1,513 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx/vpx_codec.h"
+
+namespace {
+
+class DatarateTestLarge
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ public:
+  DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
+
+  virtual ~DatarateTestLarge() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+    ResetModel();
+  }
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    first_drop_ = 0;
+    bits_total_ = 0;
+    duration_ = 0.0;
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
+    gf_boost_ = 0;
+    use_roi_ = false;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_);
+    }
+
+    if (use_roi_) {
+      encoder->Control(VP8E_SET_ROI_MAP, &roi_);
+    }
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    }
+
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Time since last timestamp = duration.
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+    // TODO(jimbankoski): Remove these lines when the issue:
+    // http://code.google.com/p/webm/issues/detail?id=496 is fixed.
+    // For now the codec assumes buffer starts at starting buffer rate
+    // plus one frame's time.
+    if (last_pts_ == 0) duration = 1;
+
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+
+    /* Test the buffer model here before subtracting the frame. Do so because
+     * the way the leaky bucket model works in libvpx is to allow the buffer to
+     * empty - and then stop showing frames until we've got enough bits to
+     * show one. As noted in comment below (issue 495), this does not currently
+     * apply to key frames. For now exclude key frames in condition below. */
+    const bool key_frame =
+        (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
+    if (!key_frame) {
+      ASSERT_GE(bits_in_buffer_model_, 0)
+          << "Buffer Underrun at frame " << pkt->data.frame.pts;
+    }
+
+    const int64_t frame_size_in_bits = pkt->data.frame.sz * 8;
+
+    // Subtract from the buffer the bits associated with a played back frame.
+    bits_in_buffer_model_ -= frame_size_in_bits;
+
+    // Update the running total of bits for end of test datarate checks.
+    bits_total_ += frame_size_in_bits;
+
+    // If first drop not set and we have a drop set it to this time.
+    if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
+
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+
+    // We update this so that we can calculate the datarate minus the last
+    // frame encoded in the file.
+    bits_in_last_frame_ = frame_size_in_bits;
+
+    ++frame_number_;
+  }
+
+  virtual void EndPassHook(void) {
+    if (bits_total_) {
+      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
+
+      duration_ = (last_pts_ + 1) * timebase_;
+
+      // Effective file datarate includes the time spent prebuffering.
+      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0 /
+                            (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
+
+      file_datarate_ = file_size_in_kb / duration_;
+    }
+  }
+
+  vpx_codec_pts_t last_pts_;
+  int64_t bits_in_buffer_model_;
+  double timebase_;
+  int frame_number_;
+  vpx_codec_pts_t first_drop_;
+  int64_t bits_total_;
+  double duration_;
+  double file_datarate_;
+  double effective_datarate_;
+  int64_t bits_in_last_frame_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
+  int set_cpu_used_;
+  int gf_boost_;
+  bool use_roi_;
+  vpx_roi_map_t roi_;
+};
+
+#if CONFIG_TEMPORAL_DENOISING
+// Check basic datarate targeting, for a single bitrate, but loop over the
+// various denoiser settings.
+TEST_P(DatarateTestLarge, DenoiserLevels) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  for (int j = 1; j < 5; ++j) {
+    // Run over the denoiser levels.
+    // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
+    // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
+    // denoiserOnAggressive, and denoiserOnAdaptive.
+    denoiser_on_ = j;
+    cfg_.rc_target_bitrate = 300;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+        << " The datarate for the file missed the target!";
+  }
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestLarge, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+}
+#endif  // CONFIG_TEMPORAL_DENOISING
+
+TEST_P(DatarateTestLarge, BasicBufferModel) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  // 2 pass cbr datarate control has a bug hidden by the small # of
+  // frames selected in this encode. The problem is that even if the buffer is
+  // negative we produce a keyframe on a cutscene. Ignoring datarate
+  // constraints
+  // TODO(jimbankoski): ( Fix when issue
+  // http://code.google.com/p/webm/issues/detail?id=495 is addressed. )
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+
+  // There is an issue for low bitrates in real-time mode, where the
+  // effective_datarate slightly overshoots the target bitrate.
+  // This is same the issue as noted about (#495).
+  // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
+  // when the issue is resolved.
+  for (int i = 100; i < 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+        << " The datarate for the file missed the target!";
+  }
+}
+
+TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_max_quantizer = 36;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+
+  const int frame_count = 40;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, frame_count);
+
+  // Here we check that the first dropped frame gets earlier and earlier
+  // as the drop frame threshold is increased.
+
+  const int kDropFrameThreshTestStep = 30;
+  vpx_codec_pts_t last_drop = frame_count;
+  for (int i = 1; i < 91; i += kDropFrameThreshTestStep) {
+    cfg_.rc_dropframe_thresh = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_LE(first_drop_, last_drop)
+        << " The first dropped frame for drop_thresh " << i
+        << " > first dropped frame for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    last_drop = first_drop_;
+  }
+}
+
+TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+}
+
+class DatarateTestRealTime : public DatarateTestLarge {
+ public:
+  virtual ~DatarateTestRealTime() {}
+};
+
+#if CONFIG_TEMPORAL_DENOISING
+// Check basic datarate targeting, for a single bitrate, but loop over the
+// various denoiser settings.
+TEST_P(DatarateTestRealTime, DenoiserLevels) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  for (int j = 1; j < 5; ++j) {
+    // Run over the denoiser levels.
+    // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
+    // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
+    // denoiserOnAggressive, and denoiserOnAdaptive.
+    denoiser_on_ = j;
+    cfg_.rc_target_bitrate = 300;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+        << " The datarate for the file missed the target!";
+  }
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestRealTime, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+}
+#endif  // CONFIG_TEMPORAL_DENOISING
+
+TEST_P(DatarateTestRealTime, BasicBufferModel) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  // 2 pass cbr datarate control has a bug hidden by the small # of
+  // frames selected in this encode. The problem is that even if the buffer is
+  // negative we produce a keyframe on a cutscene, ignoring datarate
+  // constraints
+  // TODO(jimbankoski): Fix when issue
+  // http://bugs.chromium.org/p/webm/issues/detail?id=495 is addressed.
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+
+  // There is an issue for low bitrates in real-time mode, where the
+  // effective_datarate slightly overshoots the target bitrate.
+  // This is same the issue as noted above (#495).
+  // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
+  // when the issue is resolved.
+  for (int i = 100; i <= 700; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+        << " The datarate for the file missed the target!";
+  }
+}
+
+TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_max_quantizer = 36;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+
+  const int frame_count = 40;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, frame_count);
+
+  // Check that the first dropped frame gets earlier and earlier
+  // as the drop frame threshold is increased.
+
+  const int kDropFrameThreshTestStep = 30;
+  vpx_codec_pts_t last_drop = frame_count;
+  for (int i = 1; i < 91; i += kDropFrameThreshTestStep) {
+    cfg_.rc_dropframe_thresh = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_LE(first_drop_, last_drop)
+        << " The first dropped frame for drop_thresh " << i
+        << " > first dropped frame for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    last_drop = first_drop_;
+  }
+}
+
+TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  // Encode using multiple threads.
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+}
+
+TEST_P(DatarateTestRealTime, RegionOfInterest) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  // Encode using multiple threads.
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 450;
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  ResetModel();
+
+  // Set ROI parameters
+  use_roi_ = true;
+  memset(&roi_, 0, sizeof(roi_));
+
+  roi_.rows = (cfg_.g_h + 15) / 16;
+  roi_.cols = (cfg_.g_w + 15) / 16;
+
+  roi_.delta_q[0] = 0;
+  roi_.delta_q[1] = -20;
+  roi_.delta_q[2] = 0;
+  roi_.delta_q[3] = 0;
+
+  roi_.delta_lf[0] = 0;
+  roi_.delta_lf[1] = -20;
+  roi_.delta_lf[2] = 0;
+  roi_.delta_lf[3] = 0;
+
+  roi_.static_threshold[0] = 0;
+  roi_.static_threshold[1] = 1000;
+  roi_.static_threshold[2] = 0;
+  roi_.static_threshold[3] = 0;
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi_.roi_map =
+      (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map));
+  for (unsigned int i = 0; i < roi_.rows; ++i) {
+    for (unsigned int j = 0; j < roi_.cols; ++j) {
+      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
+          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
+        roi_.roi_map[i * roi_.cols + j] = 1;
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+
+  free(roi_.roi_map);
+}
+
+TEST_P(DatarateTestRealTime, GFBoost) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_error_resilient = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // Apply a gf boost.
+  gf_boost_ = 50;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+}
+
+VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES,
+                          ::testing::Values(0));
+VP8_INSTANTIATE_TEST_CASE(DatarateTestRealTime,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Values(-6, -12));
+}  // namespace
--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@ -17,12 +17,16 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

+#include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"

 namespace {

+typedef void (*FdctFunc)(int16_t *a, int16_t *b, int a_stride);
+
 const int cospi8sqrt2minus1 = 20091;
 const int sinpi8sqrt2 = 35468;

@ -68,10 +72,21 @@ void reference_idct4x4(const int16_t *input, int16_t *output) {

 using libvpx_test::ACMRandom;

-TEST(VP8FdctTest, SignBiasCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
+class FdctTest : public ::testing::TestWithParam<FdctFunc> {
+ public:
+  virtual void SetUp() {
+    fdct_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  FdctFunc fdct_func_;
+  ACMRandom rnd_;
+};
+
+TEST_P(FdctTest, SignBiasCheck) {
  int16_t test_input_block[16];
-  int16_t test_output_block[16];
+  DECLARE_ALIGNED(16, int16_t, test_output_block[16]);
  const int pitch = 8;
  int count_sign_block[16][2];
  const int count_test_block = 1000000;
@ -81,10 +96,10 @@ TEST(VP8FdctTest, SignBiasCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    // Initialize a test block with input range [-255, 255].
    for (int j = 0; j < 16; ++j) {
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = rnd_.Rand8() - rnd_.Rand8();
    }

-    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+    fdct_func_(test_input_block, test_output_block, pitch);

    for (int j = 0; j < 16; ++j) {
      if (test_output_block[j] < 0) {
@ -110,10 +125,10 @@ TEST(VP8FdctTest, SignBiasCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    // Initialize a test block with input range [-15, 15].
    for (int j = 0; j < 16; ++j) {
-      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+      test_input_block[j] = (rnd_.Rand8() >> 4) - (rnd_.Rand8() >> 4);
    }

-    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+    fdct_func_(test_input_block, test_output_block, pitch);

    for (int j = 0; j < 16; ++j) {
      if (test_output_block[j] < 0) {
@ -135,23 +150,22 @@ TEST(VP8FdctTest, SignBiasCheck) {
      << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]";
 };

-TEST(VP8FdctTest, RoundTripErrorCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
+TEST_P(FdctTest, RoundTripErrorCheck) {
  int max_error = 0;
  double total_error = 0;
  const int count_test_block = 1000000;
  for (int i = 0; i < count_test_block; ++i) {
    int16_t test_input_block[16];
-    int16_t test_temp_block[16];
    int16_t test_output_block[16];
+    DECLARE_ALIGNED(16, int16_t, test_temp_block[16]);

    // Initialize a test block with input range [-255, 255].
    for (int j = 0; j < 16; ++j) {
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = rnd_.Rand8() - rnd_.Rand8();
    }

    const int pitch = 8;
-    vp8_short_fdct4x4_c(test_input_block, test_temp_block, pitch);
+    fdct_func_(test_input_block, test_temp_block, pitch);
    reference_idct4x4(test_temp_block, test_output_block);

    for (int j = 0; j < 16; ++j) {
@ -169,4 +183,24 @@ TEST(VP8FdctTest, RoundTripErrorCheck) {
      << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
 };

+INSTANTIATE_TEST_CASE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, FdctTest,
+                        ::testing::Values(vp8_short_fdct4x4_neon));
+#endif  // HAVE_NEON
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, FdctTest,
+                        ::testing::Values(vp8_short_fdct4x4_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, FdctTest,
+                        ::testing::Values(vp8_short_fdct4x4_msa));
+#endif  // HAVE_MSA
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, FdctTest,
+                        ::testing::Values(vp8_short_fdct4x4_mmi));
+#endif  // HAVE_MMI
 }  // namespace
--- a/test/vp9_block_error_test.cc
+++ b/test/vp9_block_error_test.cc
@ -23,36 +23,36 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"

 using libvpx_test::ACMRandom;

 namespace {
-#if CONFIG_VP9_HIGHBITDEPTH
 const int kNumIterations = 1000;

-typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
+typedef int64_t (*HBDBlockErrorFunc)(const tran_low_t *coeff,
+                                     const tran_low_t *dqcoeff,
+                                     intptr_t block_size, int64_t *ssz,
+                                     int bps);
+
+typedef ::testing::tuple<HBDBlockErrorFunc, HBDBlockErrorFunc, vpx_bit_depth_t>
+    BlockErrorParam;
+
+typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
                                  const tran_low_t *dqcoeff,
-                                  intptr_t block_size, int64_t *ssz, int bps);
+                                  intptr_t block_size, int64_t *ssz);

-typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
-    ErrorBlockParam;
-
-// wrapper for 8-bit block error functions without a 'bps' param.
-typedef int64_t (*HighBdBlockError8bit)(const tran_low_t *coeff,
-                                        const tran_low_t *dqcoeff,
-                                        intptr_t block_size, int64_t *ssz);
-template <HighBdBlockError8bit fn>
-int64_t HighBdBlockError8bitWrapper(const tran_low_t *coeff,
-                                    const tran_low_t *dqcoeff,
-                                    intptr_t block_size, int64_t *ssz,
-                                    int bps) {
-  EXPECT_EQ(8, bps);
+template <BlockErrorFunc fn>
+int64_t BlockError8BitWrapper(const tran_low_t *coeff,
+                              const tran_low_t *dqcoeff, intptr_t block_size,
+                              int64_t *ssz, int bps) {
+  EXPECT_EQ(bps, 8);
  return fn(coeff, dqcoeff, block_size, ssz);
 }

-class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
+class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> {
 public:
-  virtual ~ErrorBlockTest() {}
+  virtual ~BlockErrorTest() {}
  virtual void SetUp() {
    error_block_op_ = GET_PARAM(0);
    ref_error_block_op_ = GET_PARAM(1);
@ -63,11 +63,11 @@ class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {

 protected:
  vpx_bit_depth_t bit_depth_;
-  ErrorBlockFunc error_block_op_;
-  ErrorBlockFunc ref_error_block_op_;
+  HBDBlockErrorFunc error_block_op_;
+  HBDBlockErrorFunc ref_error_block_op_;
 };

-TEST_P(ErrorBlockTest, OperationCheck) {
+TEST_P(BlockErrorTest, OperationCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
@ -110,7 +110,7 @@ TEST_P(ErrorBlockTest, OperationCheck) {
      << "First failed at test case " << first_failure;
 }

-TEST_P(ErrorBlockTest, ExtremeValues) {
+TEST_P(BlockErrorTest, ExtremeValues) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
@ -168,32 +168,31 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
      << "First failed at test case " << first_failure;
 }

-using std::tr1::make_tuple;
+using ::testing::make_tuple;

 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, ErrorBlockTest,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
-                   VPX_BITS_10),
-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
-                   VPX_BITS_12),
-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
-                   VPX_BITS_8),
-        make_tuple(
-            &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_sse2>,
-            &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_c>,
-            VPX_BITS_8)));
+const BlockErrorParam sse2_block_error_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+             VPX_BITS_10),
+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+             VPX_BITS_12),
+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+             VPX_BITS_8),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&BlockError8BitWrapper<vp9_block_error_sse2>,
+             &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, BlockErrorTest,
+                        ::testing::ValuesIn(sse2_block_error_tests));
 #endif  // HAVE_SSE2

-#if HAVE_AVX
+#if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(
-    AVX, ErrorBlockTest,
-    ::testing::Values(make_tuple(
-        &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_avx>,
-        &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_c>,
-        VPX_BITS_8)));
-#endif  // HAVE_AVX
-
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+    AVX2, BlockErrorTest,
+    ::testing::Values(make_tuple(&BlockError8BitWrapper<vp9_block_error_avx2>,
+                                 &BlockError8BitWrapper<vp9_block_error_c>,
+                                 VPX_BITS_8)));
+#endif  // HAVE_AVX2
 }  // namespace
--- a/test/vp9_datarate_test.cc
+++ b/test/vp9_datarate_test.cc
@ -0,0 +1,839 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace {
+
+class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
+ public:
+  explicit DatarateTestVP9(const ::libvpx_test::CodecFactory *codec)
+      : EncoderTest(codec) {}
+
+ protected:
+  virtual ~DatarateTestVP9() {}
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    tot_frame_number_ = 0;
+    first_drop_ = 0;
+    num_drops_ = 0;
+    // Denoiser is off by default.
+    denoiser_on_ = 0;
+    // For testing up to 3 layers.
+    for (int i = 0; i < 3; ++i) {
+      bits_total_[i] = 0;
+    }
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
+    frame_parallel_decoding_mode_ = 1;
+    use_roi_ = false;
+  }
+
+  //
+  // Frame flags and layer id for temporal layers.
+  //
+
+  // For two layers, test pattern is:
+  //   1     3
+  // 0    2     .....
+  // For three layers, test pattern is:
+  //   1      3    5      7
+  //      2           6
+  // 0          4            ....
+  // LAST is always update on base/layer 0, GOLDEN is updated on layer 1.
+  // For this 3 layer example, the 2nd enhancement layer (layer 2) updates
+  // the altref frame.
+  static int GetFrameFlags(int frame_num, int num_temp_layers) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        // Layer 0: predict from L and ARF, update L.
+        frame_flags =
+            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        // Layer 1: predict from L, G and ARF, and update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_UPD_ENTROPY;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        // Layer 0: predict from L and ARF; update L.
+        frame_flags =
+            VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF;
+      } else if ((frame_num - 2) % 4 == 0) {
+        // Layer 1: predict from L, G, ARF; update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
+      } else if ((frame_num - 1) % 2 == 0) {
+        // Layer 2: predict from L, G, ARF; update ARF.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+      }
+    }
+    return frame_flags;
+  }
+
+  static int SetLayerId(int frame_num, int num_temp_layers) {
+    int layer_id = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        layer_id = 0;
+      } else {
+        layer_id = 1;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        layer_id = 0;
+      } else if ((frame_num - 2) % 4 == 0) {
+        layer_id = 1;
+      } else if ((frame_num - 1) % 2 == 0) {
+        layer_id = 2;
+      }
+    }
+    return layer_id;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+    }
+
+    encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    encoder->Control(VP9E_SET_TILE_COLUMNS, get_msb(cfg_.g_threads));
+    encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
+                     frame_parallel_decoding_mode_);
+
+    if (use_roi_) {
+      encoder->Control(VP9E_SET_ROI_MAP, &roi_);
+    }
+
+    if (cfg_.ts_number_layers > 1) {
+      if (video->frame() == 0) {
+        encoder->Control(VP9E_SET_SVC, 1);
+      }
+      vpx_svc_layer_id_t layer_id;
+      layer_id.spatial_layer_id = 0;
+      frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
+      layer_id.temporal_layer_id =
+          SetLayerId(video->frame(), cfg_.ts_number_layers);
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+    }
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Time since last timestamp = duration.
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+    if (duration > 1) {
+      // If first drop not set and we have a drop set it to this time.
+      if (!first_drop_) first_drop_ = last_pts_ + 1;
+      // Update the number of frame drops.
+      num_drops_ += static_cast<int>(duration - 1);
+      // Update counter for total number of frames (#frames input to encoder).
+      // Needed for setting the proper layer_id below.
+      tot_frame_number_ += static_cast<int>(duration - 1);
+    }
+
+    int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers);
+
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+
+    // Buffer should not go negative.
+    ASSERT_GE(bits_in_buffer_model_, 0)
+        << "Buffer Underrun at frame " << pkt->data.frame.pts;
+
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+
+    // Update the total encoded bits. For temporal layers, update the cumulative
+    // encoded bits per layer.
+    for (int i = layer; i < static_cast<int>(cfg_.ts_number_layers); ++i) {
+      bits_total_[i] += frame_size_in_bits;
+    }
+
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+    ++frame_number_;
+    ++tot_frame_number_;
+  }
+
+  virtual void EndPassHook(void) {
+    for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
+         ++layer) {
+      duration_ = (last_pts_ + 1) * timebase_;
+      if (bits_total_[layer]) {
+        // Effective file datarate:
+        effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_;
+      }
+    }
+  }
+
+  vpx_codec_pts_t last_pts_;
+  double timebase_;
+  int frame_number_;      // Counter for number of non-dropped/encoded frames.
+  int tot_frame_number_;  // Counter for total number of input frames.
+  int64_t bits_total_[3];
+  double duration_;
+  double effective_datarate_[3];
+  int set_cpu_used_;
+  int64_t bits_in_buffer_model_;
+  vpx_codec_pts_t first_drop_;
+  int num_drops_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
+  int frame_parallel_decoding_mode_;
+  bool use_roi_;
+  vpx_roi_map_t roi_;
+};
+
+// Params: test mode, speed setting and index for bitrate array.
+class DatarateTestVP9Large
+    : public DatarateTestVP9,
+      public ::libvpx_test::CodecTestWith3Params<libvpx_test::TestMode, int,
+                                                 int> {
+ public:
+  DatarateTestVP9Large() : DatarateTestVP9(GET_PARAM(0)) {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+    ResetModel();
+  }
+};
+
+// Params: test mode, speed setting.
+class DatarateTestVP9LargeOneBR
+    : public DatarateTestVP9,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ public:
+  DatarateTestVP9LargeOneBR() : DatarateTestVP9(GET_PARAM(0)) {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for VBR mode with 0 lag.
+TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagZero) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+
+  const int bitrates[2] = { 400, 800 };
+  const int bitrate_index = GET_PARAM(3);
+  if (bitrate_index > 1) return;
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for VBR mode with non-zero lag.
+TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZero) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  // For non-zero lag, rate control will work (be within bounds) for
+  // real-time mode.
+  if (deadline_ == VPX_DL_REALTIME) {
+    cfg_.g_lag_in_frames = 15;
+  } else {
+    cfg_.g_lag_in_frames = 0;
+  }
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  const int bitrates[2] = { 400, 800 };
+  const int bitrate_index = GET_PARAM(3);
+  if (bitrate_index > 1) return;
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for VBR mode with non-zero lag, with
+// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs
+// since error_resilience is off.
+TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZeroFrameParDecOff) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  // For non-zero lag, rate control will work (be within bounds) for
+  // real-time mode.
+  if (deadline_ == VPX_DL_REALTIME) {
+    cfg_.g_lag_in_frames = 15;
+  } else {
+    cfg_.g_lag_in_frames = 0;
+  }
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  const int bitrates[2] = { 400, 800 };
+  const int bitrate_index = GET_PARAM(3);
+  if (bitrate_index > 1) return;
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  frame_parallel_decoding_mode_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for CBR mode.
+TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  const int bitrates[4] = { 150, 350, 550, 750 };
+  const int bitrate_index = GET_PARAM(3);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode
+// off( and error_resilience off).
+TEST_P(DatarateTestVP9Large, BasicRateTargetingFrameParDecOff) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  const int bitrates[4] = { 150, 350, 550, 750 };
+  const int bitrate_index = GET_PARAM(3);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  frame_parallel_decoding_mode_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for CBR mode, with 2 threads and dropped frames.
+TEST_P(DatarateTestVP9LargeOneBR, BasicRateTargetingDropFramesMultiThreads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  // Encode using multiple threads.
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestVP9Large, BasicRateTargeting444) {
+  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+
+  cfg_.g_profile = 1;
+  cfg_.g_timebase = video.timebase();
+
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  const int bitrates[4] = { 250, 450, 650, 850 };
+  const int bitrate_index = GET_PARAM(3);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+            effective_datarate_[0] * 0.80)
+      << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+            effective_datarate_[0] * 1.15)
+      << " The datarate for the file missed the target!"
+      << cfg_.rc_target_bitrate << " " << effective_datarate_;
+}
+
+// Check that (1) the first dropped frame gets earlier and earlier
+// as the drop frame threshold is increased, and (2) that the total number of
+// frame drops does not decrease as we increase frame drop threshold.
+// Use a lower qp-max to force some frame drops.
+TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 50;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_lag_in_frames = 0;
+  // TODO(marpan): Investigate datarate target failures with a smaller keyframe
+  // interval (128).
+  cfg_.kf_max_dist = 9999;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+
+  const int kDropFrameThreshTestStep = 30;
+  const int bitrates[2] = { 50, 150 };
+  const int bitrate_index = GET_PARAM(3);
+  if (bitrate_index > 1) return;
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  vpx_codec_pts_t last_drop = 140;
+  int last_num_drops = 0;
+  for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
+    cfg_.rc_dropframe_thresh = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+        << " The datarate for the file is greater than target by too much!";
+    ASSERT_LE(first_drop_, last_drop)
+        << " The first dropped frame for drop_thresh " << i
+        << " > first dropped frame for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    ASSERT_GE(num_drops_, last_num_drops * 0.85)
+        << " The number of dropped frames for drop_thresh " << i
+        << " < number of dropped frames for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    last_drop = first_drop_;
+    last_num_drops = num_drops_;
+  }
+}  // namespace
+
+// Check basic rate targeting for 2 temporal layers.
+TEST_P(DatarateTestVP9Large, BasicRateTargeting2TemporalLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 2 Temporal layers, no spatial layers: Framerate decimation (2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 2;
+  cfg_.ts_rate_decimator[0] = 2;
+  cfg_.ts_rate_decimator[1] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  if (deadline_ == VPX_DL_REALTIME) cfg_.g_error_resilient = 1;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  const int bitrates[4] = { 200, 400, 600, 800 };
+  const int bitrate_index = GET_PARAM(3);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  // 60-40 bitrate allocation for 2 temporal layers.
+  cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
+        << " The datarate for the file is lower than target by too much, "
+           "for layer: "
+        << j;
+    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
+        << " The datarate for the file is greater than target by too much, "
+           "for layer: "
+        << j;
+  }
+}
+
+// Check basic rate targeting for 3 temporal layers.
+TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  const int bitrates[4] = { 200, 400, 600, 800 };
+  const int bitrate_index = GET_PARAM(3);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  // 40-20-40 bitrate allocation for 3 temporal layers.
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+    // TODO(yaowu): Work out more stable rc control strategy and
+    //              Adjust the thresholds to be tighter than .75.
+    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75)
+        << " The datarate for the file is lower than target by too much, "
+           "for layer: "
+        << j;
+    // TODO(yaowu): Work out more stable rc control strategy and
+    //              Adjust the thresholds to be tighter than 1.25.
+    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25)
+        << " The datarate for the file is greater than target by too much, "
+           "for layer: "
+        << j;
+  }
+}
+
+// Check basic rate targeting for 3 temporal layers, with frame dropping.
+// Only for one (low) bitrate with lower max_quantizer, and somewhat higher
+// frame drop threshold, to force frame dropping.
+TEST_P(DatarateTestVP9LargeOneBR,
+       BasicRateTargeting3TemporalLayersFrameDropping) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  // Set frame drop threshold and rc_max_quantizer to force some frame drops.
+  cfg_.rc_dropframe_thresh = 20;
+  cfg_.rc_max_quantizer = 45;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+  // 40-20-40 bitrate allocation for 3 temporal layers.
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
+        << " The datarate for the file is lower than target by too much, "
+           "for layer: "
+        << j;
+    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
+        << " The datarate for the file is greater than target by too much, "
+           "for layer: "
+        << j;
+    // Expect some frame drops in this test: for this 200 frames test,
+    // expect at least 10% and not more than 60% drops.
+    ASSERT_GE(num_drops_, 20);
+    ASSERT_LE(num_drops_, 130);
+  }
+}
+
+// Params: speed setting.
+class DatarateTestVP9RealTime : public DatarateTestVP9,
+                                public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {}
+  virtual ~DatarateTestVP9RealTime() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_cpu_used_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check VP9 region of interest feature.
+TEST_P(DatarateTestVP9RealTime, RegionOfInterest) {
+  if (deadline_ != VPX_DL_REALTIME || set_cpu_used_ < 5) return;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+
+  cfg_.rc_target_bitrate = 450;
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  ResetModel();
+
+  // Set ROI parameters
+  use_roi_ = true;
+  memset(&roi_, 0, sizeof(roi_));
+
+  roi_.rows = (cfg_.g_h + 7) / 8;
+  roi_.cols = (cfg_.g_w + 7) / 8;
+
+  roi_.delta_q[1] = -20;
+  roi_.delta_lf[1] = -20;
+  memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame));
+  roi_.ref_frame[1] = 1;
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi_.roi_map = reinterpret_cast<uint8_t *>(
+      calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)));
+  ASSERT_TRUE(roi_.roi_map != NULL);
+
+  for (unsigned int i = 0; i < roi_.rows; ++i) {
+    for (unsigned int j = 0; j < roi_.cols; ++j) {
+      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
+          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
+        roi_.roi_map[i * roi_.cols + j] = 1;
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4)
+      << " The datarate for the file missed the target!";
+
+  free(roi_.roi_map);
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// Params: speed setting.
+class DatarateTestVP9LargeDenoiser : public DatarateTestVP9RealTime {
+ public:
+  virtual ~DatarateTestVP9LargeDenoiser() {}
+};
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is on.
+TEST_P(DatarateTestVP9LargeDenoiser, LowNoise) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is on,
+// for clip with high noise level. Use 2 threads.
+TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: kDenoiserOnYOnly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is on,
+// for 1280x720 clip with 4 threads.
+TEST_P(DatarateTestVP9LargeDenoiser, 4threads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 4;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.29)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestVP9LargeDenoiser, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+#endif  // CONFIG_VP9_TEMPORAL_DENOISING
+
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
+                          ::testing::Values(::libvpx_test::kOnePassGood,
+                                            ::libvpx_test::kRealTime),
+                          ::testing::Range(2, 9), ::testing::Range(0, 4));
+
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeOneBR,
+                          ::testing::Values(::libvpx_test::kOnePassGood,
+                                            ::libvpx_test::kRealTime),
+                          ::testing::Range(2, 9));
+
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime, ::testing::Range(5, 9));
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser, ::testing::Range(5, 9));
+#endif
+}  // namespace
--- a/test/vp9_denoiser_sse2_test.cc
+++ b/test/vp9_denoiser_sse2_test.cc
@ -29,11 +29,22 @@ using libvpx_test::ACMRandom;
 namespace {

 const int kNumPixels = 64 * 64;
-class VP9DenoiserTest : public ::testing::TestWithParam<BLOCK_SIZE> {
+
+typedef int (*Vp9DenoiserFilterFunc)(const uint8_t *sig, int sig_stride,
+                                     const uint8_t *mc_avg, int mc_avg_stride,
+                                     uint8_t *avg, int avg_stride,
+                                     int increase_denoising, BLOCK_SIZE bs,
+                                     int motion_magnitude);
+typedef ::testing::tuple<Vp9DenoiserFilterFunc, BLOCK_SIZE>
+    VP9DenoiserTestParam;
+
+class VP9DenoiserTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<VP9DenoiserTestParam> {
 public:
  virtual ~VP9DenoiserTest() {}

-  virtual void SetUp() { bs_ = GetParam(); }
+  virtual void SetUp() { bs_ = GET_PARAM(1); }

  virtual void TearDown() { libvpx_test::ClearSystemState(); }

@ -76,9 +87,9 @@ TEST_P(VP9DenoiserTest, BitexactCheck) {
                                                   64, avg_block_c, 64, 0, bs_,
                                                   motion_magnitude_random));

-    ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_sse2(
-        sig_block, 64, mc_avg_block, 64, avg_block_sse2, 64, 0, bs_,
-        motion_magnitude_random));
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(0)(sig_block, 64, mc_avg_block, 64,
+                                          avg_block_sse2, 64, 0, bs_,
+                                          motion_magnitude_random));

    // Test bitexactness.
    for (int h = 0; h < (4 << b_height_log2_lookup[bs_]); ++h) {
@ -89,10 +100,36 @@ TEST_P(VP9DenoiserTest, BitexactCheck) {
  }
 }

+using ::testing::make_tuple;
+
 // Test for all block size.
-INSTANTIATE_TEST_CASE_P(SSE2, VP9DenoiserTest,
-                        ::testing::Values(BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
-                                          BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
-                                          BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
-                                          BLOCK_64X64));
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9DenoiserTest,
+    ::testing::Values(make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X8),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X16),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X8),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X16),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X32),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X16),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X32),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X64),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X32),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X64)));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9DenoiserTest,
+    ::testing::Values(make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X8),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X16),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X8),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X16),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X32),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X16),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X32),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X64),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X32),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X64)));
+#endif
 }  // namespace
--- a/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/test/vp9_encoder_parms_get_to_decoder.cc
@ -99,9 +99,7 @@ class VpxEncoderParmsGetToDecoder
    vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
    vpx_codec_alg_priv_t *const priv =
        reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv);
-    FrameWorkerData *const worker_data =
-        reinterpret_cast<FrameWorkerData *>(priv->frame_workers[0].data1);
-    VP9_COMMON *const common = &worker_data->pbi->common;
+    VP9_COMMON *const common = &priv->pbi->common;

    if (encode_parms.lossless) {
      EXPECT_EQ(0, common->base_qindex);
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@ -59,7 +59,7 @@ const TestVideoParam kTestVectors[] = {
 // Encoding modes tested
 const libvpx_test::TestMode kEncodingModeVectors[] = {
  ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-  ::libvpx_test::kRealTime,
+  ::libvpx_test::kRealTime
 };

 // Speed settings tested
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@ -16,8 +16,207 @@
 #include "test/md5_helper.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
+#include "vp9/encoder/vp9_firstpass.h"

 namespace {
+// FIRSTPASS_STATS struct:
+// {
+//   25 double members;
+//   1 int64_t member;
+// }
+// Whenever FIRSTPASS_STATS struct is modified, the following constants need to
+// be revisited.
+const int kDbl = 25;
+const int kInt = 1;
+const size_t kFirstPassStatsSz = kDbl * sizeof(double) + kInt * sizeof(int64_t);
+
+class VPxFirstPassEncoderThreadTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  VPxFirstPassEncoderThreadTest()
+      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), tiles_(0),
+        encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) {
+    init_flags_ = VPX_CODEC_USE_PSNR;
+
+    row_mt_mode_ = 1;
+    first_pass_only_ = true;
+    firstpass_stats_.buf = NULL;
+    firstpass_stats_.sz = 0;
+  }
+  virtual ~VPxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.rc_end_usage = VPX_VBR;
+    cfg_.rc_2pass_vbr_minsection_pct = 5;
+    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    encoder_initialized_ = false;
+    abort_ = false;
+  }
+
+  virtual void EndPassHook() {
+    // For first pass stats test, only run first pass encoder.
+    if (first_pass_only_ && cfg_.g_pass == VPX_RC_FIRST_PASS)
+      abort_ |= first_pass_only_;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (!encoder_initialized_) {
+      // Encode in 2-pass mode.
+      encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0);
+
+      if (encoding_mode_ == ::libvpx_test::kTwoPassGood)
+        encoder->Control(VP9E_SET_ROW_MT, row_mt_mode_);
+
+      encoder_initialized_ = true;
+    }
+  }
+
+  virtual void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    const uint8_t *const pkt_buf =
+        reinterpret_cast<uint8_t *>(pkt->data.twopass_stats.buf);
+    const size_t pkt_size = pkt->data.twopass_stats.sz;
+
+    // First pass stats size equals sizeof(FIRSTPASS_STATS)
+    EXPECT_EQ(pkt_size, kFirstPassStatsSz)
+        << "Error: First pass stats size doesn't equal kFirstPassStatsSz";
+
+    firstpass_stats_.buf =
+        realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size);
+    memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf,
+           pkt_size);
+    firstpass_stats_.sz += pkt_size;
+  }
+
+  bool encoder_initialized_;
+  int tiles_;
+  ::libvpx_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  int row_mt_mode_;
+  bool first_pass_only_;
+  vpx_fixed_buf_t firstpass_stats_;
+};
+
+static void compare_fp_stats(vpx_fixed_buf_t *fp_stats, double factor) {
+  // fp_stats consists of 2 set of first pass encoding stats. These 2 set of
+  // stats are compared to check if the stats match or at least are very close.
+  FIRSTPASS_STATS *stats1 = reinterpret_cast<FIRSTPASS_STATS *>(fp_stats->buf);
+  int nframes_ = (int)(fp_stats->sz / sizeof(FIRSTPASS_STATS));
+  FIRSTPASS_STATS *stats2 = stats1 + nframes_ / 2;
+  int i, j;
+
+  // The total stats are also output and included in the first pass stats. Here
+  // ignore that in the comparison.
+  for (i = 0; i < (nframes_ / 2 - 1); ++i) {
+    const double *frame_stats1 = reinterpret_cast<double *>(stats1);
+    const double *frame_stats2 = reinterpret_cast<double *>(stats2);
+
+    for (j = 0; j < kDbl; ++j) {
+      ASSERT_LE(fabs(*frame_stats1 - *frame_stats2),
+                fabs(*frame_stats1) / factor)
+          << "First failure @ frame #" << i << " stat #" << j << " ("
+          << *frame_stats1 << " vs. " << *frame_stats2 << ")";
+      frame_stats1++;
+      frame_stats2++;
+    }
+
+    stats1++;
+    stats2++;
+  }
+
+  // Reset firstpass_stats_ to 0.
+  memset((uint8_t *)fp_stats->buf, 0, fp_stats->sz);
+  fp_stats->sz = 0;
+}
+
+static void compare_fp_stats_md5(vpx_fixed_buf_t *fp_stats) {
+  // fp_stats consists of 2 set of first pass encoding stats. These 2 set of
+  // stats are compared to check if the stats match.
+  uint8_t *stats1 = reinterpret_cast<uint8_t *>(fp_stats->buf);
+  uint8_t *stats2 = stats1 + fp_stats->sz / 2;
+  ::libvpx_test::MD5 md5_row_mt_0, md5_row_mt_1;
+
+  md5_row_mt_0.Add(stats1, fp_stats->sz / 2);
+  const char *md5_row_mt_0_str = md5_row_mt_0.Get();
+
+  md5_row_mt_1.Add(stats2, fp_stats->sz / 2);
+  const char *md5_row_mt_1_str = md5_row_mt_1.Get();
+
+  // Check md5 match.
+  ASSERT_STREQ(md5_row_mt_0_str, md5_row_mt_1_str)
+      << "MD5 checksums don't match";
+
+  // Reset firstpass_stats_ to 0.
+  memset((uint8_t *)fp_stats->buf, 0, fp_stats->sz);
+  fp_stats->sz = 0;
+}
+
+TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+
+  first_pass_only_ = true;
+  cfg_.rc_target_bitrate = 1000;
+
+  // Test row_mt_mode: 0 vs 1 at single thread case(threads = 1, tiles_ = 0)
+  tiles_ = 0;
+  cfg_.g_threads = 1;
+
+  row_mt_mode_ = 0;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  row_mt_mode_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare to check if using or not using row-mt generates close stats.
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
+
+  // Test single thread vs multiple threads
+  row_mt_mode_ = 1;
+  tiles_ = 0;
+
+  cfg_.g_threads = 1;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  cfg_.g_threads = 4;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare to check if single-thread and multi-thread stats are close enough.
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
+
+  // Bit exact test in row_mt mode.
+  // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact
+  // result.
+  row_mt_mode_ = 1;
+  tiles_ = 2;
+
+  cfg_.g_threads = 2;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  cfg_.g_threads = 8;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare to check if stats match with row-mt=0/1.
+  compare_fp_stats_md5(&firstpass_stats_);
+}
+
 class VPxEncoderThreadTest
    : public ::libvpx_test::EncoderTest,
      public ::libvpx_test::CodecTestWith4Params<libvpx_test::TestMode, int,
@ -29,6 +228,9 @@ class VPxEncoderThreadTest
        encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) {
    init_flags_ = VPX_CODEC_USE_PSNR;
    md5_.clear();
+    row_mt_mode_ = 1;
+    psnr_ = 0.0;
+    nframes_ = 0;
  }
  virtual ~VPxEncoderThreadTest() {}

@ -37,7 +239,6 @@ class VPxEncoderThreadTest
    SetMode(encoding_mode_);

    if (encoding_mode_ != ::libvpx_test::kRealTime) {
-      cfg_.g_lag_in_frames = 3;
      cfg_.rc_end_usage = VPX_VBR;
      cfg_.rc_2pass_vbr_minsection_pct = 5;
      cfg_.rc_2pass_vbr_maxsection_pct = 2000;
@ -52,6 +253,8 @@ class VPxEncoderThreadTest

  virtual void BeginPassHook(unsigned int /*pass*/) {
    encoder_initialized_ = false;
+    psnr_ = 0.0;
+    nframes_ = 0;
  }

  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
@ -70,10 +273,17 @@ class VPxEncoderThreadTest
        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0);
        encoder->Control(VP9E_SET_AQ_MODE, 3);
      }
+      encoder->Control(VP9E_SET_ROW_MT, row_mt_mode_);
+
      encoder_initialized_ = true;
    }
  }

+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
  virtual void DecompressedFrameHook(const vpx_image_t &img,
                                     vpx_codec_pts_t /*pts*/) {
    ::libvpx_test::MD5 md5_res;
@ -92,43 +302,127 @@ class VPxEncoderThreadTest
    return true;
  }

+  double GetAveragePsnr() const { return nframes_ ? (psnr_ / nframes_) : 0.0; }
+
  bool encoder_initialized_;
  int tiles_;
  int threads_;
  ::libvpx_test::TestMode encoding_mode_;
  int set_cpu_used_;
+  int row_mt_mode_;
+  double psnr_;
+  unsigned int nframes_;
  std::vector<std::string> md5_;
 };

 TEST_P(VPxEncoderThreadTest, EncoderResultTest) {
-  std::vector<std::string> single_thr_md5, multi_thr_md5;
-
  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 20);
-
  cfg_.rc_target_bitrate = 1000;

+  // Part 1: Bit exact test for row_mt_mode_ = 0.
+  // This part keeps original unit tests done before row-mt code is checked in.
+  row_mt_mode_ = 0;
+
  // Encode using single thread.
  cfg_.g_threads = 1;
  init_flags_ = VPX_CODEC_USE_PSNR;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  single_thr_md5 = md5_;
+  const std::vector<std::string> single_thr_md5 = md5_;
  md5_.clear();

  // Encode using multiple threads.
  cfg_.g_threads = threads_;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  multi_thr_md5 = md5_;
+  const std::vector<std::string> multi_thr_md5 = md5_;
  md5_.clear();

  // Compare to check if two vectors are equal.
  ASSERT_EQ(single_thr_md5, multi_thr_md5);
+
+  // Part 2: row_mt_mode_ = 0 vs row_mt_mode_ = 1 single thread bit exact test.
+  row_mt_mode_ = 1;
+
+  // Encode using single thread
+  cfg_.g_threads = 1;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  std::vector<std::string> row_mt_single_thr_md5 = md5_;
+  md5_.clear();
+
+  ASSERT_EQ(single_thr_md5, row_mt_single_thr_md5);
+
+  // Part 3: Bit exact test with row-mt on
+  // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact
+  // result.
+  row_mt_mode_ = 1;
+  row_mt_single_thr_md5.clear();
+
+  // Encode using 2 threads.
+  cfg_.g_threads = 2;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  row_mt_single_thr_md5 = md5_;
+  md5_.clear();
+
+  // Encode using multiple threads.
+  cfg_.g_threads = threads_;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const std::vector<std::string> row_mt_multi_thr_md5 = md5_;
+  md5_.clear();
+
+  // Compare to check if two vectors are equal.
+  ASSERT_EQ(row_mt_single_thr_md5, row_mt_multi_thr_md5);
+
+  // Part 4: PSNR test with bit_match_mode_ = 0
+  row_mt_mode_ = 1;
+
+  // Encode using single thread.
+  cfg_.g_threads = 1;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double single_thr_psnr = GetAveragePsnr();
+
+  // Encode using multiple threads.
+  cfg_.g_threads = threads_;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double multi_thr_psnr = GetAveragePsnr();
+
+  EXPECT_NEAR(single_thr_psnr, multi_thr_psnr, 0.1);
 }

-VP9_INSTANTIATE_TEST_CASE(VPxEncoderThreadTest,
-                          ::testing::Values(::libvpx_test::kTwoPassGood,
-                                            ::libvpx_test::kOnePassGood,
-                                            ::libvpx_test::kRealTime),
-                          ::testing::Range(0, 9),   // cpu_used
-                          ::testing::Range(0, 3),   // tile_columns
-                          ::testing::Range(2, 5));  // threads
+INSTANTIATE_TEST_CASE_P(
+    VP9, VPxFirstPassEncoderThreadTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Values(::libvpx_test::kTwoPassGood),
+        ::testing::Range(0, 4)));  // cpu_used
+
+// Split this into two instantiations so that we can distinguish
+// between very slow runs ( ie cpu_speed 0 ) vs ones that can be
+// run nightly by adding Large to the title.
+INSTANTIATE_TEST_CASE_P(
+    VP9, VPxEncoderThreadTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Values(::libvpx_test::kTwoPassGood,
+                          ::libvpx_test::kOnePassGood,
+                          ::libvpx_test::kRealTime),
+        ::testing::Range(3, 9),    // cpu_used
+        ::testing::Range(0, 3),    // tile_columns
+        ::testing::Range(2, 5)));  // threads
+
+INSTANTIATE_TEST_CASE_P(
+    VP9Large, VPxEncoderThreadTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Values(::libvpx_test::kTwoPassGood,
+                          ::libvpx_test::kOnePassGood,
+                          ::libvpx_test::kRealTime),
+        ::testing::Range(0, 3),    // cpu_used
+        ::testing::Range(0, 3),    // tile_columns
+        ::testing::Range(2, 5)));  // threads
+
 }  // namespace
--- a/test/vp9_frame_parallel_test.cc
+++ b/test/vp9_frame_parallel_test.cc
@ -1,217 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "test/codec_factory.h"
-#include "test/decode_test_driver.h"
-#include "test/ivf_video_source.h"
-#include "test/md5_helper.h"
-#include "test/util.h"
-#if CONFIG_WEBM_IO
-#include "test/webm_video_source.h"
-#endif
-#include "vpx_mem/vpx_mem.h"
-
-namespace {
-
-using std::string;
-
-#if CONFIG_WEBM_IO
-
-struct PauseFileList {
-  const char *name;
-  // md5 sum for decoded frames which does not include skipped frames.
-  const char *expected_md5;
-  const int pause_frame_num;
-};
-
-// Decodes |filename| with |num_threads|. Pause at the specified frame_num,
-// seek to next key frame and then continue decoding until the end. Return
-// the md5 of the decoded frames which does not include skipped frames.
-string DecodeFileWithPause(const string &filename, int num_threads,
-                           int pause_num) {
-  libvpx_test::WebMVideoSource video(filename);
-  video.Init();
-  int in_frames = 0;
-  int out_frames = 0;
-
-  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
-  cfg.threads = num_threads;
-  vpx_codec_flags_t flags = 0;
-  flags |= VPX_CODEC_USE_FRAME_THREADING;
-  libvpx_test::VP9Decoder decoder(cfg, flags);
-
-  libvpx_test::MD5 md5;
-  video.Begin();
-
-  do {
-    ++in_frames;
-    const vpx_codec_err_t res =
-        decoder.DecodeFrame(video.cxdata(), video.frame_size());
-    if (res != VPX_CODEC_OK) {
-      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
-      break;
-    }
-
-    // Pause at specified frame number.
-    if (in_frames == pause_num) {
-      // Flush the decoder and then seek to next key frame.
-      decoder.DecodeFrame(NULL, 0);
-      video.SeekToNextKeyFrame();
-    } else {
-      video.Next();
-    }
-
-    // Flush the decoder at the end of the video.
-    if (!video.cxdata()) decoder.DecodeFrame(NULL, 0);
-
-    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img;
-
-    // Get decompressed data
-    while ((img = dec_iter.Next())) {
-      ++out_frames;
-      md5.Add(img);
-    }
-  } while (video.cxdata() != NULL);
-
-  EXPECT_EQ(in_frames, out_frames)
-      << "Input frame count does not match output frame count";
-
-  return string(md5.Get());
-}
-
-void DecodeFilesWithPause(const PauseFileList files[]) {
-  for (const PauseFileList *iter = files; iter->name != NULL; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5,
-                DecodeFileWithPause(iter->name, t, iter->pause_frame_num))
-          << "threads = " << t;
-    }
-  }
-}
-
-TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) {
-  // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
-  // one key frame for every ten frames.
-  static const PauseFileList files[] = {
-    { "vp90-2-07-frame_parallel-1.webm", "6ea7c3875d67252e7caf2bc6e75b36b1",
-      6 },
-    { "vp90-2-07-frame_parallel-1.webm", "4bb634160c7356a8d7d4299b6dc83a45",
-      12 },
-    { "vp90-2-07-frame_parallel-1.webm", "89772591e6ef461f9fa754f916c78ed8",
-      26 },
-    { NULL, NULL, 0 },
-  };
-  DecodeFilesWithPause(files);
-}
-
-struct FileList {
-  const char *name;
-  // md5 sum for decoded frames which does not include corrupted frames.
-  const char *expected_md5;
-  // Expected number of decoded frames which does not include corrupted frames.
-  const int expected_frame_count;
-};
-
-// Decodes |filename| with |num_threads|. Return the md5 of the decoded
-// frames which does not include corrupted frames.
-string DecodeFile(const string &filename, int num_threads,
-                  int expected_frame_count) {
-  libvpx_test::WebMVideoSource video(filename);
-  video.Init();
-
-  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
-  cfg.threads = num_threads;
-  const vpx_codec_flags_t flags = VPX_CODEC_USE_FRAME_THREADING;
-  libvpx_test::VP9Decoder decoder(cfg, flags);
-
-  libvpx_test::MD5 md5;
-  video.Begin();
-
-  int out_frames = 0;
-  do {
-    const vpx_codec_err_t res =
-        decoder.DecodeFrame(video.cxdata(), video.frame_size());
-    // TODO(hkuang): frame parallel mode should return an error on corruption.
-    if (res != VPX_CODEC_OK) {
-      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
-      break;
-    }
-
-    video.Next();
-
-    // Flush the decoder at the end of the video.
-    if (!video.cxdata()) decoder.DecodeFrame(NULL, 0);
-
-    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img;
-
-    // Get decompressed data
-    while ((img = dec_iter.Next())) {
-      ++out_frames;
-      md5.Add(img);
-    }
-  } while (video.cxdata() != NULL);
-
-  EXPECT_EQ(expected_frame_count, out_frames)
-      << "Input frame count does not match expected output frame count";
-
-  return string(md5.Get());
-}
-
-void DecodeFiles(const FileList files[]) {
-  for (const FileList *iter = files; iter->name != NULL; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5,
-                DecodeFile(iter->name, t, iter->expected_frame_count))
-          << "threads = " << t;
-    }
-  }
-}
-
-TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) {
-  static const FileList files[] = {
-    // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
-    // one key frame for every ten frames. The 11th frame has corrupted data.
-    { "invalid-vp90-2-07-frame_parallel-1.webm",
-      "0549d0f45f60deaef8eb708e6c0eb6cb", 30 },
-    // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with
-    // one key frame for every ten frames. The 1st and 31st frames have
-    // corrupted data.
-    { "invalid-vp90-2-07-frame_parallel-2.webm",
-      "6a1f3cf6f9e7a364212fadb9580d525e", 20 },
-    // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with
-    // one key frame for every ten frames. The 5th and 13th frames have
-    // corrupted data.
-    { "invalid-vp90-2-07-frame_parallel-3.webm",
-      "8256544308de926b0681e04685b98677", 27 },
-    { NULL, NULL, 0 },
-  };
-  DecodeFiles(files);
-}
-
-TEST(VP9MultiThreadedFrameParallel, ValidFileTest) {
-  static const FileList files[] = {
-#if CONFIG_VP9_HIGHBITDEPTH
-    { "vp92-2-20-10bit-yuv420.webm", "a16b99df180c584e8db2ffeda987d293", 10 },
-#endif
-    { NULL, NULL, 0 },
-  };
-  DecodeFiles(files);
-}
-#endif  // CONFIG_WEBM_IO
-}  // namespace
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@ -235,8 +235,16 @@ INSTANTIATE_TEST_CASE_P(
                       8),
        IntraPredParam(&vpx_d45_predictor_16x16_neon,
                       &vpx_d45_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d45_predictor_32x32_neon,
+                       &vpx_d45_predictor_32x32_c, 32, 8),
        IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c,
                       4, 8),
+        IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d135_predictor_16x16_neon,
+                       &vpx_d135_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d135_predictor_32x32_neon,
+                       &vpx_d135_predictor_32x32_c, 32, 8),
        IntraPredParam(&vpx_dc_128_predictor_4x4_neon,
                       &vpx_dc_128_predictor_4x4_c, 4, 8),
        IntraPredParam(&vpx_dc_128_predictor_8x8_neon,
@ -291,6 +299,139 @@ INSTANTIATE_TEST_CASE_P(
                       32, 8)));
 #endif  // HAVE_NEON

+#if HAVE_DSPR2
+INSTANTIATE_TEST_CASE_P(
+    DSPR2, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_dc_predictor_4x4_dspr2,
+                                     &vpx_dc_predictor_4x4_c, 4, 8),
+                      IntraPredParam(&vpx_dc_predictor_8x8_dspr2,
+                                     &vpx_dc_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_dc_predictor_16x16_dspr2,
+                                     &vpx_dc_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_h_predictor_4x4_dspr2,
+                                     &vpx_h_predictor_4x4_c, 4, 8),
+                      IntraPredParam(&vpx_h_predictor_8x8_dspr2,
+                                     &vpx_h_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_h_predictor_16x16_dspr2,
+                                     &vpx_h_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_tm_predictor_4x4_dspr2,
+                                     &vpx_tm_predictor_4x4_c, 4, 8),
+                      IntraPredParam(&vpx_tm_predictor_8x8_dspr2,
+                                     &vpx_tm_predictor_8x8_c, 8, 8)));
+#endif  // HAVE_DSPR2
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+    MSA, VP9IntraPredTest,
+    ::testing::Values(
+        IntraPredParam(&vpx_dc_128_predictor_4x4_msa,
+                       &vpx_dc_128_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_128_predictor_8x8_msa,
+                       &vpx_dc_128_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_128_predictor_16x16_msa,
+                       &vpx_dc_128_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_128_predictor_32x32_msa,
+                       &vpx_dc_128_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_left_predictor_4x4_msa,
+                       &vpx_dc_left_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_left_predictor_8x8_msa,
+                       &vpx_dc_left_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_left_predictor_16x16_msa,
+                       &vpx_dc_left_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_left_predictor_32x32_msa,
+                       &vpx_dc_left_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_predictor_4x4_msa, &vpx_dc_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_8x8_msa, &vpx_dc_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_16x16_msa, &vpx_dc_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_dc_predictor_32x32_msa, &vpx_dc_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_dc_top_predictor_4x4_msa,
+                       &vpx_dc_top_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_top_predictor_8x8_msa,
+                       &vpx_dc_top_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_top_predictor_16x16_msa,
+                       &vpx_dc_top_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_top_predictor_32x32_msa,
+                       &vpx_dc_top_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_h_predictor_4x4_msa, &vpx_h_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_h_predictor_8x8_msa, &vpx_h_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_h_predictor_16x16_msa, &vpx_h_predictor_16x16_c, 16,
+                       8),
+        IntraPredParam(&vpx_h_predictor_32x32_msa, &vpx_h_predictor_32x32_c, 32,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_4x4_msa, &vpx_tm_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_8x8_msa, &vpx_tm_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_16x16_msa, &vpx_tm_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_tm_predictor_32x32_msa, &vpx_tm_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_v_predictor_4x4_msa, &vpx_v_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_v_predictor_8x8_msa, &vpx_v_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_v_predictor_16x16_msa, &vpx_v_predictor_16x16_c, 16,
+                       8),
+        IntraPredParam(&vpx_v_predictor_32x32_msa, &vpx_v_predictor_32x32_c, 32,
+                       8)));
+#endif  // HAVE_MSA
+
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(
+    VSX, VP9IntraPredTest,
+    ::testing::Values(
+        IntraPredParam(&vpx_d45_predictor_8x8_vsx, &vpx_d45_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d45_predictor_16x16_vsx, &vpx_d45_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_d45_predictor_32x32_vsx, &vpx_d45_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_d63_predictor_8x8_vsx, &vpx_d63_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d63_predictor_16x16_vsx, &vpx_d63_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_d63_predictor_32x32_vsx, &vpx_d63_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_dc_128_predictor_16x16_vsx,
+                       &vpx_dc_128_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_128_predictor_32x32_vsx,
+                       &vpx_dc_128_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_left_predictor_16x16_vsx,
+                       &vpx_dc_left_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_left_predictor_32x32_vsx,
+                       &vpx_dc_left_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_predictor_8x8_vsx, &vpx_dc_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_16x16_vsx, &vpx_dc_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_dc_predictor_32x32_vsx, &vpx_dc_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_dc_top_predictor_16x16_vsx,
+                       &vpx_dc_top_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_top_predictor_32x32_vsx,
+                       &vpx_dc_top_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_h_predictor_4x4_vsx, &vpx_h_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_h_predictor_8x8_vsx, &vpx_h_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_h_predictor_16x16_vsx, &vpx_h_predictor_16x16_c, 16,
+                       8),
+        IntraPredParam(&vpx_h_predictor_32x32_vsx, &vpx_h_predictor_32x32_c, 32,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_4x4_vsx, &vpx_tm_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_8x8_vsx, &vpx_tm_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_16x16_vsx, &vpx_tm_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_tm_predictor_32x32_vsx, &vpx_tm_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_v_predictor_16x16_vsx, &vpx_v_predictor_16x16_c, 16,
+                       8),
+        IntraPredParam(&vpx_v_predictor_32x32_vsx, &vpx_v_predictor_32x32_c, 32,
+                       8)));
+#endif  // HAVE_VSX
+
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
                                const uint16_t *above, const uint16_t *left,
@ -326,10 +467,164 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) {
  RunTest(left_col, above_data, dst, ref_dst);
 }

+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_TO_C_8, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+                             &vpx_highbd_d63_predictor_32x32_ssse3, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+                             &vpx_highbd_d117_predictor_32x32_ssse3, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 8)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_TO_C_10, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+                             &vpx_highbd_d63_predictor_32x32_ssse3, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+                             &vpx_highbd_d117_predictor_32x32_ssse3, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 10)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_TO_C_12, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+                             &vpx_highbd_d63_predictor_32x32_ssse3, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+                             &vpx_highbd_d117_predictor_32x32_ssse3, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 12)));
+#endif  // HAVE_SSSE3
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
    SSE2_TO_C_8, VP9HighbdIntraPredTest,
    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 8),
        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
                             &vpx_highbd_dc_predictor_4x4_c, 4, 8),
        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@ -338,6 +633,14 @@ INSTANTIATE_TEST_CASE_P(
                             &vpx_highbd_dc_predictor_16x16_c, 16, 8),
        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
                             &vpx_highbd_dc_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 8),
        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
                             &vpx_highbd_tm_predictor_4x4_c, 4, 8),
        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
@ -346,6 +649,14 @@ INSTANTIATE_TEST_CASE_P(
                             &vpx_highbd_tm_predictor_16x16_c, 16, 8),
        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
                             &vpx_highbd_tm_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 8),
        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
                             &vpx_highbd_v_predictor_4x4_c, 4, 8),
        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
@ -358,6 +669,32 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
    SSE2_TO_C_10, VP9HighbdIntraPredTest,
    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 10),
        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
                             &vpx_highbd_dc_predictor_4x4_c, 4, 10),
        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@ -366,6 +703,14 @@ INSTANTIATE_TEST_CASE_P(
                             &vpx_highbd_dc_predictor_16x16_c, 16, 10),
        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
                             &vpx_highbd_dc_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 10),
        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
                             &vpx_highbd_tm_predictor_4x4_c, 4, 10),
        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
@ -374,6 +719,14 @@ INSTANTIATE_TEST_CASE_P(
                             &vpx_highbd_tm_predictor_16x16_c, 16, 10),
        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
                             &vpx_highbd_tm_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 10),
        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
                             &vpx_highbd_v_predictor_4x4_c, 4, 10),
        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
@ -386,6 +739,32 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
    SSE2_TO_C_12, VP9HighbdIntraPredTest,
    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 12),
        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
                             &vpx_highbd_dc_predictor_4x4_c, 4, 12),
        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@ -394,6 +773,14 @@ INSTANTIATE_TEST_CASE_P(
                             &vpx_highbd_dc_predictor_16x16_c, 16, 12),
        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
                             &vpx_highbd_dc_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 12),
        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
                             &vpx_highbd_tm_predictor_4x4_c, 4, 12),
        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
@ -402,6 +789,14 @@ INSTANTIATE_TEST_CASE_P(
                             &vpx_highbd_tm_predictor_16x16_c, 16, 12),
        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
                             &vpx_highbd_tm_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 12),
        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
                             &vpx_highbd_v_predictor_4x4_c, 4, 12),
        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
@ -412,5 +807,235 @@ INSTANTIATE_TEST_CASE_P(
                             &vpx_highbd_v_predictor_32x32_c, 32, 12)));
 #endif  // HAVE_SSE2

+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON_TO_C_8, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 8)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON_TO_C_10, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 10)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON_TO_C_12, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 12)));
+#endif  // HAVE_NEON
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
--- a/test/vp9_motion_vector_test.cc
+++ b/test/vp9_motion_vector_test.cc
@ -0,0 +1,97 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+#define MAX_EXTREME_MV 1
+#define MIN_EXTREME_MV 2
+
+// Encoding modes
+const libvpx_test::TestMode kEncodingModeVectors[] = {
+  ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
+  ::libvpx_test::kRealTime
+};
+
+// Encoding speeds
+const int kCpuUsedVectors[] = { 0, 1, 2, 3, 4, 5, 6 };
+
+// MV test modes: 1 - always use maximum MV; 2 - always use minimum MV.
+const int kMVTestModes[] = { MAX_EXTREME_MV, MIN_EXTREME_MV };
+
+class MotionVectorTestLarge
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith3Params<libvpx_test::TestMode, int,
+                                                 int> {
+ protected:
+  MotionVectorTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {}
+
+  virtual ~MotionVectorTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 3;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+  }
+
+  libvpx_test::TestMode encoding_mode_;
+  int cpu_used_;
+  int mv_test_mode_;
+};
+
+TEST_P(MotionVectorTestLarge, OverallTest) {
+  cfg_.rc_target_bitrate = 24000;
+  cfg_.g_profile = 0;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  testing::internal::scoped_ptr<libvpx_test::VideoSource> video;
+  video.reset(new libvpx_test::YUVVideoSource(
+      "niklas_640_480_30.yuv", VPX_IMG_FMT_I420, 3840, 2160,  // 2048, 1080,
+      30, 1, 0, 5));
+
+  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+VP9_INSTANTIATE_TEST_CASE(MotionVectorTestLarge,
+                          ::testing::ValuesIn(kEncodingModeVectors),
+                          ::testing::ValuesIn(kCpuUsedVectors),
+                          ::testing::ValuesIn(kMVTestModes));
+}  // namespace
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@ -14,9 +14,11 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@ -24,11 +26,12 @@
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/vpx_timer.h"

 using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;

 namespace {
-#if CONFIG_VP9_HIGHBITDEPTH
 const int number_of_iterations = 100;

 typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
@ -38,307 +41,537 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
                             tran_low_t *dqcoeff, const int16_t *dequant,
                             uint16_t *eob, const int16_t *scan,
                             const int16_t *iscan);
-typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t>
+typedef ::testing::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
+                         int /*max_size*/, bool /*is_fp*/>
    QuantizeParam;

-class VP9QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+// Wrapper for FP version which does not use zbin or quant_shift.
+typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
+                               int skip_block, const int16_t *round,
+                               const int16_t *quant, tran_low_t *qcoeff,
+                               tran_low_t *dqcoeff, const int16_t *dequant,
+                               uint16_t *eob, const int16_t *scan,
+                               const int16_t *iscan);
+
+template <QuantizeFPFunc fn>
+void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,
+                    const int16_t *zbin, const int16_t *round,
+                    const int16_t *quant, const int16_t *quant_shift,
+                    tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                    const int16_t *dequant, uint16_t *eob, const int16_t *scan,
+                    const int16_t *iscan) {
+  (void)zbin;
+  (void)quant_shift;
+
+  fn(coeff, count, skip_block, round, quant, qcoeff, dqcoeff, dequant, eob,
+     scan, iscan);
+}
+
+class VP9QuantizeBase {
 public:
-  virtual ~VP9QuantizeTest() {}
-  virtual void SetUp() {
-    quantize_op_ = GET_PARAM(0);
-    ref_quantize_op_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
+  VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
+      : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) {
+    max_value_ = (1 << bit_depth_) - 1;
+    zbin_ptr_ =
+        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
+    round_fp_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
+    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
+    round_ptr_ =
+        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
+    quant_ptr_ =
+        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
+    quant_shift_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
+    dequant_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
  }

-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-  QuantizeFunc quantize_op_;
-  QuantizeFunc ref_quantize_op_;
-};
-
-class VP9Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
- public:
-  virtual ~VP9Quantize32Test() {}
-  virtual void SetUp() {
-    quantize_op_ = GET_PARAM(0);
-    ref_quantize_op_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
+  ~VP9QuantizeBase() {
+    vpx_free(zbin_ptr_);
+    vpx_free(round_fp_ptr_);
+    vpx_free(quant_fp_ptr_);
+    vpx_free(round_ptr_);
+    vpx_free(quant_ptr_);
+    vpx_free(quant_shift_ptr_);
+    vpx_free(dequant_ptr_);
+    zbin_ptr_ = NULL;
+    round_fp_ptr_ = NULL;
+    quant_fp_ptr_ = NULL;
+    round_ptr_ = NULL;
+    quant_ptr_ = NULL;
+    quant_shift_ptr_ = NULL;
+    dequant_ptr_ = NULL;
+    libvpx_test::ClearSystemState();
  }

-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+ protected:
+  int16_t *zbin_ptr_;
+  int16_t *round_fp_ptr_;
+  int16_t *quant_fp_ptr_;
+  int16_t *round_ptr_;
+  int16_t *quant_ptr_;
+  int16_t *quant_shift_ptr_;
+  int16_t *dequant_ptr_;
+  const vpx_bit_depth_t bit_depth_;
+  int max_value_;
+  const int max_size_;
+  const bool is_fp_;
+};
+
+class VP9QuantizeTest : public VP9QuantizeBase,
+                        public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  VP9QuantizeTest()
+      : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3), GET_PARAM(4)),
+        quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}

 protected:
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-  QuantizeFunc quantize_op_;
-  QuantizeFunc ref_quantize_op_;
+  const QuantizeFunc quantize_op_;
+  const QuantizeFunc ref_quantize_op_;
 };

+// This quantizer compares the AC coefficients to the quantization step size to
+// determine if further multiplication operations are needed.
+// Based on vp9_quantize_fp_sse2().
+inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        int skip_block, const int16_t *round_ptr,
+                        const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan,
+                        const int16_t *iscan, int is_32x32) {
+  int i, eob = -1;
+  const int thr = dequant_ptr[1] >> (1 + is_32x32);
+  (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < n_coeffs; i += 16) {
+    int y;
+    int nzflag_cnt = 0;
+    int abs_coeff[16];
+    int coeff_sign[16];
+
+    // count nzflag for each row (16 tran_low_t)
+    for (y = 0; y < 16; ++y) {
+      const int rc = i + y;
+      const int coeff = coeff_ptr[rc];
+      coeff_sign[y] = (coeff >> 31);
+      abs_coeff[y] = (coeff ^ coeff_sign[y]) - coeff_sign[y];
+      // The first 16 are skipped in the sse2 code.  Do the same here to match.
+      if (i >= 16 && (abs_coeff[y] <= thr)) {
+        nzflag_cnt++;
+      }
+    }
+
+    for (y = 0; y < 16; ++y) {
+      const int rc = i + y;
+      // If all of the AC coeffs in a row has magnitude less than the
+      // quantization step_size/2, quantize to zero.
+      if (nzflag_cnt < 16) {
+        int tmp;
+        int _round;
+
+        if (is_32x32) {
+          _round = ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        } else {
+          _round = round_ptr[rc != 0];
+        }
+        tmp = clamp(abs_coeff[y] + _round, INT16_MIN, INT16_MAX);
+        tmp = (tmp * quant_ptr[rc != 0]) >> (16 - is_32x32);
+        qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y];
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+        if (is_32x32) {
+          dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+        } else {
+          dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+        }
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
+    }
+  }
+
+  // Scan for eob.
+  for (i = 0; i < n_coeffs; i++) {
+    // Use the scan order to find the correct eob.
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan) {
+  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
+              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
+}
+
+void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                            uint16_t *eob_ptr, const int16_t *scan,
+                            const int16_t *iscan) {
+  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
+              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
+}
+
+void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
+                          int16_t *quant, int16_t *quant_shift,
+                          int16_t *dequant, int16_t *round_fp,
+                          int16_t *quant_fp) {
+  // Max when q == 0.  Otherwise, it is 48 for Y and 42 for U/V.
+  const int max_qrounding_factor_fp = 64;
+
+  for (int j = 0; j < 2; j++) {
+    // The range is 4 to 1828 in the VP9 tables.
+    const int qlookup = rnd->RandRange(1825) + 4;
+    round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7;
+    quant_fp[j] = (1 << 16) / qlookup;
+
+    // Values determined by deconstructing vp9_init_quantizer().
+    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
+    // values or U/V values of any bit depth. This is because y_delta is not
+    // factored into the vp9_ac_quant() call.
+    zbin[j] = rnd->RandRange(1200);
+
+    // round may be up to 685 for Y values or 914 for U/V.
+    round[j] = rnd->RandRange(914);
+    // quant ranges from 1 to -32703
+    quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
+    // quant_shift goes up to 1 << 16.
+    quant_shift[j] = rnd->RandRange(16384);
+    // dequant maxes out at 1828 for all cases.
+    dequant[j] = rnd->RandRange(1828);
+  }
+  for (int j = 2; j < 8; j++) {
+    zbin[j] = zbin[1];
+    round_fp[j] = round_fp[1];
+    quant_fp[j] = quant_fp[1];
+    round[j] = round[1];
+    quant[j] = quant[1];
+    quant_shift[j] = quant_shift[1];
+    dequant[j] = dequant[1];
+  }
+}
+
 TEST_P(VP9QuantizeTest, OperationCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < number_of_iterations; ++i) {
-    const int skip_block = i == 0;
-    const TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
-    const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    const int count = (4 << sz) * (4 << sz);  // 16, 64, 256
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = rnd.Rand16() & mask_;
-    }
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
-  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
-}
+  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
+  ASSERT_TRUE(coeff.Init());
+  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(qcoeff.Init());
+  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(dqcoeff.Init());
+  Buffer<tran_low_t> ref_qcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_qcoeff.Init());
+  Buffer<tran_low_t> ref_dqcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_dqcoeff.Init());
+  uint16_t eob, ref_eob;

-TEST_P(VP9Quantize32Test, OperationCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
  for (int i = 0; i < number_of_iterations; ++i) {
-    const int skip_block = i == 0;
-    const TX_SIZE sz = TX_32X32;
-    const TX_TYPE tx_type = (TX_TYPE)(i % 4);
+    // Test skip block for the first three iterations to catch all the different
+    // sizes.
+    const int skip_block = 0;
+    TX_SIZE sz;
+    if (max_size_ == 16) {
+      sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    } else {
+      sz = TX_32X32;
+    }
+    const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    const int count = (4 << sz) * (4 << sz);  // 1024
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = rnd.Rand16() & mask_;
-    }
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
+    const int count = (4 << sz) * (4 << sz);
+    coeff.Set(&rnd, -max_value_, max_value_);
+    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                         quant_fp_ptr_);
+    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                     scan_order->scan, scan_order->iscan);
+
    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+
+    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
+    EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
+
+    EXPECT_EQ(eob, ref_eob);
+
+    if (HasFailure()) {
+      printf("Failure on iteration %d.\n", i);
+      qcoeff.PrintDifference(ref_qcoeff);
+      dqcoeff.PrintDifference(ref_dqcoeff);
+      return;
    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
 }

 TEST_P(VP9QuantizeTest, EOBCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
+  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
+  ASSERT_TRUE(coeff.Init());
+  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(qcoeff.Init());
+  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(dqcoeff.Init());
+  Buffer<tran_low_t> ref_qcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_qcoeff.Init());
+  Buffer<tran_low_t> ref_dqcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_dqcoeff.Init());
+  uint16_t eob, ref_eob;
+
  for (int i = 0; i < number_of_iterations; ++i) {
-    int skip_block = i == 0;
-    TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
-    TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const int skip_block = 0;
+    TX_SIZE sz;
+    if (max_size_ == 16) {
+      sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    } else {
+      sz = TX_32X32;
+    }
+    const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    int count = (4 << sz) * (4 << sz);  // 16, 64, 256
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
+    int count = (4 << sz) * (4 << sz);
    // Two random entries
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = 0;
-    }
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
+    coeff.Set(0);
+    coeff.TopLeftPixel()[rnd(count)] =
+        static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
+    coeff.TopLeftPixel()[rnd(count)] =
+        static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
+    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                         quant_fp_ptr_);
+    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));

-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(
+        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+
+    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
+    EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
+
+    EXPECT_EQ(eob, ref_eob);
+
+    if (HasFailure()) {
+      printf("Failure on iteration %d.\n", i);
+      qcoeff.PrintDifference(ref_qcoeff);
+      dqcoeff.PrintDifference(ref_dqcoeff);
+      return;
    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
 }

-TEST_P(VP9Quantize32Test, EOBCheck) {
+TEST_P(VP9QuantizeTest, DISABLED_Speed) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < number_of_iterations; ++i) {
-    int skip_block = i == 0;
-    TX_SIZE sz = TX_32X32;
-    TX_TYPE tx_type = (TX_TYPE)(i % 4);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    int count = (4 << sz) * (4 << sz);  // 1024
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = 0;
-    }
-    // Two random entries
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
+  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
+  ASSERT_TRUE(coeff.Init());
+  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(qcoeff.Init());
+  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(dqcoeff.Init());
+  uint16_t eob;
+  TX_SIZE starting_sz, ending_sz;

-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
+  if (max_size_ == 16) {
+    starting_sz = TX_4X4;
+    ending_sz = TX_16X16;
+  } else {
+    starting_sz = TX_32X32;
+    ending_sz = TX_32X32;
+  }
+
+  for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
+    // zbin > coeff, zbin < coeff.
+    for (int i = 0; i < 2; ++i) {
+      const int skip_block = 0;
+      // TX_TYPE defines the scan order. That is not relevant to the speed test.
+      // Pick the first one.
+      const TX_TYPE tx_type = DCT_DCT;
+      const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+      const int count = (4 << sz) * (4 << sz);
+
+      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                           quant_fp_ptr_);
+      int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+      int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+
+      if (i == 0) {
+        // When |coeff values| are less than zbin the results are 0.
+        int threshold = 100;
+        if (max_size_ == 32) {
+          // For 32x32, the threshold is halved. Double it to keep the values
+          // from clearing it.
+          threshold = 200;
+        }
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
+        coeff.Set(&rnd, -99, 99);
+      } else if (i == 1) {
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
+        coeff.Set(&rnd, -500, 500);
+      }
+
+      vpx_usec_timer timer;
+      vpx_usec_timer_start(&timer);
+      for (int j = 0; j < 100000000 / count; ++j) {
+        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+                     q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(),
+                     dqcoeff.TopLeftPixel(), dequant_ptr_, &eob,
+                     scan_order->scan, scan_order->iscan);
+      }
+      vpx_usec_timer_mark(&timer);
+      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+      if (i == 0) printf("Bypass calculations.\n");
+      if (i == 1) printf("Full calculations.\n");
+      printf("Quantize %dx%d time: %5d ms\n", 4 << sz, 4 << sz,
+             elapsed_time / 1000);
+    }
+    printf("\n");
  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
 }
-using std::tr1::make_tuple;
+
+using ::testing::make_tuple;

 #if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+// TODO(johannkoenig): Fix vpx_quantize_b_sse2 in highbitdepth builds.
+// make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8),
 INSTANTIATE_TEST_CASE_P(
    SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_highbd_quantize_b_sse2,
-                                 &vpx_highbd_quantize_b_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_quantize_b_sse2,
-                                 &vpx_highbd_quantize_b_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_quantize_b_sse2,
-                                 &vpx_highbd_quantize_b_c, VPX_BITS_12)));
+    ::testing::Values(
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+
+#else
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9Quantize32Test,
-    ::testing::Values(make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                                 &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                                 &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                                 &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12)));
-#endif  // HAVE_SSE2
+    SSE2, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
+                                 &QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                                 VPX_BITS_8, 32, true)));
+#else
+INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest,
+                        ::testing::Values(make_tuple(&vpx_quantize_b_ssse3,
+                                                     &vpx_quantize_b_c,
+                                                     VPX_BITS_8, 16, false)));
+#endif
+
+#if ARCH_X86_64
+// TODO(johannkoenig): SSSE3 optimizations do not yet pass this test.
+INSTANTIATE_TEST_CASE_P(DISABLED_SSSE3, VP9QuantizeTest,
+                        ::testing::Values(make_tuple(
+                            &vpx_quantize_b_32x32_ssse3,
+                            &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false)));
+#endif  // ARCH_X86_64
+#endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+
+// TODO(johannkoenig): AVX optimizations do not yet pass the 32x32 test or
+// highbitdepth configurations.
+#if HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    AVX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      // Even though SSSE3 and AVX do not match the reference
+                      // code, we can keep them in sync with each other.
+                      make_tuple(&vpx_quantize_b_32x32_avx,
+                                 &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32,
+                                 false)));
+#endif  // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH
+
+#if ARCH_X86_64 && HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
+
+// TODO(webm:1448): dqcoeff is not handled correctly in HBD builds.
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_neon,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                                 VPX_BITS_8, 32, true)));
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
+
+// Only useful to compare "Speed" test results.
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_C, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
+        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
+                   32, false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                   &QuantFPWrapper<quantize_fp_32x32_nz_c>, VPX_BITS_8, 32,
+                   true),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+                   true)));
 }  // namespace
--- a/test/vp9_scale_test.cc
+++ b/test/vp9_scale_test.cc
@ -0,0 +1,215 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/vpx_scale_test.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/yv12config.h"
+
+namespace libvpx_test {
+
+typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst,
+                               INTERP_FILTER filter_type, int phase_scaler);
+
+class ScaleTest : public VpxScaleBase,
+                  public ::testing::TestWithParam<ScaleFrameFunc> {
+ public:
+  virtual ~ScaleTest() {}
+
+ protected:
+  virtual void SetUp() { scale_fn_ = GetParam(); }
+
+  void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
+    vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler);
+  }
+
+  void ScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
+    ASM_REGISTER_STATE_CHECK(
+        scale_fn_(&img_, &dst_img_, filter_type, phase_scaler));
+  }
+
+  void RunTest(INTERP_FILTER filter_type) {
+    static const int kNumSizesToTest = 20;
+    static const int kNumScaleFactorsToTest = 4;
+    static const int kSizesToTest[] = {
+      2,  4,  6,  8,  10, 12, 14, 16, 18,  20,
+      22, 24, 26, 28, 30, 32, 34, 68, 128, 134
+    };
+    static const int kScaleFactors[] = { 1, 2, 3, 4 };
+    for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
+      for (int h = 0; h < kNumSizesToTest; ++h) {
+        const int src_height = kSizesToTest[h];
+        for (int w = 0; w < kNumSizesToTest; ++w) {
+          const int src_width = kSizesToTest[w];
+          for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
+               ++sf_up_idx) {
+            const int sf_up = kScaleFactors[sf_up_idx];
+            for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+                 ++sf_down_idx) {
+              const int sf_down = kScaleFactors[sf_down_idx];
+              const int dst_width = src_width * sf_up / sf_down;
+              const int dst_height = src_height * sf_up / sf_down;
+              if (sf_up == sf_down && sf_up != 1) {
+                continue;
+              }
+              // I420 frame width and height must be even.
+              if (!dst_width || !dst_height || dst_width & 1 ||
+                  dst_height & 1) {
+                continue;
+              }
+              // vpx_convolve8_c() has restriction on the step which cannot
+              // exceed 64 (ratio 1 to 4).
+              if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
+                continue;
+              }
+              ASSERT_NO_FATAL_FAILURE(ResetScaleImages(src_width, src_height,
+                                                       dst_width, dst_height));
+              ReferenceScaleFrame(filter_type, phase_scaler);
+              ScaleFrame(filter_type, phase_scaler);
+              if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+                         ref_img_.frame_size)) {
+                printf(
+                    "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+                    "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+                    "scale factor = %d:%d\n",
+                    filter_type, phase_scaler, src_width, src_height, dst_width,
+                    dst_height, sf_down, sf_up);
+                PrintDiff();
+              }
+              CompareImages(dst_img_);
+              DeallocScaleImages();
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void PrintDiffComponent(const uint8_t *const ref, const uint8_t *const opt,
+                          const int stride, const int width, const int height,
+                          const int plane_idx) const {
+    for (int y = 0; y < height; y++) {
+      for (int x = 0; x < width; x++) {
+        if (ref[y * stride + x] != opt[y * stride + x]) {
+          printf("Plane %d pixel[%d][%d] diff:%6d (ref),%6d (opt)\n", plane_idx,
+                 y, x, ref[y * stride + x], opt[y * stride + x]);
+          break;
+        }
+      }
+    }
+  }
+
+  void PrintDiff() const {
+    assert(ref_img_.y_stride == dst_img_.y_stride);
+    assert(ref_img_.y_width == dst_img_.y_width);
+    assert(ref_img_.y_height == dst_img_.y_height);
+    assert(ref_img_.uv_stride == dst_img_.uv_stride);
+    assert(ref_img_.uv_width == dst_img_.uv_width);
+    assert(ref_img_.uv_height == dst_img_.uv_height);
+
+    if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+               ref_img_.frame_size)) {
+      PrintDiffComponent(ref_img_.y_buffer, dst_img_.y_buffer,
+                         ref_img_.y_stride, ref_img_.y_width, ref_img_.y_height,
+                         0);
+      PrintDiffComponent(ref_img_.u_buffer, dst_img_.u_buffer,
+                         ref_img_.uv_stride, ref_img_.uv_width,
+                         ref_img_.uv_height, 1);
+      PrintDiffComponent(ref_img_.v_buffer, dst_img_.v_buffer,
+                         ref_img_.uv_stride, ref_img_.uv_width,
+                         ref_img_.uv_height, 2);
+    }
+  }
+
+  ScaleFrameFunc scale_fn_;
+};
+
+TEST_P(ScaleTest, ScaleFrame_EightTap) { RunTest(EIGHTTAP); }
+TEST_P(ScaleTest, ScaleFrame_EightTapSmooth) { RunTest(EIGHTTAP_SMOOTH); }
+TEST_P(ScaleTest, ScaleFrame_EightTapSharp) { RunTest(EIGHTTAP_SHARP); }
+TEST_P(ScaleTest, ScaleFrame_Bilinear) { RunTest(BILINEAR); }
+
+TEST_P(ScaleTest, DISABLED_Speed) {
+  static const int kCountSpeedTestBlock = 100;
+  static const int kNumScaleFactorsToTest = 4;
+  static const int kScaleFactors[] = { 1, 2, 3, 4 };
+  const int src_width = 1280;
+  const int src_height = 720;
+  for (INTERP_FILTER filter_type = 2; filter_type < 4; ++filter_type) {
+    for (int phase_scaler = 0; phase_scaler < 2; ++phase_scaler) {
+      for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; ++sf_up_idx) {
+        const int sf_up = kScaleFactors[sf_up_idx];
+        for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+             ++sf_down_idx) {
+          const int sf_down = kScaleFactors[sf_down_idx];
+          const int dst_width = src_width * sf_up / sf_down;
+          const int dst_height = src_height * sf_up / sf_down;
+          if (sf_up == sf_down && sf_up != 1) {
+            continue;
+          }
+          // I420 frame width and height must be even.
+          if (dst_width & 1 || dst_height & 1) {
+            continue;
+          }
+          ASSERT_NO_FATAL_FAILURE(
+              ResetScaleImages(src_width, src_height, dst_width, dst_height));
+          ASM_REGISTER_STATE_CHECK(
+              ReferenceScaleFrame(filter_type, phase_scaler));
+
+          vpx_usec_timer timer;
+          vpx_usec_timer_start(&timer);
+          for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+            ScaleFrame(filter_type, phase_scaler);
+          }
+          libvpx_test::ClearSystemState();
+          vpx_usec_timer_mark(&timer);
+          const int elapsed_time =
+              static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+          CompareImages(dst_img_);
+          DeallocScaleImages();
+
+          printf(
+              "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+              "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+              "scale factor = %d:%d, scale time: %5d ms\n",
+              filter_type, phase_scaler, src_width, src_height, dst_width,
+              dst_height, sf_down, sf_up, elapsed_time);
+        }
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, ScaleTest,
+                        ::testing::Values(vp9_scale_and_extend_frame_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, ScaleTest,
+                        ::testing::Values(vp9_scale_and_extend_frame_ssse3));
+#endif  // HAVE_SSSE3
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, ScaleTest,
+                        ::testing::Values(vp9_scale_and_extend_frame_neon));
+#endif  // HAVE_NEON
+
+}  // namespace libvpx_test
--- a/Show More
+++ b/Show More