Merge "Remove tile param"

Merge "mips msa vp9 temporal filter optimization"
Merge "vpxenc.sh: Add basic multithreaded frame parallel encode test."
2015-06-29 18:04:56 +00:00 · 2015-06-27 01:29:04 +00:00 · 2015-06-26 16:12:34 +00:00 · 2015-06-26 12:00:24 +05:30 · 2015-06-26 09:23:56 +05:30 · 2015-06-26 03:42:31 +00:00
415 changed files with 32861 additions and 26977 deletions
--- a/.mailmap
+++ b/.mailmap
@@ -1,18 +1,26 @@
 Adrian Grange <agrange@google.com>
+Alex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
+Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Deb Mukherjee <debargha@google.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
+Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Jim Bankoski <jimbankoski@google.com>
-John Koleszar <jkoleszar@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
+John Koleszar <jkoleszar@google.com>
+Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Marco Paniconi <marpan@google.com>
+Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Pascal Massimino <pascal.massimino@gmail.com>
+Paul Wilkins <paulwilkins@google.com>
+Ralph Giles <giles@xiph.org> <giles@entropywave.com>
+Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Sami Pietilä <samipietila@google.com>
+Tamar Levy <tamar.levy@intel.com>
+Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
-Ralph Giles <giles@xiph.org> <giles@entropywave.com>
-Ralph Giles <giles@xiph.org> <giles@mozilla.com>
-Alpha Lam <hclam@google.com> <hclam@chromium.org>
-Deb Mukherjee <debargha@google.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
--- a/29
+++ b/29
@@ -3,10 +3,11 @@

 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Ahmad Sharif <asharif@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
-Alex Converse <alex.converse@gmail.com>
+Alex Converse <aconverse@google.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -14,44 +15,58 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
+Andrew Russell <anrussell@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 changjun.yang <changjun.yang@intel.com>
+Charles 'Buck' Krasic <ckrasic@google.com>
 chm <chm@rock-chips.com>
 Christian Duvivier <cduvivier@google.com>
 Daniel Kang <ddkang@google.com>
 Deb Mukherjee <debargha@google.com>
+Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
 Dragan Mrdjan <dmrdjan@mips.com>
-Erik Niemeyer <erik.a.niemeyer@gmail.com>
+Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
+Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
+Hanno Böck <hanno@hboeck.de>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
 Ivan Maltz <ivanmaltz@google.com>
+Jacek Caban <cjacek@gmail.com>
+JackyChen <jackychen@google.com>
 James Berry <jamesberry@google.com>
+James Yu <james.yu@linaro.org>
 James Zern <jzern@google.com>
+Jan Gerber <j@mailb.org>
 Jan Kratochvil <jan.kratochvil@redhat.com>
 Janne Salonen <jsalonen@google.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
+Jia Jia <jia.jia@linaro.org>
 Jim Bankoski <jimbankoski@google.com>
 Jingning Han <jingning@google.com>
+Joey Parrish <joeyparrish@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
+John Stark <jhnstrk@gmail.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 KO Myung-Hun <komh@chollian.net>
+Lawrence Velázquez <larryv@macports.org>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
@@ -65,6 +80,7 @@ Michael Kohler <michaelkohler@live.com>
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
+Minghai Shang <minghai@google.com>
 Morton Jonuschat <yabawock@gmail.com>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
@@ -72,6 +88,8 @@ Patrik Westin <patrik.westin@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
+Pengchong Jin <pengchong@google.com>
+Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
@@ -79,22 +97,29 @@ Rafaël Carré <funman@videolan.org>
 Ralph Giles <giles@xiph.org>
 Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rbultje@google.com>
+Rui Ueyama <ruiu@google.com>
 Sami Pietilä <samipietila@google.com>
 Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
+Sean McGovern <gseanmcg@gmail.com>
+Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
+Tao Bai <michaelbai@chromium.org>
 Tero Rintaluoma <teror@google.com>
 Thijs Vermeir <thijsvermeir@gmail.com>
+Tim Kopp <tkopp@google.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
 Yaowu Xu <yaowu@google.com>
+Yongzhe Wang <yongzhe@google.com>
 Yunqing Wang <yunqingwang@google.com>
+Zoe Liu <zoeliu@google.com>
 Google Inc.
 The Mozilla Foundation
 The Xiph.Org Foundation
--- a/28
+++ b/28
@@ -1,3 +1,31 @@
+xxxx-yy-zz v1.4.0 "Changes for next release"
+  vpxenc is changed to use VP9 by default.
+  Encoder controls added for 1 pass SVC.
+  Decoder control to toggle on/off loopfilter.
+
+2015-04-03 v1.4.0 "Indian Runner Duck"
+  This release includes significant improvements to the VP9 codec.
+
+  - Upgrading:
+    This release is ABI incompatible with 1.3.0. It drops the compatibility
+    layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec
+    controls for VP9.
+
+  - Enhancements:
+    Faster VP9 encoding and decoding
+    Multithreaded VP9 decoding (tile and frame-based)
+    Multithreaded VP9 encoding - on by default
+    YUV 4:2:2 and 4:4:4 support in VP9
+    10 and 12bit support in VP9
+    64bit ARM support by replacing ARM assembly with intrinsics
+
+  - Bug Fixes:
+    Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0
+    files.
+
+  - Known Issues:
+    Frame Parallel decoding fails for segmented and non-420 files.
+
 2013-11-15 v1.3.0 "Forest"
  This release introduces the VP9 codec in a backward-compatible way.
  All existing users of VP8 can continue to use the library without
--- a/2
+++ b/2
@@ -17,7 +17,7 @@ or agree to the institution of patent litigation or any other patent
 enforcement activity against any entity (including a cross-claim or
 counterclaim in a lawsuit) alleging that any of these implementations of WebM
 or any code incorporated within any of these implementations of WebM
-constitutes direct or contributory patent infringement, or inducement of
+constitute direct or contributory patent infringement, or inducement of
 patent infringement, then any patent rights granted to you under this License
 for these implementations of WebM shall terminate as of the date such
 litigation is filed.
--- a/16
+++ b/16
@@ -1,4 +1,4 @@
-README - 30 May 2014
+README - 23 March 2015

 Welcome to the WebM VP8/VP9 Codec SDK!

@@ -62,12 +62,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv7s-darwin-gcc
    mips32-linux-gcc
    mips64-linux-gcc
-    ppc32-darwin8-gcc
-    ppc32-darwin9-gcc
-    ppc32-linux-gcc
-    ppc64-darwin8-gcc
-    ppc64-darwin9-gcc
-    ppc64-linux-gcc
    sparc-solaris-gcc
    x86-android-gcc
    x86-darwin8-gcc
@@ -78,6 +72,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86-darwin11-gcc
    x86-darwin12-gcc
    x86-darwin13-gcc
+    x86-darwin14-gcc
    x86-iphonesimulator-gcc
    x86-linux-gcc
    x86-linux-icc
@@ -95,6 +90,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86_64-darwin11-gcc
    x86_64-darwin12-gcc
    x86_64-darwin13-gcc
+    x86_64-darwin14-gcc
    x86_64-iphonesimulator-gcc
    x86_64-linux-gcc
    x86_64-linux-icc
@@ -105,12 +101,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86_64-win64-vs10
    x86_64-win64-vs11
    x86_64-win64-vs12
-    universal-darwin8-gcc
-    universal-darwin9-gcc
-    universal-darwin10-gcc
-    universal-darwin11-gcc
-    universal-darwin12-gcc
-    universal-darwin13-gcc
    generic-gnu

  The generic-gnu target, in conjunction with the CROSS environment variable,
--- a/args.c
+++ b/args.c
@@ -14,9 +14,7 @@
 #include <limits.h>
 #include "args.h"

-#ifdef _MSC_VER
-#define snprintf _snprintf
-#endif
+#include "vpx_ports/msvc.h"

 #if defined(__GNUC__) && __GNUC__
 extern void die(const char *fmt, ...) __attribute__((noreturn));
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -158,13 +158,12 @@ LOCAL_CFLAGS += \

 LOCAL_MODULE := libvpx

-LOCAL_LDLIBS := -llog
-
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
  LOCAL_STATIC_LIBRARIES := cpufeatures
 endif

 # Add a dependency to force generation of the RTCD files.
+define rtcd_dep_template
 ifeq ($(CONFIG_VP8), yes)
 $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vp8_rtcd.h
 endif
@@ -172,10 +171,14 @@ ifeq ($(CONFIG_VP9), yes)
 $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vp9_rtcd.h
 endif
 $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_scale_rtcd.h
+$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_dsp_rtcd.h

 ifeq ($(TARGET_ARCH_ABI),x86)
 $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_config.asm
 endif
+endef
+
+$(eval $(call rtcd_dep_template))

 .PHONY: clean
 clean:
@@ -184,7 +187,11 @@ clean:
 	@$(RM) -r $(ASM_CNV_PATH)
 	@$(RM) $(CLEAN-OBJS)

-include $(BUILD_SHARED_LIBRARY)
+ifeq ($(ENABLE_SHARED),1)
+  include $(BUILD_SHARED_LIBRARY)
+else
+  include $(BUILD_STATIC_LIBRARY)
+endif

 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
 $(call import-module,cpufeatures)
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -22,8 +22,10 @@ clean:: .DEFAULT
 exampletest: .DEFAULT
 install:: .DEFAULT
 test:: .DEFAULT
+test-no-data-check:: .DEFAULT
 testdata:: .DEFAULT
 utiltest: .DEFAULT
+exampletest-no-data-check utiltest-no-data-check: .DEFAULT


 # Note: md5sum is not installed on OS X, but openssl is. Openssl may not be
@@ -56,13 +58,10 @@ dist:
        fi
 endif

+# Since we invoke make recursively for multiple targets we need to include the
+# .mk file for the correct target, but only when $(target) is non-empty.
 ifneq ($(target),)
-# Normally, we want to build the filename from the target and the toolchain.
-# This disambiguates from the $(target).mk file that exists in the source tree.
-# However, the toolchain is part of the target in universal builds, so we
-# don't want to include TOOLCHAIN in that case. FAT_ARCHS is used to test
-# if we're in the universal case.
-include $(target)$(if $(FAT_ARCHS),,-$(TOOLCHAIN)).mk
+include $(target)-$(TOOLCHAIN).mk
 endif
 BUILD_ROOT?=.
 VPATH=$(SRC_PATH_BARE)
@@ -116,6 +115,9 @@ test::
 testdata::
 .PHONY: utiltest
 utiltest:
+.PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check
+test-no-data-check::
+exampletest-no-data-check utiltest-no-data-check:

 # Add compiler flags for intrinsic files
 ifeq ($(TOOLCHAIN), x86-os2-gcc)
@@ -313,18 +315,15 @@ $(1):
        $$(filter %.o,$$^) $$(extralibs)
 endef

-
-
-define lipo_lib_template
-$(1): $(addsuffix /$(1),$(FAT_ARCHS))
-	$(if $(quiet),@echo "    [LIPO] $$@")
-	$(qexec)libtool -static -o $$@ $$?
-endef
-
-define lipo_bin_template
-$(1): $(addsuffix /$(1),$(FAT_ARCHS))
-	$(if $(quiet),@echo "    [LIPO] $$@")
-	$(qexec)lipo -output $$@ -create $$?
+define dll_template
+# Not using a pattern rule here because we don't want to generate empty
+# archives when they are listed as a dependency in files not responsible
+# for creating them.
+$(1):
+	$(if $(quiet),@echo "    [LD] $$@")
+	$(qexec)$$(LD) -Zdll $$(LDFLAGS) \
+        -o $$@ \
+        $$(filter %.o,$$^) $$(extralibs) $$(EXPORTS_FILE)
 endef


@@ -383,8 +382,9 @@ LIBS=$(call enabled,LIBS)
 .libs: $(LIBS)
 	@touch $@
 $(foreach lib,$(filter %_g.a,$(LIBS)),$(eval $(call archive_template,$(lib))))
-$(foreach lib,$(filter %so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
-$(foreach lib,$(filter %$(VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
+$(foreach lib,$(filter %so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
+$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
+$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dll,$(LIBS)),$(eval $(call dll_template,$(lib))))

 INSTALL-LIBS=$(call cond_enabled,CONFIG_INSTALL_LIBS,INSTALL-LIBS)
 ifeq ($(MAKECMDGOALS),dist)
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -390,7 +390,7 @@ write_common_config_banner() {
 write_common_config_targets() {
  for t in ${all_targets}; do
    if enabled ${t}; then
-      if enabled universal || enabled child; then
+      if enabled child; then
        fwrite config.mk "ALL_TARGETS += ${t}-${toolchain}"
      else
        fwrite config.mk "ALL_TARGETS += ${t}"
@@ -640,12 +640,6 @@ process_common_toolchain() {
      *i[3456]86*)
        tgt_isa=x86
        ;;
-      *powerpc64*)
-        tgt_isa=ppc64
-        ;;
-      *powerpc*)
-        tgt_isa=ppc32
-        ;;
      *sparc*)
        tgt_isa=sparc
        ;;
@@ -653,14 +647,6 @@ process_common_toolchain() {

    # detect tgt_os
    case "$gcctarget" in
-      *darwin8*)
-        tgt_isa=universal
-        tgt_os=darwin8
-        ;;
-      *darwin9*)
-        tgt_isa=universal
-        tgt_os=darwin9
-        ;;
      *darwin10*)
        tgt_isa=x86_64
        tgt_os=darwin10
@@ -742,6 +728,13 @@ process_common_toolchain() {
  # Handle darwin variants. Newer SDKs allow targeting older
  # platforms, so use the newest one available.
  case ${toolchain} in
+    arm*-darwin*)
+      ios_sdk_dir="$(show_darwin_sdk_path iphoneos)"
+      if [ -d "${ios_sdk_dir}" ]; then
+        add_cflags  "-isysroot ${ios_sdk_dir}"
+        add_ldflags "-isysroot ${ios_sdk_dir}"
+      fi
+      ;;
    *-darwin*)
      osx_sdk_dir="$(show_darwin_sdk_path macosx)"
      if [ -d "${osx_sdk_dir}" ]; then
@@ -795,7 +788,6 @@ process_common_toolchain() {
  case ${toolchain} in
    sparc-solaris-*)
      add_extralibs -lposix4
-      disable_feature fast_unaligned
      ;;
    *-solaris-*)
      add_extralibs -lposix4
@@ -818,12 +810,17 @@ process_common_toolchain() {
          if disabled neon && enabled neon_asm; then
            die "Disabling neon while keeping neon-asm is not supported"
          fi
+          case ${toolchain} in
+            *-darwin*)
+              # Neon is guaranteed on iOS 6+ devices, while old media extensions
+              # no longer assemble with iOS 9 SDK
+              ;;
+            *)
              soft_enable media
-          soft_enable fast_unaligned
+          esac
          ;;
        armv6)
          soft_enable media
-          soft_enable fast_unaligned
          ;;
      esac

@@ -1039,30 +1036,32 @@ EOF
      tune_cflags="-mtune="
      if enabled dspr2; then
        check_add_cflags -mips32r2 -mdspr2
-        disable_feature fast_unaligned
+      fi
+
+      if enabled runtime_cpu_detect; then
+        disable_feature runtime_cpu_detect
      fi

      if [ -n "${tune_cpu}" ]; then
        case ${tune_cpu} in
          p5600)
-            add_cflags -mips32r5 -funroll-loops -mload-store-pairs
-            add_cflags -msched-weight -mhard-float
-            add_asflags -mips32r5 -mhard-float
+            check_add_cflags -mips32r5 -funroll-loops -mload-store-pairs
+            check_add_cflags -msched-weight -mhard-float -mfp64
+            check_add_asflags -mips32r5 -mhard-float -mfp64
+            check_add_ldflags -mfp64
            ;;
          i6400)
-            add_cflags -mips64r6 -mabi=64 -funroll-loops -mload-store-pairs
-            add_cflags -msched-weight -mhard-float
-            add_asflags -mips64r6 -mabi=64 -mhard-float
-            add_ldflags -mips64r6 -mabi=64
+            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight 
+            check_add_cflags  -mload-store-pairs -mhard-float -mfp64
+            check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
+            check_add_ldflags -mips64r6 -mabi=64 -mfp64
            ;;
        esac

        if enabled msa; then
-          add_cflags -mmsa -mfp64 -flax-vector-conversions
-          add_asflags -mmsa -mfp64 -flax-vector-conversions
-          add_ldflags -mmsa -mfp64 -flax-vector-conversions
-
-          disable_feature fast_unaligned
+          add_cflags -mmsa
+          add_asflags -mmsa
+          add_ldflags -mmsa
        fi
      fi

@@ -1070,29 +1069,6 @@ EOF
      check_add_asflags -march=${tgt_isa}
      check_add_asflags -KPIC
      ;;
-    ppc*)
-      enable_feature ppc
-      bits=${tgt_isa##ppc}
-      link_with_cc=gcc
-      setup_gnu_toolchain
-      add_asflags -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
-      soft_enable altivec
-      enabled altivec && add_cflags -maltivec
-
-      case "$tgt_os" in
-        linux*)
-          add_asflags -maltivec -mregnames -I"\$(dir \$<)linux"
-          ;;
-        darwin*)
-          darwin_arch="-arch ppc"
-          enabled ppc64 && darwin_arch="${darwin_arch}64"
-          add_cflags  ${darwin_arch} -m${bits} -fasm-blocks
-          add_asflags ${darwin_arch} -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
-          add_ldflags ${darwin_arch} -m${bits}
-          enabled altivec && add_cflags -faltivec
-          ;;
-      esac
-      ;;
    x86*)
      case  ${tgt_os} in
        win*)
@@ -1245,7 +1221,7 @@ EOF
          ;;
      esac
      ;;
-    universal*|*-gcc|generic-gnu)
+    *-gcc|generic-gnu)
      link_with_cc=gcc
      enable_feature gcc
      setup_gnu_toolchain
@@ -1329,11 +1305,15 @@ EOF
  # only for MIPS platforms
  case ${toolchain} in
    mips*)
-      if enabled dspr2; then
      if enabled big_endian; then
+        if enabled dspr2; then
          echo "dspr2 optimizations are available only for little endian platforms"
          disable_feature dspr2
        fi
+        if enabled msa; then
+          echo "msa optimizations are available only for little endian platforms"
+          disable_feature msa
+        fi
      fi
      ;;
  esac
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -263,8 +263,8 @@ case "$target" in
    ;;
    arm*)
        platforms[0]="ARM"
-        asm_Debug_cmdline="armasm -nologo &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="armasm -nologo &quot;%(FullPath)&quot;"
+        asm_Debug_cmdline="armasm -nologo -oldit &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="armasm -nologo -oldit &quot;%(FullPath)&quot;"
    ;;
    *) die "Unsupported target $target!"
    ;;
--- a/114
+++ b/114
@@ -31,8 +31,6 @@ Advanced options:
  --size-limit=WxH                max size to allow in the decoder
  --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
  --sdk-path=PATH                 path to root of sdk (android builds only)
-  ${toggle_fast_unaligned}        don't use unaligned accesses, even when
-                                  supported by hardware [auto]
  ${toggle_codec_srcs}            in/exclude codec library source code
  ${toggle_debug_libs}            in/exclude debug version of libraries
  ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
@@ -40,7 +38,6 @@ Advanced options:
  ${toggle_vp8}                   VP8 codec support
  ${toggle_vp9}                   VP9 codec support
  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
-  ${toggle_mem_tracker}           track memory usage
  ${toggle_postproc}              postprocessing
  ${toggle_vp9_postproc}          vp9 specific postprocessing
  ${toggle_multithread}           multithreaded encoding and decoding
@@ -112,12 +109,6 @@ all_platforms="${all_platforms} armv7-win32-vs12"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
-all_platforms="${all_platforms} ppc32-darwin8-gcc"
-all_platforms="${all_platforms} ppc32-darwin9-gcc"
-all_platforms="${all_platforms} ppc32-linux-gcc"
-all_platforms="${all_platforms} ppc64-darwin8-gcc"
-all_platforms="${all_platforms} ppc64-darwin9-gcc"
-all_platforms="${all_platforms} ppc64-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
 all_platforms="${all_platforms} x86-android-gcc"
 all_platforms="${all_platforms} x86-darwin8-gcc"
@@ -157,13 +148,6 @@ all_platforms="${all_platforms} x86_64-win64-vs9"
 all_platforms="${all_platforms} x86_64-win64-vs10"
 all_platforms="${all_platforms} x86_64-win64-vs11"
 all_platforms="${all_platforms} x86_64-win64-vs12"
-all_platforms="${all_platforms} universal-darwin8-gcc"
-all_platforms="${all_platforms} universal-darwin9-gcc"
-all_platforms="${all_platforms} universal-darwin10-gcc"
-all_platforms="${all_platforms} universal-darwin11-gcc"
-all_platforms="${all_platforms} universal-darwin12-gcc"
-all_platforms="${all_platforms} universal-darwin13-gcc"
-all_platforms="${all_platforms} universal-darwin14-gcc"
 all_platforms="${all_platforms} generic-gnu"

 # all_targets is a list of all targets that can be configured
@@ -200,6 +184,10 @@ if [ ${doxy_major:-0} -ge 1 ]; then
    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen
 fi

+# disable codecs when their source directory does not exist
+[ -d "${source_path}/vp8" ] || disable_feature vp8
+[ -d "${source_path}/vp9" ] || disable_feature vp9
+
 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
 # case.
@@ -210,45 +198,27 @@ enable_feature install_libs
 enable_feature static
 enable_feature optimizations
 enable_feature dependency_tracking
-enable_feature fast_unaligned #allow unaligned accesses, if supported by hw
 enable_feature spatial_resampling
 enable_feature multithread
 enable_feature os_support
 enable_feature temporal_denoising

-[ -d "${source_path}/../include" ] && enable_feature alt_tree_layout
-for d in vp8 vp9; do
-    [ -d "${source_path}/${d}" ] && disable_feature alt_tree_layout;
-done
-
-if ! enabled alt_tree_layout; then
-# development environment
-[ -d "${source_path}/vp8" ] && CODECS="${CODECS} vp8_encoder vp8_decoder"
-[ -d "${source_path}/vp9" ] && CODECS="${CODECS} vp9_encoder vp9_decoder"
-else
-# customer environment
-[ -f "${source_path}/../include/vpx/vp8cx.h" ] && CODECS="${CODECS} vp8_encoder"
-[ -f "${source_path}/../include/vpx/vp8dx.h" ] && CODECS="${CODECS} vp8_decoder"
-[ -f "${source_path}/../include/vpx/vp9cx.h" ] && CODECS="${CODECS} vp9_encoder"
-[ -f "${source_path}/../include/vpx/vp9dx.h" ] && CODECS="${CODECS} vp9_decoder"
-[ -f "${source_path}/../include/vpx/vp8cx.h" ] || disable_feature vp8_encoder
-[ -f "${source_path}/../include/vpx/vp8dx.h" ] || disable_feature vp8_decoder
-[ -f "${source_path}/../include/vpx/vp9cx.h" ] || disable_feature vp9_encoder
-[ -f "${source_path}/../include/vpx/vp9dx.h" ] || disable_feature vp9_decoder
-
-[ -f "${source_path}/../lib/*/*mt.lib" ] && soft_enable static_msvcrt
-fi
-
-CODECS="$(echo ${CODECS} | tr ' ' '\n')"
-CODEC_FAMILIES="$(for c in ${CODECS}; do echo ${c%_*}; done | sort | uniq)"
+CODECS="
+    vp8_encoder
+    vp8_decoder
+    vp9_encoder
+    vp9_decoder
+"
+CODEC_FAMILIES="
+    vp8
+    vp9
+"

 ARCH_LIST="
    arm
    mips
    x86
    x86_64
-    ppc32
-    ppc64
 "
 ARCH_EXT_LIST="
    edsp
@@ -269,14 +239,11 @@ ARCH_EXT_LIST="
    sse4_1
    avx
    avx2
-
-    altivec
 "
 HAVE_LIST="
    ${ARCH_EXT_LIST}
    vpx_ports
    stdint_h
-    alt_tree_layout
    pthread_h
    sys_mman_h
    unistd_h
@@ -305,10 +272,6 @@ CONFIG_LIST="

    codec_srcs
    debug_libs
-    fast_unaligned
-    mem_manager
-    mem_tracker
-    mem_checks

    dequant_tokens
    dc_recon
@@ -370,7 +333,6 @@ CMDLINE_SELECT="
    libc
    as
    size_limit
-    fast_unaligned
    codec_srcs
    debug_libs

@@ -383,7 +345,6 @@ CMDLINE_SELECT="
    ${CODECS}
    ${CODEC_FAMILIES}
    static_msvcrt
-    mem_tracker
    spatial_resampling
    realtime_only
    onthefly_bitpacking
@@ -456,22 +417,8 @@ post_process_cmdline() {

 process_targets() {
    enabled child || write_common_config_banner
-    enabled universal || write_common_target_config_h  ${BUILD_PFX}vpx_config.h
-
-    # For fat binaries, call configure recursively to configure for each
-    # binary architecture to be included.
-    if enabled universal; then
-        # Call configure (ourselves) for each subarchitecture
-        for arch in $fat_bin_archs; do
-            BUILD_PFX=${arch}/ toolchain=${arch} $self --child $cmdline_args || exit $?
-        done
-    fi
-
-    # The write_common_config (config.mk) logic is deferred until after the
-    # recursive calls to configure complete, because we want our universal
-    # targets to be executed last.
+    write_common_target_config_h ${BUILD_PFX}vpx_config.h
    write_common_config_targets
-    enabled universal && echo "FAT_ARCHS=${fat_bin_archs}" >> config.mk

    # Calculate the default distribution name, based on the enabled features
    cf=""
@@ -547,11 +494,11 @@ process_detect() {
        # Can only build shared libs on a subset of platforms. Doing this check
        # here rather than at option parse time because the target auto-detect
        # magic happens after the command line has been parsed.
-        if ! enabled linux; then
+        if ! enabled linux && ! enabled os2; then
            if enabled gnu; then
                echo "--enable-shared is only supported on ELF; assuming this is OK"
            else
-                die "--enable-shared only supported on ELF for now"
+                die "--enable-shared only supported on ELF and OS/2 for now"
            fi
        fi
    fi
@@ -616,30 +563,6 @@ EOF
 process_toolchain() {
    process_common_toolchain

-    # Handle universal binaries for this architecture
-    case $toolchain in
-        universal-darwin*)
-            darwin_ver=${tgt_os##darwin}
-
-            # Snow Leopard (10.6/darwin10) dropped support for PPC
-            # Include PPC support for all prior versions
-            if [ $darwin_ver -lt 10 ]; then
-                fat_bin_archs="$fat_bin_archs ppc32-${tgt_os}-gcc"
-            fi
-
-            # Tiger (10.4/darwin8) brought support for x86
-            if [ $darwin_ver -ge 8 ]; then
-                fat_bin_archs="$fat_bin_archs x86-${tgt_os}-${tgt_cc}"
-            fi
-
-            # Leopard (10.5/darwin9) brought 64 bit support
-            if [ $darwin_ver -ge 9 ]; then
-                fat_bin_archs="$fat_bin_archs x86_64-${tgt_os}-${tgt_cc}"
-            fi
-            ;;
-    esac
-
-
    # Enable some useful compiler flags
    if enabled gcc; then
        enabled werror && check_add_cflags -Werror
@@ -727,7 +650,7 @@ process_toolchain() {
    esac

    # Other toolchain specific defaults
-    case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
+    case $toolchain in x86*) soft_enable postproc;; esac

    if enabled postproc_visualizer; then
        enabled postproc || die "postproc_visualizer requires postproc to be enabled"
@@ -791,6 +714,7 @@ CONFIGURE_ARGS="$@"
 process "$@"
 print_webm_license ${BUILD_PFX}vpx_config.c "/*" " */"
 cat <<EOF >> ${BUILD_PFX}vpx_config.c
+#include "vpx/vpx_codec.h"
 static const char* const cfg = "$CONFIGURE_ARGS";
 const char *vpx_codec_build_config(void) {return cfg;}
 EOF
--- a/examples.mk
+++ b/examples.mk
@@ -56,6 +56,7 @@ UTILS-$(CONFIG_DECODERS)    += vpxdec.c
 vpxdec.SRCS                 += md5_utils.c md5_utils.h
 vpxdec.SRCS                 += vpx_ports/mem_ops.h
 vpxdec.SRCS                 += vpx_ports/mem_ops_aligned.h
+vpxdec.SRCS                 += vpx_ports/msvc.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h
@@ -80,6 +81,7 @@ vpxenc.SRCS                 += tools_common.c tools_common.h
 vpxenc.SRCS                 += warnings.c warnings.h
 vpxenc.SRCS                 += vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
+vpxenc.SRCS                 += vpx_ports/msvc.h
 vpxenc.SRCS                 += vpx_ports/vpx_timer.h
 vpxenc.SRCS                 += vpxstats.c vpxstats.h
 ifeq ($(CONFIG_LIBYUV),yes)
@@ -98,6 +100,7 @@ ifeq ($(CONFIG_SPATIAL_SVC),yes)
  vp9_spatial_svc_encoder.SRCS        += tools_common.c tools_common.h
  vp9_spatial_svc_encoder.SRCS        += video_common.h
  vp9_spatial_svc_encoder.SRCS        += video_writer.h video_writer.c
+  vp9_spatial_svc_encoder.SRCS        += vpx_ports/msvc.h
  vp9_spatial_svc_encoder.SRCS        += vpxstats.c vpxstats.h
  vp9_spatial_svc_encoder.GUID        = 4A38598D-627D-4505-9C7B-D4020C84100D
  vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder
@@ -112,6 +115,7 @@ vpx_temporal_svc_encoder.SRCS        += ivfenc.c ivfenc.h
 vpx_temporal_svc_encoder.SRCS        += tools_common.c tools_common.h
 vpx_temporal_svc_encoder.SRCS        += video_common.h
 vpx_temporal_svc_encoder.SRCS        += video_writer.h video_writer.c
+vpx_temporal_svc_encoder.SRCS        += vpx_ports/msvc.h
 vpx_temporal_svc_encoder.GUID        = B18C08F2-A439-4502-A78E-849BE3D60947
 vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder
 EXAMPLES-$(CONFIG_DECODERS)        += simple_decoder.c
@@ -122,6 +126,7 @@ simple_decoder.SRCS                += video_common.h
 simple_decoder.SRCS                += video_reader.h video_reader.c
 simple_decoder.SRCS                += vpx_ports/mem_ops.h
 simple_decoder.SRCS                += vpx_ports/mem_ops_aligned.h
+simple_decoder.SRCS                += vpx_ports/msvc.h
 simple_decoder.DESCRIPTION          = Simplified decoder loop
 EXAMPLES-$(CONFIG_DECODERS)        += postproc.c
 postproc.SRCS                      += ivfdec.h ivfdec.c
@@ -130,6 +135,7 @@ postproc.SRCS                      += video_common.h
 postproc.SRCS                      += video_reader.h video_reader.c
 postproc.SRCS                      += vpx_ports/mem_ops.h
 postproc.SRCS                      += vpx_ports/mem_ops_aligned.h
+postproc.SRCS                      += vpx_ports/msvc.h
 postproc.GUID                       = 65E33355-F35E-4088-884D-3FD4905881D7
 postproc.DESCRIPTION                = Decoder postprocessor control
 EXAMPLES-$(CONFIG_DECODERS)        += decode_to_md5.c
@@ -140,6 +146,7 @@ decode_to_md5.SRCS                 += video_common.h
 decode_to_md5.SRCS                 += video_reader.h video_reader.c
 decode_to_md5.SRCS                 += vpx_ports/mem_ops.h
 decode_to_md5.SRCS                 += vpx_ports/mem_ops_aligned.h
+decode_to_md5.SRCS                 += vpx_ports/msvc.h
 decode_to_md5.GUID                  = 59120B9B-2735-4BFE-B022-146CA340FE42
 decode_to_md5.DESCRIPTION           = Frame by frame MD5 checksum
 EXAMPLES-$(CONFIG_ENCODERS)     += simple_encoder.c
@@ -147,6 +154,7 @@ simple_encoder.SRCS             += ivfenc.h ivfenc.c
 simple_encoder.SRCS             += tools_common.h tools_common.c
 simple_encoder.SRCS             += video_common.h
 simple_encoder.SRCS             += video_writer.h video_writer.c
+simple_encoder.SRCS             += vpx_ports/msvc.h
 simple_encoder.GUID              = 4607D299-8A71-4D2C-9B1D-071899B6FBFD
 simple_encoder.DESCRIPTION       = Simplified encoder loop
 EXAMPLES-$(CONFIG_VP9_ENCODER)  += vp9_lossless_encoder.c
@@ -154,6 +162,7 @@ vp9_lossless_encoder.SRCS       += ivfenc.h ivfenc.c
 vp9_lossless_encoder.SRCS       += tools_common.h tools_common.c
 vp9_lossless_encoder.SRCS       += video_common.h
 vp9_lossless_encoder.SRCS       += video_writer.h video_writer.c
+vp9_lossless_encoder.SRCS       += vpx_ports/msvc.h
 vp9_lossless_encoder.GUID        = B63C7C88-5348-46DC-A5A6-CC151EF93366
 vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder
 EXAMPLES-$(CONFIG_ENCODERS)     += twopass_encoder.c
@@ -161,6 +170,7 @@ twopass_encoder.SRCS            += ivfenc.h ivfenc.c
 twopass_encoder.SRCS            += tools_common.h tools_common.c
 twopass_encoder.SRCS            += video_common.h
 twopass_encoder.SRCS            += video_writer.h video_writer.c
+twopass_encoder.SRCS            += vpx_ports/msvc.h
 twopass_encoder.GUID             = 73494FA6-4AF9-4763-8FBB-265C92402FD8
 twopass_encoder.DESCRIPTION      = Two-pass encoder loop
 EXAMPLES-$(CONFIG_DECODERS)     += decode_with_drops.c
@@ -170,6 +180,7 @@ decode_with_drops.SRCS          += video_common.h
 decode_with_drops.SRCS          += video_reader.h video_reader.c
 decode_with_drops.SRCS          += vpx_ports/mem_ops.h
 decode_with_drops.SRCS          += vpx_ports/mem_ops_aligned.h
+decode_with_drops.SRCS          += vpx_ports/msvc.h
 decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
 EXAMPLES-$(CONFIG_ENCODERS)        += set_maps.c
@@ -177,6 +188,7 @@ set_maps.SRCS                      += ivfenc.h ivfenc.c
 set_maps.SRCS                      += tools_common.h tools_common.c
 set_maps.SRCS                      += video_common.h
 set_maps.SRCS                      += video_writer.h video_writer.c
+set_maps.SRCS                      += vpx_ports/msvc.h
 set_maps.GUID                       = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
 set_maps.DESCRIPTION                = Set active and ROI maps
 EXAMPLES-$(CONFIG_VP8_ENCODER)     += vp8cx_set_ref.c
@@ -184,6 +196,7 @@ vp8cx_set_ref.SRCS                 += ivfenc.h ivfenc.c
 vp8cx_set_ref.SRCS                 += tools_common.h tools_common.c
 vp8cx_set_ref.SRCS                 += video_common.h
 vp8cx_set_ref.SRCS                 += video_writer.h video_writer.c
+vp8cx_set_ref.SRCS                 += vpx_ports/msvc.h
 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame

@@ -194,6 +207,7 @@ EXAMPLES-$(CONFIG_VP8_ENCODER)          += vp8_multi_resolution_encoder.c
 vp8_multi_resolution_encoder.SRCS       += ivfenc.h ivfenc.c
 vp8_multi_resolution_encoder.SRCS       += tools_common.h tools_common.c
 vp8_multi_resolution_encoder.SRCS       += video_writer.h video_writer.c
+vp8_multi_resolution_encoder.SRCS       += vpx_ports/msvc.h
 vp8_multi_resolution_encoder.SRCS       += $(LIBYUV_SRCS)
 vp8_multi_resolution_encoder.GUID        = 04f8738e-63c8-423b-90fa-7c2703a374de
 vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding
@@ -254,14 +268,6 @@ CODEC_EXTRA_LIBS=$(sort $(call enabled,CODEC_EXTRA_LIBS))
 $(foreach ex,$(ALL_EXAMPLES),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) examples.mk))


-# If this is a universal (fat) binary, then all the subarchitectures have
-# already been built and our job is to stitch them together. The
-# BUILD_OBJS variable indicates whether we should be building
-# (compiling, linking) the library. The LIPO_OBJS variable indicates
-# that we're stitching.
-$(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_OBJS,BUILD_OBJS):=yes)
-
-
 # Create build/install dependencies for all examples. The common case
 # is handled here. The MSVS case is handled below.
 NOT_MSVS = $(if $(CONFIG_MSVS),,yes)
@@ -269,24 +275,28 @@ DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(ALL_EXAMPLES:.c=$(EXE_SFX)))
 INSTALL-BINS-$(NOT_MSVS)   += $(addprefix bin/,$(UTILS:.c=$(EXE_SFX)))
 DIST-SRCS-yes              += $(ALL_SRCS)
 INSTALL-SRCS-yes           += $(UTIL_SRCS)
-OBJS-$(NOT_MSVS)           += $(if $(BUILD_OBJS),$(call objs,$(ALL_SRCS)))
+OBJS-$(NOT_MSVS)           += $(call objs,$(ALL_SRCS))
 BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=$(EXE_SFX)))


 # Instantiate linker template for all examples.
 CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx)
-SHARED_LIB_SUF=$(if $(filter darwin%,$(TGT_OS)),.dylib,.so)
+ifneq ($(filter darwin%,$(TGT_OS)),)
+SHARED_LIB_SUF=.dylib
+else
+ifneq ($(filter os2%,$(TGT_OS)),)
+SHARED_LIB_SUF=_dll.a
+else
+SHARED_LIB_SUF=.so
+endif
+endif
 CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a)
 $(foreach bin,$(BINS-yes),\
-    $(if $(BUILD_OBJS),$(eval $(bin):\
-        $(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF)))\
-    $(if $(BUILD_OBJS),$(eval $(call linker_template,$(bin),\
+    $(eval $(bin):$(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF))\
+    $(eval $(call linker_template,$(bin),\
        $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \
        -l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\
-        )))\
-    $(if $(LIPO_OBJS),$(eval $(call lipo_bin_template,$(bin))))\
-    )
-
+        )))

 # The following pairs define a mapping of locations in the distribution
 # tree to locations in the source/build trees.
--- a/examples/decode_to_md5.c
+++ b/examples/decode_to_md5.c
@@ -71,7 +71,7 @@ static void print_md5(FILE *stream, unsigned char digest[16]) {

 static const char *exec_name;

-void usage_exit() {
+void usage_exit(void) {
  fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
  exit(EXIT_FAILURE);
 }
--- a/examples/decode_with_drops.c
+++ b/examples/decode_with_drops.c
@@ -65,7 +65,7 @@

 static const char *exec_name;

-void usage_exit() {
+void usage_exit(void) {
  fprintf(stderr, "Usage: %s <infile> <outfile> <N-M|N/M>\n", exec_name);
  exit(EXIT_FAILURE);
 }
--- a/examples/postproc.c
+++ b/examples/postproc.c
@@ -52,7 +52,7 @@

 static const char *exec_name;

-void usage_exit() {
+void usage_exit(void) {
  fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
  exit(EXIT_FAILURE);
 }
--- a/examples/resize_util.c
+++ b/examples/resize_util.c
@@ -15,6 +15,7 @@
 #include <stdlib.h>
 #include <string.h>

+#include "../tools_common.h"
 #include "../vp9/encoder/vp9_resize.h"

 static const char *exec_name = NULL;
@@ -26,7 +27,7 @@ static void usage() {
  printf("<output_yuv> [<frames>]\n");
 }

-void usage_exit() {
+void usage_exit(void) {
  usage();
  exit(EXIT_FAILURE);
 }
--- a/examples/set_maps.c
+++ b/examples/set_maps.c
@@ -55,7 +55,7 @@

 static const char *exec_name;

-void usage_exit() {
+void usage_exit(void) {
  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
          exec_name);
  exit(EXIT_FAILURE);
--- a/examples/simple_decoder.c
+++ b/examples/simple_decoder.c
@@ -88,7 +88,7 @@

 static const char *exec_name;

-void usage_exit() {
+void usage_exit(void) {
  fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
  exit(EXIT_FAILURE);
 }
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -106,7 +106,7 @@

 static const char *exec_name;

-void usage_exit() {
+void usage_exit(void) {
  fprintf(stderr,
          "Usage: %s <codec> <width> <height> <infile> <outfile> "
              "<keyframe-interval> [<error-resilient>]\nSee comments in "
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -58,7 +58,7 @@

 static const char *exec_name;

-void usage_exit() {
+void usage_exit(void) {
  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
          exec_name);
  exit(EXIT_FAILURE);
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -37,15 +37,14 @@
 #include <unistd.h>
 #endif
 #include "vpx_ports/vpx_timer.h"
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
 #include "vpx_ports/mem_ops.h"
-#include "./tools_common.h"
+#include "../tools_common.h"
 #define interface (vpx_codec_vp8_cx())
 #define fourcc    0x30385056

-void usage_exit() {
+void usage_exit(void) {
  exit(EXIT_FAILURE);
 }

--- a/examples/vp8cx_set_ref.c
+++ b/examples/vp8cx_set_ref.c
@@ -58,7 +58,7 @@

 static const char *exec_name;

-void usage_exit() {
+void usage_exit(void) {
  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile> <frame>\n",
          exec_name);
  exit(EXIT_FAILURE);
--- a/examples/vp9_lossless_encoder.c
+++ b/examples/vp9_lossless_encoder.c
@@ -20,7 +20,7 @@

 static const char *exec_name;

-void usage_exit() {
+void usage_exit(void) {
  fprintf(stderr, "vp9_lossless_encoder: Example demonstrating VP9 lossless "
                  "encoding feature. Supports raw input only.\n");
  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name);
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -14,11 +14,13 @@
 * that benefit from a scalable bitstream.
 */

+#include <math.h>
 #include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>

+
 #include "../args.h"
 #include "../tools_common.h"
 #include "../video_writer.h"
@@ -27,11 +29,18 @@
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 #include "../vpxstats.h"
+#define OUTPUT_RC_STATS 1

 static const arg_def_t skip_frames_arg =
    ARG_DEF("s", "skip-frames", 1, "input frames to skip");
 static const arg_def_t frames_arg =
    ARG_DEF("f", "frames", 1, "number of frames to encode");
+static const arg_def_t threads_arg =
+    ARG_DEF("th", "threads", 1, "number of threads to use");
+#if OUTPUT_RC_STATS
+static const arg_def_t output_rc_stats_arg =
+    ARG_DEF("rcstat", "output_rc_stats", 1, "output rc stats");
+#endif
 static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "source width");
 static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "source height");
 static const arg_def_t timebase_arg =
@@ -42,6 +51,9 @@ static const arg_def_t spatial_layers_arg =
    ARG_DEF("sl", "spatial-layers", 1, "number of spatial SVC layers");
 static const arg_def_t temporal_layers_arg =
    ARG_DEF("tl", "temporal-layers", 1, "number of temporal SVC layers");
+static const arg_def_t temporal_layering_mode_arg =
+    ARG_DEF("tlm", "temporal-layering-mode", 1, "temporal layering scheme."
+        "VP9E_TEMPORAL_LAYERING_MODE");
 static const arg_def_t kf_dist_arg =
    ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes");
 static const arg_def_t scale_factors_arg =
@@ -65,6 +77,8 @@ static const arg_def_t lag_in_frame_arg =
        "generating any outputs");
 static const arg_def_t rc_end_usage_arg =
    ARG_DEF(NULL, "rc-end-usage", 1, "0 - 3: VBR, CBR, CQ, Q");
+static const arg_def_t speed_arg =
+    ARG_DEF("sp", "speed", 1, "speed configuration");

 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -85,10 +99,16 @@ static const arg_def_t *svc_args[] = {
  &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &spatial_layers_arg,
  &kf_dist_arg,       &scale_factors_arg, &passes_arg,      &pass_arg,
  &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
-  &max_bitrate_arg,   &temporal_layers_arg,                 &lag_in_frame_arg,
+  &max_bitrate_arg,   &temporal_layers_arg, &temporal_layering_mode_arg,
+  &lag_in_frame_arg,  &threads_arg,
+#if OUTPUT_RC_STATS
+  &output_rc_stats_arg,
+#endif
+
 #if CONFIG_VP9_HIGHBITDEPTH
  &bitdepth_arg,
 #endif
+  &speed_arg,
  &rc_end_usage_arg,  NULL
 };

@@ -102,6 +122,10 @@ static const uint32_t default_bitrate = 1000;
 static const uint32_t default_spatial_layers = 5;
 static const uint32_t default_temporal_layers = 1;
 static const uint32_t default_kf_dist = 100;
+static const uint32_t default_temporal_layering_mode = 0;
+static const uint32_t default_output_rc_stats = 0;
+static const int32_t default_speed = -1;  // -1 means use library default.
+static const uint32_t default_threads = 0;  // zero means use library default.

 typedef struct {
  const char *input_filename;
@@ -116,7 +140,7 @@ typedef struct {

 static const char *exec_name;

-void usage_exit() {
+void usage_exit(void) {
  fprintf(stderr, "Usage: %s <options> input_filename output_filename\n",
          exec_name);
  fprintf(stderr, "Options:\n");
@@ -143,6 +167,12 @@ static void parse_command_line(int argc, const char **argv_,
  svc_ctx->log_level = SVC_LOG_DEBUG;
  svc_ctx->spatial_layers = default_spatial_layers;
  svc_ctx->temporal_layers = default_temporal_layers;
+  svc_ctx->temporal_layering_mode = default_temporal_layering_mode;
+#if OUTPUT_RC_STATS
+  svc_ctx->output_rc_stat = default_output_rc_stats;
+#endif
+  svc_ctx->speed = default_speed;
+  svc_ctx->threads = default_threads;

  // start with default encoder configuration
  res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0);
@@ -184,6 +214,20 @@ static void parse_command_line(int argc, const char **argv_,
      svc_ctx->spatial_layers = arg_parse_uint(&arg);
    } else if (arg_match(&arg, &temporal_layers_arg, argi)) {
      svc_ctx->temporal_layers = arg_parse_uint(&arg);
+#if OUTPUT_RC_STATS
+    } else if (arg_match(&arg, &output_rc_stats_arg, argi)) {
+      svc_ctx->output_rc_stat = arg_parse_uint(&arg);
+#endif
+    } else if (arg_match(&arg, &speed_arg, argi)) {
+      svc_ctx->speed = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &threads_arg, argi)) {
+      svc_ctx->threads = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &temporal_layering_mode_arg, argi)) {
+      svc_ctx->temporal_layering_mode =
+          enc_cfg->temporal_layering_mode = arg_parse_int(&arg);
+      if (svc_ctx->temporal_layering_mode) {
+        enc_cfg->g_error_resilient = 1;
+      }
    } else if (arg_match(&arg, &kf_dist_arg, argi)) {
      enc_cfg->kf_min_dist = arg_parse_uint(&arg);
      enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
@@ -316,6 +360,185 @@ static void parse_command_line(int argc, const char **argv_,
      enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
 }

+#if OUTPUT_RC_STATS
+// For rate control encoding stats.
+struct RateControlStats {
+  // Number of input frames per layer.
+  int layer_input_frames[VPX_MAX_LAYERS];
+  // Total (cumulative) number of encoded frames per layer.
+  int layer_tot_enc_frames[VPX_MAX_LAYERS];
+  // Number of encoded non-key frames per layer.
+  int layer_enc_frames[VPX_MAX_LAYERS];
+  // Framerate per layer (cumulative).
+  double layer_framerate[VPX_MAX_LAYERS];
+  // Target average frame size per layer (per-frame-bandwidth per layer).
+  double layer_pfb[VPX_MAX_LAYERS];
+  // Actual average frame size per layer.
+  double layer_avg_frame_size[VPX_MAX_LAYERS];
+  // Average rate mismatch per layer (|target - actual| / target).
+  double layer_avg_rate_mismatch[VPX_MAX_LAYERS];
+  // Actual encoding bitrate per layer (cumulative).
+  double layer_encoding_bitrate[VPX_MAX_LAYERS];
+  // Average of the short-time encoder actual bitrate.
+  // TODO(marpan): Should we add these short-time stats for each layer?
+  double avg_st_encoding_bitrate;
+  // Variance of the short-time encoder actual bitrate.
+  double variance_st_encoding_bitrate;
+  // Window (number of frames) for computing short-time encoding bitrate.
+  int window_size;
+  // Number of window measurements.
+  int window_count;
+};
+
+// Note: these rate control stats assume only 1 key frame in the
+// sequence (i.e., first frame only).
+static void set_rate_control_stats(struct RateControlStats *rc,
+                                     vpx_codec_enc_cfg_t *cfg) {
+  unsigned int sl, tl;
+  // Set the layer (cumulative) framerate and the target layer (non-cumulative)
+  // per-frame-bandwidth, for the rate control encoding stats below.
+  const double framerate = cfg->g_timebase.den / cfg->g_timebase.num;
+
+  for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
+    for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
+      const int layer = sl * cfg->ts_number_layers + tl;
+      const int tlayer0 = sl * cfg->ts_number_layers;
+      rc->layer_framerate[layer] =
+          framerate / cfg->ts_rate_decimator[tl];
+      if (tl > 0) {
+        rc->layer_pfb[layer] = 1000.0 *
+            (cfg->layer_target_bitrate[layer] -
+                cfg->layer_target_bitrate[layer - 1]) /
+            (rc->layer_framerate[layer] -
+                rc->layer_framerate[layer - 1]);
+      } else {
+        rc->layer_pfb[tlayer0] = 1000.0 *
+            cfg->layer_target_bitrate[tlayer0] /
+            rc->layer_framerate[tlayer0];
+      }
+      rc->layer_input_frames[layer] = 0;
+      rc->layer_enc_frames[layer] = 0;
+      rc->layer_tot_enc_frames[layer] = 0;
+      rc->layer_encoding_bitrate[layer] = 0.0;
+      rc->layer_avg_frame_size[layer] = 0.0;
+      rc->layer_avg_rate_mismatch[layer] = 0.0;
+    }
+  }
+  rc->window_count = 0;
+  rc->window_size = 15;
+  rc->avg_st_encoding_bitrate = 0.0;
+  rc->variance_st_encoding_bitrate = 0.0;
+}
+
+static void printout_rate_control_summary(struct RateControlStats *rc,
+                                          vpx_codec_enc_cfg_t *cfg,
+                                          int frame_cnt) {
+  unsigned int sl, tl;
+  int tot_num_frames = 0;
+  double perc_fluctuation = 0.0;
+  printf("Total number of processed frames: %d\n\n", frame_cnt - 1);
+  printf("Rate control layer stats for sl%d tl%d layer(s):\n\n",
+      cfg->ss_number_layers, cfg->ts_number_layers);
+  for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
+    for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
+      const int layer = sl * cfg->ts_number_layers + tl;
+      const int num_dropped = (tl > 0) ?
+          (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer]) :
+          (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer] - 1);
+      if (!sl)
+        tot_num_frames += rc->layer_input_frames[layer];
+      rc->layer_encoding_bitrate[layer] = 0.001 * rc->layer_framerate[layer] *
+          rc->layer_encoding_bitrate[layer] / tot_num_frames;
+      rc->layer_avg_frame_size[layer] = rc->layer_avg_frame_size[layer] /
+          rc->layer_enc_frames[layer];
+      rc->layer_avg_rate_mismatch[layer] =
+          100.0 * rc->layer_avg_rate_mismatch[layer] /
+          rc->layer_enc_frames[layer];
+      printf("For layer#: sl%d tl%d \n", sl, tl);
+      printf("Bitrate (target vs actual): %d %f.0 kbps\n",
+             cfg->layer_target_bitrate[layer],
+             rc->layer_encoding_bitrate[layer]);
+      printf("Average frame size (target vs actual): %f %f bits\n",
+             rc->layer_pfb[layer], rc->layer_avg_frame_size[layer]);
+      printf("Average rate_mismatch: %f\n",
+             rc->layer_avg_rate_mismatch[layer]);
+      printf("Number of input frames, encoded (non-key) frames, "
+          "and percent dropped frames: %d %d %f.0 \n",
+          rc->layer_input_frames[layer], rc->layer_enc_frames[layer],
+          100.0 * num_dropped / rc->layer_input_frames[layer]);
+      printf("\n");
+    }
+  }
+  rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count;
+  rc->variance_st_encoding_bitrate =
+      rc->variance_st_encoding_bitrate / rc->window_count -
+      (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate);
+  perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) /
+      rc->avg_st_encoding_bitrate;
+  printf("Short-time stats, for window of %d frames: \n", rc->window_size);
+  printf("Average, rms-variance, and percent-fluct: %f %f %f \n",
+         rc->avg_st_encoding_bitrate,
+         sqrt(rc->variance_st_encoding_bitrate),
+         perc_fluctuation);
+  if (frame_cnt != tot_num_frames)
+    die("Error: Number of input frames not equal to output encoded frames != "
+        "%d tot_num_frames = %d\n", frame_cnt, tot_num_frames);
+}
+
+vpx_codec_err_t parse_superframe_index(const uint8_t *data,
+                                       size_t data_sz,
+                                       uint32_t sizes[8], int *count) {
+  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+  // it is a super frame index. If the last byte of real video compression
+  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+  // not the associated matching marker byte at the front of the index we have
+  // an invalid bitstream and need to return an error.
+
+  uint8_t marker;
+
+  marker = *(data + data_sz - 1);
+  *count = 0;
+
+
+  if ((marker & 0xe0) == 0xc0) {
+    const uint32_t frames = (marker & 0x7) + 1;
+    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+    const size_t index_sz = 2 + mag * frames;
+
+    // This chunk is marked as having a superframe index but doesn't have
+    // enough data for it, thus it's an invalid superframe index.
+    if (data_sz < index_sz)
+      return VPX_CODEC_CORRUPT_FRAME;
+
+    {
+      const uint8_t marker2 = *(data + data_sz - index_sz);
+
+      // This chunk is marked as having a superframe index but doesn't have
+      // the matching marker byte at the front of the index therefore it's an
+      // invalid chunk.
+      if (marker != marker2)
+        return VPX_CODEC_CORRUPT_FRAME;
+    }
+
+    {
+      // Found a valid superframe index.
+      uint32_t i, j;
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+
+      for (i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; ++j)
+          this_sz |= (*x++) << (j * 8);
+        sizes[i] = this_sz;
+      }
+      *count = frames;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+#endif
+
 int main(int argc, const char **argv) {
  AppInput app_input = {0};
  VpxVideoWriter *writer = NULL;
@@ -332,7 +555,15 @@ int main(int argc, const char **argv) {
  FILE *infile = NULL;
  int end_of_stream = 0;
  int frames_received = 0;
-
+#if OUTPUT_RC_STATS
+  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL};
+  struct RateControlStats rc;
+  vpx_svc_layer_id_t layer_id;
+  int sl, tl;
+  double sum_bitrate = 0.0;
+  double sum_bitrate2 = 0.0;
+  double framerate  = 30.0;
+#endif
  memset(&svc_ctx, 0, sizeof(svc_ctx));
  svc_ctx.log_print = 1;
  exec_name = argv[0];
@@ -359,6 +590,13 @@ int main(int argc, const char **argv) {
      VPX_CODEC_OK)
    die("Failed to initialize encoder\n");

+#if OUTPUT_RC_STATS
+  if (svc_ctx.output_rc_stat) {
+    set_rate_control_stats(&rc, &enc_cfg);
+    framerate = enc_cfg.g_timebase.den / enc_cfg.g_timebase.num;
+  }
+#endif
+
  info.codec_fourcc = VP9_FOURCC;
  info.time_base.numerator = enc_cfg.g_timebase.num;
  info.time_base.denominator = enc_cfg.g_timebase.den;
@@ -370,11 +608,31 @@ int main(int argc, const char **argv) {
    if (!writer)
      die("Failed to open %s for writing\n", app_input.output_filename);
  }
+#if OUTPUT_RC_STATS
+  // For now, just write temporal layer streams.
+  // TODO(wonkap): do spatial by re-writing superframe.
+  if (svc_ctx.output_rc_stat) {
+    for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
+      char file_name[PATH_MAX];
+
+      snprintf(file_name, sizeof(file_name), "%s_t%d.ivf",
+               app_input.output_filename, tl);
+      outfile[tl] = vpx_video_writer_open(file_name, kContainerIVF, &info);
+      if (!outfile[tl])
+        die("Failed to open %s for writing", file_name);
+    }
+  }
+#endif

  // skip initial frames
  for (i = 0; i < app_input.frames_to_skip; ++i)
    vpx_img_read(&raw, infile);

+  if (svc_ctx.speed != -1)
+    vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
+  if (svc_ctx.threads)
+    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
+
  // Encode frames
  while (!end_of_stream) {
    vpx_codec_iter_t iter = NULL;
@@ -386,7 +644,9 @@ int main(int argc, const char **argv) {
    }

    res = vpx_svc_encode(&svc_ctx, &codec, (end_of_stream ? NULL : &raw),
-                         pts, frame_duration, VPX_DL_GOOD_QUALITY);
+                         pts, frame_duration, svc_ctx.speed >= 5 ?
+                         VPX_DL_REALTIME : VPX_DL_GOOD_QUALITY);
+
    printf("%s", vpx_svc_get_message(&svc_ctx));
    if (res != VPX_CODEC_OK) {
      die_codec(&codec, "Failed to encode frame");
@@ -395,11 +655,90 @@ int main(int argc, const char **argv) {
    while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) {
      switch (cx_pkt->kind) {
        case VPX_CODEC_CX_FRAME_PKT: {
-          if (cx_pkt->data.frame.sz > 0)
+          if (cx_pkt->data.frame.sz > 0) {
+#if OUTPUT_RC_STATS
+            uint32_t sizes[8];
+            int count = 0;
+#endif
            vpx_video_writer_write_frame(writer,
                                         cx_pkt->data.frame.buf,
                                         cx_pkt->data.frame.sz,
                                         cx_pkt->data.frame.pts);
+#if OUTPUT_RC_STATS
+            // TODO(marpan/wonkap): Put this (to line728) in separate function.
+            if (svc_ctx.output_rc_stat) {
+              vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
+              parse_superframe_index(cx_pkt->data.frame.buf,
+                                     cx_pkt->data.frame.sz, sizes, &count);
+              for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
+                                        layer_id.temporal_layer_id];
+              }
+              for (tl = layer_id.temporal_layer_id;
+                  tl < enc_cfg.ts_number_layers; ++tl) {
+                vpx_video_writer_write_frame(outfile[tl],
+                                             cx_pkt->data.frame.buf,
+                                             cx_pkt->data.frame.sz,
+                                             cx_pkt->data.frame.pts);
+              }
+
+              for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                for (tl = layer_id.temporal_layer_id;
+                    tl < enc_cfg.ts_number_layers; ++tl) {
+                  const int layer = sl * enc_cfg.ts_number_layers + tl;
+                  ++rc.layer_tot_enc_frames[layer];
+                  rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
+                  // Keep count of rate control stats per layer, for non-key
+                  // frames.
+                  if (tl == layer_id.temporal_layer_id &&
+                      !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
+                    rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl];
+                    rc.layer_avg_rate_mismatch[layer] +=
+                        fabs(8.0 * sizes[sl] - rc.layer_pfb[layer]) /
+                        rc.layer_pfb[layer];
+                    ++rc.layer_enc_frames[layer];
+                  }
+                }
+              }
+
+              // Update for short-time encoding bitrate states, for moving
+              // window of size rc->window, shifted by rc->window / 2.
+              // Ignore first window segment, due to key frame.
+              if (frame_cnt > rc.window_size) {
+                tl = layer_id.temporal_layer_id;
+                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                  sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
+                }
+                if (frame_cnt % rc.window_size == 0) {
+                  rc.window_count += 1;
+                  rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size;
+                  rc.variance_st_encoding_bitrate +=
+                      (sum_bitrate / rc.window_size) *
+                      (sum_bitrate / rc.window_size);
+                  sum_bitrate = 0.0;
+                }
+              }
+
+              // Second shifted window.
+              if (frame_cnt > rc.window_size + rc.window_size / 2) {
+               tl = layer_id.temporal_layer_id;
+               for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                 sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate;
+               }
+
+               if (frame_cnt > 2 * rc.window_size &&
+                  frame_cnt % rc.window_size == 0) {
+                 rc.window_count += 1;
+                 rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
+                 rc.variance_st_encoding_bitrate +=
+                    (sum_bitrate2 / rc.window_size) *
+                    (sum_bitrate2 / rc.window_size);
+                 sum_bitrate2 = 0.0;
+               }
+              }
+            }
+#endif
+          }

          printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
                 !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
@@ -424,25 +763,30 @@ int main(int argc, const char **argv) {
      pts += frame_duration;
    }
  }
-
  printf("Processed %d frames\n", frame_cnt);
-
  fclose(infile);
+#if OUTPUT_RC_STATS
+  if (svc_ctx.output_rc_stat) {
+    printout_rate_control_summary(&rc, &enc_cfg, frame_cnt);
+    printf("\n");
+  }
+#endif
  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
-
  if (app_input.passes == 2)
    stats_close(&app_input.rc_stats, 1);
-
  if (writer) {
    vpx_video_writer_close(writer);
  }
-
+#if OUTPUT_RC_STATS
+  if (svc_ctx.output_rc_stat) {
+    for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
+      vpx_video_writer_close(outfile[tl]);
+    }
+  }
+#endif
  vpx_img_free(&raw);
-
  // display average size, psnr
  printf("%s", vpx_svc_dump_statistics(&svc_ctx));
-
  vpx_svc_release(&svc_ctx);
-
  return EXIT_SUCCESS;
 }
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -28,7 +28,7 @@

 static const char *exec_name;

-void usage_exit() {
+void usage_exit(void) {
  exit(EXIT_FAILURE);
 }

@@ -70,6 +70,7 @@ struct RateControlMetrics {
  int window_size;
  // Number of window measurements.
  int window_count;
+  int layer_target_bitrate[VPX_MAX_LAYERS];
 };

 // Note: these rate control metrics assume only 1 key frame in the
@@ -85,13 +86,13 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc,
  // per-frame-bandwidth, for the rate control encoding stats below.
  const double framerate = cfg->g_timebase.den / cfg->g_timebase.num;
  rc->layer_framerate[0] = framerate / cfg->ts_rate_decimator[0];
-  rc->layer_pfb[0] = 1000.0 * cfg->ts_target_bitrate[0] /
+  rc->layer_pfb[0] = 1000.0 * rc->layer_target_bitrate[0] /
      rc->layer_framerate[0];
  for (i = 0; i < cfg->ts_number_layers; ++i) {
    if (i > 0) {
      rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i];
      rc->layer_pfb[i] = 1000.0 *
-          (cfg->ts_target_bitrate[i] - cfg->ts_target_bitrate[i - 1]) /
+          (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
          (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
    }
    rc->layer_input_frames[i] = 0;
@@ -128,7 +129,7 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
    rc->layer_avg_rate_mismatch[i] = 100.0 * rc->layer_avg_rate_mismatch[i] /
        rc->layer_enc_frames[i];
    printf("For layer#: %d \n", i);
-    printf("Bitrate (target vs actual): %d %f \n", cfg->ts_target_bitrate[i],
+    printf("Bitrate (target vs actual): %d %f \n", rc->layer_target_bitrate[i],
           rc->layer_encoding_bitrate[i]);
    printf("Average frame size (target vs actual): %f %f \n", rc->layer_pfb[i],
           rc->layer_avg_frame_size[i]);
@@ -597,13 +598,16 @@ int main(int argc, char **argv) {
  for (i = min_args_base;
       (int)i < min_args_base + mode_to_num_layers[layering_mode];
       ++i) {
-    cfg.ts_target_bitrate[i - 11] = strtol(argv[i], NULL, 0);
+    rc.layer_target_bitrate[i - 11] = strtol(argv[i], NULL, 0);
+    if (strncmp(encoder->name, "vp8", 3) == 0)
+      cfg.ts_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11];
+    else if (strncmp(encoder->name, "vp9", 3) == 0)
+      cfg.layer_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11];
  }

  // Real time parameters.
  cfg.rc_dropframe_thresh = strtol(argv[9], NULL, 0);
  cfg.rc_end_usage = VPX_CBR;
-  cfg.rc_resize_allowed = 0;
  cfg.rc_min_quantizer = 2;
  cfg.rc_max_quantizer = 56;
  if (strncmp(encoder->name, "vp9", 3) == 0)
@@ -614,6 +618,9 @@ int main(int argc, char **argv) {
  cfg.rc_buf_optimal_sz = 600;
  cfg.rc_buf_sz = 1000;

+  // Disable dynamic resizing by default.
+  cfg.rc_resize_allowed = 0;
+
  // Use 1 thread as default.
  cfg.g_threads = 1;

@@ -625,6 +632,8 @@ int main(int argc, char **argv) {
  // Disable automatic keyframe placement.
  cfg.kf_min_dist = cfg.kf_max_dist = 3000;

+  cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
  set_temporal_layer_pattern(layering_mode,
                             &cfg,
                             layer_flags,
@@ -633,8 +642,8 @@ int main(int argc, char **argv) {
  set_rate_control_metrics(&rc, &cfg);

  // Target bandwidth for the whole stream.
-  // Set to ts_target_bitrate for highest layer (total bitrate).
-  cfg.rc_target_bitrate = cfg.ts_target_bitrate[cfg.ts_number_layers - 1];
+  // Set to layer_target_bitrate for highest layer (total bitrate).
+  cfg.rc_target_bitrate = rc.layer_target_bitrate[cfg.ts_number_layers - 1];

  // Open input file.
  if (!(infile = fopen(argv[1], "rb"))) {
@@ -674,18 +683,25 @@ int main(int argc, char **argv) {

  if (strncmp(encoder->name, "vp8", 3) == 0) {
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
-    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
  } else if (strncmp(encoder->name, "vp9", 3) == 0) {
+    vpx_svc_extra_cfg_t svc_params;
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
    vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
-      vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
-      if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1: 0)) {
+    if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1: 0))
      die_codec(&codec, "Failed to set SVC");
+    for (i = 0; i < cfg.ts_number_layers; ++i) {
+      svc_params.max_quantizers[i] = cfg.rc_max_quantizer;
+      svc_params.min_quantizers[i] = cfg.rc_min_quantizer;
    }
+    svc_params.scaling_factor_num[0] = cfg.g_h;
+    svc_params.scaling_factor_den[0] = cfg.g_h;
+    vpx_codec_control(&codec, VP9E_SET_SVC_PARAMETERS, &svc_params);
  }
  if (strncmp(encoder->name, "vp8", 3) == 0) {
    vpx_codec_control(&codec, VP8E_SET_SCREEN_CONTENT_MODE, 0);
--- a/libs.mk
+++ b/libs.mk
@@ -25,7 +25,7 @@ $$(BUILD_PFX)$(1).h: $$(SRC_PATH_BARE)/$(2)
 	@echo "    [CREATE] $$@"
 	$$(qexec)$$(SRC_PATH_BARE)/build/make/rtcd.pl --arch=$$(TGT_ISA) \
          --sym=$(1) \
-          --config=$$(CONFIG_DIR)$$(target)$$(if $$(FAT_ARCHS),,-$$(TOOLCHAIN)).mk \
+          --config=$$(CONFIG_DIR)$$(target)-$$(TOOLCHAIN).mk \
          $$(RTCD_OPTIONS) $$^ > $$@
 CLEAN-OBJS += $$(BUILD_PFX)$(1).h
 RTCD += $$(BUILD_PFX)$(1).h
@@ -34,13 +34,6 @@ endef
 CODEC_SRCS-yes += CHANGELOG
 CODEC_SRCS-yes += libs.mk

-# If this is a universal (fat) binary, then all the subarchitectures have
-# already been built and our job is to stitch them together. The
-# BUILD_LIBVPX variable indicates whether we should be building
-# (compiling, linking) the library. The LIPO_LIBVPX variable indicates
-# that we're stitching.
-$(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes)
-
 include $(SRC_PATH_BARE)/vpx/vpx_codec.mk
 CODEC_SRCS-yes += $(addprefix vpx/,$(call enabled,API_SRCS))
 CODEC_DOC_SRCS += $(addprefix vpx/,$(call enabled,API_DOC_SRCS))
@@ -54,6 +47,9 @@ CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))
 include $(SRC_PATH_BARE)/vpx_ports/vpx_ports.mk
 CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS))

+include $(SRC_PATH_BARE)/vpx_dsp/vpx_dsp.mk
+CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS))
+
 ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
  VP8_PREFIX=vp8/
  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
@@ -137,18 +133,18 @@ INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/%  $(p)/Release/%)
 INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/%  $(p)/Debug/%)
 endif

-CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh
-CODEC_SRCS-$(BUILD_LIBVPX) += build/make/rtcd.pl
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emmintrin_compat.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem_ops.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem_ops_aligned.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_once.h
-CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
+CODEC_SRCS-yes += build/make/version.sh
+CODEC_SRCS-yes += build/make/rtcd.pl
+CODEC_SRCS-yes += vpx_ports/emmintrin_compat.h
+CODEC_SRCS-yes += vpx_ports/mem_ops.h
+CODEC_SRCS-yes += vpx_ports/mem_ops_aligned.h
+CODEC_SRCS-yes += vpx_ports/vpx_once.h
+CODEC_SRCS-yes += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
 endif
-CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
+CODEC_EXPORTS-yes += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec

@@ -215,7 +211,7 @@ vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
            $(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \
            --src-path-bare="$(SRC_PATH_BARE)" \

-PROJECTS-$(BUILD_LIBVPX) += vpx.$(VCPROJ_SFX)
+PROJECTS-yes += vpx.$(VCPROJ_SFX)

 vpx.$(VCPROJ_SFX): vpx_config.asm
 vpx.$(VCPROJ_SFX): $(RTCD)
@@ -223,32 +219,42 @@ vpx.$(VCPROJ_SFX): $(RTCD)
 endif
 else
 LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))
-OBJS-$(BUILD_LIBVPX) += $(LIBVPX_OBJS)
-LIBS-$(if $(BUILD_LIBVPX),$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
+OBJS-yes += $(LIBVPX_OBJS)
+LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)

-
-BUILD_LIBVPX_SO         := $(if $(BUILD_LIBVPX),$(CONFIG_SHARED))
-
+SO_VERSION_MAJOR := 2
+SO_VERSION_MINOR := 0
+SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
-LIBVPX_SO               := libvpx.$(VERSION_MAJOR).dylib
+LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
+SHARED_LIB_SUF          := .dylib
 EXPORT_FILE             := libvpx.syms
 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
                             libvpx.dylib  )
 else
-LIBVPX_SO               := libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)
+ifeq ($(filter os2%,$(TGT_OS)),$(TGT_OS))
+LIBVPX_SO               := libvpx$(SO_VERSION_MAJOR).dll
+SHARED_LIB_SUF          := _dll.a
+EXPORT_FILE             := libvpx.def
+LIBVPX_SO_SYMLINKS      :=
+LIBVPX_SO_IMPLIB        := libvpx_dll.a
+else
+LIBVPX_SO               := libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH)
+SHARED_LIB_SUF          := .so
 EXPORT_FILE             := libvpx.ver
-SYM_LINK                := libvpx.so
 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
-                             libvpx.so libvpx.so.$(VERSION_MAJOR) \
-                             libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR))
+                             libvpx.so libvpx.so.$(SO_VERSION_MAJOR) \
+                             libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR))
+endif
 endif

-LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)\
-                           $(notdir $(LIBVPX_SO_SYMLINKS))
+LIBS-$(CONFIG_SHARED) += $(BUILD_PFX)$(LIBVPX_SO)\
+                           $(notdir $(LIBVPX_SO_SYMLINKS)) \
+                           $(if $(LIBVPX_SO_IMPLIB), $(BUILD_PFX)$(LIBVPX_SO_IMPLIB))
 $(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) $(EXPORT_FILE)
 $(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm
-$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(VERSION_MAJOR)
+$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR)
 $(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE)

 libvpx.ver: $(call enabled,CODEC_EXPORTS)
@@ -263,6 +269,19 @@ libvpx.syms: $(call enabled,CODEC_EXPORTS)
 	$(qexec)awk '{print "_"$$2}' $^ >$@
 CLEAN-OBJS += libvpx.syms

+libvpx.def: $(call enabled,CODEC_EXPORTS)
+	@echo "    [CREATE] $@"
+	$(qexec)echo LIBRARY $(LIBVPX_SO:.dll=) INITINSTANCE TERMINSTANCE > $@
+	$(qexec)echo "DATA MULTIPLE NONSHARED" >> $@
+	$(qexec)echo "EXPORTS" >> $@
+	$(qexec)awk '!/vpx_svc_*/ {print "_"$$2}' $^ >>$@
+CLEAN-OBJS += libvpx.def
+
+libvpx_dll.a: $(LIBVPX_SO)
+	@echo "    [IMPLIB] $@"
+	$(qexec)emximp -o $@ $<
+CLEAN-OBJS += libvpx_dll.a
+
 define libvpx_symlink_template
 $(1): $(2)
 	@echo "    [LN]     $(2) $$@"
@@ -278,11 +297,12 @@ $(eval $(call libvpx_symlink_template,\
    $(LIBVPX_SO)))


-INSTALL-LIBS-$(BUILD_LIBVPX_SO) += $(LIBVPX_SO_SYMLINKS)
-INSTALL-LIBS-$(BUILD_LIBVPX_SO) += $(LIBSUBDIR)/$(LIBVPX_SO)
+INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBVPX_SO_SYMLINKS)
+INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBSUBDIR)/$(LIBVPX_SO)
+INSTALL-LIBS-$(CONFIG_SHARED) += $(if $(LIBVPX_SO_IMPLIB),$(LIBSUBDIR)/$(LIBVPX_SO_IMPLIB))


-LIBS-$(BUILD_LIBVPX) += vpx.pc
+LIBS-yes += vpx.pc
 vpx.pc: config.mk libs.mk
 	@echo "    [CREATE] $@"
 	$(qexec)echo '# pkg-config file from libvpx $(VERSION_STRING)' > $@
@@ -308,9 +328,6 @@ INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
 CLEAN-OBJS += vpx.pc
 endif

-LIBS-$(LIPO_LIBVPX) += libvpx.a
-$(eval $(if $(LIPO_LIBVPX),$(call lipo_lib_template,libvpx.a)))
-
 #
 # Rule to make assembler configuration file from C configuration file
 #
@@ -349,11 +366,15 @@ LIBVPX_TEST_DATA_PATH ?= .

 include $(SRC_PATH_BARE)/test/test.mk
 LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS))
-LIBVPX_TEST_BINS=./test_libvpx$(EXE_SFX)
+LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX)
 LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
                     $(call enabled,LIBVPX_TEST_DATA))
 libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)

+TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX)
+TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
+TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS)))
+
 libvpx_test_srcs.txt:
 	@echo "    [CREATE] $@"
 	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
@@ -417,7 +438,25 @@ test_libvpx.$(VCPROJ_SFX): $(LIBVPX_TEST_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_

 PROJECTS-$(CONFIG_MSVS) += test_libvpx.$(VCPROJ_SFX)

-LIBVPX_TEST_BINS := $(addprefix $(TGT_OS:win64=x64)/Release/,$(notdir $(LIBVPX_TEST_BINS)))
+LIBVPX_TEST_BIN := $(addprefix $(TGT_OS:win64=x64)/Release/,$(notdir $(LIBVPX_TEST_BIN)))
+
+ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),)
+PROJECTS-$(CONFIG_MSVS) += test_intra_pred_speed.$(VCPROJ_SFX)
+test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
+	@echo "    [CREATE] $@"
+	$(qexec)$(GEN_VCPROJ) \
+            --exe \
+            --target=$(TOOLCHAIN) \
+            --name=test_intra_pred_speed \
+            -D_VARIADIC_MAX=10 \
+            --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
+            --ver=$(CONFIG_VS_VERSION) \
+            --src-path-bare="$(SRC_PATH_BARE)" \
+            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
+            -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
+            -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
+endif  # TEST_INTRA_PRED_SPEED
 endif
 else

@@ -428,45 +467,54 @@ ifeq ($(filter win%,$(TGT_OS)),$(TGT_OS))
 # Disabling pthreads globally will cause issues on darwin and possibly elsewhere
 $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0
 endif
-$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
-$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
-OBJS-$(BUILD_LIBVPX) += $(GTEST_OBJS)
-LIBS-$(BUILD_LIBVPX) += $(BUILD_PFX)libgtest.a $(BUILD_PFX)libgtest_g.a
+GTEST_INCLUDES := -I$(SRC_PATH_BARE)/third_party/googletest/src
+GTEST_INCLUDES += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
+$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
+OBJS-yes += $(GTEST_OBJS)
+LIBS-yes += $(BUILD_PFX)libgtest.a $(BUILD_PFX)libgtest_g.a
 $(BUILD_PFX)libgtest_g.a: $(GTEST_OBJS)

 LIBVPX_TEST_OBJS=$(sort $(call objs,$(LIBVPX_TEST_SRCS)))
-$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
-$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
-OBJS-$(BUILD_LIBVPX) += $(LIBVPX_TEST_OBJS)
-BINS-$(BUILD_LIBVPX) += $(LIBVPX_TEST_BINS)
+$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
+OBJS-yes += $(LIBVPX_TEST_OBJS)
+BINS-yes += $(LIBVPX_TEST_BIN)

 CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx)
-CODEC_LIB_SUF=$(if $(CONFIG_SHARED),.so,.a)
-$(foreach bin,$(LIBVPX_TEST_BINS),\
-    $(if $(BUILD_LIBVPX),$(eval $(bin): \
-        lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\
-    $(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\
+CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a)
+TEST_LIBS := lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a
+$(LIBVPX_TEST_BIN): $(TEST_LIBS)
+$(eval $(call linkerxx_template,$(LIBVPX_TEST_BIN), \
              $(LIBVPX_TEST_OBJS) \
-        -L. -lvpx -lgtest $(extralibs) -lm)\
-        )))\
-    $(if $(LIPO_LIBS),$(eval $(call lipo_bin_template,$(bin))))\
+              -L. -lvpx -lgtest $(extralibs) -lm))

-endif
+ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),)
+$(TEST_INTRA_PRED_SPEED_OBJS) $(TEST_INTRA_PRED_SPEED_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
+OBJS-yes += $(TEST_INTRA_PRED_SPEED_OBJS)
+BINS-yes += $(TEST_INTRA_PRED_SPEED_BIN)
+
+$(TEST_INTRA_PRED_SPEED_BIN): $(TEST_LIBS)
+$(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \
+              $(TEST_INTRA_PRED_SPEED_OBJS) \
+              -L. -lvpx -lgtest $(extralibs) -lm))
+endif  # TEST_INTRA_PRED_SPEED
+
+endif  # CONFIG_UNIT_TESTS

 # Install test sources only if codec source is included
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\
    $(shell find $(SRC_PATH_BARE)/third_party/googletest -type f))
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(LIBVPX_TEST_SRCS)
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS)

 define test_shard_template
 test:: test_shard.$(1)
-test_shard.$(1): $(LIBVPX_TEST_BINS) testdata
+test-no-data-check:: test_shard_ndc.$(1)
+test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN)
 	@set -e; \
-	 for t in $(LIBVPX_TEST_BINS); do \
 	 export GTEST_SHARD_INDEX=$(1); \
 	 export GTEST_TOTAL_SHARDS=$(2); \
-	   $$$$t; \
-	 done
+	 $(LIBVPX_TEST_BIN)
+test_shard.$(1): testdata
 .PHONY: test_shard.$(1)
 endef

@@ -511,15 +559,16 @@ ifeq ($(CONFIG_MSVS),yes)
 # TODO(tomfinegan): Support running the debug versions of tools?
 TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH))
 endif
-utiltest: testdata
+utiltest utiltest-no-data-check:
 	$(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \
 		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
 		--bin-path $(TEST_BIN_PATH)
 	$(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \
 		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
 		--bin-path $(TEST_BIN_PATH)
+utiltest: testdata
 else
-utiltest:
+utiltest utiltest-no-data-check:
 	@echo Unit tests must be enabled to make the utiltest target.
 endif

@@ -537,11 +586,12 @@ ifeq ($(CONFIG_MSVS),yes)
 # TODO(tomfinegan): Support running the debug versions of tools?
 EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release
 endif
-exampletest: examples testdata
+exampletest exampletest-no-data-check: examples
 	$(qexec)$(SRC_PATH_BARE)/test/examples.sh \
 		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
 		--bin-path $(EXAMPLES_BIN_PATH)
+exampletest: testdata
 else
-exampletest:
+exampletest exampletest-no-data-check:
 	@echo Unit tests must be enabled to make the exampletest target.
 endif
--- a/md5_utils.c
+++ b/md5_utils.c
@@ -24,7 +24,7 @@

 #include "md5_utils.h"

-void
+static void
 byteSwap(UWORD32 *buf, unsigned words) {
  md5byte *p;

--- a/rate_hist.c
+++ b/rate_hist.c
@@ -88,6 +88,9 @@ void update_rate_histogram(struct rate_hist *hist,
  if (now < cfg->rc_buf_initial_sz)
    return;

+  if (!cfg->rc_target_bitrate)
+    return;
+
  then = now;

  /* Sum the size over the past rc_buf_sz ms */
--- a/test/android/Android.mk
+++ b/test/android/Android.mk
@@ -40,9 +40,17 @@ include $(CLEAR_VARS)
 LOCAL_ARM_MODE := arm
 LOCAL_MODULE := libvpx_test
 LOCAL_STATIC_LIBRARIES := gtest libwebm
-LOCAL_SHARED_LIBRARIES := vpx
+
+ifeq ($(ENABLE_SHARED),1)
+  LOCAL_SHARED_LIBRARIES := vpx
+else
+  LOCAL_STATIC_LIBRARIES += vpx
+endif
+
 include $(LOCAL_PATH)/test/test.mk
 LOCAL_C_INCLUDES := $(BINDINGS_DIR)
 FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes)))
 LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
+# some test files depend on *_rtcd.h, ensure they're generated first.
+$(eval $(call rtcd_dep_template))
 include $(BUILD_EXECUTABLE)
--- a/test/blockiness_test.cc
+++ b/test/blockiness_test.cc
@@ -0,0 +1,229 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#if CONFIG_VP9_ENCODER
+#include "./vp9_rtcd.h"
+#endif
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+
+extern "C"
+double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
+                          const unsigned char *img2, int img2_pitch,
+                          int width, int height);
+
+using libvpx_test::ACMRandom;
+
+namespace {
+class BlockinessTestBase : public ::testing::Test {
+ public:
+  BlockinessTestBase(int width, int height) : width_(width), height_(height) {}
+
+  static void SetUpTestCase() {
+    source_data_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    reference_data_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(source_data_);
+    source_data_ = NULL;
+    vpx_free(reference_data_);
+    reference_data_ = NULL;
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  // Handle frames up to 640x480
+  static const int kDataAlignment = 16;
+  static const int kDataBufferSize = 640*480;
+
+  virtual void SetUp() {
+    source_stride_ = (width_ + 31) & ~31;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint8_t fill_constant,
+                    int width, int height) {
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        data[h * stride + w] = fill_constant;
+      }
+    }
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint8_t fill_constant) {
+    FillConstant(data, stride, fill_constant, width_, height_);
+  }
+
+  void FillRandom(uint8_t *data, int stride, int width, int height) {
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        data[h * stride + w] = rnd_.Rand8();
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    FillRandom(data, stride, width_, height_);
+  }
+
+  void FillRandomBlocky(uint8_t *data, int stride) {
+    for (int h = 0; h < height_; h += 4) {
+      for (int w = 0; w < width_; w += 4) {
+        FillRandom(data + h * stride + w, stride, 4, 4);
+      }
+    }
+  }
+
+  void FillCheckerboard(uint8_t *data, int stride) {
+    for (int h = 0; h < height_; h += 4) {
+      for (int w = 0; w < width_; w += 4) {
+        if (((h/4) ^ (w/4)) & 1)
+          FillConstant(data + h * stride + w, stride, 255, 4, 4);
+        else
+          FillConstant(data + h * stride + w, stride, 0, 4, 4);
+      }
+    }
+  }
+
+  void Blur(uint8_t *data, int stride, int taps) {
+    int sum = 0;
+    int half_taps = taps / 2;
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < taps; ++w) {
+        sum += data[w + h * stride];
+      }
+      for (int w = taps; w < width_; ++w) {
+        sum += data[w + h * stride] - data[w - taps + h * stride];
+        data[w - half_taps + h * stride] = (sum + half_taps) / taps;
+      }
+    }
+    for (int w = 0; w < width_; ++w) {
+      for (int h = 0; h < taps; ++h) {
+        sum += data[h + w * stride];
+      }
+      for (int h = taps; h < height_; ++h) {
+        sum += data[w + h * stride] - data[(h - taps) * stride + w];
+        data[(h - half_taps) * stride + w] = (sum + half_taps) / taps;
+      }
+    }
+  }
+  int width_, height_;
+  static uint8_t* source_data_;
+  int source_stride_;
+  static uint8_t* reference_data_;
+  int reference_stride_;
+
+  ACMRandom rnd_;
+};
+
+#if CONFIG_VP9_ENCODER
+typedef std::tr1::tuple<int, int> BlockinessParam;
+class BlockinessVP9Test
+    : public BlockinessTestBase,
+      public ::testing::WithParamInterface<BlockinessParam> {
+ public:
+  BlockinessVP9Test() : BlockinessTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  int CheckBlockiness() {
+    return vp9_get_blockiness(source_data_, source_stride_,
+                              reference_data_, reference_stride_,
+                              width_, height_);
+  }
+};
+#endif  // CONFIG_VP9_ENCODER
+
+uint8_t* BlockinessTestBase::source_data_ = NULL;
+uint8_t* BlockinessTestBase::reference_data_ = NULL;
+
+#if CONFIG_VP9_ENCODER
+TEST_P(BlockinessVP9Test, SourceBlockierThanReference) {
+  // Source is blockier than reference.
+  FillRandomBlocky(source_data_, source_stride_);
+  FillConstant(reference_data_, reference_stride_, 128);
+  int super_blocky = CheckBlockiness();
+
+  EXPECT_EQ(0, super_blocky) << "Blocky source should produce 0 blockiness.";
+}
+
+TEST_P(BlockinessVP9Test, ReferenceBlockierThanSource) {
+  // Source is blockier than reference.
+  FillConstant(source_data_, source_stride_, 128);
+  FillRandomBlocky(reference_data_, reference_stride_);
+  int super_blocky = CheckBlockiness();
+
+  EXPECT_GT(super_blocky, 0.0)
+      << "Blocky reference should score high for blockiness.";
+}
+
+TEST_P(BlockinessVP9Test, BlurringDecreasesBlockiness) {
+  // Source is blockier than reference.
+  FillConstant(source_data_, source_stride_, 128);
+  FillRandomBlocky(reference_data_, reference_stride_);
+  int super_blocky = CheckBlockiness();
+
+  Blur(reference_data_, reference_stride_, 4);
+  int less_blocky = CheckBlockiness();
+
+  EXPECT_GT(super_blocky, less_blocky)
+      << "A straight blur should decrease blockiness.";
+}
+
+TEST_P(BlockinessVP9Test, WorstCaseBlockiness) {
+  // Source is blockier than reference.
+  FillConstant(source_data_, source_stride_, 128);
+  FillCheckerboard(reference_data_, reference_stride_);
+
+  int super_blocky = CheckBlockiness();
+
+  Blur(reference_data_, reference_stride_, 4);
+  int less_blocky = CheckBlockiness();
+
+  EXPECT_GT(super_blocky, less_blocky)
+      << "A straight blur should decrease blockiness.";
+}
+#endif  // CONFIG_VP9_ENCODER
+
+
+using std::tr1::make_tuple;
+
+//------------------------------------------------------------------------------
+// C functions
+
+#if CONFIG_VP9_ENCODER
+const BlockinessParam c_vp9_tests[] = {
+  make_tuple(320, 240),
+  make_tuple(318, 242),
+  make_tuple(318, 238),
+};
+INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests));
+#endif
+
+}  // namespace
--- a/test/consistency_test.cc
+++ b/test/consistency_test.cc
@@ -0,0 +1,224 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#if CONFIG_VP9_ENCODER
+#include "./vp9_rtcd.h"
+#endif
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vp9/encoder/vp9_ssim.h"
+#include "vpx_mem/vpx_mem.h"
+
+extern "C"
+double vp9_get_ssim_metrics(uint8_t *img1, int img1_pitch,
+                            uint8_t *img2, int img2_pitch,
+                            int width, int height,
+                            Ssimv *sv2, Metrics *m,
+                            int do_inconsistency);
+
+using libvpx_test::ACMRandom;
+
+namespace {
+class ConsistencyTestBase : public ::testing::Test {
+ public:
+  ConsistencyTestBase(int width, int height) : width_(width), height_(height) {}
+
+  static void SetUpTestCase() {
+    source_data_[0] = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    reference_data_[0] = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    source_data_[1] = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    reference_data_[1] = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    ssim_array_ = new Ssimv[kDataBufferSize / 16];
+  }
+
+  static void ClearSsim() {
+    memset(ssim_array_, 0, kDataBufferSize / 16);
+  }
+  static void TearDownTestCase() {
+    vpx_free(source_data_[0]);
+    source_data_[0] = NULL;
+    vpx_free(reference_data_[0]);
+    reference_data_[0] = NULL;
+    vpx_free(source_data_[1]);
+    source_data_[1] = NULL;
+    vpx_free(reference_data_[1]);
+    reference_data_[1] = NULL;
+
+    delete ssim_array_;
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  // Handle frames up to 640x480
+  static const int kDataAlignment = 16;
+  static const int kDataBufferSize = 640*480;
+
+  virtual void SetUp() {
+    source_stride_ = (width_ + 31) & ~31;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  void FillRandom(uint8_t *data, int stride, int width, int height) {
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        data[h * stride + w] = rnd_.Rand8();
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    FillRandom(data, stride, width_, height_);
+  }
+
+  void Copy(uint8_t *reference, uint8_t *source) {
+    memcpy(reference, source, kDataBufferSize);
+  }
+
+  void Blur(uint8_t *data, int stride, int taps) {
+    int sum = 0;
+    int half_taps = taps / 2;
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < taps; ++w) {
+        sum += data[w + h * stride];
+      }
+      for (int w = taps; w < width_; ++w) {
+        sum += data[w + h * stride] - data[w - taps + h * stride];
+        data[w - half_taps + h * stride] = (sum + half_taps) / taps;
+      }
+    }
+    for (int w = 0; w < width_; ++w) {
+      for (int h = 0; h < taps; ++h) {
+        sum += data[h + w * stride];
+      }
+      for (int h = taps; h < height_; ++h) {
+        sum += data[w + h * stride] - data[(h - taps) * stride + w];
+        data[(h - half_taps) * stride + w] = (sum + half_taps) / taps;
+      }
+    }
+  }
+  int width_, height_;
+  static uint8_t* source_data_[2];
+  int source_stride_;
+  static uint8_t* reference_data_[2];
+  int reference_stride_;
+  static Ssimv *ssim_array_;
+  Metrics metrics_;
+
+  ACMRandom rnd_;
+};
+
+#if CONFIG_VP9_ENCODER
+typedef std::tr1::tuple<int, int> ConsistencyParam;
+class ConsistencyVP9Test
+    : public ConsistencyTestBase,
+      public ::testing::WithParamInterface<ConsistencyParam> {
+ public:
+  ConsistencyVP9Test() : ConsistencyTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  double CheckConsistency(int frame) {
+    EXPECT_LT(frame, 2)<< "Frame to check has to be less than 2.";
+    return
+        vp9_get_ssim_metrics(source_data_[frame], source_stride_,
+                             reference_data_[frame], reference_stride_,
+                             width_, height_, ssim_array_, &metrics_, 1);
+  }
+};
+#endif  // CONFIG_VP9_ENCODER
+
+uint8_t* ConsistencyTestBase::source_data_[2] = {NULL, NULL};
+uint8_t* ConsistencyTestBase::reference_data_[2] = {NULL, NULL};
+Ssimv* ConsistencyTestBase::ssim_array_ = NULL;
+
+#if CONFIG_VP9_ENCODER
+TEST_P(ConsistencyVP9Test, ConsistencyIsZero) {
+  FillRandom(source_data_[0], source_stride_);
+  Copy(source_data_[1], source_data_[0]);
+  Copy(reference_data_[0], source_data_[0]);
+  Blur(reference_data_[0], reference_stride_, 3);
+  Copy(reference_data_[1], source_data_[0]);
+  Blur(reference_data_[1], reference_stride_, 3);
+
+  double inconsistency = CheckConsistency(1);
+  inconsistency = CheckConsistency(0);
+  EXPECT_EQ(inconsistency, 0.0)
+      << "Should have 0 inconsistency if they are exactly the same.";
+
+  // If sources are not consistent reference frames inconsistency should
+  // be less than if the source is consistent.
+  FillRandom(source_data_[0], source_stride_);
+  FillRandom(source_data_[1], source_stride_);
+  FillRandom(reference_data_[0], reference_stride_);
+  FillRandom(reference_data_[1], reference_stride_);
+  CheckConsistency(0);
+  inconsistency = CheckConsistency(1);
+
+  Copy(source_data_[1], source_data_[0]);
+  CheckConsistency(0);
+  double inconsistency2 = CheckConsistency(1);
+  EXPECT_LT(inconsistency, inconsistency2)
+      << "Should have less inconsistency if source itself is inconsistent.";
+
+  // Less of a blur should be less inconsistent than more blur coming off a
+  // a frame with no blur.
+  ClearSsim();
+  FillRandom(source_data_[0], source_stride_);
+  Copy(source_data_[1], source_data_[0]);
+  Copy(reference_data_[0], source_data_[0]);
+  Copy(reference_data_[1], source_data_[0]);
+  Blur(reference_data_[1], reference_stride_, 4);
+  CheckConsistency(0);
+  inconsistency = CheckConsistency(1);
+  ClearSsim();
+  Copy(reference_data_[1], source_data_[0]);
+  Blur(reference_data_[1], reference_stride_, 8);
+  CheckConsistency(0);
+  inconsistency2 = CheckConsistency(1);
+
+  EXPECT_LT(inconsistency, inconsistency2)
+      << "Stronger Blur should produce more inconsistency.";
+}
+#endif  // CONFIG_VP9_ENCODER
+
+
+using std::tr1::make_tuple;
+
+//------------------------------------------------------------------------------
+// C functions
+
+#if CONFIG_VP9_ENCODER
+const ConsistencyParam c_vp9_tests[] = {
+  make_tuple(320, 240),
+  make_tuple(318, 242),
+  make_tuple(318, 238),
+};
+INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test,
+                        ::testing::ValuesIn(c_vp9_tests));
+#endif
+
+}  // namespace
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -398,9 +398,9 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
  }

  void CopyOutputToRef() {
-    vpx_memcpy(output_ref_, output_, kOutputBufferSize);
+    memcpy(output_ref_, output_, kOutputBufferSize);
 #if CONFIG_VP9_HIGHBITDEPTH
-    vpx_memcpy(output16_ref_, output16_, kOutputBufferSize);
+    memcpy(output16_ref_, output16_, kOutputBufferSize);
 #endif
  }

@@ -1814,4 +1814,27 @@ INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
    make_tuple(32, 64, &convolve8_dspr2),
    make_tuple(64, 64, &convolve8_dspr2)));
 #endif
+
+#if HAVE_MSA
+const ConvolveFunctions convolve8_msa(
+    vp9_convolve_copy_msa, vp9_convolve_avg_msa,
+    vp9_convolve8_horiz_msa, vp9_convolve8_avg_horiz_msa,
+    vp9_convolve8_vert_msa, vp9_convolve8_avg_vert_msa,
+    vp9_convolve8_msa, vp9_convolve8_avg_msa, 0);
+
+INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_msa),
+    make_tuple(8, 4, &convolve8_msa),
+    make_tuple(4, 8, &convolve8_msa),
+    make_tuple(8, 8, &convolve8_msa),
+    make_tuple(16, 8, &convolve8_msa),
+    make_tuple(8, 16, &convolve8_msa),
+    make_tuple(16, 16, &convolve8_msa),
+    make_tuple(32, 16, &convolve8_msa),
+    make_tuple(16, 32, &convolve8_msa),
+    make_tuple(32, 32, &convolve8_msa),
+    make_tuple(64, 32, &convolve8_msa),
+    make_tuple(32, 64, &convolve8_msa),
+    make_tuple(64, 64, &convolve8_msa)));
+#endif  // HAVE_MSA
 }  // namespace
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -14,6 +14,7 @@
 #include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
+#include "vpx/vpx_codec.h"

 namespace {

@@ -371,9 +372,7 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest,
        encoder->Control(VP9E_SET_SVC, 1);
      }
      vpx_svc_layer_id_t layer_id;
-#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
      layer_id.spatial_layer_id = 0;
-#endif
      frame_flags_ = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
      layer_id.temporal_layer_id = SetLayerId(video->frame(),
                                              cfg_.ts_number_layers);
@@ -565,6 +564,8 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting2TemporalLayers) {
  cfg_.ts_rate_decimator[0] = 2;
  cfg_.ts_rate_decimator[1] = 1;

+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
  if (deadline_ == VPX_DL_REALTIME)
    cfg_.g_error_resilient = 1;

@@ -574,14 +575,14 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting2TemporalLayers) {
    cfg_.rc_target_bitrate = i;
    ResetModel();
    // 60-40 bitrate allocation for 2 temporal layers.
-    cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
-    cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+    cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+    cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate;
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
    for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-      ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.85)
+      ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
          << " The datarate for the file is lower than target by too much, "
              "for layer: " << j;
-      ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.15)
+      ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
          << " The datarate for the file is greater than target by too much, "
              "for layer: " << j;
    }
@@ -606,25 +607,27 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayers) {
  cfg_.ts_rate_decimator[1] = 2;
  cfg_.ts_rate_decimator[2] = 1;

+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 200);
  for (int i = 200; i <= 800; i += 200) {
    cfg_.rc_target_bitrate = i;
    ResetModel();
    // 40-20-40 bitrate allocation for 3 temporal layers.
-    cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
-    cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
-    cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+    cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+    cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+    cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
    for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
      // TODO(yaowu): Work out more stable rc control strategy and
      //              Adjust the thresholds to be tighter than .75.
-      ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.75)
+      ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75)
          << " The datarate for the file is lower than target by too much, "
              "for layer: " << j;
      // TODO(yaowu): Work out more stable rc control strategy and
      //              Adjust the thresholds to be tighter than 1.25.
-      ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.25)
+      ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25)
          << " The datarate for the file is greater than target by too much, "
              "for layer: " << j;
    }
@@ -652,20 +655,22 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
  cfg_.ts_rate_decimator[1] = 2;
  cfg_.ts_rate_decimator[2] = 1;

+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 200);
  cfg_.rc_target_bitrate = 200;
  ResetModel();
  // 40-20-40 bitrate allocation for 3 temporal layers.
-  cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
-  cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
-  cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-    ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.85)
+    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
        << " The datarate for the file is lower than target by too much, "
            "for layer: " << j;
-    ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.15)
+    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
        << " The datarate for the file is greater than target by too much, "
            "for layer: " << j;
    // Expect some frame drops in this test: for this 200 frames test,
@@ -737,9 +742,178 @@ TEST_P(DatarateTestVP9Large, DenoiserOffOn) {
 }
 #endif  // CONFIG_VP9_TEMPORAL_DENOISING

+class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ public:
+  DatarateOnePassCbrSvc() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~DatarateOnePassCbrSvc() {}
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    speed_setting_ = GET_PARAM(2);
+    ResetModel();
+  }
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    first_drop_ = 0;
+    bits_total_ = 0;
+    duration_ = 0.0;
+  }
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+  }
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      int i;
+      for (i = 0; i < 2; ++i) {
+        svc_params_.max_quantizers[i] = 63;
+        svc_params_.min_quantizers[i] = 0;
+      }
+      svc_params_.scaling_factor_num[0] = 144;
+      svc_params_.scaling_factor_den[0] = 288;
+      svc_params_.scaling_factor_num[1] = 288;
+      svc_params_.scaling_factor_den[1] = 288;
+      encoder->Control(VP9E_SET_SVC, 1);
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+      encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 0);
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300);
+    }
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+    if (last_pts_ == 0)
+      duration = 1;
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+    const bool key_frame = (pkt->data.frame.flags & VPX_FRAME_IS_KEY)
+                         ? true: false;
+    if (!key_frame) {
+      ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
+          << pkt->data.frame.pts;
+    }
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+    bits_in_buffer_model_ -= frame_size_in_bits;
+    bits_total_ += frame_size_in_bits;
+    if (!first_drop_ && duration > 1)
+      first_drop_ = last_pts_ + 1;
+    last_pts_ = pkt->data.frame.pts;
+    bits_in_last_frame_ = frame_size_in_bits;
+    ++frame_number_;
+  }
+  virtual void EndPassHook(void) {
+    if (bits_total_) {
+      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
+      duration_ = (last_pts_ + 1) * timebase_;
+      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0
+          / (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
+      file_datarate_ = file_size_in_kb / duration_;
+    }
+  }
+  vpx_codec_pts_t last_pts_;
+  int64_t bits_in_buffer_model_;
+  double timebase_;
+  int frame_number_;
+  vpx_codec_pts_t first_drop_;
+  int64_t bits_total_;
+  double duration_;
+  double file_datarate_;
+  double effective_datarate_;
+  size_t bits_in_last_frame_;
+  vpx_svc_extra_cfg_t svc_params_;
+  int speed_setting_;
+};
+static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
+    const vpx_svc_extra_cfg_t *svc_params,
+    int spatial_layers,
+    int temporal_layers,
+    int temporal_layering_mode,
+    unsigned int total_rate) {
+  int sl, spatial_layer_target;
+  float total = 0;
+  float alloc_ratio[VPX_MAX_LAYERS] = {0};
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    if (svc_params->scaling_factor_den[sl] > 0) {
+      alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] *
+          1.0 / svc_params->scaling_factor_den[sl]);
+      total += alloc_ratio[sl];
+    }
+  }
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    enc_cfg->ss_target_bitrate[sl] = spatial_layer_target =
+        (unsigned int)(enc_cfg->rc_target_bitrate *
+            alloc_ratio[sl] / total);
+    const int index = sl * temporal_layers;
+    if (temporal_layering_mode == 3) {
+      enc_cfg->layer_target_bitrate[index] =
+          spatial_layer_target >> 1;
+      enc_cfg->layer_target_bitrate[index + 1] =
+          (spatial_layer_target >> 1) + (spatial_layer_target >> 2);
+      enc_cfg->layer_target_bitrate[index + 2] =
+          spatial_layer_target;
+    } else if (temporal_layering_mode == 2) {
+      enc_cfg->layer_target_bitrate[index] =
+          spatial_layer_target * 2 / 3;
+      enc_cfg->layer_target_bitrate[index + 1] =
+          spatial_layer_target;
+    }
+  }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 144;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 288;
+  svc_params_.scaling_factor_den[1] = 288;
+  // TODO(wonkap/marpan): No frame drop for now, we need to implement correct
+  // frame dropping for SVC.
+  cfg_.rc_dropframe_thresh = 0;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  // TODO(wonkap/marpan): Check that effective_datarate for each layer hits the
+  // layer target_bitrate. Also check if test can pass at lower bitrate (~200k).
+  for (int i = 400; i <= 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+        cfg_.rc_target_bitrate);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+            << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
+        << " The datarate for the file is lower than the target by too much!";
+  }
+}
+
 VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES);
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
                          ::testing::Values(::libvpx_test::kOnePassGood,
                                            ::libvpx_test::kRealTime),
                          ::testing::Range(2, 7));
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Range(5, 8));
 }  // namespace
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -20,8 +20,10 @@

 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"

 using libvpx_test::ACMRandom;

@@ -356,13 +358,13 @@ class Trans16x16TestBase {
    int64_t total_error = 0;
    const int count_test_block = 10000;
    for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
+      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
-      DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
 #endif

      // Initialize a test block with input range [-mask_, mask_].
@@ -416,9 +418,9 @@ class Trans16x16TestBase {
  void RunCoeffCheck() {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);

    for (int i = 0; i < count_test_block; ++i) {
      // Initialize a test block with input range [-mask_, mask_].
@@ -437,15 +439,13 @@ class Trans16x16TestBase {
  void RunMemCheck() {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);

    for (int i = 0; i < count_test_block; ++i) {
      // Initialize a test block with input range [-mask_, mask_].
      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
      }
      if (i == 0) {
@@ -472,24 +472,19 @@ class Trans16x16TestBase {
  void RunQuantCheck(int dc_thred, int ac_thred) {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 100000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);

-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
 #endif

    for (int i = 0; i < count_test_block; ++i) {
      // Initialize a test block with input range [-mask_, mask_].
      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8)
-          input_block[j] = rnd.Rand8() - rnd.Rand8();
-        else
-          input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
      }
      if (i == 0)
@@ -502,11 +497,11 @@ class Trans16x16TestBase {
      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);

      // clear reconstructed pixel buffers
-      vpx_memset(dst, 0, kNumCoeffs * sizeof(uint8_t));
-      vpx_memset(ref, 0, kNumCoeffs * sizeof(uint8_t));
+      memset(dst, 0, kNumCoeffs * sizeof(uint8_t));
+      memset(ref, 0, kNumCoeffs * sizeof(uint8_t));
 #if CONFIG_VP9_HIGHBITDEPTH
-      vpx_memset(dst16, 0, kNumCoeffs * sizeof(uint16_t));
-      vpx_memset(ref16, 0, kNumCoeffs * sizeof(uint16_t));
+      memset(dst16, 0, kNumCoeffs * sizeof(uint16_t));
+      memset(ref16, 0, kNumCoeffs * sizeof(uint16_t));
 #endif

      // quantization with maximum allowed step sizes
@@ -539,13 +534,13 @@ class Trans16x16TestBase {
  void RunInvAccuracyCheck() {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH

    for (int i = 0; i < count_test_block; ++i) {
@@ -599,12 +594,12 @@ class Trans16x16TestBase {
    const int count_test_block = 10000;
    const int eob = 10;
    const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH

    for (int i = 0; i < count_test_block; ++i) {
@@ -934,11 +929,19 @@ INSTANTIATE_TEST_CASE_P(
                   &idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

-#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
-    SSSE3, Trans16x16DCT,
+    MSA, Trans16x16DCT,
    ::testing::Values(
-        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0,
+        make_tuple(&vp9_fdct16x16_msa,
+                   &vp9_idct16x16_256_add_msa, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
                   VPX_BITS_8)));
-#endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -23,6 +23,7 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"

 using libvpx_test::ACMRandom;

@@ -119,13 +120,13 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
  uint32_t max_error = 0;
  int64_t total_error = 0;
  const int count_test_block = 10000;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+  DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+  DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
 #endif

  for (int i = 0; i < count_test_block; ++i) {
@@ -184,9 +185,9 @@ TEST_P(Trans32x32Test, CoeffCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  const int count_test_block = 1000;

-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
+  DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);

  for (int i = 0; i < count_test_block; ++i) {
    for (int j = 0; j < kNumCoeffs; ++j)
@@ -212,15 +213,13 @@ TEST_P(Trans32x32Test, MemCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  const int count_test_block = 2000;

-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
+  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);

  for (int i = 0; i < count_test_block; ++i) {
    // Initialize a test block with input range [-mask_, mask_].
    for (int j = 0; j < kNumCoeffs; ++j) {
-      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
      input_extreme_block[j] = rnd.Rand8() & 1 ? mask_ : -mask_;
    }
    if (i == 0) {
@@ -257,13 +256,13 @@ TEST_P(Trans32x32Test, MemCheck) {
 TEST_P(Trans32x32Test, InverseAccuracy) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  const int count_test_block = 1000;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+  DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+  DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
 #endif

  for (int i = 0; i < count_test_block; ++i) {
@@ -382,4 +381,14 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fdct32x32_rd_avx2,
                   &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
 #endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_fdct32x32_msa,
+                   &vp9_idct32x32_1024_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vp9_fdct32x32_rd_msa,
+                   &vp9_idct32x32_1024_add_msa, 1, VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -29,8 +29,6 @@ void Encoder::InitEncoder(VideoSource *video) {
    cfg_.g_timebase = video->timebase();
    cfg_.rc_twopass_stats_in = stats_->buf();

-    // Default to 1 thread.
-    cfg_.g_threads = 1;
    res = vpx_codec_enc_init(&encoder_, CodecInterface(), &cfg_,
                             init_flags_);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -133,6 +133,10 @@ class Encoder {
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }

+  void Control(int ctrl_id, struct vpx_svc_parameters *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
  void Control(int ctrl_id, vpx_active_map_t *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
@@ -183,7 +187,10 @@ class EncoderTest {
 protected:
  explicit EncoderTest(const CodecFactory *codec)
      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
-        last_pts_(0) {}
+        last_pts_(0) {
+    // Default to 1 thread.
+    cfg_.g_threads = 1;
+  }

  virtual ~EncoderTest() {}

--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -22,6 +22,7 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"

 using libvpx_test::ACMRandom;

@@ -102,13 +103,13 @@ class Trans4x4TestBase {
    int64_t total_error = 0;
    const int count_test_block = 10000;
    for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
+      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
-      DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
 #endif

      // Initialize a test block with input range [-255, 255].
@@ -142,6 +143,7 @@ class Trans4x4TestBase {
        const uint32_t diff =
            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
+        ASSERT_EQ(VPX_BITS_8, bit_depth_);
        const uint32_t diff = dst[j] - src[j];
 #endif
        const uint32_t error = diff * diff;
@@ -163,9 +165,9 @@ class Trans4x4TestBase {
  void RunCoeffCheck() {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 5000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);

    for (int i = 0; i < count_test_block; ++i) {
      // Initialize a test block with input range [-mask_, mask_].
@@ -184,15 +186,13 @@ class Trans4x4TestBase {
  void RunMemCheck() {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 5000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);

    for (int i = 0; i < count_test_block; ++i) {
      // Initialize a test block with input range [-mask_, mask_].
      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
      }
      if (i == 0) {
@@ -219,13 +219,13 @@ class Trans4x4TestBase {
  void RunInvAccuracyCheck(int limit) {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
 #endif

    for (int i = 0; i < count_test_block; ++i) {
@@ -536,4 +536,18 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct4x4_msa, &vp9_idct4x4_16_add_msa, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -20,11 +20,32 @@

 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {

 const int kNumCoeffs = 64;
 const double kPi = 3.141592653589793238462643383279502884;
+
+const int kSignBiasMaxDiff255 = 1500;
+const int kSignBiasMaxDiff15 = 10000;
+
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
+typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;
+
 void reference_8x8_dct_1d(const double in[8], double out[8], int stride) {
  const double kInvSqrt2 = 0.707106781186547524400844362104;
  for (int k = 0; k < 8; k++) {
@@ -59,23 +80,6 @@ void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],
  }
 }

-using libvpx_test::ACMRandom;
-
-namespace {
-
-const int kSignBiasMaxDiff255 = 1500;
-const int kSignBiasMaxDiff15 = 10000;
-
-typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        int tx_type);
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        int tx_type);
-
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
-typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;

 void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
  vp9_fdct8x8_c(in, out, stride);
@@ -139,8 +143,8 @@ class FwdTrans8x8TestBase {

  void RunSignBiasCheck() {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_output_block, 64);
+    DECLARE_ALIGNED(16, int16_t, test_input_block[64]);
+    DECLARE_ALIGNED(16, tran_low_t, test_output_block[64]);
    int count_sign_block[64][2];
    const int count_test_block = 100000;

@@ -210,13 +214,13 @@ class FwdTrans8x8TestBase {
    int max_error = 0;
    int total_error = 0;
    const int count_test_block = 100000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+    DECLARE_ALIGNED(16, int16_t, test_input_block[64]);
+    DECLARE_ALIGNED(16, tran_low_t, test_temp_block[64]);
+    DECLARE_ALIGNED(16, uint8_t, dst[64]);
+    DECLARE_ALIGNED(16, uint8_t, src[64]);
 #if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, 64);
+    DECLARE_ALIGNED(16, uint16_t, dst16[64]);
+    DECLARE_ALIGNED(16, uint16_t, src16[64]);
 #endif

    for (int i = 0; i < count_test_block; ++i) {
@@ -287,14 +291,14 @@ class FwdTrans8x8TestBase {
    int total_error = 0;
    int total_coeff_error = 0;
    const int count_test_block = 100000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+    DECLARE_ALIGNED(16, int16_t, test_input_block[64]);
+    DECLARE_ALIGNED(16, tran_low_t, test_temp_block[64]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_temp_block[64]);
+    DECLARE_ALIGNED(16, uint8_t, dst[64]);
+    DECLARE_ALIGNED(16, uint8_t, src[64]);
 #if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, 64);
+    DECLARE_ALIGNED(16, uint16_t, dst16[64]);
+    DECLARE_ALIGNED(16, uint16_t, src16[64]);
 #endif

    for (int i = 0; i < count_test_block; ++i) {
@@ -376,13 +380,13 @@ class FwdTrans8x8TestBase {
  void RunInvAccuracyCheck() {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
 #endif

    for (int i = 0; i < count_test_block; ++i) {
@@ -434,9 +438,9 @@ class FwdTrans8x8TestBase {
  void RunFwdAccuracyCheck() {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_r, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff_r[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);

    for (int i = 0; i < count_test_block; ++i) {
      double out_r[kNumCoeffs];
@@ -464,12 +468,12 @@ void CompareInvReference(IdctFunc ref_txfm, int thresh) {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 10000;
    const int eob = 12;
-    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
 #endif
    const int16_t *scan = vp9_default_scan_orders[TX_8X8].scan;

@@ -777,4 +781,18 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0,
                   VPX_BITS_8)));
 #endif
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct8x8_msa, &vp9_idct8x8_64_add_msa, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    MSA, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -52,7 +52,7 @@ typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
                               const uint8_t *thresh1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH

-typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
+typedef std::tr1::tuple<loop_op_t, loop_op_t, int, int> loop8_param_t;
 typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;

 #if HAVE_SSE2
@@ -137,6 +137,20 @@ void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON_ASM

+#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
+void wrapper_vertical_16_msa(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh,
+                             int count) {
+  vp9_lpf_vertical_16_msa(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count) {
+  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+#endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
+
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
 public:
  virtual ~Loop8Test6Param() {}
@@ -144,6 +158,7 @@ class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
    loopfilter_op_ = GET_PARAM(0);
    ref_loopfilter_op_ = GET_PARAM(1);
    bit_depth_ = GET_PARAM(2);
+    count_ = GET_PARAM(3);
    mask_ = (1 << bit_depth_) - 1;
  }

@@ -151,6 +166,7 @@ class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {

 protected:
  int bit_depth_;
+  int count_;
  int mask_;
  loop_op_t loopfilter_op_;
  loop_op_t ref_loopfilter_op_;
@@ -180,11 +196,11 @@ TEST_P(Loop8Test6Param, OperationCheck) {
  const int count_test_block = number_of_iterations;
 #if CONFIG_VP9_HIGHBITDEPTH
  int32_t bd = bit_depth_;
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
 #else
-  DECLARE_ALIGNED_ARRAY(8, uint8_t, s, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(8, uint8_t, ref_s, kNumCoeffs);
+  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(8, uint8_t, ref_s[kNumCoeffs]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
  int err_count_total = 0;
  int first_failure = -1;
@@ -206,7 +222,6 @@ TEST_P(Loop8Test6Param, OperationCheck) {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    int32_t p = kNumCoeffs/32;
-    int count = 1;

    uint16_t tmp_s[kNumCoeffs];
    int j = 0;
@@ -238,13 +253,13 @@ TEST_P(Loop8Test6Param, OperationCheck) {
      ref_s[j] = s[j];
    }
 #if CONFIG_VP9_HIGHBITDEPTH
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count, bd);
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
    ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
 #else
-    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
    ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
 #endif  // CONFIG_VP9_HIGHBITDEPTH

    for (int j = 0; j < kNumCoeffs; ++j) {
@@ -266,11 +281,11 @@ TEST_P(Loop8Test6Param, ValueCheck) {
  const int count_test_block = number_of_iterations;
 #if CONFIG_VP9_HIGHBITDEPTH
  const int32_t bd = bit_depth_;
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
 #else
-  DECLARE_ALIGNED_ARRAY(8, uint8_t, s, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(8, uint8_t, ref_s, kNumCoeffs);
+  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(8, uint8_t, ref_s[kNumCoeffs]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
  int err_count_total = 0;
  int first_failure = -1;
@@ -279,7 +294,7 @@ TEST_P(Loop8Test6Param, ValueCheck) {
  // function of sharpness_lvl and the loopfilter lvl as:
  // block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
  // ...
-  // vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+  // memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
  //        SIMD_WIDTH);
  // This means that the largest value for mblim will occur when sharpness_lvl
  // is equal to 0, and lvl is equal to its greatest value (MAX_LOOP_FILTER).
@@ -305,19 +320,18 @@ TEST_P(Loop8Test6Param, ValueCheck) {
        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
    };
    int32_t p = kNumCoeffs / 32;
-    int count = 1;
    for (int j = 0; j < kNumCoeffs; ++j) {
      s[j] = rnd.Rand16() & mask_;
      ref_s[j] = s[j];
    }
 #if CONFIG_VP9_HIGHBITDEPTH
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count, bd);
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
    ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
 #else
-    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
    ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
    for (int j = 0; j < kNumCoeffs; ++j) {
      err_count += ref_s[j] != s[j];
@@ -338,11 +352,11 @@ TEST_P(Loop8Test9Param, OperationCheck) {
  const int count_test_block = number_of_iterations;
 #if CONFIG_VP9_HIGHBITDEPTH
  const int32_t bd = bit_depth_;
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
 #else
-  DECLARE_ALIGNED_ARRAY(8,  uint8_t,  s, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(8,  uint8_t,  ref_s, kNumCoeffs);
+  DECLARE_ALIGNED(8,  uint8_t,  s[kNumCoeffs]);
+  DECLARE_ALIGNED(8,  uint8_t,  ref_s[kNumCoeffs]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
  int err_count_total = 0;
  int first_failure = -1;
@@ -440,11 +454,11 @@ TEST_P(Loop8Test9Param, ValueCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  const int count_test_block = number_of_iterations;
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
 #else
-  DECLARE_ALIGNED_ARRAY(8,  uint8_t, s, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(8,  uint8_t, ref_s, kNumCoeffs);
+  DECLARE_ALIGNED(8,  uint8_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(8,  uint8_t, ref_s[kNumCoeffs]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
  int err_count_total = 0;
  int first_failure = -1;
@@ -521,55 +535,62 @@ INSTANTIATE_TEST_CASE_P(
    SSE2, Loop8Test6Param,
    ::testing::Values(
        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 8),
+                   &vp9_highbd_lpf_horizontal_4_c, 8, 1),
        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 8),
+                   &vp9_highbd_lpf_vertical_4_c, 8, 1),
        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 8),
+                   &vp9_highbd_lpf_horizontal_8_c, 8, 1),
        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 8),
+                   &vp9_highbd_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+                   &vp9_highbd_lpf_horizontal_16_c, 8, 2),
        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 8),
+                   &vp9_highbd_lpf_vertical_8_c, 8, 1),
        make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 8),
+                   &wrapper_vertical_16_c, 8, 1),
        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 10),
+                   &vp9_highbd_lpf_horizontal_4_c, 10, 1),
        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 10),
+                   &vp9_highbd_lpf_vertical_4_c, 10, 1),
        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 10),
+                   &vp9_highbd_lpf_horizontal_8_c, 10, 1),
        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 10),
+                   &vp9_highbd_lpf_horizontal_16_c, 10, 1),
+        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+                   &vp9_highbd_lpf_horizontal_16_c, 10, 2),
        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 10),
+                   &vp9_highbd_lpf_vertical_8_c, 10, 1),
        make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 10),
+                   &wrapper_vertical_16_c, 10, 1),
        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 12),
+                   &vp9_highbd_lpf_horizontal_4_c, 12, 1),
        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 12),
+                   &vp9_highbd_lpf_vertical_4_c, 12, 1),
        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 12),
+                   &vp9_highbd_lpf_horizontal_8_c, 12, 1),
        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 12),
+                   &vp9_highbd_lpf_horizontal_16_c, 12, 1),
+        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+                   &vp9_highbd_lpf_horizontal_16_c, 12, 2),
        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 12),
+                   &vp9_highbd_lpf_vertical_8_c, 12, 1),
        make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 12),
+                   &wrapper_vertical_16_c, 12, 1),
        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 8),
+                   &wrapper_vertical_16_dual_c, 8, 1),
        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 10),
+                   &wrapper_vertical_16_dual_c, 10, 1),
        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 12)));
+                   &wrapper_vertical_16_dual_c, 12, 1)));
 #else
 INSTANTIATE_TEST_CASE_P(
    SSE2, Loop8Test6Param,
    ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8),
-        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8),
-        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8)));
+        make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif

@@ -577,7 +598,9 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
    AVX2, Loop8Test6Param,
    ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8)));
+        make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8,
+                   2)));
 #endif

 #if HAVE_SSE2
@@ -635,20 +658,22 @@ INSTANTIATE_TEST_CASE_P(
 // Using #if inside the macro is unsupported on MSVS but the tests are not
 // currently built for MSVS with ARM and NEON.
        make_tuple(&vp9_lpf_horizontal_16_neon,
-                   &vp9_lpf_horizontal_16_c, 8),
+                   &vp9_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vp9_lpf_horizontal_16_neon,
+                   &vp9_lpf_horizontal_16_c, 8, 2),
        make_tuple(&wrapper_vertical_16_neon,
-                   &wrapper_vertical_16_c, 8),
+                   &wrapper_vertical_16_c, 8, 1),
        make_tuple(&wrapper_vertical_16_dual_neon,
-                   &wrapper_vertical_16_dual_c, 8),
+                   &wrapper_vertical_16_dual_c, 8, 1),
        make_tuple(&vp9_lpf_horizontal_8_neon,
-                   &vp9_lpf_horizontal_8_c, 8),
+                   &vp9_lpf_horizontal_8_c, 8, 1),
        make_tuple(&vp9_lpf_vertical_8_neon,
-                   &vp9_lpf_vertical_8_c, 8),
+                   &vp9_lpf_vertical_8_c, 8, 1),
 #endif  // HAVE_NEON_ASM
        make_tuple(&vp9_lpf_horizontal_4_neon,
-                   &vp9_lpf_horizontal_4_c, 8),
+                   &vp9_lpf_horizontal_4_c, 8, 1),
        make_tuple(&vp9_lpf_vertical_4_neon,
-                   &vp9_lpf_vertical_4_c, 8)));
+                   &vp9_lpf_vertical_4_c, 8, 1)));
 INSTANTIATE_TEST_CASE_P(
    NEON, Loop8Test9Param,
    ::testing::Values(
@@ -665,4 +690,27 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON

+#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
+INSTANTIATE_TEST_CASE_P(
+    MSA, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vp9_lpf_horizontal_8_msa, &vp9_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vp9_lpf_horizontal_16_msa, &vp9_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vp9_lpf_horizontal_16_msa, &vp9_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vp9_lpf_vertical_8_msa, &vp9_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_msa, &wrapper_vertical_16_c, 8, 1)));
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vp9_lpf_horizontal_4_dual_msa,
+                   &vp9_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vp9_lpf_horizontal_8_dual_msa,
+                   &vp9_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vp9_lpf_vertical_4_dual_msa,
+                   &vp9_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vp9_lpf_vertical_8_dual_msa,
+                   &vp9_lpf_vertical_8_dual_c, 8)));
+#endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
+
 }  // namespace
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -42,6 +42,10 @@ class MD5 {
    }
  }

+  void Add(const uint8_t *data, size_t size) {
+    MD5Update(&md5_, data, static_cast<uint32_t>(size));
+  }
+
  const char *Get(void) {
    static const char hex[16] = {
      '0', '1', '2', '3', '4', '5', '6', '7',
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -74,16 +74,16 @@ TEST_P(PartialIDctTest, RunQuantCheck) {
      FAIL() << "Wrong Size!";
      break;
  }
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block1, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block2, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);

  const int count_test_block = 1000;
  const int block_size = size * size;

-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kMaxNumCoeffs);
+  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);

  int max_error = 0;
  for (int i = 0; i < count_test_block; ++i) {
@@ -153,10 +153,10 @@ TEST_P(PartialIDctTest, ResultsMatch) {
      FAIL() << "Wrong Size!";
      break;
  }
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block1, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block2, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
  const int count_test_block = 1000;
  const int max_coeff = 32766 / 4;
  const int block_size = size * size;
@@ -230,7 +230,7 @@ INSTANTIATE_TEST_CASE_P(
                   &vp9_idct4x4_1_add_c,
                   TX_4X4, 1)));

-#if HAVE_NEON
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
    NEON, PartialIDctTest,
    ::testing::Values(
@@ -258,7 +258,7 @@ INSTANTIATE_TEST_CASE_P(
                   &vp9_idct4x4_16_add_c,
                   &vp9_idct4x4_1_add_neon,
                   TX_4X4, 1)));
-#endif  // HAVE_NEON
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@@ -305,13 +305,38 @@ INSTANTIATE_TEST_CASE_P(
                   TX_8X8, 12)));
 #endif

-#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
-    SSSE3, PartialIDctTest,
+    MSA, PartialIDctTest,
    ::testing::Values(
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_c,
+                   &vp9_idct32x32_34_add_msa,
+                   TX_32X32, 34),
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_c,
+                   &vp9_idct32x32_1_add_msa,
+                   TX_32X32, 1),
        make_tuple(&vp9_fdct16x16_c,
                   &vp9_idct16x16_256_add_c,
-                   &vp9_idct16x16_10_add_ssse3,
-                   TX_16X16, 10)));
-#endif
+                   &vp9_idct16x16_10_add_msa,
+                   TX_16X16, 10),
+        make_tuple(&vp9_fdct16x16_c,
+                   &vp9_idct16x16_256_add_c,
+                   &vp9_idct16x16_1_add_msa,
+                   TX_16X16, 1),
+        make_tuple(&vp9_fdct8x8_c,
+                   &vp9_idct8x8_64_add_c,
+                   &vp9_idct8x8_12_add_msa,
+                   TX_8X8, 10),
+        make_tuple(&vp9_fdct8x8_c,
+                   &vp9_idct8x8_64_add_c,
+                   &vp9_idct8x8_1_add_msa,
+                   TX_8X8, 1),
+        make_tuple(&vp9_fdct4x4_c,
+                   &vp9_idct4x4_16_add_c,
+                   &vp9_idct4x4_1_add_msa,
+                   TX_4X4, 1)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 }  // namespace
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -63,12 +63,12 @@ TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) {
  uint8_t *const dst_image_ptr = dst_image + 8;
  uint8_t *const flimits =
      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
-  (void)vpx_memset(flimits, 255, block_width);
+  (void)memset(flimits, 255, block_width);

  // Initialize pixels in the input:
  //   block pixels to value 1,
  //   border pixels to value 10.
-  (void)vpx_memset(src_image, 10, input_size);
+  (void)memset(src_image, 10, input_size);
  uint8_t *pixel_ptr = src_image_ptr;
  for (int i = 0; i < block_height; ++i) {
    for (int j = 0; j < block_width; ++j) {
@@ -78,7 +78,7 @@ TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) {
  }

  // Initialize pixels in the output to 99.
-  (void)vpx_memset(dst_image, 99, output_size);
+  (void)memset(dst_image, 99, output_size);

  ASM_REGISTER_STATE_CHECK(
      GetParam()(src_image_ptr, dst_image_ptr, input_stride,
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -56,7 +56,7 @@ class QuantizeTestBase {

    // The full configuration is necessary to generate the quantization tables.
    VP8_CONFIG vp8_config;
-    vpx_memset(&vp8_config, 0, sizeof(vp8_config));
+    memset(&vp8_config, 0, sizeof(vp8_config));

    vp8_comp_ = vp8_create_compressor(&vp8_config);

@@ -69,8 +69,7 @@ class QuantizeTestBase {
    // Copy macroblockd from the reference to get pre-set-up dequant values.
    macroblockd_dst_ = reinterpret_cast<MACROBLOCKD *>(
        vpx_memalign(32, sizeof(*macroblockd_dst_)));
-    vpx_memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd,
-               sizeof(*macroblockd_dst_));
+    memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_));
    // Fix block pointers - currently they point to the blocks in the reference
    // structure.
    vp8_setup_block_dptrs(macroblockd_dst_);
@@ -79,8 +78,7 @@ class QuantizeTestBase {
  void UpdateQuantizer(int q) {
    vp8_set_quantizer(vp8_comp_, q);

-    vpx_memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd,
-               sizeof(*macroblockd_dst_));
+    memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_));
    vp8_setup_block_dptrs(macroblockd_dst_);
  }

--- a/test/sad_test.cc
+++ b/test/sad_test.cc
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@@ -53,7 +53,7 @@ TEST(VP8RoiMapTest, ParameterCheck) {
  cpi.common.mb_rows = 240 >> 4;
  cpi.common.mb_cols = 320 >> 4;
  const int mbs = (cpi.common.mb_rows * cpi.common.mb_cols);
-  vpx_memset(cpi.segment_feature_data, 0, sizeof(cpi.segment_feature_data));
+  memset(cpi.segment_feature_data, 0, sizeof(cpi.segment_feature_data));

  // Segment map
  cpi.segmentation_map = reinterpret_cast<unsigned char *>(vpx_calloc(mbs, 1));
@@ -61,9 +61,9 @@ TEST(VP8RoiMapTest, ParameterCheck) {
  // Allocate memory for the source memory map.
  unsigned char *roi_map =
    reinterpret_cast<unsigned char *>(vpx_calloc(mbs, 1));
-  vpx_memset(&roi_map[mbs >> 2], 1, (mbs >> 2));
-  vpx_memset(&roi_map[mbs >> 1], 2, (mbs >> 2));
-  vpx_memset(&roi_map[mbs -(mbs >> 2)], 3, (mbs >> 2));
+  memset(&roi_map[mbs >> 2], 1, (mbs >> 2));
+  memset(&roi_map[mbs >> 1], 2, (mbs >> 2));
+  memset(&roi_map[mbs -(mbs >> 2)], 3, (mbs >> 2));

  // Do a test call with valid parameters.
  int roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -63,6 +63,9 @@ class SvcTest : public ::testing::Test {
    vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
    VP9CodecFactory codec_factory;
    decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
+
+    tile_columns_ = 0;
+    tile_rows_ = 0;
  }

  virtual void TearDown() {
@@ -75,6 +78,8 @@ class SvcTest : public ::testing::Test {
        vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
    EXPECT_EQ(VPX_CODEC_OK, res);
    vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4);  // Make the test faster
+    vpx_codec_control(&codec_, VP9E_SET_TILE_COLUMNS, tile_columns_);
+    vpx_codec_control(&codec_, VP9E_SET_TILE_ROWS, tile_rows_);
    codec_initialized_ = true;
  }

@@ -108,7 +113,8 @@ class SvcTest : public ::testing::Test {
    codec_enc_.g_pass = VPX_RC_FIRST_PASS;
    InitializeEncoder();

-    libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
+    libvpx_test::I420VideoSource video(test_file_name_,
+                                       codec_enc_.g_w, codec_enc_.g_h,
                                       codec_enc_.g_timebase.den,
                                       codec_enc_.g_timebase.num, 0, 30);
    video.Begin();
@@ -176,7 +182,8 @@ class SvcTest : public ::testing::Test {
    }
    InitializeEncoder();

-    libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
+    libvpx_test::I420VideoSource video(test_file_name_,
+                                       codec_enc_.g_w, codec_enc_.g_h,
                                       codec_enc_.g_timebase.den,
                                       codec_enc_.g_timebase.num, 0, 30);
    video.Begin();
@@ -310,6 +317,8 @@ class SvcTest : public ::testing::Test {
  std::string test_file_name_;
  bool codec_initialized_;
  Decoder *decoder_;
+  int tile_columns_;
+  int tile_rows_;
 };

 TEST_F(SvcTest, SvcInit) {
@@ -444,6 +453,7 @@ TEST_F(SvcTest, OnePassEncodeOneFrame) {

 TEST_F(SvcTest, OnePassEncodeThreeFrames) {
  codec_enc_.g_pass = VPX_RC_ONE_PASS;
+  codec_enc_.g_lag_in_frames = 0;
  vpx_fixed_buf outputs[3];
  memset(&outputs[0], 0, sizeof(outputs));
  Pass2EncodeNFrames(NULL, 3, 2, &outputs[0]);
@@ -737,4 +747,51 @@ TEST_F(SvcTest,
  FreeBitstreamBuffers(&outputs[0], 10);
 }

+TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithTiles) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+  codec_enc_.g_w = 704;
+  codec_enc_.g_h = 144;
+  tile_columns_ = 1;
+  tile_rows_ = 1;
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode2TemporalLayersWithMultipleFrameContextsAndTiles) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  codec_enc_.g_error_resilient = 0;
+  codec_enc_.g_w = 704;
+  codec_enc_.g_h = 144;
+  tile_columns_ = 1;
+  tile_rows_ = 1;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
 }  // namespace
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -12,6 +12,7 @@ LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_440.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420_a10-1.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
--- a/test/test.mk
+++ b/test/test.mk
@@ -66,6 +66,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../tools_common.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc
 endif

 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += decode_api_test.cc
@@ -150,6 +151,9 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += vp9_intrapred_test.cc

 ifeq ($(CONFIG_VP9_ENCODER),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc
+
 endif

 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
@@ -160,6 +164,9 @@ endif # VP9

 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc

+TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9_DECODER) := test_intra_pred_speed.cc
+TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9_DECODER) += ../md5_utils.h ../md5_utils.c
+
 endif # CONFIG_SHARED

 include $(SRC_PATH_BARE)/test/test-data.mk
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -0,0 +1,384 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+//  Test and time VP9 intra-predictor functions
+
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/md5_helper.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
+
+// -----------------------------------------------------------------------------
+
+namespace {
+
+typedef void (*VpxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
+                            const uint8_t *above, const uint8_t *left);
+
+const int kNumVp9IntraPredFuncs = 13;
+const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = {
+  "DC_PRED", "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED", "V_PRED", "H_PRED",
+  "D45_PRED", "D135_PRED", "D117_PRED", "D153_PRED", "D207_PRED", "D63_PRED",
+  "TM_PRED"
+};
+
+void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs,
+                   const char *const pred_func_names[], int num_funcs,
+                   const char *const signatures[], int block_size,
+                   int num_pixels_per_test) {
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const int kBPS = 32;
+  const int kTotalPixels = 32 * kBPS;
+  DECLARE_ALIGNED(16, uint8_t, src[kTotalPixels]);
+  DECLARE_ALIGNED(16, uint8_t, ref_src[kTotalPixels]);
+  DECLARE_ALIGNED(16, uint8_t, left[kBPS]);
+  DECLARE_ALIGNED(16, uint8_t, above_mem[2 * kBPS + 16]);
+  uint8_t *const above = above_mem + 16;
+  for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand8();
+  for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand8();
+  for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand8();
+  const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
+
+  // some code assumes the top row has been extended:
+  // d45/d63 C-code, for instance, but not the assembly.
+  // TODO(jzern): this style of extension isn't strictly necessary.
+  ASSERT_LE(block_size, kBPS);
+  memset(above + block_size, above[block_size - 1], 2 * kBPS - block_size);
+
+  for (int k = 0; k < num_funcs; ++k) {
+    if (pred_funcs[k] == NULL) continue;
+    memcpy(src, ref_src, sizeof(src));
+    vpx_usec_timer timer;
+    vpx_usec_timer_start(&timer);
+    for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+      pred_funcs[k](src, kBPS, above, left);
+    }
+    libvpx_test::ClearSystemState();
+    vpx_usec_timer_mark(&timer);
+    const int elapsed_time =
+        static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+    libvpx_test::MD5 md5;
+    md5.Add(src, sizeof(src));
+    printf("Mode %s[%12s]: %5d ms     MD5: %s\n", name, pred_func_names[k],
+           elapsed_time, md5.Get());
+    EXPECT_STREQ(signatures[k], md5.Get());
+  }
+}
+
+void TestIntraPred4(VpxPredFunc const *pred_funcs) {
+  static const int kNumVp9IntraFuncs = 13;
+  static const char *const kSignatures[kNumVp9IntraFuncs] = {
+    "4334156168b34ab599d9b5b30f522fe9",
+    "bc4649d5ba47c7ff178d92e475960fb0",
+    "8d316e5933326dcac24e1064794b5d12",
+    "a27270fed024eafd762c95de85f4da51",
+    "c33dff000d4256c2b8f3bf9e9bab14d2",
+    "44d8cddc2ad8f79b8ed3306051722b4f",
+    "eb54839b2bad6699d8946f01ec041cd0",
+    "ecb0d56ae5f677ea45127ce9d5c058e4",
+    "0b7936841f6813da818275944895b574",
+    "9117972ef64f91a58ff73e1731c81db2",
+    "c56d5e8c729e46825f46dd5d3b5d508a",
+    "c0889e2039bcf7bcb5d2f33cdca69adc",
+    "309a618577b27c648f9c5ee45252bc8f",
+  };
+  TestIntraPred("Intra4", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
+                kSignatures, 4, 4 * 4 * kNumVp9IntraFuncs);
+}
+
+void TestIntraPred8(VpxPredFunc const *pred_funcs) {
+  static const int kNumVp9IntraFuncs = 13;
+  static const char *const kSignatures[kNumVp9IntraFuncs] = {
+    "7694ddeeefed887faf9d339d18850928",
+    "7d726b1213591b99f736be6dec65065b",
+    "19c5711281357a485591aaf9c96c0a67",
+    "ba6b66877a089e71cd938e3b8c40caac",
+    "802440c93317e0f8ba93fab02ef74265",
+    "9e09a47a15deb0b9d8372824f9805080",
+    "b7c2d8c662268c0c427da412d7b0311d",
+    "78339c1c60bb1d67d248ab8c4da08b7f",
+    "5c97d70f7d47de1882a6cd86c165c8a9",
+    "8182bf60688b42205acd95e59e967157",
+    "08323400005a297f16d7e57e7fe1eaac",
+    "95f7bfc262329a5849eda66d8f7c68ce",
+    "815b75c8e0d91cc1ae766dc5d3e445a3",
+  };
+  TestIntraPred("Intra8", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
+                kSignatures, 8, 8 * 8 * kNumVp9IntraFuncs);
+}
+
+void TestIntraPred16(VpxPredFunc const *pred_funcs) {
+  static const int kNumVp9IntraFuncs = 13;
+  static const char *const kSignatures[kNumVp9IntraFuncs] = {
+    "b40dbb555d5d16a043dc361e6694fe53",
+    "fb08118cee3b6405d64c1fd68be878c6",
+    "6c190f341475c837cc38c2e566b64875",
+    "db5c34ccbe2c7f595d9b08b0dc2c698c",
+    "a62cbfd153a1f0b9fed13e62b8408a7a",
+    "143df5b4c89335e281103f610f5052e4",
+    "d87feb124107cdf2cfb147655aa0bb3c",
+    "7841fae7d4d47b519322e6a03eeed9dc",
+    "f6ebed3f71cbcf8d6d0516ce87e11093",
+    "3cc480297dbfeed01a1c2d78dd03d0c5",
+    "b9f69fa6532b372c545397dcb78ef311",
+    "a8fe1c70432f09d0c20c67bdb6432c4d",
+    "b8a41aa968ec108af447af4217cba91b",
+  };
+  TestIntraPred("Intra16", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
+                kSignatures, 16, 16 * 16 * kNumVp9IntraFuncs);
+}
+
+void TestIntraPred32(VpxPredFunc const *pred_funcs) {
+  static const int kNumVp9IntraFuncs = 13;
+  static const char *const kSignatures[kNumVp9IntraFuncs] = {
+    "558541656d84f9ae7896db655826febe",
+    "b3587a1f9a01495fa38c8cd3c8e2a1bf",
+    "4c6501e64f25aacc55a2a16c7e8f0255",
+    "b3b01379ba08916ef6b1b35f7d9ad51c",
+    "0f1eb38b6cbddb3d496199ef9f329071",
+    "911c06efb9ed1c3b4c104b232b55812f",
+    "9225beb0ddfa7a1d24eaa1be430a6654",
+    "0a6d584a44f8db9aa7ade2e2fdb9fc9e",
+    "b01c9076525216925f3456f034fb6eee",
+    "d267e20ad9e5cd2915d1a47254d3d149",
+    "ed012a4a5da71f36c2393023184a0e59",
+    "f162b51ed618d28b936974cff4391da5",
+    "9e1370c6d42e08d357d9612c93a71cfc",
+  };
+  TestIntraPred("Intra32", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
+                kSignatures, 32, 32 * 32 * kNumVp9IntraFuncs);
+}
+
+}  // namespace
+
+// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors
+// to |test_func|. The test name is 'arch.test_func', e.g., C.TestIntraPred4.
+#define INTRA_PRED_TEST(arch, test_func, dc, dc_left, dc_top, dc_128, v, h, \
+                        d45, d135, d117, d153, d207, d63, tm)               \
+  TEST(arch, test_func) {                                                   \
+    static const VpxPredFunc vp9_intra_pred[] = {                           \
+        dc,   dc_left, dc_top, dc_128, v,   h, d45,                         \
+        d135, d117,    d153,   d207,   d63, tm};                            \
+    test_func(vp9_intra_pred);                                              \
+  }
+
+// -----------------------------------------------------------------------------
+// 4x4
+
+INTRA_PRED_TEST(C, TestIntraPred4, vp9_dc_predictor_4x4_c,
+                vp9_dc_left_predictor_4x4_c, vp9_dc_top_predictor_4x4_c,
+                vp9_dc_128_predictor_4x4_c, vp9_v_predictor_4x4_c,
+                vp9_h_predictor_4x4_c, vp9_d45_predictor_4x4_c,
+                vp9_d135_predictor_4x4_c, vp9_d117_predictor_4x4_c,
+                vp9_d153_predictor_4x4_c, vp9_d207_predictor_4x4_c,
+                vp9_d63_predictor_4x4_c, vp9_tm_predictor_4x4_c)
+
+#if HAVE_SSE
+INTRA_PRED_TEST(SSE, TestIntraPred4, vp9_dc_predictor_4x4_sse,
+                vp9_dc_left_predictor_4x4_sse, vp9_dc_top_predictor_4x4_sse,
+                vp9_dc_128_predictor_4x4_sse, vp9_v_predictor_4x4_sse, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, vp9_tm_predictor_4x4_sse)
+#endif  // HAVE_SSE
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
+                vp9_h_predictor_4x4_ssse3, vp9_d45_predictor_4x4_ssse3, NULL,
+                NULL, vp9_d153_predictor_4x4_ssse3,
+                vp9_d207_predictor_4x4_ssse3, vp9_d63_predictor_4x4_ssse3, NULL)
+#endif  // HAVE_SSSE3
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TestIntraPred4, vp9_dc_predictor_4x4_dspr2, NULL, NULL,
+                NULL, NULL, vp9_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
+                NULL, NULL, vp9_tm_predictor_4x4_dspr2)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred4, vp9_dc_predictor_4x4_neon,
+                vp9_dc_left_predictor_4x4_neon, vp9_dc_top_predictor_4x4_neon,
+                vp9_dc_128_predictor_4x4_neon, vp9_v_predictor_4x4_neon,
+                vp9_h_predictor_4x4_neon, vp9_d45_predictor_4x4_neon,
+                vp9_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+                vp9_tm_predictor_4x4_neon)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred4, vp9_dc_predictor_4x4_msa,
+                vp9_dc_left_predictor_4x4_msa, vp9_dc_top_predictor_4x4_msa,
+                vp9_dc_128_predictor_4x4_msa, vp9_v_predictor_4x4_msa,
+                vp9_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vp9_tm_predictor_4x4_msa)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 8x8
+
+INTRA_PRED_TEST(C, TestIntraPred8, vp9_dc_predictor_8x8_c,
+                vp9_dc_left_predictor_8x8_c, vp9_dc_top_predictor_8x8_c,
+                vp9_dc_128_predictor_8x8_c, vp9_v_predictor_8x8_c,
+                vp9_h_predictor_8x8_c, vp9_d45_predictor_8x8_c,
+                vp9_d135_predictor_8x8_c, vp9_d117_predictor_8x8_c,
+                vp9_d153_predictor_8x8_c, vp9_d207_predictor_8x8_c,
+                vp9_d63_predictor_8x8_c, vp9_tm_predictor_8x8_c)
+
+#if HAVE_SSE
+INTRA_PRED_TEST(SSE, TestIntraPred8, vp9_dc_predictor_8x8_sse,
+                vp9_dc_left_predictor_8x8_sse, vp9_dc_top_predictor_8x8_sse,
+                vp9_dc_128_predictor_8x8_sse, vp9_v_predictor_8x8_sse, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+#endif  // HAVE_SSE
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, vp9_tm_predictor_8x8_sse2)
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
+                vp9_h_predictor_8x8_ssse3, vp9_d45_predictor_8x8_ssse3, NULL,
+                NULL, vp9_d153_predictor_8x8_ssse3,
+                vp9_d207_predictor_8x8_ssse3, vp9_d63_predictor_8x8_ssse3, NULL)
+#endif  // HAVE_SSSE3
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TestIntraPred8, vp9_dc_predictor_8x8_dspr2, NULL, NULL,
+                NULL, NULL, vp9_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
+                NULL, NULL, vp9_tm_predictor_8x8_c)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred8, vp9_dc_predictor_8x8_neon,
+                vp9_dc_left_predictor_8x8_neon, vp9_dc_top_predictor_8x8_neon,
+                vp9_dc_128_predictor_8x8_neon, vp9_v_predictor_8x8_neon,
+                vp9_h_predictor_8x8_neon, vp9_d45_predictor_8x8_neon, NULL,
+                NULL, NULL, NULL, NULL, vp9_tm_predictor_8x8_neon)
+
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred8, vp9_dc_predictor_8x8_msa,
+                vp9_dc_left_predictor_8x8_msa, vp9_dc_top_predictor_8x8_msa,
+                vp9_dc_128_predictor_8x8_msa, vp9_v_predictor_8x8_msa,
+                vp9_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vp9_tm_predictor_8x8_msa)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 16x16
+
+INTRA_PRED_TEST(C, TestIntraPred16, vp9_dc_predictor_16x16_c,
+                vp9_dc_left_predictor_16x16_c, vp9_dc_top_predictor_16x16_c,
+                vp9_dc_128_predictor_16x16_c, vp9_v_predictor_16x16_c,
+                vp9_h_predictor_16x16_c, vp9_d45_predictor_16x16_c,
+                vp9_d135_predictor_16x16_c, vp9_d117_predictor_16x16_c,
+                vp9_d153_predictor_16x16_c, vp9_d207_predictor_16x16_c,
+                vp9_d63_predictor_16x16_c, vp9_tm_predictor_16x16_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2, TestIntraPred16, vp9_dc_predictor_16x16_sse2,
+                vp9_dc_left_predictor_16x16_sse2,
+                vp9_dc_top_predictor_16x16_sse2,
+                vp9_dc_128_predictor_16x16_sse2, vp9_v_predictor_16x16_sse2,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                vp9_tm_predictor_16x16_sse2)
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL,
+                vp9_h_predictor_16x16_ssse3, vp9_d45_predictor_16x16_ssse3,
+                NULL, NULL, vp9_d153_predictor_16x16_ssse3,
+                vp9_d207_predictor_16x16_ssse3, vp9_d63_predictor_16x16_ssse3,
+                NULL)
+#endif  // HAVE_SSSE3
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TestIntraPred16, vp9_dc_predictor_16x16_dspr2, NULL,
+                NULL, NULL, NULL, vp9_h_predictor_16x16_dspr2, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred16, vp9_dc_predictor_16x16_neon,
+                vp9_dc_left_predictor_16x16_neon,
+                vp9_dc_top_predictor_16x16_neon,
+                vp9_dc_128_predictor_16x16_neon, vp9_v_predictor_16x16_neon,
+                vp9_h_predictor_16x16_neon, vp9_d45_predictor_16x16_neon, NULL,
+                NULL, NULL, NULL, NULL, vp9_tm_predictor_16x16_neon)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred16, vp9_dc_predictor_16x16_msa,
+                vp9_dc_left_predictor_16x16_msa, vp9_dc_top_predictor_16x16_msa,
+                vp9_dc_128_predictor_16x16_msa, vp9_v_predictor_16x16_msa,
+                vp9_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vp9_tm_predictor_16x16_msa)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 32x32
+
+INTRA_PRED_TEST(C, TestIntraPred32, vp9_dc_predictor_32x32_c,
+                vp9_dc_left_predictor_32x32_c, vp9_dc_top_predictor_32x32_c,
+                vp9_dc_128_predictor_32x32_c, vp9_v_predictor_32x32_c,
+                vp9_h_predictor_32x32_c, vp9_d45_predictor_32x32_c,
+                vp9_d135_predictor_32x32_c, vp9_d117_predictor_32x32_c,
+                vp9_d153_predictor_32x32_c, vp9_d207_predictor_32x32_c,
+                vp9_d63_predictor_32x32_c, vp9_tm_predictor_32x32_c)
+
+#if HAVE_SSE2
+#if ARCH_X86_64
+INTRA_PRED_TEST(SSE2, TestIntraPred32, vp9_dc_predictor_32x32_sse2,
+                vp9_dc_left_predictor_32x32_sse2,
+                vp9_dc_top_predictor_32x32_sse2,
+                vp9_dc_128_predictor_32x32_sse2, vp9_v_predictor_32x32_sse2,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                vp9_tm_predictor_32x32_sse2)
+#else
+INTRA_PRED_TEST(SSE2, TestIntraPred32, vp9_dc_predictor_32x32_sse2,
+                vp9_dc_left_predictor_32x32_sse2,
+                vp9_dc_top_predictor_32x32_sse2,
+                vp9_dc_128_predictor_32x32_sse2, vp9_v_predictor_32x32_sse2,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+#endif  // ARCH_X86_64
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL,
+                vp9_h_predictor_32x32_ssse3, vp9_d45_predictor_32x32_ssse3,
+                NULL, NULL, vp9_d153_predictor_32x32_ssse3,
+                vp9_d207_predictor_32x32_ssse3, vp9_d63_predictor_32x32_ssse3,
+                NULL)
+#endif  // HAVE_SSSE3
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred32, vp9_dc_predictor_32x32_neon,
+                vp9_dc_left_predictor_32x32_neon,
+                vp9_dc_top_predictor_32x32_neon,
+                vp9_dc_128_predictor_32x32_neon, vp9_v_predictor_32x32_neon,
+                vp9_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+                vp9_tm_predictor_32x32_neon)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred32, vp9_dc_predictor_32x32_msa,
+                vp9_dc_left_predictor_32x32_msa, vp9_dc_top_predictor_32x32_msa,
+                vp9_dc_128_predictor_32x32_msa, vp9_v_predictor_32x32_msa,
+                vp9_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vp9_tm_predictor_32x32_msa)
+#endif  // HAVE_MSA
+
+#include "test/test_libvpx.cc"
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -15,10 +15,12 @@
 extern "C" {
 #if CONFIG_VP8
 extern void vp8_rtcd();
-#endif
+#endif  // CONFIG_VP8
 #if CONFIG_VP9
 extern void vp9_rtcd();
-#endif
+#endif  // CONFIG_VP9
+extern void vpx_dsp_rtcd();
+extern void vpx_scale_rtcd();
 }
 #include "third_party/googletest/src/include/gtest/gtest.h"

@@ -36,21 +38,21 @@ int main(int argc, char **argv) {
 #if ARCH_X86 || ARCH_X86_64
  const int simd_caps = x86_simd_caps();
  if (!(simd_caps & HAS_MMX))
-    append_negative_gtest_filter(":MMX/*");
+    append_negative_gtest_filter(":MMX.*:MMX/*");
  if (!(simd_caps & HAS_SSE))
-    append_negative_gtest_filter(":SSE/*");
+    append_negative_gtest_filter(":SSE.*:SSE/*");
  if (!(simd_caps & HAS_SSE2))
-    append_negative_gtest_filter(":SSE2/*");
+    append_negative_gtest_filter(":SSE2.*:SSE2/*");
  if (!(simd_caps & HAS_SSE3))
-    append_negative_gtest_filter(":SSE3/*");
+    append_negative_gtest_filter(":SSE3.*:SSE3/*");
  if (!(simd_caps & HAS_SSSE3))
-    append_negative_gtest_filter(":SSSE3/*");
+    append_negative_gtest_filter(":SSSE3.*:SSSE3/*");
  if (!(simd_caps & HAS_SSE4_1))
-    append_negative_gtest_filter(":SSE4_1/*");
+    append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*");
  if (!(simd_caps & HAS_AVX))
-    append_negative_gtest_filter(":AVX/*");
+    append_negative_gtest_filter(":AVX.*:AVX/*");
  if (!(simd_caps & HAS_AVX2))
-    append_negative_gtest_filter(":AVX2/*");
+    append_negative_gtest_filter(":AVX2.*:AVX2/*");
 #endif

 #if !CONFIG_SHARED
@@ -59,11 +61,13 @@ int main(int argc, char **argv) {

 #if CONFIG_VP8
  vp8_rtcd();
-#endif
+#endif  // CONFIG_VP8
 #if CONFIG_VP9
  vp9_rtcd();
-#endif
-#endif
+#endif  // CONFIG_VP9
+  vpx_dsp_rtcd();
+  vpx_scale_rtcd();
+#endif  // !CONFIG_SHARED

  return RUN_ALL_TESTS();
 }
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -165,7 +165,10 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm",
  "vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf",
  "vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf",
+#if !CONFIG_SIZE_LIMIT || \
+    (DECODE_WIDTH_LIMIT >= 20400 && DECODE_HEIGHT_LIMIT >= 120)
  "vp90-2-13-largescaling.webm",
+#endif
  "vp90-2-14-resize-fp-tiles-1-16.webm",
  "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
  "vp90-2-14-resize-fp-tiles-1-2.webm", "vp90-2-14-resize-fp-tiles-1-4.webm",
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -402,11 +402,15 @@ VP9_IVF_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-09-subpixel-00.ivf"

 VP9_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm"
 VP9_FPM_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm"
+VP9_LT_50_FRAMES_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm"

 YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
 YUV_RAW_INPUT_WIDTH=352
 YUV_RAW_INPUT_HEIGHT=288

+Y4M_NOSQ_PAR_INPUT="${LIBVPX_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m"
+Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+
 # Setup a trap function to clean up after tests complete.
 trap cleanup EXIT

@@ -428,6 +432,7 @@ vlog "$(basename "${0%.*}") test configuration:
  VPX_TEST_VERBOSE_OUTPUT=${VPX_TEST_VERBOSE_OUTPUT}
  YUV_RAW_INPUT=${YUV_RAW_INPUT}
  YUV_RAW_INPUT_WIDTH=${YUV_RAW_INPUT_WIDTH}
-  YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT}"
+  YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT}
+  Y4M_NOSQ_PAR_INPUT=${Y4M_NOSQ_PAR_INPUT}"

 fi  # End $VPX_TEST_TOOLS_COMMON_SH pseudo include guard.
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
--- a/test/vp8_denoiser_sse2_test.cc
+++ b/test/vp8_denoiser_sse2_test.cc
@@ -52,13 +52,13 @@ TEST_P(VP8DenoiserTest, BitexactCheck) {
  // mc_avg_block is the denoised reference block,
  // avg_block_c is the denoised result from C code,
  // avg_block_sse2 is the denoised result from SSE2 code.
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, sig_block_c, kNumPixels);
+  DECLARE_ALIGNED(16, uint8_t, sig_block_c[kNumPixels]);
  // Since in VP8 denoiser, the source signal will be changed,
  // we need another copy of the source signal as the input of sse2 code.
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, sig_block_sse2, kNumPixels);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, mc_avg_block, kNumPixels);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, avg_block_c, kNumPixels);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, avg_block_sse2, kNumPixels);
+  DECLARE_ALIGNED(16, uint8_t, sig_block_sse2[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, mc_avg_block[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_c[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_sse2[kNumPixels]);

  for (int i = 0; i < count_test_block; ++i) {
    // Generate random motion magnitude, 20% of which exceed the threshold.
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -121,6 +121,79 @@ class AverageTest
  }
 };

+typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
+                              const int ref_stride, const int height);
+
+typedef std::tr1::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+
+class IntProRowTest
+    : public AverageTestBase,
+      public ::testing::WithParamInterface<IntProRowParam> {
+ public:
+  IntProRowTest()
+    : AverageTestBase(16, GET_PARAM(0)),
+      hbuf_asm_(NULL),
+      hbuf_c_(NULL) {
+    asm_func_ = GET_PARAM(1);
+    c_func_ = GET_PARAM(2);
+  }
+
+ protected:
+  virtual void SetUp() {
+    hbuf_asm_ = reinterpret_cast<int16_t*>(
+        vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+    hbuf_c_ = reinterpret_cast<int16_t*>(
+        vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+  }
+
+  virtual void TearDown() {
+    vpx_free(hbuf_c_);
+    hbuf_c_ = NULL;
+    vpx_free(hbuf_asm_);
+    hbuf_asm_ = NULL;
+  }
+
+  void RunComparison() {
+    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
+    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+        << "Output mismatch";
+  }
+
+ private:
+  IntProRowFunc asm_func_;
+  IntProRowFunc c_func_;
+  int16_t *hbuf_asm_;
+  int16_t *hbuf_c_;
+};
+
+typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+
+typedef std::tr1::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+
+class IntProColTest
+    : public AverageTestBase,
+      public ::testing::WithParamInterface<IntProColParam> {
+ public:
+  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
+    asm_func_ = GET_PARAM(1);
+    c_func_ = GET_PARAM(2);
+  }
+
+ protected:
+  void RunComparison() {
+    ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
+    ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
+    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
+  }
+
+ private:
+  IntProColFunc asm_func_;
+  IntProColFunc c_func_;
+  int16_t sum_asm_;
+  int16_t sum_c_;
+};
+

 uint8_t* AverageTestBase::source_data_ = NULL;

@@ -143,6 +216,36 @@ TEST_P(AverageTest, Random) {
  }
 }

+TEST_P(IntProRowTest, MinValue) {
+  FillConstant(0);
+  RunComparison();
+}
+
+TEST_P(IntProRowTest, MaxValue) {
+  FillConstant(255);
+  RunComparison();
+}
+
+TEST_P(IntProRowTest, Random) {
+  FillRandom();
+  RunComparison();
+}
+
+TEST_P(IntProColTest, MinValue) {
+  FillConstant(0);
+  RunComparison();
+}
+
+TEST_P(IntProColTest, MaxValue) {
+  FillConstant(255);
+  RunComparison();
+}
+
+TEST_P(IntProColTest, Random) {
+  FillRandom();
+  RunComparison();
+}
+
 using std::tr1::make_tuple;

 INSTANTIATE_TEST_CASE_P(
@@ -151,7 +254,6 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c),
        make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c)));

-
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
    SSE2, AverageTest,
@@ -163,6 +265,17 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_sse2),
        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_sse2)));

+INSTANTIATE_TEST_CASE_P(
+    SSE2, IntProRowTest, ::testing::Values(
+        make_tuple(16, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),
+        make_tuple(32, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),
+        make_tuple(64, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, IntProColTest, ::testing::Values(
+        make_tuple(16, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
+        make_tuple(32, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
+        make_tuple(64, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c)));
 #endif

 #if HAVE_NEON
@@ -175,4 +288,16 @@ INSTANTIATE_TEST_CASE_P(

 #endif

+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+    MSA, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_msa),
+        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_msa),
+        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_msa),
+        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_msa),
+        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_msa),
+        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_msa)));
+#endif
+
 }  // namespace
--- a/test/vp9_denoiser_sse2_test.cc
+++ b/test/vp9_denoiser_sse2_test.cc
@@ -52,10 +52,10 @@ TEST_P(VP9DenoiserTest, BitexactCheck) {
  // mc_avg_block is the denoised reference block,
  // avg_block_c is the denoised result from C code,
  // avg_block_sse2 is the denoised result from SSE2 code.
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, sig_block, kNumPixels);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, mc_avg_block, kNumPixels);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, avg_block_c, kNumPixels);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, avg_block_sse2, kNumPixels);
+  DECLARE_ALIGNED(16, uint8_t, sig_block[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, mc_avg_block[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_c[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_sse2[kNumPixels]);

  for (int i = 0; i < count_test_block; ++i) {
    // Generate random motion magnitude, 20% of which exceed the threshold.
--- a/test/vp9_error_block_test.cc
+++ b/test/vp9_error_block_test.cc
@@ -21,6 +21,7 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -57,8 +58,8 @@ class ErrorBlockTest

 TEST_P(ErrorBlockTest, OperationCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff,   4096);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
  int err_count_total = 0;
  int first_failure = -1;
  intptr_t block_size;
@@ -90,8 +91,8 @@ TEST_P(ErrorBlockTest, OperationCheck) {

 TEST_P(ErrorBlockTest, ExtremeValues) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff,   4096);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
  int err_count_total = 0;
  int first_failure = -1;
  intptr_t block_size;
--- a/test/vp9_frame_parallel_test.cc
+++ b/test/vp9_frame_parallel_test.cc
@@ -29,7 +29,7 @@ using std::string;

 #if CONFIG_WEBM_IO

-struct FileList {
+struct PauseFileList {
  const char *name;
  // md5 sum for decoded frames which does not include skipped frames.
  const char *expected_md5;
@@ -39,7 +39,8 @@ struct FileList {
 // Decodes |filename| with |num_threads|. Pause at the specified frame_num,
 // seek to next key frame and then continue decoding until the end. Return
 // the md5 of the decoded frames which does not include skipped frames.
-string DecodeFile(const string &filename, int num_threads, int pause_num) {
+string DecodeFileWithPause(const string &filename, int num_threads,
+                           int pause_num) {
  libvpx_test::WebMVideoSource video(filename);
  video.Init();
  int in_frames = 0;
@@ -92,12 +93,12 @@ string DecodeFile(const string &filename, int num_threads, int pause_num) {
  return string(md5.Get());
 }

-void DecodeFiles(const FileList files[]) {
-  for (const FileList *iter = files; iter->name != NULL; ++iter) {
+void DecodeFilesWithPause(const PauseFileList files[]) {
+  for (const PauseFileList *iter = files; iter->name != NULL; ++iter) {
    SCOPED_TRACE(iter->name);
    for (int t = 2; t <= 8; ++t) {
      EXPECT_EQ(iter->expected_md5,
-                DecodeFile(iter->name, t, iter->pause_frame_num))
+                DecodeFileWithPause(iter->name, t, iter->pause_frame_num))
          << "threads = " << t;
    }
  }
@@ -106,19 +107,19 @@ void DecodeFiles(const FileList files[]) {
 TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) {
  // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
  // one key frame for every ten frames.
-  static const FileList files[] = {
+  static const PauseFileList files[] = {
    { "vp90-2-07-frame_parallel-1.webm",
-      "6ea7c3875d67252e7caf2bc6e75b36b1", 6},
+      "6ea7c3875d67252e7caf2bc6e75b36b1", 6 },
    { "vp90-2-07-frame_parallel-1.webm",
-      "4bb634160c7356a8d7d4299b6dc83a45", 12},
+      "4bb634160c7356a8d7d4299b6dc83a45", 12 },
    { "vp90-2-07-frame_parallel-1.webm",
-      "89772591e6ef461f9fa754f916c78ed8", 26},
-    { NULL, NULL, 0},
+      "89772591e6ef461f9fa754f916c78ed8", 26 },
+    { NULL, NULL, 0 },
  };
-  DecodeFiles(files);
+  DecodeFilesWithPause(files);
 }

-struct InvalidFileList {
+struct FileList {
  const char *name;
  // md5 sum for decoded frames which does not include corrupted frames.
  const char *expected_md5;
@@ -128,7 +129,7 @@ struct InvalidFileList {

 // Decodes |filename| with |num_threads|. Return the md5 of the decoded
 // frames which does not include corrupted frames.
-string DecodeInvalidFile(const string &filename, int num_threads,
+string DecodeFile(const string &filename, int num_threads,
                  int expected_frame_count) {
  libvpx_test::WebMVideoSource video(filename);
  video.Init();
@@ -173,37 +174,47 @@ string DecodeInvalidFile(const string &filename, int num_threads,
  return string(md5.Get());
 }

-void DecodeInvalidFiles(const InvalidFileList files[]) {
-  for (const InvalidFileList *iter = files; iter->name != NULL; ++iter) {
+void DecodeFiles(const FileList files[]) {
+  for (const FileList *iter = files; iter->name != NULL; ++iter) {
    SCOPED_TRACE(iter->name);
    for (int t = 2; t <= 8; ++t) {
      EXPECT_EQ(iter->expected_md5,
-                DecodeInvalidFile(iter->name, t, iter->expected_frame_count))
+                DecodeFile(iter->name, t, iter->expected_frame_count))
          << "threads = " << t;
    }
  }
 }

 TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) {
-  static const InvalidFileList files[] = {
+  static const FileList files[] = {
    // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
    // one key frame for every ten frames. The 11th frame has corrupted data.
    { "invalid-vp90-2-07-frame_parallel-1.webm",
-      "0549d0f45f60deaef8eb708e6c0eb6cb", 30},
+      "0549d0f45f60deaef8eb708e6c0eb6cb", 30 },
    // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with
    // one key frame for every ten frames. The 1st and 31st frames have
    // corrupted data.
    { "invalid-vp90-2-07-frame_parallel-2.webm",
-      "6a1f3cf6f9e7a364212fadb9580d525e", 20},
+      "6a1f3cf6f9e7a364212fadb9580d525e", 20 },
    // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with
    // one key frame for every ten frames. The 5th and 13th frames have
    // corrupted data.
    { "invalid-vp90-2-07-frame_parallel-3.webm",
-      "8256544308de926b0681e04685b98677", 27},
-    { NULL, NULL, 0},
+      "8256544308de926b0681e04685b98677", 27 },
+    { NULL, NULL, 0 },
  };
-  DecodeInvalidFiles(files);
+  DecodeFiles(files);
 }

+TEST(VP9MultiThreadedFrameParallel, ValidFileTest) {
+  static const FileList files[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+    { "vp92-2-20-10bit-yuv420.webm",
+      "a16b99df180c584e8db2ffeda987d293", 10 },
+#endif
+    { NULL, NULL, 0 },
+  };
+  DecodeFiles(files);
+}
 #endif  // CONFIG_WEBM_IO
 }  // namespace
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -120,10 +120,10 @@ class VP9IntraPredTest

 TEST_P(VP9IntraPredTest, IntraPredTests) {
  // max block size is 32
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, left_col, 2*32);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 2*32+32);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, dst, 3 * 32 * 32);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_dst, 3 * 32 * 32);
+  DECLARE_ALIGNED(16, uint16_t, left_col[2*32]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[2*32+32]);
+  DECLARE_ALIGNED(16, uint16_t, dst[3 * 32 * 32]);
+  DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 32 * 32]);
  RunTest(left_col, above_data, dst, ref_dst);
 }

--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -21,6 +21,8 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -80,18 +82,18 @@ class VP9Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {

 TEST_P(VP9QuantizeTest, OperationCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 256);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 256);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 256);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 256);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
+  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
+  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
+  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
  int err_count_total = 0;
  int first_failure = -1;
  for (int i = 0; i < number_of_iterations; ++i) {
@@ -139,18 +141,18 @@ TEST_P(VP9QuantizeTest, OperationCheck) {

 TEST_P(VP9Quantize32Test, OperationCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 1024);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 1024);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 1024);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 1024);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
+  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
+  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
+  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
  int err_count_total = 0;
  int first_failure = -1;
  for (int i = 0; i < number_of_iterations; ++i) {
@@ -198,18 +200,18 @@ TEST_P(VP9Quantize32Test, OperationCheck) {

 TEST_P(VP9QuantizeTest, EOBCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 256);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 256);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 256);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 256);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
+  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
+  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
+  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
  int err_count_total = 0;
  int first_failure = -1;
  for (int i = 0; i < number_of_iterations; ++i) {
@@ -262,18 +264,18 @@ TEST_P(VP9QuantizeTest, EOBCheck) {

 TEST_P(VP9Quantize32Test, EOBCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 1024);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 1024);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 1024);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 1024);
-  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
+  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
+  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
+  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
  int err_count_total = 0;
  int first_failure = -1;
  for (int i = 0; i < number_of_iterations; ++i) {
--- a/test/vp9_skip_loopfilter_test.cc
+++ b/test/vp9_skip_loopfilter_test.cc
@@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+
+namespace {
+
+const char kVp9TestFile[] = "vp90-2-08-tile_1x8_frame_parallel.webm";
+const char kVp9Md5File[] = "vp90-2-08-tile_1x8_frame_parallel.webm.md5";
+
+// Class for testing shutting off the loop filter.
+class SkipLoopFilterTest {
+ public:
+  SkipLoopFilterTest()
+      : video_(NULL),
+        decoder_(NULL),
+        md5_file_(NULL) {}
+
+  ~SkipLoopFilterTest() {
+    if (md5_file_ != NULL)
+      fclose(md5_file_);
+    delete decoder_;
+    delete video_;
+  }
+
+  // If |threads| > 0 then set the decoder with that number of threads.
+  void Init(int num_threads) {
+    expected_md5_[0] = '\0';
+    junk_[0] = '\0';
+    video_ = new libvpx_test::WebMVideoSource(kVp9TestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    if (num_threads > 0)
+      cfg.threads = num_threads;
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+
+    OpenMd5File(kVp9Md5File);
+  }
+
+  // Set the VP9 skipLoopFilter control value.
+  void SetSkipLoopFilter(int value, vpx_codec_err_t expected_value) {
+    decoder_->Control(VP9_SET_SKIP_LOOP_FILTER, value, expected_value);
+  }
+
+  vpx_codec_err_t DecodeOneFrame() {
+    const vpx_codec_err_t res =
+        decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+    if (res == VPX_CODEC_OK) {
+      ReadMd5();
+      video_->Next();
+    }
+    return res;
+  }
+
+  vpx_codec_err_t DecodeRemainingFrames() {
+    for (; video_->cxdata() != NULL; video_->Next()) {
+      const vpx_codec_err_t res =
+          decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+      if (res != VPX_CODEC_OK)
+        return res;
+      ReadMd5();
+    }
+    return VPX_CODEC_OK;
+  }
+
+  // Checks if MD5 matches or doesn't.
+  void CheckMd5(bool matches) {
+    libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
+    const vpx_image_t *img = dec_iter.Next();
+    CheckMd5Vpx(*img, matches);
+  }
+
+ private:
+  // TODO(fgalligan): Move the MD5 testing code into another class.
+  void OpenMd5File(const std::string &md5_file_name) {
+    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name);
+    ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
+        << md5_file_name;
+  }
+
+  // Reads the next line of the MD5 file.
+  void ReadMd5() {
+    ASSERT_TRUE(md5_file_ != NULL);
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5_, junk_);
+    ASSERT_NE(EOF, res) << "Read md5 data failed";
+    expected_md5_[32] = '\0';
+  }
+
+  // Checks if the last read MD5 matches |img| or doesn't.
+  void CheckMd5Vpx(const vpx_image_t &img, bool matches) {
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    const char *const actual_md5 = md5_res.Get();
+
+    // Check MD5.
+    if (matches)
+      ASSERT_STREQ(expected_md5_, actual_md5) << "MD5 checksums don't match";
+    else
+      ASSERT_STRNE(expected_md5_, actual_md5) << "MD5 checksums match";
+  }
+
+  libvpx_test::WebMVideoSource *video_;
+  libvpx_test::VP9Decoder *decoder_;
+  FILE *md5_file_;
+  char expected_md5_[33];
+  char junk_[128];
+};
+
+TEST(SkipLoopFilterTest, ShutOffLoopFilter) {
+  const int non_zero_value = 1;
+  const int num_threads = 0;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+  skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(false);
+}
+
+TEST(SkipLoopFilterTest, ShutOffLoopFilterSingleThread) {
+  const int non_zero_value = 1;
+  const int num_threads = 1;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+  skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(false);
+}
+
+TEST(SkipLoopFilterTest, ShutOffLoopFilter8Threads) {
+  const int non_zero_value = 1;
+  const int num_threads = 8;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+  skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(false);
+}
+
+TEST(SkipLoopFilterTest, WithLoopFilter) {
+  const int non_zero_value = 1;
+  const int num_threads = 0;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+  skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
+  skip_loop_filter.SetSkipLoopFilter(0, VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(true);
+}
+
+TEST(SkipLoopFilterTest, ToggleLoopFilter) {
+  const int num_threads = 0;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+
+  for (int i = 0; i < 10; ++i) {
+    skip_loop_filter.SetSkipLoopFilter(i % 2, VPX_CODEC_OK);
+    ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeOneFrame());
+  }
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(false);
+}
+
+}  // namespace
--- a/test/vpx_scale_test.cc
+++ b/test/vpx_scale_test.cc
@@ -33,10 +33,10 @@ class VpxScaleBase {
  void ResetImage(int width, int height) {
    width_ = width;
    height_ = height;
-    vpx_memset(&img_, 0, sizeof(img_));
+    memset(&img_, 0, sizeof(img_));
    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_,
                                             VP8BORDERINPIXELS));
-    vpx_memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
+    memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
    FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
              img_.y_stride);
    FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
@@ -44,15 +44,15 @@ class VpxScaleBase {
    FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
              img_.uv_stride);

-    vpx_memset(&ref_img_, 0, sizeof(ref_img_));
+    memset(&ref_img_, 0, sizeof(ref_img_));
    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_,
                                             VP8BORDERINPIXELS));
-    vpx_memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
+    memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);

-    vpx_memset(&cpy_img_, 0, sizeof(cpy_img_));
+    memset(&cpy_img_, 0, sizeof(cpy_img_));
    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_,
                                             VP8BORDERINPIXELS));
-    vpx_memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
+    memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
    ReferenceCopyFrame();
  }

@@ -87,8 +87,8 @@ class VpxScaleBase {

    // Fill the border pixels from the nearest image pixel.
    for (int y = 0; y < crop_height; ++y) {
-      vpx_memset(left, left[padding], padding);
-      vpx_memset(right, right[-1], right_extend);
+      memset(left, left[padding], padding);
+      memset(right, right[-1], right_extend);
      left += stride;
      right += stride;
    }
@@ -101,13 +101,13 @@ class VpxScaleBase {

    // The first row was already extended to the left and right. Copy it up.
    for (int y = 0; y < padding; ++y) {
-      vpx_memcpy(top, left, extend_width);
+      memcpy(top, left, extend_width);
      top += stride;
    }

    uint8_t *bottom = left + (crop_height * stride);
    for (int y = 0; y <  bottom_extend; ++y) {
-      vpx_memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
+      memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
      bottom += stride;
    }
  }
--- a/test/vpxdec.sh
+++ b/test/vpxdec.sh
@@ -17,7 +17,8 @@
 # Environment check: Make sure input is available.
 vpxdec_verify_environment() {
  if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ] || \
-    [ ! -e "${VP9_FPM_WEBM_FILE}" ] ; then
+    [ ! -e "${VP9_FPM_WEBM_FILE}" ] || \
+    [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] ; then
    elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
    return 1
  fi
@@ -87,12 +88,29 @@ vpxdec_vp9_webm_frame_parallel() {
        --frame-parallel
    done
  fi
+}

+vpxdec_vp9_webm_less_than_50_frames() {
+  # ensure that reaching eof in webm_guess_framerate doesn't result in invalid
+  # frames in actual webm_read_frame calls.
+  if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly decoder="$(vpx_tool_path vpxdec)"
+    local readonly expected=10
+    local readonly num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \
+      "${VP9_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \
+      | awk '/^[0-9]+ decoded frames/ { print $1 }')
+    if [ "$num_frames" -ne "$expected" ]; then
+      elog "Output frames ($num_frames) != expected ($expected)"
+      return 1
+    fi
+  fi
 }

 vpxdec_tests="vpxdec_vp8_ivf
              vpxdec_vp8_ivf_pipe_input
              vpxdec_vp9_webm
-              vpxdec_vp9_webm_frame_parallel"
+              vpxdec_vp9_webm_frame_parallel
+              vpxdec_vp9_webm_less_than_50_frames"

 run_tests vpxdec_verify_environment "${vpxdec_tests}"
--- a/test/vpxenc.sh
+++ b/test/vpxenc.sh
@@ -23,6 +23,13 @@ vpxenc_verify_environment() {
    elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBVPX_TEST_DATA_PATH."
    return 1
  fi
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
+    if [ ! -e "${Y4M_NOSQ_PAR_INPUT}" ]; then
+      elog "The file ${Y4M_NOSQ_PAR_INPUT##*/} must exist in"
+      elog "LIBVPX_TEST_DATA_PATH."
+      return 1
+    fi
+  fi
  if [ -z "$(vpx_tool_path vpxenc)" ]; then
    elog "vpxenc not found. It must exist in LIBVPX_BIN_PATH or its parent."
    return 1
@@ -49,6 +56,14 @@ yuv_input_hantro_collage() {
       --height="${YUV_RAW_INPUT_HEIGHT}""
 }

+y4m_input_non_square_par() {
+  echo ""${Y4M_NOSQ_PAR_INPUT}""
+}
+
+y4m_input_720p() {
+  echo ""${Y4M_720P_INPUT}""
+}
+
 # Echo default vpxenc real time encoding params. $1 is the codec, which defaults
 # to vp8 if unspecified.
 vpxenc_rt_params() {
@@ -57,7 +72,7 @@ vpxenc_rt_params() {
    --buf-initial-sz=500
    --buf-optimal-sz=600
    --buf-sz=1000
-    --cpu-used=-5
+    --cpu-used=-6
    --end-usage=cbr
    --error-resilient=1
    --kf-max-dist=90000
@@ -247,6 +262,63 @@ vpxenc_vp9_webm_rt() {
  fi
 }

+vpxenc_vp9_webm_rt_multithread_tiled() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm"
+    local readonly tilethread_min=2
+    local readonly tilethread_max=4
+    local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+    local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+
+    for threads in ${num_threads}; do
+      for tile_cols in ${num_tile_cols}; do
+        vpxenc $(y4m_input_720p) \
+          $(vpxenc_rt_params vp9) \
+          --threads=${threads} \
+          --tile-columns=${tile_cols} \
+          --output="${output}"
+      done
+    done
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+
+    rm "${output}"
+  fi
+}
+
+vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm"
+    local readonly tilethread_min=2
+    local readonly tilethread_max=4
+    local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+    local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+
+    for threads in ${num_threads}; do
+      for tile_cols in ${num_tile_cols}; do
+        vpxenc $(y4m_input_720p) \
+          $(vpxenc_rt_params vp9) \
+          --threads=${threads} \
+          --tile-columns=${tile_cols} \
+          --frame-parallel=1 \
+          --output="${output}"
+      done
+    done
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+
+    rm "${output}"
+  fi
+}
+
 vpxenc_vp9_webm_2pass() {
  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
     [ "$(webm_io_available)" = "yes" ]; then
@@ -320,6 +392,23 @@ vpxenc_vp9_webm_lag10_frames20() {
  fi
 }

+# TODO(fgalligan): Test that DisplayWidth is different than video width.
+vpxenc_vp9_webm_non_square_par() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm"
+    vpxenc $(y4m_input_non_square_par) \
+      --codec=vp9 \
+      --limit="${TEST_FRAMES}" \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
 vpxenc_tests="vpxenc_vp8_ivf
              vpxenc_vp8_webm
              vpxenc_vp8_webm_rt
@@ -329,9 +418,12 @@ vpxenc_tests="vpxenc_vp8_ivf
              vpxenc_vp9_ivf
              vpxenc_vp9_webm
              vpxenc_vp9_webm_rt
+              vpxenc_vp9_webm_rt_multithread_tiled
+              vpxenc_vp9_webm_rt_multithread_tiled_frameparallel
              vpxenc_vp9_webm_2pass
              vpxenc_vp9_ivf_lossless
              vpxenc_vp9_ivf_minq0_maxq0
-              vpxenc_vp9_webm_lag10_frames20"
+              vpxenc_vp9_webm_lag10_frames20
+              vpxenc_vp9_webm_non_square_par"

 run_tests vpxenc_verify_environment "${vpxenc_tests}"
--- a/third_party/x86inc/README.libvpx
+++ b/third_party/x86inc/README.libvpx
@@ -9,3 +9,4 @@ defines that help automatically allow assembly to work cross-platform.

 Local Modifications:
 Some modifications to allow PIC to work with x86inc.
+Conditionally define program_name to allow overriding.
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -36,7 +36,9 @@

 %include "vpx_config.asm"

+%ifndef program_name
 %define program_name vp9
+%endif


 %define UNIX64 0
--- a/tools_common.c
+++ b/tools_common.c
@@ -140,7 +140,7 @@ static const VpxInterface vpx_encoders[] = {
 #endif
 };

-int get_vpx_encoder_count() {
+int get_vpx_encoder_count(void) {
  return sizeof(vpx_encoders) / sizeof(vpx_encoders[0]);
 }

@@ -170,7 +170,7 @@ static const VpxInterface vpx_decoders[] = {
 #endif
 };

-int get_vpx_decoder_count() {
+int get_vpx_decoder_count(void) {
  return sizeof(vpx_decoders) / sizeof(vpx_decoders[0]);
 }

--- a/tools_common.h
+++ b/tools_common.h
@@ -16,6 +16,7 @@
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_image.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/msvc.h"

 #if CONFIG_ENCODERS
 #include "./y4minput.h"
@@ -34,7 +35,6 @@
 #if CONFIG_OS_SUPPORT
 #if defined(_MSC_VER)
 #include <io.h>  /* NOLINT */
-#define snprintf _snprintf
 #define isatty   _isatty
 #define fileno   _fileno
 #else
@@ -89,6 +89,7 @@ struct VpxInputContext {
  enum VideoFileType file_type;
  uint32_t width;
  uint32_t height;
+  struct VpxRational pixel_aspect_ratio;
  vpx_img_fmt_t fmt;
  vpx_bit_depth_t bit_depth;
  int only_i420;
@@ -119,7 +120,7 @@ void warn(const char *fmt, ...);
 void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN;

 /* The tool including this file must define usage_exit() */
-void usage_exit() VPX_NO_RETURN;
+void usage_exit(void) VPX_NO_RETURN;

 #undef VPX_NO_RETURN

@@ -131,11 +132,11 @@ typedef struct VpxInterface {
  vpx_codec_iface_t *(*const codec_interface)();
 } VpxInterface;

-int get_vpx_encoder_count();
+int get_vpx_encoder_count(void);
 const VpxInterface *get_vpx_encoder_by_index(int i);
 const VpxInterface *get_vpx_encoder_by_name(const char *name);

-int get_vpx_decoder_count();
+int get_vpx_decoder_count(void);
 const VpxInterface *get_vpx_decoder_by_index(int i);
 const VpxInterface *get_vpx_decoder_by_name(const char *name);
 const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc);
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -10,6 +10,7 @@


 #include "vpx_config.h"
+#include "alloccommon.h"
 #include "blockd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "onyxc_int.h"
@@ -103,8 +104,8 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
        goto allocation_fail;

    oci->post_proc_buffer_int_used = 0;
-    vpx_memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
-    vpx_memset(oci->post_proc_buffer.buffer_alloc, 128,
+    memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
+    memset(oci->post_proc_buffer.buffer_alloc, 128,
           oci->post_proc_buffer.frame_size);

    /* Allocate buffer to store post-processing filter coefficients.
@@ -176,7 +177,7 @@ void vp8_create_common(VP8_COMMON *oci)
    oci->clamp_type = RECON_CLAMP_REQUIRED;

    /* Initialize reference frame sign bias structure to defaults */
-    vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+    memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));

    /* Default disable buffer to buffer copying */
    oci->copy_buffer_to_gf = 0;
--- a/vp8/common/arm/armv6/dequant_idct_v6.asm
+++ b/vp8/common/arm/armv6/dequant_idct_v6.asm
@@ -165,7 +165,7 @@ vp8_dequant_idct_loop2_v6
    str     r1, [r2], r12           ; store output to dst
    bne     vp8_dequant_idct_loop2_v6

-; vpx_memset
+; memset
    sub     r0, r0, #32
    add     sp, sp, #4

--- a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
@@ -1,154 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_variance16x16_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp8_variance16x16_armv6| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r8, #0              ; initialize sum = 0
-    mov     r11, #0             ; initialize sse = 0
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-
-loop
-    ; 1st 4 pixels
-    ldr     r4, [r0, #0]        ; load 4 src pixels
-    ldr     r5, [r2, #0]        ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r6, r4, r5          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-    ; calculate total sum
-    adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r4, [r0, #4]        ; load 4 src pixels
-    ldr     r5, [r2, #4]        ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r4, [r0, #8]        ; load 4 src pixels
-    ldr     r5, [r2, #8]        ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r4, [r0, #12]       ; load 4 src pixels
-    ldr     r5, [r2, #12]       ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-
-    subs    r12, r12, #1
-
-    bne     loop
-
-    ; return stuff
-    ldr     r6, [sp, #40]       ; get address of sse
-    mul     r0, r8, r8          ; sum * sum
-    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-    END
-
--- a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
@@ -1,101 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_variance8x8_armv6|
-
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp8_variance8x8_armv6| PROC
-
-    push    {r4-r10, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r12, #8             ; set loop counter to 8 (=block height)
-    mov     r4, #0              ; initialize sum = 0
-    mov     r5, #0              ; initialize sse = 0
-
-loop
-    ; 1st 4 pixels
-    ldr     r6, [r0, #0x0]      ; load 4 src pixels
-    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r8, r6, r7          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r10, r8, lr         ; select bytes with positive difference
-    usub8   r9, r7, r6          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r6, r10, lr         ; calculate sum of positive differences
-    usad8   r7, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r10         ; differences of all 4 pixels
-    ; calculate total sum
-    add    r4, r4, r6           ; add positive differences to sum
-    sub    r4, r4, r7           ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r7, r8              ; byte (two pixels) to halfwords
-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r6, [r0, #0x4]      ; load 4 src pixels
-    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r6, r7          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r10, r8, lr         ; select bytes with positive difference
-    usub8   r9, r7, r6          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r6, r10, lr         ; calculate sum of positive differences
-    usad8   r7, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r10         ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r4, r4, r6          ; add positive differences to sum
-    sub     r4, r4, r7          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r7, r8              ; byte (two pixels) to halfwords
-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
-    subs    r12, r12, #1        ; next row
-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
-
-    bne     loop
-
-    ; return stuff
-    ldr     r8, [sp, #32]       ; get address of sse
-    mul     r1, r4, r4          ; sum * sum
-    str     r5, [r8]            ; store sse
-    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
-
-    pop     {r4-r10, pc}
-
-    ENDP
-
-    END
--- a/vp8/common/arm/filter_arm.c
+++ b/vp8/common/arm/filter_arm.c
@@ -99,7 +99,7 @@ void vp8_sixtap_predict4x4_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */
+    DECLARE_ALIGNED(4, short, FData[12*4]); /* Temp data buffer used in filtering */


    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
@@ -147,7 +147,7 @@ void vp8_sixtap_predict8x8_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */
+    DECLARE_ALIGNED(4, short, FData[16*8]); /* Temp data buffer used in filtering */

    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
@@ -189,7 +189,7 @@ void vp8_sixtap_predict16x16_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */
+    DECLARE_ALIGNED(4, short, FData[24*16]);    /* Temp data buffer used in filtering */

    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
--- a/vp8/common/arm/neon/sad_neon.c
+++ b/vp8/common/arm/neon/sad_neon.c
@@ -1,184 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-unsigned int vp8_sad8x8_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x8_t d0, d8;
-    uint16x8_t q12;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
-
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
-
-    for (i = 0; i < 7; i++) {
-        d0 = vld1_u8(src_ptr);
-        src_ptr += src_stride;
-        d8 = vld1_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, d0, d8);
-    }
-
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
-}
-
-unsigned int vp8_sad8x16_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x8_t d0, d8;
-    uint16x8_t q12;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
-
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
-
-    for (i = 0; i < 15; i++) {
-        d0 = vld1_u8(src_ptr);
-        src_ptr += src_stride;
-        d8 = vld1_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, d0, d8);
-    }
-
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
-}
-
-unsigned int vp8_sad4x4_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x8_t d0, d8;
-    uint16x8_t q12;
-    uint32x2_t d1;
-    uint64x1_t d3;
-    int i;
-
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
-
-    for (i = 0; i < 3; i++) {
-        d0 = vld1_u8(src_ptr);
-        src_ptr += src_stride;
-        d8 = vld1_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, d0, d8);
-    }
-
-    d1 = vpaddl_u16(vget_low_u16(q12));
-    d3 = vpaddl_u32(d1);
-
-    return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
-}
-
-unsigned int vp8_sad16x16_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x16_t q0, q4;
-    uint16x8_t q12, q13;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
-
-    q0 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    q4 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
-    for (i = 0; i < 15; i++) {
-        q0 = vld1q_u8(src_ptr);
-        src_ptr += src_stride;
-        q4 = vld1q_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
-    }
-
-    q12 = vaddq_u16(q12, q13);
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
-}
-
-unsigned int vp8_sad16x8_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x16_t q0, q4;
-    uint16x8_t q12, q13;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
-
-    q0 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    q4 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
-    for (i = 0; i < 7; i++) {
-        q0 = vld1q_u8(src_ptr);
-        src_ptr += src_stride;
-        q4 = vld1q_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
-    }
-
-    q12 = vaddq_u16(q12, q13);
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
-}
--- a/vp8/common/arm/neon/variance_neon.c
+++ b/vp8/common/arm/neon/variance_neon.c
@@ -1,320 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include "vpx_ports/mem.h"
-
-unsigned int vp8_variance16x16_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 8; i++) {
-        q0u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q1u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        q2u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance16x8_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 4; i++) {  // variance16x8_neon_loop
-        q0u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q1u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        q2u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance8x16_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    uint8x8_t d0u8, d2u8, d4u8, d6u8;
-    int16x4_t d22s16, d23s16, d24s16, d25s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint16x8_t q11u16, q12u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 8; i++) {  // variance8x16_neon_loop
-        d0u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d2u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        d4u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d6u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(d0u8, d4u8);
-        q12u16 = vsubl_u8(d2u8, d6u8);
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance8x8_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 2; i++) {  // variance8x8_neon_loop
-        d0u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d1u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d2u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d3u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-
-        d4u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d5u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d6u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d7u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-
-        q11u16 = vsubl_u8(d0u8, d4u8);
-        q12u16 = vsubl_u8(d1u8, d5u8);
-        q13u16 = vsubl_u8(d2u8, d6u8);
-        q14u16 = vsubl_u8(d3u8, d7u8);
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
--- a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
+++ b/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
@@ -12,7 +12,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"

-static const uint16_t bilinear_taps_coeff[8][2] = {
+static const uint8_t bilinear_taps_coeff[8][2] = {
    {128,   0},
    {112,  16},
    { 96,  32},
@@ -32,7 +32,7 @@ unsigned int vp8_sub_pixel_variance16x16_neon_func(
        int dst_pixels_per_line,
        unsigned int *sse) {
    int i;
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);
+    DECLARE_ALIGNED(16, unsigned char, tmp[528]);
    unsigned char *tmpp;
    unsigned char *tmpp2;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
@@ -911,12 +911,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_neon(
    return vget_lane_u32(d0u32, 0);
 }

-enum { kWidth8 = 8 };
-enum { kHeight8 = 8 };
-enum { kHeight8PlusOne = 9 };
-enum { kPixelStepOne = 1 };
-enum { kAlign16 = 16 };
-
 #define FILTER_BITS 7

 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
@@ -968,8 +962,8 @@ static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,
                                     const uint8_t *b, int b_stride,
                                     unsigned int *sse) {
  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+  return *sse - (((int64_t)sum * sum) / (8 * 8));
 }

 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
@@ -978,9 +972,9 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                      int pixel_step,
                                      unsigned int output_height,
                                      unsigned int output_width,
-                                      const uint16_t *vpx_filter) {
-  const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);
-  const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]);
+                                      const uint8_t *vpx_filter) {
+  const uint8x8_t f0 = vmov_n_u8(vpx_filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(vpx_filter[1]);
  unsigned int i;
  for (i = 0; i < output_height; ++i) {
    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
@@ -1003,21 +997,21 @@ unsigned int vp8_sub_pixel_variance8x8_neon(
        const unsigned char *dst,
        int dst_stride,
        unsigned int *sse) {
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8PlusOne * kWidth8);
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
+  DECLARE_ALIGNED(16, uint8_t, temp2[9 * 8]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
  if (xoffset == 0) {
-    var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8,
-                              kWidth8, bilinear_taps_coeff[yoffset]);
+    var_filter_block2d_bil_w8(src, temp2, src_stride, 8, 8,
+                              8, bilinear_taps_coeff[yoffset]);
  } else if (yoffset == 0) {
-    var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne,
-                              kHeight8PlusOne, kWidth8,
+    var_filter_block2d_bil_w8(src, temp2, src_stride, 1,
+                              9, 8,
                              bilinear_taps_coeff[xoffset]);
  } else {
-    var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
-                              kHeight8PlusOne, kWidth8,
+    var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
+                              9, 8,
                              bilinear_taps_coeff[xoffset]);
-    var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
-                              kWidth8, bilinear_taps_coeff[yoffset]);
+    var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
+                              8, bilinear_taps_coeff[yoffset]);
  }
-  return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
+  return variance8x8_neon(temp2, 8, dst, dst_stride, sse);
 }
--- a/vp8/common/arm/variance_arm.c
+++ b/vp8/common/arm/variance_arm.c
@@ -9,10 +9,14 @@
 */

 #include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp8/common/variance.h"
 #include "vp8/common/filter.h"

+// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
+#if CONFIG_VP8_ENCODER
+
 #if HAVE_MEDIA
 #include "vp8/common/arm/bilinearfilter_arm.h"

@@ -40,7 +44,7 @@ unsigned int vp8_sub_pixel_variance8x8_armv6
    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
                                             8, 8, 8, VFilter);

-    return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
+    return vpx_variance8x8_media(second_pass, 8, dst_ptr,
                                 dst_pixels_per_line, sse);
 }

@@ -86,13 +90,13 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
        vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
                                                 16, 16, 16, VFilter);

-        var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+        var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
                                      dst_pixels_per_line, sse);
    }
    return var;
 }

-#endif /* HAVE_MEDIA */
+#endif  // HAVE_MEDIA


 #if HAVE_NEON
@@ -129,4 +133,5 @@ unsigned int vp8_sub_pixel_variance16x16_neon
    return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
 }

-#endif
+#endif  // HAVE_NEON
+#endif  // CONFIG_VP8_ENCODER
--- a/vp8/common/common.h
+++ b/vp8/common/common.h
@@ -29,19 +29,19 @@ extern "C" {

 #define vp8_copy( Dest, Src) { \
        assert( sizeof( Dest) == sizeof( Src)); \
-        vpx_memcpy( Dest, Src, sizeof( Src)); \
+        memcpy( Dest, Src, sizeof( Src)); \
    }

 /* Use this for variably-sized arrays. */

 #define vp8_copy_array( Dest, Src, N) { \
        assert( sizeof( *Dest) == sizeof( *Src)); \
-        vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
+        memcpy( Dest, Src, N * sizeof( *Src)); \
    }

-#define vp8_zero( Dest)  vpx_memset( &Dest, 0, sizeof( Dest));
+#define vp8_zero( Dest)  memset( &Dest, 0, sizeof( Dest));

-#define vp8_zero_array( Dest, N)  vpx_memset( Dest, 0, N * sizeof( *Dest));
+#define vp8_zero_array( Dest, N)  memset( Dest, 0, N * sizeof( *Dest));


 #ifdef __cplusplus
--- a/vp8/common/copy_c.c
+++ b/vp8/common/copy_c.c
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+
+#include "./vp8_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+/* Copy 2 macroblocks to a buffer */
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride,
+                    unsigned char *dst_ptr, int dst_stride,
+                    int height)
+{
+    int r;
+
+    for (r = 0; r < height; r++)
+    {
+        memcpy(dst_ptr, src_ptr, 32);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+
+    }
+}
--- a/vp8/common/debugmodes.c
+++ b/vp8/common/debugmodes.c
@@ -81,7 +81,6 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
    fprintf(mvs, "\n");

    /* print out the block modes */
-    mb_index = 0;
    fprintf(mvs, "Mbs for Frame %d\n", frame);
    {
        int b_row;
@@ -129,7 +128,6 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f


    /* print out the block modes */
-    mb_index = 0;
    fprintf(mvs, "MVs for Frame %d\n", frame);
    {
        int b_row;
--- a/vp8/common/dequantize.c
+++ b/vp8/common/dequantize.c
@@ -38,6 +38,6 @@ void vp8_dequant_idct_add_c(short *input, short *dq,

    vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);

-    vpx_memset(input, 0, 32);
+    memset(input, 0, 32);

 }
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -183,7 +183,6 @@ const vp8_extra_bit_struct vp8_extra_bits[12] =

 void vp8_default_coef_probs(VP8_COMMON *pc)
 {
-    vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
-                   sizeof(default_coef_probs));
+    memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs));
 }

--- a/vp8/common/entropymode.c
+++ b/vp8/common/entropymode.c
@@ -159,13 +159,13 @@ const vp8_tree_index vp8_small_mvtree [14] =

 void vp8_init_mbmode_probs(VP8_COMMON *x)
 {
-    vpx_memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
-    vpx_memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
-    vpx_memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
+    memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
+    memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
+    memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
 }

 void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1])
 {
-    vpx_memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
+    memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
 }

--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -40,9 +40,9 @@ static void copy_and_extend_plane

    for (i = 0; i < h; i++)
    {
-        vpx_memset(dest_ptr1, src_ptr1[0], el);
-        vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
-        vpx_memset(dest_ptr2, src_ptr2[0], er);
+        memset(dest_ptr1, src_ptr1[0], el);
+        memcpy(dest_ptr1 + el, src_ptr1, w);
+        memset(dest_ptr2, src_ptr2[0], er);
        src_ptr1  += sp;
        src_ptr2  += sp;
        dest_ptr1 += dp;
@@ -60,13 +60,13 @@ static void copy_and_extend_plane

    for (i = 0; i < et; i++)
    {
-        vpx_memcpy(dest_ptr1, src_ptr1, linesize);
+        memcpy(dest_ptr1, src_ptr1, linesize);
        dest_ptr1 += dp;
    }

    for (i = 0; i < eb; i++)
    {
-        vpx_memcpy(dest_ptr2, src_ptr2, linesize);
+        memcpy(dest_ptr2, src_ptr2, linesize);
        dest_ptr2 += dp;
    }
 }
--- a/vp8/common/filter.c
+++ b/vp8/common/filter.c
@@ -10,6 +10,7 @@


 #include "filter.h"
+#include "./vp8_rtcd.h"

 DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
 {
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -17,6 +17,7 @@
 #include "vpx_ports/x86.h"
 #endif
 #include "vp8/common/onyxc_int.h"
+#include "vp8/common/systemdependent.h"

 #if CONFIG_MULTITHREAD
 #if HAVE_UNISTD_H && !defined(__OS2__)
--- a/vp8/common/idct_blk.c
+++ b/vp8/common/idct_blk.c
@@ -33,7 +33,7 @@ void vp8_dequant_idct_add_y_block_c
            else
            {
                vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
-                vpx_memset(q, 0, 2 * sizeof(q[0]));
+                memset(q, 0, 2 * sizeof(q[0]));
            }

            q   += 16;
@@ -59,7 +59,7 @@ void vp8_dequant_idct_add_uv_block_c
            else
            {
                vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
-                vpx_memset(q, 0, 2 * sizeof(q[0]));
+                memset(q, 0, 2 * sizeof(q[0]));
            }

            q    += 16;
@@ -78,7 +78,7 @@ void vp8_dequant_idct_add_uv_block_c
            else
            {
                vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
-                vpx_memset(q, 0, 2 * sizeof(q[0]));
+                memset(q, 0, 2 * sizeof(q[0]));
            }

            q    += 16;
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include "./vp8_rtcd.h"

 /****************************************************************************
 * Notes:
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -82,10 +82,9 @@ void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
        if (block_inside_limit < 1)
            block_inside_limit = 1;

-        vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
-        vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
-                SIMD_WIDTH);
-        vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+        memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+        memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), SIMD_WIDTH);
+        memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
               SIMD_WIDTH);
    }
 }
@@ -105,7 +104,7 @@ void vp8_loop_filter_init(VP8_COMMON *cm)
    /* init hev threshold const vectors */
    for(i = 0; i < 4 ; i++)
    {
-        vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+        memset(lfi->hev_thr[i], i, SIMD_WIDTH);
    }
 }

@@ -151,7 +150,7 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm,
            /* we could get rid of this if we assume that deltas are set to
             * zero when not in use; encoder always uses deltas
             */
-            vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
+            memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
            continue;
        }

--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c
@@ -17,10 +17,11 @@
 * higher quality.
 */

-#include "postproc.h"
-#include "variance.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/postproc.h"
+#include "vp8/common/variance.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp8_rtcd.h"
 #include "vpx_scale/yv12config.h"

 #include <limits.h>
@@ -150,36 +151,36 @@ static void multiframe_quality_enhance_block

    if (blksize == 16)
    {
-        actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
-        act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
 #ifdef USE_SSD
-        sad = (vp8_variance16x16(y, y_stride, yd, yd_stride, &sse));
+        vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
        sad = (sse + 128)>>8;
-        usad = (vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse));
+        vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
        usad = (sse + 32)>>6;
-        vsad = (vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse));
+        vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
        vsad = (sse + 32)>>6;
 #else
-        sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8;
-        usad = (vp8_sad8x8(u, uv_stride, ud, uvd_stride, UINT_MAX) + 32) >> 6;
-        vsad = (vp8_sad8x8(v, uv_stride, vd, uvd_stride, UINT_MAX)+ 32) >> 6;
+        sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+        usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6;
+        vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride)+ 32) >> 6;
 #endif
    }
    else /* if (blksize == 8) */
    {
-        actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
-        act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
 #ifdef USE_SSD
-        sad = (vp8_variance8x8(y, y_stride, yd, yd_stride, &sse));
+        vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
        sad = (sse + 32)>>6;
-        usad = (vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse));
+        vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
        usad = (sse + 8)>>4;
-        vsad = (vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse));
+        vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
        vsad = (sse + 8)>>4;
 #else
-        sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6;
-        usad = (vp8_sad4x4(u, uv_stride, ud, uvd_stride, UINT_MAX) + 8) >> 4;
-        vsad = (vp8_sad4x4(v, uv_stride, vd, uvd_stride, UINT_MAX) + 8) >> 4;
+        sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
+        usad = (vpx_sad4x4(u, uv_stride, ud, uvd_stride) + 8) >> 4;
+        vsad = (vpx_sad4x4(v, uv_stride, vd, uvd_stride) + 8) >> 4;
 #endif
    }

@@ -231,9 +232,9 @@ static void multiframe_quality_enhance_block
        {
            vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
            for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride)
-                vpx_memcpy(udp, up, uvblksize);
+                memcpy(udp, up, uvblksize);
            for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride)
-                vpx_memcpy(vdp, vp, uvblksize);
+                memcpy(vdp, vp, uvblksize);
        }
    }
 }
@@ -341,8 +342,8 @@ void vp8_multiframe_quality_enhance
                                for (k = 0; k < 4; ++k, up += show->uv_stride, udp += dest->uv_stride,
                                                        vp += show->uv_stride, vdp += dest->uv_stride)
                                {
-                                    vpx_memcpy(udp, up, 4);
-                                    vpx_memcpy(vdp, vp, 4);
+                                    memcpy(udp, up, 4);
+                                    memcpy(vdp, vp, 4);
                                }
                            }
                        }
--- a/vp8/common/mips/dspr2/dequantize_dspr2.c
+++ b/vp8/common/mips/dspr2/dequantize_dspr2.c
@@ -26,7 +26,7 @@ void vp8_dequant_idct_add_dspr2(short *input, short *dq,

    vp8_short_idct4x4llm_dspr2(input, dest, stride, dest, stride);

-    vpx_memset(input, 0, 32);
+    memset(input, 0, 32);

 }

--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -355,8 +355,8 @@ void vp8_deblock(VP8_COMMON                 *cm,
                else
                    mb_ppl = (unsigned char)ppl;

-                vpx_memset(ylptr, mb_ppl, 16);
-                vpx_memset(uvlptr, mb_ppl, 8);
+                memset(ylptr, mb_ppl, 16);
+                memset(uvlptr, mb_ppl, 8);

                ylptr += 16;
                uvlptr += 8;
@@ -403,7 +403,7 @@ void vp8_de_noise(VP8_COMMON                 *cm,
    (void) low_var_thresh;
    (void) flag;

-    vpx_memset(limits, (unsigned char)ppl, 16 * mb_cols);
+    memset(limits, (unsigned char)ppl, 16 * mb_cols);

    /* TODO: The original code don't filter the 2 outer rows and columns. */
    for (mbr = 0; mbr < mb_rows; mbr++)
@@ -427,7 +427,7 @@ void vp8_de_noise(VP8_COMMON                 *cm,
    }
 }

-double vp8_gaussian(double sigma, double mu, double x)
+static double gaussian(double sigma, double mu, double x)
 {
    return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
           (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
@@ -455,7 +455,7 @@ static void fillrd(struct postproc_state *state, int q, int a)

        for (i = -32; i < 32; i++)
        {
-            const int v = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
+            const int v = (int)(.5 + 256 * gaussian(sigma, 0, i));

            if (v)
            {
@@ -763,7 +763,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
            /* insure that postproc is set to all 0's so that post proc
             * doesn't pull random data in from edge
             */
-            vpx_memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
+            memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);

        }
    }
--- a/vp8/common/ppc/copy_altivec.asm
+++ b/vp8/common/ppc/copy_altivec.asm
@@ -1,47 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl copy_mem16x16_ppc
-
-;# r3 unsigned char *src
-;# r4 int src_stride
-;# r5 unsigned char *dst
-;# r6 int dst_stride
-
-;# Make the assumption that input will not be aligned,
-;#  but the output will be.  So two reads and a perm
-;#  for the input, but only one store for the output.
-copy_mem16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xe000
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r10, 16
-    mtctr   r10
-
-cp_16x16_loop:
-    lvsl    v0,  0, r3          ;# permutate value for alignment
-
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-
-    vperm   v1, v1, v2, v0
-
-    stvx    v1,  0, r5
-
-    add     r3, r3, r4          ;# increment source pointer
-    add     r5, r5, r6          ;# increment destination pointer
-
-    bdnz    cp_16x16_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
--- a/vp8/common/ppc/filter_altivec.asm
+++ b/vp8/common/ppc/filter_altivec.asm
--- a/vp8/common/ppc/filter_bilinear_altivec.asm
+++ b/vp8/common/ppc/filter_bilinear_altivec.asm
@@ -1,677 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl bilinear_predict4x4_ppc
-    .globl bilinear_predict8x4_ppc
-    .globl bilinear_predict8x8_ppc
-    .globl bilinear_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
-    load_c \V0, vfilter_b, r6, r9, r10
-
-    addi    r6,  r6, 16
-    lvx     \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
-    ;# load up horizontal filter
-    slwi.   r5, r5, 4           ;# index into horizontal filter array
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-    li      r12, 32
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq     \jump_label
-
-    load_c v20, hfilter_b, r5, r9, r0
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v28, b_hperm_b, 0, r9, r0
-
-    ;# rounding added in on the multiply
-    vspltisw v21, 8
-    vspltisw v18, 3
-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
-
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro HFilter V
-    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456
-    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A
-
-    vmsummbm v24, v20, v24, v18
-    vmsummbm v25, v20, v25, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-
-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
-.endm
-
-.macro hfilter_8 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 9 bytes wide, output is 8 bytes.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-
-    HFilter \V
-.endm
-
-
-.macro load_and_align_8 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-.macro write_aligned_8 V, increment_counter
-    stvx    \V,  0, r7
-
-.if \increment_counter
-    add     r7, r7, r8
-.endif
-.endm
-
-.macro vfilter_16 P0 P1
-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
-    vadduhm v22, v18, v22
-    vmuloub v23, \P0, v20
-    vadduhm v23, v18, v23
-
-    vmuleub v24, \P1, v21
-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
-    vmuloub v25, \P1, v21
-    vadduhm v23, v23, v25       ;# Ro = odds
-
-    vsrh    v22, v22, v19       ;# divide by 128
-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
-    vmrglh  v23, v22, v23
-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
-.endm
-
-
-.macro w_8x8 V, D, R, P
-    stvx    \V, 0, r1
-    lwz     \R, 0(r1)
-    stw     \R, 0(r7)
-    lwz     \R, 4(r1)
-    stw     \R, 4(r7)
-    add     \D, \D, \P
-.endm
-
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict4x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_4x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_4x4_b
-
-    hfilter_8 v4, 0
-
-    b   second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-
-second_pass_4x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-store_out_4x4_b:
-
-    stvx    v0, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v1, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v2, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v3, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-
-exit_4x4:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_8x4_b
-
-    hfilter_8 v4, 0
-
-    b   second_pass_8x4_b
-
-second_pass_8x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-
-second_pass_8x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-store_out_8x4_b:
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x4_b
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned_8x4_b:
-    load_c v10, b_hilo_b, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-
-exit_8x4:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff0
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x8_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-    hfilter_8 v4, 1
-    hfilter_8 v5, 1
-    hfilter_8 v6, 1
-    hfilter_8 v7, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_8x8_b
-
-    hfilter_8 v8, 0
-
-    b   second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-    load_and_align_8  v5, 1
-    load_and_align_8  v6, 1
-    load_and_align_8  v7, 1
-    load_and_align_8  v8, 0
-
-second_pass_8x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-
-store_out_8x8_b:
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x8_b
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned_8x8_b:
-    load_c v10, b_hilo_b, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-
-exit_8x8:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-    lvx     v23, r12, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
-
-    ;# set 0
-    vmsummbm v24, v20, v21, v18 ;# taps times elements
-
-    ;# set 1
-    vsldoi  v23, v21, v22, 1
-    vmsummbm v25, v20, v23, v18
-
-    ;# set 2
-    vsldoi  v23, v21, v22, 2
-    vmsummbm v26, v20, v23, v18
-
-    ;# set 3
-    vsldoi  v23, v21, v22, 3
-    vmsummbm v27, v20, v23, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-    vsrh    v25, v25, v19
-
-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
-.endm
-
-.macro load_and_align_16 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-.macro write_16 V, increment_counter
-    stvx    \V,  0, r7
-
-.if \increment_counter
-    add     r7, r7, r8
-.endif
-.endm
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    HProlog second_pass_16x16_pre_copy_b
-
-    hfilter_16 v0,  1
-    hfilter_16 v1,  1
-    hfilter_16 v2,  1
-    hfilter_16 v3,  1
-    hfilter_16 v4,  1
-    hfilter_16 v5,  1
-    hfilter_16 v6,  1
-    hfilter_16 v7,  1
-    hfilter_16 v8,  1
-    hfilter_16 v9,  1
-    hfilter_16 v10, 1
-    hfilter_16 v11, 1
-    hfilter_16 v12, 1
-    hfilter_16 v13, 1
-    hfilter_16 v14, 1
-    hfilter_16 v15, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_16x16_b
-
-    hfilter_16 v16, 0
-
-    b   second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  1
-    load_and_align_16  v1,  1
-    load_and_align_16  v2,  1
-    load_and_align_16  v3,  1
-    load_and_align_16  v4,  1
-    load_and_align_16  v5,  1
-    load_and_align_16  v6,  1
-    load_and_align_16  v7,  1
-    load_and_align_16  v8,  1
-    load_and_align_16  v9,  1
-    load_and_align_16  v10, 1
-    load_and_align_16  v11, 1
-    load_and_align_16  v12, 1
-    load_and_align_16  v13, 1
-    load_and_align_16  v14, 1
-    load_and_align_16  v15, 1
-    load_and_align_16  v16, 0
-
-second_pass_16x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-store_out_16x16_b:
-
-    write_16 v0,  1
-    write_16 v1,  1
-    write_16 v2,  1
-    write_16 v3,  1
-    write_16 v4,  1
-    write_16 v5,  1
-    write_16 v6,  1
-    write_16 v7,  1
-    write_16 v8,  1
-    write_16 v9,  1
-    write_16 v10, 1
-    write_16 v11, 1
-    write_16 v12, 1
-    write_16 v13, 1
-    write_16 v14, 1
-    write_16 v15, 0
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-hfilter_b:
-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
-
-    .align 4
-vfilter_b:
-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
-    .align 4
-b_hperm_b:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-b_0123_b:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-b_4567_b:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-b_hilo_b:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
--- a/vp8/common/ppc/idctllm_altivec.asm
+++ b/vp8/common/ppc/idctllm_altivec.asm
@@ -1,189 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl short_idct4x4llm_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-    .align 2
-short_idct4x4llm_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    load_c v8, sinpi8sqrt2, 0, r9, r10
-    load_c v9, cospi8sqrt2minus1, 0, r9, r10
-    load_c v10, hi_hi, 0, r9, r10
-    load_c v11, lo_lo, 0, r9, r10
-    load_c v12, shift_16, 0, r9, r10
-
-    li      r10,  16
-    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
-    lvx     v1, r10, r3         ;# input ip[8], ip[12]
-
-    ;# first pass
-    vupkhsh v2, v0
-    vupkhsh v3, v1
-    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
-    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]
-
-    vupklsh v0, v0
-    vmulosh v4, v0, v8
-    vsraw   v4, v4, v12
-    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
-    vupklsh v1, v1
-    vmulosh v5, v1, v9
-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v1
-
-    vsubsws v4, v4, v5          ;# c1
-
-    vmulosh v3, v1, v8
-    vsraw   v3, v3, v12
-    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v0, v9
-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v0
-
-    vaddsws v3, v3, v5          ;# d1
-
-    vaddsws v0, v6, v3          ;# a1 + d1
-    vsubsws v3, v6, v3          ;# a1 - d1
-
-    vaddsws v1, v7, v4          ;# b1 + c1
-    vsubsws v2, v7, v4          ;# b1 - c1
-
-    ;# transpose input
-    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
-    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1
-
-    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
-    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3
-
-    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
-    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1
-
-    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
-    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3
-
-    ;# second pass
-    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
-    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]
-
-    vmulosh v4, v1, v8
-    vsraw   v4, v4, v12
-    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v3, v9
-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v3
-
-    vsubsws v4, v4, v5          ;# c1
-
-    vmulosh v2, v3, v8
-    vsraw   v2, v2, v12
-    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v1, v9
-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v1
-
-    vaddsws v3, v2, v5          ;# d1
-
-    vaddsws v0, v6, v3          ;# a1 + d1
-    vsubsws v3, v6, v3          ;# a1 - d1
-
-    vaddsws v1, v7, v4          ;# b1 + c1
-    vsubsws v2, v7, v4          ;# b1 - c1
-
-    vspltish v6, 4
-    vspltish v7, 3
-
-    vpkswss v0, v0, v1
-    vpkswss v1, v2, v3
-
-    vaddshs v0, v0, v6
-    vaddshs v1, v1, v6
-
-    vsrah   v0, v0, v7
-    vsrah   v1, v1, v7
-
-    ;# transpose output
-    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
-    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3
-
-    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
-    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3
-
-    stwu    r1,-416(r1)         ;# create space on the stack
-
-    stvx    v0,  0, r1
-    lwz     r6, 0(r1)
-    stw     r6, 0(r4)
-    lwz     r6, 4(r1)
-    stw     r6, 4(r4)
-
-    add     r4, r4, r5
-
-    lwz     r6,  8(r1)
-    stw     r6,  0(r4)
-    lwz     r6, 12(r1)
-    stw     r6,  4(r4)
-
-    add     r4, r4, r5
-
-    stvx    v1,  0, r1
-    lwz     r6, 0(r1)
-    stw     r6, 0(r4)
-    lwz     r6, 4(r1)
-    stw     r6, 4(r4)
-
-    add     r4, r4, r5
-
-    lwz     r6,  8(r1)
-    stw     r6,  0(r4)
-    lwz     r6, 12(r1)
-    stw     r6,  4(r4)
-
-    addi    r1, r1, 416         ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 4
-sinpi8sqrt2:
-    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
-
-    .align 4
-cospi8sqrt2minus1:
-    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
-
-    .align 4
-shift_16:
-    .long      16,    16,    16,    16
-
-    .align 4
-hi_hi:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
-
-    .align 4
-lo_lo:
-    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
--- a/vp8/common/ppc/loopfilter_altivec.c
+++ b/vp8/common/ppc/loopfilter_altivec.c
@@ -1,135 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "loopfilter.h"
-#include "onyxc_int.h"
-
-typedef void loop_filter_function_y_ppc
-(
-    unsigned char *s,   // source pointer
-    int p,              // pitch
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh
-);
-
-typedef void loop_filter_function_uv_ppc
-(
-    unsigned char *u,   // source pointer
-    unsigned char *v,   // source pointer
-    int p,              // pitch
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh
-);
-
-typedef void loop_filter_function_s_ppc
-(
-    unsigned char *s,   // source pointer
-    int p,              // pitch
-    const signed char *flimit
-);
-
-loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
-
-loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
-
-loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
-loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
-
-// Horizontal MB filtering
-void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Vertical MB Filtering
-void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Horizontal B Filtering
-void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    // These should all be done at once with one call, instead of 3
-    loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-    loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-    loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
-    loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
-    loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
-}
-
-// Vertical B Filtering
-void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);
-    loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);
-    loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
-}
--- a/vp8/common/ppc/loopfilter_filters_altivec.asm
+++ b/vp8/common/ppc/loopfilter_filters_altivec.asm
--- a/Show More
+++ b/Show More