vp9-denoiser bugfix: Disable postproc-denoiser under temporal denoising.

The postproc vp9_denoise() is a spatial denoise/blur function. It was not intended to be used if temporal denoising is enabled. Change-Id: I97d2dcb941e7cc49bbafce99d9286beb2693249d
vp9-svc: Fix to avoid msan unitialized value.
2016-02-24 17:06:33 -08:00 · 2016-01-06 11:34:57 -08:00 · 2016-01-05 14:55:05 -08:00 · 2016-01-05 20:35:46 +00:00 · 2016-01-05 16:56:54 +00:00 · 2016-01-04 18:33:37 -08:00
271 changed files with 27153 additions and 11766 deletions
--- a/.mailmap
+++ b/.mailmap
@@ -1,14 +1,18 @@
 Adrian Grange <agrange@google.com>
-Alex Converse <aconverse@google.com> <alex.converse@gmail.com>
+Aℓex Converse <aconverse@google.com>
 Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
 Deb Mukherjee <debargha@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
 Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hui Su <huisu@google.com>
 Jacky Chen <jackychen@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
 Marco Paniconi <marpan@google.com>
@@ -17,10 +21,12 @@ Pascal Massimino <pascal.massimino@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
 Sami Pietilä <samipietila@google.com>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
--- a/15
+++ b/15
@@ -5,9 +5,9 @@ Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
 Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
 Alex Converse <aconverse@google.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -16,8 +16,10 @@ Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Andrew Russell <anrussell@google.com>
 Angie Chiang <angiebird@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
 chm <chm@rock-chips.com>
@@ -27,6 +29,7 @@ Deb Mukherjee <debargha@google.com>
 Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
 Dragan Mrdjan <dmrdjan@mips.com>
 Ed Baker <edward.baker@intel.com>
 Ehsan Akhgari <ehsan.akhgari@gmail.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com>
 Fabio Pedretti <fabio.ped@libero.it>
@@ -34,6 +37,8 @@ Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
 Guillaume Martres <gmartres@google.com>
@@ -44,7 +49,7 @@ Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
 Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
-JackyChen <jackychen@google.com>
+Jacky Chen <jackychen@google.com>
 James Berry <jamesberry@google.com>
 James Yu <james.yu@linaro.org>
 James Zern <jzern@google.com>
@@ -60,9 +65,11 @@ Jingning Han <jingning@google.com>
 Joey Parrish <joeyparrish@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
 John Stark <jhnstrk@gmail.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
 Julia Robson <juliamrobson@gmail.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 KO Myung-Hun <komh@chollian.net>
@@ -82,6 +89,7 @@ Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
 Minghai Shang <minghai@google.com>
 Morton Jonuschat <yabawock@gmail.com>
 Nico Weber <thakis@chromium.org>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
 Patrik Westin <patrik.westin@gmail.com>
@@ -96,7 +104,7 @@ Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
 Ralph Giles <giles@xiph.org>
 Rob Bradford <rob@linux.intel.com>
-Ronald S. Bultje <rbultje@google.com>
+Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
 Sami Pietilä <samipietila@google.com>
 Scott Graham <scottmg@chromium.org>
@@ -104,6 +112,7 @@ Scott LaVarnway <slavarnway@google.com>
 Sean McGovern <gseanmcg@gmail.com>
 Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
 Taekhyun Kim <takim@nvidia.com>
--- a/20
+++ b/20
@@ -1,7 +1,19 @@
-xxxx-yy-zz v1.4.0 "Changes for next release"
+2015-11-09 v1.5.0 "Javan Whistling Duck"
-  vpxenc is changed to use VP9 by default.
+  This release improves upon the VP9 encoder and speeds up the encoding and
-  Encoder controls added for 1 pass SVC.
+  decoding processes.
-  Decoder control to toggle on/off loopfilter.
+
  - Upgrading:
    This release is ABI incompatible with 1.4.0. It drops deprecated VP8
    controls and adds a variety of VP9 controls for testing.
    The vpxenc utility now prefers VP9 by default.
  - Enhancements:
    Faster VP9 encoding and decoding
    Smaller library size by combining functions used by VP8 and VP9
  - Bug Fixes:
    A variety of fuzzing issues
 2015-04-03 v1.4.0 "Indian Runner Duck"
  This release includes significant improvements to the VP9 codec.
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -287,7 +287,7 @@ define archive_template
 # for creating them.
 $(1):
 	$(if $(quiet),@echo "    [AR] $$@")
-	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$?
+	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$^
 endef
 define so_template
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -73,6 +73,7 @@ Build options:
  --target=TARGET             target platform tuple [generic-gnu]
  --cpu=CPU                   optimize for a specific cpu rather than a family
  --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
  --extra-cxxflags=ECXXFLAGS  add ECXXFLAGS to CXXFLAGS [$CXXFLAGS]
  ${toggle_extra_warnings}    emit harmless warnings (always non-fatal)
  ${toggle_werror}            treat warnings as errors, if possible
                              (not available with all compilers)
@@ -200,6 +201,10 @@ disabled(){
  eval test "x\$$1" = "xno"
 }
 # Iterates through positional parameters, checks to confirm the parameter has
 # not been explicitly (force) disabled, and enables the setting controlled by
 # the parameter when the setting is not disabled.
 # Note: Does NOT alter RTCD generation options ($RTCD_OPTIONS).
 soft_enable() {
  for var in $*; do
    if ! disabled $var; then
@@ -209,6 +214,10 @@ soft_enable() {
  done
 }
 # Iterates through positional parameters, checks to confirm the parameter has
 # not been explicitly (force) enabled, and disables the setting controlled by
 # the parameter when the setting is not enabled.
 # Note: Does NOT alter RTCD generation options ($RTCD_OPTIONS).
 soft_disable() {
  for var in $*; do
    if ! enabled $var; then
@@ -337,6 +346,10 @@ check_add_cflags() {
  check_cflags "$@" && add_cflags_only "$@"
 }
 check_add_cxxflags() {
  check_cxxflags "$@" && add_cxxflags_only "$@"
 }
 check_add_asflags() {
  log add_asflags "$@"
  add_asflags "$@"
@@ -428,7 +441,7 @@ NM=${NM}
 CFLAGS  = ${CFLAGS}
 CXXFLAGS  = ${CXXFLAGS}
-ARFLAGS = -rus\$(if \$(quiet),c,v)
+ARFLAGS = -crs\$(if \$(quiet),,v)
 LDFLAGS = ${LDFLAGS}
 ASFLAGS = ${ASFLAGS}
 extralibs = ${extralibs}
@@ -503,6 +516,9 @@ process_common_cmdline() {
      --extra-cflags=*)
        extra_cflags="${optval}"
        ;;
      --extra-cxxflags=*)
        extra_cxxflags="${optval}"
        ;;
      --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
        if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
@@ -617,6 +633,11 @@ show_darwin_sdk_path() {
    xcodebuild -sdk $1 -version Path 2>/dev/null
 }
 # Print the major version number of the Darwin SDK specified by $1.
 show_darwin_sdk_major_version() {
  xcrun --sdk $1 --show-sdk-version 2>/dev/null | cut -d. -f1
 }
 process_common_toolchain() {
  if [ -z "$toolchain" ]; then
    gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
@@ -667,6 +688,10 @@ process_common_toolchain() {
        tgt_isa=x86_64
        tgt_os=darwin14
        ;;
      *darwin15*)
        tgt_isa=x86_64
        tgt_os=darwin15
        ;;
      x86_64*mingw32*)
        tgt_os=win64
        ;;
@@ -729,13 +754,14 @@ process_common_toolchain() {
  # platforms, so use the newest one available.
  case ${toolchain} in
    arm*-darwin*)
-      ios_sdk_dir="$(show_darwin_sdk_path iphoneos)"
+      add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
-      if [ -d "${ios_sdk_dir}" ]; then
+      iphoneos_sdk_dir="$(show_darwin_sdk_path iphoneos)"
-        add_cflags  "-isysroot ${ios_sdk_dir}"
+      if [ -d "${iphoneos_sdk_dir}" ]; then
-        add_ldflags "-isysroot ${ios_sdk_dir}"
+        add_cflags  "-isysroot ${iphoneos_sdk_dir}"
        add_ldflags "-isysroot ${iphoneos_sdk_dir}"
      fi
      ;;
-    *-darwin*)
+    x86*-darwin*)
      osx_sdk_dir="$(show_darwin_sdk_path macosx)"
      if [ -d "${osx_sdk_dir}" ]; then
        add_cflags  "-isysroot ${osx_sdk_dir}"
@@ -773,6 +799,10 @@ process_common_toolchain() {
      add_cflags  "-mmacosx-version-min=10.10"
      add_ldflags "-mmacosx-version-min=10.10"
      ;;
    *-darwin15-*)
      add_cflags  "-mmacosx-version-min=10.11"
      add_ldflags "-mmacosx-version-min=10.11"
      ;;
    *-iphonesimulator-*)
      add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
      add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
@@ -811,16 +841,35 @@ process_common_toolchain() {
            die "Disabling neon while keeping neon-asm is not supported"
          fi
          case ${toolchain} in
            # Apple iOS SDKs no longer support armv6 as of the version 9
            # release (coincides with release of Xcode 7). Only enable media
            # when using earlier SDK releases.
            *-darwin*)
-              # Neon is guaranteed on iOS 6+ devices, while old media extensions
+              if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then
-              # no longer assemble with iOS 9 SDK
+                soft_enable media
              else
                soft_disable media
                RTCD_OPTIONS="${RTCD_OPTIONS}--disable-media "
              fi
              ;;
            *)
              soft_enable media
              ;;
          esac
          ;;
        armv6)
-          soft_enable media
+          case ${toolchain} in
            *-darwin*)
              if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then
                soft_enable media
              else
                die "Your iOS SDK does not support armv6."
              fi
              ;;
            *)
              soft_enable media
              ;;
          esac
          ;;
      esac
@@ -938,8 +987,10 @@ EOF
                awk '{ print $1 }' | tail -1`
          fi
-          add_cflags "--sysroot=${alt_libc}"
+          if [ -d "${alt_libc}" ]; then
-          add_ldflags "--sysroot=${alt_libc}"
+            add_cflags "--sysroot=${alt_libc}"
            add_ldflags "--sysroot=${alt_libc}"
          fi
          # linker flag that routes around a CPU bug in some
          # Cortex-A8 implementations (NDK Dev Guide)
@@ -1003,6 +1054,12 @@ EOF
          done
          asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
          if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ]; then
            check_add_cflags -fembed-bitcode
            check_add_asflags -fembed-bitcode
            check_add_ldflags -fembed-bitcode
          fi
          ;;
        linux*)
@@ -1151,32 +1208,43 @@ EOF
      soft_enable runtime_cpu_detect
      # We can't use 'check_cflags' until the compiler is configured and CC is
      # populated.
-      check_gcc_machine_option mmx
+      for ext in ${ARCH_EXT_LIST_X86}; do
-      check_gcc_machine_option sse
+        # disable higher order extensions to simplify asm dependencies
-      check_gcc_machine_option sse2
+        if [ "$disable_exts" = "yes" ]; then
-      check_gcc_machine_option sse3
+          if ! disabled $ext; then
-      check_gcc_machine_option ssse3
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
-      check_gcc_machine_option sse4 sse4_1
+            disable_feature $ext
      check_gcc_machine_option avx
      check_gcc_machine_option avx2
      case "${AS}" in
        auto|"")
          which nasm >/dev/null 2>&1 && AS=nasm
          which yasm >/dev/null 2>&1 && AS=yasm
          if [ "${AS}" = nasm ] ; then
            # Apple ships version 0.98 of nasm through at least Xcode 6. Revisit
            # this check if they start shipping a compatible version.
            apple=`nasm -v | grep "Apple"`
            [ -n "${apple}" ] \
              && echo "Unsupported version of nasm: ${apple}" \
              && AS=""
          fi
-          [ "${AS}" = auto ] || [ -z "${AS}" ] \
+        elif disabled $ext; then
-            && die "Neither yasm nor nasm have been found"
+          disable_exts="yes"
-          ;;
+        else
-      esac
+          # use the shortened version for the flag: sse4_1 -> sse4
-      log_echo "  using $AS"
+          check_gcc_machine_option ${ext%_*} $ext
        fi
      done
      if enabled external_build; then
        log_echo "  skipping assembler detection"
      else
        case "${AS}" in
          auto|"")
            which nasm >/dev/null 2>&1 && AS=nasm
            which yasm >/dev/null 2>&1 && AS=yasm
            if [ "${AS}" = nasm ] ; then
              # Apple ships version 0.98 of nasm through at least Xcode 6. Revisit
              # this check if they start shipping a compatible version.
              apple=`nasm -v | grep "Apple"`
              [ -n "${apple}" ] \
                && echo "Unsupported version of nasm: ${apple}" \
                && AS=""
            fi
            [ "${AS}" = auto ] || [ -z "${AS}" ] \
              && die "Neither yasm nor nasm have been found." \
                     "See the prerequisites section in the README for more info."
            ;;
        esac
        log_echo "  using $AS"
      fi
      [ "${AS##*/}" = nasm ] && add_asflags -Ox
      AS_SFX=.asm
      case  ${tgt_os} in
@@ -1212,6 +1280,13 @@ EOF
          enabled x86 && sim_arch="-arch i386" || sim_arch="-arch x86_64"
          add_cflags  ${sim_arch}
          add_ldflags ${sim_arch}
          if [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then
            # yasm v1.3.0 doesn't know what -fembed-bitcode means, so turning it
            # on is pointless (unless building a C-only lib). Warn the user, but
            # do nothing here.
            log "Warning: Bitcode embed disabled for simulator targets."
          fi
          ;;
        os2)
          add_asflags -f aout
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -25,31 +25,42 @@ CONFIGURE_ARGS="--disable-docs
 DIST_DIR="_dist"
 FRAMEWORK_DIR="VPX.framework"
 HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
 MAKE_JOBS=1
 SCRIPT_DIR=$(dirname "$0")
 LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
 LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 ORIG_PWD="$(pwd)"
-TARGETS="arm64-darwin-gcc
+ARM_TARGETS="arm64-darwin-gcc
-         armv7-darwin-gcc
+             armv7-darwin-gcc
-         armv7s-darwin-gcc
+             armv7s-darwin-gcc"
-         x86-iphonesimulator-gcc
+SIM_TARGETS="x86-iphonesimulator-gcc
-         x86_64-iphonesimulator-gcc"
+             x86_64-iphonesimulator-gcc"
 OSX_TARGETS="x86-darwin15-gcc
             x86_64-darwin15-gcc"
 TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
 # Configures for the target specified by $1, and invokes make with the dist
 # target using $DIST_DIR as the distribution output directory.
 build_target() {
  local target="$1"
  local old_pwd="$(pwd)"
  local target_specific_flags=""
  vlog "***Building target: ${target}***"
  case "${target}" in
    x86-*)
      target_specific_flags="--enable-pic"
      vlog "Enabled PIC for ${target}"
      ;;
  esac
  mkdir "${target}"
  cd "${target}"
  eval "${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
-    ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${devnull}
+    ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${target_specific_flags} \
    ${devnull}
  export DIST_DIR
-  eval make -j ${MAKE_JOBS} dist ${devnull}
+  eval make dist ${devnull}
  cd "${old_pwd}"
  vlog "***Done building target: ${target}***"
@@ -189,16 +200,29 @@ cleanup() {
  fi
 }
 print_list() {
  local indent="$1"
  shift
  local list="$@"
  for entry in ${list}; do
    echo "${indent}${entry}"
  done
 }
 iosbuild_usage() {
 cat << EOF
  Usage: ${0##*/} [arguments]
    --help: Display this message and exit.
    --extra-configure-args <args>: Extra args to pass when configuring libvpx.
-    --jobs: Number of make jobs.
+    --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
              and x86_64. Allows linking to framework when builds target MacOSX
              instead of iOS.
    --preserve-build-output: Do not delete the build directory.
    --show-build-output: Show output from each library build.
    --targets <targets>: Override default target list. Defaults:
-         ${TARGETS}
+$(print_list "        " ${TARGETS})
    --test-link: Confirms all targets can be linked. Functionally identical to
                 passing --enable-examples via --extra-configure-args.
    --verbose: Output information about the environment and each stage of the
               build.
 EOF
@@ -227,20 +251,22 @@ while [ -n "$1" ]; do
      iosbuild_usage
      exit
      ;;
    --jobs)
      MAKE_JOBS="$2"
      shift
      ;;
    --preserve-build-output)
      PRESERVE_BUILD_OUTPUT=yes
      ;;
    --show-build-output)
      devnull=
      ;;
    --test-link)
      EXTRA_CONFIGURE_ARGS="${EXTRA_CONFIGURE_ARGS} --enable-examples"
      ;;
    --targets)
      TARGETS="$2"
      shift
      ;;
    --macosx)
      TARGETS="${ARM_TARGETS} ${OSX_TARGETS}"
      ;;
    --verbose)
      VERBOSE=yes
      ;;
@@ -260,15 +286,17 @@ cat << EOF
  EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
  FRAMEWORK_DIR=${FRAMEWORK_DIR}
  HEADER_DIR=${HEADER_DIR}
  MAKE_JOBS=${MAKE_JOBS}
  PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
  LIBVPX_SOURCE_DIR=${LIBVPX_SOURCE_DIR}
  LIPO=${LIPO}
  MAKEFLAGS=${MAKEFLAGS}
  ORIG_PWD=${ORIG_PWD}
-  TARGETS="${TARGETS}"
+  PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
  TARGETS="$(print_list "" ${TARGETS})"
  OSX_TARGETS="${OSX_TARGETS}"
  SIM_TARGETS="${SIM_TARGETS}"
 EOF
 fi
 build_framework "${TARGETS}"
 echo "Successfully built '${FRAMEWORK_DIR}' for:"
-echo "         ${TARGETS}"
+print_list "" ${TARGETS}
--- a/31
+++ b/31
@@ -35,6 +35,9 @@ Advanced options:
  ${toggle_debug_libs}            in/exclude debug version of libraries
  ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
  ${toggle_vp9_highbitdepth}      use VP9 high bit depth (10/12) profiles
  ${toggle_better_hw_compatibility}
                                  enable encoder to produce streams with better
                                  hardware decoder compatibility
  ${toggle_vp8}                   VP8 codec support
  ${toggle_vp9}                   VP9 codec support
  ${toggle_vp10}                  VP10 codec support
@@ -122,6 +125,7 @@ all_platforms="${all_platforms} x86-darwin11-gcc"
 all_platforms="${all_platforms} x86-darwin12-gcc"
 all_platforms="${all_platforms} x86-darwin13-gcc"
 all_platforms="${all_platforms} x86-darwin14-gcc"
 all_platforms="${all_platforms} x86-darwin15-gcc"
 all_platforms="${all_platforms} x86-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
@@ -142,6 +146,7 @@ all_platforms="${all_platforms} x86_64-darwin11-gcc"
 all_platforms="${all_platforms} x86_64-darwin12-gcc"
 all_platforms="${all_platforms} x86_64-darwin13-gcc"
 all_platforms="${all_platforms} x86_64-darwin14-gcc"
 all_platforms="${all_platforms} x86_64-darwin15-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
@@ -232,6 +237,16 @@ ARCH_LIST="
    x86
    x86_64
 "
 ARCH_EXT_LIST_X86="
    mmx
    sse
    sse2
    sse3
    ssse3
    sse4_1
    avx
    avx2
 "
 ARCH_EXT_LIST="
    edsp
    media
@@ -243,14 +258,7 @@ ARCH_EXT_LIST="
    msa
    mips64
-    mmx
+    ${ARCH_EXT_LIST_X86}
    sse
    sse2
    sse3
    ssse3
    sse4_1
    avx
    avx2
 "
 HAVE_LIST="
    ${ARCH_EXT_LIST}
@@ -264,6 +272,7 @@ EXPERIMENT_LIST="
    spatial_svc
    fp_mb_stats
    emulate_hardware
    misc_fixes
 "
 CONFIG_LIST="
    dependency_tracking
@@ -316,6 +325,7 @@ CONFIG_LIST="
    vp9_temporal_denoising
    coefficient_range_checking
    vp9_highbitdepth
    better_hw_compatibility
    experimental
    size_limit
    ${EXPERIMENT_LIST}
@@ -374,6 +384,7 @@ CMDLINE_SELECT="
    temporal_denoising
    vp9_temporal_denoising
    coefficient_range_checking
    better_hw_compatibility
    vp9_highbitdepth
    experimental
 "
@@ -722,6 +733,10 @@ EOF
        check_add_cflags ${extra_cflags} || \
        die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
    fi
    if [ -n "${extra_cxxflags}" ]; then
        check_add_cxxflags ${extra_cxxflags} || \
        die "Requested extra CXXFLAGS '${extra_cxxflags}' not supported by compiler"
    fi
 }
--- a/examples.mk
+++ b/examples.mk
@@ -36,6 +36,8 @@ LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
                third_party/libyuv/source/scale_neon64.cc \
                third_party/libyuv/source/scale_win.cc \
 LIBWEBM_COMMON_SRCS += third_party/libwebm/webmids.hpp
 LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
                      third_party/libwebm/mkvmuxerutil.cpp \
                      third_party/libwebm/mkvwriter.cpp \
@@ -43,8 +45,7 @@ LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
                      third_party/libwebm/mkvmuxertypes.hpp \
                      third_party/libwebm/mkvmuxerutil.hpp \
                      third_party/libwebm/mkvparser.hpp \
-                      third_party/libwebm/mkvwriter.hpp \
+                      third_party/libwebm/mkvwriter.hpp
                      third_party/libwebm/webmids.hpp
 LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser.cpp \
                      third_party/libwebm/mkvreader.cpp \
@@ -68,6 +69,7 @@ ifeq ($(CONFIG_LIBYUV),yes)
  vpxdec.SRCS                 += $(LIBYUV_SRCS)
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
  vpxdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
  vpxdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)
  vpxdec.SRCS                 += webmdec.cc webmdec.h
 endif
@@ -89,6 +91,7 @@ ifeq ($(CONFIG_LIBYUV),yes)
  vpxenc.SRCS                 += $(LIBYUV_SRCS)
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
  vpxenc.SRCS                 += $(LIBWEBM_COMMON_SRCS)
  vpxenc.SRCS                 += $(LIBWEBM_MUXER_SRCS)
  vpxenc.SRCS                 += webmenc.cc webmenc.h
 endif
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -80,6 +80,8 @@ static const arg_def_t rc_end_usage_arg =
    ARG_DEF(NULL, "rc-end-usage", 1, "0 - 3: VBR, CBR, CQ, Q");
 static const arg_def_t speed_arg =
    ARG_DEF("sp", "speed", 1, "speed configuration");
 static const arg_def_t aqmode_arg =
    ARG_DEF("aq", "aqmode", 1, "aq-mode off/on");
 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -101,7 +103,7 @@ static const arg_def_t *svc_args[] = {
  &kf_dist_arg,       &scale_factors_arg, &passes_arg,      &pass_arg,
  &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
  &max_bitrate_arg,   &temporal_layers_arg, &temporal_layering_mode_arg,
-  &lag_in_frame_arg,  &threads_arg,
+  &lag_in_frame_arg,  &threads_arg,       &aqmode_arg,
 #if OUTPUT_RC_STATS
  &output_rc_stats_arg,
 #endif
@@ -221,6 +223,8 @@ static void parse_command_line(int argc, const char **argv_,
 #endif
    } else if (arg_match(&arg, &speed_arg, argi)) {
      svc_ctx->speed = arg_parse_uint(&arg);
    } else if (arg_match(&arg, &aqmode_arg, argi)) {
      svc_ctx->aqmode = arg_parse_uint(&arg);
    } else if (arg_match(&arg, &threads_arg, argi)) {
      svc_ctx->threads = arg_parse_uint(&arg);
    } else if (arg_match(&arg, &temporal_layering_mode_arg, argi)) {
@@ -404,7 +408,10 @@ static void set_rate_control_stats(struct RateControlStats *rc,
    for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
      const int layer = sl * cfg->ts_number_layers + tl;
      const int tlayer0 = sl * cfg->ts_number_layers;
-      rc->layer_framerate[layer] =
+      if (cfg->ts_number_layers == 1)
        rc->layer_framerate[layer] = framerate;
      else
        rc->layer_framerate[layer] =
          framerate / cfg->ts_rate_decimator[tl];
      if (tl > 0) {
        rc->layer_pfb[layer] = 1000.0 *
@@ -540,6 +547,59 @@ vpx_codec_err_t parse_superframe_index(const uint8_t *data,
 }
 #endif
 // Example pattern for spatial layers and 2 temporal layers used in the
 // bypass/flexible mode. The pattern corresponds to the pattern
 // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
 // non-flexible mode.
 void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
                                 int is_key_frame,
                                 vpx_svc_ref_frame_config_t *ref_frame_config) {
  for (sl = 0; sl < num_spatial_layers; ++sl) {
    if (!tl) {
      if (!sl) {
        ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_GF |
                                            VP8_EFLAG_NO_REF_ARF |
                                            VP8_EFLAG_NO_UPD_GF |
                                            VP8_EFLAG_NO_UPD_ARF;
      } else {
        if (is_key_frame) {
          ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_LAST |
                                              VP8_EFLAG_NO_REF_ARF |
                                              VP8_EFLAG_NO_UPD_GF |
                                              VP8_EFLAG_NO_UPD_ARF;
        } else {
        ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_ARF |
                                            VP8_EFLAG_NO_UPD_GF |
                                            VP8_EFLAG_NO_UPD_ARF;
        }
      }
    } else if (tl == 1) {
      if (!sl) {
        ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_GF |
                                            VP8_EFLAG_NO_REF_ARF |
                                            VP8_EFLAG_NO_UPD_LAST |
                                            VP8_EFLAG_NO_UPD_GF;
      } else {
        ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_ARF |
                                            VP8_EFLAG_NO_UPD_LAST |
                                            VP8_EFLAG_NO_UPD_GF;
      }
    }
    if (tl == 0) {
      ref_frame_config->lst_fb_idx[sl] = sl;
      if (sl)
        ref_frame_config->gld_fb_idx[sl] = sl - 1;
      else
        ref_frame_config->gld_fb_idx[sl] = 0;
      ref_frame_config->alt_fb_idx[sl] = 0;
    } else if (tl == 1) {
      ref_frame_config->lst_fb_idx[sl] = sl;
      ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
      ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
    }
  }
 }
 int main(int argc, const char **argv) {
  AppInput app_input = {0};
  VpxVideoWriter *writer = NULL;
@@ -560,6 +620,7 @@ int main(int argc, const char **argv) {
  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL};
  struct RateControlStats rc;
  vpx_svc_layer_id_t layer_id;
  vpx_svc_ref_frame_config_t ref_frame_config;
  int sl, tl;
  double sum_bitrate = 0.0;
  double sum_bitrate2 = 0.0;
@@ -635,7 +696,7 @@ int main(int argc, const char **argv) {
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
  if (svc_ctx.threads)
    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
-  if (svc_ctx.speed >= 5)
+  if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
@@ -649,6 +710,37 @@ int main(int argc, const char **argv) {
      end_of_stream = 1;
    }
    // For BYPASS/FLEXIBLE mode, set the frame flags (reference and updates)
    // and the buffer indices for each spatial layer of the current
    // (super)frame to be encoded. The temporal layer_id for the current frame
    // also needs to be set.
    // TODO(marpan): Should rename the "VP9E_TEMPORAL_LAYERING_MODE_BYPASS"
    // mode to "VP9E_LAYERING_MODE_BYPASS".
    if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
      layer_id.spatial_layer_id = 0;
      // Example for 2 temporal layers.
      if (frame_cnt % 2 == 0)
        layer_id.temporal_layer_id = 0;
      else
        layer_id.temporal_layer_id = 1;
      // Note that we only set the temporal layer_id, since we are calling
      // the encode for the whole superframe. The encoder will internally loop
      // over all the spatial layers for the current superframe.
      vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
      set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id,
                                  svc_ctx.spatial_layers,
                                  frame_cnt == 0,
                                  &ref_frame_config);
      vpx_codec_control(&codec, VP9E_SET_SVC_REF_FRAME_CONFIG,
                        &ref_frame_config);
      // Keep track of input frames, to account for frame drops in rate control
      // stats/metrics.
      for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
        ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
                                layer_id.temporal_layer_id];
      }
    }
    vpx_usec_timer_start(&timer);
    res = vpx_svc_encode(&svc_ctx, &codec, (end_of_stream ? NULL : &raw),
                         pts, frame_duration, svc_ctx.speed >= 5 ?
@@ -679,9 +771,16 @@ int main(int argc, const char **argv) {
              vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
              parse_superframe_index(cx_pkt->data.frame.buf,
                                     cx_pkt->data.frame.sz, sizes, &count);
-              for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+              // Note computing input_layer_frames here won't account for frame
-                ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
+              // drops in rate control stats.
-                                        layer_id.temporal_layer_id];
+              // TODO(marpan): Fix this for non-bypass mode so we can get stats
              // for dropped frames.
              if (svc_ctx.temporal_layering_mode !=
                  VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
                  ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
                                         layer_id.temporal_layer_id];
                }
              }
              for (tl = layer_id.temporal_layer_id;
                  tl < enc_cfg.ts_number_layers; ++tl) {
@@ -772,6 +871,16 @@ int main(int argc, const char **argv) {
      pts += frame_duration;
    }
  }
  // Compensate for the extra frame count for the bypass mode.
  if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
    for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
      const int layer = sl * enc_cfg.ts_number_layers +
          layer_id.temporal_layer_id;
      --rc.layer_input_frames[layer];
    }
  }
  printf("Processed %d frames\n", frame_cnt);
  fclose(infile);
 #if OUTPUT_RC_STATS
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -684,14 +684,14 @@ int main(int argc, char **argv) {
  if (strncmp(encoder->name, "vp8", 3) == 0) {
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
-    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
  } else if (strncmp(encoder->name, "vp9", 3) == 0) {
    vpx_svc_extra_cfg_t svc_params;
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
    vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
-    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
    vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
    if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1: 0))
--- a/libs.mk
+++ b/libs.mk
@@ -260,7 +260,7 @@ OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
-SO_VERSION_MAJOR := 2
+SO_VERSION_MAJOR := 3
 SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
@@ -429,12 +429,10 @@ testdata:: $(LIBVPX_TEST_DATA)
          if [ -n "$${sha1sum}" ]; then\
            set -e;\
            echo "Checking test data:";\
-            if [ -n "$(LIBVPX_TEST_DATA)" ]; then\
+            for f in $(call enabled,LIBVPX_TEST_DATA); do\
-                for f in $(call enabled,LIBVPX_TEST_DATA); do\
+                grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
-                    grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
+                    (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
-                        (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
+            done; \
                done; \
            fi; \
        else\
            echo "Skipping test data integrity check, sha1sum not found.";\
        fi
--- a/test/active_map_refresh_test.cc
+++ b/test/active_map_refresh_test.cc
@@ -0,0 +1,127 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <algorithm>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
 namespace {
 // Check if any pixel in a 16x16 macroblock varies between frames.
 int CheckMb(const vpx_image_t &current, const vpx_image_t &previous,
            int mb_r, int mb_c) {
  for (int plane = 0; plane < 3; plane++) {
    int r = 16 * mb_r;
    int c0 = 16 * mb_c;
    int r_top = std::min(r + 16, static_cast<int>(current.d_h));
    int c_top = std::min(c0 + 16, static_cast<int>(current.d_w));
    r = std::max(r, 0);
    c0 = std::max(c0, 0);
    if (plane > 0 && current.x_chroma_shift) {
      c_top = (c_top + 1) >> 1;
      c0 >>= 1;
    }
    if (plane > 0 && current.y_chroma_shift) {
      r_top = (r_top + 1) >> 1;
      r >>= 1;
    }
    for (; r < r_top; ++r) {
      for (int c = c0; c < c_top; ++c) {
        if (current.planes[plane][current.stride[plane] * r + c] !=
            previous.planes[plane][previous.stride[plane] * r + c])
          return 1;
      }
    }
  }
  return 0;
 }
 void GenerateMap(int mb_rows, int mb_cols, const vpx_image_t &current,
                 const vpx_image_t &previous, uint8_t *map) {
  for (int mb_r = 0; mb_r < mb_rows; ++mb_r) {
    for (int mb_c = 0; mb_c < mb_cols; ++mb_c) {
      map[mb_r * mb_cols + mb_c] = CheckMb(current, previous, mb_r, mb_c);
    }
  }
 }
 const int kAqModeCyclicRefresh = 3;
 class ActiveMapRefreshTest
    : public ::libvpx_test::EncoderTest,
      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
 protected:
  ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {}
  virtual ~ActiveMapRefreshTest() {}
  virtual void SetUp() {
    InitializeConfig();
    SetMode(GET_PARAM(1));
    cpu_used_ = GET_PARAM(2);
  }
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    ::libvpx_test::Y4mVideoSource *y4m_video =
        static_cast<libvpx_test::Y4mVideoSource *>(video);
    if (video->frame() == 1) {
      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
      encoder->Control(VP9E_SET_AQ_MODE, kAqModeCyclicRefresh);
    } else if (video->frame() >= 2 && video->img()) {
      vpx_image_t *current = video->img();
      vpx_image_t *previous = y4m_holder_->img();
      ASSERT_TRUE(previous != NULL);
      vpx_active_map_t map = vpx_active_map_t();
      const int width = static_cast<int>(current->d_w);
      const int height = static_cast<int>(current->d_h);
      const int mb_width = (width + 15) / 16;
      const int mb_height = (height + 15) / 16;
      uint8_t *active_map = new uint8_t[mb_width * mb_height];
      GenerateMap(mb_height, mb_width, *current, *previous, active_map);
      map.cols = mb_width;
      map.rows = mb_height;
      map.active_map = active_map;
      encoder->Control(VP8E_SET_ACTIVEMAP, &map);
      delete[] active_map;
    }
    if (video->img()) {
      y4m_video->SwapBuffers(y4m_holder_);
    }
  }
  int cpu_used_;
  ::libvpx_test::Y4mVideoSource *y4m_holder_;
 };
 TEST_P(ActiveMapRefreshTest, Test) {
  cfg_.g_lag_in_frames = 0;
  cfg_.g_profile = 1;
  cfg_.rc_target_bitrate = 600;
  cfg_.rc_resize_allowed = 0;
  cfg_.rc_min_quantizer = 8;
  cfg_.rc_max_quantizer = 30;
  cfg_.g_pass = VPX_RC_ONE_PASS;
  cfg_.rc_end_usage = VPX_CBR;
  cfg_.kf_max_dist = 90000;
  ::libvpx_test::Y4mVideoSource video("desktop_credits.y4m", 0, 30);
  ::libvpx_test::Y4mVideoSource video_holder("desktop_credits.y4m", 0, 30);
  video_holder.Begin();
  y4m_holder_ = &video_holder;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 VP9_INSTANTIATE_TEST_CASE(ActiveMapRefreshTest,
                          ::testing::Values(::libvpx_test::kRealTime),
                          ::testing::Range(5, 6));
 }  // namespace
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -15,9 +15,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
-#if CONFIG_VP9_ENCODER
+#include "./vpx_dsp_rtcd.h"
 #include "./vp9_rtcd.h"
 #endif
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
@@ -194,6 +192,48 @@ class IntProColTest
  int16_t sum_c_;
 };
 typedef int (*SatdFunc)(const int16_t *coeffs, int length);
 typedef std::tr1::tuple<int, SatdFunc> SatdTestParam;
 class SatdTest
    : public ::testing::Test,
      public ::testing::WithParamInterface<SatdTestParam> {
 protected:
  virtual void SetUp() {
    satd_size_ = GET_PARAM(0);
    satd_func_ = GET_PARAM(1);
    rnd_.Reset(ACMRandom::DeterministicSeed());
    src_ = reinterpret_cast<int16_t*>(
        vpx_memalign(16, sizeof(*src_) * satd_size_));
    ASSERT_TRUE(src_ != NULL);
  }
  virtual void TearDown() {
    libvpx_test::ClearSystemState();
    vpx_free(src_);
  }
  void FillConstant(const int16_t val) {
    for (int i = 0; i < satd_size_; ++i) src_[i] = val;
  }
  void FillRandom() {
    for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16();
  }
  void Check(const int expected) {
    int total;
    ASM_REGISTER_STATE_CHECK(total = satd_func_(src_, satd_size_));
    EXPECT_EQ(expected, total);
  }
  int satd_size_;
 private:
  int16_t *src_;
  SatdFunc satd_func_;
  ACMRandom rnd_;
 };
 uint8_t* AverageTestBase::source_data_ = NULL;
@@ -246,69 +286,126 @@ TEST_P(IntProColTest, Random) {
  RunComparison();
 }
 TEST_P(SatdTest, MinValue) {
  const int kMin = -32640;
  const int expected = -kMin * satd_size_;
  FillConstant(kMin);
  Check(expected);
 }
 TEST_P(SatdTest, MaxValue) {
  const int kMax = 32640;
  const int expected = kMax * satd_size_;
  FillConstant(kMax);
  Check(expected);
 }
 TEST_P(SatdTest, Random) {
  int expected;
  switch (satd_size_) {
    case 16: expected = 205298; break;
    case 64: expected = 1113950; break;
    case 256: expected = 4268415; break;
    case 1024: expected = 16954082; break;
    default:
      FAIL() << "Invalid satd size (" << satd_size_
             << ") valid: 16/64/256/1024";
  }
  FillRandom();
  Check(expected);
 }
 using std::tr1::make_tuple;
 INSTANTIATE_TEST_CASE_P(
    C, AverageTest,
    ::testing::Values(
-        make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c),
+        make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c),
-        make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c)));
+        make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c)));
 INSTANTIATE_TEST_CASE_P(
    C, SatdTest,
    ::testing::Values(
        make_tuple(16, &vpx_satd_c),
        make_tuple(64, &vpx_satd_c),
        make_tuple(256, &vpx_satd_c),
        make_tuple(1024, &vpx_satd_c)));
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
    SSE2, AverageTest,
    ::testing::Values(
-        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_sse2),
+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_sse2),
-        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_sse2),
+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_sse2),
-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_sse2),
+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_sse2),
-        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_sse2),
+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_sse2),
-        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_sse2),
+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_sse2),
-        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_sse2)));
+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_sse2)));
 INSTANTIATE_TEST_CASE_P(
    SSE2, IntProRowTest, ::testing::Values(
-        make_tuple(16, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),
+        make_tuple(16, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),
-        make_tuple(32, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),
+        make_tuple(32, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),
-        make_tuple(64, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c)));
+        make_tuple(64, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c)));
 INSTANTIATE_TEST_CASE_P(
    SSE2, IntProColTest, ::testing::Values(
-        make_tuple(16, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
+        make_tuple(16, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),
-        make_tuple(32, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
+        make_tuple(32, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),
-        make_tuple(64, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c)));
+        make_tuple(64, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c)));
 INSTANTIATE_TEST_CASE_P(
    SSE2, SatdTest,
    ::testing::Values(
        make_tuple(16, &vpx_satd_sse2),
        make_tuple(64, &vpx_satd_sse2),
        make_tuple(256, &vpx_satd_sse2),
        make_tuple(1024, &vpx_satd_sse2)));
 #endif
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(
    NEON, AverageTest,
    ::testing::Values(
-        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon),
+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_neon),
-        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_neon),
-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon)));
+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_neon),
        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_neon),
        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon),
        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon)));
 INSTANTIATE_TEST_CASE_P(
    NEON, IntProRowTest, ::testing::Values(
-        make_tuple(16, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
+        make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
-        make_tuple(32, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
+        make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
-        make_tuple(64, &vp9_int_pro_row_neon, &vp9_int_pro_row_c)));
+        make_tuple(64, &vpx_int_pro_row_neon, &vpx_int_pro_row_c)));
 INSTANTIATE_TEST_CASE_P(
    NEON, IntProColTest, ::testing::Values(
-        make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
+        make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
-        make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
+        make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
-        make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));
+        make_tuple(64, &vpx_int_pro_col_neon, &vpx_int_pro_col_c)));
 INSTANTIATE_TEST_CASE_P(
    NEON, SatdTest,
    ::testing::Values(
        make_tuple(16, &vpx_satd_neon),
        make_tuple(64, &vpx_satd_neon),
        make_tuple(256, &vpx_satd_neon),
        make_tuple(1024, &vpx_satd_neon)));
 #endif
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(
    MSA, AverageTest,
    ::testing::Values(
-        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_msa),
+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_msa),
-        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_msa),
+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_msa),
-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_msa),
+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_msa),
-        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_msa),
+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_msa),
-        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_msa),
+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa),
-        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_msa)));
+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa)));
 #endif
 }  // namespace
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -960,511 +960,72 @@ TEST_P(ConvolveTest, CheckScalingFiltering) {
 using std::tr1::make_tuple;
 #if CONFIG_VP9_HIGHBITDEPTH
 #define WRAP(func, bd) \
 void wrap_ ## func ## _ ## bd(const uint8_t *src, ptrdiff_t src_stride, \
                              uint8_t *dst, ptrdiff_t dst_stride, \
                              const int16_t *filter_x, \
                              int filter_x_stride, \
                              const int16_t *filter_y, \
                              int filter_y_stride, \
                              int w, int h) { \
  vpx_highbd_ ## func(src, src_stride, dst, dst_stride, filter_x, \
                      filter_x_stride, filter_y, filter_y_stride, \
                      w, h, bd); \
 }
 #if HAVE_SSE2 && ARCH_X86_64
-void wrap_convolve8_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
+#if CONFIG_USE_X86INC
-                                 uint8_t *dst, ptrdiff_t dst_stride,
+WRAP(convolve_copy_sse2, 8)
-                                 const int16_t *filter_x,
+WRAP(convolve_avg_sse2, 8)
-                                 int filter_x_stride,
+WRAP(convolve_copy_sse2, 10)
-                                 const int16_t *filter_y,
+WRAP(convolve_avg_sse2, 10)
-                                 int filter_y_stride,
+WRAP(convolve_copy_sse2, 12)
-                                 int w, int h) {
+WRAP(convolve_avg_sse2, 12)
-  vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
+#endif  // CONFIG_USE_X86INC
-                                  filter_x_stride, filter_y, filter_y_stride,
+WRAP(convolve8_horiz_sse2, 8)
-                                  w, h, 8);
+WRAP(convolve8_avg_horiz_sse2, 8)
-}
+WRAP(convolve8_vert_sse2, 8)
-
+WRAP(convolve8_avg_vert_sse2, 8)
-void wrap_convolve8_avg_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
+WRAP(convolve8_sse2, 8)
-                                     uint8_t *dst, ptrdiff_t dst_stride,
+WRAP(convolve8_avg_sse2, 8)
-                                     const int16_t *filter_x,
+WRAP(convolve8_horiz_sse2, 10)
-                                     int filter_x_stride,
+WRAP(convolve8_avg_horiz_sse2, 10)
-                                     const int16_t *filter_y,
+WRAP(convolve8_vert_sse2, 10)
-                                     int filter_y_stride,
+WRAP(convolve8_avg_vert_sse2, 10)
-                                     int w, int h) {
+WRAP(convolve8_sse2, 10)
-  vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+WRAP(convolve8_avg_sse2, 10)
-                                      filter_x, filter_x_stride,
+WRAP(convolve8_horiz_sse2, 12)
-                                      filter_y, filter_y_stride, w, h, 8);
+WRAP(convolve8_avg_horiz_sse2, 12)
-}
+WRAP(convolve8_vert_sse2, 12)
-
+WRAP(convolve8_avg_vert_sse2, 12)
-void wrap_convolve8_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
+WRAP(convolve8_sse2, 12)
-                                uint8_t *dst, ptrdiff_t dst_stride,
+WRAP(convolve8_avg_sse2, 12)
                                const int16_t *filter_x,
                                int filter_x_stride,
                                const int16_t *filter_y,
                                int filter_y_stride,
                                int w, int h) {
  vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
                                 filter_x, filter_x_stride,
                                 filter_y, filter_y_stride, w, h, 8);
 }
 void wrap_convolve8_avg_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const int16_t *filter_x,
                                    int filter_x_stride,
                                    const int16_t *filter_y,
                                    int filter_y_stride,
                                    int w, int h) {
  vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
                                     filter_x, filter_x_stride,
                                     filter_y, filter_y_stride, w, h, 8);
 }
 void wrap_convolve8_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x,
                           int filter_x_stride,
                           const int16_t *filter_y,
                           int filter_y_stride,
                           int w, int h) {
  vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
                            filter_x, filter_x_stride,
                            filter_y, filter_y_stride, w, h, 8);
 }
 void wrap_convolve8_avg_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x,
                               int filter_x_stride,
                               const int16_t *filter_y,
                               int filter_y_stride,
                               int w, int h) {
  vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
                                filter_x, filter_x_stride,
                                filter_y, filter_y_stride, w, h, 8);
 }
 void wrap_convolve8_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x,
                                  int filter_x_stride,
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
  vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
                                  filter_x, filter_x_stride,
                                  filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve8_avg_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x,
                                      int filter_x_stride,
                                      const int16_t *filter_y,
                                      int filter_y_stride,
                                      int w, int h) {
  vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
                                      filter_x, filter_x_stride,
                                      filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve8_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x,
                                 int filter_x_stride,
                                 const int16_t *filter_y,
                                 int filter_y_stride,
                                 int w, int h) {
  vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
                                 filter_x, filter_x_stride,
                                 filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve8_avg_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
                                     uint8_t *dst, ptrdiff_t dst_stride,
                                     const int16_t *filter_x,
                                     int filter_x_stride,
                                     const int16_t *filter_y,
                                     int filter_y_stride,
                                     int w, int h) {
  vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
                                     filter_x, filter_x_stride,
                                     filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve8_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x,
                            int filter_x_stride,
                            const int16_t *filter_y,
                            int filter_y_stride,
                            int w, int h) {
  vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
                            filter_x, filter_x_stride,
                            filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve8_avg_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x,
                                int filter_x_stride,
                                const int16_t *filter_y,
                                int filter_y_stride,
                                int w, int h) {
  vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
                                filter_x, filter_x_stride,
                                filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve8_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x,
                                  int filter_x_stride,
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
  vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
                                  filter_x, filter_x_stride,
                                  filter_y, filter_y_stride, w, h, 12);
 }
 void wrap_convolve8_avg_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x,
                                      int filter_x_stride,
                                      const int16_t *filter_y,
                                      int filter_y_stride,
                                      int w, int h) {
  vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
                                      filter_x, filter_x_stride,
                                      filter_y, filter_y_stride, w, h, 12);
 }
 void wrap_convolve8_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x,
                                 int filter_x_stride,
                                 const int16_t *filter_y,
                                 int filter_y_stride,
                                 int w, int h) {
  vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
                                 filter_x, filter_x_stride,
                                 filter_y, filter_y_stride, w, h, 12);
 }
 void wrap_convolve8_avg_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
                                     uint8_t *dst, ptrdiff_t dst_stride,
                                     const int16_t *filter_x,
                                     int filter_x_stride,
                                     const int16_t *filter_y,
                                     int filter_y_stride,
                                     int w, int h) {
  vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
                                     filter_x, filter_x_stride,
                                     filter_y, filter_y_stride, w, h, 12);
 }
 void wrap_convolve8_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x,
                            int filter_x_stride,
                            const int16_t *filter_y,
                            int filter_y_stride,
                            int w, int h) {
  vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
                            filter_x, filter_x_stride,
                            filter_y, filter_y_stride, w, h, 12);
 }
 void wrap_convolve8_avg_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x,
                                int filter_x_stride,
                                const int16_t *filter_y,
                                int filter_y_stride,
                                int w, int h) {
  vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
                                filter_x, filter_x_stride,
                                filter_y, filter_y_stride, w, h, 12);
 }
 #endif  // HAVE_SSE2 && ARCH_X86_64
-void wrap_convolve_copy_c_8(const uint8_t *src, ptrdiff_t src_stride,
+WRAP(convolve_copy_c, 8)
-                            uint8_t *dst, ptrdiff_t dst_stride,
+WRAP(convolve_avg_c, 8)
-                            const int16_t *filter_x,
+WRAP(convolve8_horiz_c, 8)
-                            int filter_x_stride,
+WRAP(convolve8_avg_horiz_c, 8)
-                            const int16_t *filter_y,
+WRAP(convolve8_vert_c, 8)
-                            int filter_y_stride,
+WRAP(convolve8_avg_vert_c, 8)
-                            int w, int h) {
+WRAP(convolve8_c, 8)
-  vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+WRAP(convolve8_avg_c, 8)
-                             filter_x, filter_x_stride,
+WRAP(convolve_copy_c, 10)
-                             filter_y, filter_y_stride, w, h, 8);
+WRAP(convolve_avg_c, 10)
-}
+WRAP(convolve8_horiz_c, 10)
-
+WRAP(convolve8_avg_horiz_c, 10)
-void wrap_convolve_avg_c_8(const uint8_t *src, ptrdiff_t src_stride,
+WRAP(convolve8_vert_c, 10)
-                           uint8_t *dst, ptrdiff_t dst_stride,
+WRAP(convolve8_avg_vert_c, 10)
-                           const int16_t *filter_x,
+WRAP(convolve8_c, 10)
-                           int filter_x_stride,
+WRAP(convolve8_avg_c, 10)
-                           const int16_t *filter_y,
+WRAP(convolve_copy_c, 12)
-                           int filter_y_stride,
+WRAP(convolve_avg_c, 12)
-                           int w, int h) {
+WRAP(convolve8_horiz_c, 12)
-  vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+WRAP(convolve8_avg_horiz_c, 12)
-                            filter_x, filter_x_stride,
+WRAP(convolve8_vert_c, 12)
-                            filter_y, filter_y_stride, w, h, 8);
+WRAP(convolve8_avg_vert_c, 12)
-}
+WRAP(convolve8_c, 12)
-
+WRAP(convolve8_avg_c, 12)
-void wrap_convolve8_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride,
+#undef WRAP
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x,
                              int filter_x_stride,
                              const int16_t *filter_y,
                              int filter_y_stride,
                              int w, int h) {
  vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
                               filter_x, filter_x_stride,
                               filter_y, filter_y_stride, w, h, 8);
 }
 void wrap_convolve8_avg_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x,
                                  int filter_x_stride,
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
  vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
                                   filter_x, filter_x_stride,
                                   filter_y, filter_y_stride, w, h, 8);
 }
 void wrap_convolve8_vert_c_8(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x,
                             int filter_x_stride,
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
  vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
                              filter_x, filter_x_stride,
                              filter_y, filter_y_stride, w, h, 8);
 }
 void wrap_convolve8_avg_vert_c_8(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x,
                                 int filter_x_stride,
                                 const int16_t *filter_y,
                                 int filter_y_stride,
                                 int w, int h) {
  vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
                                  filter_x, filter_x_stride,
                                  filter_y, filter_y_stride, w, h, 8);
 }
 void wrap_convolve8_c_8(const uint8_t *src, ptrdiff_t src_stride,
                        uint8_t *dst, ptrdiff_t dst_stride,
                        const int16_t *filter_x,
                        int filter_x_stride,
                        const int16_t *filter_y,
                        int filter_y_stride,
                        int w, int h) {
  vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
                         filter_x, filter_x_stride,
                         filter_y, filter_y_stride, w, h, 8);
 }
 void wrap_convolve8_avg_c_8(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x,
                            int filter_x_stride,
                            const int16_t *filter_y,
                            int filter_y_stride,
                            int w, int h) {
  vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
                             filter_x, filter_x_stride,
                             filter_y, filter_y_stride, w, h, 8);
 }
 void wrap_convolve_copy_c_10(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x,
                             int filter_x_stride,
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
  vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
                             filter_x, filter_x_stride,
                             filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve_avg_c_10(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x,
                            int filter_x_stride,
                            const int16_t *filter_y,
                            int filter_y_stride,
                            int w, int h) {
  vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
                            filter_x, filter_x_stride,
                            filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve8_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x,
                               int filter_x_stride,
                               const int16_t *filter_y,
                               int filter_y_stride,
                               int w, int h) {
  vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
                               filter_x, filter_x_stride,
                               filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve8_avg_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x,
                                   int filter_x_stride,
                                   const int16_t *filter_y,
                                   int filter_y_stride,
                                   int w, int h) {
  vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
                                   filter_x, filter_x_stride,
                                   filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve8_vert_c_10(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x,
                              int filter_x_stride,
                              const int16_t *filter_y,
                              int filter_y_stride,
                              int w, int h) {
  vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
                              filter_x, filter_x_stride,
                              filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve8_avg_vert_c_10(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x,
                                  int filter_x_stride,
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
  vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
                                  filter_x, filter_x_stride,
                                  filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve8_c_10(const uint8_t *src, ptrdiff_t src_stride,
                         uint8_t *dst, ptrdiff_t dst_stride,
                         const int16_t *filter_x,
                         int filter_x_stride,
                         const int16_t *filter_y,
                         int filter_y_stride,
                         int w, int h) {
  vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
                         filter_x, filter_x_stride,
                         filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve8_avg_c_10(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x,
                             int filter_x_stride,
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
  vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
                             filter_x, filter_x_stride,
                             filter_y, filter_y_stride, w, h, 10);
 }
 void wrap_convolve_copy_c_12(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x,
                             int filter_x_stride,
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
  vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
                             filter_x, filter_x_stride,
                             filter_y, filter_y_stride, w, h, 12);
 }
 void wrap_convolve_avg_c_12(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x,
                            int filter_x_stride,
                            const int16_t *filter_y,
                            int filter_y_stride,
                            int w, int h) {
  vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
                            filter_x, filter_x_stride,
                            filter_y, filter_y_stride, w, h, 12);
 }
 void wrap_convolve8_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x,
                               int filter_x_stride,
                               const int16_t *filter_y,
                               int filter_y_stride,
                               int w, int h) {
  vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
                               filter_x, filter_x_stride,
                               filter_y, filter_y_stride, w, h, 12);
 }
 void wrap_convolve8_avg_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x,
                                   int filter_x_stride,
                                   const int16_t *filter_y,
                                   int filter_y_stride,
                                   int w, int h) {
  vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
                                   filter_x, filter_x_stride,
                                   filter_y, filter_y_stride, w, h, 12);
 }
 void wrap_convolve8_vert_c_12(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x,
                              int filter_x_stride,
                              const int16_t *filter_y,
                              int filter_y_stride,
                              int w, int h) {
  vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
                              filter_x, filter_x_stride,
                              filter_y, filter_y_stride, w, h, 12);
 }
 void wrap_convolve8_avg_vert_c_12(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x,
                                  int filter_x_stride,
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
  vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
                                  filter_x, filter_x_stride,
                                  filter_y, filter_y_stride, w, h, 12);
 }
 void wrap_convolve8_c_12(const uint8_t *src, ptrdiff_t src_stride,
                         uint8_t *dst, ptrdiff_t dst_stride,
                         const int16_t *filter_x,
                         int filter_x_stride,
                         const int16_t *filter_y,
                         int filter_y_stride,
                         int w, int h) {
  vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
                         filter_x, filter_x_stride,
                         filter_y, filter_y_stride, w, h, 12);
 }
 void wrap_convolve8_avg_c_12(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x,
                             int filter_x_stride,
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
  vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
                             filter_x, filter_x_stride,
                             filter_y, filter_y_stride, w, h, 12);
 }
 const ConvolveFunctions convolve8_c(
    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
@@ -1563,7 +1124,11 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
 #if HAVE_SSE2 && ARCH_X86_64
 #if CONFIG_VP9_HIGHBITDEPTH
 const ConvolveFunctions convolve8_sse2(
 #if CONFIG_USE_X86INC
    wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8,
 #else
    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
 #endif  // CONFIG_USE_X86INC
    wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
    wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
    wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8,
@@ -1571,7 +1136,11 @@ const ConvolveFunctions convolve8_sse2(
    wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
    wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
 const ConvolveFunctions convolve10_sse2(
 #if CONFIG_USE_X86INC
    wrap_convolve_copy_sse2_10, wrap_convolve_avg_sse2_10,
 #else
    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
 #endif  // CONFIG_USE_X86INC
    wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
    wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
    wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10,
@@ -1579,7 +1148,11 @@ const ConvolveFunctions convolve10_sse2(
    wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
    wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
 const ConvolveFunctions convolve12_sse2(
 #if CONFIG_USE_X86INC
    wrap_convolve_copy_sse2_12, wrap_convolve_avg_sse2_12,
 #else
    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
 #endif  // CONFIG_USE_X86INC
    wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
    wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
    wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12,
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -538,7 +538,7 @@ TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) {
        << " The first dropped frame for drop_thresh " << i
        << " > first dropped frame for drop_thresh "
        << i - kDropFrameThreshTestStep;
-    ASSERT_GE(num_drops_, last_num_drops * 0.90)
+    ASSERT_GE(num_drops_, last_num_drops * 0.85)
        << " The number of dropped frames for drop_thresh " << i
        << " < number of dropped frames for drop_thresh "
        << i - kDropFrameThreshTestStep;
@@ -770,7 +770,7 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest,
                                  ::libvpx_test::Encoder *encoder) {
    if (video->frame() == 0) {
      int i;
-      for (i = 0; i < 2; ++i) {
+      for (i = 0; i < VPX_MAX_LAYERS; ++i) {
        svc_params_.max_quantizers[i] = 63;
        svc_params_.min_quantizers[i] = 0;
      }
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -124,6 +124,11 @@ class Encoder {
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }
  void Control(int ctrl_id, int *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
  }
  void Control(int ctrl_id, struct vpx_scaling_mode *arg) {
    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -1,406 +0,0 @@
 /*
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <string.h>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "vp8/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"
 namespace {
 using libvpx_test::ACMRandom;
 class IntraPredBase {
 public:
  virtual ~IntraPredBase() { libvpx_test::ClearSystemState(); }
 protected:
  void SetupMacroblock(MACROBLOCKD *mbptr,
                       MODE_INFO *miptr,
                       uint8_t *data,
                       int block_size,
                       int stride,
                       int num_planes) {
    mbptr_ = mbptr;
    miptr_ = miptr;
    mbptr_->up_available = 1;
    mbptr_->left_available = 1;
    mbptr_->mode_info_context = miptr_;
    stride_ = stride;
    block_size_ = block_size;
    num_planes_ = num_planes;
    for (int p = 0; p < num_planes; p++)
      data_ptr_[p] = data + stride * (block_size + 1) * p +
                     stride + block_size;
  }
  void FillRandom() {
    // Fill edges with random data
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    for (int p = 0; p < num_planes_; p++) {
      for (int x = -1 ; x <= block_size_; x++)
        data_ptr_[p][x - stride_] = rnd.Rand8();
      for (int y = 0; y < block_size_; y++)
        data_ptr_[p][y * stride_ - 1] = rnd.Rand8();
    }
  }
  virtual void Predict(MB_PREDICTION_MODE mode) = 0;
  void SetLeftUnavailable() {
    mbptr_->left_available = 0;
    for (int p = 0; p < num_planes_; p++)
      for (int i = -1; i < block_size_; ++i)
        data_ptr_[p][stride_ * i - 1] = 129;
  }
  void SetTopUnavailable() {
    mbptr_->up_available = 0;
    for (int p = 0; p < num_planes_; p++)
      memset(&data_ptr_[p][-1 - stride_], 127, block_size_ + 2);
  }
  void SetTopLeftUnavailable() {
    SetLeftUnavailable();
    SetTopUnavailable();
  }
  int BlockSizeLog2Min1() const {
    switch (block_size_) {
      case 16:
        return 3;
      case 8:
        return 2;
      default:
        return 0;
    }
  }
  // check DC prediction output against a reference
  void CheckDCPrediction() const {
    for (int p = 0; p < num_planes_; p++) {
      // calculate expected DC
      int expected;
      if (mbptr_->up_available || mbptr_->left_available) {
        int sum = 0, shift = BlockSizeLog2Min1() + mbptr_->up_available +
                             mbptr_->left_available;
        if (mbptr_->up_available)
          for (int x = 0; x < block_size_; x++)
            sum += data_ptr_[p][x - stride_];
        if (mbptr_->left_available)
          for (int y = 0; y < block_size_; y++)
            sum += data_ptr_[p][y * stride_ - 1];
        expected = (sum + (1 << (shift - 1))) >> shift;
      } else {
        expected = 0x80;
      }
      // check that all subsequent lines are equal to the first
      for (int y = 1; y < block_size_; ++y)
        ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_],
                            block_size_));
      // within the first line, ensure that each pixel has the same value
      for (int x = 1; x < block_size_; ++x)
        ASSERT_EQ(data_ptr_[p][0], data_ptr_[p][x]);
      // now ensure that that pixel has the expected (DC) value
      ASSERT_EQ(expected, data_ptr_[p][0]);
    }
  }
  // check V prediction output against a reference
  void CheckVPrediction() const {
    // check that all lines equal the top border
    for (int p = 0; p < num_planes_; p++)
      for (int y = 0; y < block_size_; y++)
        ASSERT_EQ(0, memcmp(&data_ptr_[p][-stride_],
                            &data_ptr_[p][y * stride_], block_size_));
  }
  // check H prediction output against a reference
  void CheckHPrediction() const {
    // for each line, ensure that each pixel is equal to the left border
    for (int p = 0; p < num_planes_; p++)
      for (int y = 0; y < block_size_; y++)
        for (int x = 0; x < block_size_; x++)
          ASSERT_EQ(data_ptr_[p][-1 + y * stride_],
                    data_ptr_[p][x + y * stride_]);
  }
  static int ClipByte(int value) {
    if (value > 255)
      return 255;
    else if (value < 0)
      return 0;
    return value;
  }
  // check TM prediction output against a reference
  void CheckTMPrediction() const {
    for (int p = 0; p < num_planes_; p++)
      for (int y = 0; y < block_size_; y++)
        for (int x = 0; x < block_size_; x++) {
          const int expected = ClipByte(data_ptr_[p][x - stride_]
                                      + data_ptr_[p][stride_ * y - 1]
                                      - data_ptr_[p][-1 - stride_]);
          ASSERT_EQ(expected, data_ptr_[p][y * stride_ + x]);
       }
  }
  // Actual test
  void RunTest() {
    {
      SCOPED_TRACE("DC_PRED");
      FillRandom();
      Predict(DC_PRED);
      CheckDCPrediction();
    }
    {
      SCOPED_TRACE("DC_PRED LEFT");
      FillRandom();
      SetLeftUnavailable();
      Predict(DC_PRED);
      CheckDCPrediction();
    }
    {
      SCOPED_TRACE("DC_PRED TOP");
      FillRandom();
      SetTopUnavailable();
      Predict(DC_PRED);
      CheckDCPrediction();
    }
    {
      SCOPED_TRACE("DC_PRED TOP_LEFT");
      FillRandom();
      SetTopLeftUnavailable();
      Predict(DC_PRED);
      CheckDCPrediction();
    }
    {
      SCOPED_TRACE("H_PRED");
      FillRandom();
      Predict(H_PRED);
      CheckHPrediction();
    }
    {
      SCOPED_TRACE("V_PRED");
      FillRandom();
      Predict(V_PRED);
      CheckVPrediction();
    }
    {
      SCOPED_TRACE("TM_PRED");
      FillRandom();
      Predict(TM_PRED);
      CheckTMPrediction();
    }
  }
  MACROBLOCKD *mbptr_;
  MODE_INFO *miptr_;
  uint8_t *data_ptr_[2];  // in the case of Y, only [0] is used
  int stride_;
  int block_size_;
  int num_planes_;
 };
 typedef void (*IntraPredYFunc)(MACROBLOCKD *x,
                               uint8_t *yabove_row,
                               uint8_t *yleft,
                               int left_stride,
                               uint8_t *ypred_ptr,
                               int y_stride);
 class IntraPredYTest
    : public IntraPredBase,
      public ::testing::TestWithParam<IntraPredYFunc> {
 public:
  static void SetUpTestCase() {
    mb_ = reinterpret_cast<MACROBLOCKD*>(
        vpx_memalign(32, sizeof(MACROBLOCKD)));
    mi_ = reinterpret_cast<MODE_INFO*>(
        vpx_memalign(32, sizeof(MODE_INFO)));
    data_array_ = reinterpret_cast<uint8_t*>(
        vpx_memalign(kDataAlignment, kDataBufferSize));
  }
  static void TearDownTestCase() {
    vpx_free(data_array_);
    vpx_free(mi_);
    vpx_free(mb_);
    data_array_ = NULL;
  }
 protected:
  static const int kBlockSize = 16;
  static const int kDataAlignment = 16;
  static const int kStride = kBlockSize * 3;
  // We use 48 so that the data pointer of the first pixel in each row of
  // each macroblock is 16-byte aligned, and this gives us access to the
  // top-left and top-right corner pixels belonging to the top-left/right
  // macroblocks.
  // We use 17 lines so we have one line above us for top-prediction.
  static const int kDataBufferSize = kStride * (kBlockSize + 1);
  virtual void SetUp() {
    pred_fn_ = GetParam();
    SetupMacroblock(mb_, mi_, data_array_, kBlockSize, kStride, 1);
  }
  virtual void Predict(MB_PREDICTION_MODE mode) {
    mbptr_->mode_info_context->mbmi.mode = mode;
    ASM_REGISTER_STATE_CHECK(pred_fn_(mbptr_,
                                      data_ptr_[0] - kStride,
                                      data_ptr_[0] - 1, kStride,
                                      data_ptr_[0], kStride));
  }
  IntraPredYFunc pred_fn_;
  static uint8_t* data_array_;
  static MACROBLOCKD * mb_;
  static MODE_INFO *mi_;
 };
 MACROBLOCKD* IntraPredYTest::mb_ = NULL;
 MODE_INFO* IntraPredYTest::mi_ = NULL;
 uint8_t* IntraPredYTest::data_array_ = NULL;
 TEST_P(IntraPredYTest, IntraPredTests) {
  RunTest();
 }
 INSTANTIATE_TEST_CASE_P(C, IntraPredYTest,
                        ::testing::Values(
                            vp8_build_intra_predictors_mby_s_c));
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, IntraPredYTest,
                        ::testing::Values(
                            vp8_build_intra_predictors_mby_s_sse2));
 #endif
 #if HAVE_SSSE3
 INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredYTest,
                        ::testing::Values(
                            vp8_build_intra_predictors_mby_s_ssse3));
 #endif
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, IntraPredYTest,
                        ::testing::Values(
                            vp8_build_intra_predictors_mby_s_neon));
 #endif
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(MSA, IntraPredYTest,
                        ::testing::Values(
                            vp8_build_intra_predictors_mby_s_msa));
 #endif
 typedef void (*IntraPredUvFunc)(MACROBLOCKD *x,
                                uint8_t *uabove_row,
                                uint8_t *vabove_row,
                                uint8_t *uleft,
                                uint8_t *vleft,
                                int left_stride,
                                uint8_t *upred_ptr,
                                uint8_t *vpred_ptr,
                                int pred_stride);
 class IntraPredUVTest
    : public IntraPredBase,
      public ::testing::TestWithParam<IntraPredUvFunc> {
 public:
  static void SetUpTestCase() {
    mb_ = reinterpret_cast<MACROBLOCKD*>(
        vpx_memalign(32, sizeof(MACROBLOCKD)));
    mi_ = reinterpret_cast<MODE_INFO*>(
        vpx_memalign(32, sizeof(MODE_INFO)));
    data_array_ = reinterpret_cast<uint8_t*>(
        vpx_memalign(kDataAlignment, kDataBufferSize));
  }
  static void TearDownTestCase() {
    vpx_free(data_array_);
    vpx_free(mi_);
    vpx_free(mb_);
    data_array_ = NULL;
  }
 protected:
  static const int kBlockSize = 8;
  static const int kDataAlignment = 8;
  static const int kStride = kBlockSize * 3;
  // We use 24 so that the data pointer of the first pixel in each row of
  // each macroblock is 8-byte aligned, and this gives us access to the
  // top-left and top-right corner pixels belonging to the top-left/right
  // macroblocks.
  // We use 9 lines so we have one line above us for top-prediction.
  // [0] = U, [1] = V
  static const int kDataBufferSize = 2 * kStride * (kBlockSize + 1);
  virtual void SetUp() {
    pred_fn_ = GetParam();
    SetupMacroblock(mb_, mi_, data_array_, kBlockSize, kStride, 2);
  }
  virtual void Predict(MB_PREDICTION_MODE mode) {
    mbptr_->mode_info_context->mbmi.uv_mode = mode;
    pred_fn_(mbptr_, data_ptr_[0] - kStride, data_ptr_[1] - kStride,
             data_ptr_[0] - 1, data_ptr_[1] - 1, kStride,
             data_ptr_[0], data_ptr_[1], kStride);
  }
  IntraPredUvFunc pred_fn_;
  // We use 24 so that the data pointer of the first pixel in each row of
  // each macroblock is 8-byte aligned, and this gives us access to the
  // top-left and top-right corner pixels belonging to the top-left/right
  // macroblocks.
  // We use 9 lines so we have one line above us for top-prediction.
  // [0] = U, [1] = V
  static uint8_t* data_array_;
  static MACROBLOCKD* mb_;
  static MODE_INFO* mi_;
 };
 MACROBLOCKD* IntraPredUVTest::mb_ = NULL;
 MODE_INFO* IntraPredUVTest::mi_ = NULL;
 uint8_t* IntraPredUVTest::data_array_ = NULL;
 TEST_P(IntraPredUVTest, IntraPredTests) {
  RunTest();
 }
 INSTANTIATE_TEST_CASE_P(C, IntraPredUVTest,
                        ::testing::Values(
                            vp8_build_intra_predictors_mbuv_s_c));
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, IntraPredUVTest,
                        ::testing::Values(
                            vp8_build_intra_predictors_mbuv_s_sse2));
 #endif
 #if HAVE_SSSE3
 INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredUVTest,
                        ::testing::Values(
                            vp8_build_intra_predictors_mbuv_s_ssse3));
 #endif
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, IntraPredUVTest,
                        ::testing::Values(
                            vp8_build_intra_predictors_mbuv_s_neon));
 #endif
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(MSA, IntraPredUVTest,
                        ::testing::Values(
                            vp8_build_intra_predictors_mbuv_s_msa));
 #endif
 }  // namespace
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -63,9 +63,22 @@ class InvalidFileTest
    EXPECT_NE(res, EOF) << "Read result data failed";
    // Check results match.
-    EXPECT_EQ(expected_res_dec, res_dec)
+    const DecodeParam input = GET_PARAM(1);
-        << "Results don't match: frame number = " << video.frame_number()
+    if (input.threads > 1) {
-        << ". (" << decoder->DecodeError() << ")";
+      // The serial decode check is too strict for tile-threaded decoding as
      // there is no guarantee on the decode order nor which specific error
      // will take precedence. Currently a tile-level error is not forwarded so
      // the frame will simply be marked corrupt.
      EXPECT_TRUE(res_dec == expected_res_dec ||
                  res_dec == VPX_CODEC_CORRUPT_FRAME)
          << "Results don't match: frame number = " << video.frame_number()
          << ". (" << decoder->DecodeError() << "). Expected: "
          << expected_res_dec << " or " << VPX_CODEC_CORRUPT_FRAME;
    } else {
      EXPECT_EQ(expected_res_dec, res_dec)
          << "Results don't match: frame number = " << video.frame_number()
          << ". (" << decoder->DecodeError() << ")";
    }
    return !HasFailure();
  }
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -30,7 +30,9 @@
 #if defined(_WIN64)
-#define _WIN32_LEAN_AND_MEAN
+#undef NOMINMAX
 #define NOMINMAX
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #include <winnt.h>
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -81,6 +81,15 @@ static void write_ivf_frame_header(const vpx_codec_cx_pkt_t *const pkt,
 const unsigned int kInitialWidth = 320;
 const unsigned int kInitialHeight = 240;
 struct FrameInfo {
  FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
      : pts(_pts), w(_w), h(_h) {}
  vpx_codec_pts_t pts;
  unsigned int w;
  unsigned int h;
 };
 unsigned int ScaleForFrameNumber(unsigned int frame, unsigned int val) {
  if (frame < 10)
    return val;
@@ -120,15 +129,6 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
  virtual ~ResizeTest() {}
  struct FrameInfo {
    FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
        : pts(_pts), w(_w), h(_h) {}
    vpx_codec_pts_t pts;
    unsigned int w;
    unsigned int h;
  };
  virtual void SetUp() {
    InitializeConfig();
    SetMode(GET_PARAM(1));
@@ -196,13 +196,27 @@ class ResizeInternalTest : public ResizeTest {
  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
-    if (video->frame() == kStepDownFrame) {
+    if (change_config_) {
-      struct vpx_scaling_mode mode = {VP8E_FOURFIVE, VP8E_THREEFIVE};
+      int new_q = 60;
-      encoder->Control(VP8E_SET_SCALEMODE, &mode);
+      if (video->frame() == 0) {
-    }
+        struct vpx_scaling_mode mode = {VP8E_ONETWO, VP8E_ONETWO};
-    if (video->frame() == kStepUpFrame) {
+        encoder->Control(VP8E_SET_SCALEMODE, &mode);
-      struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
+      }
-      encoder->Control(VP8E_SET_SCALEMODE, &mode);
+      if (video->frame() == 1) {
        struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
        encoder->Control(VP8E_SET_SCALEMODE, &mode);
        cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = new_q;
        encoder->Config(&cfg_);
      }
    } else {
      if (video->frame() == kStepDownFrame) {
        struct vpx_scaling_mode mode = {VP8E_FOURFIVE, VP8E_THREEFIVE};
        encoder->Control(VP8E_SET_SCALEMODE, &mode);
      }
      if (video->frame() == kStepUpFrame) {
        struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
        encoder->Control(VP8E_SET_SCALEMODE, &mode);
      }
    }
  }
@@ -227,6 +241,7 @@ class ResizeInternalTest : public ResizeTest {
 #endif
  double frame0_psnr_;
  bool change_config_;
 #if WRITE_COMPRESSED_STREAM
  FILE *outfile_;
  unsigned int out_frames_;
@@ -237,6 +252,7 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 10);
  init_flags_ = VPX_CODEC_USE_PSNR;
  change_config_ = false;
  // q picked such that initial keyframe on this clip is ~30dB PSNR
  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
@@ -261,6 +277,164 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
  }
 }
 TEST_P(ResizeInternalTest, TestInternalResizeChangeConfig) {
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 10);
  cfg_.g_w = 352;
  cfg_.g_h = 288;
  change_config_ = true;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 class ResizeRealtimeTest : public ::libvpx_test::EncoderTest,
  public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
 protected:
  ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
  virtual ~ResizeRealtimeTest() {}
  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
    if (video->frame() == 0) {
      encoder->Control(VP9E_SET_AQ_MODE, 3);
      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
    }
    if (change_bitrate_ && video->frame() == 120) {
      change_bitrate_ = false;
      cfg_.rc_target_bitrate = 500;
      encoder->Config(&cfg_);
    }
  }
  virtual void SetUp() {
    InitializeConfig();
    SetMode(GET_PARAM(1));
    set_cpu_used_ = GET_PARAM(2);
  }
  virtual void DecompressedFrameHook(const vpx_image_t &img,
                                     vpx_codec_pts_t pts) {
    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
  }
  void DefaultConfig() {
    cfg_.rc_buf_initial_sz = 500;
    cfg_.rc_buf_optimal_sz = 600;
    cfg_.rc_buf_sz = 1000;
    cfg_.rc_min_quantizer = 2;
    cfg_.rc_max_quantizer = 56;
    cfg_.rc_undershoot_pct = 50;
    cfg_.rc_overshoot_pct = 50;
    cfg_.rc_end_usage = VPX_CBR;
    cfg_.kf_mode = VPX_KF_AUTO;
    cfg_.g_lag_in_frames = 0;
    cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
    // Enable dropped frames.
    cfg_.rc_dropframe_thresh = 1;
    // Enable error_resilience mode.
    cfg_.g_error_resilient  = 1;
    // Enable dynamic resizing.
    cfg_.rc_resize_allowed = 1;
    // Run at low bitrate.
    cfg_.rc_target_bitrate = 200;
  }
  std::vector< FrameInfo > frame_info_list_;
  int set_cpu_used_;
  bool change_bitrate_;
 };
 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
  ResizingVideoSource video;
  DefaultConfig();
  change_bitrate_ = false;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
       info != frame_info_list_.end(); ++info) {
    const unsigned int frame = static_cast<unsigned>(info->pts);
    const unsigned int expected_w = ScaleForFrameNumber(frame, kInitialWidth);
    const unsigned int expected_h = ScaleForFrameNumber(frame, kInitialHeight);
    EXPECT_EQ(expected_w, info->w)
        << "Frame " << frame << " had unexpected width";
    EXPECT_EQ(expected_h, info->h)
        << "Frame " << frame << " had unexpected height";
  }
 }
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
 // Run at low bitrate, with resize_allowed = 1, and verify that we get
 // one resize down event.
 TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 299);
  DefaultConfig();
  cfg_.g_w = 352;
  cfg_.g_h = 288;
  change_bitrate_ = false;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
  unsigned int last_w = cfg_.g_w;
  unsigned int last_h = cfg_.g_h;
  int resize_count = 0;
  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
       info != frame_info_list_.end(); ++info) {
    if (info->w != last_w || info->h != last_h) {
      // Verify that resize down occurs.
      ASSERT_LT(info->w, last_w);
      ASSERT_LT(info->h, last_h);
      last_w = info->w;
      last_h = info->h;
      resize_count++;
    }
  }
  // Verify that we get 1 resize down event in this test.
  ASSERT_EQ(1, resize_count) << "Resizing should occur.";
 }
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
 // Start at low target bitrate, raise the bitrate in the middle of the clip,
 // scaling-up should occur after bitrate changed.
 TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 359);
  DefaultConfig();
  cfg_.g_w = 352;
  cfg_.g_h = 288;
  change_bitrate_ = true;
  // Disable dropped frames.
  cfg_.rc_dropframe_thresh = 0;
  // Starting bitrate low.
  cfg_.rc_target_bitrate = 80;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
  unsigned int last_w = cfg_.g_w;
  unsigned int last_h = cfg_.g_h;
  int resize_count = 0;
  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
       info != frame_info_list_.end(); ++info) {
    if (info->w != last_w || info->h != last_h) {
      resize_count++;
      if (resize_count == 1) {
        // Verify that resize down occurs.
        ASSERT_LT(info->w, last_w);
        ASSERT_LT(info->h, last_h);
      } else if (resize_count == 2) {
        // Verify that resize up occurs.
        ASSERT_GT(info->w, last_w);
        ASSERT_GT(info->h, last_h);
      }
      last_w = info->w;
      last_h = info->h;
    }
  }
  // Verify that we get 2 resize events in this test.
  ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
 }
 vpx_img_fmt_t CspForFrameNumber(int frame) {
  if (frame < 10)
    return VPX_IMG_FMT_I420;
@@ -371,6 +545,9 @@ VP9_INSTANTIATE_TEST_CASE(ResizeTest,
                          ::testing::Values(::libvpx_test::kRealTime));
 VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest,
                          ::testing::Values(::libvpx_test::kOnePassBest));
 VP9_INSTANTIATE_TEST_CASE(ResizeRealtimeTest,
                          ::testing::Values(::libvpx_test::kRealTime),
                          ::testing::Range(5, 9));
 VP9_INSTANTIATE_TEST_CASE(ResizeCspTest,
                          ::testing::Values(::libvpx_test::kRealTime));
 }  // namespace
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc
@@ -186,70 +186,48 @@ TEST_P(SixtapPredictTest, TestWithRandomData) {
 using std::tr1::make_tuple;
 const SixtapPredictFunc sixtap_16x16_c = vp8_sixtap_predict16x16_c;
 const SixtapPredictFunc sixtap_8x8_c = vp8_sixtap_predict8x8_c;
 const SixtapPredictFunc sixtap_8x4_c = vp8_sixtap_predict8x4_c;
 const SixtapPredictFunc sixtap_4x4_c = vp8_sixtap_predict4x4_c;
 INSTANTIATE_TEST_CASE_P(
    C, SixtapPredictTest, ::testing::Values(
-        make_tuple(16, 16, sixtap_16x16_c),
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_c),
-        make_tuple(8, 8, sixtap_8x8_c),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_c),
-        make_tuple(8, 4, sixtap_8x4_c),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_c),
-        make_tuple(4, 4, sixtap_4x4_c)));
+        make_tuple(4, 4, &vp8_sixtap_predict4x4_c)));
 #if HAVE_NEON
 const SixtapPredictFunc sixtap_16x16_neon = vp8_sixtap_predict16x16_neon;
 const SixtapPredictFunc sixtap_8x8_neon = vp8_sixtap_predict8x8_neon;
 const SixtapPredictFunc sixtap_8x4_neon = vp8_sixtap_predict8x4_neon;
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_NEON, SixtapPredictTest, ::testing::Values(
+    NEON, SixtapPredictTest, ::testing::Values(
-        make_tuple(16, 16, sixtap_16x16_neon),
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),
-        make_tuple(8, 8, sixtap_8x8_neon),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_neon),
-        make_tuple(8, 4, sixtap_8x4_neon)));
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_neon)));
 #endif
 #if HAVE_MMX
 const SixtapPredictFunc sixtap_16x16_mmx = vp8_sixtap_predict16x16_mmx;
 const SixtapPredictFunc sixtap_8x8_mmx = vp8_sixtap_predict8x8_mmx;
 const SixtapPredictFunc sixtap_8x4_mmx = vp8_sixtap_predict8x4_mmx;
 const SixtapPredictFunc sixtap_4x4_mmx = vp8_sixtap_predict4x4_mmx;
 INSTANTIATE_TEST_CASE_P(
    MMX, SixtapPredictTest, ::testing::Values(
-        make_tuple(16, 16, sixtap_16x16_mmx),
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx),
-        make_tuple(8, 8, sixtap_8x8_mmx),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx),
-        make_tuple(8, 4, sixtap_8x4_mmx),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx),
-        make_tuple(4, 4, sixtap_4x4_mmx)));
+        make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
 #endif
 #if HAVE_SSE2
 const SixtapPredictFunc sixtap_16x16_sse2 = vp8_sixtap_predict16x16_sse2;
 const SixtapPredictFunc sixtap_8x8_sse2 = vp8_sixtap_predict8x8_sse2;
 const SixtapPredictFunc sixtap_8x4_sse2 = vp8_sixtap_predict8x4_sse2;
 INSTANTIATE_TEST_CASE_P(
    SSE2, SixtapPredictTest, ::testing::Values(
-        make_tuple(16, 16, sixtap_16x16_sse2),
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2),
-        make_tuple(8, 8, sixtap_8x8_sse2),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2),
-        make_tuple(8, 4, sixtap_8x4_sse2)));
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2)));
 #endif
 #if HAVE_SSSE3
 const SixtapPredictFunc sixtap_16x16_ssse3 = vp8_sixtap_predict16x16_ssse3;
 const SixtapPredictFunc sixtap_8x8_ssse3 = vp8_sixtap_predict8x8_ssse3;
 const SixtapPredictFunc sixtap_8x4_ssse3 = vp8_sixtap_predict8x4_ssse3;
 const SixtapPredictFunc sixtap_4x4_ssse3 = vp8_sixtap_predict4x4_ssse3;
 INSTANTIATE_TEST_CASE_P(
    SSSE3, SixtapPredictTest, ::testing::Values(
-        make_tuple(16, 16, sixtap_16x16_ssse3),
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3),
-        make_tuple(8, 8, sixtap_8x8_ssse3),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3),
-        make_tuple(8, 4, sixtap_8x4_ssse3),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_ssse3),
-        make_tuple(4, 4, sixtap_4x4_ssse3)));
+        make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3)));
 #endif
 #if HAVE_MSA
 const SixtapPredictFunc sixtap_16x16_msa = vp8_sixtap_predict16x16_msa;
 const SixtapPredictFunc sixtap_8x8_msa = vp8_sixtap_predict8x8_msa;
 const SixtapPredictFunc sixtap_8x4_msa = vp8_sixtap_predict8x4_msa;
 const SixtapPredictFunc sixtap_4x4_msa = vp8_sixtap_predict4x4_msa;
 INSTANTIATE_TEST_CASE_P(
    MSA, SixtapPredictTest, ::testing::Values(
-        make_tuple(16, 16, sixtap_16x16_msa),
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_msa),
-        make_tuple(8, 8, sixtap_8x8_msa),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_msa),
-        make_tuple(8, 4, sixtap_8x4_msa),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_msa),
-        make_tuple(4, 4, sixtap_4x4_msa)));
+        make_tuple(4, 4, &vp8_sixtap_predict4x4_msa)));
 #endif
 }  // namespace
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -16,8 +16,13 @@
 namespace {
 const int kTestMode = 0;
 const int kSuperframeSyntax = 1;
 typedef std::tr1::tuple<libvpx_test::TestMode,int> SuperframeTestParam;
 class SuperframeTest : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+    public ::libvpx_test::CodecTestWithParam<SuperframeTestParam> {
 protected:
  SuperframeTest() : EncoderTest(GET_PARAM(0)), modified_buf_(NULL),
      last_sf_pts_(0) {}
@@ -25,9 +30,13 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
  virtual void SetUp() {
    InitializeConfig();
-    SetMode(GET_PARAM(1));
+    const SuperframeTestParam input = GET_PARAM(1);
    const libvpx_test::TestMode mode = std::tr1::get<kTestMode>(input);
    const int syntax = std::tr1::get<kSuperframeSyntax>(input);
    SetMode(mode);
    sf_count_ = 0;
    sf_count_max_ = INT_MAX;
    is_vp10_style_superframe_ = syntax;
  }
  virtual void TearDown() {
@@ -50,7 +59,8 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
    const uint8_t marker = buffer[pkt->data.frame.sz - 1];
    const int frames = (marker & 0x7) + 1;
    const int mag = ((marker >> 3) & 3) + 1;
-    const unsigned int index_sz = 2 + mag  * frames;
+    const unsigned int index_sz =
        2 + mag * (frames - is_vp10_style_superframe_);
    if ((marker & 0xe0) == 0xc0 &&
        pkt->data.frame.sz >= index_sz &&
        buffer[pkt->data.frame.sz - index_sz] == marker) {
@@ -75,6 +85,7 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
    return pkt;
  }
  int is_vp10_style_superframe_;
  int sf_count_;
  int sf_count_max_;
  vpx_codec_cx_pkt_t modified_pkt_;
@@ -92,9 +103,11 @@ TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {
  EXPECT_EQ(sf_count_, 1);
 }
-VP9_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Values(
+VP9_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Combine(
-    ::libvpx_test::kTwoPassGood));
+    ::testing::Values(::libvpx_test::kTwoPassGood),
    ::testing::Values(0)));
-VP10_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Values(
+VP10_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Combine(
-    ::libvpx_test::kTwoPassGood));
+    ::testing::Values(::libvpx_test::kTwoPassGood),
    ::testing::Values(CONFIG_MISC_FIXES)));
 }  // namespace
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -18,6 +18,7 @@ LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_credits.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
@@ -417,6 +418,18 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-130x132.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-130x132.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x130.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x130.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x132.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x132.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-178x180.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-178x180.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x178.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x178.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x180.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x180.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
@@ -641,6 +654,34 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-8.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-8.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-1.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-1.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-8.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-8.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-1.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-1.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-8.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-8.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-1.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-1.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
@@ -768,3 +809,53 @@ endif  # CONFIG_ENCODE_PERF_TESTS
 # sort and remove duplicates
 LIBVPX_TEST_DATA-yes := $(sort $(LIBVPX_TEST_DATA-yes))
 # VP9 dynamic resizing test (decoder)
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -743,3 +743,92 @@ d06285d109ecbaef63b0cbcc44d70a129186f51c *invalid-vp90-2-03-size-224x196.webm.iv
 e60d859b0ef2b331b21740cf6cb83fabe469b079 *invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf
 0ae808dca4d3c1152a9576e14830b6faa39f1b4a *invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf.res
 9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m
 5b5763b388b1b52a81bb82b39f7ec25c4bd3d0e1 *desktop_credits.y4m
 85771f6ab44e4a0226e206c0cde8351dd5918953 *vp90-2-02-size-130x132.webm
 512dad5eabbed37b4bbbc64ce153f1a5484427b8 *vp90-2-02-size-130x132.webm.md5
 01f7127d40360289db63b27f61cb9afcda350e95 *vp90-2-02-size-132x130.webm
 4a94275328ae076cf60f966c097a8721010fbf5a *vp90-2-02-size-132x130.webm.md5
 f41c0400b5716b4b70552c40dd03d44be131e1cc *vp90-2-02-size-132x132.webm
 1a69e989f697e424bfe3e3e8a77bb0c0992c8e47 *vp90-2-02-size-132x132.webm.md5
 94a5cbfacacba100e0c5f7861c72a1b417feca0f *vp90-2-02-size-178x180.webm
 dedfecf1d784bcf70629592fa5e6f01d5441ccc9 *vp90-2-02-size-178x180.webm.md5
 4828b62478c04014bba3095a83106911a71cf387 *vp90-2-02-size-180x178.webm
 423da2b861050c969d78ed8e8f8f14045d1d8199 *vp90-2-02-size-180x178.webm.md5
 338f7c9282f43e29940f5391118aadd17e4f9234 *vp90-2-02-size-180x180.webm
 6c2ef013392310778dca5dd5351160eca66b0a60 *vp90-2-02-size-180x180.webm.md5
 679fa7d6807e936ff937d7b282e7dbd8ac76447e *vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm
 fc7267ab8fc2bf5d6c234e34ee6c078a967b4888 *vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm.md5
 9d33a137c819792209c5ce4e4e1ee5da73d574fe *vp90-2-14-resize-10frames-fp-tiles-1-2.webm
 0c78a154956a8605d050bdd75e0dcc4d39c040a6 *vp90-2-14-resize-10frames-fp-tiles-1-2.webm.md5
 d6a8d8c57f66a91d23e8e7df480f9ae841e56c37 *vp90-2-14-resize-10frames-fp-tiles-1-4.webm
 e9b4e8c7b33b5fda745d340c3f47e6623ae40cf2 *vp90-2-14-resize-10frames-fp-tiles-1-4.webm.md5
 aa6fe043a0c4a42b49c87ebbe812d4afd9945bec *vp90-2-14-resize-10frames-fp-tiles-1-8.webm
 028520578994c2d013d4c0129033d4f2ff31bbe0 *vp90-2-14-resize-10frames-fp-tiles-1-8.webm.md5
 d1d5463c9ea7b5cc5f609ddedccddf656f348d1a *vp90-2-14-resize-10frames-fp-tiles-2-1.webm
 92d5872f5bdffbed721703b7e959b4f885e3d77a *vp90-2-14-resize-10frames-fp-tiles-2-1.webm.md5
 677cb29de1215d97346015af5807a9b1faad54cf *vp90-2-14-resize-10frames-fp-tiles-2-4.webm
 a5db19f977094ec3fd60b4f7671b3e6740225e12 *vp90-2-14-resize-10frames-fp-tiles-2-4.webm.md5
 cdd3c52ba21067efdbb2de917fe2a965bf27332e *vp90-2-14-resize-10frames-fp-tiles-2-8.webm
 db17ec5d894ea8b8d0b7f32206d0dd3d46dcfa6d *vp90-2-14-resize-10frames-fp-tiles-2-8.webm.md5
 0f6093c472125d05b764d7d1965c1d56771c0ea2 *vp90-2-14-resize-10frames-fp-tiles-4-1.webm
 bc7c79e1bee07926dd970462ce6f64fc30eec3e1 *vp90-2-14-resize-10frames-fp-tiles-4-1.webm.md5
 c5142e2bff4091338196c8ea8bc9266e64f548bc *vp90-2-14-resize-10frames-fp-tiles-4-2.webm
 22aa3dd430b69fd3d92f6561bac86deeed90486d *vp90-2-14-resize-10frames-fp-tiles-4-2.webm.md5
 ede8b1466d2f26e1b1bd9602addb9cd1017e1d8c *vp90-2-14-resize-10frames-fp-tiles-4-8.webm
 508d5ebb9c0eac2a4100281a3ee052ec2fc19217 *vp90-2-14-resize-10frames-fp-tiles-4-8.webm.md5
 2b292e3392854cd1d76ae597a6f53656cf741cfa *vp90-2-14-resize-10frames-fp-tiles-8-1.webm
 1c24e54fa19e94e1722f24676404444e941c3d31 *vp90-2-14-resize-10frames-fp-tiles-8-1.webm.md5
 61beda21064e09634564caa6697ab90bd53c9af7 *vp90-2-14-resize-10frames-fp-tiles-8-2.webm
 9c0657b4d9e1d0e4c9d28a90e5a8630a65519124 *vp90-2-14-resize-10frames-fp-tiles-8-2.webm.md5
 1758c50a11a7c92522749b4a251664705f1f0d4b *vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm
 4f454a06750614314ae15a44087b79016fe2db97 *vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm.md5
 3920c95ba94f1f048a731d9d9b416043b44aa4bd *vp90-2-14-resize-10frames-fp-tiles-8-4.webm
 4eb347a0456d2c49a1e1d8de5aa1c51acc39887e *vp90-2-14-resize-10frames-fp-tiles-8-4.webm.md5
 4b95a74c032a473b6683d7ad5754db1b0ec378e9 *vp90-2-21-resize_inter_1280x720_5_1-2.webm
 a7826dd386bedfe69d02736969bfb47fb6a40a5e *vp90-2-21-resize_inter_1280x720_5_1-2.webm.md5
 5cfff79e82c4d69964ccb8e75b4f0c53b9295167 *vp90-2-21-resize_inter_1280x720_5_3-4.webm
 a18f57db4a25e1f543a99f2ceb182e00db0ee22f *vp90-2-21-resize_inter_1280x720_5_3-4.webm.md5
 d26db0811bf30eb4131d928669713e2485f8e833 *vp90-2-21-resize_inter_1280x720_7_1-2.webm
 fd6f9f332cd5bea4c0f0d57be4297bea493cc5a1 *vp90-2-21-resize_inter_1280x720_7_1-2.webm.md5
 5c7d73d4d268e2ba9593b31cb091fd339505c7fd *vp90-2-21-resize_inter_1280x720_7_3-4.webm
 7bbb949cabc1e70dadcc74582739f63b833034e0 *vp90-2-21-resize_inter_1280x720_7_3-4.webm.md5
 f2d2a41a60eb894aff0c5854afca15931f1445a8 *vp90-2-21-resize_inter_1920x1080_5_1-2.webm
 66d7789992613ac9d678ff905ff1059daa1b89e4 *vp90-2-21-resize_inter_1920x1080_5_1-2.webm.md5
 764edb75fe7dd64e73a1b4f3b4b2b1bf237a4dea *vp90-2-21-resize_inter_1920x1080_5_3-4.webm
 f78bea1075983fd990e7f25d4f31438f9b5efa34 *vp90-2-21-resize_inter_1920x1080_5_3-4.webm.md5
 96496f2ade764a5de9f0c27917c7df1f120fb2ef *vp90-2-21-resize_inter_1920x1080_7_1-2.webm
 2632b635135ed5ecd67fd22dec7990d29c4f4cb5 *vp90-2-21-resize_inter_1920x1080_7_1-2.webm.md5
 74889ea42001bf41428cb742ca74e65129c886dc *vp90-2-21-resize_inter_1920x1080_7_3-4.webm
 d2cf3b25956415bb579d368e7098097e482dd73a *vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
 4658986a8ce36ebfcc80a1903e446eaab3985336 *vp90-2-21-resize_inter_320x180_5_1-2.webm
 8a3d8cf325109ffa913cc9426c32eea8c202a09a *vp90-2-21-resize_inter_320x180_5_1-2.webm.md5
 16303aa45176520ee42c2c425247aadc1506b881 *vp90-2-21-resize_inter_320x180_5_3-4.webm
 41cab1ddf7715b680a4dbce42faa9bcd72af4e5c *vp90-2-21-resize_inter_320x180_5_3-4.webm.md5
 56648adcee66dd0e5cb6ac947f5ee1b9cc8ba129 *vp90-2-21-resize_inter_320x180_7_1-2.webm
 70047377787003cc03dda7b2394e6d7eaa666d9e *vp90-2-21-resize_inter_320x180_7_1-2.webm.md5
 d2ff99165488499cc55f75929f1ce5ca9c9e359b *vp90-2-21-resize_inter_320x180_7_3-4.webm
 e69019e378114a4643db283b66d1a7e304761a56 *vp90-2-21-resize_inter_320x180_7_3-4.webm.md5
 4834d129bed0f4289d3a88f2ae3a1736f77621b0 *vp90-2-21-resize_inter_320x240_5_1-2.webm
 a75653c53d22b623c1927fc0088da21dafef21f4 *vp90-2-21-resize_inter_320x240_5_1-2.webm.md5
 19818e1b7fd1c1e63d8873c31b0babe29dd33ba6 *vp90-2-21-resize_inter_320x240_5_3-4.webm
 8d89814ff469a186312111651b16601dfbce4336 *vp90-2-21-resize_inter_320x240_5_3-4.webm.md5
 ac8057bae52498f324ce92a074d5f8207cc4a4a7 *vp90-2-21-resize_inter_320x240_7_1-2.webm
 2643440898c83c08cc47bc744245af696b877c24 *vp90-2-21-resize_inter_320x240_7_1-2.webm.md5
 cf4a4cd38ac8b18c42d8c25a3daafdb39132256b *vp90-2-21-resize_inter_320x240_7_3-4.webm
 70ba8ec9120b26e9b0ffa2c79b432f16cbcb50ec *vp90-2-21-resize_inter_320x240_7_3-4.webm.md5
 669f10409fe1c4a054010162ca47773ea1fdbead *vp90-2-21-resize_inter_640x360_5_1-2.webm
 6355a04249004a35fb386dd1024214234f044383 *vp90-2-21-resize_inter_640x360_5_1-2.webm.md5
 c23763b950b8247c1775d1f8158d93716197676c *vp90-2-21-resize_inter_640x360_5_3-4.webm
 59e6fc381e3ec3b7bdaac586334e0bc944d18fb6 *vp90-2-21-resize_inter_640x360_5_3-4.webm.md5
 71b45cbfdd068baa1f679a69e5e6f421d256a85f *vp90-2-21-resize_inter_640x360_7_1-2.webm
 1416fc761b690c54a955c4cf017fa078520e8c18 *vp90-2-21-resize_inter_640x360_7_1-2.webm.md5
 6c409903279448a697e4db63bab1061784bcd8d2 *vp90-2-21-resize_inter_640x360_7_3-4.webm
 60de1299793433a630b71130cf76c9f5965758e2 *vp90-2-21-resize_inter_640x360_7_3-4.webm.md5
 852b597b8af096d90c80bf0ed6ed3b336b851f19 *vp90-2-21-resize_inter_640x480_5_1-2.webm
 f6856f19236ee46ed462bd0a2e7e72b9c3b9cea6 *vp90-2-21-resize_inter_640x480_5_1-2.webm.md5
 792a16c6f60043bd8dceb515f0b95b8891647858 *vp90-2-21-resize_inter_640x480_5_3-4.webm
 68ffe59877e9a7863805e1c0a3ce18ce037d7c9d *vp90-2-21-resize_inter_640x480_5_3-4.webm.md5
 61e044c4759972a35ea3db8c1478a988910a4ef4 *vp90-2-21-resize_inter_640x480_7_1-2.webm
 7739bfca167b1b43fea72f807f01e097b7cb98d8 *vp90-2-21-resize_inter_640x480_7_1-2.webm.md5
 7291af354b4418917eee00e3a7e366086a0b7a10 *vp90-2-21-resize_inter_640x480_7_3-4.webm
 4a18b09ccb36564193f0215f599d745d95bb558c *vp90-2-21-resize_inter_640x480_7_3-4.webm.md5
--- a/test/test.mk
+++ b/test/test.mk
@@ -36,6 +36,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += invalid_file_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
@@ -91,10 +92,9 @@ endif
 ## shared library builds don't make these functions accessible.
 ##
 ifeq ($(CONFIG_SHARED),)
 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += lpf_8_test.cc
 ## VP8
-ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
+ifeq ($(CONFIG_VP8),yes)
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
@@ -104,13 +104,12 @@ endif
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
 LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.cc
@@ -121,7 +120,7 @@ endif
 endif # VP8
 ## VP9
-ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
+ifeq ($(CONFIG_VP9),yes)
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),yesyes)
@@ -134,25 +133,24 @@ LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_encoder_parms_get_to_decoder.cc
 endif
-LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc
+LIBVPX_TEST_SRCS-yes                   += convolve_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
+LIBVPX_TEST_SRCS-yes                   += lpf_8_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_intrapred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += vp9_intrapred_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 ifeq ($(CONFIG_VP9_ENCODER),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc
 endif
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
@@ -162,10 +160,24 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc
 endif # VP9
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
+## VP10
 ifeq ($(CONFIG_VP10),yes)
-TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) := test_intra_pred_speed.cc
+LIBVPX_TEST_SRCS-yes                    += vp10_inv_txfm_test.cc
-TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
 endif # VP10
 ## Multi-codec / unconditional whitebox tests.
 ifeq ($(findstring yes,$(CONFIG_VP9_ENCODER)$(CONFIG_VP10_ENCODER)),yes)
 LIBVPX_TEST_SRCS-yes += avg_test.cc
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
 endif # CONFIG_SHARED
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -187,18 +187,19 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
                vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c,
                vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c)
-#if HAVE_SSE && CONFIG_USE_X86INC
+#if HAVE_SSE2 && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE, TestIntraPred4, vpx_dc_predictor_4x4_sse,
+INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
-                vpx_dc_left_predictor_4x4_sse, vpx_dc_top_predictor_4x4_sse,
+                vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
-                vpx_dc_128_predictor_4x4_sse, vpx_v_predictor_4x4_sse, NULL,
+                vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
-                NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_4x4_sse)
+                vpx_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-#endif  // HAVE_SSE && CONFIG_USE_X86INC
+                vpx_tm_predictor_4x4_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_4x4_ssse3, vpx_d45_predictor_4x4_ssse3, NULL,
+                NULL, vpx_d45_predictor_4x4_ssse3, NULL, NULL,
-                NULL, vpx_d153_predictor_4x4_ssse3,
+                vpx_d153_predictor_4x4_ssse3, vpx_d207_predictor_4x4_ssse3,
-                vpx_d207_predictor_4x4_ssse3, vpx_d63_predictor_4x4_ssse3, NULL)
+                vpx_d63_predictor_4x4_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
 #if HAVE_DSPR2
@@ -235,23 +236,19 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
                vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c,
                vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c)
 #if HAVE_SSE && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSE, TestIntraPred8, vpx_dc_predictor_8x8_sse,
                vpx_dc_left_predictor_8x8_sse, vpx_dc_top_predictor_8x8_sse,
                vpx_dc_128_predictor_8x8_sse, vpx_v_predictor_8x8_sse, NULL,
                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE && CONFIG_USE_X86INC
 #if HAVE_SSE2 && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE2, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
-                NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
+                vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
                vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
                vpx_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL,
                NULL, vpx_tm_predictor_8x8_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_8x8_ssse3, vpx_d45_predictor_8x8_ssse3, NULL,
+                NULL, vpx_d45_predictor_8x8_ssse3, NULL, NULL,
-                NULL, vpx_d153_predictor_8x8_ssse3,
+                vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
-                vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL)
+                vpx_d63_predictor_8x8_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
 #if HAVE_DSPR2
@@ -293,13 +290,13 @@ INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2,
                vpx_dc_left_predictor_16x16_sse2,
                vpx_dc_top_predictor_16x16_sse2,
                vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
                vpx_tm_predictor_16x16_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_16x16_ssse3, vpx_d45_predictor_16x16_ssse3,
+                NULL, vpx_d45_predictor_16x16_ssse3,
                NULL, NULL, vpx_d153_predictor_16x16_ssse3,
                vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3,
                NULL)
@@ -340,28 +337,19 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
                vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
 #if HAVE_SSE2 && CONFIG_USE_X86INC
 #if ARCH_X86_64
 INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
                vpx_dc_left_predictor_32x32_sse2,
                vpx_dc_top_predictor_32x32_sse2,
                vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_sse2)
+                NULL, vpx_tm_predictor_32x32_sse2)
 #else
 INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
                vpx_dc_left_predictor_32x32_sse2,
                vpx_dc_top_predictor_32x32_sse2,
                vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 #endif  // ARCH_X86_64
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_32x32_ssse3, vpx_d45_predictor_32x32_ssse3,
+                NULL, vpx_d45_predictor_32x32_ssse3, NULL, NULL,
-                NULL, NULL, vpx_d153_predictor_32x32_ssse3,
+                vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
-                vpx_d207_predictor_32x32_ssse3, vpx_d63_predictor_32x32_ssse3,
+                vpx_d63_predictor_32x32_ssse3, NULL)
                NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
 #if HAVE_NEON
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -10,6 +10,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <set>
 #include <string>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "../tools_common.h"
@@ -44,6 +45,12 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
  TestVectorTest()
      : DecoderTest(GET_PARAM(0)),
        md5_file_(NULL) {
 #if CONFIG_VP9_DECODER
    resize_clips_.insert(
      ::libvpx_test::kVP9TestVectorsResize,
      ::libvpx_test::kVP9TestVectorsResize +
          ::libvpx_test::kNumVP9TestVectorsResize);
 #endif
  }
  virtual ~TestVectorTest() {
@@ -77,6 +84,10 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
        << "Md5 checksums don't match: frame number = " << frame_number;
  }
 #if CONFIG_VP9_DECODER
  std::set<std::string> resize_clips_;
 #endif
 private:
  FILE *md5_file_;
 };
@@ -97,6 +108,14 @@ TEST_P(TestVectorTest, MD5Match) {
  if (mode == kFrameParallelMode) {
    flags |= VPX_CODEC_USE_FRAME_THREADING;
 #if CONFIG_VP9_DECODER
    // TODO(hkuang): Fix frame parallel decode bug. See issue 1086.
    if (resize_clips_.find(filename) != resize_clips_.end()) {
      printf("Skipping the test file: %s, due to frame parallel decode bug.\n",
             filename.c_str());
      return;
    }
 #endif
  }
  cfg.threads = threads;
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -52,6 +52,31 @@ const char *const kVP8TestVectors[] = {
 const int kNumVP8TestVectors = NELEMENTS(kVP8TestVectors);
 #endif  // CONFIG_VP8_DECODER
 #if CONFIG_VP9_DECODER
 #define RESIZE_TEST_VECTORS "vp90-2-21-resize_inter_320x180_5_1-2.webm", \
  "vp90-2-21-resize_inter_320x180_5_3-4.webm", \
  "vp90-2-21-resize_inter_320x180_7_1-2.webm", \
  "vp90-2-21-resize_inter_320x180_7_3-4.webm", \
  "vp90-2-21-resize_inter_320x240_5_1-2.webm", \
  "vp90-2-21-resize_inter_320x240_5_3-4.webm", \
  "vp90-2-21-resize_inter_320x240_7_1-2.webm", \
  "vp90-2-21-resize_inter_320x240_7_3-4.webm", \
  "vp90-2-21-resize_inter_640x360_5_1-2.webm", \
  "vp90-2-21-resize_inter_640x360_5_3-4.webm", \
  "vp90-2-21-resize_inter_640x360_7_1-2.webm", \
  "vp90-2-21-resize_inter_640x360_7_3-4.webm", \
  "vp90-2-21-resize_inter_640x480_5_1-2.webm", \
  "vp90-2-21-resize_inter_640x480_5_3-4.webm", \
  "vp90-2-21-resize_inter_640x480_7_1-2.webm", \
  "vp90-2-21-resize_inter_640x480_7_3-4.webm", \
  "vp90-2-21-resize_inter_1280x720_5_1-2.webm", \
  "vp90-2-21-resize_inter_1280x720_5_3-4.webm", \
  "vp90-2-21-resize_inter_1280x720_7_1-2.webm", \
  "vp90-2-21-resize_inter_1280x720_7_3-4.webm", \
  "vp90-2-21-resize_inter_1920x1080_5_1-2.webm", \
  "vp90-2-21-resize_inter_1920x1080_5_3-4.webm", \
  "vp90-2-21-resize_inter_1920x1080_7_1-2.webm", \
  "vp90-2-21-resize_inter_1920x1080_7_3-4.webm",
 const char *const kVP9TestVectors[] = {
  "vp90-2-00-quantizer-00.webm", "vp90-2-00-quantizer-01.webm",
  "vp90-2-00-quantizer-02.webm", "vp90-2-00-quantizer-03.webm",
@@ -120,7 +145,10 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-02-size-66x10.webm", "vp90-2-02-size-66x16.webm",
  "vp90-2-02-size-66x18.webm", "vp90-2-02-size-66x32.webm",
  "vp90-2-02-size-66x34.webm", "vp90-2-02-size-66x64.webm",
-  "vp90-2-02-size-66x66.webm", "vp90-2-03-size-196x196.webm",
+  "vp90-2-02-size-66x66.webm", "vp90-2-02-size-130x132.webm",
  "vp90-2-02-size-132x130.webm", "vp90-2-02-size-132x132.webm",
  "vp90-2-02-size-178x180.webm", "vp90-2-02-size-180x178.webm",
  "vp90-2-02-size-180x180.webm", "vp90-2-03-size-196x196.webm",
  "vp90-2-03-size-196x198.webm", "vp90-2-03-size-196x200.webm",
  "vp90-2-03-size-196x202.webm", "vp90-2-03-size-196x208.webm",
  "vp90-2-03-size-196x210.webm", "vp90-2-03-size-196x224.webm",
@@ -182,6 +210,20 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-14-resize-fp-tiles-4-2.webm", "vp90-2-14-resize-fp-tiles-4-8.webm",
  "vp90-2-14-resize-fp-tiles-8-16.webm", "vp90-2-14-resize-fp-tiles-8-1.webm",
  "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
  "vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm",
  "vp90-2-14-resize-10frames-fp-tiles-1-2.webm",
  "vp90-2-14-resize-10frames-fp-tiles-1-4.webm",
  "vp90-2-14-resize-10frames-fp-tiles-1-8.webm",
  "vp90-2-14-resize-10frames-fp-tiles-2-1.webm",
  "vp90-2-14-resize-10frames-fp-tiles-2-4.webm",
  "vp90-2-14-resize-10frames-fp-tiles-2-8.webm",
  "vp90-2-14-resize-10frames-fp-tiles-4-1.webm",
  "vp90-2-14-resize-10frames-fp-tiles-4-2.webm",
  "vp90-2-14-resize-10frames-fp-tiles-4-8.webm",
  "vp90-2-14-resize-10frames-fp-tiles-8-1.webm",
  "vp90-2-14-resize-10frames-fp-tiles-8-2.webm",
  "vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm",
  "vp90-2-14-resize-10frames-fp-tiles-8-4.webm",
  "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
  "vp90-2-16-intra-only.webm", "vp90-2-17-show-existing-frame.webm",
  "vp90-2-18-resize.ivf", "vp90-2-19-skip.webm",
@@ -193,10 +235,16 @@ const char *const kVP9TestVectors[] = {
  "vp93-2-20-10bit-yuv422.webm", "vp93-2-20-12bit-yuv422.webm",
  "vp93-2-20-10bit-yuv440.webm", "vp93-2-20-12bit-yuv440.webm",
  "vp93-2-20-10bit-yuv444.webm", "vp93-2-20-12bit-yuv444.webm",
-#endif  // CONFIG_VP9_HIGHBITDEPTH`
+#endif  // CONFIG_VP9_HIGHBITDEPTH
  "vp90-2-20-big_superframe-01.webm", "vp90-2-20-big_superframe-02.webm",
  RESIZE_TEST_VECTORS
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 const char *const kVP9TestVectorsResize[] = {
  RESIZE_TEST_VECTORS
 };
 const int kNumVP9TestVectorsResize = NELEMENTS(kVP9TestVectorsResize);
 #undef RESIZE_TEST_VECTORS
 #endif  // CONFIG_VP9_DECODER
 }  // namespace libvpx_test
--- a/test/test_vectors.h
+++ b/test/test_vectors.h
@@ -23,6 +23,8 @@ extern const char *const kVP8TestVectors[];
 #if CONFIG_VP9_DECODER
 extern const int kNumVP9TestVectors;
 extern const char *const kVP9TestVectors[];
 extern const int kNumVP9TestVectorsResize;
 extern const char *const kVP9TestVectorsResize[];
 #endif  // CONFIG_VP9_DECODER
 }  // namespace libvpx_test
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -11,6 +11,9 @@
 #define TEST_VIDEO_SOURCE_H_
 #if defined(_WIN32)
 #undef NOMINMAX
 #define NOMINMAX
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #endif
 #include <cstdio>
--- a/test/vp10_dct_test.cc
+++ b/test/vp10_dct_test.cc
@@ -0,0 +1,111 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <math.h>
 #include <stdlib.h>
 #include <new>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
 #include "./vpx_config.h"
 #include "vpx_ports/msvc.h"
 #undef CONFIG_COEFFICIENT_RANGE_CHECKING
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 1
 #include "vp10/encoder/dct.c"
 using libvpx_test::ACMRandom;
 namespace {
 void reference_dct_1d(const double *in, double *out, int size) {
  const double PI = 3.141592653589793238462643383279502884;
  const double kInvSqrt2 = 0.707106781186547524400844362104;
  for (int k = 0; k < size; ++k) {
    out[k] = 0;
    for (int n = 0; n < size; ++n) {
      out[k] += in[n] * cos(PI * (2 * n + 1) * k / (2 * size));
    }
    if (k == 0)
      out[k] = out[k] * kInvSqrt2;
  }
 }
 typedef void (*FdctFuncRef)(const double *in, double *out, int size);
 typedef void (*IdctFuncRef)(const double *in, double *out, int size);
 typedef void (*FdctFunc)(const tran_low_t *in, tran_low_t *out);
 typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
 class TransTestBase {
 public:
  virtual ~TransTestBase() {}
 protected:
  void RunFwdAccuracyCheck() {
    tran_low_t *input  = new tran_low_t[txfm_size_];
    tran_low_t *output = new tran_low_t[txfm_size_];
    double *ref_input  = new double[txfm_size_];
    double *ref_output = new double[txfm_size_];
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 5000;
    for (int ti =  0; ti < count_test_block; ++ti) {
      for (int ni = 0; ni < txfm_size_; ++ni) {
        input[ni] = rnd.Rand8() - rnd.Rand8();
        ref_input[ni] = static_cast<double>(input[ni]);
      }
      fwd_txfm_(input, output);
      fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
      for (int ni = 0; ni < txfm_size_; ++ni) {
        EXPECT_LE(
            abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
            max_error_);
      }
    }
    delete[] input;
    delete[] output;
    delete[] ref_input;
    delete[] ref_output;
  }
  double max_error_;
  int txfm_size_;
  FdctFunc fwd_txfm_;
  FdctFuncRef fwd_txfm_ref_;
 };
 typedef std::tr1::tuple<FdctFunc, FdctFuncRef, int, int> FdctParam;
 class Vp10FwdTxfm
    : public TransTestBase,
      public ::testing::TestWithParam<FdctParam> {
 public:
  virtual void SetUp() {
    fwd_txfm_ = GET_PARAM(0);
    fwd_txfm_ref_ = GET_PARAM(1);
    txfm_size_ = GET_PARAM(2);
    max_error_ = GET_PARAM(3);
  }
  virtual void TearDown() {}
 };
 TEST_P(Vp10FwdTxfm, RunFwdAccuracyCheck) {
  RunFwdAccuracyCheck();
 }
 INSTANTIATE_TEST_CASE_P(
    C, Vp10FwdTxfm,
    ::testing::Values(
        FdctParam(&fdct4, &reference_dct_1d, 4, 1),
        FdctParam(&fdct8, &reference_dct_1d, 8, 1),
        FdctParam(&fdct16, &reference_dct_1d, 16, 2)));
 }  // namespace
--- a/test/vp10_inv_txfm_test.cc
+++ b/test/vp10_inv_txfm_test.cc
@@ -0,0 +1,321 @@
 /*
 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp10/common/blockd.h"
 #include "vp10/common/scan.h"
 #include "vpx/vpx_integer.h"
 #include "vp10/common/vp10_inv_txfm.h"
 using libvpx_test::ACMRandom;
 namespace {
 const double PI = 3.141592653589793238462643383279502884;
 const double kInvSqrt2 = 0.707106781186547524400844362104;
 void reference_idct_1d(const double *in, double *out, int size) {
  for (int n = 0; n < size; ++n) {
    out[n] = 0;
    for (int k = 0; k < size; ++k) {
      if (k == 0)
        out[n] += kInvSqrt2 * in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
      else
        out[n] += in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
    }
  }
 }
 typedef void (*IdctFuncRef)(const double *in, double *out, int size);
 typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
 class TransTestBase {
 public:
  virtual ~TransTestBase() {}
 protected:
  void RunInvAccuracyCheck() {
    tran_low_t *input  = new tran_low_t[txfm_size_];
    tran_low_t *output = new tran_low_t[txfm_size_];
    double *ref_input  = new double[txfm_size_];
    double *ref_output = new double[txfm_size_];
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 5000;
    for (int ti =  0; ti < count_test_block; ++ti) {
      for (int ni = 0; ni < txfm_size_; ++ni) {
        input[ni] = rnd.Rand8() - rnd.Rand8();
        ref_input[ni] = static_cast<double>(input[ni]);
      }
      fwd_txfm_(input, output);
      fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
      for (int ni = 0; ni < txfm_size_; ++ni) {
        EXPECT_LE(
            abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
            max_error_);
      }
    }
    delete[] input;
    delete[] output;
    delete[] ref_input;
    delete[] ref_output;
  }
  double max_error_;
  int txfm_size_;
  IdctFunc fwd_txfm_;
  IdctFuncRef fwd_txfm_ref_;
 };
 typedef std::tr1::tuple<IdctFunc, IdctFuncRef, int, int> IdctParam;
 class Vp10InvTxfm
    : public TransTestBase,
      public ::testing::TestWithParam<IdctParam> {
 public:
  virtual void SetUp() {
    fwd_txfm_ = GET_PARAM(0);
    fwd_txfm_ref_ = GET_PARAM(1);
    txfm_size_ = GET_PARAM(2);
    max_error_ = GET_PARAM(3);
  }
  virtual void TearDown() {}
 };
 TEST_P(Vp10InvTxfm, RunInvAccuracyCheck) {
  RunInvAccuracyCheck();
 }
 INSTANTIATE_TEST_CASE_P(
    C, Vp10InvTxfm,
    ::testing::Values(
        IdctParam(&vp10_idct4_c, &reference_idct_1d, 4, 1),
        IdctParam(&vp10_idct8_c, &reference_idct_1d, 8, 2),
        IdctParam(&vp10_idct16_c, &reference_idct_1d, 16, 4),
        IdctParam(&vp10_idct32_c, &reference_idct_1d, 32, 6))
 );
 typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
 typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
 typedef std::tr1::tuple<FwdTxfmFunc,
                        InvTxfmFunc,
                        InvTxfmFunc,
                        TX_SIZE, int> PartialInvTxfmParam;
 const int kMaxNumCoeffs = 1024;
 class Vp10PartialIDctTest
    : public ::testing::TestWithParam<PartialInvTxfmParam> {
 public:
  virtual ~Vp10PartialIDctTest() {}
  virtual void SetUp() {
    ftxfm_ = GET_PARAM(0);
    full_itxfm_ = GET_PARAM(1);
    partial_itxfm_ = GET_PARAM(2);
    tx_size_  = GET_PARAM(3);
    last_nonzero_ = GET_PARAM(4);
  }
  virtual void TearDown() { libvpx_test::ClearSystemState(); }
 protected:
  int last_nonzero_;
  TX_SIZE tx_size_;
  FwdTxfmFunc ftxfm_;
  InvTxfmFunc full_itxfm_;
  InvTxfmFunc partial_itxfm_;
 };
 TEST_P(Vp10PartialIDctTest, RunQuantCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int size;
  switch (tx_size_) {
    case TX_4X4:
      size = 4;
      break;
    case TX_8X8:
      size = 8;
      break;
    case TX_16X16:
      size = 16;
      break;
    case TX_32X32:
      size = 32;
      break;
    default:
      FAIL() << "Wrong Size!";
      break;
  }
  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
  const int count_test_block = 1000;
  const int block_size = size * size;
  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
  int max_error = 0;
  for (int i = 0; i < count_test_block; ++i) {
    // clear out destination buffer
    memset(dst1, 0, sizeof(*dst1) * block_size);
    memset(dst2, 0, sizeof(*dst2) * block_size);
    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    for (int i = 0; i < count_test_block; ++i) {
      // Initialize a test block with input range [-255, 255].
      if (i == 0) {
        for (int j = 0; j < block_size; ++j)
          input_extreme_block[j] = 255;
      } else if (i == 1) {
        for (int j = 0; j < block_size; ++j)
          input_extreme_block[j] = -255;
      } else {
        for (int j = 0; j < block_size; ++j) {
          input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
        }
      }
      ftxfm_(input_extreme_block, output_ref_block, size);
      // quantization with maximum allowed step sizes
      test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
      for (int j = 1; j < last_nonzero_; ++j)
        test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]]
                         = (output_ref_block[j] / 1828) * 1828;
    }
    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
    for (int j = 0; j < block_size; ++j) {
      const int diff = dst1[j] - dst2[j];
      const int error = diff * diff;
      if (max_error < error)
        max_error = error;
    }
  }
  EXPECT_EQ(0, max_error)
      << "Error: partial inverse transform produces different results";
 }
 TEST_P(Vp10PartialIDctTest, ResultsMatch) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int size;
  switch (tx_size_) {
    case TX_4X4:
      size = 4;
      break;
    case TX_8X8:
      size = 8;
      break;
    case TX_16X16:
      size = 16;
      break;
    case TX_32X32:
      size = 32;
      break;
    default:
      FAIL() << "Wrong Size!";
      break;
  }
  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
  const int count_test_block = 1000;
  const int max_coeff = 32766 / 4;
  const int block_size = size * size;
  int max_error = 0;
  for (int i = 0; i < count_test_block; ++i) {
    // clear out destination buffer
    memset(dst1, 0, sizeof(*dst1) * block_size);
    memset(dst2, 0, sizeof(*dst2) * block_size);
    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
    int max_energy_leftover = max_coeff * max_coeff;
    for (int j = 0; j < last_nonzero_; ++j) {
      int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
                                          (rnd.Rand16() - 32768) / 65536);
      max_energy_leftover -= coef * coef;
      if (max_energy_leftover < 0) {
        max_energy_leftover = 0;
        coef = 0;
      }
      test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]] = coef;
    }
    memcpy(test_coef_block2, test_coef_block1,
           sizeof(*test_coef_block2) * block_size);
    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
    for (int j = 0; j < block_size; ++j) {
      const int diff = dst1[j] - dst2[j];
      const int error = diff * diff;
      if (max_error < error)
        max_error = error;
    }
  }
  EXPECT_EQ(0, max_error)
      << "Error: partial inverse transform produces different results";
 }
 using std::tr1::make_tuple;
 INSTANTIATE_TEST_CASE_P(
    C, Vp10PartialIDctTest,
    ::testing::Values(
        make_tuple(&vpx_fdct32x32_c,
                   &vp10_idct32x32_1024_add_c,
                   &vp10_idct32x32_34_add_c,
                   TX_32X32, 34),
        make_tuple(&vpx_fdct32x32_c,
                   &vp10_idct32x32_1024_add_c,
                   &vp10_idct32x32_1_add_c,
                   TX_32X32, 1),
        make_tuple(&vpx_fdct16x16_c,
                   &vp10_idct16x16_256_add_c,
                   &vp10_idct16x16_10_add_c,
                   TX_16X16, 10),
        make_tuple(&vpx_fdct16x16_c,
                   &vp10_idct16x16_256_add_c,
                   &vp10_idct16x16_1_add_c,
                   TX_16X16, 1),
        make_tuple(&vpx_fdct8x8_c,
                   &vp10_idct8x8_64_add_c,
                   &vp10_idct8x8_12_add_c,
                   TX_8X8, 12),
        make_tuple(&vpx_fdct8x8_c,
                   &vp10_idct8x8_64_add_c,
                   &vp10_idct8x8_1_add_c,
                   TX_8X8, 1),
        make_tuple(&vpx_fdct4x4_c,
                   &vp10_idct4x4_16_add_c,
                   &vp10_idct4x4_1_add_c,
                   TX_4X4, 1)));
 }  // namespace
--- a/test/vp9_arf_freq_test.cc
+++ b/test/vp9_arf_freq_test.cc
@@ -230,9 +230,23 @@ VP9_INSTANTIATE_TEST_CASE(
    ::testing::ValuesIn(kEncodeVectors),
    ::testing::ValuesIn(kMinArfVectors));
 #if CONFIG_VP9_HIGHBITDEPTH
 # if CONFIG_VP10_ENCODER
 // TODO(angiebird): 25-29 fail in high bitdepth mode.
 INSTANTIATE_TEST_CASE_P(
    DISABLED_VP10, ArfFreqTest,
    ::testing::Combine(
        ::testing::Values(static_cast<const libvpx_test::CodecFactory *>(
            &libvpx_test::kVP10)),
        ::testing::ValuesIn(kTestVectors),
        ::testing::ValuesIn(kEncodeVectors),
        ::testing::ValuesIn(kMinArfVectors)));
 # endif  // CONFIG_VP10_ENCODER
 #else
 VP10_INSTANTIATE_TEST_CASE(
    ArfFreqTest,
    ::testing::ValuesIn(kTestVectors),
    ::testing::ValuesIn(kEncodeVectors),
    ::testing::ValuesIn(kMinArfVectors));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
--- a/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/test/vp9_encoder_parms_get_to_decoder.cc
@@ -14,38 +14,10 @@
 #include "test/encode_test_driver.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
-#include "test/yuv_video_source.h"
+#include "vp9/vp9_dx_iface.h"
 #include "vp9/decoder/vp9_decoder.h"
 typedef vpx_codec_stream_info_t vp9_stream_info_t;
 struct vpx_codec_alg_priv {
  vpx_codec_priv_t        base;
  vpx_codec_dec_cfg_t     cfg;
  vp9_stream_info_t       si;
  struct VP9Decoder      *pbi;
  int                     postproc_cfg_set;
  vp8_postproc_cfg_t      postproc_cfg;
  vpx_decrypt_cb          decrypt_cb;
  void                   *decrypt_state;
  vpx_image_t             img;
  int                     img_avail;
  int                     flushed;
  int                     invert_tile_order;
  int                     frame_parallel_decode;
  // External frame buffer info to save for VP9 common.
  void *ext_priv;  // Private data associated with the external frame buffers.
  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
  vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
 };
 static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
  return (vpx_codec_alg_priv_t *)ctx->priv;
 }
 namespace {
 const unsigned int kFramerate = 50;
 const int kCpuUsed = 2;
 struct EncodePerfTestVideo {
@@ -66,35 +38,27 @@ struct EncodeParameters {
  int32_t lossless;
  int32_t error_resilient;
  int32_t frame_parallel;
  vpx_color_range_t color_range;
  vpx_color_space_t cs;
  int render_size[2];
  // TODO(JBB): quantizers / bitrate
 };
 const EncodeParameters kVP9EncodeParameterSet[] = {
-    {0, 0, 0, 1, 0, VPX_CS_BT_601},
+  {0, 0, 0, 1, 0, VPX_CR_STUDIO_RANGE, VPX_CS_BT_601},
-    {0, 0, 0, 0, 0, VPX_CS_BT_709},
+  {0, 0, 0, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_709},
-    {0, 0, 1, 0, 0, VPX_CS_BT_2020},
+  {0, 0, 1, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_2020},
-    {0, 2, 0, 0, 1, VPX_CS_UNKNOWN},
+  {0, 2, 0, 0, 1, VPX_CR_STUDIO_RANGE, VPX_CS_UNKNOWN, { 640, 480 }},
-    // TODO(JBB): Test profiles (requires more work).
+  // TODO(JBB): Test profiles (requires more work).
 };
 int is_extension_y4m(const char *filename) {
  const char *dot = strrchr(filename, '.');
  if (!dot || dot == filename)
    return 0;
  else
    return !strcmp(dot, ".y4m");
 }
 class VpxEncoderParmsGetToDecoder
    : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<EncodeParameters, \
+      public ::libvpx_test::CodecTestWith2Params<EncodeParameters,
                                                 EncodePerfTestVideo> {
 protected:
  VpxEncoderParmsGetToDecoder()
-      : EncoderTest(GET_PARAM(0)),
+      : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
        encode_parms(GET_PARAM(1)) {
  }
  virtual ~VpxEncoderParmsGetToDecoder() {}
@@ -112,6 +76,7 @@ class VpxEncoderParmsGetToDecoder
                                  ::libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
      encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs);
      encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range);
      encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless);
      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
                       encode_parms.frame_parallel);
@@ -122,37 +87,44 @@ class VpxEncoderParmsGetToDecoder
      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
      if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0)
        encoder->Control(VP9E_SET_RENDER_SIZE, encode_parms.render_size);
    }
  }
  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource& video,
+                                  const libvpx_test::VideoSource &video,
                                  libvpx_test::Decoder *decoder) {
-    vpx_codec_ctx_t* vp9_decoder = decoder->GetDecoder();
+    vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
-    vpx_codec_alg_priv_t* priv =
+    vpx_codec_alg_priv_t *const priv =
-        (vpx_codec_alg_priv_t*) get_alg_priv(vp9_decoder);
+        reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv);
-
+    FrameWorkerData *const worker_data =
-    VP9Decoder* pbi = priv->pbi;
+        reinterpret_cast<FrameWorkerData *>(priv->frame_workers[0].data1);
-    VP9_COMMON* common = &pbi->common;
+    VP9_COMMON *const common = &worker_data->pbi->common;
    if (encode_parms.lossless) {
-      EXPECT_EQ(common->base_qindex, 0);
+      EXPECT_EQ(0, common->base_qindex);
-      EXPECT_EQ(common->y_dc_delta_q, 0);
+      EXPECT_EQ(0, common->y_dc_delta_q);
-      EXPECT_EQ(common->uv_dc_delta_q, 0);
+      EXPECT_EQ(0, common->uv_dc_delta_q);
-      EXPECT_EQ(common->uv_ac_delta_q, 0);
+      EXPECT_EQ(0, common->uv_ac_delta_q);
-      EXPECT_EQ(common->tx_mode, ONLY_4X4);
+      EXPECT_EQ(ONLY_4X4, common->tx_mode);
    }
-    EXPECT_EQ(common->error_resilient_mode, encode_parms.error_resilient);
+    EXPECT_EQ(encode_parms.error_resilient, common->error_resilient_mode);
    if (encode_parms.error_resilient) {
-      EXPECT_EQ(common->frame_parallel_decoding_mode, 1);
+      EXPECT_EQ(1, common->frame_parallel_decoding_mode);
-      EXPECT_EQ(common->use_prev_frame_mvs, 0);
+      EXPECT_EQ(0, common->use_prev_frame_mvs);
    } else {
-      EXPECT_EQ(common->frame_parallel_decoding_mode,
+      EXPECT_EQ(encode_parms.frame_parallel,
-                encode_parms.frame_parallel);
+                common->frame_parallel_decoding_mode);
    }
-    EXPECT_EQ(common->color_space, encode_parms.cs);
+    EXPECT_EQ(encode_parms.color_range, common->color_range);
-    EXPECT_EQ(common->log2_tile_cols, encode_parms.tile_cols);
+    EXPECT_EQ(encode_parms.cs, common->color_space);
-    EXPECT_EQ(common->log2_tile_rows, encode_parms.tile_rows);
+    if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
      EXPECT_EQ(encode_parms.render_size[0], common->render_width);
      EXPECT_EQ(encode_parms.render_size[1], common->render_height);
    }
    EXPECT_EQ(encode_parms.tile_cols, common->log2_tile_cols);
    EXPECT_EQ(encode_parms.tile_rows, common->log2_tile_rows);
    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
    return VPX_CODEC_OK == res_dec;
@@ -164,35 +136,18 @@ class VpxEncoderParmsGetToDecoder
  EncodeParameters encode_parms;
 };
-// TODO(hkuang): This test conflicts with frame parallel decode. So disable it
+TEST_P(VpxEncoderParmsGetToDecoder, BitstreamParms) {
 // for now until fix.
 TEST_P(VpxEncoderParmsGetToDecoder, DISABLED_BitstreamParms) {
  init_flags_ = VPX_CODEC_USE_PSNR;
-  libvpx_test::VideoSource *video;
+  libvpx_test::VideoSource *const video =
-  if (is_extension_y4m(test_video_.name)) {
+      new libvpx_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames);
-    video = new libvpx_test::Y4mVideoSource(test_video_.name,
+  ASSERT_TRUE(video != NULL);
                                            0, test_video_.frames);
  } else {
    video = new libvpx_test::YUVVideoSource(test_video_.name,
                                            VPX_IMG_FMT_I420,
                                            test_video_.width,
                                            test_video_.height,
                                            kFramerate, 1, 0,
                                            test_video_.frames);
  }
  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
-  delete(video);
+  delete video;
 }
-VP9_INSTANTIATE_TEST_CASE(
+VP9_INSTANTIATE_TEST_CASE(VpxEncoderParmsGetToDecoder,
-    VpxEncoderParmsGetToDecoder,
+                          ::testing::ValuesIn(kVP9EncodeParameterSet),
-    ::testing::ValuesIn(kVP9EncodeParameterSet),
+                          ::testing::ValuesIn(kVP9EncodePerfTestVectors));
    ::testing::ValuesIn(kVP9EncodePerfTestVectors));
 VP10_INSTANTIATE_TEST_CASE(
    VpxEncoderParmsGetToDecoder,
    ::testing::ValuesIn(kVP9EncodeParameterSet),
    ::testing::ValuesIn(kVP9EncodePerfTestVectors));
 }  // namespace
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -187,9 +187,23 @@ VP9_INSTANTIATE_TEST_CASE(
    ::testing::ValuesIn(kTestVectors),
    ::testing::ValuesIn(kCpuUsedVectors));
 #if CONFIG_VP9_HIGHBITDEPTH
 # if CONFIG_VP10_ENCODER
 // TODO(angiebird): many fail in high bitdepth mode.
 INSTANTIATE_TEST_CASE_P(
    DISABLED_VP10, EndToEndTestLarge,
    ::testing::Combine(
        ::testing::Values(static_cast<const libvpx_test::CodecFactory *>(
            &libvpx_test::kVP10)),
        ::testing::ValuesIn(kEncodingModeVectors),
        ::testing::ValuesIn(kTestVectors),
        ::testing::ValuesIn(kCpuUsedVectors)));
 # endif  // CONFIG_VP10_ENCODER
 #else
 VP10_INSTANTIATE_TEST_CASE(
    EndToEndTestLarge,
    ::testing::ValuesIn(kEncodingModeVectors),
    ::testing::ValuesIn(kTestVectors),
    ::testing::ValuesIn(kCpuUsedVectors));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
--- a/test/vp9_error_block_test.cc
+++ b/test/vp9_error_block_test.cc
@@ -67,12 +67,22 @@ TEST_P(ErrorBlockTest, OperationCheck) {
  int64_t ret;
  int64_t ref_ssz;
  int64_t ref_ret;
  const int msb = bit_depth_ + 8 - 1;
  for (int i = 0; i < kNumIterations; ++i) {
    int err_count = 0;
    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
    for (int j = 0; j < block_size; j++) {
-      coeff[j]   = rnd(2 << 20) - (1 << 20);
+      // coeff and dqcoeff will always have at least the same sign, and this
-      dqcoeff[j] = rnd(2 << 20) - (1 << 20);
+      // can be used for optimization, so generate test input precisely.
      if (rnd(2)) {
        // Positive number
        coeff[j]   = rnd(1 << msb);
        dqcoeff[j] = rnd(1 << msb);
      } else {
        // Negative number
        coeff[j]   = -rnd(1 << msb);
        dqcoeff[j] = -rnd(1 << msb);
      }
    }
    ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
                                  bit_depth_);
@@ -85,7 +95,7 @@ TEST_P(ErrorBlockTest, OperationCheck) {
    err_count_total += err_count;
  }
  EXPECT_EQ(0, err_count_total)
-      << "Error: Error Block Test, C output doesn't match SSE2 output. "
+      << "Error: Error Block Test, C output doesn't match optimized output. "
      << "First failed at test case " << first_failure;
 }
@@ -100,23 +110,36 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
  int64_t ret;
  int64_t ref_ssz;
  int64_t ref_ret;
-  int max_val = ((1 << 20) - 1);
+  const int msb = bit_depth_ + 8 - 1;
  int max_val = ((1 << msb) - 1);
  for (int i = 0; i < kNumIterations; ++i) {
    int err_count = 0;
-    int k = (i / 9) % 5;
+    int k = (i / 9) % 9;
    // Change the maximum coeff value, to test different bit boundaries
-    if ( k == 4 && (i % 9) == 0 ) {
+    if ( k == 8 && (i % 9) == 0 ) {
      max_val >>= 1;
    }
    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
    for (int j = 0; j < block_size; j++) {
-      if (k < 4) {  // Test at maximum values
+      if (k < 4) {
-        coeff[j]   = k % 2 ? max_val : -max_val;
+        // Test at positive maximum values
-        dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
+        coeff[j]   = k % 2 ? max_val : 0;
        dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
      } else if (k < 8) {
        // Test at negative maximum values
        coeff[j]   = k % 2 ? -max_val : 0;
        dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
      } else {
-        coeff[j]   = rnd(2 << 14) - (1 << 14);
+        if (rnd(2)) {
-        dqcoeff[j] = rnd(2 << 14) - (1 << 14);
+          // Positive number
          coeff[j]   = rnd(1 << 14);
          dqcoeff[j] = rnd(1 << 14);
        } else {
          // Negative number
          coeff[j]   = -rnd(1 << 14);
          dqcoeff[j] = -rnd(1 << 14);
        }
      }
    }
    ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
@@ -130,13 +153,30 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
    err_count_total += err_count;
  }
  EXPECT_EQ(0, err_count_total)
-      << "Error: Error Block Test, C output doesn't match SSE2 output. "
+      << "Error: Error Block Test, C output doesn't match optimized output. "
      << "First failed at test case " << first_failure;
 }
 using std::tr1::make_tuple;
 #if CONFIG_USE_X86INC
 int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
                                           const tran_low_t *dqcoeff,
                                           intptr_t block_size,
                                           int64_t *ssz, int bps) {
  assert(bps == 8);
  return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
 }
 #if HAVE_SSE2
 int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
                                              const tran_low_t *dqcoeff,
                                              intptr_t block_size,
                                              int64_t *ssz, int bps) {
  assert(bps == 8);
  return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
 }
 INSTANTIATE_TEST_CASE_P(
    SSE2, ErrorBlockTest,
    ::testing::Values(
@@ -145,7 +185,27 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_highbd_block_error_sse2,
                   &vp9_highbd_block_error_c, VPX_BITS_12),
        make_tuple(&vp9_highbd_block_error_sse2,
-                   &vp9_highbd_block_error_c, VPX_BITS_8)));
+                   &vp9_highbd_block_error_c, VPX_BITS_8),
        make_tuple(&wrap_vp9_highbd_block_error_8bit_sse2,
                   &wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
 #endif  // HAVE_SSE2
 #if HAVE_AVX
 int64_t wrap_vp9_highbd_block_error_8bit_avx(const tran_low_t *coeff,
                                              const tran_low_t *dqcoeff,
                                              intptr_t block_size,
                                              int64_t *ssz, int bps) {
  assert(bps == 8);
  return vp9_highbd_block_error_8bit_avx(coeff, dqcoeff, block_size, ssz);
 }
 INSTANTIATE_TEST_CASE_P(
    AVX, ErrorBlockTest,
    ::testing::Values(
        make_tuple(&wrap_vp9_highbd_block_error_8bit_avx,
                   &wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
 #endif  // HAVE_AVX
 #endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -132,7 +132,6 @@ using std::tr1::make_tuple;
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_USE_X86INC
 #if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                        ::testing::Values(
                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -141,13 +140,13 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                                       &vpx_highbd_tm_predictor_16x16_c, 16, 8),
                            make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                       &vpx_highbd_tm_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                       &vpx_highbd_dc_predictor_4x4_c, 4, 8),
                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                       &vpx_highbd_dc_predictor_8x8_c, 8, 8),
                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                       &vpx_highbd_dc_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                       &vpx_highbd_v_predictor_4x4_c, 4, 8),
                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                       &vpx_highbd_v_predictor_8x8_c, 8, 8),
@@ -155,34 +154,11 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                                       &vpx_highbd_v_predictor_16x16_c, 16, 8),
                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                       &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                       &vpx_highbd_tm_predictor_4x4_c, 4, 8),
                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                       &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                        ::testing::Values(
                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
                                       &vpx_highbd_dc_predictor_4x4_c, 4, 8),
                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                       &vpx_highbd_dc_predictor_8x8_c, 8, 8),
                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                       &vpx_highbd_dc_predictor_16x16_c, 16, 8),
                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
                                       &vpx_highbd_v_predictor_4x4_c, 4, 8),
                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                       &vpx_highbd_v_predictor_8x8_c, 8, 8),
                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
                                       &vpx_highbd_v_predictor_16x16_c, 16, 8),
                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                       &vpx_highbd_v_predictor_32x32_c, 32, 8),
                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
                                       &vpx_highbd_tm_predictor_4x4_c, 4, 8),
                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                       &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
 #endif  // !ARCH_X86_64
 #if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                        ::testing::Values(
                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -194,14 +170,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                            make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                       &vpx_highbd_tm_predictor_32x32_c, 32,
                                       10),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                       &vpx_highbd_dc_predictor_4x4_c, 4, 10),
                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                       &vpx_highbd_dc_predictor_8x8_c, 8, 10),
                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                       &vpx_highbd_dc_predictor_16x16_c, 16,
                                       10),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                       &vpx_highbd_v_predictor_4x4_c, 4, 10),
                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                       &vpx_highbd_v_predictor_8x8_c, 8, 10),
@@ -211,35 +187,11 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                       &vpx_highbd_v_predictor_32x32_c, 32,
                                       10),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                       &vpx_highbd_tm_predictor_4x4_c, 4, 10),
                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                       &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                        ::testing::Values(
                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
                                       &vpx_highbd_dc_predictor_4x4_c, 4, 10),
                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                       &vpx_highbd_dc_predictor_8x8_c, 8, 10),
                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                       &vpx_highbd_dc_predictor_16x16_c, 16,
                                       10),
                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
                                       &vpx_highbd_v_predictor_4x4_c, 4, 10),
                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                       &vpx_highbd_v_predictor_8x8_c, 8, 10),
                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
                                       &vpx_highbd_v_predictor_16x16_c, 16, 10),
                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                       &vpx_highbd_v_predictor_32x32_c, 32, 10),
                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
                                       &vpx_highbd_tm_predictor_4x4_c, 4, 10),
                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                       &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
 #endif  // !ARCH_X86_64
 #if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                        ::testing::Values(
                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -251,14 +203,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                            make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                       &vpx_highbd_tm_predictor_32x32_c, 32,
                                       12),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                       &vpx_highbd_dc_predictor_4x4_c, 4, 12),
                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                       &vpx_highbd_dc_predictor_8x8_c, 8, 12),
                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                       &vpx_highbd_dc_predictor_16x16_c, 16,
                                       12),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                       &vpx_highbd_v_predictor_4x4_c, 4, 12),
                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                       &vpx_highbd_v_predictor_8x8_c, 8, 12),
@@ -268,33 +220,11 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                       &vpx_highbd_v_predictor_32x32_c, 32,
                                       12),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                       &vpx_highbd_tm_predictor_4x4_c, 4, 12),
                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                       &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
-#else
+
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                        ::testing::Values(
                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
                                       &vpx_highbd_dc_predictor_4x4_c, 4, 12),
                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                       &vpx_highbd_dc_predictor_8x8_c, 8, 12),
                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                       &vpx_highbd_dc_predictor_16x16_c, 16,
                                       12),
                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
                                       &vpx_highbd_v_predictor_4x4_c, 4, 12),
                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                       &vpx_highbd_v_predictor_8x8_c, 8, 12),
                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
                                       &vpx_highbd_v_predictor_16x16_c, 16, 12),
                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                       &vpx_highbd_v_predictor_32x32_c, 32, 12),
                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
                                       &vpx_highbd_tm_predictor_4x4_c, 4, 12),
                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                       &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
 #endif  // !ARCH_X86_64
 #endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
--- a/test/vp9_spatial_svc_encoder.sh
+++ b/test/vp9_spatial_svc_encoder.sh
@@ -54,7 +54,7 @@ vp9_spatial_svc() {
  if [ "$(vp9_encode_available)" = "yes" ]; then
    local readonly test_name="vp9_spatial_svc"
    for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
-      vp9_spatial_svc_encoder "${test_name}" -l ${layers}
+      vp9_spatial_svc_encoder "${test_name}" -sl ${layers}
    done
  fi
 }
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -190,7 +190,7 @@ string DecodeFile(const string& filename, int num_threads) {
 void DecodeFiles(const FileList files[]) {
  for (const FileList *iter = files; iter->name != NULL; ++iter) {
    SCOPED_TRACE(iter->name);
-    for (int t = 2; t <= 8; ++t) {
+    for (int t = 1; t <= 8; ++t) {
      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
          << "threads = " << t;
    }
@@ -235,13 +235,13 @@ TEST(VPxWorkerThreadTest, TestSerialInterface) {
  EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
 }
-TEST(VP9DecodeMultiThreadedTest, Decode) {
+TEST(VP9DecodeMultiThreadedTest, NoTilesNonFrameParallel) {
  // no tiles or frame parallel; this exercises loop filter threading.
  EXPECT_EQ("b35a1b707b28e82be025d960aba039bc",
            DecodeFile("vp90-2-03-size-226x226.webm", 2));
 }
-TEST(VP9DecodeMultiThreadedTest, Decode2) {
+TEST(VP9DecodeMultiThreadedTest, FrameParallel) {
  static const FileList files[] = {
    { "vp90-2-08-tile_1x2_frame_parallel.webm",
      "68ede6abd66bae0a2edf2eb9232241b6" },
@@ -255,8 +255,7 @@ TEST(VP9DecodeMultiThreadedTest, Decode2) {
  DecodeFiles(files);
 }
-// Test tile quantity changes within one file.
+TEST(VP9DecodeMultiThreadedTest, FrameParallelResize) {
 TEST(VP9DecodeMultiThreadedTest, Decode3) {
  static const FileList files[] = {
    { "vp90-2-14-resize-fp-tiles-1-16.webm",
      "0cd5e632c326297e975f38949c31ea94" },
@@ -307,6 +306,19 @@ TEST(VP9DecodeMultiThreadedTest, Decode3) {
  DecodeFiles(files);
 }
 TEST(VP9DecodeMultiThreadedTest, NonFrameParallel) {
  static const FileList files[] = {
    { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" },
    { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" },
    { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" },
    { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" },
    { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" },
    { NULL, NULL }
  };
  DecodeFiles(files);
 }
 #endif  // CONFIG_WEBM_IO
 INSTANTIATE_TEST_CASE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool());
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -9,6 +9,7 @@
 */
 #ifndef TEST_Y4M_VIDEO_SOURCE_H_
 #define TEST_Y4M_VIDEO_SOURCE_H_
 #include <algorithm>
 #include <string>
 #include "test/video_source.h"
@@ -91,6 +92,18 @@ class Y4mVideoSource : public VideoSource {
    y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
  }
  // Swap buffers with another y4m source. This allows reading a new frame
  // while keeping the old frame around. A whole Y4mSource is required and
  // not just a vpx_image_t because of how the y4m reader manipulates
  // vpx_image_t internals,
  void SwapBuffers(Y4mVideoSource *other) {
    std::swap(other->y4m_.dst_buf, y4m_.dst_buf);
    vpx_image_t *tmp;
    tmp = other->img_.release();
    other->img_.reset(img_.release());
    img_.reset(tmp);
  }
 protected:
  void CloseSource() {
    y4m_input_close(&y4m_);
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@@ -1,7 +1,10 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 2dec09426ab62b794464cc9971bd135b4d313e65
+Version: 476366249e1fda7710a389cd41c57db42305e0d4
 License: BSD
 License File: LICENSE.txt
 Description:
 libwebm is used to handle WebM container I/O.
 Local Changes:
 * <none>
--- a/third_party/libwebm/mkvmuxer.hpp
+++ b/third_party/libwebm/mkvmuxer.hpp
@@ -528,7 +528,7 @@ class Tracks {
 public:
  // Audio and video type defined by the Matroska specs.
  enum { kVideo = 0x1, kAudio = 0x2 };
-  // Opus, Vorbis, VP8, and VP9 codec ids defined by the Matroska specs.
+
  static const char kOpusCodecId[];
  static const char kVorbisCodecId[];
  static const char kVp8CodecId[];
--- a/third_party/libwebm/mkvparser.cpp
+++ b/third_party/libwebm/mkvparser.cpp
--- a/third_party/libwebm/mkvparser.hpp
+++ b/third_party/libwebm/mkvparser.hpp
@@ -9,12 +9,13 @@
 #ifndef MKVPARSER_HPP
 #define MKVPARSER_HPP
 #include <cstdlib>
 #include <cstdio>
 #include <cstddef>
 #include <cstdio>
 #include <cstdlib>
 namespace mkvparser {
 const int E_PARSE_FAILED = -1;
 const int E_FILE_FORMAT_INVALID = -2;
 const int E_BUFFER_NOT_FULL = -3;
@@ -27,8 +28,11 @@ class IMkvReader {
  virtual ~IMkvReader();
 };
 template<typename Type> Type* SafeArrayAlloc(unsigned long long num_elements,
                                             unsigned long long element_size);
 long long GetUIntLength(IMkvReader*, long long, long&);
 long long ReadUInt(IMkvReader*, long long, long&);
 long long ReadID(IMkvReader* pReader, long long pos, long& len);
 long long UnserializeUInt(IMkvReader*, long long pos, long long size);
 long UnserializeFloat(IMkvReader*, long long pos, long long size, double&);
@@ -833,7 +837,7 @@ class Cues {
 private:
  bool Init() const;
-  void PreloadCuePoint(long&, long long) const;
+  bool PreloadCuePoint(long&, long long) const;
  mutable CuePoint** m_cue_points;
  mutable long m_count;
@@ -999,8 +1003,8 @@ class Segment {
  long DoLoadClusterUnknownSize(long long&, long&);
  long DoParseNext(const Cluster*&, long long&, long&);
-  void AppendCluster(Cluster*);
+  bool AppendCluster(Cluster*);
-  void PreloadCluster(Cluster*, ptrdiff_t);
+  bool PreloadCluster(Cluster*, ptrdiff_t);
  // void ParseSeekHead(long long pos, long long size);
  // void ParseSeekEntry(long long pos, long long size);
--- a/third_party/libwebm/webmids.hpp
+++ b/third_party/libwebm/webmids.hpp
@@ -41,6 +41,7 @@ enum MkvId {
  kMkvTimecodeScale = 0x2AD7B1,
  kMkvDuration = 0x4489,
  kMkvDateUTC = 0x4461,
  kMkvTitle = 0x7BA9,
  kMkvMuxingApp = 0x4D80,
  kMkvWritingApp = 0x5741,
  // Cluster
@@ -107,9 +108,16 @@ enum MkvId {
  kMkvContentEncodingOrder = 0x5031,
  kMkvContentEncodingScope = 0x5032,
  kMkvContentEncodingType = 0x5033,
  kMkvContentCompression = 0x5034,
  kMkvContentCompAlgo = 0x4254,
  kMkvContentCompSettings = 0x4255,
  kMkvContentEncryption = 0x5035,
  kMkvContentEncAlgo = 0x47E1,
  kMkvContentEncKeyID = 0x47E2,
  kMkvContentSignature = 0x47E3,
  kMkvContentSigKeyID = 0x47E4,
  kMkvContentSigAlgo = 0x47E5,
  kMkvContentSigHashAlgo = 0x47E6,
  kMkvContentEncAESSettings = 0x47E7,
  kMkvAESSettingsCipherMode = 0x47E8,
  kMkvAESSettingsCipherInitData = 0x47E9,
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -119,7 +119,7 @@
 %if ABI_IS_32BIT
    %if CONFIG_PIC=1
        %ifidn __OUTPUT_FORMAT__,elf32
-            %define GET_GOT_SAVE_ARG 1
+            %define GET_GOT_DEFINED 1
            %define WRT_PLT wrt ..plt
            %macro GET_GOT 1
                extern _GLOBAL_OFFSET_TABLE_
@@ -138,7 +138,7 @@
                %define RESTORE_GOT pop %1
            %endmacro
        %elifidn __OUTPUT_FORMAT__,macho32
-            %define GET_GOT_SAVE_ARG 1
+            %define GET_GOT_DEFINED 1
            %macro GET_GOT 1
                push %1
                call %%get_got
@@ -149,6 +149,8 @@
                %undef RESTORE_GOT
                %define RESTORE_GOT pop %1
            %endmacro
        %else
            %define GET_GOT_DEFINED 0
        %endif
    %endif
--- a/tools/gen_authors.sh
+++ b/tools/gen_authors.sh
@@ -6,7 +6,7 @@ cat <<EOF
 # This file is automatically generated from the git commit history
 # by tools/gen_authors.sh.
-$(git log --pretty=format:"%aN <%aE>" | sort | uniq)
+$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v corp.google)
 Google Inc.
 The Mozilla Foundation
 The Xiph.Org Foundation
--- a/vp10/common/blockd.c
+++ b/vp10/common/blockd.c
@@ -66,7 +66,7 @@ void vp10_foreach_transformed_block_in_plane(
  for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
    // Skip visiting the sub blocks that are wholly within the UMV.
    for (c = 0; c < max_blocks_wide; c += (1 << tx_size)) {
-      visit(plane, i, plane_bsize, tx_size, arg);
+      visit(plane, i, r, c, plane_bsize, tx_size, arg);
      i += step;
    }
    i += extra_step;
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -70,6 +70,9 @@ typedef struct {
  PREDICTION_MODE mode;
  TX_SIZE tx_size;
  int8_t skip;
 #if CONFIG_MISC_FIXES
  int8_t has_no_coeffs;
 #endif
  int8_t segment_id;
  int8_t seg_id_predicted;  // valid only when temporal_update is enabled
@@ -128,6 +131,7 @@ struct macroblockd_plane {
  ENTROPY_CONTEXT *above_context;
  ENTROPY_CONTEXT *left_context;
  int16_t seg_dequant[MAX_SEGMENTS][2];
  uint8_t *color_index_map;
  // number of 4x4s in current block
  uint16_t n4_w, n4_h;
@@ -167,8 +171,6 @@ typedef struct macroblockd {
  int up_available;
  int left_available;
  const vpx_prob (*partition_probs)[PARTITION_TYPES - 1];
  /* Distance of MB away from frame edges */
  int mb_to_left_edge;
  int mb_to_right_edge;
@@ -176,7 +178,6 @@ typedef struct macroblockd {
  int mb_to_bottom_edge;
  FRAME_CONTEXT *fc;
  int frame_parallel_decoding_mode;
  /* pointers to reference frames */
  RefBuffer *block_refs[2];
@@ -195,7 +196,7 @@ typedef struct macroblockd {
  int bd;
 #endif
-  int lossless;
+  int lossless[MAX_SEGMENTS];
  int corrupted;
  struct vpx_internal_error_info *error_info;
@@ -224,8 +225,8 @@ static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, const MACROBLOCKD *xd,
  const MODE_INFO *const mi = xd->mi[0];
  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi) ||
+  if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] ||
-      mbmi->tx_size >= TX_32X32)
+      is_inter_block(mbmi) || mbmi->tx_size >= TX_32X32)
    return DCT_DCT;
  return intra_mode_to_tx_type_lookup[get_y_mode(mi, block_idx)];
@@ -266,16 +267,8 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
  }
 }
 static INLINE const vpx_prob *get_y_mode_probs(const MODE_INFO *mi,
                                               const MODE_INFO *above_mi,
                                               const MODE_INFO *left_mi,
                                               int block) {
  const PREDICTION_MODE above = vp10_above_block_mode(mi, above_mi, block);
  const PREDICTION_MODE left = vp10_left_block_mode(mi, left_mi, block);
  return vp10_kf_y_mode_prob[above][left];
 }
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                  int blk_row, int blk_col,
                                                  BLOCK_SIZE plane_bsize,
                                                  TX_SIZE tx_size,
                                                  void *arg);
@@ -289,17 +282,6 @@ void vp10_foreach_transformed_block(
    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
    foreach_transformed_block_visitor visit, void *arg);
 static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
                                            TX_SIZE tx_size, int block,
                                            int *x, int *y) {
  const int bwl = b_width_log2_lookup[plane_bsize];
  const int tx_cols_log2 = bwl - tx_size;
  const int tx_cols = 1 << tx_cols_log2;
  const int raster_mb = block >> (tx_size << 1);
  *x = (raster_mb & (tx_cols - 1)) << tx_size;
  *y = (raster_mb >> tx_cols_log2) << tx_size;
 }
 void vp10_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
                      int aoff, int loff);
--- a/vp10/common/common_data.h
+++ b/vp10/common/common_data.h
@@ -31,6 +31,8 @@ static const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] =
 // Log 2 conversion lookup tables for modeinfo width and height
 static const uint8_t mi_width_log2_lookup[BLOCK_SIZES] =
  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
 static const uint8_t mi_height_log2_lookup[BLOCK_SIZES] =
  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
 static const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
 static const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] =
--- a/vp10/common/entropy.c
+++ b/vp10/common/entropy.c
@@ -403,7 +403,6 @@ const vpx_prob vp10_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
  {255, 241, 243, 255, 236, 255, 252, 254},
  {255, 243, 245, 255, 237, 255, 252, 254},
  {255, 246, 247, 255, 239, 255, 253, 255},
  {255, 246, 247, 255, 239, 255, 253, 255},
 };
 static const vp10_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
@@ -743,7 +742,9 @@ static const vp10_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
 };
 static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) {
-  memcpy(probs, vp10_pareto8_full[p = 0 ? 0 : p - 1],
+  // TODO(aconverse): model[PIVOT_NODE] should never be zero.
  // https://code.google.com/p/webm/issues/detail?id=1089
  memcpy(probs, vp10_pareto8_full[p == 0 ? 254 : p - 1],
         MODEL_NODES * sizeof(vpx_prob));
 }
--- a/vp10/common/entropy.h
+++ b/vp10/common/entropy.h
@@ -153,7 +153,7 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
-#define COEFF_PROB_MODELS 256
+#define COEFF_PROB_MODELS 255
 #define UNCONSTRAINED_NODES         3
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -127,6 +127,7 @@ const vpx_prob vp10_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] =
  }
 };
 #if !CONFIG_MISC_FIXES
 const vpx_prob vp10_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
  { 144,  11,  54, 157, 195, 130,  46,  58, 108 },  // y = dc
  { 118,  15, 123, 148, 131, 101,  44,  93, 131 },  // y = v
@@ -139,6 +140,7 @@ const vpx_prob vp10_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
  { 116,  12,  64, 120, 140, 125,  49, 115, 121 },  // y = d63
  { 102,  19,  66, 162, 182, 122,  35,  59, 128 }   // y = tm
 };
 #endif
 static const vpx_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
  {  65,  32,  18, 144, 162, 194,  41,  51,  98 },  // block_size < 8x8
@@ -147,7 +149,7 @@ static const vpx_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
  { 221, 135,  38, 194, 248, 121,  96,  85,  29 }   // block_size >= 32x32
 };
-static const vpx_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
+static const vpx_prob default_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
  { 120,   7,  76, 176, 208, 126,  28,  54, 103 },  // y = dc
  {  48,  12, 154, 155, 139,  90,  34, 117, 119 },  // y = v
  {  67,   6,  25, 204, 243, 158,  13,  21,  96 },  // y = h
@@ -160,6 +162,7 @@ static const vpx_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
  { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
 };
 #if !CONFIG_MISC_FIXES
 const vpx_prob vp10_kf_partition_probs[PARTITION_CONTEXTS]
                                     [PARTITION_TYPES - 1] = {
  // 8x8 -> 4x4
@@ -183,6 +186,7 @@ const vpx_prob vp10_kf_partition_probs[PARTITION_CONTEXTS]
  {  57,  15,   9 },  // l split, a not split
  {  12,   3,   3 },  // a/l both split
 };
 #endif
 static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
                                             [PARTITION_TYPES - 1] = {
@@ -314,8 +318,16 @@ static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
  { 149, 144, },
 };
 #if CONFIG_MISC_FIXES
 // FIXME(someone) need real defaults here
 static const struct segmentation_probs default_seg_probs = {
  { 128, 128, 128, 128, 128, 128, 128 },
  { 128, 128, 128 },
 };
 #endif
 static void init_mode_probs(FRAME_CONTEXT *fc) {
-  vp10_copy(fc->uv_mode_prob, default_if_uv_probs);
+  vp10_copy(fc->uv_mode_prob, default_uv_probs);
  vp10_copy(fc->y_mode_prob, default_if_y_probs);
  vp10_copy(fc->switchable_interp_prob, default_switchable_interp_prob);
  vp10_copy(fc->partition_prob, default_partition_probs);
@@ -326,6 +338,10 @@ static void init_mode_probs(FRAME_CONTEXT *fc) {
  fc->tx_probs = default_tx_probs;
  vp10_copy(fc->skip_probs, default_skip_probs);
  vp10_copy(fc->inter_mode_probs, default_inter_mode_probs);
 #if CONFIG_MISC_FIXES
  vp10_copy(fc->seg.tree_probs, default_seg_probs.tree_probs);
  vp10_copy(fc->seg.pred_probs, default_seg_probs.pred_probs);
 #endif
 }
 const vpx_tree_index vp10_switchable_interp_tree
@@ -334,7 +350,7 @@ const vpx_tree_index vp10_switchable_interp_tree
  -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
-void vp10_adapt_mode_probs(VP10_COMMON *cm) {
+void vp10_adapt_inter_frame_probs(VP10_COMMON *cm) {
  int i, j;
  FRAME_CONTEXT *fc = cm->fc;
  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
@@ -362,6 +378,7 @@ void vp10_adapt_mode_probs(VP10_COMMON *cm) {
    vpx_tree_merge_probs(vp10_intra_mode_tree, pre_fc->y_mode_prob[i],
                counts->y_mode[i], fc->y_mode_prob[i]);
 #if !CONFIG_MISC_FIXES
  for (i = 0; i < INTRA_MODES; ++i)
    vpx_tree_merge_probs(vp10_intra_mode_tree, pre_fc->uv_mode_prob[i],
                         counts->uv_mode[i], fc->uv_mode_prob[i]);
@@ -369,6 +386,7 @@ void vp10_adapt_mode_probs(VP10_COMMON *cm) {
  for (i = 0; i < PARTITION_CONTEXTS; i++)
    vpx_tree_merge_probs(vp10_partition_tree, pre_fc->partition_prob[i],
                         counts->partition[i], fc->partition_prob[i]);
 #endif
  if (cm->interp_filter == SWITCHABLE) {
    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
@@ -377,6 +395,13 @@ void vp10_adapt_mode_probs(VP10_COMMON *cm) {
                           counts->switchable_interp[i],
                           fc->switchable_interp_prob[i]);
  }
 }
 void vp10_adapt_intra_frame_probs(VP10_COMMON *cm) {
  int i;
  FRAME_CONTEXT *fc = cm->fc;
  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
  const FRAME_COUNTS *counts = &cm->counts;
  if (cm->tx_mode == TX_MODE_SELECT) {
    int j;
@@ -405,6 +430,28 @@ void vp10_adapt_mode_probs(VP10_COMMON *cm) {
  for (i = 0; i < SKIP_CONTEXTS; ++i)
    fc->skip_probs[i] = mode_mv_merge_probs(
        pre_fc->skip_probs[i], counts->skip[i]);
 #if CONFIG_MISC_FIXES
  if (cm->seg.temporal_update) {
    for (i = 0; i < PREDICTION_PROBS; i++)
      fc->seg.pred_probs[i] = mode_mv_merge_probs(pre_fc->seg.pred_probs[i],
                                                  counts->seg.pred[i]);
    vpx_tree_merge_probs(vp10_segment_tree, pre_fc->seg.tree_probs,
                         counts->seg.tree_mispred, fc->seg.tree_probs);
  } else {
    vpx_tree_merge_probs(vp10_segment_tree, pre_fc->seg.tree_probs,
                         counts->seg.tree_total, fc->seg.tree_probs);
  }
  for (i = 0; i < INTRA_MODES; ++i)
    vpx_tree_merge_probs(vp10_intra_mode_tree, pre_fc->uv_mode_prob[i],
                         counts->uv_mode[i], fc->uv_mode_prob[i]);
  for (i = 0; i < PARTITION_CONTEXTS; i++)
    vpx_tree_merge_probs(vp10_partition_tree, pre_fc->partition_prob[i],
                         counts->partition[i], fc->partition_prob[i]);
 #endif
 }
 static void set_default_lf_deltas(struct loopfilter *lf) {
@@ -448,12 +495,12 @@ void vp10_setup_past_independence(VP10_COMMON *cm) {
  vp10_init_mv_probs(cm);
  cm->fc->initialized = 1;
-  if (cm->frame_type == KEY_FRAME ||
+  if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
-      cm->error_resilient_mode || cm->reset_frame_context == 3) {
+      cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
    // Reset all frame contexts.
    for (i = 0; i < FRAME_CONTEXTS; ++i)
      cm->frame_contexts[i] = *cm->fc;
-  } else if (cm->reset_frame_context == 2) {
+  } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
    // Reset only the frame context specified in the frame header.
    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
  }
@@ -463,7 +510,5 @@ void vp10_setup_past_independence(VP10_COMMON *cm) {
    memset(cm->prev_mip, 0,
           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
  vp10_zero(cm->ref_frame_sign_bias);
  cm->frame_context_idx = 0;
 }
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h
@@ -14,6 +14,7 @@
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymv.h"
 #include "vp10/common/filter.h"
 #include "vp10/common/seg_common.h"
 #include "vpx_dsp/vpx_filter.h"
 #ifdef __cplusplus
@@ -41,6 +42,12 @@ struct tx_counts {
  unsigned int tx_totals[TX_SIZES];
 };
 struct seg_counts {
  unsigned int tree_total[MAX_SEGMENTS];
  unsigned int tree_mispred[MAX_SEGMENTS];
  unsigned int pred[PREDICTION_PROBS][2];
 };
 typedef struct frame_contexts {
  vpx_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
  vpx_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
@@ -56,10 +63,14 @@ typedef struct frame_contexts {
  struct tx_probs tx_probs;
  vpx_prob skip_probs[SKIP_CONTEXTS];
  nmv_context nmvc;
 #if CONFIG_MISC_FIXES
  struct segmentation_probs seg;
 #endif
  int initialized;
 } FRAME_CONTEXT;
 typedef struct FRAME_COUNTS {
  unsigned int kf_y_mode[INTRA_MODES][INTRA_MODES][INTRA_MODES];
  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
  unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
  unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
@@ -76,22 +87,30 @@ typedef struct FRAME_COUNTS {
  struct tx_counts tx;
  unsigned int skip[SKIP_CONTEXTS][2];
  nmv_context_counts mv;
 #if CONFIG_MISC_FIXES
  struct seg_counts seg;
 #endif
 } FRAME_COUNTS;
 extern const vpx_prob vp10_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
 extern const vpx_prob vp10_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                        [INTRA_MODES - 1];
 #if !CONFIG_MISC_FIXES
 extern const vpx_prob vp10_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
 extern const vpx_prob vp10_kf_partition_probs[PARTITION_CONTEXTS]
                                            [PARTITION_TYPES - 1];
 #endif
 extern const vpx_tree_index vp10_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
 extern const vpx_tree_index vp10_inter_mode_tree[TREE_SIZE(INTER_MODES)];
 extern const vpx_tree_index vp10_partition_tree[TREE_SIZE(PARTITION_TYPES)];
 extern const vpx_tree_index vp10_switchable_interp_tree
                                [TREE_SIZE(SWITCHABLE_FILTERS)];
 void vp10_setup_past_independence(struct VP10Common *cm);
-void vp10_adapt_mode_probs(struct VP10Common *cm);
+void vp10_adapt_intra_frame_probs(struct VP10Common *cm);
 void vp10_adapt_inter_frame_probs(struct VP10Common *cm);
 void vp10_tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
                                      unsigned int (*ct_32x32p)[2]);
@@ -100,6 +119,15 @@ void vp10_tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
 void vp10_tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
                                    unsigned int (*ct_8x8p)[2]);
 static INLINE int vp10_ceil_log2(int n) {
  int i = 1, p = 2;
  while (p < n) {
    i++;
    p = p << 1;
  }
  return i;
 }
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp10/common/entropymv.c
+++ b/vp10/common/entropymv.c
@@ -128,8 +128,13 @@ MV_CLASS_TYPE vp10_get_mv_class(int z, int *offset) {
 }
 int vp10_use_mv_hp(const MV *ref) {
 #if CONFIG_MISC_FIXES
  (void) ref;
  return 1;
 #else
  return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
         (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
 #endif
 }
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
@@ -161,17 +166,19 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts,
  }
 }
-void vp10_inc_mv(const MV *mv, nmv_context_counts *counts) {
+void vp10_inc_mv(const MV *mv, nmv_context_counts *counts, const int usehp) {
  if (counts != NULL) {
    const MV_JOINT_TYPE j = vp10_get_mv_joint(mv);
    ++counts->joints[j];
    if (mv_joint_vertical(j)) {
-      inc_mv_component(mv->row, &counts->comps[0], 1, 1);
+      inc_mv_component(mv->row, &counts->comps[0], 1,
                       !CONFIG_MISC_FIXES || usehp);
    }
    if (mv_joint_horizontal(j)) {
-      inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+      inc_mv_component(mv->col, &counts->comps[1], 1,
                       !CONFIG_MISC_FIXES || usehp);
    }
  }
 }
--- a/vp10/common/entropymv.h
+++ b/vp10/common/entropymv.h
@@ -124,7 +124,7 @@ typedef struct {
  nmv_component_counts comps[2];
 } nmv_context_counts;
-void vp10_inc_mv(const MV *mv, nmv_context_counts *mvctx);
+void vp10_inc_mv(const MV *mv, nmv_context_counts *mvctx, const int usehp);
 #ifdef __cplusplus
 }  // extern "C"
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -179,21 +179,24 @@ void vp10_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
 }
 void vp10_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
-                           int stride, int eob, TX_TYPE tx_type,
+                           int stride, int eob, TX_TYPE tx_type, int lossless) {
-                           void (*itxm_add_4x4)(const tran_low_t *input,
+  if (lossless) {
-                               uint8_t *dest, int stride, int eob)) {
+    assert(tx_type == DCT_DCT);
-  switch (tx_type) {
+    vp10_iwht4x4_add(input, dest, stride, eob);
-    case DCT_DCT:
+  } else {
-      itxm_add_4x4(input, dest, stride, eob);
+    switch (tx_type) {
-      break;
+      case DCT_DCT:
-    case ADST_DCT:
+        vp10_idct4x4_add(input, dest, stride, eob);
-    case DCT_ADST:
+        break;
-    case ADST_ADST:
+      case ADST_DCT:
-      vp10_iht4x4_16_add(input, dest, stride, tx_type);
+      case DCT_ADST:
-      break;
+      case ADST_ADST:
-    default:
+        vp10_iht4x4_16_add(input, dest, stride, tx_type);
-      assert(0);
+        break;
-      break;
+      default:
        assert(0);
        break;
    }
  }
 }
@@ -418,21 +421,24 @@ void vp10_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
 void vp10_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type,
-                                  void (*highbd_itxm_add_4x4)
+                                  int lossless) {
-                                  (const tran_low_t *input, uint8_t *dest,
+  if (lossless) {
-                                      int stride, int eob, int bd)) {
+    assert(tx_type == DCT_DCT);
-  switch (tx_type) {
+    vp10_highbd_iwht4x4_add(input, dest, stride, eob, bd);
-    case DCT_DCT:
+  } else {
-      highbd_itxm_add_4x4(input, dest, stride, eob, bd);
+    switch (tx_type) {
-      break;
+      case DCT_DCT:
-    case ADST_DCT:
+        vp10_highbd_idct4x4_add(input, dest, stride, eob, bd);
-    case DCT_ADST:
+        break;
-    case ADST_ADST:
+      case ADST_DCT:
-      vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
+      case DCT_ADST:
-      break;
+      case ADST_ADST:
-    default:
+         vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
-      assert(0);
+         break;
-      break;
+      default:
        assert(0);
        break;
    }
  }
 }
--- a/vp10/common/idct.h
+++ b/vp10/common/idct.h
@@ -44,9 +44,7 @@ void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                     int eob);
 void vp10_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
-                           int stride, int eob, TX_TYPE tx_type,
+                           int stride, int eob, TX_TYPE tx_type, int lossless);
                           void (*itxm_add_4x4)(const tran_low_t *input,
                               uint8_t *dest, int stride, int eob));
 void vp10_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                           int stride, int eob, TX_TYPE tx_type);
 void vp10_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
@@ -67,9 +65,7 @@ void vp10_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
                              int stride, int eob, int bd);
 void vp10_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type,
-                                  void (*highbd_itxm_add_4x4)
+                                  int lossless);
                                  (const tran_low_t *input, uint8_t *dest,
                                      int stride, int eob, int bd));
 void vp10_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type);
 void vp10_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -719,7 +719,11 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
 #if CONFIG_MISC_FIXES
  uint16_t *const int_4x4_uv = &lfm->left_int_4x4_uv;
 #else
  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
 #endif
  int i;
  // If filter level is 0 we don't loop filter.
@@ -754,8 +758,13 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
  // If the block has no coefficients and is not intra we skip applying
  // the loop filter on block edges.
 #if CONFIG_MISC_FIXES
  if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi))
    return;
 #else
  if (mbmi->skip && is_inter_block(mbmi))
    return;
 #endif
  // Here we are adding a mask for the transform size. The transform
  // size mask is set to be correct for a 64x64 prediction block size. We
@@ -812,8 +821,13 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
  *above_y |= above_prediction_mask[block_size] << shift_y;
  *left_y |= left_prediction_mask[block_size] << shift_y;
 #if CONFIG_MISC_FIXES
  if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi))
    return;
 #else
  if (mbmi->skip && is_inter_block(mbmi))
    return;
 #endif
  *above_y |= (size_mask[block_size] &
               above_64x64_txform_mask[tx_size_y]) << shift_y;
@@ -1005,7 +1019,11 @@ void vp10_setup_mask(VP10_COMMON *const cm, const int mi_row, const int mi_col,
      lfm->above_uv[i] &= mask_uv;
    }
    lfm->int_4x4_y &= mask_y;
 #if CONFIG_MISC_FIXES
    lfm->above_int_4x4_uv = lfm->left_int_4x4_uv & mask_uv;
 #else
    lfm->int_4x4_uv &= mask_uv;
 #endif
    // We don't apply a wide loop filter on the last uv block row. If set
    // apply the shorter one instead.
@@ -1039,7 +1057,11 @@ void vp10_setup_mask(VP10_COMMON *const cm, const int mi_row, const int mi_col,
      lfm->above_uv[i] &= mask_uv;
    }
    lfm->int_4x4_y &= mask_y;
 #if CONFIG_MISC_FIXES
    lfm->left_int_4x4_uv &= mask_uv_int;
 #else
    lfm->int_4x4_uv &= mask_uv_int;
 #endif
    // We don't apply a wide loop filter on the last uv column. If set
    // apply the shorter one instead.
@@ -1069,7 +1091,11 @@ void vp10_setup_mask(VP10_COMMON *const cm, const int mi_row, const int mi_col,
  assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));
  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
 #if CONFIG_MISC_FIXES
  assert(!(lfm->left_int_4x4_uv & lfm->left_uv[TX_16X16]));
 #else
  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));
 #endif
  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
@@ -1077,7 +1103,11 @@ void vp10_setup_mask(VP10_COMMON *const cm, const int mi_row, const int mi_col,
  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
 #if CONFIG_MISC_FIXES
  assert(!(lfm->above_int_4x4_uv & lfm->above_uv[TX_16X16]));
 #else
  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
 #endif
 }
 static void filter_selectively_vert(uint8_t *s, int pitch,
@@ -1432,7 +1462,11 @@ void vp10_filter_block_plane_ss11(VP10_COMMON *const cm,
  uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
  uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
  uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
 #if CONFIG_MISC_FIXES
  uint16_t mask_4x4_int = lfm->left_int_4x4_uv;
 #else
  uint16_t mask_4x4_int = lfm->int_4x4_uv;
 #endif
  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
@@ -1484,7 +1518,11 @@ void vp10_filter_block_plane_ss11(VP10_COMMON *const cm,
  mask_16x16 = lfm->above_uv[TX_16X16];
  mask_8x8 = lfm->above_uv[TX_8X8];
  mask_4x4 = lfm->above_uv[TX_4X4];
 #if CONFIG_MISC_FIXES
  mask_4x4_int = lfm->above_int_4x4_uv;
 #else
  mask_4x4_int = lfm->int_4x4_uv;
 #endif
  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
    const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
--- a/vp10/common/loopfilter.h
+++ b/vp10/common/loopfilter.h
@@ -80,7 +80,12 @@ typedef struct {
  uint64_t int_4x4_y;
  uint16_t left_uv[TX_SIZES];
  uint16_t above_uv[TX_SIZES];
 #if CONFIG_MISC_FIXES
  uint16_t left_int_4x4_uv;
  uint16_t above_int_4x4_uv;
 #else
  uint16_t int_4x4_uv;
 #endif
  uint8_t lfl_y[64];
  uint8_t lfl_uv[16];
 } LOOP_FILTER_MASK;
--- a/vp10/common/mvref_common.c
+++ b/vp10/common/mvref_common.c
@@ -27,9 +27,13 @@ static void find_mv_refs_idx(const VP10_COMMON *cm, const MACROBLOCKD *xd,
  const MV_REF *const  prev_frame_mvs = cm->use_prev_frame_mvs ?
      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
  const TileInfo *const tile = &xd->tile;
  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type] << 3;
  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type] << 3;
 #if !CONFIG_MISC_FIXES
  // Blank the reference vector list
  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
 #endif
  // The nearest 2 blocks are treated differently
  // if the size < 8x8 we get the mv from the bmi substructure,
@@ -46,10 +50,10 @@ static void find_mv_refs_idx(const VP10_COMMON *cm, const MACROBLOCKD *xd,
      if (candidate->ref_frame[0] == ref_frame)
        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
-                        refmv_count, mv_ref_list, Done);
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);
      else if (candidate->ref_frame[1] == ref_frame)
        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
-                        refmv_count, mv_ref_list, Done);
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);
    }
  }
@@ -64,9 +68,11 @@ static void find_mv_refs_idx(const VP10_COMMON *cm, const MACROBLOCKD *xd,
      different_ref_found = 1;
      if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, Done);
+        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list,
                        bw, bh, xd, Done);
      else if (candidate->ref_frame[1] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[1], refmv_count, mv_ref_list, Done);
+        ADD_MV_REF_LIST(candidate->mv[1], refmv_count, mv_ref_list,
                        bw, bh, xd, Done);
    }
  }
@@ -88,9 +94,11 @@ static void find_mv_refs_idx(const VP10_COMMON *cm, const MACROBLOCKD *xd,
    }
    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
-      ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list,
                      bw, bh, xd, Done);
    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
-      ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done);
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list,
                      bw, bh, xd, Done);
    }
  }
@@ -106,7 +114,7 @@ static void find_mv_refs_idx(const VP10_COMMON *cm, const MACROBLOCKD *xd,
        // If the candidate is INTRA we don't want to consider its mv.
        IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
-                                 refmv_count, mv_ref_list, Done);
+                                 refmv_count, mv_ref_list, bw, bh, xd, Done);
      }
    }
  }
@@ -121,19 +129,21 @@ static void find_mv_refs_idx(const VP10_COMMON *cm, const MACROBLOCKD *xd,
        mv.as_mv.row *= -1;
        mv.as_mv.col *= -1;
      }
-      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done);
    }
    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
-        prev_frame_mvs->ref_frame[1] != ref_frame &&
+#if !CONFIG_MISC_FIXES
-        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) {
+        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int &&
 #endif
        prev_frame_mvs->ref_frame[1] != ref_frame) {
      int_mv mv = prev_frame_mvs->mv[1];
      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
          ref_sign_bias[ref_frame]) {
        mv.as_mv.row *= -1;
        mv.as_mv.col *= -1;
      }
-      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done);
    }
  }
@@ -141,9 +151,14 @@ static void find_mv_refs_idx(const VP10_COMMON *cm, const MACROBLOCKD *xd,
  mode_context[ref_frame] = counter_to_context[context_counter];
 #if CONFIG_MISC_FIXES
  for (i = refmv_count; i < MAX_MV_REF_CANDIDATES; ++i)
      mv_ref_list[i].as_int = 0;
 #else
  // Clamp vectors
  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
-    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
+    clamp_mv_ref(&mv_ref_list[i].as_mv, bw, bh, xd);
 #endif
 }
 void vp10_find_mv_refs(const VP10_COMMON *cm, const MACROBLOCKD *xd,
@@ -166,14 +181,13 @@ static void lower_mv_precision(MV *mv, int allow_hp) {
  }
 }
-void vp10_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
+void vp10_find_best_ref_mvs(int allow_hp,
                           int_mv *mvlist, int_mv *nearest_mv,
                           int_mv *near_mv) {
  int i;
  // Make sure all the candidates are properly clamped etc
  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
    clamp_mv2(&mvlist[i].as_mv, xd);
  }
  *nearest_mv = mvlist[0];
  *near_mv = mvlist[1];
--- a/vp10/common/mvref_common.h
+++ b/vp10/common/mvref_common.h
@@ -17,10 +17,6 @@
 extern "C" {
 #endif
 #define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3)
 #define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\
                                VP9_INTERP_EXTEND) << 3)
 #define MVREF_NEIGHBOURS 8
 typedef struct position {
@@ -123,13 +119,26 @@ static const int idx_n_column_to_subblock[4][2] = {
 };
 // clamp_mv_ref
 #if CONFIG_MISC_FIXES
 #define MV_BORDER (8 << 3)  // Allow 8 pels in 1/8th pel units
 #else
 #define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
 #endif
-static INLINE void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
+static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
 #if CONFIG_MISC_FIXES
  clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
               xd->mb_to_right_edge + bw * 8 + MV_BORDER,
               xd->mb_to_top_edge - bh * 8 - MV_BORDER,
               xd->mb_to_bottom_edge + bh * 8 + MV_BORDER);
 #else
  (void) bw;
  (void) bh;
  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
               xd->mb_to_right_edge + MV_BORDER,
               xd->mb_to_top_edge - MV_BORDER,
               xd->mb_to_bottom_edge + MV_BORDER);
 #endif
 }
 // This function returns either the appropriate sub block or block's mv
@@ -155,35 +164,41 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
  return mv;
 }
 #if CONFIG_MISC_FIXES
 #define CLIP_IN_ADD(mv, bw, bh, xd) clamp_mv_ref(mv, bw, bh, xd)
 #else
 #define CLIP_IN_ADD(mv, bw, bh, xd) do {} while (0)
 #endif
 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector it will also
 // skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done) \
+#define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done) \
  do { \
-    if (refmv_count) { \
+    (mv_ref_list)[(refmv_count)] = (mv); \
-      if ((mv).as_int != (mv_ref_list)[0].as_int) { \
+    CLIP_IN_ADD(&(mv_ref_list)[(refmv_count)].as_mv, (bw), (bh), (xd)); \
-        (mv_ref_list)[(refmv_count)] = (mv); \
+    if (refmv_count && (mv_ref_list)[1].as_int != (mv_ref_list)[0].as_int) { \
        (refmv_count) = 2; \
        goto Done; \
      } \
    } else { \
      (mv_ref_list)[(refmv_count)++] = (mv); \
    } \
    (refmv_count) = 1; \
  } while (0)
 // If either reference frame is different, not INTRA, and they
 // are different from each other scale and add the mv to our list.
 #define IF_DIFF_REF_FRAME_ADD_MV(mbmi, ref_frame, ref_sign_bias, refmv_count, \
-                                 mv_ref_list, Done) \
+                                 mv_ref_list, bw, bh, xd, Done) \
  do { \
    if (is_inter_block(mbmi)) { \
      if ((mbmi)->ref_frame[0] != ref_frame) \
        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \
-                        refmv_count, mv_ref_list, Done); \
+                        refmv_count, mv_ref_list, bw, bh, xd, Done); \
      if (has_second_ref(mbmi) && \
-          (mbmi)->ref_frame[1] != ref_frame && \
+          (CONFIG_MISC_FIXES || \
-          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+           (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) && \
          (mbmi)->ref_frame[1] != ref_frame) \
        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \
-                        refmv_count, mv_ref_list, Done); \
+                        refmv_count, mv_ref_list, bw, bh, xd, Done); \
    } \
  } while (0)
@@ -199,14 +214,6 @@ static INLINE int is_inside(const TileInfo *const tile,
           mi_col + mi_pos->col >= tile->mi_col_end);
 }
 // TODO(jingning): this mv clamping function should be block size dependent.
 static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
               xd->mb_to_top_edge - LEFT_TOP_MARGIN,
               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void vp10_find_mv_refs(const VP10_COMMON *cm, const MACROBLOCKD *xd,
                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
@@ -217,7 +224,7 @@ void vp10_find_mv_refs(const VP10_COMMON *cm, const MACROBLOCKD *xd,
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
 // score to use as ref motion vector
-void vp10_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
+void vp10_find_best_ref_mvs(int allow_hp,
                           int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv);
 void vp10_append_sub8x8_mvs_for_idx(VP10_COMMON *cm, MACROBLOCKD *xd,
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -57,6 +57,29 @@ typedef enum {
  REFERENCE_MODES       = 3,
 } REFERENCE_MODE;
 typedef enum {
  RESET_FRAME_CONTEXT_NONE = 0,
  RESET_FRAME_CONTEXT_CURRENT = 1,
  RESET_FRAME_CONTEXT_ALL = 2,
 } RESET_FRAME_CONTEXT_MODE;
 typedef enum {
  /**
   * Don't update frame context
   */
  REFRESH_FRAME_CONTEXT_OFF,
  /**
   * Update frame context to values resulting from forward probability
   * updates signaled in the frame header
   */
  REFRESH_FRAME_CONTEXT_FORWARD,
  /**
   * Update frame context to values resulting from backward probability
   * updates based on entropy/counts in the decoded frame
   */
  REFRESH_FRAME_CONTEXT_BACKWARD,
 } REFRESH_FRAME_CONTEXT_MODE;
 typedef struct {
  int_mv mv[2];
  MV_REFERENCE_FRAME ref_frame[2];
@@ -106,10 +129,11 @@ typedef struct BufferPool {
 typedef struct VP10Common {
  struct vpx_internal_error_info  error;
  vpx_color_space_t color_space;
  int color_range;
  int width;
  int height;
-  int display_width;
+  int render_width;
-  int display_height;
+  int render_height;
  int last_width;
  int last_height;
@@ -161,10 +185,8 @@ typedef struct VP10Common {
  int allow_high_precision_mv;
-  // Flag signaling that the frame context should be reset to default values.
+  // Flag signaling which frame contexts should be reset to default values.
-  // 0 or 1 implies don't reset, 2 reset just the context specified in the
+  RESET_FRAME_CONTEXT_MODE reset_frame_context;
  // frame header, 3 reset all contexts.
  int reset_frame_context;
  // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
  // MODE_INFO (8-pixel) units.
@@ -222,15 +244,18 @@ typedef struct VP10Common {
  loop_filter_info_n lf_info;
-  int refresh_frame_context;    /* Two state 0 = NO, 1 = YES */
+  // Flag signaling how frame contexts should be updated at the end of
  // a frame decode
  REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
  int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
  struct loopfilter lf;
  struct segmentation seg;
 #if !CONFIG_MISC_FIXES
  struct segmentation_probs segp;
 #endif
  // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
  // in pbi.
  int frame_parallel_decode;  // frame-based threading.
  // Context probabilities for reference frame prediction
@@ -255,9 +280,9 @@ typedef struct VP10Common {
 #endif
  int error_resilient_mode;
  int frame_parallel_decoding_mode;
  int log2_tile_cols, log2_tile_rows;
  int tile_sz_mag;
  int byte_alignment;
  int skip_loop_filter;
@@ -275,6 +300,11 @@ typedef struct VP10Common {
  PARTITION_CONTEXT *above_seg_context;
  ENTROPY_CONTEXT *above_context;
  int above_context_alloc_cols;
  // scratch memory for intraonly/keyframe forward updates from default tables
  // - this is intentionally not placed in FRAME_CONTEXT since it's reset upon
  // each keyframe and not used afterwards
  vpx_prob kf_y_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1];
 } VP10_COMMON;
 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
@@ -347,14 +377,6 @@ static INLINE int frame_is_intra_only(const VP10_COMMON *const cm) {
  return cm->frame_type == KEY_FRAME || cm->intra_only;
 }
 static INLINE void set_partition_probs(const VP10_COMMON *const cm,
                                       MACROBLOCKD *const xd) {
  xd->partition_probs =
      frame_is_intra_only(cm) ?
          &vp10_kf_partition_probs[0] :
          (const vpx_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
 }
 static INLINE void vp10_init_macroblockd(VP10_COMMON *cm, MACROBLOCKD *xd,
                                        tran_low_t *dqcoeff) {
  int i;
@@ -370,19 +392,11 @@ static INLINE void vp10_init_macroblockd(VP10_COMMON *cm, MACROBLOCKD *xd,
      memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
    }
    xd->fc = cm->fc;
    xd->frame_parallel_decoding_mode = cm->frame_parallel_decoding_mode;
  }
  xd->above_seg_context = cm->above_seg_context;
  xd->mi_stride = cm->mi_stride;
  xd->error_info = &cm->error;
  set_partition_probs(cm, xd);
 }
 static INLINE const vpx_prob* get_partition_probs(const MACROBLOCKD *xd,
                                                  int ctx) {
  return xd->partition_probs[ctx];
 }
 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
@@ -432,6 +446,16 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
  }
 }
 static INLINE const vpx_prob *get_y_mode_probs(const VP10_COMMON *cm,
                                               const MODE_INFO *mi,
                                               const MODE_INFO *above_mi,
                                               const MODE_INFO *left_mi,
                                               int block) {
  const PREDICTION_MODE above = vp10_above_block_mode(mi, above_mi, block);
  const PREDICTION_MODE left = vp10_left_block_mode(mi, left_mi, block);
  return cm->kf_y_prob[above][left];
 }
 static INLINE void update_partition_context(MACROBLOCKD *xd,
                                            int mi_row, int mi_col,
                                            BLOCK_SIZE subsize,
--- a/vp10/common/pred_common.h
+++ b/vp10/common/pred_common.h
@@ -48,9 +48,9 @@ static INLINE int vp10_get_pred_context_seg_id(const MACROBLOCKD *xd) {
  return above_sip + left_sip;
 }
-static INLINE vpx_prob vp10_get_pred_prob_seg_id(const struct segmentation *seg,
+static INLINE vpx_prob vp10_get_pred_prob_seg_id(
-                                                const MACROBLOCKD *xd) {
+    const struct segmentation_probs *segp, const MACROBLOCKD *xd) {
-  return seg->pred_probs[vp10_get_pred_context_seg_id(xd)];
+  return segp->pred_probs[vp10_get_pred_context_seg_id(xd)];
 }
 static INLINE int vp10_get_skip_context(const MACROBLOCKD *xd) {
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -128,6 +128,53 @@ void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
  }
 }
 void vp10_build_inter_predictor_sub8x8(MACROBLOCKD *xd, int plane,
                                       int i, int ir, int ic,
                                       int mi_row, int mi_col) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  MODE_INFO *const mi = xd->mi[0];
  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
  const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
  const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
  uint8_t *const dst = &pd->dst.buf[(ir * pd->dst.stride + ic) << 2];
  int ref;
  const int is_compound = has_second_ref(&mi->mbmi);
  const InterpKernel *kernel = vp10_filter_kernels[mi->mbmi.interp_filter];
  for (ref = 0; ref < 1 + is_compound; ++ref) {
    const uint8_t *pre =
        &pd->pre[ref].buf[(ir * pd->pre[ref].stride + ic) << 2];
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    vp10_highbd_build_inter_predictor(pre, pd->pre[ref].stride,
                                      dst, pd->dst.stride,
                                      &mi->bmi[i].as_mv[ref].as_mv,
                                      &xd->block_refs[ref]->sf, width, height,
                                      ref, kernel, MV_PRECISION_Q3,
                                      mi_col * MI_SIZE + 4 * ic,
                                      mi_row * MI_SIZE + 4 * ir, xd->bd);
  } else {
    vp10_build_inter_predictor(pre, pd->pre[ref].stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
                               &xd->block_refs[ref]->sf, width, height, ref,
                               kernel, MV_PRECISION_Q3,
                               mi_col * MI_SIZE + 4 * ic,
                               mi_row * MI_SIZE + 4 * ir);
  }
 #else
    vp10_build_inter_predictor(pre, pd->pre[ref].stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
                               &xd->block_refs[ref]->sf, width, height, ref,
                               kernel, MV_PRECISION_Q3,
                               mi_col * MI_SIZE + 4 * ic,
                               mi_row * MI_SIZE + 4 * ir);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
  }
 }
 static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                              int mi_row, int mi_col,
                                              int plane_from, int plane_to) {
@@ -135,20 +182,26 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
  const int mi_x = mi_col * MI_SIZE;
  const int mi_y = mi_row * MI_SIZE;
  for (plane = plane_from; plane <= plane_to; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+    const struct macroblockd_plane *pd = &xd->plane[plane];
-                                                        &xd->plane[plane]);
+    const int bw = 4 * num_4x4_blocks_wide_lookup[bsize] >> pd->subsampling_x;
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int bh = 4 * num_4x4_blocks_high_lookup[bsize] >> pd->subsampling_y;
    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
    const int bw = 4 * num_4x4_w;
    const int bh = 4 * num_4x4_h;
    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
-      int i = 0, x, y;
+      const PARTITION_TYPE bp = bsize - xd->mi[0]->mbmi.sb_type;
      const int have_vsplit = bp != PARTITION_HORZ;
      const int have_hsplit = bp != PARTITION_VERT;
      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
      const int pw = 8 >> (have_vsplit | pd->subsampling_x);
      const int ph = 8 >> (have_hsplit | pd->subsampling_y);
      int x, y;
      assert(bp != PARTITION_NONE && bp < PARTITION_TYPES);
      assert(bsize == BLOCK_8X8);
      assert(pw * num_4x4_w == bw && ph * num_4x4_h == bh);
      for (y = 0; y < num_4x4_h; ++y)
        for (x = 0; x < num_4x4_w; ++x)
-           build_inter_predictors(xd, plane, i++, bw, bh,
+           build_inter_predictors(xd, plane, y * 2 + x, bw, bh,
-                                  4 * x, 4 * y, 4, 4, mi_x, mi_y);
+                                  4 * x, 4 * y, pw, ph, mi_x, mi_y);
    } else {
      build_inter_predictors(xd, plane, 0, bw, bh,
                             0, 0, bw, bh, mi_x, mi_y);
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@@ -131,6 +131,10 @@ void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                   int x, int y, int w, int h,
                                   int mi_x, int mi_y);
 void vp10_build_inter_predictor_sub8x8(MACROBLOCKD *xd, int plane,
                                       int i, int ir, int ic,
                                       int mi_row, int mi_col);
 void vp10_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -21,6 +21,28 @@
 #include "vp10/common/reconintra.h"
 #include "vp10/common/onyxc_int.h"
 #if CONFIG_MISC_FIXES
 enum {
  NEED_LEFT = 1 << 1,
  NEED_ABOVE = 1 << 2,
  NEED_ABOVERIGHT = 1 << 3,
  NEED_ABOVELEFT = 1 << 4,
  NEED_BOTTOMLEFT = 1 << 5,
 };
 static const uint8_t extend_modes[INTRA_MODES] = {
  NEED_ABOVE | NEED_LEFT,                   // DC
  NEED_ABOVE,                               // V
  NEED_LEFT,                                // H
  NEED_ABOVE | NEED_ABOVERIGHT,             // D45
  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D135
  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D117
  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D153
  NEED_LEFT | NEED_BOTTOMLEFT,              // D207
  NEED_ABOVE | NEED_ABOVERIGHT,             // D63
  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // TM
 };
 #else
 enum {
  NEED_LEFT = 1 << 1,
  NEED_ABOVE = 1 << 2,
@@ -39,6 +61,134 @@ static const uint8_t extend_modes[INTRA_MODES] = {
  NEED_ABOVERIGHT,              // D63
  NEED_LEFT | NEED_ABOVE,       // TM
 };
 #endif
 #if CONFIG_MISC_FIXES
 static const uint8_t orders_64x64[1] = { 0 };
 static const uint8_t orders_64x32[2] = { 0, 1 };
 static const uint8_t orders_32x64[2] = { 0, 1 };
 static const uint8_t orders_32x32[4] = {
  0, 1,
  2, 3,
 };
 static const uint8_t orders_32x16[8] = {
  0, 2,
  1, 3,
  4, 6,
  5, 7,
 };
 static const uint8_t orders_16x32[8] = {
  0, 1, 2, 3,
  4, 5, 6, 7,
 };
 static const uint8_t orders_16x16[16] = {
  0,   1,  4,  5,
  2,   3,  6,  7,
  8,   9, 12, 13,
  10, 11, 14, 15,
 };
 static const uint8_t orders_16x8[32] = {
  0,   2,  8, 10,
  1,   3,  9, 11,
  4,   6, 12, 14,
  5,   7, 13, 15,
  16, 18, 24, 26,
  17, 19, 25, 27,
  20, 22, 28, 30,
  21, 23, 29, 31,
 };
 static const uint8_t orders_8x16[32] = {
  0,   1,  2,  3,  8,  9, 10, 11,
  4,   5,  6,  7, 12, 13, 14, 15,
  16, 17, 18, 19, 24, 25, 26, 27,
  20, 21, 22, 23, 28, 29, 30, 31,
 };
 static const uint8_t orders_8x8[64] = {
  0,   1,  4,  5, 16, 17, 20, 21,
  2,   3,  6,  7, 18, 19, 22, 23,
  8,   9, 12, 13, 24, 25, 28, 29,
  10, 11, 14, 15, 26, 27, 30, 31,
  32, 33, 36, 37, 48, 49, 52, 53,
  34, 35, 38, 39, 50, 51, 54, 55,
  40, 41, 44, 45, 56, 57, 60, 61,
  42, 43, 46, 47, 58, 59, 62, 63,
 };
 static const uint8_t *const orders[BLOCK_SIZES] = {
  orders_8x8, orders_8x8, orders_8x8, orders_8x8,
  orders_8x16, orders_16x8, orders_16x16,
  orders_16x32, orders_32x16, orders_32x32,
  orders_32x64, orders_64x32, orders_64x64,
 };
 static int vp10_has_right(BLOCK_SIZE bsize, int mi_row, int mi_col,
                          int right_available,
                          TX_SIZE txsz, int y, int x, int ss_x) {
  if (y == 0) {
    int wl = mi_width_log2_lookup[bsize];
    int hl = mi_height_log2_lookup[bsize];
    int w = 1 << (wl + 1 - ss_x);
    int step = 1 << txsz;
    const uint8_t *order = orders[bsize];
    int my_order, tr_order;
    if (x + step < w)
      return 1;
    mi_row = (mi_row & 7) >> hl;
    mi_col = (mi_col & 7) >> wl;
    if (mi_row == 0)
      return right_available;
    if (((mi_col + 1) << wl) >= 8)
      return 0;
    my_order = order[((mi_row + 0) << (3 - wl)) + mi_col + 0];
    tr_order = order[((mi_row - 1) << (3 - wl)) + mi_col + 1];
    return my_order > tr_order && right_available;
  } else {
    int wl = mi_width_log2_lookup[bsize];
    int w = 1 << (wl + 1 - ss_x);
    int step = 1 << txsz;
    return x + step < w;
  }
 }
 static int vp10_has_bottom(BLOCK_SIZE bsize, int mi_row, int mi_col,
                           int bottom_available, TX_SIZE txsz,
                           int y, int x, int ss_y) {
  if (x == 0) {
    int wl = mi_width_log2_lookup[bsize];
    int hl = mi_height_log2_lookup[bsize];
    int h = 1 << (hl + 1 - ss_y);
    int step = 1 << txsz;
    const uint8_t *order = orders[bsize];
    int my_order, bl_order;
    mi_row = (mi_row & 7) >> hl;
    mi_col = (mi_col & 7) >> wl;
    if (mi_col == 0)
      return bottom_available &&
             (mi_row << (hl + !ss_y)) + y + step < (8 << !ss_y);
    if (((mi_row + 1) << hl) >= 8)
      return 0;
    if (y + step < h)
      return 1;
    my_order = order[((mi_row + 0) << (3 - wl)) + mi_col + 0];
    bl_order = order[((mi_row + 1) << (3 - wl)) + mi_col - 1];
    return bl_order < my_order && bottom_available;
  } else {
    return 0;
  }
 }
 #endif
 typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
                              const uint8_t *above, const uint8_t *left);
@@ -55,17 +205,26 @@ static intra_high_pred_fn dc_pred_high[2][2][4];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 static void vp10_init_intra_predictors_internal(void) {
-#define INIT_ALL_SIZES(p, type) \
+#define INIT_NO_4X4(p, type) \
  p[TX_4X4] = vpx_##type##_predictor_4x4; \
  p[TX_8X8] = vpx_##type##_predictor_8x8; \
  p[TX_16X16] = vpx_##type##_predictor_16x16; \
  p[TX_32X32] = vpx_##type##_predictor_32x32
 #define INIT_ALL_SIZES(p, type) \
  p[TX_4X4] = vpx_##type##_predictor_4x4; \
  INIT_NO_4X4(p, type)
  INIT_ALL_SIZES(pred[V_PRED], v);
  INIT_ALL_SIZES(pred[H_PRED], h);
 #if CONFIG_MISC_FIXES
  INIT_ALL_SIZES(pred[D207_PRED], d207e);
  INIT_ALL_SIZES(pred[D45_PRED], d45e);
  INIT_ALL_SIZES(pred[D63_PRED], d63e);
 #else
  INIT_ALL_SIZES(pred[D207_PRED], d207);
  INIT_ALL_SIZES(pred[D45_PRED], d45);
  INIT_ALL_SIZES(pred[D63_PRED], d63);
 #endif
  INIT_ALL_SIZES(pred[D117_PRED], d117);
  INIT_ALL_SIZES(pred[D135_PRED], d135);
  INIT_ALL_SIZES(pred[D153_PRED], d153);
@@ -79,9 +238,15 @@ static void vp10_init_intra_predictors_internal(void) {
 #if CONFIG_VP9_HIGHBITDEPTH
  INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
  INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
 #if CONFIG_MISC_FIXES
  INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207e);
  INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45e);
  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63);
 #else
  INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207);
  INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45);
  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63);
 #endif
  INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117);
  INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135);
  INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153);
@@ -96,6 +261,13 @@ static void vp10_init_intra_predictors_internal(void) {
 #undef intra_pred_allsizes
 }
 #if CONFIG_MISC_FIXES
 static INLINE void memset16(uint16_t *dst, int val, int n) {
  while (n--)
    *dst++ = val;
 }
 #endif
 #if CONFIG_VP9_HIGHBITDEPTH
 static void build_intra_predictors_high(const MACROBLOCKD *xd,
                                        const uint8_t *ref8,
@@ -104,23 +276,38 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
                                        int dst_stride,
                                        PREDICTION_MODE mode,
                                        TX_SIZE tx_size,
 #if CONFIG_MISC_FIXES
                                        int n_top_px, int n_topright_px,
                                        int n_left_px, int n_bottomleft_px,
 #else
                                        int up_available,
                                        int left_available,
                                        int right_available,
 #endif
                                        int x, int y,
                                        int plane, int bd) {
  int i;
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 #if CONFIG_MISC_FIXES
  DECLARE_ALIGNED(16, uint16_t, left_col[32]);
 #else
  DECLARE_ALIGNED(16, uint16_t, left_col[64]);
 #endif
  DECLARE_ALIGNED(16, uint16_t, above_data[64 + 16]);
  uint16_t *above_row = above_data + 16;
  const uint16_t *const_above_row = above_row;
  const int bs = 4 << tx_size;
 #if CONFIG_MISC_FIXES
  const uint16_t *above_ref = ref - ref_stride;
 #else
  int frame_width, frame_height;
  int x0, y0;
  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  //  int base=128;
+#endif
  const int need_left = extend_modes[mode] & NEED_LEFT;
  const int need_above = extend_modes[mode] & NEED_ABOVE;
  const int need_aboveright = extend_modes[mode] & NEED_ABOVERIGHT;
  int base = 128 << (bd - 8);
  // 127 127 127 .. 127 127 127 127 127 127
  // 129  A   B  ..  Y   Z
@@ -128,129 +315,56 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
  // 129  E   F  ..  U   V
  // 129  G   H  ..  S   T   T   T   T   T
-  // Get current frame pointer, width and height.
+#if CONFIG_MISC_FIXES
-  if (plane == 0) {
+  (void) x;
-    frame_width = xd->cur_buf->y_width;
+  (void) y;
-    frame_height = xd->cur_buf->y_height;
+  (void) plane;
-  } else {
+  (void) need_left;
-    frame_width = xd->cur_buf->uv_width;
+  (void) need_above;
-    frame_height = xd->cur_buf->uv_height;
+  (void) need_aboveright;
  }
-  // Get block position in current frame.
+  // NEED_LEFT
-  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+  if (extend_modes[mode] & NEED_LEFT) {
-  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+    const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
-
+    i = 0;
-  // left
+    if (n_left_px > 0) {
-  if (left_available) {
+      for (; i < n_left_px; i++)
    if (xd->mb_to_bottom_edge < 0) {
      /* slower path if the block needs border extension */
      if (y0 + bs <= frame_height) {
        for (i = 0; i < bs; ++i)
          left_col[i] = ref[i * ref_stride - 1];
      } else {
        const int extend_bottom = frame_height - y0;
        for (i = 0; i < extend_bottom; ++i)
          left_col[i] = ref[i * ref_stride - 1];
        for (; i < bs; ++i)
          left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
      }
    } else {
      /* faster path if the block does not need extension */
      for (i = 0; i < bs; ++i)
        left_col[i] = ref[i * ref_stride - 1];
-    }
+      if (need_bottom && n_bottomleft_px > 0) {
-  } else {
+        assert(i == bs);
-    // TODO(Peter): this value should probably change for high bitdepth
+        for (; i < bs + n_bottomleft_px; i++)
-    vpx_memset16(left_col, base + 1, bs);
+          left_col[i] = ref[i * ref_stride - 1];
  }
  // TODO(hkuang) do not extend 2*bs pixels for all modes.
  // above
  if (up_available) {
    const uint16_t *above_ref = ref - ref_stride;
    if (xd->mb_to_right_edge < 0) {
      /* slower path if the block needs border extension */
      if (x0 + 2 * bs <= frame_width) {
        if (right_available && bs == 4) {
          memcpy(above_row, above_ref, 2 * bs * sizeof(above_row[0]));
        } else {
          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
          vpx_memset16(above_row + bs, above_row[bs - 1], bs);
        }
      } else if (x0 + bs <= frame_width) {
        const int r = frame_width - x0;
        if (right_available && bs == 4) {
          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
          vpx_memset16(above_row + r, above_row[r - 1],
                       x0 + 2 * bs - frame_width);
        } else {
          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
          vpx_memset16(above_row + bs, above_row[bs - 1], bs);
        }
      } else if (x0 <= frame_width) {
        const int r = frame_width - x0;
        memcpy(above_row, above_ref, r * sizeof(above_row[0]));
        vpx_memset16(above_row + r, above_row[r - 1],
                       x0 + 2 * bs - frame_width);
      }
-      // TODO(Peter) this value should probably change for high bitdepth
+      if (i < (bs << need_bottom))
-      above_row[-1] = left_available ? above_ref[-1] : (base+1);
+        memset16(&left_col[i], left_col[i - 1], (bs << need_bottom) - i);
    } else {
-      /* faster path if the block does not need extension */
+      memset16(left_col, base + 1, bs << need_bottom);
      if (bs == 4 && right_available && left_available) {
        const_above_row = above_ref;
      } else {
        memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
        if (bs == 4 && right_available)
          memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0]));
        else
          vpx_memset16(above_row + bs, above_row[bs - 1], bs);
        // TODO(Peter): this value should probably change for high bitdepth
        above_row[-1] = left_available ? above_ref[-1] : (base+1);
      }
    }
  } else {
    vpx_memset16(above_row, base - 1, bs * 2);
    // TODO(Peter): this value should probably change for high bitdepth
    above_row[-1] = base - 1;
  }
-  // predict
+  // NEED_ABOVE
-  if (mode == DC_PRED) {
+  if (extend_modes[mode] & NEED_ABOVE) {
-    dc_pred_high[left_available][up_available][tx_size](dst, dst_stride,
+    const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
-                                                        const_above_row,
+    if (n_top_px > 0) {
-                                                        left_col, xd->bd);
+      memcpy(above_row, above_ref, n_top_px * 2);
-  } else {
+      i = n_top_px;
-    pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col,
+      if (need_right && n_topright_px > 0) {
-                             xd->bd);
+        assert(n_top_px == bs);
        memcpy(above_row + bs, above_ref + bs, n_topright_px * 2);
        i += n_topright_px;
      }
      if (i < (bs << need_right))
        memset16(&above_row[i], above_row[i - 1], (bs << need_right) - i);
    } else {
      memset16(above_row, base - 1, bs << need_right);
    }
  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                   int ref_stride, uint8_t *dst, int dst_stride,
                                   PREDICTION_MODE mode, TX_SIZE tx_size,
                                   int up_available, int left_available,
                                   int right_available, int x, int y,
                                   int plane) {
  int i;
  DECLARE_ALIGNED(16, uint8_t, left_col[32]);
  DECLARE_ALIGNED(16, uint8_t, above_data[64 + 16]);
  uint8_t *above_row = above_data + 16;
  const uint8_t *const_above_row = above_row;
  const int bs = 4 << tx_size;
  int frame_width, frame_height;
  int x0, y0;
  const struct macroblockd_plane *const pd = &xd->plane[plane];
  // 127 127 127 .. 127 127 127 127 127 127
  // 129  A   B  ..  Y   Z
  // 129  C   D  ..  W   X
  // 129  E   F  ..  U   V
  // 129  G   H  ..  S   T   T   T   T   T
  // ..
  if (extend_modes[mode] & NEED_ABOVELEFT) {
    above_row[-1] = n_top_px > 0 ?
        (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
  }
 #else
  // Get current frame pointer, width and height.
  if (plane == 0) {
    frame_width = xd->cur_buf->y_width;
@@ -264,8 +378,207 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
  // NEED_LEFT
  if (need_left) {
    if (left_available) {
      if (xd->mb_to_bottom_edge < 0) {
        /* slower path if the block needs border extension */
        if (y0 + bs <= frame_height) {
          for (i = 0; i < bs; ++i)
            left_col[i] = ref[i * ref_stride - 1];
        } else {
          const int extend_bottom = frame_height - y0;
          for (i = 0; i < extend_bottom; ++i)
            left_col[i] = ref[i * ref_stride - 1];
          for (; i < bs; ++i)
            left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
        }
      } else {
        /* faster path if the block does not need extension */
        for (i = 0; i < bs; ++i)
          left_col[i] = ref[i * ref_stride - 1];
      }
    } else {
      // TODO(Peter): this value should probably change for high bitdepth
      vpx_memset16(left_col, base + 1, bs);
    }
  }
  // NEED_ABOVE
  if (need_above) {
    if (up_available) {
      const uint16_t *above_ref = ref - ref_stride;
      if (xd->mb_to_right_edge < 0) {
        /* slower path if the block needs border extension */
        if (x0 + bs <= frame_width) {
          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
        } else if (x0 <= frame_width) {
          const int r = frame_width - x0;
          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
          vpx_memset16(above_row + r, above_row[r - 1], x0 + bs - frame_width);
        }
      } else {
        /* faster path if the block does not need extension */
        if (bs == 4 && right_available && left_available) {
          const_above_row = above_ref;
        } else {
          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
        }
      }
      above_row[-1] = left_available ? above_ref[-1] : (base + 1);
    } else {
      vpx_memset16(above_row, base - 1, bs);
      above_row[-1] = base - 1;
    }
  }
  // NEED_ABOVERIGHT
  if (need_aboveright) {
    if (up_available) {
      const uint16_t *above_ref = ref - ref_stride;
      if (xd->mb_to_right_edge < 0) {
        /* slower path if the block needs border extension */
        if (x0 + 2 * bs <= frame_width) {
          if (right_available && bs == 4) {
            memcpy(above_row, above_ref, 2 * bs * sizeof(above_row[0]));
          } else {
            memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
          }
        } else if (x0 + bs <= frame_width) {
          const int r = frame_width - x0;
          if (right_available && bs == 4) {
            memcpy(above_row, above_ref, r * sizeof(above_row[0]));
            vpx_memset16(above_row + r, above_row[r - 1],
                         x0 + 2 * bs - frame_width);
          } else {
            memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
          }
        } else if (x0 <= frame_width) {
          const int r = frame_width - x0;
          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
          vpx_memset16(above_row + r, above_row[r - 1],
                       x0 + 2 * bs - frame_width);
        }
        // TODO(Peter) this value should probably change for high bitdepth
        above_row[-1] = left_available ? above_ref[-1] : (base + 1);
      } else {
        /* faster path if the block does not need extension */
        if (bs == 4 && right_available && left_available) {
          const_above_row = above_ref;
        } else {
          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
          if (bs == 4 && right_available)
            memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0]));
          else
            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
          // TODO(Peter): this value should probably change for high bitdepth
          above_row[-1] = left_available ? above_ref[-1] : (base + 1);
        }
      }
    } else {
      vpx_memset16(above_row, base - 1, bs * 2);
      // TODO(Peter): this value should probably change for high bitdepth
      above_row[-1] = base - 1;
    }
  }
 #endif
  // predict
  if (mode == DC_PRED) {
 #if CONFIG_MISC_FIXES
    dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride,
                                                       const_above_row,
                                                       left_col, xd->bd);
 #else
    dc_pred_high[left_available][up_available][tx_size](dst, dst_stride,
                                                        const_above_row,
                                                        left_col, xd->bd);
 #endif
  } else {
    pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col,
                             xd->bd);
  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                   int ref_stride, uint8_t *dst, int dst_stride,
                                   PREDICTION_MODE mode, TX_SIZE tx_size,
 #if CONFIG_MISC_FIXES
                                   int n_top_px, int n_topright_px,
                                   int n_left_px, int n_bottomleft_px,
 #else
                                   int up_available, int left_available,
                                   int right_available,
 #endif
                                   int x, int y, int plane) {
  int i;
 #if CONFIG_MISC_FIXES
  DECLARE_ALIGNED(16, uint8_t, left_col[64]);
  const uint8_t *above_ref = ref - ref_stride;
 #else
  DECLARE_ALIGNED(16, uint8_t, left_col[32]);
  int frame_width, frame_height;
  int x0, y0;
  const struct macroblockd_plane *const pd = &xd->plane[plane];
 #endif
  DECLARE_ALIGNED(16, uint8_t, above_data[64 + 16]);
  uint8_t *above_row = above_data + 16;
  const uint8_t *const_above_row = above_row;
  const int bs = 4 << tx_size;
  // 127 127 127 .. 127 127 127 127 127 127
  // 129  A   B  ..  Y   Z
  // 129  C   D  ..  W   X
  // 129  E   F  ..  U   V
  // 129  G   H  ..  S   T   T   T   T   T
  // ..
 #if CONFIG_MISC_FIXES
  (void) xd;
  (void) x;
  (void) y;
  (void) plane;
  assert(n_top_px >= 0);
  assert(n_topright_px >= 0);
  assert(n_left_px >= 0);
  assert(n_bottomleft_px >= 0);
 #else
  // Get current frame pointer, width and height.
  if (plane == 0) {
    frame_width = xd->cur_buf->y_width;
    frame_height = xd->cur_buf->y_height;
  } else {
    frame_width = xd->cur_buf->uv_width;
    frame_height = xd->cur_buf->uv_height;
  }
  // Get block position in current frame.
  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
 #endif
  // NEED_LEFT
  if (extend_modes[mode] & NEED_LEFT) {
 #if CONFIG_MISC_FIXES
    const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
    i = 0;
    if (n_left_px > 0) {
      for (; i < n_left_px; i++)
        left_col[i] = ref[i * ref_stride - 1];
      if (need_bottom && n_bottomleft_px > 0) {
        assert(i == bs);
        for (; i < bs + n_bottomleft_px; i++)
          left_col[i] = ref[i * ref_stride - 1];
      }
      if (i < (bs << need_bottom))
        memset(&left_col[i], left_col[i - 1], (bs << need_bottom) - i);
    } else {
      memset(left_col, 129, bs << need_bottom);
    }
 #else
    if (left_available) {
      if (xd->mb_to_bottom_edge < 0) {
        /* slower path if the block needs border extension */
@@ -287,10 +600,27 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
    } else {
      memset(left_col, 129, bs);
    }
 #endif
  }
  // NEED_ABOVE
  if (extend_modes[mode] & NEED_ABOVE) {
 #if CONFIG_MISC_FIXES
    const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
    if (n_top_px > 0) {
      memcpy(above_row, above_ref, n_top_px);
      i = n_top_px;
      if (need_right && n_topright_px > 0) {
        assert(n_top_px == bs);
        memcpy(above_row + bs, above_ref + bs, n_topright_px);
        i += n_topright_px;
      }
      if (i < (bs << need_right))
        memset(&above_row[i], above_row[i - 1], (bs << need_right) - i);
    } else {
      memset(above_row, 127, bs << need_right);
    }
 #else
    if (up_available) {
      const uint8_t *above_ref = ref - ref_stride;
      if (xd->mb_to_right_edge < 0) {
@@ -315,8 +645,14 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
      memset(above_row, 127, bs);
      above_row[-1] = 127;
    }
 #endif
  }
 #if CONFIG_MISC_FIXES
  if (extend_modes[mode] & NEED_ABOVELEFT) {
    above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
  }
 #else
  // NEED_ABOVERIGHT
  if (extend_modes[mode] & NEED_ABOVERIGHT) {
    if (up_available) {
@@ -362,29 +698,83 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
      above_row[-1] = 127;
    }
  }
 #endif
  // predict
  if (mode == DC_PRED) {
 #if CONFIG_MISC_FIXES
    dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride,
                                                  const_above_row, left_col);
 #else
    dc_pred[left_available][up_available][tx_size](dst, dst_stride,
                                                   const_above_row, left_col);
 #endif
  } else {
    pred[mode][tx_size](dst, dst_stride, const_above_row, left_col);
  }
 }
-void vp10_predict_intra_block(const MACROBLOCKD *xd, int bwl_in,
+void vp10_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, int bhl_in,
                             TX_SIZE tx_size, PREDICTION_MODE mode,
                             const uint8_t *ref, int ref_stride,
                             uint8_t *dst, int dst_stride,
                             int aoff, int loff, int plane) {
  const int bw = (1 << bwl_in);
  const int txw = (1 << tx_size);
  const int have_top = loff || xd->up_available;
  const int have_left = aoff || xd->left_available;
  const int have_right = (aoff + txw) < bw;
  const int x = aoff * 4;
  const int y = loff * 4;
 #if CONFIG_MISC_FIXES
  const int bw = VPXMAX(2, 1 << bwl_in);
  const int bh = VPXMAX(2, 1 << bhl_in);
  const int mi_row = -xd->mb_to_top_edge >> 6;
  const int mi_col = -xd->mb_to_left_edge >> 6;
  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
  const struct macroblockd_plane *const pd = &xd->plane[plane];
  const int right_available =
      mi_col + (bw >> !pd->subsampling_x) < xd->tile.mi_col_end;
  const int have_right = vp10_has_right(bsize, mi_row, mi_col,
                                        right_available,
                                        tx_size, loff, aoff,
                                        pd->subsampling_x);
  const int have_bottom = vp10_has_bottom(bsize, mi_row, mi_col,
                                          xd->mb_to_bottom_edge > 0,
                                          tx_size, loff, aoff,
                                          pd->subsampling_y);
  const int wpx = 4 * bw;
  const int hpx = 4 * bh;
  const int txpx = 4 * txw;
  int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + (wpx - x - txpx);
  int yd =
      (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + (hpx - y - txpx);
 #else
  const int bw = (1 << bwl_in);
  const int have_right = (aoff + txw) < bw;
 #endif  // CONFIG_MISC_FIXES
 #if CONFIG_MISC_FIXES
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,
                                tx_size,
                                have_top ? VPXMIN(txpx, xr + txpx) : 0,
                                have_top && have_right ? VPXMIN(txpx, xr) : 0,
                                have_left ? VPXMIN(txpx, yd + txpx) : 0,
                                have_bottom && have_left ? VPXMIN(txpx, yd) : 0,
                                x, y, plane, xd->bd);
    return;
  }
 #endif
  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
                         tx_size,
                         have_top ? VPXMIN(txpx, xr + txpx) : 0,
                         have_top && have_right ? VPXMIN(txpx, xr) : 0,
                         have_left ? VPXMIN(txpx, yd + txpx) : 0,
                         have_bottom && have_left ? VPXMIN(txpx, yd) : 0,
                         x, y, plane);
 #else  // CONFIG_MISC_FIXES
  (void) bhl_in;
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,
@@ -395,6 +785,7 @@ void vp10_predict_intra_block(const MACROBLOCKD *xd, int bwl_in,
 #endif
  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
                         have_top, have_left, have_right, x, y, plane);
 #endif  // CONFIG_MISC_FIXES
 }
 void vp10_init_intra_predictors(void) {
--- a/vp10/common/reconintra.h
+++ b/vp10/common/reconintra.h
@@ -20,7 +20,7 @@ extern "C" {
 void vp10_init_intra_predictors(void);
-void vp10_predict_intra_block(const MACROBLOCKD *xd, int bwl_in,
+void vp10_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, int bhl_in,
                             TX_SIZE tx_size, PREDICTION_MODE mode,
                             const uint8_t *ref, int ref_stride,
                             uint8_t *dst, int dst_stride,
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c
@@ -695,6 +695,13 @@ DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_32x32[1024]) = {
  1023,
 };
 const scan_order vp10_default_scan_orders[TX_SIZES] = {
  {default_scan_4x4,   vp10_default_iscan_4x4,   default_scan_4x4_neighbors},
  {default_scan_8x8,   vp10_default_iscan_8x8,   default_scan_8x8_neighbors},
  {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors},
  {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
 };
 const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES] = {
  {  // TX_4X4
    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
--- a/vp10/common/scan.h
+++ b/vp10/common/scan.h
@@ -29,6 +29,7 @@ typedef struct {
  const int16_t *neighbors;
 } scan_order;
 extern const scan_order vp10_default_scan_orders[TX_SIZES];
 extern const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES];
 static INLINE int get_coef_context(const int16_t *neighbors,
--- a/vp10/common/seg_common.h
+++ b/vp10/common/seg_common.h
@@ -42,13 +42,15 @@ struct segmentation {
  uint8_t abs_delta;
  uint8_t temporal_update;
  vpx_prob tree_probs[SEG_TREE_PROBS];
  vpx_prob pred_probs[PREDICTION_PROBS];
  int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
  unsigned int feature_mask[MAX_SEGMENTS];
 };
 struct segmentation_probs {
  vpx_prob tree_probs[SEG_TREE_PROBS];
  vpx_prob pred_probs[PREDICTION_PROBS];
 };
 static INLINE int segfeature_active(const struct segmentation *seg,
                                    int segment_id,
                                    SEG_LVL_FEATURES feature_id) {
--- a/vp10/common/thread_common.c
+++ b/vp10/common/thread_common.c
@@ -434,4 +434,15 @@ void vp10_accumulate_frame_counts(VP10_COMMON *cm, FRAME_COUNTS *counts,
    for (i = 0; i < MV_FP_SIZE; i++)
      comps->fp[i] += comps_t->fp[i];
  }
 #if CONFIG_MISC_FIXES
  for (i = 0; i < PREDICTION_PROBS; i++)
    for (j = 0; j < 2; j++)
      cm->counts.seg.pred[i][j] += counts->seg.pred[i][j];
  for (i = 0; i < MAX_SEGMENTS; i++) {
    cm->counts.seg.tree_total[i] += counts->seg.tree_total[i];
    cm->counts.seg.tree_mispred[i] += counts->seg.tree_mispred[i];
  }
 #endif
 }
--- a/vp10/common/thread_common.h
+++ b/vp10/common/thread_common.h
@@ -14,6 +14,10 @@
 #include "vp10/common/loopfilter.h"
 #include "vpx_util/vpx_thread.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct VP10Common;
 struct FRAME_COUNTS;
@@ -54,4 +58,8 @@ void vp10_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
 void vp10_accumulate_frame_counts(struct VP10Common *cm,
                                 struct FRAME_COUNTS *counts, int is_dec);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP10_COMMON_LOOPFILTER_THREAD_H_
--- a/vp10/common/vp10_fwd_txfm.c
+++ b/vp10/common/vp10_fwd_txfm.c
@@ -0,0 +1,824 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vp10/common/vp10_fwd_txfm.h"
 void vp10_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
  // the results. In the second one, we transform the rows. To achieve that,
  // as the first pass results are transposed, we transpose the columns (that
  // is the transposed rows) and transpose the results (so that it goes back
  // in normal/row positions).
  int pass;
  // We need an intermediate buffer between passes.
  tran_low_t intermediate[4 * 4];
  const int16_t *in_pass0 = input;
  const tran_low_t *in = NULL;
  tran_low_t *out = intermediate;
  // Do the two transform/transpose passes
  for (pass = 0; pass < 2; ++pass) {
    tran_high_t input[4];      // canbe16
    tran_high_t step[4];       // canbe16
    tran_high_t temp1, temp2;  // needs32
    int i;
    for (i = 0; i < 4; ++i) {
      // Load inputs.
      if (0 == pass) {
        input[0] = in_pass0[0 * stride] * 16;
        input[1] = in_pass0[1 * stride] * 16;
        input[2] = in_pass0[2 * stride] * 16;
        input[3] = in_pass0[3 * stride] * 16;
        if (i == 0 && input[0]) {
          input[0] += 1;
        }
      } else {
        input[0] = in[0 * 4];
        input[1] = in[1 * 4];
        input[2] = in[2 * 4];
        input[3] = in[3 * 4];
      }
      // Transform.
      step[0] = input[0] + input[3];
      step[1] = input[1] + input[2];
      step[2] = input[1] - input[2];
      step[3] = input[0] - input[3];
      temp1 = (step[0] + step[1]) * cospi_16_64;
      temp2 = (step[0] - step[1]) * cospi_16_64;
      out[0] = (tran_low_t)fdct_round_shift(temp1);
      out[2] = (tran_low_t)fdct_round_shift(temp2);
      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
      out[1] = (tran_low_t)fdct_round_shift(temp1);
      out[3] = (tran_low_t)fdct_round_shift(temp2);
      // Do next column (which is a transposed row in second/horizontal pass)
      in_pass0++;
      in++;
      out += 4;
    }
    // Setup in/out for next pass.
    in = intermediate;
    out = output;
  }
  {
    int i, j;
    for (i = 0; i < 4; ++i) {
      for (j = 0; j < 4; ++j)
        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
    }
  }
 }
 void vp10_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
  int r, c;
  tran_low_t sum = 0;
  for (r = 0; r < 4; ++r)
    for (c = 0; c < 4; ++c)
      sum += input[r * stride + c];
  output[0] = sum << 1;
  output[1] = 0;
 }
 void vp10_fdct8x8_c(const int16_t *input,
    tran_low_t *final_output, int stride) {
  int i, j;
  tran_low_t intermediate[64];
  int pass;
  tran_low_t *output = intermediate;
  const tran_low_t *in = NULL;
  // Transform columns
  for (pass = 0; pass < 2; ++pass) {
    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
    tran_high_t t0, t1, t2, t3;                  // needs32
    tran_high_t x0, x1, x2, x3;                  // canbe16
    int i;
    for (i = 0; i < 8; i++) {
      // stage 1
      if (pass == 0) {
        s0 = (input[0 * stride] + input[7 * stride]) * 4;
        s1 = (input[1 * stride] + input[6 * stride]) * 4;
        s2 = (input[2 * stride] + input[5 * stride]) * 4;
        s3 = (input[3 * stride] + input[4 * stride]) * 4;
        s4 = (input[3 * stride] - input[4 * stride]) * 4;
        s5 = (input[2 * stride] - input[5 * stride]) * 4;
        s6 = (input[1 * stride] - input[6 * stride]) * 4;
        s7 = (input[0 * stride] - input[7 * stride]) * 4;
        ++input;
      } else {
        s0 = in[0 * 8] + in[7 * 8];
        s1 = in[1 * 8] + in[6 * 8];
        s2 = in[2 * 8] + in[5 * 8];
        s3 = in[3 * 8] + in[4 * 8];
        s4 = in[3 * 8] - in[4 * 8];
        s5 = in[2 * 8] - in[5 * 8];
        s6 = in[1 * 8] - in[6 * 8];
        s7 = in[0 * 8] - in[7 * 8];
        ++in;
      }
      // fdct4(step, step);
      x0 = s0 + s3;
      x1 = s1 + s2;
      x2 = s1 - s2;
      x3 = s0 - s3;
      t0 = (x0 + x1) * cospi_16_64;
      t1 = (x0 - x1) * cospi_16_64;
      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
      output[0] = (tran_low_t)fdct_round_shift(t0);
      output[2] = (tran_low_t)fdct_round_shift(t2);
      output[4] = (tran_low_t)fdct_round_shift(t1);
      output[6] = (tran_low_t)fdct_round_shift(t3);
      // Stage 2
      t0 = (s6 - s5) * cospi_16_64;
      t1 = (s6 + s5) * cospi_16_64;
      t2 = fdct_round_shift(t0);
      t3 = fdct_round_shift(t1);
      // Stage 3
      x0 = s4 + t2;
      x1 = s4 - t2;
      x2 = s7 - t3;
      x3 = s7 + t3;
      // Stage 4
      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
      output[1] = (tran_low_t)fdct_round_shift(t0);
      output[3] = (tran_low_t)fdct_round_shift(t2);
      output[5] = (tran_low_t)fdct_round_shift(t1);
      output[7] = (tran_low_t)fdct_round_shift(t3);
      output += 8;
    }
    in  = intermediate;
    output = final_output;
  }
  // Rows
  for (i = 0; i < 8; ++i) {
    for (j = 0; j < 8; ++j)
      final_output[j + i * 8] /= 2;
  }
 }
 void vp10_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
  int r, c;
  tran_low_t sum = 0;
  for (r = 0; r < 8; ++r)
    for (c = 0; c < 8; ++c)
      sum += input[r * stride + c];
  output[0] = sum;
  output[1] = 0;
 }
 void vp10_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
  // the results. In the second one, we transform the rows. To achieve that,
  // as the first pass results are transposed, we transpose the columns (that
  // is the transposed rows) and transpose the results (so that it goes back
  // in normal/row positions).
  int pass;
  // We need an intermediate buffer between passes.
  tran_low_t intermediate[256];
  const int16_t *in_pass0 = input;
  const tran_low_t *in = NULL;
  tran_low_t *out = intermediate;
  // Do the two transform/transpose passes
  for (pass = 0; pass < 2; ++pass) {
    tran_high_t step1[8];      // canbe16
    tran_high_t step2[8];      // canbe16
    tran_high_t step3[8];      // canbe16
    tran_high_t input[8];      // canbe16
    tran_high_t temp1, temp2;  // needs32
    int i;
    for (i = 0; i < 16; i++) {
      if (0 == pass) {
        // Calculate input for the first 8 results.
        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
        // Calculate input for the next 8 results.
        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
      } else {
        // Calculate input for the first 8 results.
        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
        // Calculate input for the next 8 results.
        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
      }
      // Work on the first eight values; fdct8(input, even_results);
      {
        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
        tran_high_t t0, t1, t2, t3;                  // needs32
        tran_high_t x0, x1, x2, x3;                  // canbe16
        // stage 1
        s0 = input[0] + input[7];
        s1 = input[1] + input[6];
        s2 = input[2] + input[5];
        s3 = input[3] + input[4];
        s4 = input[3] - input[4];
        s5 = input[2] - input[5];
        s6 = input[1] - input[6];
        s7 = input[0] - input[7];
        // fdct4(step, step);
        x0 = s0 + s3;
        x1 = s1 + s2;
        x2 = s1 - s2;
        x3 = s0 - s3;
        t0 = (x0 + x1) * cospi_16_64;
        t1 = (x0 - x1) * cospi_16_64;
        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
        out[0] = (tran_low_t)fdct_round_shift(t0);
        out[4] = (tran_low_t)fdct_round_shift(t2);
        out[8] = (tran_low_t)fdct_round_shift(t1);
        out[12] = (tran_low_t)fdct_round_shift(t3);
        // Stage 2
        t0 = (s6 - s5) * cospi_16_64;
        t1 = (s6 + s5) * cospi_16_64;
        t2 = fdct_round_shift(t0);
        t3 = fdct_round_shift(t1);
        // Stage 3
        x0 = s4 + t2;
        x1 = s4 - t2;
        x2 = s7 - t3;
        x3 = s7 + t3;
        // Stage 4
        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
        out[2] = (tran_low_t)fdct_round_shift(t0);
        out[6] = (tran_low_t)fdct_round_shift(t2);
        out[10] = (tran_low_t)fdct_round_shift(t1);
        out[14] = (tran_low_t)fdct_round_shift(t3);
      }
      // Work on the next eight values; step1 -> odd_results
      {
        // step 2
        temp1 = (step1[5] - step1[2]) * cospi_16_64;
        temp2 = (step1[4] - step1[3]) * cospi_16_64;
        step2[2] = fdct_round_shift(temp1);
        step2[3] = fdct_round_shift(temp2);
        temp1 = (step1[4] + step1[3]) * cospi_16_64;
        temp2 = (step1[5] + step1[2]) * cospi_16_64;
        step2[4] = fdct_round_shift(temp1);
        step2[5] = fdct_round_shift(temp2);
        // step 3
        step3[0] = step1[0] + step2[3];
        step3[1] = step1[1] + step2[2];
        step3[2] = step1[1] - step2[2];
        step3[3] = step1[0] - step2[3];
        step3[4] = step1[7] - step2[4];
        step3[5] = step1[6] - step2[5];
        step3[6] = step1[6] + step2[5];
        step3[7] = step1[7] + step2[4];
        // step 4
        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
        step2[1] = fdct_round_shift(temp1);
        step2[2] = fdct_round_shift(temp2);
        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
        step2[5] = fdct_round_shift(temp1);
        step2[6] = fdct_round_shift(temp2);
        // step 5
        step1[0] = step3[0] + step2[1];
        step1[1] = step3[0] - step2[1];
        step1[2] = step3[3] + step2[2];
        step1[3] = step3[3] - step2[2];
        step1[4] = step3[4] - step2[5];
        step1[5] = step3[4] + step2[5];
        step1[6] = step3[7] - step2[6];
        step1[7] = step3[7] + step2[6];
        // step 6
        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
        out[1] = (tran_low_t)fdct_round_shift(temp1);
        out[9] = (tran_low_t)fdct_round_shift(temp2);
        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
        out[5] = (tran_low_t)fdct_round_shift(temp1);
        out[13] = (tran_low_t)fdct_round_shift(temp2);
        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
        out[3] = (tran_low_t)fdct_round_shift(temp1);
        out[11] = (tran_low_t)fdct_round_shift(temp2);
        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
        out[7] = (tran_low_t)fdct_round_shift(temp1);
        out[15] = (tran_low_t)fdct_round_shift(temp2);
      }
      // Do next column (which is a transposed row in second/horizontal pass)
      in++;
      in_pass0++;
      out += 16;
    }
    // Setup in/out for next pass.
    in = intermediate;
    out = output;
  }
 }
 void vp10_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
  int r, c;
  tran_low_t sum = 0;
  for (r = 0; r < 16; ++r)
    for (c = 0; c < 16; ++c)
      sum += input[r * stride + c];
  output[0] = sum >> 1;
  output[1] = 0;
 }
 static INLINE tran_high_t dct_32_round(tran_high_t input) {
  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
  // TODO(debargha, peter.derivaz): Find new bounds for this assert,
  // and make the bounds consts.
  // assert(-131072 <= rv && rv <= 131071);
  return rv;
 }
 static INLINE tran_high_t half_round_shift(tran_high_t input) {
  tran_high_t rv = (input + 1 + (input < 0)) >> 2;
  return rv;
 }
 void vp10_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
  tran_high_t step[32];
  // Stage 1
  step[0] = input[0] + input[(32 - 1)];
  step[1] = input[1] + input[(32 - 2)];
  step[2] = input[2] + input[(32 - 3)];
  step[3] = input[3] + input[(32 - 4)];
  step[4] = input[4] + input[(32 - 5)];
  step[5] = input[5] + input[(32 - 6)];
  step[6] = input[6] + input[(32 - 7)];
  step[7] = input[7] + input[(32 - 8)];
  step[8] = input[8] + input[(32 - 9)];
  step[9] = input[9] + input[(32 - 10)];
  step[10] = input[10] + input[(32 - 11)];
  step[11] = input[11] + input[(32 - 12)];
  step[12] = input[12] + input[(32 - 13)];
  step[13] = input[13] + input[(32 - 14)];
  step[14] = input[14] + input[(32 - 15)];
  step[15] = input[15] + input[(32 - 16)];
  step[16] = -input[16] + input[(32 - 17)];
  step[17] = -input[17] + input[(32 - 18)];
  step[18] = -input[18] + input[(32 - 19)];
  step[19] = -input[19] + input[(32 - 20)];
  step[20] = -input[20] + input[(32 - 21)];
  step[21] = -input[21] + input[(32 - 22)];
  step[22] = -input[22] + input[(32 - 23)];
  step[23] = -input[23] + input[(32 - 24)];
  step[24] = -input[24] + input[(32 - 25)];
  step[25] = -input[25] + input[(32 - 26)];
  step[26] = -input[26] + input[(32 - 27)];
  step[27] = -input[27] + input[(32 - 28)];
  step[28] = -input[28] + input[(32 - 29)];
  step[29] = -input[29] + input[(32 - 30)];
  step[30] = -input[30] + input[(32 - 31)];
  step[31] = -input[31] + input[(32 - 32)];
  // Stage 2
  output[0] = step[0] + step[16 - 1];
  output[1] = step[1] + step[16 - 2];
  output[2] = step[2] + step[16 - 3];
  output[3] = step[3] + step[16 - 4];
  output[4] = step[4] + step[16 - 5];
  output[5] = step[5] + step[16 - 6];
  output[6] = step[6] + step[16 - 7];
  output[7] = step[7] + step[16 - 8];
  output[8] = -step[8] + step[16 - 9];
  output[9] = -step[9] + step[16 - 10];
  output[10] = -step[10] + step[16 - 11];
  output[11] = -step[11] + step[16 - 12];
  output[12] = -step[12] + step[16 - 13];
  output[13] = -step[13] + step[16 - 14];
  output[14] = -step[14] + step[16 - 15];
  output[15] = -step[15] + step[16 - 16];
  output[16] = step[16];
  output[17] = step[17];
  output[18] = step[18];
  output[19] = step[19];
  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
  output[28] = step[28];
  output[29] = step[29];
  output[30] = step[30];
  output[31] = step[31];
  // dump the magnitude by 4, hence the intermediate values are within
  // the range of 16 bits.
  if (round) {
    output[0] = half_round_shift(output[0]);
    output[1] = half_round_shift(output[1]);
    output[2] = half_round_shift(output[2]);
    output[3] = half_round_shift(output[3]);
    output[4] = half_round_shift(output[4]);
    output[5] = half_round_shift(output[5]);
    output[6] = half_round_shift(output[6]);
    output[7] = half_round_shift(output[7]);
    output[8] = half_round_shift(output[8]);
    output[9] = half_round_shift(output[9]);
    output[10] = half_round_shift(output[10]);
    output[11] = half_round_shift(output[11]);
    output[12] = half_round_shift(output[12]);
    output[13] = half_round_shift(output[13]);
    output[14] = half_round_shift(output[14]);
    output[15] = half_round_shift(output[15]);
    output[16] = half_round_shift(output[16]);
    output[17] = half_round_shift(output[17]);
    output[18] = half_round_shift(output[18]);
    output[19] = half_round_shift(output[19]);
    output[20] = half_round_shift(output[20]);
    output[21] = half_round_shift(output[21]);
    output[22] = half_round_shift(output[22]);
    output[23] = half_round_shift(output[23]);
    output[24] = half_round_shift(output[24]);
    output[25] = half_round_shift(output[25]);
    output[26] = half_round_shift(output[26]);
    output[27] = half_round_shift(output[27]);
    output[28] = half_round_shift(output[28]);
    output[29] = half_round_shift(output[29]);
    output[30] = half_round_shift(output[30]);
    output[31] = half_round_shift(output[31]);
  }
  // Stage 3
  step[0] = output[0] + output[(8 - 1)];
  step[1] = output[1] + output[(8 - 2)];
  step[2] = output[2] + output[(8 - 3)];
  step[3] = output[3] + output[(8 - 4)];
  step[4] = -output[4] + output[(8 - 5)];
  step[5] = -output[5] + output[(8 - 6)];
  step[6] = -output[6] + output[(8 - 7)];
  step[7] = -output[7] + output[(8 - 8)];
  step[8] = output[8];
  step[9] = output[9];
  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
  step[14] = output[14];
  step[15] = output[15];
  step[16] = output[16] + output[23];
  step[17] = output[17] + output[22];
  step[18] = output[18] + output[21];
  step[19] = output[19] + output[20];
  step[20] = -output[20] + output[19];
  step[21] = -output[21] + output[18];
  step[22] = -output[22] + output[17];
  step[23] = -output[23] + output[16];
  step[24] = -output[24] + output[31];
  step[25] = -output[25] + output[30];
  step[26] = -output[26] + output[29];
  step[27] = -output[27] + output[28];
  step[28] = output[28] + output[27];
  step[29] = output[29] + output[26];
  step[30] = output[30] + output[25];
  step[31] = output[31] + output[24];
  // Stage 4
  output[0] = step[0] + step[3];
  output[1] = step[1] + step[2];
  output[2] = -step[2] + step[1];
  output[3] = -step[3] + step[0];
  output[4] = step[4];
  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
  output[7] = step[7];
  output[8] = step[8] + step[11];
  output[9] = step[9] + step[10];
  output[10] = -step[10] + step[9];
  output[11] = -step[11] + step[8];
  output[12] = -step[12] + step[15];
  output[13] = -step[13] + step[14];
  output[14] = step[14] + step[13];
  output[15] = step[15] + step[12];
  output[16] = step[16];
  output[17] = step[17];
  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
  output[22] = step[22];
  output[23] = step[23];
  output[24] = step[24];
  output[25] = step[25];
  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
  output[30] = step[30];
  output[31] = step[31];
  // Stage 5
  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
  step[4] = output[4] + output[5];
  step[5] = -output[5] + output[4];
  step[6] = -output[6] + output[7];
  step[7] = output[7] + output[6];
  step[8] = output[8];
  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
  step[11] = output[11];
  step[12] = output[12];
  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
  step[15] = output[15];
  step[16] = output[16] + output[19];
  step[17] = output[17] + output[18];
  step[18] = -output[18] + output[17];
  step[19] = -output[19] + output[16];
  step[20] = -output[20] + output[23];
  step[21] = -output[21] + output[22];
  step[22] = output[22] + output[21];
  step[23] = output[23] + output[20];
  step[24] = output[24] + output[27];
  step[25] = output[25] + output[26];
  step[26] = -output[26] + output[25];
  step[27] = -output[27] + output[24];
  step[28] = -output[28] + output[31];
  step[29] = -output[29] + output[30];
  step[30] = output[30] + output[29];
  step[31] = output[31] + output[28];
  // Stage 6
  output[0] = step[0];
  output[1] = step[1];
  output[2] = step[2];
  output[3] = step[3];
  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
  output[8] = step[8] + step[9];
  output[9] = -step[9] + step[8];
  output[10] = -step[10] + step[11];
  output[11] = step[11] + step[10];
  output[12] = step[12] + step[13];
  output[13] = -step[13] + step[12];
  output[14] = -step[14] + step[15];
  output[15] = step[15] + step[14];
  output[16] = step[16];
  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
  output[19] = step[19];
  output[20] = step[20];
  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
  output[23] = step[23];
  output[24] = step[24];
  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
  output[27] = step[27];
  output[28] = step[28];
  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
  output[31] = step[31];
  // Stage 7
  step[0] = output[0];
  step[1] = output[1];
  step[2] = output[2];
  step[3] = output[3];
  step[4] = output[4];
  step[5] = output[5];
  step[6] = output[6];
  step[7] = output[7];
  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
  step[16] = output[16] + output[17];
  step[17] = -output[17] + output[16];
  step[18] = -output[18] + output[19];
  step[19] = output[19] + output[18];
  step[20] = output[20] + output[21];
  step[21] = -output[21] + output[20];
  step[22] = -output[22] + output[23];
  step[23] = output[23] + output[22];
  step[24] = output[24] + output[25];
  step[25] = -output[25] + output[24];
  step[26] = -output[26] + output[27];
  step[27] = output[27] + output[26];
  step[28] = output[28] + output[29];
  step[29] = -output[29] + output[28];
  step[30] = -output[30] + output[31];
  step[31] = output[31] + output[30];
  // Final stage --- outputs indices are bit-reversed.
  output[0]  = step[0];
  output[16] = step[1];
  output[8]  = step[2];
  output[24] = step[3];
  output[4]  = step[4];
  output[20] = step[5];
  output[12] = step[6];
  output[28] = step[7];
  output[2]  = step[8];
  output[18] = step[9];
  output[10] = step[10];
  output[26] = step[11];
  output[6]  = step[12];
  output[22] = step[13];
  output[14] = step[14];
  output[30] = step[15];
  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
 }
 void vp10_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
  int i, j;
  tran_high_t output[32 * 32];
  // Columns
  for (i = 0; i < 32; ++i) {
    tran_high_t temp_in[32], temp_out[32];
    for (j = 0; j < 32; ++j)
      temp_in[j] = input[j * stride + i] * 4;
    vp10_fdct32(temp_in, temp_out, 0);
    for (j = 0; j < 32; ++j)
      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
  }
  // Rows
  for (i = 0; i < 32; ++i) {
    tran_high_t temp_in[32], temp_out[32];
    for (j = 0; j < 32; ++j)
      temp_in[j] = output[j + i * 32];
    vp10_fdct32(temp_in, temp_out, 0);
    for (j = 0; j < 32; ++j)
      out[j + i * 32] =
          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
  }
 }
 // Note that although we use dct_32_round in dct32 computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
 void vp10_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
  int i, j;
  tran_high_t output[32 * 32];
  // Columns
  for (i = 0; i < 32; ++i) {
    tran_high_t temp_in[32], temp_out[32];
    for (j = 0; j < 32; ++j)
      temp_in[j] = input[j * stride + i] * 4;
    vp10_fdct32(temp_in, temp_out, 0);
    for (j = 0; j < 32; ++j)
      // TODO(cd): see quality impact of only doing
      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
      //           PS: also change code in vp10_dsp/x86/vp10_dct_sse2.c
      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
  }
  // Rows
  for (i = 0; i < 32; ++i) {
    tran_high_t temp_in[32], temp_out[32];
    for (j = 0; j < 32; ++j)
      temp_in[j] = output[j + i * 32];
    vp10_fdct32(temp_in, temp_out, 1);
    for (j = 0; j < 32; ++j)
      out[j + i * 32] = (tran_low_t)temp_out[j];
  }
 }
 void vp10_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
  int r, c;
  tran_low_t sum = 0;
  for (r = 0; r < 32; ++r)
    for (c = 0; c < 32; ++c)
      sum += input[r * stride + c];
  output[0] = sum >> 3;
  output[1] = 0;
 }
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
                          int stride) {
  vp10_fdct4x4_c(input, output, stride);
 }
 void vp10_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
                          int stride) {
  vp10_fdct8x8_c(input, final_output, stride);
 }
 void vp10_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
                            int stride) {
  vp10_fdct8x8_1_c(input, final_output, stride);
 }
 void vp10_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
                            int stride) {
  vp10_fdct16x16_c(input, output, stride);
 }
 void vp10_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
                              int stride) {
  vp10_fdct16x16_1_c(input, output, stride);
 }
 void vp10_highbd_fdct32x32_c(const int16_t *input,
    tran_low_t *out, int stride) {
  vp10_fdct32x32_c(input, out, stride);
 }
 void vp10_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
                               int stride) {
  vp10_fdct32x32_rd_c(input, out, stride);
 }
 void vp10_highbd_fdct32x32_1_c(const int16_t *input,
    tran_low_t *out, int stride) {
  vp10_fdct32x32_1_c(input, out, stride);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp10/common/vp10_fwd_txfm.h
+++ b/vp10/common/vp10_fwd_txfm.h
@@ -0,0 +1,18 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP10_COMMON_VP10_FWD_TXFM_H_
 #define VP10_COMMON_VP10_FWD_TXFM_H_
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/fwd_txfm.h"
 void vp10_fdct32(const tran_high_t *input, tran_high_t *output, int round);
 #endif  // VP10_COMMON_VP10_FWD_TXFM_H_
--- a/vp10/common/vp10_inv_txfm.c
+++ b/vp10/common/vp10_inv_txfm.c
--- a/vp10/common/vp10_inv_txfm.h
+++ b/vp10/common/vp10_inv_txfm.h
@@ -0,0 +1,122 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VPX_DSP_INV_TXFM_H_
 #define VPX_DSP_INV_TXFM_H_
 #include <assert.h>
 #include "./vpx_config.h"
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_ports/mem.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 static INLINE tran_low_t check_range(tran_high_t input) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
  // For valid VP9 input streams, intermediate stage coefficients should always
  // stay within the range of a signed 16 bit integer. Coefficients can go out
  // of this range for invalid/corrupt VP9 streams. However, strictly checking
  // this range for every intermediate coefficient can burdensome for a decoder,
  // therefore the following assertion is only enabled when configured with
  // --enable-coefficient-range-checking.
  assert(INT16_MIN <= input);
  assert(input <= INT16_MAX);
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
  return (tran_low_t)input;
 }
 static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
  return check_range(rv);
 }
 #if CONFIG_VP9_HIGHBITDEPTH
 static INLINE tran_low_t highbd_check_range(tran_high_t input,
                                            int bd) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
  // stay within the ranges:
  // - 8 bit: signed 16 bit integer
  // - 10 bit: signed 18 bit integer
  // - 12 bit: signed 20 bit integer
  const int32_t int_max = (1 << (7 + bd)) - 1;
  const int32_t int_min = -int_max - 1;
  assert(int_min <= input);
  assert(input <= int_max);
  (void) int_min;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
  (void) bd;
  return (tran_low_t)input;
 }
 static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
                                                      int bd) {
  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
  return highbd_check_range(rv, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_EMULATE_HARDWARE
 // When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
 // non-normative method to handle overflows. A stream that causes
 // overflows  in the inverse transform is considered invalid in VP9,
 // and a hardware implementer is free to choose any reasonable
 // method to handle overflows. However to aid in hardware
 // verification they can use a specific implementation of the
 // WRAPLOW() macro below that is identical to their intended
 // hardware implementation (and also use configure options to trigger
 // the C-implementation of the transform).
 //
 // The particular WRAPLOW implementation below performs strict
 // overflow wrapping to match common hardware implementations.
 // bd of 8 uses trans_low with 16bits, need to remove 16bits
 // bd of 10 uses trans_low with 18bits, need to remove 14bits
 // bd of 12 uses trans_low with 20bits, need to remove 12bits
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
 #define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
 #else
 #define WRAPLOW(x, bd) ((int32_t)(x))
 #endif  // CONFIG_EMULATE_HARDWARE
 void vp10_idct4_c(const tran_low_t *input, tran_low_t *output);
 void vp10_idct8_c(const tran_low_t *input, tran_low_t *output);
 void vp10_idct16_c(const tran_low_t *input, tran_low_t *output);
 void vp10_idct32_c(const tran_low_t *input, tran_low_t *output);
 void vp10_iadst4_c(const tran_low_t *input, tran_low_t *output);
 void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output);
 void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output);
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vp10_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vp10_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vp10_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
                                             int bd) {
  trans = WRAPLOW(trans, bd);
  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
 }
 #endif
 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
  trans = WRAPLOW(trans, 8);
  return clip_pixel(WRAPLOW(dest + trans, 8));
 }
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VPX_DSP_INV_TXFM_H_
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -87,14 +87,127 @@ specialize qw/vp10_filter_by_weight8x8 sse2 msa/;
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  # Note as optimized versions of these functions are added we need to add a check to ensure
  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-  specialize qw/vp10_iht4x4_16_add/;
+    add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
    specialize qw/vp10_iht4x4_16_add/;
-  add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-  specialize qw/vp10_iht8x8_64_add/;
+    specialize qw/vp10_iht8x8_64_add/;
-  add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-  specialize qw/vp10_iht16x16_256_add/;
+    specialize qw/vp10_iht16x16_256_add/;
    add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct4x4/;
    add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct4x4_1/;
    add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct8x8/;
    add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct8x8_1/;
    add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct16x16/;
    add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct16x16_1/;
    add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct32x32/;
    add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct32x32_rd/;
    add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct32x32_1/;
    add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct4x4/;
    add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct8x8/;
    add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct8x8_1/;
    add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct16x16/;
    add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct16x16_1/;
    add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct32x32/;
    add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct32x32_rd/;
    add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct32x32_1/;
  } else {
    add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
    specialize qw/vp10_iht4x4_16_add sse2/;
    add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
    specialize qw/vp10_iht8x8_64_add sse2/;
    add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
    specialize qw/vp10_iht16x16_256_add/;
    add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct4x4 sse2/;
    add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct4x4_1 sse2/;
    add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct8x8 sse2/;
    add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct8x8_1 sse2/;
    add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct16x16 sse2/;
    add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct16x16_1 sse2/;
    add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct32x32 sse2/;
    add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct32x32_rd sse2/;
    add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct32x32_1 sse2/;
    add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct4x4 sse2/;
    add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct8x8 sse2/;
    add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct8x8_1/;
    add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct16x16 sse2/;
    add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct16x16_1/;
    add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct32x32 sse2/;
    add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct32x32_rd sse2/;
    add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_highbd_fdct32x32_1/;
  }
 } else {
  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
@@ -106,6 +219,33 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
    specialize qw/vp10_iht16x16_256_add/;
    add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct4x4/;
    add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct4x4_1/;
    add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct8x8/;
    add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct8x8_1/;
    add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct16x16/;
    add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct16x16_1/;
    add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct32x32/;
    add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct32x32_rd/;
    add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct32x32_1/;
  } else {
    add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
    specialize qw/vp10_iht4x4_16_add sse2 neon dspr2 msa/;
@@ -115,6 +255,33 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
    specialize qw/vp10_iht16x16_256_add sse2 dspr2 msa/;
    add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct4x4 sse2/;
    add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct4x4_1 sse2/;
    add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct8x8 sse2/;
    add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct8x8_1 sse2/;
    add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct16x16 sse2/;
    add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct16x16_1 sse2/;
    add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct32x32 sse2/;
    add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct32x32_rd sse2/;
    add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp10_fdct32x32_1 sse2/;
  }
 }
@@ -184,42 +351,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
 add_proto qw/unsigned int vp10_avg_8x8/, "const uint8_t *, int p";
 specialize qw/vp10_avg_8x8 sse2 neon msa/;
 add_proto qw/unsigned int vp10_avg_4x4/, "const uint8_t *, int p";
 specialize qw/vp10_avg_4x4 sse2 msa/;
 add_proto qw/void vp10_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
 specialize qw/vp10_minmax_8x8 sse2/;
 add_proto qw/void vp10_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
 specialize qw/vp10_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
 add_proto qw/void vp10_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
 specialize qw/vp10_hadamard_16x16 sse2/;
 add_proto qw/int16_t vp10_satd/, "const int16_t *coeff, int length";
 specialize qw/vp10_satd sse2/;
 add_proto qw/void vp10_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
 specialize qw/vp10_int_pro_row sse2 neon/;
 add_proto qw/int16_t vp10_int_pro_col/, "uint8_t const *ref, const int width";
 specialize qw/vp10_int_pro_col sse2 neon/;
 add_proto qw/int vp10_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
 specialize qw/vp10_vector_var neon sse2/;
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/unsigned int vp10_highbd_avg_8x8/, "const uint8_t *, int p";
  specialize qw/vp10_highbd_avg_8x8/;
  add_proto qw/unsigned int vp10_highbd_avg_4x4/, "const uint8_t *, int p";
  specialize qw/vp10_highbd_avg_4x4/;
  add_proto qw/void vp10_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
  specialize qw/vp10_highbd_minmax_8x8/;
 }
 # ENCODEMB INVOKE
 #
@@ -289,6 +420,188 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
 }
 # Inverse transform
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  # Note as optimized versions of these functions are added we need to add a check to ensure
  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
  add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_idct4x4_1_add/;
  add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_idct4x4_16_add/;
  add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_idct8x8_1_add/;
  add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_idct8x8_64_add/;
  add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_idct8x8_12_add/;
  add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_idct16x16_1_add/;
  add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_idct16x16_256_add/;
  add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_idct16x16_10_add/;
  add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_idct32x32_1024_add/;
  add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_idct32x32_34_add/;
  add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_idct32x32_1_add/;
  add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_iwht4x4_1_add/;
  add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp10_iwht4x4_16_add/;
  add_proto qw/void vp10_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp10_highbd_idct4x4_1_add/;
  add_proto qw/void vp10_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp10_highbd_idct8x8_1_add/;
  add_proto qw/void vp10_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp10_highbd_idct16x16_1_add/;
  add_proto qw/void vp10_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp10_highbd_idct32x32_1024_add/;
  add_proto qw/void vp10_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp10_highbd_idct32x32_34_add/;
  add_proto qw/void vp10_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp10_highbd_idct32x32_1_add/;
  add_proto qw/void vp10_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp10_highbd_iwht4x4_1_add/;
  add_proto qw/void vp10_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp10_highbd_iwht4x4_16_add/;
  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
    add_proto qw/void vp10_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
    specialize qw/vp10_highbd_idct4x4_16_add/;
    add_proto qw/void vp10_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
    specialize qw/vp10_highbd_idct8x8_64_add/;
    add_proto qw/void vp10_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
    specialize qw/vp10_highbd_idct8x8_10_add/;
    add_proto qw/void vp10_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
    specialize qw/vp10_highbd_idct16x16_256_add/;
    add_proto qw/void vp10_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
    specialize qw/vp10_highbd_idct16x16_10_add/;
  } else {
    add_proto qw/void vp10_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
    specialize qw/vp10_highbd_idct4x4_16_add sse2/;
    add_proto qw/void vp10_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
    specialize qw/vp10_highbd_idct8x8_64_add sse2/;
    add_proto qw/void vp10_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
    specialize qw/vp10_highbd_idct8x8_10_add sse2/;
    add_proto qw/void vp10_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
    specialize qw/vp10_highbd_idct16x16_256_add sse2/;
    add_proto qw/void vp10_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
    specialize qw/vp10_highbd_idct16x16_10_add sse2/;
  }  # CONFIG_EMULATE_HARDWARE
 } else {
  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
    add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct4x4_1_add/;
    add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct4x4_16_add/;
    add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct8x8_1_add/;
    add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct8x8_64_add/;
    add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct8x8_12_add/;
    add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct16x16_1_add/;
    add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct16x16_256_add/;
    add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct16x16_10_add/;
    add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct32x32_1024_add/;
    add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct32x32_34_add/;
    add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct32x32_1_add/;
    add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_iwht4x4_1_add/;
    add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_iwht4x4_16_add/;
  } else {
    add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct4x4_1_add sse2/;
    add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct4x4_16_add sse2/;
    add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct8x8_1_add sse2/;
    add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct8x8_64_add sse2/;
    add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct8x8_12_add sse2/;
    add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct16x16_1_add sse2/;
    add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct16x16_256_add sse2/;
    add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct16x16_10_add sse2/;
    add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct32x32_1024_add sse2/;
    add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct32x32_34_add sse2/;
    add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_idct32x32_1_add sse2/;
    add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_iwht4x4_1_add/;
    add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp10_iwht4x4_16_add/;
  }  # CONFIG_EMULATE_HARDWARE
 }  # CONFIG_VP9_HIGHBITDEPTH
 #
 # Motion search
 #
--- a/vp10/common/x86/idct_intrin_sse2.c
+++ b/vp10/common/x86/idct_intrin_sse2.c
@@ -12,14 +12,14 @@
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
-void vp10_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            int tx_type) {
+                             int tx_type) {
  __m128i in[2];
  const __m128i zero = _mm_setzero_si128();
  const __m128i eight = _mm_set1_epi16(8);
-  in[0] = _mm_loadu_si128((const __m128i *)(input));
+  in[0] = load_input_data(input);
-  in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
+  in[1] = load_input_data(input + 8);
  switch (tx_type) {
    case 0:  // DCT_DCT
@@ -77,21 +77,21 @@ void vp10_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
  }
 }
-void vp10_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            int tx_type) {
+                             int tx_type) {
  __m128i in[8];
  const __m128i zero = _mm_setzero_si128();
  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
  // load input data
-  in[0] = _mm_load_si128((const __m128i *)input);
+  in[0] = load_input_data(input);
-  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in[1] = load_input_data(input + 8 * 1);
-  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[2] = load_input_data(input + 8 * 2);
-  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in[3] = load_input_data(input + 8 * 3);
-  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[4] = load_input_data(input + 8 * 4);
-  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in[5] = load_input_data(input + 8 * 5);
-  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in[6] = load_input_data(input + 8 * 6);
-  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+  in[7] = load_input_data(input + 8 * 7);
  switch (tx_type) {
    case 0:  // DCT_DCT
@@ -144,8 +144,8 @@ void vp10_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
  RECON_AND_STORE(dest + 7 * stride, in[7]);
 }
-void vp10_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+void vp10_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int tx_type) {
+                                int stride, int tx_type) {
  __m128i in0[16], in1[16];
  load_buffer_8x16(input, in0);
--- a/vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h
+++ b/vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h
--- a/vp10/common/x86/vp10_fwd_txfm_impl_sse2.h
+++ b/vp10/common/x86/vp10_fwd_txfm_impl_sse2.h
--- a/vp10/common/x86/vp10_fwd_txfm_sse2.c
+++ b/vp10/common/x86/vp10_fwd_txfm_sse2.c
@@ -0,0 +1,271 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <emmintrin.h>  // SSE2
 #include "./vpx_config.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
 void vp10_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
  __m128i in0, in1;
  __m128i tmp;
  const __m128i zero = _mm_setzero_si128();
  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
         (input +  2 * stride)));
  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
         (input +  3 * stride)));
  tmp = _mm_add_epi16(in0, in1);
  in0 = _mm_unpacklo_epi16(zero, tmp);
  in1 = _mm_unpackhi_epi16(zero, tmp);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);
  tmp = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(tmp, zero);
  in1 = _mm_unpackhi_epi32(tmp, zero);
  tmp = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(tmp, 8);
  in1 = _mm_add_epi32(tmp, in0);
  in0 = _mm_slli_epi32(in1, 1);
  store_output(&in0, output);
 }
 void vp10_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
  __m128i u0, u1, sum;
  u0 = _mm_add_epi16(in0, in1);
  u1 = _mm_add_epi16(in2, in3);
  in0  = _mm_load_si128((const __m128i *)(input + 4 * stride));
  in1  = _mm_load_si128((const __m128i *)(input + 5 * stride));
  in2  = _mm_load_si128((const __m128i *)(input + 6 * stride));
  in3  = _mm_load_si128((const __m128i *)(input + 7 * stride));
  sum = _mm_add_epi16(u0, u1);
  in0 = _mm_add_epi16(in0, in1);
  in2 = _mm_add_epi16(in2, in3);
  sum = _mm_add_epi16(sum, in0);
  u0  = _mm_setzero_si128();
  sum = _mm_add_epi16(sum, in2);
  in0 = _mm_unpacklo_epi16(u0, sum);
  in1 = _mm_unpackhi_epi16(u0, sum);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);
  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(sum, u0);
  in1 = _mm_unpackhi_epi32(sum, u0);
  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(sum, 8);
  in1 = _mm_add_epi32(sum, in0);
  store_output(&in1, output);
 }
 void vp10_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
                          int stride) {
  __m128i in0, in1, in2, in3;
  __m128i u0, u1;
  __m128i sum = _mm_setzero_si128();
  int i;
  for (i = 0; i < 2; ++i) {
    input += 8 * i;
    in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
    in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
    in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
    in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);
    in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
    in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
    in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
    in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
    sum = _mm_add_epi16(sum, u1);
    u0  = _mm_add_epi16(in0, in1);
    u1  = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);
    in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
    in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
    in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
    in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
    sum = _mm_add_epi16(sum, u1);
    u0  = _mm_add_epi16(in0, in1);
    u1  = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);
    in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
    in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
    in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
    in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
    sum = _mm_add_epi16(sum, u1);
    u0  = _mm_add_epi16(in0, in1);
    u1  = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);
    sum = _mm_add_epi16(sum, u1);
  }
  u0  = _mm_setzero_si128();
  in0 = _mm_unpacklo_epi16(u0, sum);
  in1 = _mm_unpackhi_epi16(u0, sum);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);
  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(sum, u0);
  in1 = _mm_unpackhi_epi32(sum, u0);
  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(sum, 8);
  in1 = _mm_add_epi32(sum, in0);
  in1 = _mm_srai_epi32(in1, 1);
  store_output(&in1, output);
 }
 void vp10_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
                          int stride) {
  __m128i in0, in1, in2, in3;
  __m128i u0, u1;
  __m128i sum = _mm_setzero_si128();
  int i;
  for (i = 0; i < 8; ++i) {
    in0  = _mm_load_si128((const __m128i *)(input +  0));
    in1  = _mm_load_si128((const __m128i *)(input +  8));
    in2  = _mm_load_si128((const __m128i *)(input + 16));
    in3  = _mm_load_si128((const __m128i *)(input + 24));
    input += stride;
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);
    in0  = _mm_load_si128((const __m128i *)(input +  0));
    in1  = _mm_load_si128((const __m128i *)(input +  8));
    in2  = _mm_load_si128((const __m128i *)(input + 16));
    in3  = _mm_load_si128((const __m128i *)(input + 24));
    input += stride;
    sum = _mm_add_epi16(sum, u1);
    u0  = _mm_add_epi16(in0, in1);
    u1  = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);
    in0  = _mm_load_si128((const __m128i *)(input +  0));
    in1  = _mm_load_si128((const __m128i *)(input +  8));
    in2  = _mm_load_si128((const __m128i *)(input + 16));
    in3  = _mm_load_si128((const __m128i *)(input + 24));
    input += stride;
    sum = _mm_add_epi16(sum, u1);
    u0  = _mm_add_epi16(in0, in1);
    u1  = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);
    in0  = _mm_load_si128((const __m128i *)(input +  0));
    in1  = _mm_load_si128((const __m128i *)(input +  8));
    in2  = _mm_load_si128((const __m128i *)(input + 16));
    in3  = _mm_load_si128((const __m128i *)(input + 24));
    input += stride;
    sum = _mm_add_epi16(sum, u1);
    u0  = _mm_add_epi16(in0, in1);
    u1  = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);
    sum = _mm_add_epi16(sum, u1);
  }
  u0  = _mm_setzero_si128();
  in0 = _mm_unpacklo_epi16(u0, sum);
  in1 = _mm_unpackhi_epi16(u0, sum);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);
  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(sum, u0);
  in1 = _mm_unpackhi_epi32(sum, u0);
  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(sum, 8);
  in1 = _mm_add_epi32(sum, in0);
  in1 = _mm_srai_epi32(in1, 3);
  store_output(&in1, output);
 }
 #define DCT_HIGH_BIT_DEPTH 0
 #define FDCT4x4_2D vp10_fdct4x4_sse2
 #define FDCT8x8_2D vp10_fdct8x8_sse2
 #define FDCT16x16_2D vp10_fdct16x16_sse2
 #include "vp10/common/x86/vp10_fwd_txfm_impl_sse2.h"
 #undef  FDCT4x4_2D
 #undef  FDCT8x8_2D
 #undef  FDCT16x16_2D
 #define FDCT32x32_2D vp10_fdct32x32_rd_sse2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h"
 #undef  FDCT32x32_2D
 #undef  FDCT32x32_HIGH_PRECISION
 #define FDCT32x32_2D vp10_fdct32x32_sse2
 #define FDCT32x32_HIGH_PRECISION 1
 #include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h"  // NOLINT
 #undef  FDCT32x32_2D
 #undef  FDCT32x32_HIGH_PRECISION
 #undef  DCT_HIGH_BIT_DEPTH
 #if CONFIG_VP9_HIGHBITDEPTH
 #define DCT_HIGH_BIT_DEPTH 1
 #define FDCT4x4_2D vp10_highbd_fdct4x4_sse2
 #define FDCT8x8_2D vp10_highbd_fdct8x8_sse2
 #define FDCT16x16_2D vp10_highbd_fdct16x16_sse2
 #include "vp10/common/x86/vp10_fwd_txfm_impl_sse2.h" // NOLINT
 #undef  FDCT4x4_2D
 #undef  FDCT8x8_2D
 #undef  FDCT16x16_2D
 #define FDCT32x32_2D vp10_highbd_fdct32x32_rd_sse2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h" // NOLINT
 #undef  FDCT32x32_2D
 #undef  FDCT32x32_HIGH_PRECISION
 #define FDCT32x32_2D vp10_highbd_fdct32x32_sse2
 #define FDCT32x32_HIGH_PRECISION 1
 #include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h" // NOLINT
 #undef  FDCT32x32_2D
 #undef  FDCT32x32_HIGH_PRECISION
 #undef  DCT_HIGH_BIT_DEPTH
 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp10/common/x86/vp10_inv_txfm_sse2.c
+++ b/vp10/common/x86/vp10_inv_txfm_sse2.c
--- a/vp10/common/x86/vp10_inv_txfm_sse2.h
+++ b/vp10/common/x86/vp10_inv_txfm_sse2.h
@@ -0,0 +1,184 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
 #define VPX_DSP_X86_INV_TXFM_SSE2_H_
 #include <emmintrin.h>  // SSE2
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vp10/common/vp10_inv_txfm.h"
 // perform 8x8 transpose
 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
 }
 #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
  {                                                     \
    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
                                                        \
    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
  }
 static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
 }
 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
  __m128i tbuf[8];
  array_transpose_8x8(res0, res0);
  array_transpose_8x8(res1, tbuf);
  array_transpose_8x8(res0 + 8, res1);
  array_transpose_8x8(res1 + 8, res1 + 8);
  res0[8] = tbuf[0];
  res0[9] = tbuf[1];
  res0[10] = tbuf[2];
  res0[11] = tbuf[3];
  res0[12] = tbuf[4];
  res0[13] = tbuf[5];
  res0[14] = tbuf[6];
  res0[15] = tbuf[7];
 }
 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
 }
 #define RECON_AND_STORE(dest, in_x) \
  {                                                     \
     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
      d0 = _mm_unpacklo_epi8(d0, zero); \
      d0 = _mm_add_epi16(in_x, d0); \
      d0 = _mm_packus_epi16(d0, d0); \
      _mm_storel_epi64((__m128i *)(dest), d0); \
  }
 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
  const __m128i final_rounding = _mm_set1_epi16(1<<5);
  const __m128i zero = _mm_setzero_si128();
  // Final rounding and shift
  in[0] = _mm_adds_epi16(in[0], final_rounding);
  in[1] = _mm_adds_epi16(in[1], final_rounding);
  in[2] = _mm_adds_epi16(in[2], final_rounding);
  in[3] = _mm_adds_epi16(in[3], final_rounding);
  in[4] = _mm_adds_epi16(in[4], final_rounding);
  in[5] = _mm_adds_epi16(in[5], final_rounding);
  in[6] = _mm_adds_epi16(in[6], final_rounding);
  in[7] = _mm_adds_epi16(in[7], final_rounding);
  in[8] = _mm_adds_epi16(in[8], final_rounding);
  in[9] = _mm_adds_epi16(in[9], final_rounding);
  in[10] = _mm_adds_epi16(in[10], final_rounding);
  in[11] = _mm_adds_epi16(in[11], final_rounding);
  in[12] = _mm_adds_epi16(in[12], final_rounding);
  in[13] = _mm_adds_epi16(in[13], final_rounding);
  in[14] = _mm_adds_epi16(in[14], final_rounding);
  in[15] = _mm_adds_epi16(in[15], final_rounding);
  in[0] = _mm_srai_epi16(in[0], 6);
  in[1] = _mm_srai_epi16(in[1], 6);
  in[2] = _mm_srai_epi16(in[2], 6);
  in[3] = _mm_srai_epi16(in[3], 6);
  in[4] = _mm_srai_epi16(in[4], 6);
  in[5] = _mm_srai_epi16(in[5], 6);
  in[6] = _mm_srai_epi16(in[6], 6);
  in[7] = _mm_srai_epi16(in[7], 6);
  in[8] = _mm_srai_epi16(in[8], 6);
  in[9] = _mm_srai_epi16(in[9], 6);
  in[10] = _mm_srai_epi16(in[10], 6);
  in[11] = _mm_srai_epi16(in[11], 6);
  in[12] = _mm_srai_epi16(in[12], 6);
  in[13] = _mm_srai_epi16(in[13], 6);
  in[14] = _mm_srai_epi16(in[14], 6);
  in[15] = _mm_srai_epi16(in[15], 6);
  RECON_AND_STORE(dest +  0 * stride, in[0]);
  RECON_AND_STORE(dest +  1 * stride, in[1]);
  RECON_AND_STORE(dest +  2 * stride, in[2]);
  RECON_AND_STORE(dest +  3 * stride, in[3]);
  RECON_AND_STORE(dest +  4 * stride, in[4]);
  RECON_AND_STORE(dest +  5 * stride, in[5]);
  RECON_AND_STORE(dest +  6 * stride, in[6]);
  RECON_AND_STORE(dest +  7 * stride, in[7]);
  RECON_AND_STORE(dest +  8 * stride, in[8]);
  RECON_AND_STORE(dest +  9 * stride, in[9]);
  RECON_AND_STORE(dest + 10 * stride, in[10]);
  RECON_AND_STORE(dest + 11 * stride, in[11]);
  RECON_AND_STORE(dest + 12 * stride, in[12]);
  RECON_AND_STORE(dest + 13 * stride, in[13]);
  RECON_AND_STORE(dest + 14 * stride, in[14]);
  RECON_AND_STORE(dest + 15 * stride, in[15]);
 }
 void idct4_sse2(__m128i *in);
 void idct8_sse2(__m128i *in);
 void idct16_sse2(__m128i *in0, __m128i *in1);
 void iadst4_sse2(__m128i *in);
 void iadst8_sse2(__m128i *in);
 void iadst16_sse2(__m128i *in0, __m128i *in1);
 #endif  // VPX_DSP_X86_INV_TXFM_SSE2_H_
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -47,6 +47,8 @@
 static int is_compound_reference_allowed(const VP10_COMMON *cm) {
  int i;
  if (frame_is_intra_only(cm))
    return 0;
  for (i = 1; i < REFS_PER_FRAME; ++i)
    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
      return 1;
@@ -81,12 +83,18 @@ static int decode_unsigned_max(struct vpx_read_bit_buffer *rb, int max) {
  return data > max ? max : data;
 }
 #if CONFIG_MISC_FIXES
 static TX_MODE read_tx_mode(struct vpx_read_bit_buffer *rb) {
  return vpx_rb_read_bit(rb) ? TX_MODE_SELECT : vpx_rb_read_literal(rb, 2);
 }
 #else
 static TX_MODE read_tx_mode(vpx_reader *r) {
  TX_MODE tx_mode = vpx_read_literal(r, 2);
  if (tx_mode == ALLOW_32X32)
    tx_mode += vpx_read_bit(r);
  return tx_mode;
 }
 #endif
 static void read_tx_mode_probs(struct tx_probs *tx_probs, vpx_reader *r) {
  int i, j;
@@ -118,6 +126,18 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
      vp10_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
 }
 #if CONFIG_MISC_FIXES
 static REFERENCE_MODE read_frame_reference_mode(const VP10_COMMON *cm,
    struct vpx_read_bit_buffer *rb) {
  if (is_compound_reference_allowed(cm)) {
    return vpx_rb_read_bit(rb) ? REFERENCE_MODE_SELECT
                               : (vpx_rb_read_bit(rb) ? COMPOUND_REFERENCE
                                                      : SINGLE_REFERENCE);
  } else {
    return SINGLE_REFERENCE;
  }
 }
 #else
 static REFERENCE_MODE read_frame_reference_mode(const VP10_COMMON *cm,
                                                vpx_reader *r) {
  if (is_compound_reference_allowed(cm)) {
@@ -128,6 +148,7 @@ static REFERENCE_MODE read_frame_reference_mode(const VP10_COMMON *cm,
    return SINGLE_REFERENCE;
  }
 }
 #endif
 static void read_frame_reference_mode_probs(VP10_COMMON *cm, vpx_reader *r) {
  FRAME_CONTEXT *const fc = cm->fc;
@@ -151,8 +172,12 @@ static void read_frame_reference_mode_probs(VP10_COMMON *cm, vpx_reader *r) {
 static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) {
  int i;
  for (i = 0; i < n; ++i)
 #if CONFIG_MISC_FIXES
    vp10_diff_update_prob(r, &p[i]);
 #else
    if (vpx_read(r, MV_UPDATE_PROB))
      p[i] = (vpx_read_literal(r, 7) << 1) | 1;
 #endif
 }
 static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) {
@@ -190,6 +215,7 @@ static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
                                          int eob, int block) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block);
  const int seg_id = xd->mi[0]->mbmi.segment_id;
  if (eob > 0) {
    tran_low_t *const dqcoeff = pd->dqcoeff;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -197,9 +223,7 @@ static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
      switch (tx_size) {
        case TX_4X4:
          vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, xd->bd,
-                                       tx_type, xd->lossless ?
+                                       tx_type, xd->lossless[seg_id]);
                                           vp10_highbd_iwht4x4_add :
                                           vp10_highbd_idct4x4_add);
          break;
        case TX_8X8:
          vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, xd->bd,
@@ -222,8 +246,7 @@ static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
      switch (tx_size) {
        case TX_4X4:
          vp10_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, tx_type,
-                                xd->lossless ? vp10_iwht4x4_add :
+                                xd->lossless[seg_id]);
                                    vp10_idct4x4_add);
          break;
        case TX_8X8:
          vp10_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, tx_type);
@@ -261,6 +284,7 @@ static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
                                          uint8_t *dst, int stride,
                                          int eob) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  const int seg_id = xd->mi[0]->mbmi.segment_id;
  if (eob > 0) {
    tran_low_t *const dqcoeff = pd->dqcoeff;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -268,9 +292,7 @@ static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
      switch (tx_size) {
        case TX_4X4:
          vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, xd->bd,
-                                       tx_type, xd->lossless ?
+                                       tx_type, xd->lossless[seg_id]);
                                           vp10_highbd_iwht4x4_add :
                                           vp10_highbd_idct4x4_add);
          break;
        case TX_8X8:
          vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, xd->bd,
@@ -293,8 +315,7 @@ static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
      switch (tx_size) {
        case TX_4X4:
          vp10_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, tx_type,
-                                xd->lossless ? vp10_iwht4x4_add :
+                                xd->lossless[seg_id]);
                                    vp10_idct4x4_add);
          break;
        case TX_8X8:
          vp10_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, tx_type);
@@ -343,7 +364,7 @@ static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
    if (plane == 0)
      mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
-  vp10_predict_intra_block(xd, pd->n4_wl, tx_size, mode,
+  vp10_predict_intra_block(xd, pd->n4_wl, pd->n4_hl, tx_size, mode,
                          dst, pd->dst.stride, dst, pd->dst.stride,
                          col, row, plane);
@@ -527,6 +548,7 @@ static void dec_build_inter_predictors(VP10Decoder *const pbi, MACROBLOCKD *xd,
                                       struct buf_2d *dst_buf, const MV* mv,
                                       RefCntBuffer *ref_frame_buf,
                                       int is_scaled, int ref) {
  VP10_COMMON *const cm = &pbi->common;
  struct macroblockd_plane *const pd = &xd->plane[plane];
  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
  MV32 scaled_mv;
@@ -623,7 +645,7 @@ static void dec_build_inter_predictors(VP10Decoder *const pbi, MACROBLOCKD *xd,
    // Wait until reference block is ready. Pad 7 more pixels as last 7
    // pixels of each superblock row can be changed by next superblock row.
-    if (pbi->frame_parallel_decode)
+    if (cm->frame_parallel_decode)
      vp10_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
                            VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
@@ -650,7 +672,7 @@ static void dec_build_inter_predictors(VP10Decoder *const pbi, MACROBLOCKD *xd,
  } else {
    // Wait until reference block is ready. Pad 7 more pixels as last 7
    // pixels of each superblock row can be changed by next superblock row.
-     if (pbi->frame_parallel_decode) {
+     if (cm->frame_parallel_decode) {
       const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
       vp10_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
                             VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
@@ -700,12 +722,19 @@ static void dec_build_inter_predictors_sb(VP10Decoder *const pbi,
      const int is_scaled = vp10_is_scaled(sf);
      if (sb_type < BLOCK_8X8) {
-        int i = 0, x, y;
+        const PARTITION_TYPE bp = BLOCK_8X8 - sb_type;
        const int have_vsplit = bp != PARTITION_HORZ;
        const int have_hsplit = bp != PARTITION_VERT;
        const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
        const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
        const int pw = 8 >> (have_vsplit | pd->subsampling_x);
        const int ph = 8 >> (have_hsplit | pd->subsampling_y);
        int x, y;
        for (y = 0; y < num_4x4_h; ++y) {
          for (x = 0; x < num_4x4_w; ++x) {
-            const MV mv = average_split_mvs(pd, mi, ref, i++);
+            const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
            dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
-                                       4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
+                                       4 * x, 4 * y, pw, ph, mi_x, mi_y, kernel,
                                       sf, pre_buf, dst_buf, &mv,
                                       ref_frame_buf, is_scaled, ref);
          }
@@ -857,7 +886,11 @@ static void decode_block(VP10Decoder *const pbi, MACROBLOCKD *const xd,
      }
      if (!less8x8 && eobtotal == 0)
 #if CONFIG_MISC_FIXES
        mbmi->has_no_coeffs = 1;  // skip loopfilter
 #else
        mbmi->skip = 1;  // skip loopfilter
 #endif
    }
  }
@@ -890,11 +923,11 @@ static INLINE void dec_update_partition_context(MACROBLOCKD *xd,
  memset(left_ctx, partition_context_lookup[subsize].left, bw);
 }
-static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
+static PARTITION_TYPE read_partition(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                     vpx_reader *r,
+                                     int mi_row, int mi_col, vpx_reader *r,
                                     int has_rows, int has_cols, int bsl) {
  const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
-  const vpx_prob *const probs = get_partition_probs(xd, ctx);
+  const vpx_prob *const probs = cm->fc->partition_prob[ctx];
  FRAME_COUNTS *counts = xd->counts;
  PARTITION_TYPE p;
@@ -929,7 +962,7 @@ static void decode_partition(VP10Decoder *const pbi, MACROBLOCKD *const xd,
  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
    return;
-  partition = read_partition(xd, mi_row, mi_col, r, has_rows, has_cols,
+  partition = read_partition(cm, xd, mi_row, mi_col, r, has_rows, has_cols,
                             n8x8_l2);
  subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
  if (!hbs) {
@@ -1015,6 +1048,9 @@ static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
 static void setup_segmentation(VP10_COMMON *const cm,
                               struct vpx_read_bit_buffer *rb) {
  struct segmentation *const seg = &cm->seg;
 #if !CONFIG_MISC_FIXES
  struct segmentation_probs *const segp = &cm->segp;
 #endif
  int i, j;
  seg->update_map = 0;
@@ -1031,23 +1067,26 @@ static void setup_segmentation(VP10_COMMON *const cm,
    seg->update_map = vpx_rb_read_bit(rb);
  }
  if (seg->update_map) {
 #if !CONFIG_MISC_FIXES
    for (i = 0; i < SEG_TREE_PROBS; i++)
-      seg->tree_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
+      segp->tree_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
-                                               : MAX_PROB;
+                                                : MAX_PROB;
-
+#endif
    if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
      seg->temporal_update = 0;
    } else {
      seg->temporal_update = vpx_rb_read_bit(rb);
    }
 #if !CONFIG_MISC_FIXES
    if (seg->temporal_update) {
      for (i = 0; i < PREDICTION_PROBS; i++)
-        seg->pred_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
+        segp->pred_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
-                                                 : MAX_PROB;
+                                                  : MAX_PROB;
    } else {
      for (i = 0; i < PREDICTION_PROBS; i++)
-        seg->pred_probs[i] = MAX_PROB;
+        segp->pred_probs[i] = MAX_PROB;
    }
 #endif
  }
  // Segmentation data update
@@ -1090,34 +1129,27 @@ static void setup_loopfilter(struct loopfilter *lf,
      for (i = 0; i < MAX_REF_FRAMES; i++)
        if (vpx_rb_read_bit(rb))
-          lf->ref_deltas[i] = vpx_rb_read_signed_literal(rb, 6);
+          lf->ref_deltas[i] = vpx_rb_read_inv_signed_literal(rb, 6);
      for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
        if (vpx_rb_read_bit(rb))
-          lf->mode_deltas[i] = vpx_rb_read_signed_literal(rb, 6);
+          lf->mode_deltas[i] = vpx_rb_read_inv_signed_literal(rb, 6);
    }
  }
 }
 static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
-  return vpx_rb_read_bit(rb) ? vpx_rb_read_signed_literal(rb, 4) : 0;
+  return vpx_rb_read_bit(rb) ?
      vpx_rb_read_inv_signed_literal(rb, CONFIG_MISC_FIXES ? 6 : 4) : 0;
 }
-static void setup_quantization(VP10_COMMON *const cm, MACROBLOCKD *const xd,
+static void setup_quantization(VP10_COMMON *const cm,
                               struct vpx_read_bit_buffer *rb) {
  cm->base_qindex = vpx_rb_read_literal(rb, QINDEX_BITS);
  cm->y_dc_delta_q = read_delta_q(rb);
  cm->uv_dc_delta_q = read_delta_q(rb);
  cm->uv_ac_delta_q = read_delta_q(rb);
  cm->dequant_bit_depth = cm->bit_depth;
  xd->lossless = cm->base_qindex == 0 &&
                 cm->y_dc_delta_q == 0 &&
                 cm->uv_dc_delta_q == 0 &&
                 cm->uv_ac_delta_q == 0;
 #if CONFIG_VP9_HIGHBITDEPTH
  xd->bd = (int)cm->bit_depth;
 #endif
 }
 static void setup_segmentation_dequant(VP10_COMMON *const cm) {
@@ -1151,12 +1183,12 @@ static INTERP_FILTER read_interp_filter(struct vpx_read_bit_buffer *rb) {
  return vpx_rb_read_bit(rb) ? SWITCHABLE : vpx_rb_read_literal(rb, 2);
 }
-static void setup_display_size(VP10_COMMON *cm,
+static void setup_render_size(VP10_COMMON *cm,
-                               struct vpx_read_bit_buffer *rb) {
+                              struct vpx_read_bit_buffer *rb) {
-  cm->display_width = cm->width;
+  cm->render_width = cm->width;
-  cm->display_height = cm->height;
+  cm->render_height = cm->height;
  if (vpx_rb_read_bit(rb))
-    vp10_read_frame_size(rb, &cm->display_width, &cm->display_height);
+    vp10_read_frame_size(rb, &cm->render_width, &cm->render_height);
 }
 static void resize_mv_buffer(VP10_COMMON *cm) {
@@ -1204,7 +1236,7 @@ static void setup_frame_size(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
  BufferPool *const pool = cm->buffer_pool;
  vp10_read_frame_size(rb, &width, &height);
  resize_context_buffers(cm, width, height);
-  setup_display_size(cm, rb);
+  setup_render_size(cm, rb);
  lock_buffer_pool(pool);
  if (vpx_realloc_frame_buffer(
@@ -1227,6 +1259,9 @@ static void setup_frame_size(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
  pool->frame_bufs[cm->new_fb_idx].buf.render_width  = cm->render_width;
  pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
 static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth,
@@ -1248,13 +1283,21 @@ static void setup_frame_size_with_refs(VP10_COMMON *cm,
      YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
      width = buf->y_crop_width;
      height = buf->y_crop_height;
 #if CONFIG_MISC_FIXES
      cm->render_width = buf->render_width;
      cm->render_height = buf->render_height;
 #endif
      found = 1;
      break;
    }
  }
-  if (!found)
+  if (!found) {
    vp10_read_frame_size(rb, &width, &height);
 #if CONFIG_MISC_FIXES
    setup_render_size(cm, rb);
 #endif
  }
  if (width <= 0 || height <= 0)
    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
@@ -1285,7 +1328,9 @@ static void setup_frame_size_with_refs(VP10_COMMON *cm,
  }
  resize_context_buffers(cm, width, height);
-  setup_display_size(cm, rb);
+#if !CONFIG_MISC_FIXES
  setup_render_size(cm, rb);
 #endif
  lock_buffer_pool(pool);
  if (vpx_realloc_frame_buffer(
@@ -1308,6 +1353,9 @@ static void setup_frame_size_with_refs(VP10_COMMON *cm,
  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
  pool->frame_bufs[cm->new_fb_idx].buf.render_width  = cm->render_width;
  pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
 static void setup_tile_info(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
@@ -1328,6 +1376,15 @@ static void setup_tile_info(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
  cm->log2_tile_rows = vpx_rb_read_bit(rb);
  if (cm->log2_tile_rows)
    cm->log2_tile_rows += vpx_rb_read_bit(rb);
 #if CONFIG_MISC_FIXES
  // tile size magnitude
  if (cm->log2_tile_rows > 0 || cm->log2_tile_cols > 0) {
    cm->tile_sz_mag = vpx_rb_read_literal(rb, 2);
  }
 #else
  cm->tile_sz_mag = 3;
 #endif
 }
 typedef struct TileBuffer {
@@ -1336,10 +1393,27 @@ typedef struct TileBuffer {
  int col;  // only used with multi-threaded decoding
 } TileBuffer;
 static int mem_get_varsize(const uint8_t *data, const int mag) {
  switch (mag) {
    case 0:
      return data[0];
    case 1:
      return mem_get_le16(data);
    case 2:
      return mem_get_le24(data);
    case 3:
      return mem_get_le32(data);
  }
  assert("Invalid tile size marker value" && 0);
  return -1;
 }
 // Reads the next tile returning its size and adjusting '*data' accordingly
 // based on 'is_last'.
 static void get_tile_buffer(const uint8_t *const data_end,
-                            int is_last,
+                            const int tile_sz_mag, int is_last,
                            struct vpx_internal_error_info *error_info,
                            const uint8_t **data,
                            vpx_decrypt_cb decrypt_cb, void *decrypt_state,
@@ -1353,12 +1427,12 @@ static void get_tile_buffer(const uint8_t *const data_end,
    if (decrypt_cb) {
      uint8_t be_data[4];
-      decrypt_cb(decrypt_state, *data, be_data, 4);
+      decrypt_cb(decrypt_state, *data, be_data, tile_sz_mag + 1);
-      size = mem_get_be32(be_data);
+      size = mem_get_varsize(be_data, tile_sz_mag) + CONFIG_MISC_FIXES;
    } else {
-      size = mem_get_be32(*data);
+      size = mem_get_varsize(*data, tile_sz_mag) + CONFIG_MISC_FIXES;
    }
-    *data += 4;
+    *data += tile_sz_mag + 1;
    if (size > (size_t)(data_end - *data))
      vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
@@ -1384,7 +1458,8 @@ static void get_tile_buffers(VP10Decoder *pbi,
      const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1);
      TileBuffer *const buf = &tile_buffers[r][c];
      buf->col = c;
-      get_tile_buffer(data_end, is_last, &pbi->common.error, &data,
+      get_tile_buffer(data_end, pbi->common.tile_sz_mag,
                      is_last, &pbi->common.error, &data,
                      pbi->decrypt_cb, pbi->decrypt_state, buf);
    }
  }
@@ -1453,14 +1528,17 @@ static const uint8_t *decode_tiles(VP10Decoder *pbi,
      tile_data->cm = cm;
      tile_data->xd = pbi->mb;
      tile_data->xd.corrupted = 0;
-      tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
+      tile_data->xd.counts =
-                             NULL : &cm->counts;
+          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD ?
              &cm->counts : NULL;
      vp10_zero(tile_data->dqcoeff);
      vp10_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
      setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                          &tile_data->bit_reader, pbi->decrypt_cb,
                          pbi->decrypt_state);
      vp10_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
      tile_data->xd.plane[0].color_index_map = tile_data->color_index_map[0];
      tile_data->xd.plane[1].color_index_map = tile_data->color_index_map[1];
    }
  }
@@ -1509,7 +1587,7 @@ static const uint8_t *decode_tiles(VP10Decoder *pbi,
      // After loopfiltering, the last 7 row pixels in each superblock row may
      // still be changed by the longest loopfilter of the next superblock
      // row.
-      if (pbi->frame_parallel_decode)
+      if (cm->frame_parallel_decode)
        vp10_frameworker_broadcast(pbi->cur_buf,
                                  mi_row << MI_BLOCK_SIZE_LOG2);
    }
@@ -1527,7 +1605,7 @@ static const uint8_t *decode_tiles(VP10Decoder *pbi,
  // Get last tile data.
  tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
-  if (pbi->frame_parallel_decode)
+  if (cm->frame_parallel_decode)
    vp10_frameworker_broadcast(pbi->cur_buf, INT_MAX);
  return vpx_reader_find_end(&tile_data->bit_reader);
 }
@@ -1651,7 +1729,7 @@ static const uint8_t *decode_tiles_mt(VP10Decoder *pbi,
  }
  // Initialize thread frame counts.
-  if (!cm->frame_parallel_decoding_mode) {
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
    int i;
    for (i = 0; i < num_workers; ++i) {
@@ -1673,8 +1751,9 @@ static const uint8_t *decode_tiles_mt(VP10Decoder *pbi,
      tile_data->pbi = pbi;
      tile_data->xd = pbi->mb;
      tile_data->xd.corrupted = 0;
-      tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
+      tile_data->xd.counts =
-                             0 : &tile_data->counts;
+          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD ?
              &tile_data->counts : NULL;
      vp10_zero(tile_data->dqcoeff);
      vp10_tile_init(tile, cm, 0, buf->col);
      vp10_tile_init(&tile_data->xd.tile, cm, 0, buf->col);
@@ -1682,6 +1761,8 @@ static const uint8_t *decode_tiles_mt(VP10Decoder *pbi,
                          &tile_data->bit_reader, pbi->decrypt_cb,
                          pbi->decrypt_state);
      vp10_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
      tile_data->xd.plane[0].color_index_map = tile_data->color_index_map[0];
      tile_data->xd.plane[1].color_index_map = tile_data->color_index_map[1];
      worker->had_error = 0;
      if (i == num_workers - 1 || n == tile_cols - 1) {
@@ -1713,7 +1794,8 @@ static const uint8_t *decode_tiles_mt(VP10Decoder *pbi,
    }
    // Accumulate thread frame counts.
-    if (n >= tile_cols && !cm->frame_parallel_decoding_mode) {
+    if (n >= tile_cols &&
        cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
      for (i = 0; i < num_workers; ++i) {
        TileWorkerData *const tile_data =
            (TileWorkerData*)pbi->tile_workers[i].data1;
@@ -1745,7 +1827,8 @@ static void read_bitdepth_colorspace_sampling(
  }
  cm->color_space = vpx_rb_read_literal(rb, 3);
  if (cm->color_space != VPX_CS_SRGB) {
-    vpx_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
+    // [16,235] (including xvycc) vs [0,255] range
    cm->color_range = vpx_rb_read_bit(rb);
    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
      cm->subsampling_x = vpx_rb_read_bit(rb);
      cm->subsampling_y = vpx_rb_read_bit(rb);
@@ -1776,6 +1859,7 @@ static void read_bitdepth_colorspace_sampling(
 static size_t read_uncompressed_header(VP10Decoder *pbi,
                                       struct vpx_read_bit_buffer *rb) {
  VP10_COMMON *const cm = &pbi->common;
  MACROBLOCKD *const xd = &pbi->mb;
  BufferPool *const pool = cm->buffer_pool;
  RefCntBuffer *const frame_bufs = pool->frame_bufs;
  int i, mask, ref_index = 0;
@@ -1817,7 +1901,7 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
    cm->lf.filter_level = 0;
    cm->show_frame = 1;
-    if (pbi->frame_parallel_decode) {
+    if (cm->frame_parallel_decode) {
      for (i = 0; i < REF_FRAMES; ++i)
        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
    }
@@ -1849,13 +1933,41 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
  } else {
    cm->intra_only = cm->show_frame ? 0 : vpx_rb_read_bit(rb);
-    cm->reset_frame_context = cm->error_resilient_mode ?
+    if (cm->error_resilient_mode) {
-        0 : vpx_rb_read_literal(rb, 2);
+        cm->reset_frame_context = RESET_FRAME_CONTEXT_ALL;
    } else {
 #if CONFIG_MISC_FIXES
      if (cm->intra_only) {
          cm->reset_frame_context =
              vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_ALL
                                  : RESET_FRAME_CONTEXT_CURRENT;
      } else {
          cm->reset_frame_context =
              vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_CURRENT
                                  : RESET_FRAME_CONTEXT_NONE;
          if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT)
            cm->reset_frame_context =
                  vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_ALL
                                      : RESET_FRAME_CONTEXT_CURRENT;
      }
 #else
      static const RESET_FRAME_CONTEXT_MODE reset_frame_context_conv_tbl[4] = {
        RESET_FRAME_CONTEXT_NONE, RESET_FRAME_CONTEXT_NONE,
        RESET_FRAME_CONTEXT_CURRENT, RESET_FRAME_CONTEXT_ALL
      };
      cm->reset_frame_context =
          reset_frame_context_conv_tbl[vpx_rb_read_literal(rb, 2)];
 #endif
    }
    if (cm->intra_only) {
      if (!vp10_read_sync_code(rb))
        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                           "Invalid frame sync code");
 #if CONFIG_MISC_FIXES
      read_bitdepth_colorspace_sampling(cm, rb);
 #else
      if (cm->profile > PROFILE_0) {
        read_bitdepth_colorspace_sampling(cm, rb);
      } else {
@@ -1864,12 +1976,14 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
        // specifies that the default color format should be YUV 4:2:0 in this
        // case (normative).
        cm->color_space = VPX_CS_BT_601;
        cm->color_range = 0;
        cm->subsampling_y = cm->subsampling_x = 1;
        cm->bit_depth = VPX_BITS_8;
 #if CONFIG_VP9_HIGHBITDEPTH
        cm->use_highbitdepth = 0;
 #endif
      }
 #endif
      pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
      setup_frame_size(cm, rb);
@@ -1914,6 +2028,9 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
  get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
 #endif
  get_frame_new_buffer(cm)->color_space = cm->color_space;
  get_frame_new_buffer(cm)->color_range = cm->color_range;
  get_frame_new_buffer(cm)->render_width  = cm->render_width;
  get_frame_new_buffer(cm)->render_height = cm->render_height;
  if (pbi->need_resync) {
    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
@@ -1922,11 +2039,20 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
  }
  if (!cm->error_resilient_mode) {
-    cm->refresh_frame_context = vpx_rb_read_bit(rb);
+    cm->refresh_frame_context =
-    cm->frame_parallel_decoding_mode = vpx_rb_read_bit(rb);
+        vpx_rb_read_bit(rb) ? REFRESH_FRAME_CONTEXT_FORWARD
                            : REFRESH_FRAME_CONTEXT_OFF;
    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD) {
        cm->refresh_frame_context =
            vpx_rb_read_bit(rb) ? REFRESH_FRAME_CONTEXT_FORWARD
                                : REFRESH_FRAME_CONTEXT_BACKWARD;
 #if !CONFIG_MISC_FIXES
    } else {
      vpx_rb_read_bit(rb);  // parallel decoding mode flag
 #endif
    }
  } else {
-    cm->refresh_frame_context = 0;
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_OFF;
    cm->frame_parallel_decoding_mode = 1;
  }
  // This flag will be overridden by the call to vp10_setup_past_independence
@@ -1961,9 +2087,32 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
    vp10_setup_past_independence(cm);
  setup_loopfilter(&cm->lf, rb);
-  setup_quantization(cm, &pbi->mb, rb);
+  setup_quantization(cm, rb);
 #if CONFIG_VP9_HIGHBITDEPTH
  xd->bd = (int)cm->bit_depth;
 #endif
  setup_segmentation(cm, rb);
  {
    int i;
    for (i = 0; i < MAX_SEGMENTS; ++i) {
      const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
          vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
          cm->base_qindex;
      xd->lossless[i] = qindex == 0 &&
          cm->y_dc_delta_q == 0 &&
          cm->uv_dc_delta_q == 0 &&
          cm->uv_ac_delta_q == 0;
    }
  }
  setup_segmentation_dequant(cm);
 #if CONFIG_MISC_FIXES
  cm->tx_mode = (!cm->seg.enabled && xd->lossless[0]) ? ONLY_4X4
                                                      : read_tx_mode(rb);
  cm->reference_mode = read_frame_reference_mode(cm, rb);
 #endif
  setup_tile_info(cm, rb);
  sz = vpx_rb_read_literal(rb, 16);
@@ -1978,17 +2127,21 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
 static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
                                  size_t partition_size) {
  VP10_COMMON *const cm = &pbi->common;
 #if !CONFIG_MISC_FIXES
  MACROBLOCKD *const xd = &pbi->mb;
 #endif
  FRAME_CONTEXT *const fc = cm->fc;
  vpx_reader r;
-  int k;
+  int k, i, j;
  if (vpx_reader_init(&r, data, partition_size, pbi->decrypt_cb,
                      pbi->decrypt_state))
    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                       "Failed to allocate bool decoder 0");
-  cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r);
+#if !CONFIG_MISC_FIXES
  cm->tx_mode = xd->lossless[0] ? ONLY_4X4 : read_tx_mode(&r);
 #endif
  if (cm->tx_mode == TX_MODE_SELECT)
    read_tx_mode_probs(&fc->tx_probs, &r);
  read_coef_probs(fc, cm->tx_mode, &r);
@@ -1996,9 +2149,35 @@ static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
  for (k = 0; k < SKIP_CONTEXTS; ++k)
    vp10_diff_update_prob(&r, &fc->skip_probs[k]);
-  if (!frame_is_intra_only(cm)) {
+#if CONFIG_MISC_FIXES
  if (cm->seg.enabled) {
    if (cm->seg.temporal_update) {
      for (k = 0; k < PREDICTION_PROBS; k++)
        vp10_diff_update_prob(&r, &cm->fc->seg.pred_probs[k]);
    }
    for (k = 0; k < MAX_SEGMENTS - 1; k++)
      vp10_diff_update_prob(&r, &cm->fc->seg.tree_probs[k]);
  }
  for (j = 0; j < INTRA_MODES; j++)
    for (i = 0; i < INTRA_MODES - 1; ++i)
      vp10_diff_update_prob(&r, &fc->uv_mode_prob[j][i]);
  for (j = 0; j < PARTITION_CONTEXTS; ++j)
    for (i = 0; i < PARTITION_TYPES - 1; ++i)
      vp10_diff_update_prob(&r, &fc->partition_prob[j][i]);
 #endif
  if (frame_is_intra_only(cm)) {
    vp10_copy(cm->kf_y_prob, vp10_kf_y_mode_prob);
 #if CONFIG_MISC_FIXES
    for (k = 0; k < INTRA_MODES; k++)
      for (j = 0; j < INTRA_MODES; j++)
        for (i = 0; i < INTRA_MODES - 1; ++i)
          vp10_diff_update_prob(&r, &cm->kf_y_prob[k][j][i]);
 #endif
  } else {
    nmv_context *const nmvc = &fc->nmvc;
    int i, j;
    read_inter_mode_probs(fc, &r);
@@ -2008,7 +2187,9 @@ static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
      vp10_diff_update_prob(&r, &fc->intra_inter_prob[i]);
 #if !CONFIG_MISC_FIXES
    cm->reference_mode = read_frame_reference_mode(cm, &r);
 #endif
    if (cm->reference_mode != SINGLE_REFERENCE)
      setup_compound_reference_mode(cm);
    read_frame_reference_mode_probs(cm, &r);
@@ -2017,9 +2198,11 @@ static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
      for (i = 0; i < INTRA_MODES - 1; ++i)
        vp10_diff_update_prob(&r, &fc->y_mode_prob[j][i]);
 #if !CONFIG_MISC_FIXES
    for (j = 0; j < PARTITION_CONTEXTS; ++j)
      for (i = 0; i < PARTITION_TYPES - 1; ++i)
        vp10_diff_update_prob(&r, &fc->partition_prob[j][i]);
 #endif
    read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
  }
@@ -2035,7 +2218,8 @@ static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
 static void debug_check_frame_counts(const VP10_COMMON *const cm) {
  FRAME_COUNTS zero_counts;
  vp10_zero(zero_counts);
-  assert(cm->frame_parallel_decoding_mode || cm->error_resilient_mode);
+  assert(cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD ||
         cm->error_resilient_mode);
  assert(!memcmp(cm->counts.y_mode, zero_counts.y_mode,
                 sizeof(cm->counts.y_mode)));
  assert(!memcmp(cm->counts.uv_mode, zero_counts.uv_mode,
@@ -2161,10 +2345,11 @@ void vp10_decode_frame(VP10Decoder *pbi,
  // If encoded in frame parallel mode, frame context is ready after decoding
  // the frame header.
-  if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
+  if (cm->frame_parallel_decode &&
      cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD) {
    VPxWorker *const worker = pbi->frame_worker_owner;
    FrameWorkerData *const frame_worker_data = worker->data1;
-    if (cm->refresh_frame_context) {
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD) {
      context_updated = 1;
      cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
    }
@@ -2198,11 +2383,17 @@ void vp10_decode_frame(VP10Decoder *pbi,
  }
  if (!xd->corrupted) {
-    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
      vp10_adapt_coef_probs(cm);
 #if CONFIG_MISC_FIXES
      vp10_adapt_intra_frame_probs(cm);
 #endif
      if (!frame_is_intra_only(cm)) {
-        vp10_adapt_mode_probs(cm);
+#if !CONFIG_MISC_FIXES
        vp10_adapt_intra_frame_probs(cm);
 #endif
        vp10_adapt_inter_frame_probs(cm);
        vp10_adapt_mv_probs(cm, cm->allow_high_precision_mv);
      }
    } else {
@@ -2214,6 +2405,7 @@ void vp10_decode_frame(VP10Decoder *pbi,
  }
  // Non frame parallel update frame context here.
-  if (cm->refresh_frame_context && !context_updated)
+  if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF &&
      !context_updated)
    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 }
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -24,6 +24,19 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 static INLINE int read_uniform(vpx_reader *r, int n) {
  int l = get_unsigned_bits(n);
  int m = (1 << l) - n;
  int v = vpx_read_literal(r, l-1);
  assert(l != 0);
  if (v < m)
    return v;
  else
    return (v << 1) - m + vpx_read_literal(r, 1);
 }
 static PREDICTION_MODE read_intra_mode(vpx_reader *r, const vpx_prob *p) {
  return (PREDICTION_MODE)vpx_read_tree(r, vp10_intra_mode_tree, p);
 }
@@ -60,8 +73,9 @@ static PREDICTION_MODE read_inter_mode(VP10_COMMON *cm, MACROBLOCKD *xd,
  return NEARESTMV + mode;
 }
-static int read_segment_id(vpx_reader *r, const struct segmentation *seg) {
+static int read_segment_id(vpx_reader *r,
-  return vpx_read_tree(r, vp10_segment_tree, seg->tree_probs);
+    const struct segmentation_probs *segp) {
  return vpx_read_tree(r, vp10_segment_tree, segp->tree_probs);
 }
 static TX_SIZE read_selected_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
@@ -86,6 +100,8 @@ static TX_SIZE read_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
  TX_MODE tx_mode = cm->tx_mode;
  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
  if (xd->lossless[xd->mi[0]->mbmi.segment_id])
    return TX_4X4;
  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
    return read_selected_tx_size(cm, xd, max_tx_size, r);
  else
@@ -116,18 +132,32 @@ static void set_segment_id(VP10_COMMON *cm, int mi_offset,
      cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }
-static int read_intra_segment_id(VP10_COMMON *const cm, int mi_offset,
+static int read_intra_segment_id(VP10_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int x_mis, int y_mis,
+                                 int mi_offset, int x_mis, int y_mis,
                                 vpx_reader *r) {
  struct segmentation *const seg = &cm->seg;
 #if CONFIG_MISC_FIXES
  FRAME_COUNTS *counts = xd->counts;
  struct segmentation_probs *const segp = &cm->fc->seg;
 #else
  struct segmentation_probs *const segp = &cm->segp;
 #endif
  int segment_id;
 #if !CONFIG_MISC_FIXES
  (void) xd;
 #endif
  if (!seg->enabled)
    return 0;  // Default for disabled segmentation
  assert(seg->update_map && !seg->temporal_update);
-  segment_id = read_segment_id(r, seg);
+  segment_id = read_segment_id(r, segp);
 #if CONFIG_MISC_FIXES
  if (counts)
    ++counts->seg.tree_total[segment_id];
 #endif
  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
  return segment_id;
 }
@@ -147,6 +177,12 @@ static void copy_segment_id(const VP10_COMMON *cm,
 static int read_inter_segment_id(VP10_COMMON *const cm, MACROBLOCKD *const xd,
                                 int mi_row, int mi_col, vpx_reader *r) {
  struct segmentation *const seg = &cm->seg;
 #if CONFIG_MISC_FIXES
  FRAME_COUNTS *counts = xd->counts;
  struct segmentation_probs *const segp = &cm->fc->seg;
 #else
  struct segmentation_probs *const segp = &cm->segp;
 #endif
  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
  int predicted_segment_id, segment_id;
  const int mi_offset = mi_row * cm->mi_cols + mi_col;
@@ -171,12 +207,28 @@ static int read_inter_segment_id(VP10_COMMON *const cm, MACROBLOCKD *const xd,
  }
  if (seg->temporal_update) {
-    const vpx_prob pred_prob = vp10_get_pred_prob_seg_id(seg, xd);
+    const int ctx = vp10_get_pred_context_seg_id(xd);
    const vpx_prob pred_prob = segp->pred_probs[ctx];
    mbmi->seg_id_predicted = vpx_read(r, pred_prob);
-    segment_id = mbmi->seg_id_predicted ? predicted_segment_id
+#if CONFIG_MISC_FIXES
-                                        : read_segment_id(r, seg);
+    if (counts)
      ++counts->seg.pred[ctx][mbmi->seg_id_predicted];
 #endif
    if (mbmi->seg_id_predicted) {
      segment_id = predicted_segment_id;
    } else {
      segment_id = read_segment_id(r, segp);
 #if CONFIG_MISC_FIXES
      if (counts)
        ++counts->seg.tree_mispred[segment_id];
 #endif
    }
  } else {
-    segment_id = read_segment_id(r, seg);
+    segment_id = read_segment_id(r, segp);
 #if CONFIG_MISC_FIXES
    if (counts)
      ++counts->seg.tree_total[segment_id];
 #endif
  }
  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
  return segment_id;
@@ -213,7 +265,7 @@ static void read_intra_frame_mode_info(VP10_COMMON *const cm,
  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
-  mbmi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r);
+  mbmi->segment_id = read_intra_segment_id(cm, xd, mi_offset, x_mis, y_mis, r);
  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
  mbmi->tx_size = read_tx_size(cm, xd, 1, r);
  mbmi->ref_frame[0] = INTRA_FRAME;
@@ -223,27 +275,27 @@ static void read_intra_frame_mode_info(VP10_COMMON *const cm,
    case BLOCK_4X4:
      for (i = 0; i < 4; ++i)
        mi->bmi[i].as_mode =
-            read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, i));
+            read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, i));
      mbmi->mode = mi->bmi[3].as_mode;
      break;
    case BLOCK_4X8:
      mi->bmi[0].as_mode = mi->bmi[2].as_mode =
-          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0));
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 1));
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 1));
      break;
    case BLOCK_8X4:
      mi->bmi[0].as_mode = mi->bmi[1].as_mode =
-          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0));
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 2));
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 2));
      break;
    default:
      mbmi->mode = read_intra_mode(r,
-                                   get_y_mode_probs(mi, above_mi, left_mi, 0));
+          get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
  }
-  mbmi->uv_mode = read_intra_mode(r, vp10_kf_uv_mode_prob[mbmi->mode]);
+  mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
 }
 static int read_mv_component(vpx_reader *r,
@@ -294,7 +346,7 @@ static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref,
  if (mv_joint_horizontal(joint_type))
    diff.col = read_mv_component(r, &ctx->comps[1], use_hp);
-  vp10_inc_mv(&diff, counts);
+  vp10_inc_mv(&diff, counts, use_hp);
  mv->row = ref->row + diff.row;
  mv->col = ref->col + diff.col;
@@ -523,8 +575,8 @@ static void read_inter_block_mode_info(VP10Decoder *const pbi,
  if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      vp10_find_best_ref_mvs(xd, allow_hp, ref_mvs[mbmi->ref_frame[ref]],
+      vp10_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[ref]],
-                            &nearestmv[ref], &nearmv[ref]);
+                             &nearestmv[ref], &nearmv[ref]);
    }
  }
--- a/vp10/decoder/decoder.c
+++ b/vp10/decoder/decoder.c
@@ -126,6 +126,9 @@ VP10Decoder *vp10_decoder_create(BufferPool *const pool) {
 void vp10_decoder_remove(VP10Decoder *pbi) {
  int i;
  if (!pbi)
    return;
  vpx_get_worker_interface()->end(&pbi->lf_worker);
  vpx_free(pbi->lf_worker.data1);
  vpx_free(pbi->tile_data);
@@ -258,7 +261,7 @@ static void swap_frame_buffers(VP10Decoder *pbi) {
  pbi->hold_ref_buf = 0;
  cm->frame_to_show = get_frame_new_buffer(cm);
-  if (!pbi->frame_parallel_decode || !cm->show_frame) {
+  if (!cm->frame_parallel_decode || !cm->show_frame) {
    lock_buffer_pool(pool);
    --frame_bufs[cm->new_fb_idx].ref_count;
    unlock_buffer_pool(pool);
@@ -297,7 +300,7 @@ int vp10_receive_compressed_data(VP10Decoder *pbi,
  // Check if the previous frame was a frame without any references to it.
  // Release frame buffer if not decoding in frame parallel mode.
-  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
+  if (!cm->frame_parallel_decode && cm->new_fb_idx >= 0
      && frame_bufs[cm->new_fb_idx].ref_count == 0)
    pool->release_fb_cb(pool->cb_priv,
                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
@@ -310,7 +313,7 @@ int vp10_receive_compressed_data(VP10Decoder *pbi,
  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
  pbi->hold_ref_buf = 0;
-  if (pbi->frame_parallel_decode) {
+  if (cm->frame_parallel_decode) {
    VPxWorker *const worker = pbi->frame_worker_owner;
    vp10_frameworker_lock_stats(worker);
    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
@@ -379,12 +382,12 @@ int vp10_receive_compressed_data(VP10Decoder *pbi,
  if (!cm->show_existing_frame) {
    cm->last_show_frame = cm->show_frame;
    cm->prev_frame = cm->cur_frame;
-    if (cm->seg.enabled && !pbi->frame_parallel_decode)
+    if (cm->seg.enabled && !cm->frame_parallel_decode)
      vp10_swap_current_and_last_seg_map(cm);
  }
  // Update progress in frame parallel decode.
-  if (pbi->frame_parallel_decode) {
+  if (cm->frame_parallel_decode) {
    // Need to lock the mutex here as another thread may
    // be accessing this buffer.
    VPxWorker *const worker = pbi->frame_worker_owner;
@@ -456,6 +459,9 @@ vpx_codec_err_t vp10_parse_superframe_index(const uint8_t *data,
  // an invalid bitstream and need to return an error.
  uint8_t marker;
 #if CONFIG_MISC_FIXES
  size_t frame_sz_sum = 0;
 #endif
  assert(data_sz);
  marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
@@ -464,7 +470,7 @@ vpx_codec_err_t vp10_parse_superframe_index(const uint8_t *data,
  if ((marker & 0xe0) == 0xc0) {
    const uint32_t frames = (marker & 0x7) + 1;
    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-    const size_t index_sz = 2 + mag * frames;
+    const size_t index_sz = 2 + mag * (frames - CONFIG_MISC_FIXES);
    // This chunk is marked as having a superframe index but doesn't have
    // enough data for it, thus it's an invalid superframe index.
@@ -495,13 +501,20 @@ vpx_codec_err_t vp10_parse_superframe_index(const uint8_t *data,
        x = clear_buffer;
      }
-      for (i = 0; i < frames; ++i) {
+      for (i = 0; i < frames - CONFIG_MISC_FIXES; ++i) {
        uint32_t this_sz = 0;
        for (j = 0; j < mag; ++j)
          this_sz |= (*x++) << (j * 8);
        this_sz += CONFIG_MISC_FIXES;
        sizes[i] = this_sz;
 #if CONFIG_MISC_FIXES
        frame_sz_sum += this_sz;
 #endif
      }
 #if CONFIG_MISC_FIXES
      sizes[i] = data_sz - index_sz - frame_sz_sum;
 #endif
      *count = frames;
    }
  }
--- a/vp10/decoder/decoder.h
+++ b/vp10/decoder/decoder.h
@@ -34,6 +34,7 @@ typedef struct TileData {
  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
 } TileData;
 typedef struct TileWorkerData {
@@ -43,6 +44,7 @@ typedef struct TileWorkerData {
  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
  struct vpx_internal_error_info error_info;
 } TileWorkerData;
@@ -55,8 +57,6 @@ typedef struct VP10Decoder {
  int refresh_frame_flags;
  int frame_parallel_decode;  // frame-based threading.
  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
  // the same.
  RefCntBuffer *cur_buf;   //  Current decoding frame buffer.
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c
@@ -163,26 +163,33 @@ static int decode_coefs(const MACROBLOCKD *xd,
        case CATEGORY5_TOKEN:
          val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
          break;
-        case CATEGORY6_TOKEN:
+        case CATEGORY6_TOKEN: {
 #if CONFIG_MISC_FIXES
          const int skip_bits = TX_SIZES - 1 - tx_size;
 #else
          const int skip_bits = 0;
 #endif
          const uint8_t *cat6p = cat6_prob + skip_bits;
 #if CONFIG_VP9_HIGHBITDEPTH
          switch (xd->bd) {
            case VPX_BITS_8:
-              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r);
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
              break;
            case VPX_BITS_10:
-              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 16, r);
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 16 - skip_bits, r);
              break;
            case VPX_BITS_12:
-              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 18, r);
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 18 - skip_bits, r);
              break;
            default:
              assert(0);
              return -1;
          }
 #else
-          val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r);
+          val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
 #endif
          break;
        }
      }
    }
    v = (val * dqv) >> dq_shift;
--- a/vp10/decoder/dsubexp.c
+++ b/vp10/decoder/dsubexp.c
@@ -23,13 +23,13 @@ static int inv_recenter_nonneg(int v, int m) {
 static int decode_uniform(vpx_reader *r) {
  const int l = 8;
-  const int m = (1 << l) - 191;
+  const int m = (1 << l) - 191 + CONFIG_MISC_FIXES;
  const int v = vpx_read_literal(r, l - 1);
  return v < m ?  v : (v << 1) - m + vpx_read_bit(r);
 }
 static int inv_remap_prob(int v, int m) {
-  static int inv_map_table[MAX_PROB] = {
+  static uint8_t inv_map_table[MAX_PROB - CONFIG_MISC_FIXES] = {
      7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176, 189,
    202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,  10,  11,
     12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
@@ -46,7 +46,10 @@ static int inv_remap_prob(int v, int m) {
    191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
    207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222,
    223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
-    239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 253
+    239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
 #if !CONFIG_MISC_FIXES
    253
 #endif
  };
  assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0])));
  v = inv_map_table[v];
--- a/vp10/decoder/dthread.h
+++ b/vp10/decoder/dthread.h
@@ -15,6 +15,10 @@
 #include "vpx_util/vpx_thread.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct VP10Common;
 struct VP10Decoder;
@@ -63,4 +67,8 @@ void vp10_frameworker_broadcast(RefCntBuffer *const buf, int row);
 void vp10_frameworker_copy_context(VPxWorker *const dst_worker,
                                  VPxWorker *const src_worker);
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  // VP10_DECODER_DTHREAD_H_
--- a/vp10/encoder/arm/neon/avg_neon.c
+++ b/vp10/encoder/arm/neon/avg_neon.c
@@ -1,160 +0,0 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 #include <assert.h>
 #include "./vp10_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
  const uint32x4_t a = vpaddlq_u16(v_16x8);
  const uint64x2_t b = vpaddlq_u32(a);
  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
                                vreinterpret_u32_u64(vget_high_u64(b)));
  return vget_lane_u32(c, 0);
 }
 unsigned int vp10_avg_8x8_neon(const uint8_t *s, int p) {
  uint8x8_t v_s0 = vld1_u8(s);
  const uint8x8_t v_s1 = vld1_u8(s + p);
  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
  v_s0 = vld1_u8(s + 2 * p);
  v_sum = vaddw_u8(v_sum, v_s0);
  v_s0 = vld1_u8(s + 3 * p);
  v_sum = vaddw_u8(v_sum, v_s0);
  v_s0 = vld1_u8(s + 4 * p);
  v_sum = vaddw_u8(v_sum, v_s0);
  v_s0 = vld1_u8(s + 5 * p);
  v_sum = vaddw_u8(v_sum, v_s0);
  v_s0 = vld1_u8(s + 6 * p);
  v_sum = vaddw_u8(v_sum, v_s0);
  v_s0 = vld1_u8(s + 7 * p);
  v_sum = vaddw_u8(v_sum, v_s0);
  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
 }
 void vp10_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                          const int ref_stride, const int height) {
  int i;
  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
  const int shift_factor = ((height >> 5) + 3) * -1;
  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
  for (i = 0; i < height; i += 8) {
    const uint8x16_t vec_row1 = vld1q_u8(ref);
    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
    ref += ref_stride * 8;
  }
  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
  hbuf += 8;
  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
 }
 int16_t vp10_int_pro_col_neon(uint8_t const *ref, const int width) {
  int i;
  uint16x8_t vec_sum = vdupq_n_u16(0);
  for (i = 0; i < width; i += 16) {
    const uint8x16_t vec_row = vld1q_u8(ref);
    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
    ref += 16;
  }
  return horizontal_add_u16x8(vec_sum);
 }
 // ref, src = [0, 510] - max diff = 16-bits
 // bwl = {2, 3, 4}, width = {16, 32, 64}
 int vp10_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
  int width = 4 << bwl;
  int32x4_t sse = vdupq_n_s32(0);
  int16x8_t total = vdupq_n_s16(0);
  assert(width >= 8);
  assert((width % 8) == 0);
  do {
    const int16x8_t r = vld1q_s16(ref);
    const int16x8_t s = vld1q_s16(src);
    const int16x8_t diff = vsubq_s16(r, s);  // [-510, 510], 10 bits.
    const int16x4_t diff_lo = vget_low_s16(diff);
    const int16x4_t diff_hi = vget_high_s16(diff);
    sse = vmlal_s16(sse, diff_lo, diff_lo);  // dynamic range 26 bits.
    sse = vmlal_s16(sse, diff_hi, diff_hi);
    total = vaddq_s16(total, diff);  // dynamic range 16 bits.
    ref += 8;
    src += 8;
    width -= 8;
  } while (width != 0);
  {
    // Note: 'total''s pairwise addition could be implemented similarly to
    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
    // with the summation of 'sse' performed better on a Cortex-A15.
    const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'
    const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
    const int32x2_t t2 = vpadd_s32(t1, t1);
    const int t = vget_lane_s32(t2, 0);
    const int64x2_t s0 = vpaddlq_s32(sse);  // cascading summation of 'sse'.
    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
                                  vreinterpret_s32_s64(vget_high_s64(s0)));
    const int s = vget_lane_s32(s1, 0);
    const int shift_factor = bwl + 2;
    return s - ((t * t) >> shift_factor);
  }
 }
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -45,6 +45,19 @@ static const struct vp10_token partition_encodings[PARTITION_TYPES] =
 static const struct vp10_token inter_mode_encodings[INTER_MODES] =
  {{2, 2}, {6, 3}, {0, 1}, {7, 3}};
 static INLINE void write_uniform(vpx_writer *w, int n, int v) {
  int l = get_unsigned_bits(n);
  int m = (1 << l) - n;
  if (l == 0)
    return;
  if (v < m) {
    vpx_write_literal(w, v, l - 1);
  } else {
    vpx_write_literal(w, m + ((v - m) >> 1), l - 1);
    vpx_write_literal(w, (v - m) & 1, 1);
  }
 }
 static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode,
                             const vpx_prob *probs) {
  vp10_write_token(w, vp10_intra_mode_tree, probs, &intra_mode_encodings[mode]);
@@ -122,8 +135,11 @@ static void update_switchable_interp_probs(VP10_COMMON *cm, vpx_writer *w,
 static void pack_mb_tokens(vpx_writer *w,
                           TOKENEXTRA **tp, const TOKENEXTRA *const stop,
-                           vpx_bit_depth_t bit_depth) {
+                           vpx_bit_depth_t bit_depth, const TX_SIZE tx) {
  TOKENEXTRA *p = *tp;
 #if !CONFIG_MISC_FIXES
  (void) tx;
 #endif
  while (p < stop && p->token != EOSB_TOKEN) {
    const int t = p->token;
@@ -171,6 +187,12 @@ static void pack_mb_tokens(vpx_writer *w,
    if (b->base_val) {
      const int e = p->extra, l = b->len;
 #if CONFIG_MISC_FIXES
      int skip_bits =
          (b->base_val == CAT6_MIN_VAL) ? TX_SIZES - 1 - tx : 0;
 #else
      int skip_bits = 0;
 #endif
      if (l) {
        const unsigned char *pb = b->prob;
@@ -180,7 +202,12 @@ static void pack_mb_tokens(vpx_writer *w,
        do {
          const int bb = (v >> --n) & 1;
-          vpx_write(w, bb, pb[i >> 1]);
+          if (skip_bits) {
            skip_bits--;
            assert(!bb);
          } else {
            vpx_write(w, bb, pb[i >> 1]);
          }
          i = b->tree[i + bb];
        } while (n);
      }
@@ -190,13 +217,14 @@ static void pack_mb_tokens(vpx_writer *w,
    ++p;
  }
-  *tp = p + (p->token == EOSB_TOKEN);
+  *tp = p;
 }
 static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
                             const struct segmentation_probs *segp,
                             int segment_id) {
  if (seg->enabled && seg->update_map)
-    vp10_write_tree(w, vp10_segment_tree, seg->tree_probs, segment_id, 3, 0);
+    vp10_write_tree(w, vp10_segment_tree, segp->tree_probs, segment_id, 3, 0);
 }
 // This function encodes the reference frame
@@ -242,6 +270,11 @@ static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi,
  const MACROBLOCK *const x = &cpi->td.mb;
  const MACROBLOCKD *const xd = &x->e_mbd;
  const struct segmentation *const seg = &cm->seg;
 #if CONFIG_MISC_FIXES
  const struct segmentation_probs *const segp = &cm->fc->seg;
 #else
  const struct segmentation_probs *const segp = &cm->segp;
 #endif
  const MB_MODE_INFO *const mbmi = &mi->mbmi;
  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
  const PREDICTION_MODE mode = mbmi->mode;
@@ -255,12 +288,12 @@ static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi,
  if (seg->update_map) {
    if (seg->temporal_update) {
      const int pred_flag = mbmi->seg_id_predicted;
-      vpx_prob pred_prob = vp10_get_pred_prob_seg_id(seg, xd);
+      vpx_prob pred_prob = vp10_get_pred_prob_seg_id(segp, xd);
      vpx_write(w, pred_flag, pred_prob);
      if (!pred_flag)
-        write_segment_id(w, seg, segment_id);
+        write_segment_id(w, seg, segp, segment_id);
    } else {
-      write_segment_id(w, seg, segment_id);
+      write_segment_id(w, seg, segp, segment_id);
    }
  }
@@ -270,7 +303,7 @@ static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi,
    vpx_write(w, is_inter, vp10_get_intra_inter_prob(cm, xd));
  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
-      !(is_inter && skip)) {
+      !(is_inter && skip) && !xd->lossless[segment_id]) {
    write_selected_tx_size(cm, xd, w);
  }
@@ -342,6 +375,11 @@ static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi,
 static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd,
                              MODE_INFO **mi_8x8, vpx_writer *w) {
  const struct segmentation *const seg = &cm->seg;
 #if CONFIG_MISC_FIXES
  const struct segmentation_probs *const segp = &cm->fc->seg;
 #else
  const struct segmentation_probs *const segp = &cm->segp;
 #endif
  const MODE_INFO *const mi = mi_8x8[0];
  const MODE_INFO *const above_mi = xd->above_mi;
  const MODE_INFO *const left_mi = xd->left_mi;
@@ -349,15 +387,17 @@ static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd,
  const BLOCK_SIZE bsize = mbmi->sb_type;
  if (seg->update_map)
-    write_segment_id(w, seg, mbmi->segment_id);
+    write_segment_id(w, seg, segp, mbmi->segment_id);
  write_skip(cm, xd, mbmi->segment_id, mi, w);
-  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
+  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
      !xd->lossless[mbmi->segment_id])
    write_selected_tx_size(cm, xd, w);
  if (bsize >= BLOCK_8X8) {
-    write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
+    write_intra_mode(w, mbmi->mode,
                     get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
  } else {
    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -367,12 +407,12 @@ static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd,
      for (idx = 0; idx < 2; idx += num_4x4_w) {
        const int block = idy * 2 + idx;
        write_intra_mode(w, mi->bmi[block].as_mode,
-                         get_y_mode_probs(mi, above_mi, left_mi, block));
+                         get_y_mode_probs(cm, mi, above_mi, left_mi, block));
      }
    }
  }
-  write_intra_mode(w, mbmi->uv_mode, vp10_kf_uv_mode_prob[mbmi->mode]);
+  write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mbmi->mode]);
 }
 static void write_modes_b(VP10_COMP *cpi, const TileInfo *const tile,
@@ -382,12 +422,12 @@ static void write_modes_b(VP10_COMP *cpi, const TileInfo *const tile,
  const VP10_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
  MODE_INFO *m;
  int plane;
  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
  m = xd->mi[0];
-  cpi->td.mb.mbmi_ext = cpi->td.mb.mbmi_ext_base +
+  cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
      (mi_row * cm->mi_cols + mi_col);
  set_mi_row_col(xd, tile,
                 mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
@@ -399,8 +439,16 @@ static void write_modes_b(VP10_COMP *cpi, const TileInfo *const tile,
    pack_inter_mode_mvs(cpi, m, w);
  }
-  assert(*tok < tok_end);
+  if (!m->mbmi.skip) {
-  pack_mb_tokens(w, tok, tok_end, cm->bit_depth);
+    assert(*tok < tok_end);
    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
      TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
                         : m->mbmi.tx_size;
      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
      (*tok)++;
    }
  }
 }
 static void write_partition(const VP10_COMMON *const cm,
@@ -408,7 +456,7 @@ static void write_partition(const VP10_COMMON *const cm,
                            int hbs, int mi_row, int mi_col,
                            PARTITION_TYPE p, BLOCK_SIZE bsize, vpx_writer *w) {
  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-  const vpx_prob *const probs = xd->partition_probs[ctx];
+  const vpx_prob *const probs = cm->fc->partition_prob[ctx];
  const int has_rows = (mi_row + hbs) < cm->mi_rows;
  const int has_cols = (mi_col + hbs) < cm->mi_cols;
@@ -486,12 +534,9 @@ static void write_modes_sb(VP10_COMP *cpi,
 static void write_modes(VP10_COMP *cpi,
                        const TileInfo *const tile, vpx_writer *w,
                        TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
  const VP10_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
  int mi_row, mi_col;
  set_partition_probs(cm, xd);
  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
       mi_row += MI_BLOCK_SIZE) {
    vp10_zero(xd->left_seg_context);
@@ -714,8 +759,7 @@ static void encode_loopfilter(struct loopfilter *lf,
        vpx_wb_write_bit(wb, changed);
        if (changed) {
          lf->last_ref_deltas[i] = delta;
-          vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+          vpx_wb_write_inv_signed_literal(wb, delta, 6);
          vpx_wb_write_bit(wb, delta < 0);
        }
      }
@@ -725,8 +769,7 @@ static void encode_loopfilter(struct loopfilter *lf,
        vpx_wb_write_bit(wb, changed);
        if (changed) {
          lf->last_mode_deltas[i] = delta;
-          vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+          vpx_wb_write_inv_signed_literal(wb, delta, 6);
          vpx_wb_write_bit(wb, delta < 0);
        }
      }
    }
@@ -736,8 +779,7 @@ static void encode_loopfilter(struct loopfilter *lf,
 static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
  if (delta_q != 0) {
    vpx_wb_write_bit(wb, 1);
-    vpx_wb_write_literal(wb, abs(delta_q), 4);
+    vpx_wb_write_inv_signed_literal(wb, delta_q, CONFIG_MISC_FIXES ? 6 : 4);
    vpx_wb_write_bit(wb, delta_q < 0);
  } else {
    vpx_wb_write_bit(wb, 0);
  }
@@ -756,6 +798,9 @@ static void encode_segmentation(VP10_COMMON *cm, MACROBLOCKD *xd,
  int i, j;
  const struct segmentation *seg = &cm->seg;
 #if !CONFIG_MISC_FIXES
  const struct segmentation_probs *segp = &cm->segp;
 #endif
  vpx_wb_write_bit(wb, seg->enabled);
  if (!seg->enabled)
@@ -770,14 +815,16 @@ static void encode_segmentation(VP10_COMMON *cm, MACROBLOCKD *xd,
  if (seg->update_map) {
    // Select the coding strategy (temporal or spatial)
    vp10_choose_segmap_coding_method(cm, xd);
 #if !CONFIG_MISC_FIXES
    // Write out probabilities used to decode unpredicted  macro-block segments
    for (i = 0; i < SEG_TREE_PROBS; i++) {
-      const int prob = seg->tree_probs[i];
+      const int prob = segp->tree_probs[i];
      const int update = prob != MAX_PROB;
      vpx_wb_write_bit(wb, update);
      if (update)
        vpx_wb_write_literal(wb, prob, 8);
    }
 #endif
    // Write out the chosen coding method.
    if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
@@ -785,15 +832,18 @@ static void encode_segmentation(VP10_COMMON *cm, MACROBLOCKD *xd,
    } else {
      assert(seg->temporal_update == 0);
    }
 #if !CONFIG_MISC_FIXES
    if (seg->temporal_update) {
      for (i = 0; i < PREDICTION_PROBS; i++) {
-        const int prob = seg->pred_probs[i];
+        const int prob = segp->pred_probs[i];
        const int update = prob != MAX_PROB;
        vpx_wb_write_bit(wb, update);
        if (update)
          vpx_wb_write_literal(wb, prob, 8);
      }
    }
 #endif
  }
  // Segmentation data
@@ -821,14 +871,45 @@ static void encode_segmentation(VP10_COMMON *cm, MACROBLOCKD *xd,
  }
 }
-static void encode_txfm_probs(VP10_COMMON *cm, vpx_writer *w,
+#if CONFIG_MISC_FIXES
-                              FRAME_COUNTS *counts) {
+static void update_seg_probs(VP10_COMP *cpi, vpx_writer *w) {
-  // Mode
+  VP10_COMMON *cm = &cpi->common;
-  vpx_write_literal(w, VPXMIN(cm->tx_mode, ALLOW_32X32), 2);
+
-  if (cm->tx_mode >= ALLOW_32X32)
+  if (!cpi->common.seg.enabled)
-    vpx_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
+    return;
  if (cpi->common.seg.temporal_update) {
    int i;
    for (i = 0; i < PREDICTION_PROBS; i++)
      vp10_cond_prob_diff_update(w, &cm->fc->seg.pred_probs[i],
          cm->counts.seg.pred[i]);
    prob_diff_update(vp10_segment_tree, cm->fc->seg.tree_probs,
        cm->counts.seg.tree_mispred, MAX_SEGMENTS, w);
  } else {
    prob_diff_update(vp10_segment_tree, cm->fc->seg.tree_probs,
        cm->counts.seg.tree_total, MAX_SEGMENTS, w);
  }
 }
 static void write_txfm_mode(TX_MODE mode, struct vpx_write_bit_buffer *wb) {
  vpx_wb_write_bit(wb, mode == TX_MODE_SELECT);
  if (mode != TX_MODE_SELECT)
    vpx_wb_write_literal(wb, mode, 2);
 }
 #else
 static void write_txfm_mode(TX_MODE mode, struct vpx_writer *wb) {
  vpx_write_literal(wb, VPXMIN(mode, ALLOW_32X32), 2);
  if (mode >= ALLOW_32X32)
    vpx_write_bit(wb, mode == TX_MODE_SELECT);
 }
 #endif
 static void update_txfm_probs(VP10_COMMON *cm, vpx_writer *w,
                              FRAME_COUNTS *counts) {
  // Probabilities
  if (cm->tx_mode == TX_MODE_SELECT) {
    int i, j;
    unsigned int ct_8x8p[TX_SIZES - 3][2];
@@ -933,7 +1014,8 @@ static int get_refresh_mask(VP10_COMP *cpi) {
  }
 }
-static size_t encode_tiles(VP10_COMP *cpi, uint8_t *data_ptr) {
+static size_t encode_tiles(VP10_COMP *cpi, uint8_t *data_ptr,
                           unsigned int *max_tile_sz) {
  VP10_COMMON *const cm = &cpi->common;
  vpx_writer residual_bc;
  int tile_row, tile_col;
@@ -941,6 +1023,7 @@ static size_t encode_tiles(VP10_COMP *cpi, uint8_t *data_ptr) {
  size_t total_size = 0;
  const int tile_cols = 1 << cm->log2_tile_cols;
  const int tile_rows = 1 << cm->log2_tile_rows;
  unsigned int max_tile = 0;
  memset(cm->above_seg_context, 0,
         sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
@@ -963,26 +1046,32 @@ static size_t encode_tiles(VP10_COMP *cpi, uint8_t *data_ptr) {
      assert(tok == tok_end);
      vpx_stop_encode(&residual_bc);
      if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
        unsigned int tile_sz;
        // size of this tile
-        mem_put_be32(data_ptr + total_size, residual_bc.pos);
+        assert(residual_bc.pos > 0);
        tile_sz = residual_bc.pos - CONFIG_MISC_FIXES;
        mem_put_le32(data_ptr + total_size, tile_sz);
        max_tile = max_tile > tile_sz ? max_tile : tile_sz;
        total_size += 4;
      }
      total_size += residual_bc.pos;
    }
  }
  *max_tile_sz = max_tile;
  return total_size;
 }
-static void write_display_size(const VP10_COMMON *cm,
+static void write_render_size(const VP10_COMMON *cm,
-                               struct vpx_write_bit_buffer *wb) {
+                              struct vpx_write_bit_buffer *wb) {
-  const int scaling_active = cm->width != cm->display_width ||
+  const int scaling_active = cm->width != cm->render_width ||
-                             cm->height != cm->display_height;
+                             cm->height != cm->render_height;
  vpx_wb_write_bit(wb, scaling_active);
  if (scaling_active) {
-    vpx_wb_write_literal(wb, cm->display_width - 1, 16);
+    vpx_wb_write_literal(wb, cm->render_width - 1, 16);
-    vpx_wb_write_literal(wb, cm->display_height - 1, 16);
+    vpx_wb_write_literal(wb, cm->render_height - 1, 16);
  }
 }
@@ -991,7 +1080,7 @@ static void write_frame_size(const VP10_COMMON *cm,
  vpx_wb_write_literal(wb, cm->width - 1, 16);
  vpx_wb_write_literal(wb, cm->height - 1, 16);
-  write_display_size(cm, wb);
+  write_render_size(cm, wb);
 }
 static void write_frame_size_with_refs(VP10_COMP *cpi,
@@ -1006,6 +1095,10 @@ static void write_frame_size_with_refs(VP10_COMP *cpi,
    if (cfg != NULL) {
      found = cm->width == cfg->y_crop_width &&
              cm->height == cfg->y_crop_height;
 #if CONFIG_MISC_FIXES
      found &= cm->render_width == cfg->render_width &&
               cm->render_height == cfg->render_height;
 #endif
    }
    vpx_wb_write_bit(wb, found);
    if (found) {
@@ -1016,9 +1109,15 @@ static void write_frame_size_with_refs(VP10_COMP *cpi,
  if (!found) {
    vpx_wb_write_literal(wb, cm->width - 1, 16);
    vpx_wb_write_literal(wb, cm->height - 1, 16);
 #if CONFIG_MISC_FIXES
    write_render_size(cm, wb);
 #endif
  }
-  write_display_size(cm, wb);
+#if !CONFIG_MISC_FIXES
  write_render_size(cm, wb);
 #endif
 }
 static void write_sync_code(struct vpx_write_bit_buffer *wb) {
@@ -1055,7 +1154,8 @@ static void write_bitdepth_colorspace_sampling(
  }
  vpx_wb_write_literal(wb, cm->color_space, 3);
  if (cm->color_space != VPX_CS_SRGB) {
-    vpx_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
    vpx_wb_write_bit(wb, cm->color_range);
    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
      assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
      vpx_wb_write_bit(wb, cm->subsampling_x);
@@ -1092,16 +1192,37 @@ static void write_uncompressed_header(VP10_COMP *cpi,
    if (!cm->show_frame)
      vpx_wb_write_bit(wb, cm->intra_only);
-    if (!cm->error_resilient_mode)
+    if (!cm->error_resilient_mode) {
-      vpx_wb_write_literal(wb, cm->reset_frame_context, 2);
+#if CONFIG_MISC_FIXES
      if (cm->intra_only) {
        vpx_wb_write_bit(wb,
                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
      } else {
        vpx_wb_write_bit(wb,
                         cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
        if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
          vpx_wb_write_bit(wb,
                           cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
      }
 #else
      static const int reset_frame_context_conv_tbl[3] = { 0, 2, 3 };
      vpx_wb_write_literal(wb,
          reset_frame_context_conv_tbl[cm->reset_frame_context], 2);
 #endif
    }
    if (cm->intra_only) {
      write_sync_code(wb);
 #if CONFIG_MISC_FIXES
      write_bitdepth_colorspace_sampling(cm, wb);
 #else
      // Note for profile 0, 420 8bpp is assumed.
      if (cm->profile > PROFILE_0) {
        write_bitdepth_colorspace_sampling(cm, wb);
      }
 #endif
      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
      write_frame_size(cm, wb);
@@ -1125,8 +1246,13 @@ static void write_uncompressed_header(VP10_COMP *cpi,
  }
  if (!cm->error_resilient_mode) {
-    vpx_wb_write_bit(wb, cm->refresh_frame_context);
+    vpx_wb_write_bit(wb,
-    vpx_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
+                     cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF);
 #if CONFIG_MISC_FIXES
    if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF)
 #endif
      vpx_wb_write_bit(wb, cm->refresh_frame_context !=
                               REFRESH_FRAME_CONTEXT_BACKWARD);
  }
  vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
@@ -1134,30 +1260,69 @@ static void write_uncompressed_header(VP10_COMP *cpi,
  encode_loopfilter(&cm->lf, wb);
  encode_quantization(cm, wb);
  encode_segmentation(cm, xd, wb);
 #if CONFIG_MISC_FIXES
  if (!cm->seg.enabled && xd->lossless[0])
    cm->tx_mode = TX_4X4;
  else
    write_txfm_mode(cm->tx_mode, wb);
  if (cpi->allow_comp_inter_inter) {
    const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
    const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
    vpx_wb_write_bit(wb, use_hybrid_pred);
    if (!use_hybrid_pred)
      vpx_wb_write_bit(wb, use_compound_pred);
  }
 #endif
  write_tile_info(cm, wb);
 }
 static size_t write_compressed_header(VP10_COMP *cpi, uint8_t *data) {
  VP10_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
  FRAME_CONTEXT *const fc = cm->fc;
  FRAME_COUNTS *counts = cpi->td.counts;
  vpx_writer header_bc;
  int i;
 #if CONFIG_MISC_FIXES
  int j;
 #endif
  vpx_start_encode(&header_bc, data);
-  if (xd->lossless)
+#if !CONFIG_MISC_FIXES
-    cm->tx_mode = ONLY_4X4;
+  if (cpi->td.mb.e_mbd.lossless[0]) {
-  else
+    cm->tx_mode = TX_4X4;
-    encode_txfm_probs(cm, &header_bc, counts);
+  } else {
-
+    write_txfm_mode(cm->tx_mode, &header_bc);
    update_txfm_probs(cm, &header_bc, counts);
  }
 #else
  update_txfm_probs(cm, &header_bc, counts);
 #endif
  update_coef_probs(cpi, &header_bc);
  update_skip_probs(cm, &header_bc, counts);
 #if CONFIG_MISC_FIXES
  update_seg_probs(cpi, &header_bc);
-  if (!frame_is_intra_only(cm)) {
+  for (i = 0; i < INTRA_MODES; ++i)
-    int i;
+    prob_diff_update(vp10_intra_mode_tree, fc->uv_mode_prob[i],
                     counts->uv_mode[i], INTRA_MODES, &header_bc);
  for (i = 0; i < PARTITION_CONTEXTS; ++i)
    prob_diff_update(vp10_partition_tree, fc->partition_prob[i],
                     counts->partition[i], PARTITION_TYPES, &header_bc);
 #endif
  if (frame_is_intra_only(cm)) {
    vp10_copy(cm->kf_y_prob, vp10_kf_y_mode_prob);
 #if CONFIG_MISC_FIXES
    for (i = 0; i < INTRA_MODES; ++i)
      for (j = 0; j < INTRA_MODES; ++j)
        prob_diff_update(vp10_intra_mode_tree, cm->kf_y_prob[i][j],
                         counts->kf_y_mode[i][j], INTRA_MODES, &header_bc);
 #endif
  } else {
    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
      prob_diff_update(vp10_inter_mode_tree, cm->fc->inter_mode_probs[i],
                       counts->inter_mode[i], INTER_MODES, &header_bc);
@@ -1170,8 +1335,9 @@ static size_t write_compressed_header(VP10_COMP *cpi, uint8_t *data) {
                                counts->intra_inter[i]);
    if (cpi->allow_comp_inter_inter) {
      const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
      const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
 #if !CONFIG_MISC_FIXES
      const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
      vpx_write_bit(&header_bc, use_compound_pred);
      if (use_compound_pred) {
@@ -1181,6 +1347,12 @@ static size_t write_compressed_header(VP10_COMP *cpi, uint8_t *data) {
            vp10_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
                                      counts->comp_inter[i]);
      }
 #else
      if (use_hybrid_pred)
        for (i = 0; i < COMP_INTER_CONTEXTS; i++)
          vp10_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
                                     counts->comp_inter[i]);
 #endif
    }
    if (cm->reference_mode != COMPOUND_REFERENCE) {
@@ -1201,9 +1373,11 @@ static size_t write_compressed_header(VP10_COMP *cpi, uint8_t *data) {
      prob_diff_update(vp10_intra_mode_tree, cm->fc->y_mode_prob[i],
                       counts->y_mode[i], INTRA_MODES, &header_bc);
 #if !CONFIG_MISC_FIXES
    for (i = 0; i < PARTITION_CONTEXTS; ++i)
      prob_diff_update(vp10_partition_tree, fc->partition_prob[i],
                       counts->partition[i], PARTITION_TYPES, &header_bc);
 #endif
    vp10_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc,
                        &counts->mv);
@@ -1215,15 +1389,67 @@ static size_t write_compressed_header(VP10_COMP *cpi, uint8_t *data) {
  return header_bc.pos;
 }
-void vp10_pack_bitstream(VP10_COMP *cpi, uint8_t *dest, size_t *size) {
+#if CONFIG_MISC_FIXES
 static int remux_tiles(uint8_t *dest, const int sz,
                       const int n_tiles, const int mag) {
  int rpos = 0, wpos = 0, n;
  for (n = 0; n < n_tiles; n++) {
    int tile_sz;
    if (n == n_tiles - 1) {
      tile_sz = sz - rpos;
    } else {
      tile_sz = mem_get_le32(&dest[rpos]) + 1;
      rpos += 4;
      switch (mag) {
        case 0:
          dest[wpos] = tile_sz - 1;
          break;
        case 1:
          mem_put_le16(&dest[wpos], tile_sz - 1);
          break;
        case 2:
          mem_put_le24(&dest[wpos], tile_sz - 1);
          break;
        case 3:  // remuxing should only happen if mag < 3
        default:
          assert("Invalid value for tile size magnitude" && 0);
      }
      wpos += mag + 1;
    }
    memmove(&dest[wpos], &dest[rpos], tile_sz);
    wpos += tile_sz;
    rpos += tile_sz;
  }
  assert(rpos > wpos);
  assert(rpos == sz);
  return wpos;
 }
 #endif
 void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size) {
  uint8_t *data = dest;
-  size_t first_part_size, uncompressed_hdr_size;
+  size_t first_part_size, uncompressed_hdr_size, data_sz;
  struct vpx_write_bit_buffer wb = {data, 0};
  struct vpx_write_bit_buffer saved_wb;
  unsigned int max_tile;
 #if CONFIG_MISC_FIXES
  VP10_COMMON *const cm = &cpi->common;
  const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
  const int have_tiles = n_log2_tiles > 0;
 #else
  const int have_tiles = 0;  // we have tiles, but we don't want to write a
                             // tile size marker in the header
 #endif
  write_uncompressed_header(cpi, &wb);
  saved_wb = wb;
-  vpx_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
+  // don't know in advance first part. size
  vpx_wb_write_literal(&wb, 0, 16 + have_tiles * 2);
  uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
  data += uncompressed_hdr_size;
@@ -1232,10 +1458,32 @@ void vp10_pack_bitstream(VP10_COMP *cpi, uint8_t *dest, size_t *size) {
  first_part_size = write_compressed_header(cpi, data);
  data += first_part_size;
  data_sz = encode_tiles(cpi, data, &max_tile);
 #if CONFIG_MISC_FIXES
  if (max_tile > 0) {
    int mag;
    unsigned int mask;
    // Choose the (tile size) magnitude
    for (mag = 0, mask = 0xff; mag < 4; mag++) {
      if (max_tile <= mask)
        break;
      mask <<= 8;
      mask |= 0xff;
    }
    assert(n_log2_tiles > 0);
    vpx_wb_write_literal(&saved_wb, mag, 2);
    if (mag < 3)
      data_sz = remux_tiles(data, (int)data_sz, 1 << n_log2_tiles, mag);
  } else {
    assert(n_log2_tiles == 0);
  }
 #endif
  data += data_sz;
  // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
  vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16);
  data += encode_tiles(cpi, data);
  *size = data - dest;
 }
--- a/vp10/encoder/bitstream.h
+++ b/vp10/encoder/bitstream.h
@@ -18,7 +18,7 @@ extern "C" {
 #include "vp10/encoder/encoder.h"
-void vp10_pack_bitstream(VP10_COMP *cpi, uint8_t *dest, size_t *size);
+void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size);
 static INLINE int vp10_preserve_existing_gf(VP10_COMP *cpi) {
  return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -58,7 +58,6 @@ struct macroblock {
  MACROBLOCKD e_mbd;
  MB_MODE_INFO_EXT *mbmi_ext;
  MB_MODE_INFO_EXT *mbmi_ext_base;
  int skip_block;
  int select_tx_size;
  int skip_recode;
@@ -71,6 +70,8 @@ struct macroblock {
  int rddiv;
  int rdmult;
  int mb_energy;
  int * m_search_count_ptr;
  int * ex_search_count_ptr;
  // These are set to their default values at the beginning, and then adjusted
  // further in the encoding process.
@@ -115,7 +116,6 @@ struct macroblock {
  // indicate if it is in the rd search loop or encoding process
  int use_lp32x32fdct;
  int skip_encode;
  // use fast quantization process
  int quant_fp;
@@ -134,13 +134,6 @@ struct macroblock {
  // Strong color activity detection. Used in RTC coding mode to enhance
  // the visual quality at the boundary of moving color objects.
  uint8_t color_sensitivity[2];
  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
  void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
 #if CONFIG_VP9_HIGHBITDEPTH
  void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
                          int eob, int bd);
 #endif
 };
 #ifdef __cplusplus
--- a/vp10/encoder/context_tree.c
+++ b/vp10/encoder/context_tree.c
@@ -30,13 +30,13 @@ static void alloc_mode_context(VP10_COMMON *cm, int num_4x4_blk,
  for (i = 0; i < MAX_MB_PLANE; ++i) {
    for (k = 0; k < 3; ++k) {
      CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
+                      vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
      CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
+                      vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
      CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
+                      vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
      CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
-                      vpx_memalign(16, num_blk * sizeof(*ctx->eobs[i][k])));
+                      vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
      ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];
      ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];
      ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
@@ -61,6 +61,11 @@ static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
      ctx->eobs[i][k] = 0;
    }
  }
  for (i = 0; i < 2; ++i) {
    vpx_free(ctx->color_index_map[i]);
    ctx->color_index_map[i] = 0;
  }
 }
 static void alloc_tree_contexts(VP10_COMMON *cm, PC_TREE *tree,
--- a/vp10/encoder/context_tree.h
+++ b/vp10/encoder/context_tree.h
@@ -14,6 +14,10 @@
 #include "vp10/common/blockd.h"
 #include "vp10/encoder/block.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct VP10_COMP;
 struct VP10Common;
 struct ThreadData;
@@ -23,6 +27,7 @@ typedef struct {
  MODE_INFO mic;
  MB_MODE_INFO_EXT mbmi_ext;
  uint8_t *zcoeff_blk;
  uint8_t *color_index_map[2];
  tran_low_t *coeff[MAX_MB_PLANE][3];
  tran_low_t *qcoeff[MAX_MB_PLANE][3];
  tran_low_t *dqcoeff[MAX_MB_PLANE][3];
@@ -84,4 +89,8 @@ typedef struct PC_TREE {
 void vp10_setup_pc_tree(struct VP10Common *cm, struct ThreadData *td);
 void vp10_free_pc_tree(struct ThreadData *td);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif /* VP10_ENCODER_CONTEXT_TREE_H_ */
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -20,218 +20,711 @@
 #include "vpx_dsp/fwd_txfm.h"
 #include "vpx_ports/mem.h"
 static INLINE void range_check(const tran_low_t *input, const int size,
                               const int bit) {
 #if 0  // CONFIG_COEFFICIENT_RANGE_CHECKING
 // TODO(angiebird): the range_check is not used because the bit range
 // in fdct# is not correct. Since we are going to merge in a new version
 // of fdct# from nextgenv2, we won't fix the incorrect bit range now.
  int i;
  for (i = 0; i < size; ++i) {
    assert(abs(input[i]) < (1 << bit));
  }
 #else
  (void)input;
  (void)size;
  (void)bit;
 #endif
 }
 static void fdct4(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t step[4];
+  tran_high_t temp;
-  tran_high_t temp1, temp2;
+  tran_low_t step[4];
-  step[0] = input[0] + input[3];
+  // stage 0
-  step[1] = input[1] + input[2];
+  range_check(input, 4, 14);
  step[2] = input[1] - input[2];
  step[3] = input[0] - input[3];
-  temp1 = (step[0] + step[1]) * cospi_16_64;
+  // stage 1
-  temp2 = (step[0] - step[1]) * cospi_16_64;
+  output[0] = input[0] + input[3];
-  output[0] = (tran_low_t)fdct_round_shift(temp1);
+  output[1] = input[1] + input[2];
-  output[2] = (tran_low_t)fdct_round_shift(temp2);
+  output[2] = input[1] - input[2];
-  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+  output[3] = input[0] - input[3];
-  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+
-  output[1] = (tran_low_t)fdct_round_shift(temp1);
+  range_check(output, 4, 15);
-  output[3] = (tran_low_t)fdct_round_shift(temp2);
+
  // stage 2
  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
  step[0] = (tran_low_t)fdct_round_shift(temp);
  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
  step[1] = (tran_low_t)fdct_round_shift(temp);
  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
  step[2] = (tran_low_t)fdct_round_shift(temp);
  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
  step[3] = (tran_low_t)fdct_round_shift(temp);
  range_check(step, 4, 16);
  // stage 3
  output[0] = step[0];
  output[1] = step[2];
  output[2] = step[1];
  output[3] = step[3];
  range_check(output, 4, 16);
 }
 static void fdct8(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+  tran_high_t temp;
-  tran_high_t t0, t1, t2, t3;                  // needs32
+  tran_low_t step[8];
-  tran_high_t x0, x1, x2, x3;                  // canbe16
+
  // stage 0
  range_check(input, 8, 13);
  // stage 1
-  s0 = input[0] + input[7];
+  output[0] = input[0] + input[7];
-  s1 = input[1] + input[6];
+  output[1] = input[1] + input[6];
-  s2 = input[2] + input[5];
+  output[2] = input[2] + input[5];
-  s3 = input[3] + input[4];
+  output[3] = input[3] + input[4];
-  s4 = input[3] - input[4];
+  output[4] = input[3] - input[4];
-  s5 = input[2] - input[5];
+  output[5] = input[2] - input[5];
-  s6 = input[1] - input[6];
+  output[6] = input[1] - input[6];
-  s7 = input[0] - input[7];
+  output[7] = input[0] - input[7];
-  // fdct4(step, step);
+  range_check(output, 8, 14);
  x0 = s0 + s3;
  x1 = s1 + s2;
  x2 = s1 - s2;
  x3 = s0 - s3;
  t0 = (x0 + x1) * cospi_16_64;
  t1 = (x0 - x1) * cospi_16_64;
  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
  output[0] = (tran_low_t)fdct_round_shift(t0);
  output[2] = (tran_low_t)fdct_round_shift(t2);
  output[4] = (tran_low_t)fdct_round_shift(t1);
  output[6] = (tran_low_t)fdct_round_shift(t3);
-  // Stage 2
+  // stage 2
-  t0 = (s6 - s5) * cospi_16_64;
+  step[0] = output[0] + output[3];
-  t1 = (s6 + s5) * cospi_16_64;
+  step[1] = output[1] + output[2];
-  t2 = (tran_low_t)fdct_round_shift(t0);
+  step[2] = output[1] - output[2];
-  t3 = (tran_low_t)fdct_round_shift(t1);
+  step[3] = output[0] - output[3];
  step[4] = output[4];
  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
  step[5] = (tran_low_t)fdct_round_shift(temp);
  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
  step[6] = (tran_low_t)fdct_round_shift(temp);
  step[7] = output[7];
-  // Stage 3
+  range_check(step, 8, 15);
  x0 = s4 + t2;
  x1 = s4 - t2;
  x2 = s7 - t3;
  x3 = s7 + t3;
-  // Stage 4
+  // stage 3
-  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
-  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+  output[0] = (tran_low_t)fdct_round_shift(temp);
-  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
-  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+  output[1] = (tran_low_t)fdct_round_shift(temp);
-  output[1] = (tran_low_t)fdct_round_shift(t0);
+  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-  output[3] = (tran_low_t)fdct_round_shift(t2);
+  output[2] = (tran_low_t)fdct_round_shift(temp);
-  output[5] = (tran_low_t)fdct_round_shift(t1);
+  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
-  output[7] = (tran_low_t)fdct_round_shift(t3);
+  output[3] = (tran_low_t)fdct_round_shift(temp);
  output[4] = step[4] + step[5];
  output[5] = step[4] - step[5];
  output[6] = step[7] - step[6];
  output[7] = step[7] + step[6];
  range_check(output, 8, 16);
  // stage 4
  step[0] = output[0];
  step[1] = output[1];
  step[2] = output[2];
  step[3] = output[3];
  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
  step[4] = (tran_low_t)fdct_round_shift(temp);
  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
  step[5] = (tran_low_t)fdct_round_shift(temp);
  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
  step[6] = (tran_low_t)fdct_round_shift(temp);
  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
  step[7] = (tran_low_t)fdct_round_shift(temp);
  range_check(step, 8, 16);
  // stage 5
  output[0] = step[0];
  output[1] = step[4];
  output[2] = step[2];
  output[3] = step[6];
  output[4] = step[1];
  output[5] = step[5];
  output[6] = step[3];
  output[7] = step[7];
  range_check(output, 8, 16);
 }
-static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+static void fdct16(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t step1[8];      // canbe16
+  tran_high_t temp;
-  tran_high_t step2[8];      // canbe16
+  tran_low_t step[16];
  tran_high_t step3[8];      // canbe16
  tran_high_t input[8];      // canbe16
  tran_high_t temp1, temp2;  // needs32
-  // step 1
+  // stage 0
-  input[0] = in[0] + in[15];
+  range_check(input, 16, 13);
  input[1] = in[1] + in[14];
  input[2] = in[2] + in[13];
  input[3] = in[3] + in[12];
  input[4] = in[4] + in[11];
  input[5] = in[5] + in[10];
  input[6] = in[6] + in[ 9];
  input[7] = in[7] + in[ 8];
-  step1[0] = in[7] - in[ 8];
+  // stage 1
-  step1[1] = in[6] - in[ 9];
+  output[0] = input[0] + input[15];
-  step1[2] = in[5] - in[10];
+  output[1] = input[1] + input[14];
-  step1[3] = in[4] - in[11];
+  output[2] = input[2] + input[13];
-  step1[4] = in[3] - in[12];
+  output[3] = input[3] + input[12];
-  step1[5] = in[2] - in[13];
+  output[4] = input[4] + input[11];
-  step1[6] = in[1] - in[14];
+  output[5] = input[5] + input[10];
-  step1[7] = in[0] - in[15];
+  output[6] = input[6] + input[9];
  output[7] = input[7] + input[8];
  output[8] = input[7] - input[8];
  output[9] = input[6] - input[9];
  output[10] = input[5] - input[10];
  output[11] = input[4] - input[11];
  output[12] = input[3] - input[12];
  output[13] = input[2] - input[13];
  output[14] = input[1] - input[14];
  output[15] = input[0] - input[15];
-  // fdct8(step, step);
+  range_check(output, 16, 14);
  {
    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
    tran_high_t t0, t1, t2, t3;                  // needs32
    tran_high_t x0, x1, x2, x3;                  // canbe16
-    // stage 1
+  // stage 2
-    s0 = input[0] + input[7];
+  step[0] = output[0] + output[7];
-    s1 = input[1] + input[6];
+  step[1] = output[1] + output[6];
-    s2 = input[2] + input[5];
+  step[2] = output[2] + output[5];
-    s3 = input[3] + input[4];
+  step[3] = output[3] + output[4];
-    s4 = input[3] - input[4];
+  step[4] = output[3] - output[4];
-    s5 = input[2] - input[5];
+  step[5] = output[2] - output[5];
-    s6 = input[1] - input[6];
+  step[6] = output[1] - output[6];
-    s7 = input[0] - input[7];
+  step[7] = output[0] - output[7];
  step[8] = output[8];
  step[9] = output[9];
  temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64;
  step[10] = (tran_low_t)fdct_round_shift(temp);
  temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64;
  step[11] = (tran_low_t)fdct_round_shift(temp);
  temp = output[12] * cospi_16_64 + output[11] * cospi_16_64;
  step[12] = (tran_low_t)fdct_round_shift(temp);
  temp = output[13] * cospi_16_64 + output[10] * cospi_16_64;
  step[13] = (tran_low_t)fdct_round_shift(temp);
  step[14] = output[14];
  step[15] = output[15];
-    // fdct4(step, step);
+  range_check(step, 16, 15);
    x0 = s0 + s3;
    x1 = s1 + s2;
    x2 = s1 - s2;
    x3 = s0 - s3;
    t0 = (x0 + x1) * cospi_16_64;
    t1 = (x0 - x1) * cospi_16_64;
    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
    out[0] = (tran_low_t)fdct_round_shift(t0);
    out[4] = (tran_low_t)fdct_round_shift(t2);
    out[8] = (tran_low_t)fdct_round_shift(t1);
    out[12] = (tran_low_t)fdct_round_shift(t3);
-    // Stage 2
+  // stage 3
-    t0 = (s6 - s5) * cospi_16_64;
+  output[0] = step[0] + step[3];
-    t1 = (s6 + s5) * cospi_16_64;
+  output[1] = step[1] + step[2];
-    t2 = fdct_round_shift(t0);
+  output[2] = step[1] - step[2];
-    t3 = fdct_round_shift(t1);
+  output[3] = step[0] - step[3];
  output[4] = step[4];
  temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64;
  output[5] = (tran_low_t)fdct_round_shift(temp);
  temp = step[6] * cospi_16_64 + step[5] * cospi_16_64;
  output[6] = (tran_low_t)fdct_round_shift(temp);
  output[7] = step[7];
  output[8] = step[8] + step[11];
  output[9] = step[9] + step[10];
  output[10] = step[9] - step[10];
  output[11] = step[8] - step[11];
  output[12] = step[15] - step[12];
  output[13] = step[14] - step[13];
  output[14] = step[14] + step[13];
  output[15] = step[15] + step[12];
-    // Stage 3
+  range_check(output, 16, 16);
    x0 = s4 + t2;
    x1 = s4 - t2;
    x2 = s7 - t3;
    x3 = s7 + t3;
-    // Stage 4
+  // stage 4
-    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
-    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+  step[0] = (tran_low_t)fdct_round_shift(temp);
-    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
-    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+  step[1] = (tran_low_t)fdct_round_shift(temp);
-    out[2] = (tran_low_t)fdct_round_shift(t0);
+  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
-    out[6] = (tran_low_t)fdct_round_shift(t2);
+  step[2] = (tran_low_t)fdct_round_shift(temp);
-    out[10] = (tran_low_t)fdct_round_shift(t1);
+  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
-    out[14] = (tran_low_t)fdct_round_shift(t3);
+  step[3] = (tran_low_t)fdct_round_shift(temp);
-  }
+  step[4] = output[4] + output[5];
  step[5] = output[4] - output[5];
  step[6] = output[7] - output[6];
  step[7] = output[7] + output[6];
  step[8] = output[8];
  temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64;
  step[9] = (tran_low_t)fdct_round_shift(temp);
  temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64;
  step[10] = (tran_low_t)fdct_round_shift(temp);
  step[11] = output[11];
  step[12] = output[12];
  temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64;
  step[13] = (tran_low_t)fdct_round_shift(temp);
  temp = output[14] * cospi_8_64 + output[9] * cospi_24_64;
  step[14] = (tran_low_t)fdct_round_shift(temp);
  step[15] = output[15];
-  // step 2
+  range_check(step, 16, 16);
  temp1 = (step1[5] - step1[2]) * cospi_16_64;
  temp2 = (step1[4] - step1[3]) * cospi_16_64;
  step2[2] = fdct_round_shift(temp1);
  step2[3] = fdct_round_shift(temp2);
  temp1 = (step1[4] + step1[3]) * cospi_16_64;
  temp2 = (step1[5] + step1[2]) * cospi_16_64;
  step2[4] = fdct_round_shift(temp1);
  step2[5] = fdct_round_shift(temp2);
-  // step 3
+  // stage 5
-  step3[0] = step1[0] + step2[3];
+  output[0] = step[0];
-  step3[1] = step1[1] + step2[2];
+  output[1] = step[1];
-  step3[2] = step1[1] - step2[2];
+  output[2] = step[2];
-  step3[3] = step1[0] - step2[3];
+  output[3] = step[3];
-  step3[4] = step1[7] - step2[4];
+  temp = step[4] * cospi_28_64 + step[7] * cospi_4_64;
-  step3[5] = step1[6] - step2[5];
+  output[4] = (tran_low_t)fdct_round_shift(temp);
-  step3[6] = step1[6] + step2[5];
+  temp = step[5] * cospi_12_64 + step[6] * cospi_20_64;
-  step3[7] = step1[7] + step2[4];
+  output[5] = (tran_low_t)fdct_round_shift(temp);
  temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64;
  output[6] = (tran_low_t)fdct_round_shift(temp);
  temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64;
  output[7] = (tran_low_t)fdct_round_shift(temp);
  output[8] = step[8] + step[9];
  output[9] = step[8] - step[9];
  output[10] = step[11] - step[10];
  output[11] = step[11] + step[10];
  output[12] = step[12] + step[13];
  output[13] = step[12] - step[13];
  output[14] = step[15] - step[14];
  output[15] = step[15] + step[14];
-  // step 4
+  range_check(output, 16, 16);
  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
  step2[1] = fdct_round_shift(temp1);
  step2[2] = fdct_round_shift(temp2);
  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
  step2[5] = fdct_round_shift(temp1);
  step2[6] = fdct_round_shift(temp2);
-  // step 5
+  // stage 6
-  step1[0] = step3[0] + step2[1];
+  step[0] = output[0];
-  step1[1] = step3[0] - step2[1];
+  step[1] = output[1];
-  step1[2] = step3[3] + step2[2];
+  step[2] = output[2];
-  step1[3] = step3[3] - step2[2];
+  step[3] = output[3];
-  step1[4] = step3[4] - step2[5];
+  step[4] = output[4];
-  step1[5] = step3[4] + step2[5];
+  step[5] = output[5];
-  step1[6] = step3[7] - step2[6];
+  step[6] = output[6];
-  step1[7] = step3[7] + step2[6];
+  step[7] = output[7];
  temp = output[8] * cospi_30_64 + output[15] * cospi_2_64;
  step[8] = (tran_low_t)fdct_round_shift(temp);
  temp = output[9] * cospi_14_64 + output[14] * cospi_18_64;
  step[9] = (tran_low_t)fdct_round_shift(temp);
  temp = output[10] * cospi_22_64 + output[13] * cospi_10_64;
  step[10] = (tran_low_t)fdct_round_shift(temp);
  temp = output[11] * cospi_6_64 + output[12] * cospi_26_64;
  step[11] = (tran_low_t)fdct_round_shift(temp);
  temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64;
  step[12] = (tran_low_t)fdct_round_shift(temp);
  temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64;
  step[13] = (tran_low_t)fdct_round_shift(temp);
  temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64;
  step[14] = (tran_low_t)fdct_round_shift(temp);
  temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64;
  step[15] = (tran_low_t)fdct_round_shift(temp);
-  // step 6
+  range_check(step, 16, 16);
  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
  out[1] = (tran_low_t)fdct_round_shift(temp1);
  out[9] = (tran_low_t)fdct_round_shift(temp2);
-  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+  // stage 7
-  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+  output[0] = step[0];
-  out[5] = (tran_low_t)fdct_round_shift(temp1);
+  output[1] = step[8];
-  out[13] = (tran_low_t)fdct_round_shift(temp2);
+  output[2] = step[4];
  output[3] = step[12];
  output[4] = step[2];
  output[5] = step[10];
  output[6] = step[6];
  output[7] = step[14];
  output[8] = step[1];
  output[9] = step[9];
  output[10] = step[5];
  output[11] = step[13];
  output[12] = step[3];
  output[13] = step[11];
  output[14] = step[7];
  output[15] = step[15];
-  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+  range_check(output, 16, 16);
  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
  out[3] = (tran_low_t)fdct_round_shift(temp1);
  out[11] = (tran_low_t)fdct_round_shift(temp2);
  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
  out[7] = (tran_low_t)fdct_round_shift(temp1);
  out[15] = (tran_low_t)fdct_round_shift(temp2);
 }
 /* TODO(angiebird): Unify this with vp10_fwd_txfm.c: vp10_fdct32
 static void fdct32(const tran_low_t *input, tran_low_t *output) {
  tran_high_t temp;
  tran_low_t step[32];
  // stage 0
  range_check(input, 32, 14);
  // stage 1
  output[0] = input[0] + input[31];
  output[1] = input[1] + input[30];
  output[2] = input[2] + input[29];
  output[3] = input[3] + input[28];
  output[4] = input[4] + input[27];
  output[5] = input[5] + input[26];
  output[6] = input[6] + input[25];
  output[7] = input[7] + input[24];
  output[8] = input[8] + input[23];
  output[9] = input[9] + input[22];
  output[10] = input[10] + input[21];
  output[11] = input[11] + input[20];
  output[12] = input[12] + input[19];
  output[13] = input[13] + input[18];
  output[14] = input[14] + input[17];
  output[15] = input[15] + input[16];
  output[16] = input[15] - input[16];
  output[17] = input[14] - input[17];
  output[18] = input[13] - input[18];
  output[19] = input[12] - input[19];
  output[20] = input[11] - input[20];
  output[21] = input[10] - input[21];
  output[22] = input[9] - input[22];
  output[23] = input[8] - input[23];
  output[24] = input[7] - input[24];
  output[25] = input[6] - input[25];
  output[26] = input[5] - input[26];
  output[27] = input[4] - input[27];
  output[28] = input[3] - input[28];
  output[29] = input[2] - input[29];
  output[30] = input[1] - input[30];
  output[31] = input[0] - input[31];
  range_check(output, 32, 15);
  // stage 2
  step[0] = output[0] + output[15];
  step[1] = output[1] + output[14];
  step[2] = output[2] + output[13];
  step[3] = output[3] + output[12];
  step[4] = output[4] + output[11];
  step[5] = output[5] + output[10];
  step[6] = output[6] + output[9];
  step[7] = output[7] + output[8];
  step[8] = output[7] - output[8];
  step[9] = output[6] - output[9];
  step[10] = output[5] - output[10];
  step[11] = output[4] - output[11];
  step[12] = output[3] - output[12];
  step[13] = output[2] - output[13];
  step[14] = output[1] - output[14];
  step[15] = output[0] - output[15];
  step[16] = output[16];
  step[17] = output[17];
  step[18] = output[18];
  step[19] = output[19];
  temp = output[20] * -cospi_16_64 + output[27] * cospi_16_64;
  step[20] = (tran_low_t)fdct_round_shift(temp);
  temp = output[21] * -cospi_16_64 + output[26] * cospi_16_64;
  step[21] = (tran_low_t)fdct_round_shift(temp);
  temp = output[22] * -cospi_16_64 + output[25] * cospi_16_64;
  step[22] = (tran_low_t)fdct_round_shift(temp);
  temp = output[23] * -cospi_16_64 + output[24] * cospi_16_64;
  step[23] = (tran_low_t)fdct_round_shift(temp);
  temp = output[24] * cospi_16_64 + output[23] * cospi_16_64;
  step[24] = (tran_low_t)fdct_round_shift(temp);
  temp = output[25] * cospi_16_64 + output[22] * cospi_16_64;
  step[25] = (tran_low_t)fdct_round_shift(temp);
  temp = output[26] * cospi_16_64 + output[21] * cospi_16_64;
  step[26] = (tran_low_t)fdct_round_shift(temp);
  temp = output[27] * cospi_16_64 + output[20] * cospi_16_64;
  step[27] = (tran_low_t)fdct_round_shift(temp);
  step[28] = output[28];
  step[29] = output[29];
  step[30] = output[30];
  step[31] = output[31];
  range_check(step, 32, 16);
  // stage 3
  output[0] = step[0] + step[7];
  output[1] = step[1] + step[6];
  output[2] = step[2] + step[5];
  output[3] = step[3] + step[4];
  output[4] = step[3] - step[4];
  output[5] = step[2] - step[5];
  output[6] = step[1] - step[6];
  output[7] = step[0] - step[7];
  output[8] = step[8];
  output[9] = step[9];
  temp = step[10] * -cospi_16_64 + step[13] * cospi_16_64;
  output[10] = (tran_low_t)fdct_round_shift(temp);
  temp = step[11] * -cospi_16_64 + step[12] * cospi_16_64;
  output[11] = (tran_low_t)fdct_round_shift(temp);
  temp = step[12] * cospi_16_64 + step[11] * cospi_16_64;
  output[12] = (tran_low_t)fdct_round_shift(temp);
  temp = step[13] * cospi_16_64 + step[10] * cospi_16_64;
  output[13] = (tran_low_t)fdct_round_shift(temp);
  output[14] = step[14];
  output[15] = step[15];
  output[16] = step[16] + step[23];
  output[17] = step[17] + step[22];
  output[18] = step[18] + step[21];
  output[19] = step[19] + step[20];
  output[20] = step[19] - step[20];
  output[21] = step[18] - step[21];
  output[22] = step[17] - step[22];
  output[23] = step[16] - step[23];
  output[24] = step[31] - step[24];
  output[25] = step[30] - step[25];
  output[26] = step[29] - step[26];
  output[27] = step[28] - step[27];
  output[28] = step[28] + step[27];
  output[29] = step[29] + step[26];
  output[30] = step[30] + step[25];
  output[31] = step[31] + step[24];
  range_check(output, 32, 17);
  // stage 4
  step[0] = output[0] + output[3];
  step[1] = output[1] + output[2];
  step[2] = output[1] - output[2];
  step[3] = output[0] - output[3];
  step[4] = output[4];
  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
  step[5] = (tran_low_t)fdct_round_shift(temp);
  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
  step[6] = (tran_low_t)fdct_round_shift(temp);
  step[7] = output[7];
  step[8] = output[8] + output[11];
  step[9] = output[9] + output[10];
  step[10] = output[9] - output[10];
  step[11] = output[8] - output[11];
  step[12] = output[15] - output[12];
  step[13] = output[14] - output[13];
  step[14] = output[14] + output[13];
  step[15] = output[15] + output[12];
  step[16] = output[16];
  step[17] = output[17];
  temp = output[18] * -cospi_8_64 + output[29] * cospi_24_64;
  step[18] = (tran_low_t)fdct_round_shift(temp);
  temp = output[19] * -cospi_8_64 + output[28] * cospi_24_64;
  step[19] = (tran_low_t)fdct_round_shift(temp);
  temp = output[20] * -cospi_24_64 + output[27] * -cospi_8_64;
  step[20] = (tran_low_t)fdct_round_shift(temp);
  temp = output[21] * -cospi_24_64 + output[26] * -cospi_8_64;
  step[21] = (tran_low_t)fdct_round_shift(temp);
  step[22] = output[22];
  step[23] = output[23];
  step[24] = output[24];
  step[25] = output[25];
  temp = output[26] * cospi_24_64 + output[21] * -cospi_8_64;
  step[26] = (tran_low_t)fdct_round_shift(temp);
  temp = output[27] * cospi_24_64 + output[20] * -cospi_8_64;
  step[27] = (tran_low_t)fdct_round_shift(temp);
  temp = output[28] * cospi_8_64 + output[19] * cospi_24_64;
  step[28] = (tran_low_t)fdct_round_shift(temp);
  temp = output[29] * cospi_8_64 + output[18] * cospi_24_64;
  step[29] = (tran_low_t)fdct_round_shift(temp);
  step[30] = output[30];
  step[31] = output[31];
  range_check(step, 32, 18);
  // stage 5
  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
  output[0] = (tran_low_t)fdct_round_shift(temp);
  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
  output[1] = (tran_low_t)fdct_round_shift(temp);
  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
  output[2] = (tran_low_t)fdct_round_shift(temp);
  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
  output[3] = (tran_low_t)fdct_round_shift(temp);
  output[4] = step[4] + step[5];
  output[5] = step[4] - step[5];
  output[6] = step[7] - step[6];
  output[7] = step[7] + step[6];
  output[8] = step[8];
  temp = step[9] * -cospi_8_64 + step[14] * cospi_24_64;
  output[9] = (tran_low_t)fdct_round_shift(temp);
  temp = step[10] * -cospi_24_64 + step[13] * -cospi_8_64;
  output[10] = (tran_low_t)fdct_round_shift(temp);
  output[11] = step[11];
  output[12] = step[12];
  temp = step[13] * cospi_24_64 + step[10] * -cospi_8_64;
  output[13] = (tran_low_t)fdct_round_shift(temp);
  temp = step[14] * cospi_8_64 + step[9] * cospi_24_64;
  output[14] = (tran_low_t)fdct_round_shift(temp);
  output[15] = step[15];
  output[16] = step[16] + step[19];
  output[17] = step[17] + step[18];
  output[18] = step[17] - step[18];
  output[19] = step[16] - step[19];
  output[20] = step[23] - step[20];
  output[21] = step[22] - step[21];
  output[22] = step[22] + step[21];
  output[23] = step[23] + step[20];
  output[24] = step[24] + step[27];
  output[25] = step[25] + step[26];
  output[26] = step[25] - step[26];
  output[27] = step[24] - step[27];
  output[28] = step[31] - step[28];
  output[29] = step[30] - step[29];
  output[30] = step[30] + step[29];
  output[31] = step[31] + step[28];
  range_check(output, 32, 18);
  // stage 6
  step[0] = output[0];
  step[1] = output[1];
  step[2] = output[2];
  step[3] = output[3];
  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
  step[4] = (tran_low_t)fdct_round_shift(temp);
  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
  step[5] = (tran_low_t)fdct_round_shift(temp);
  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
  step[6] = (tran_low_t)fdct_round_shift(temp);
  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
  step[7] = (tran_low_t)fdct_round_shift(temp);
  step[8] = output[8] + output[9];
  step[9] = output[8] - output[9];
  step[10] = output[11] - output[10];
  step[11] = output[11] + output[10];
  step[12] = output[12] + output[13];
  step[13] = output[12] - output[13];
  step[14] = output[15] - output[14];
  step[15] = output[15] + output[14];
  step[16] = output[16];
  temp = output[17] * -cospi_4_64 + output[30] * cospi_28_64;
  step[17] = (tran_low_t)fdct_round_shift(temp);
  temp = output[18] * -cospi_28_64 + output[29] * -cospi_4_64;
  step[18] = (tran_low_t)fdct_round_shift(temp);
  step[19] = output[19];
  step[20] = output[20];
  temp = output[21] * -cospi_20_64 + output[26] * cospi_12_64;
  step[21] = (tran_low_t)fdct_round_shift(temp);
  temp = output[22] * -cospi_12_64 + output[25] * -cospi_20_64;
  step[22] = (tran_low_t)fdct_round_shift(temp);
  step[23] = output[23];
  step[24] = output[24];
  temp = output[25] * cospi_12_64 + output[22] * -cospi_20_64;
  step[25] = (tran_low_t)fdct_round_shift(temp);
  temp = output[26] * cospi_20_64 + output[21] * cospi_12_64;
  step[26] = (tran_low_t)fdct_round_shift(temp);
  step[27] = output[27];
  step[28] = output[28];
  temp = output[29] * cospi_28_64 + output[18] * -cospi_4_64;
  step[29] = (tran_low_t)fdct_round_shift(temp);
  temp = output[30] * cospi_4_64 + output[17] * cospi_28_64;
  step[30] = (tran_low_t)fdct_round_shift(temp);
  step[31] = output[31];
  range_check(step, 32, 18);
  // stage 7
  output[0] = step[0];
  output[1] = step[1];
  output[2] = step[2];
  output[3] = step[3];
  output[4] = step[4];
  output[5] = step[5];
  output[6] = step[6];
  output[7] = step[7];
  temp = step[8] * cospi_30_64 + step[15] * cospi_2_64;
  output[8] = (tran_low_t)fdct_round_shift(temp);
  temp = step[9] * cospi_14_64 + step[14] * cospi_18_64;
  output[9] = (tran_low_t)fdct_round_shift(temp);
  temp = step[10] * cospi_22_64 + step[13] * cospi_10_64;
  output[10] = (tran_low_t)fdct_round_shift(temp);
  temp = step[11] * cospi_6_64 + step[12] * cospi_26_64;
  output[11] = (tran_low_t)fdct_round_shift(temp);
  temp = step[12] * cospi_6_64 + step[11] * -cospi_26_64;
  output[12] = (tran_low_t)fdct_round_shift(temp);
  temp = step[13] * cospi_22_64 + step[10] * -cospi_10_64;
  output[13] = (tran_low_t)fdct_round_shift(temp);
  temp = step[14] * cospi_14_64 + step[9] * -cospi_18_64;
  output[14] = (tran_low_t)fdct_round_shift(temp);
  temp = step[15] * cospi_30_64 + step[8] * -cospi_2_64;
  output[15] = (tran_low_t)fdct_round_shift(temp);
  output[16] = step[16] + step[17];
  output[17] = step[16] - step[17];
  output[18] = step[19] - step[18];
  output[19] = step[19] + step[18];
  output[20] = step[20] + step[21];
  output[21] = step[20] - step[21];
  output[22] = step[23] - step[22];
  output[23] = step[23] + step[22];
  output[24] = step[24] + step[25];
  output[25] = step[24] - step[25];
  output[26] = step[27] - step[26];
  output[27] = step[27] + step[26];
  output[28] = step[28] + step[29];
  output[29] = step[28] - step[29];
  output[30] = step[31] - step[30];
  output[31] = step[31] + step[30];
  range_check(output, 32, 18);
  // stage 8
  step[0] = output[0];
  step[1] = output[1];
  step[2] = output[2];
  step[3] = output[3];
  step[4] = output[4];
  step[5] = output[5];
  step[6] = output[6];
  step[7] = output[7];
  step[8] = output[8];
  step[9] = output[9];
  step[10] = output[10];
  step[11] = output[11];
  step[12] = output[12];
  step[13] = output[13];
  step[14] = output[14];
  step[15] = output[15];
  temp = output[16] * cospi_31_64 + output[31] * cospi_1_64;
  step[16] = (tran_low_t)fdct_round_shift(temp);
  temp = output[17] * cospi_15_64 + output[30] * cospi_17_64;
  step[17] = (tran_low_t)fdct_round_shift(temp);
  temp = output[18] * cospi_23_64 + output[29] * cospi_9_64;
  step[18] = (tran_low_t)fdct_round_shift(temp);
  temp = output[19] * cospi_7_64 + output[28] * cospi_25_64;
  step[19] = (tran_low_t)fdct_round_shift(temp);
  temp = output[20] * cospi_27_64 + output[27] * cospi_5_64;
  step[20] = (tran_low_t)fdct_round_shift(temp);
  temp = output[21] * cospi_11_64 + output[26] * cospi_21_64;
  step[21] = (tran_low_t)fdct_round_shift(temp);
  temp = output[22] * cospi_19_64 + output[25] * cospi_13_64;
  step[22] = (tran_low_t)fdct_round_shift(temp);
  temp = output[23] * cospi_3_64 + output[24] * cospi_29_64;
  step[23] = (tran_low_t)fdct_round_shift(temp);
  temp = output[24] * cospi_3_64 + output[23] * -cospi_29_64;
  step[24] = (tran_low_t)fdct_round_shift(temp);
  temp = output[25] * cospi_19_64 + output[22] * -cospi_13_64;
  step[25] = (tran_low_t)fdct_round_shift(temp);
  temp = output[26] * cospi_11_64 + output[21] * -cospi_21_64;
  step[26] = (tran_low_t)fdct_round_shift(temp);
  temp = output[27] * cospi_27_64 + output[20] * -cospi_5_64;
  step[27] = (tran_low_t)fdct_round_shift(temp);
  temp = output[28] * cospi_7_64 + output[19] * -cospi_25_64;
  step[28] = (tran_low_t)fdct_round_shift(temp);
  temp = output[29] * cospi_23_64 + output[18] * -cospi_9_64;
  step[29] = (tran_low_t)fdct_round_shift(temp);
  temp = output[30] * cospi_15_64 + output[17] * -cospi_17_64;
  step[30] = (tran_low_t)fdct_round_shift(temp);
  temp = output[31] * cospi_31_64 + output[16] * -cospi_1_64;
  step[31] = (tran_low_t)fdct_round_shift(temp);
  range_check(step, 32, 18);
  // stage 9
  output[0] = step[0];
  output[1] = step[16];
  output[2] = step[8];
  output[3] = step[24];
  output[4] = step[4];
  output[5] = step[20];
  output[6] = step[12];
  output[7] = step[28];
  output[8] = step[2];
  output[9] = step[18];
  output[10] = step[10];
  output[11] = step[26];
  output[12] = step[6];
  output[13] = step[22];
  output[14] = step[14];
  output[15] = step[30];
  output[16] = step[1];
  output[17] = step[17];
  output[18] = step[9];
  output[19] = step[25];
  output[20] = step[5];
  output[21] = step[21];
  output[22] = step[13];
  output[23] = step[29];
  output[24] = step[3];
  output[25] = step[19];
  output[26] = step[11];
  output[27] = step[27];
  output[28] = step[7];
  output[29] = step[23];
  output[30] = step[15];
  output[31] = step[31];
  range_check(output, 32, 18);
 }
 */
 static void fadst4(const tran_low_t *input, tran_low_t *output) {
  tran_high_t x0, x1, x2, x3;
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -607,19 +1100,19 @@ void vp10_fdct8x8_quant_c(const int16_t *input, int stride,
      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
-      // Stage 2
+      // stage 2
      t0 = (s6 - s5) * cospi_16_64;
      t1 = (s6 + s5) * cospi_16_64;
      t2 = fdct_round_shift(t0);
      t3 = fdct_round_shift(t1);
-      // Stage 3
+      // stage 3
      x0 = s4 + t2;
      x1 = s4 - t2;
      x2 = s7 - t3;
      x3 = s7 + t3;
-      // Stage 4
+      // stage 4
      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
--- a/Show More
+++ b/Show More