Hide global symbols for macho32/64

Added hiding global symbols for macho32 and macho64 in x86inc.asm. This was done to fix exported symbol issue in Chrome build. Change-Id: I08d5c559b985b82f655b537469fee125615e78c0
Correct handling of show_bit in uncompressed header.
2013-10-30 13:12:15 -07:00 · 2013-10-29 13:59:35 -07:00 · 2013-10-29 11:34:19 -07:00 · 2013-10-29 10:28:25 -07:00 · 2013-10-29 10:28:25 -07:00 · 2013-10-29 10:28:25 -07:00
266 changed files with 31447 additions and 19894 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
 *.a
 *.asm.s
 *.d
+*.gcno
+*.gcda
 *.o
 *~
 /*.ivf
@@ -14,7 +16,7 @@
 /.install-*
 /.libs
 /Makefile
-/config.err
+/config.log
 /config.mk
 /decode_to_md5
 /decode_to_md5.c
--- a/36
+++ b/36
@@ -1,7 +1,7 @@
 vpx Multi-Format Codec SDK
-README - 21 June 2012
+README - 1 August 2013

-Welcome to the WebM VP8 Codec SDK!
+Welcome to the WebM VP8/VP9 Codec SDK!

 COMPILING THE APPLICATIONS/LIBRARIES:
  The build system used is similar to autotools. Building generally consists of
@@ -53,33 +53,63 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv5te-android-gcc
    armv5te-linux-rvct
    armv5te-linux-gcc
+    armv5te-none-rvct
    armv6-darwin-gcc
    armv6-linux-rvct
    armv6-linux-gcc
+    armv6-none-rvct
    armv7-android-gcc
+    armv7-darwin-gcc
    armv7-linux-rvct
    armv7-linux-gcc
+    armv7-none-rvct
+    armv7-win32-vs11
    mips32-linux-gcc
    ppc32-darwin8-gcc
    ppc32-darwin9-gcc
+    ppc32-linux-gcc
    ppc64-darwin8-gcc
    ppc64-darwin9-gcc
    ppc64-linux-gcc
+    sparc-solaris-gcc
+    x86-android-gcc
    x86-darwin8-gcc
    x86-darwin8-icc
    x86-darwin9-gcc
    x86-darwin9-icc
+    x86-darwin10-gcc
+    x86-darwin11-gcc
+    x86-darwin12-gcc
+    x86-darwin13-gcc
    x86-linux-gcc
    x86-linux-icc
+    x86-os2-gcc
    x86-solaris-gcc
+    x86-win32-gcc
    x86-win32-vs7
    x86-win32-vs8
+    x86-win32-vs9
+    x86-win32-vs10
+    x86-win32-vs11
    x86_64-darwin9-gcc
+    x86_64-darwin10-gcc
+    x86_64-darwin11-gcc
+    x86_64-darwin12-gcc
+    x86_64-darwin13-gcc
    x86_64-linux-gcc
+    x86_64-linux-icc
    x86_64-solaris-gcc
+    x86_64-win64-gcc
    x86_64-win64-vs8
+    x86_64-win64-vs9
+    x86_64-win64-vs10
+    x86_64-win64-vs11
    universal-darwin8-gcc
    universal-darwin9-gcc
+    universal-darwin10-gcc
+    universal-darwin11-gcc
+    universal-darwin12-gcc
+    universal-darwin13-gcc
    generic-gnu

  The generic-gnu target, in conjunction with the CROSS environment variable,
@@ -97,7 +127,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:

  5. Configuration errors
  If the configuration step fails, the first step is to look in the error log.
-  This defaults to config.err. This should give a good indication of what went
+  This defaults to config.log. This should give a good indication of what went
  wrong. If not, contact us for support.

 SUPPORT
--- a/build/arm-msvs/obj_int_extract.bat
+++ b/build/arm-msvs/obj_int_extract.bat
@@ -7,18 +7,7 @@ REM   in the file PATENTS.  All contributing project authors may
 REM   be found in the AUTHORS file in the root of the source tree.
 echo on

-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/common/vp9_asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/decoder/vp9_asm_dec_offsets.c"
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/encoder/vp9_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm"
-obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm"
-obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm"
-
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/common/vp8_asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/decoder/vp8_asm_dec_offsets.c"
 cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/encoder/vp8_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
-obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
 obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"

 cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vpx_scale/vpx_scale_asm_offsets.c"
--- a/build/make/armlink_adapter.sh
+++ b/build/make/armlink_adapter.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
@@ -13,20 +13,20 @@
 verbose=0
 set -- $*
 for i; do
-    if [ "$i" == "-o" ]; then
+    if [ "$i" = "-o" ]; then
        on_of=1
-    elif [ "$i" == "-v" ]; then
+    elif [ "$i" = "-v" ]; then
        verbose=1
-    elif [ "$i" == "-g" ]; then
+    elif [ "$i" = "-g" ]; then
        args="${args} --debug"
-    elif [ "$on_of" == "1" ]; then
+    elif [ "$on_of" = "1" ]; then
        outfile=$i
        on_of=0
    elif [ -f "$i" ]; then
        infiles="$infiles $i"
-    elif [ "${i:0:2}" == "-l" ]; then
+    elif [ "${i#-l}" != "$i" ]; then
        libs="$libs ${i#-l}"
-    elif [ "${i:0:2}" == "-L" ]; then
+    elif [ "${i#-L}" != "$i" ]; then
        libpaths="${libpaths} ${i#-L}"
    else
        args="${args} ${i}"
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  configure.sh
 ##
@@ -75,7 +75,7 @@ Options:

 Build options:
  --help                      print this message
-  --log=yes|no|FILE           file configure log is written to [config.err]
+  --log=yes|no|FILE           file configure log is written to [config.log]
  --target=TARGET             target platform tuple [generic-gnu]
  --cpu=CPU                   optimize for a specific cpu rather than a family
  --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
@@ -198,11 +198,11 @@ add_extralibs() {
 #
 # Boolean Manipulation Functions
 #
-enable(){
+enable_feature(){
    set_all yes $*
 }

-disable(){
+disable_feature(){
    set_all no $*
 }

@@ -219,7 +219,7 @@ soft_enable() {
    for var in $*; do
        if ! disabled $var; then
            log_echo "  enabling $var"
-            enable $var
+            enable_feature $var
        fi
    done
 }
@@ -228,7 +228,7 @@ soft_disable() {
    for var in $*; do
        if ! enabled $var; then
            log_echo "  disabling $var"
-            disable $var
+            disable_feature $var
        fi
    done
 }
@@ -251,10 +251,10 @@ tolower(){
 # Temporary File Functions
 #
 source_path=${0%/*}
-enable source_path_used
+enable_feature source_path_used
 if test -z "$source_path" -o "$source_path" = "." ; then
    source_path="`pwd`"
-    disable source_path_used
+    disable_feature source_path_used
 fi

 if test ! -z "$TMPDIR" ; then
@@ -264,12 +264,13 @@ elif test ! -z "$TEMPDIR" ; then
 else
    TMPDIRx="/tmp"
 fi
-TMP_H="${TMPDIRx}/vpx-conf-$$-${RANDOM}.h"
-TMP_C="${TMPDIRx}/vpx-conf-$$-${RANDOM}.c"
-TMP_CC="${TMPDIRx}/vpx-conf-$$-${RANDOM}.cc"
-TMP_O="${TMPDIRx}/vpx-conf-$$-${RANDOM}.o"
-TMP_X="${TMPDIRx}/vpx-conf-$$-${RANDOM}.x"
-TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RANDOM}.asm"
+RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
+TMP_H="${TMPDIRx}/vpx-conf-$$-${RAND}.h"
+TMP_C="${TMPDIRx}/vpx-conf-$$-${RAND}.c"
+TMP_CC="${TMPDIRx}/vpx-conf-$$-${RAND}.cc"
+TMP_O="${TMPDIRx}/vpx-conf-$$-${RAND}.o"
+TMP_X="${TMPDIRx}/vpx-conf-$$-${RAND}.x"
+TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RAND}.asm"

 clean_temp_files() {
    rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
@@ -316,8 +317,8 @@ check_header(){
    header=$1
    shift
    var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-    disable $var
-    check_cpp "$@" <<EOF && enable $var
+    disable_feature $var
+    check_cpp "$@" <<EOF && enable_feature $var
 #include "$header"
 int x;
 EOF
@@ -479,7 +480,7 @@ process_common_cmdline() {
    for opt in "$@"; do
        optval="${opt#*=}"
        case "$opt" in
-        --child) enable child
+        --child) enable_feature child
        ;;
        --log*)
        logging="$optval"
@@ -491,7 +492,7 @@ process_common_cmdline() {
        ;;
        --target=*) toolchain="${toolchain:-${optval}}"
        ;;
-        --force-target=*) toolchain="${toolchain:-${optval}}"; enable force_toolchain
+        --force-target=*) toolchain="${toolchain:-${optval}}"; enable_feature force_toolchain
        ;;
        --cpu)
        ;;
@@ -511,7 +512,7 @@ process_common_cmdline() {
          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
            die_unknown $opt
        fi
-        $action $option
+        ${action}_feature $option
        ;;
        --require-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
@@ -523,11 +524,11 @@ process_common_cmdline() {
        ;;
        --force-enable-?*|--force-disable-?*)
        eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'`
-        $action $option
+        ${action}_feature $option
        ;;
        --libc=*)
        [ -d "${optval}" ] || die "Not a directory: ${optval}"
-        disable builtin_libc
+        disable_feature builtin_libc
        alt_libc="${optval}"
        ;;
        --as=*)
@@ -653,6 +654,10 @@ process_common_toolchain() {
                tgt_isa=x86_64
                tgt_os=darwin12
                ;;
+            *darwin13*)
+                tgt_isa=x86_64
+                tgt_os=darwin13
+                ;;
            x86_64*mingw32*)
                tgt_os=win64
                ;;
@@ -692,13 +697,13 @@ process_common_toolchain() {

    # Mark the specific ISA requested as enabled
    soft_enable ${tgt_isa}
-    enable ${tgt_os}
-    enable ${tgt_cc}
+    enable_feature ${tgt_os}
+    enable_feature ${tgt_cc}

    # Enable the architecture family
    case ${tgt_isa} in
-        arm*) enable arm;;
-        mips*) enable mips;;
+        arm*) enable_feature arm;;
+        mips*) enable_feature mips;;
    esac

    # PIC is probably what we want when building shared libs
@@ -751,13 +756,17 @@ process_common_toolchain() {
            add_cflags  "-mmacosx-version-min=10.8"
            add_ldflags "-mmacosx-version-min=10.8"
            ;;
+        *-darwin13-*)
+            add_cflags  "-mmacosx-version-min=10.9"
+            add_ldflags "-mmacosx-version-min=10.9"
+            ;;
    esac

    # Handle Solaris variants. Solaris 10 needs -lposix4
    case ${toolchain} in
        sparc-solaris-*)
            add_extralibs -lposix4
-            disable fast_unaligned
+            disable_feature fast_unaligned
            ;;
        *-solaris-*)
            add_extralibs -lposix4
@@ -782,7 +791,7 @@ process_common_toolchain() {
            ;;
        armv5te)
            soft_enable edsp
-            disable fast_unaligned
+            disable_feature fast_unaligned
            ;;
        esac

@@ -797,7 +806,7 @@ process_common_toolchain() {
            arch_int=${arch_int%%te}
            check_add_asflags --defsym ARCHITECTURE=${arch_int}
            tune_cflags="-mtune="
-            if [ ${tgt_isa} == "armv7" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
                if [ -z "${float_abi}" ]; then
                    check_cpp <<EOF && float_abi=hard || float_abi=softfp
 #ifndef __ARM_PCS_VFP
@@ -834,8 +843,8 @@ EOF
            asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
            AS_SFX=.s
            msvs_arch_dir=arm-msvs
-            disable multithread
-            disable unit_tests
+            disable_feature multithread
+            disable_feature unit_tests
            ;;
        rvct)
            CC=armcc
@@ -847,7 +856,7 @@ EOF
            tune_cflags="--cpu="
            tune_asflags="--cpu="
            if [ -z "${tune_cpu}" ]; then
-                if [ ${tgt_isa} == "armv7" ]; then
+                if [ ${tgt_isa} = "armv7" ]; then
                    if enabled neon
                    then
                        check_add_cflags --fpu=softvfp+vfpv3
@@ -872,8 +881,8 @@ EOF

        case ${tgt_os} in
        none*)
-            disable multithread
-            disable os_support
+            disable_feature multithread
+            disable_feature os_support
            ;;

        android*)
@@ -905,9 +914,9 @@ EOF
            # Cortex-A8 implementations (NDK Dev Guide)
            add_ldflags "-Wl,--fix-cortex-a8"

-            enable pic
+            enable_feature pic
            soft_enable realtime_only
-            if [ ${tgt_isa} == "armv7" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
                soft_enable runtime_cpu_detect
            fi
            if enabled runtime_cpu_detect; then
@@ -961,7 +970,7 @@ EOF
         ;;

        linux*)
-            enable linux
+            enable_feature linux
            if enabled rvct; then
                # Check if we have CodeSourcery GCC in PATH. Needed for
                # libraries
@@ -992,14 +1001,14 @@ EOF
        tune_cflags="-mtune="
        if enabled dspr2; then
            check_add_cflags -mips32r2 -mdspr2
-            disable fast_unaligned
+            disable_feature fast_unaligned
        fi
        check_add_cflags -march=${tgt_isa}
        check_add_asflags -march=${tgt_isa}
        check_add_asflags -KPIC
    ;;
    ppc*)
-        enable ppc
+        enable_feature ppc
        bits=${tgt_isa##ppc}
        link_with_cc=gcc
        setup_gnu_toolchain
@@ -1147,7 +1156,7 @@ EOF
    ;;
    universal*|*-gcc|generic-gnu)
        link_with_cc=gcc
-        enable gcc
+        enable_feature gcc
    setup_gnu_toolchain
    ;;
    esac
@@ -1181,6 +1190,12 @@ EOF
        fi
    fi

+    # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
+    echo "  checking here for x86inc \"${tgt_isa}\" \"$pic\" "
+    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o "${tgt_os#darwin}" = "${tgt_os}"  ]; then
+      soft_enable use_x86inc
+    fi
+
    # Position Independent Code (PIC) support, for building relocatable
    # shared objects
    enabled gcc && enabled pic && check_add_cflags -fPIC
@@ -1190,14 +1205,14 @@ EOF
    enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0

    # Check for strip utility variant
-    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable gnu_strip
+    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip

    # Try to determine target endianness
    check_cc <<EOF
    unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
 EOF
    [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' |
-        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable big_endian
+        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable_feature big_endian

    # Try to find which inline keywords are supported
    check_cc <<EOF && INLINE="inline"
@@ -1222,7 +1237,7 @@ EOF
            if enabled dspr2; then
                if enabled big_endian; then
                    echo "dspr2 optimizations are available only for little endian platforms"
-                    disable dspr2
+                    disable_feature dspr2
                fi
            fi
        ;;
@@ -1273,8 +1288,8 @@ print_config_h() {

 print_webm_license() {
    local destination=$1
-    local prefix=$2
-    local suffix=$3
+    local prefix="$2"
+    local suffix="$3"
    shift 3
    cat <<EOF > ${destination}
 ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
@@ -1295,8 +1310,8 @@ process_detect() {
    true;
 }

-enable logging
-logfile="config.err"
+enable_feature logging
+logfile="config.log"
 self=$0
 process() {
    cmdline_args="$@"
--- a/build/make/gen_asm_deps.sh
+++ b/build/make/gen_asm_deps.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -381,7 +381,7 @@ generate_vcproj() {
                            RuntimeLibrary="$debug_runtime" \
                            UsePrecompiledHeader="0" \
                            WarningLevel="3" \
-                            DebugInformationFormat="1" \
+                            DebugInformationFormat="2" \
                            $warn_64bit \

                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
@@ -395,7 +395,7 @@ generate_vcproj() {
                            RuntimeLibrary="$debug_runtime" \
                            UsePrecompiledHeader="0" \
                            WarningLevel="3" \
-                            DebugInformationFormat="1" \
+                            DebugInformationFormat="2" \
                            $warn_64bit \

                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh
@@ -72,10 +72,21 @@ parse_project() {
    eval "${var}_name=$name"
    eval "${var}_guid=$guid"

-    # assume that all projects have the same list of possible configurations,
-    # so overwriting old config_lists is not a problem
-    config_list=`grep -A1 '<Configuration' $file |
-        grep Name | cut -d\" -f2`
+    if [ "$sfx" = "vcproj" ]; then
+        cur_config_list=`grep -A1 '<Configuration' $file |
+            grep Name | cut -d\" -f2`
+    else
+        cur_config_list=`grep -B1 'Label="Configuration"' $file |
+            grep Condition | cut -d\' -f4`
+    fi
+    new_config_list=$(for i in $config_list $cur_config_list; do
+        echo $i
+    done | sort | uniq)
+    if [ "$config_list" != "" ] && [ "$config_list" != "$new_config_list" ]; then
+        mixed_platforms=1
+    fi
+    config_list="$new_config_list"
+    eval "${var}_config_list=\"$cur_config_list\""
    proj_list="${proj_list} ${var}"
 }

@@ -125,6 +136,11 @@ process_global() {
    indent_push
    IFS_bak=${IFS}
    IFS=$'\r'$'\n'
+    if [ "$mixed_platforms" != "" ]; then
+        config_list="
+Release|Mixed Platforms
+Debug|Mixed Platforms"
+    fi
    for config in ${config_list}; do
        echo "${indent}$config = $config"
    done
@@ -139,10 +155,17 @@ process_global() {
    indent_push
    for proj in ${proj_list}; do
        eval "local proj_guid=\${${proj}_guid}"
+        eval "local proj_config_list=\${${proj}_config_list}"
        IFS=$'\r'$'\n'
-        for config in ${config_list}; do
-            echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}"
-            echo "${indent}${proj_guid}.${config}.Build.0 = ${config}"
+        for config in ${proj_config_list}; do
+            if [ "$mixed_platforms" != "" ]; then
+                local c=${config%%|*}
+                echo "${indent}${proj_guid}.${c}|Mixed Platforms.ActiveCfg = ${config}"
+                echo "${indent}${proj_guid}.${c}|Mixed Platforms.Build.0 = ${config}"
+            else
+                echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}"
+                echo "${indent}${proj_guid}.${config}.Build.0 = ${config}"
+            fi

        done
        IFS=${IFS_bak}
@@ -168,9 +191,14 @@ process_makefile() {
    IFS=$'\r'$'\n'
    local TAB=$'\t'
    cat <<EOF
-found_devenv := \$(shell which devenv.com >/dev/null 2>&1 && echo yes)
+ifeq (\$(CONFIG_VS_VERSION),7)
+MSBUILD_TOOL := devenv.com
+else
+MSBUILD_TOOL := msbuild.exe
+endif
+found_devenv := \$(shell which \$(MSBUILD_TOOL) >/dev/null 2>&1 && echo yes)
 .nodevenv.once:
-${TAB}@echo "  * devenv.com not found in path."
+${TAB}@echo "  * \$(MSBUILD_TOOL) not found in path."
 ${TAB}@echo "  * "
 ${TAB}@echo "  * You will have to build all configurations manually using the"
 ${TAB}@echo "  * Visual Studio IDE. To allow make to build them automatically,"
@@ -195,16 +223,17 @@ ${TAB}rm -rf "$platform"/"$config"
 ifneq (\$(found_devenv),)
  ifeq (\$(CONFIG_VS_VERSION),7)
 $nows_sln_config: $outfile
-${TAB}devenv.com $outfile -build "$config"
+${TAB}\$(MSBUILD_TOOL) $outfile -build "$config"

  else
 $nows_sln_config: $outfile
-${TAB}devenv.com $outfile -build "$sln_config"
+${TAB}\$(MSBUILD_TOOL) $outfile -m -t:Build \\
+${TAB}${TAB}-p:Configuration="$config" -p:Platform="$platform"

  endif
 else
 $nows_sln_config: $outfile .nodevenv.once
-${TAB}@echo "  * Skipping build of $sln_config (devenv.com not in path)."
+${TAB}@echo "  * Skipping build of $sln_config (\$(MSBUILD_TOOL) not in path)."
 ${TAB}@echo "  * "
 endif

--- a/build/make/version.sh
+++ b/build/make/version.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
--- a/build/x86-msvs/obj_int_extract.bat
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -7,17 +7,6 @@ REM   in the file PATENTS.  All contributing project authors may
 REM   be found in the AUTHORS file in the root of the source tree.
 echo on

-cl /I "./" /I "%1" /nologo /c "%1/vp9/common/vp9_asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp9/decoder/vp9_asm_dec_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp9/encoder/vp9_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm"
-obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm"
-obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm"
-
-cl /I "./" /I "%1" /nologo /c "%1/vp8/common/vp8_asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/vp8_asm_dec_offsets.c"
 cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/vp8_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
-obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
 obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"

--- a/99
+++ b/99
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  configure
 ##
@@ -38,6 +38,7 @@ Advanced options:
  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
  ${toggle_mem_tracker}           track memory usage
  ${toggle_postproc}              postprocessing
+  ${toggle_vp9_postproc}          vp9 specific postprocessing
  ${toggle_multithread}           multithreaded encoding and decoding
  ${toggle_spatial_resampling}    spatial sampling (scaling) support
  ${toggle_realtime_only}         enable this option while building for real-time encoding
@@ -115,6 +116,7 @@ all_platforms="${all_platforms} x86-darwin9-icc"
 all_platforms="${all_platforms} x86-darwin10-gcc"
 all_platforms="${all_platforms} x86-darwin11-gcc"
 all_platforms="${all_platforms} x86-darwin12-gcc"
+all_platforms="${all_platforms} x86-darwin13-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
 all_platforms="${all_platforms} x86-os2-gcc"
@@ -129,6 +131,7 @@ all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-darwin11-gcc"
 all_platforms="${all_platforms} x86_64-darwin12-gcc"
+all_platforms="${all_platforms} x86_64-darwin13-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -142,6 +145,7 @@ all_platforms="${all_platforms} universal-darwin9-gcc"
 all_platforms="${all_platforms} universal-darwin10-gcc"
 all_platforms="${all_platforms} universal-darwin11-gcc"
 all_platforms="${all_platforms} universal-darwin12-gcc"
+all_platforms="${all_platforms} universal-darwin13-gcc"
 all_platforms="${all_platforms} generic-gnu"

 # all_targets is a list of all targets that can be configured
@@ -150,7 +154,7 @@ all_targets="libs examples docs"

 # all targets available are enabled, by default.
 for t in ${all_targets}; do
-    [ -f ${source_path}/${t}.mk ] && enable ${t}
+    [ -f ${source_path}/${t}.mk ] && enable_feature ${t}
 done

 # check installed doxygen version
@@ -161,30 +165,30 @@ if [ ${doxy_major:-0} -ge 1 ]; then
    doxy_minor=${doxy_version%%.*}
    doxy_patch=${doxy_version##*.}

-    [ $doxy_major -gt 1 ] && enable doxygen
-    [ $doxy_minor -gt 5 ] && enable doxygen
-    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable doxygen
+    [ $doxy_major -gt 1 ] && enable_feature doxygen
+    [ $doxy_minor -gt 5 ] && enable_feature doxygen
+    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen
 fi

 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
 # case.
-enabled doxygen && php -v >/dev/null 2>&1 && enable install_docs
-enable install_bins
-enable install_libs
+enabled doxygen && php -v >/dev/null 2>&1 && enable_feature install_docs
+enable_feature install_bins
+enable_feature install_libs

-enable static
-enable optimizations
-enable fast_unaligned #allow unaligned accesses, if supported by hw
-enable md5
-enable spatial_resampling
-enable multithread
-enable os_support
-enable temporal_denoising
+enable_feature static
+enable_feature optimizations
+enable_feature fast_unaligned #allow unaligned accesses, if supported by hw
+enable_feature md5
+enable_feature spatial_resampling
+enable_feature multithread
+enable_feature os_support
+enable_feature temporal_denoising

-[ -d ${source_path}/../include ] && enable alt_tree_layout
+[ -d ${source_path}/../include ] && enable_feature alt_tree_layout
 for d in vp8 vp9; do
-    [ -d ${source_path}/${d} ] && disable alt_tree_layout;
+    [ -d ${source_path}/${d} ] && disable_feature alt_tree_layout;
 done

 if ! enabled alt_tree_layout; then
@@ -197,10 +201,10 @@ else
 [ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder"
 [ -f ${source_path}/../include/vpx/vp9cx.h ] && CODECS="${CODECS} vp9_encoder"
 [ -f ${source_path}/../include/vpx/vp9dx.h ] && CODECS="${CODECS} vp9_decoder"
-[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable vp8_encoder
-[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable vp8_decoder
-[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable vp9_encoder
-[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable vp9_decoder
+[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable_feature vp8_encoder
+[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable_feature vp8_decoder
+[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable_feature vp9_encoder
+[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable_feature vp9_decoder

 [ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt
 fi
@@ -247,7 +251,6 @@ EXPERIMENT_LIST="
    multiple_arf
    non420
    alpha
-    balanced_coeftree
 "
 CONFIG_LIST="
    external_build
@@ -255,6 +258,7 @@ CONFIG_LIST="
    install_bins
    install_libs
    install_srcs
+    use_x86inc
    debug
    gprof
    gcov
@@ -276,6 +280,7 @@ CONFIG_LIST="
    dc_recon
    runtime_cpu_detect
    postproc
+    vp9_postproc
    multithread
    internal_stats
    ${CODECS}
@@ -311,6 +316,7 @@ CMDLINE_SELECT="
    gprof
    gcov
    pic
+    use_x86inc
    optimizations
    ccache
    runtime_cpu_detect
@@ -329,6 +335,7 @@ CMDLINE_SELECT="
    dequant_tokens
    dc_recon
    postproc
+    vp9_postproc
    multithread
    internal_stats
    ${CODECS}
@@ -354,12 +361,12 @@ process_cmdline() {
    for opt do
        optval="${opt#*=}"
        case "$opt" in
-        --disable-codecs) for c in ${CODECS}; do disable $c; done ;;
+        --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;;
        --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
        if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then
            if enabled experimental; then
-                $action $option
+                ${action}_feature $option
            else
                log_echo "Ignoring $opt -- not in experimental mode."
            fi
@@ -380,8 +387,8 @@ post_process_cmdline() {
    # If the codec family is enabled, enable all components of that family.
    log_echo "Configuring selected codecs"
    for c in ${CODECS}; do
-        disabled ${c%%_*} && disable ${c}
-        enabled ${c%%_*} && enable ${c}
+        disabled ${c%%_*} && disable_feature ${c}
+        enabled ${c%%_*} && enable_feature ${c}
    done

    # Enable all detected codecs, if they haven't been disabled
@@ -389,12 +396,12 @@ post_process_cmdline() {

    # Enable the codec family if any component of that family is enabled
    for c in ${CODECS}; do
-        enabled $c && enable ${c%_*}
+        enabled $c && enable_feature ${c%_*}
    done

    # Set the {en,de}coders variable if any algorithm in that class is enabled
    for c in ${CODECS}; do
-        enabled ${c} && enable ${c##*_}s
+        enabled ${c} && enable_feature ${c##*_}s
    done
 }

@@ -434,7 +441,7 @@ process_targets() {
    done
    enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
    enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
-    ! enabled postproc && DIST_DIR="${DIST_DIR}-nopost"
+    ! enabled postproc && ! enabled vp9_postproc && DIST_DIR="${DIST_DIR}-nopost"
    ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
    ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
    DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
@@ -504,13 +511,13 @@ process_detect() {
    fi
    if [ -z "$CC" ] || enabled external_build; then
        echo "Bypassing toolchain for environment detection."
-        enable external_build
+        enable_feature external_build
        check_header() {
            log fake_check_header "$@"
            header=$1
            shift
            var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-            disable $var
+            disable_feature $var
            # Headers common to all environments
            case $header in
                stdio.h)
@@ -522,7 +529,7 @@ process_detect() {
                        [ -f "${d##-I}/$header" ] && result=true && break
                    done
                    ${result:-true}
-            esac && enable $var
+            esac && enable_feature $var

            # Specialize windows and POSIX environments.
            case $toolchain in
@@ -530,7 +537,7 @@ process_detect() {
                    case $header-$toolchain in
                        stdint*-gcc) true;;
                        *) false;;
-                    esac && enable $var
+                    esac && enable_feature $var
                    ;;
                *)
                    case $header in
@@ -539,7 +546,7 @@ process_detect() {
                        sys/mman.h) true;;
                        unistd.h) true;;
                        *) false;;
-                    esac && enable $var
+                    esac && enable_feature $var
            esac
            enabled $var
        }
@@ -557,7 +564,7 @@ EOF
    check_header sys/mman.h
    check_header unistd.h # for sysconf(3) and friends.

-    check_header vpx/vpx_integer.h -I${source_path} && enable vpx_ports
+    check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
 }

 process_toolchain() {
@@ -639,14 +646,18 @@ process_toolchain() {
    # ccache only really works on gcc toolchains
    enabled gcc || soft_disable ccache
    if enabled mips; then
-        enable dequant_tokens
-        enable dc_recon
+        enable_feature dequant_tokens
+        enable_feature dc_recon
+    fi
+
+    if enabled internal_stats; then
+        enable_feature vp9_postproc
    fi

    # Enable the postbuild target if building for visual studio.
    case "$tgt_cc" in
-        vs*) enable msvs
-             enable solution
+        vs*) enable_feature msvs
+             enable_feature solution
             vs_version=${tgt_cc##vs}
             case $vs_version in
             [789])
@@ -682,6 +693,14 @@ process_toolchain() {
            # iOS/ARM builds do not work with gtest. This does not match
            # x86 targets.
        ;;
+        *-win*)
+            # Some mingw toolchains don't have pthread available by default.
+            # Treat these more like visual studio where threading in gtest
+            # would be disabled for the same reason.
+            check_cxx "$@" <<EOF && soft_enable unit_tests
+int z;
+EOF
+        ;;
        *)
            enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
 int z;
--- a/examples.mk
+++ b/examples.mk
@@ -49,6 +49,9 @@ vpxenc.DESCRIPTION           = Full featured encoder
 UTILS-$(CONFIG_VP8_ENCODER)    += vp8_scalable_patterns.c
 vp8_scalable_patterns.GUID   = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C
 vp8_scalable_patterns.DESCRIPTION = Temporal Scalability Encoder
+UTILS-$(CONFIG_VP8_ENCODER)    += vp9_spatial_scalable_encoder.c
+vp8_scalable_patterns.GUID   = 4A38598D-627D-4505-9C7B-D4020C84100D
+vp8_scalable_patterns.DESCRIPTION = Spatial Scalable Encoder

 # Clean up old ivfenc, ivfdec binaries.
 ifeq ($(CONFIG_MSVS),yes)
--- a/libs.mk
+++ b/libs.mk
@@ -57,6 +57,13 @@ CLEAN-OBJS += $$(BUILD_PFX)$(1).h
 RTCD += $$(BUILD_PFX)$(1).h
 endef

+# x86inc.asm is not compatible with pic 32bit builds. Restrict
+# files which use it to 64bit builds or 32bit without pic
+USE_X86INC = no
+ifeq ($(CONFIG_USE_X86INC),yes)
+  USE_X86INC = yes
+endif
+
 CODEC_SRCS-yes += CHANGELOG
 CODEC_SRCS-yes += libs.mk

@@ -383,7 +390,12 @@ LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
                     $(call enabled,LIBVPX_TEST_DATA))
 libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)

-$(LIBVPX_TEST_DATA):
+libvpx_test_srcs.txt:
+	@echo "    [CREATE] $@"
+	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | sort -u > $@
+CLEAN-OBJS += libvpx_test_srcs.txt
+
+$(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
 	@echo "    [DOWNLOAD] $@"
 	$(qexec)trap 'rm -f $@' INT TERM &&\
            curl -L -o $@ $(call libvpx_test_data_url,$(@F))
@@ -443,6 +455,10 @@ else
 include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
 GTEST_SRCS := $(addprefix third_party/googletest/src/,$(call enabled,GTEST_SRCS))
 GTEST_OBJS=$(call objs,$(GTEST_SRCS))
+ifeq ($(filter win%,$(TGT_OS)),$(TGT_OS))
+# Disabling pthreads globally will cause issues on darwin and possibly elsewhere
+$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0
+endif
 $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
 $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
 OBJS-$(BUILD_LIBVPX) += $(GTEST_OBJS)
@@ -467,7 +483,7 @@ $(foreach bin,$(LIBVPX_TEST_BINS),\
        lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\
    $(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\
        $(LIBVPX_TEST_OBJS) \
-        -L. -lvpx -lgtest -lpthread -lm)\
+        -L. -lvpx -lgtest $(extralibs) -lm)\
        )))\
    $(if $(LIPO_LIBS),$(eval $(call lipo_bin_template,$(bin))))\

--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef LIBVPX_TEST_ACM_RANDOM_H_
-#define LIBVPX_TEST_ACM_RANDOM_H_
+#ifndef TEST_ACM_RANDOM_H_
+#define TEST_ACM_RANDOM_H_

 #include "third_party/googletest/src/include/gtest/gtest.h"

@@ -59,4 +59,4 @@ class ACMRandom {

 }  // namespace libvpx_test

-#endif  // LIBVPX_TEST_ACM_RANDOM_H_
+#endif  // TEST_ACM_RANDOM_H_
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -33,10 +33,6 @@ class AltRefTest : public ::libvpx_test::EncoderTest,
    altref_count_ = 0;
  }

-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -27,14 +27,10 @@ class BordersTest : public ::libvpx_test::EncoderTest,
    SetMode(GET_PARAM(1));
  }

-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
-    if ( video->frame() == 1) {
-      encoder->Control(VP8E_SET_CPUUSED, 0);
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, 1);
      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
--- a/test/clear_system_state.h
+++ b/test/clear_system_state.h
@@ -10,7 +10,7 @@
 #ifndef TEST_CLEAR_SYSTEM_STATE_H_
 #define TEST_CLEAR_SYSTEM_STATE_H_

-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 # include "vpx_ports/x86.h"
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -134,14 +134,14 @@ class VP8CodecFactory : public CodecFactory {

 const libvpx_test::VP8CodecFactory kVP8;

-#define VP8_INSTANTIATE_TEST_CASE(test, params)\
+#define VP8_INSTANTIATE_TEST_CASE(test, ...)\
  INSTANTIATE_TEST_CASE_P(VP8, test, \
      ::testing::Combine( \
          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
              &libvpx_test::kVP8)), \
-          params))
+          __VA_ARGS__))
 #else
-#define VP8_INSTANTIATE_TEST_CASE(test, params)
+#define VP8_INSTANTIATE_TEST_CASE(test, ...)
 #endif  // CONFIG_VP8


@@ -216,14 +216,14 @@ class VP9CodecFactory : public CodecFactory {

 const libvpx_test::VP9CodecFactory kVP9;

-#define VP9_INSTANTIATE_TEST_CASE(test, params)\
+#define VP9_INSTANTIATE_TEST_CASE(test, ...)\
  INSTANTIATE_TEST_CASE_P(VP9, test, \
      ::testing::Combine( \
          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
               &libvpx_test::kVP9)), \
-          params))
+          __VA_ARGS__))
 #else
-#define VP9_INSTANTIATE_TEST_CASE(test, params)
+#define VP9_INSTANTIATE_TEST_CASE(test, ...)
 #endif  // CONFIG_VP9


--- a/test/config_test.cc
+++ b/test/config_test.cc
@@ -40,10 +40,6 @@ class ConfigTest : public ::libvpx_test::EncoderTest,
    ++frame_count_out_;
  }

-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
  unsigned int frame_count_in_;
  unsigned int frame_count_out_;
  unsigned int frame_count_max_;
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include <string.h>
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -22,8 +23,8 @@ extern "C" {
 }

 namespace {
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int filter_x_stride,
                              const int16_t *filter_y, int filter_y_stride,
                              int w, int h);
@@ -187,7 +188,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {

 protected:
  static const int kDataAlignment = 16;
-  static const int kOuterBlockSize = 128;
+  static const int kOuterBlockSize = 256;
  static const int kInputStride = kOuterBlockSize;
  static const int kOutputStride = kOuterBlockSize;
  static const int kMaxDimension = 64;
@@ -211,7 +212,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {

  virtual void SetUp() {
    UUT_ = GET_PARAM(2);
-    /* Set up guard blocks for an inner block cetered in the outer block */
+    /* Set up guard blocks for an inner block centered in the outer block */
    for (int i = 0; i < kOutputBufferSize; ++i) {
      if (IsIndexInBorder(i))
        output_[i] = 255;
@@ -224,6 +225,10 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
      input_[i] = prng.Rand8Extremes();
  }

+  void SetConstantInput(int value) {
+    memset(input_, value, kInputBufferSize);
+  }
+
  void CheckGuardBlocks() {
    for (int i = 0; i < kOutputBufferSize; ++i) {
      if (IsIndexInBorder(i))
@@ -456,45 +461,86 @@ DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = {
    { 128}
 };

+/* This test exercises the horizontal and vertical filter functions. */
 TEST_P(ConvolveTest, ChangeFilterWorks) {
  uint8_t* const in = input();
  uint8_t* const out = output();
+
+  /* Assume that the first input sample is at the 8/16th position. */
+  const int kInitialSubPelOffset = 8;
+
+  /* Filters are 8-tap, so the first filter tap will be applied to the pixel
+   * at position -3 with respect to the current filtering position. Since
+   * kInitialSubPelOffset is set to 8, we first select sub-pixel filter 8,
+   * which is non-zero only in the last tap. So, applying the filter at the
+   * current input position will result in an output equal to the pixel at
+   * offset +4 (-3 + 7) with respect to the current filtering position.
+   */
  const int kPixelSelected = 4;

+  /* Assume that each output pixel requires us to step on by 17/16th pixels in
+   * the input.
+   */
+  const int kInputPixelStep = 17;
+
+  /* The filters are setup in such a way that the expected output produces
+   * sets of 8 identical output samples. As the filter position moves to the
+   * next 1/16th pixel position the only active (=128) filter tap moves one
+   * position to the left, resulting in the same input pixel being replicated
+   * in to the output for 8 consecutive samples. After each set of 8 positions
+   * the filters select a different input pixel. kFilterPeriodAdjust below
+   * computes which input pixel is written to the output for a specified
+   * x or y position.
+   */
+
+  /* Test the horizontal filter. */
  REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[8], 17, kChangeFilters[4], 16,
-                                 Width(), Height()));
+                                 kChangeFilters[kInitialSubPelOffset],
+                                 kInputPixelStep, NULL, 0, Width(), Height()));

  for (int x = 0; x < Width(); ++x) {
-    const int kQ4StepAdjust = x >> 4;
    const int kFilterPeriodAdjust = (x >> 3) << 3;
-    const int ref_x = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
-    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x;
+    const int ref_x =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjust * kInputPixelStep)
+                          >> SUBPEL_BITS);
+    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x << "width = " << Width();
  }

+  /* Test the vertical filter. */
  REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[4], 16, kChangeFilters[8], 17,
-                                 Width(), Height()));
+                                 NULL, 0, kChangeFilters[kInitialSubPelOffset],
+                                 kInputPixelStep, Width(), Height()));

  for (int y = 0; y < Height(); ++y) {
-    const int kQ4StepAdjust = y >> 4;
    const int kFilterPeriodAdjust = (y >> 3) << 3;
-    const int ref_y = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
+    const int ref_y =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjust * kInputPixelStep)
+                          >> SUBPEL_BITS);
    ASSERT_EQ(in[ref_y * kInputStride], out[y * kInputStride]) << "y == " << y;
  }

+  /* Test the horizontal and vertical filters in combination. */
  REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                                  kChangeFilters[8], 17, kChangeFilters[8], 17,
+                                  kChangeFilters[kInitialSubPelOffset],
+                                  kInputPixelStep,
+                                  kChangeFilters[kInitialSubPelOffset],
+                                  kInputPixelStep,
                                  Width(), Height()));

  for (int y = 0; y < Height(); ++y) {
-    const int kQ4StepAdjustY = y >> 4;
    const int kFilterPeriodAdjustY = (y >> 3) << 3;
-    const int ref_y = kQ4StepAdjustY + kFilterPeriodAdjustY + kPixelSelected;
+    const int ref_y =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjustY * kInputPixelStep)
+                          >> SUBPEL_BITS);
    for (int x = 0; x < Width(); ++x) {
-      const int kQ4StepAdjustX = x >> 4;
      const int kFilterPeriodAdjustX = (x >> 3) << 3;
-      const int ref_x = kQ4StepAdjustX + kFilterPeriodAdjustX + kPixelSelected;
+      const int ref_x =
+          kPixelSelected + ((kInitialSubPelOffset
+              + kFilterPeriodAdjustX * kInputPixelStep)
+                            >> SUBPEL_BITS);

      ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x])
          << "x == " << x << ", y == " << y;
@@ -502,6 +548,34 @@ TEST_P(ConvolveTest, ChangeFilterWorks) {
  }
 }

+/* This test exercises that enough rows and columns are filtered with every
+   possible initial fractional positions and scaling steps. */
+TEST_P(ConvolveTest, CheckScalingFiltering) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+
+  SetConstantInput(127);
+
+  for (int frac = 0; frac < 16; ++frac) {
+    for (int step = 1; step <= 32; ++step) {
+      /* Test the horizontal and vertical filters in combination. */
+      REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
+                                      vp9_sub_pel_filters_8[frac], step,
+                                      vp9_sub_pel_filters_8[frac], step,
+                                      Width(), Height()));
+
+      CheckGuardBlocks();
+
+      for (int y = 0; y < Height(); ++y) {
+        for (int x = 0; x < Width(); ++x) {
+          ASSERT_EQ(in[y * kInputStride + x], out[y * kOutputStride + x])
+              << "x == " << x << ", y == " << y
+              << ", frac == " << frac << ", step == " << step;
+        }
+      }
+    }
+  }
+}

 using std::tr1::make_tuple;

@@ -527,9 +601,9 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(

 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
-    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_c,
-    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_c,
-    vp9_convolve8_ssse3, vp9_convolve8_avg_c);
+    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
+    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3,
+    vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3);

 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
    make_tuple(4, 4, &convolve8_ssse3),
@@ -546,4 +620,26 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
    make_tuple(32, 64, &convolve8_ssse3),
    make_tuple(64, 64, &convolve8_ssse3)));
 #endif
+
+#if HAVE_NEON
+const ConvolveFunctions convolve8_neon(
+    vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
+    vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
+    vp9_convolve8_neon, vp9_convolve8_avg_neon);
+
+INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_neon),
+    make_tuple(8, 4, &convolve8_neon),
+    make_tuple(4, 8, &convolve8_neon),
+    make_tuple(8, 8, &convolve8_neon),
+    make_tuple(16, 8, &convolve8_neon),
+    make_tuple(8, 16, &convolve8_neon),
+    make_tuple(16, 16, &convolve8_neon),
+    make_tuple(32, 16, &convolve8_neon),
+    make_tuple(16, 32, &convolve8_neon),
+    make_tuple(32, 32, &convolve8_neon),
+    make_tuple(64, 32, &convolve8_neon),
+    make_tuple(32, 64, &convolve8_neon),
+    make_tuple(64, 64, &convolve8_neon)));
+#endif
 }  // namespace
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class CpuSpeedTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWith2Params<
+        libvpx_test::TestMode, int> {
+ protected:
+  CpuSpeedTest() : EncoderTest(GET_PARAM(0)) {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
+    }
+  }
+  int set_cpu_used_;
+};
+
+TEST_P(CpuSpeedTest, TestQ0) {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 400;
+  cfg_.rc_max_quantizer = 0;
+  cfg_.rc_min_quantizer = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       20);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+
+TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 12000;
+  cfg_.rc_max_quantizer = 10;
+  cfg_.rc_min_quantizer = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+TEST_P(CpuSpeedTest, TestLowBitrate) {
+  // Validate that this clip encodes and decodes without a mismatch
+  // when passing in a very high min q.  This pushes the encoder to producing
+  // lots of small partitions which might will test the other condition.
+
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.rc_min_quantizer = 40;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+using std::tr1::make_tuple;
+
+#define VP9_FACTORY \
+  static_cast<const libvpx_test::CodecFactory*> (&libvpx_test::kVP9)
+
+VP9_INSTANTIATE_TEST_CASE(
+    CpuSpeedTest,
+    ::testing::Values(::libvpx_test::kTwoPassGood),
+    ::testing::Range(0, 5));
+}  // namespace
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@@ -42,10 +42,6 @@ class CQTest : public ::libvpx_test::EncoderTest,
    n_frames_ = 0;
  }

-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -36,10 +36,6 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
    duration_ = 0.0;
  }

-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    const vpx_rational_t tb = video->timebase();
@@ -79,7 +75,7 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
    bits_in_buffer_model_ -= frame_size_in_bits;

    // Update the running total of bits for end of test datarate checks.
-    bits_total_ += frame_size_in_bits ;
+    bits_total_ += frame_size_in_bits;

    // If first drop not set and we have a drop set it to this time.
    if (!first_drop_ && duration > 1)
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -13,14 +13,16 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"

 extern "C" {
 #include "vp9/common/vp9_entropy.h"
-#include "vp9_rtcd.h"
-void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
+#include "./vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *output, int pitch);
 }
-
-#include "acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -30,12 +32,13 @@ namespace {
 #ifdef _MSC_VER
 static int round(double x) {
  if (x < 0)
-    return (int)ceil(x - 0.5);
+    return static_cast<int>(ceil(x - 0.5));
  else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif

+const int kNumCoeffs = 256;
 const double PI = 3.1415926535898;
 void reference2_16x16_idct_2d(double *input, double *output) {
  double x;
@@ -44,7 +47,9 @@ void reference2_16x16_idct_2d(double *input, double *output) {
      double s = 0;
      for (int i = 0; i < 16; ++i) {
        for (int j = 0; j < 16; ++j) {
-          x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/256;
+          x = cos(PI * j * (l + 0.5) / 16.0) *
+              cos(PI * i * (k + 0.5) / 16.0) *
+              input[i * 16 + j] / 256;
          if (i != 0)
            x *= sqrt(2.0);
          if (j != 0)
@@ -58,23 +63,23 @@ void reference2_16x16_idct_2d(double *input, double *output) {
 }


-static const double C1 = 0.995184726672197;
-static const double C2 = 0.98078528040323;
-static const double C3 = 0.956940335732209;
-static const double C4 = 0.923879532511287;
-static const double C5 = 0.881921264348355;
-static const double C6 = 0.831469612302545;
-static const double C7 = 0.773010453362737;
-static const double C8 = 0.707106781186548;
-static const double C9 = 0.634393284163646;
-static const double C10 = 0.555570233019602;
-static const double C11 = 0.471396736825998;
-static const double C12 = 0.38268343236509;
-static const double C13 = 0.290284677254462;
-static const double C14 = 0.195090322016128;
-static const double C15 = 0.098017140329561;
+const double C1 = 0.995184726672197;
+const double C2 = 0.98078528040323;
+const double C3 = 0.956940335732209;
+const double C4 = 0.923879532511287;
+const double C5 = 0.881921264348355;
+const double C6 = 0.831469612302545;
+const double C7 = 0.773010453362737;
+const double C8 = 0.707106781186548;
+const double C9 = 0.634393284163646;
+const double C10 = 0.555570233019602;
+const double C11 = 0.471396736825998;
+const double C12 = 0.38268343236509;
+const double C13 = 0.290284677254462;
+const double C14 = 0.195090322016128;
+const double C15 = 0.098017140329561;

-static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
+void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  double step[16];
  double intermediate[16];
  double temp1, temp2;
@@ -107,36 +112,36 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[6] = step[1] - step[6];
  output[7] = step[0] - step[7];

-  temp1 = step[ 8]*C7;
-  temp2 = step[15]*C9;
+  temp1 = step[ 8] * C7;
+  temp2 = step[15] * C9;
  output[ 8] = temp1 + temp2;

-  temp1 = step[ 9]*C11;
-  temp2 = step[14]*C5;
+  temp1 = step[ 9] * C11;
+  temp2 = step[14] * C5;
  output[ 9] = temp1 - temp2;

-  temp1 = step[10]*C3;
-  temp2 = step[13]*C13;
+  temp1 = step[10] * C3;
+  temp2 = step[13] * C13;
  output[10] = temp1 + temp2;

-  temp1 = step[11]*C15;
-  temp2 = step[12]*C1;
+  temp1 = step[11] * C15;
+  temp2 = step[12] * C1;
  output[11] = temp1 - temp2;

-  temp1 = step[11]*C1;
-  temp2 = step[12]*C15;
+  temp1 = step[11] * C1;
+  temp2 = step[12] * C15;
  output[12] = temp2 + temp1;

-  temp1 = step[10]*C13;
-  temp2 = step[13]*C3;
+  temp1 = step[10] * C13;
+  temp2 = step[13] * C3;
  output[13] = temp2 - temp1;

-  temp1 = step[ 9]*C5;
-  temp2 = step[14]*C11;
+  temp1 = step[ 9] * C5;
+  temp2 = step[14] * C11;
  output[14] = temp2 + temp1;

-  temp1 = step[ 8]*C9;
-  temp2 = step[15]*C7;
+  temp1 = step[ 8] * C9;
+  temp2 = step[15] * C7;
  output[15] = temp2 - temp1;

  // step 3
@@ -145,20 +150,20 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  step[ 2] = output[1] - output[2];
  step[ 3] = output[0] - output[3];

-  temp1 = output[4]*C14;
-  temp2 = output[7]*C2;
+  temp1 = output[4] * C14;
+  temp2 = output[7] * C2;
  step[ 4] = temp1 + temp2;

-  temp1 = output[5]*C10;
-  temp2 = output[6]*C6;
+  temp1 = output[5] * C10;
+  temp2 = output[6] * C6;
  step[ 5] = temp1 + temp2;

-  temp1 = output[5]*C6;
-  temp2 = output[6]*C10;
+  temp1 = output[5] * C6;
+  temp2 = output[6] * C10;
  step[ 6] = temp2 - temp1;

-  temp1 = output[4]*C2;
-  temp2 = output[7]*C14;
+  temp1 = output[4] * C2;
+  temp2 = output[7] * C14;
  step[ 7] = temp2 - temp1;

  step[ 8] = output[ 8] + output[11];
@@ -175,18 +180,18 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[ 0] = (step[ 0] + step[ 1]);
  output[ 8] = (step[ 0] - step[ 1]);

-  temp1 = step[2]*C12;
-  temp2 = step[3]*C4;
+  temp1 = step[2] * C12;
+  temp2 = step[3] * C4;
  temp1 = temp1 + temp2;
-  output[ 4] = 2*(temp1*C8);
+  output[ 4] = 2*(temp1 * C8);

-  temp1 = step[2]*C4;
-  temp2 = step[3]*C12;
+  temp1 = step[2] * C4;
+  temp2 = step[3] * C12;
  temp1 = temp2 - temp1;
-  output[12] = 2*(temp1*C8);
+  output[12] = 2 * (temp1 * C8);

-  output[ 2] = 2*((step[4] + step[ 5])*C8);
-  output[14] = 2*((step[7] - step[ 6])*C8);
+  output[ 2] = 2 * ((step[4] + step[ 5]) * C8);
+  output[14] = 2 * ((step[7] - step[ 6]) * C8);

  temp1 = step[4] - step[5];
  temp2 = step[6] + step[7];
@@ -196,17 +201,17 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  intermediate[8] = step[8] + step[14];
  intermediate[9] = step[9] + step[15];

-  temp1 = intermediate[8]*C12;
-  temp2 = intermediate[9]*C4;
+  temp1 = intermediate[8] * C12;
+  temp2 = intermediate[9] * C4;
  temp1 = temp1 - temp2;
-  output[3] = 2*(temp1*C8);
+  output[3] = 2 * (temp1 * C8);

-  temp1 = intermediate[8]*C4;
-  temp2 = intermediate[9]*C12;
+  temp1 = intermediate[8] * C4;
+  temp2 = intermediate[9] * C12;
  temp1 = temp2 + temp1;
-  output[13] = 2*(temp1*C8);
+  output[13] = 2 * (temp1 * C8);

-  output[ 9] = 2*((step[10] + step[11])*C8);
+  output[ 9] = 2 * ((step[10] + step[11]) * C8);

  intermediate[11] = step[10] - step[11];
  intermediate[12] = step[12] + step[13];
@@ -217,150 +222,300 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[15] = (intermediate[11] + intermediate[12]);
  output[ 1] = -(intermediate[11] - intermediate[12]);

-  output[ 7] = 2*(intermediate[13]*C8);
+  output[ 7] = 2 * (intermediate[13] * C8);

-  temp1 = intermediate[14]*C12;
-  temp2 = intermediate[15]*C4;
+  temp1 = intermediate[14] * C12;
+  temp2 = intermediate[15] * C4;
  temp1 = temp1 - temp2;
-  output[11] = -2*(temp1*C8);
+  output[11] = -2 * (temp1 * C8);

-  temp1 = intermediate[14]*C4;
-  temp2 = intermediate[15]*C12;
+  temp1 = intermediate[14] * C4;
+  temp2 = intermediate[15] * C12;
  temp1 = temp2 + temp1;
-  output[ 5] = 2*(temp1*C8);
+  output[ 5] = 2 * (temp1 * C8);
 }

-static void reference_16x16_dct_1d(double in[16], double out[16]) {
-  const double kPi = 3.141592653589793238462643383279502884;
-  const double kInvSqrt2 = 0.707106781186547524400844362104;
-  for (int k = 0; k < 16; k++) {
-    out[k] = 0.0;
-    for (int n = 0; n < 16; n++)
-      out[k] += in[n]*cos(kPi*(2*n+1)*k/32.0);
-    if (k == 0)
-      out[k] = out[k]*kInvSqrt2;
-  }
-}
-
-void reference_16x16_dct_2d(int16_t input[16*16], double output[16*16]) {
+void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
  // First transform columns
  for (int i = 0; i < 16; ++i) {
    double temp_in[16], temp_out[16];
    for (int j = 0; j < 16; ++j)
-      temp_in[j] = input[j*16 + i];
+      temp_in[j] = input[j * 16 + i];
    butterfly_16x16_dct_1d(temp_in, temp_out);
    for (int j = 0; j < 16; ++j)
-      output[j*16 + i] = temp_out[j];
+      output[j * 16 + i] = temp_out[j];
  }
  // Then transform rows
  for (int i = 0; i < 16; ++i) {
    double temp_in[16], temp_out[16];
    for (int j = 0; j < 16; ++j)
-      temp_in[j] = output[j + i*16];
+      temp_in[j] = output[j + i * 16];
    butterfly_16x16_dct_1d(temp_in, temp_out);
    // Scale by some magic number
    for (int j = 0; j < 16; ++j)
-      output[j + i*16] = temp_out[j]/2;
+      output[j + i * 16] = temp_out[j]/2;
  }
 }

+typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
+typedef void (*idct_t)(int16_t *in, uint8_t *out, int stride);
+typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
+typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);

-TEST(VP9Idct16x16Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t in[256], coeff[256];
-    uint8_t dst[256], src[256];
-    double out_r[256];
-
-    for (int j = 0; j < 256; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j)
-      in[j] = src[j] - dst[j];
-
-    reference_16x16_dct_2d(in, out_r);
-    for (int j = 0; j < 256; j++)
-      coeff[j] = round(out_r[j]);
-    vp9_short_idct16x16_add_c(coeff, dst, 16);
-    for (int j = 0; j < 256; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      EXPECT_GE(1, error)
-          << "Error: 16x16 IDCT has error " << error
-          << " at index " << j;
-    }
-  }
+void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
+  vp9_short_fdct16x16_c(in, out, stride);
 }

-// we need enable fdct test once we re-do the 16 point fdct.
-TEST(VP9Fdct16x16Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int max_error = 0;
-  double total_error = 0;
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[256];
-    int16_t test_temp_block[256];
-    uint8_t dst[256], src[256];
+void fht16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
+  vp9_short_fht16x16_c(in, out, stride, tx_type);
+}

-    for (int j = 0; j < 256; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
+class Trans16x16TestBase {
+ public:
+  virtual ~Trans16x16TestBase() {}
+
+ protected:
+  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
+
+  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+    for (int i = 0; i < count_test_block; ++i) {
+      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        test_input_block[j] = src[j] - dst[j];
+      }
+
+      REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+                                      test_temp_block, pitch_));
+      REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = dst[j] - src[j];
+        const uint32_t error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j)
-      test_input_block[j] = src[j] - dst[j];

-    const int pitch = 32;
-    vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
+    EXPECT_GE(1u, max_error)
+        << "Error: 16x16 FHT/IHT has an individual round trip error > 1";

-    for (int j = 0; j < 256; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
+    EXPECT_GE(count_test_block , total_error)
+        << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
+  }
+
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
    }
  }

-  EXPECT_GE(1, max_error)
-      << "Error: 16x16 FDCT/IDCT has an individual round trip error > 1";
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);

-  EXPECT_GE(count_test_block , total_error)
-      << "Error: 16x16 FDCT/IDCT has average round trip error > 1 per block";
-}
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+      }
+      if (i == 0)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = 255;
+      if (i == 1)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = -255;

-TEST(VP9Fdct16x16Test, CoeffSizeCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t input_block[256], input_extreme_block[256];
-    int16_t output_block[256], output_extreme_block[256];
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+                                      output_block, pitch_));

-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < 256; ++j)
-        input_extreme_block[j] = 255;
-
-    const int pitch = 32;
-    vp9_short_fdct16x16_c(input_block, output_block, pitch);
-    vp9_short_fdct16x16_c(input_extreme_block, output_extreme_block, pitch);
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < 256; ++j) {
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 16x16 FDCT extreme has coefficient larger than 4*DCT_MAX_VALUE";
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      }
    }
  }
+
+  void RunInvAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        in[j] = src[j] - dst[j];
+      }
+
+      reference_16x16_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff[j] = round(out_r[j]);
+
+      const int pitch = 32;
+      REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = dst[j] - src[j];
+        const uint32_t error = diff * diff;
+        EXPECT_GE(1u, error)
+            << "Error: 16x16 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+  int pitch_;
+  int tx_type_;
+  fht_t fwd_txfm_ref;
+};
+
+class Trans16x16DCT : public Trans16x16TestBase,
+                      public PARAMS(fdct_t, idct_t, int) {
+ public:
+  virtual ~Trans16x16DCT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 32;
+    fwd_txfm_ref = fdct16x16_ref;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride >> 1);
+  }
+
+  fdct_t fwd_txfm_;
+  idct_t inv_txfm_;
+};
+
+TEST_P(Trans16x16DCT, AccuracyCheck) {
+  RunAccuracyCheck();
 }
+
+TEST_P(Trans16x16DCT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans16x16DCT, MemCheck) {
+  RunMemCheck();
+}
+
+TEST_P(Trans16x16DCT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
+class Trans16x16HT : public Trans16x16TestBase,
+                     public PARAMS(fht_t, iht_t, int) {
+ public:
+  virtual ~Trans16x16HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 16;
+    fwd_txfm_ref = fht16x16_ref;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  fht_t fwd_txfm_;
+  iht_t inv_txfm_;
+};
+
+TEST_P(Trans16x16HT, AccuracyCheck) {
+  RunAccuracyCheck();
+}
+
+TEST_P(Trans16x16HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans16x16HT, MemCheck) {
+  RunMemCheck();
+}
+
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct16x16_c, &vp9_short_idct16x16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 0),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 1),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 2),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 3)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct16x16_sse2, &vp9_short_idct16x16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 0),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 1),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 2),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 3)));
+#endif
 }  // namespace
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -13,15 +13,17 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"

 extern "C" {
+#include "./vpx_config.h"
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
-  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
-  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
 }

-#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -30,35 +32,15 @@ namespace {
 #ifdef _MSC_VER
 static int round(double x) {
  if (x < 0)
-    return (int)ceil(x - 0.5);
+    return static_cast<int>(ceil(x - 0.5));
  else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif

-static const double kPi = 3.141592653589793238462643383279502884;
-static void reference2_32x32_idct_2d(double *input, double *output) {
-  double x;
-  for (int l = 0; l < 32; ++l) {
-    for (int k = 0; k < 32; ++k) {
-      double s = 0;
-      for (int i = 0; i < 32; ++i) {
-        for (int j = 0; j < 32; ++j) {
-          x = cos(kPi * j * (l + 0.5) / 32.0) *
-              cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
-          if (i != 0)
-            x *= sqrt(2.0);
-          if (j != 0)
-            x *= sqrt(2.0);
-          s += x;
-        }
-      }
-      output[k * 32 + l] = s / 4;
-    }
-  }
-}
-
-static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
+const int kNumCoeffs = 1024;
+const double kPi = 3.141592653589793238462643383279502884;
+void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
  const double kInvSqrt2 = 0.707106781186547524400844362104;
  for (int k = 0; k < 32; k++) {
    out[k] = 0.0;
@@ -69,7 +51,8 @@ static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
  }
 }

-static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
+void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
+                            double output[kNumCoeffs]) {
  // First transform columns
  for (int i = 0; i < 32; ++i) {
    double temp_in[32], temp_out[32];
@@ -91,27 +74,165 @@ static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
  }
 }

-TEST(VP9Idct32x32Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t in[1024], coeff[1024];
-    uint8_t dst[1024], src[1024];
-    double out_r[1024];
+typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
+typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride);

-    for (int j = 0; j < 1024; ++j) {
+class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
+ public:
+  virtual ~Trans32x32Test() {}
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    version_  = GET_PARAM(2);  // 0: high precision forward transform
+                               // 1: low precision version for rd loop
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int version_;
+  fwd_txfm_t fwd_txfm_;
+  inv_txfm_t inv_txfm_;
+};
+
+TEST_P(Trans32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  uint32_t max_error = 0;
+  int64_t total_error = 0;
+  const int count_test_block = 1000;
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < kNumCoeffs; ++j) {
      src[j] = rnd.Rand8();
      dst[j] = rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
    }
+
+    const int pitch = 64;
+    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch));
+    REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      const uint32_t diff = dst[j] - src[j];
+      const uint32_t error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  if (version_ == 1) {
+    max_error /= 2;
+    total_error /= 45;
+  }
+
+  EXPECT_GE(1u, max_error)
+      << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
+
+  EXPECT_GE(count_test_block, total_error)
+      << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
+}
+
+TEST_P(Trans32x32Test, CoeffCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    for (int j = 0; j < kNumCoeffs; ++j)
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_block, output_ref_block, pitch);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch));
+
+    if (version_ == 0) {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+    } else {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, MemCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 2000;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j)
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = 255;
+    if (i == 1)
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = -255;
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch));
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (version_ == 0) {
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+      } else {
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+      }
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
+          << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than "
+          << "4*DCT_MAX_VALUE";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, InverseAccuracy) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    double out_r[kNumCoeffs];
+
+    // Initialize a test block with input range [-255, 255]
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
      in[j] = src[j] - dst[j];
+    }

    reference_32x32_dct_2d(in, out_r);
-    for (int j = 0; j < 1024; j++)
+    for (int j = 0; j < kNumCoeffs; ++j)
      coeff[j] = round(out_r[j]);
-    vp9_short_idct32x32_add_c(coeff, dst, 32);
-    for (int j = 0; j < 1024; ++j) {
+    REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+    for (int j = 0; j < kNumCoeffs; ++j) {
      const int diff = dst[j] - src[j];
      const int error = diff * diff;
      EXPECT_GE(1, error)
@@ -121,72 +242,21 @@ TEST(VP9Idct32x32Test, AccuracyCheck) {
  }
 }

-TEST(VP9Fdct32x32Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  unsigned int max_error = 0;
-  int64_t total_error = 0;
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[1024];
-    int16_t test_temp_block[1024];
-    uint8_t dst[1024], src[1024];
+using std::tr1::make_tuple;

-    for (int j = 0; j < 1024; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j)
-      test_input_block[j] = src[j] - dst[j];
+INSTANTIATE_TEST_CASE_P(
+    C, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct32x32_c, &vp9_short_idct32x32_add_c, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_short_idct32x32_add_c, 1)));

-    const int pitch = 64;
-    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
-
-    for (int j = 0; j < 1024; ++j) {
-      const unsigned diff = dst[j] - src[j];
-      const unsigned error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
-  }
-
-  EXPECT_GE(1u, max_error)
-      << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
-
-  EXPECT_GE(count_test_block, total_error)
-      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1 per block";
-}
-
-TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t input_block[1024], input_extreme_block[1024];
-    int16_t output_block[1024], output_extreme_block[1024];
-
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < 1024; ++j)
-        input_extreme_block[j] = 255;
-
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_block, output_block, pitch);
-    vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < 1024; ++j) {
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 32x32 FDCT extreme has coefficient larger than "
-             "4*DCT_MAX_VALUE";
-    }
-  }
-}
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct32x32_sse2,
+                   &vp9_short_idct32x32_add_sse2, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_sse2,
+                   &vp9_short_idct32x32_add_sse2, 1)));
+#endif
 }  // namespace
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -12,7 +12,7 @@
 #define TEST_DECODE_TEST_DRIVER_H_
 #include <cstring>
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"

 namespace libvpx_test {
@@ -36,9 +36,8 @@ class DxDataIterator {
 };

 // Provides a simplified interface to manage one video decoding.
-//
-// TODO: similar to Encoder class, the exact services should be
-// added as more tests are added.
+// Similar to Encoder class, the exact services should be added
+// as more tests are added.
 class Decoder {
 public:
  Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/decode_test_driver.h"
@@ -114,19 +114,19 @@ static bool compare_img(const vpx_image_t *img1,
  const unsigned int height_y = img1->d_h;
  unsigned int i;
  for (i = 0; i < height_y; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
-                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                     width_y) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                    img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                    width_y) == 0) && match;
  const unsigned int width_uv  = (img1->d_w + 1) >> 1;
  const unsigned int height_uv = (img1->d_h + 1) >> 1;
  for (i = 0; i <  height_uv; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
-                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
-                     width_uv) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                    img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                    width_uv) == 0) && match;
  for (i = 0; i < height_uv; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
-                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
-                     width_uv) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                    img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                    width_uv) == 0) && match;
  return match;
 }

@@ -158,7 +158,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
    Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
    bool again;
    for (again = true, video->Begin(); again; video->Next()) {
-      again = video->img() != NULL;
+      again = (video->img() != NULL);

      PreEncodeFrameHook(video);
      PreEncodeFrameHook(video, encoder);
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -190,7 +190,9 @@ class EncoderTest {
  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {}

  // Hook to determine whether the encode loop should continue.
-  virtual bool Continue() const { return !abort_; }
+  virtual bool Continue() const {
+    return !(::testing::Test::HasFatalFailure() || abort_);
+  }

  const CodecFactory   *codec_;
  // Hook to determine whether to decode frame after encoding
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -50,10 +50,6 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
    mismatch_nframes_ = 0;
  }

-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
    psnr_ += pkt->data.psnr.psnr[0];
    nframes_++;
@@ -66,7 +62,7 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
    if (droppable_nframes_ > 0 &&
        (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
      for (unsigned int i = 0; i < droppable_nframes_; ++i) {
-        if (droppable_frames_[i] == nframes_) {
+        if (droppable_frames_[i] == video->frame()) {
          std::cout << "             Encoding droppable frame: "
                    << droppable_frames_[i] << "\n";
          frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST |
@@ -152,7 +148,7 @@ TEST_P(ErrorResilienceTest, OnVersusOff) {
  const vpx_rational timebase = { 33333333, 1000000000 };
  cfg_.g_timebase = timebase;
  cfg_.rc_target_bitrate = 2000;
-  cfg_.g_lag_in_frames = 25;
+  cfg_.g_lag_in_frames = 10;

  init_flags_ = VPX_CODEC_USE_PSNR;

@@ -183,6 +179,9 @@ TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) {
  const vpx_rational timebase = { 33333333, 1000000000 };
  cfg_.g_timebase = timebase;
  cfg_.rc_target_bitrate = 500;
+  // FIXME(debargha): Fix this to work for any lag.
+  // Currently this test only works for lag = 0
+  cfg_.g_lag_in_frames = 0;

  init_flags_ = VPX_CODEC_USE_PSNR;

--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -15,68 +15,69 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 }

-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"

 using libvpx_test::ACMRandom;

 namespace {
-void fdct4x4(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
+void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+             int stride, int /*tx_type*/) {
  vp9_short_fdct4x4_c(in, out, stride);
 }
-void idct4x4_add(int16_t *in, int16_t *out, uint8_t *dst,
-                 int stride, int tx_type) {
+void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                 int stride, int /*tx_type*/) {
  vp9_short_idct4x4_add_c(out, dst, stride >> 1);
 }
-void fht4x4(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
+void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+            int stride, int tx_type) {
  vp9_short_fht4x4_c(in, out, stride >> 1, tx_type);
 }
-void iht4x4_add(int16_t *in, int16_t *out, uint8_t *dst,
+void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
                int stride, int tx_type) {
  vp9_short_iht4x4_add_c(out, dst, stride >> 1, tx_type);
 }

 class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
 public:
-  FwdTrans4x4Test() {SetUpTestTxfm();}
-  ~FwdTrans4x4Test() {}
-
-  void SetUpTestTxfm() {
-    tx_type = GetParam();
-    if (tx_type == 0) {
-      fwd_txfm = fdct4x4;
-      inv_txfm = idct4x4_add;
+  virtual ~FwdTrans4x4Test() {}
+  virtual void SetUp() {
+    tx_type_ = GetParam();
+    if (tx_type_ == 0) {
+      fwd_txfm_ = fdct4x4;
+      inv_txfm_ = idct4x4_add;
    } else {
-      fwd_txfm = fht4x4;
-      inv_txfm = iht4x4_add;
+      fwd_txfm_ = fht4x4;
+      inv_txfm_ = iht4x4_add;
    }
  }

 protected:
  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
                  int stride, int tx_type) {
-    (*fwd_txfm)(in, out, dst, stride, tx_type);
+    (*fwd_txfm_)(in, out, dst, stride, tx_type);
  }

  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
                  int stride, int tx_type) {
-    (*inv_txfm)(in, out, dst, stride, tx_type);
+    (*inv_txfm_)(in, out, dst, stride, tx_type);
  }

-  int tx_type;
-  void (*fwd_txfm)(int16_t *in, int16_t *out, uint8_t *dst,
+  int tx_type_;
+  void (*fwd_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
                   int stride, int tx_type);
-  void (*inv_txfm)(int16_t *in, int16_t *out, uint8_t *dst,
+  void (*inv_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
                   int stride, int tx_type);
 };

 TEST_P(FwdTrans4x4Test, SignBiasCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int16_t test_input_block[16];
-  int16_t test_output_block[16];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 16);
  const int pitch = 8;
  int count_sign_block[16][2];
  const int count_test_block = 1000000;
@@ -87,7 +88,7 @@ TEST_P(FwdTrans4x4Test, SignBiasCheck) {
    for (int j = 0; j < 16; ++j)
      test_input_block[j] = rnd.Rand8() - rnd.Rand8();

-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type);
+    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);

    for (int j = 0; j < 16; ++j) {
      if (test_output_block[j] < 0)
@@ -103,7 +104,7 @@ TEST_P(FwdTrans4x4Test, SignBiasCheck) {
    EXPECT_TRUE(bias_acceptable)
        << "Error: 4x4 FDCT/FHT has a sign bias > 1%"
        << " for input range [-255, 255] at index " << j
-        << " tx_type " << tx_type;
+        << " tx_type " << tx_type_;
  }

  memset(count_sign_block, 0, sizeof(count_sign_block));
@@ -112,7 +113,7 @@ TEST_P(FwdTrans4x4Test, SignBiasCheck) {
    for (int j = 0; j < 16; ++j)
      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);

-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type);
+    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);

    for (int j = 0; j < 16; ++j) {
      if (test_output_block[j] < 0)
@@ -135,12 +136,13 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());

  int max_error = 0;
-  double total_error = 0;
+  int total_error = 0;
  const int count_test_block = 1000000;
  for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[16];
-    int16_t test_temp_block[16];
-    uint8_t dst[16], src[16];
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 16);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 16);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 16);

    for (int j = 0; j < 16; ++j) {
      src[j] = rnd.Rand8();
@@ -151,10 +153,10 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 8;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type);
+    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);

    for (int j = 0; j < 16; ++j) {
-        if(test_temp_block[j] > 0) {
+        if (test_temp_block[j] > 0) {
          test_temp_block[j] += 2;
          test_temp_block[j] /= 4;
          test_temp_block[j] *= 4;
@@ -166,7 +168,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
    }

    // inverse transform and reconstruct the pixel block
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type);
+    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);

    for (int j = 0; j < 16; ++j) {
      const int diff = dst[j] - src[j];
@@ -181,7 +183,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {

  EXPECT_GE(count_test_block, total_error)
      << "Error: FDCT/IDCT or FHT/IHT has average "
-          "roundtrip error > 1 per block";
+      << "roundtrip error > 1 per block";
 }

 INSTANTIATE_TEST_CASE_P(VP9, FwdTrans4x4Test, ::testing::Range(0, 4));
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -13,23 +13,78 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "vpx_ports/mem.h"

 extern "C" {
-#include "vp9_rtcd.h"
-void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
+#include "./vp9_rtcd.h"
+void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *output, int pitch);
 }

-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;

 namespace {
+void fdct8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+             int stride, int /*tx_type*/) {
+  vp9_short_fdct8x8_c(in, out, stride);
+}
+void idct8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                 int stride, int /*tx_type*/) {
+  vp9_short_idct8x8_add_c(out, dst, stride >> 1);
+}
+void fht8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+            int stride, int tx_type) {
+  // TODO(jingning): need to refactor this to test both _c and _sse2 functions,
+  // when we have all inverse dct functions done sse2.
+#if HAVE_SSE2
+  vp9_short_fht8x8_sse2(in, out, stride >> 1, tx_type);
+#else
+  vp9_short_fht8x8_c(in, out, stride >> 1, tx_type);
+#endif
+}
+void iht8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                int stride, int tx_type) {
+  vp9_short_iht8x8_add_c(out, dst, stride >> 1, tx_type);
+}

-TEST(VP9Fdct8x8Test, SignBiasCheck) {
+class FwdTrans8x8Test : public ::testing::TestWithParam<int> {
+ public:
+  virtual ~FwdTrans8x8Test() {}
+  virtual void SetUp() {
+    tx_type_ = GetParam();
+    if (tx_type_ == 0) {
+      fwd_txfm = fdct8x8;
+      inv_txfm = idct8x8_add;
+    } else {
+      fwd_txfm = fht8x8;
+      inv_txfm = iht8x8_add;
+    }
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+    (*fwd_txfm)(in, out, dst, stride, tx_type);
+  }
+  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+    (*inv_txfm)(in, out, dst, stride, tx_type);
+  }
+
+  int tx_type_;
+  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+};
+
+TEST_P(FwdTrans8x8Test, SignBiasCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int16_t test_input_block[64];
-  int16_t test_output_block[64];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
  const int pitch = 16;
  int count_sign_block[64][2];
  const int count_test_block = 100000;
@@ -40,8 +95,9 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
    // Initialize a test block with input range [-255, 255].
    for (int j = 0; j < 64; ++j)
      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
-
-    vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_output_block,
+                   NULL, pitch, tx_type_));

    for (int j = 0; j < 64; ++j) {
      if (test_output_block[j] < 0)
@@ -55,7 +111,7 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
    const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
    const int max_diff = 1125;
    EXPECT_LT(diff, max_diff)
-        << "Error: 8x8 FDCT has a sign bias > "
+        << "Error: 8x8 FDCT/FHT has a sign bias > "
        << 1. * max_diff / count_test_block * 100 << "%"
        << " for input range [-255, 255] at index " << j
        << " count0: " << count_sign_block[j][0]
@@ -69,8 +125,9 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
    // Initialize a test block with input range [-15, 15].
    for (int j = 0; j < 64; ++j)
      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
-
-    vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_output_block,
+                   NULL, pitch, tx_type_));

    for (int j = 0; j < 64; ++j) {
      if (test_output_block[j] < 0)
@@ -84,24 +141,25 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
    const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
    const int max_diff = 10000;
    EXPECT_LT(diff, max_diff)
-        << "Error: 4x4 FDCT has a sign bias > "
+        << "Error: 4x4 FDCT/FHT has a sign bias > "
        << 1. * max_diff / count_test_block * 100 << "%"
        << " for input range [-15, 15] at index " << j
        << " count0: " << count_sign_block[j][0]
        << " count1: " << count_sign_block[j][1]
        << " diff: " << diff;
  }
-};
+}

-TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
+TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int max_error = 0;
-  double total_error = 0;
+  int total_error = 0;
  const int count_test_block = 100000;
  for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[64];
-    int16_t test_temp_block[64];
-    uint8_t dst[64], src[64];
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);

    for (int j = 0; j < 64; ++j) {
      src[j] = rnd.Rand8();
@@ -112,9 +170,11 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 16;
-    vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
-    for (int j = 0; j < 64; ++j){
-        if(test_temp_block[j] > 0) {
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
+    for (int j = 0; j < 64; ++j) {
+        if (test_temp_block[j] > 0) {
          test_temp_block[j] += 2;
          test_temp_block[j] /= 4;
          test_temp_block[j] *= 4;
@@ -124,7 +184,9 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
          test_temp_block[j] *= 4;
        }
    }
-    vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
+    REGISTER_STATE_CHECK(
+        RunInvTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));

    for (int j = 0; j < 64; ++j) {
      const int diff = dst[j] - src[j];
@@ -136,21 +198,23 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
  }

  EXPECT_GE(1, max_error)
-      << "Error: 8x8 FDCT/IDCT has an individual roundtrip error > 1";
+    << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1";

  EXPECT_GE(count_test_block/5, total_error)
-      << "Error: 8x8 FDCT/IDCT has average roundtrip error > 1/5 per block";
-};
+    << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
+        "error > 1/5 per block";
+}

-TEST(VP9Fdct8x8Test, ExtremalCheck) {
+TEST_P(FwdTrans8x8Test, ExtremalCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int max_error = 0;
-  double total_error = 0;
+  int total_error = 0;
  const int count_test_block = 100000;
  for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[64];
-    int16_t test_temp_block[64];
-    uint8_t dst[64], src[64];
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);

    for (int j = 0; j < 64; ++j) {
      src[j] = rnd.Rand8() % 2 ? 255 : 0;
@@ -161,8 +225,12 @@ TEST(VP9Fdct8x8Test, ExtremalCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 16;
-    vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
+    REGISTER_STATE_CHECK(
+        RunInvTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));

    for (int j = 0; j < 64; ++j) {
      const int diff = dst[j] - src[j];
@@ -173,13 +241,14 @@ TEST(VP9Fdct8x8Test, ExtremalCheck) {
    }

    EXPECT_GE(1, max_error)
-        << "Error: Extremal 8x8 FDCT/IDCT has an"
+        << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has an"
        << " individual roundtrip error > 1";

    EXPECT_GE(count_test_block/5, total_error)
-        << "Error: Extremal 8x8 FDCT/IDCT has average"
+        << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
        << " roundtrip error > 1/5 per block";
  }
-};
+}

+INSTANTIATE_TEST_CASE_P(VP9, FwdTrans8x8Test, ::testing::Range(0, 4));
 }  // namespace
--- a/test/i420_video_source.h
+++ b/test/i420_video_source.h
@@ -11,6 +11,7 @@
 #define TEST_I420_VIDEO_SOURCE_H_
 #include <cstdio>
 #include <cstdlib>
+#include <string>

 #include "test/video_source.h"

@@ -34,7 +35,6 @@ class I420VideoSource : public VideoSource {
        height_(0),
        framerate_numerator_(rate_numerator),
        framerate_denominator_(rate_denominator) {
-
    // This initializes raw_sz_, width_, height_ and allocates an img.
    SetSize(width, height);
  }
@@ -49,7 +49,7 @@ class I420VideoSource : public VideoSource {
    if (input_file_)
      fclose(input_file_);
    input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
        << file_name_;
    if (start_) {
      fseek(input_file_, raw_sz_ * start_, SEEK_SET);
@@ -92,6 +92,7 @@ class I420VideoSource : public VideoSource {
  }

  virtual void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
    // Read a frame from input_file.
    if (fread(img_->img_data, raw_sz_, 1, input_file_) == 0) {
      limit_ = frame_;
@@ -108,8 +109,8 @@ class I420VideoSource : public VideoSource {
  unsigned int frame_;
  unsigned int width_;
  unsigned int height_;
-  unsigned int framerate_numerator_;
-  unsigned int framerate_denominator_;
+  int framerate_numerator_;
+  int framerate_denominator_;
 };

 }  // namespace libvpx_test
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -15,10 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 }

-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -27,10 +27,10 @@ namespace {

 #ifdef _MSC_VER
 static int round(double x) {
-  if(x < 0)
-    return (int)ceil(x - 0.5);
+  if (x < 0)
+    return static_cast<int>(ceil(x - 0.5));
  else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif

--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 extern "C" {
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
@@ -17,105 +16,101 @@ extern "C" {
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"

-typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
+#include "vpx/vpx_integer.h"
+
+typedef void (*idct_fn_t)(int16_t *input, unsigned char *pred_ptr,
                          int pred_stride, unsigned char *dst_ptr,
                          int dst_stride);
 namespace {
 class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {
-  protected:
-    virtual void SetUp() {
-        int i;
+ protected:
+  virtual void SetUp() {
+    int i;

-        UUT = GetParam();
-        memset(input, 0, sizeof(input));
-        /* Set up guard blocks */
-        for (i = 0; i < 256; i++)
-            output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
-    }
+    UUT = GetParam();
+    memset(input, 0, sizeof(input));
+    /* Set up guard blocks */
+    for (i = 0; i < 256; i++) output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
+  }

-    virtual void TearDown() {
-      libvpx_test::ClearSystemState();
-    }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }

-    idct_fn_t UUT;
-    short input[16];
-    unsigned char output[256];
-    unsigned char predict[256];
+  idct_fn_t UUT;
+  int16_t input[16];
+  unsigned char output[256];
+  unsigned char predict[256];
 };

 TEST_P(IDCTTest, TestGuardBlocks) {
-    int i;
+  int i;

-    for (i = 0; i < 256; i++)
-        if ((i & 0xF) < 4 && i < 64)
-            EXPECT_EQ(0, output[i]) << i;
-        else
-            EXPECT_EQ(255, output[i]);
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) < 4 && i < 64)
+      EXPECT_EQ(0, output[i]) << i;
+    else
+      EXPECT_EQ(255, output[i]);
 }

 TEST_P(IDCTTest, TestAllZeros) {
-    int i;
+  int i;

-    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-    for (i = 0; i < 256; i++)
-        if ((i & 0xF) < 4 && i < 64)
-            EXPECT_EQ(0, output[i]) << "i==" << i;
-        else
-            EXPECT_EQ(255, output[i]) << "i==" << i;
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) < 4 && i < 64)
+      EXPECT_EQ(0, output[i]) << "i==" << i;
+    else
+      EXPECT_EQ(255, output[i]) << "i==" << i;
 }

 TEST_P(IDCTTest, TestAllOnes) {
-    int i;
+  int i;

-    input[0] = 4;
-    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  input[0] = 4;
+  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-    for (i = 0; i < 256; i++)
-        if ((i & 0xF) < 4 && i < 64)
-            EXPECT_EQ(1, output[i]) << "i==" << i;
-        else
-            EXPECT_EQ(255, output[i]) << "i==" << i;
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) < 4 && i < 64)
+      EXPECT_EQ(1, output[i]) << "i==" << i;
+    else
+      EXPECT_EQ(255, output[i]) << "i==" << i;
 }

 TEST_P(IDCTTest, TestAddOne) {
-    int i;
+  int i;

-    for (i = 0; i < 256; i++)
-        predict[i] = i;
-    input[0] = 4;
-    REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
+  for (i = 0; i < 256; i++) predict[i] = i;
+  input[0] = 4;
+  REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));

-    for (i = 0; i < 256; i++)
-        if ((i & 0xF) < 4 && i < 64)
-            EXPECT_EQ(i+1, output[i]) << "i==" << i;
-        else
-            EXPECT_EQ(255, output[i]) << "i==" << i;
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) < 4 && i < 64)
+      EXPECT_EQ(i + 1, output[i]) << "i==" << i;
+    else
+      EXPECT_EQ(255, output[i]) << "i==" << i;
 }

 TEST_P(IDCTTest, TestWithData) {
-    int i;
+  int i;

-    for (i = 0; i < 16; i++)
-        input[i] = i;
+  for (i = 0; i < 16; i++) input[i] = i;

-    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-    for (i = 0; i < 256; i++)
-        if ((i & 0xF) > 3 || i > 63)
-            EXPECT_EQ(255, output[i]) << "i==" << i;
-        else if (i == 0)
-            EXPECT_EQ(11, output[i]) << "i==" << i;
-        else if (i == 34)
-            EXPECT_EQ(1, output[i]) << "i==" << i;
-        else if (i == 2 || i == 17 || i == 32)
-            EXPECT_EQ(3, output[i]) << "i==" << i;
-        else
-            EXPECT_EQ(0, output[i]) << "i==" << i;
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) > 3 || i > 63)
+      EXPECT_EQ(255, output[i]) << "i==" << i;
+    else if (i == 0)
+      EXPECT_EQ(11, output[i]) << "i==" << i;
+    else if (i == 34)
+      EXPECT_EQ(1, output[i]) << "i==" << i;
+    else if (i == 2 || i == 17 || i == 32)
+      EXPECT_EQ(3, output[i]) << "i==" << i;
+    else
+      EXPECT_EQ(0, output[i]) << "i==" << i;
 }

-INSTANTIATE_TEST_CASE_P(C, IDCTTest,
-                        ::testing::Values(vp8_short_idct4x4llm_c));
+INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
                        ::testing::Values(vp8_short_idct4x4llm_mmx));
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -15,8 +15,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -27,6 +27,8 @@ using libvpx_test::ACMRandom;

 class IntraPredBase {
 public:
+  virtual ~IntraPredBase() {}
+
  virtual void TearDown() {
    libvpx_test::ClearSystemState();
  }
@@ -104,9 +106,9 @@ class IntraPredBase {
          for (int y = 0; y < block_size_; y++)
            sum += data_ptr_[p][y * stride_ - 1];
        expected = (sum + (1 << (shift - 1))) >> shift;
-      } else
+      } else {
        expected = 0x80;
-
+      }
      // check that all subsequent lines are equal to the first
      for (int y = 1; y < block_size_; ++y)
        ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_],
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -28,7 +28,7 @@ static unsigned int MemGetLe32(const uint8_t *mem) {
 // so that we can do actual file decodes.
 class IVFVideoSource : public CompressedVideoSource {
 public:
-  IVFVideoSource(const std::string &file_name)
+  explicit IVFVideoSource(const std::string &file_name)
      : file_name_(file_name),
        input_file_(NULL),
        compressed_frame_buf_(NULL),
@@ -47,12 +47,13 @@ class IVFVideoSource : public CompressedVideoSource {
  virtual void Init() {
    // Allocate a buffer for read in the compressed video frame.
    compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
-    ASSERT_TRUE(compressed_frame_buf_) << "Allocate frame buffer failed";
+    ASSERT_TRUE(compressed_frame_buf_ != NULL)
+        << "Allocate frame buffer failed";
  }

  virtual void Begin() {
    input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
        << file_name_;

    // Read file header
@@ -72,6 +73,7 @@ class IVFVideoSource : public CompressedVideoSource {
  }

  void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
    uint8_t frame_hdr[kIvfFrameHdrSize];
    // Check frame header and read a frame from input_file.
    if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_)
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@@ -31,10 +31,6 @@ class KeyframeTest : public ::libvpx_test::EncoderTest,
    set_cpu_used_ = 0;
  }

-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    if (kf_do_force_kf_)
@@ -136,7 +132,6 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
  // Verify that keyframes match the file keyframes in the file.
  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
       iter != kf_pts_list_.end(); ++iter) {
-
    if (deadline_ == VPX_DL_REALTIME && *iter > 0)
      EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
        << *iter;
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef LIBVPX_TEST_MD5_HELPER_H_
-#define LIBVPX_TEST_MD5_HELPER_H_
+#ifndef TEST_MD5_HELPER_H_
+#define TEST_MD5_HELPER_H_

 extern "C" {
 #include "./md5_utils.h"
@@ -25,9 +25,15 @@ class MD5 {

  void Add(const vpx_image_t *img) {
    for (int plane = 0; plane < 3; ++plane) {
-      uint8_t *buf = img->planes[plane];
-      const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
-      const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+      const uint8_t *buf = img->planes[plane];
+      // Calculate the width and height to do the md5 check. For the chroma
+      // plane, we never want to round down and thus skip a pixel so if
+      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
+      // This works only for chroma_shift of 0 and 1.
+      const int h = plane ? (img->d_h + img->y_chroma_shift) >>
+                    img->y_chroma_shift : img->d_h;
+      const int w = plane ? (img->d_w + img->x_chroma_shift) >>
+                    img->x_chroma_shift : img->d_w;

      for (int y = 0; y < h; ++y) {
        MD5Update(&md5_, buf, w);
@@ -61,4 +67,4 @@ class MD5 {

 }  // namespace libvpx_test

-#endif  // LIBVPX_TEST_MD5_HELPER_H_
+#endif  // TEST_MD5_HELPER_H_
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -11,8 +11,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -63,7 +63,8 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
  // Pointers to top-left pixel of block in the input and output images.
  uint8_t *const src_image_ptr = src_image + (input_stride << 1);
  uint8_t *const dst_image_ptr = dst_image + 8;
-  uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  uint8_t *const flimits =
+      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
  (void)vpx_memset(flimits, 255, block_width);

  // Initialize pixels in the input:
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef LIBVPX_TEST_REGISTER_STATE_CHECK_H_
-#define LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#ifndef TEST_REGISTER_STATE_CHECK_H_
+#define TEST_REGISTER_STATE_CHECK_H_

 #ifdef _WIN64

@@ -92,4 +92,4 @@ class RegisterStateCheck {};

 #endif  // _WIN64

-#endif  // LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#endif  // TEST_REGISTER_STATE_CHECK_H_
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -16,8 +16,68 @@
 #include "test/video_source.h"
 #include "test/util.h"

+// Enable(1) or Disable(0) writing of the compressed bitstream.
+#define WRITE_COMPRESSED_STREAM 0
+
 namespace {

+#if WRITE_COMPRESSED_STREAM
+static void mem_put_le16(char *const mem, const unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+}
+
+static void mem_put_le32(char *const mem, const unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+  mem[2] = val >> 16;
+  mem[3] = val >> 24;
+}
+
+static void write_ivf_file_header(const vpx_codec_enc_cfg_t *const cfg,
+                                  int frame_cnt, FILE *const outfile) {
+  char header[32];
+
+  header[0] = 'D';
+  header[1] = 'K';
+  header[2] = 'I';
+  header[3] = 'F';
+  mem_put_le16(header + 4,  0);                   /* version */
+  mem_put_le16(header + 6,  32);                  /* headersize */
+  mem_put_le32(header + 8,  0x30395056);          /* fourcc (vp9) */
+  mem_put_le16(header + 12, cfg->g_w);            /* width */
+  mem_put_le16(header + 14, cfg->g_h);            /* height */
+  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
+  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
+  mem_put_le32(header + 24, frame_cnt);           /* length */
+  mem_put_le32(header + 28, 0);                   /* unused */
+
+  (void)fwrite(header, 1, 32, outfile);
+}
+
+static void write_ivf_frame_size(FILE *const outfile, const size_t size) {
+  char header[4];
+  mem_put_le32(header, static_cast<unsigned int>(size));
+  (void)fwrite(header, 1, 4, outfile);
+}
+
+static void write_ivf_frame_header(const vpx_codec_cx_pkt_t *const pkt,
+                                   FILE *const outfile) {
+  char header[12];
+  vpx_codec_pts_t pts;
+
+  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+    return;
+
+  pts = pkt->data.frame.pts;
+  mem_put_le32(header, static_cast<unsigned int>(pkt->data.frame.sz));
+  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
+  mem_put_le32(header + 8, pts >> 32);
+
+  (void)fwrite(header, 1, 12, outfile);
+}
+#endif  // WRITE_COMPRESSED_STREAM
+
 const unsigned int kInitialWidth = 320;
 const unsigned int kInitialHeight = 240;

@@ -42,6 +102,8 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
    limit_ = 60;
  }

+  virtual ~ResizingVideoSource() {}
+
 protected:
  virtual void Next() {
    ++frame_;
@@ -56,13 +118,15 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
 protected:
  ResizeTest() : EncoderTest(GET_PARAM(0)) {}

+  virtual ~ResizeTest() {}
+
  struct FrameInfo {
    FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
        : pts(_pts), w(_w), h(_h) {}

    vpx_codec_pts_t pts;
-    unsigned int    w;
-    unsigned int    h;
+    unsigned int w;
+    unsigned int h;
  };

  virtual void SetUp() {
@@ -70,10 +134,6 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
    SetMode(GET_PARAM(1));
  }

-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
  virtual void DecompressedFrameHook(const vpx_image_t &img,
                                     vpx_codec_pts_t pts) {
    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
@@ -99,17 +159,47 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
  }
 }

+const unsigned int kStepDownFrame = 3;
+const unsigned int kStepUpFrame = 6;
+
 class ResizeInternalTest : public ResizeTest {
 protected:
+#if WRITE_COMPRESSED_STREAM
+  ResizeInternalTest()
+      : ResizeTest(),
+        frame0_psnr_(0.0),
+        outfile_(NULL),
+        out_frames_(0) {}
+#else
  ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}
+#endif
+
+  virtual ~ResizeInternalTest() {}
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+#if WRITE_COMPRESSED_STREAM
+    outfile_ = fopen("vp90-2-05-resize.ivf", "wb");
+#endif
+  }
+
+  virtual void EndPassHook() {
+#if WRITE_COMPRESSED_STREAM
+    if (outfile_) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        write_ivf_file_header(&cfg_, out_frames_, outfile_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+#endif
+  }

  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
-    if (video->frame() == 3) {
+    if (video->frame() == kStepDownFrame) {
      struct vpx_scaling_mode mode = {VP8E_FOURFIVE, VP8E_THREEFIVE};
      encoder->Control(VP8E_SET_SCALEMODE, &mode);
    }
-    if (video->frame() == 6) {
+    if (video->frame() == kStepUpFrame) {
      struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
      encoder->Control(VP8E_SET_SCALEMODE, &mode);
    }
@@ -121,21 +211,46 @@ class ResizeInternalTest : public ResizeTest {
    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.0);
  }

+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+#if WRITE_COMPRESSED_STREAM
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0)
+      write_ivf_file_header(&cfg_, 0, outfile_);
+
+    // Write frame header and data.
+    write_ivf_frame_header(pkt, outfile_);
+    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+#endif
+  }
+
  double frame0_psnr_;
+#if WRITE_COMPRESSED_STREAM
+  FILE *outfile_;
+  unsigned int out_frames_;
+#endif
 };

 TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 10);
  init_flags_ = VPX_CODEC_USE_PSNR;
+
  // q picked such that initial keyframe on this clip is ~30dB PSNR
  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+
+  // If the number of frames being encoded is smaller than g_lag_in_frames
+  // the encoded frame is unavailable using the current API. Comparing
+  // frames to detect mismatch would then not be possible. Set
+  // g_lag_in_frames = 0 to get around this.
+  cfg_.g_lag_in_frames = 0;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

  for (std::vector<FrameInfo>::iterator info = frame_info_list_.begin();
       info != frame_info_list_.end(); ++info) {
    const vpx_codec_pts_t pts = info->pts;
-    if (pts >= 3 && pts < 6) {
+    if (pts >= kStepDownFrame && pts < kStepUpFrame) {
      ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width";
      ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height";
    } else {
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -17,7 +17,6 @@ extern "C" {
 #include "./vpx_config.h"
 #if CONFIG_VP8_ENCODER
 #include "./vp8_rtcd.h"
-//#include "vp8/common/blockd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 #include "./vp9_rtcd.h"
@@ -428,6 +427,7 @@ INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));

 #if HAVE_SSE
 #if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;
 const sad_m_by_n_fn_t sad_4x8_sse_vp9 = vp9_sad4x8_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values(
@@ -441,6 +441,7 @@ INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
                        make_tuple(4, 4, sad_4x4x4d_sse)));
 #endif
 #endif
+#endif

 #if HAVE_SSE2
 #if CONFIG_VP8_ENCODER
@@ -451,14 +452,20 @@ const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt;
 const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
 #endif
 #if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
+const sad_m_by_n_fn_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
+const sad_m_by_n_fn_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
 const sad_m_by_n_fn_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
+const sad_m_by_n_fn_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
+const sad_m_by_n_fn_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
 const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
-const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
 const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
+const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
 const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
 const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
 #endif
+#endif
 const sad_m_by_n_test_param_t sse2_tests[] = {
 #if CONFIG_VP8_ENCODER
  make_tuple(16, 16, sad_16x16_wmt),
@@ -468,18 +475,25 @@ const sad_m_by_n_test_param_t sse2_tests[] = {
  make_tuple(4, 4, sad_4x4_wmt),
 #endif
 #if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
  make_tuple(64, 64, sad_64x64_sse2_vp9),
+  make_tuple(64, 32, sad_64x32_sse2_vp9),
+  make_tuple(32, 64, sad_32x64_sse2_vp9),
  make_tuple(32, 32, sad_32x32_sse2_vp9),
+  make_tuple(32, 16, sad_32x16_sse2_vp9),
+  make_tuple(16, 32, sad_16x32_sse2_vp9),
  make_tuple(16, 16, sad_16x16_sse2_vp9),
-  make_tuple(8, 16, sad_8x16_sse2_vp9),
  make_tuple(16, 8, sad_16x8_sse2_vp9),
+  make_tuple(8, 16, sad_8x16_sse2_vp9),
  make_tuple(8, 8, sad_8x8_sse2_vp9),
  make_tuple(8, 4, sad_8x4_sse2_vp9),
 #endif
+#endif
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));

 #if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2;
@@ -505,6 +519,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
                        make_tuple(8, 4, sad_8x4x4d_sse2)));
 #endif
 #endif
+#endif

 #if HAVE_SSE3
 #if CONFIG_VP8_ENCODER
@@ -523,9 +538,11 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values(
 #endif

 #if HAVE_SSSE3
+#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
                        make_tuple(16, 16, sad_16x16_sse3)));
 #endif
+#endif

 }  // namespace
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@@ -17,15 +17,19 @@
 #include <sys/types.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 extern "C" {
 #include "vp8/encoder/onyx_int.h"
 }

+using libvpx_test::ACMRandom;
+
 namespace {

 TEST(Vp8RoiMapTest, ParameterCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
  int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
  unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 };
@@ -121,10 +125,10 @@ TEST(Vp8RoiMapTest, ParameterCheck) {
    for (int i = 0; i < 1000; ++i) {
      int rand_deltas[4];
      int deltas_valid;
-      rand_deltas[0] = (rand() % 160) - 80;
-      rand_deltas[1] = (rand() % 160) - 80;
-      rand_deltas[2] = (rand() % 160) - 80;
-      rand_deltas[3] = (rand() % 160) - 80;
+      rand_deltas[0] = rnd(160) - 80;
+      rand_deltas[1] = rnd(160) - 80;
+      rand_deltas[2] = rnd(160) - 80;
+      rand_deltas[3] = rnd(160) - 80;

      deltas_valid = ((abs(rand_deltas[0]) <= 63) &&
                      (abs(rand_deltas[1]) <= 63) &&
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -13,8 +13,8 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vp8/encoder/block.h"
 #include "vpx_mem/vpx_mem.h"
@@ -51,7 +51,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
  bd.predictor = reinterpret_cast<unsigned char*>(
      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));

-  for(int i = 0; kSrcStride[i] > 0; ++i) {
+  for (int i = 0; kSrcStride[i] > 0; ++i) {
    // start at block0
    be.src = 0;
    be.base_src = &source;
@@ -61,7 +61,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
    int16_t *src_diff = be.src_diff;
    for (int r = 0; r < kBlockHeight; ++r) {
      for (int c = 0; c < kBlockWidth; ++c) {
-        src_diff[c] = 0xa5a5;
+        src_diff[c] = static_cast<int16_t>(0xa5a5);
      }
      src_diff += kDiffPredStride;
    }
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -33,10 +33,6 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
    delete[] modified_buf_;
  }

-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -520,3 +520,12 @@ d17bc08eedfc60c4c23d576a6c964a21bf854d1f  vp90-2-03-size-226x202.webm
 83c6d8f2969b759e10e5c6542baca1265c874c29  vp90-2-03-size-226x224.webm.md5
 fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce  vp90-2-03-size-226x226.webm
 94ad19b8b699cea105e2ff18f0df2afd7242bcf7  vp90-2-03-size-226x226.webm.md5
+b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
+65f45ec9a55537aac76104818278e0978f94a678  vp90-2-03-deltaq.webm.md5
+4dbb87494c7f565ffc266c98d17d0d8c7a5c5aba  vp90-2-05-resize.ivf
+7f6d8879336239a43dbb6c9f13178cb11cf7ed09  vp90-2-05-resize.ivf.md5
+bf61ddc1f716eba58d4c9837d4e91031d9ce4ffe  vp90-2-06-bilinear.webm
+f6235f937552e11d8eb331ec55da6b3aa596b9ac  vp90-2-06-bilinear.webm.md5
+495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
+65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5
+
--- a/test/test.mk
+++ b/test/test.mk
@@ -24,7 +24,9 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
@@ -87,6 +89,7 @@ LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
 endif

 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
@@ -626,3 +629,11 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <string>
-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 #include "vpx_ports/x86.h"
@@ -48,7 +48,9 @@ int main(int argc, char **argv) {
 #endif

 #if !CONFIG_SHARED
-  /* Shared library builds don't support whitebox tests that exercise internal symbols. */
+// Shared library builds don't support whitebox tests
+// that exercise internal symbols.
+
 #if CONFIG_VP8
  vp8_rtcd();
 #endif
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -159,7 +159,11 @@ const char *kVP9TestVectors[] = {
  "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
  "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
  "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
-  "vp90-2-03-size-226x226.webm"
+  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
+  "vp90-2-05-resize.ivf",        "vp90-2-06-bilinear.webm",
+#if CONFIG_NON420
+  "vp91-2-04-yv444.webm"
+#endif
 };
 #endif

@@ -181,6 +185,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,

  virtual void DecompressedFrameHook(const vpx_image_t& img,
                                     const unsigned int frame_number) {
+    ASSERT_TRUE(md5_file_ != NULL);
    char expected_md5[33];
    char junk[128];

--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -23,10 +23,13 @@ extern "C" {

 namespace {
 class TileIndependenceTest : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWithParam<int> {
+                             public ::libvpx_test::CodecTestWithParam<int> {
 protected:
-  TileIndependenceTest() : EncoderTest(GET_PARAM(0)), n_tiles_(GET_PARAM(1)),
-      md5_fw_order_(), md5_inv_order_() {
+  TileIndependenceTest()
+      : EncoderTest(GET_PARAM(0)),
+        md5_fw_order_(),
+        md5_inv_order_(),
+        n_tiles_(GET_PARAM(1)) {
    init_flags_ = VPX_CODEC_USE_PSNR;
    vpx_codec_dec_cfg_t cfg;
    cfg.w = 704;
@@ -56,9 +59,8 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,

  void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,
                 ::libvpx_test::MD5 *md5) {
-    const vpx_codec_err_t res =
-        dec->DecodeFrame(reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
-                         pkt->data.frame.sz);
+    const vpx_codec_err_t res = dec->DecodeFrame(
+        reinterpret_cast<uint8_t*>(pkt->data.frame.buf), pkt->data.frame.sz);
    if (res != VPX_CODEC_OK) {
      abort_ = true;
      ASSERT_EQ(VPX_CODEC_OK, res);
@@ -72,11 +74,11 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
    UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
  }

- private:
-  int n_tiles_;
- protected:
  ::libvpx_test::MD5 md5_fw_order_, md5_inv_order_;
  ::libvpx_test::Decoder *fw_dec_, *inv_dec_;
+
+ private:
+  int n_tiles_;
 };

 // run an encode with 2 or 4 tiles, and do the decode both in normal and
@@ -93,7 +95,7 @@ TEST_P(TileIndependenceTest, MD5Match) {
                                     timebase.den, timebase.num, 0, 30);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

-  const char *md5_fw_str  = md5_fw_order_.Get();
+  const char *md5_fw_str = md5_fw_order_.Get();
  const char *md5_inv_str = md5_inv_order_.Get();

  // could use ASSERT_EQ(!memcmp(.., .., 16) here, but this gives nicer
@@ -102,7 +104,6 @@ TEST_P(TileIndependenceTest, MD5Match) {
  ASSERT_STREQ(md5_fw_str, md5_inv_str);
 }

-VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest,
-                          ::testing::Range(0, 2, 1));
+VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));

 }  // namespace
--- a/test/util.h
+++ b/test/util.h
@@ -37,7 +37,7 @@ static double compute_psnr(const vpx_image_t *img1,
                  img2->planes[VPX_PLANE_Y][i * img2->stride[VPX_PLANE_Y] + j];
      sqrerr += d * d;
    }
-  double mse = sqrerr / (width_y * height_y);
+  double mse = static_cast<double>(sqrerr) / (width_y * height_y);
  double psnr = 100.0;
  if (mse > 0.0) {
    psnr = 10 * log10(255.0 * 255.0 / mse);
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -16,16 +16,16 @@
 #include "test/register_state_check.h"

 #include "vpx/vpx_integer.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #include "vpx_mem/vpx_mem.h"
 #if CONFIG_VP8_ENCODER
 # include "vp8/common/variance.h"
-# include "vp8_rtcd.h"
+# include "./vp8_rtcd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 # include "vp9/encoder/vp9_variance.h"
-# include "vp9_rtcd.h"
+# include "./vp9_rtcd.h"
 #endif
 }
 #include "test/acm_random.h"
@@ -107,8 +107,8 @@ static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
 }

 template<typename VarianceFunctionType>
-class VarianceTest :
-    public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
+class VarianceTest
+    : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
 public:
  virtual void SetUp() {
    const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
@@ -191,9 +191,9 @@ void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
 }

 template<typename SubpelVarianceFunctionType>
-class SubpelVarianceTest :
-    public ::testing::TestWithParam<tuple<int, int,
-                                          SubpelVarianceFunctionType> > {
+class SubpelVarianceTest
+    : public ::testing::TestWithParam<tuple<int, int,
+                                            SubpelVarianceFunctionType> > {
 public:
  virtual void SetUp() {
    const tuple<int, int, SubpelVarianceFunctionType>& params =
@@ -218,6 +218,7 @@ class SubpelVarianceTest :
    vpx_free(src_);
    delete[] ref_;
    vpx_free(sec_);
+    libvpx_test::ClearSystemState();
  }

 protected:
@@ -482,6 +483,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif

 #if HAVE_SSE2
+#if CONFIG_USE_X86INC
 const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
 const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
 const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
@@ -595,8 +597,11 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(6, 5, subpel_avg_variance64x32_sse2),
                      make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
 #endif
+#endif

 #if HAVE_SSSE3
+#if CONFIG_USE_X86INC
+
 const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 =
    vp9_sub_pixel_variance4x4_ssse3;
 const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 =
@@ -681,6 +686,7 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
 #endif
+#endif
 #endif  // CONFIG_VP9_ENCODER

 }  // namespace vp9
--- a/test/vp8_boolcoder_test.cc
+++ b/test/vp8_boolcoder_test.cc
@@ -8,10 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-extern "C" {
-#include "vp8/encoder/boolhuff.h"
-#include "vp8/decoder/dboolhuff.h"
-}

 #include <math.h>
 #include <stddef.h>
@@ -24,6 +20,11 @@ extern "C" {
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_integer.h"

+extern "C" {
+#include "vp8/encoder/boolhuff.h"
+#include "vp8/decoder/dboolhuff.h"
+}
+
 namespace {
 const int num_tests = 10;

@@ -44,7 +45,7 @@ void encrypt_buffer(uint8_t *buffer, int size) {

 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                           uint8_t *output, int count) {
-  int offset = input - (uint8_t *)decrypt_state;
+  int offset = input - reinterpret_cast<uint8_t *>(decrypt_state);
  for (int i = 0; i < count; i++) {
    output[i] = input[i] ^ secret_key[(offset + i) & 15];
  }
@@ -58,10 +59,10 @@ TEST(VP8, TestBitIO) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  for (int n = 0; n < num_tests; ++n) {
    for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int bits_to_test = 1000;
-      uint8_t probas[bits_to_test];
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];

-      for (int i = 0; i < bits_to_test; ++i) {
+      for (int i = 0; i < kBitsToTest; ++i) {
        const int parity = i & 1;
        probas[i] =
            (method == 0) ? 0 : (method == 1) ? 255 :
@@ -76,14 +77,14 @@ TEST(VP8, TestBitIO) {
      }
      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
        const int random_seed = 6432;
-        const int buffer_size = 10000;
+        const int kBufferSize = 10000;
        ACMRandom bit_rnd(random_seed);
        BOOL_CODER bw;
-        uint8_t bw_buffer[buffer_size];
-        vp8_start_encode(&bw, bw_buffer, bw_buffer + buffer_size);
+        uint8_t bw_buffer[kBufferSize];
+        vp8_start_encode(&bw, bw_buffer, bw_buffer + kBufferSize);

        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
@@ -98,19 +99,20 @@ TEST(VP8, TestBitIO) {
 #if CONFIG_DECRYPT
        encrypt_buffer(bw_buffer, buffer_size);
        vp8dx_start_decode(&br, bw_buffer, buffer_size,
-                           test_decrypt_cb, (void *)bw_buffer);
+                           test_decrypt_cb,
+                           reinterpret_cast<void *>(bw_buffer));
 #else
-        vp8dx_start_decode(&br, bw_buffer, buffer_size, NULL, NULL);
+        vp8dx_start_decode(&br, bw_buffer, kBufferSize, NULL, NULL);
 #endif
        bit_rnd.Reset(random_seed);
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
            bit = bit_rnd(2);
          }
          GTEST_ASSERT_EQ(vp8dx_decode_bool(&br, probas[i]), bit)
-              << "pos: "<< i << " / " << bits_to_test
+              << "pos: "<< i << " / " << kBitsToTest
              << " bit_method: " << bit_method
              << " method: " << method;
        }
--- a/test/vp8_decrypt_test.cc
+++ b/test/vp8_decrypt_test.cc
@@ -26,7 +26,8 @@ const uint8_t test_key[16] = {
  0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0
 };

-void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0) {
+void encrypt_buffer(const uint8_t *src, uint8_t *dst,
+                    int size, int offset = 0) {
  for (int i = 0; i < size; ++i) {
    dst[i] = src[i] ^ test_key[(offset + i) & 15];
  }
@@ -34,10 +35,11 @@ void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0)

 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                     uint8_t *output, int count) {
-  encrypt_buffer(input, output, count, input - (uint8_t *)decrypt_state);
+  encrypt_buffer(input, output, count,
+                 input - reinterpret_cast<uint8_t *>(decrypt_state));
 }

-} // namespace
+}  // namespace

 namespace libvpx_test {

--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -18,7 +18,7 @@


 extern "C" {
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
 }

 #include "test/acm_random.h"
--- a/test/vp9_boolcoder_test.cc
+++ b/test/vp9_boolcoder_test.cc
@@ -19,7 +19,7 @@ extern "C" {
 #include "vp9/decoder/vp9_dboolhuff.h"
 }

-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -32,10 +32,10 @@ TEST(VP9, TestBitIO) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  for (int n = 0; n < num_tests; ++n) {
    for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int bits_to_test = 1000;
-      uint8_t probas[bits_to_test];
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];

-      for (int i = 0; i < bits_to_test; ++i) {
+      for (int i = 0; i < kBitsToTest; ++i) {
        const int parity = i & 1;
        probas[i] =
          (method == 0) ? 0 : (method == 1) ? 255 :
@@ -50,14 +50,14 @@ TEST(VP9, TestBitIO) {
      }
      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
        const int random_seed = 6432;
-        const int buffer_size = 10000;
+        const int kBufferSize = 10000;
        ACMRandom bit_rnd(random_seed);
        vp9_writer bw;
-        uint8_t bw_buffer[buffer_size];
+        uint8_t bw_buffer[kBufferSize];
        vp9_start_encode(&bw, bw_buffer);

        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
@@ -72,16 +72,16 @@ TEST(VP9, TestBitIO) {
        GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);

        vp9_reader br;
-        vp9_reader_init(&br, bw_buffer, buffer_size);
+        vp9_reader_init(&br, bw_buffer, kBufferSize);
        bit_rnd.Reset(random_seed);
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
            bit = bit_rnd(2);
          }
          GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit)
-              << "pos: " << i << " / " << bits_to_test
+              << "pos: " << i << " / " << kBitsToTest
              << " bit_method: " << bit_method
              << " method: " << method;
        }
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -0,0 +1,75 @@
+/*
+  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+
+  Use of this source code is governed by a BSD-style license
+  that can be found in the LICENSE file in the root of the source
+  tree. An additional intellectual property rights grant can be found
+  in the file PATENTS.  All contributing project authors may
+  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+
+class LossLessTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  LossLessTest() : EncoderTest(GET_PARAM(0)),
+                   psnr_(kMaxPsnr),
+                   nframes_(0),
+                   encoding_mode_(GET_PARAM(1)) {
+  }
+
+  virtual ~LossLessTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < psnr_)
+      psnr_= pkt->data.psnr.psnr[0];
+  }
+
+  double GetMinPsnr() const {
+      return psnr_;
+  }
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  libvpx_test::TestMode encoding_mode_;
+};
+
+TEST_P(LossLessTest, TestLossLessEncoding) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 0;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  // intentionally changed the dimension for better testing coverage
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 356, 284,
+                                     timebase.den, timebase.num, 0, 30);
+
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+VP9_INSTANTIATE_TEST_CASE(LossLessTest, ALL_TEST_MODES);
+}  // namespace
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -39,8 +39,8 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());

  // FIXME(rbultje) split in its own file
-  for (BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_AB4X4; bsize < BLOCK_SIZE_TYPES;
-       bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
    const int block_width  = 4 << b_width_log2(bsize);
    const int block_height = 4 << b_height_log2(bsize);
    int16_t *diff = reinterpret_cast<int16_t *>(
@@ -93,9 +93,8 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
 INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
                        ::testing::Values(vp9_subtract_block_c));

-#if HAVE_SSE2
+#if HAVE_SSE2 && CONFIG_USE_X86INC
 INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
                        ::testing::Values(vp9_subtract_block_sse2));
 #endif
-
 }  // namespace vp9
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/decoder/vp9_thread.h"
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/webm_video_source.h"
+
+namespace {
+
+class VP9WorkerThreadTest : public ::testing::Test {
+ protected:
+  virtual ~VP9WorkerThreadTest() {}
+  virtual void SetUp() {
+    vp9_worker_init(&worker_);
+  }
+
+  virtual void TearDown() {
+    vp9_worker_end(&worker_);
+  }
+
+  VP9Worker worker_;
+};
+
+int ThreadHook(void* data, void* return_value) {
+  int* const hook_data = reinterpret_cast<int*>(data);
+  *hook_data = 5;
+  return *reinterpret_cast<int*>(return_value);
+}
+
+TEST_F(VP9WorkerThreadTest, HookSuccess) {
+  EXPECT_TRUE(vp9_worker_sync(&worker_));  // should be a no-op.
+
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_TRUE(vp9_worker_reset(&worker_));
+
+    int hook_data = 0;
+    int return_value = 1;  // return successfully from the hook
+    worker_.hook = ThreadHook;
+    worker_.data1 = &hook_data;
+    worker_.data2 = &return_value;
+
+    vp9_worker_launch(&worker_);
+    EXPECT_TRUE(vp9_worker_sync(&worker_));
+    EXPECT_FALSE(worker_.had_error);
+    EXPECT_EQ(5, hook_data);
+
+    EXPECT_TRUE(vp9_worker_sync(&worker_));  // should be a no-op.
+  }
+}
+
+TEST_F(VP9WorkerThreadTest, HookFailure) {
+  EXPECT_TRUE(vp9_worker_reset(&worker_));
+
+  int hook_data = 0;
+  int return_value = 0;  // return failure from the hook
+  worker_.hook = ThreadHook;
+  worker_.data1 = &hook_data;
+  worker_.data2 = &return_value;
+
+  vp9_worker_launch(&worker_);
+  EXPECT_FALSE(vp9_worker_sync(&worker_));
+  EXPECT_TRUE(worker_.had_error);
+
+  // Ensure _reset() clears the error and _launch() can be called again.
+  return_value = 1;
+  EXPECT_TRUE(vp9_worker_reset(&worker_));
+  EXPECT_FALSE(worker_.had_error);
+  vp9_worker_launch(&worker_);
+  EXPECT_TRUE(vp9_worker_sync(&worker_));
+  EXPECT_FALSE(worker_.had_error);
+}
+
+TEST(VP9DecodeMTTest, MTDecode) {
+  libvpx_test::WebMVideoSource video("vp90-2-03-size-226x226.webm");
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = {0};
+  cfg.threads = 2;
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  libvpx_test::MD5 md5;
+  for (video.Begin(); video.cxdata(); video.Next()) {
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size());
+    ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next())) {
+      md5.Add(img);
+    }
+  }
+  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc", md5.Get());
+}
+
+}  // namespace
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -99,7 +99,7 @@ class WebMVideoSource : public CompressedVideoSource {

  virtual void Begin() {
    input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
        << file_name_;

    nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb,
@@ -130,6 +130,7 @@ class WebMVideoSource : public CompressedVideoSource {
  }

  void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
    if (chunk_ >= chunks_) {
      unsigned int track;

--- a/third_party/libyuv/source/scale.c
+++ b/third_party/libyuv/source/scale.c
@@ -1370,12 +1370,12 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    mov        edx, [esp + 8 + 12]  // src_stride
    mov        ecx, [esp + 8 + 16]  // dst_width
    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    shr        eax, 1
    cmp        eax, 0
    je         xloop1
-    cmp        eax, 128
+    cmp        eax, 64
    je         xloop2

-    shr        eax, 1
    mov        ah,al
    neg        al
    add        al, 128
@@ -2132,11 +2132,11 @@ void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    "mov    0x14(%esp),%edx                    \n"
    "mov    0x18(%esp),%ecx                    \n"
    "mov    0x1c(%esp),%eax                    \n"
+    "shr    %eax                               \n"
    "cmp    $0x0,%eax                          \n"
    "je     2f                                 \n"
-    "cmp    $0x80,%eax                         \n"
+    "cmp    $0x40,%eax                         \n"
    "je     3f                                 \n"
-    "shr    %eax                               \n"
    "mov    %al,%ah                            \n"
    "neg    %al                                \n"
    "add    $0x80,%al                          \n"
@@ -2662,6 +2662,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
                                  const uint8* src_ptr, int src_stride,
                                  int dst_width, int source_y_fraction) {
+  source_y_fraction >>= 1;
  if (source_y_fraction == 0) {
    asm volatile (
   "1:"
@@ -2680,7 +2681,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
      : "memory", "cc", "rax"
    );
    return;
-  } else if (source_y_fraction == 128) {
+  } else if (source_y_fraction == 64) {
    asm volatile (
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
@@ -2703,7 +2704,6 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
  } else {
    asm volatile (
      "mov        %3,%%eax                     \n"
-      "shr        %%eax                        \n"
      "mov        %%al,%%ah                    \n"
      "neg        %%al                         \n"
      "add        $0x80,%%al                   \n"
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -97,21 +97,91 @@
    %endif
 %endmacro

-%if WIN64
-    %define PIC
-%elifidn __OUTPUT_FORMAT__,macho64
-    %define PIC
-%elif ARCH_X86_64 == 0
-; x86_32 doesn't require PIC.
-; Some distros prefer shared objects to be PIC, but nothing breaks if
-; the code contains a few textrels, so we'll skip that complexity.
-    %undef PIC
-%elif CONFIG_PIC
-    %define PIC
+; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
+; from original code is added in for 64bit.
+%ifidn __OUTPUT_FORMAT__,elf32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,macho32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,win32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,aout
+%define ABI_IS_32BIT 1
+%else
+%define ABI_IS_32BIT 0
 %endif
+
+%if ABI_IS_32BIT
+  %if CONFIG_PIC=1
+  %ifidn __OUTPUT_FORMAT__,elf32
+    %define GET_GOT_SAVE_ARG 1
+    %define WRT_PLT wrt ..plt
+    %macro GET_GOT 1
+      extern _GLOBAL_OFFSET_TABLE_
+      push %1
+      call %%get_got
+      %%sub_offset:
+      jmp %%exitGG
+      %%get_got:
+      mov %1, [esp]
+      add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
+      ret
+      %%exitGG:
+      %undef GLOBAL
+      %define GLOBAL(x) x + %1 wrt ..gotoff
+      %undef RESTORE_GOT
+      %define RESTORE_GOT pop %1
+    %endmacro
+  %elifidn __OUTPUT_FORMAT__,macho32
+    %define GET_GOT_SAVE_ARG 1
+    %macro GET_GOT 1
+      push %1
+      call %%get_got
+      %%get_got:
+      pop  %1
+      %undef GLOBAL
+      %define GLOBAL(x) x + %1 - %%get_got
+      %undef RESTORE_GOT
+      %define RESTORE_GOT pop %1
+    %endmacro
+  %endif
+  %endif
+
+  %if ARCH_X86_64 == 0
+    %undef PIC
+  %endif
+
+%else
+  %macro GET_GOT 1
+  %endmacro
+  %define GLOBAL(x) rel x
+  %define WRT_PLT wrt ..plt
+
+  %if WIN64
+    %define PIC
+  %elifidn __OUTPUT_FORMAT__,macho64
+    %define PIC
+  %elif CONFIG_PIC
+    %define PIC
+  %endif
+%endif
+
+%ifnmacro GET_GOT
+    %macro GET_GOT 1
+    %endmacro
+    %define GLOBAL(x) x
+%endif
+%ifndef RESTORE_GOT
+%define RESTORE_GOT
+%endif
+%ifndef WRT_PLT
+%define WRT_PLT
+%endif
+
 %ifdef PIC
    default rel
 %endif
+; Done with PIC macros

 ; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
 %ifndef __NASM_VER__
@@ -528,6 +598,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
        global %1:function hidden
    %elifidn __OUTPUT_FORMAT__,elf64
        global %1:function hidden
+    %elifidn __OUTPUT_FORMAT__,macho32
+        global %1:private_extern
+    %elifidn __OUTPUT_FORMAT__,macho64
+        global %1:private_extern
    %else
        global %1
    %endif
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -173,7 +173,6 @@ void vp8_create_common(VP8_COMMON *oci)
    oci->use_bilinear_mc_filter = 0;
    oci->full_pixel = 0;
    oci->multi_token_partition = ONE_PARTITION;
-    oci->clr_type = REG_YUV;
    oci->clamp_type = RECON_CLAMP_REQUIRED;

    /* Initialize reference frame sign bias structure to defaults */
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -41,7 +41,8 @@ extern "C"
    {
        USAGE_STREAM_FROM_SERVER    = 0x0,
        USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-        USAGE_CONSTRAINED_QUALITY   = 0x2
+        USAGE_CONSTRAINED_QUALITY   = 0x2,
+        USAGE_CONSTANT_QUALITY      = 0x3
    } END_USAGE;


--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -72,7 +72,6 @@ typedef struct VP8Common
    int horiz_scale;
    int vert_scale;

-    YUV_TYPE clr_type;
    CLAMP_TYPE  clamp_type;

    YV12_BUFFER_CONFIG *frame_to_show;
@@ -115,9 +114,6 @@ typedef struct VP8Common
    int uvdc_delta_q;
    int uvac_delta_q;

-    unsigned int frames_since_golden;
-    unsigned int frames_till_alt_ref_frame;
-
    /* We allocate a MODE_INFO struct for each macroblock, together with
       an extra row on top and column on the left to simplify prediction. */

@@ -157,7 +153,6 @@ typedef struct VP8Common

    unsigned int current_video_frame;

-    int near_boffset[3];
    int version;

    TOKEN_PARTITION multi_token_partition;
@@ -165,8 +160,10 @@ typedef struct VP8Common
 #ifdef PACKET_TESTING
    VP8_HEADER oh;
 #endif
+#if CONFIG_POSTPROC_VISUALIZER
    double bitrate;
    double framerate;
+#endif

 #if CONFIG_MULTITHREAD
    int processor_core_count;
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -923,7 +923,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
    if (flags & VP8D_DEBUG_TXT_RATE_INFO)
    {
        char message[512];
-        sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
+        sprintf(message, "Bitrate: %10.2f framerate: %10.2f ", oci->bitrate, oci->framerate);
        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
    }

--- a/vp8/common/vp8_asm_com_offsets.c
+++ b/vp8/common/vp8_asm_com_offsets.c
@@ -1,52 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vpx/vpx_codec.h"
-#include "vpx_ports/asm_offsets.h"
-#include "vp8/common/blockd.h"
-
-#if CONFIG_POSTPROC
-#include "postproc.h"
-#endif /* CONFIG_POSTPROC */
-
-BEGIN
-
-#if CONFIG_POSTPROC
-/* mfqe.c / filter_by_weight */
-DEFINE(MFQE_PRECISION_VAL,                      MFQE_PRECISION);
-#endif /* CONFIG_POSTPROC */
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
-
-#if HAVE_MEDIA
-/* switch case in vp8_intra4x4_predict_armv6 is based on these enumerated values */
-ct_assert(B_DC_PRED, B_DC_PRED == 0);
-ct_assert(B_TM_PRED, B_TM_PRED == 1);
-ct_assert(B_VE_PRED, B_VE_PRED == 2);
-ct_assert(B_HE_PRED, B_HE_PRED == 3);
-ct_assert(B_LD_PRED, B_LD_PRED == 4);
-ct_assert(B_RD_PRED, B_RD_PRED == 5);
-ct_assert(B_VR_PRED, B_VR_PRED == 6);
-ct_assert(B_VL_PRED, B_VL_PRED == 7);
-ct_assert(B_HD_PRED, B_HD_PRED == 8);
-ct_assert(B_HU_PRED, B_HU_PRED == 9);
-#endif
-
-#if HAVE_SSE2
-#if CONFIG_POSTPROC
-/* vp8_filter_by_weight16x16 and 8x8 */
-ct_assert(MFQE_PRECISION_VAL, MFQE_PRECISION == 4)
-#endif /* CONFIG_POSTPROC */
-#endif /* HAVE_SSE2 */
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -512,15 +512,15 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi)
                else
                {
                    mbmi->mode =  NEARMV;
-                    vp8_clamp_mv2(&near_mvs[CNT_NEAR], &pbi->mb);
                    mbmi->mv.as_int = near_mvs[CNT_NEAR].as_int;
+                    vp8_clamp_mv2(&mbmi->mv, &pbi->mb);
                }
            }
            else
            {
                mbmi->mode =  NEARESTMV;
-                vp8_clamp_mv2(&near_mvs[CNT_NEAREST], &pbi->mb);
                mbmi->mv.as_int = near_mvs[CNT_NEAREST].as_int;
+                vp8_clamp_mv2(&mbmi->mv, &pbi->mb);
            }
        }
        else
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -1026,7 +1026,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        const unsigned char *clear = data;
        if (pbi->decrypt_cb)
        {
-            int n = data_end - data;
+            int n = (int)(data_end - data);
            if (n > 10) n = 10;
            pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n);
            clear = clear_buffer;
@@ -1095,7 +1095,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                           "Failed to allocate bool decoder 0");
    if (pc->frame_type == KEY_FRAME) {
-        pc->clr_type    = (YUV_TYPE)vp8_read_bit(bc);
+        (void)vp8_read_bit(bc);  // colorspace
        pc->clamp_type  = (CLAMP_TYPE)vp8_read_bit(bc);
    }

--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -430,7 +430,6 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_st
    *time_stamp = pbi->last_time_stamp;
    *time_end_stamp = 0;

-    sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
    ret = vp8_post_proc_frame(&pbi->common, sd, flags);
 #else
--- a/vp8/decoder/vp8_asm_dec_offsets.c
+++ b/vp8/decoder/vp8_asm_dec_offsets.c
@@ -1,26 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/asm_offsets.h"
-#include "onyxd_int.h"
-
-BEGIN
-
-DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));
-DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));
-DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));
-DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));
-DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1322,7 +1322,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
        vp8_start_encode(bc, cx_data, cx_data_end);

        /* signal clr type */
-        vp8_write_bit(bc, pc->clr_type);
+        vp8_write_bit(bc, 0);
        vp8_write_bit(bc, pc->clamp_type);

    }
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -1325,7 +1325,7 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta
    return Q;
 }

-extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);
+extern void vp8_new_framerate(VP8_COMP *cpi, double framerate);

 void vp8_init_second_pass(VP8_COMP *cpi)
 {
@@ -1349,9 +1349,9 @@ void vp8_init_second_pass(VP8_COMP *cpi)
     * sum duration is not. Its calculated based on the actual durations of
     * all frames from the first pass.
     */
-    vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
+    vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);

-    cpi->output_frame_rate = cpi->frame_rate;
+    cpi->output_framerate = cpi->framerate;
    cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
    cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);

@@ -2398,7 +2398,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    target_frame_size += cpi->min_frame_bandwidth;

    /* Every other frame gets a few extra bits */
-    if ( (cpi->common.frames_since_golden & 0x01) &&
+    if ( (cpi->frames_since_golden & 0x01) &&
         (cpi->frames_till_gf_update_due > 0) )
    {
        target_frame_size += cpi->twopass.alt_extra_bits;
@@ -2529,7 +2529,7 @@ void vp8_second_pass(VP8_COMP *cpi)

    /* Set nominal per second bandwidth for this frame */
    cpi->target_bandwidth = (int)
-    (cpi->per_frame_bandwidth * cpi->output_frame_rate);
+    (cpi->per_frame_bandwidth * cpi->output_framerate);
    if (cpi->target_bandwidth < 0)
        cpi->target_bandwidth = 0;

@@ -3185,7 +3185,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        /* Convert to a per second bitrate */
        cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
-                                      cpi->output_frame_rate);
+                                      cpi->output_framerate);
    }

    /* Note the total error score of the kf group minus the key frame itself */
@@ -3224,7 +3224,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        cpi->common.vert_scale = NORMAL;

        /* Calculate Average bits per frame. */
-        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);
+        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate);

        /* CBR... Use the clip average as the target for deciding resample */
        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
@@ -3299,7 +3299,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        }
        else
        {
-            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
+            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate));
            int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;

            /* If triggered last time the threshold for triggering again is
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -301,11 +301,11 @@ static int rescale(int val, int num, int denom)
 static void init_temporal_layer_context(VP8_COMP *cpi,
                                        VP8_CONFIG *oxcf,
                                        const int layer,
-                                        double prev_layer_frame_rate)
+                                        double prev_layer_framerate)
 {
    LAYER_CONTEXT *lc = &cpi->layer_context[layer];

-    lc->frame_rate = cpi->output_frame_rate / cpi->oxcf.rate_decimator[layer];
+    lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
    lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;

    lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
@@ -335,7 +335,7 @@ static void init_temporal_layer_context(VP8_COMP *cpi,
      lc->avg_frame_size_for_layer =
          (int)((cpi->oxcf.target_bitrate[layer] -
                cpi->oxcf.target_bitrate[layer-1]) * 1000 /
-                (lc->frame_rate - prev_layer_frame_rate));
+                (lc->framerate - prev_layer_framerate));

     lc->active_worst_quality         = cpi->oxcf.worst_allowed_q;
     lc->active_best_quality          = cpi->oxcf.best_allowed_q;
@@ -363,7 +363,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
                                        const int prev_num_layers)
 {
    int i;
-    double prev_layer_frame_rate = 0;
+    double prev_layer_framerate = 0;
    const int curr_num_layers = cpi->oxcf.number_of_layers;
    // If the previous state was 1 layer, get current layer context from cpi.
    // We need this to set the layer context for the new layers below.
@@ -377,7 +377,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
        LAYER_CONTEXT *lc = &cpi->layer_context[i];
        if (i >= prev_num_layers)
        {
-           init_temporal_layer_context(cpi, oxcf, i, prev_layer_frame_rate);
+           init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
        }
        // The initial buffer levels are set based on their starting levels.
        // We could set the buffer levels based on the previous state (normalized
@@ -403,8 +403,8 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
            lc->bits_off_target = lc->buffer_level;
            restore_layer_context(cpi, 0);
        }
-        prev_layer_frame_rate =  cpi->output_frame_rate /
-                                 cpi->oxcf.rate_decimator[i];
+        prev_layer_framerate = cpi->output_framerate /
+                               cpi->oxcf.rate_decimator[i];
    }
 }

@@ -1282,21 +1282,21 @@ int vp8_reverse_trans(int x)

    return 63;
 }
-void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
+void vp8_new_framerate(VP8_COMP *cpi, double framerate)
 {
    if(framerate < .1)
        framerate = 30;

-    cpi->frame_rate             = framerate;
-    cpi->output_frame_rate      = framerate;
+    cpi->framerate              = framerate;
+    cpi->output_framerate       = framerate;
    cpi->per_frame_bandwidth    = (int)(cpi->oxcf.target_bandwidth /
-                                  cpi->output_frame_rate);
+                                  cpi->output_framerate);
    cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
    cpi->min_frame_bandwidth    = (int)(cpi->av_per_frame_bandwidth *
                                  cpi->oxcf.two_pass_vbrmin_section / 100);

    /* Set Maximum gf/arf interval */
-    cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
+    cpi->max_gf_interval = ((int)(cpi->output_framerate / 2.0) + 2);

    if(cpi->max_gf_interval < 12)
        cpi->max_gf_interval = 12;
@@ -1337,13 +1337,13 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     * seems like a reasonable framerate, then use that as a guess, otherwise
     * use 30.
     */
-    cpi->frame_rate = (double)(oxcf->timebase.den) /
-                      (double)(oxcf->timebase.num);
+    cpi->framerate = (double)(oxcf->timebase.den) /
+                     (double)(oxcf->timebase.num);

-    if (cpi->frame_rate > 180)
-        cpi->frame_rate = 30;
+    if (cpi->framerate > 180)
+        cpi->framerate = 30;

-    cpi->ref_frame_rate = cpi->frame_rate;
+    cpi->ref_framerate = cpi->framerate;

    /* change includes all joint functionality */
    vp8_change_config(cpi, oxcf);
@@ -1369,13 +1369,13 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
    if (cpi->oxcf.number_of_layers > 1)
    {
        unsigned int i;
-        double prev_layer_frame_rate=0;
+        double prev_layer_framerate=0;

        for (i=0; i<cpi->oxcf.number_of_layers; i++)
        {
-            init_temporal_layer_context(cpi, oxcf, i, prev_layer_frame_rate);
-            prev_layer_frame_rate = cpi->output_frame_rate /
-                                    cpi->oxcf.rate_decimator[i];
+            init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+            prev_layer_framerate = cpi->output_framerate /
+                                   cpi->oxcf.rate_decimator[i];
        }
    }

@@ -1399,14 +1399,14 @@ static void update_layer_contexts (VP8_COMP *cpi)
    if (oxcf->number_of_layers > 1)
    {
        unsigned int i;
-        double prev_layer_frame_rate=0;
+        double prev_layer_framerate=0;

        for (i=0; i<oxcf->number_of_layers; i++)
        {
            LAYER_CONTEXT *lc = &cpi->layer_context[i];

-            lc->frame_rate =
-                cpi->ref_frame_rate / oxcf->rate_decimator[i];
+            lc->framerate =
+                cpi->ref_framerate / oxcf->rate_decimator[i];
            lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;

            lc->starting_buffer_level = rescale(
@@ -1432,9 +1432,9 @@ static void update_layer_contexts (VP8_COMP *cpi)
                lc->avg_frame_size_for_layer =
                   (int)((oxcf->target_bitrate[i] -
                          oxcf->target_bitrate[i-1]) * 1000 /
-                          (lc->frame_rate - prev_layer_frame_rate));
+                          (lc->framerate - prev_layer_framerate));

-            prev_layer_frame_rate = lc->frame_rate;
+            prev_layer_framerate = lc->framerate;
        }
    }
 }
@@ -1625,7 +1625,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
                    cpi->oxcf.target_bandwidth, 1000);

    /* Set up frame rate and related parameters rate control values. */
-    vp8_new_frame_rate(cpi, cpi->frame_rate);
+    vp8_new_framerate(cpi, cpi->framerate);

    /* Set absolute upper and lower quality limits */
    cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
@@ -1945,7 +1945,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)

    for (i = 0; i < KEY_FRAME_CONTEXT; i++)
    {
-        cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
+        cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate;
    }

 #ifdef OUTPUT_YUV_SRC
@@ -2273,7 +2273,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
        {
            extern int count_mb_seg[4];
            FILE *f = fopen("modes.stt", "a");
-            double dr = (double)cpi->frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+            double dr = (double)cpi->framerate * (double)bytes * (double)8 / (double)count / (double)1000 ;
            fprintf(f, "intra_mode in Intra Frames:\n");
            fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
            fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
@@ -2750,7 +2750,7 @@ static void update_alt_ref_frame_stats(VP8_COMP *cpi)
    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

    /* this frame refreshes means next frames don't unless specified by user */
-    cpi->common.frames_since_golden = 0;
+    cpi->frames_since_golden = 0;

    /* Clear the alternate reference update pending flag. */
    cpi->source_alt_ref_pending = 0;
@@ -2802,7 +2802,7 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
         * user
         */
        cm->refresh_golden_frame = 0;
-        cpi->common.frames_since_golden = 0;
+        cpi->frames_since_golden = 0;

        cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
        cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
@@ -2834,12 +2834,12 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
        if (cpi->frames_till_gf_update_due > 0)
            cpi->frames_till_gf_update_due--;

-        if (cpi->common.frames_till_alt_ref_frame)
-            cpi->common.frames_till_alt_ref_frame --;
+        if (cpi->frames_till_alt_ref_frame)
+            cpi->frames_till_alt_ref_frame --;

-        cpi->common.frames_since_golden ++;
+        cpi->frames_since_golden ++;

-        if (cpi->common.frames_since_golden > 1)
+        if (cpi->frames_since_golden > 1)
        {
            cpi->recent_ref_frame_usage[INTRA_FRAME] +=
                cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME];
@@ -2890,11 +2890,11 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
            cpi->prob_last_coded = 200;
            cpi->prob_gf_coded = 1;
        }
-        else if (cpi->common.frames_since_golden == 0)
+        else if (cpi->frames_since_golden == 0)
        {
            cpi->prob_last_coded = 214;
        }
-        else if (cpi->common.frames_since_golden == 1)
+        else if (cpi->frames_since_golden == 1)
        {
            cpi->prob_last_coded = 192;
            cpi->prob_gf_coded = 220;
@@ -3368,12 +3368,12 @@ static void encode_frame_to_data_rate
            cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
            /* per second target bitrate */
            cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
-                                          cpi->output_frame_rate);
+                                          cpi->output_framerate);
        }
    }
    else
 #endif
-        cpi->per_frame_bandwidth  = (int)(cpi->target_bandwidth / cpi->output_frame_rate);
+        cpi->per_frame_bandwidth  = (int)(cpi->target_bandwidth / cpi->output_framerate);

    /* Default turn off buffer to buffer copying */
    cm->copy_buffer_to_gf = 0;
@@ -4557,7 +4557,7 @@ static void encode_frame_to_data_rate
        {
            LAYER_CONTEXT *lc = &cpi->layer_context[i];
            int bits_off_for_this_layer =
-               (int)(lc->target_bandwidth / lc->frame_rate -
+               (int)(lc->target_bandwidth / lc->framerate -
                     cpi->projected_frame_size);

            lc->bits_off_target += bits_off_for_this_layer;
@@ -4805,7 +4805,7 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
    {
        double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
            *cpi->oxcf.two_pass_vbrmin_section / 100);
-        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->frame_rate);
+        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->framerate);
    }
 }
 #endif
@@ -4821,8 +4821,10 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C
 {
 #if HAVE_NEON
    int64_t store_reg[8];
-#endif
+#if CONFIG_RUNTIME_CPU_DETECT
    VP8_COMMON            *cm = &cpi->common;
+#endif
+#endif
    struct vpx_usec_timer  timer;
    int                    res = 0;

@@ -4848,7 +4850,6 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C
    if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
                          frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL))
        res = -1;
-    cm->clr_type = sd->clrtype;
    vpx_usec_timer_mark(&timer);
    cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);

@@ -4933,7 +4934,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                                              cpi->frames_till_gf_update_due);
                force_src_buffer = &cpi->alt_ref_buffer;
            }
-            cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+            cpi->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
            cm->refresh_alt_ref_frame = 1;
            cm->refresh_golden_frame = 0;
            cm->refresh_last_frame = 0;
@@ -5038,7 +5039,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
        if (this_duration)
        {
            if (step)
-                cpi->ref_frame_rate = 10000000.0 / this_duration;
+                cpi->ref_framerate = 10000000.0 / this_duration;
            else
            {
                double avg_duration, interval;
@@ -5052,11 +5053,11 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                if(interval > 10000000.0)
                    interval = 10000000;

-                avg_duration = 10000000.0 / cpi->ref_frame_rate;
+                avg_duration = 10000000.0 / cpi->ref_framerate;
                avg_duration *= (interval - avg_duration + this_duration);
                avg_duration /= interval;

-                cpi->ref_frame_rate = 10000000.0 / avg_duration;
+                cpi->ref_framerate = 10000000.0 / avg_duration;
            }

            if (cpi->oxcf.number_of_layers > 1)
@@ -5067,12 +5068,12 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                for (i=0; i<cpi->oxcf.number_of_layers; i++)
                {
                    LAYER_CONTEXT *lc = &cpi->layer_context[i];
-                    lc->frame_rate = cpi->ref_frame_rate /
-                                  cpi->oxcf.rate_decimator[i];
+                    lc->framerate = cpi->ref_framerate /
+                                    cpi->oxcf.rate_decimator[i];
                }
            }
            else
-                vp8_new_frame_rate(cpi, cpi->ref_frame_rate);
+                vp8_new_framerate(cpi, cpi->ref_framerate);
        }

        cpi->last_time_stamp_seen = cpi->source->ts_start;
@@ -5089,7 +5090,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
        layer = cpi->oxcf.layer_id[
                cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
        restore_layer_context (cpi, layer);
-        vp8_new_frame_rate (cpi, cpi->layer_context[layer].frame_rate);
+        vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
    }

    if (cpi->compressor_speed == 2)
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -232,7 +232,7 @@ enum
 typedef struct
 {
    /* Layer configuration */
-    double frame_rate;
+    double framerate;
    int target_bandwidth;

    /* Layer specific coding parameters */
@@ -320,6 +320,7 @@ typedef struct VP8_COMP
    YV12_BUFFER_CONFIG scaled_source;
    YV12_BUFFER_CONFIG *last_frame_unscaled_source;

+    unsigned int frames_till_alt_ref_frame;
    /* frame in src_buffers has been identified to be encoded as an alt ref */
    int source_alt_ref_pending;
    /* an alt ref frame has been encoded and is usable */
@@ -369,6 +370,7 @@ typedef struct VP8_COMP
    double key_frame_rate_correction_factor;
    double gf_rate_correction_factor;

+    unsigned int frames_since_golden;
    /* Count down till next GF */
    int frames_till_gf_update_due;

@@ -401,7 +403,7 @@ typedef struct VP8_COMP
    /* Minimum allocation that should be used for any frame */
    int min_frame_bandwidth;
    int inter_frame_target;
-    double output_frame_rate;
+    double output_framerate;
    int64_t last_time_stamp_seen;
    int64_t last_end_time_stamp_seen;
    int64_t first_time_stamp_ever;
@@ -415,8 +417,8 @@ typedef struct VP8_COMP

    int buffered_mode;

-    double frame_rate;
-    double ref_frame_rate;
+    double framerate;
+    double ref_framerate;
    int64_t buffer_level;
    int64_t bits_off_target;

--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -313,7 +313,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    /* Get baseline error score */

    /* Copy the unfiltered / processed recon buffer to the new buffer */
-    vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+    vpx_yv12_copy_y(saved_frame, cm->frame_to_show);

    vp8cx_set_alt_lf_level(cpi, filt_mid);
    vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
@@ -339,7 +339,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
            if(ss_err[filt_low] == 0)
            {
                /* Get Low filter error score */
-                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
                vp8cx_set_alt_lf_level(cpi, filt_low);
                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);

@@ -367,7 +367,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
        {
            if(ss_err[filt_high] == 0)
            {
-                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
                vp8cx_set_alt_lf_level(cpi, filt_high);
                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);

--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -234,7 +234,7 @@ void vp8_save_coding_context(VP8_COMP *cpi)
    cc->frames_since_key          = cpi->frames_since_key;
    cc->filter_level             = cpi->common.filter_level;
    cc->frames_till_gf_update_due   = cpi->frames_till_gf_update_due;
-    cc->frames_since_golden       = cpi->common.frames_since_golden;
+    cc->frames_since_golden       = cpi->frames_since_golden;

    vp8_copy(cc->mvc,      cpi->common.fc.mvc);
    vp8_copy(cc->mvcosts,  cpi->rd_costs.mvcosts);
@@ -271,7 +271,7 @@ void vp8_restore_coding_context(VP8_COMP *cpi)
    cpi->frames_since_key         =   cc->frames_since_key;
    cpi->common.filter_level     =   cc->filter_level;
    cpi->frames_till_gf_update_due  =   cc->frames_till_gf_update_due;
-    cpi->common.frames_since_golden       =   cc->frames_since_golden;
+    cpi->frames_since_golden       =   cc->frames_since_golden;

    vp8_copy(cpi->common.fc.mvc, cc->mvc);

@@ -388,7 +388,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
        int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
        /* Boost depends somewhat on frame rate: only used for 1 layer case. */
        if (cpi->oxcf.number_of_layers == 1) {
-          kf_boost = MAX(initial_boost, (int)(2 * cpi->output_frame_rate - 16));
+          kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
        }
        else {
          /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
@@ -399,9 +399,9 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
        kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100;

        /* frame separation adjustment ( down) */
-        if (cpi->frames_since_key  < cpi->output_frame_rate / 2)
+        if (cpi->frames_since_key  < cpi->output_framerate / 2)
            kf_boost = (int)(kf_boost
-                       * cpi->frames_since_key / (cpi->output_frame_rate / 2));
+                       * cpi->frames_since_key / (cpi->output_framerate / 2));

        /* Minimal target size is |2* per_frame_bandwidth|. */
        if (kf_boost < 16)
@@ -715,7 +715,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
                if (Adjustment > (cpi->this_frame_target - min_frame_target))
                    Adjustment = (cpi->this_frame_target - min_frame_target);

-                if (cpi->common.frames_since_golden == (cpi->current_gf_interval >> 1))
+                if (cpi->frames_since_golden == (cpi->current_gf_interval >> 1))
                    cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment);
                else
                    cpi->this_frame_target -= Adjustment;
@@ -1360,7 +1360,7 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi)
         * whichever is smaller.
         */
        int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1;
-        av_key_frame_frequency = 1 + (int)cpi->output_frame_rate * 2;
+        av_key_frame_frequency = 1 + (int)cpi->output_framerate * 2;

        if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
            av_key_frame_frequency = key_freq;
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -341,7 +341,7 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue)

 void vp8_auto_select_speed(VP8_COMP *cpi)
 {
-    int milliseconds_for_compress = (int)(1000000 / cpi->frame_rate);
+    int milliseconds_for_compress = (int)(1000000 / cpi->framerate);

    milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;

--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -66,7 +66,6 @@ VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
 VP8_COMMON_SRCS-yes += common/variance_c.c
 VP8_COMMON_SRCS-yes += common/variance.h
-VP8_COMMON_SRCS-yes += common/vp8_asm_com_offsets.c
 VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h


@@ -192,7 +191,4 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(A
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)

-$(eval $(call asm_offsets_template,\
-         vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/vp8_asm_com_offsets.c))
-
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -153,7 +153,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 #else
    RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
 #endif
-    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
+    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
    RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
    RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
@@ -204,7 +204,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
    RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
    RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
    RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
-    if(finalize && cfg->rc_end_usage == VPX_CQ)
+    if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q))
        RANGE_CHECK(vp8_cfg, cq_level,
                    cfg->rc_min_quantizer, cfg->rc_max_quantizer);

@@ -327,17 +327,14 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
    oxcf->resample_up_water_mark   = cfg.rc_resize_up_thresh;
    oxcf->resample_down_water_mark = cfg.rc_resize_down_thresh;

-    if (cfg.rc_end_usage == VPX_VBR)
-    {
-        oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
-    }
-    else if (cfg.rc_end_usage == VPX_CBR)
-    {
-        oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
-    }
-    else if (cfg.rc_end_usage == VPX_CQ)
-    {
-        oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+    if (cfg.rc_end_usage == VPX_VBR) {
+      oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
+    } else if (cfg.rc_end_usage == VPX_CBR) {
+      oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+    } else if (cfg.rc_end_usage == VPX_CQ) {
+      oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+    } else if (cfg.rc_end_usage == VPX_Q) {
+      oxcf->end_usage = USAGE_CONSTANT_QUALITY;
    }

    oxcf->target_bandwidth         = cfg.rc_target_bitrate;
@@ -695,7 +692,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
    yv12->uv_stride = img->stride[VPX_PLANE_U];

    yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
-    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
    return res;
 }

@@ -1079,11 +1075,7 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
        ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
        ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;

-        if (sd.clrtype == REG_YUV)
-            ctx->preview_img.fmt = VPX_IMG_FMT_I420;
-        else
-            ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
-
+        ctx->preview_img.fmt = VPX_IMG_FMT_I420;
        ctx->preview_img.x_chroma_shift = 1;
        ctx->preview_img.y_chroma_shift = 1;

@@ -1277,7 +1269,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
        1,                  /* g_delete_first_pass_file */
        "vp8.fpf"           /* first pass filename */
 #endif
-
+        VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
        1,                  /* ts_number_layers */
        {0},                /* ts_target_bitrate */
        {0},                /* ts_rate_decimator */
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -41,15 +41,6 @@ typedef enum

 static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);

-typedef struct
-{
-    unsigned int   id;
-    unsigned long  sz;
-    unsigned int   align;
-    unsigned int   flags;
-    unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
-} mem_req_t;
-
 static const mem_req_t vp8_mem_req_segs[] =
 {
    {VP8_SEG_ALG_PRIV,    0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
@@ -93,65 +84,6 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_
    return sizeof(vpx_codec_alg_priv_t);
 }

-
-static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap)
-{
-    free(mmap->priv);
-}
-
-static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap)
-{
-    vpx_codec_err_t  res;
-    unsigned int   align;
-
-    align = mmap->align ? mmap->align - 1 : 0;
-
-    if (mmap->flags & VPX_CODEC_MEM_ZERO)
-        mmap->priv = calloc(1, mmap->sz + align);
-    else
-        mmap->priv = malloc(mmap->sz + align);
-
-    res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
-    mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
-    mmap->dtor = vp8_mmap_dtor;
-    return res;
-}
-
-static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
-        const vpx_codec_mmap_t        *mmaps,
-        vpx_codec_flags_t              init_flags)
-{
-    int i;
-    vpx_codec_err_t res = VPX_CODEC_OK;
-
-    for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++)
-    {
-        /* Ensure the segment has been allocated */
-        if (!mmaps[i].base)
-        {
-            res = VPX_CODEC_MEM_ERROR;
-            break;
-        }
-
-        /* Verify variable size segment is big enough for the current si. */
-        if (vp8_mem_req_segs[i].calc_sz)
-        {
-            vpx_codec_dec_cfg_t cfg;
-
-            cfg.w = si->w;
-            cfg.h = si->h;
-
-            if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags))
-            {
-                res = VPX_CODEC_MEM_ERROR;
-                break;
-            }
-        }
-    }
-
-    return res;
-}
-
 static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
 {
    int i;
@@ -178,16 +110,6 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
    }
 }

-static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id)
-{
-    int i;
-
-    for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
-        if (ctx->mmaps[i].id == id)
-            return ctx->mmaps[i].base;
-
-    return NULL;
-}
 static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx)
 {
    /* nothing to clean up */
@@ -214,7 +136,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
        mmap.align = vp8_mem_req_segs[0].align;
        mmap.flags = vp8_mem_req_segs[0].flags;

-        res = vp8_mmap_alloc(&mmap);
+        res = vpx_mmap_alloc(&mmap);
        if (res != VPX_CODEC_OK) return res;

        vp8_init_ctx(ctx, &mmap);
@@ -366,8 +288,7 @@ static void yuvconfig2image(vpx_image_t               *img,
      * the Y, U, and V planes, nor other alignment adjustments that
      * might be representable by a YV12_BUFFER_CONFIG, so we just
      * initialize all the fields.*/
-    img->fmt = yv12->clrtype == REG_YUV ?
-        VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
+    img->fmt = VPX_IMG_FMT_I420;
    img->w = yv12->y_stride;
    img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
    img->d_w = yv12->y_width;
@@ -488,7 +409,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
                ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
                                   ctx->base.init_flags);

-            res = vp8_mmap_alloc(&ctx->mmaps[i]);
+            res = vpx_mmap_alloc(&ctx->mmaps[i]);
        }

        if (!res)
@@ -500,7 +421,9 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
    /* Initialize the decoder instance on the first frame*/
    if (!res && !ctx->decoder_init)
    {
-        res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);
+        res = vpx_validate_mmaps(&ctx->si, ctx->mmaps,
+                                 vp8_mem_req_segs, NELEMENTS(vp8_mem_req_segs),
+                                 ctx->base.init_flags);

        if (!res)
        {
@@ -797,8 +720,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
    yv12->uv_stride = img->stride[VPX_PLANE_U];

    yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
-    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
-
    return res;
 }

--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -35,9 +35,5 @@ VP8_DX_SRCS-yes += decoder/onyxd_int.h
 VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
-VP8_DX_SRCS-yes += decoder/vp8_asm_dec_offsets.c

 VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
-
-$(eval $(call asm_offsets_template,\
-         vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/vp8_asm_dec_offsets.c))
--- a/vp9/common/arm/neon/vp9_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_avg_neon.asm
@@ -0,0 +1,116 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_convolve_avg_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_convolve_avg_neon| PROC
+    push                {r4-r6, lr}
+    ldrd                r4, r5, [sp, #32]
+    mov                 r6, r2
+
+    cmp                 r4, #32
+    bgt                 avg64
+    beq                 avg32
+    cmp                 r4, #8
+    bgt                 avg16
+    beq                 avg8
+    b                   avg4
+
+avg64
+    sub                 lr, r1, #32
+    sub                 r4, r3, #32
+avg64_h
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    pld                 [r2, r3]
+    vld1.8              {q8-q9},   [r6@128]!
+    vld1.8              {q10-q11}, [r6@128], r4
+    vrhadd.u8           q0, q0, q8
+    vrhadd.u8           q1, q1, q9
+    vrhadd.u8           q2, q2, q10
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2@128]!
+    vst1.8              {q2-q3}, [r2@128], r4
+    subs                r5, r5, #1
+    bgt                 avg64_h
+    pop                 {r4-r6, pc}
+
+avg32
+    vld1.8              {q0-q1}, [r0], r1
+    vld1.8              {q2-q3}, [r0], r1
+    vld1.8              {q8-q9},   [r6@128], r3
+    vld1.8              {q10-q11}, [r6@128], r3
+    pld                 [r0]
+    vrhadd.u8           q0, q0, q8
+    pld                 [r0, r1]
+    vrhadd.u8           q1, q1, q9
+    pld                 [r6]
+    vrhadd.u8           q2, q2, q10
+    pld                 [r6, r3]
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2@128], r3
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 avg32
+    pop                 {r4-r6, pc}
+
+avg16
+    vld1.8              {q0}, [r0], r1
+    vld1.8              {q1}, [r0], r1
+    vld1.8              {q2}, [r6@128], r3
+    vld1.8              {q3}, [r6@128], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q2
+    pld                 [r6]
+    pld                 [r6, r3]
+    vrhadd.u8           q1, q1, q3
+    vst1.8              {q0}, [r2@128], r3
+    vst1.8              {q1}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 avg16
+    pop                 {r4-r6, pc}
+
+avg8
+    vld1.8              {d0}, [r0], r1
+    vld1.8              {d1}, [r0], r1
+    vld1.8              {d2}, [r6@64], r3
+    vld1.8              {d3}, [r6@64], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q1
+    pld                 [r6]
+    pld                 [r6, r3]
+    vst1.8              {d0}, [r2@64], r3
+    vst1.8              {d1}, [r2@64], r3
+    subs                r5, r5, #2
+    bgt                 avg8
+    pop                 {r4-r6, pc}
+
+avg4
+    vld1.32             {d0[0]}, [r0], r1
+    vld1.32             {d0[1]}, [r0], r1
+    vld1.32             {d2[0]}, [r6@32], r3
+    vld1.32             {d2[1]}, [r6@32], r3
+    vrhadd.u8           d0, d0, d2
+    vst1.32             {d0[0]}, [r2@32], r3
+    vst1.32             {d0[1]}, [r2@32], r3
+    subs                r5, r5, #2
+    bgt                 avg4
+    pop                 {r4-r6, pc}
+    ENDP
+
+    END
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -0,0 +1,302 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ; These functions are only valid when:
+    ; x_step_q4 == 16
+    ; w%4 == 0
+    ; h%4 == 0
+    ; taps == 8
+    ; VP9_FILTER_WEIGHT == 128
+    ; VP9_FILTER_SHIFT == 7
+
+    EXPORT  |vp9_convolve8_avg_horiz_neon|
+    EXPORT  |vp9_convolve8_avg_vert_neon|
+    IMPORT  |vp9_convolve8_avg_horiz_c|
+    IMPORT  |vp9_convolve8_avg_vert_c|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Multiply and accumulate by q0
+    MACRO
+    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+    vmull.s16 $dst, $src0, d0[0]
+    vmlal.s16 $dst, $src1, d0[1]
+    vmlal.s16 $dst, $src2, d0[2]
+    vmlal.s16 $dst, $src3, d0[3]
+    vmlal.s16 $dst, $src4, d1[0]
+    vmlal.s16 $dst, $src5, d1[1]
+    vmlal.s16 $dst, $src6, d1[2]
+    vmlal.s16 $dst, $src7, d1[3]
+    MEND
+
+; r0    const uint8_t *src
+; r1    int src_stride
+; r2    uint8_t *dst
+; r3    int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4           ; unused
+; sp[]int w
+; sp[]int h
+
+|vp9_convolve8_avg_horiz_neon| PROC
+    ldr             r12, [sp, #4]           ; x_step_q4
+    cmp             r12, #16
+    bne             vp9_convolve8_avg_horiz_c
+
+    push            {r4-r10, lr}
+
+    sub             r0, r0, #3              ; adjust for taps
+
+    ldr             r5, [sp, #32]           ; filter_x
+    ldr             r6, [sp, #48]           ; w
+    ldr             r7, [sp, #52]           ; h
+
+    vld1.s16        {q0}, [r5]              ; filter_x
+
+    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
+    add             r8, r8, #4              ; -src_stride * 3 + 4
+
+    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
+    add             r4, r4, #4              ; -dst_stride * 3 + 4
+
+    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
+    sub             r9, r9, #7
+    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
+
+    mov             r10, r6                 ; w loop counter
+
+loop_horiz_v
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8
+
+    vtrn.16         q12, q13
+    vtrn.8          d24, d25
+    vtrn.8          d26, d27
+
+    pld             [r0, r1, lsl #2]
+
+    vmovl.u8        q8, d24
+    vmovl.u8        q9, d25
+    vmovl.u8        q10, d26
+    vmovl.u8        q11, d27
+
+    ; save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+loop_horiz
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    ; extract to s16
+    vtrn.32         q14, q15
+    vmovl.u8        q12, d28
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]
+
+    ; slightly out of order load to match the existing data
+    vld1.u32        {d6[0]}, [r2], r3
+    vld1.u32        {d7[0]}, [r2], r3
+    vld1.u32        {d6[1]}, [r2], r3
+    vld1.u32        {d7[1]}, [r2], r3
+
+    sub             r2, r2, r3, lsl #2      ; reset for store
+
+    ; src[] * filter_x
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    ; transpose
+    vtrn.16         d2, d3
+    vtrn.32         d2, d3
+    vtrn.8          d2, d3
+
+    ; average the new value and the dst value
+    vrhadd.u8       q1, q1, q3
+
+    vst1.u32        {d2[0]}, [r2@32], r3
+    vst1.u32        {d3[0]}, [r2@32], r3
+    vst1.u32        {d2[1]}, [r2@32], r3
+    vst1.u32        {d3[1]}, [r2@32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13
+
+    subs            r6, r6, #4              ; w -= 4
+    bgt             loop_horiz
+
+    ; outer loop
+    mov             r6, r10                 ; restore w counter
+    add             r0, r0, r9              ; src += src_stride * 4 - w
+    add             r2, r2, r12             ; dst += dst_stride * 4 - w
+    subs            r7, r7, #4              ; h -= 4
+    bgt loop_horiz_v
+
+    pop             {r4-r10, pc}
+
+    ENDP
+
+|vp9_convolve8_avg_vert_neon| PROC
+    ldr             r12, [sp, #12]
+    cmp             r12, #16
+    bne             vp9_convolve8_avg_vert_c
+
+    push            {r4-r8, lr}
+
+    ; adjust for taps
+    sub             r0, r0, r1
+    sub             r0, r0, r1, lsl #1
+
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h
+
+    vld1.s16        {q0}, [r4]              ; filter_y
+
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1
+
+loop_vert_h
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 ; h loop counter
+
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1
+
+    vmovl.u8        q8, d16
+    vmovl.u8        q9, d18
+    vmovl.u8        q10, d20
+    vmovl.u8        q11, d22
+
+loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
+
+    ; extract to s16
+    vmovl.u8        q12, d24
+    vmovl.u8        q13, d26
+
+    vld1.u32        {d6[0]}, [r5@32], r3
+    vld1.u32        {d6[1]}, [r8@32], r3
+    vld1.u32        {d7[0]}, [r5@32], r3
+    vld1.u32        {d7[1]}, [r8@32], r3
+
+    pld             [r7]
+    pld             [r4]
+
+    ; src[] * filter_y
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r5]
+    pld             [r8]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    ; average the new value and the dst value
+    vrhadd.u8       q1, q1, q3
+
+    sub             r5, r5, r3, lsl #1      ; reset for store
+    sub             r8, r8, r3, lsl #1
+
+    vst1.u32        {d2[0]}, [r5@32], r3
+    vst1.u32        {d2[1]}, [r8@32], r3
+    vst1.u32        {d3[0]}, [r5@32], r3
+    vst1.u32        {d3[1]}, [r8@32], r3
+
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            ; h -= 4
+    bgt             loop_vert
+
+    ; outer loop
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              ; w -= 4
+    bgt             loop_vert_h
+
+    pop             {r4-r8, pc}
+
+    ENDP
+    END
--- a/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -0,0 +1,280 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ; These functions are only valid when:
+    ; x_step_q4 == 16
+    ; w%4 == 0
+    ; h%4 == 0
+    ; taps == 8
+    ; VP9_FILTER_WEIGHT == 128
+    ; VP9_FILTER_SHIFT == 7
+
+    EXPORT  |vp9_convolve8_horiz_neon|
+    EXPORT  |vp9_convolve8_vert_neon|
+    IMPORT  |vp9_convolve8_horiz_c|
+    IMPORT  |vp9_convolve8_vert_c|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Multiply and accumulate by q0
+    MACRO
+    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+    vmull.s16 $dst, $src0, d0[0]
+    vmlal.s16 $dst, $src1, d0[1]
+    vmlal.s16 $dst, $src2, d0[2]
+    vmlal.s16 $dst, $src3, d0[3]
+    vmlal.s16 $dst, $src4, d1[0]
+    vmlal.s16 $dst, $src5, d1[1]
+    vmlal.s16 $dst, $src6, d1[2]
+    vmlal.s16 $dst, $src7, d1[3]
+    MEND
+
+; r0    const uint8_t *src
+; r1    int src_stride
+; r2    uint8_t *dst
+; r3    int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4           ; unused
+; sp[]int w
+; sp[]int h
+
+|vp9_convolve8_horiz_neon| PROC
+    ldr             r12, [sp, #4]           ; x_step_q4
+    cmp             r12, #16
+    bne             vp9_convolve8_horiz_c
+
+    push            {r4-r10, lr}
+
+    sub             r0, r0, #3              ; adjust for taps
+
+    ldr             r5, [sp, #32]           ; filter_x
+    ldr             r6, [sp, #48]           ; w
+    ldr             r7, [sp, #52]           ; h
+
+    vld1.s16        {q0}, [r5]              ; filter_x
+
+    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
+    add             r8, r8, #4              ; -src_stride * 3 + 4
+
+    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
+    add             r4, r4, #4              ; -dst_stride * 3 + 4
+
+    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
+    sub             r9, r9, #7
+    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
+
+    mov             r10, r6                 ; w loop counter
+
+loop_horiz_v
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8
+
+    vtrn.16         q12, q13
+    vtrn.8          d24, d25
+    vtrn.8          d26, d27
+
+    pld             [r0, r1, lsl #2]
+
+    vmovl.u8        q8, d24
+    vmovl.u8        q9, d25
+    vmovl.u8        q10, d26
+    vmovl.u8        q11, d27
+
+    ; save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+loop_horiz
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    ; extract to s16
+    vtrn.32         q14, q15
+    vmovl.u8        q12, d28
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]
+
+    ; src[] * filter_x
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    ; transpose
+    vtrn.16         d2, d3
+    vtrn.32         d2, d3
+    vtrn.8          d2, d3
+
+    vst1.u32        {d2[0]}, [r2@32], r3
+    vst1.u32        {d3[0]}, [r2@32], r3
+    vst1.u32        {d2[1]}, [r2@32], r3
+    vst1.u32        {d3[1]}, [r2@32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13
+
+    subs            r6, r6, #4              ; w -= 4
+    bgt             loop_horiz
+
+    ; outer loop
+    mov             r6, r10                 ; restore w counter
+    add             r0, r0, r9              ; src += src_stride * 4 - w
+    add             r2, r2, r12             ; dst += dst_stride * 4 - w
+    subs            r7, r7, #4              ; h -= 4
+    bgt loop_horiz_v
+
+    pop             {r4-r10, pc}
+
+    ENDP
+
+|vp9_convolve8_vert_neon| PROC
+    ldr             r12, [sp, #12]
+    cmp             r12, #16
+    bne             vp9_convolve8_vert_c
+
+    push            {r4-r8, lr}
+
+    ; adjust for taps
+    sub             r0, r0, r1
+    sub             r0, r0, r1, lsl #1
+
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h
+
+    vld1.s16        {q0}, [r4]              ; filter_y
+
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1
+
+loop_vert_h
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 ; h loop counter
+
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1
+
+    vmovl.u8        q8, d16
+    vmovl.u8        q9, d18
+    vmovl.u8        q10, d20
+    vmovl.u8        q11, d22
+
+loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
+
+    ; extract to s16
+    vmovl.u8        q12, d24
+    vmovl.u8        q13, d26
+
+    pld             [r5]
+    pld             [r8]
+
+    ; src[] * filter_y
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r7]
+    pld             [r4]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    vst1.u32        {d2[0]}, [r5@32], r3
+    vst1.u32        {d2[1]}, [r8@32], r3
+    vst1.u32        {d3[0]}, [r5@32], r3
+    vst1.u32        {d3[1]}, [r8@32], r3
+
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            ; h -= 4
+    bgt             loop_vert
+
+    ; outer loop
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              ; w -= 4
+    bgt             loop_vert_h
+
+    pop             {r4-r8, pc}
+
+    ENDP
+    END
--- a/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -0,0 +1,78 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx_ports/mem.h"
+
+void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int x_step_q4,
+                        const int16_t *filter_y, int y_step_q4,
+                        int w, int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+   */
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+
+  // Account for the vertical phase needing 3 lines prior and 4 lines post
+  int intermediate_height = h + 7;
+
+  if (x_step_q4 != 16 || y_step_q4 != 16)
+    return vp9_convolve8_c(src, src_stride,
+                           dst, dst_stride,
+                           filter_x, x_step_q4,
+                           filter_y, y_step_q4,
+                           w, h);
+
+  /* Filter starting 3 lines back. The neon implementation will ignore the
+   * given height and filter a multiple of 4 lines. Since this goes in to
+   * the temp buffer which has lots of extra room and is subsequently discarded
+   * this is safe if somewhat less than ideal.
+   */
+  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+                           temp, 64,
+                           filter_x, x_step_q4, filter_y, y_step_q4,
+                           w, intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data */
+  vp9_convolve8_vert_neon(temp + 64 * 3, 64,
+                          dst, dst_stride,
+                          filter_x, x_step_q4, filter_y, y_step_q4,
+                          w, h);
+}
+
+void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h) {
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+  int intermediate_height = h + 7;
+
+  if (x_step_q4 != 16 || y_step_q4 != 16)
+    return vp9_convolve8_avg_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+
+  /* This implementation has the same issues as above. In addition, we only want
+   * to average the values after both passes.
+   */
+  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+                           temp, 64,
+                           filter_x, x_step_q4, filter_y, y_step_q4,
+                           w, intermediate_height);
+  vp9_convolve8_avg_vert_neon(temp + 64 * 3,
+                              64, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+}
--- a/vp9/common/arm/neon/vp9_copy_neon.asm
+++ b/vp9/common/arm/neon/vp9_copy_neon.asm
@@ -0,0 +1,84 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_convolve_copy_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_convolve_copy_neon| PROC
+    push                {r4-r5, lr}
+    ldrd                r4, r5, [sp, #28]
+
+    cmp                 r4, #32
+    bgt                 copy64
+    beq                 copy32
+    cmp                 r4, #8
+    bgt                 copy16
+    beq                 copy8
+    b                   copy4
+
+copy64
+    sub                 lr, r1, #32
+    sub                 r3, r3, #32
+copy64_h
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    vst1.8              {q0-q1}, [r2@128]!
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #1
+    bgt                 copy64_h
+    pop                 {r4-r5, pc}
+
+copy32
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q2-q3}, [r0], r1
+    vst1.8              {q0-q1}, [r2@128], r3
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 copy32
+    pop                 {r4-r5, pc}
+
+copy16
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q1}, [r0], r1
+    vst1.8              {q0}, [r2@128], r3
+    vst1.8              {q1}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 copy16
+    pop                 {r4-r5, pc}
+
+copy8
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {d0}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {d2}, [r0], r1
+    vst1.8              {d0}, [r2@64], r3
+    vst1.8              {d2}, [r2@64], r3
+    subs                r5, r5, #2
+    bgt                 copy8
+    pop                 {r4-r5, pc}
+
+copy4
+    ldr                 r12, [r0], r1
+    str                 r12, [r2], r3
+    subs                r5, r5, #1
+    bgt                 copy4
+    pop                 {r4-r5, pc}
+    ENDP
+
+    END
--- a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
@@ -0,0 +1,69 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_dc_only_idct_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr,
+;                            uint8_t *dst_ptr, int pitch, int stride)
+;
+; r0  int input_dc
+; r1  uint8_t *pred_ptr
+; r2  uint8_t *dst_ptr
+; r3  int pitch
+; sp  int stride
+
+|vp9_dc_only_idct_add_neon| PROC
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    mul              r0, r0, r12               ; input_dc * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; ROUND_POWER_OF_TWO(out, 4)
+    add              r0, r0, #8                ; + (1 <<((4) - 1))
+    asr              r0, r0, #4                ; >> 4
+
+    vdup.16         q0, r0;                   ; duplicate a1
+    ldr              r12, [sp]                 ; load stride
+
+    vld1.32         {d2[0]}, [r1], r3
+    vld1.32         {d2[1]}, [r1], r3
+    vld1.32         {d4[0]}, [r1], r3
+    vld1.32         {d4[1]}, [r1]
+
+    vaddw.u8        q1, q0, d2                ; a1 + pred_ptr[c]
+    vaddw.u8        q2, q0, d4
+
+    vqmovun.s16     d2, q1                    ; clip_pixel
+    vqmovun.s16     d4, q2
+
+    vst1.32         {d2[0]}, [r2], r12
+    vst1.32         {d2[1]}, [r2], r12
+    vst1.32         {d4[0]}, [r2], r12
+    vst1.32         {d4[1]}, [r2]
+
+    bx               lr
+    ENDP             ; |vp9_dc_only_idct_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -0,0 +1,169 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
+                                               int16_t *output,
+                                               int output_stride);
+extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
+                                               int16_t *output,
+                                               int16_t *pass1Output,
+                                               int16_t skip_adding,
+                                               uint8_t *dest,
+                                               int dest_stride);
+extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input,
+                                               int16_t *output,
+                                               int output_stride);
+extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
+                                               int16_t *output,
+                                               int16_t *pass1Output,
+                                               int16_t skip_adding,
+                                               uint8_t *dest,
+                                               int dest_stride);
+extern void save_neon_registers();
+extern void restore_neon_registers();
+
+
+void vp9_short_idct16x16_add_neon(int16_t *input,
+                                  uint8_t *dest, int dest_stride) {
+  int16_t pass1_output[16*16] = {0};
+  int16_t row_idct_output[16*16] = {0};
+
+  // save d8-d15 register values.
+  save_neon_registers();
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct16x16_add_neon_pass2(input+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     0,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the lower 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     0,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     1,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     1,
+                                     dest+8,
+                                     dest_stride);
+
+  // restore d8-d15 register values.
+  restore_neon_registers();
+
+  return;
+}
+
+void vp9_short_idct10_16x16_add_neon(int16_t *input,
+                                  uint8_t *dest, int dest_stride) {
+  int16_t pass1_output[16*16] = {0};
+  int16_t row_idct_output[16*16] = {0};
+
+  // save d8-d15 register values.
+  save_neon_registers();
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct10_16x16_add_neon_pass2(input+1,
+                                        row_idct_output,
+                                        pass1_output,
+                                        0,
+                                        dest,
+                                        dest_stride);
+
+  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     1,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     1,
+                                     dest+8,
+                                     dest_stride);
+
+  // restore d8-d15 register values.
+  restore_neon_registers();
+
+  return;
+}
--- a/vp9/common/arm/neon/vp9_idct32x32_neon.c
+++ b/vp9/common/arm/neon/vp9_idct32x32_neon.c
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_common.h"
+
+// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+extern void idct32_transpose_and_transform(int16_t *transpose_buffer,
+                                           int16_t *output, int16_t *input);
+extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
+
+
+// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+extern void save_neon_registers();
+extern void restore_neon_registers();
+
+void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  // TODO(cd): move the creation of these buffers within the ASM file
+  // internal buffer used to transpose 8 lines into before transforming them
+  int16_t transpose_buffer[32 * 8];
+  // results of the first pass (transpose and transform rows)
+  int16_t pass1[32 * 32];
+  // results of the second pass (transpose and transform columns)
+  int16_t pass2[32 * 32];
+
+  // save register we need to preserve
+  save_neon_registers();
+  // process rows
+  idct32_transpose_and_transform(transpose_buffer, pass1, input);
+  // process columns
+  // TODO(cd): do these two steps/passes within the ASM file
+  idct32_transpose_and_transform(transpose_buffer, pass2, pass1);
+  // combine and add to dest
+  // TODO(cd): integrate this within the last storage step of the second pass
+  idct32_combine_add(dest, pass2, dest_stride);
+  // restore register we need to preserve
+  restore_neon_registers();
+}
+
+// TODO(cd): Eliminate this file altogether when everything is in ASM file
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.asm
+++ b/vp9/common/arm/neon/vp9_loopfilter_neon.asm
@@ -0,0 +1,708 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_loop_filter_horizontal_edge_neon|
+    EXPORT  |vp9_loop_filter_vertical_edge_neon|
+    EXPORT  |vp9_mbloop_filter_horizontal_edge_neon|
+    EXPORT  |vp9_mbloop_filter_vertical_edge_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s,
+;                                           int p /* pitch */,
+;                                           const uint8_t *blimit,
+;                                           const uint8_t *limit,
+;                                           const uint8_t *thresh,
+;                                           int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_loop_filter_horizontal_edge_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r12, [sp, #8]              ; load count
+    ldr         r2, [sp, #4]               ; load thresh
+    add         r1, r1, r1                 ; double pitch
+
+    cmp         r12, #0
+    beq         end_vp9_lf_h_edge
+
+    vld1.8      {d1[]}, [r3]               ; duplicate *limit
+    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+
+count_lf_h_loop
+    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
+    add         r3, r2, r1, lsr #1         ; set to 3 lines down
+
+    vld1.u8     {d3}, [r2@64], r1          ; p3
+    vld1.u8     {d4}, [r3@64], r1          ; p2
+    vld1.u8     {d5}, [r2@64], r1          ; p1
+    vld1.u8     {d6}, [r3@64], r1          ; p0
+    vld1.u8     {d7}, [r2@64], r1          ; q0
+    vld1.u8     {d16}, [r3@64], r1         ; q1
+    vld1.u8     {d17}, [r2@64]             ; q2
+    vld1.u8     {d18}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          vp9_loop_filter_neon
+
+    vst1.u8     {d4}, [r2@64], r1          ; store op1
+    vst1.u8     {d5}, [r3@64], r1          ; store op0
+    vst1.u8     {d6}, [r2@64], r1          ; store oq0
+    vst1.u8     {d7}, [r3@64], r1          ; store oq1
+
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         count_lf_h_loop
+
+end_vp9_lf_h_edge
+    pop         {pc}
+    ENDP        ; |vp9_loop_filter_horizontal_edge_neon|
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_loop_filter_vertical_edge_neon(uint8_t *s,
+;                                         int p /* pitch */,
+;                                         const uint8_t *blimit,
+;                                         const uint8_t *limit,
+;                                         const uint8_t *thresh,
+;                                         int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_loop_filter_vertical_edge_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    ldr         r12, [sp, #8]             ; load count
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #4]              ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
+    cmp         r12, #0
+    beq         end_vp9_lf_v_edge
+
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
+
+count_lf_v_loop
+    vld1.u8     {d3}, [r2], r1             ; load s data
+    vld1.u8     {d4}, [r2], r1
+    vld1.u8     {d5}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d7}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    bl          vp9_loop_filter_neon
+
+    sub         r0, r0, #2
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+    add         r0, r0, r1, lsl #3         ; s += pitch * 8
+    subs        r12, r12, #1
+    subne       r2, r0, #4                 ; move s pointer down by 4 columns
+    bne         count_lf_v_loop
+
+end_vp9_lf_v_edge
+    pop         {pc}
+    ENDP        ; |vp9_loop_filter_vertical_edge_neon|
+
+; void vp9_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+;
+; Outputs:
+; d4    op1
+; d5    op0
+; d6    oq0
+; d7    oq1
+|vp9_loop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
+    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
+    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
+
+    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
+
+    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
+
+    vmov.u8     d18, #0x80
+
+    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
+
+    ; hevmask
+    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
+
+    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
+    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
+
+    veor        d7, d7, d18                 ; qs0
+
+    vcge.u8     d23, d1, d23                ; abs(m1) > limit
+
+    ; filter() function
+    ; convert to signed
+
+    vshr.u8     d28, d28, #1                ; a = a / 2
+    veor        d6, d6, d18                 ; ps0
+
+    veor        d5, d5, d18                 ; ps1
+    vqadd.u8    d17, d17, d28               ; a = b + a
+
+    veor        d16, d16, d18               ; qs1
+
+    vmov.u8     d19, #3
+
+    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
+
+    vcge.u8     d17, d0, d17                ; a > blimit
+
+    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
+    vorr        d22, d21, d22               ; hevmask
+
+    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
+
+    vand        d27, d27, d22               ; filter &= hev
+    vand        d23, d23, d17               ; filter_mask
+
+    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d17, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d27, q12
+
+    vand        d27, d27, d23               ; filter &= mask
+
+    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
+    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
+    vshr.s8     d28, d28, #3                ; filter2 >>= 3
+    vshr.s8     d27, d27, #3                ; filter1 >>= 3
+
+    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
+    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
+
+    veor        d6, d26, d18                ; *oq0 = u^0x80
+
+    vbic        d27, d27, d22               ; filter &= ~hev
+
+    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
+    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
+
+    veor        d5, d19, d18                ; *op0 = u^0x80
+    veor        d4, d21, d18                ; *op1 = u^0x80
+    veor        d7, d20, d18                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vp9_loop_filter_neon|
+
+; void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int p,
+;                                             const uint8_t *blimit,
+;                                             const uint8_t *limit,
+;                                             const uint8_t *thresh,
+;                                             int count)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_mbloop_filter_horizontal_edge_neon| PROC
+    push        {r4-r5, lr}
+
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r12, [sp, #16]             ; load count
+    ldr         r2, [sp, #12]              ; load thresh
+    add         r1, r1, r1                 ; double pitch
+
+    cmp         r12, #0
+    beq         end_vp9_mblf_h_edge
+
+    vld1.8      {d1[]}, [r3]               ; duplicate *limit
+    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+
+count_mblf_h_loop
+    sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
+    add         r2, r3, r1, lsr #1         ; set to 3 lines down
+
+    vld1.u8     {d3}, [r3@64], r1          ; p3
+    vld1.u8     {d4}, [r2@64], r1          ; p2
+    vld1.u8     {d5}, [r3@64], r1          ; p1
+    vld1.u8     {d6}, [r2@64], r1          ; p0
+    vld1.u8     {d7}, [r3@64], r1          ; q0
+    vld1.u8     {d16}, [r2@64], r1         ; q1
+    vld1.u8     {d17}, [r3@64]             ; q2
+    vld1.u8     {d18}, [r2@64], r1         ; q3
+
+    sub         r3, r3, r1, lsl #1
+    sub         r2, r2, r1, lsl #2
+
+    bl          vp9_mbloop_filter_neon
+
+    vst1.u8     {d0}, [r2@64], r1          ; store op2
+    vst1.u8     {d1}, [r3@64], r1          ; store op1
+    vst1.u8     {d2}, [r2@64], r1          ; store op0
+    vst1.u8     {d3}, [r3@64], r1          ; store oq0
+    vst1.u8     {d4}, [r2@64], r1          ; store oq1
+    vst1.u8     {d5}, [r3@64], r1          ; store oq2
+
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         count_mblf_h_loop
+
+end_vp9_mblf_h_edge
+    pop         {r4-r5, pc}
+
+    ENDP        ; |vp9_mbloop_filter_horizontal_edge_neon|
+
+; void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s,
+;                                           int pitch,
+;                                           const uint8_t *blimit,
+;                                           const uint8_t *limit,
+;                                           const uint8_t *thresh,
+;                                           int count)
+;
+; r0    uint8_t *s,
+; r1    int pitch,
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_mbloop_filter_vertical_edge_neon| PROC
+    push        {r4-r5, lr}
+
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    ldr         r12, [sp, #16]            ; load count
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #12]             ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
+    cmp         r12, #0
+    beq         end_vp9_mblf_v_edge
+
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
+
+count_mblf_v_loop
+    vld1.u8     {d3}, [r2], r1             ; load s data
+    vld1.u8     {d4}, [r2], r1
+    vld1.u8     {d5}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d7}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    sub         r2, r0, #3
+    add         r3, r0, #1
+
+    bl          vp9_mbloop_filter_neon
+
+    ;store op2, op1, op0, oq0
+    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
+    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
+    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
+    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
+    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
+    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
+    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
+    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r2]
+
+    ;store oq1, oq2
+    vst2.8      {d4[0], d5[0]}, [r3], r1
+    vst2.8      {d4[1], d5[1]}, [r3], r1
+    vst2.8      {d4[2], d5[2]}, [r3], r1
+    vst2.8      {d4[3], d5[3]}, [r3], r1
+    vst2.8      {d4[4], d5[4]}, [r3], r1
+    vst2.8      {d4[5], d5[5]}, [r3], r1
+    vst2.8      {d4[6], d5[6]}, [r3], r1
+    vst2.8      {d4[7], d5[7]}, [r3]
+
+    add         r0, r0, r1, lsl #3         ; s += pitch * 8
+    subs        r12, r12, #1
+    subne       r2, r0, #4                 ; move s pointer down by 4 columns
+    bne         count_mblf_v_loop
+
+end_vp9_mblf_v_edge
+    pop         {r4-r5, pc}
+    ENDP        ; |vp9_mbloop_filter_vertical_edge_neon|
+
+; void vp9_mbloop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+;
+; Outputs:
+; d0    op2
+; d1    op1
+; d2    op0
+; d3    oq0
+; d4    oq1
+; d5    oq2
+|vp9_mbloop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7               ; m4 = abs(q1 - q0)
+    vabd.u8     d23, d17, d16              ; m5 = abs(q2 - q1)
+    vabd.u8     d24, d18, d17              ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20              ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22              ; m2 = max(m3, m4)
+
+    vabd.u8     d25, d6, d4                ; m7 = abs(p0 - p2)
+
+    vmax.u8     d23, d23, d24              ; m3 = max(m5, m6)
+
+    vabd.u8     d26, d7, d17               ; m8 = abs(q0 - q2)
+
+    vmax.u8     d19, d19, d20
+
+    vabd.u8     d24, d6, d7                ; m9 = abs(p0 - q0)
+    vabd.u8     d27, d3, d6                ; m10 = abs(p3 - p0)
+    vabd.u8     d28, d18, d7               ; m11 = abs(q3 - q0)
+
+    vmax.u8     d19, d19, d23
+
+    vabd.u8     d23, d5, d16               ; a = abs(p1 - q1)
+    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
+
+    ; abs () > limit
+    vcge.u8     d19, d1, d19
+
+    ; only compare the largest value to thresh
+    vmax.u8     d25, d25, d26              ; m4 = max(m7, m8)
+    vmax.u8     d26, d27, d28              ; m5 = max(m10, m11)
+
+    vshr.u8     d23, d23, #1               ; a = a / 2
+
+    vmax.u8     d25, d25, d26              ; m4 = max(m4, m5)
+
+    vqadd.u8    d24, d24, d23              ; a = b + a
+
+    vmax.u8     d20, d20, d25              ; m2 = max(m2, m4)
+
+    vmov.u8     d23, #1
+    vcge.u8     d24, d0, d24               ; a > blimit
+
+    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1
+
+    vcge.u8     d20, d23, d20              ; flat
+
+    vand        d19, d19, d24              ; mask
+
+    vcgt.u8     d23, d22, d2               ; (abs(q1 - q0) > thresh)*-1
+
+    vand        d20, d20, d19              ; flat & mask
+
+    vmov.u8     d22, #0x80
+
+    vorr        d23, d21, d23              ; hev
+
+    ; This instruction will truncate the "flat & mask" masks down to 4 bits
+    ; each to fit into one 32 bit arm register. The values are stored in
+    ; q10.64[0].
+    vshrn.u16   d30, q10, #4
+    vmov.u32    r4, d30[0]                 ; flat & mask 4bits
+
+    adds        r5, r4, #1                 ; Check for all 1's
+
+    ; If mask and flat are 1's for all vectors, then we only need to execute
+    ; the power branch for all vectors.
+    beq         power_branch_only
+
+    cmp         r4, #0                     ; Check for 0, set flag for later
+
+    ; mbfilter() function
+    ; filter() function
+    ; convert to signed
+    veor        d21, d7, d22               ; qs0
+    veor        d24, d6, d22               ; ps0
+    veor        d25, d5, d22               ; ps1
+    veor        d26, d16, d22              ; qs1
+
+    vmov.u8     d27, #3
+
+    vsub.s8     d28, d21, d24              ; ( qs0 - ps0)
+
+    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
+
+    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
+
+    vand        d29, d29, d23              ; filter &= hev
+
+    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d29, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d28, q15
+
+    vand        d28, d28, d19              ; filter &= mask
+
+    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
+    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
+    vshr.s8     d30, d30, #3               ; filter2 >>= 3
+    vshr.s8     d29, d29, #3               ; filter1 >>= 3
+
+    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
+    vqsub.s8    d21, d21, d29              ; oq0 = clamp(qs0 - filter1)
+
+    ; outer tap adjustments: ++filter1 >> 1
+    vrshr.s8    d29, d29, #1
+    vbic        d29, d29, d23              ; filter &= ~hev
+
+    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
+    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
+
+    ; If mask and flat are 0's for all vectors, then we only need to execute
+    ; the filter branch for all vectors.
+    beq         filter_branch_only
+
+    ; If mask and flat are mixed then we must perform both branches and
+    ; combine the data.
+    veor        d24, d24, d22              ; *f_op0 = u^0x80
+    veor        d21, d21, d22              ; *f_oq0 = u^0x80
+    veor        d25, d25, d22              ; *f_op1 = u^0x80
+    veor        d26, d26, d22              ; *f_oq1 = u^0x80
+
+    ; At this point we have already executed the filter branch. The filter
+    ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
+    ; branch and combine the data.
+    vmov.u8     d23, #2
+    vaddl.u8    q14, d6, d7                ; r_op2 = p0 + q0
+    vmlal.u8    q14, d3, d27               ; r_op2 += p3 * 3
+    vmlal.u8    q14, d4, d23               ; r_op2 += p2 * 2
+
+    vbif        d0, d4, d20                ; op2 |= p2 & ~(flat & mask)
+
+    vaddw.u8    q14, d5                    ; r_op2 += p1
+
+    vbif        d1, d25, d20               ; op1 |= f_op1 & ~(flat & mask)
+
+    vqrshrn.u16 d30, q14, #3               ; r_op2
+
+    vsubw.u8    q14, d3                    ; r_op1 = r_op2 - p3
+    vsubw.u8    q14, d4                    ; r_op1 -= p2
+    vaddw.u8    q14, d5                    ; r_op1 += p1
+    vaddw.u8    q14, d16                   ; r_op1 += q1
+
+    vbif        d2, d24, d20               ; op0 |= f_op0 & ~(flat & mask)
+
+    vqrshrn.u16 d31, q14, #3               ; r_op1
+
+    vsubw.u8    q14, d3                    ; r_op0 = r_op1 - p3
+    vsubw.u8    q14, d5                    ; r_op0 -= p1
+    vaddw.u8    q14, d6                    ; r_op0 += p0
+    vaddw.u8    q14, d17                   ; r_op0 += q2
+
+    vbit        d0, d30, d20               ; op2 |= r_op2 & (flat & mask)
+
+    vqrshrn.u16 d23, q14, #3               ; r_op0
+
+    vsubw.u8    q14, d3                    ; r_oq0 = r_op0 - p3
+    vsubw.u8    q14, d6                    ; r_oq0 -= p0
+    vaddw.u8    q14, d7                    ; r_oq0 += q0
+
+    vbit        d1, d31, d20               ; op1 |= r_op1 & (flat & mask)
+
+    vaddw.u8    q14, d18                   ; oq0 += q3
+
+    vbit        d2, d23, d20               ; op0 |= r_op0 & (flat & mask)
+
+    vqrshrn.u16 d22, q14, #3               ; r_oq0
+
+    vsubw.u8    q14, d4                    ; r_oq1 = r_oq0 - p2
+    vsubw.u8    q14, d7                    ; r_oq1 -= q0
+    vaddw.u8    q14, d16                   ; r_oq1 += q1
+
+    vbif        d3, d21, d20               ; oq0 |= f_oq0 & ~(flat & mask)
+
+    vaddw.u8    q14, d18                   ; r_oq1 += q3
+
+    vbif        d4, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
+
+    vqrshrn.u16 d6, q14, #3                ; r_oq1
+
+    vsubw.u8    q14, d5                    ; r_oq2 = r_oq1 - p1
+    vsubw.u8    q14, d16                   ; r_oq2 -= q1
+    vaddw.u8    q14, d17                   ; r_oq2 += q2
+    vaddw.u8    q14, d18                   ; r_oq2 += q3
+
+    vbif        d5, d17, d20               ; oq2 |= q2 & ~(flat & mask)
+
+    vqrshrn.u16 d7, q14, #3                ; r_oq2
+
+    vbit        d3, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
+    vbit        d4, d6, d20                ; oq1 |= r_oq1 & (flat & mask)
+    vbit        d5, d7, d20                ; oq2 |= r_oq2 & (flat & mask)
+
+    bx          lr
+
+power_branch_only
+    vmov.u8     d27, #3
+    vmov.u8     d21, #2
+    vaddl.u8    q14, d6, d7                ; op2 = p0 + q0
+    vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
+    vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
+    vaddw.u8    q14, d5                    ; op2 += p1
+    vqrshrn.u16 d0, q14, #3                ; op2
+
+    vsubw.u8    q14, d3                    ; op1 = op2 - p3
+    vsubw.u8    q14, d4                    ; op1 -= p2
+    vaddw.u8    q14, d5                    ; op1 += p1
+    vaddw.u8    q14, d16                   ; op1 += q1
+    vqrshrn.u16 d1, q14, #3                ; op1
+
+    vsubw.u8    q14, d3                    ; op0 = op1 - p3
+    vsubw.u8    q14, d5                    ; op0 -= p1
+    vaddw.u8    q14, d6                    ; op0 += p0
+    vaddw.u8    q14, d17                   ; op0 += q2
+    vqrshrn.u16 d2, q14, #3                ; op0
+
+    vsubw.u8    q14, d3                    ; oq0 = op0 - p3
+    vsubw.u8    q14, d6                    ; oq0 -= p0
+    vaddw.u8    q14, d7                    ; oq0 += q0
+    vaddw.u8    q14, d18                   ; oq0 += q3
+    vqrshrn.u16 d3, q14, #3                ; oq0
+
+    vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
+    vsubw.u8    q14, d7                    ; oq1 -= q0
+    vaddw.u8    q14, d16                   ; oq1 += q1
+    vaddw.u8    q14, d18                   ; oq1 += q3
+    vqrshrn.u16 d4, q14, #3                ; oq1
+
+    vsubw.u8    q14, d5                    ; oq2 = oq1 - p1
+    vsubw.u8    q14, d16                   ; oq2 -= q1
+    vaddw.u8    q14, d17                   ; oq2 += q2
+    vaddw.u8    q14, d18                   ; oq2 += q3
+    vqrshrn.u16 d5, q14, #3                ; oq2
+
+    bx          lr
+
+filter_branch_only
+    ; TODO(fgalligan): See if we can rearange registers so we do not need to
+    ; do the 2 vswp.
+    vswp        d0, d4                      ; op2
+    vswp        d5, d17                     ; oq2
+    veor        d2, d24, d22                ; *op0 = u^0x80
+    veor        d3, d21, d22                ; *oq0 = u^0x80
+    veor        d1, d25, d22                ; *op1 = u^0x80
+    veor        d4, d26, d22                ; *oq1 = u^0x80
+
+    bx          lr
+
+    ENDP        ; |vp9_mbloop_filter_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -0,0 +1,603 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_mb_lpf_horizontal_edge_w_neon|
+    EXPORT  |vp9_mb_lpf_vertical_edge_w_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int p,
+;                                        const uint8_t *blimit,
+;                                        const uint8_t *limit,
+;                                        const uint8_t *thresh
+;                                        int count)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vp9_mb_lpf_horizontal_edge_w_neon| PROC
+    push        {r4-r8, lr}
+    vpush       {d8-d15}
+    ldr         r4, [sp, #88]              ; load thresh
+    ldr         r12, [sp, #92]             ; load count
+
+h_count
+    vld1.8      {d16[]}, [r2]              ; load *blimit
+    vld1.8      {d17[]}, [r3]              ; load *limit
+    vld1.8      {d18[]}, [r4]              ; load *thresh
+
+    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
+
+    vld1.u8     {d0}, [r8@64], r1          ; p7
+    vld1.u8     {d1}, [r8@64], r1          ; p6
+    vld1.u8     {d2}, [r8@64], r1          ; p5
+    vld1.u8     {d3}, [r8@64], r1          ; p4
+    vld1.u8     {d4}, [r8@64], r1          ; p3
+    vld1.u8     {d5}, [r8@64], r1          ; p2
+    vld1.u8     {d6}, [r8@64], r1          ; p1
+    vld1.u8     {d7}, [r8@64], r1          ; p0
+    vld1.u8     {d8}, [r8@64], r1          ; q0
+    vld1.u8     {d9}, [r8@64], r1          ; q1
+    vld1.u8     {d10}, [r8@64], r1         ; q2
+    vld1.u8     {d11}, [r8@64], r1         ; q3
+    vld1.u8     {d12}, [r8@64], r1         ; q4
+    vld1.u8     {d13}, [r8@64], r1         ; q5
+    vld1.u8     {d14}, [r8@64], r1         ; q6
+    vld1.u8     {d15}, [r8@64], r1         ; q7
+
+    bl          vp9_wide_mbfilter_neon
+
+    tst         r7, #1
+    beq         h_mbfilter
+
+    ; flat && mask were not set for any of the channels. Just store the values
+    ; from filter.
+    sub         r8, r0, r1, lsl #1
+
+    vst1.u8     {d25}, [r8@64], r1         ; store op1
+    vst1.u8     {d24}, [r8@64], r1         ; store op0
+    vst1.u8     {d23}, [r8@64], r1         ; store oq0
+    vst1.u8     {d26}, [r8@64], r1         ; store oq1
+
+    b           h_next
+
+h_mbfilter
+    tst         r7, #2
+    beq         h_wide_mbfilter
+
+    ; flat2 was not set for any of the channels. Just store the values from
+    ; mbfilter.
+    sub         r8, r0, r1, lsl #1
+    sub         r8, r8, r1
+
+    vst1.u8     {d18}, [r8@64], r1         ; store op2
+    vst1.u8     {d19}, [r8@64], r1         ; store op1
+    vst1.u8     {d20}, [r8@64], r1         ; store op0
+    vst1.u8     {d21}, [r8@64], r1         ; store oq0
+    vst1.u8     {d22}, [r8@64], r1         ; store oq1
+    vst1.u8     {d23}, [r8@64], r1         ; store oq2
+
+    b           h_next
+
+h_wide_mbfilter
+    sub         r8, r0, r1, lsl #3
+    add         r8, r8, r1
+
+    vst1.u8     {d16}, [r8@64], r1         ; store op6
+    vst1.u8     {d24}, [r8@64], r1         ; store op5
+    vst1.u8     {d25}, [r8@64], r1         ; store op4
+    vst1.u8     {d26}, [r8@64], r1         ; store op3
+    vst1.u8     {d27}, [r8@64], r1         ; store op2
+    vst1.u8     {d18}, [r8@64], r1         ; store op1
+    vst1.u8     {d19}, [r8@64], r1         ; store op0
+    vst1.u8     {d20}, [r8@64], r1         ; store oq0
+    vst1.u8     {d21}, [r8@64], r1         ; store oq1
+    vst1.u8     {d22}, [r8@64], r1         ; store oq2
+    vst1.u8     {d23}, [r8@64], r1         ; store oq3
+    vst1.u8     {d1}, [r8@64], r1          ; store oq4
+    vst1.u8     {d2}, [r8@64], r1          ; store oq5
+    vst1.u8     {d3}, [r8@64], r1          ; store oq6
+
+h_next
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         h_count
+
+    vpop        {d8-d15}
+    pop         {r4-r8, pc}
+
+    ENDP        ; |vp9_mb_lpf_horizontal_edge_w_neon|
+
+; void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int p,
+;                                        const uint8_t *blimit,
+;                                        const uint8_t *limit,
+;                                        const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vp9_mb_lpf_vertical_edge_w_neon| PROC
+    push        {r4-r8, lr}
+    vpush       {d8-d15}
+    ldr         r4, [sp, #88]              ; load thresh
+
+    vld1.8      {d16[]}, [r2]              ; load *blimit
+    vld1.8      {d17[]}, [r3]              ; load *limit
+    vld1.8      {d18[]}, [r4]              ; load *thresh
+
+    sub         r8, r0, #8
+
+    vld1.8      {d0}, [r8@64], r1
+    vld1.8      {d8}, [r0@64], r1
+    vld1.8      {d1}, [r8@64], r1
+    vld1.8      {d9}, [r0@64], r1
+    vld1.8      {d2}, [r8@64], r1
+    vld1.8      {d10}, [r0@64], r1
+    vld1.8      {d3}, [r8@64], r1
+    vld1.8      {d11}, [r0@64], r1
+    vld1.8      {d4}, [r8@64], r1
+    vld1.8      {d12}, [r0@64], r1
+    vld1.8      {d5}, [r8@64], r1
+    vld1.8      {d13}, [r0@64], r1
+    vld1.8      {d6}, [r8@64], r1
+    vld1.8      {d14}, [r0@64], r1
+    vld1.8      {d7}, [r8@64], r1
+    vld1.8      {d15}, [r0@64], r1
+
+    sub         r0, r0, r1, lsl #3
+
+    vtrn.32     q0, q2
+    vtrn.32     q1, q3
+    vtrn.32     q4, q6
+    vtrn.32     q5, q7
+
+    vtrn.16     q0, q1
+    vtrn.16     q2, q3
+    vtrn.16     q4, q5
+    vtrn.16     q6, q7
+
+    vtrn.8      d0, d1
+    vtrn.8      d2, d3
+    vtrn.8      d4, d5
+    vtrn.8      d6, d7
+
+    vtrn.8      d8, d9
+    vtrn.8      d10, d11
+    vtrn.8      d12, d13
+    vtrn.8      d14, d15
+
+    bl          vp9_wide_mbfilter_neon
+
+    tst         r7, #1
+    beq         v_mbfilter
+
+    ; flat && mask were not set for any of the channels. Just store the values
+    ; from filter.
+    sub         r8, r0, #2
+
+    vswp        d23, d25
+
+    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
+    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
+    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
+    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
+    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
+    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
+    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
+    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
+
+    b           v_end
+
+v_mbfilter
+    tst         r7, #2
+    beq         v_wide_mbfilter
+
+    ; flat2 was not set for any of the channels. Just store the values from
+    ; mbfilter.
+    sub         r8, r0, #3
+
+    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
+    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
+    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
+    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
+    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
+    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
+    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
+    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
+    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
+    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
+    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
+    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
+    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
+    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
+    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
+    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
+
+    b           v_end
+
+v_wide_mbfilter
+    sub         r8, r0, #8
+
+    vtrn.32     d0,  d26
+    vtrn.32     d16, d27
+    vtrn.32     d24, d18
+    vtrn.32     d25, d19
+
+    vtrn.16     d0,  d24
+    vtrn.16     d16, d25
+    vtrn.16     d26, d18
+    vtrn.16     d27, d19
+
+    vtrn.8      d0,  d16
+    vtrn.8      d24, d25
+    vtrn.8      d26, d27
+    vtrn.8      d18, d19
+
+    vtrn.32     d20, d1
+    vtrn.32     d21, d2
+    vtrn.32     d22, d3
+    vtrn.32     d23, d15
+
+    vtrn.16     d20, d22
+    vtrn.16     d21, d23
+    vtrn.16     d1,  d3
+    vtrn.16     d2,  d15
+
+    vtrn.8      d20, d21
+    vtrn.8      d22, d23
+    vtrn.8      d1,  d2
+    vtrn.8      d3,  d15
+
+    vst1.8      {d0}, [r8@64], r1
+    vst1.8      {d20}, [r0@64], r1
+    vst1.8      {d16}, [r8@64], r1
+    vst1.8      {d21}, [r0@64], r1
+    vst1.8      {d24}, [r8@64], r1
+    vst1.8      {d22}, [r0@64], r1
+    vst1.8      {d25}, [r8@64], r1
+    vst1.8      {d23}, [r0@64], r1
+    vst1.8      {d26}, [r8@64], r1
+    vst1.8      {d1}, [r0@64], r1
+    vst1.8      {d27}, [r8@64], r1
+    vst1.8      {d2}, [r0@64], r1
+    vst1.8      {d18}, [r8@64], r1
+    vst1.8      {d3}, [r0@64], r1
+    vst1.8      {d19}, [r8@64], r1
+    vst1.8      {d15}, [r0@64], r1
+
+v_end
+    vpop        {d8-d15}
+    pop         {r4-r8, pc}
+
+    ENDP        ; |vp9_mb_lpf_vertical_edge_w_neon|
+
+; void vp9_wide_mbfilter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+;
+; r0-r3 PRESERVE
+; d16    blimit
+; d17    limit
+; d18    thresh
+; d0    p7
+; d1    p6
+; d2    p5
+; d3    p4
+; d4    p3
+; d5    p2
+; d6    p1
+; d7    p0
+; d8    q0
+; d9    q1
+; d10   q2
+; d11   q3
+; d12   q4
+; d13   q5
+; d14   q6
+; d15   q7
+|vp9_wide_mbfilter_neon| PROC
+    mov         r7, #0
+
+    ; filter_mask
+    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
+    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
+    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
+    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
+    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
+    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
+    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
+    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
+    vmax.u8     d19, d19, d20
+
+    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
+
+    vmax.u8     d19, d19, d23
+
+    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
+    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
+
+    ; abs () > limit
+    vcge.u8     d19, d17, d19
+
+    ; flatmask4
+    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
+    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
+    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
+    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
+
+    ; only compare the largest value to thresh
+    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
+    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
+    vmax.u8     d25, d25, d26
+    vmax.u8     d20, d20, d25
+
+    vshr.u8     d23, d23, #1               ; a = a / 2
+    vqadd.u8    d24, d24, d23              ; a = b + a
+
+    vmov.u8     d30, #1
+    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
+
+    vcge.u8     d20, d30, d20              ; flat
+
+    vand        d19, d19, d24              ; mask
+
+    ; hevmask
+    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
+    vorr        d21, d21, d22              ; hev
+
+    vand        d16, d20, d19              ; flat && mask
+    vmov        r5, r6, d16
+
+    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
+    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
+    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
+    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
+    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
+    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
+    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
+    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
+    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
+
+    ; only compare the largest value to thresh
+    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
+    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
+    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
+    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
+
+    vmax.u8     d26, d22, d23
+    vmax.u8     d27, d24, d25
+    vmax.u8     d23, d26, d27
+
+    vcge.u8     d18, d30, d23              ; flat2
+
+    vmov.u8     d22, #0x80
+
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #1                 ; Only do filter branch
+
+    vand        d17, d18, d16              ; flat2 && flat && mask
+    vmov        r5, r6, d17
+
+    ; mbfilter() function
+
+    ; filter() function
+    ; convert to signed
+    veor        d23, d8, d22               ; qs0
+    veor        d24, d7, d22               ; ps0
+    veor        d25, d6, d22               ; ps1
+    veor        d26, d9, d22               ; qs1
+
+    vmov.u8     d27, #3
+
+    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
+    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
+    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
+    vand        d29, d29, d21              ; filter &= hev
+    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
+    vmov.u8     d29, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d28, q15
+
+    vand        d28, d28, d19              ; filter &= mask
+
+    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
+    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
+    vshr.s8     d30, d30, #3               ; filter2 >>= 3
+    vshr.s8     d29, d29, #3               ; filter1 >>= 3
+
+
+    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
+    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
+
+    ; outer tap adjustments: ++filter1 >> 1
+    vrshr.s8    d29, d29, #1
+    vbic        d29, d29, d21              ; filter &= ~hev
+
+    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
+    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
+
+    veor        d24, d24, d22              ; *f_op0 = u^0x80
+    veor        d23, d23, d22              ; *f_oq0 = u^0x80
+    veor        d25, d25, d22              ; *f_op1 = u^0x80
+    veor        d26, d26, d22              ; *f_oq1 = u^0x80
+
+    tst         r7, #1
+    bxne        lr
+
+    ; mbfilter flat && mask branch
+    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
+    ; and using vibt on the q's?
+    vmov.u8     d29, #2
+    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
+    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
+    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+    vaddl.u8    q10, d4, d5
+    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+    vaddl.u8    q14, d6, d9
+    vqrshrn.u16 d18, q15, #3               ; r_op2
+
+    vsub.i16    q15, q10
+    vaddl.u8    q10, d4, d6
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d7, d10
+    vqrshrn.u16 d19, q15, #3               ; r_op1
+
+    vsub.i16    q15, q10
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d8, d11
+    vqrshrn.u16 d20, q15, #3               ; r_op0
+
+    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
+    vsubw.u8    q15, d7                    ; oq0 -= p0
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d9, d11
+    vqrshrn.u16 d21, q15, #3               ; r_oq0
+
+    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
+    vsubw.u8    q15, d8                    ; oq1 -= q0
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d10, d11
+    vqrshrn.u16 d22, q15, #3               ; r_oq1
+
+    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
+    vsubw.u8    q15, d9                    ; oq2 -= q1
+    vadd.i16    q15, q14
+    vqrshrn.u16 d27, q15, #3               ; r_oq2
+
+    ; Filter does not set op2 or oq2, so use p2 and q2.
+    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
+    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
+    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
+    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
+    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
+
+    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
+    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
+
+    tst         r7, #2
+    bxne        lr
+
+    ; wide_mbfilter flat2 && flat && mask branch
+    vmov.u8     d16, #7
+    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
+    vaddl.u8    q12, d2, d3
+    vaddl.u8    q13, d4, d5
+    vaddl.u8    q14, d1, d6
+    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
+    vadd.i16    q12, q13
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d2, d9
+    vadd.i16    q15, q12
+    vaddl.u8    q12, d0, d1
+    vaddw.u8    q15, d1
+    vaddl.u8    q13, d0, d2
+    vadd.i16    q14, q15, q14
+    vqrshrn.u16 d16, q15, #4               ; w_op6
+
+    vsub.i16    q15, q14, q12
+    vaddl.u8    q14, d3, d10
+    vqrshrn.u16 d24, q15, #4               ; w_op5
+
+    vsub.i16    q15, q13
+    vaddl.u8    q13, d0, d3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d4, d11
+    vqrshrn.u16 d25, q15, #4               ; w_op4
+
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d0, d4
+    vsub.i16    q15, q13
+    vsub.i16    q14, q15, q14
+    vqrshrn.u16 d26, q15, #4               ; w_op3
+
+    vaddw.u8    q15, q14, d5               ; op2 += p2
+    vaddl.u8    q14, d0, d5
+    vaddw.u8    q15, d12                   ; op2 += q4
+    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
+    vqrshrn.u16 d27, q15, #4               ; w_op2
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d6
+    vaddw.u8    q15, d6                    ; op1 += p1
+    vaddw.u8    q15, d13                   ; op1 += q5
+    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
+    vqrshrn.u16 d18, q15, #4               ; w_op1
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d7
+    vaddw.u8    q15, d7                    ; op0 += p0
+    vaddw.u8    q15, d14                   ; op0 += q6
+    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
+    vqrshrn.u16 d19, q15, #4               ; w_op0
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d1, d8
+    vaddw.u8    q15, d8                    ; oq0 += q0
+    vaddw.u8    q15, d15                   ; oq0 += q7
+    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
+    vqrshrn.u16 d20, q15, #4               ; w_oq0
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d2, d9
+    vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddl.u8    q4, d10, d15
+    vaddw.u8    q15, d15                   ; oq1 += q7
+    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
+    vqrshrn.u16 d21, q15, #4               ; w_oq1
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d3, d10
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d11, d15
+    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
+    vqrshrn.u16 d22, q15, #4               ; w_oq2
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d4, d11
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d12, d15
+    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
+    vqrshrn.u16 d23, q15, #4               ; w_oq3
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d5, d12
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d13, d15
+    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
+    vqrshrn.u16 d1, q15, #4                ; w_oq4
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d6, d13
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d14, d15
+    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
+    vqrshrn.u16 d2, q15, #4                ; w_oq5
+
+    vsub.i16    q15, q14
+    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
+    vadd.i16    q15, q4
+    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
+    vqrshrn.u16 d3, q15, #4                ; w_oq6
+    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
+    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
+    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
+
+    bx          lr
+    ENDP        ; |vp9_wide_mbfilter_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
@@ -0,0 +1,198 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct16x16_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;                                    int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct16x16_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asr              r0, r0, #6                ; >> 6
+
+    vdup.s16         q0, r0                    ; duplicate a1
+    mov              r0, #8
+    sub              r2, #8
+
+    ; load destination data row0 - row3
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row4 - row7
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row8 - row11
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row12 - row15
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vp9_short_idct16x16_1_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
--- a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
--- a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
@@ -0,0 +1,68 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct4x4_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct4x4_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 4)
+    add              r0, r0, #8                ; + (1 <<((4) - 1))
+    asr              r0, r0, #4                ; >> 4
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    vld1.32          {d2[0]}, [r1], r2
+    vld1.32          {d2[1]}, [r1], r2
+    vld1.32          {d4[0]}, [r1], r2
+    vld1.32          {d4[1]}, [r1]
+
+    vaddw.u8         q8, q0, d2                ; dest[x] + a1
+    vaddw.u8         q9, q0, d4
+
+    vqmovun.s16      d6, q8                    ; clip_pixel
+    vqmovun.s16      d7, q9
+
+    vst1.32          {d6[0]}, [r12], r2
+    vst1.32          {d6[1]}, [r12], r2
+    vst1.32          {d7[0]}, [r12], r2
+    vst1.32          {d7[1]}, [r12]
+
+    bx               lr
+    ENDP             ; |vp9_short_idct4x4_1_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
@@ -0,0 +1,190 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_idct4x4_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct4x4_add_neon| PROC
+
+    ; The 2D transform is done with two passes which are actually pretty
+    ; similar. We first transform the rows. This is done by transposing
+    ; the inputs, doing an SIMD column transform (the columns are the
+    ; transposed rows) and then transpose the results (so that it goes back
+    ; in normal/row positions). Then, we transform the columns by doing
+    ; another SIMD column transform.
+    ; So, two passes of a transpose followed by a column transform.
+
+    ; load the inputs into q8-q9, d16-d19
+    vld1.s16        {q8,q9}, [r0]!
+
+    ; generate scalar constants
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov             r0, #0x3b00
+    add             r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov             r3, #0x2d00
+    add             r3, #0x41
+    ; cospi_24_64 = 6270 = 0x 187e
+    mov             r12, #0x1800
+    add             r12, #0x7e
+
+    ; transpose the input data
+    ; 00 01 02 03   d16
+    ; 10 11 12 13   d17
+    ; 20 21 22 23   d18
+    ; 30 31 32 33   d19
+    vtrn.16         d16, d17
+    vtrn.16         d18, d19
+
+    ; generate constant vectors
+    vdup.16         d20, r0         ; replicate cospi_8_64
+    vdup.16         d21, r3         ; replicate cospi_16_64
+
+    ; 00 10 02 12   d16
+    ; 01 11 03 13   d17
+    ; 20 30 22 32   d18
+    ; 21 31 23 33   d19
+    vtrn.32         q8, q9
+    ; 00 10 20 30   d16
+    ; 01 11 21 31   d17
+    ; 02 12 22 32   d18
+    ; 03 13 23 33   d19
+
+    vdup.16         d22, r12        ; replicate cospi_24_64
+
+    ; do the transform on transposed rows
+
+    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
+
+    ; (input[0] + input[2]) * cospi_16_64;
+    ; (input[0] - input[2]) * cospi_16_64;
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
+    vmlsl.s16 q15, d19, d20
+    vmlal.s16 q1,  d19, d22
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16 q8,  q13, q14
+    vsub.s16 q9,  q13, q14
+    vswp     d18, d19
+
+    ; transpose the results
+    ; 00 01 02 03   d16
+    ; 10 11 12 13   d17
+    ; 20 21 22 23   d18
+    ; 30 31 32 33   d19
+    vtrn.16         d16, d17
+    vtrn.16         d18, d19
+    ; 00 10 02 12   d16
+    ; 01 11 03 13   d17
+    ; 20 30 22 32   d18
+    ; 21 31 23 33   d19
+    vtrn.32         q8, q9
+    ; 00 10 20 30   d16
+    ; 01 11 21 31   d17
+    ; 02 12 22 32   d18
+    ; 03 13 23 33   d19
+
+    ; do the transform on columns
+
+    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
+
+    ; (input[0] + input[2]) * cospi_16_64;
+    ; (input[0] - input[2]) * cospi_16_64;
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
+    vmlsl.s16 q15, d19, d20
+    vmlal.s16 q1,  d19, d22
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16 q8,  q13, q14
+    vsub.s16 q9,  q13, q14
+
+    ; The results are in two registers, one of them being swapped. This will
+    ; be taken care of by loading the 'dest' value in a swapped fashion and
+    ; also storing them in the same swapped fashion.
+    ; temp_out[0, 1] = d16, d17 = q8
+    ; temp_out[2, 3] = d19, d18 = q9 swapped
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+    vrshr.s16 q8, q8, #4
+    vrshr.s16 q9, q9, #4
+
+    vld1.32 {d26[0]}, [r1], r2
+    vld1.32 {d26[1]}, [r1], r2
+    vld1.32 {d27[1]}, [r1], r2
+    vld1.32 {d27[0]}, [r1]  ; no post-increment
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    vaddw.u8 q8, q8, d26
+    vaddw.u8 q9, q9, d27
+
+    ; clip_pixel
+    vqmovun.s16 d26, q8
+    vqmovun.s16 d27, q9
+
+    ; do the stores in reverse order with negative post-increment, by changing
+    ; the sign of the stride
+    rsb r2, r2, #0
+    vst1.32 {d27[0]}, [r1], r2
+    vst1.32 {d27[1]}, [r1], r2
+    vst1.32 {d26[1]}, [r1], r2
+    vst1.32 {d26[0]}, [r1]  ; no post-increment
+    bx              lr
+    ENDP  ; |vp9_short_idct4x4_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
@@ -0,0 +1,88 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct8x8_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct8x8_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 5)
+    add              r0, r0, #16               ; + (1 <<((5) - 1))
+    asr              r0, r0, #5                ; >> 5
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    ; load destination data
+    vld1.64          {d2}, [r1], r2
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r2
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r2
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r2
+    vld1.64          {d17}, [r1]
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vp9_short_idct8x8_1_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -0,0 +1,519 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_idct8x8_add_neon|
+    EXPORT  |vp9_short_idct10_8x8_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
+    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
+    ; This macro will touch q0-q7 registers and use them as buffer during
+    ; calculation.
+    MACRO
+    IDCT8x8_1D
+    ; stage 1
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r4                    ; duplicate cospi_4_64
+    vdup.16         d2, r5                    ; duplicate cospi_12_64
+    vdup.16         d3, r6                    ; duplicate cospi_20_64
+
+    ; input[1] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; input[5] * cospi_12_64
+    vmull.s16       q5, d26, d2
+    vmull.s16       q6, d27, d2
+
+    ; input[1]*cospi_28_64-input[7]*cospi_4_64
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q5, d22, d3
+    vmlsl.s16       q6, d23, d3
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
+    ; input[1] * cospi_4_64
+    vmull.s16       q2, d18, d1
+    vmull.s16       q3, d19, d1
+
+    ; input[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q13, d27, d3
+
+    ; input[1]*cospi_4_64+input[7]*cospi_28_64
+    vmlal.s16       q2, d30, d0
+    vmlal.s16       q3, d31, d0
+
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q13, d23, d2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d14, q2, #14              ; >> 14
+    vqrshrn.s32     d15, q3, #14              ; >> 14
+
+    ; stage 2 & stage 3 - even half
+    vdup.16         d0, r7                    ; duplicate cospi_16_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q13, #14              ; >> 14
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q2, d16, d0
+    vmull.s16       q3, d17, d0
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q13, d16, d0
+    vmull.s16       q15, d17, d0
+
+    ; (input[0] + input[2]) * cospi_16_64
+    vmlal.s16       q2,  d24, d0
+    vmlal.s16       q3, d25, d0
+
+    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16       q13, d24, d0
+    vmlsl.s16       q15, d25, d0
+
+    vdup.16         d0, r8                    ; duplicate cospi_24_64
+    vdup.16         d1, r9                    ; duplicate cospi_8_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q2, #14              ; >> 14
+    vqrshrn.s32     d19, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d22, q13, #14              ; >> 14
+    vqrshrn.s32     d23, q15, #14              ; >> 14
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    ; input[1] * cospi_24_64
+    vmull.s16       q2, d20, d0
+    vmull.s16       q3, d21, d0
+
+    ; input[1] * cospi_8_64
+    vmull.s16       q8, d20, d1
+    vmull.s16       q12, d21, d1
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlsl.s16       q2, d28, d1
+    vmlsl.s16       q3, d29, d1
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q8, d28, d0
+    vmlal.s16       q12, d29, d0
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d26, q2, #14              ; >> 14
+    vqrshrn.s32     d27, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d30, q8, #14              ; >> 14
+    vqrshrn.s32     d31, q12, #14              ; >> 14
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+    MEND
+
+    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct8x8_add_neon| PROC
+    push            {r4-r9}
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+
+    ; First transform rows
+    IDCT8x8_1D
+
+    ; Transpose the matrix
+    TRANSPOSE8X8
+
+    ; Then transform columns
+    IDCT8x8_1D
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+
+    vpop            {d8-d15}
+    pop             {r4-r9}
+    bx              lr
+    ENDP  ; |vp9_short_idct8x8_add_neon|
+
+;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct10_8x8_add_neon| PROC
+    push            {r4-r9}
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+
+    ; First transform rows
+    ; stage 1
+    ; The following instructions use vqrdmulh to do the
+    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
+    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
+    ; to double the constants before multiplying to compensate this.
+    mov             r12, r3, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
+    mov             r12, r4, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_28_64)
+    vqrdmulh.s16    q4, q9, q0
+
+    mov             r12, r6, lsl #1
+    rsb             r12, #0
+    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_4_64)
+    vqrdmulh.s16    q7, q9, q1
+
+    mov             r12, r5, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
+
+    ; dct_const_round_shift(- input[3] * cospi_20_64)
+    vqrdmulh.s16    q5, q11, q0
+
+    mov             r12, r7, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
+
+    ; dct_const_round_shift(input[3] * cospi_12_64)
+    vqrdmulh.s16    q6, q11, q1
+
+    ; stage 2 & stage 3 - even half
+    mov             r12, r8, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrdmulh.s16    q9, q8, q0
+
+    mov             r12, r9, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_24_64)
+    vqrdmulh.s16    q13, q10, q1
+
+    ; dct_const_round_shift(input[1] * cospi_8_64)
+    vqrdmulh.s16    q15, q10, q0
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+
+    ; Transpose the matrix
+    TRANSPOSE8X8
+
+    ; Then transform columns
+    IDCT8x8_1D
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+
+    vpop            {d8-d15}
+    pop             {r4-r9}
+    bx              lr
+    ENDP  ; |vp9_short_idct10_8x8_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_iht4x4_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
+    ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
+    ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
+    ; into d16-d19 registers. This macro will touch q10- q15 registers and use
+    ; them as buffer during calculation.
+    MACRO
+    IDCT4x4_1D
+    ; stage 1
+    vadd.s16    d23, d16, d18   ; (input[0] + input[2])
+    vsub.s16    d24, d16, d18   ; (input[0] - input[2])
+
+    vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
+    vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
+    vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
+    vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q10, #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16    q8,  q13, q14
+    vsub.s16    q9,  q13, q14
+    vswp        d18, d19
+    MEND
+
+    ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
+    ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
+    ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
+    ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
+    ; q14,q15 registers and use them as buffer during calculation.
+    MACRO
+    IADST4x4_1D
+    vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
+    vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
+    vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
+    vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
+    vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
+    vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
+    vaddw.s16   q15, q15, d19   ; x0 + x3
+    vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
+    vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
+    vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
+
+    vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
+    vadd.s32    q10, q10, q8
+    vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
+    vdup.32     q8, r0          ; duplicate sinpi_3_9
+    vsub.s32    q11, q11, q9
+    vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
+
+    vadd.s32    q13, q10, q12   ; s0 = x0 + x3
+    vadd.s32    q10, q10, q11   ; x0 + x1
+    vadd.s32    q14, q11, q12   ; s1 = x1 + x3
+    vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d16, q13, #14
+    vqrshrn.s32 d17, q14, #14
+    vqrshrn.s32 d18, q15, #14
+    vqrshrn.s32 d19, q10, #14
+    MEND
+
+    ; Generate cosine constants in d6 - d8 for the IDCT
+    MACRO
+    GENERATE_COSINE_CONSTANTS
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov         r0, #0x3b00
+    add         r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov         r3, #0x2d00
+    add         r3, #0x41
+    ; cospi_24_64 = 6270 = 0x187e
+    mov         r12, #0x1800
+    add         r12, #0x7e
+
+    ; generate constant vectors
+    vdup.16     d0, r0          ; duplicate cospi_8_64
+    vdup.16     d1, r3          ; duplicate cospi_16_64
+    vdup.16     d2, r12         ; duplicate cospi_24_64
+    MEND
+
+    ; Generate sine constants in d1 - d4 for the IADST.
+    MACRO
+    GENERATE_SINE_CONSTANTS
+    ; sinpi_1_9 = 5283 = 0x14A3
+    mov         r0, #0x1400
+    add         r0, #0xa3
+    ; sinpi_2_9 = 9929 = 0x26C9
+    mov         r3, #0x2600
+    add         r3, #0xc9
+    ; sinpi_4_9 = 15212 = 0x3B6C
+    mov         r12, #0x3b00
+    add         r12, #0x6c
+
+    ; generate constant vectors
+    vdup.16     d3, r0          ; duplicate sinpi_1_9
+
+    ; sinpi_3_9 = 13377 = 0x3441
+    mov         r0, #0x3400
+    add         r0, #0x41
+
+    vdup.16     d4, r3          ; duplicate sinpi_2_9
+    vdup.16     d5, r12         ; duplicate sinpi_4_9
+    vdup.16     q3, r0          ; duplicate sinpi_3_9
+    MEND
+
+    ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
+    MACRO
+    TRANSPOSE4X4
+    vtrn.16     d16, d17
+    vtrn.16     d18, d19
+    vtrn.32     q8, q9
+    MEND
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
+;                               int dest_stride, int tx_type)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+; r3  int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_short_iht4x4_add_neon| PROC
+
+    ; load the inputs into d16-d19
+    vld1.s16    {q8,q9}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE4X4
+
+    ; decide the type of transform
+    cmp         r3, #2
+    beq         idct_iadst
+    cmp         r3, #3
+    beq         iadst_iadst
+
+iadst_idct
+    ; generate constants
+    GENERATE_COSINE_CONSTANTS
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IDCT4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IADST4x4_1D
+
+    b end_vp9_short_iht4x4_add_neon
+
+idct_iadst
+    ; generate constants
+    GENERATE_COSINE_CONSTANTS
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IADST4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IDCT4x4_1D
+
+    b end_vp9_short_iht4x4_add_neon
+
+iadst_iadst
+    ; generate constants
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IADST4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IADST4x4_1D
+
+end_vp9_short_iht4x4_add_neon
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+    vrshr.s16   q8, q8, #4
+    vrshr.s16   q9, q9, #4
+
+    vld1.32     {d26[0]}, [r1], r2
+    vld1.32     {d26[1]}, [r1], r2
+    vld1.32     {d27[0]}, [r1], r2
+    vld1.32     {d27[1]}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    vaddw.u8    q8, q8, d26
+    vaddw.u8    q9, q9, d27
+
+    ; clip_pixel
+    vqmovun.s16 d26, q8
+    vqmovun.s16 d27, q9
+
+    ; do the stores in reverse order with negative post-increment, by changing
+    ; the sign of the stride
+    rsb         r2, r2, #0
+    vst1.32     {d27[1]}, [r1], r2
+    vst1.32     {d27[0]}, [r1], r2
+    vst1.32     {d26[1]}, [r1], r2
+    vst1.32     {d26[0]}, [r1]  ; no post-increment
+    bx          lr
+    ENDP  ; |vp9_short_iht4x4_add_neon|
+
+    END
--- a/Show More
+++ b/Show More