libvpx: enable building for iOS devices (armv7)

Allow output of gas syntax assembly directly from obj_int_extract Change-Id: I33a747e87ef1c590a8766dea17f8cb2497e54591
Replace generated quant tables with static lookup tables.
2013-07-19 14:05:59 -07:00 · 2013-07-16 14:04:41 -07:00 · 2013-07-16 14:04:41 -07:00 · 2013-07-16 14:04:39 -07:00 · 2013-07-16 12:41:10 -07:00 · 2013-07-16 12:40:48 -07:00
264 changed files with 19822 additions and 31282 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,6 @@
 *.a
 *.asm.s
 *.d
-*.gcno
-*.gcda
 *.o
 *~
 /*.ivf
@@ -16,7 +14,7 @@
 /.install-*
 /.libs
 /Makefile
-/config.log
+/config.err
 /config.mk
 /decode_to_md5
 /decode_to_md5.c
--- a/36
+++ b/36
@@ -1,7 +1,7 @@
 vpx Multi-Format Codec SDK
-README - 1 August 2013
+README - 21 June 2012

-Welcome to the WebM VP8/VP9 Codec SDK!
+Welcome to the WebM VP8 Codec SDK!

 COMPILING THE APPLICATIONS/LIBRARIES:
  The build system used is similar to autotools. Building generally consists of
@@ -53,63 +53,33 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv5te-android-gcc
    armv5te-linux-rvct
    armv5te-linux-gcc
-    armv5te-none-rvct
    armv6-darwin-gcc
    armv6-linux-rvct
    armv6-linux-gcc
-    armv6-none-rvct
    armv7-android-gcc
-    armv7-darwin-gcc
    armv7-linux-rvct
    armv7-linux-gcc
-    armv7-none-rvct
-    armv7-win32-vs11
    mips32-linux-gcc
    ppc32-darwin8-gcc
    ppc32-darwin9-gcc
-    ppc32-linux-gcc
    ppc64-darwin8-gcc
    ppc64-darwin9-gcc
    ppc64-linux-gcc
-    sparc-solaris-gcc
-    x86-android-gcc
    x86-darwin8-gcc
    x86-darwin8-icc
    x86-darwin9-gcc
    x86-darwin9-icc
-    x86-darwin10-gcc
-    x86-darwin11-gcc
-    x86-darwin12-gcc
-    x86-darwin13-gcc
    x86-linux-gcc
    x86-linux-icc
-    x86-os2-gcc
    x86-solaris-gcc
-    x86-win32-gcc
    x86-win32-vs7
    x86-win32-vs8
-    x86-win32-vs9
-    x86-win32-vs10
-    x86-win32-vs11
    x86_64-darwin9-gcc
-    x86_64-darwin10-gcc
-    x86_64-darwin11-gcc
-    x86_64-darwin12-gcc
-    x86_64-darwin13-gcc
    x86_64-linux-gcc
-    x86_64-linux-icc
    x86_64-solaris-gcc
-    x86_64-win64-gcc
    x86_64-win64-vs8
-    x86_64-win64-vs9
-    x86_64-win64-vs10
-    x86_64-win64-vs11
    universal-darwin8-gcc
    universal-darwin9-gcc
-    universal-darwin10-gcc
-    universal-darwin11-gcc
-    universal-darwin12-gcc
-    universal-darwin13-gcc
    generic-gnu

  The generic-gnu target, in conjunction with the CROSS environment variable,
@@ -127,7 +97,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:

  5. Configuration errors
  If the configuration step fails, the first step is to look in the error log.
-  This defaults to config.log. This should give a good indication of what went
+  This defaults to config.err. This should give a good indication of what went
  wrong. If not, contact us for support.

 SUPPORT
--- a/build/arm-msvs/obj_int_extract.bat
+++ b/build/arm-msvs/obj_int_extract.bat
@@ -7,7 +7,18 @@ REM   in the file PATENTS.  All contributing project authors may
 REM   be found in the AUTHORS file in the root of the source tree.
 echo on

+cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/common/vp9_asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/decoder/vp9_asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/encoder/vp9_asm_enc_offsets.c"
+obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm"
+obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm"
+obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm"
+
+cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/common/vp8_asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/decoder/vp8_asm_dec_offsets.c"
 cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/encoder/vp8_asm_enc_offsets.c"
+obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
+obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
 obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"

 cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vpx_scale/vpx_scale_asm_offsets.c"
--- a/build/make/armlink_adapter.sh
+++ b/build/make/armlink_adapter.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
@@ -13,20 +13,20 @@
 verbose=0
 set -- $*
 for i; do
-    if [ "$i" = "-o" ]; then
+    if [ "$i" == "-o" ]; then
        on_of=1
-    elif [ "$i" = "-v" ]; then
+    elif [ "$i" == "-v" ]; then
        verbose=1
-    elif [ "$i" = "-g" ]; then
+    elif [ "$i" == "-g" ]; then
        args="${args} --debug"
-    elif [ "$on_of" = "1" ]; then
+    elif [ "$on_of" == "1" ]; then
        outfile=$i
        on_of=0
    elif [ -f "$i" ]; then
        infiles="$infiles $i"
-    elif [ "${i#-l}" != "$i" ]; then
+    elif [ "${i:0:2}" == "-l" ]; then
        libs="$libs ${i#-l}"
-    elif [ "${i#-L}" != "$i" ]; then
+    elif [ "${i:0:2}" == "-L" ]; then
        libpaths="${libpaths} ${i#-L}"
    else
        args="${args} ${i}"
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 ##
 ##  configure.sh
 ##
@@ -75,7 +75,7 @@ Options:

 Build options:
  --help                      print this message
-  --log=yes|no|FILE           file configure log is written to [config.log]
+  --log=yes|no|FILE           file configure log is written to [config.err]
  --target=TARGET             target platform tuple [generic-gnu]
  --cpu=CPU                   optimize for a specific cpu rather than a family
  --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
@@ -198,11 +198,11 @@ add_extralibs() {
 #
 # Boolean Manipulation Functions
 #
-enable_feature(){
+enable(){
    set_all yes $*
 }

-disable_feature(){
+disable(){
    set_all no $*
 }

@@ -219,7 +219,7 @@ soft_enable() {
    for var in $*; do
        if ! disabled $var; then
            log_echo "  enabling $var"
-            enable_feature $var
+            enable $var
        fi
    done
 }
@@ -228,7 +228,7 @@ soft_disable() {
    for var in $*; do
        if ! enabled $var; then
            log_echo "  disabling $var"
-            disable_feature $var
+            disable $var
        fi
    done
 }
@@ -251,10 +251,10 @@ tolower(){
 # Temporary File Functions
 #
 source_path=${0%/*}
-enable_feature source_path_used
+enable source_path_used
 if test -z "$source_path" -o "$source_path" = "." ; then
    source_path="`pwd`"
-    disable_feature source_path_used
+    disable source_path_used
 fi

 if test ! -z "$TMPDIR" ; then
@@ -264,13 +264,12 @@ elif test ! -z "$TEMPDIR" ; then
 else
    TMPDIRx="/tmp"
 fi
-RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
-TMP_H="${TMPDIRx}/vpx-conf-$$-${RAND}.h"
-TMP_C="${TMPDIRx}/vpx-conf-$$-${RAND}.c"
-TMP_CC="${TMPDIRx}/vpx-conf-$$-${RAND}.cc"
-TMP_O="${TMPDIRx}/vpx-conf-$$-${RAND}.o"
-TMP_X="${TMPDIRx}/vpx-conf-$$-${RAND}.x"
-TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RAND}.asm"
+TMP_H="${TMPDIRx}/vpx-conf-$$-${RANDOM}.h"
+TMP_C="${TMPDIRx}/vpx-conf-$$-${RANDOM}.c"
+TMP_CC="${TMPDIRx}/vpx-conf-$$-${RANDOM}.cc"
+TMP_O="${TMPDIRx}/vpx-conf-$$-${RANDOM}.o"
+TMP_X="${TMPDIRx}/vpx-conf-$$-${RANDOM}.x"
+TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RANDOM}.asm"

 clean_temp_files() {
    rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
@@ -317,8 +316,8 @@ check_header(){
    header=$1
    shift
    var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-    disable_feature $var
-    check_cpp "$@" <<EOF && enable_feature $var
+    disable $var
+    check_cpp "$@" <<EOF && enable $var
 #include "$header"
 int x;
 EOF
@@ -480,7 +479,7 @@ process_common_cmdline() {
    for opt in "$@"; do
        optval="${opt#*=}"
        case "$opt" in
-        --child) enable_feature child
+        --child) enable child
        ;;
        --log*)
        logging="$optval"
@@ -492,7 +491,7 @@ process_common_cmdline() {
        ;;
        --target=*) toolchain="${toolchain:-${optval}}"
        ;;
-        --force-target=*) toolchain="${toolchain:-${optval}}"; enable_feature force_toolchain
+        --force-target=*) toolchain="${toolchain:-${optval}}"; enable force_toolchain
        ;;
        --cpu)
        ;;
@@ -512,7 +511,7 @@ process_common_cmdline() {
          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
            die_unknown $opt
        fi
-        ${action}_feature $option
+        $action $option
        ;;
        --require-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
@@ -524,11 +523,11 @@ process_common_cmdline() {
        ;;
        --force-enable-?*|--force-disable-?*)
        eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'`
-        ${action}_feature $option
+        $action $option
        ;;
        --libc=*)
        [ -d "${optval}" ] || die "Not a directory: ${optval}"
-        disable_feature builtin_libc
+        disable builtin_libc
        alt_libc="${optval}"
        ;;
        --as=*)
@@ -654,10 +653,6 @@ process_common_toolchain() {
                tgt_isa=x86_64
                tgt_os=darwin12
                ;;
-            *darwin13*)
-                tgt_isa=x86_64
-                tgt_os=darwin13
-                ;;
            x86_64*mingw32*)
                tgt_os=win64
                ;;
@@ -697,13 +692,13 @@ process_common_toolchain() {

    # Mark the specific ISA requested as enabled
    soft_enable ${tgt_isa}
-    enable_feature ${tgt_os}
-    enable_feature ${tgt_cc}
+    enable ${tgt_os}
+    enable ${tgt_cc}

    # Enable the architecture family
    case ${tgt_isa} in
-        arm*) enable_feature arm;;
-        mips*) enable_feature mips;;
+        arm*) enable arm;;
+        mips*) enable mips;;
    esac

    # PIC is probably what we want when building shared libs
@@ -756,17 +751,13 @@ process_common_toolchain() {
            add_cflags  "-mmacosx-version-min=10.8"
            add_ldflags "-mmacosx-version-min=10.8"
            ;;
-        *-darwin13-*)
-            add_cflags  "-mmacosx-version-min=10.9"
-            add_ldflags "-mmacosx-version-min=10.9"
-            ;;
    esac

    # Handle Solaris variants. Solaris 10 needs -lposix4
    case ${toolchain} in
        sparc-solaris-*)
            add_extralibs -lposix4
-            disable_feature fast_unaligned
+            disable fast_unaligned
            ;;
        *-solaris-*)
            add_extralibs -lposix4
@@ -791,7 +782,7 @@ process_common_toolchain() {
            ;;
        armv5te)
            soft_enable edsp
-            disable_feature fast_unaligned
+            disable fast_unaligned
            ;;
        esac

@@ -806,7 +797,7 @@ process_common_toolchain() {
            arch_int=${arch_int%%te}
            check_add_asflags --defsym ARCHITECTURE=${arch_int}
            tune_cflags="-mtune="
-            if [ ${tgt_isa} = "armv7" ]; then
+            if [ ${tgt_isa} == "armv7" ]; then
                if [ -z "${float_abi}" ]; then
                    check_cpp <<EOF && float_abi=hard || float_abi=softfp
 #ifndef __ARM_PCS_VFP
@@ -843,8 +834,8 @@ EOF
            asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
            AS_SFX=.s
            msvs_arch_dir=arm-msvs
-            disable_feature multithread
-            disable_feature unit_tests
+            disable multithread
+            disable unit_tests
            ;;
        rvct)
            CC=armcc
@@ -856,7 +847,7 @@ EOF
            tune_cflags="--cpu="
            tune_asflags="--cpu="
            if [ -z "${tune_cpu}" ]; then
-                if [ ${tgt_isa} = "armv7" ]; then
+                if [ ${tgt_isa} == "armv7" ]; then
                    if enabled neon
                    then
                        check_add_cflags --fpu=softvfp+vfpv3
@@ -881,8 +872,8 @@ EOF

        case ${tgt_os} in
        none*)
-            disable_feature multithread
-            disable_feature os_support
+            disable multithread
+            disable os_support
            ;;

        android*)
@@ -914,9 +905,9 @@ EOF
            # Cortex-A8 implementations (NDK Dev Guide)
            add_ldflags "-Wl,--fix-cortex-a8"

-            enable_feature pic
+            enable pic
            soft_enable realtime_only
-            if [ ${tgt_isa} = "armv7" ]; then
+            if [ ${tgt_isa} == "armv7" ]; then
                soft_enable runtime_cpu_detect
            fi
            if enabled runtime_cpu_detect; then
@@ -970,7 +961,7 @@ EOF
         ;;

        linux*)
-            enable_feature linux
+            enable linux
            if enabled rvct; then
                # Check if we have CodeSourcery GCC in PATH. Needed for
                # libraries
@@ -1001,14 +992,14 @@ EOF
        tune_cflags="-mtune="
        if enabled dspr2; then
            check_add_cflags -mips32r2 -mdspr2
-            disable_feature fast_unaligned
+            disable fast_unaligned
        fi
        check_add_cflags -march=${tgt_isa}
        check_add_asflags -march=${tgt_isa}
        check_add_asflags -KPIC
    ;;
    ppc*)
-        enable_feature ppc
+        enable ppc
        bits=${tgt_isa##ppc}
        link_with_cc=gcc
        setup_gnu_toolchain
@@ -1156,7 +1147,7 @@ EOF
    ;;
    universal*|*-gcc|generic-gnu)
        link_with_cc=gcc
-        enable_feature gcc
+        enable gcc
    setup_gnu_toolchain
    ;;
    esac
@@ -1190,12 +1181,6 @@ EOF
        fi
    fi

-    # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
-    echo "  checking here for x86inc \"${tgt_isa}\" \"$pic\" "
-    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o "${tgt_os#darwin}" = "${tgt_os}"  ]; then
-      soft_enable use_x86inc
-    fi
-
    # Position Independent Code (PIC) support, for building relocatable
    # shared objects
    enabled gcc && enabled pic && check_add_cflags -fPIC
@@ -1205,14 +1190,14 @@ EOF
    enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0

    # Check for strip utility variant
-    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip
+    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable gnu_strip

    # Try to determine target endianness
    check_cc <<EOF
    unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
 EOF
    [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' |
-        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable_feature big_endian
+        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable big_endian

    # Try to find which inline keywords are supported
    check_cc <<EOF && INLINE="inline"
@@ -1237,7 +1222,7 @@ EOF
            if enabled dspr2; then
                if enabled big_endian; then
                    echo "dspr2 optimizations are available only for little endian platforms"
-                    disable_feature dspr2
+                    disable dspr2
                fi
            fi
        ;;
@@ -1288,8 +1273,8 @@ print_config_h() {

 print_webm_license() {
    local destination=$1
-    local prefix="$2"
-    local suffix="$3"
+    local prefix=$2
+    local suffix=$3
    shift 3
    cat <<EOF > ${destination}
 ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
@@ -1310,8 +1295,8 @@ process_detect() {
    true;
 }

-enable_feature logging
-logfile="config.log"
+enable logging
+logfile="config.err"
 self=$0
 process() {
    cmdline_args="$@"
--- a/build/make/gen_asm_deps.sh
+++ b/build/make/gen_asm_deps.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -381,7 +381,7 @@ generate_vcproj() {
                            RuntimeLibrary="$debug_runtime" \
                            UsePrecompiledHeader="0" \
                            WarningLevel="3" \
-                            DebugInformationFormat="2" \
+                            DebugInformationFormat="1" \
                            $warn_64bit \

                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
@@ -395,7 +395,7 @@ generate_vcproj() {
                            RuntimeLibrary="$debug_runtime" \
                            UsePrecompiledHeader="0" \
                            WarningLevel="3" \
-                            DebugInformationFormat="2" \
+                            DebugInformationFormat="1" \
                            $warn_64bit \

                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh
@@ -72,21 +72,10 @@ parse_project() {
    eval "${var}_name=$name"
    eval "${var}_guid=$guid"

-    if [ "$sfx" = "vcproj" ]; then
-        cur_config_list=`grep -A1 '<Configuration' $file |
-            grep Name | cut -d\" -f2`
-    else
-        cur_config_list=`grep -B1 'Label="Configuration"' $file |
-            grep Condition | cut -d\' -f4`
-    fi
-    new_config_list=$(for i in $config_list $cur_config_list; do
-        echo $i
-    done | sort | uniq)
-    if [ "$config_list" != "" ] && [ "$config_list" != "$new_config_list" ]; then
-        mixed_platforms=1
-    fi
-    config_list="$new_config_list"
-    eval "${var}_config_list=\"$cur_config_list\""
+    # assume that all projects have the same list of possible configurations,
+    # so overwriting old config_lists is not a problem
+    config_list=`grep -A1 '<Configuration' $file |
+        grep Name | cut -d\" -f2`
    proj_list="${proj_list} ${var}"
 }

@@ -136,11 +125,6 @@ process_global() {
    indent_push
    IFS_bak=${IFS}
    IFS=$'\r'$'\n'
-    if [ "$mixed_platforms" != "" ]; then
-        config_list="
-Release|Mixed Platforms
-Debug|Mixed Platforms"
-    fi
    for config in ${config_list}; do
        echo "${indent}$config = $config"
    done
@@ -155,17 +139,10 @@ Debug|Mixed Platforms"
    indent_push
    for proj in ${proj_list}; do
        eval "local proj_guid=\${${proj}_guid}"
-        eval "local proj_config_list=\${${proj}_config_list}"
        IFS=$'\r'$'\n'
-        for config in ${proj_config_list}; do
-            if [ "$mixed_platforms" != "" ]; then
-                local c=${config%%|*}
-                echo "${indent}${proj_guid}.${c}|Mixed Platforms.ActiveCfg = ${config}"
-                echo "${indent}${proj_guid}.${c}|Mixed Platforms.Build.0 = ${config}"
-            else
-                echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}"
-                echo "${indent}${proj_guid}.${config}.Build.0 = ${config}"
-            fi
+        for config in ${config_list}; do
+            echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}"
+            echo "${indent}${proj_guid}.${config}.Build.0 = ${config}"

        done
        IFS=${IFS_bak}
@@ -191,14 +168,9 @@ process_makefile() {
    IFS=$'\r'$'\n'
    local TAB=$'\t'
    cat <<EOF
-ifeq (\$(CONFIG_VS_VERSION),7)
-MSBUILD_TOOL := devenv.com
-else
-MSBUILD_TOOL := msbuild.exe
-endif
-found_devenv := \$(shell which \$(MSBUILD_TOOL) >/dev/null 2>&1 && echo yes)
+found_devenv := \$(shell which devenv.com >/dev/null 2>&1 && echo yes)
 .nodevenv.once:
-${TAB}@echo "  * \$(MSBUILD_TOOL) not found in path."
+${TAB}@echo "  * devenv.com not found in path."
 ${TAB}@echo "  * "
 ${TAB}@echo "  * You will have to build all configurations manually using the"
 ${TAB}@echo "  * Visual Studio IDE. To allow make to build them automatically,"
@@ -223,17 +195,16 @@ ${TAB}rm -rf "$platform"/"$config"
 ifneq (\$(found_devenv),)
  ifeq (\$(CONFIG_VS_VERSION),7)
 $nows_sln_config: $outfile
-${TAB}\$(MSBUILD_TOOL) $outfile -build "$config"
+${TAB}devenv.com $outfile -build "$config"

  else
 $nows_sln_config: $outfile
-${TAB}\$(MSBUILD_TOOL) $outfile -m -t:Build \\
-${TAB}${TAB}-p:Configuration="$config" -p:Platform="$platform"
+${TAB}devenv.com $outfile -build "$sln_config"

  endif
 else
 $nows_sln_config: $outfile .nodevenv.once
-${TAB}@echo "  * Skipping build of $sln_config (\$(MSBUILD_TOOL) not in path)."
+${TAB}@echo "  * Skipping build of $sln_config (devenv.com not in path)."
 ${TAB}@echo "  * "
 endif

--- a/build/make/version.sh
+++ b/build/make/version.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
--- a/build/x86-msvs/obj_int_extract.bat
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -7,6 +7,17 @@ REM   in the file PATENTS.  All contributing project authors may
 REM   be found in the AUTHORS file in the root of the source tree.
 echo on

+cl /I "./" /I "%1" /nologo /c "%1/vp9/common/vp9_asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp9/decoder/vp9_asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp9/encoder/vp9_asm_enc_offsets.c"
+obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm"
+obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm"
+obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm"
+
+cl /I "./" /I "%1" /nologo /c "%1/vp8/common/vp8_asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/vp8_asm_dec_offsets.c"
 cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/vp8_asm_enc_offsets.c"
+obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
+obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
 obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"

--- a/99
+++ b/99
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 ##
 ##  configure
 ##
@@ -38,7 +38,6 @@ Advanced options:
  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
  ${toggle_mem_tracker}           track memory usage
  ${toggle_postproc}              postprocessing
-  ${toggle_vp9_postproc}          vp9 specific postprocessing
  ${toggle_multithread}           multithreaded encoding and decoding
  ${toggle_spatial_resampling}    spatial sampling (scaling) support
  ${toggle_realtime_only}         enable this option while building for real-time encoding
@@ -116,7 +115,6 @@ all_platforms="${all_platforms} x86-darwin9-icc"
 all_platforms="${all_platforms} x86-darwin10-gcc"
 all_platforms="${all_platforms} x86-darwin11-gcc"
 all_platforms="${all_platforms} x86-darwin12-gcc"
-all_platforms="${all_platforms} x86-darwin13-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
 all_platforms="${all_platforms} x86-os2-gcc"
@@ -131,7 +129,6 @@ all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-darwin11-gcc"
 all_platforms="${all_platforms} x86_64-darwin12-gcc"
-all_platforms="${all_platforms} x86_64-darwin13-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -145,7 +142,6 @@ all_platforms="${all_platforms} universal-darwin9-gcc"
 all_platforms="${all_platforms} universal-darwin10-gcc"
 all_platforms="${all_platforms} universal-darwin11-gcc"
 all_platforms="${all_platforms} universal-darwin12-gcc"
-all_platforms="${all_platforms} universal-darwin13-gcc"
 all_platforms="${all_platforms} generic-gnu"

 # all_targets is a list of all targets that can be configured
@@ -154,7 +150,7 @@ all_targets="libs examples docs"

 # all targets available are enabled, by default.
 for t in ${all_targets}; do
-    [ -f ${source_path}/${t}.mk ] && enable_feature ${t}
+    [ -f ${source_path}/${t}.mk ] && enable ${t}
 done

 # check installed doxygen version
@@ -165,30 +161,30 @@ if [ ${doxy_major:-0} -ge 1 ]; then
    doxy_minor=${doxy_version%%.*}
    doxy_patch=${doxy_version##*.}

-    [ $doxy_major -gt 1 ] && enable_feature doxygen
-    [ $doxy_minor -gt 5 ] && enable_feature doxygen
-    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen
+    [ $doxy_major -gt 1 ] && enable doxygen
+    [ $doxy_minor -gt 5 ] && enable doxygen
+    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable doxygen
 fi

 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
 # case.
-enabled doxygen && php -v >/dev/null 2>&1 && enable_feature install_docs
-enable_feature install_bins
-enable_feature install_libs
+enabled doxygen && php -v >/dev/null 2>&1 && enable install_docs
+enable install_bins
+enable install_libs

-enable_feature static
-enable_feature optimizations
-enable_feature fast_unaligned #allow unaligned accesses, if supported by hw
-enable_feature md5
-enable_feature spatial_resampling
-enable_feature multithread
-enable_feature os_support
-enable_feature temporal_denoising
+enable static
+enable optimizations
+enable fast_unaligned #allow unaligned accesses, if supported by hw
+enable md5
+enable spatial_resampling
+enable multithread
+enable os_support
+enable temporal_denoising

-[ -d ${source_path}/../include ] && enable_feature alt_tree_layout
+[ -d ${source_path}/../include ] && enable alt_tree_layout
 for d in vp8 vp9; do
-    [ -d ${source_path}/${d} ] && disable_feature alt_tree_layout;
+    [ -d ${source_path}/${d} ] && disable alt_tree_layout;
 done

 if ! enabled alt_tree_layout; then
@@ -201,10 +197,10 @@ else
 [ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder"
 [ -f ${source_path}/../include/vpx/vp9cx.h ] && CODECS="${CODECS} vp9_encoder"
 [ -f ${source_path}/../include/vpx/vp9dx.h ] && CODECS="${CODECS} vp9_decoder"
-[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable_feature vp8_encoder
-[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable_feature vp8_decoder
-[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable_feature vp9_encoder
-[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable_feature vp9_decoder
+[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable vp8_encoder
+[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable vp8_decoder
+[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable vp9_encoder
+[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable vp9_decoder

 [ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt
 fi
@@ -251,6 +247,7 @@ EXPERIMENT_LIST="
    multiple_arf
    non420
    alpha
+    balanced_coeftree
 "
 CONFIG_LIST="
    external_build
@@ -258,7 +255,6 @@ CONFIG_LIST="
    install_bins
    install_libs
    install_srcs
-    use_x86inc
    debug
    gprof
    gcov
@@ -280,7 +276,6 @@ CONFIG_LIST="
    dc_recon
    runtime_cpu_detect
    postproc
-    vp9_postproc
    multithread
    internal_stats
    ${CODECS}
@@ -316,7 +311,6 @@ CMDLINE_SELECT="
    gprof
    gcov
    pic
-    use_x86inc
    optimizations
    ccache
    runtime_cpu_detect
@@ -335,7 +329,6 @@ CMDLINE_SELECT="
    dequant_tokens
    dc_recon
    postproc
-    vp9_postproc
    multithread
    internal_stats
    ${CODECS}
@@ -361,12 +354,12 @@ process_cmdline() {
    for opt do
        optval="${opt#*=}"
        case "$opt" in
-        --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;;
+        --disable-codecs) for c in ${CODECS}; do disable $c; done ;;
        --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
        if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then
            if enabled experimental; then
-                ${action}_feature $option
+                $action $option
            else
                log_echo "Ignoring $opt -- not in experimental mode."
            fi
@@ -387,8 +380,8 @@ post_process_cmdline() {
    # If the codec family is enabled, enable all components of that family.
    log_echo "Configuring selected codecs"
    for c in ${CODECS}; do
-        disabled ${c%%_*} && disable_feature ${c}
-        enabled ${c%%_*} && enable_feature ${c}
+        disabled ${c%%_*} && disable ${c}
+        enabled ${c%%_*} && enable ${c}
    done

    # Enable all detected codecs, if they haven't been disabled
@@ -396,12 +389,12 @@ post_process_cmdline() {

    # Enable the codec family if any component of that family is enabled
    for c in ${CODECS}; do
-        enabled $c && enable_feature ${c%_*}
+        enabled $c && enable ${c%_*}
    done

    # Set the {en,de}coders variable if any algorithm in that class is enabled
    for c in ${CODECS}; do
-        enabled ${c} && enable_feature ${c##*_}s
+        enabled ${c} && enable ${c##*_}s
    done
 }

@@ -441,7 +434,7 @@ process_targets() {
    done
    enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
    enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
-    ! enabled postproc && ! enabled vp9_postproc && DIST_DIR="${DIST_DIR}-nopost"
+    ! enabled postproc && DIST_DIR="${DIST_DIR}-nopost"
    ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
    ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
    DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
@@ -511,13 +504,13 @@ process_detect() {
    fi
    if [ -z "$CC" ] || enabled external_build; then
        echo "Bypassing toolchain for environment detection."
-        enable_feature external_build
+        enable external_build
        check_header() {
            log fake_check_header "$@"
            header=$1
            shift
            var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-            disable_feature $var
+            disable $var
            # Headers common to all environments
            case $header in
                stdio.h)
@@ -529,7 +522,7 @@ process_detect() {
                        [ -f "${d##-I}/$header" ] && result=true && break
                    done
                    ${result:-true}
-            esac && enable_feature $var
+            esac && enable $var

            # Specialize windows and POSIX environments.
            case $toolchain in
@@ -537,7 +530,7 @@ process_detect() {
                    case $header-$toolchain in
                        stdint*-gcc) true;;
                        *) false;;
-                    esac && enable_feature $var
+                    esac && enable $var
                    ;;
                *)
                    case $header in
@@ -546,7 +539,7 @@ process_detect() {
                        sys/mman.h) true;;
                        unistd.h) true;;
                        *) false;;
-                    esac && enable_feature $var
+                    esac && enable $var
            esac
            enabled $var
        }
@@ -564,7 +557,7 @@ EOF
    check_header sys/mman.h
    check_header unistd.h # for sysconf(3) and friends.

-    check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
+    check_header vpx/vpx_integer.h -I${source_path} && enable vpx_ports
 }

 process_toolchain() {
@@ -646,18 +639,14 @@ process_toolchain() {
    # ccache only really works on gcc toolchains
    enabled gcc || soft_disable ccache
    if enabled mips; then
-        enable_feature dequant_tokens
-        enable_feature dc_recon
-    fi
-
-    if enabled internal_stats; then
-        enable_feature vp9_postproc
+        enable dequant_tokens
+        enable dc_recon
    fi

    # Enable the postbuild target if building for visual studio.
    case "$tgt_cc" in
-        vs*) enable_feature msvs
-             enable_feature solution
+        vs*) enable msvs
+             enable solution
             vs_version=${tgt_cc##vs}
             case $vs_version in
             [789])
@@ -693,14 +682,6 @@ process_toolchain() {
            # iOS/ARM builds do not work with gtest. This does not match
            # x86 targets.
        ;;
-        *-win*)
-            # Some mingw toolchains don't have pthread available by default.
-            # Treat these more like visual studio where threading in gtest
-            # would be disabled for the same reason.
-            check_cxx "$@" <<EOF && soft_enable unit_tests
-int z;
-EOF
-        ;;
        *)
            enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
 int z;
--- a/examples.mk
+++ b/examples.mk
@@ -49,9 +49,6 @@ vpxenc.DESCRIPTION           = Full featured encoder
 UTILS-$(CONFIG_VP8_ENCODER)    += vp8_scalable_patterns.c
 vp8_scalable_patterns.GUID   = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C
 vp8_scalable_patterns.DESCRIPTION = Temporal Scalability Encoder
-UTILS-$(CONFIG_VP8_ENCODER)    += vp9_spatial_scalable_encoder.c
-vp8_scalable_patterns.GUID   = 4A38598D-627D-4505-9C7B-D4020C84100D
-vp8_scalable_patterns.DESCRIPTION = Spatial Scalable Encoder

 # Clean up old ivfenc, ivfdec binaries.
 ifeq ($(CONFIG_MSVS),yes)
--- a/libs.mk
+++ b/libs.mk
@@ -57,13 +57,6 @@ CLEAN-OBJS += $$(BUILD_PFX)$(1).h
 RTCD += $$(BUILD_PFX)$(1).h
 endef

-# x86inc.asm is not compatible with pic 32bit builds. Restrict
-# files which use it to 64bit builds or 32bit without pic
-USE_X86INC = no
-ifeq ($(CONFIG_USE_X86INC),yes)
-  USE_X86INC = yes
-endif
-
 CODEC_SRCS-yes += CHANGELOG
 CODEC_SRCS-yes += libs.mk

@@ -390,11 +383,6 @@ LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
                     $(call enabled,LIBVPX_TEST_DATA))
 libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)

-libvpx_test_srcs.txt:
-	@echo "    [CREATE] $@"
-	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | sort -u > $@
-CLEAN-OBJS += libvpx_test_srcs.txt
-
 $(LIBVPX_TEST_DATA):
 	@echo "    [DOWNLOAD] $@"
 	$(qexec)trap 'rm -f $@' INT TERM &&\
@@ -455,10 +443,6 @@ else
 include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
 GTEST_SRCS := $(addprefix third_party/googletest/src/,$(call enabled,GTEST_SRCS))
 GTEST_OBJS=$(call objs,$(GTEST_SRCS))
-ifeq ($(filter win%,$(TGT_OS)),$(TGT_OS))
-# Disabling pthreads globally will cause issues on darwin and possibly elsewhere
-$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0
-endif
 $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
 $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
 OBJS-$(BUILD_LIBVPX) += $(GTEST_OBJS)
@@ -483,7 +467,7 @@ $(foreach bin,$(LIBVPX_TEST_BINS),\
        lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\
    $(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\
        $(LIBVPX_TEST_OBJS) \
-        -L. -lvpx -lgtest $(extralibs) -lm)\
+        -L. -lvpx -lgtest -lpthread -lm)\
        )))\
    $(if $(LIPO_LIBS),$(eval $(call lipo_bin_template,$(bin))))\

--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef TEST_ACM_RANDOM_H_
-#define TEST_ACM_RANDOM_H_
+#ifndef LIBVPX_TEST_ACM_RANDOM_H_
+#define LIBVPX_TEST_ACM_RANDOM_H_

 #include "third_party/googletest/src/include/gtest/gtest.h"

@@ -59,4 +59,4 @@ class ACMRandom {

 }  // namespace libvpx_test

-#endif  // TEST_ACM_RANDOM_H_
+#endif  // LIBVPX_TEST_ACM_RANDOM_H_
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -33,6 +33,10 @@ class AltRefTest : public ::libvpx_test::EncoderTest,
    altref_count_ = 0;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -27,10 +27,14 @@ class BordersTest : public ::libvpx_test::EncoderTest,
    SetMode(GET_PARAM(1));
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
-      encoder->Control(VP8E_SET_CPUUSED, 1);
+    if ( video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, 0);
      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
--- a/test/clear_system_state.h
+++ b/test/clear_system_state.h
@@ -10,7 +10,7 @@
 #ifndef TEST_CLEAR_SYSTEM_STATE_H_
 #define TEST_CLEAR_SYSTEM_STATE_H_

-#include "./vpx_config.h"
+#include "vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 # include "vpx_ports/x86.h"
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -134,14 +134,14 @@ class VP8CodecFactory : public CodecFactory {

 const libvpx_test::VP8CodecFactory kVP8;

-#define VP8_INSTANTIATE_TEST_CASE(test, ...)\
+#define VP8_INSTANTIATE_TEST_CASE(test, params)\
  INSTANTIATE_TEST_CASE_P(VP8, test, \
      ::testing::Combine( \
          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
              &libvpx_test::kVP8)), \
-          __VA_ARGS__))
+          params))
 #else
-#define VP8_INSTANTIATE_TEST_CASE(test, ...)
+#define VP8_INSTANTIATE_TEST_CASE(test, params)
 #endif  // CONFIG_VP8


@@ -216,14 +216,14 @@ class VP9CodecFactory : public CodecFactory {

 const libvpx_test::VP9CodecFactory kVP9;

-#define VP9_INSTANTIATE_TEST_CASE(test, ...)\
+#define VP9_INSTANTIATE_TEST_CASE(test, params)\
  INSTANTIATE_TEST_CASE_P(VP9, test, \
      ::testing::Combine( \
          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
               &libvpx_test::kVP9)), \
-          __VA_ARGS__))
+          params))
 #else
-#define VP9_INSTANTIATE_TEST_CASE(test, ...)
+#define VP9_INSTANTIATE_TEST_CASE(test, params)
 #endif  // CONFIG_VP9


--- a/test/config_test.cc
+++ b/test/config_test.cc
@@ -40,6 +40,10 @@ class ConfigTest : public ::libvpx_test::EncoderTest,
    ++frame_count_out_;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  unsigned int frame_count_in_;
  unsigned int frame_count_out_;
  unsigned int frame_count_max_;
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include <string.h>
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -23,8 +22,8 @@ extern "C" {
 }

 namespace {
-typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
                              const int16_t *filter_x, int filter_x_stride,
                              const int16_t *filter_y, int filter_y_stride,
                              int w, int h);
@@ -188,7 +187,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {

 protected:
  static const int kDataAlignment = 16;
-  static const int kOuterBlockSize = 256;
+  static const int kOuterBlockSize = 128;
  static const int kInputStride = kOuterBlockSize;
  static const int kOutputStride = kOuterBlockSize;
  static const int kMaxDimension = 64;
@@ -212,7 +211,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {

  virtual void SetUp() {
    UUT_ = GET_PARAM(2);
-    /* Set up guard blocks for an inner block centered in the outer block */
+    /* Set up guard blocks for an inner block cetered in the outer block */
    for (int i = 0; i < kOutputBufferSize; ++i) {
      if (IsIndexInBorder(i))
        output_[i] = 255;
@@ -225,10 +224,6 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
      input_[i] = prng.Rand8Extremes();
  }

-  void SetConstantInput(int value) {
-    memset(input_, value, kInputBufferSize);
-  }
-
  void CheckGuardBlocks() {
    for (int i = 0; i < kOutputBufferSize; ++i) {
      if (IsIndexInBorder(i))
@@ -461,86 +456,45 @@ DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = {
    { 128}
 };

-/* This test exercises the horizontal and vertical filter functions. */
 TEST_P(ConvolveTest, ChangeFilterWorks) {
  uint8_t* const in = input();
  uint8_t* const out = output();
-
-  /* Assume that the first input sample is at the 8/16th position. */
-  const int kInitialSubPelOffset = 8;
-
-  /* Filters are 8-tap, so the first filter tap will be applied to the pixel
-   * at position -3 with respect to the current filtering position. Since
-   * kInitialSubPelOffset is set to 8, we first select sub-pixel filter 8,
-   * which is non-zero only in the last tap. So, applying the filter at the
-   * current input position will result in an output equal to the pixel at
-   * offset +4 (-3 + 7) with respect to the current filtering position.
-   */
  const int kPixelSelected = 4;

-  /* Assume that each output pixel requires us to step on by 17/16th pixels in
-   * the input.
-   */
-  const int kInputPixelStep = 17;
-
-  /* The filters are setup in such a way that the expected output produces
-   * sets of 8 identical output samples. As the filter position moves to the
-   * next 1/16th pixel position the only active (=128) filter tap moves one
-   * position to the left, resulting in the same input pixel being replicated
-   * in to the output for 8 consecutive samples. After each set of 8 positions
-   * the filters select a different input pixel. kFilterPeriodAdjust below
-   * computes which input pixel is written to the output for a specified
-   * x or y position.
-   */
-
-  /* Test the horizontal filter. */
  REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[kInitialSubPelOffset],
-                                 kInputPixelStep, NULL, 0, Width(), Height()));
+                                 kChangeFilters[8], 17, kChangeFilters[4], 16,
+                                 Width(), Height()));

  for (int x = 0; x < Width(); ++x) {
+    const int kQ4StepAdjust = x >> 4;
    const int kFilterPeriodAdjust = (x >> 3) << 3;
-    const int ref_x =
-        kPixelSelected + ((kInitialSubPelOffset
-            + kFilterPeriodAdjust * kInputPixelStep)
-                          >> SUBPEL_BITS);
-    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x << "width = " << Width();
+    const int ref_x = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
+    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x;
  }

-  /* Test the vertical filter. */
  REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,
-                                 NULL, 0, kChangeFilters[kInitialSubPelOffset],
-                                 kInputPixelStep, Width(), Height()));
+                                 kChangeFilters[4], 16, kChangeFilters[8], 17,
+                                 Width(), Height()));

  for (int y = 0; y < Height(); ++y) {
+    const int kQ4StepAdjust = y >> 4;
    const int kFilterPeriodAdjust = (y >> 3) << 3;
-    const int ref_y =
-        kPixelSelected + ((kInitialSubPelOffset
-            + kFilterPeriodAdjust * kInputPixelStep)
-                          >> SUBPEL_BITS);
+    const int ref_y = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
    ASSERT_EQ(in[ref_y * kInputStride], out[y * kInputStride]) << "y == " << y;
  }

-  /* Test the horizontal and vertical filters in combination. */
  REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                                  kChangeFilters[kInitialSubPelOffset],
-                                  kInputPixelStep,
-                                  kChangeFilters[kInitialSubPelOffset],
-                                  kInputPixelStep,
+                                  kChangeFilters[8], 17, kChangeFilters[8], 17,
                                  Width(), Height()));

  for (int y = 0; y < Height(); ++y) {
+    const int kQ4StepAdjustY = y >> 4;
    const int kFilterPeriodAdjustY = (y >> 3) << 3;
-    const int ref_y =
-        kPixelSelected + ((kInitialSubPelOffset
-            + kFilterPeriodAdjustY * kInputPixelStep)
-                          >> SUBPEL_BITS);
+    const int ref_y = kQ4StepAdjustY + kFilterPeriodAdjustY + kPixelSelected;
    for (int x = 0; x < Width(); ++x) {
+      const int kQ4StepAdjustX = x >> 4;
      const int kFilterPeriodAdjustX = (x >> 3) << 3;
-      const int ref_x =
-          kPixelSelected + ((kInitialSubPelOffset
-              + kFilterPeriodAdjustX * kInputPixelStep)
-                            >> SUBPEL_BITS);
+      const int ref_x = kQ4StepAdjustX + kFilterPeriodAdjustX + kPixelSelected;

      ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x])
          << "x == " << x << ", y == " << y;
@@ -548,34 +502,6 @@ TEST_P(ConvolveTest, ChangeFilterWorks) {
  }
 }

-/* This test exercises that enough rows and columns are filtered with every
-   possible initial fractional positions and scaling steps. */
-TEST_P(ConvolveTest, CheckScalingFiltering) {
-  uint8_t* const in = input();
-  uint8_t* const out = output();
-
-  SetConstantInput(127);
-
-  for (int frac = 0; frac < 16; ++frac) {
-    for (int step = 1; step <= 32; ++step) {
-      /* Test the horizontal and vertical filters in combination. */
-      REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                                      vp9_sub_pel_filters_8[frac], step,
-                                      vp9_sub_pel_filters_8[frac], step,
-                                      Width(), Height()));
-
-      CheckGuardBlocks();
-
-      for (int y = 0; y < Height(); ++y) {
-        for (int x = 0; x < Width(); ++x) {
-          ASSERT_EQ(in[y * kInputStride + x], out[y * kOutputStride + x])
-              << "x == " << x << ", y == " << y
-              << ", frac == " << frac << ", step == " << step;
-        }
-      }
-    }
-  }
-}

 using std::tr1::make_tuple;

@@ -601,9 +527,9 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(

 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
-    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
-    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3,
-    vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3);
+    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_c,
+    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_c,
+    vp9_convolve8_ssse3, vp9_convolve8_avg_c);

 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
    make_tuple(4, 4, &convolve8_ssse3),
@@ -620,26 +546,4 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
    make_tuple(32, 64, &convolve8_ssse3),
    make_tuple(64, 64, &convolve8_ssse3)));
 #endif
-
-#if HAVE_NEON
-const ConvolveFunctions convolve8_neon(
-    vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
-    vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
-    vp9_convolve8_neon, vp9_convolve8_avg_neon);
-
-INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_neon),
-    make_tuple(8, 4, &convolve8_neon),
-    make_tuple(4, 8, &convolve8_neon),
-    make_tuple(8, 8, &convolve8_neon),
-    make_tuple(16, 8, &convolve8_neon),
-    make_tuple(8, 16, &convolve8_neon),
-    make_tuple(16, 16, &convolve8_neon),
-    make_tuple(32, 16, &convolve8_neon),
-    make_tuple(16, 32, &convolve8_neon),
-    make_tuple(32, 32, &convolve8_neon),
-    make_tuple(64, 32, &convolve8_neon),
-    make_tuple(32, 64, &convolve8_neon),
-    make_tuple(64, 64, &convolve8_neon)));
-#endif
 }  // namespace
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -1,112 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include <climits>
-#include <vector>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/i420_video_source.h"
-#include "test/util.h"
-
-namespace {
-
-class CpuSpeedTest : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWith2Params<
-        libvpx_test::TestMode, int> {
- protected:
-  CpuSpeedTest() : EncoderTest(GET_PARAM(0)) {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
-  }
-
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
-      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
-      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
-      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
-      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
-      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
-    }
-  }
-
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
-    }
-  }
-  int set_cpu_used_;
-};
-
-TEST_P(CpuSpeedTest, TestQ0) {
-  // Validate that this non multiple of 64 wide clip encodes and decodes
-  // without a mismatch when passing in a very low max q.  This pushes
-  // the encoder to producing lots of big partitions which will likely
-  // extend into the border and test the border condition.
-  cfg_.g_lag_in_frames = 25;
-  cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
-  cfg_.rc_target_bitrate = 400;
-  cfg_.rc_max_quantizer = 0;
-  cfg_.rc_min_quantizer = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       20);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
-
-
-TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
-  // Validate that this non multiple of 64 wide clip encodes and decodes
-  // without a mismatch when passing in a very low max q.  This pushes
-  // the encoder to producing lots of big partitions which will likely
-  // extend into the border and test the border condition.
-  cfg_.g_lag_in_frames = 25;
-  cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
-  cfg_.rc_target_bitrate = 12000;
-  cfg_.rc_max_quantizer = 10;
-  cfg_.rc_min_quantizer = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       40);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
-TEST_P(CpuSpeedTest, TestLowBitrate) {
-  // Validate that this clip encodes and decodes without a mismatch
-  // when passing in a very high min q.  This pushes the encoder to producing
-  // lots of small partitions which might will test the other condition.
-
-  cfg_.g_lag_in_frames = 25;
-  cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
-  cfg_.rc_target_bitrate = 200;
-  cfg_.rc_min_quantizer = 40;
-
-  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       40);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
-
-using std::tr1::make_tuple;
-
-#define VP9_FACTORY \
-  static_cast<const libvpx_test::CodecFactory*> (&libvpx_test::kVP9)
-
-VP9_INSTANTIATE_TEST_CASE(
-    CpuSpeedTest,
-    ::testing::Values(::libvpx_test::kTwoPassGood),
-    ::testing::Range(0, 5));
-}  // namespace
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@@ -42,6 +42,10 @@ class CQTest : public ::libvpx_test::EncoderTest,
    n_frames_ = 0;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -36,6 +36,10 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
    duration_ = 0.0;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    const vpx_rational_t tb = video->timebase();
@@ -75,7 +79,7 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
    bits_in_buffer_model_ -= frame_size_in_bits;

    // Update the running total of bits for end of test datarate checks.
-    bits_total_ += frame_size_in_bits;
+    bits_total_ += frame_size_in_bits ;

    // If first drop not set and we have a drop set it to this time.
    if (!first_drop_ && duration > 1)
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -13,16 +13,14 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"

 extern "C" {
 #include "vp9/common/vp9_entropy.h"
-#include "./vp9_rtcd.h"
-void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *output, int pitch);
+#include "vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
 }
+
+#include "acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -32,13 +30,12 @@ namespace {
 #ifdef _MSC_VER
 static int round(double x) {
  if (x < 0)
-    return static_cast<int>(ceil(x - 0.5));
+    return (int)ceil(x - 0.5);
  else
-    return static_cast<int>(floor(x + 0.5));
+    return (int)floor(x + 0.5);
 }
 #endif

-const int kNumCoeffs = 256;
 const double PI = 3.1415926535898;
 void reference2_16x16_idct_2d(double *input, double *output) {
  double x;
@@ -47,9 +44,7 @@ void reference2_16x16_idct_2d(double *input, double *output) {
      double s = 0;
      for (int i = 0; i < 16; ++i) {
        for (int j = 0; j < 16; ++j) {
-          x = cos(PI * j * (l + 0.5) / 16.0) *
-              cos(PI * i * (k + 0.5) / 16.0) *
-              input[i * 16 + j] / 256;
+          x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/256;
          if (i != 0)
            x *= sqrt(2.0);
          if (j != 0)
@@ -63,23 +58,23 @@ void reference2_16x16_idct_2d(double *input, double *output) {
 }


-const double C1 = 0.995184726672197;
-const double C2 = 0.98078528040323;
-const double C3 = 0.956940335732209;
-const double C4 = 0.923879532511287;
-const double C5 = 0.881921264348355;
-const double C6 = 0.831469612302545;
-const double C7 = 0.773010453362737;
-const double C8 = 0.707106781186548;
-const double C9 = 0.634393284163646;
-const double C10 = 0.555570233019602;
-const double C11 = 0.471396736825998;
-const double C12 = 0.38268343236509;
-const double C13 = 0.290284677254462;
-const double C14 = 0.195090322016128;
-const double C15 = 0.098017140329561;
+static const double C1 = 0.995184726672197;
+static const double C2 = 0.98078528040323;
+static const double C3 = 0.956940335732209;
+static const double C4 = 0.923879532511287;
+static const double C5 = 0.881921264348355;
+static const double C6 = 0.831469612302545;
+static const double C7 = 0.773010453362737;
+static const double C8 = 0.707106781186548;
+static const double C9 = 0.634393284163646;
+static const double C10 = 0.555570233019602;
+static const double C11 = 0.471396736825998;
+static const double C12 = 0.38268343236509;
+static const double C13 = 0.290284677254462;
+static const double C14 = 0.195090322016128;
+static const double C15 = 0.098017140329561;

-void butterfly_16x16_dct_1d(double input[16], double output[16]) {
+static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  double step[16];
  double intermediate[16];
  double temp1, temp2;
@@ -112,36 +107,36 @@ void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[6] = step[1] - step[6];
  output[7] = step[0] - step[7];

-  temp1 = step[ 8] * C7;
-  temp2 = step[15] * C9;
+  temp1 = step[ 8]*C7;
+  temp2 = step[15]*C9;
  output[ 8] = temp1 + temp2;

-  temp1 = step[ 9] * C11;
-  temp2 = step[14] * C5;
+  temp1 = step[ 9]*C11;
+  temp2 = step[14]*C5;
  output[ 9] = temp1 - temp2;

-  temp1 = step[10] * C3;
-  temp2 = step[13] * C13;
+  temp1 = step[10]*C3;
+  temp2 = step[13]*C13;
  output[10] = temp1 + temp2;

-  temp1 = step[11] * C15;
-  temp2 = step[12] * C1;
+  temp1 = step[11]*C15;
+  temp2 = step[12]*C1;
  output[11] = temp1 - temp2;

-  temp1 = step[11] * C1;
-  temp2 = step[12] * C15;
+  temp1 = step[11]*C1;
+  temp2 = step[12]*C15;
  output[12] = temp2 + temp1;

-  temp1 = step[10] * C13;
-  temp2 = step[13] * C3;
+  temp1 = step[10]*C13;
+  temp2 = step[13]*C3;
  output[13] = temp2 - temp1;

-  temp1 = step[ 9] * C5;
-  temp2 = step[14] * C11;
+  temp1 = step[ 9]*C5;
+  temp2 = step[14]*C11;
  output[14] = temp2 + temp1;

-  temp1 = step[ 8] * C9;
-  temp2 = step[15] * C7;
+  temp1 = step[ 8]*C9;
+  temp2 = step[15]*C7;
  output[15] = temp2 - temp1;

  // step 3
@@ -150,20 +145,20 @@ void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  step[ 2] = output[1] - output[2];
  step[ 3] = output[0] - output[3];

-  temp1 = output[4] * C14;
-  temp2 = output[7] * C2;
+  temp1 = output[4]*C14;
+  temp2 = output[7]*C2;
  step[ 4] = temp1 + temp2;

-  temp1 = output[5] * C10;
-  temp2 = output[6] * C6;
+  temp1 = output[5]*C10;
+  temp2 = output[6]*C6;
  step[ 5] = temp1 + temp2;

-  temp1 = output[5] * C6;
-  temp2 = output[6] * C10;
+  temp1 = output[5]*C6;
+  temp2 = output[6]*C10;
  step[ 6] = temp2 - temp1;

-  temp1 = output[4] * C2;
-  temp2 = output[7] * C14;
+  temp1 = output[4]*C2;
+  temp2 = output[7]*C14;
  step[ 7] = temp2 - temp1;

  step[ 8] = output[ 8] + output[11];
@@ -180,18 +175,18 @@ void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[ 0] = (step[ 0] + step[ 1]);
  output[ 8] = (step[ 0] - step[ 1]);

-  temp1 = step[2] * C12;
-  temp2 = step[3] * C4;
+  temp1 = step[2]*C12;
+  temp2 = step[3]*C4;
  temp1 = temp1 + temp2;
-  output[ 4] = 2*(temp1 * C8);
+  output[ 4] = 2*(temp1*C8);

-  temp1 = step[2] * C4;
-  temp2 = step[3] * C12;
+  temp1 = step[2]*C4;
+  temp2 = step[3]*C12;
  temp1 = temp2 - temp1;
-  output[12] = 2 * (temp1 * C8);
+  output[12] = 2*(temp1*C8);

-  output[ 2] = 2 * ((step[4] + step[ 5]) * C8);
-  output[14] = 2 * ((step[7] - step[ 6]) * C8);
+  output[ 2] = 2*((step[4] + step[ 5])*C8);
+  output[14] = 2*((step[7] - step[ 6])*C8);

  temp1 = step[4] - step[5];
  temp2 = step[6] + step[7];
@@ -201,17 +196,17 @@ void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  intermediate[8] = step[8] + step[14];
  intermediate[9] = step[9] + step[15];

-  temp1 = intermediate[8] * C12;
-  temp2 = intermediate[9] * C4;
+  temp1 = intermediate[8]*C12;
+  temp2 = intermediate[9]*C4;
  temp1 = temp1 - temp2;
-  output[3] = 2 * (temp1 * C8);
+  output[3] = 2*(temp1*C8);

-  temp1 = intermediate[8] * C4;
-  temp2 = intermediate[9] * C12;
+  temp1 = intermediate[8]*C4;
+  temp2 = intermediate[9]*C12;
  temp1 = temp2 + temp1;
-  output[13] = 2 * (temp1 * C8);
+  output[13] = 2*(temp1*C8);

-  output[ 9] = 2 * ((step[10] + step[11]) * C8);
+  output[ 9] = 2*((step[10] + step[11])*C8);

  intermediate[11] = step[10] - step[11];
  intermediate[12] = step[12] + step[13];
@@ -222,300 +217,150 @@ void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[15] = (intermediate[11] + intermediate[12]);
  output[ 1] = -(intermediate[11] - intermediate[12]);

-  output[ 7] = 2 * (intermediate[13] * C8);
+  output[ 7] = 2*(intermediate[13]*C8);

-  temp1 = intermediate[14] * C12;
-  temp2 = intermediate[15] * C4;
+  temp1 = intermediate[14]*C12;
+  temp2 = intermediate[15]*C4;
  temp1 = temp1 - temp2;
-  output[11] = -2 * (temp1 * C8);
+  output[11] = -2*(temp1*C8);

-  temp1 = intermediate[14] * C4;
-  temp2 = intermediate[15] * C12;
+  temp1 = intermediate[14]*C4;
+  temp2 = intermediate[15]*C12;
  temp1 = temp2 + temp1;
-  output[ 5] = 2 * (temp1 * C8);
+  output[ 5] = 2*(temp1*C8);
 }

-void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
+static void reference_16x16_dct_1d(double in[16], double out[16]) {
+  const double kPi = 3.141592653589793238462643383279502884;
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < 16; k++) {
+    out[k] = 0.0;
+    for (int n = 0; n < 16; n++)
+      out[k] += in[n]*cos(kPi*(2*n+1)*k/32.0);
+    if (k == 0)
+      out[k] = out[k]*kInvSqrt2;
+  }
+}
+
+void reference_16x16_dct_2d(int16_t input[16*16], double output[16*16]) {
  // First transform columns
  for (int i = 0; i < 16; ++i) {
    double temp_in[16], temp_out[16];
    for (int j = 0; j < 16; ++j)
-      temp_in[j] = input[j * 16 + i];
+      temp_in[j] = input[j*16 + i];
    butterfly_16x16_dct_1d(temp_in, temp_out);
    for (int j = 0; j < 16; ++j)
-      output[j * 16 + i] = temp_out[j];
+      output[j*16 + i] = temp_out[j];
  }
  // Then transform rows
  for (int i = 0; i < 16; ++i) {
    double temp_in[16], temp_out[16];
    for (int j = 0; j < 16; ++j)
-      temp_in[j] = output[j + i * 16];
+      temp_in[j] = output[j + i*16];
    butterfly_16x16_dct_1d(temp_in, temp_out);
    // Scale by some magic number
    for (int j = 0; j < 16; ++j)
-      output[j + i * 16] = temp_out[j]/2;
+      output[j + i*16] = temp_out[j]/2;
  }
 }

-typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*idct_t)(int16_t *in, uint8_t *out, int stride);
-typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
-typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);

-void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
-  vp9_short_fdct16x16_c(in, out, stride);
-}
+TEST(VP9Idct16x16Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t in[256], coeff[256];
+    uint8_t dst[256], src[256];
+    double out_r[256];

-void fht16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
-  vp9_short_fht16x16_c(in, out, stride, tx_type);
-}
-
-class Trans16x16TestBase {
- public:
-  virtual ~Trans16x16TestBase() {}
-
- protected:
-  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
-
-  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
-
-  void RunAccuracyCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    uint32_t max_error = 0;
-    int64_t total_error = 0;
-    const int count_test_block = 10000;
-    for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        test_input_block[j] = src[j] - dst[j];
-      }
-
-      REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
-                                      test_temp_block, pitch_));
-      REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        const uint32_t diff = dst[j] - src[j];
-        const uint32_t error = diff * diff;
-        if (max_error < error)
-          max_error = error;
-        total_error += error;
-      }
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
    }
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 256; ++j)
+      in[j] = src[j] - dst[j];

-    EXPECT_GE(1u, max_error)
-        << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
-
-    EXPECT_GE(count_test_block , total_error)
-        << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
+    reference_16x16_dct_2d(in, out_r);
+    for (int j = 0; j < 256; j++)
+      coeff[j] = round(out_r[j]);
+    vp9_short_idct16x16_add_c(coeff, dst, 16);
+    for (int j = 0; j < 256; ++j) {
+      const int diff = dst[j] - src[j];
+      const int error = diff * diff;
+      EXPECT_GE(1, error)
+          << "Error: 16x16 IDCT has error " << error
+          << " at index " << j;
+    }
  }
+}

-  void RunCoeffCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+// we need enable fdct test once we re-do the 16 point fdct.
+TEST(VP9Fdct16x16Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int max_error = 0;
+  double total_error = 0;
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t test_input_block[256];
+    int16_t test_temp_block[256];
+    uint8_t dst[256], src[256];

-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j)
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 256; ++j)
+      test_input_block[j] = src[j] - dst[j];

-      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
-      REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+    const int pitch = 32;
+    vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
+    vp9_short_idct16x16_add_c(test_temp_block, dst, 16);

-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
+    for (int j = 0; j < 256; ++j) {
+      const int diff = dst[j] - src[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
    }
  }

-  void RunMemCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+  EXPECT_GE(1, max_error)
+      << "Error: 16x16 FDCT/IDCT has an individual round trip error > 1";

-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-      }
-      if (i == 0)
-        for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = 255;
-      if (i == 1)
-        for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -255;
+  EXPECT_GE(count_test_block , total_error)
+      << "Error: 16x16 FDCT/IDCT has average round trip error > 1 per block";
+}

-      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
-      REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
-                                      output_block, pitch_));
+TEST(VP9Fdct16x16Test, CoeffSizeCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t input_block[256], input_extreme_block[256];
+    int16_t output_block[256], output_extreme_block[256];

-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
-            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      }
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 256; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < 256; ++j)
+        input_extreme_block[j] = 255;
+
+    const int pitch = 32;
+    vp9_short_fdct16x16_c(input_block, output_block, pitch);
+    vp9_short_fdct16x16_c(input_extreme_block, output_extreme_block, pitch);
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < 256; ++j) {
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
+          << "Error: 16x16 FDCT extreme has coefficient larger than 4*DCT_MAX_VALUE";
    }
  }
-
-  void RunInvAccuracyCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      double out_r[kNumCoeffs];
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        in[j] = src[j] - dst[j];
-      }
-
-      reference_16x16_dct_2d(in, out_r);
-      for (int j = 0; j < kNumCoeffs; ++j)
-        coeff[j] = round(out_r[j]);
-
-      const int pitch = 32;
-      REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch));
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        const uint32_t diff = dst[j] - src[j];
-        const uint32_t error = diff * diff;
-        EXPECT_GE(1u, error)
-            << "Error: 16x16 IDCT has error " << error
-            << " at index " << j;
-      }
-    }
-  }
-  int pitch_;
-  int tx_type_;
-  fht_t fwd_txfm_ref;
-};
-
-class Trans16x16DCT : public Trans16x16TestBase,
-                      public PARAMS(fdct_t, idct_t, int) {
- public:
-  virtual ~Trans16x16DCT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_  = GET_PARAM(2);
-    pitch_    = 32;
-    fwd_txfm_ref = fdct16x16_ref;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride >> 1);
-  }
-
-  fdct_t fwd_txfm_;
-  idct_t inv_txfm_;
-};
-
-TEST_P(Trans16x16DCT, AccuracyCheck) {
-  RunAccuracyCheck();
 }
-
-TEST_P(Trans16x16DCT, CoeffCheck) {
-  RunCoeffCheck();
-}
-
-TEST_P(Trans16x16DCT, MemCheck) {
-  RunMemCheck();
-}
-
-TEST_P(Trans16x16DCT, InvAccuracyCheck) {
-  RunInvAccuracyCheck();
-}
-
-class Trans16x16HT : public Trans16x16TestBase,
-                     public PARAMS(fht_t, iht_t, int) {
- public:
-  virtual ~Trans16x16HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_  = GET_PARAM(2);
-    pitch_    = 16;
-    fwd_txfm_ref = fht16x16_ref;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
-    fwd_txfm_(in, out, stride, tx_type_);
-  }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, tx_type_);
-  }
-
-  fht_t fwd_txfm_;
-  iht_t inv_txfm_;
-};
-
-TEST_P(Trans16x16HT, AccuracyCheck) {
-  RunAccuracyCheck();
-}
-
-TEST_P(Trans16x16HT, CoeffCheck) {
-  RunCoeffCheck();
-}
-
-TEST_P(Trans16x16HT, MemCheck) {
-  RunMemCheck();
-}
-
-using std::tr1::make_tuple;
-
-INSTANTIATE_TEST_CASE_P(
-    C, Trans16x16DCT,
-    ::testing::Values(
-        make_tuple(&vp9_short_fdct16x16_c, &vp9_short_idct16x16_add_c, 0)));
-INSTANTIATE_TEST_CASE_P(
-    C, Trans16x16HT,
-    ::testing::Values(
-        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 0),
-        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 1),
-        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 2),
-        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 3)));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans16x16DCT,
-    ::testing::Values(
-        make_tuple(&vp9_short_fdct16x16_sse2, &vp9_short_idct16x16_add_c, 0)));
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans16x16HT,
-    ::testing::Values(
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 0),
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 1),
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 2),
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 3)));
-#endif
 }  // namespace
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -13,17 +13,15 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"

 extern "C" {
-#include "./vpx_config.h"
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
+  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
+  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
 }

+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -32,15 +30,35 @@ namespace {
 #ifdef _MSC_VER
 static int round(double x) {
  if (x < 0)
-    return static_cast<int>(ceil(x - 0.5));
+    return (int)ceil(x - 0.5);
  else
-    return static_cast<int>(floor(x + 0.5));
+    return (int)floor(x + 0.5);
 }
 #endif

-const int kNumCoeffs = 1024;
-const double kPi = 3.141592653589793238462643383279502884;
-void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
+static const double kPi = 3.141592653589793238462643383279502884;
+static void reference2_32x32_idct_2d(double *input, double *output) {
+  double x;
+  for (int l = 0; l < 32; ++l) {
+    for (int k = 0; k < 32; ++k) {
+      double s = 0;
+      for (int i = 0; i < 32; ++i) {
+        for (int j = 0; j < 32; ++j) {
+          x = cos(kPi * j * (l + 0.5) / 32.0) *
+              cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
+          if (i != 0)
+            x *= sqrt(2.0);
+          if (j != 0)
+            x *= sqrt(2.0);
+          s += x;
+        }
+      }
+      output[k * 32 + l] = s / 4;
+    }
+  }
+}
+
+static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
  const double kInvSqrt2 = 0.707106781186547524400844362104;
  for (int k = 0; k < 32; k++) {
    out[k] = 0.0;
@@ -51,8 +69,7 @@ void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
  }
 }

-void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
-                            double output[kNumCoeffs]) {
+static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
  // First transform columns
  for (int i = 0; i < 32; ++i) {
    double temp_in[32], temp_out[32];
@@ -74,165 +91,27 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
  }
 }

-typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride);
-
-class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
- public:
-  virtual ~Trans32x32Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    version_  = GET_PARAM(2);  // 0: high precision forward transform
-                               // 1: low precision version for rd loop
-  }
-
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  int version_;
-  fwd_txfm_t fwd_txfm_;
-  inv_txfm_t inv_txfm_;
-};
-
-TEST_P(Trans32x32Test, AccuracyCheck) {
+TEST(VP9Idct32x32Test, AccuracyCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  uint32_t max_error = 0;
-  int64_t total_error = 0;
  const int count_test_block = 1000;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
-
  for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < kNumCoeffs; ++j) {
+    int16_t in[1024], coeff[1024];
+    uint8_t dst[1024], src[1024];
+    double out_r[1024];
+
+    for (int j = 0; j < 1024; ++j) {
      src[j] = rnd.Rand8();
      dst[j] = rnd.Rand8();
-      test_input_block[j] = src[j] - dst[j];
    }
-
-    const int pitch = 64;
-    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch));
-    REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
-
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      const uint32_t diff = dst[j] - src[j];
-      const uint32_t error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
-  }
-
-  if (version_ == 1) {
-    max_error /= 2;
-    total_error /= 45;
-  }
-
-  EXPECT_GE(1u, max_error)
-      << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
-
-  EXPECT_GE(count_test_block, total_error)
-      << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
-}
-
-TEST_P(Trans32x32Test, CoeffCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-
-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
-
-  for (int i = 0; i < count_test_block; ++i) {
-    for (int j = 0; j < kNumCoeffs; ++j)
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_block, output_ref_block, pitch);
-    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch));
-
-    if (version_ == 0) {
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j])
-            << "Error: 32x32 FDCT versions have mismatched coefficients";
-    } else {
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
-            << "Error: 32x32 FDCT rd has mismatched coefficients";
-    }
-  }
-}
-
-TEST_P(Trans32x32Test, MemCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 2000;
-
-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
-
-  for (int i = 0; i < count_test_block; ++i) {
    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < kNumCoeffs; ++j)
-        input_extreme_block[j] = 255;
-    if (i == 1)
-      for (int j = 0; j < kNumCoeffs; ++j)
-        input_extreme_block[j] = -255;
-
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch);
-    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch));
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      if (version_ == 0) {
-        EXPECT_EQ(output_block[j], output_ref_block[j])
-            << "Error: 32x32 FDCT versions have mismatched coefficients";
-      } else {
-        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
-            << "Error: 32x32 FDCT rd has mismatched coefficients";
-      }
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
-          << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 32x32 FDCT has coefficient larger than "
-          << "4*DCT_MAX_VALUE";
-    }
-  }
-}
-
-TEST_P(Trans32x32Test, InverseAccuracy) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
-
-  for (int i = 0; i < count_test_block; ++i) {
-    double out_r[kNumCoeffs];
-
-    // Initialize a test block with input range [-255, 255]
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
+    for (int j = 0; j < 1024; ++j)
      in[j] = src[j] - dst[j];
-    }

    reference_32x32_dct_2d(in, out_r);
-    for (int j = 0; j < kNumCoeffs; ++j)
+    for (int j = 0; j < 1024; j++)
      coeff[j] = round(out_r[j]);
-    REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
-    for (int j = 0; j < kNumCoeffs; ++j) {
+    vp9_short_idct32x32_add_c(coeff, dst, 32);
+    for (int j = 0; j < 1024; ++j) {
      const int diff = dst[j] - src[j];
      const int error = diff * diff;
      EXPECT_GE(1, error)
@@ -242,21 +121,72 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
  }
 }

-using std::tr1::make_tuple;
+TEST(VP9Fdct32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  unsigned int max_error = 0;
+  int64_t total_error = 0;
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t test_input_block[1024];
+    int16_t test_temp_block[1024];
+    uint8_t dst[1024], src[1024];

-INSTANTIATE_TEST_CASE_P(
-    C, Trans32x32Test,
-    ::testing::Values(
-        make_tuple(&vp9_short_fdct32x32_c, &vp9_short_idct32x32_add_c, 0),
-        make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_short_idct32x32_add_c, 1)));
+    for (int j = 0; j < 1024; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j)
+      test_input_block[j] = src[j] - dst[j];

-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans32x32Test,
-    ::testing::Values(
-        make_tuple(&vp9_short_fdct32x32_sse2,
-                   &vp9_short_idct32x32_add_sse2, 0),
-        make_tuple(&vp9_short_fdct32x32_rd_sse2,
-                   &vp9_short_idct32x32_add_sse2, 1)));
-#endif
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
+    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
+
+    for (int j = 0; j < 1024; ++j) {
+      const unsigned diff = dst[j] - src[j];
+      const unsigned error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1u, max_error)
+      << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
+
+  EXPECT_GE(count_test_block, total_error)
+      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1 per block";
+}
+
+TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t input_block[1024], input_extreme_block[1024];
+    int16_t output_block[1024], output_extreme_block[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < 1024; ++j)
+        input_extreme_block[j] = 255;
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_block, output_block, pitch);
+    vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < 1024; ++j) {
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
+          << "Error: 32x32 FDCT extreme has coefficient larger than "
+             "4*DCT_MAX_VALUE";
+    }
+  }
+}
 }  // namespace
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -12,7 +12,7 @@
 #define TEST_DECODE_TEST_DRIVER_H_
 #include <cstring>
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
+#include "vpx_config.h"
 #include "vpx/vpx_decoder.h"

 namespace libvpx_test {
@@ -36,8 +36,9 @@ class DxDataIterator {
 };

 // Provides a simplified interface to manage one video decoding.
-// Similar to Encoder class, the exact services should be added
-// as more tests are added.
+//
+// TODO: similar to Encoder class, the exact services should be
+// added as more tests are added.
 class Decoder {
 public:
  Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "./vpx_config.h"
+#include "vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/decode_test_driver.h"
@@ -114,19 +114,19 @@ static bool compare_img(const vpx_image_t *img1,
  const unsigned int height_y = img1->d_h;
  unsigned int i;
  for (i = 0; i < height_y; ++i)
-    match = (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
-                    img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                    width_y) == 0) && match;
+    match = ( memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                     width_y) == 0) && match;
  const unsigned int width_uv  = (img1->d_w + 1) >> 1;
  const unsigned int height_uv = (img1->d_h + 1) >> 1;
  for (i = 0; i <  height_uv; ++i)
-    match = (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
-                    img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
-                    width_uv) == 0) && match;
+    match = ( memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                     width_uv) == 0) && match;
  for (i = 0; i < height_uv; ++i)
-    match = (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
-                    img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
-                    width_uv) == 0) && match;
+    match = ( memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                     width_uv) == 0) && match;
  return match;
 }

@@ -158,7 +158,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
    Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
    bool again;
    for (again = true, video->Begin(); again; video->Next()) {
-      again = (video->img() != NULL);
+      again = video->img() != NULL;

      PreEncodeFrameHook(video);
      PreEncodeFrameHook(video, encoder);
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -190,9 +190,7 @@ class EncoderTest {
  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {}

  // Hook to determine whether the encode loop should continue.
-  virtual bool Continue() const {
-    return !(::testing::Test::HasFatalFailure() || abort_);
-  }
+  virtual bool Continue() const { return !abort_; }

  const CodecFactory   *codec_;
  // Hook to determine whether to decode frame after encoding
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -50,6 +50,10 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
    mismatch_nframes_ = 0;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
    psnr_ += pkt->data.psnr.psnr[0];
    nframes_++;
@@ -62,7 +66,7 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
    if (droppable_nframes_ > 0 &&
        (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
      for (unsigned int i = 0; i < droppable_nframes_; ++i) {
-        if (droppable_frames_[i] == video->frame()) {
+        if (droppable_frames_[i] == nframes_) {
          std::cout << "             Encoding droppable frame: "
                    << droppable_frames_[i] << "\n";
          frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST |
@@ -148,7 +152,7 @@ TEST_P(ErrorResilienceTest, OnVersusOff) {
  const vpx_rational timebase = { 33333333, 1000000000 };
  cfg_.g_timebase = timebase;
  cfg_.rc_target_bitrate = 2000;
-  cfg_.g_lag_in_frames = 10;
+  cfg_.g_lag_in_frames = 25;

  init_flags_ = VPX_CODEC_USE_PSNR;

@@ -179,9 +183,6 @@ TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) {
  const vpx_rational timebase = { 33333333, 1000000000 };
  cfg_.g_timebase = timebase;
  cfg_.rc_target_bitrate = 500;
-  // FIXME(debargha): Fix this to work for any lag.
-  // Currently this test only works for lag = 0
-  cfg_.g_lag_in_frames = 0;

  init_flags_ = VPX_CODEC_USE_PSNR;

--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -15,69 +15,68 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {
-#include "./vp9_rtcd.h"
+#include "vp9_rtcd.h"
 }

-#include "test/acm_random.h"
+#include "acm_random.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"

 using libvpx_test::ACMRandom;

 namespace {
-void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-             int stride, int /*tx_type*/) {
+void fdct4x4(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
  vp9_short_fdct4x4_c(in, out, stride);
 }
-void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                 int stride, int /*tx_type*/) {
+void idct4x4_add(int16_t *in, int16_t *out, uint8_t *dst,
+                 int stride, int tx_type) {
  vp9_short_idct4x4_add_c(out, dst, stride >> 1);
 }
-void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-            int stride, int tx_type) {
+void fht4x4(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
  vp9_short_fht4x4_c(in, out, stride >> 1, tx_type);
 }
-void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+void iht4x4_add(int16_t *in, int16_t *out, uint8_t *dst,
                int stride, int tx_type) {
  vp9_short_iht4x4_add_c(out, dst, stride >> 1, tx_type);
 }

 class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
 public:
-  virtual ~FwdTrans4x4Test() {}
-  virtual void SetUp() {
-    tx_type_ = GetParam();
-    if (tx_type_ == 0) {
-      fwd_txfm_ = fdct4x4;
-      inv_txfm_ = idct4x4_add;
+  FwdTrans4x4Test() {SetUpTestTxfm();}
+  ~FwdTrans4x4Test() {}
+
+  void SetUpTestTxfm() {
+    tx_type = GetParam();
+    if (tx_type == 0) {
+      fwd_txfm = fdct4x4;
+      inv_txfm = idct4x4_add;
    } else {
-      fwd_txfm_ = fht4x4;
-      inv_txfm_ = iht4x4_add;
+      fwd_txfm = fht4x4;
+      inv_txfm = iht4x4_add;
    }
  }

 protected:
  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
                  int stride, int tx_type) {
-    (*fwd_txfm_)(in, out, dst, stride, tx_type);
+    (*fwd_txfm)(in, out, dst, stride, tx_type);
  }

  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
                  int stride, int tx_type) {
-    (*inv_txfm_)(in, out, dst, stride, tx_type);
+    (*inv_txfm)(in, out, dst, stride, tx_type);
  }

-  int tx_type_;
-  void (*fwd_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
+  int tx_type;
+  void (*fwd_txfm)(int16_t *in, int16_t *out, uint8_t *dst,
                   int stride, int tx_type);
-  void (*inv_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
+  void (*inv_txfm)(int16_t *in, int16_t *out, uint8_t *dst,
                   int stride, int tx_type);
 };

 TEST_P(FwdTrans4x4Test, SignBiasCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 16);
+  int16_t test_input_block[16];
+  int16_t test_output_block[16];
  const int pitch = 8;
  int count_sign_block[16][2];
  const int count_test_block = 1000000;
@@ -88,7 +87,7 @@ TEST_P(FwdTrans4x4Test, SignBiasCheck) {
    for (int j = 0; j < 16; ++j)
      test_input_block[j] = rnd.Rand8() - rnd.Rand8();

-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type);

    for (int j = 0; j < 16; ++j) {
      if (test_output_block[j] < 0)
@@ -104,7 +103,7 @@ TEST_P(FwdTrans4x4Test, SignBiasCheck) {
    EXPECT_TRUE(bias_acceptable)
        << "Error: 4x4 FDCT/FHT has a sign bias > 1%"
        << " for input range [-255, 255] at index " << j
-        << " tx_type " << tx_type_;
+        << " tx_type " << tx_type;
  }

  memset(count_sign_block, 0, sizeof(count_sign_block));
@@ -113,7 +112,7 @@ TEST_P(FwdTrans4x4Test, SignBiasCheck) {
    for (int j = 0; j < 16; ++j)
      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);

-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type);

    for (int j = 0; j < 16; ++j) {
      if (test_output_block[j] < 0)
@@ -136,13 +135,12 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());

  int max_error = 0;
-  int total_error = 0;
+  double total_error = 0;
  const int count_test_block = 1000000;
  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 16);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 16);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 16);
+    int16_t test_input_block[16];
+    int16_t test_temp_block[16];
+    uint8_t dst[16], src[16];

    for (int j = 0; j < 16; ++j) {
      src[j] = rnd.Rand8();
@@ -153,10 +151,10 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 8;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type);

    for (int j = 0; j < 16; ++j) {
-        if (test_temp_block[j] > 0) {
+        if(test_temp_block[j] > 0) {
          test_temp_block[j] += 2;
          test_temp_block[j] /= 4;
          test_temp_block[j] *= 4;
@@ -168,7 +166,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
    }

    // inverse transform and reconstruct the pixel block
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type);

    for (int j = 0; j < 16; ++j) {
      const int diff = dst[j] - src[j];
@@ -183,7 +181,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {

  EXPECT_GE(count_test_block, total_error)
      << "Error: FDCT/IDCT or FHT/IHT has average "
-      << "roundtrip error > 1 per block";
+          "roundtrip error > 1 per block";
 }

 INSTANTIATE_TEST_CASE_P(VP9, FwdTrans4x4Test, ::testing::Range(0, 4));
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -13,78 +13,23 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "vpx_ports/mem.h"

 extern "C" {
-#include "./vp9_rtcd.h"
-void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *output, int pitch);
+#include "vp9_rtcd.h"
+void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
 }

-#include "test/acm_random.h"
+#include "acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;

 namespace {
-void fdct8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-             int stride, int /*tx_type*/) {
-  vp9_short_fdct8x8_c(in, out, stride);
-}
-void idct8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                 int stride, int /*tx_type*/) {
-  vp9_short_idct8x8_add_c(out, dst, stride >> 1);
-}
-void fht8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-            int stride, int tx_type) {
-  // TODO(jingning): need to refactor this to test both _c and _sse2 functions,
-  // when we have all inverse dct functions done sse2.
-#if HAVE_SSE2
-  vp9_short_fht8x8_sse2(in, out, stride >> 1, tx_type);
-#else
-  vp9_short_fht8x8_c(in, out, stride >> 1, tx_type);
-#endif
-}
-void iht8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                int stride, int tx_type) {
-  vp9_short_iht8x8_add_c(out, dst, stride >> 1, tx_type);
-}

-class FwdTrans8x8Test : public ::testing::TestWithParam<int> {
- public:
-  virtual ~FwdTrans8x8Test() {}
-  virtual void SetUp() {
-    tx_type_ = GetParam();
-    if (tx_type_ == 0) {
-      fwd_txfm = fdct8x8;
-      inv_txfm = idct8x8_add;
-    } else {
-      fwd_txfm = fht8x8;
-      inv_txfm = iht8x8_add;
-    }
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*fwd_txfm)(in, out, dst, stride, tx_type);
-  }
-  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*inv_txfm)(in, out, dst, stride, tx_type);
-  }
-
-  int tx_type_;
-  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-};
-
-TEST_P(FwdTrans8x8Test, SignBiasCheck) {
+TEST(VP9Fdct8x8Test, SignBiasCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
+  int16_t test_input_block[64];
+  int16_t test_output_block[64];
  const int pitch = 16;
  int count_sign_block[64][2];
  const int count_test_block = 100000;
@@ -95,9 +40,8 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
    // Initialize a test block with input range [-255, 255].
    for (int j = 0; j < 64; ++j)
      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
-    REGISTER_STATE_CHECK(
-        RunFwdTxfm(test_input_block, test_output_block,
-                   NULL, pitch, tx_type_));
+
+    vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch);

    for (int j = 0; j < 64; ++j) {
      if (test_output_block[j] < 0)
@@ -111,7 +55,7 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
    const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
    const int max_diff = 1125;
    EXPECT_LT(diff, max_diff)
-        << "Error: 8x8 FDCT/FHT has a sign bias > "
+        << "Error: 8x8 FDCT has a sign bias > "
        << 1. * max_diff / count_test_block * 100 << "%"
        << " for input range [-255, 255] at index " << j
        << " count0: " << count_sign_block[j][0]
@@ -125,9 +69,8 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
    // Initialize a test block with input range [-15, 15].
    for (int j = 0; j < 64; ++j)
      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
-    REGISTER_STATE_CHECK(
-        RunFwdTxfm(test_input_block, test_output_block,
-                   NULL, pitch, tx_type_));
+
+    vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch);

    for (int j = 0; j < 64; ++j) {
      if (test_output_block[j] < 0)
@@ -141,25 +84,24 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
    const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
    const int max_diff = 10000;
    EXPECT_LT(diff, max_diff)
-        << "Error: 4x4 FDCT/FHT has a sign bias > "
+        << "Error: 4x4 FDCT has a sign bias > "
        << 1. * max_diff / count_test_block * 100 << "%"
        << " for input range [-15, 15] at index " << j
        << " count0: " << count_sign_block[j][0]
        << " count1: " << count_sign_block[j][1]
        << " diff: " << diff;
  }
-}
+};

-TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
+TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int max_error = 0;
-  int total_error = 0;
+  double total_error = 0;
  const int count_test_block = 100000;
  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+    int16_t test_input_block[64];
+    int16_t test_temp_block[64];
+    uint8_t dst[64], src[64];

    for (int j = 0; j < 64; ++j) {
      src[j] = rnd.Rand8();
@@ -170,11 +112,9 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 16;
-    REGISTER_STATE_CHECK(
-        RunFwdTxfm(test_input_block, test_temp_block,
-                   dst, pitch, tx_type_));
-    for (int j = 0; j < 64; ++j) {
-        if (test_temp_block[j] > 0) {
+    vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
+    for (int j = 0; j < 64; ++j){
+        if(test_temp_block[j] > 0) {
          test_temp_block[j] += 2;
          test_temp_block[j] /= 4;
          test_temp_block[j] *= 4;
@@ -184,9 +124,7 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
          test_temp_block[j] *= 4;
        }
    }
-    REGISTER_STATE_CHECK(
-        RunInvTxfm(test_input_block, test_temp_block,
-                   dst, pitch, tx_type_));
+    vp9_short_idct8x8_add_c(test_temp_block, dst, 8);

    for (int j = 0; j < 64; ++j) {
      const int diff = dst[j] - src[j];
@@ -198,23 +136,21 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
  }

  EXPECT_GE(1, max_error)
-    << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1";
+      << "Error: 8x8 FDCT/IDCT has an individual roundtrip error > 1";

  EXPECT_GE(count_test_block/5, total_error)
-    << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
-        "error > 1/5 per block";
-}
+      << "Error: 8x8 FDCT/IDCT has average roundtrip error > 1/5 per block";
+};

-TEST_P(FwdTrans8x8Test, ExtremalCheck) {
+TEST(VP9Fdct8x8Test, ExtremalCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int max_error = 0;
-  int total_error = 0;
+  double total_error = 0;
  const int count_test_block = 100000;
  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+    int16_t test_input_block[64];
+    int16_t test_temp_block[64];
+    uint8_t dst[64], src[64];

    for (int j = 0; j < 64; ++j) {
      src[j] = rnd.Rand8() % 2 ? 255 : 0;
@@ -225,12 +161,8 @@ TEST_P(FwdTrans8x8Test, ExtremalCheck) {
      test_input_block[j] = src[j] - dst[j];

    const int pitch = 16;
-    REGISTER_STATE_CHECK(
-        RunFwdTxfm(test_input_block, test_temp_block,
-                   dst, pitch, tx_type_));
-    REGISTER_STATE_CHECK(
-        RunInvTxfm(test_input_block, test_temp_block,
-                   dst, pitch, tx_type_));
+    vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
+    vp9_short_idct8x8_add_c(test_temp_block, dst, 8);

    for (int j = 0; j < 64; ++j) {
      const int diff = dst[j] - src[j];
@@ -241,14 +173,13 @@ TEST_P(FwdTrans8x8Test, ExtremalCheck) {
    }

    EXPECT_GE(1, max_error)
-        << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has an"
+        << "Error: Extremal 8x8 FDCT/IDCT has an"
        << " individual roundtrip error > 1";

    EXPECT_GE(count_test_block/5, total_error)
-        << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
+        << "Error: Extremal 8x8 FDCT/IDCT has average"
        << " roundtrip error > 1/5 per block";
  }
-}
+};

-INSTANTIATE_TEST_CASE_P(VP9, FwdTrans8x8Test, ::testing::Range(0, 4));
 }  // namespace
--- a/test/i420_video_source.h
+++ b/test/i420_video_source.h
@@ -11,7 +11,6 @@
 #define TEST_I420_VIDEO_SOURCE_H_
 #include <cstdio>
 #include <cstdlib>
-#include <string>

 #include "test/video_source.h"

@@ -35,6 +34,7 @@ class I420VideoSource : public VideoSource {
        height_(0),
        framerate_numerator_(rate_numerator),
        framerate_denominator_(rate_denominator) {
+
    // This initializes raw_sz_, width_, height_ and allocates an img.
    SetSize(width, height);
  }
@@ -49,7 +49,7 @@ class I420VideoSource : public VideoSource {
    if (input_file_)
      fclose(input_file_);
    input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
        << file_name_;
    if (start_) {
      fseek(input_file_, raw_sz_ * start_, SEEK_SET);
@@ -92,7 +92,6 @@ class I420VideoSource : public VideoSource {
  }

  virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
    // Read a frame from input_file.
    if (fread(img_->img_data, raw_sz_, 1, input_file_) == 0) {
      limit_ = frame_;
@@ -109,8 +108,8 @@ class I420VideoSource : public VideoSource {
  unsigned int frame_;
  unsigned int width_;
  unsigned int height_;
-  int framerate_numerator_;
-  int framerate_denominator_;
+  unsigned int framerate_numerator_;
+  unsigned int framerate_denominator_;
 };

 }  // namespace libvpx_test
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -15,10 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {
-#include "./vp9_rtcd.h"
+#include "vp9_rtcd.h"
 }

-#include "test/acm_random.h"
+#include "acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -27,10 +27,10 @@ namespace {

 #ifdef _MSC_VER
 static int round(double x) {
-  if (x < 0)
-    return static_cast<int>(ceil(x - 0.5));
+  if(x < 0)
+    return (int)ceil(x - 0.5);
  else
-    return static_cast<int>(floor(x + 0.5));
+    return (int)floor(x + 0.5);
 }
 #endif

--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+
 extern "C" {
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
@@ -16,101 +17,105 @@ extern "C" {
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"

-#include "vpx/vpx_integer.h"
-
-typedef void (*idct_fn_t)(int16_t *input, unsigned char *pred_ptr,
+typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
                          int pred_stride, unsigned char *dst_ptr,
                          int dst_stride);
 namespace {
 class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {
- protected:
-  virtual void SetUp() {
-    int i;
+  protected:
+    virtual void SetUp() {
+        int i;

-    UUT = GetParam();
-    memset(input, 0, sizeof(input));
-    /* Set up guard blocks */
-    for (i = 0; i < 256; i++) output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
-  }
+        UUT = GetParam();
+        memset(input, 0, sizeof(input));
+        /* Set up guard blocks */
+        for (i = 0; i < 256; i++)
+            output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
+    }

-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+    virtual void TearDown() {
+      libvpx_test::ClearSystemState();
+    }

-  idct_fn_t UUT;
-  int16_t input[16];
-  unsigned char output[256];
-  unsigned char predict[256];
+    idct_fn_t UUT;
+    short input[16];
+    unsigned char output[256];
+    unsigned char predict[256];
 };

 TEST_P(IDCTTest, TestGuardBlocks) {
-  int i;
+    int i;

-  for (i = 0; i < 256; i++)
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(0, output[i]) << i;
-    else
-      EXPECT_EQ(255, output[i]);
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) < 4 && i < 64)
+            EXPECT_EQ(0, output[i]) << i;
+        else
+            EXPECT_EQ(255, output[i]);
 }

 TEST_P(IDCTTest, TestAllZeros) {
-  int i;
+    int i;

-  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-  for (i = 0; i < 256; i++)
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(0, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) < 4 && i < 64)
+            EXPECT_EQ(0, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(255, output[i]) << "i==" << i;
 }

 TEST_P(IDCTTest, TestAllOnes) {
-  int i;
+    int i;

-  input[0] = 4;
-  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+    input[0] = 4;
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-  for (i = 0; i < 256; i++)
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(1, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) < 4 && i < 64)
+            EXPECT_EQ(1, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(255, output[i]) << "i==" << i;
 }

 TEST_P(IDCTTest, TestAddOne) {
-  int i;
+    int i;

-  for (i = 0; i < 256; i++) predict[i] = i;
-  input[0] = 4;
-  REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
+    for (i = 0; i < 256; i++)
+        predict[i] = i;
+    input[0] = 4;
+    REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));

-  for (i = 0; i < 256; i++)
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(i + 1, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) < 4 && i < 64)
+            EXPECT_EQ(i+1, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(255, output[i]) << "i==" << i;
 }

 TEST_P(IDCTTest, TestWithData) {
-  int i;
+    int i;

-  for (i = 0; i < 16; i++) input[i] = i;
+    for (i = 0; i < 16; i++)
+        input[i] = i;

-  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-  for (i = 0; i < 256; i++)
-    if ((i & 0xF) > 3 || i > 63)
-      EXPECT_EQ(255, output[i]) << "i==" << i;
-    else if (i == 0)
-      EXPECT_EQ(11, output[i]) << "i==" << i;
-    else if (i == 34)
-      EXPECT_EQ(1, output[i]) << "i==" << i;
-    else if (i == 2 || i == 17 || i == 32)
-      EXPECT_EQ(3, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(0, output[i]) << "i==" << i;
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) > 3 || i > 63)
+            EXPECT_EQ(255, output[i]) << "i==" << i;
+        else if (i == 0)
+            EXPECT_EQ(11, output[i]) << "i==" << i;
+        else if (i == 34)
+            EXPECT_EQ(1, output[i]) << "i==" << i;
+        else if (i == 2 || i == 17 || i == 32)
+            EXPECT_EQ(3, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(0, output[i]) << "i==" << i;
 }

-INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
+INSTANTIATE_TEST_CASE_P(C, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_c));
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
                        ::testing::Values(vp8_short_idct4x4llm_mmx));
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -15,8 +15,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -27,8 +27,6 @@ using libvpx_test::ACMRandom;

 class IntraPredBase {
 public:
-  virtual ~IntraPredBase() {}
-
  virtual void TearDown() {
    libvpx_test::ClearSystemState();
  }
@@ -106,9 +104,9 @@ class IntraPredBase {
          for (int y = 0; y < block_size_; y++)
            sum += data_ptr_[p][y * stride_ - 1];
        expected = (sum + (1 << (shift - 1))) >> shift;
-      } else {
+      } else
        expected = 0x80;
-      }
+
      // check that all subsequent lines are equal to the first
      for (int y = 1; y < block_size_; ++y)
        ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_],
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -28,7 +28,7 @@ static unsigned int MemGetLe32(const uint8_t *mem) {
 // so that we can do actual file decodes.
 class IVFVideoSource : public CompressedVideoSource {
 public:
-  explicit IVFVideoSource(const std::string &file_name)
+  IVFVideoSource(const std::string &file_name)
      : file_name_(file_name),
        input_file_(NULL),
        compressed_frame_buf_(NULL),
@@ -47,13 +47,12 @@ class IVFVideoSource : public CompressedVideoSource {
  virtual void Init() {
    // Allocate a buffer for read in the compressed video frame.
    compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
-    ASSERT_TRUE(compressed_frame_buf_ != NULL)
-        << "Allocate frame buffer failed";
+    ASSERT_TRUE(compressed_frame_buf_) << "Allocate frame buffer failed";
  }

  virtual void Begin() {
    input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
        << file_name_;

    // Read file header
@@ -73,7 +72,6 @@ class IVFVideoSource : public CompressedVideoSource {
  }

  void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
    uint8_t frame_hdr[kIvfFrameHdrSize];
    // Check frame header and read a frame from input_file.
    if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_)
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@@ -31,6 +31,10 @@ class KeyframeTest : public ::libvpx_test::EncoderTest,
    set_cpu_used_ = 0;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
    if (kf_do_force_kf_)
@@ -132,6 +136,7 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
  // Verify that keyframes match the file keyframes in the file.
  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
       iter != kf_pts_list_.end(); ++iter) {
+
    if (deadline_ == VPX_DL_REALTIME && *iter > 0)
      EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
        << *iter;
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef TEST_MD5_HELPER_H_
-#define TEST_MD5_HELPER_H_
+#ifndef LIBVPX_TEST_MD5_HELPER_H_
+#define LIBVPX_TEST_MD5_HELPER_H_

 extern "C" {
 #include "./md5_utils.h"
@@ -25,15 +25,9 @@ class MD5 {

  void Add(const vpx_image_t *img) {
    for (int plane = 0; plane < 3; ++plane) {
-      const uint8_t *buf = img->planes[plane];
-      // Calculate the width and height to do the md5 check. For the chroma
-      // plane, we never want to round down and thus skip a pixel so if
-      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
-      // This works only for chroma_shift of 0 and 1.
-      const int h = plane ? (img->d_h + img->y_chroma_shift) >>
-                    img->y_chroma_shift : img->d_h;
-      const int w = plane ? (img->d_w + img->x_chroma_shift) >>
-                    img->x_chroma_shift : img->d_w;
+      uint8_t *buf = img->planes[plane];
+      const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
+      const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;

      for (int y = 0; y < h; ++y) {
        MD5Update(&md5_, buf, w);
@@ -67,4 +61,4 @@ class MD5 {

 }  // namespace libvpx_test

-#endif  // TEST_MD5_HELPER_H_
+#endif  // LIBVPX_TEST_MD5_HELPER_H_
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -11,8 +11,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -63,8 +63,7 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
  // Pointers to top-left pixel of block in the input and output images.
  uint8_t *const src_image_ptr = src_image + (input_stride << 1);
  uint8_t *const dst_image_ptr = dst_image + 8;
-  uint8_t *const flimits =
-      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
  (void)vpx_memset(flimits, 255, block_width);

  // Initialize pixels in the input:
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef TEST_REGISTER_STATE_CHECK_H_
-#define TEST_REGISTER_STATE_CHECK_H_
+#ifndef LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#define LIBVPX_TEST_REGISTER_STATE_CHECK_H_

 #ifdef _WIN64

@@ -92,4 +92,4 @@ class RegisterStateCheck {};

 #endif  // _WIN64

-#endif  // TEST_REGISTER_STATE_CHECK_H_
+#endif  // LIBVPX_TEST_REGISTER_STATE_CHECK_H_
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -16,68 +16,8 @@
 #include "test/video_source.h"
 #include "test/util.h"

-// Enable(1) or Disable(0) writing of the compressed bitstream.
-#define WRITE_COMPRESSED_STREAM 0
-
 namespace {

-#if WRITE_COMPRESSED_STREAM
-static void mem_put_le16(char *const mem, const unsigned int val) {
-  mem[0] = val;
-  mem[1] = val >> 8;
-}
-
-static void mem_put_le32(char *const mem, const unsigned int val) {
-  mem[0] = val;
-  mem[1] = val >> 8;
-  mem[2] = val >> 16;
-  mem[3] = val >> 24;
-}
-
-static void write_ivf_file_header(const vpx_codec_enc_cfg_t *const cfg,
-                                  int frame_cnt, FILE *const outfile) {
-  char header[32];
-
-  header[0] = 'D';
-  header[1] = 'K';
-  header[2] = 'I';
-  header[3] = 'F';
-  mem_put_le16(header + 4,  0);                   /* version */
-  mem_put_le16(header + 6,  32);                  /* headersize */
-  mem_put_le32(header + 8,  0x30395056);          /* fourcc (vp9) */
-  mem_put_le16(header + 12, cfg->g_w);            /* width */
-  mem_put_le16(header + 14, cfg->g_h);            /* height */
-  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
-  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
-  mem_put_le32(header + 24, frame_cnt);           /* length */
-  mem_put_le32(header + 28, 0);                   /* unused */
-
-  (void)fwrite(header, 1, 32, outfile);
-}
-
-static void write_ivf_frame_size(FILE *const outfile, const size_t size) {
-  char header[4];
-  mem_put_le32(header, static_cast<unsigned int>(size));
-  (void)fwrite(header, 1, 4, outfile);
-}
-
-static void write_ivf_frame_header(const vpx_codec_cx_pkt_t *const pkt,
-                                   FILE *const outfile) {
-  char header[12];
-  vpx_codec_pts_t pts;
-
-  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
-    return;
-
-  pts = pkt->data.frame.pts;
-  mem_put_le32(header, static_cast<unsigned int>(pkt->data.frame.sz));
-  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
-  mem_put_le32(header + 8, pts >> 32);
-
-  (void)fwrite(header, 1, 12, outfile);
-}
-#endif  // WRITE_COMPRESSED_STREAM
-
 const unsigned int kInitialWidth = 320;
 const unsigned int kInitialHeight = 240;

@@ -102,8 +42,6 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
    limit_ = 60;
  }

-  virtual ~ResizingVideoSource() {}
-
 protected:
  virtual void Next() {
    ++frame_;
@@ -118,15 +56,13 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
 protected:
  ResizeTest() : EncoderTest(GET_PARAM(0)) {}

-  virtual ~ResizeTest() {}
-
  struct FrameInfo {
    FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
        : pts(_pts), w(_w), h(_h) {}

    vpx_codec_pts_t pts;
-    unsigned int w;
-    unsigned int h;
+    unsigned int    w;
+    unsigned int    h;
  };

  virtual void SetUp() {
@@ -134,6 +70,10 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
    SetMode(GET_PARAM(1));
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void DecompressedFrameHook(const vpx_image_t &img,
                                     vpx_codec_pts_t pts) {
    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
@@ -159,47 +99,17 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
  }
 }

-const unsigned int kStepDownFrame = 3;
-const unsigned int kStepUpFrame = 6;
-
 class ResizeInternalTest : public ResizeTest {
 protected:
-#if WRITE_COMPRESSED_STREAM
-  ResizeInternalTest()
-      : ResizeTest(),
-        frame0_psnr_(0.0),
-        outfile_(NULL),
-        out_frames_(0) {}
-#else
  ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}
-#endif
-
-  virtual ~ResizeInternalTest() {}
-
-  virtual void BeginPassHook(unsigned int /*pass*/) {
-#if WRITE_COMPRESSED_STREAM
-    outfile_ = fopen("vp90-2-05-resize.ivf", "wb");
-#endif
-  }
-
-  virtual void EndPassHook() {
-#if WRITE_COMPRESSED_STREAM
-    if (outfile_) {
-      if (!fseek(outfile_, 0, SEEK_SET))
-        write_ivf_file_header(&cfg_, out_frames_, outfile_);
-      fclose(outfile_);
-      outfile_ = NULL;
-    }
-#endif
-  }

  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
-    if (video->frame() == kStepDownFrame) {
+    if (video->frame() == 3) {
      struct vpx_scaling_mode mode = {VP8E_FOURFIVE, VP8E_THREEFIVE};
      encoder->Control(VP8E_SET_SCALEMODE, &mode);
    }
-    if (video->frame() == kStepUpFrame) {
+    if (video->frame() == 6) {
      struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
      encoder->Control(VP8E_SET_SCALEMODE, &mode);
    }
@@ -211,46 +121,21 @@ class ResizeInternalTest : public ResizeTest {
    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.0);
  }

-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-#if WRITE_COMPRESSED_STREAM
-    ++out_frames_;
-
-    // Write initial file header if first frame.
-    if (pkt->data.frame.pts == 0)
-      write_ivf_file_header(&cfg_, 0, outfile_);
-
-    // Write frame header and data.
-    write_ivf_frame_header(pkt, outfile_);
-    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
-#endif
-  }
-
  double frame0_psnr_;
-#if WRITE_COMPRESSED_STREAM
-  FILE *outfile_;
-  unsigned int out_frames_;
-#endif
 };

 TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 10);
  init_flags_ = VPX_CODEC_USE_PSNR;
-
  // q picked such that initial keyframe on this clip is ~30dB PSNR
  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
-
-  // If the number of frames being encoded is smaller than g_lag_in_frames
-  // the encoded frame is unavailable using the current API. Comparing
-  // frames to detect mismatch would then not be possible. Set
-  // g_lag_in_frames = 0 to get around this.
-  cfg_.g_lag_in_frames = 0;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

  for (std::vector<FrameInfo>::iterator info = frame_info_list_.begin();
       info != frame_info_list_.end(); ++info) {
    const vpx_codec_pts_t pts = info->pts;
-    if (pts >= kStepDownFrame && pts < kStepUpFrame) {
+    if (pts >= 3 && pts < 6) {
      ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width";
      ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height";
    } else {
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -17,6 +17,7 @@ extern "C" {
 #include "./vpx_config.h"
 #if CONFIG_VP8_ENCODER
 #include "./vp8_rtcd.h"
+//#include "vp8/common/blockd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 #include "./vp9_rtcd.h"
@@ -427,7 +428,6 @@ INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));

 #if HAVE_SSE
 #if CONFIG_VP9_ENCODER
-#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;
 const sad_m_by_n_fn_t sad_4x8_sse_vp9 = vp9_sad4x8_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values(
@@ -441,7 +441,6 @@ INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
                        make_tuple(4, 4, sad_4x4x4d_sse)));
 #endif
 #endif
-#endif

 #if HAVE_SSE2
 #if CONFIG_VP8_ENCODER
@@ -452,20 +451,14 @@ const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt;
 const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
 #endif
 #if CONFIG_VP9_ENCODER
-#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
-const sad_m_by_n_fn_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
-const sad_m_by_n_fn_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
 const sad_m_by_n_fn_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
-const sad_m_by_n_fn_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
-const sad_m_by_n_fn_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
 const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
-const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
 const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
+const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
 const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
 const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
 #endif
-#endif
 const sad_m_by_n_test_param_t sse2_tests[] = {
 #if CONFIG_VP8_ENCODER
  make_tuple(16, 16, sad_16x16_wmt),
@@ -475,25 +468,18 @@ const sad_m_by_n_test_param_t sse2_tests[] = {
  make_tuple(4, 4, sad_4x4_wmt),
 #endif
 #if CONFIG_VP9_ENCODER
-#if CONFIG_USE_X86INC
  make_tuple(64, 64, sad_64x64_sse2_vp9),
-  make_tuple(64, 32, sad_64x32_sse2_vp9),
-  make_tuple(32, 64, sad_32x64_sse2_vp9),
  make_tuple(32, 32, sad_32x32_sse2_vp9),
-  make_tuple(32, 16, sad_32x16_sse2_vp9),
-  make_tuple(16, 32, sad_16x32_sse2_vp9),
  make_tuple(16, 16, sad_16x16_sse2_vp9),
-  make_tuple(16, 8, sad_16x8_sse2_vp9),
  make_tuple(8, 16, sad_8x16_sse2_vp9),
+  make_tuple(16, 8, sad_16x8_sse2_vp9),
  make_tuple(8, 8, sad_8x8_sse2_vp9),
  make_tuple(8, 4, sad_8x4_sse2_vp9),
 #endif
-#endif
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));

 #if CONFIG_VP9_ENCODER
-#if CONFIG_USE_X86INC
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2;
@@ -519,7 +505,6 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
                        make_tuple(8, 4, sad_8x4x4d_sse2)));
 #endif
 #endif
-#endif

 #if HAVE_SSE3
 #if CONFIG_VP8_ENCODER
@@ -538,11 +523,9 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values(
 #endif

 #if HAVE_SSSE3
-#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
                        make_tuple(16, 16, sad_16x16_sse3)));
 #endif
-#endif

 }  // namespace
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@@ -17,19 +17,15 @@
 #include <sys/types.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 extern "C" {
 #include "vp8/encoder/onyx_int.h"
 }

-using libvpx_test::ACMRandom;
-
 namespace {

 TEST(Vp8RoiMapTest, ParameterCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
  int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
  unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 };
@@ -125,10 +121,10 @@ TEST(Vp8RoiMapTest, ParameterCheck) {
    for (int i = 0; i < 1000; ++i) {
      int rand_deltas[4];
      int deltas_valid;
-      rand_deltas[0] = rnd(160) - 80;
-      rand_deltas[1] = rnd(160) - 80;
-      rand_deltas[2] = rnd(160) - 80;
-      rand_deltas[3] = rnd(160) - 80;
+      rand_deltas[0] = (rand() % 160) - 80;
+      rand_deltas[1] = (rand() % 160) - 80;
+      rand_deltas[2] = (rand() % 160) - 80;
+      rand_deltas[3] = (rand() % 160) - 80;

      deltas_valid = ((abs(rand_deltas[0]) <= 63) &&
                      (abs(rand_deltas[1]) <= 63) &&
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -13,8 +13,8 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 extern "C" {
-#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vp8/encoder/block.h"
 #include "vpx_mem/vpx_mem.h"
@@ -51,7 +51,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
  bd.predictor = reinterpret_cast<unsigned char*>(
      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));

-  for (int i = 0; kSrcStride[i] > 0; ++i) {
+  for(int i = 0; kSrcStride[i] > 0; ++i) {
    // start at block0
    be.src = 0;
    be.base_src = &source;
@@ -61,7 +61,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
    int16_t *src_diff = be.src_diff;
    for (int r = 0; r < kBlockHeight; ++r) {
      for (int c = 0; c < kBlockWidth; ++c) {
-        src_diff[c] = static_cast<int16_t>(0xa5a5);
+        src_diff[c] = 0xa5a5;
      }
      src_diff += kDiffPredStride;
    }
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -33,6 +33,10 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
    delete[] modified_buf_;
  }

+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
    if (video->frame() == 1) {
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -520,12 +520,3 @@ d17bc08eedfc60c4c23d576a6c964a21bf854d1f  vp90-2-03-size-226x202.webm
 83c6d8f2969b759e10e5c6542baca1265c874c29  vp90-2-03-size-226x224.webm.md5
 fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce  vp90-2-03-size-226x226.webm
 94ad19b8b699cea105e2ff18f0df2afd7242bcf7  vp90-2-03-size-226x226.webm.md5
-b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
-65f45ec9a55537aac76104818278e0978f94a678  vp90-2-03-deltaq.webm.md5
-4dbb87494c7f565ffc266c98d17d0d8c7a5c5aba  vp90-2-05-resize.ivf
-7f6d8879336239a43dbb6c9f13178cb11cf7ed09  vp90-2-05-resize.ivf.md5
-bf61ddc1f716eba58d4c9837d4e91031d9ce4ffe  vp90-2-06-bilinear.webm
-f6235f937552e11d8eb331ec55da6b3aa596b9ac  vp90-2-06-bilinear.webm.md5
-495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
-65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5
-
--- a/test/test.mk
+++ b/test/test.mk
@@ -24,9 +24,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
@@ -89,7 +87,6 @@ LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
 endif

 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
@@ -629,11 +626,3 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <string>
-#include "./vpx_config.h"
+#include "vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 #include "vpx_ports/x86.h"
@@ -48,9 +48,7 @@ int main(int argc, char **argv) {
 #endif

 #if !CONFIG_SHARED
-// Shared library builds don't support whitebox tests
-// that exercise internal symbols.
-
+  /* Shared library builds don't support whitebox tests that exercise internal symbols. */
 #if CONFIG_VP8
  vp8_rtcd();
 #endif
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -159,11 +159,7 @@ const char *kVP9TestVectors[] = {
  "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
  "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
  "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
-  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
-  "vp90-2-05-resize.ivf",        "vp90-2-06-bilinear.webm",
-#if CONFIG_NON420
-  "vp91-2-04-yv444.webm"
-#endif
+  "vp90-2-03-size-226x226.webm"
 };
 #endif

@@ -185,7 +181,6 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,

  virtual void DecompressedFrameHook(const vpx_image_t& img,
                                     const unsigned int frame_number) {
-    ASSERT_TRUE(md5_file_ != NULL);
    char expected_md5[33];
    char junk[128];

--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -23,13 +23,10 @@ extern "C" {

 namespace {
 class TileIndependenceTest : public ::libvpx_test::EncoderTest,
-                             public ::libvpx_test::CodecTestWithParam<int> {
+    public ::libvpx_test::CodecTestWithParam<int> {
 protected:
-  TileIndependenceTest()
-      : EncoderTest(GET_PARAM(0)),
-        md5_fw_order_(),
-        md5_inv_order_(),
-        n_tiles_(GET_PARAM(1)) {
+  TileIndependenceTest() : EncoderTest(GET_PARAM(0)), n_tiles_(GET_PARAM(1)),
+      md5_fw_order_(), md5_inv_order_() {
    init_flags_ = VPX_CODEC_USE_PSNR;
    vpx_codec_dec_cfg_t cfg;
    cfg.w = 704;
@@ -59,8 +56,9 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,

  void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,
                 ::libvpx_test::MD5 *md5) {
-    const vpx_codec_err_t res = dec->DecodeFrame(
-        reinterpret_cast<uint8_t*>(pkt->data.frame.buf), pkt->data.frame.sz);
+    const vpx_codec_err_t res =
+        dec->DecodeFrame(reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
+                         pkt->data.frame.sz);
    if (res != VPX_CODEC_OK) {
      abort_ = true;
      ASSERT_EQ(VPX_CODEC_OK, res);
@@ -74,11 +72,11 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
    UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
  }

-  ::libvpx_test::MD5 md5_fw_order_, md5_inv_order_;
-  ::libvpx_test::Decoder *fw_dec_, *inv_dec_;
-
 private:
  int n_tiles_;
+ protected:
+  ::libvpx_test::MD5 md5_fw_order_, md5_inv_order_;
+  ::libvpx_test::Decoder *fw_dec_, *inv_dec_;
 };

 // run an encode with 2 or 4 tiles, and do the decode both in normal and
@@ -95,7 +93,7 @@ TEST_P(TileIndependenceTest, MD5Match) {
                                     timebase.den, timebase.num, 0, 30);
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

-  const char *md5_fw_str = md5_fw_order_.Get();
+  const char *md5_fw_str  = md5_fw_order_.Get();
  const char *md5_inv_str = md5_inv_order_.Get();

  // could use ASSERT_EQ(!memcmp(.., .., 16) here, but this gives nicer
@@ -104,6 +102,7 @@ TEST_P(TileIndependenceTest, MD5Match) {
  ASSERT_STREQ(md5_fw_str, md5_inv_str);
 }

-VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
+VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest,
+                          ::testing::Range(0, 2, 1));

 }  // namespace
--- a/test/util.h
+++ b/test/util.h
@@ -37,7 +37,7 @@ static double compute_psnr(const vpx_image_t *img1,
                  img2->planes[VPX_PLANE_Y][i * img2->stride[VPX_PLANE_Y] + j];
      sqrerr += d * d;
    }
-  double mse = static_cast<double>(sqrerr) / (width_y * height_y);
+  double mse = sqrerr / (width_y * height_y);
  double psnr = 100.0;
  if (mse > 0.0) {
    psnr = 10 * log10(255.0 * 255.0 / mse);
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -16,16 +16,16 @@
 #include "test/register_state_check.h"

 #include "vpx/vpx_integer.h"
-#include "./vpx_config.h"
+#include "vpx_config.h"
 extern "C" {
 #include "vpx_mem/vpx_mem.h"
 #if CONFIG_VP8_ENCODER
 # include "vp8/common/variance.h"
-# include "./vp8_rtcd.h"
+# include "vp8_rtcd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 # include "vp9/encoder/vp9_variance.h"
-# include "./vp9_rtcd.h"
+# include "vp9_rtcd.h"
 #endif
 }
 #include "test/acm_random.h"
@@ -107,8 +107,8 @@ static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
 }

 template<typename VarianceFunctionType>
-class VarianceTest
-    : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
+class VarianceTest :
+    public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
 public:
  virtual void SetUp() {
    const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
@@ -191,9 +191,9 @@ void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
 }

 template<typename SubpelVarianceFunctionType>
-class SubpelVarianceTest
-    : public ::testing::TestWithParam<tuple<int, int,
-                                            SubpelVarianceFunctionType> > {
+class SubpelVarianceTest :
+    public ::testing::TestWithParam<tuple<int, int,
+                                          SubpelVarianceFunctionType> > {
 public:
  virtual void SetUp() {
    const tuple<int, int, SubpelVarianceFunctionType>& params =
@@ -218,7 +218,6 @@ class SubpelVarianceTest
    vpx_free(src_);
    delete[] ref_;
    vpx_free(sec_);
-    libvpx_test::ClearSystemState();
  }

 protected:
@@ -483,7 +482,6 @@ INSTANTIATE_TEST_CASE_P(
 #endif

 #if HAVE_SSE2
-#if CONFIG_USE_X86INC
 const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
 const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
 const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
@@ -597,11 +595,8 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(6, 5, subpel_avg_variance64x32_sse2),
                      make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
 #endif
-#endif

 #if HAVE_SSSE3
-#if CONFIG_USE_X86INC
-
 const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 =
    vp9_sub_pixel_variance4x4_ssse3;
 const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 =
@@ -686,7 +681,6 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
 #endif
-#endif
 #endif  // CONFIG_VP9_ENCODER

 }  // namespace vp9
--- a/test/vp8_boolcoder_test.cc
+++ b/test/vp8_boolcoder_test.cc
@@ -8,6 +8,10 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+extern "C" {
+#include "vp8/encoder/boolhuff.h"
+#include "vp8/decoder/dboolhuff.h"
+}

 #include <math.h>
 #include <stddef.h>
@@ -20,11 +24,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_integer.h"

-extern "C" {
-#include "vp8/encoder/boolhuff.h"
-#include "vp8/decoder/dboolhuff.h"
-}
-
 namespace {
 const int num_tests = 10;

@@ -45,7 +44,7 @@ void encrypt_buffer(uint8_t *buffer, int size) {

 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                           uint8_t *output, int count) {
-  int offset = input - reinterpret_cast<uint8_t *>(decrypt_state);
+  int offset = input - (uint8_t *)decrypt_state;
  for (int i = 0; i < count; i++) {
    output[i] = input[i] ^ secret_key[(offset + i) & 15];
  }
@@ -59,10 +58,10 @@ TEST(VP8, TestBitIO) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  for (int n = 0; n < num_tests; ++n) {
    for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int kBitsToTest = 1000;
-      uint8_t probas[kBitsToTest];
+      const int bits_to_test = 1000;
+      uint8_t probas[bits_to_test];

-      for (int i = 0; i < kBitsToTest; ++i) {
+      for (int i = 0; i < bits_to_test; ++i) {
        const int parity = i & 1;
        probas[i] =
            (method == 0) ? 0 : (method == 1) ? 255 :
@@ -77,14 +76,14 @@ TEST(VP8, TestBitIO) {
      }
      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
        const int random_seed = 6432;
-        const int kBufferSize = 10000;
+        const int buffer_size = 10000;
        ACMRandom bit_rnd(random_seed);
        BOOL_CODER bw;
-        uint8_t bw_buffer[kBufferSize];
-        vp8_start_encode(&bw, bw_buffer, bw_buffer + kBufferSize);
+        uint8_t bw_buffer[buffer_size];
+        vp8_start_encode(&bw, bw_buffer, bw_buffer + buffer_size);

        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < kBitsToTest; ++i) {
+        for (int i = 0; i < bits_to_test; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
@@ -99,20 +98,19 @@ TEST(VP8, TestBitIO) {
 #if CONFIG_DECRYPT
        encrypt_buffer(bw_buffer, buffer_size);
        vp8dx_start_decode(&br, bw_buffer, buffer_size,
-                           test_decrypt_cb,
-                           reinterpret_cast<void *>(bw_buffer));
+                           test_decrypt_cb, (void *)bw_buffer);
 #else
-        vp8dx_start_decode(&br, bw_buffer, kBufferSize, NULL, NULL);
+        vp8dx_start_decode(&br, bw_buffer, buffer_size, NULL, NULL);
 #endif
        bit_rnd.Reset(random_seed);
-        for (int i = 0; i < kBitsToTest; ++i) {
+        for (int i = 0; i < bits_to_test; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
            bit = bit_rnd(2);
          }
          GTEST_ASSERT_EQ(vp8dx_decode_bool(&br, probas[i]), bit)
-              << "pos: "<< i << " / " << kBitsToTest
+              << "pos: "<< i << " / " << bits_to_test
              << " bit_method: " << bit_method
              << " method: " << method;
        }
--- a/test/vp8_decrypt_test.cc
+++ b/test/vp8_decrypt_test.cc
@@ -26,8 +26,7 @@ const uint8_t test_key[16] = {
  0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0
 };

-void encrypt_buffer(const uint8_t *src, uint8_t *dst,
-                    int size, int offset = 0) {
+void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0) {
  for (int i = 0; i < size; ++i) {
    dst[i] = src[i] ^ test_key[(offset + i) & 15];
  }
@@ -35,11 +34,10 @@ void encrypt_buffer(const uint8_t *src, uint8_t *dst,

 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                     uint8_t *output, int count) {
-  encrypt_buffer(input, output, count,
-                 input - reinterpret_cast<uint8_t *>(decrypt_state));
+  encrypt_buffer(input, output, count, input - (uint8_t *)decrypt_state);
 }

-}  // namespace
+} // namespace

 namespace libvpx_test {

--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -18,7 +18,7 @@


 extern "C" {
-#include "./vp8_rtcd.h"
+#include "vp8_rtcd.h"
 }

 #include "test/acm_random.h"
--- a/test/vp9_boolcoder_test.cc
+++ b/test/vp9_boolcoder_test.cc
@@ -19,7 +19,7 @@ extern "C" {
 #include "vp9/decoder/vp9_dboolhuff.h"
 }

-#include "test/acm_random.h"
+#include "acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -32,10 +32,10 @@ TEST(VP9, TestBitIO) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  for (int n = 0; n < num_tests; ++n) {
    for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int kBitsToTest = 1000;
-      uint8_t probas[kBitsToTest];
+      const int bits_to_test = 1000;
+      uint8_t probas[bits_to_test];

-      for (int i = 0; i < kBitsToTest; ++i) {
+      for (int i = 0; i < bits_to_test; ++i) {
        const int parity = i & 1;
        probas[i] =
          (method == 0) ? 0 : (method == 1) ? 255 :
@@ -50,14 +50,14 @@ TEST(VP9, TestBitIO) {
      }
      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
        const int random_seed = 6432;
-        const int kBufferSize = 10000;
+        const int buffer_size = 10000;
        ACMRandom bit_rnd(random_seed);
        vp9_writer bw;
-        uint8_t bw_buffer[kBufferSize];
+        uint8_t bw_buffer[buffer_size];
        vp9_start_encode(&bw, bw_buffer);

        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < kBitsToTest; ++i) {
+        for (int i = 0; i < bits_to_test; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
@@ -72,16 +72,16 @@ TEST(VP9, TestBitIO) {
        GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);

        vp9_reader br;
-        vp9_reader_init(&br, bw_buffer, kBufferSize);
+        vp9_reader_init(&br, bw_buffer, buffer_size);
        bit_rnd.Reset(random_seed);
-        for (int i = 0; i < kBitsToTest; ++i) {
+        for (int i = 0; i < bits_to_test; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
            bit = bit_rnd(2);
          }
          GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit)
-              << "pos: " << i << " / " << kBitsToTest
+              << "pos: " << i << " / " << bits_to_test
              << " bit_method: " << bit_method
              << " method: " << method;
        }
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -1,75 +0,0 @@
-/*
-  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-
-  Use of this source code is governed by a BSD-style license
-  that can be found in the LICENSE file in the root of the source
-  tree. An additional intellectual property rights grant can be found
-  in the file PATENTS.  All contributing project authors may
-  be found in the AUTHORS file in the root of the source tree.
-*/
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/i420_video_source.h"
-#include "test/util.h"
-
-namespace {
-
-const int kMaxPsnr = 100;
-
-class LossLessTest : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
- protected:
-  LossLessTest() : EncoderTest(GET_PARAM(0)),
-                   psnr_(kMaxPsnr),
-                   nframes_(0),
-                   encoding_mode_(GET_PARAM(1)) {
-  }
-
-  virtual ~LossLessTest() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
-  }
-
-  virtual void BeginPassHook(unsigned int /*pass*/) {
-    psnr_ = 0.0;
-    nframes_ = 0;
-  }
-
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
-    if (pkt->data.psnr.psnr[0] < psnr_)
-      psnr_= pkt->data.psnr.psnr[0];
-  }
-
-  double GetMinPsnr() const {
-      return psnr_;
-  }
-
- private:
-  double psnr_;
-  unsigned int nframes_;
-  libvpx_test::TestMode encoding_mode_;
-};
-
-TEST_P(LossLessTest, TestLossLessEncoding) {
-  const vpx_rational timebase = { 33333333, 1000000000 };
-  cfg_.g_timebase = timebase;
-  cfg_.rc_target_bitrate = 2000;
-  cfg_.g_lag_in_frames = 25;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 0;
-
-  init_flags_ = VPX_CODEC_USE_PSNR;
-
-  // intentionally changed the dimension for better testing coverage
-  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 356, 284,
-                                     timebase.den, timebase.num, 0, 30);
-
-  const double psnr_lossless = GetMinPsnr();
-  EXPECT_GE(psnr_lossless, kMaxPsnr);
-}
-VP9_INSTANTIATE_TEST_CASE(LossLessTest, ALL_TEST_MODES);
-}  // namespace
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -39,8 +39,8 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());

  // FIXME(rbultje) split in its own file
-  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
-       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
+  for (BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_AB4X4; bsize < BLOCK_SIZE_TYPES;
+       bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
    const int block_width  = 4 << b_width_log2(bsize);
    const int block_height = 4 << b_height_log2(bsize);
    int16_t *diff = reinterpret_cast<int16_t *>(
@@ -93,8 +93,9 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
 INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
                        ::testing::Values(vp9_subtract_block_c));

-#if HAVE_SSE2 && CONFIG_USE_X86INC
+#if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
                        ::testing::Values(vp9_subtract_block_sse2));
 #endif
+
 }  // namespace vp9
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -1,109 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/decoder/vp9_thread.h"
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/decode_test_driver.h"
-#include "test/md5_helper.h"
-#include "test/webm_video_source.h"
-
-namespace {
-
-class VP9WorkerThreadTest : public ::testing::Test {
- protected:
-  virtual ~VP9WorkerThreadTest() {}
-  virtual void SetUp() {
-    vp9_worker_init(&worker_);
-  }
-
-  virtual void TearDown() {
-    vp9_worker_end(&worker_);
-  }
-
-  VP9Worker worker_;
-};
-
-int ThreadHook(void* data, void* return_value) {
-  int* const hook_data = reinterpret_cast<int*>(data);
-  *hook_data = 5;
-  return *reinterpret_cast<int*>(return_value);
-}
-
-TEST_F(VP9WorkerThreadTest, HookSuccess) {
-  EXPECT_TRUE(vp9_worker_sync(&worker_));  // should be a no-op.
-
-  for (int i = 0; i < 2; ++i) {
-    EXPECT_TRUE(vp9_worker_reset(&worker_));
-
-    int hook_data = 0;
-    int return_value = 1;  // return successfully from the hook
-    worker_.hook = ThreadHook;
-    worker_.data1 = &hook_data;
-    worker_.data2 = &return_value;
-
-    vp9_worker_launch(&worker_);
-    EXPECT_TRUE(vp9_worker_sync(&worker_));
-    EXPECT_FALSE(worker_.had_error);
-    EXPECT_EQ(5, hook_data);
-
-    EXPECT_TRUE(vp9_worker_sync(&worker_));  // should be a no-op.
-  }
-}
-
-TEST_F(VP9WorkerThreadTest, HookFailure) {
-  EXPECT_TRUE(vp9_worker_reset(&worker_));
-
-  int hook_data = 0;
-  int return_value = 0;  // return failure from the hook
-  worker_.hook = ThreadHook;
-  worker_.data1 = &hook_data;
-  worker_.data2 = &return_value;
-
-  vp9_worker_launch(&worker_);
-  EXPECT_FALSE(vp9_worker_sync(&worker_));
-  EXPECT_TRUE(worker_.had_error);
-
-  // Ensure _reset() clears the error and _launch() can be called again.
-  return_value = 1;
-  EXPECT_TRUE(vp9_worker_reset(&worker_));
-  EXPECT_FALSE(worker_.had_error);
-  vp9_worker_launch(&worker_);
-  EXPECT_TRUE(vp9_worker_sync(&worker_));
-  EXPECT_FALSE(worker_.had_error);
-}
-
-TEST(VP9DecodeMTTest, MTDecode) {
-  libvpx_test::WebMVideoSource video("vp90-2-03-size-226x226.webm");
-  video.Init();
-
-  vpx_codec_dec_cfg_t cfg = {0};
-  cfg.threads = 2;
-  libvpx_test::VP9Decoder decoder(cfg, 0);
-
-  libvpx_test::MD5 md5;
-  for (video.Begin(); video.cxdata(); video.Next()) {
-    const vpx_codec_err_t res =
-        decoder.DecodeFrame(video.cxdata(), video.frame_size());
-    ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
-
-    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img = NULL;
-
-    // Get decompressed data
-    while ((img = dec_iter.Next())) {
-      md5.Add(img);
-    }
-  }
-  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc", md5.Get());
-}
-
-}  // namespace
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -99,7 +99,7 @@ class WebMVideoSource : public CompressedVideoSource {

  virtual void Begin() {
    input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
        << file_name_;

    nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb,
@@ -130,7 +130,6 @@ class WebMVideoSource : public CompressedVideoSource {
  }

  void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
    if (chunk_ >= chunks_) {
      unsigned int track;

--- a/third_party/libyuv/source/scale.c
+++ b/third_party/libyuv/source/scale.c
@@ -1370,12 +1370,12 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    mov        edx, [esp + 8 + 12]  // src_stride
    mov        ecx, [esp + 8 + 16]  // dst_width
    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    shr        eax, 1
    cmp        eax, 0
    je         xloop1
-    cmp        eax, 64
+    cmp        eax, 128
    je         xloop2

+    shr        eax, 1
    mov        ah,al
    neg        al
    add        al, 128
@@ -2132,11 +2132,11 @@ void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    "mov    0x14(%esp),%edx                    \n"
    "mov    0x18(%esp),%ecx                    \n"
    "mov    0x1c(%esp),%eax                    \n"
-    "shr    %eax                               \n"
    "cmp    $0x0,%eax                          \n"
    "je     2f                                 \n"
-    "cmp    $0x40,%eax                         \n"
+    "cmp    $0x80,%eax                         \n"
    "je     3f                                 \n"
+    "shr    %eax                               \n"
    "mov    %al,%ah                            \n"
    "neg    %al                                \n"
    "add    $0x80,%al                          \n"
@@ -2662,7 +2662,6 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
                                  const uint8* src_ptr, int src_stride,
                                  int dst_width, int source_y_fraction) {
-  source_y_fraction >>= 1;
  if (source_y_fraction == 0) {
    asm volatile (
   "1:"
@@ -2681,7 +2680,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
      : "memory", "cc", "rax"
    );
    return;
-  } else if (source_y_fraction == 64) {
+  } else if (source_y_fraction == 128) {
    asm volatile (
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
@@ -2704,6 +2703,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
  } else {
    asm volatile (
      "mov        %3,%%eax                     \n"
+      "shr        %%eax                        \n"
      "mov        %%al,%%ah                    \n"
      "neg        %%al                         \n"
      "add        $0x80,%%al                   \n"
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -173,6 +173,7 @@ void vp8_create_common(VP8_COMMON *oci)
    oci->use_bilinear_mc_filter = 0;
    oci->full_pixel = 0;
    oci->multi_token_partition = ONE_PARTITION;
+    oci->clr_type = REG_YUV;
    oci->clamp_type = RECON_CLAMP_REQUIRED;

    /* Initialize reference frame sign bias structure to defaults */
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -41,8 +41,7 @@ extern "C"
    {
        USAGE_STREAM_FROM_SERVER    = 0x0,
        USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-        USAGE_CONSTRAINED_QUALITY   = 0x2,
-        USAGE_CONSTANT_QUALITY      = 0x3
+        USAGE_CONSTRAINED_QUALITY   = 0x2
    } END_USAGE;


--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -72,6 +72,7 @@ typedef struct VP8Common
    int horiz_scale;
    int vert_scale;

+    YUV_TYPE clr_type;
    CLAMP_TYPE  clamp_type;

    YV12_BUFFER_CONFIG *frame_to_show;
@@ -114,6 +115,9 @@ typedef struct VP8Common
    int uvdc_delta_q;
    int uvac_delta_q;

+    unsigned int frames_since_golden;
+    unsigned int frames_till_alt_ref_frame;
+
    /* We allocate a MODE_INFO struct for each macroblock, together with
       an extra row on top and column on the left to simplify prediction. */

@@ -153,6 +157,7 @@ typedef struct VP8Common

    unsigned int current_video_frame;

+    int near_boffset[3];
    int version;

    TOKEN_PARTITION multi_token_partition;
@@ -160,10 +165,8 @@ typedef struct VP8Common
 #ifdef PACKET_TESTING
    VP8_HEADER oh;
 #endif
-#if CONFIG_POSTPROC_VISUALIZER
    double bitrate;
    double framerate;
-#endif

 #if CONFIG_MULTITHREAD
    int processor_core_count;
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -923,7 +923,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
    if (flags & VP8D_DEBUG_TXT_RATE_INFO)
    {
        char message[512];
-        sprintf(message, "Bitrate: %10.2f framerate: %10.2f ", oci->bitrate, oci->framerate);
+        sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
    }

--- a/vp8/common/vp8_asm_com_offsets.c
+++ b/vp8/common/vp8_asm_com_offsets.c
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/asm_offsets.h"
+#include "vp8/common/blockd.h"
+
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif /* CONFIG_POSTPROC */
+
+BEGIN
+
+#if CONFIG_POSTPROC
+/* mfqe.c / filter_by_weight */
+DEFINE(MFQE_PRECISION_VAL,                      MFQE_PRECISION);
+#endif /* CONFIG_POSTPROC */
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
+
+#if HAVE_MEDIA
+/* switch case in vp8_intra4x4_predict_armv6 is based on these enumerated values */
+ct_assert(B_DC_PRED, B_DC_PRED == 0);
+ct_assert(B_TM_PRED, B_TM_PRED == 1);
+ct_assert(B_VE_PRED, B_VE_PRED == 2);
+ct_assert(B_HE_PRED, B_HE_PRED == 3);
+ct_assert(B_LD_PRED, B_LD_PRED == 4);
+ct_assert(B_RD_PRED, B_RD_PRED == 5);
+ct_assert(B_VR_PRED, B_VR_PRED == 6);
+ct_assert(B_VL_PRED, B_VL_PRED == 7);
+ct_assert(B_HD_PRED, B_HD_PRED == 8);
+ct_assert(B_HU_PRED, B_HU_PRED == 9);
+#endif
+
+#if HAVE_SSE2
+#if CONFIG_POSTPROC
+/* vp8_filter_by_weight16x16 and 8x8 */
+ct_assert(MFQE_PRECISION_VAL, MFQE_PRECISION == 4)
+#endif /* CONFIG_POSTPROC */
+#endif /* HAVE_SSE2 */
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -1095,7 +1095,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                           "Failed to allocate bool decoder 0");
    if (pc->frame_type == KEY_FRAME) {
-        (void)vp8_read_bit(bc);  // colorspace
+        pc->clr_type    = (YUV_TYPE)vp8_read_bit(bc);
        pc->clamp_type  = (CLAMP_TYPE)vp8_read_bit(bc);
    }

--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -430,6 +430,7 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_st
    *time_stamp = pbi->last_time_stamp;
    *time_end_stamp = 0;

+    sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
    ret = vp8_post_proc_frame(&pbi->common, sd, flags);
 #else
--- a/vp8/decoder/vp8_asm_dec_offsets.c
+++ b/vp8/decoder/vp8_asm_dec_offsets.c
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/asm_offsets.h"
+#include "onyxd_int.h"
+
+BEGIN
+
+DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));
+DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));
+DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));
+DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));
+DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1322,7 +1322,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
        vp8_start_encode(bc, cx_data, cx_data_end);

        /* signal clr type */
-        vp8_write_bit(bc, 0);
+        vp8_write_bit(bc, pc->clr_type);
        vp8_write_bit(bc, pc->clamp_type);

    }
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -1325,7 +1325,7 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta
    return Q;
 }

-extern void vp8_new_framerate(VP8_COMP *cpi, double framerate);
+extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);

 void vp8_init_second_pass(VP8_COMP *cpi)
 {
@@ -1349,9 +1349,9 @@ void vp8_init_second_pass(VP8_COMP *cpi)
     * sum duration is not. Its calculated based on the actual durations of
     * all frames from the first pass.
     */
-    vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
+    vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);

-    cpi->output_framerate = cpi->framerate;
+    cpi->output_frame_rate = cpi->frame_rate;
    cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
    cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);

@@ -2398,7 +2398,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    target_frame_size += cpi->min_frame_bandwidth;

    /* Every other frame gets a few extra bits */
-    if ( (cpi->frames_since_golden & 0x01) &&
+    if ( (cpi->common.frames_since_golden & 0x01) &&
         (cpi->frames_till_gf_update_due > 0) )
    {
        target_frame_size += cpi->twopass.alt_extra_bits;
@@ -2529,7 +2529,7 @@ void vp8_second_pass(VP8_COMP *cpi)

    /* Set nominal per second bandwidth for this frame */
    cpi->target_bandwidth = (int)
-    (cpi->per_frame_bandwidth * cpi->output_framerate);
+    (cpi->per_frame_bandwidth * cpi->output_frame_rate);
    if (cpi->target_bandwidth < 0)
        cpi->target_bandwidth = 0;

@@ -3185,7 +3185,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        /* Convert to a per second bitrate */
        cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
-                                      cpi->output_framerate);
+                                      cpi->output_frame_rate);
    }

    /* Note the total error score of the kf group minus the key frame itself */
@@ -3224,7 +3224,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        cpi->common.vert_scale = NORMAL;

        /* Calculate Average bits per frame. */
-        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate);
+        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);

        /* CBR... Use the clip average as the target for deciding resample */
        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
@@ -3299,7 +3299,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        }
        else
        {
-            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate));
+            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
            int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;

            /* If triggered last time the threshold for triggering again is
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -301,11 +301,11 @@ static int rescale(int val, int num, int denom)
 static void init_temporal_layer_context(VP8_COMP *cpi,
                                        VP8_CONFIG *oxcf,
                                        const int layer,
-                                        double prev_layer_framerate)
+                                        double prev_layer_frame_rate)
 {
    LAYER_CONTEXT *lc = &cpi->layer_context[layer];

-    lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
+    lc->frame_rate = cpi->output_frame_rate / cpi->oxcf.rate_decimator[layer];
    lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;

    lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
@@ -335,7 +335,7 @@ static void init_temporal_layer_context(VP8_COMP *cpi,
      lc->avg_frame_size_for_layer =
          (int)((cpi->oxcf.target_bitrate[layer] -
                cpi->oxcf.target_bitrate[layer-1]) * 1000 /
-                (lc->framerate - prev_layer_framerate));
+                (lc->frame_rate - prev_layer_frame_rate));

     lc->active_worst_quality         = cpi->oxcf.worst_allowed_q;
     lc->active_best_quality          = cpi->oxcf.best_allowed_q;
@@ -363,7 +363,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
                                        const int prev_num_layers)
 {
    int i;
-    double prev_layer_framerate = 0;
+    double prev_layer_frame_rate = 0;
    const int curr_num_layers = cpi->oxcf.number_of_layers;
    // If the previous state was 1 layer, get current layer context from cpi.
    // We need this to set the layer context for the new layers below.
@@ -377,7 +377,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
        LAYER_CONTEXT *lc = &cpi->layer_context[i];
        if (i >= prev_num_layers)
        {
-           init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+           init_temporal_layer_context(cpi, oxcf, i, prev_layer_frame_rate);
        }
        // The initial buffer levels are set based on their starting levels.
        // We could set the buffer levels based on the previous state (normalized
@@ -403,8 +403,8 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
            lc->bits_off_target = lc->buffer_level;
            restore_layer_context(cpi, 0);
        }
-        prev_layer_framerate = cpi->output_framerate /
-                               cpi->oxcf.rate_decimator[i];
+        prev_layer_frame_rate =  cpi->output_frame_rate /
+                                 cpi->oxcf.rate_decimator[i];
    }
 }

@@ -1282,21 +1282,21 @@ int vp8_reverse_trans(int x)

    return 63;
 }
-void vp8_new_framerate(VP8_COMP *cpi, double framerate)
+void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
 {
    if(framerate < .1)
        framerate = 30;

-    cpi->framerate              = framerate;
-    cpi->output_framerate       = framerate;
+    cpi->frame_rate             = framerate;
+    cpi->output_frame_rate      = framerate;
    cpi->per_frame_bandwidth    = (int)(cpi->oxcf.target_bandwidth /
-                                  cpi->output_framerate);
+                                  cpi->output_frame_rate);
    cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
    cpi->min_frame_bandwidth    = (int)(cpi->av_per_frame_bandwidth *
                                  cpi->oxcf.two_pass_vbrmin_section / 100);

    /* Set Maximum gf/arf interval */
-    cpi->max_gf_interval = ((int)(cpi->output_framerate / 2.0) + 2);
+    cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);

    if(cpi->max_gf_interval < 12)
        cpi->max_gf_interval = 12;
@@ -1337,13 +1337,13 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     * seems like a reasonable framerate, then use that as a guess, otherwise
     * use 30.
     */
-    cpi->framerate = (double)(oxcf->timebase.den) /
-                     (double)(oxcf->timebase.num);
+    cpi->frame_rate = (double)(oxcf->timebase.den) /
+                      (double)(oxcf->timebase.num);

-    if (cpi->framerate > 180)
-        cpi->framerate = 30;
+    if (cpi->frame_rate > 180)
+        cpi->frame_rate = 30;

-    cpi->ref_framerate = cpi->framerate;
+    cpi->ref_frame_rate = cpi->frame_rate;

    /* change includes all joint functionality */
    vp8_change_config(cpi, oxcf);
@@ -1369,13 +1369,13 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
    if (cpi->oxcf.number_of_layers > 1)
    {
        unsigned int i;
-        double prev_layer_framerate=0;
+        double prev_layer_frame_rate=0;

        for (i=0; i<cpi->oxcf.number_of_layers; i++)
        {
-            init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
-            prev_layer_framerate = cpi->output_framerate /
-                                   cpi->oxcf.rate_decimator[i];
+            init_temporal_layer_context(cpi, oxcf, i, prev_layer_frame_rate);
+            prev_layer_frame_rate = cpi->output_frame_rate /
+                                    cpi->oxcf.rate_decimator[i];
        }
    }

@@ -1399,14 +1399,14 @@ static void update_layer_contexts (VP8_COMP *cpi)
    if (oxcf->number_of_layers > 1)
    {
        unsigned int i;
-        double prev_layer_framerate=0;
+        double prev_layer_frame_rate=0;

        for (i=0; i<oxcf->number_of_layers; i++)
        {
            LAYER_CONTEXT *lc = &cpi->layer_context[i];

-            lc->framerate =
-                cpi->ref_framerate / oxcf->rate_decimator[i];
+            lc->frame_rate =
+                cpi->ref_frame_rate / oxcf->rate_decimator[i];
            lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;

            lc->starting_buffer_level = rescale(
@@ -1432,9 +1432,9 @@ static void update_layer_contexts (VP8_COMP *cpi)
                lc->avg_frame_size_for_layer =
                   (int)((oxcf->target_bitrate[i] -
                          oxcf->target_bitrate[i-1]) * 1000 /
-                          (lc->framerate - prev_layer_framerate));
+                          (lc->frame_rate - prev_layer_frame_rate));

-            prev_layer_framerate = lc->framerate;
+            prev_layer_frame_rate = lc->frame_rate;
        }
    }
 }
@@ -1625,7 +1625,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
                    cpi->oxcf.target_bandwidth, 1000);

    /* Set up frame rate and related parameters rate control values. */
-    vp8_new_framerate(cpi, cpi->framerate);
+    vp8_new_frame_rate(cpi, cpi->frame_rate);

    /* Set absolute upper and lower quality limits */
    cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
@@ -1945,7 +1945,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)

    for (i = 0; i < KEY_FRAME_CONTEXT; i++)
    {
-        cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate;
+        cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
    }

 #ifdef OUTPUT_YUV_SRC
@@ -2273,7 +2273,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
        {
            extern int count_mb_seg[4];
            FILE *f = fopen("modes.stt", "a");
-            double dr = (double)cpi->framerate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+            double dr = (double)cpi->frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
            fprintf(f, "intra_mode in Intra Frames:\n");
            fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
            fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
@@ -2750,7 +2750,7 @@ static void update_alt_ref_frame_stats(VP8_COMP *cpi)
    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

    /* this frame refreshes means next frames don't unless specified by user */
-    cpi->frames_since_golden = 0;
+    cpi->common.frames_since_golden = 0;

    /* Clear the alternate reference update pending flag. */
    cpi->source_alt_ref_pending = 0;
@@ -2802,7 +2802,7 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
         * user
         */
        cm->refresh_golden_frame = 0;
-        cpi->frames_since_golden = 0;
+        cpi->common.frames_since_golden = 0;

        cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
        cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
@@ -2834,12 +2834,12 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
        if (cpi->frames_till_gf_update_due > 0)
            cpi->frames_till_gf_update_due--;

-        if (cpi->frames_till_alt_ref_frame)
-            cpi->frames_till_alt_ref_frame --;
+        if (cpi->common.frames_till_alt_ref_frame)
+            cpi->common.frames_till_alt_ref_frame --;

-        cpi->frames_since_golden ++;
+        cpi->common.frames_since_golden ++;

-        if (cpi->frames_since_golden > 1)
+        if (cpi->common.frames_since_golden > 1)
        {
            cpi->recent_ref_frame_usage[INTRA_FRAME] +=
                cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME];
@@ -2890,11 +2890,11 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
            cpi->prob_last_coded = 200;
            cpi->prob_gf_coded = 1;
        }
-        else if (cpi->frames_since_golden == 0)
+        else if (cpi->common.frames_since_golden == 0)
        {
            cpi->prob_last_coded = 214;
        }
-        else if (cpi->frames_since_golden == 1)
+        else if (cpi->common.frames_since_golden == 1)
        {
            cpi->prob_last_coded = 192;
            cpi->prob_gf_coded = 220;
@@ -3368,12 +3368,12 @@ static void encode_frame_to_data_rate
            cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
            /* per second target bitrate */
            cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
-                                          cpi->output_framerate);
+                                          cpi->output_frame_rate);
        }
    }
    else
 #endif
-        cpi->per_frame_bandwidth  = (int)(cpi->target_bandwidth / cpi->output_framerate);
+        cpi->per_frame_bandwidth  = (int)(cpi->target_bandwidth / cpi->output_frame_rate);

    /* Default turn off buffer to buffer copying */
    cm->copy_buffer_to_gf = 0;
@@ -4557,7 +4557,7 @@ static void encode_frame_to_data_rate
        {
            LAYER_CONTEXT *lc = &cpi->layer_context[i];
            int bits_off_for_this_layer =
-               (int)(lc->target_bandwidth / lc->framerate -
+               (int)(lc->target_bandwidth / lc->frame_rate -
                     cpi->projected_frame_size);

            lc->bits_off_target += bits_off_for_this_layer;
@@ -4805,7 +4805,7 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
    {
        double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
            *cpi->oxcf.two_pass_vbrmin_section / 100);
-        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->framerate);
+        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->frame_rate);
    }
 }
 #endif
@@ -4821,10 +4821,8 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C
 {
 #if HAVE_NEON
    int64_t store_reg[8];
-#if CONFIG_RUNTIME_CPU_DETECT
+#endif
    VP8_COMMON            *cm = &cpi->common;
-#endif
-#endif
    struct vpx_usec_timer  timer;
    int                    res = 0;

@@ -4850,6 +4848,7 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C
    if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
                          frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL))
        res = -1;
+    cm->clr_type = sd->clrtype;
    vpx_usec_timer_mark(&timer);
    cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);

@@ -4934,7 +4933,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                                              cpi->frames_till_gf_update_due);
                force_src_buffer = &cpi->alt_ref_buffer;
            }
-            cpi->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+            cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
            cm->refresh_alt_ref_frame = 1;
            cm->refresh_golden_frame = 0;
            cm->refresh_last_frame = 0;
@@ -5039,7 +5038,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
        if (this_duration)
        {
            if (step)
-                cpi->ref_framerate = 10000000.0 / this_duration;
+                cpi->ref_frame_rate = 10000000.0 / this_duration;
            else
            {
                double avg_duration, interval;
@@ -5053,11 +5052,11 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                if(interval > 10000000.0)
                    interval = 10000000;

-                avg_duration = 10000000.0 / cpi->ref_framerate;
+                avg_duration = 10000000.0 / cpi->ref_frame_rate;
                avg_duration *= (interval - avg_duration + this_duration);
                avg_duration /= interval;

-                cpi->ref_framerate = 10000000.0 / avg_duration;
+                cpi->ref_frame_rate = 10000000.0 / avg_duration;
            }

            if (cpi->oxcf.number_of_layers > 1)
@@ -5068,12 +5067,12 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                for (i=0; i<cpi->oxcf.number_of_layers; i++)
                {
                    LAYER_CONTEXT *lc = &cpi->layer_context[i];
-                    lc->framerate = cpi->ref_framerate /
-                                    cpi->oxcf.rate_decimator[i];
+                    lc->frame_rate = cpi->ref_frame_rate /
+                                  cpi->oxcf.rate_decimator[i];
                }
            }
            else
-                vp8_new_framerate(cpi, cpi->ref_framerate);
+                vp8_new_frame_rate(cpi, cpi->ref_frame_rate);
        }

        cpi->last_time_stamp_seen = cpi->source->ts_start;
@@ -5090,7 +5089,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
        layer = cpi->oxcf.layer_id[
                cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
        restore_layer_context (cpi, layer);
-        vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
+        vp8_new_frame_rate (cpi, cpi->layer_context[layer].frame_rate);
    }

    if (cpi->compressor_speed == 2)
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -232,7 +232,7 @@ enum
 typedef struct
 {
    /* Layer configuration */
-    double framerate;
+    double frame_rate;
    int target_bandwidth;

    /* Layer specific coding parameters */
@@ -320,7 +320,6 @@ typedef struct VP8_COMP
    YV12_BUFFER_CONFIG scaled_source;
    YV12_BUFFER_CONFIG *last_frame_unscaled_source;

-    unsigned int frames_till_alt_ref_frame;
    /* frame in src_buffers has been identified to be encoded as an alt ref */
    int source_alt_ref_pending;
    /* an alt ref frame has been encoded and is usable */
@@ -370,7 +369,6 @@ typedef struct VP8_COMP
    double key_frame_rate_correction_factor;
    double gf_rate_correction_factor;

-    unsigned int frames_since_golden;
    /* Count down till next GF */
    int frames_till_gf_update_due;

@@ -403,7 +401,7 @@ typedef struct VP8_COMP
    /* Minimum allocation that should be used for any frame */
    int min_frame_bandwidth;
    int inter_frame_target;
-    double output_framerate;
+    double output_frame_rate;
    int64_t last_time_stamp_seen;
    int64_t last_end_time_stamp_seen;
    int64_t first_time_stamp_ever;
@@ -417,8 +415,8 @@ typedef struct VP8_COMP

    int buffered_mode;

-    double framerate;
-    double ref_framerate;
+    double frame_rate;
+    double ref_frame_rate;
    int64_t buffer_level;
    int64_t bits_off_target;

--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -313,7 +313,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    /* Get baseline error score */

    /* Copy the unfiltered / processed recon buffer to the new buffer */
-    vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
+    vp8_yv12_copy_y(saved_frame, cm->frame_to_show);

    vp8cx_set_alt_lf_level(cpi, filt_mid);
    vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
@@ -339,7 +339,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
            if(ss_err[filt_low] == 0)
            {
                /* Get Low filter error score */
-                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
                vp8cx_set_alt_lf_level(cpi, filt_low);
                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);

@@ -367,7 +367,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
        {
            if(ss_err[filt_high] == 0)
            {
-                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
                vp8cx_set_alt_lf_level(cpi, filt_high);
                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);

--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -234,7 +234,7 @@ void vp8_save_coding_context(VP8_COMP *cpi)
    cc->frames_since_key          = cpi->frames_since_key;
    cc->filter_level             = cpi->common.filter_level;
    cc->frames_till_gf_update_due   = cpi->frames_till_gf_update_due;
-    cc->frames_since_golden       = cpi->frames_since_golden;
+    cc->frames_since_golden       = cpi->common.frames_since_golden;

    vp8_copy(cc->mvc,      cpi->common.fc.mvc);
    vp8_copy(cc->mvcosts,  cpi->rd_costs.mvcosts);
@@ -271,7 +271,7 @@ void vp8_restore_coding_context(VP8_COMP *cpi)
    cpi->frames_since_key         =   cc->frames_since_key;
    cpi->common.filter_level     =   cc->filter_level;
    cpi->frames_till_gf_update_due  =   cc->frames_till_gf_update_due;
-    cpi->frames_since_golden       =   cc->frames_since_golden;
+    cpi->common.frames_since_golden       =   cc->frames_since_golden;

    vp8_copy(cpi->common.fc.mvc, cc->mvc);

@@ -388,7 +388,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
        int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
        /* Boost depends somewhat on frame rate: only used for 1 layer case. */
        if (cpi->oxcf.number_of_layers == 1) {
-          kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
+          kf_boost = MAX(initial_boost, (int)(2 * cpi->output_frame_rate - 16));
        }
        else {
          /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
@@ -399,9 +399,9 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
        kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100;

        /* frame separation adjustment ( down) */
-        if (cpi->frames_since_key  < cpi->output_framerate / 2)
+        if (cpi->frames_since_key  < cpi->output_frame_rate / 2)
            kf_boost = (int)(kf_boost
-                       * cpi->frames_since_key / (cpi->output_framerate / 2));
+                       * cpi->frames_since_key / (cpi->output_frame_rate / 2));

        /* Minimal target size is |2* per_frame_bandwidth|. */
        if (kf_boost < 16)
@@ -715,7 +715,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
                if (Adjustment > (cpi->this_frame_target - min_frame_target))
                    Adjustment = (cpi->this_frame_target - min_frame_target);

-                if (cpi->frames_since_golden == (cpi->current_gf_interval >> 1))
+                if (cpi->common.frames_since_golden == (cpi->current_gf_interval >> 1))
                    cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment);
                else
                    cpi->this_frame_target -= Adjustment;
@@ -1360,7 +1360,7 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi)
         * whichever is smaller.
         */
        int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1;
-        av_key_frame_frequency = 1 + (int)cpi->output_framerate * 2;
+        av_key_frame_frequency = 1 + (int)cpi->output_frame_rate * 2;

        if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
            av_key_frame_frequency = key_freq;
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -341,7 +341,7 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue)

 void vp8_auto_select_speed(VP8_COMP *cpi)
 {
-    int milliseconds_for_compress = (int)(1000000 / cpi->framerate);
+    int milliseconds_for_compress = (int)(1000000 / cpi->frame_rate);

    milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;

--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -66,6 +66,7 @@ VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
 VP8_COMMON_SRCS-yes += common/variance_c.c
 VP8_COMMON_SRCS-yes += common/variance.h
+VP8_COMMON_SRCS-yes += common/vp8_asm_com_offsets.c
 VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h


@@ -191,4 +192,7 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(A
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)

+$(eval $(call asm_offsets_template,\
+         vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/vp8_asm_com_offsets.c))
+
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -153,7 +153,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 #else
    RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
 #endif
-    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
+    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
    RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
    RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
@@ -204,7 +204,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
    RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
    RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
    RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
-    if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q))
+    if(finalize && cfg->rc_end_usage == VPX_CQ)
        RANGE_CHECK(vp8_cfg, cq_level,
                    cfg->rc_min_quantizer, cfg->rc_max_quantizer);

@@ -327,14 +327,17 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
    oxcf->resample_up_water_mark   = cfg.rc_resize_up_thresh;
    oxcf->resample_down_water_mark = cfg.rc_resize_down_thresh;

-    if (cfg.rc_end_usage == VPX_VBR) {
-      oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
-    } else if (cfg.rc_end_usage == VPX_CBR) {
-      oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
-    } else if (cfg.rc_end_usage == VPX_CQ) {
-      oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
-    } else if (cfg.rc_end_usage == VPX_Q) {
-      oxcf->end_usage = USAGE_CONSTANT_QUALITY;
+    if (cfg.rc_end_usage == VPX_VBR)
+    {
+        oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
+    }
+    else if (cfg.rc_end_usage == VPX_CBR)
+    {
+        oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+    }
+    else if (cfg.rc_end_usage == VPX_CQ)
+    {
+        oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
    }

    oxcf->target_bandwidth         = cfg.rc_target_bitrate;
@@ -692,6 +695,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
    yv12->uv_stride = img->stride[VPX_PLANE_U];

    yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
    return res;
 }

@@ -1075,7 +1079,11 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
        ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
        ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;

-        ctx->preview_img.fmt = VPX_IMG_FMT_I420;
+        if (sd.clrtype == REG_YUV)
+            ctx->preview_img.fmt = VPX_IMG_FMT_I420;
+        else
+            ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
+
        ctx->preview_img.x_chroma_shift = 1;
        ctx->preview_img.y_chroma_shift = 1;

@@ -1269,7 +1277,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
        1,                  /* g_delete_first_pass_file */
        "vp8.fpf"           /* first pass filename */
 #endif
-        VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
+
        1,                  /* ts_number_layers */
        {0},                /* ts_target_bitrate */
        {0},                /* ts_rate_decimator */
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -41,6 +41,15 @@ typedef enum

 static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);

+typedef struct
+{
+    unsigned int   id;
+    unsigned long  sz;
+    unsigned int   align;
+    unsigned int   flags;
+    unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
+} mem_req_t;
+
 static const mem_req_t vp8_mem_req_segs[] =
 {
    {VP8_SEG_ALG_PRIV,    0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
@@ -84,6 +93,65 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_
    return sizeof(vpx_codec_alg_priv_t);
 }

+
+static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap)
+{
+    free(mmap->priv);
+}
+
+static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap)
+{
+    vpx_codec_err_t  res;
+    unsigned int   align;
+
+    align = mmap->align ? mmap->align - 1 : 0;
+
+    if (mmap->flags & VPX_CODEC_MEM_ZERO)
+        mmap->priv = calloc(1, mmap->sz + align);
+    else
+        mmap->priv = malloc(mmap->sz + align);
+
+    res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
+    mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
+    mmap->dtor = vp8_mmap_dtor;
+    return res;
+}
+
+static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
+        const vpx_codec_mmap_t        *mmaps,
+        vpx_codec_flags_t              init_flags)
+{
+    int i;
+    vpx_codec_err_t res = VPX_CODEC_OK;
+
+    for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++)
+    {
+        /* Ensure the segment has been allocated */
+        if (!mmaps[i].base)
+        {
+            res = VPX_CODEC_MEM_ERROR;
+            break;
+        }
+
+        /* Verify variable size segment is big enough for the current si. */
+        if (vp8_mem_req_segs[i].calc_sz)
+        {
+            vpx_codec_dec_cfg_t cfg;
+
+            cfg.w = si->w;
+            cfg.h = si->h;
+
+            if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags))
+            {
+                res = VPX_CODEC_MEM_ERROR;
+                break;
+            }
+        }
+    }
+
+    return res;
+}
+
 static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
 {
    int i;
@@ -110,6 +178,16 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
    }
 }

+static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id)
+{
+    int i;
+
+    for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
+        if (ctx->mmaps[i].id == id)
+            return ctx->mmaps[i].base;
+
+    return NULL;
+}
 static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx)
 {
    /* nothing to clean up */
@@ -136,7 +214,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
        mmap.align = vp8_mem_req_segs[0].align;
        mmap.flags = vp8_mem_req_segs[0].flags;

-        res = vpx_mmap_alloc(&mmap);
+        res = vp8_mmap_alloc(&mmap);
        if (res != VPX_CODEC_OK) return res;

        vp8_init_ctx(ctx, &mmap);
@@ -288,7 +366,8 @@ static void yuvconfig2image(vpx_image_t               *img,
      * the Y, U, and V planes, nor other alignment adjustments that
      * might be representable by a YV12_BUFFER_CONFIG, so we just
      * initialize all the fields.*/
-    img->fmt = VPX_IMG_FMT_I420;
+    img->fmt = yv12->clrtype == REG_YUV ?
+        VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
    img->w = yv12->y_stride;
    img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
    img->d_w = yv12->y_width;
@@ -409,7 +488,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
                ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
                                   ctx->base.init_flags);

-            res = vpx_mmap_alloc(&ctx->mmaps[i]);
+            res = vp8_mmap_alloc(&ctx->mmaps[i]);
        }

        if (!res)
@@ -421,9 +500,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
    /* Initialize the decoder instance on the first frame*/
    if (!res && !ctx->decoder_init)
    {
-        res = vpx_validate_mmaps(&ctx->si, ctx->mmaps,
-                                 vp8_mem_req_segs, NELEMENTS(vp8_mem_req_segs),
-                                 ctx->base.init_flags);
+        res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);

        if (!res)
        {
@@ -720,6 +797,8 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
    yv12->uv_stride = img->stride[VPX_PLANE_U];

    yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
+    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
+
    return res;
 }

--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -35,5 +35,9 @@ VP8_DX_SRCS-yes += decoder/onyxd_int.h
 VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
+VP8_DX_SRCS-yes += decoder/vp8_asm_dec_offsets.c

 VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
+
+$(eval $(call asm_offsets_template,\
+         vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/vp8_asm_dec_offsets.c))
--- a/vp9/common/arm/neon/vp9_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_avg_neon.asm
@@ -1,116 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_convolve_avg_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|vp9_convolve_avg_neon| PROC
-    push                {r4-r6, lr}
-    ldrd                r4, r5, [sp, #32]
-    mov                 r6, r2
-
-    cmp                 r4, #32
-    bgt                 avg64
-    beq                 avg32
-    cmp                 r4, #8
-    bgt                 avg16
-    beq                 avg8
-    b                   avg4
-
-avg64
-    sub                 lr, r1, #32
-    sub                 r4, r3, #32
-avg64_h
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0]!
-    vld1.8              {q2-q3}, [r0], lr
-    pld                 [r2, r3]
-    vld1.8              {q8-q9},   [r6@128]!
-    vld1.8              {q10-q11}, [r6@128], r4
-    vrhadd.u8           q0, q0, q8
-    vrhadd.u8           q1, q1, q9
-    vrhadd.u8           q2, q2, q10
-    vrhadd.u8           q3, q3, q11
-    vst1.8              {q0-q1}, [r2@128]!
-    vst1.8              {q2-q3}, [r2@128], r4
-    subs                r5, r5, #1
-    bgt                 avg64_h
-    pop                 {r4-r6, pc}
-
-avg32
-    vld1.8              {q0-q1}, [r0], r1
-    vld1.8              {q2-q3}, [r0], r1
-    vld1.8              {q8-q9},   [r6@128], r3
-    vld1.8              {q10-q11}, [r6@128], r3
-    pld                 [r0]
-    vrhadd.u8           q0, q0, q8
-    pld                 [r0, r1]
-    vrhadd.u8           q1, q1, q9
-    pld                 [r6]
-    vrhadd.u8           q2, q2, q10
-    pld                 [r6, r3]
-    vrhadd.u8           q3, q3, q11
-    vst1.8              {q0-q1}, [r2@128], r3
-    vst1.8              {q2-q3}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 avg32
-    pop                 {r4-r6, pc}
-
-avg16
-    vld1.8              {q0}, [r0], r1
-    vld1.8              {q1}, [r0], r1
-    vld1.8              {q2}, [r6@128], r3
-    vld1.8              {q3}, [r6@128], r3
-    pld                 [r0]
-    pld                 [r0, r1]
-    vrhadd.u8           q0, q0, q2
-    pld                 [r6]
-    pld                 [r6, r3]
-    vrhadd.u8           q1, q1, q3
-    vst1.8              {q0}, [r2@128], r3
-    vst1.8              {q1}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 avg16
-    pop                 {r4-r6, pc}
-
-avg8
-    vld1.8              {d0}, [r0], r1
-    vld1.8              {d1}, [r0], r1
-    vld1.8              {d2}, [r6@64], r3
-    vld1.8              {d3}, [r6@64], r3
-    pld                 [r0]
-    pld                 [r0, r1]
-    vrhadd.u8           q0, q0, q1
-    pld                 [r6]
-    pld                 [r6, r3]
-    vst1.8              {d0}, [r2@64], r3
-    vst1.8              {d1}, [r2@64], r3
-    subs                r5, r5, #2
-    bgt                 avg8
-    pop                 {r4-r6, pc}
-
-avg4
-    vld1.32             {d0[0]}, [r0], r1
-    vld1.32             {d0[1]}, [r0], r1
-    vld1.32             {d2[0]}, [r6@32], r3
-    vld1.32             {d2[1]}, [r6@32], r3
-    vrhadd.u8           d0, d0, d2
-    vst1.32             {d0[0]}, [r2@32], r3
-    vst1.32             {d0[1]}, [r2@32], r3
-    subs                r5, r5, #2
-    bgt                 avg4
-    pop                 {r4-r6, pc}
-    ENDP
-
-    END
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -1,302 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
-
-    EXPORT  |vp9_convolve8_avg_horiz_neon|
-    EXPORT  |vp9_convolve8_avg_vert_neon|
-    IMPORT  |vp9_convolve8_avg_horiz_c|
-    IMPORT  |vp9_convolve8_avg_vert_c|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
-; sp[]int w
-; sp[]int h
-
-|vp9_convolve8_avg_horiz_neon| PROC
-    ldr             r12, [sp, #4]           ; x_step_q4
-    cmp             r12, #16
-    bne             vp9_convolve8_avg_horiz_c
-
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
-
-    vld1.s16        {q0}, [r5]              ; filter_x
-
-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    ; slightly out of order load to match the existing data
-    vld1.u32        {d6[0]}, [r2], r3
-    vld1.u32        {d7[0]}, [r2], r3
-    vld1.u32        {d6[1]}, [r2], r3
-    vld1.u32        {d7[1]}, [r2], r3
-
-    sub             r2, r2, r3, lsl #2      ; reset for store
-
-    ; src[] * filter_x
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|vp9_convolve8_avg_vert_neon| PROC
-    ldr             r12, [sp, #12]
-    cmp             r12, #16
-    bne             vp9_convolve8_avg_vert_c
-
-    push            {r4-r8, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
-
-    vld1.s16        {q0}, [r4]              ; filter_y
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    vld1.u32        {d6[0]}, [r5@32], r3
-    vld1.u32        {d6[1]}, [r8@32], r3
-    vld1.u32        {d7[0]}, [r5@32], r3
-    vld1.u32        {d7[1]}, [r8@32], r3
-
-    pld             [r7]
-    pld             [r4]
-
-    ; src[] * filter_y
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    sub             r5, r5, r3, lsl #1      ; reset for store
-    sub             r8, r8, r3, lsl #1
-
-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
-    bgt             loop_vert
-
-    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    ENDP
-    END
--- a/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -1,280 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
-
-    EXPORT  |vp9_convolve8_horiz_neon|
-    EXPORT  |vp9_convolve8_vert_neon|
-    IMPORT  |vp9_convolve8_horiz_c|
-    IMPORT  |vp9_convolve8_vert_c|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
-; sp[]int w
-; sp[]int h
-
-|vp9_convolve8_horiz_neon| PROC
-    ldr             r12, [sp, #4]           ; x_step_q4
-    cmp             r12, #16
-    bne             vp9_convolve8_horiz_c
-
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
-
-    vld1.s16        {q0}, [r5]              ; filter_x
-
-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    ; src[] * filter_x
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|vp9_convolve8_vert_neon| PROC
-    ldr             r12, [sp, #12]
-    cmp             r12, #16
-    bne             vp9_convolve8_vert_c
-
-    push            {r4-r8, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
-
-    vld1.s16        {q0}, [r4]              ; filter_y
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    ; src[] * filter_y
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r7]
-    pld             [r4]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
-    bgt             loop_vert
-
-    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    ENDP
-    END
--- a/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -1,78 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx_ports/mem.h"
-
-void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int x_step_q4,
-                        const int16_t *filter_y, int y_step_q4,
-                        int w, int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
-   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
-   */
-  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
-
-  // Account for the vertical phase needing 3 lines prior and 4 lines post
-  int intermediate_height = h + 7;
-
-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_c(src, src_stride,
-                           dst, dst_stride,
-                           filter_x, x_step_q4,
-                           filter_y, y_step_q4,
-                           w, h);
-
-  /* Filter starting 3 lines back. The neon implementation will ignore the
-   * given height and filter a multiple of 4 lines. Since this goes in to
-   * the temp buffer which has lots of extra room and is subsequently discarded
-   * this is safe if somewhat less than ideal.
-   */
-  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
-                           temp, 64,
-                           filter_x, x_step_q4, filter_y, y_step_q4,
-                           w, intermediate_height);
-
-  /* Step into the temp buffer 3 lines to get the actual frame data */
-  vp9_convolve8_vert_neon(temp + 64 * 3, 64,
-                          dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-}
-
-void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h) {
-  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
-  int intermediate_height = h + 7;
-
-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_avg_c(src, src_stride,
-                               dst, dst_stride,
-                               filter_x, x_step_q4,
-                               filter_y, y_step_q4,
-                               w, h);
-
-  /* This implementation has the same issues as above. In addition, we only want
-   * to average the values after both passes.
-   */
-  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
-                           temp, 64,
-                           filter_x, x_step_q4, filter_y, y_step_q4,
-                           w, intermediate_height);
-  vp9_convolve8_avg_vert_neon(temp + 64 * 3,
-                              64, dst, dst_stride,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h);
-}
--- a/vp9/common/arm/neon/vp9_copy_neon.asm
+++ b/vp9/common/arm/neon/vp9_copy_neon.asm
@@ -1,84 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_convolve_copy_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|vp9_convolve_copy_neon| PROC
-    push                {r4-r5, lr}
-    ldrd                r4, r5, [sp, #28]
-
-    cmp                 r4, #32
-    bgt                 copy64
-    beq                 copy32
-    cmp                 r4, #8
-    bgt                 copy16
-    beq                 copy8
-    b                   copy4
-
-copy64
-    sub                 lr, r1, #32
-    sub                 r3, r3, #32
-copy64_h
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0]!
-    vld1.8              {q2-q3}, [r0], lr
-    vst1.8              {q0-q1}, [r2@128]!
-    vst1.8              {q2-q3}, [r2@128], r3
-    subs                r5, r5, #1
-    bgt                 copy64_h
-    pop                 {r4-r5, pc}
-
-copy32
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0], r1
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q2-q3}, [r0], r1
-    vst1.8              {q0-q1}, [r2@128], r3
-    vst1.8              {q2-q3}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 copy32
-    pop                 {r4-r5, pc}
-
-copy16
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0}, [r0], r1
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q1}, [r0], r1
-    vst1.8              {q0}, [r2@128], r3
-    vst1.8              {q1}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 copy16
-    pop                 {r4-r5, pc}
-
-copy8
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {d0}, [r0], r1
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {d2}, [r0], r1
-    vst1.8              {d0}, [r2@64], r3
-    vst1.8              {d2}, [r2@64], r3
-    subs                r5, r5, #2
-    bgt                 copy8
-    pop                 {r4-r5, pc}
-
-copy4
-    ldr                 r12, [r0], r1
-    str                 r12, [r2], r3
-    subs                r5, r5, #1
-    bgt                 copy4
-    pop                 {r4-r5, pc}
-    ENDP
-
-    END
--- a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
@@ -1,69 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_dc_only_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr,
-;                            uint8_t *dst_ptr, int pitch, int stride)
-;
-; r0  int input_dc
-; r1  uint8_t *pred_ptr
-; r2  uint8_t *dst_ptr
-; r3  int pitch
-; sp  int stride
-
-|vp9_dc_only_idct_add_neon| PROC
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    mul              r0, r0, r12               ; input_dc * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; ROUND_POWER_OF_TWO(out, 4)
-    add              r0, r0, #8                ; + (1 <<((4) - 1))
-    asr              r0, r0, #4                ; >> 4
-
-    vdup.16         q0, r0;                   ; duplicate a1
-    ldr              r12, [sp]                 ; load stride
-
-    vld1.32         {d2[0]}, [r1], r3
-    vld1.32         {d2[1]}, [r1], r3
-    vld1.32         {d4[0]}, [r1], r3
-    vld1.32         {d4[1]}, [r1]
-
-    vaddw.u8        q1, q0, d2                ; a1 + pred_ptr[c]
-    vaddw.u8        q2, q0, d4
-
-    vqmovun.s16     d2, q1                    ; clip_pixel
-    vqmovun.s16     d4, q2
-
-    vst1.32         {d2[0]}, [r2], r12
-    vst1.32         {d2[1]}, [r2], r12
-    vst1.32         {d4[0]}, [r2], r12
-    vst1.32         {d4[1]}, [r2]
-
-    bx               lr
-    ENDP             ; |vp9_dc_only_idct_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -1,169 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-
-extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
-                                               int16_t *output,
-                                               int output_stride);
-extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1Output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest,
-                                               int dest_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input,
-                                               int16_t *output,
-                                               int output_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1Output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest,
-                                               int dest_stride);
-extern void save_neon_registers();
-extern void restore_neon_registers();
-
-
-void vp9_short_idct16x16_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {
-  int16_t pass1_output[16*16] = {0};
-  int16_t row_idct_output[16*16] = {0};
-
-  // save d8-d15 register values.
-  save_neon_registers();
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vp9_short_idct16x16_add_neon_pass2(input+1,
-                                     row_idct_output,
-                                     pass1_output,
-                                     0,
-                                     dest,
-                                     dest_stride);
-
-  /* Parallel idct on the lower 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
-                                     row_idct_output+8,
-                                     pass1_output,
-                                     0,
-                                     dest,
-                                     dest_stride);
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
-                                     row_idct_output,
-                                     pass1_output,
-                                     1,
-                                     dest,
-                                     dest_stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
-                                     row_idct_output+8,
-                                     pass1_output,
-                                     1,
-                                     dest+8,
-                                     dest_stride);
-
-  // restore d8-d15 register values.
-  restore_neon_registers();
-
-  return;
-}
-
-void vp9_short_idct10_16x16_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {
-  int16_t pass1_output[16*16] = {0};
-  int16_t row_idct_output[16*16] = {0};
-
-  // save d8-d15 register values.
-  save_neon_registers();
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vp9_short_idct10_16x16_add_neon_pass2(input+1,
-                                        row_idct_output,
-                                        pass1_output,
-                                        0,
-                                        dest,
-                                        dest_stride);
-
-  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
-                                     row_idct_output,
-                                     pass1_output,
-                                     1,
-                                     dest,
-                                     dest_stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
-                                     row_idct_output+8,
-                                     pass1_output,
-                                     1,
-                                     dest+8,
-                                     dest_stride);
-
-  // restore d8-d15 register values.
-  restore_neon_registers();
-
-  return;
-}
--- a/vp9/common/arm/neon/vp9_idct32x32_neon.c
+++ b/vp9/common/arm/neon/vp9_idct32x32_neon.c
@@ -1,47 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/common/vp9_common.h"
-
-// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
-extern void idct32_transpose_and_transform(int16_t *transpose_buffer,
-                                           int16_t *output, int16_t *input);
-extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
-
-
-// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
-extern void save_neon_registers();
-extern void restore_neon_registers();
-
-void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
-  // TODO(cd): move the creation of these buffers within the ASM file
-  // internal buffer used to transpose 8 lines into before transforming them
-  int16_t transpose_buffer[32 * 8];
-  // results of the first pass (transpose and transform rows)
-  int16_t pass1[32 * 32];
-  // results of the second pass (transpose and transform columns)
-  int16_t pass2[32 * 32];
-
-  // save register we need to preserve
-  save_neon_registers();
-  // process rows
-  idct32_transpose_and_transform(transpose_buffer, pass1, input);
-  // process columns
-  // TODO(cd): do these two steps/passes within the ASM file
-  idct32_transpose_and_transform(transpose_buffer, pass2, pass1);
-  // combine and add to dest
-  // TODO(cd): integrate this within the last storage step of the second pass
-  idct32_combine_add(dest, pass2, dest_stride);
-  // restore register we need to preserve
-  restore_neon_registers();
-}
-
-// TODO(cd): Eliminate this file altogether when everything is in ASM file
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.asm
+++ b/vp9/common/arm/neon/vp9_loopfilter_neon.asm
@@ -1,708 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_loop_filter_horizontal_edge_neon|
-    EXPORT  |vp9_loop_filter_vertical_edge_neon|
-    EXPORT  |vp9_mbloop_filter_horizontal_edge_neon|
-    EXPORT  |vp9_mbloop_filter_vertical_edge_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
-; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
-;
-; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s,
-;                                           int p /* pitch */,
-;                                           const uint8_t *blimit,
-;                                           const uint8_t *limit,
-;                                           const uint8_t *thresh,
-;                                           int count)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_loop_filter_horizontal_edge_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r12, [sp, #8]              ; load count
-    ldr         r2, [sp, #4]               ; load thresh
-    add         r1, r1, r1                 ; double pitch
-
-    cmp         r12, #0
-    beq         end_vp9_lf_h_edge
-
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
-
-count_lf_h_loop
-    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
-    add         r3, r2, r1, lsr #1         ; set to 3 lines down
-
-    vld1.u8     {d3}, [r2@64], r1          ; p3
-    vld1.u8     {d4}, [r3@64], r1          ; p2
-    vld1.u8     {d5}, [r2@64], r1          ; p1
-    vld1.u8     {d6}, [r3@64], r1          ; p0
-    vld1.u8     {d7}, [r2@64], r1          ; q0
-    vld1.u8     {d16}, [r3@64], r1         ; q1
-    vld1.u8     {d17}, [r2@64]             ; q2
-    vld1.u8     {d18}, [r3@64]             ; q3
-
-    sub         r2, r2, r1, lsl #1
-    sub         r3, r3, r1, lsl #1
-
-    bl          vp9_loop_filter_neon
-
-    vst1.u8     {d4}, [r2@64], r1          ; store op1
-    vst1.u8     {d5}, [r3@64], r1          ; store op0
-    vst1.u8     {d6}, [r2@64], r1          ; store oq0
-    vst1.u8     {d7}, [r3@64], r1          ; store oq1
-
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         count_lf_h_loop
-
-end_vp9_lf_h_edge
-    pop         {pc}
-    ENDP        ; |vp9_loop_filter_horizontal_edge_neon|
-
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
-; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
-;
-; void vp9_loop_filter_vertical_edge_neon(uint8_t *s,
-;                                         int p /* pitch */,
-;                                         const uint8_t *blimit,
-;                                         const uint8_t *limit,
-;                                         const uint8_t *thresh,
-;                                         int count)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_loop_filter_vertical_edge_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    ldr         r12, [sp, #8]             ; load count
-    vld1.8      {d1[]}, [r3]              ; duplicate *limit
-
-    ldr         r3, [sp, #4]              ; load thresh
-    sub         r2, r0, #4                ; move s pointer down by 4 columns
-    cmp         r12, #0
-    beq         end_vp9_lf_v_edge
-
-    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
-
-count_lf_v_loop
-    vld1.u8     {d3}, [r2], r1             ; load s data
-    vld1.u8     {d4}, [r2], r1
-    vld1.u8     {d5}, [r2], r1
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d7}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d18}, [r2]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     d3, d7
-    vtrn.32     d4, d16
-    vtrn.32     d5, d17
-    vtrn.32     d6, d18
-
-    vtrn.16     d3, d5
-    vtrn.16     d4, d6
-    vtrn.16     d7, d17
-    vtrn.16     d16, d18
-
-    vtrn.8      d3, d4
-    vtrn.8      d5, d6
-    vtrn.8      d7, d16
-    vtrn.8      d17, d18
-
-    bl          vp9_loop_filter_neon
-
-    sub         r0, r0, #2
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
-    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
-    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
-    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
-    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
-    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
-    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
-    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
-
-    add         r0, r0, r1, lsl #3         ; s += pitch * 8
-    subs        r12, r12, #1
-    subne       r2, r0, #4                 ; move s pointer down by 4 columns
-    bne         count_lf_v_loop
-
-end_vp9_lf_v_edge
-    pop         {pc}
-    ENDP        ; |vp9_loop_filter_vertical_edge_neon|
-
-; void vp9_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0    blimit
-; d1    limit
-; d2    thresh
-; d3    p3
-; d4    p2
-; d5    p1
-; d6    p0
-; d7    q0
-; d16   q1
-; d17   q2
-; d18   q3
-;
-; Outputs:
-; d4    op1
-; d5    op0
-; d6    oq0
-; d7    oq1
-|vp9_loop_filter_neon| PROC
-    ; filter_mask
-    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
-    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
-    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
-    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
-    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
-    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
-    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
-
-    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
-
-    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
-
-    vmov.u8     d18, #0x80
-
-    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
-
-    ; hevmask
-    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
-
-    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
-    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
-
-    veor        d7, d7, d18                 ; qs0
-
-    vcge.u8     d23, d1, d23                ; abs(m1) > limit
-
-    ; filter() function
-    ; convert to signed
-
-    vshr.u8     d28, d28, #1                ; a = a / 2
-    veor        d6, d6, d18                 ; ps0
-
-    veor        d5, d5, d18                 ; ps1
-    vqadd.u8    d17, d17, d28               ; a = b + a
-
-    veor        d16, d16, d18               ; qs1
-
-    vmov.u8     d19, #3
-
-    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
-
-    vcge.u8     d17, d0, d17                ; a > blimit
-
-    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
-    vorr        d22, d21, d22               ; hevmask
-
-    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
-
-    vand        d27, d27, d22               ; filter &= hev
-    vand        d23, d23, d17               ; filter_mask
-
-    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
-
-    vmov.u8     d17, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d27, q12
-
-    vand        d27, d27, d23               ; filter &= mask
-
-    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
-    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
-    vshr.s8     d28, d28, #3                ; filter2 >>= 3
-    vshr.s8     d27, d27, #3                ; filter1 >>= 3
-
-    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
-    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
-
-    ; outer tap adjustments
-    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
-
-    veor        d6, d26, d18                ; *oq0 = u^0x80
-
-    vbic        d27, d27, d22               ; filter &= ~hev
-
-    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
-    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
-
-    veor        d5, d19, d18                ; *op0 = u^0x80
-    veor        d4, d21, d18                ; *op1 = u^0x80
-    veor        d7, d20, d18                ; *oq1 = u^0x80
-
-    bx          lr
-    ENDP        ; |vp9_loop_filter_neon|
-
-; void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int p,
-;                                             const uint8_t *blimit,
-;                                             const uint8_t *limit,
-;                                             const uint8_t *thresh,
-;                                             int count)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_mbloop_filter_horizontal_edge_neon| PROC
-    push        {r4-r5, lr}
-
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r12, [sp, #16]             ; load count
-    ldr         r2, [sp, #12]              ; load thresh
-    add         r1, r1, r1                 ; double pitch
-
-    cmp         r12, #0
-    beq         end_vp9_mblf_h_edge
-
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
-
-count_mblf_h_loop
-    sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
-    add         r2, r3, r1, lsr #1         ; set to 3 lines down
-
-    vld1.u8     {d3}, [r3@64], r1          ; p3
-    vld1.u8     {d4}, [r2@64], r1          ; p2
-    vld1.u8     {d5}, [r3@64], r1          ; p1
-    vld1.u8     {d6}, [r2@64], r1          ; p0
-    vld1.u8     {d7}, [r3@64], r1          ; q0
-    vld1.u8     {d16}, [r2@64], r1         ; q1
-    vld1.u8     {d17}, [r3@64]             ; q2
-    vld1.u8     {d18}, [r2@64], r1         ; q3
-
-    sub         r3, r3, r1, lsl #1
-    sub         r2, r2, r1, lsl #2
-
-    bl          vp9_mbloop_filter_neon
-
-    vst1.u8     {d0}, [r2@64], r1          ; store op2
-    vst1.u8     {d1}, [r3@64], r1          ; store op1
-    vst1.u8     {d2}, [r2@64], r1          ; store op0
-    vst1.u8     {d3}, [r3@64], r1          ; store oq0
-    vst1.u8     {d4}, [r2@64], r1          ; store oq1
-    vst1.u8     {d5}, [r3@64], r1          ; store oq2
-
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         count_mblf_h_loop
-
-end_vp9_mblf_h_edge
-    pop         {r4-r5, pc}
-
-    ENDP        ; |vp9_mbloop_filter_horizontal_edge_neon|
-
-; void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s,
-;                                           int pitch,
-;                                           const uint8_t *blimit,
-;                                           const uint8_t *limit,
-;                                           const uint8_t *thresh,
-;                                           int count)
-;
-; r0    uint8_t *s,
-; r1    int pitch,
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_mbloop_filter_vertical_edge_neon| PROC
-    push        {r4-r5, lr}
-
-    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    ldr         r12, [sp, #16]            ; load count
-    vld1.8      {d1[]}, [r3]              ; duplicate *limit
-
-    ldr         r3, [sp, #12]             ; load thresh
-    sub         r2, r0, #4                ; move s pointer down by 4 columns
-    cmp         r12, #0
-    beq         end_vp9_mblf_v_edge
-
-    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
-
-count_mblf_v_loop
-    vld1.u8     {d3}, [r2], r1             ; load s data
-    vld1.u8     {d4}, [r2], r1
-    vld1.u8     {d5}, [r2], r1
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d7}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d18}, [r2]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     d3, d7
-    vtrn.32     d4, d16
-    vtrn.32     d5, d17
-    vtrn.32     d6, d18
-
-    vtrn.16     d3, d5
-    vtrn.16     d4, d6
-    vtrn.16     d7, d17
-    vtrn.16     d16, d18
-
-    vtrn.8      d3, d4
-    vtrn.8      d5, d6
-    vtrn.8      d7, d16
-    vtrn.8      d17, d18
-
-    sub         r2, r0, #3
-    add         r3, r0, #1
-
-    bl          vp9_mbloop_filter_neon
-
-    ;store op2, op1, op0, oq0
-    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
-    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
-    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
-    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
-    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
-    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
-    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
-    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r2]
-
-    ;store oq1, oq2
-    vst2.8      {d4[0], d5[0]}, [r3], r1
-    vst2.8      {d4[1], d5[1]}, [r3], r1
-    vst2.8      {d4[2], d5[2]}, [r3], r1
-    vst2.8      {d4[3], d5[3]}, [r3], r1
-    vst2.8      {d4[4], d5[4]}, [r3], r1
-    vst2.8      {d4[5], d5[5]}, [r3], r1
-    vst2.8      {d4[6], d5[6]}, [r3], r1
-    vst2.8      {d4[7], d5[7]}, [r3]
-
-    add         r0, r0, r1, lsl #3         ; s += pitch * 8
-    subs        r12, r12, #1
-    subne       r2, r0, #4                 ; move s pointer down by 4 columns
-    bne         count_mblf_v_loop
-
-end_vp9_mblf_v_edge
-    pop         {r4-r5, pc}
-    ENDP        ; |vp9_mbloop_filter_vertical_edge_neon|
-
-; void vp9_mbloop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0    blimit
-; d1    limit
-; d2    thresh
-; d3    p3
-; d4    p2
-; d5    p1
-; d6    p0
-; d7    q0
-; d16   q1
-; d17   q2
-; d18   q3
-;
-; Outputs:
-; d0    op2
-; d1    op1
-; d2    op0
-; d3    oq0
-; d4    oq1
-; d5    oq2
-|vp9_mbloop_filter_neon| PROC
-    ; filter_mask
-    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
-    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
-    vabd.u8     d21, d5, d6                ; m3 = abs(p1 - p0)
-    vabd.u8     d22, d16, d7               ; m4 = abs(q1 - q0)
-    vabd.u8     d23, d17, d16              ; m5 = abs(q2 - q1)
-    vabd.u8     d24, d18, d17              ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20              ; m1 = max(m1, m2)
-    vmax.u8     d20, d21, d22              ; m2 = max(m3, m4)
-
-    vabd.u8     d25, d6, d4                ; m7 = abs(p0 - p2)
-
-    vmax.u8     d23, d23, d24              ; m3 = max(m5, m6)
-
-    vabd.u8     d26, d7, d17               ; m8 = abs(q0 - q2)
-
-    vmax.u8     d19, d19, d20
-
-    vabd.u8     d24, d6, d7                ; m9 = abs(p0 - q0)
-    vabd.u8     d27, d3, d6                ; m10 = abs(p3 - p0)
-    vabd.u8     d28, d18, d7               ; m11 = abs(q3 - q0)
-
-    vmax.u8     d19, d19, d23
-
-    vabd.u8     d23, d5, d16               ; a = abs(p1 - q1)
-    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
-
-    ; abs () > limit
-    vcge.u8     d19, d1, d19
-
-    ; only compare the largest value to thresh
-    vmax.u8     d25, d25, d26              ; m4 = max(m7, m8)
-    vmax.u8     d26, d27, d28              ; m5 = max(m10, m11)
-
-    vshr.u8     d23, d23, #1               ; a = a / 2
-
-    vmax.u8     d25, d25, d26              ; m4 = max(m4, m5)
-
-    vqadd.u8    d24, d24, d23              ; a = b + a
-
-    vmax.u8     d20, d20, d25              ; m2 = max(m2, m4)
-
-    vmov.u8     d23, #1
-    vcge.u8     d24, d0, d24               ; a > blimit
-
-    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1
-
-    vcge.u8     d20, d23, d20              ; flat
-
-    vand        d19, d19, d24              ; mask
-
-    vcgt.u8     d23, d22, d2               ; (abs(q1 - q0) > thresh)*-1
-
-    vand        d20, d20, d19              ; flat & mask
-
-    vmov.u8     d22, #0x80
-
-    vorr        d23, d21, d23              ; hev
-
-    ; This instruction will truncate the "flat & mask" masks down to 4 bits
-    ; each to fit into one 32 bit arm register. The values are stored in
-    ; q10.64[0].
-    vshrn.u16   d30, q10, #4
-    vmov.u32    r4, d30[0]                 ; flat & mask 4bits
-
-    adds        r5, r4, #1                 ; Check for all 1's
-
-    ; If mask and flat are 1's for all vectors, then we only need to execute
-    ; the power branch for all vectors.
-    beq         power_branch_only
-
-    cmp         r4, #0                     ; Check for 0, set flag for later
-
-    ; mbfilter() function
-    ; filter() function
-    ; convert to signed
-    veor        d21, d7, d22               ; qs0
-    veor        d24, d6, d22               ; ps0
-    veor        d25, d5, d22               ; ps1
-    veor        d26, d16, d22              ; qs1
-
-    vmov.u8     d27, #3
-
-    vsub.s8     d28, d21, d24              ; ( qs0 - ps0)
-
-    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-
-    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-
-    vand        d29, d29, d23              ; filter &= hev
-
-    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-
-    vmov.u8     d29, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d28, q15
-
-    vand        d28, d28, d19              ; filter &= mask
-
-    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
-    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
-    vshr.s8     d30, d30, #3               ; filter2 >>= 3
-    vshr.s8     d29, d29, #3               ; filter1 >>= 3
-
-    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
-    vqsub.s8    d21, d21, d29              ; oq0 = clamp(qs0 - filter1)
-
-    ; outer tap adjustments: ++filter1 >> 1
-    vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d23              ; filter &= ~hev
-
-    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
-    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
-
-    ; If mask and flat are 0's for all vectors, then we only need to execute
-    ; the filter branch for all vectors.
-    beq         filter_branch_only
-
-    ; If mask and flat are mixed then we must perform both branches and
-    ; combine the data.
-    veor        d24, d24, d22              ; *f_op0 = u^0x80
-    veor        d21, d21, d22              ; *f_oq0 = u^0x80
-    veor        d25, d25, d22              ; *f_op1 = u^0x80
-    veor        d26, d26, d22              ; *f_oq1 = u^0x80
-
-    ; At this point we have already executed the filter branch. The filter
-    ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
-    ; branch and combine the data.
-    vmov.u8     d23, #2
-    vaddl.u8    q14, d6, d7                ; r_op2 = p0 + q0
-    vmlal.u8    q14, d3, d27               ; r_op2 += p3 * 3
-    vmlal.u8    q14, d4, d23               ; r_op2 += p2 * 2
-
-    vbif        d0, d4, d20                ; op2 |= p2 & ~(flat & mask)
-
-    vaddw.u8    q14, d5                    ; r_op2 += p1
-
-    vbif        d1, d25, d20               ; op1 |= f_op1 & ~(flat & mask)
-
-    vqrshrn.u16 d30, q14, #3               ; r_op2
-
-    vsubw.u8    q14, d3                    ; r_op1 = r_op2 - p3
-    vsubw.u8    q14, d4                    ; r_op1 -= p2
-    vaddw.u8    q14, d5                    ; r_op1 += p1
-    vaddw.u8    q14, d16                   ; r_op1 += q1
-
-    vbif        d2, d24, d20               ; op0 |= f_op0 & ~(flat & mask)
-
-    vqrshrn.u16 d31, q14, #3               ; r_op1
-
-    vsubw.u8    q14, d3                    ; r_op0 = r_op1 - p3
-    vsubw.u8    q14, d5                    ; r_op0 -= p1
-    vaddw.u8    q14, d6                    ; r_op0 += p0
-    vaddw.u8    q14, d17                   ; r_op0 += q2
-
-    vbit        d0, d30, d20               ; op2 |= r_op2 & (flat & mask)
-
-    vqrshrn.u16 d23, q14, #3               ; r_op0
-
-    vsubw.u8    q14, d3                    ; r_oq0 = r_op0 - p3
-    vsubw.u8    q14, d6                    ; r_oq0 -= p0
-    vaddw.u8    q14, d7                    ; r_oq0 += q0
-
-    vbit        d1, d31, d20               ; op1 |= r_op1 & (flat & mask)
-
-    vaddw.u8    q14, d18                   ; oq0 += q3
-
-    vbit        d2, d23, d20               ; op0 |= r_op0 & (flat & mask)
-
-    vqrshrn.u16 d22, q14, #3               ; r_oq0
-
-    vsubw.u8    q14, d4                    ; r_oq1 = r_oq0 - p2
-    vsubw.u8    q14, d7                    ; r_oq1 -= q0
-    vaddw.u8    q14, d16                   ; r_oq1 += q1
-
-    vbif        d3, d21, d20               ; oq0 |= f_oq0 & ~(flat & mask)
-
-    vaddw.u8    q14, d18                   ; r_oq1 += q3
-
-    vbif        d4, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
-
-    vqrshrn.u16 d6, q14, #3                ; r_oq1
-
-    vsubw.u8    q14, d5                    ; r_oq2 = r_oq1 - p1
-    vsubw.u8    q14, d16                   ; r_oq2 -= q1
-    vaddw.u8    q14, d17                   ; r_oq2 += q2
-    vaddw.u8    q14, d18                   ; r_oq2 += q3
-
-    vbif        d5, d17, d20               ; oq2 |= q2 & ~(flat & mask)
-
-    vqrshrn.u16 d7, q14, #3                ; r_oq2
-
-    vbit        d3, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
-    vbit        d4, d6, d20                ; oq1 |= r_oq1 & (flat & mask)
-    vbit        d5, d7, d20                ; oq2 |= r_oq2 & (flat & mask)
-
-    bx          lr
-
-power_branch_only
-    vmov.u8     d27, #3
-    vmov.u8     d21, #2
-    vaddl.u8    q14, d6, d7                ; op2 = p0 + q0
-    vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
-    vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
-    vaddw.u8    q14, d5                    ; op2 += p1
-    vqrshrn.u16 d0, q14, #3                ; op2
-
-    vsubw.u8    q14, d3                    ; op1 = op2 - p3
-    vsubw.u8    q14, d4                    ; op1 -= p2
-    vaddw.u8    q14, d5                    ; op1 += p1
-    vaddw.u8    q14, d16                   ; op1 += q1
-    vqrshrn.u16 d1, q14, #3                ; op1
-
-    vsubw.u8    q14, d3                    ; op0 = op1 - p3
-    vsubw.u8    q14, d5                    ; op0 -= p1
-    vaddw.u8    q14, d6                    ; op0 += p0
-    vaddw.u8    q14, d17                   ; op0 += q2
-    vqrshrn.u16 d2, q14, #3                ; op0
-
-    vsubw.u8    q14, d3                    ; oq0 = op0 - p3
-    vsubw.u8    q14, d6                    ; oq0 -= p0
-    vaddw.u8    q14, d7                    ; oq0 += q0
-    vaddw.u8    q14, d18                   ; oq0 += q3
-    vqrshrn.u16 d3, q14, #3                ; oq0
-
-    vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
-    vsubw.u8    q14, d7                    ; oq1 -= q0
-    vaddw.u8    q14, d16                   ; oq1 += q1
-    vaddw.u8    q14, d18                   ; oq1 += q3
-    vqrshrn.u16 d4, q14, #3                ; oq1
-
-    vsubw.u8    q14, d5                    ; oq2 = oq1 - p1
-    vsubw.u8    q14, d16                   ; oq2 -= q1
-    vaddw.u8    q14, d17                   ; oq2 += q2
-    vaddw.u8    q14, d18                   ; oq2 += q3
-    vqrshrn.u16 d5, q14, #3                ; oq2
-
-    bx          lr
-
-filter_branch_only
-    ; TODO(fgalligan): See if we can rearange registers so we do not need to
-    ; do the 2 vswp.
-    vswp        d0, d4                      ; op2
-    vswp        d5, d17                     ; oq2
-    veor        d2, d24, d22                ; *op0 = u^0x80
-    veor        d3, d21, d22                ; *oq0 = u^0x80
-    veor        d1, d25, d22                ; *op1 = u^0x80
-    veor        d4, d26, d22                ; *oq1 = u^0x80
-
-    bx          lr
-
-    ENDP        ; |vp9_mbloop_filter_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -1,603 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_mb_lpf_horizontal_edge_w_neon|
-    EXPORT  |vp9_mb_lpf_vertical_edge_w_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int p,
-;                                        const uint8_t *blimit,
-;                                        const uint8_t *limit,
-;                                        const uint8_t *thresh
-;                                        int count)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|vp9_mb_lpf_horizontal_edge_w_neon| PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              ; load thresh
-    ldr         r12, [sp, #92]             ; load count
-
-h_count
-    vld1.8      {d16[]}, [r2]              ; load *blimit
-    vld1.8      {d17[]}, [r3]              ; load *limit
-    vld1.8      {d18[]}, [r4]              ; load *thresh
-
-    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
-
-    vld1.u8     {d0}, [r8@64], r1          ; p7
-    vld1.u8     {d1}, [r8@64], r1          ; p6
-    vld1.u8     {d2}, [r8@64], r1          ; p5
-    vld1.u8     {d3}, [r8@64], r1          ; p4
-    vld1.u8     {d4}, [r8@64], r1          ; p3
-    vld1.u8     {d5}, [r8@64], r1          ; p2
-    vld1.u8     {d6}, [r8@64], r1          ; p1
-    vld1.u8     {d7}, [r8@64], r1          ; p0
-    vld1.u8     {d8}, [r8@64], r1          ; q0
-    vld1.u8     {d9}, [r8@64], r1          ; q1
-    vld1.u8     {d10}, [r8@64], r1         ; q2
-    vld1.u8     {d11}, [r8@64], r1         ; q3
-    vld1.u8     {d12}, [r8@64], r1         ; q4
-    vld1.u8     {d13}, [r8@64], r1         ; q5
-    vld1.u8     {d14}, [r8@64], r1         ; q6
-    vld1.u8     {d15}, [r8@64], r1         ; q7
-
-    bl          vp9_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         h_mbfilter
-
-    ; flat && mask were not set for any of the channels. Just store the values
-    ; from filter.
-    sub         r8, r0, r1, lsl #1
-
-    vst1.u8     {d25}, [r8@64], r1         ; store op1
-    vst1.u8     {d24}, [r8@64], r1         ; store op0
-    vst1.u8     {d23}, [r8@64], r1         ; store oq0
-    vst1.u8     {d26}, [r8@64], r1         ; store oq1
-
-    b           h_next
-
-h_mbfilter
-    tst         r7, #2
-    beq         h_wide_mbfilter
-
-    ; flat2 was not set for any of the channels. Just store the values from
-    ; mbfilter.
-    sub         r8, r0, r1, lsl #1
-    sub         r8, r8, r1
-
-    vst1.u8     {d18}, [r8@64], r1         ; store op2
-    vst1.u8     {d19}, [r8@64], r1         ; store op1
-    vst1.u8     {d20}, [r8@64], r1         ; store op0
-    vst1.u8     {d21}, [r8@64], r1         ; store oq0
-    vst1.u8     {d22}, [r8@64], r1         ; store oq1
-    vst1.u8     {d23}, [r8@64], r1         ; store oq2
-
-    b           h_next
-
-h_wide_mbfilter
-    sub         r8, r0, r1, lsl #3
-    add         r8, r8, r1
-
-    vst1.u8     {d16}, [r8@64], r1         ; store op6
-    vst1.u8     {d24}, [r8@64], r1         ; store op5
-    vst1.u8     {d25}, [r8@64], r1         ; store op4
-    vst1.u8     {d26}, [r8@64], r1         ; store op3
-    vst1.u8     {d27}, [r8@64], r1         ; store op2
-    vst1.u8     {d18}, [r8@64], r1         ; store op1
-    vst1.u8     {d19}, [r8@64], r1         ; store op0
-    vst1.u8     {d20}, [r8@64], r1         ; store oq0
-    vst1.u8     {d21}, [r8@64], r1         ; store oq1
-    vst1.u8     {d22}, [r8@64], r1         ; store oq2
-    vst1.u8     {d23}, [r8@64], r1         ; store oq3
-    vst1.u8     {d1}, [r8@64], r1          ; store oq4
-    vst1.u8     {d2}, [r8@64], r1          ; store oq5
-    vst1.u8     {d3}, [r8@64], r1          ; store oq6
-
-h_next
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         h_count
-
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    ENDP        ; |vp9_mb_lpf_horizontal_edge_w_neon|
-
-; void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int p,
-;                                        const uint8_t *blimit,
-;                                        const uint8_t *limit,
-;                                        const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|vp9_mb_lpf_vertical_edge_w_neon| PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              ; load thresh
-
-    vld1.8      {d16[]}, [r2]              ; load *blimit
-    vld1.8      {d17[]}, [r3]              ; load *limit
-    vld1.8      {d18[]}, [r4]              ; load *thresh
-
-    sub         r8, r0, #8
-
-    vld1.8      {d0}, [r8@64], r1
-    vld1.8      {d8}, [r0@64], r1
-    vld1.8      {d1}, [r8@64], r1
-    vld1.8      {d9}, [r0@64], r1
-    vld1.8      {d2}, [r8@64], r1
-    vld1.8      {d10}, [r0@64], r1
-    vld1.8      {d3}, [r8@64], r1
-    vld1.8      {d11}, [r0@64], r1
-    vld1.8      {d4}, [r8@64], r1
-    vld1.8      {d12}, [r0@64], r1
-    vld1.8      {d5}, [r8@64], r1
-    vld1.8      {d13}, [r0@64], r1
-    vld1.8      {d6}, [r8@64], r1
-    vld1.8      {d14}, [r0@64], r1
-    vld1.8      {d7}, [r8@64], r1
-    vld1.8      {d15}, [r0@64], r1
-
-    sub         r0, r0, r1, lsl #3
-
-    vtrn.32     q0, q2
-    vtrn.32     q1, q3
-    vtrn.32     q4, q6
-    vtrn.32     q5, q7
-
-    vtrn.16     q0, q1
-    vtrn.16     q2, q3
-    vtrn.16     q4, q5
-    vtrn.16     q6, q7
-
-    vtrn.8      d0, d1
-    vtrn.8      d2, d3
-    vtrn.8      d4, d5
-    vtrn.8      d6, d7
-
-    vtrn.8      d8, d9
-    vtrn.8      d10, d11
-    vtrn.8      d12, d13
-    vtrn.8      d14, d15
-
-    bl          vp9_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         v_mbfilter
-
-    ; flat && mask were not set for any of the channels. Just store the values
-    ; from filter.
-    sub         r8, r0, #2
-
-    vswp        d23, d25
-
-    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
-    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
-    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
-    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
-    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
-    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
-    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
-    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
-
-    b           v_end
-
-v_mbfilter
-    tst         r7, #2
-    beq         v_wide_mbfilter
-
-    ; flat2 was not set for any of the channels. Just store the values from
-    ; mbfilter.
-    sub         r8, r0, #3
-
-    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
-    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
-    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
-    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
-    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
-    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
-    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
-    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
-    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
-    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
-    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
-    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
-    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
-    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
-    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
-    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
-
-    b           v_end
-
-v_wide_mbfilter
-    sub         r8, r0, #8
-
-    vtrn.32     d0,  d26
-    vtrn.32     d16, d27
-    vtrn.32     d24, d18
-    vtrn.32     d25, d19
-
-    vtrn.16     d0,  d24
-    vtrn.16     d16, d25
-    vtrn.16     d26, d18
-    vtrn.16     d27, d19
-
-    vtrn.8      d0,  d16
-    vtrn.8      d24, d25
-    vtrn.8      d26, d27
-    vtrn.8      d18, d19
-
-    vtrn.32     d20, d1
-    vtrn.32     d21, d2
-    vtrn.32     d22, d3
-    vtrn.32     d23, d15
-
-    vtrn.16     d20, d22
-    vtrn.16     d21, d23
-    vtrn.16     d1,  d3
-    vtrn.16     d2,  d15
-
-    vtrn.8      d20, d21
-    vtrn.8      d22, d23
-    vtrn.8      d1,  d2
-    vtrn.8      d3,  d15
-
-    vst1.8      {d0}, [r8@64], r1
-    vst1.8      {d20}, [r0@64], r1
-    vst1.8      {d16}, [r8@64], r1
-    vst1.8      {d21}, [r0@64], r1
-    vst1.8      {d24}, [r8@64], r1
-    vst1.8      {d22}, [r0@64], r1
-    vst1.8      {d25}, [r8@64], r1
-    vst1.8      {d23}, [r0@64], r1
-    vst1.8      {d26}, [r8@64], r1
-    vst1.8      {d1}, [r0@64], r1
-    vst1.8      {d27}, [r8@64], r1
-    vst1.8      {d2}, [r0@64], r1
-    vst1.8      {d18}, [r8@64], r1
-    vst1.8      {d3}, [r0@64], r1
-    vst1.8      {d19}, [r8@64], r1
-    vst1.8      {d15}, [r0@64], r1
-
-v_end
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    ENDP        ; |vp9_mb_lpf_vertical_edge_w_neon|
-
-; void vp9_wide_mbfilter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-;
-; r0-r3 PRESERVE
-; d16    blimit
-; d17    limit
-; d18    thresh
-; d0    p7
-; d1    p6
-; d2    p5
-; d3    p4
-; d4    p3
-; d5    p2
-; d6    p1
-; d7    p0
-; d8    q0
-; d9    q1
-; d10   q2
-; d11   q3
-; d12   q4
-; d13   q5
-; d14   q6
-; d15   q7
-|vp9_wide_mbfilter_neon| PROC
-    mov         r7, #0
-
-    ; filter_mask
-    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
-    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
-    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
-    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
-    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
-    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
-    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
-    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
-    vmax.u8     d19, d19, d20
-
-    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
-
-    vmax.u8     d19, d19, d23
-
-    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
-    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
-
-    ; abs () > limit
-    vcge.u8     d19, d17, d19
-
-    ; flatmask4
-    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
-    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
-    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
-    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
-
-    ; only compare the largest value to thresh
-    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
-    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
-    vmax.u8     d25, d25, d26
-    vmax.u8     d20, d20, d25
-
-    vshr.u8     d23, d23, #1               ; a = a / 2
-    vqadd.u8    d24, d24, d23              ; a = b + a
-
-    vmov.u8     d30, #1
-    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
-
-    vcge.u8     d20, d30, d20              ; flat
-
-    vand        d19, d19, d24              ; mask
-
-    ; hevmask
-    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
-    vorr        d21, d21, d22              ; hev
-
-    vand        d16, d20, d19              ; flat && mask
-    vmov        r5, r6, d16
-
-    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
-    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
-    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
-    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
-    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
-    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
-    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
-    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
-    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
-
-    ; only compare the largest value to thresh
-    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
-    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
-    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
-    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
-
-    vmax.u8     d26, d22, d23
-    vmax.u8     d27, d24, d25
-    vmax.u8     d23, d26, d27
-
-    vcge.u8     d18, d30, d23              ; flat2
-
-    vmov.u8     d22, #0x80
-
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #1                 ; Only do filter branch
-
-    vand        d17, d18, d16              ; flat2 && flat && mask
-    vmov        r5, r6, d17
-
-    ; mbfilter() function
-
-    ; filter() function
-    ; convert to signed
-    veor        d23, d8, d22               ; qs0
-    veor        d24, d7, d22               ; ps0
-    veor        d25, d6, d22               ; ps1
-    veor        d26, d9, d22               ; qs1
-
-    vmov.u8     d27, #3
-
-    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
-    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-    vand        d29, d29, d21              ; filter &= hev
-    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-    vmov.u8     d29, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d28, q15
-
-    vand        d28, d28, d19              ; filter &= mask
-
-    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
-    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
-    vshr.s8     d30, d30, #3               ; filter2 >>= 3
-    vshr.s8     d29, d29, #3               ; filter1 >>= 3
-
-
-    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
-    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
-
-    ; outer tap adjustments: ++filter1 >> 1
-    vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d21              ; filter &= ~hev
-
-    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
-    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
-
-    veor        d24, d24, d22              ; *f_op0 = u^0x80
-    veor        d23, d23, d22              ; *f_oq0 = u^0x80
-    veor        d25, d25, d22              ; *f_op1 = u^0x80
-    veor        d26, d26, d22              ; *f_oq1 = u^0x80
-
-    tst         r7, #1
-    bxne        lr
-
-    ; mbfilter flat && mask branch
-    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
-    ; and using vibt on the q's?
-    vmov.u8     d29, #2
-    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
-    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
-    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
-    vaddl.u8    q10, d4, d5
-    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
-    vaddl.u8    q14, d6, d9
-    vqrshrn.u16 d18, q15, #3               ; r_op2
-
-    vsub.i16    q15, q10
-    vaddl.u8    q10, d4, d6
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d7, d10
-    vqrshrn.u16 d19, q15, #3               ; r_op1
-
-    vsub.i16    q15, q10
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d8, d11
-    vqrshrn.u16 d20, q15, #3               ; r_op0
-
-    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
-    vsubw.u8    q15, d7                    ; oq0 -= p0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d9, d11
-    vqrshrn.u16 d21, q15, #3               ; r_oq0
-
-    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
-    vsubw.u8    q15, d8                    ; oq1 -= q0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d10, d11
-    vqrshrn.u16 d22, q15, #3               ; r_oq1
-
-    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
-    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vadd.i16    q15, q14
-    vqrshrn.u16 d27, q15, #3               ; r_oq2
-
-    ; Filter does not set op2 or oq2, so use p2 and q2.
-    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
-    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
-    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
-    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
-    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
-
-    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
-    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
-
-    tst         r7, #2
-    bxne        lr
-
-    ; wide_mbfilter flat2 && flat && mask branch
-    vmov.u8     d16, #7
-    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
-    vaddl.u8    q12, d2, d3
-    vaddl.u8    q13, d4, d5
-    vaddl.u8    q14, d1, d6
-    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
-    vadd.i16    q12, q13
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vadd.i16    q15, q12
-    vaddl.u8    q12, d0, d1
-    vaddw.u8    q15, d1
-    vaddl.u8    q13, d0, d2
-    vadd.i16    q14, q15, q14
-    vqrshrn.u16 d16, q15, #4               ; w_op6
-
-    vsub.i16    q15, q14, q12
-    vaddl.u8    q14, d3, d10
-    vqrshrn.u16 d24, q15, #4               ; w_op5
-
-    vsub.i16    q15, q13
-    vaddl.u8    q13, d0, d3
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vqrshrn.u16 d25, q15, #4               ; w_op4
-
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d0, d4
-    vsub.i16    q15, q13
-    vsub.i16    q14, q15, q14
-    vqrshrn.u16 d26, q15, #4               ; w_op3
-
-    vaddw.u8    q15, q14, d5               ; op2 += p2
-    vaddl.u8    q14, d0, d5
-    vaddw.u8    q15, d12                   ; op2 += q4
-    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
-    vqrshrn.u16 d27, q15, #4               ; w_op2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d6
-    vaddw.u8    q15, d6                    ; op1 += p1
-    vaddw.u8    q15, d13                   ; op1 += q5
-    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
-    vqrshrn.u16 d18, q15, #4               ; w_op1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d7
-    vaddw.u8    q15, d7                    ; op0 += p0
-    vaddw.u8    q15, d14                   ; op0 += q6
-    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
-    vqrshrn.u16 d19, q15, #4               ; w_op0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d1, d8
-    vaddw.u8    q15, d8                    ; oq0 += q0
-    vaddw.u8    q15, d15                   ; oq0 += q7
-    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
-    vqrshrn.u16 d20, q15, #4               ; w_oq0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vaddw.u8    q15, d9                    ; oq1 += q1
-    vaddl.u8    q4, d10, d15
-    vaddw.u8    q15, d15                   ; oq1 += q7
-    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
-    vqrshrn.u16 d21, q15, #4               ; w_oq1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d3, d10
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d11, d15
-    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
-    vqrshrn.u16 d22, q15, #4               ; w_oq2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d12, d15
-    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
-    vqrshrn.u16 d23, q15, #4               ; w_oq3
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d5, d12
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d13, d15
-    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
-    vqrshrn.u16 d1, q15, #4                ; w_oq4
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d6, d13
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d14, d15
-    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
-    vqrshrn.u16 d2, q15, #4                ; w_oq5
-
-    vsub.i16    q15, q14
-    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
-    vadd.i16    q15, q4
-    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
-    vqrshrn.u16 d3, q15, #4                ; w_oq6
-    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
-    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
-    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
-
-    bx          lr
-    ENDP        ; |vp9_wide_mbfilter_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
@@ -1,198 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_short_idct16x16_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
-;                                    int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vp9_short_idct16x16_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 6)
-    add              r0, r0, #32               ; + (1 <<((6) - 1))
-    asr              r0, r0, #6                ; >> 6
-
-    vdup.s16         q0, r0                    ; duplicate a1
-    mov              r0, #8
-    sub              r2, #8
-
-    ; load destination data row0 - row3
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row4 - row7
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row8 - row11
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row12 - row15
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    bx               lr
-    ENDP             ; |vp9_short_idct16x16_1_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
--- a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
--- a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
@@ -1,68 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_short_idct4x4_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
-;                                  int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vp9_short_idct4x4_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 4)
-    add              r0, r0, #8                ; + (1 <<((4) - 1))
-    asr              r0, r0, #4                ; >> 4
-
-    vdup.s16         q0, r0                    ; duplicate a1
-
-    vld1.32          {d2[0]}, [r1], r2
-    vld1.32          {d2[1]}, [r1], r2
-    vld1.32          {d4[0]}, [r1], r2
-    vld1.32          {d4[1]}, [r1]
-
-    vaddw.u8         q8, q0, d2                ; dest[x] + a1
-    vaddw.u8         q9, q0, d4
-
-    vqmovun.s16      d6, q8                    ; clip_pixel
-    vqmovun.s16      d7, q9
-
-    vst1.32          {d6[0]}, [r12], r2
-    vst1.32          {d6[1]}, [r12], r2
-    vst1.32          {d7[0]}, [r12], r2
-    vst1.32          {d7[1]}, [r12]
-
-    bx               lr
-    ENDP             ; |vp9_short_idct4x4_1_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
@@ -1,190 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_short_idct4x4_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vp9_short_idct4x4_add_neon| PROC
-
-    ; The 2D transform is done with two passes which are actually pretty
-    ; similar. We first transform the rows. This is done by transposing
-    ; the inputs, doing an SIMD column transform (the columns are the
-    ; transposed rows) and then transpose the results (so that it goes back
-    ; in normal/row positions). Then, we transform the columns by doing
-    ; another SIMD column transform.
-    ; So, two passes of a transpose followed by a column transform.
-
-    ; load the inputs into q8-q9, d16-d19
-    vld1.s16        {q8,q9}, [r0]!
-
-    ; generate scalar constants
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov             r0, #0x3b00
-    add             r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov             r3, #0x2d00
-    add             r3, #0x41
-    ; cospi_24_64 = 6270 = 0x 187e
-    mov             r12, #0x1800
-    add             r12, #0x7e
-
-    ; transpose the input data
-    ; 00 01 02 03   d16
-    ; 10 11 12 13   d17
-    ; 20 21 22 23   d18
-    ; 30 31 32 33   d19
-    vtrn.16         d16, d17
-    vtrn.16         d18, d19
-
-    ; generate constant vectors
-    vdup.16         d20, r0         ; replicate cospi_8_64
-    vdup.16         d21, r3         ; replicate cospi_16_64
-
-    ; 00 10 02 12   d16
-    ; 01 11 03 13   d17
-    ; 20 30 22 32   d18
-    ; 21 31 23 33   d19
-    vtrn.32         q8, q9
-    ; 00 10 20 30   d16
-    ; 01 11 21 31   d17
-    ; 02 12 22 32   d18
-    ; 03 13 23 33   d19
-
-    vdup.16         d22, r12        ; replicate cospi_24_64
-
-    ; do the transform on transposed rows
-
-    ; stage 1
-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
-
-    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
-    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
-
-    ; (input[0] + input[2]) * cospi_16_64;
-    ; (input[0] - input[2]) * cospi_16_64;
-    vmull.s16 q13, d23, d21
-    vmull.s16 q14, d24, d21
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
-    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
-    vmlsl.s16 q15, d19, d20
-    vmlal.s16 q1,  d19, d22
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q1,  #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16 q8,  q13, q14
-    vsub.s16 q9,  q13, q14
-    vswp     d18, d19
-
-    ; transpose the results
-    ; 00 01 02 03   d16
-    ; 10 11 12 13   d17
-    ; 20 21 22 23   d18
-    ; 30 31 32 33   d19
-    vtrn.16         d16, d17
-    vtrn.16         d18, d19
-    ; 00 10 02 12   d16
-    ; 01 11 03 13   d17
-    ; 20 30 22 32   d18
-    ; 21 31 23 33   d19
-    vtrn.32         q8, q9
-    ; 00 10 20 30   d16
-    ; 01 11 21 31   d17
-    ; 02 12 22 32   d18
-    ; 03 13 23 33   d19
-
-    ; do the transform on columns
-
-    ; stage 1
-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
-
-    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
-    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
-
-    ; (input[0] + input[2]) * cospi_16_64;
-    ; (input[0] - input[2]) * cospi_16_64;
-    vmull.s16 q13, d23, d21
-    vmull.s16 q14, d24, d21
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
-    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
-    vmlsl.s16 q15, d19, d20
-    vmlal.s16 q1,  d19, d22
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q1,  #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16 q8,  q13, q14
-    vsub.s16 q9,  q13, q14
-
-    ; The results are in two registers, one of them being swapped. This will
-    ; be taken care of by loading the 'dest' value in a swapped fashion and
-    ; also storing them in the same swapped fashion.
-    ; temp_out[0, 1] = d16, d17 = q8
-    ; temp_out[2, 3] = d19, d18 = q9 swapped
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
-    vrshr.s16 q8, q8, #4
-    vrshr.s16 q9, q9, #4
-
-    vld1.32 {d26[0]}, [r1], r2
-    vld1.32 {d26[1]}, [r1], r2
-    vld1.32 {d27[1]}, [r1], r2
-    vld1.32 {d27[0]}, [r1]  ; no post-increment
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
-    vaddw.u8 q8, q8, d26
-    vaddw.u8 q9, q9, d27
-
-    ; clip_pixel
-    vqmovun.s16 d26, q8
-    vqmovun.s16 d27, q9
-
-    ; do the stores in reverse order with negative post-increment, by changing
-    ; the sign of the stride
-    rsb r2, r2, #0
-    vst1.32 {d27[0]}, [r1], r2
-    vst1.32 {d27[1]}, [r1], r2
-    vst1.32 {d26[1]}, [r1], r2
-    vst1.32 {d26[0]}, [r1]  ; no post-increment
-    bx              lr
-    ENDP  ; |vp9_short_idct4x4_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
@@ -1,88 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_short_idct8x8_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
-;                                  int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vp9_short_idct8x8_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 5)
-    add              r0, r0, #16               ; + (1 <<((5) - 1))
-    asr              r0, r0, #5                ; >> 5
-
-    vdup.s16         q0, r0                    ; duplicate a1
-
-    ; load destination data
-    vld1.64          {d2}, [r1], r2
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r2
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r2
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r2
-    vld1.64          {d17}, [r1]
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r2
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r2
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r2
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r2
-    vst1.64          {d31}, [r12], r2
-
-    bx               lr
-    ENDP             ; |vp9_short_idct8x8_1_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -1,519 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_short_idct8x8_add_neon|
-    EXPORT  |vp9_short_idct10_8x8_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
-    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
-    ; This macro will touch q0-q7 registers and use them as buffer during
-    ; calculation.
-    MACRO
-    IDCT8x8_1D
-    ; stage 1
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r4                    ; duplicate cospi_4_64
-    vdup.16         d2, r5                    ; duplicate cospi_12_64
-    vdup.16         d3, r6                    ; duplicate cospi_20_64
-
-    ; input[1] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q5, d26, d2
-    vmull.s16       q6, d27, d2
-
-    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q5, d22, d3
-    vmlsl.s16       q6, d23, d3
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; input[1] * cospi_4_64
-    vmull.s16       q2, d18, d1
-    vmull.s16       q3, d19, d1
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q13, d27, d3
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vmlal.s16       q2, d30, d0
-    vmlal.s16       q3, d31, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q13, d23, d2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
-
-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q13, #14              ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q13, d16, d0
-    vmull.s16       q15, d17, d0
-
-    ; (input[0] + input[2]) * cospi_16_64
-    vmlal.s16       q2,  d24, d0
-    vmlal.s16       q3, d25, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q13, d24, d0
-    vmlsl.s16       q15, d25, d0
-
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q2, #14              ; >> 14
-    vqrshrn.s32     d19, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q13, #14              ; >> 14
-    vqrshrn.s32     d23, q15, #14              ; >> 14
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    ; input[1] * cospi_24_64
-    vmull.s16       q2, d20, d0
-    vmull.s16       q3, d21, d0
-
-    ; input[1] * cospi_8_64
-    vmull.s16       q8, d20, d1
-    vmull.s16       q12, d21, d1
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q2, d28, d1
-    vmlsl.s16       q3, d29, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q8, d28, d0
-    vmlal.s16       q12, d29, d0
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d26, q2, #14              ; >> 14
-    vqrshrn.s32     d27, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q8, #14              ; >> 14
-    vqrshrn.s32     d31, q12, #14              ; >> 14
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14              ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-    MEND
-
-    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vp9_short_idct8x8_add_neon| PROC
-    push            {r4-r9}
-    vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-
-    ; First transform rows
-    IDCT8x8_1D
-
-    ; Transpose the matrix
-    TRANSPOSE8X8
-
-    ; Then transform columns
-    IDCT8x8_1D
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r9}
-    bx              lr
-    ENDP  ; |vp9_short_idct8x8_add_neon|
-
-;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vp9_short_idct10_8x8_add_neon| PROC
-    push            {r4-r9}
-    vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-
-    ; First transform rows
-    ; stage 1
-    ; The following instructions use vqrdmulh to do the
-    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
-    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
-    ; to double the constants before multiplying to compensate this.
-    mov             r12, r3, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
-    mov             r12, r4, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_28_64)
-    vqrdmulh.s16    q4, q9, q0
-
-    mov             r12, r6, lsl #1
-    rsb             r12, #0
-    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_4_64)
-    vqrdmulh.s16    q7, q9, q1
-
-    mov             r12, r5, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
-
-    ; dct_const_round_shift(- input[3] * cospi_20_64)
-    vqrdmulh.s16    q5, q11, q0
-
-    mov             r12, r7, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
-
-    ; dct_const_round_shift(input[3] * cospi_12_64)
-    vqrdmulh.s16    q6, q11, q1
-
-    ; stage 2 & stage 3 - even half
-    mov             r12, r8, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrdmulh.s16    q9, q8, q0
-
-    mov             r12, r9, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_24_64)
-    vqrdmulh.s16    q13, q10, q1
-
-    ; dct_const_round_shift(input[1] * cospi_8_64)
-    vqrdmulh.s16    q15, q10, q0
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14              ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-
-    ; Transpose the matrix
-    TRANSPOSE8X8
-
-    ; Then transform columns
-    IDCT8x8_1D
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r9}
-    bx              lr
-    ENDP  ; |vp9_short_idct10_8x8_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
@@ -1,237 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_short_iht4x4_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
-    ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
-    ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
-    ; into d16-d19 registers. This macro will touch q10- q15 registers and use
-    ; them as buffer during calculation.
-    MACRO
-    IDCT4x4_1D
-    ; stage 1
-    vadd.s16    d23, d16, d18   ; (input[0] + input[2])
-    vsub.s16    d24, d16, d18   ; (input[0] - input[2])
-
-    vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
-    vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
-    vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
-    vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q10, #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16    q8,  q13, q14
-    vsub.s16    q9,  q13, q14
-    vswp        d18, d19
-    MEND
-
-    ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
-    ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
-    ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
-    ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
-    ; q14,q15 registers and use them as buffer during calculation.
-    MACRO
-    IADST4x4_1D
-    vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
-    vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
-    vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
-    vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
-    vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
-    vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
-    vaddw.s16   q15, q15, d19   ; x0 + x3
-    vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
-    vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
-    vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
-
-    vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
-    vadd.s32    q10, q10, q8
-    vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
-    vdup.32     q8, r0          ; duplicate sinpi_3_9
-    vsub.s32    q11, q11, q9
-    vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
-
-    vadd.s32    q13, q10, q12   ; s0 = x0 + x3
-    vadd.s32    q10, q10, q11   ; x0 + x1
-    vadd.s32    q14, q11, q12   ; s1 = x1 + x3
-    vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d16, q13, #14
-    vqrshrn.s32 d17, q14, #14
-    vqrshrn.s32 d18, q15, #14
-    vqrshrn.s32 d19, q10, #14
-    MEND
-
-    ; Generate cosine constants in d6 - d8 for the IDCT
-    MACRO
-    GENERATE_COSINE_CONSTANTS
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov         r0, #0x3b00
-    add         r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov         r3, #0x2d00
-    add         r3, #0x41
-    ; cospi_24_64 = 6270 = 0x187e
-    mov         r12, #0x1800
-    add         r12, #0x7e
-
-    ; generate constant vectors
-    vdup.16     d0, r0          ; duplicate cospi_8_64
-    vdup.16     d1, r3          ; duplicate cospi_16_64
-    vdup.16     d2, r12         ; duplicate cospi_24_64
-    MEND
-
-    ; Generate sine constants in d1 - d4 for the IADST.
-    MACRO
-    GENERATE_SINE_CONSTANTS
-    ; sinpi_1_9 = 5283 = 0x14A3
-    mov         r0, #0x1400
-    add         r0, #0xa3
-    ; sinpi_2_9 = 9929 = 0x26C9
-    mov         r3, #0x2600
-    add         r3, #0xc9
-    ; sinpi_4_9 = 15212 = 0x3B6C
-    mov         r12, #0x3b00
-    add         r12, #0x6c
-
-    ; generate constant vectors
-    vdup.16     d3, r0          ; duplicate sinpi_1_9
-
-    ; sinpi_3_9 = 13377 = 0x3441
-    mov         r0, #0x3400
-    add         r0, #0x41
-
-    vdup.16     d4, r3          ; duplicate sinpi_2_9
-    vdup.16     d5, r12         ; duplicate sinpi_4_9
-    vdup.16     q3, r0          ; duplicate sinpi_3_9
-    MEND
-
-    ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
-    MACRO
-    TRANSPOSE4X4
-    vtrn.16     d16, d17
-    vtrn.16     d18, d19
-    vtrn.32     q8, q9
-    MEND
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
-;                               int dest_stride, int tx_type)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-; r3  int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht4x4_add_neon| PROC
-
-    ; load the inputs into d16-d19
-    vld1.s16    {q8,q9}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE4X4
-
-    ; decide the type of transform
-    cmp         r3, #2
-    beq         idct_iadst
-    cmp         r3, #3
-    beq         iadst_iadst
-
-iadst_idct
-    ; generate constants
-    GENERATE_COSINE_CONSTANTS
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IDCT4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IADST4x4_1D
-
-    b end_vp9_short_iht4x4_add_neon
-
-idct_iadst
-    ; generate constants
-    GENERATE_COSINE_CONSTANTS
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IADST4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IDCT4x4_1D
-
-    b end_vp9_short_iht4x4_add_neon
-
-iadst_iadst
-    ; generate constants
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IADST4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IADST4x4_1D
-
-end_vp9_short_iht4x4_add_neon
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
-    vrshr.s16   q8, q8, #4
-    vrshr.s16   q9, q9, #4
-
-    vld1.32     {d26[0]}, [r1], r2
-    vld1.32     {d26[1]}, [r1], r2
-    vld1.32     {d27[0]}, [r1], r2
-    vld1.32     {d27[1]}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
-    vaddw.u8    q8, q8, d26
-    vaddw.u8    q9, q9, d27
-
-    ; clip_pixel
-    vqmovun.s16 d26, q8
-    vqmovun.s16 d27, q9
-
-    ; do the stores in reverse order with negative post-increment, by changing
-    ; the sign of the stride
-    rsb         r2, r2, #0
-    vst1.32     {d27[1]}, [r1], r2
-    vst1.32     {d27[0]}, [r1], r2
-    vst1.32     {d26[1]}, [r1], r2
-    vst1.32     {d26[0]}, [r1]  ; no post-increment
-    bx          lr
-    ENDP  ; |vp9_short_iht4x4_add_neon|
-
-    END
--- a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
@@ -1,696 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_short_iht8x8_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Generate IADST constants in r0 - r12 for the IADST.
-    MACRO
-    GENERATE_IADST_CONSTANTS
-    ; generate  cospi_2_64  = 16305
-    mov             r0, #0x3f00
-    add             r0, #0xb1
-
-    ; generate cospi_30_64 = 1606
-    mov             r1, #0x600
-    add             r1, #0x46
-
-    ; generate cospi_10_64 = 14449
-    mov             r2, #0x3800
-    add             r2, #0x71
-
-    ; generate cospi_22_64 = 7723
-    mov             r3, #0x1e00
-    add             r3, #0x2b
-
-    ; generate cospi_18_64 = 10394
-    mov             r4, #0x2800
-    add             r4, #0x9a
-
-    ; generate cospi_14_64 = 12665
-    mov             r5, #0x3100
-    add             r5, #0x79
-
-    ; generate cospi_26_64 = 4756
-    mov             r6, #0x1200
-    add             r6, #0x94
-
-    ; generate cospi_6_64  = 15679
-    mov             r7, #0x3d00
-    add             r7, #0x3f
-
-    ; generate cospi_8_64  = 15137
-    mov             r8, #0x3b00
-    add             r8, #0x21
-
-    ; generate cospi_24_64 = 6270
-    mov             r9, #0x1800
-    add             r9, #0x7e
-
-    ; generate 0
-    mov             r10, #0
-
-    ; generate  cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
-    MEND
-
-    ; Generate IDCT constants in r3 - r9 for the IDCT.
-    MACRO
-    GENERATE_IDCT_CONSTANTS
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-    MEND
-
-    ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
-    ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
-    ; will be stored back into q8-q15 registers. This macro will touch q0-q7
-    ; registers and use them as buffer during calculation.
-    MACRO
-    IDCT8x8_1D
-    ; stage 1
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r4                    ; duplicate cospi_4_64
-    vdup.16         d2, r5                    ; duplicate cospi_12_64
-    vdup.16         d3, r6                    ; duplicate cospi_20_64
-
-    ; input[1] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q5, d26, d2
-    vmull.s16       q6, d27, d2
-
-    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q5, d22, d3
-    vmlsl.s16       q6, d23, d3
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; input[1] * cospi_4_64
-    vmull.s16       q2, d18, d1
-    vmull.s16       q3, d19, d1
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q13, d27, d3
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vmlal.s16       q2, d30, d0
-    vmlal.s16       q3, d31, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q13, d23, d2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
-
-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q13, #14             ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q13, d16, d0
-    vmull.s16       q15, d17, d0
-
-    ; (input[0] + input[2]) * cospi_16_64
-    vmlal.s16       q2,  d24, d0
-    vmlal.s16       q3, d25, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q13, d24, d0
-    vmlsl.s16       q15, d25, d0
-
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q2, #14              ; >> 14
-    vqrshrn.s32     d19, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q13, #14             ; >> 14
-    vqrshrn.s32     d23, q15, #14             ; >> 14
-
-    ; input[1] * cospi_24_64
-    vmull.s16       q2, d20, d0
-    vmull.s16       q3, d21, d0
-
-    ; input[1] * cospi_8_64
-    vmull.s16       q8, d20, d1
-    vmull.s16       q12, d21, d1
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q2, d28, d1
-    vmlsl.s16       q3, d29, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q8, d28, d0
-    vmlal.s16       q12, d29, d0
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d26, q2, #14              ; >> 14
-    vqrshrn.s32     d27, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q8, #14              ; >> 14
-    vqrshrn.s32     d31, q12, #14             ; >> 14
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14             ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-    MEND
-
-    ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
-    ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
-    ; output will be stored back into q8-q15 registers. This macro will touch
-    ; q0 - q7 registers and use them as buffer during calculation.
-    MACRO
-    IADST8X8_1D
-    vdup.16         d14, r0                   ; duplicate cospi_2_64
-    vdup.16         d15, r1                   ; duplicate cospi_30_64
-
-    ; cospi_2_64  * x0
-    vmull.s16       q1, d30, d14
-    vmull.s16       q2, d31, d14
-
-    ; cospi_30_64 * x0
-    vmull.s16       q3, d30, d15
-    vmull.s16       q4, d31, d15
-
-    vdup.16         d30, r4                   ; duplicate cospi_18_64
-    vdup.16         d31, r5                   ; duplicate cospi_14_64
-
-    ; s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-    vmlal.s16       q1, d16, d15
-    vmlal.s16       q2, d17, d15
-
-    ; s1 = cospi_30_64 * x0 - cospi_2_64  * x1
-    vmlsl.s16       q3, d16, d14
-    vmlsl.s16       q4, d17, d14
-
-    ; cospi_18_64 * x4
-    vmull.s16       q5, d22, d30
-    vmull.s16       q6, d23, d30
-
-    ; cospi_14_64 * x4
-    vmull.s16       q7, d22, d31
-    vmull.s16       q8, d23, d31
-
-    ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-    vmlal.s16       q5, d24, d31
-    vmlal.s16       q6, d25, d31
-
-    ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
-    vmlsl.s16       q7, d24, d30
-    vmlsl.s16       q8, d25, d30
-
-    ; (s0 + s4)
-    vadd.s32        q11, q1, q5
-    vadd.s32        q12, q2, q6
-
-    vdup.16         d0, r2                   ; duplicate cospi_10_64
-    vdup.16         d1, r3                   ; duplicate cospi_22_64
-
-    ; (s0 - s4)
-    vsub.s32        q1, q1, q5
-    vsub.s32        q2, q2, q6
-
-    ; x0 = dct_const_round_shift(s0 + s4);
-    vqrshrn.s32     d22, q11, #14             ; >> 14
-    vqrshrn.s32     d23, q12, #14             ; >> 14
-
-    ; (s1 + s5)
-    vadd.s32        q12, q3, q7
-    vadd.s32        q15, q4, q8
-
-    ; (s1 - s5)
-    vsub.s32        q3, q3, q7
-    vsub.s32        q4, q4, q8
-
-    ; x4 = dct_const_round_shift(s0 - s4);
-    vqrshrn.s32     d2, q1, #14               ; >> 14
-    vqrshrn.s32     d3, q2, #14               ; >> 14
-
-    ; x1 = dct_const_round_shift(s1 + s5);
-    vqrshrn.s32     d24, q12, #14             ; >> 14
-    vqrshrn.s32     d25, q15, #14             ; >> 14
-
-    ; x5 = dct_const_round_shift(s1 - s5);
-    vqrshrn.s32     d6, q3, #14               ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
-
-    ; cospi_10_64 * x2
-    vmull.s16       q4, d26, d0
-    vmull.s16       q5, d27, d0
-
-    ; cospi_22_64 * x2
-    vmull.s16       q2, d26, d1
-    vmull.s16       q6, d27, d1
-
-    vdup.16         d30, r6                   ; duplicate cospi_26_64
-    vdup.16         d31, r7                   ; duplicate cospi_6_64
-
-    ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-    vmlal.s16       q4, d20, d1
-    vmlal.s16       q5, d21, d1
-
-    ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-    vmlsl.s16       q2, d20, d0
-    vmlsl.s16       q6, d21, d0
-
-    ; cospi_26_64 * x6
-    vmull.s16       q0, d18, d30
-    vmull.s16       q13, d19, d30
-
-    ; s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-    vmlal.s16       q0, d28, d31
-    vmlal.s16       q13, d29, d31
-
-    ; cospi_6_64  * x6
-    vmull.s16       q10, d18, d31
-    vmull.s16       q9, d19, d31
-
-    ; s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
-    vmlsl.s16       q10, d28, d30
-    vmlsl.s16       q9, d29, d30
-
-    ; (s3 + s7)
-    vadd.s32        q14, q2, q10
-    vadd.s32        q15, q6, q9
-
-    ; (s3 - s7)
-    vsub.s32        q2, q2, q10
-    vsub.s32        q6, q6, q9
-
-    ; x3 = dct_const_round_shift(s3 + s7);
-    vqrshrn.s32     d28, q14, #14             ; >> 14
-    vqrshrn.s32     d29, q15, #14             ; >> 14
-
-    ; x7 = dct_const_round_shift(s3 - s7);
-    vqrshrn.s32     d4, q2, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
-
-    ; (s2 + s6)
-    vadd.s32        q9, q4, q0
-    vadd.s32        q10, q5, q13
-
-    ; (s2 - s6)
-    vsub.s32        q4, q4, q0
-    vsub.s32        q5, q5, q13
-
-    vdup.16         d30, r8                   ; duplicate cospi_8_64
-    vdup.16         d31, r9                   ; duplicate cospi_24_64
-
-    ; x2 = dct_const_round_shift(s2 + s6);
-    vqrshrn.s32     d18, q9, #14              ; >> 14
-    vqrshrn.s32     d19, q10, #14             ; >> 14
-
-    ; x6 = dct_const_round_shift(s2 - s6);
-    vqrshrn.s32     d8, q4, #14               ; >> 14
-    vqrshrn.s32     d9, q5, #14               ; >> 14
-
-    ; cospi_8_64  * x4
-    vmull.s16       q5, d2, d30
-    vmull.s16       q6, d3, d30
-
-    ; cospi_24_64 * x4
-    vmull.s16       q7, d2, d31
-    vmull.s16       q0, d3, d31
-
-    ; s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-    vmlal.s16       q5, d6, d31
-    vmlal.s16       q6, d7, d31
-
-    ; s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-    vmlsl.s16       q7, d6, d30
-    vmlsl.s16       q0, d7, d30
-
-    ; cospi_8_64  * x7
-    vmull.s16       q1, d4, d30
-    vmull.s16       q3, d5, d30
-
-    ; cospi_24_64 * x7
-    vmull.s16       q10, d4, d31
-    vmull.s16       q2, d5, d31
-
-    ; s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-    vmlsl.s16       q1, d8, d31
-    vmlsl.s16       q3, d9, d31
-
-    ; s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
-    vmlal.s16       q10, d8, d30
-    vmlal.s16       q2, d9, d30
-
-    vadd.s16        q8, q11, q9               ; x0 = s0 + s2;
-
-    vsub.s16        q11, q11, q9              ; x2 = s0 - s2;
-
-    vadd.s16        q4, q12, q14              ; x1 = s1 + s3;
-
-    vsub.s16        q12, q12, q14             ; x3 = s1 - s3;
-
-    ; (s4 + s6)
-    vadd.s32        q14, q5, q1
-    vadd.s32        q15, q6, q3
-
-    ; (s4 - s6)
-    vsub.s32        q5, q5, q1
-    vsub.s32        q6, q6, q3
-
-    ; x4 = dct_const_round_shift(s4 + s6);
-    vqrshrn.s32     d18, q14, #14             ; >> 14
-    vqrshrn.s32     d19, q15, #14             ; >> 14
-
-    ; x6 = dct_const_round_shift(s4 - s6);
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; (s5 + s7)
-    vadd.s32        q1, q7, q10
-    vadd.s32        q3, q0, q2
-
-    ; (s5 - s7))
-    vsub.s32        q7, q7, q10
-    vsub.s32        q0, q0, q2
-
-    ; x5 = dct_const_round_shift(s5 + s7);
-    vqrshrn.s32     d28, q1, #14               ; >> 14
-    vqrshrn.s32     d29, q3, #14               ; >> 14
-
-    ; x7 = dct_const_round_shift(s5 - s7);
-    vqrshrn.s32     d14, q7, #14              ; >> 14
-    vqrshrn.s32     d15, q0, #14              ; >> 14
-
-    vdup.16         d30, r12                  ; duplicate cospi_16_64
-
-    ; cospi_16_64 * x2
-    vmull.s16       q2, d22, d30
-    vmull.s16       q3, d23, d30
-
-    ; cospi_6_64  * x6
-    vmull.s16       q13, d22, d30
-    vmull.s16       q1, d23, d30
-
-    ; cospi_16_64 * x2 + cospi_16_64  * x3;
-    vmlal.s16       q2, d24, d30
-    vmlal.s16       q3, d25, d30
-
-    ; cospi_16_64 * x2 - cospi_16_64  * x3;
-    vmlsl.s16       q13, d24, d30
-    vmlsl.s16       q1, d25, d30
-
-    ; x2 = dct_const_round_shift(s2);
-    vqrshrn.s32     d4, q2, #14               ; >> 14
-    vqrshrn.s32     d5, q3, #14               ; >> 14
-
-    ;x3 = dct_const_round_shift(s3);
-    vqrshrn.s32     d24, q13, #14             ; >> 14
-    vqrshrn.s32     d25, q1, #14              ; >> 14
-
-    ; cospi_16_64 * x6
-    vmull.s16       q13, d10, d30
-    vmull.s16       q1, d11, d30
-
-    ; cospi_6_64  * x6
-    vmull.s16       q11, d10, d30
-    vmull.s16       q0, d11, d30
-
-    ; cospi_16_64 * x6 + cospi_16_64  * x7;
-    vmlal.s16       q13, d14, d30
-    vmlal.s16       q1, d15, d30
-
-    ; cospi_16_64 * x6 - cospi_16_64  * x7;
-    vmlsl.s16       q11, d14, d30
-    vmlsl.s16       q0, d15, d30
-
-    ; x6 = dct_const_round_shift(s6);
-    vqrshrn.s32     d20, q13, #14             ; >> 14
-    vqrshrn.s32     d21, q1, #14              ; >> 14
-
-    ;x7 = dct_const_round_shift(s7);
-    vqrshrn.s32     d12, q11, #14             ; >> 14
-    vqrshrn.s32     d13, q0, #14              ; >> 14
-
-    vdup.16         q5, r10                   ; duplicate 0
-
-    vsub.s16        q9, q5, q9                ; output[1] = -x4;
-    vsub.s16        q11, q5, q2               ; output[3] = -x2;
-    vsub.s16        q13, q5, q6               ; output[5] = -x7;
-    vsub.s16        q15, q5, q4               ; output[7] = -x1;
-    MEND
-
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest,
-;                               int dest_stride, int tx_type)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-; r3  int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht8x8_add_neon| PROC
-
-    ; load the inputs into d16-d19
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    push            {r0-r10}
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; decide the type of transform
-    cmp         r3, #2
-    beq         idct_iadst
-    cmp         r3, #3
-    beq         iadst_iadst
-
-iadst_idct
-    ; generate IDCT constants
-    GENERATE_IDCT_CONSTANTS
-
-    ; first transform rows
-    IDCT8x8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; then transform columns
-    IADST8X8_1D
-
-    b end_vp9_short_iht8x8_add_neon
-
-idct_iadst
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; first transform rows
-    IADST8X8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; generate IDCT constants
-    GENERATE_IDCT_CONSTANTS
-
-    ; then transform columns
-    IDCT8x8_1D
-
-    b end_vp9_short_iht8x8_add_neon
-
-iadst_iadst
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; first transform rows
-    IADST8X8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; then transform columns
-    IADST8X8_1D
-
-end_vp9_short_iht8x8_add_neon
-    pop            {r0-r10}
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-    bx          lr
-    ENDP  ; |vp9_short_iht8x8_add_neon|
-
-    END
--- a/vp9/common/generic/vp9_systemdependent.c
+++ b/vp9/common/generic/vp9_systemdependent.c
@@ -13,7 +13,6 @@
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_onyxc_int.h"

-void vp9_machine_specific_config(VP9_COMMON *cm) {
-  (void)cm;
+void vp9_machine_specific_config(VP9_COMMON *ctx) {
  vp9_rtcd();
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ami Fischman	28147a449a	libvpx: enable building for iOS devices (armv7) Allow output of gas syntax assembly directly from obj_int_extract Change-Id: I33a747e87ef1c590a8766dea17f8cb2497e54591	2013-07-19 14:05:59 -07:00
Ronald S. Bultje	33149cbb4c	Replace generated quant tables with static lookup tables. This prevents possible float rounding issues between architectures. Change-Id: I6ed260aebd49feb4cfb5596a5370c44be5f72167	2013-07-16 14:04:41 -07:00
John Koleszar	3f454060bb	Fix above context pointers In the prior code, the above context pointers used for entropy decoding were initialized on the first frame, and not updated when the frame size changed. The per-frame code which initializes the contexts assumes that the contexts are contiguous, leading to an incomplete initialization when the frame is smaller. This commit updates the pointers so that the context is contigous whenever the frame size changes. Conflicts: vp9/common/vp9_alloccommon.c Change-Id: I08b53e3a30c8289491212311682ff1b8028cff6c	2013-07-16 14:04:41 -07:00
Yaowu Xu	d19ed5f249	Change to extend full border only when needed This is a short term optimization till we work out a decoder implementation requiring no frame border extension. Change-Id: I02d15bfde4d926b50a4e58b393d8c4062d1be70f	2013-07-16 14:04:39 -07:00
Ronald S. Bultje	a801f7a295	Increase border size from 96 to 160. This is required because upon downscaling, if a motion vector points partially into the UMV (e.g. all minus 1 of 64+7 pixels, i.e. 70), then we can point up to 140 pixels into the larger-resolution (2x) reference buffer UMV, which means the UMV for reference buffers in downscaling needs to be 140 rounded up to the nearest multiple of 32, i.e. 160. Longer-term, we should probably handle the UMV differently by detecting edge coverage on-the-fly and using a temporary buffer for edge extensions instead of adding 160 pixels on all sides of the image (which means a CIF image uses 3x its own area size for borders). Change-Id: I5184443e6731cd6721fc6a5d430a53e7d91b4f7e	2013-07-16 12:41:10 -07:00
Dmitry Kovalev	e39bd6407f	Fixing vp9_get_pred_context_comp_ref_p function. Adding missed parenthesis around boolean expressions. Bitstream is changed. Regenerating test vectors. Conflicts: vp9/common/vp9_pred_common.c Change-Id: I4cc00b761e9473f92f180a9fc3a0c607f0aaae56	2013-07-16 12:40:48 -07:00