BITSTREAM - CLARIFICATION OF MV SIZE RANGE

The codec should effectively run with motion vector of range (-2048, 2047) in full pixels, for sequences of 1080p and below. Add assertions to clarify this behavior. Change-Id: Ia0cac28249f587d8f8882205228fa480263ab313
Merge "Adding read_intra_mode_{y, uv} functions for clarity."
2013-10-02 10:29:45 -07:00 · 2013-10-02 09:17:10 -07:00 · 2013-10-01 18:34:36 -07:00 · 2013-10-01 17:55:48 -07:00 · 2013-10-01 16:15:03 -07:00 · 2013-10-01 16:14:58 -07:00
210 changed files with 16844 additions and 9658 deletions
--- a/build/make/armlink_adapter.sh
+++ b/build/make/armlink_adapter.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
@@ -13,20 +13,20 @@
 verbose=0
 set -- $*
 for i; do
-    if [ "$i" == "-o" ]; then
+    if [ "$i" = "-o" ]; then
        on_of=1
-    elif [ "$i" == "-v" ]; then
+    elif [ "$i" = "-v" ]; then
        verbose=1
-    elif [ "$i" == "-g" ]; then
+    elif [ "$i" = "-g" ]; then
        args="${args} --debug"
-    elif [ "$on_of" == "1" ]; then
+    elif [ "$on_of" = "1" ]; then
        outfile=$i
        on_of=0
    elif [ -f "$i" ]; then
        infiles="$infiles $i"
-    elif [ "${i:0:2}" == "-l" ]; then
+    elif [ "${i#-l}" != "$i" ]; then
        libs="$libs ${i#-l}"
-    elif [ "${i:0:2}" == "-L" ]; then
+    elif [ "${i#-L}" != "$i" ]; then
        libpaths="${libpaths} ${i#-L}"
    else
        args="${args} ${i}"
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  configure.sh
 ##
@@ -198,11 +198,11 @@ add_extralibs() {
 #
 # Boolean Manipulation Functions
 #
-enable(){
+enable_feature(){
    set_all yes $*
 }

-disable(){
+disable_feature(){
    set_all no $*
 }

@@ -219,7 +219,7 @@ soft_enable() {
    for var in $*; do
        if ! disabled $var; then
            log_echo "  enabling $var"
-            enable $var
+            enable_feature $var
        fi
    done
 }
@@ -228,7 +228,7 @@ soft_disable() {
    for var in $*; do
        if ! enabled $var; then
            log_echo "  disabling $var"
-            disable $var
+            disable_feature $var
        fi
    done
 }
@@ -251,10 +251,10 @@ tolower(){
 # Temporary File Functions
 #
 source_path=${0%/*}
-enable source_path_used
+enable_feature source_path_used
 if test -z "$source_path" -o "$source_path" = "." ; then
    source_path="`pwd`"
-    disable source_path_used
+    disable_feature source_path_used
 fi

 if test ! -z "$TMPDIR" ; then
@@ -264,12 +264,13 @@ elif test ! -z "$TEMPDIR" ; then
 else
    TMPDIRx="/tmp"
 fi
-TMP_H="${TMPDIRx}/vpx-conf-$$-${RANDOM}.h"
-TMP_C="${TMPDIRx}/vpx-conf-$$-${RANDOM}.c"
-TMP_CC="${TMPDIRx}/vpx-conf-$$-${RANDOM}.cc"
-TMP_O="${TMPDIRx}/vpx-conf-$$-${RANDOM}.o"
-TMP_X="${TMPDIRx}/vpx-conf-$$-${RANDOM}.x"
-TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RANDOM}.asm"
+RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
+TMP_H="${TMPDIRx}/vpx-conf-$$-${RAND}.h"
+TMP_C="${TMPDIRx}/vpx-conf-$$-${RAND}.c"
+TMP_CC="${TMPDIRx}/vpx-conf-$$-${RAND}.cc"
+TMP_O="${TMPDIRx}/vpx-conf-$$-${RAND}.o"
+TMP_X="${TMPDIRx}/vpx-conf-$$-${RAND}.x"
+TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RAND}.asm"

 clean_temp_files() {
    rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
@@ -316,8 +317,8 @@ check_header(){
    header=$1
    shift
    var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-    disable $var
-    check_cpp "$@" <<EOF && enable $var
+    disable_feature $var
+    check_cpp "$@" <<EOF && enable_feature $var
 #include "$header"
 int x;
 EOF
@@ -479,7 +480,7 @@ process_common_cmdline() {
    for opt in "$@"; do
        optval="${opt#*=}"
        case "$opt" in
-        --child) enable child
+        --child) enable_feature child
        ;;
        --log*)
        logging="$optval"
@@ -491,7 +492,7 @@ process_common_cmdline() {
        ;;
        --target=*) toolchain="${toolchain:-${optval}}"
        ;;
-        --force-target=*) toolchain="${toolchain:-${optval}}"; enable force_toolchain
+        --force-target=*) toolchain="${toolchain:-${optval}}"; enable_feature force_toolchain
        ;;
        --cpu)
        ;;
@@ -511,7 +512,7 @@ process_common_cmdline() {
          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
            die_unknown $opt
        fi
-        $action $option
+        ${action}_feature $option
        ;;
        --require-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
@@ -523,11 +524,11 @@ process_common_cmdline() {
        ;;
        --force-enable-?*|--force-disable-?*)
        eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'`
-        $action $option
+        ${action}_feature $option
        ;;
        --libc=*)
        [ -d "${optval}" ] || die "Not a directory: ${optval}"
-        disable builtin_libc
+        disable_feature builtin_libc
        alt_libc="${optval}"
        ;;
        --as=*)
@@ -696,13 +697,13 @@ process_common_toolchain() {

    # Mark the specific ISA requested as enabled
    soft_enable ${tgt_isa}
-    enable ${tgt_os}
-    enable ${tgt_cc}
+    enable_feature ${tgt_os}
+    enable_feature ${tgt_cc}

    # Enable the architecture family
    case ${tgt_isa} in
-        arm*) enable arm;;
-        mips*) enable mips;;
+        arm*) enable_feature arm;;
+        mips*) enable_feature mips;;
    esac

    # PIC is probably what we want when building shared libs
@@ -765,7 +766,7 @@ process_common_toolchain() {
    case ${toolchain} in
        sparc-solaris-*)
            add_extralibs -lposix4
-            disable fast_unaligned
+            disable_feature fast_unaligned
            ;;
        *-solaris-*)
            add_extralibs -lposix4
@@ -790,7 +791,7 @@ process_common_toolchain() {
            ;;
        armv5te)
            soft_enable edsp
-            disable fast_unaligned
+            disable_feature fast_unaligned
            ;;
        esac

@@ -805,7 +806,7 @@ process_common_toolchain() {
            arch_int=${arch_int%%te}
            check_add_asflags --defsym ARCHITECTURE=${arch_int}
            tune_cflags="-mtune="
-            if [ ${tgt_isa} == "armv7" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
                if [ -z "${float_abi}" ]; then
                    check_cpp <<EOF && float_abi=hard || float_abi=softfp
 #ifndef __ARM_PCS_VFP
@@ -842,8 +843,8 @@ EOF
            asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
            AS_SFX=.s
            msvs_arch_dir=arm-msvs
-            disable multithread
-            disable unit_tests
+            disable_feature multithread
+            disable_feature unit_tests
            ;;
        rvct)
            CC=armcc
@@ -855,7 +856,7 @@ EOF
            tune_cflags="--cpu="
            tune_asflags="--cpu="
            if [ -z "${tune_cpu}" ]; then
-                if [ ${tgt_isa} == "armv7" ]; then
+                if [ ${tgt_isa} = "armv7" ]; then
                    if enabled neon
                    then
                        check_add_cflags --fpu=softvfp+vfpv3
@@ -880,8 +881,8 @@ EOF

        case ${tgt_os} in
        none*)
-            disable multithread
-            disable os_support
+            disable_feature multithread
+            disable_feature os_support
            ;;

        android*)
@@ -913,9 +914,9 @@ EOF
            # Cortex-A8 implementations (NDK Dev Guide)
            add_ldflags "-Wl,--fix-cortex-a8"

-            enable pic
+            enable_feature pic
            soft_enable realtime_only
-            if [ ${tgt_isa} == "armv7" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
                soft_enable runtime_cpu_detect
            fi
            if enabled runtime_cpu_detect; then
@@ -969,7 +970,7 @@ EOF
         ;;

        linux*)
-            enable linux
+            enable_feature linux
            if enabled rvct; then
                # Check if we have CodeSourcery GCC in PATH. Needed for
                # libraries
@@ -1000,14 +1001,14 @@ EOF
        tune_cflags="-mtune="
        if enabled dspr2; then
            check_add_cflags -mips32r2 -mdspr2
-            disable fast_unaligned
+            disable_feature fast_unaligned
        fi
        check_add_cflags -march=${tgt_isa}
        check_add_asflags -march=${tgt_isa}
        check_add_asflags -KPIC
    ;;
    ppc*)
-        enable ppc
+        enable_feature ppc
        bits=${tgt_isa##ppc}
        link_with_cc=gcc
        setup_gnu_toolchain
@@ -1061,7 +1062,7 @@ EOF
                setup_gnu_toolchain
                add_cflags -use-msasm -use-asm
                add_ldflags -i-static
-                enabled x86_64 && add_cflags -ipo -no-prec-div -static -xSSE2 -axSSE2
+                enabled x86_64 && add_cflags -ipo -static -O3
                enabled x86_64 && AR=xiar
                case ${tune_cpu} in
                    atom*)
@@ -1155,7 +1156,7 @@ EOF
    ;;
    universal*|*-gcc|generic-gnu)
        link_with_cc=gcc
-        enable gcc
+        enable_feature gcc
    setup_gnu_toolchain
    ;;
    esac
@@ -1191,7 +1192,7 @@ EOF

    # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
    echo "  checking here for x86inc \"${tgt_isa}\" \"$pic\" "
-    if [ ${tgt_isa} = x86_64 -o ! "$pic" == "yes" -o ! ${tgt_os:0:6} = darwin ]; then
+    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o "${tgt_os#darwin}" = "${tgt_os}"  ]; then
      soft_enable use_x86inc
    fi

@@ -1204,14 +1205,14 @@ EOF
    enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0

    # Check for strip utility variant
-    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable gnu_strip
+    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip

    # Try to determine target endianness
    check_cc <<EOF
    unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
 EOF
    [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' |
-        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable big_endian
+        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable_feature big_endian

    # Try to find which inline keywords are supported
    check_cc <<EOF && INLINE="inline"
@@ -1236,7 +1237,7 @@ EOF
            if enabled dspr2; then
                if enabled big_endian; then
                    echo "dspr2 optimizations are available only for little endian platforms"
-                    disable dspr2
+                    disable_feature dspr2
                fi
            fi
        ;;
@@ -1287,8 +1288,8 @@ print_config_h() {

 print_webm_license() {
    local destination=$1
-    local prefix=$2
-    local suffix=$3
+    local prefix="$2"
+    local suffix="$3"
    shift 3
    cat <<EOF > ${destination}
 ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
@@ -1309,7 +1310,7 @@ process_detect() {
    true;
 }

-enable logging
+enable_feature logging
 logfile="config.log"
 self=$0
 process() {
--- a/build/make/gen_asm_deps.sh
+++ b/build/make/gen_asm_deps.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh
@@ -290,9 +290,11 @@ static void setup_rtcd_internal(void)
 {
 $(set_function_pointers c $ALL_ARCHS)
 #if HAVE_DSPR2
+#if CONFIG_VP8
 void dsputil_static_init();
 dsputil_static_init();
 #endif
+#endif
 }
 #endif
 $(common_bottom)
--- a/build/make/thumb.pm
+++ b/build/make/thumb.pm
@@ -47,7 +47,7 @@ sub FixThumbInstructions($$)
    # this is used, it's used for two subsequent load instructions,
    # where a hand-written version of it could merge two subsequent
    # add and sub instructions.
-    s/^(\s*)((ldr|str)(ne)?)(\s+)(r\d+),\s*\[(\w+), -([^\]]+)\]/$1sub$4$5$7, $7, $8\n$1$2$5$6, [$7]\n$1add$4$5$7, $7, $8/g;
+    s/^(\s*)((ldr|str|pld)(ne)?)(\s+)(r\d+,\s*)?\[(\w+), -([^\]]+)\]/$1sub$4$5$7, $7, $8\n$1$2$5$6\[$7\]\n$1add$4$5$7, $7, $8/g;

    # Convert register post indexing to a separate add instruction.
    # This converts "ldrneb r9, [r0], r2" into "ldrneb r9, [r0]",
--- a/build/make/version.sh
+++ b/build/make/version.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
--- a/85
+++ b/85
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  configure
 ##
@@ -38,6 +38,7 @@ Advanced options:
  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
  ${toggle_mem_tracker}           track memory usage
  ${toggle_postproc}              postprocessing
+  ${toggle_vp9_postproc}          vp9 specific postprocessing
  ${toggle_multithread}           multithreaded encoding and decoding
  ${toggle_spatial_resampling}    spatial sampling (scaling) support
  ${toggle_realtime_only}         enable this option while building for real-time encoding
@@ -153,7 +154,7 @@ all_targets="libs examples docs"

 # all targets available are enabled, by default.
 for t in ${all_targets}; do
-    [ -f ${source_path}/${t}.mk ] && enable ${t}
+    [ -f ${source_path}/${t}.mk ] && enable_feature ${t}
 done

 # check installed doxygen version
@@ -164,30 +165,30 @@ if [ ${doxy_major:-0} -ge 1 ]; then
    doxy_minor=${doxy_version%%.*}
    doxy_patch=${doxy_version##*.}

-    [ $doxy_major -gt 1 ] && enable doxygen
-    [ $doxy_minor -gt 5 ] && enable doxygen
-    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable doxygen
+    [ $doxy_major -gt 1 ] && enable_feature doxygen
+    [ $doxy_minor -gt 5 ] && enable_feature doxygen
+    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen
 fi

 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
 # case.
-enabled doxygen && php -v >/dev/null 2>&1 && enable install_docs
-enable install_bins
-enable install_libs
+enabled doxygen && php -v >/dev/null 2>&1 && enable_feature install_docs
+enable_feature install_bins
+enable_feature install_libs

-enable static
-enable optimizations
-enable fast_unaligned #allow unaligned accesses, if supported by hw
-enable md5
-enable spatial_resampling
-enable multithread
-enable os_support
-enable temporal_denoising
+enable_feature static
+enable_feature optimizations
+enable_feature fast_unaligned #allow unaligned accesses, if supported by hw
+enable_feature md5
+enable_feature spatial_resampling
+enable_feature multithread
+enable_feature os_support
+enable_feature temporal_denoising

-[ -d ${source_path}/../include ] && enable alt_tree_layout
+[ -d ${source_path}/../include ] && enable_feature alt_tree_layout
 for d in vp8 vp9; do
-    [ -d ${source_path}/${d} ] && disable alt_tree_layout;
+    [ -d ${source_path}/${d} ] && disable_feature alt_tree_layout;
 done

 if ! enabled alt_tree_layout; then
@@ -200,10 +201,10 @@ else
 [ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder"
 [ -f ${source_path}/../include/vpx/vp9cx.h ] && CODECS="${CODECS} vp9_encoder"
 [ -f ${source_path}/../include/vpx/vp9dx.h ] && CODECS="${CODECS} vp9_decoder"
-[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable vp8_encoder
-[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable vp8_decoder
-[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable vp9_encoder
-[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable vp9_decoder
+[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable_feature vp8_encoder
+[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable_feature vp8_decoder
+[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable_feature vp9_encoder
+[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable_feature vp9_decoder

 [ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt
 fi
@@ -279,6 +280,7 @@ CONFIG_LIST="
    dc_recon
    runtime_cpu_detect
    postproc
+    vp9_postproc
    multithread
    internal_stats
    ${CODECS}
@@ -333,6 +335,7 @@ CMDLINE_SELECT="
    dequant_tokens
    dc_recon
    postproc
+    vp9_postproc
    multithread
    internal_stats
    ${CODECS}
@@ -358,12 +361,12 @@ process_cmdline() {
    for opt do
        optval="${opt#*=}"
        case "$opt" in
-        --disable-codecs) for c in ${CODECS}; do disable $c; done ;;
+        --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;;
        --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
        if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then
            if enabled experimental; then
-                $action $option
+                ${action}_feature $option
            else
                log_echo "Ignoring $opt -- not in experimental mode."
            fi
@@ -384,8 +387,8 @@ post_process_cmdline() {
    # If the codec family is enabled, enable all components of that family.
    log_echo "Configuring selected codecs"
    for c in ${CODECS}; do
-        disabled ${c%%_*} && disable ${c}
-        enabled ${c%%_*} && enable ${c}
+        disabled ${c%%_*} && disable_feature ${c}
+        enabled ${c%%_*} && enable_feature ${c}
    done

    # Enable all detected codecs, if they haven't been disabled
@@ -393,12 +396,12 @@ post_process_cmdline() {

    # Enable the codec family if any component of that family is enabled
    for c in ${CODECS}; do
-        enabled $c && enable ${c%_*}
+        enabled $c && enable_feature ${c%_*}
    done

    # Set the {en,de}coders variable if any algorithm in that class is enabled
    for c in ${CODECS}; do
-        enabled ${c} && enable ${c##*_}s
+        enabled ${c} && enable_feature ${c##*_}s
    done
 }

@@ -438,7 +441,7 @@ process_targets() {
    done
    enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
    enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
-    ! enabled postproc && DIST_DIR="${DIST_DIR}-nopost"
+    ! enabled postproc && ! enabled vp9_postproc && DIST_DIR="${DIST_DIR}-nopost"
    ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
    ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
    DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
@@ -508,13 +511,13 @@ process_detect() {
    fi
    if [ -z "$CC" ] || enabled external_build; then
        echo "Bypassing toolchain for environment detection."
-        enable external_build
+        enable_feature external_build
        check_header() {
            log fake_check_header "$@"
            header=$1
            shift
            var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-            disable $var
+            disable_feature $var
            # Headers common to all environments
            case $header in
                stdio.h)
@@ -526,7 +529,7 @@ process_detect() {
                        [ -f "${d##-I}/$header" ] && result=true && break
                    done
                    ${result:-true}
-            esac && enable $var
+            esac && enable_feature $var

            # Specialize windows and POSIX environments.
            case $toolchain in
@@ -534,7 +537,7 @@ process_detect() {
                    case $header-$toolchain in
                        stdint*-gcc) true;;
                        *) false;;
-                    esac && enable $var
+                    esac && enable_feature $var
                    ;;
                *)
                    case $header in
@@ -543,7 +546,7 @@ process_detect() {
                        sys/mman.h) true;;
                        unistd.h) true;;
                        *) false;;
-                    esac && enable $var
+                    esac && enable_feature $var
            esac
            enabled $var
        }
@@ -561,7 +564,7 @@ EOF
    check_header sys/mman.h
    check_header unistd.h # for sysconf(3) and friends.

-    check_header vpx/vpx_integer.h -I${source_path} && enable vpx_ports
+    check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
 }

 process_toolchain() {
@@ -643,14 +646,18 @@ process_toolchain() {
    # ccache only really works on gcc toolchains
    enabled gcc || soft_disable ccache
    if enabled mips; then
-        enable dequant_tokens
-        enable dc_recon
+        enable_feature dequant_tokens
+        enable_feature dc_recon
+    fi
+
+    if enabled internal_stats; then
+        enable_feature vp9_postproc
    fi

    # Enable the postbuild target if building for visual studio.
    case "$tgt_cc" in
-        vs*) enable msvs
-             enable solution
+        vs*) enable_feature msvs
+             enable_feature solution
             vs_version=${tgt_cc##vs}
             case $vs_version in
             [789])
--- a/examples.mk
+++ b/examples.mk
@@ -49,6 +49,9 @@ vpxenc.DESCRIPTION           = Full featured encoder
 UTILS-$(CONFIG_VP8_ENCODER)    += vp8_scalable_patterns.c
 vp8_scalable_patterns.GUID   = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C
 vp8_scalable_patterns.DESCRIPTION = Temporal Scalability Encoder
+UTILS-$(CONFIG_VP9_ENCODER)    += vp9_spatial_scalable_encoder.c
+vp9_spatial_scalable_encoder.GUID   = 4A38598D-627D-4505-9C7B-D4020C84100D
+vp9_spatial_scalable_encoder.DESCRIPTION = Spatial Scalable Encoder

 # Clean up old ivfenc, ivfdec binaries.
 ifeq ($(CONFIG_MSVS),yes)
--- a/libmkv/EbmlWriter.c
+++ b/libmkv/EbmlWriter.c
@@ -105,7 +105,7 @@ void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned l
 void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin) {
  int size;
  for (size = 4; size > 1; size--) {
-    if (bin & 0x000000ff << ((size - 1) * 8))
+    if (bin & (unsigned int)0x000000ff << ((size - 1) * 8))
      break;
  }
  Ebml_WriteID(glob, class_id);
--- a/libs.mk
+++ b/libs.mk
@@ -395,7 +395,7 @@ libvpx_test_srcs.txt:
 	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | sort -u > $@
 CLEAN-OBJS += libvpx_test_srcs.txt

-$(LIBVPX_TEST_DATA):
+$(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
 	@echo "    [DOWNLOAD] $@"
 	$(qexec)trap 'rm -f $@' INT TERM &&\
            curl -L -o $@ $(call libvpx_test_data_url,$(@F))
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef LIBVPX_TEST_ACM_RANDOM_H_
-#define LIBVPX_TEST_ACM_RANDOM_H_
+#ifndef TEST_ACM_RANDOM_H_
+#define TEST_ACM_RANDOM_H_

 #include "third_party/googletest/src/include/gtest/gtest.h"

@@ -59,4 +59,4 @@ class ACMRandom {

 }  // namespace libvpx_test

-#endif  // LIBVPX_TEST_ACM_RANDOM_H_
+#endif  // TEST_ACM_RANDOM_H_
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -29,8 +29,8 @@ class BordersTest : public ::libvpx_test::EncoderTest,

  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                  ::libvpx_test::Encoder *encoder) {
-    if ( video->frame() == 1) {
-      encoder->Control(VP8E_SET_CPUUSED, 0);
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, 1);
      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
--- a/test/clear_system_state.h
+++ b/test/clear_system_state.h
@@ -10,7 +10,7 @@
 #ifndef TEST_CLEAR_SYSTEM_STATE_H_
 #define TEST_CLEAR_SYSTEM_STATE_H_

-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 # include "vpx_ports/x86.h"
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include <string.h>
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -187,7 +188,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {

 protected:
  static const int kDataAlignment = 16;
-  static const int kOuterBlockSize = 128;
+  static const int kOuterBlockSize = 256;
  static const int kInputStride = kOuterBlockSize;
  static const int kOutputStride = kOuterBlockSize;
  static const int kMaxDimension = 64;
@@ -224,6 +225,10 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
      input_[i] = prng.Rand8Extremes();
  }

+  void SetConstantInput(int value) {
+    memset(input_, value, kInputBufferSize);
+  }
+
  void CheckGuardBlocks() {
    for (int i = 0; i < kOutputBufferSize; ++i) {
      if (IsIndexInBorder(i))
@@ -456,45 +461,86 @@ DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = {
    { 128}
 };

+/* This test exercises the horizontal and vertical filter functions. */
 TEST_P(ConvolveTest, ChangeFilterWorks) {
  uint8_t* const in = input();
  uint8_t* const out = output();
+
+  /* Assume that the first input sample is at the 8/16th position. */
+  const int kInitialSubPelOffset = 8;
+
+  /* Filters are 8-tap, so the first filter tap will be applied to the pixel
+   * at position -3 with respect to the current filtering position. Since
+   * kInitialSubPelOffset is set to 8, we first select sub-pixel filter 8,
+   * which is non-zero only in the last tap. So, applying the filter at the
+   * current input position will result in an output equal to the pixel at
+   * offset +4 (-3 + 7) with respect to the current filtering position.
+   */
  const int kPixelSelected = 4;

+  /* Assume that each output pixel requires us to step on by 17/16th pixels in
+   * the input.
+   */
+  const int kInputPixelStep = 17;
+
+  /* The filters are setup in such a way that the expected output produces
+   * sets of 8 identical output samples. As the filter position moves to the
+   * next 1/16th pixel position the only active (=128) filter tap moves one
+   * position to the left, resulting in the same input pixel being replicated
+   * in to the output for 8 consecutive samples. After each set of 8 positions
+   * the filters select a different input pixel. kFilterPeriodAdjust below
+   * computes which input pixel is written to the output for a specified
+   * x or y position.
+   */
+
+  /* Test the horizontal filter. */
  REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[8], 17, kChangeFilters[4], 16,
-                                 Width(), Height()));
+                                 kChangeFilters[kInitialSubPelOffset],
+                                 kInputPixelStep, NULL, 0, Width(), Height()));

  for (int x = 0; x < Width(); ++x) {
-    const int kQ4StepAdjust = x >> 4;
    const int kFilterPeriodAdjust = (x >> 3) << 3;
-    const int ref_x = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
-    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x;
+    const int ref_x =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjust * kInputPixelStep)
+                          >> SUBPEL_BITS);
+    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x << "width = " << Width();
  }

+  /* Test the vertical filter. */
  REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[4], 16, kChangeFilters[8], 17,
-                                 Width(), Height()));
+                                 NULL, 0, kChangeFilters[kInitialSubPelOffset],
+                                 kInputPixelStep, Width(), Height()));

  for (int y = 0; y < Height(); ++y) {
-    const int kQ4StepAdjust = y >> 4;
    const int kFilterPeriodAdjust = (y >> 3) << 3;
-    const int ref_y = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
+    const int ref_y =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjust * kInputPixelStep)
+                          >> SUBPEL_BITS);
    ASSERT_EQ(in[ref_y * kInputStride], out[y * kInputStride]) << "y == " << y;
  }

+  /* Test the horizontal and vertical filters in combination. */
  REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                                  kChangeFilters[8], 17, kChangeFilters[8], 17,
+                                  kChangeFilters[kInitialSubPelOffset],
+                                  kInputPixelStep,
+                                  kChangeFilters[kInitialSubPelOffset],
+                                  kInputPixelStep,
                                  Width(), Height()));

  for (int y = 0; y < Height(); ++y) {
-    const int kQ4StepAdjustY = y >> 4;
    const int kFilterPeriodAdjustY = (y >> 3) << 3;
-    const int ref_y = kQ4StepAdjustY + kFilterPeriodAdjustY + kPixelSelected;
+    const int ref_y =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjustY * kInputPixelStep)
+                          >> SUBPEL_BITS);
    for (int x = 0; x < Width(); ++x) {
-      const int kQ4StepAdjustX = x >> 4;
      const int kFilterPeriodAdjustX = (x >> 3) << 3;
-      const int ref_x = kQ4StepAdjustX + kFilterPeriodAdjustX + kPixelSelected;
+      const int ref_x =
+          kPixelSelected + ((kInitialSubPelOffset
+              + kFilterPeriodAdjustX * kInputPixelStep)
+                            >> SUBPEL_BITS);

      ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x])
          << "x == " << x << ", y == " << y;
@@ -502,6 +548,34 @@ TEST_P(ConvolveTest, ChangeFilterWorks) {
  }
 }

+/* This test exercises that enough rows and columns are filtered with every
+   possible initial fractional positions and scaling steps. */
+TEST_P(ConvolveTest, CheckScalingFiltering) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+
+  SetConstantInput(127);
+
+  for (int frac = 0; frac < 16; ++frac) {
+    for (int step = 1; step <= 32; ++step) {
+      /* Test the horizontal and vertical filters in combination. */
+      REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
+                                      vp9_sub_pel_filters_8[frac], step,
+                                      vp9_sub_pel_filters_8[frac], step,
+                                      Width(), Height()));
+
+      CheckGuardBlocks();
+
+      for (int y = 0; y < Height(); ++y) {
+        for (int x = 0; x < Width(); ++x) {
+          ASSERT_EQ(in[y * kInputStride + x], out[y * kOutputStride + x])
+              << "x == " << x << ", y == " << y
+              << ", frac == " << frac << ", step == " << step;
+        }
+      }
+    }
+  }
+}

 using std::tr1::make_tuple;

--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -108,5 +108,5 @@ using std::tr1::make_tuple;
 VP9_INSTANTIATE_TEST_CASE(
    CpuSpeedTest,
    ::testing::Values(::libvpx_test::kTwoPassGood),
-    ::testing::Range(0, 3));
+    ::testing::Range(0, 5));
 }  // namespace
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -75,7 +75,7 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
    bits_in_buffer_model_ -= frame_size_in_bits;

    // Update the running total of bits for end of test datarate checks.
-    bits_total_ += frame_size_in_bits ;
+    bits_total_ += frame_size_in_bits;

    // If first drop not set and we have a drop set it to this time.
    if (!first_drop_ && duration > 1)
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -13,15 +13,16 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"

 extern "C" {
 #include "vp9/common/vp9_entropy.h"
-#include "vp9_rtcd.h"
-void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
+#include "./vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *output, int pitch);
 }
-
-#include "acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -31,12 +32,13 @@ namespace {
 #ifdef _MSC_VER
 static int round(double x) {
  if (x < 0)
-    return (int)ceil(x - 0.5);
+    return static_cast<int>(ceil(x - 0.5));
  else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif

+const int kNumCoeffs = 256;
 const double PI = 3.1415926535898;
 void reference2_16x16_idct_2d(double *input, double *output) {
  double x;
@@ -45,7 +47,9 @@ void reference2_16x16_idct_2d(double *input, double *output) {
      double s = 0;
      for (int i = 0; i < 16; ++i) {
        for (int j = 0; j < 16; ++j) {
-          x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/256;
+          x = cos(PI * j * (l + 0.5) / 16.0) *
+              cos(PI * i * (k + 0.5) / 16.0) *
+              input[i * 16 + j] / 256;
          if (i != 0)
            x *= sqrt(2.0);
          if (j != 0)
@@ -59,23 +63,23 @@ void reference2_16x16_idct_2d(double *input, double *output) {
 }


-static const double C1 = 0.995184726672197;
-static const double C2 = 0.98078528040323;
-static const double C3 = 0.956940335732209;
-static const double C4 = 0.923879532511287;
-static const double C5 = 0.881921264348355;
-static const double C6 = 0.831469612302545;
-static const double C7 = 0.773010453362737;
-static const double C8 = 0.707106781186548;
-static const double C9 = 0.634393284163646;
-static const double C10 = 0.555570233019602;
-static const double C11 = 0.471396736825998;
-static const double C12 = 0.38268343236509;
-static const double C13 = 0.290284677254462;
-static const double C14 = 0.195090322016128;
-static const double C15 = 0.098017140329561;
+const double C1 = 0.995184726672197;
+const double C2 = 0.98078528040323;
+const double C3 = 0.956940335732209;
+const double C4 = 0.923879532511287;
+const double C5 = 0.881921264348355;
+const double C6 = 0.831469612302545;
+const double C7 = 0.773010453362737;
+const double C8 = 0.707106781186548;
+const double C9 = 0.634393284163646;
+const double C10 = 0.555570233019602;
+const double C11 = 0.471396736825998;
+const double C12 = 0.38268343236509;
+const double C13 = 0.290284677254462;
+const double C14 = 0.195090322016128;
+const double C15 = 0.098017140329561;

-static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
+void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  double step[16];
  double intermediate[16];
  double temp1, temp2;
@@ -108,36 +112,36 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[6] = step[1] - step[6];
  output[7] = step[0] - step[7];

-  temp1 = step[ 8]*C7;
-  temp2 = step[15]*C9;
+  temp1 = step[ 8] * C7;
+  temp2 = step[15] * C9;
  output[ 8] = temp1 + temp2;

-  temp1 = step[ 9]*C11;
-  temp2 = step[14]*C5;
+  temp1 = step[ 9] * C11;
+  temp2 = step[14] * C5;
  output[ 9] = temp1 - temp2;

-  temp1 = step[10]*C3;
-  temp2 = step[13]*C13;
+  temp1 = step[10] * C3;
+  temp2 = step[13] * C13;
  output[10] = temp1 + temp2;

-  temp1 = step[11]*C15;
-  temp2 = step[12]*C1;
+  temp1 = step[11] * C15;
+  temp2 = step[12] * C1;
  output[11] = temp1 - temp2;

-  temp1 = step[11]*C1;
-  temp2 = step[12]*C15;
+  temp1 = step[11] * C1;
+  temp2 = step[12] * C15;
  output[12] = temp2 + temp1;

-  temp1 = step[10]*C13;
-  temp2 = step[13]*C3;
+  temp1 = step[10] * C13;
+  temp2 = step[13] * C3;
  output[13] = temp2 - temp1;

-  temp1 = step[ 9]*C5;
-  temp2 = step[14]*C11;
+  temp1 = step[ 9] * C5;
+  temp2 = step[14] * C11;
  output[14] = temp2 + temp1;

-  temp1 = step[ 8]*C9;
-  temp2 = step[15]*C7;
+  temp1 = step[ 8] * C9;
+  temp2 = step[15] * C7;
  output[15] = temp2 - temp1;

  // step 3
@@ -146,20 +150,20 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  step[ 2] = output[1] - output[2];
  step[ 3] = output[0] - output[3];

-  temp1 = output[4]*C14;
-  temp2 = output[7]*C2;
+  temp1 = output[4] * C14;
+  temp2 = output[7] * C2;
  step[ 4] = temp1 + temp2;

-  temp1 = output[5]*C10;
-  temp2 = output[6]*C6;
+  temp1 = output[5] * C10;
+  temp2 = output[6] * C6;
  step[ 5] = temp1 + temp2;

-  temp1 = output[5]*C6;
-  temp2 = output[6]*C10;
+  temp1 = output[5] * C6;
+  temp2 = output[6] * C10;
  step[ 6] = temp2 - temp1;

-  temp1 = output[4]*C2;
-  temp2 = output[7]*C14;
+  temp1 = output[4] * C2;
+  temp2 = output[7] * C14;
  step[ 7] = temp2 - temp1;

  step[ 8] = output[ 8] + output[11];
@@ -176,18 +180,18 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[ 0] = (step[ 0] + step[ 1]);
  output[ 8] = (step[ 0] - step[ 1]);

-  temp1 = step[2]*C12;
-  temp2 = step[3]*C4;
+  temp1 = step[2] * C12;
+  temp2 = step[3] * C4;
  temp1 = temp1 + temp2;
-  output[ 4] = 2*(temp1*C8);
+  output[ 4] = 2*(temp1 * C8);

-  temp1 = step[2]*C4;
-  temp2 = step[3]*C12;
+  temp1 = step[2] * C4;
+  temp2 = step[3] * C12;
  temp1 = temp2 - temp1;
-  output[12] = 2*(temp1*C8);
+  output[12] = 2 * (temp1 * C8);

-  output[ 2] = 2*((step[4] + step[ 5])*C8);
-  output[14] = 2*((step[7] - step[ 6])*C8);
+  output[ 2] = 2 * ((step[4] + step[ 5]) * C8);
+  output[14] = 2 * ((step[7] - step[ 6]) * C8);

  temp1 = step[4] - step[5];
  temp2 = step[6] + step[7];
@@ -197,17 +201,17 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  intermediate[8] = step[8] + step[14];
  intermediate[9] = step[9] + step[15];

-  temp1 = intermediate[8]*C12;
-  temp2 = intermediate[9]*C4;
+  temp1 = intermediate[8] * C12;
+  temp2 = intermediate[9] * C4;
  temp1 = temp1 - temp2;
-  output[3] = 2*(temp1*C8);
+  output[3] = 2 * (temp1 * C8);

-  temp1 = intermediate[8]*C4;
-  temp2 = intermediate[9]*C12;
+  temp1 = intermediate[8] * C4;
+  temp2 = intermediate[9] * C12;
  temp1 = temp2 + temp1;
-  output[13] = 2*(temp1*C8);
+  output[13] = 2 * (temp1 * C8);

-  output[ 9] = 2*((step[10] + step[11])*C8);
+  output[ 9] = 2 * ((step[10] + step[11]) * C8);

  intermediate[11] = step[10] - step[11];
  intermediate[12] = step[12] + step[13];
@@ -218,207 +222,301 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
  output[15] = (intermediate[11] + intermediate[12]);
  output[ 1] = -(intermediate[11] - intermediate[12]);

-  output[ 7] = 2*(intermediate[13]*C8);
+  output[ 7] = 2 * (intermediate[13] * C8);

-  temp1 = intermediate[14]*C12;
-  temp2 = intermediate[15]*C4;
+  temp1 = intermediate[14] * C12;
+  temp2 = intermediate[15] * C4;
  temp1 = temp1 - temp2;
-  output[11] = -2*(temp1*C8);
+  output[11] = -2 * (temp1 * C8);

-  temp1 = intermediate[14]*C4;
-  temp2 = intermediate[15]*C12;
+  temp1 = intermediate[14] * C4;
+  temp2 = intermediate[15] * C12;
  temp1 = temp2 + temp1;
-  output[ 5] = 2*(temp1*C8);
+  output[ 5] = 2 * (temp1 * C8);
 }

-static void reference_16x16_dct_1d(double in[16], double out[16]) {
-  const double kPi = 3.141592653589793238462643383279502884;
-  const double kInvSqrt2 = 0.707106781186547524400844362104;
-  for (int k = 0; k < 16; k++) {
-    out[k] = 0.0;
-    for (int n = 0; n < 16; n++)
-      out[k] += in[n]*cos(kPi*(2*n+1)*k/32.0);
-    if (k == 0)
-      out[k] = out[k]*kInvSqrt2;
-  }
-}
-
-void reference_16x16_dct_2d(int16_t input[16*16], double output[16*16]) {
+void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
  // First transform columns
  for (int i = 0; i < 16; ++i) {
    double temp_in[16], temp_out[16];
    for (int j = 0; j < 16; ++j)
-      temp_in[j] = input[j*16 + i];
+      temp_in[j] = input[j * 16 + i];
    butterfly_16x16_dct_1d(temp_in, temp_out);
    for (int j = 0; j < 16; ++j)
-      output[j*16 + i] = temp_out[j];
+      output[j * 16 + i] = temp_out[j];
  }
  // Then transform rows
  for (int i = 0; i < 16; ++i) {
    double temp_in[16], temp_out[16];
    for (int j = 0; j < 16; ++j)
-      temp_in[j] = output[j + i*16];
+      temp_in[j] = output[j + i * 16];
    butterfly_16x16_dct_1d(temp_in, temp_out);
    // Scale by some magic number
    for (int j = 0; j < 16; ++j)
-      output[j + i*16] = temp_out[j]/2;
+      output[j + i * 16] = temp_out[j]/2;
  }
 }

-void fdct16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-               int stride, int /*tx_type*/) {
+typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
+typedef void (*idct_t)(int16_t *in, uint8_t *dst, int stride);
+typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
+typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);
+
+void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
  vp9_short_fdct16x16_c(in, out, stride);
 }
-void idct16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                   int stride, int /*tx_type*/) {
-  vp9_short_idct16x16_add_c(out, dst, stride >> 1);
-}
-void fht16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-              int stride, int tx_type) {
-  // FIXME(jingning): need to test both SSE2 and c
-#if HAVE_SSE2
-  vp9_short_fht16x16_sse2(in, out, stride >> 1, tx_type);
-#else
-  vp9_short_fht16x16_c(in, out, stride >> 1, tx_type);
-#endif
-}
-void iht16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-  vp9_short_iht16x16_add_c(out, dst, stride >> 1, tx_type);
+
+void fht16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
+  vp9_short_fht16x16_c(in, out, stride, tx_type);
 }

-class FwdTrans16x16Test : public ::testing::TestWithParam<int> {
+class Trans16x16TestBase {
 public:
-  virtual ~FwdTrans16x16Test() {}
-
-  virtual void SetUp() {
-    tx_type_ = GetParam();
-    if (tx_type_ == 0) {
-      fwd_txfm = fdct16x16;
-      inv_txfm = idct16x16_add;
-    } else {
-      fwd_txfm = fht16x16;
-      inv_txfm = iht16x16_add;
-    }
-  }
+  virtual ~Trans16x16TestBase() {}

 protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*fwd_txfm)(in, out, dst, stride, tx_type);
-  }
-  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*inv_txfm)(in, out, dst, stride, tx_type);
+  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
+
+  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+    for (int i = 0; i < count_test_block; ++i) {
+      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        test_input_block[j] = src[j] - dst[j];
+      }
+
+      REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+                                      test_temp_block, pitch_));
+      REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = dst[j] - src[j];
+        const uint32_t error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
+    }
+
+    EXPECT_GE(1u, max_error)
+        << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
+
+    EXPECT_GE(count_test_block , total_error)
+        << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
  }

+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+    }
+  }
+
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+      }
+      if (i == 0)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = 255;
+      if (i == 1)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = -255;
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+                                      output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      }
+    }
+  }
+
+  void RunInvAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        in[j] = src[j] - dst[j];
+      }
+
+      reference_16x16_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff[j] = round(out_r[j]);
+
+      const int pitch = 32;
+      REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = dst[j] - src[j];
+        const uint32_t error = diff * diff;
+        EXPECT_GE(1u, error)
+            << "Error: 16x16 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+  int pitch_;
  int tx_type_;
-  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+  fht_t fwd_txfm_ref;
 };

-TEST_P(FwdTrans16x16Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int max_error = 0;
-  double total_error = 0;
-  const int count_test_block = 10000;
-  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 256);
+class Trans16x16DCT : public Trans16x16TestBase,
+                      public PARAMS(fdct_t, idct_t, int) {
+ public:
+  virtual ~Trans16x16DCT() {}

-    for (int j = 0; j < 256; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-      // Initialize a test block with input range [-255, 255].
-      test_input_block[j] = src[j] - dst[j];
-    }
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 32;
+    fwd_txfm_ref = fdct16x16_ref;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }

-    const int pitch = 32;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-
-    for (int j = 0; j < 256; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride >> 1);
  }

-  EXPECT_GE(1, max_error)
-      << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
+  fdct_t fwd_txfm_;
+  idct_t inv_txfm_;
+};

-  EXPECT_GE(count_test_block , total_error)
-      << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
+TEST_P(Trans16x16DCT, AccuracyCheck) {
+  RunAccuracyCheck();
 }

-TEST_P(FwdTrans16x16Test, CoeffSizeCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_extreme_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
+TEST_P(Trans16x16DCT, CoeffCheck) {
+  RunCoeffCheck();
+}

-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < 256; ++j)
-        input_extreme_block[j] = 255;
+TEST_P(Trans16x16DCT, MemCheck) {
+  RunMemCheck();
+}

-    const int pitch = 32;
-    RunFwdTxfm(input_block, output_block, dst, pitch, tx_type_);
-    RunFwdTxfm(input_extreme_block, output_extreme_block, dst, pitch, tx_type_);
+TEST_P(Trans16x16DCT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}

-    // The minimum quant value is 4.
-    for (int j = 0; j < 256; ++j) {
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 16x16 FDCT extreme has coefficient larger "
-          << "than 4*DCT_MAX_VALUE";
-    }
+class Trans16x16HT : public Trans16x16TestBase,
+                     public PARAMS(fht_t, iht_t, int) {
+ public:
+  virtual ~Trans16x16HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 16;
+    fwd_txfm_ref = fht16x16_ref;
  }
-}
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }

-INSTANTIATE_TEST_CASE_P(VP9, FwdTrans16x16Test, ::testing::Range(0, 4));
-
-TEST(VP9Idct16x16Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t in[256], coeff[256];
-    uint8_t dst[256], src[256];
-    double out_r[256];
-
-    for (int j = 0; j < 256; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j)
-      in[j] = src[j] - dst[j];
-
-    reference_16x16_dct_2d(in, out_r);
-    for (int j = 0; j < 256; j++)
-      coeff[j] = round(out_r[j]);
-    vp9_short_idct16x16_add_c(coeff, dst, 16);
-    for (int j = 0; j < 256; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      EXPECT_GE(1, error)
-          << "Error: 16x16 IDCT has error " << error
-          << " at index " << j;
-    }
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
  }
+  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  fht_t fwd_txfm_;
+  iht_t inv_txfm_;
+};
+
+TEST_P(Trans16x16HT, AccuracyCheck) {
+  RunAccuracyCheck();
 }

+TEST_P(Trans16x16HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans16x16HT, MemCheck) {
+  RunMemCheck();
+}
+
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct16x16_c, &vp9_short_idct16x16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 0),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 1),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 2),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 3)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct16x16_sse2,
+                   &vp9_short_idct16x16_add_sse2, 0)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 0),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 1),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 2),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 3)));
+#endif
 }  // namespace
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -13,15 +13,17 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"

 extern "C" {
+#include "./vpx_config.h"
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
-  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
-  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
 }

-#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -30,35 +32,15 @@ namespace {
 #ifdef _MSC_VER
 static int round(double x) {
  if (x < 0)
-    return (int)ceil(x - 0.5);
+    return static_cast<int>(ceil(x - 0.5));
  else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif

-static const double kPi = 3.141592653589793238462643383279502884;
-static void reference2_32x32_idct_2d(double *input, double *output) {
-  double x;
-  for (int l = 0; l < 32; ++l) {
-    for (int k = 0; k < 32; ++k) {
-      double s = 0;
-      for (int i = 0; i < 32; ++i) {
-        for (int j = 0; j < 32; ++j) {
-          x = cos(kPi * j * (l + 0.5) / 32.0) *
-              cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
-          if (i != 0)
-            x *= sqrt(2.0);
-          if (j != 0)
-            x *= sqrt(2.0);
-          s += x;
-        }
-      }
-      output[k * 32 + l] = s / 4;
-    }
-  }
-}
-
-static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
+const int kNumCoeffs = 1024;
+const double kPi = 3.141592653589793238462643383279502884;
+void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
  const double kInvSqrt2 = 0.707106781186547524400844362104;
  for (int k = 0; k < 32; k++) {
    out[k] = 0.0;
@@ -69,7 +51,8 @@ static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
  }
 }

-static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
+void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
+                            double output[kNumCoeffs]) {
  // First transform columns
  for (int i = 0; i < 32; ++i) {
    double temp_in[32], temp_out[32];
@@ -91,27 +74,165 @@ static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
  }
 }

-TEST(VP9Idct32x32Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t in[1024], coeff[1024];
-    uint8_t dst[1024], src[1024];
-    double out_r[1024];
+typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
+typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride);

-    for (int j = 0; j < 1024; ++j) {
+class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
+ public:
+  virtual ~Trans32x32Test() {}
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    version_  = GET_PARAM(2);  // 0: high precision forward transform
+                               // 1: low precision version for rd loop
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int version_;
+  fwd_txfm_t fwd_txfm_;
+  inv_txfm_t inv_txfm_;
+};
+
+TEST_P(Trans32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  uint32_t max_error = 0;
+  int64_t total_error = 0;
+  const int count_test_block = 1000;
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < kNumCoeffs; ++j) {
      src[j] = rnd.Rand8();
      dst[j] = rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
    }
+
+    const int pitch = 64;
+    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch));
+    REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      const uint32_t diff = dst[j] - src[j];
+      const uint32_t error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  if (version_ == 1) {
+    max_error /= 2;
+    total_error /= 45;
+  }
+
+  EXPECT_GE(1u, max_error)
+      << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
+
+  EXPECT_GE(count_test_block, total_error)
+      << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
+}
+
+TEST_P(Trans32x32Test, CoeffCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    for (int j = 0; j < kNumCoeffs; ++j)
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_block, output_ref_block, pitch);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch));
+
+    if (version_ == 0) {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+    } else {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, MemCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 2000;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j)
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = 255;
+    if (i == 1)
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = -255;
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch));
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (version_ == 0) {
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+      } else {
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+      }
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
+          << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than "
+          << "4*DCT_MAX_VALUE";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, InverseAccuracy) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    double out_r[kNumCoeffs];
+
+    // Initialize a test block with input range [-255, 255]
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
      in[j] = src[j] - dst[j];
+    }

    reference_32x32_dct_2d(in, out_r);
-    for (int j = 0; j < 1024; j++)
+    for (int j = 0; j < kNumCoeffs; ++j)
      coeff[j] = round(out_r[j]);
-    vp9_short_idct32x32_add_c(coeff, dst, 32);
-    for (int j = 0; j < 1024; ++j) {
+    REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+    for (int j = 0; j < kNumCoeffs; ++j) {
      const int diff = dst[j] - src[j];
      const int error = diff * diff;
      EXPECT_GE(1, error)
@@ -121,72 +242,21 @@ TEST(VP9Idct32x32Test, AccuracyCheck) {
  }
 }

-TEST(VP9Fdct32x32Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  unsigned int max_error = 0;
-  int64_t total_error = 0;
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[1024];
-    int16_t test_temp_block[1024];
-    uint8_t dst[1024], src[1024];
+using std::tr1::make_tuple;

-    for (int j = 0; j < 1024; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j)
-      test_input_block[j] = src[j] - dst[j];
+INSTANTIATE_TEST_CASE_P(
+    C, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct32x32_c, &vp9_short_idct32x32_add_c, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_short_idct32x32_add_c, 1)));

-    const int pitch = 64;
-    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
-
-    for (int j = 0; j < 1024; ++j) {
-      const unsigned diff = dst[j] - src[j];
-      const unsigned error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
-  }
-
-  EXPECT_GE(1u, max_error)
-      << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
-
-  EXPECT_GE(count_test_block, total_error)
-      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1 per block";
-}
-
-TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t input_block[1024], input_extreme_block[1024];
-    int16_t output_block[1024], output_extreme_block[1024];
-
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < 1024; ++j)
-        input_extreme_block[j] = 255;
-
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_block, output_block, pitch);
-    vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < 1024; ++j) {
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 32x32 FDCT extreme has coefficient larger than "
-             "4*DCT_MAX_VALUE";
-    }
-  }
-}
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct32x32_sse2,
+                   &vp9_short_idct32x32_add_sse2, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_sse2,
+                   &vp9_short_idct32x32_add_sse2, 1)));
+#endif
 }  // namespace
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -12,7 +12,7 @@
 #define TEST_DECODE_TEST_DRIVER_H_
 #include <cstring>
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"

 namespace libvpx_test {
@@ -36,9 +36,8 @@ class DxDataIterator {
 };

 // Provides a simplified interface to manage one video decoding.
-//
-// TODO: similar to Encoder class, the exact services should be
-// added as more tests are added.
+// Similar to Encoder class, the exact services should be added
+// as more tests are added.
 class Decoder {
 public:
  Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/decode_test_driver.h"
@@ -114,19 +114,19 @@ static bool compare_img(const vpx_image_t *img1,
  const unsigned int height_y = img1->d_h;
  unsigned int i;
  for (i = 0; i < height_y; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
-                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                     width_y) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                    img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                    width_y) == 0) && match;
  const unsigned int width_uv  = (img1->d_w + 1) >> 1;
  const unsigned int height_uv = (img1->d_h + 1) >> 1;
  for (i = 0; i <  height_uv; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
-                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
-                     width_uv) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                    img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                    width_uv) == 0) && match;
  for (i = 0; i < height_uv; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
-                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
-                     width_uv) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                    img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                    width_uv) == 0) && match;
  return match;
 }

@@ -158,7 +158,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
    Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
    bool again;
    for (again = true, video->Begin(); again; video->Next()) {
-      again = video->img() != NULL;
+      again = (video->img() != NULL);

      PreEncodeFrameHook(video);
      PreEncodeFrameHook(video, encoder);
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -62,7 +62,7 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
    if (droppable_nframes_ > 0 &&
        (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
      for (unsigned int i = 0; i < droppable_nframes_; ++i) {
-        if (droppable_frames_[i] == nframes_) {
+        if (droppable_frames_[i] == video->frame()) {
          std::cout << "             Encoding droppable frame: "
                    << droppable_frames_[i] << "\n";
          frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST |
@@ -148,7 +148,7 @@ TEST_P(ErrorResilienceTest, OnVersusOff) {
  const vpx_rational timebase = { 33333333, 1000000000 };
  cfg_.g_timebase = timebase;
  cfg_.rc_target_bitrate = 2000;
-  cfg_.g_lag_in_frames = 25;
+  cfg_.g_lag_in_frames = 10;

  init_flags_ = VPX_CODEC_USE_PSNR;

@@ -179,6 +179,9 @@ TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) {
  const vpx_rational timebase = { 33333333, 1000000000 };
  cfg_.g_timebase = timebase;
  cfg_.rc_target_bitrate = 500;
+  // FIXME(debargha): Fix this to work for any lag.
+  // Currently this test only works for lag = 0
+  cfg_.g_lag_in_frames = 0;

  init_flags_ = VPX_CODEC_USE_PSNR;

--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -15,10 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 }

-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"

@@ -136,7 +136,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());

  int max_error = 0;
-  double total_error = 0;
+  int total_error = 0;
  const int count_test_block = 1000000;
  for (int i = 0; i < count_test_block; ++i) {
    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
@@ -156,7 +156,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);

    for (int j = 0; j < 16; ++j) {
-        if(test_temp_block[j] > 0) {
+        if (test_temp_block[j] > 0) {
          test_temp_block[j] += 2;
          test_temp_block[j] /= 4;
          test_temp_block[j] *= 4;
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -13,229 +13,309 @@
 #include <string.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"

 extern "C" {
-#include "vp9_rtcd.h"
-void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
+#include "vp9/common/vp9_entropy.h"
+#include "./vp9_rtcd.h"
+void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *output, int pitch);
 }
-
-#include "acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;

 namespace {
-void fdct8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-             int stride, int /*tx_type*/) {
+typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
+typedef void (*idct_t)(int16_t *in, uint8_t *dst, int stride);
+typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
+typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);
+
+void fdct8x8_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
  vp9_short_fdct8x8_c(in, out, stride);
 }
-void idct8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                 int stride, int /*tx_type*/) {
-  vp9_short_idct8x8_add_c(out, dst, stride >> 1);
-}
-void fht8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-            int stride, int tx_type) {
-  // TODO(jingning): need to refactor this to test both _c and _sse2 functions,
-  // when we have all inverse dct functions done sse2.
-#if HAVE_SSE2
-  vp9_short_fht8x8_sse2(in, out, stride >> 1, tx_type);
-#else
-  vp9_short_fht8x8_c(in, out, stride >> 1, tx_type);
-#endif
-}
-void iht8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                int stride, int tx_type) {
-  vp9_short_iht8x8_add_c(out, dst, stride >> 1, tx_type);
+
+void fht8x8_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
+  vp9_short_fht8x8_c(in, out, stride, tx_type);
 }

-class FwdTrans8x8Test : public ::testing::TestWithParam<int> {
+class FwdTrans8x8TestBase {
 public:
-  virtual ~FwdTrans8x8Test() {}
-  virtual void SetUp() {
-    tx_type_ = GetParam();
-    if (tx_type_ == 0) {
-      fwd_txfm = fdct8x8;
-      inv_txfm = idct8x8_add;
-    } else {
-      fwd_txfm = fht8x8;
-      inv_txfm = iht8x8_add;
-    }
-  }
+  virtual ~FwdTrans8x8TestBase() {}

 protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*fwd_txfm)(in, out, dst, stride, tx_type);
-  }
-  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*inv_txfm)(in, out, dst, stride, tx_type);
-  }
+  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
+  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;

-  int tx_type_;
-  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-};
+  void RunSignBiasCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
+    int count_sign_block[64][2];
+    const int count_test_block = 100000;

-TEST_P(FwdTrans8x8Test, SignBiasCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
-  const int pitch = 16;
-  int count_sign_block[64][2];
-  const int count_test_block = 100000;
+    memset(count_sign_block, 0, sizeof(count_sign_block));

-  memset(count_sign_block, 0, sizeof(count_sign_block));
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < 64; ++j)
+        test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      REGISTER_STATE_CHECK(
+          RunFwdTxfm(test_input_block, test_output_block, pitch_));

-  for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 64; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
-
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+      for (int j = 0; j < 64; ++j) {
+        if (test_output_block[j] < 0)
+          ++count_sign_block[j][0];
+        else if (test_output_block[j] > 0)
+          ++count_sign_block[j][1];
+      }
+    }

    for (int j = 0; j < 64; ++j) {
-      if (test_output_block[j] < 0)
-        ++count_sign_block[j][0];
-      else if (test_output_block[j] > 0)
-        ++count_sign_block[j][1];
+      const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
+      const int max_diff = 1125;
+      EXPECT_LT(diff, max_diff)
+          << "Error: 8x8 FDCT/FHT has a sign bias > "
+          << 1. * max_diff / count_test_block * 100 << "%"
+          << " for input range [-255, 255] at index " << j
+          << " count0: " << count_sign_block[j][0]
+          << " count1: " << count_sign_block[j][1]
+          << " diff: " << diff;
+    }
+
+    memset(count_sign_block, 0, sizeof(count_sign_block));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-15, 15].
+      for (int j = 0; j < 64; ++j)
+        test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+      REGISTER_STATE_CHECK(
+          RunFwdTxfm(test_input_block, test_output_block, pitch_));
+
+      for (int j = 0; j < 64; ++j) {
+        if (test_output_block[j] < 0)
+          ++count_sign_block[j][0];
+        else if (test_output_block[j] > 0)
+          ++count_sign_block[j][1];
+      }
+    }
+
+    for (int j = 0; j < 64; ++j) {
+      const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
+      const int max_diff = 10000;
+      EXPECT_LT(diff, max_diff)
+          << "Error: 4x4 FDCT/FHT has a sign bias > "
+          << 1. * max_diff / count_test_block * 100 << "%"
+          << " for input range [-15, 15] at index " << j
+          << " count0: " << count_sign_block[j][0]
+          << " count1: " << count_sign_block[j][1]
+          << " diff: " << diff;
    }
  }

-  for (int j = 0; j < 64; ++j) {
-    const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
-    const int max_diff = 1125;
-    EXPECT_LT(diff, max_diff)
-        << "Error: 8x8 FDCT/FHT has a sign bias > "
-        << 1. * max_diff / count_test_block * 100 << "%"
-        << " for input range [-255, 255] at index " << j
-        << " count0: " << count_sign_block[j][0]
-        << " count1: " << count_sign_block[j][1]
-        << " diff: " << diff;
-  }
-
-  memset(count_sign_block, 0, sizeof(count_sign_block));
-
-  for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-15, 15].
-    for (int j = 0; j < 64; ++j)
-      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
-
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
-
-    for (int j = 0; j < 64; ++j) {
-      if (test_output_block[j] < 0)
-        ++count_sign_block[j][0];
-      else if (test_output_block[j] > 0)
-        ++count_sign_block[j][1];
-    }
-  }
-
-  for (int j = 0; j < 64; ++j) {
-    const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
-    const int max_diff = 10000;
-    EXPECT_LT(diff, max_diff)
-        << "Error: 4x4 FDCT/FHT has a sign bias > "
-        << 1. * max_diff / count_test_block * 100 << "%"
-        << " for input range [-15, 15] at index " << j
-        << " count0: " << count_sign_block[j][0]
-        << " count1: " << count_sign_block[j][1]
-        << " diff: " << diff;
-  }
-}
-
-TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int max_error = 0;
-  double total_error = 0;
-  const int count_test_block = 100000;
-  for (int i = 0; i < count_test_block; ++i) {
+  void RunRoundTripErrorCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    int max_error = 0;
+    int total_error = 0;
+    const int count_test_block = 100000;
    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);

-    for (int j = 0; j < 64; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 64; ++j)
-      test_input_block[j] = src[j] - dst[j];
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < 64; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        test_input_block[j] = src[j] - dst[j];
+      }

-    const int pitch = 16;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    for (int j = 0; j < 64; ++j){
-        if(test_temp_block[j] > 0) {
-          test_temp_block[j] += 2;
-          test_temp_block[j] /= 4;
-          test_temp_block[j] *= 4;
-        } else {
-          test_temp_block[j] -= 2;
-          test_temp_block[j] /= 4;
-          test_temp_block[j] *= 4;
-        }
-    }
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+      REGISTER_STATE_CHECK(
+          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
+      for (int j = 0; j < 64; ++j) {
+          if (test_temp_block[j] > 0) {
+            test_temp_block[j] += 2;
+            test_temp_block[j] /= 4;
+            test_temp_block[j] *= 4;
+          } else {
+            test_temp_block[j] -= 2;
+            test_temp_block[j] /= 4;
+            test_temp_block[j] *= 4;
+          }
+      }
+      REGISTER_STATE_CHECK(
+          RunInvTxfm(test_temp_block, dst, pitch_));

-    for (int j = 0; j < 64; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
-  }
-
-  EXPECT_GE(1, max_error)
-    << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1";
-
-  EXPECT_GE(count_test_block/5, total_error)
-    << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
-        "error > 1/5 per block";
-}
-
-TEST_P(FwdTrans8x8Test, ExtremalCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int max_error = 0;
-  double total_error = 0;
-  const int count_test_block = 100000;
-  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
-
-    for (int j = 0; j < 64; ++j) {
-      src[j] = rnd.Rand8() % 2 ? 255 : 0;
-      dst[j] = src[j] > 0 ? 0 : 255;
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 64; ++j)
-      test_input_block[j] = src[j] - dst[j];
-
-    const int pitch = 16;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-
-    for (int j = 0; j < 64; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
+      for (int j = 0; j < 64; ++j) {
+        const int diff = dst[j] - src[j];
+        const int error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
    }

    EXPECT_GE(1, max_error)
-        << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has an"
-        << " individual roundtrip error > 1";
+      << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"
+      << " roundtrip error > 1";

    EXPECT_GE(count_test_block/5, total_error)
-        << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
-        << " roundtrip error > 1/5 per block";
+      << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
+      << "error > 1/5 per block";
  }
+
+  void RunExtremalCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    int max_error = 0;
+    int total_error = 0;
+    const int count_test_block = 100000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < 64; ++j) {
+        src[j] = rnd.Rand8() % 2 ? 255 : 0;
+        dst[j] = src[j] > 0 ? 0 : 255;
+        test_input_block[j] = src[j] - dst[j];
+      }
+
+      REGISTER_STATE_CHECK(
+          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
+      REGISTER_STATE_CHECK(
+          RunInvTxfm(test_temp_block, dst, pitch_));
+
+      for (int j = 0; j < 64; ++j) {
+        const int diff = dst[j] - src[j];
+        const int error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
+
+      EXPECT_GE(1, max_error)
+          << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has"
+          << "an individual roundtrip error > 1";
+
+      EXPECT_GE(count_test_block/5, total_error)
+          << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
+          << " roundtrip error > 1/5 per block";
+    }
+  }
+
+  int pitch_;
+  int tx_type_;
+  fht_t fwd_txfm_ref;
+};
+
+class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
+                       public PARAMS(fdct_t, idct_t, int) {
+ public:
+  virtual ~FwdTrans8x8DCT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 16;
+    fwd_txfm_ref = fdct8x8_ref;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride >> 1);
+  }
+
+  fdct_t fwd_txfm_;
+  idct_t inv_txfm_;
+};
+
+TEST_P(FwdTrans8x8DCT, SignBiasCheck) {
+  RunSignBiasCheck();
 }

-INSTANTIATE_TEST_CASE_P(VP9, FwdTrans8x8Test, ::testing::Range(0, 4));
+TEST_P(FwdTrans8x8DCT, RoundTripErrorCheck) {
+  RunRoundTripErrorCheck();
+}
+
+TEST_P(FwdTrans8x8DCT, ExtremalCheck) {
+  RunExtremalCheck();
+}
+
+class FwdTrans8x8HT : public FwdTrans8x8TestBase,
+                      public PARAMS(fht_t, iht_t, int) {
+ public:
+  virtual ~FwdTrans8x8HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 8;
+    fwd_txfm_ref = fht8x8_ref;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  fht_t fwd_txfm_;
+  iht_t inv_txfm_;
+};
+
+TEST_P(FwdTrans8x8HT, SignBiasCheck) {
+  RunSignBiasCheck();
+}
+
+TEST_P(FwdTrans8x8HT, RoundTripErrorCheck) {
+  RunRoundTripErrorCheck();
+}
+
+TEST_P(FwdTrans8x8HT, ExtremalCheck) {
+  RunExtremalCheck();
+}
+
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct8x8_c, &vp9_short_idct8x8_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    C, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht8x8_c, &vp9_short_iht8x8_add_c, 0),
+        make_tuple(&vp9_short_fht8x8_c, &vp9_short_iht8x8_add_c, 1),
+        make_tuple(&vp9_short_fht8x8_c, &vp9_short_iht8x8_add_c, 2),
+        make_tuple(&vp9_short_fht8x8_c, &vp9_short_iht8x8_add_c, 3)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct8x8_sse2, &vp9_short_idct8x8_add_sse2, 0)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht8x8_sse2, &vp9_short_iht8x8_add_sse2, 0),
+        make_tuple(&vp9_short_fht8x8_sse2, &vp9_short_iht8x8_add_sse2, 1),
+        make_tuple(&vp9_short_fht8x8_sse2, &vp9_short_iht8x8_add_sse2, 2),
+        make_tuple(&vp9_short_fht8x8_sse2, &vp9_short_iht8x8_add_sse2, 3)));
+#endif
 }  // namespace
--- a/test/i420_video_source.h
+++ b/test/i420_video_source.h
@@ -11,6 +11,7 @@
 #define TEST_I420_VIDEO_SOURCE_H_
 #include <cstdio>
 #include <cstdlib>
+#include <string>

 #include "test/video_source.h"

@@ -34,7 +35,6 @@ class I420VideoSource : public VideoSource {
        height_(0),
        framerate_numerator_(rate_numerator),
        framerate_denominator_(rate_denominator) {
-
    // This initializes raw_sz_, width_, height_ and allocates an img.
    SetSize(width, height);
  }
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -15,10 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 }

-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -27,10 +27,10 @@ namespace {

 #ifdef _MSC_VER
 static int round(double x) {
-  if(x < 0)
-    return (int)ceil(x - 0.5);
+  if (x < 0)
+    return static_cast<int>(ceil(x - 0.5));
  else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif

--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -16,7 +16,9 @@ extern "C" {
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"

-typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
+#include "vpx/vpx_integer.h"
+
+typedef void (*idct_fn_t)(int16_t *input, unsigned char *pred_ptr,
                          int pred_stride, unsigned char *dst_ptr,
                          int dst_stride);
 namespace {
@@ -34,7 +36,7 @@ class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {
  virtual void TearDown() { libvpx_test::ClearSystemState(); }

  idct_fn_t UUT;
-  short input[16];
+  int16_t input[16];
  unsigned char output[256];
  unsigned char predict[256];
 };
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -15,8 +15,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -34,13 +34,17 @@ class IntraPredBase {
  }

 protected:
-  void SetupMacroblock(uint8_t *data, int block_size, int stride,
+  void SetupMacroblock(MACROBLOCKD *mbptr,
+                       MODE_INFO *miptr,
+                       uint8_t *data,
+                       int block_size,
+                       int stride,
                       int num_planes) {
-    memset(&mb_, 0, sizeof(mb_));
-    memset(&mi_, 0, sizeof(mi_));
-    mb_.up_available = 1;
-    mb_.left_available = 1;
-    mb_.mode_info_context = &mi_;
+    mbptr_ = mbptr;
+    miptr_ = miptr;
+    mbptr_->up_available = 1;
+    mbptr_->left_available = 1;
+    mbptr_->mode_info_context = miptr_;
    stride_ = stride;
    block_size_ = block_size;
    num_planes_ = num_planes;
@@ -63,14 +67,14 @@ class IntraPredBase {
  virtual void Predict(MB_PREDICTION_MODE mode) = 0;

  void SetLeftUnavailable() {
-    mb_.left_available = 0;
+    mbptr_->left_available = 0;
    for (int p = 0; p < num_planes_; p++)
      for (int i = -1; i < block_size_; ++i)
        data_ptr_[p][stride_ * i - 1] = 129;
  }

  void SetTopUnavailable() {
-    mb_.up_available = 0;
+    mbptr_->up_available = 0;
    for (int p = 0; p < num_planes_; p++)
      memset(&data_ptr_[p][-1 - stride_], 127, block_size_ + 2);
  }
@@ -96,19 +100,19 @@ class IntraPredBase {
    for (int p = 0; p < num_planes_; p++) {
      // calculate expected DC
      int expected;
-      if (mb_.up_available || mb_.left_available) {
-        int sum = 0, shift = BlockSizeLog2Min1() + mb_.up_available +
-                             mb_.left_available;
-        if (mb_.up_available)
+      if (mbptr_->up_available || mbptr_->left_available) {
+        int sum = 0, shift = BlockSizeLog2Min1() + mbptr_->up_available +
+                             mbptr_->left_available;
+        if (mbptr_->up_available)
          for (int x = 0; x < block_size_; x++)
            sum += data_ptr_[p][x - stride_];
-        if (mb_.left_available)
+        if (mbptr_->left_available)
          for (int y = 0; y < block_size_; y++)
            sum += data_ptr_[p][y * stride_ - 1];
        expected = (sum + (1 << (shift - 1))) >> shift;
-      } else
+      } else {
        expected = 0x80;
-
+      }
      // check that all subsequent lines are equal to the first
      for (int y = 1; y < block_size_; ++y)
        ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_],
@@ -209,8 +213,8 @@ class IntraPredBase {
    }
  }

-  MACROBLOCKD mb_;
-  MODE_INFO mi_;
+  MACROBLOCKD *mbptr_;
+  MODE_INFO *miptr_;
  uint8_t *data_ptr_[2];  // in the case of Y, only [0] is used
  int stride_;
  int block_size_;
@@ -228,12 +232,18 @@ class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>,
    protected IntraPredBase {
 public:
  static void SetUpTestCase() {
+    mb_ = reinterpret_cast<MACROBLOCKD*>(
+        vpx_memalign(32, sizeof(MACROBLOCKD)));
+    mi_ = reinterpret_cast<MODE_INFO*>(
+        vpx_memalign(32, sizeof(MODE_INFO)));
    data_array_ = reinterpret_cast<uint8_t*>(
        vpx_memalign(kDataAlignment, kDataBufferSize));
  }

  static void TearDownTestCase() {
    vpx_free(data_array_);
+    vpx_free(mi_);
+    vpx_free(mb_);
    data_array_ = NULL;
  }

@@ -250,12 +260,12 @@ class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>,

  virtual void SetUp() {
    pred_fn_ = GetParam();
-    SetupMacroblock(data_array_, kBlockSize, kStride, 1);
+    SetupMacroblock(mb_, mi_, data_array_, kBlockSize, kStride, 1);
  }

  virtual void Predict(MB_PREDICTION_MODE mode) {
-    mb_.mode_info_context->mbmi.mode = mode;
-    REGISTER_STATE_CHECK(pred_fn_(&mb_,
+    mbptr_->mode_info_context->mbmi.mode = mode;
+    REGISTER_STATE_CHECK(pred_fn_(mbptr_,
                                  data_ptr_[0] - kStride,
                                  data_ptr_[0] - 1, kStride,
                                  data_ptr_[0], kStride));
@@ -263,8 +273,12 @@ class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>,

  intra_pred_y_fn_t pred_fn_;
  static uint8_t* data_array_;
+  static MACROBLOCKD * mb_;
+  static MODE_INFO *mi_;
 };

+MACROBLOCKD* IntraPredYTest::mb_ = NULL;
+MODE_INFO* IntraPredYTest::mi_ = NULL;
 uint8_t* IntraPredYTest::data_array_ = NULL;

 TEST_P(IntraPredYTest, IntraPredTests) {
@@ -299,12 +313,18 @@ class IntraPredUVTest : public ::testing::TestWithParam<intra_pred_uv_fn_t>,
    protected IntraPredBase {
 public:
  static void SetUpTestCase() {
+    mb_ = reinterpret_cast<MACROBLOCKD*>(
+        vpx_memalign(32, sizeof(MACROBLOCKD)));
+    mi_ = reinterpret_cast<MODE_INFO*>(
+        vpx_memalign(32, sizeof(MODE_INFO)));
    data_array_ = reinterpret_cast<uint8_t*>(
        vpx_memalign(kDataAlignment, kDataBufferSize));
  }

  static void TearDownTestCase() {
    vpx_free(data_array_);
+    vpx_free(mi_);
+    vpx_free(mb_);
    data_array_ = NULL;
  }

@@ -322,12 +342,12 @@ class IntraPredUVTest : public ::testing::TestWithParam<intra_pred_uv_fn_t>,

  virtual void SetUp() {
    pred_fn_ = GetParam();
-    SetupMacroblock(data_array_, kBlockSize, kStride, 2);
+    SetupMacroblock(mb_, mi_, data_array_, kBlockSize, kStride, 2);
  }

  virtual void Predict(MB_PREDICTION_MODE mode) {
-    mb_.mode_info_context->mbmi.uv_mode = mode;
-    pred_fn_(&mb_, data_ptr_[0] - kStride, data_ptr_[1] - kStride,
+    mbptr_->mode_info_context->mbmi.uv_mode = mode;
+    pred_fn_(mbptr_, data_ptr_[0] - kStride, data_ptr_[1] - kStride,
             data_ptr_[0] - 1, data_ptr_[1] - 1, kStride,
             data_ptr_[0], data_ptr_[1], kStride);
  }
@@ -340,8 +360,12 @@ class IntraPredUVTest : public ::testing::TestWithParam<intra_pred_uv_fn_t>,
  // We use 9 lines so we have one line above us for top-prediction.
  // [0] = U, [1] = V
  static uint8_t* data_array_;
+  static MACROBLOCKD* mb_;
+  static MODE_INFO* mi_;
 };

+MACROBLOCKD* IntraPredUVTest::mb_ = NULL;
+MODE_INFO* IntraPredUVTest::mi_ = NULL;
 uint8_t* IntraPredUVTest::data_array_ = NULL;

 TEST_P(IntraPredUVTest, IntraPredTests) {
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -28,7 +28,7 @@ static unsigned int MemGetLe32(const uint8_t *mem) {
 // so that we can do actual file decodes.
 class IVFVideoSource : public CompressedVideoSource {
 public:
-  IVFVideoSource(const std::string &file_name)
+  explicit IVFVideoSource(const std::string &file_name)
      : file_name_(file_name),
        input_file_(NULL),
        compressed_frame_buf_(NULL),
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@@ -132,7 +132,6 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
  // Verify that keyframes match the file keyframes in the file.
  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
       iter != kf_pts_list_.end(); ++iter) {
-
    if (deadline_ == VPX_DL_REALTIME && *iter > 0)
      EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
        << *iter;
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef LIBVPX_TEST_MD5_HELPER_H_
-#define LIBVPX_TEST_MD5_HELPER_H_
+#ifndef TEST_MD5_HELPER_H_
+#define TEST_MD5_HELPER_H_

 extern "C" {
 #include "./md5_utils.h"
@@ -25,9 +25,15 @@ class MD5 {

  void Add(const vpx_image_t *img) {
    for (int plane = 0; plane < 3; ++plane) {
-      uint8_t *buf = img->planes[plane];
-      const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
-      const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+      const uint8_t *buf = img->planes[plane];
+      // Calculate the width and height to do the md5 check. For the chroma
+      // plane, we never want to round down and thus skip a pixel so if
+      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
+      // This works only for chroma_shift of 0 and 1.
+      const int h = plane ? (img->d_h + img->y_chroma_shift) >>
+                    img->y_chroma_shift : img->d_h;
+      const int w = plane ? (img->d_w + img->x_chroma_shift) >>
+                    img->x_chroma_shift : img->d_w;

      for (int y = 0; y < h; ++y) {
        MD5Update(&md5_, buf, w);
@@ -61,4 +67,4 @@ class MD5 {

 }  // namespace libvpx_test

-#endif  // LIBVPX_TEST_MD5_HELPER_H_
+#endif  // TEST_MD5_HELPER_H_
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -11,8 +11,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -63,7 +63,8 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
  // Pointers to top-left pixel of block in the input and output images.
  uint8_t *const src_image_ptr = src_image + (input_stride << 1);
  uint8_t *const dst_image_ptr = dst_image + 8;
-  uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  uint8_t *const flimits =
+      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
  (void)vpx_memset(flimits, 255, block_width);

  // Initialize pixels in the input:
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#ifndef LIBVPX_TEST_REGISTER_STATE_CHECK_H_
-#define LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#ifndef TEST_REGISTER_STATE_CHECK_H_
+#define TEST_REGISTER_STATE_CHECK_H_

 #ifdef _WIN64

@@ -92,4 +92,4 @@ class RegisterStateCheck {};

 #endif  // _WIN64

-#endif  // LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#endif  // TEST_REGISTER_STATE_CHECK_H_
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -16,8 +16,68 @@
 #include "test/video_source.h"
 #include "test/util.h"

+// Enable(1) or Disable(0) writing of the compressed bitstream.
+#define WRITE_COMPRESSED_STREAM 0
+
 namespace {

+#if WRITE_COMPRESSED_STREAM
+static void mem_put_le16(char *const mem, const unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+}
+
+static void mem_put_le32(char *const mem, const unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+  mem[2] = val >> 16;
+  mem[3] = val >> 24;
+}
+
+static void write_ivf_file_header(const vpx_codec_enc_cfg_t *const cfg,
+                                  int frame_cnt, FILE *const outfile) {
+  char header[32];
+
+  header[0] = 'D';
+  header[1] = 'K';
+  header[2] = 'I';
+  header[3] = 'F';
+  mem_put_le16(header + 4,  0);                   /* version */
+  mem_put_le16(header + 6,  32);                  /* headersize */
+  mem_put_le32(header + 8,  0x30395056);          /* fourcc (vp9) */
+  mem_put_le16(header + 12, cfg->g_w);            /* width */
+  mem_put_le16(header + 14, cfg->g_h);            /* height */
+  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
+  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
+  mem_put_le32(header + 24, frame_cnt);           /* length */
+  mem_put_le32(header + 28, 0);                   /* unused */
+
+  (void)fwrite(header, 1, 32, outfile);
+}
+
+static void write_ivf_frame_size(FILE *const outfile, const size_t size) {
+  char header[4];
+  mem_put_le32(header, static_cast<unsigned int>(size));
+  (void)fwrite(header, 1, 4, outfile);
+}
+
+static void write_ivf_frame_header(const vpx_codec_cx_pkt_t *const pkt,
+                                   FILE *const outfile) {
+  char header[12];
+  vpx_codec_pts_t pts;
+
+  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+    return;
+
+  pts = pkt->data.frame.pts;
+  mem_put_le32(header, static_cast<unsigned int>(pkt->data.frame.sz));
+  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
+  mem_put_le32(header + 8, pts >> 32);
+
+  (void)fwrite(header, 1, 12, outfile);
+}
+#endif  // WRITE_COMPRESSED_STREAM
+
 const unsigned int kInitialWidth = 320;
 const unsigned int kInitialHeight = 240;

@@ -42,6 +102,8 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
    limit_ = 60;
  }

+  virtual ~ResizingVideoSource() {}
+
 protected:
  virtual void Next() {
    ++frame_;
@@ -56,13 +118,15 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
 protected:
  ResizeTest() : EncoderTest(GET_PARAM(0)) {}

+  virtual ~ResizeTest() {}
+
  struct FrameInfo {
    FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
        : pts(_pts), w(_w), h(_h) {}

    vpx_codec_pts_t pts;
-    unsigned int    w;
-    unsigned int    h;
+    unsigned int w;
+    unsigned int h;
  };

  virtual void SetUp() {
@@ -95,17 +159,47 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
  }
 }

+const unsigned int kStepDownFrame = 3;
+const unsigned int kStepUpFrame = 6;
+
 class ResizeInternalTest : public ResizeTest {
 protected:
+#if WRITE_COMPRESSED_STREAM
+  ResizeInternalTest()
+      : ResizeTest(),
+        frame0_psnr_(0.0),
+        outfile_(NULL),
+        out_frames_(0) {}
+#else
  ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}
+#endif
+
+  virtual ~ResizeInternalTest() {}
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+#if WRITE_COMPRESSED_STREAM
+    outfile_ = fopen("vp90-2-05-resize.ivf", "wb");
+#endif
+  }
+
+  virtual void EndPassHook() {
+#if WRITE_COMPRESSED_STREAM
+    if (outfile_) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        write_ivf_file_header(&cfg_, out_frames_, outfile_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+#endif
+  }

  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                  libvpx_test::Encoder *encoder) {
-    if (video->frame() == 3) {
+    if (video->frame() == kStepDownFrame) {
      struct vpx_scaling_mode mode = {VP8E_FOURFIVE, VP8E_THREEFIVE};
      encoder->Control(VP8E_SET_SCALEMODE, &mode);
    }
-    if (video->frame() == 6) {
+    if (video->frame() == kStepUpFrame) {
      struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
      encoder->Control(VP8E_SET_SCALEMODE, &mode);
    }
@@ -117,21 +211,46 @@ class ResizeInternalTest : public ResizeTest {
    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.0);
  }

+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+#if WRITE_COMPRESSED_STREAM
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0)
+      write_ivf_file_header(&cfg_, 0, outfile_);
+
+    // Write frame header and data.
+    write_ivf_frame_header(pkt, outfile_);
+    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+#endif
+  }
+
  double frame0_psnr_;
+#if WRITE_COMPRESSED_STREAM
+  FILE *outfile_;
+  unsigned int out_frames_;
+#endif
 };

 TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 10);
  init_flags_ = VPX_CODEC_USE_PSNR;
+
  // q picked such that initial keyframe on this clip is ~30dB PSNR
  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+
+  // If the number of frames being encoded is smaller than g_lag_in_frames
+  // the encoded frame is unavailable using the current API. Comparing
+  // frames to detect mismatch would then not be possible. Set
+  // g_lag_in_frames = 0 to get around this.
+  cfg_.g_lag_in_frames = 0;
  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

  for (std::vector<FrameInfo>::iterator info = frame_info_list_.begin();
       info != frame_info_list_.end(); ++info) {
    const vpx_codec_pts_t pts = info->pts;
-    if (pts >= 3 && pts < 6) {
+    if (pts >= kStepDownFrame && pts < kStepUpFrame) {
      ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width";
      ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height";
    } else {
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -17,7 +17,6 @@ extern "C" {
 #include "./vpx_config.h"
 #if CONFIG_VP8_ENCODER
 #include "./vp8_rtcd.h"
-//#include "vp8/common/blockd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 #include "./vp9_rtcd.h"
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@@ -17,15 +17,19 @@
 #include <sys/types.h>

 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 extern "C" {
 #include "vp8/encoder/onyx_int.h"
 }

+using libvpx_test::ACMRandom;
+
 namespace {

 TEST(Vp8RoiMapTest, ParameterCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
  int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
  int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
  unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 };
@@ -121,10 +125,10 @@ TEST(Vp8RoiMapTest, ParameterCheck) {
    for (int i = 0; i < 1000; ++i) {
      int rand_deltas[4];
      int deltas_valid;
-      rand_deltas[0] = (rand() % 160) - 80;
-      rand_deltas[1] = (rand() % 160) - 80;
-      rand_deltas[2] = (rand() % 160) - 80;
-      rand_deltas[3] = (rand() % 160) - 80;
+      rand_deltas[0] = rnd(160) - 80;
+      rand_deltas[1] = rnd(160) - 80;
+      rand_deltas[2] = rnd(160) - 80;
+      rand_deltas[3] = rnd(160) - 80;

      deltas_valid = ((abs(rand_deltas[0]) <= 63) &&
                      (abs(rand_deltas[1]) <= 63) &&
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -13,8 +13,8 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vp8/encoder/block.h"
 #include "vpx_mem/vpx_mem.h"
@@ -51,7 +51,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
  bd.predictor = reinterpret_cast<unsigned char*>(
      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));

-  for(int i = 0; kSrcStride[i] > 0; ++i) {
+  for (int i = 0; kSrcStride[i] > 0; ++i) {
    // start at block0
    be.src = 0;
    be.base_src = &source;
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -520,3 +520,10 @@ d17bc08eedfc60c4c23d576a6c964a21bf854d1f  vp90-2-03-size-226x202.webm
 83c6d8f2969b759e10e5c6542baca1265c874c29  vp90-2-03-size-226x224.webm.md5
 fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce  vp90-2-03-size-226x226.webm
 94ad19b8b699cea105e2ff18f0df2afd7242bcf7  vp90-2-03-size-226x226.webm.md5
+b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
+65f45ec9a55537aac76104818278e0978f94a678  vp90-2-03-deltaq.webm.md5
+4dbb87494c7f565ffc266c98d17d0d8c7a5c5aba  vp90-2-05-resize.ivf
+7f6d8879336239a43dbb6c9f13178cb11cf7ed09  vp90-2-05-resize.ivf.md5
+495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
+65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5
+
--- a/test/test.mk
+++ b/test/test.mk
@@ -24,7 +24,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc

@@ -629,3 +629,9 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <string>
-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 #include "vpx_ports/x86.h"
@@ -48,7 +48,9 @@ int main(int argc, char **argv) {
 #endif

 #if !CONFIG_SHARED
-  /* Shared library builds don't support whitebox tests that exercise internal symbols. */
+// Shared library builds don't support whitebox tests
+// that exercise internal symbols.
+
 #if CONFIG_VP8
  vp8_rtcd();
 #endif
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -159,7 +159,11 @@ const char *kVP9TestVectors[] = {
  "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
  "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
  "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
-  "vp90-2-03-size-226x226.webm"
+  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
+  "vp90-2-05-resize.ivf",
+#if CONFIG_NON420
+  "vp91-2-04-yv444.webm"
+#endif
 };
 #endif

--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -16,16 +16,16 @@
 #include "test/register_state_check.h"

 #include "vpx/vpx_integer.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #include "vpx_mem/vpx_mem.h"
 #if CONFIG_VP8_ENCODER
 # include "vp8/common/variance.h"
-# include "vp8_rtcd.h"
+# include "./vp8_rtcd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 # include "vp9/encoder/vp9_variance.h"
-# include "vp9_rtcd.h"
+# include "./vp9_rtcd.h"
 #endif
 }
 #include "test/acm_random.h"
@@ -78,37 +78,9 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
  return sse - (((int64_t) se * se) >> (l2w + l2h));
 }

-static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
-                                            const uint8_t *src,
-                                            const uint8_t *second_pred,
-                                            int l2w, int l2h,
-                                            int xoff, int yoff,
-                                            unsigned int *sse_ptr) {
-  int se = 0;
-  unsigned int sse = 0;
-  const int w = 1 << l2w, h = 1 << l2h;
-  for (int y = 0; y < h; y++) {
-    for (int x = 0; x < w; x++) {
-      // bilinear interpolation at a 16th pel step
-      const int a1 = ref[(w + 1) * (y + 0) + x + 0];
-      const int a2 = ref[(w + 1) * (y + 0) + x + 1];
-      const int b1 = ref[(w + 1) * (y + 1) + x + 0];
-      const int b2 = ref[(w + 1) * (y + 1) + x + 1];
-      const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
-      const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
-      const int r = a + (((b - a) * yoff + 8) >> 4);
-      int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
-      se += diff;
-      sse += diff * diff;
-    }
-  }
-  *sse_ptr = sse;
-  return sse - (((int64_t) se * se) >> (l2w + l2h));
-}
-
 template<typename VarianceFunctionType>
-class VarianceTest :
-    public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
+class VarianceTest
+    : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
 public:
  virtual void SetUp() {
    const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
@@ -190,10 +162,40 @@ void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
  EXPECT_EQ(expected, var);
 }

+#if CONFIG_VP9_ENCODER
+
+unsigned int subpel_avg_variance_ref(const uint8_t *ref,
+                                     const uint8_t *src,
+                                     const uint8_t *second_pred,
+                                     int l2w, int l2h,
+                                     int xoff, int yoff,
+                                     unsigned int *sse_ptr) {
+  int se = 0;
+  unsigned int sse = 0;
+  const int w = 1 << l2w, h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+      const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+      const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+      const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+      const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+      const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+      const int r = a + (((b - a) * yoff + 8) >> 4);
+      int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+      se += diff;
+      sse += diff * diff;
+    }
+  }
+  *sse_ptr = sse;
+  return sse - (((int64_t) se * se) >> (l2w + l2h));
+}
+
 template<typename SubpelVarianceFunctionType>
-class SubpelVarianceTest :
-    public ::testing::TestWithParam<tuple<int, int,
-                                          SubpelVarianceFunctionType> > {
+class SubpelVarianceTest
+    : public ::testing::TestWithParam<tuple<int, int,
+                                            SubpelVarianceFunctionType> > {
 public:
  virtual void SetUp() {
    const tuple<int, int, SubpelVarianceFunctionType>& params =
@@ -280,6 +282,8 @@ void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
  }
 }

+#endif  // CONFIG_VP9_ENCODER
+
 // -----------------------------------------------------------------------------
 // VP8 test cases.

--- a/test/vp8_boolcoder_test.cc
+++ b/test/vp8_boolcoder_test.cc
@@ -8,10 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-extern "C" {
-#include "vp8/encoder/boolhuff.h"
-#include "vp8/decoder/dboolhuff.h"
-}

 #include <math.h>
 #include <stddef.h>
@@ -24,6 +20,11 @@ extern "C" {
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_integer.h"

+extern "C" {
+#include "vp8/encoder/boolhuff.h"
+#include "vp8/decoder/dboolhuff.h"
+}
+
 namespace {
 const int num_tests = 10;

@@ -44,7 +45,7 @@ void encrypt_buffer(uint8_t *buffer, int size) {

 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                           uint8_t *output, int count) {
-  int offset = input - (uint8_t *)decrypt_state;
+  int offset = input - reinterpret_cast<uint8_t *>(decrypt_state);
  for (int i = 0; i < count; i++) {
    output[i] = input[i] ^ secret_key[(offset + i) & 15];
  }
@@ -58,10 +59,10 @@ TEST(VP8, TestBitIO) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  for (int n = 0; n < num_tests; ++n) {
    for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int bits_to_test = 1000;
-      uint8_t probas[bits_to_test];
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];

-      for (int i = 0; i < bits_to_test; ++i) {
+      for (int i = 0; i < kBitsToTest; ++i) {
        const int parity = i & 1;
        probas[i] =
            (method == 0) ? 0 : (method == 1) ? 255 :
@@ -76,14 +77,14 @@ TEST(VP8, TestBitIO) {
      }
      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
        const int random_seed = 6432;
-        const int buffer_size = 10000;
+        const int kBufferSize = 10000;
        ACMRandom bit_rnd(random_seed);
        BOOL_CODER bw;
-        uint8_t bw_buffer[buffer_size];
-        vp8_start_encode(&bw, bw_buffer, bw_buffer + buffer_size);
+        uint8_t bw_buffer[kBufferSize];
+        vp8_start_encode(&bw, bw_buffer, bw_buffer + kBufferSize);

        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
@@ -98,19 +99,20 @@ TEST(VP8, TestBitIO) {
 #if CONFIG_DECRYPT
        encrypt_buffer(bw_buffer, buffer_size);
        vp8dx_start_decode(&br, bw_buffer, buffer_size,
-                           test_decrypt_cb, (void *)bw_buffer);
+                           test_decrypt_cb,
+                           reinterpret_cast<void *>(bw_buffer));
 #else
-        vp8dx_start_decode(&br, bw_buffer, buffer_size, NULL, NULL);
+        vp8dx_start_decode(&br, bw_buffer, kBufferSize, NULL, NULL);
 #endif
        bit_rnd.Reset(random_seed);
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
            bit = bit_rnd(2);
          }
          GTEST_ASSERT_EQ(vp8dx_decode_bool(&br, probas[i]), bit)
-              << "pos: "<< i << " / " << bits_to_test
+              << "pos: "<< i << " / " << kBitsToTest
              << " bit_method: " << bit_method
              << " method: " << method;
        }
--- a/test/vp8_decrypt_test.cc
+++ b/test/vp8_decrypt_test.cc
@@ -26,7 +26,8 @@ const uint8_t test_key[16] = {
  0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0
 };

-void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0) {
+void encrypt_buffer(const uint8_t *src, uint8_t *dst,
+                    int size, int offset = 0) {
  for (int i = 0; i < size; ++i) {
    dst[i] = src[i] ^ test_key[(offset + i) & 15];
  }
@@ -34,10 +35,11 @@ void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0)

 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                     uint8_t *output, int count) {
-  encrypt_buffer(input, output, count, input - (uint8_t *)decrypt_state);
+  encrypt_buffer(input, output, count,
+                 input - reinterpret_cast<uint8_t *>(decrypt_state));
 }

-} // namespace
+}  // namespace

 namespace libvpx_test {

--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -18,7 +18,7 @@


 extern "C" {
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
 }

 #include "test/acm_random.h"
--- a/test/vp9_boolcoder_test.cc
+++ b/test/vp9_boolcoder_test.cc
@@ -19,7 +19,7 @@ extern "C" {
 #include "vp9/decoder/vp9_dboolhuff.h"
 }

-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;
@@ -32,10 +32,10 @@ TEST(VP9, TestBitIO) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  for (int n = 0; n < num_tests; ++n) {
    for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int bits_to_test = 1000;
-      uint8_t probas[bits_to_test];
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];

-      for (int i = 0; i < bits_to_test; ++i) {
+      for (int i = 0; i < kBitsToTest; ++i) {
        const int parity = i & 1;
        probas[i] =
          (method == 0) ? 0 : (method == 1) ? 255 :
@@ -50,14 +50,14 @@ TEST(VP9, TestBitIO) {
      }
      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
        const int random_seed = 6432;
-        const int buffer_size = 10000;
+        const int kBufferSize = 10000;
        ACMRandom bit_rnd(random_seed);
        vp9_writer bw;
-        uint8_t bw_buffer[buffer_size];
+        uint8_t bw_buffer[kBufferSize];
        vp9_start_encode(&bw, bw_buffer);

        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
@@ -72,16 +72,16 @@ TEST(VP9, TestBitIO) {
        GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);

        vp9_reader br;
-        vp9_reader_init(&br, bw_buffer, buffer_size);
+        vp9_reader_init(&br, bw_buffer, kBufferSize);
        bit_rnd.Reset(random_seed);
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
          if (bit_method == 2) {
            bit = (i & 1);
          } else if (bit_method == 3) {
            bit = bit_rnd(2);
          }
          GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit)
-              << "pos: " << i << " / " << bits_to_test
+              << "pos: " << i << " / " << kBitsToTest
              << " bit_method: " << bit_method
              << " method: " << method;
        }
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -39,8 +39,8 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());

  // FIXME(rbultje) split in its own file
-  for (BLOCK_SIZE_TYPE bsize = BLOCK_4X4; bsize < BLOCK_SIZE_TYPES;
-       bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
    const int block_width  = 4 << b_width_log2(bsize);
    const int block_height = 4 << b_height_log2(bsize);
    int16_t *diff = reinterpret_cast<int16_t *>(
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -97,21 +97,91 @@
    %endif
 %endmacro

-%if WIN64
-    %define PIC
-%elifidn __OUTPUT_FORMAT__,macho64
-    %define PIC
-%elif ARCH_X86_64 == 0
-; x86_32 doesn't require PIC.
-; Some distros prefer shared objects to be PIC, but nothing breaks if
-; the code contains a few textrels, so we'll skip that complexity.
-    %undef PIC
-%elif CONFIG_PIC
-    %define PIC
+; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
+; from original code is added in for 64bit.
+%ifidn __OUTPUT_FORMAT__,elf32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,macho32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,win32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,aout
+%define ABI_IS_32BIT 1
+%else
+%define ABI_IS_32BIT 0
 %endif
+
+%if ABI_IS_32BIT
+  %if CONFIG_PIC=1
+  %ifidn __OUTPUT_FORMAT__,elf32
+    %define GET_GOT_SAVE_ARG 1
+    %define WRT_PLT wrt ..plt
+    %macro GET_GOT 1
+      extern _GLOBAL_OFFSET_TABLE_
+      push %1
+      call %%get_got
+      %%sub_offset:
+      jmp %%exitGG
+      %%get_got:
+      mov %1, [esp]
+      add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
+      ret
+      %%exitGG:
+      %undef GLOBAL
+      %define GLOBAL(x) x + %1 wrt ..gotoff
+      %undef RESTORE_GOT
+      %define RESTORE_GOT pop %1
+    %endmacro
+  %elifidn __OUTPUT_FORMAT__,macho32
+    %define GET_GOT_SAVE_ARG 1
+    %macro GET_GOT 1
+      push %1
+      call %%get_got
+      %%get_got:
+      pop  %1
+      %undef GLOBAL
+      %define GLOBAL(x) x + %1 - %%get_got
+      %undef RESTORE_GOT
+      %define RESTORE_GOT pop %1
+    %endmacro
+  %endif
+  %endif
+
+  %if ARCH_X86_64 == 0
+    %undef PIC
+  %endif
+
+%else
+  %macro GET_GOT 1
+  %endmacro
+  %define GLOBAL(x) rel x
+  %define WRT_PLT wrt ..plt
+
+  %if WIN64
+    %define PIC
+  %elifidn __OUTPUT_FORMAT__,macho64
+    %define PIC
+  %elif CONFIG_PIC
+    %define PIC
+  %endif
+%endif
+
+%ifnmacro GET_GOT
+    %macro GET_GOT 1
+    %endmacro
+    %define GLOBAL(x) x
+%endif
+%ifndef RESTORE_GOT
+%define RESTORE_GOT
+%endif
+%ifndef WRT_PLT
+%define WRT_PLT
+%endif
+
 %ifdef PIC
    default rel
 %endif
+; Done with PIC macros

 ; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
 %ifndef __NASM_VER__
@@ -528,6 +598,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
        global %1:function hidden
    %elifidn __OUTPUT_FORMAT__,elf64
        global %1:function hidden
+    %elifidn __OUTPUT_FORMAT__,macho32
+        global %1:private_extern
+    %elifidn __OUTPUT_FORMAT__,macho64
+        global %1:private_extern
    %else
        global %1
    %endif
--- a/vp8/common/filter.c
+++ b/vp8/common/filter.c
@@ -9,9 +9,7 @@
 */


-#include <stdlib.h>
 #include "filter.h"
-#include "vpx_ports/mem.h"

 DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
 {
--- a/vp8/common/filter.h
+++ b/vp8/common/filter.h
@@ -12,11 +12,13 @@
 #ifndef FILTER_H
 #define FILTER_H

+#include "vpx_ports/mem.h"
+
 #define BLOCK_HEIGHT_WIDTH 4
 #define VP8_FILTER_WEIGHT 128
 #define VP8_FILTER_SHIFT  7

-extern const short vp8_bilinear_filters[8][2];
-extern const short vp8_sub_pel_filters[8][6];
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]);
+extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);

 #endif
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -124,7 +124,7 @@ static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
        b += 16;
    }

-    return (cur_mb->bmi + b - 4)->mv.as_int;
+    return (cur_mb->bmi + (b - 4))->mv.as_int;
 }
 static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
 {
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -41,7 +41,8 @@ extern "C"
    {
        USAGE_STREAM_FROM_SERVER    = 0x0,
        USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-        USAGE_CONSTRAINED_QUALITY   = 0x2
+        USAGE_CONSTRAINED_QUALITY   = 0x2,
+        USAGE_CONSTANT_QUALITY      = 0x3
    } END_USAGE;


--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -138,14 +138,10 @@ void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre,
    {
        for (r = 0; r < 4; r++)
        {
-#if !(CONFIG_FAST_UNALIGNED)
            pred_ptr[0]  = ptr[0];
            pred_ptr[1]  = ptr[1];
            pred_ptr[2]  = ptr[2];
            pred_ptr[3]  = ptr[3];
-#else
-            *(uint32_t *)pred_ptr = *(uint32_t *)ptr ;
-#endif
            pred_ptr     += pitch;
            ptr         += pre_stride;
        }
@@ -196,16 +192,12 @@ static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stri
    {
        for (r = 0; r < 4; r++)
        {
-#if !(CONFIG_FAST_UNALIGNED)
          dst[0]  = ptr[0];
          dst[1]  = ptr[1];
          dst[2]  = ptr[2];
          dst[3]  = ptr[3];
-#else
-            *(uint32_t *)dst = *(uint32_t *)ptr ;
-#endif
-            dst     += dst_stride;
-            ptr     += pre_stride;
+          dst     += dst_stride;
+          ptr     += pre_stride;
        }
    }
 }
@@ -270,7 +262,7 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
                   + x->block[yoffset+4].bmi.mv.as_mv.row
                   + x->block[yoffset+5].bmi.mv.as_mv.row;

-            temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);
+            temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);

            x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;

@@ -279,7 +271,7 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
                   + x->block[yoffset+4].bmi.mv.as_mv.col
                   + x->block[yoffset+5].bmi.mv.as_mv.col;

-            temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);
+            temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);

            x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;

@@ -558,7 +550,7 @@ void build_4x4uvmvs(MACROBLOCKD *x)
                 + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row
                 + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row;

-            temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);
+            temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);

            x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;

@@ -567,7 +559,7 @@ void build_4x4uvmvs(MACROBLOCKD *x)
                 + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col
                 + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col;

-            temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);
+            temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);

            x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;

--- a/vp8/common/x86/filter_x86.c
+++ b/vp8/common/x86/filter_x86.c
@@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"

 DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
 {
--- a/vp8/common/x86/filter_x86.h
+++ b/vp8/common/x86/filter_x86.h
@@ -11,9 +11,15 @@
 #ifndef FILTER_X86_H
 #define FILTER_X86_H

+#include "vpx_ports/mem.h"
+
 /* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
 * duplicated values */
-extern const short vp8_bilinear_filters_x86_4[8][8];  /* duplicated 4x */
-extern const short vp8_bilinear_filters_x86_8[8][16]; /* duplicated 8x */
+
+/* duplicated 4x */
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
+
+/* duplicated 8x */
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);

 #endif /* FILTER_X86_H */
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -611,16 +611,12 @@ void vp8_sixtap_predict4x4_ssse3

          for (r = 0; r < 4; r++)
          {
-  #if !(CONFIG_FAST_UNALIGNED)
            dst_ptr[0]  = src_ptr[0];
            dst_ptr[1]  = src_ptr[1];
            dst_ptr[2]  = src_ptr[2];
            dst_ptr[3]  = src_ptr[3];
-  #else
-              *(uint32_t *)dst_ptr = *(uint32_t *)src_ptr ;
-  #endif
-              dst_ptr     += dst_pitch;
-              src_ptr     += src_pixels_per_line;
+            dst_ptr     += dst_pitch;
+            src_ptr     += src_pixels_per_line;
          }
      }
  }
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -110,8 +110,8 @@ static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc)

 static void read_mv(vp8_reader *r, MV *mv, const MV_CONTEXT *mvc)
 {
-    mv->row = (short)(read_mvcomponent(r,   mvc) << 1);
-    mv->col = (short)(read_mvcomponent(r, ++mvc) << 1);
+    mv->row = (short)(read_mvcomponent(r,   mvc) * 2);
+    mv->col = (short)(read_mvcomponent(r, ++mvc) * 2);
 }


@@ -292,9 +292,9 @@ static void decode_split_mv(vp8_reader *const bc, MODE_INFO *mi,
                blockmv.as_int = 0;
                if( vp8_read(bc, prob[2]) )
                {
-                    blockmv.as_mv.row = read_mvcomponent(bc, &mvc[0]) << 1;
+                    blockmv.as_mv.row = read_mvcomponent(bc, &mvc[0]) * 2;
                    blockmv.as_mv.row += best_mv.as_mv.row;
-                    blockmv.as_mv.col = read_mvcomponent(bc, &mvc[1]) << 1;
+                    blockmv.as_mv.col = read_mvcomponent(bc, &mvc[1]) * 2;
                    blockmv.as_mv.col += best_mv.as_mv.col;
                }
            }
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -576,7 +576,7 @@ static void decode_mb_rows(VP8D_COMP *pbi)

        xd->left_available = 0;

-        xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+        xd->mb_to_top_edge = -((mb_row * 16) << 3);
        xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;

        xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
@@ -1026,7 +1026,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        const unsigned char *clear = data;
        if (pbi->decrypt_cb)
        {
-            int n = data_end - data;
+            int n = (int)(data_end - data);
            if (n > 10) n = 10;
            pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n);
            clear = clear_buffer;
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -432,7 +432,7 @@ static void write_mv_ref
    assert(NEARESTMV <= m  &&  m <= SPLITMV);
 #endif
    vp8_write_token(w, vp8_mv_ref_tree, p,
-                    vp8_mv_ref_encoding_array - NEARESTMV + m);
+                    vp8_mv_ref_encoding_array + (m - NEARESTMV));
 }

 static void write_sub_mv_ref
@@ -444,7 +444,7 @@ static void write_sub_mv_ref
    assert(LEFT4X4 <= m  &&  m <= NEW4X4);
 #endif
    vp8_write_token(w, vp8_sub_mv_ref_tree, p,
-                    vp8_sub_mv_ref_encoding_array - LEFT4X4 + m);
+                    vp8_sub_mv_ref_encoding_array + (m - LEFT4X4));
 }

 static void write_mv
@@ -577,7 +577,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
             */
            xd->mb_to_left_edge = -((mb_col * 16) << 3);
            xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-            xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+            xd->mb_to_top_edge = -((mb_row * 16) << 3);
            xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;

 #ifdef VP8_ENTROPY_STATS
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -20,10 +20,10 @@ void vp8_short_fdct4x4_c(short *input, short *output, int pitch)

    for (i = 0; i < 4; i++)
    {
-        a1 = ((ip[0] + ip[3])<<3);
-        b1 = ((ip[1] + ip[2])<<3);
-        c1 = ((ip[1] - ip[2])<<3);
-        d1 = ((ip[0] - ip[3])<<3);
+        a1 = ((ip[0] + ip[3]) * 8);
+        b1 = ((ip[1] + ip[2]) * 8);
+        c1 = ((ip[1] - ip[2]) * 8);
+        d1 = ((ip[0] - ip[3]) * 8);

        op[0] = a1 + b1;
        op[2] = a1 - b1;
@@ -72,10 +72,10 @@ void vp8_short_walsh4x4_c(short *input, short *output, int pitch)

    for (i = 0; i < 4; i++)
    {
-        a1 = ((ip[0] + ip[2])<<2);
-        d1 = ((ip[1] + ip[3])<<2);
-        c1 = ((ip[1] - ip[3])<<2);
-        b1 = ((ip[0] - ip[2])<<2);
+        a1 = ((ip[0] + ip[2]) * 4);
+        d1 = ((ip[1] + ip[3]) * 4);
+        c1 = ((ip[1] - ip[3]) * 4);
+        b1 = ((ip[0] - ip[2]) * 4);

        op[0] = a1 + d1 + (a1!=0);
        op[1] = b1 + c1;
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -711,8 +711,8 @@ skip_motion_search:
                        neutral_count++;
                    }

-                    d->bmi.mv.as_mv.row <<= 3;
-                    d->bmi.mv.as_mv.col <<= 3;
+                    d->bmi.mv.as_mv.row *= 8;
+                    d->bmi.mv.as_mv.col *= 8;
                    this_error = motion_error;
                    vp8_set_mbmode_and_mvs(x, NEWMV, &d->bmi.mv);
                    vp8_encode_inter16x16y(x);
@@ -909,13 +909,16 @@ extern const int vp8_bits_per_mb[2][QINDEX_RANGE];

 static double bitcost( double prob )
 {
-    return -(log( prob ) / log( 2.0 ));
+  if (prob > 0.000122)
+    return -log(prob) / log(2.0);
+  else
+    return 13.0;
 }
 static int64_t estimate_modemvcost(VP8_COMP *cpi,
                                     FIRSTPASS_STATS * fpstats)
 {
    int mv_cost;
-    int mode_cost;
+    int64_t mode_cost;

    double av_pct_inter = fpstats->pcnt_inter / fpstats->count;
    double av_pct_motion = fpstats->pcnt_motion / fpstats->count;
@@ -937,10 +940,9 @@ static int64_t estimate_modemvcost(VP8_COMP *cpi,
    /* Crude estimate of overhead cost from modes
     * << 9 is the normalization to (bits * 512) used in vp8_bits_per_mb
     */
-    mode_cost =
-        (int)( ( ((av_pct_inter - av_pct_motion) * zz_cost) +
-                 (av_pct_motion * motion_cost) +
-                 (av_intra * intra_cost) ) * cpi->common.MBs ) << 9;
+    mode_cost =((((av_pct_inter - av_pct_motion) * zz_cost) +
+                (av_pct_motion * motion_cost) +
+                (av_intra * intra_cost)) * cpi->common.MBs) * 512;

    return mv_cost + mode_cost;
 }
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -210,7 +210,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
    unsigned char *z = (*(b->base_src) + b->src);

    int rr = ref_mv->as_mv.row >> 1, rc = ref_mv->as_mv.col >> 1;
-    int br = bestmv->as_mv.row << 2, bc = bestmv->as_mv.col << 2;
+    int br = bestmv->as_mv.row * 4, bc = bestmv->as_mv.col * 4;
    int tr = br, tc = bc;
    unsigned int besterr;
    unsigned int left, right, up, down, diag;
@@ -220,10 +220,14 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
    unsigned int quarteriters = 4;
    int thismse;

-    int minc = MAX(x->mv_col_min << 2, (ref_mv->as_mv.col >> 1) - ((1 << mvlong_width) - 1));
-    int maxc = MIN(x->mv_col_max << 2, (ref_mv->as_mv.col >> 1) + ((1 << mvlong_width) - 1));
-    int minr = MAX(x->mv_row_min << 2, (ref_mv->as_mv.row >> 1) - ((1 << mvlong_width) - 1));
-    int maxr = MIN(x->mv_row_max << 2, (ref_mv->as_mv.row >> 1) + ((1 << mvlong_width) - 1));
+    int minc = MAX(x->mv_col_min * 4,
+                   (ref_mv->as_mv.col >> 1) - ((1 << mvlong_width) - 1));
+    int maxc = MIN(x->mv_col_max * 4,
+                   (ref_mv->as_mv.col >> 1) + ((1 << mvlong_width) - 1));
+    int minr = MAX(x->mv_row_min * 4,
+                   (ref_mv->as_mv.row >> 1) - ((1 << mvlong_width) - 1));
+    int maxr = MIN(x->mv_row_max * 4,
+                   (ref_mv->as_mv.row >> 1) + ((1 << mvlong_width) - 1));

    int y_stride;
    int offset;
@@ -254,8 +258,8 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
    offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;

    /* central mv */
-    bestmv->as_mv.row <<= 3;
-    bestmv->as_mv.col <<= 3;
+    bestmv->as_mv.row *= 8;
+    bestmv->as_mv.col *= 8;

    /* calculate central point error */
    besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
@@ -337,8 +341,8 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
        tc = bc;
    }

-    bestmv->as_mv.row = br << 1;
-    bestmv->as_mv.col = bc << 1;
+    bestmv->as_mv.row = br * 2;
+    bestmv->as_mv.col = bc * 2;

    if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL<<3)) ||
        (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL<<3)))
@@ -699,8 +703,8 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 #endif

    /* central mv */
-    bestmv->as_mv.row <<= 3;
-    bestmv->as_mv.col <<= 3;
+    bestmv->as_mv.row *= 8;
+    bestmv->as_mv.col *= 8;
    startmv = *bestmv;

    /* calculate central point error */
@@ -1315,8 +1319,8 @@ int vp8_diamond_search_sadx4
            (*num00)++;
    }

-    this_mv.as_mv.row = best_mv->as_mv.row << 3;
-    this_mv.as_mv.col = best_mv->as_mv.col << 3;
+    this_mv.as_mv.row = best_mv->as_mv.row * 8;
+    this_mv.as_mv.col = best_mv->as_mv.col * 8;

    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
@@ -1709,8 +1713,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
        }
    }

-    this_mv.as_mv.row = best_mv->as_mv.row << 3;
-    this_mv.as_mv.col = best_mv->as_mv.col << 3;
+    this_mv.as_mv.row = best_mv->as_mv.row * 8;
+    this_mv.as_mv.col = best_mv->as_mv.col * 8;

    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
@@ -1905,8 +1909,8 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
        }
    }

-    this_mv.as_mv.row = ref_mv->as_mv.row << 3;
-    this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+    this_mv.as_mv.row = ref_mv->as_mv.row * 8;
+    this_mv.as_mv.col = ref_mv->as_mv.col * 8;

    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -313,7 +313,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    /* Get baseline error score */

    /* Copy the unfiltered / processed recon buffer to the new buffer */
-    vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+    vpx_yv12_copy_y(saved_frame, cm->frame_to_show);

    vp8cx_set_alt_lf_level(cpi, filt_mid);
    vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
@@ -339,7 +339,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
            if(ss_err[filt_low] == 0)
            {
                /* Get Low filter error score */
-                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
                vp8cx_set_alt_lf_level(cpi, filt_low);
                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);

@@ -367,7 +367,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
        {
            if(ss_err[filt_high] == 0)
            {
-                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
                vp8cx_set_alt_lf_level(cpi, filt_high);
                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);

--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -935,7 +935,7 @@ int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4])
    assert(NEARESTMV <= m  &&  m <= SPLITMV);
    vp8_mv_ref_probs(p, near_mv_ref_ct);
    return vp8_cost_token(vp8_mv_ref_tree, p,
-                          vp8_mv_ref_encoding_array - NEARESTMV + m);
+                          vp8_mv_ref_encoding_array + (m - NEARESTMV));
 }

 void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv)
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -153,7 +153,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 #else
    RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
 #endif
-    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
+    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
    RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
    RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
@@ -204,7 +204,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
    RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
    RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
    RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
-    if(finalize && cfg->rc_end_usage == VPX_CQ)
+    if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q))
        RANGE_CHECK(vp8_cfg, cq_level,
                    cfg->rc_min_quantizer, cfg->rc_max_quantizer);

@@ -327,17 +327,14 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
    oxcf->resample_up_water_mark   = cfg.rc_resize_up_thresh;
    oxcf->resample_down_water_mark = cfg.rc_resize_down_thresh;

-    if (cfg.rc_end_usage == VPX_VBR)
-    {
-        oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
-    }
-    else if (cfg.rc_end_usage == VPX_CBR)
-    {
-        oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
-    }
-    else if (cfg.rc_end_usage == VPX_CQ)
-    {
-        oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+    if (cfg.rc_end_usage == VPX_VBR) {
+      oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
+    } else if (cfg.rc_end_usage == VPX_CBR) {
+      oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+    } else if (cfg.rc_end_usage == VPX_CQ) {
+      oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+    } else if (cfg.rc_end_usage == VPX_Q) {
+      oxcf->end_usage = USAGE_CONSTANT_QUALITY;
    }

    oxcf->target_bandwidth         = cfg.rc_target_bitrate;
@@ -1272,7 +1269,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
        1,                  /* g_delete_first_pass_file */
        "vp8.fpf"           /* first pass filename */
 #endif
-
+        VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
        1,                  /* ts_number_layers */
        {0},                /* ts_target_bitrate */
        {0},                /* ts_rate_decimator */
--- a/vp9/common/arm/neon/vp9_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_avg_neon.asm
@@ -0,0 +1,116 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_convolve_avg_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_convolve_avg_neon| PROC
+    push                {r4-r6, lr}
+    ldrd                r4, r5, [sp, #32]
+    mov                 r6, r2
+
+    cmp                 r4, #32
+    bgt                 avg64
+    beq                 avg32
+    cmp                 r4, #8
+    bgt                 avg16
+    beq                 avg8
+    b                   avg4
+
+avg64
+    sub                 lr, r1, #32
+    sub                 r4, r3, #32
+avg64_h
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    pld                 [r2, r3]
+    vld1.8              {q8-q9},   [r6@128]!
+    vld1.8              {q10-q11}, [r6@128], r4
+    vrhadd.u8           q0, q0, q8
+    vrhadd.u8           q1, q1, q9
+    vrhadd.u8           q2, q2, q10
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2@128]!
+    vst1.8              {q2-q3}, [r2@128], r4
+    subs                r5, r5, #1
+    bgt                 avg64_h
+    pop                 {r4-r6, pc}
+
+avg32
+    vld1.8              {q0-q1}, [r0], r1
+    vld1.8              {q2-q3}, [r0], r1
+    vld1.8              {q8-q9},   [r6@128], r3
+    vld1.8              {q10-q11}, [r6@128], r3
+    pld                 [r0]
+    vrhadd.u8           q0, q0, q8
+    pld                 [r0, r1]
+    vrhadd.u8           q1, q1, q9
+    pld                 [r6]
+    vrhadd.u8           q2, q2, q10
+    pld                 [r6, r3]
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2@128], r3
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 avg32
+    pop                 {r4-r6, pc}
+
+avg16
+    vld1.8              {q0}, [r0], r1
+    vld1.8              {q1}, [r0], r1
+    vld1.8              {q2}, [r6@128], r3
+    vld1.8              {q3}, [r6@128], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q2
+    pld                 [r6]
+    pld                 [r6, r3]
+    vrhadd.u8           q1, q1, q3
+    vst1.8              {q0}, [r2@128], r3
+    vst1.8              {q1}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 avg16
+    pop                 {r4-r6, pc}
+
+avg8
+    vld1.8              {d0}, [r0], r1
+    vld1.8              {d1}, [r0], r1
+    vld1.8              {d2}, [r6@64], r3
+    vld1.8              {d3}, [r6@64], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q1
+    pld                 [r6]
+    pld                 [r6, r3]
+    vst1.8              {d0}, [r2@64], r3
+    vst1.8              {d1}, [r2@64], r3
+    subs                r5, r5, #2
+    bgt                 avg8
+    pop                 {r4-r6, pc}
+
+avg4
+    vld1.32             {d0[0]}, [r0], r1
+    vld1.32             {d0[1]}, [r0], r1
+    vld1.32             {d2[0]}, [r6@32], r3
+    vld1.32             {d2[1]}, [r6@32], r3
+    vrhadd.u8           d0, d0, d2
+    vst1.32             {d0[0]}, [r2@32], r3
+    vst1.32             {d0[1]}, [r2@32], r3
+    subs                r5, r5, #2
+    bgt                 avg4
+    pop                 {r4-r6, pc}
+    ENDP
+
+    END
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -66,46 +66,64 @@

    vld1.s16        {q0}, [r5]              ; filter_x

-    add             r8, r1, r1, lsl #1      ; src_stride * 3
-    add             r8, r8, #4              ; src_stride * 3 + 4
-    rsb             r8, r8, #0              ; reset for src
+    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
+    add             r8, r8, #4              ; -src_stride * 3 + 4

-    add             r4, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r4, r4, #4              ; dst_stride * 3 - 4
-    rsb             r4, r4, #0              ; reset for dst
+    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
+    add             r4, r4, #4              ; -dst_stride * 3 + 4

-    sub             r9, r1, #8              ; post increment for src load
-
-    rsb             r1, r6, r1, lsl #2      ; reset src for outer loop
+    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
+    sub             r9, r9, #7
    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop

    mov             r10, r6                 ; w loop counter

-loop_horiz
-    vld1.8          {d24}, [r0]!
-    vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
-
-    vld1.8          {d25}, [r0]!
-    vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
-
-    vld1.8          {d26}, [r0]!
-    vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
-
-    vld1.8          {d27}, [r0]!
-    vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
+loop_horiz_v
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8

    vtrn.16         q12, q13
    vtrn.8          d24, d25
    vtrn.8          d26, d27

-    ; extract to s16
+    pld             [r0, r1, lsl #2]
+
    vmovl.u8        q8, d24
    vmovl.u8        q9, d25
    vmovl.u8        q10, d26
    vmovl.u8        q11, d27
-    vtrn.32         d28, d29 ; only the first half is populated
+
+    ; save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+loop_horiz
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    ; extract to s16
+    vtrn.32         q14, q15
    vmovl.u8        q12, d28
-    vmovl.u8        q13, d30
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]

    ; slightly out of order load to match the existing data
    vld1.u32        {d6[0]}, [r2], r3
@@ -116,10 +134,12 @@ loop_horiz
    sub             r2, r2, r3, lsl #2      ; reset for store

    ; src[] * filter_x
-    MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
-    MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
-    MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
@@ -139,20 +159,25 @@ loop_horiz
    ; average the new value and the dst value
    vrhadd.u8       q1, q1, q3

-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r4
+    vst1.u32        {d2[0]}, [r2@32], r3
+    vst1.u32        {d3[0]}, [r2@32], r3
+    vst1.u32        {d2[1]}, [r2@32], r3
+    vst1.u32        {d3[1]}, [r2@32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13

    subs            r6, r6, #4              ; w -= 4
    bgt             loop_horiz

    ; outer loop
    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r1              ; src += src_stride * 4 - w
+    add             r0, r0, r9              ; src += src_stride * 4 - w
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz
+    bgt loop_horiz_v

    pop             {r4-r10, pc}

@@ -163,66 +188,77 @@ loop_horiz
    cmp             r12, #16
    bne             vp9_convolve8_avg_vert_c

-    push            {r4-r10, lr}
+    push            {r4-r8, lr}

    ; adjust for taps
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

-    ldr             r7, [sp, #40]           ; filter_y
-    ldr             r8, [sp, #48]           ; w
-    ldr             r9, [sp, #52]           ; h
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h

-    vld1.s16        {q0}, [r7]              ; filter_y
+    vld1.s16        {q0}, [r4]              ; filter_y

-    mov             r5, r1, lsl #1          ; src_stride * 2
-    add             r5, r5, r1, lsl #3      ; src_stride * 10
-    sub             r5, r5, #4              ; src_stride * 10 + 4
-    rsb             r5, r5, #0              ; reset for src
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1

-    add             r6, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r6, r6, #4              ; dst_stride * 3 - 4
-    rsb             r6, r6, #0              ; reset for dst
+loop_vert_h
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 ; h loop counter

-    rsb             r7, r8, r1, lsl #2      ; reset src for outer loop
-    rsb             r12, r8, r3, lsl #2     ; reset dst for outer loop
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1

-    mov             r10, r8                 ; w loop counter
-
-loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d16[0]}, [r0], r1
-    vld1.u32        {d16[1]}, [r0], r1
-    vld1.u32        {d18[0]}, [r0], r1
-    vld1.u32        {d18[1]}, [r0], r1
-    vld1.u32        {d20[0]}, [r0], r1
-    vld1.u32        {d20[1]}, [r0], r1
-    vld1.u32        {d22[0]}, [r0], r1
-    vld1.u32        {d22[1]}, [r0], r1
-    vld1.u32        {d24[0]}, [r0], r1
-    vld1.u32        {d24[1]}, [r0], r1
-    vld1.u32        {d26[0]}, [r0], r5
-
-    ; extract to s16
    vmovl.u8        q8, d16
    vmovl.u8        q9, d18
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22
+
+loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
+
+    ; extract to s16
    vmovl.u8        q12, d24
    vmovl.u8        q13, d26

-    vld1.u32        {d6[0]}, [r2], r3
-    vld1.u32        {d6[1]}, [r2], r3
-    vld1.u32        {d7[0]}, [r2], r3
-    vld1.u32        {d7[1]}, [r2], r3
+    vld1.u32        {d6[0]}, [r5@32], r3
+    vld1.u32        {d6[1]}, [r8@32], r3
+    vld1.u32        {d7[0]}, [r5@32], r3
+    vld1.u32        {d7[1]}, [r8@32], r3

-    sub             r2, r2, r3, lsl #2      ; reset for store
+    pld             [r7]
+    pld             [r4]

    ; src[] * filter_y
-    MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
-    MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
-    MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r5]
+    pld             [r8]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
@@ -237,22 +273,30 @@ loop_vert
    ; average the new value and the dst value
    vrhadd.u8       q1, q1, q3

-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r6
+    sub             r5, r5, r3, lsl #1      ; reset for store
+    sub             r8, r8, r3, lsl #1

-    subs            r8, r8, #4              ; w -= 4
+    vst1.u32        {d2[0]}, [r5@32], r3
+    vst1.u32        {d2[1]}, [r8@32], r3
+    vst1.u32        {d3[0]}, [r5@32], r3
+    vst1.u32        {d3[1]}, [r8@32], r3
+
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            ; h -= 4
    bgt             loop_vert

    ; outer loop
-    mov             r8, r10                 ; restore w counter
-    add             r0, r0, r7              ; src += 4 * src_stride - w
-    add             r2, r2, r12             ; dst += 4 * dst_stride - w
-    subs            r9, r9, #4              ; h -= 4
-    bgt             loop_vert
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              ; w -= 4
+    bgt             loop_vert_h

-    pop             {r4-r10, pc}
+    pop             {r4-r8, pc}

    ENDP
    END
--- a/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -66,52 +66,72 @@

    vld1.s16        {q0}, [r5]              ; filter_x

-    add             r8, r1, r1, lsl #1      ; src_stride * 3
-    add             r8, r8, #4              ; src_stride * 3 + 4
-    rsb             r8, r8, #0              ; reset for src
+    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
+    add             r8, r8, #4              ; -src_stride * 3 + 4

-    add             r4, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r4, r4, #4              ; dst_stride * 3 - 4
-    rsb             r4, r4, #0              ; reset for dst
+    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
+    add             r4, r4, #4              ; -dst_stride * 3 + 4

-    sub             r9, r1, #8              ; post increment for src load
-
-    rsb             r1, r6, r1, lsl #2      ; reset src for outer loop
+    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
+    sub             r9, r9, #7
    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop

    mov             r10, r6                 ; w loop counter

-loop_horiz
-    vld1.8          {d24}, [r0]!
-    vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
-
-    vld1.8          {d25}, [r0]!
-    vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
-
-    vld1.8          {d26}, [r0]!
-    vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
-
-    vld1.8          {d27}, [r0]!
-    vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
+loop_horiz_v
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8

    vtrn.16         q12, q13
    vtrn.8          d24, d25
    vtrn.8          d26, d27

-    ; extract to s16
+    pld             [r0, r1, lsl #2]
+
    vmovl.u8        q8, d24
    vmovl.u8        q9, d25
    vmovl.u8        q10, d26
    vmovl.u8        q11, d27
-    vtrn.32         d28, d29 ; only the first half is populated
+
+    ; save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+loop_horiz
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    ; extract to s16
+    vtrn.32         q14, q15
    vmovl.u8        q12, d28
-    vmovl.u8        q13, d30
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]

    ; src[] * filter_x
-    MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
-    MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
-    MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
@@ -128,20 +148,25 @@ loop_horiz
    vtrn.32         d2, d3
    vtrn.8          d2, d3

-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r4
+    vst1.u32        {d2[0]}, [r2@32], r3
+    vst1.u32        {d3[0]}, [r2@32], r3
+    vst1.u32        {d2[1]}, [r2@32], r3
+    vst1.u32        {d3[1]}, [r2@32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13

    subs            r6, r6, #4              ; w -= 4
    bgt             loop_horiz

    ; outer loop
    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r1              ; src += src_stride * 4 - w
+    add             r0, r0, r9              ; src += src_stride * 4 - w
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz
+    bgt loop_horiz_v

    pop             {r4-r10, pc}

@@ -152,59 +177,72 @@ loop_horiz
    cmp             r12, #16
    bne             vp9_convolve8_vert_c

-    push            {r4-r10, lr}
+    push            {r4-r8, lr}

    ; adjust for taps
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

-    ldr             r7, [sp, #40]           ; filter_y
-    ldr             r8, [sp, #48]           ; w
-    ldr             r9, [sp, #52]           ; h
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h

-    vld1.s16        {q0}, [r7]              ; filter_y
+    vld1.s16        {q0}, [r4]              ; filter_y

-    mov             r5, r1, lsl #1          ; src_stride * 2
-    add             r5, r5, r1, lsl #3      ; src_stride * 10
-    sub             r5, r5, #4              ; src_stride * 10 + 4
-    rsb             r5, r5, #0              ; reset for src
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1

-    add             r6, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r6, r6, #4              ; dst_stride * 3 - 4
-    rsb             r6, r6, #0              ; reset for dst
+loop_vert_h
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 ; h loop counter

-    rsb             r7, r8, r1, lsl #2      ; reset src for outer loop
-    rsb             r12, r8, r3, lsl #2     ; reset dst for outer loop
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1

-    mov             r10, r8                 ; w loop counter
-
-loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d16[0]}, [r0], r1
-    vld1.u32        {d16[1]}, [r0], r1
-    vld1.u32        {d18[0]}, [r0], r1
-    vld1.u32        {d18[1]}, [r0], r1
-    vld1.u32        {d20[0]}, [r0], r1
-    vld1.u32        {d20[1]}, [r0], r1
-    vld1.u32        {d22[0]}, [r0], r1
-    vld1.u32        {d22[1]}, [r0], r1
-    vld1.u32        {d24[0]}, [r0], r1
-    vld1.u32        {d24[1]}, [r0], r1
-    vld1.u32        {d26[0]}, [r0], r5
-
-    ; extract to s16
    vmovl.u8        q8, d16
    vmovl.u8        q9, d18
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22
+
+loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
+
+    ; extract to s16
    vmovl.u8        q12, d24
    vmovl.u8        q13, d26

+    pld             [r5]
+    pld             [r8]
+
    ; src[] * filter_y
-    MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
-    MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
-    MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r7]
+    pld             [r4]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
@@ -216,22 +254,27 @@ loop_vert
    vqmovn.u16      d2, q1
    vqmovn.u16      d3, q2

-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r6
+    vst1.u32        {d2[0]}, [r5@32], r3
+    vst1.u32        {d2[1]}, [r8@32], r3
+    vst1.u32        {d3[0]}, [r5@32], r3
+    vst1.u32        {d3[1]}, [r8@32], r3

-    subs            r8, r8, #4              ; w -= 4
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            ; h -= 4
    bgt             loop_vert

    ; outer loop
-    mov             r8, r10                 ; restore w counter
-    add             r0, r0, r7              ; src += 4 * src_stride - w
-    add             r2, r2, r12             ; dst += 4 * dst_stride - w
-    subs            r9, r9, #4              ; h -= 4
-    bgt             loop_vert
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              ; w -= 4
+    bgt             loop_vert_h

-    pop             {r4-r10, pc}
+    pop             {r4-r8, pc}

    ENDP
    END
--- a/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -10,6 +10,7 @@

 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
+#include "vpx_ports/mem.h"

 void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
                        uint8_t *dst, ptrdiff_t dst_stride,
@@ -19,7 +20,7 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
   */
-  uint8_t temp[64 * 72];
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);

  // Account for the vertical phase needing 3 lines prior and 4 lines post
  int intermediate_height = h + 7;
@@ -53,7 +54,7 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h) {
-  uint8_t temp[64 * 72];
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
  int intermediate_height = h + 7;

  if (x_step_q4 != 16 || y_step_q4 != 16)
--- a/vp9/common/arm/neon/vp9_copy_neon.asm
+++ b/vp9/common/arm/neon/vp9_copy_neon.asm
@@ -0,0 +1,84 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_convolve_copy_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_convolve_copy_neon| PROC
+    push                {r4-r5, lr}
+    ldrd                r4, r5, [sp, #28]
+
+    cmp                 r4, #32
+    bgt                 copy64
+    beq                 copy32
+    cmp                 r4, #8
+    bgt                 copy16
+    beq                 copy8
+    b                   copy4
+
+copy64
+    sub                 lr, r1, #32
+    sub                 r3, r3, #32
+copy64_h
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    vst1.8              {q0-q1}, [r2@128]!
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #1
+    bgt                 copy64_h
+    pop                 {r4-r5, pc}
+
+copy32
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q2-q3}, [r0], r1
+    vst1.8              {q0-q1}, [r2@128], r3
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 copy32
+    pop                 {r4-r5, pc}
+
+copy16
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q1}, [r0], r1
+    vst1.8              {q0}, [r2@128], r3
+    vst1.8              {q1}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 copy16
+    pop                 {r4-r5, pc}
+
+copy8
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {d0}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {d2}, [r0], r1
+    vst1.8              {d0}, [r2@64], r3
+    vst1.8              {d2}, [r2@64], r3
+    subs                r5, r5, #2
+    bgt                 copy8
+    pop                 {r4-r5, pc}
+
+copy4
+    ldr                 r12, [r0], r1
+    str                 r12, [r2], r3
+    subs                r5, r5, #1
+    bgt                 copy4
+    pop                 {r4-r5, pc}
+    ENDP
+
+    END
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -0,0 +1,172 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
+                                               int16_t *output,
+                                               int output_stride);
+extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
+                                               int16_t *output,
+                                               int16_t *pass1Output,
+                                               int16_t skip_adding,
+                                               uint8_t *dest,
+                                               int dest_stride);
+extern void vp9_short_idct16x16_10_add_neon_pass1(int16_t *input,
+                                               int16_t *output,
+                                               int output_stride);
+extern void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src,
+                                               int16_t *output,
+                                               int16_t *pass1Output,
+                                               int16_t skip_adding,
+                                               uint8_t *dest,
+                                               int dest_stride);
+
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+extern void vp9_push_neon(int64_t *store);
+extern void vp9_pop_neon(int64_t *store);
+
+void vp9_short_idct16x16_add_neon(int16_t *input,
+                                  uint8_t *dest, int dest_stride) {
+  int64_t store_reg[8];
+  int16_t pass1_output[16*16] = {0};
+  int16_t row_idct_output[16*16] = {0};
+
+  // save d8-d15 register values.
+  vp9_push_neon(store_reg);
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct16x16_add_neon_pass2(input+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     0,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the lower 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     0,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     1,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     1,
+                                     dest+8,
+                                     dest_stride);
+
+  // restore d8-d15 register values.
+  vp9_pop_neon(store_reg);
+
+  return;
+}
+
+void vp9_short_idct16x16_10_add_neon(int16_t *input,
+                                  uint8_t *dest, int dest_stride) {
+  int64_t store_reg[8];
+  int16_t pass1_output[16*16] = {0};
+  int16_t row_idct_output[16*16] = {0};
+
+  // save d8-d15 register values.
+  vp9_push_neon(store_reg);
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct16x16_10_add_neon_pass2(input+1,
+                                        row_idct_output,
+                                        pass1_output,
+                                        0,
+                                        dest,
+                                        dest_stride);
+
+  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     1,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     1,
+                                     dest+8,
+                                     dest_stride);
+
+  // restore d8-d15 register values.
+  vp9_pop_neon(store_reg);
+
+  return;
+}
--- a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -361,8 +361,6 @@ v_end

    vand        d16, d20, d19              ; flat && mask
    vmov        r5, r6, d16
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #1                 ; Only do filter branch

    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
@@ -388,10 +386,11 @@ v_end

    vmov.u8     d22, #0x80

+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #1                 ; Only do filter branch
+
    vand        d17, d18, d16              ; flat2 && flat && mask
    vmov        r5, r6, d17
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #2                 ; Only do mbfilter branch

    ; mbfilter() function

@@ -405,15 +404,10 @@ v_end
    vmov.u8     d27, #3

    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
-
    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-
    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-
    vand        d29, d29, d21              ; filter &= hev
-
    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-
    vmov.u8     d29, #4

    ; filter = clamp(filter + 3 * ( qs0 - ps0))
@@ -452,37 +446,37 @@ v_end
    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+    vaddl.u8    q10, d4, d5
    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+    vaddl.u8    q14, d6, d9
    vqrshrn.u16 d18, q15, #3               ; r_op2

-    vsubw.u8    q15, d4                    ; op1 = op2 - p3
-    vsubw.u8    q15, d5                    ; op1 -= p2
-    vaddw.u8    q15, d6                    ; op1 += p1
-    vaddw.u8    q15, d9                    ; op1 += q1
+    vsub.i16    q15, q10
+    vaddl.u8    q10, d4, d6
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d7, d10
    vqrshrn.u16 d19, q15, #3               ; r_op1

-    vsubw.u8    q15, d4                    ; op0 = op1 - p3
-    vsubw.u8    q15, d6                    ; op0 -= p1
-    vaddw.u8    q15, d7                    ; op0 += p0
-    vaddw.u8    q15, d10                   ; op0 += q2
+    vsub.i16    q15, q10
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d8, d11
    vqrshrn.u16 d20, q15, #3               ; r_op0

    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
    vsubw.u8    q15, d7                    ; oq0 -= p0
-    vaddw.u8    q15, d8                    ; oq0 += q0
-    vaddw.u8    q15, d11                   ; oq0 += q3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d9, d11
    vqrshrn.u16 d21, q15, #3               ; r_oq0

    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
    vsubw.u8    q15, d8                    ; oq1 -= q0
-    vaddw.u8    q15, d9                    ; oq1 += q1
-    vaddw.u8    q15, d11                   ; oq1 += q3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d10, d11
    vqrshrn.u16 d22, q15, #3               ; r_oq1

    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vaddw.u8    q15, d10                   ; oq2 += q2
-    vaddw.u8    q15, d11                   ; oq2 += q3
+    vadd.i16    q15, q14
    vqrshrn.u16 d27, q15, #3               ; r_oq2

    ; Filter does not set op2 or oq2, so use p2 and q2.
@@ -501,113 +495,104 @@ v_end
    ; wide_mbfilter flat2 && flat && mask branch
    vmov.u8     d16, #7
    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
+    vaddl.u8    q12, d2, d3
+    vaddl.u8    q13, d4, d5
+    vaddl.u8    q14, d1, d6
    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
-    vmlal.u8    q15, d1, d29               ; op6 += p6 * 2
-    vaddw.u8    q15, d2                    ; op6 += p5
-    vaddw.u8    q15, d3                    ; op6 += p4
-    vaddw.u8    q15, d4                    ; op6 += p3
-    vaddw.u8    q15, d5                    ; op6 += p2
-    vaddw.u8    q15, d6                    ; op6 += p1
+    vadd.i16    q12, q13
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d2, d9
+    vadd.i16    q15, q12
+    vaddl.u8    q12, d0, d1
+    vaddw.u8    q15, d1
+    vaddl.u8    q13, d0, d2
+    vadd.i16    q14, q15, q14
    vqrshrn.u16 d16, q15, #4               ; w_op6

-    vsubw.u8    q15, d0                    ; op5 = op6 - p7
-    vsubw.u8    q15, d1                    ; op5 -= p6
-    vaddw.u8    q15, d2                    ; op5 += p5
-    vaddw.u8    q15, d9                    ; op5 += q1
+    vsub.i16    q15, q14, q12
+    vaddl.u8    q14, d3, d10
    vqrshrn.u16 d24, q15, #4               ; w_op5

-    vsubw.u8    q15, d0                    ; op4 = op5 - p7
-    vsubw.u8    q15, d2                    ; op4 -= p5
-    vaddw.u8    q15, d3                    ; op4 += p4
-    vaddw.u8    q15, d10                   ; op4 += q2
+    vsub.i16    q15, q13
+    vaddl.u8    q13, d0, d3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d4, d11
    vqrshrn.u16 d25, q15, #4               ; w_op4

-    vsubw.u8    q15, d0                    ; op3 = op4 - p7
-    vsubw.u8    q15, d3                    ; op3 -= p4
-    vaddw.u8    q15, d4                    ; op3 += p3
-    vaddw.u8    q15, d11                   ; op3 += q3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d0, d4
+    vsub.i16    q15, q13
+    vsub.i16    q14, q15, q14
    vqrshrn.u16 d26, q15, #4               ; w_op3

-    vsubw.u8    q15, d0                    ; op2 = op3 - p7
-    vsubw.u8    q15, d4                    ; op2 -= p3
-    vaddw.u8    q15, d5                    ; op2 += p2
+    vaddw.u8    q15, q14, d5               ; op2 += p2
+    vaddl.u8    q14, d0, d5
    vaddw.u8    q15, d12                   ; op2 += q4
+    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
    vqrshrn.u16 d27, q15, #4               ; w_op2

-    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d0                    ; op1 = op2 - p7
-    vsubw.u8    q15, d5                    ; op1 -= p2
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d6
    vaddw.u8    q15, d6                    ; op1 += p1
    vaddw.u8    q15, d13                   ; op1 += q5
+    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
    vqrshrn.u16 d18, q15, #4               ; w_op1

-    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d0                    ; op0 = op1 - p7
-    vsubw.u8    q15, d6                    ; op0 -= p1
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d7
    vaddw.u8    q15, d7                    ; op0 += p0
    vaddw.u8    q15, d14                   ; op0 += q6
+    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
    vqrshrn.u16 d19, q15, #4               ; w_op0

-    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d0                    ; oq0 = op0 - p7
-    vsubw.u8    q15, d7                    ; oq0 -= p0
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d1, d8
    vaddw.u8    q15, d8                    ; oq0 += q0
    vaddw.u8    q15, d15                   ; oq0 += q7
+    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
    vqrshrn.u16 d20, q15, #4               ; w_oq0

-    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d1                    ; oq1 = oq0 - p6
-    vsubw.u8    q15, d8                    ; oq1 -= q0
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d2, d9
    vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddl.u8    q4, d10, d15
    vaddw.u8    q15, d15                   ; oq1 += q7
+    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
    vqrshrn.u16 d21, q15, #4               ; w_oq1

+    vsub.i16    q15, q14
+    vaddl.u8    q14, d3, d10
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d11, d15
    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d2                    ; oq2 = oq1 - p5
-    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vaddw.u8    q15, d10                   ; oq2 += q2
-    vaddw.u8    q15, d15                   ; oq2 += q7
    vqrshrn.u16 d22, q15, #4               ; w_oq2

+    vsub.i16    q15, q14
+    vaddl.u8    q14, d4, d11
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d12, d15
    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d3                    ; oq3 = oq2 - p4
-    vsubw.u8    q15, d10                   ; oq3 -= q2
-    vaddw.u8    q15, d11                   ; oq3 += q3
-    vaddw.u8    q15, d15                   ; oq3 += q7
    vqrshrn.u16 d23, q15, #4               ; w_oq3

+    vsub.i16    q15, q14
+    vaddl.u8    q14, d5, d12
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d13, d15
    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d4                    ; oq4 = oq3 - p3
-    vsubw.u8    q15, d11                   ; oq4 -= q3
-    vaddw.u8    q15, d12                   ; oq4 += q4
-    vaddw.u8    q15, d15                   ; oq4 += q7
    vqrshrn.u16 d1, q15, #4                ; w_oq4

+    vsub.i16    q15, q14
+    vaddl.u8    q14, d6, d13
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d14, d15
    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d5                    ; oq5 = oq4 - p2
-    vsubw.u8    q15, d12                   ; oq5 -= q4
-    vaddw.u8    q15, d13                   ; oq5 += q5
-    vaddw.u8    q15, d15                   ; oq5 += q7
    vqrshrn.u16 d2, q15, #4                ; w_oq5

+    vsub.i16    q15, q14
    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d6                    ; oq6 = oq5 - p1
-    vsubw.u8    q15, d13                   ; oq6 -= q5
-    vaddw.u8    q15, d14                   ; oq6 += q6
-    vaddw.u8    q15, d15                   ; oq6 += q7
-    vqrshrn.u16 d3, q15, #4                ; w_oq6
-
-    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
+    vadd.i16    q15, q4
    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
+    vqrshrn.u16 d3, q15, #4                ; w_oq6
    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
--- a/vp9/common/arm/neon/vp9_save_reg_neon.asm
+++ b/vp9/common/arm/neon/vp9_save_reg_neon.asm
@@ -0,0 +1,36 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_push_neon|
+    EXPORT  |vp9_pop_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_push_neon| PROC
+    vst1.i64            {d8, d9, d10, d11}, [r0]!
+    vst1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+|vp9_pop_neon| PROC
+    vld1.i64            {d8, d9, d10, d11}, [r0]!
+    vld1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+    END
+
--- a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
@@ -0,0 +1,198 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct16x16_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;                                    int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct16x16_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asr              r0, r0, #6                ; >> 6
+
+    vdup.s16         q0, r0                    ; duplicate a1
+    mov              r0, #8
+    sub              r2, #8
+
+    ; load destination data row0 - row3
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row4 - row7
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row8 - row11
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row12 - row15
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vp9_short_idct16x16_1_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
--- a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
--- a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
@@ -0,0 +1,68 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct4x4_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct4x4_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 4)
+    add              r0, r0, #8                ; + (1 <<((4) - 1))
+    asr              r0, r0, #4                ; >> 4
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    vld1.32          {d2[0]}, [r1], r2
+    vld1.32          {d2[1]}, [r1], r2
+    vld1.32          {d4[0]}, [r1], r2
+    vld1.32          {d4[1]}, [r1]
+
+    vaddw.u8         q8, q0, d2                ; dest[x] + a1
+    vaddw.u8         q9, q0, d4
+
+    vqmovun.s16      d6, q8                    ; clip_pixel
+    vqmovun.s16      d7, q9
+
+    vst1.32          {d6[0]}, [r12], r2
+    vst1.32          {d6[1]}, [r12], r2
+    vst1.32          {d7[0]}, [r12], r2
+    vst1.32          {d7[1]}, [r12]
+
+    bx               lr
+    ENDP             ; |vp9_short_idct4x4_1_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
@@ -0,0 +1,190 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_idct4x4_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct4x4_add_neon| PROC
+
+    ; The 2D transform is done with two passes which are actually pretty
+    ; similar. We first transform the rows. This is done by transposing
+    ; the inputs, doing an SIMD column transform (the columns are the
+    ; transposed rows) and then transpose the results (so that it goes back
+    ; in normal/row positions). Then, we transform the columns by doing
+    ; another SIMD column transform.
+    ; So, two passes of a transpose followed by a column transform.
+
+    ; load the inputs into q8-q9, d16-d19
+    vld1.s16        {q8,q9}, [r0]!
+
+    ; generate scalar constants
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov             r0, #0x3b00
+    add             r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov             r3, #0x2d00
+    add             r3, #0x41
+    ; cospi_24_64 = 6270 = 0x 187e
+    mov             r12, #0x1800
+    add             r12, #0x7e
+
+    ; transpose the input data
+    ; 00 01 02 03   d16
+    ; 10 11 12 13   d17
+    ; 20 21 22 23   d18
+    ; 30 31 32 33   d19
+    vtrn.16         d16, d17
+    vtrn.16         d18, d19
+
+    ; generate constant vectors
+    vdup.16         d20, r0         ; replicate cospi_8_64
+    vdup.16         d21, r3         ; replicate cospi_16_64
+
+    ; 00 10 02 12   d16
+    ; 01 11 03 13   d17
+    ; 20 30 22 32   d18
+    ; 21 31 23 33   d19
+    vtrn.32         q8, q9
+    ; 00 10 20 30   d16
+    ; 01 11 21 31   d17
+    ; 02 12 22 32   d18
+    ; 03 13 23 33   d19
+
+    vdup.16         d22, r12        ; replicate cospi_24_64
+
+    ; do the transform on transposed rows
+
+    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
+
+    ; (input[0] + input[2]) * cospi_16_64;
+    ; (input[0] - input[2]) * cospi_16_64;
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
+    vmlsl.s16 q15, d19, d20
+    vmlal.s16 q1,  d19, d22
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16 q8,  q13, q14
+    vsub.s16 q9,  q13, q14
+    vswp     d18, d19
+
+    ; transpose the results
+    ; 00 01 02 03   d16
+    ; 10 11 12 13   d17
+    ; 20 21 22 23   d18
+    ; 30 31 32 33   d19
+    vtrn.16         d16, d17
+    vtrn.16         d18, d19
+    ; 00 10 02 12   d16
+    ; 01 11 03 13   d17
+    ; 20 30 22 32   d18
+    ; 21 31 23 33   d19
+    vtrn.32         q8, q9
+    ; 00 10 20 30   d16
+    ; 01 11 21 31   d17
+    ; 02 12 22 32   d18
+    ; 03 13 23 33   d19
+
+    ; do the transform on columns
+
+    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
+
+    ; (input[0] + input[2]) * cospi_16_64;
+    ; (input[0] - input[2]) * cospi_16_64;
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
+    vmlsl.s16 q15, d19, d20
+    vmlal.s16 q1,  d19, d22
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16 q8,  q13, q14
+    vsub.s16 q9,  q13, q14
+
+    ; The results are in two registers, one of them being swapped. This will
+    ; be taken care of by loading the 'dest' value in a swapped fashion and
+    ; also storing them in the same swapped fashion.
+    ; temp_out[0, 1] = d16, d17 = q8
+    ; temp_out[2, 3] = d19, d18 = q9 swapped
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+    vrshr.s16 q8, q8, #4
+    vrshr.s16 q9, q9, #4
+
+    vld1.32 {d26[0]}, [r1], r2
+    vld1.32 {d26[1]}, [r1], r2
+    vld1.32 {d27[1]}, [r1], r2
+    vld1.32 {d27[0]}, [r1]  ; no post-increment
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    vaddw.u8 q8, q8, d26
+    vaddw.u8 q9, q9, d27
+
+    ; clip_pixel
+    vqmovun.s16 d26, q8
+    vqmovun.s16 d27, q9
+
+    ; do the stores in reverse order with negative post-increment, by changing
+    ; the sign of the stride
+    rsb r2, r2, #0
+    vst1.32 {d27[0]}, [r1], r2
+    vst1.32 {d27[1]}, [r1], r2
+    vst1.32 {d26[1]}, [r1], r2
+    vst1.32 {d26[0]}, [r1]  ; no post-increment
+    bx              lr
+    ENDP  ; |vp9_short_idct4x4_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
@@ -0,0 +1,88 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct8x8_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct8x8_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 5)
+    add              r0, r0, #16               ; + (1 <<((5) - 1))
+    asr              r0, r0, #5                ; >> 5
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    ; load destination data
+    vld1.64          {d2}, [r1], r2
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r2
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r2
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r2
+    vld1.64          {d17}, [r1]
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vp9_short_idct8x8_1_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -9,6 +9,7 @@
 ;

    EXPORT  |vp9_short_idct8x8_add_neon|
+    EXPORT  |vp9_short_idct8x8_10_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -24,191 +25,149 @@
    ; stage 1
    vdup.16         d0, r3                    ; duplicate cospi_28_64
    vdup.16         d1, r4                    ; duplicate cospi_4_64
+    vdup.16         d2, r5                    ; duplicate cospi_12_64
+    vdup.16         d3, r6                    ; duplicate cospi_20_64

    ; input[1] * cospi_28_64
    vmull.s16       q2, d18, d0
    vmull.s16       q3, d19, d0

-    ; input[7] * cospi_4_64
-    vmull.s16       q4, d30, d1
-    vmull.s16       q5, d31, d1
+    ; input[5] * cospi_12_64
+    vmull.s16       q5, d26, d2
+    vmull.s16       q6, d27, d2

    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vsub.s32        q6, q2, q4
-    vsub.s32        q7, q3, q5
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q5, d22, d3
+    vmlsl.s16       q6, d23, d3

    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q6, #14               ; >> 14
-    vqrshrn.s32     d9, q7, #14               ; >> 14
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14

    ; input[1] * cospi_4_64
    vmull.s16       q2, d18, d1
    vmull.s16       q3, d19, d1

-    ; input[7] * cospi_28_64
-    vmull.s16       q1, d30, d0
-    vmull.s16       q5, d31, d0
+    ; input[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q13, d27, d3

    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vadd.s32        q2, q2, q1
-    vadd.s32        q3, q3, q5
+    vmlal.s16       q2, d30, d0
+    vmlal.s16       q3, d31, d0
+
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q13, d23, d2

    ; dct_const_round_shift(input_dc * cospi_16_64)
    vqrshrn.s32     d14, q2, #14              ; >> 14
    vqrshrn.s32     d15, q3, #14              ; >> 14

-    vdup.16         d0, r5                    ; duplicate cospi_12_64
-    vdup.16         d1, r6                    ; duplicate cospi_20_64
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q2, d26, d0
-    vmull.s16       q3, d27, d0
-
-    ; input[3] * cospi_20_64
-    vmull.s16       q5, d22, d1
-    vmull.s16       q6, d23, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vsub.s32        q2, q2, q5
-    vsub.s32        q3, q3, q6
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q2, #14              ; >> 14
-    vqrshrn.s32     d11, q3, #14              ; >> 14
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q2, d26, d1
-    vmull.s16       q3, d27, d1
-
-    ; input[3] * cospi_12_64
-    vmull.s16       q9, d22, d0
-    vmull.s16       q15, d23, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vadd.s32        q0, q2, q9
-    vadd.s32        q1, q3, q15
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q0, #14              ; >> 14
-    vqrshrn.s32     d13, q1, #14              ; >> 14
-
    ; stage 2 & stage 3 - even half
    vdup.16         d0, r7                    ; duplicate cospi_16_64

+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q13, #14              ; >> 14
+
    ; input[0] * cospi_16_64
    vmull.s16       q2, d16, d0
    vmull.s16       q3, d17, d0

-    ; input[2] * cospi_16_64
-    vmull.s16       q9,  d24, d0
-    vmull.s16       q11, d25, d0
+    ; input[0] * cospi_16_64
+    vmull.s16       q13, d16, d0
+    vmull.s16       q15, d17, d0

    ; (input[0] + input[2]) * cospi_16_64
-    vadd.s32        q9, q2, q9
-    vadd.s32        q11, q3, q11
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q9, #14              ; >> 14
-    vqrshrn.s32     d19, q11, #14             ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[2] * cospi_16_64
-    vmull.s16       q13,  d24, d0
-    vmull.s16       q15, d25, d0
+    vmlal.s16       q2,  d24, d0
+    vmlal.s16       q3, d25, d0

    ; (input[0] - input[2]) * cospi_16_64
-    vsub.s32        q2, q2, q13
-    vsub.s32        q3, q3, q15
+    vmlsl.s16       q13, d24, d0
+    vmlsl.s16       q15, d25, d0

-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q2, #14              ; >> 14
-    vqrshrn.s32     d23, q3, #14              ; >> 14
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
    vdup.16         d0, r8                    ; duplicate cospi_24_64
    vdup.16         d1, r9                    ; duplicate cospi_8_64

+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q2, #14              ; >> 14
+    vqrshrn.s32     d19, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d22, q13, #14              ; >> 14
+    vqrshrn.s32     d23, q15, #14              ; >> 14
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
    ; input[1] * cospi_24_64
    vmull.s16       q2, d20, d0
    vmull.s16       q3, d21, d0

-    ; input[3] * cospi_8_64
-    vmull.s16       q13, d28, d1
-    vmull.s16       q15, d29, d1
+    ; input[1] * cospi_8_64
+    vmull.s16       q8, d20, d1
+    vmull.s16       q12, d21, d1

    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vsub.s32        q2, q2, q13
-    vsub.s32        q3, q3, q15
+    vmlsl.s16       q2, d28, d1
+    vmlsl.s16       q3, d29, d1
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q8, d28, d0
+    vmlal.s16       q12, d29, d0

    ; dct_const_round_shift(input_dc * cospi_16_64)
    vqrshrn.s32     d26, q2, #14              ; >> 14
    vqrshrn.s32     d27, q3, #14              ; >> 14

-    ; input[1] * cospi_8_64
-    vmull.s16       q2, d20, d1
-    vmull.s16       q3, d21, d1
-
-    ; input[3] * cospi_24_64
-    vmull.s16       q8, d28, d0
-    vmull.s16       q10, d29, d0
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vadd.s32        q0, q2, q8
-    vadd.s32        q1, q3, q10
-
    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q0, #14              ; >> 14
-    vqrshrn.s32     d31, q1, #14              ; >> 14
-
+    vqrshrn.s32     d30, q8, #14              ; >> 14
+    vqrshrn.s32     d31, q12, #14              ; >> 14

    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]

+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
    ; stage 2 - odd half
    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]

-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
    ; step2[6] * cospi_16_64
    vmull.s16       q9, d28, d16
    vmull.s16       q10, d29, d16

-    ; step2[5] * cospi_16_64
-    vmull.s16       q11, d26, d16
-    vmull.s16       q12, d27, d16
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16

    ; (step2[6] - step2[5]) * cospi_16_64
-    vsub.s32        q9, q9, q11
-    vsub.s32        q10, q10, q12
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16

    ; dct_const_round_shift(input_dc * cospi_16_64)
    vqrshrn.s32     d10, q9, #14              ; >> 14
    vqrshrn.s32     d11, q10, #14             ; >> 14

-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q11, d26, d16
-    vmull.s16       q12, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vadd.s32        q9, q9, q11
-    vadd.s32        q10, q10, q12
-
    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14

    ; stage 4
    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
@@ -247,14 +206,11 @@

 |vp9_short_idct8x8_add_neon| PROC
    push            {r4-r9}
-    vld1.s16        {q8}, [r0]!
-    vld1.s16        {q9}, [r0]!
-    vld1.s16        {q10}, [r0]!
-    vld1.s16        {q11}, [r0]!
-    vld1.s16        {q12}, [r0]!
-    vld1.s16        {q13}, [r0]!
-    vld1.s16        {q14}, [r0]!
-    vld1.s16        {q15}, [r0]!
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!

    ; transpose the input data
    TRANSPOSE8X8
@@ -349,8 +305,215 @@
    vst1.64         {d6}, [r0], r2
    vst1.64         {d7}, [r0], r2

+    vpop            {d8-d15}
    pop             {r4-r9}
    bx              lr
    ENDP  ; |vp9_short_idct8x8_add_neon|

+;void vp9_short_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct8x8_10_add_neon| PROC
+    push            {r4-r9}
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+
+    ; First transform rows
+    ; stage 1
+    ; The following instructions use vqrdmulh to do the
+    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
+    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
+    ; to double the constants before multiplying to compensate this.
+    mov             r12, r3, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
+    mov             r12, r4, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_28_64)
+    vqrdmulh.s16    q4, q9, q0
+
+    mov             r12, r6, lsl #1
+    rsb             r12, #0
+    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_4_64)
+    vqrdmulh.s16    q7, q9, q1
+
+    mov             r12, r5, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
+
+    ; dct_const_round_shift(- input[3] * cospi_20_64)
+    vqrdmulh.s16    q5, q11, q0
+
+    mov             r12, r7, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
+
+    ; dct_const_round_shift(input[3] * cospi_12_64)
+    vqrdmulh.s16    q6, q11, q1
+
+    ; stage 2 & stage 3 - even half
+    mov             r12, r8, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrdmulh.s16    q9, q8, q0
+
+    mov             r12, r9, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_24_64)
+    vqrdmulh.s16    q13, q10, q1
+
+    ; dct_const_round_shift(input[1] * cospi_8_64)
+    vqrdmulh.s16    q15, q10, q0
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+
+    ; Transpose the matrix
+    TRANSPOSE8X8
+
+    ; Then transform columns
+    IDCT8x8_1D
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+
+    vpop            {d8-d15}
+    pop             {r4-r9}
+    bx              lr
+    ENDP  ; |vp9_short_idct8x8_10_add_neon|
+
    END
--- a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_iht4x4_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
+    ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
+    ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
+    ; into d16-d19 registers. This macro will touch q10- q15 registers and use
+    ; them as buffer during calculation.
+    MACRO
+    IDCT4x4_1D
+    ; stage 1
+    vadd.s16    d23, d16, d18   ; (input[0] + input[2])
+    vsub.s16    d24, d16, d18   ; (input[0] - input[2])
+
+    vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
+    vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
+    vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
+    vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q10, #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16    q8,  q13, q14
+    vsub.s16    q9,  q13, q14
+    vswp        d18, d19
+    MEND
+
+    ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
+    ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
+    ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
+    ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
+    ; q14,q15 registers and use them as buffer during calculation.
+    MACRO
+    IADST4x4_1D
+    vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
+    vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
+    vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
+    vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
+    vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
+    vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
+    vaddw.s16   q15, q15, d19   ; x0 + x3
+    vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
+    vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
+    vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
+
+    vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
+    vadd.s32    q10, q10, q8
+    vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
+    vdup.32     q8, r0          ; duplicate sinpi_3_9
+    vsub.s32    q11, q11, q9
+    vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
+
+    vadd.s32    q13, q10, q12   ; s0 = x0 + x3
+    vadd.s32    q10, q10, q11   ; x0 + x1
+    vadd.s32    q14, q11, q12   ; s1 = x1 + x3
+    vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d16, q13, #14
+    vqrshrn.s32 d17, q14, #14
+    vqrshrn.s32 d18, q15, #14
+    vqrshrn.s32 d19, q10, #14
+    MEND
+
+    ; Generate cosine constants in d6 - d8 for the IDCT
+    MACRO
+    GENERATE_COSINE_CONSTANTS
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov         r0, #0x3b00
+    add         r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov         r3, #0x2d00
+    add         r3, #0x41
+    ; cospi_24_64 = 6270 = 0x187e
+    mov         r12, #0x1800
+    add         r12, #0x7e
+
+    ; generate constant vectors
+    vdup.16     d0, r0          ; duplicate cospi_8_64
+    vdup.16     d1, r3          ; duplicate cospi_16_64
+    vdup.16     d2, r12         ; duplicate cospi_24_64
+    MEND
+
+    ; Generate sine constants in d1 - d4 for the IADST.
+    MACRO
+    GENERATE_SINE_CONSTANTS
+    ; sinpi_1_9 = 5283 = 0x14A3
+    mov         r0, #0x1400
+    add         r0, #0xa3
+    ; sinpi_2_9 = 9929 = 0x26C9
+    mov         r3, #0x2600
+    add         r3, #0xc9
+    ; sinpi_4_9 = 15212 = 0x3B6C
+    mov         r12, #0x3b00
+    add         r12, #0x6c
+
+    ; generate constant vectors
+    vdup.16     d3, r0          ; duplicate sinpi_1_9
+
+    ; sinpi_3_9 = 13377 = 0x3441
+    mov         r0, #0x3400
+    add         r0, #0x41
+
+    vdup.16     d4, r3          ; duplicate sinpi_2_9
+    vdup.16     d5, r12         ; duplicate sinpi_4_9
+    vdup.16     q3, r0          ; duplicate sinpi_3_9
+    MEND
+
+    ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
+    MACRO
+    TRANSPOSE4X4
+    vtrn.16     d16, d17
+    vtrn.16     d18, d19
+    vtrn.32     q8, q9
+    MEND
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
+;                               int dest_stride, int tx_type)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+; r3  int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_short_iht4x4_add_neon| PROC
+
+    ; load the inputs into d16-d19
+    vld1.s16    {q8,q9}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE4X4
+
+    ; decide the type of transform
+    cmp         r3, #2
+    beq         idct_iadst
+    cmp         r3, #3
+    beq         iadst_iadst
+
+iadst_idct
+    ; generate constants
+    GENERATE_COSINE_CONSTANTS
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IDCT4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IADST4x4_1D
+
+    b end_vp9_short_iht4x4_add_neon
+
+idct_iadst
+    ; generate constants
+    GENERATE_COSINE_CONSTANTS
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IADST4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IDCT4x4_1D
+
+    b end_vp9_short_iht4x4_add_neon
+
+iadst_iadst
+    ; generate constants
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IADST4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IADST4x4_1D
+
+end_vp9_short_iht4x4_add_neon
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+    vrshr.s16   q8, q8, #4
+    vrshr.s16   q9, q9, #4
+
+    vld1.32     {d26[0]}, [r1], r2
+    vld1.32     {d26[1]}, [r1], r2
+    vld1.32     {d27[0]}, [r1], r2
+    vld1.32     {d27[1]}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    vaddw.u8    q8, q8, d26
+    vaddw.u8    q9, q9, d27
+
+    ; clip_pixel
+    vqmovun.s16 d26, q8
+    vqmovun.s16 d27, q9
+
+    ; do the stores in reverse order with negative post-increment, by changing
+    ; the sign of the stride
+    rsb         r2, r2, #0
+    vst1.32     {d27[1]}, [r1], r2
+    vst1.32     {d27[0]}, [r1], r2
+    vst1.32     {d26[1]}, [r1], r2
+    vst1.32     {d26[0]}, [r1]  ; no post-increment
+    bx          lr
+    ENDP  ; |vp9_short_iht4x4_add_neon|
+
+    END
--- a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
@@ -0,0 +1,696 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_iht8x8_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Generate IADST constants in r0 - r12 for the IADST.
+    MACRO
+    GENERATE_IADST_CONSTANTS
+    ; generate  cospi_2_64  = 16305
+    mov             r0, #0x3f00
+    add             r0, #0xb1
+
+    ; generate cospi_30_64 = 1606
+    mov             r1, #0x600
+    add             r1, #0x46
+
+    ; generate cospi_10_64 = 14449
+    mov             r2, #0x3800
+    add             r2, #0x71
+
+    ; generate cospi_22_64 = 7723
+    mov             r3, #0x1e00
+    add             r3, #0x2b
+
+    ; generate cospi_18_64 = 10394
+    mov             r4, #0x2800
+    add             r4, #0x9a
+
+    ; generate cospi_14_64 = 12665
+    mov             r5, #0x3100
+    add             r5, #0x79
+
+    ; generate cospi_26_64 = 4756
+    mov             r6, #0x1200
+    add             r6, #0x94
+
+    ; generate cospi_6_64  = 15679
+    mov             r7, #0x3d00
+    add             r7, #0x3f
+
+    ; generate cospi_8_64  = 15137
+    mov             r8, #0x3b00
+    add             r8, #0x21
+
+    ; generate cospi_24_64 = 6270
+    mov             r9, #0x1800
+    add             r9, #0x7e
+
+    ; generate 0
+    mov             r10, #0
+
+    ; generate  cospi_16_64 = 11585
+    mov             r12, #0x2d00
+    add             r12, #0x41
+    MEND
+
+    ; Generate IDCT constants in r3 - r9 for the IDCT.
+    MACRO
+    GENERATE_IDCT_CONSTANTS
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+    MEND
+
+    ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
+    ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
+    ; will be stored back into q8-q15 registers. This macro will touch q0-q7
+    ; registers and use them as buffer during calculation.
+    MACRO
+    IDCT8x8_1D
+    ; stage 1
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r4                    ; duplicate cospi_4_64
+    vdup.16         d2, r5                    ; duplicate cospi_12_64
+    vdup.16         d3, r6                    ; duplicate cospi_20_64
+
+    ; input[1] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; input[5] * cospi_12_64
+    vmull.s16       q5, d26, d2
+    vmull.s16       q6, d27, d2
+
+    ; input[1]*cospi_28_64-input[7]*cospi_4_64
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q5, d22, d3
+    vmlsl.s16       q6, d23, d3
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
+    ; input[1] * cospi_4_64
+    vmull.s16       q2, d18, d1
+    vmull.s16       q3, d19, d1
+
+    ; input[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q13, d27, d3
+
+    ; input[1]*cospi_4_64+input[7]*cospi_28_64
+    vmlal.s16       q2, d30, d0
+    vmlal.s16       q3, d31, d0
+
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q13, d23, d2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d14, q2, #14              ; >> 14
+    vqrshrn.s32     d15, q3, #14              ; >> 14
+
+    ; stage 2 & stage 3 - even half
+    vdup.16         d0, r7                    ; duplicate cospi_16_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q13, #14             ; >> 14
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q2, d16, d0
+    vmull.s16       q3, d17, d0
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q13, d16, d0
+    vmull.s16       q15, d17, d0
+
+    ; (input[0] + input[2]) * cospi_16_64
+    vmlal.s16       q2,  d24, d0
+    vmlal.s16       q3, d25, d0
+
+    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16       q13, d24, d0
+    vmlsl.s16       q15, d25, d0
+
+    vdup.16         d0, r8                    ; duplicate cospi_24_64
+    vdup.16         d1, r9                    ; duplicate cospi_8_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q2, #14              ; >> 14
+    vqrshrn.s32     d19, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d22, q13, #14             ; >> 14
+    vqrshrn.s32     d23, q15, #14             ; >> 14
+
+    ; input[1] * cospi_24_64
+    vmull.s16       q2, d20, d0
+    vmull.s16       q3, d21, d0
+
+    ; input[1] * cospi_8_64
+    vmull.s16       q8, d20, d1
+    vmull.s16       q12, d21, d1
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlsl.s16       q2, d28, d1
+    vmlsl.s16       q3, d29, d1
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q8, d28, d0
+    vmlal.s16       q12, d29, d0
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d26, q2, #14              ; >> 14
+    vqrshrn.s32     d27, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d30, q8, #14              ; >> 14
+    vqrshrn.s32     d31, q12, #14             ; >> 14
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14             ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+    MEND
+
+    ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
+    ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
+    ; output will be stored back into q8-q15 registers. This macro will touch
+    ; q0 - q7 registers and use them as buffer during calculation.
+    MACRO
+    IADST8X8_1D
+    vdup.16         d14, r0                   ; duplicate cospi_2_64
+    vdup.16         d15, r1                   ; duplicate cospi_30_64
+
+    ; cospi_2_64  * x0
+    vmull.s16       q1, d30, d14
+    vmull.s16       q2, d31, d14
+
+    ; cospi_30_64 * x0
+    vmull.s16       q3, d30, d15
+    vmull.s16       q4, d31, d15
+
+    vdup.16         d30, r4                   ; duplicate cospi_18_64
+    vdup.16         d31, r5                   ; duplicate cospi_14_64
+
+    ; s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+    vmlal.s16       q1, d16, d15
+    vmlal.s16       q2, d17, d15
+
+    ; s1 = cospi_30_64 * x0 - cospi_2_64  * x1
+    vmlsl.s16       q3, d16, d14
+    vmlsl.s16       q4, d17, d14
+
+    ; cospi_18_64 * x4
+    vmull.s16       q5, d22, d30
+    vmull.s16       q6, d23, d30
+
+    ; cospi_14_64 * x4
+    vmull.s16       q7, d22, d31
+    vmull.s16       q8, d23, d31
+
+    ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+    vmlal.s16       q5, d24, d31
+    vmlal.s16       q6, d25, d31
+
+    ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
+    vmlsl.s16       q7, d24, d30
+    vmlsl.s16       q8, d25, d30
+
+    ; (s0 + s4)
+    vadd.s32        q11, q1, q5
+    vadd.s32        q12, q2, q6
+
+    vdup.16         d0, r2                   ; duplicate cospi_10_64
+    vdup.16         d1, r3                   ; duplicate cospi_22_64
+
+    ; (s0 - s4)
+    vsub.s32        q1, q1, q5
+    vsub.s32        q2, q2, q6
+
+    ; x0 = dct_const_round_shift(s0 + s4);
+    vqrshrn.s32     d22, q11, #14             ; >> 14
+    vqrshrn.s32     d23, q12, #14             ; >> 14
+
+    ; (s1 + s5)
+    vadd.s32        q12, q3, q7
+    vadd.s32        q15, q4, q8
+
+    ; (s1 - s5)
+    vsub.s32        q3, q3, q7
+    vsub.s32        q4, q4, q8
+
+    ; x4 = dct_const_round_shift(s0 - s4);
+    vqrshrn.s32     d2, q1, #14               ; >> 14
+    vqrshrn.s32     d3, q2, #14               ; >> 14
+
+    ; x1 = dct_const_round_shift(s1 + s5);
+    vqrshrn.s32     d24, q12, #14             ; >> 14
+    vqrshrn.s32     d25, q15, #14             ; >> 14
+
+    ; x5 = dct_const_round_shift(s1 - s5);
+    vqrshrn.s32     d6, q3, #14               ; >> 14
+    vqrshrn.s32     d7, q4, #14               ; >> 14
+
+    ; cospi_10_64 * x2
+    vmull.s16       q4, d26, d0
+    vmull.s16       q5, d27, d0
+
+    ; cospi_22_64 * x2
+    vmull.s16       q2, d26, d1
+    vmull.s16       q6, d27, d1
+
+    vdup.16         d30, r6                   ; duplicate cospi_26_64
+    vdup.16         d31, r7                   ; duplicate cospi_6_64
+
+    ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+    vmlal.s16       q4, d20, d1
+    vmlal.s16       q5, d21, d1
+
+    ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+    vmlsl.s16       q2, d20, d0
+    vmlsl.s16       q6, d21, d0
+
+    ; cospi_26_64 * x6
+    vmull.s16       q0, d18, d30
+    vmull.s16       q13, d19, d30
+
+    ; s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+    vmlal.s16       q0, d28, d31
+    vmlal.s16       q13, d29, d31
+
+    ; cospi_6_64  * x6
+    vmull.s16       q10, d18, d31
+    vmull.s16       q9, d19, d31
+
+    ; s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+    vmlsl.s16       q10, d28, d30
+    vmlsl.s16       q9, d29, d30
+
+    ; (s3 + s7)
+    vadd.s32        q14, q2, q10
+    vadd.s32        q15, q6, q9
+
+    ; (s3 - s7)
+    vsub.s32        q2, q2, q10
+    vsub.s32        q6, q6, q9
+
+    ; x3 = dct_const_round_shift(s3 + s7);
+    vqrshrn.s32     d28, q14, #14             ; >> 14
+    vqrshrn.s32     d29, q15, #14             ; >> 14
+
+    ; x7 = dct_const_round_shift(s3 - s7);
+    vqrshrn.s32     d4, q2, #14               ; >> 14
+    vqrshrn.s32     d5, q6, #14               ; >> 14
+
+    ; (s2 + s6)
+    vadd.s32        q9, q4, q0
+    vadd.s32        q10, q5, q13
+
+    ; (s2 - s6)
+    vsub.s32        q4, q4, q0
+    vsub.s32        q5, q5, q13
+
+    vdup.16         d30, r8                   ; duplicate cospi_8_64
+    vdup.16         d31, r9                   ; duplicate cospi_24_64
+
+    ; x2 = dct_const_round_shift(s2 + s6);
+    vqrshrn.s32     d18, q9, #14              ; >> 14
+    vqrshrn.s32     d19, q10, #14             ; >> 14
+
+    ; x6 = dct_const_round_shift(s2 - s6);
+    vqrshrn.s32     d8, q4, #14               ; >> 14
+    vqrshrn.s32     d9, q5, #14               ; >> 14
+
+    ; cospi_8_64  * x4
+    vmull.s16       q5, d2, d30
+    vmull.s16       q6, d3, d30
+
+    ; cospi_24_64 * x4
+    vmull.s16       q7, d2, d31
+    vmull.s16       q0, d3, d31
+
+    ; s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+    vmlal.s16       q5, d6, d31
+    vmlal.s16       q6, d7, d31
+
+    ; s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+    vmlsl.s16       q7, d6, d30
+    vmlsl.s16       q0, d7, d30
+
+    ; cospi_8_64  * x7
+    vmull.s16       q1, d4, d30
+    vmull.s16       q3, d5, d30
+
+    ; cospi_24_64 * x7
+    vmull.s16       q10, d4, d31
+    vmull.s16       q2, d5, d31
+
+    ; s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+    vmlsl.s16       q1, d8, d31
+    vmlsl.s16       q3, d9, d31
+
+    ; s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+    vmlal.s16       q10, d8, d30
+    vmlal.s16       q2, d9, d30
+
+    vadd.s16        q8, q11, q9               ; x0 = s0 + s2;
+
+    vsub.s16        q11, q11, q9              ; x2 = s0 - s2;
+
+    vadd.s16        q4, q12, q14              ; x1 = s1 + s3;
+
+    vsub.s16        q12, q12, q14             ; x3 = s1 - s3;
+
+    ; (s4 + s6)
+    vadd.s32        q14, q5, q1
+    vadd.s32        q15, q6, q3
+
+    ; (s4 - s6)
+    vsub.s32        q5, q5, q1
+    vsub.s32        q6, q6, q3
+
+    ; x4 = dct_const_round_shift(s4 + s6);
+    vqrshrn.s32     d18, q14, #14             ; >> 14
+    vqrshrn.s32     d19, q15, #14             ; >> 14
+
+    ; x6 = dct_const_round_shift(s4 - s6);
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
+    ; (s5 + s7)
+    vadd.s32        q1, q7, q10
+    vadd.s32        q3, q0, q2
+
+    ; (s5 - s7))
+    vsub.s32        q7, q7, q10
+    vsub.s32        q0, q0, q2
+
+    ; x5 = dct_const_round_shift(s5 + s7);
+    vqrshrn.s32     d28, q1, #14               ; >> 14
+    vqrshrn.s32     d29, q3, #14               ; >> 14
+
+    ; x7 = dct_const_round_shift(s5 - s7);
+    vqrshrn.s32     d14, q7, #14              ; >> 14
+    vqrshrn.s32     d15, q0, #14              ; >> 14
+
+    vdup.16         d30, r12                  ; duplicate cospi_16_64
+
+    ; cospi_16_64 * x2
+    vmull.s16       q2, d22, d30
+    vmull.s16       q3, d23, d30
+
+    ; cospi_6_64  * x6
+    vmull.s16       q13, d22, d30
+    vmull.s16       q1, d23, d30
+
+    ; cospi_16_64 * x2 + cospi_16_64  * x3;
+    vmlal.s16       q2, d24, d30
+    vmlal.s16       q3, d25, d30
+
+    ; cospi_16_64 * x2 - cospi_16_64  * x3;
+    vmlsl.s16       q13, d24, d30
+    vmlsl.s16       q1, d25, d30
+
+    ; x2 = dct_const_round_shift(s2);
+    vqrshrn.s32     d4, q2, #14               ; >> 14
+    vqrshrn.s32     d5, q3, #14               ; >> 14
+
+    ;x3 = dct_const_round_shift(s3);
+    vqrshrn.s32     d24, q13, #14             ; >> 14
+    vqrshrn.s32     d25, q1, #14              ; >> 14
+
+    ; cospi_16_64 * x6
+    vmull.s16       q13, d10, d30
+    vmull.s16       q1, d11, d30
+
+    ; cospi_6_64  * x6
+    vmull.s16       q11, d10, d30
+    vmull.s16       q0, d11, d30
+
+    ; cospi_16_64 * x6 + cospi_16_64  * x7;
+    vmlal.s16       q13, d14, d30
+    vmlal.s16       q1, d15, d30
+
+    ; cospi_16_64 * x6 - cospi_16_64  * x7;
+    vmlsl.s16       q11, d14, d30
+    vmlsl.s16       q0, d15, d30
+
+    ; x6 = dct_const_round_shift(s6);
+    vqrshrn.s32     d20, q13, #14             ; >> 14
+    vqrshrn.s32     d21, q1, #14              ; >> 14
+
+    ;x7 = dct_const_round_shift(s7);
+    vqrshrn.s32     d12, q11, #14             ; >> 14
+    vqrshrn.s32     d13, q0, #14              ; >> 14
+
+    vdup.16         q5, r10                   ; duplicate 0
+
+    vsub.s16        q9, q5, q9                ; output[1] = -x4;
+    vsub.s16        q11, q5, q2               ; output[3] = -x2;
+    vsub.s16        q13, q5, q6               ; output[5] = -x7;
+    vsub.s16        q15, q5, q4               ; output[7] = -x1;
+    MEND
+
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest,
+;                               int dest_stride, int tx_type)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+; r3  int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_short_iht8x8_add_neon| PROC
+
+    ; load the inputs into d16-d19
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    push            {r0-r10}
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; decide the type of transform
+    cmp         r3, #2
+    beq         idct_iadst
+    cmp         r3, #3
+    beq         iadst_iadst
+
+iadst_idct
+    ; generate IDCT constants
+    GENERATE_IDCT_CONSTANTS
+
+    ; first transform rows
+    IDCT8x8_1D
+
+    ; transpose the matrix
+    TRANSPOSE8X8
+
+    ; generate IADST constants
+    GENERATE_IADST_CONSTANTS
+
+    ; then transform columns
+    IADST8X8_1D
+
+    b end_vp9_short_iht8x8_add_neon
+
+idct_iadst
+    ; generate IADST constants
+    GENERATE_IADST_CONSTANTS
+
+    ; first transform rows
+    IADST8X8_1D
+
+    ; transpose the matrix
+    TRANSPOSE8X8
+
+    ; generate IDCT constants
+    GENERATE_IDCT_CONSTANTS
+
+    ; then transform columns
+    IDCT8x8_1D
+
+    b end_vp9_short_iht8x8_add_neon
+
+iadst_iadst
+    ; generate IADST constants
+    GENERATE_IADST_CONSTANTS
+
+    ; first transform rows
+    IADST8X8_1D
+
+    ; transpose the matrix
+    TRANSPOSE8X8
+
+    ; then transform columns
+    IADST8X8_1D
+
+end_vp9_short_iht8x8_add_neon
+    pop            {r0-r10}
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+    bx          lr
+    ENDP  ; |vp9_short_iht8x8_add_neon|
+
+    END
--- a/vp9/common/generic/vp9_systemdependent.c
+++ b/vp9/common/generic/vp9_systemdependent.c
@@ -10,9 +10,10 @@


 #include "./vpx_config.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_onyxc_int.h"

-void vp9_machine_specific_config(VP9_COMMON *ctx) {
+void vp9_machine_specific_config(VP9_COMMON *cm) {
+  (void)cm;
  vp9_rtcd();
 }
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -31,40 +31,30 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) {
    vpx_memset(&mi[i * stride], 0, sizeof(MODE_INFO));
 }

-void vp9_update_mode_info_in_image(VP9_COMMON *cm, MODE_INFO *mi) {
-  int i, j;
-
-  // For each in image mode_info element set the in image flag to 1
-  for (i = 0; i < cm->mi_rows; i++) {
-    MODE_INFO *ptr = mi;
-    for (j = 0; j < cm->mi_cols; j++) {
-      ptr->mbmi.mb_in_image = 1;
-      ptr++;  // Next element in the row
-    }
-
-    // Step over border element at start of next row
-    mi += cm->mode_info_stride;
-  }
-}
-
-void vp9_free_frame_buffers(VP9_COMMON *oci) {
+void vp9_free_frame_buffers(VP9_COMMON *cm) {
  int i;

  for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    vp9_free_frame_buffer(&oci->yv12_fb[i]);
+    vp9_free_frame_buffer(&cm->yv12_fb[i]);

-  vp9_free_frame_buffer(&oci->post_proc_buffer);
+  vp9_free_frame_buffer(&cm->post_proc_buffer);

-  vpx_free(oci->mip);
-  vpx_free(oci->prev_mip);
-  vpx_free(oci->above_seg_context);
+  vpx_free(cm->mip);
+  vpx_free(cm->prev_mip);
+  vpx_free(cm->above_seg_context);
+  vpx_free(cm->last_frame_seg_map);
+  vpx_free(cm->mi_grid_base);
+  vpx_free(cm->prev_mi_grid_base);

-  vpx_free(oci->above_context[0]);
+  vpx_free(cm->above_context[0]);
  for (i = 0; i < MAX_MB_PLANE; i++)
-    oci->above_context[i] = 0;
-  oci->mip = NULL;
-  oci->prev_mip = NULL;
-  oci->above_seg_context = NULL;
+    cm->above_context[i] = 0;
+  cm->mip = NULL;
+  cm->prev_mip = NULL;
+  cm->above_seg_context = NULL;
+  cm->last_frame_seg_map = NULL;
+  cm->mi_grid_base = NULL;
+  cm->prev_mi_grid_base = NULL;
 }

 static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
@@ -72,112 +62,120 @@ static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
  cm->mb_rows = (aligned_height + 8) >> 4;
  cm->MBs = cm->mb_rows * cm->mb_cols;

-  cm->mi_cols = aligned_width >> LOG2_MI_SIZE;
-  cm->mi_rows = aligned_height >> LOG2_MI_SIZE;
+  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
  cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE;
 }

 static void setup_mi(VP9_COMMON *cm) {
  cm->mi = cm->mip + cm->mode_info_stride + 1;
  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;

  vpx_memset(cm->mip, 0,
             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));

-  vp9_update_mode_info_border(cm, cm->mip);
-  vp9_update_mode_info_in_image(cm, cm->mi);
+  vpx_memset(cm->mi_grid_base, 0,
+             cm->mode_info_stride * (cm->mi_rows + 1) *
+             sizeof(*cm->mi_grid_base));

+  vp9_update_mode_info_border(cm, cm->mip);
  vp9_update_mode_info_border(cm, cm->prev_mip);
-  vp9_update_mode_info_in_image(cm, cm->prev_mi);
 }

-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
+int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
  int i, mi_cols;

-  const int aligned_width = ALIGN_POWER_OF_TWO(width, LOG2_MI_SIZE);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, LOG2_MI_SIZE);
-  const int ss_x = oci->subsampling_x;
-  const int ss_y = oci->subsampling_y;
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+  const int ss_x = cm->subsampling_x;
+  const int ss_y = cm->subsampling_y;
  int mi_size;

-  vp9_free_frame_buffers(oci);
+  vp9_free_frame_buffers(cm);

  for (i = 0; i < NUM_YV12_BUFFERS; i++) {
-    oci->fb_idx_ref_cnt[i] = 0;
-    if (vp9_alloc_frame_buffer(&oci->yv12_fb[i], width, height, ss_x, ss_y,
+    cm->fb_idx_ref_cnt[i] = 0;
+    if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y,
                               VP9BORDERINPIXELS) < 0)
      goto fail;
  }

-  oci->new_fb_idx = NUM_YV12_BUFFERS - 1;
-  oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1;
+  cm->new_fb_idx = NUM_YV12_BUFFERS - 1;
+  cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1;

  for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++)
-    oci->active_ref_idx[i] = i;
+    cm->active_ref_idx[i] = i;

  for (i = 0; i < NUM_REF_FRAMES; i++) {
-    oci->ref_frame_map[i] = i;
-    oci->fb_idx_ref_cnt[i] = 1;
+    cm->ref_frame_map[i] = i;
+    cm->fb_idx_ref_cnt[i] = 1;
  }

-  if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y,
+  if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
                             VP9BORDERINPIXELS) < 0)
    goto fail;

-  set_mb_mi(oci, aligned_width, aligned_height);
+  set_mb_mi(cm, aligned_width, aligned_height);

  // Allocation
-  mi_size = oci->mode_info_stride * (oci->mi_rows + MI_BLOCK_SIZE);
+  mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE);

-  oci->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
-  if (!oci->mip)
+  cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!cm->mip)
    goto fail;

-  oci->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
-  if (!oci->prev_mip)
+  cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!cm->prev_mip)
    goto fail;

-  setup_mi(oci);
+  cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
+  if (!cm->mi_grid_base)
+    goto fail;
+
+  cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
+  if (!cm->prev_mi_grid_base)
+    goto fail;
+
+  setup_mi(cm);

  // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
  // information is exposed at this level
-  mi_cols = mi_cols_aligned_to_sb(oci->mi_cols);
+  mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);

  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
  // block where mi unit size is 8x8.
-# if CONFIG_ALPHA
-  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 8 * mi_cols, 1);
-#else
-  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 6 * mi_cols, 1);
-#endif
-  if (!oci->above_context[0])
+  cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE *
+                                         (2 * mi_cols), 1);
+  if (!cm->above_context[0])
    goto fail;

-  oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
-  if (!oci->above_seg_context)
+  cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
+  if (!cm->above_seg_context)
+    goto fail;
+
+  // Create the segmentation map structure and set to 0.
+  cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
+  if (!cm->last_frame_seg_map)
    goto fail;

  return 0;

 fail:
-  vp9_free_frame_buffers(oci);
+  vp9_free_frame_buffers(cm);
  return 1;
 }

-void vp9_create_common(VP9_COMMON *oci) {
-  vp9_machine_specific_config(oci);
+void vp9_create_common(VP9_COMMON *cm) {
+  vp9_machine_specific_config(cm);

-  vp9_init_mbmode_probs(oci);
-
-  oci->tx_mode = ONLY_4X4;
-  oci->comp_pred_mode = HYBRID_PREDICTION;
-
-  // Initialize reference frame sign bias structure to defaults
-  vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+  cm->tx_mode = ONLY_4X4;
+  cm->comp_pred_mode = HYBRID_PREDICTION;
 }

-void vp9_remove_common(VP9_COMMON *oci) {
-  vp9_free_frame_buffers(oci);
+void vp9_remove_common(VP9_COMMON *cm) {
+  vp9_free_frame_buffers(cm);
 }

 void vp9_initialize_common() {
@@ -188,8 +186,8 @@ void vp9_initialize_common() {

 void vp9_update_frame_size(VP9_COMMON *cm) {
  int i, mi_cols;
-  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, LOG2_MI_SIZE);
-  const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, LOG2_MI_SIZE);
+  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2);

  set_mb_mi(cm, aligned_width, aligned_height);
  setup_mi(cm);
@@ -198,4 +196,8 @@ void vp9_update_frame_size(VP9_COMMON *cm) {
  for (i = 1; i < MAX_MB_PLANE; i++)
    cm->above_context[i] =
        cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
+
+  // Initialize the previous frame segment map to 0.
+  if (cm->last_frame_seg_map)
+    vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
 }
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -16,14 +16,13 @@

 void vp9_initialize_common();

-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi);
-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);
+void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);

-void vp9_create_common(VP9_COMMON *oci);
-void vp9_remove_common(VP9_COMMON *oci);
+void vp9_create_common(VP9_COMMON *cm);
+void vp9_remove_common(VP9_COMMON *cm);

-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);
-void vp9_free_frame_buffers(VP9_COMMON *oci);
+int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height);
+void vp9_free_frame_buffers(VP9_COMMON *cm);


 void vp9_update_frame_size(VP9_COMMON *cm);
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -19,9 +19,9 @@

 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_common_data.h"
-#include "vp9/common/vp9_convolve.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_mv.h"
+#include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_treecoder.h"

@@ -71,7 +71,7 @@ typedef enum {
  D135_PRED,       // Directional 135 deg = 180 - 45
  D117_PRED,       // Directional 117 deg = 180 - 63
  D153_PRED,       // Directional 153 deg = 180 - 27
-  D27_PRED,        // Directional 27  deg = round(arctan(1/2) * 180/pi)
+  D207_PRED,       // Directional 207 deg = 180 + 27
  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
  TM_PRED,         // True-motion
  NEARESTMV,
@@ -89,9 +89,9 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
  return mode >= NEARESTMV && mode <= NEWMV;
 }

-#define VP9_INTRA_MODES (TM_PRED + 1)
+#define INTRA_MODES (TM_PRED + 1)

-#define VP9_INTER_MODES (1 + NEWMV - NEARESTMV)
+#define INTER_MODES (1 + NEWMV - NEARESTMV)

 static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
  return (mode - NEARESTMV);
@@ -115,45 +115,41 @@ typedef enum {
  MAX_REF_FRAMES = 4
 } MV_REFERENCE_FRAME;

-static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int b_width_log2(BLOCK_SIZE sb_type) {
  return b_width_log2_lookup[sb_type];
 }
-static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int b_height_log2(BLOCK_SIZE sb_type) {
  return b_height_log2_lookup[sb_type];
 }

-static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
  return mi_width_log2_lookup[sb_type];
 }

-static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int mi_height_log2(BLOCK_SIZE sb_type) {
  return mi_height_log2_lookup[sb_type];
 }

+// This structure now relates to 8x8 block regions.
 typedef struct {
  MB_PREDICTION_MODE mode, uv_mode;
  MV_REFERENCE_FRAME ref_frame[2];
-  TX_SIZE txfm_size;
-  int_mv mv[2]; // for each reference frame used
+  TX_SIZE tx_size;
+  int_mv mv[2];                // for each reference frame used
  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-  int_mv best_mv, best_second_mv;
+  int_mv best_mv[2];

-  uint8_t mb_mode_context[MAX_REF_FRAMES];
+  uint8_t mode_context[MAX_REF_FRAMES];

-  unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
-  unsigned char segment_id;           // Segment id for current frame
+  unsigned char skip_coeff;    // 0=need to decode coeffs, 1=no coefficients
+  unsigned char segment_id;    // Segment id for this block.

-  // Flags used for prediction status of various bistream signals
+  // Flags used for prediction status of various bit-stream signals
  unsigned char seg_id_predicted;

-  // Indicates if the mb is part of the image (1) vs border (0)
-  // This can be useful in determining whether the MB provides
-  // a valid predictor
-  unsigned char mb_in_image;
-
  INTERPOLATIONFILTERTYPE interp_filter;

-  BLOCK_SIZE_TYPE sb_type;
+  BLOCK_SIZE sb_type;
 } MB_MODE_INFO;

 typedef struct {
@@ -161,36 +157,19 @@ typedef struct {
  union b_mode_info bmi[4];
 } MODE_INFO;

-static int is_inter_block(const MB_MODE_INFO *mbmi) {
+static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
  return mbmi->ref_frame[0] > INTRA_FRAME;
 }

+static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
+  return mbmi->ref_frame[1] > INTRA_FRAME;
+}

 enum mv_precision {
  MV_PRECISION_Q3,
  MV_PRECISION_Q4
 };

-#define VP9_REF_SCALE_SHIFT 14
-#define VP9_REF_NO_SCALE (1 << VP9_REF_SCALE_SHIFT)
-
-struct scale_factors {
-  int x_scale_fp;   // horizontal fixed point scale factor
-  int y_scale_fp;   // vertical fixed point scale factor
-  int x_offset_q4;
-  int x_step_q4;
-  int y_offset_q4;
-  int y_step_q4;
-
-  int (*scale_value_x)(int val, const struct scale_factors *scale);
-  int (*scale_value_y)(int val, const struct scale_factors *scale);
-  void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
-  MV32 (*scale_mv_q3_to_q4)(const MV *mv, const struct scale_factors *scale);
-  MV32 (*scale_mv_q4)(const MV *mv, const struct scale_factors *scale);
-
-  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
-};
-
 #if CONFIG_ALPHA
 enum { MAX_MB_PLANE = 4 };
 #else
@@ -216,45 +195,27 @@ struct macroblockd_plane {
  ENTROPY_CONTEXT *left_context;
 };

-#define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n))
-
-#define MAX_REF_LF_DELTAS       4
-#define MAX_MODE_LF_DELTAS      2
-
-struct loopfilter {
-  int filter_level;
-
-  int sharpness_level;
-  int last_sharpness_level;
-
-  uint8_t mode_ref_delta_enabled;
-  uint8_t mode_ref_delta_update;
-
-  // 0 = Intra, Last, GF, ARF
-  signed char ref_deltas[MAX_REF_LF_DELTAS];
-  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
-
-  // 0 = ZERO_MV, MV
-  signed char mode_deltas[MAX_MODE_LF_DELTAS];
-  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
-};
+#define BLOCK_OFFSET(x, i) ((x) + (i) * 16)

 typedef struct macroblockd {
  struct macroblockd_plane plane[MAX_MB_PLANE];

  struct scale_factors scale_factor[2];

-  MODE_INFO *prev_mode_info_context;
-  MODE_INFO *mode_info_context;
+  MODE_INFO *last_mi;
+  MODE_INFO *this_mi;
  int mode_info_stride;

+  MODE_INFO *mic_stream_ptr;
+
+  // A NULL indicates that the 8x8 is not part of the image
+  MODE_INFO **mi_8x8;
+  MODE_INFO **prev_mi_8x8;
+
  int up_available;
  int left_available;
  int right_available;

-  struct segmentation seg;
-  struct loopfilter lf;
-
  // partition contexts
  PARTITION_CONTEXT *above_seg_context;
  PARTITION_CONTEXT *left_seg_context;
@@ -283,10 +244,9 @@ typedef struct macroblockd {
  unsigned char ab_index;   // index of 4x4 block inside the 8x8 block

  int q_index;
-
 } MACROBLOCKD;

-static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
  switch (subsize) {
    case BLOCK_64X64:
    case BLOCK_64X32:
@@ -311,9 +271,8 @@ static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsi
  }
 }

-static INLINE void update_partition_context(MACROBLOCKD *xd,
-                                            BLOCK_SIZE_TYPE sb_type,
-                                            BLOCK_SIZE_TYPE sb_size) {
+static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type,
+                                            BLOCK_SIZE sb_size) {
  const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
  const int bwl = b_width_log2(sb_type);
  const int bhl = b_height_log2(sb_type);
@@ -331,8 +290,7 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
  vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
 }

-static INLINE int partition_plane_context(MACROBLOCKD *xd,
-                                          BLOCK_SIZE_TYPE sb_type) {
+static INLINE int partition_plane_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type) {
  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
  int above = 0, left = 0, i;
  int boffset = mi_width_log2(BLOCK_64X64) - bsl;
@@ -352,10 +310,9 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd,
  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }

-static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
-                                   PARTITION_TYPE partition) {
-  BLOCK_SIZE_TYPE subsize = subsize_lookup[partition][bsize];
-  assert(subsize != BLOCK_SIZE_TYPES);
+static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) {
+  const BLOCK_SIZE subsize = subsize_lookup[partition][bsize];
+  assert(subsize < BLOCK_SIZES);
  return subsize;
 }

@@ -363,7 +320,7 @@ extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];

 static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                      const MACROBLOCKD *xd, int ib) {
-  const MODE_INFO *const mi = xd->mode_info_context;
+  const MODE_INFO *const mi = xd->this_mi;
  const MB_MODE_INFO *const mbmi = &mi->mbmi;

  if (plane_type != PLANE_TYPE_Y_WITH_DC ||
@@ -378,13 +335,13 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
 static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
                                      const MACROBLOCKD *xd) {
  return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
+             mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT;
 }

 static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type,
                                        const MACROBLOCKD *xd) {
  return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
+             mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT;
 }

 static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
@@ -404,259 +361,147 @@ static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {


 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
-  return MIN(mbmi->txfm_size, max_uv_txsize_lookup[mbmi->sb_type]);
+  return MIN(mbmi->tx_size, max_uv_txsize_lookup[mbmi->sb_type]);
 }

-struct plane_block_idx {
-  int plane;
-  int block;
-};
-
-// TODO(jkoleszar): returning a struct so it can be used in a const context,
-// expect to refactor this further later.
-static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
-                                                     int b_idx) {
-  const int v_offset = y_blocks * 5 / 4;
-  struct plane_block_idx res;
-
-  if (b_idx < y_blocks) {
-    res.plane = 0;
-    res.block = b_idx;
-  } else if (b_idx < v_offset) {
-    res.plane = 1;
-    res.block = b_idx - y_blocks;
-  } else {
-    assert(b_idx < y_blocks * 3 / 2);
-    res.plane = 2;
-    res.block = b_idx - v_offset;
-  }
-  return res;
+static BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+                                       const struct macroblockd_plane *pd) {
+  BLOCK_SIZE bs = ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
+  assert(bs < BLOCK_SIZES);
+  return bs;
 }

-static INLINE int plane_block_width(BLOCK_SIZE_TYPE bsize,
+static INLINE int plane_block_width(BLOCK_SIZE bsize,
                                    const struct macroblockd_plane* plane) {
  return 4 << (b_width_log2(bsize) - plane->subsampling_x);
 }

-static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize,
+static INLINE int plane_block_height(BLOCK_SIZE bsize,
                                     const struct macroblockd_plane* plane) {
  return 4 << (b_height_log2(bsize) - plane->subsampling_y);
 }

-static INLINE int plane_block_width_log2by4(
-    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
-  return (b_width_log2(bsize) - plane->subsampling_x);
-}
-
-static INLINE int plane_block_height_log2by4(
-    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
-  return (b_height_log2(bsize) - plane->subsampling_y);
-}
-
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
-                                                  BLOCK_SIZE_TYPE bsize,
-                                                  int ss_txfrm_size,
+                                                  BLOCK_SIZE plane_bsize,
+                                                  TX_SIZE tx_size,
                                                  void *arg);

 static INLINE void foreach_transformed_block_in_plane(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
    foreach_transformed_block_visitor visit, void *arg) {
-  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
-
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO* mbmi = &xd->this_mi->mbmi;
  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
  // transform size varies per plane, look it up in a common way.
-  const MB_MODE_INFO* mbmi = &xd->mode_info_context->mbmi;
  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
-                                : mbmi->txfm_size;
-  const int block_size_b = bw + bh;
-  const int txfrm_size_b = tx_size * 2;
-
-  // subsampled size of the block
-  const int ss_sum = xd->plane[plane].subsampling_x
-      + xd->plane[plane].subsampling_y;
-  const int ss_block_size = block_size_b - ss_sum;
-
-  const int step = 1 << txfrm_size_b;
-
+                                : mbmi->tx_size;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int step = 1 << (tx_size << 1);
  int i;

-  assert(txfrm_size_b <= block_size_b);
-  assert(txfrm_size_b <= ss_block_size);
-
  // If mb_to_right_edge is < 0 we are in a situation in which
  // the current block size extends into the UMV and we won't
  // visit the sub blocks that are wholly within the UMV.
  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
    int r, c;
-    const int sw = bw - xd->plane[plane].subsampling_x;
-    const int sh = bh - xd->plane[plane].subsampling_y;
-    int max_blocks_wide = 1 << sw;
-    int max_blocks_high = 1 << sh;
+
+    int max_blocks_wide = num_4x4_w;
+    int max_blocks_high = num_4x4_h;

    // xd->mb_to_right_edge is in units of pixels * 8.  This converts
    // it to 4x4 block sizes.
    if (xd->mb_to_right_edge < 0)
-      max_blocks_wide +=
-          (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
+      max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));

    if (xd->mb_to_bottom_edge < 0)
-      max_blocks_high +=
-          (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
+      max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));

    i = 0;
    // Unlike the normal case - in here we have to keep track of the
    // row and column of the blocks we use so that we know if we are in
    // the unrestricted motion border.
-    for (r = 0; r < (1 << sh); r += (1 << tx_size)) {
-      for (c = 0; c < (1 << sw); c += (1 << tx_size)) {
+    for (r = 0; r < num_4x4_h; r += (1 << tx_size)) {
+      for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
        if (r < max_blocks_high && c < max_blocks_wide)
-          visit(plane, i, bsize, txfrm_size_b, arg);
+          visit(plane, i, plane_bsize, tx_size, arg);
        i += step;
      }
    }
  } else {
-    for (i = 0; i < (1 << ss_block_size); i += step) {
-      visit(plane, i, bsize, txfrm_size_b, arg);
-    }
+    for (i = 0; i < num_4x4_w * num_4x4_h; i += step)
+      visit(plane, i, plane_bsize, tx_size, arg);
  }
 }

 static INLINE void foreach_transformed_block(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
    foreach_transformed_block_visitor visit, void *arg) {
  int plane;

-  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
-    foreach_transformed_block_in_plane(xd, bsize, plane,
-                                       visit, arg);
-  }
+  for (plane = 0; plane < MAX_MB_PLANE; plane++)
+    foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }

 static INLINE void foreach_transformed_block_uv(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
    foreach_transformed_block_visitor visit, void *arg) {
  int plane;

-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    foreach_transformed_block_in_plane(xd, bsize, plane,
-                                       visit, arg);
-  }
+  for (plane = 1; plane < MAX_MB_PLANE; plane++)
+    foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }

-// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE_TYPE, but that type isn't defined for
-// sizes smaller than 16x16 yet.
-typedef void (*foreach_predicted_block_visitor)(int plane, int block,
-                                                BLOCK_SIZE_TYPE bsize,
-                                                int pred_w, int pred_h,
-                                                void *arg);
-static INLINE void foreach_predicted_block_in_plane(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
-    foreach_predicted_block_visitor visit, void *arg) {
-  int i, x, y;
-
-  // block sizes in number of 4x4 blocks log 2 ("*_b")
-  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-  // subsampled size of the block
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-
-  // size of the predictor to use.
-  int pred_w, pred_h;
-
-  if (xd->mode_info_context->mbmi.sb_type < BLOCK_8X8) {
-    assert(bsize == BLOCK_8X8);
-    pred_w = 0;
-    pred_h = 0;
-  } else {
-    pred_w = bwl;
-    pred_h = bhl;
-  }
-  assert(pred_w <= bwl);
-  assert(pred_h <= bhl);
-
-  // visit each subblock in raster order
-  i = 0;
-  for (y = 0; y < 1 << bhl; y += 1 << pred_h) {
-    for (x = 0; x < 1 << bwl; x += 1 << pred_w) {
-      visit(plane, i, bsize, pred_w, pred_h, arg);
-      i += 1 << pred_w;
-    }
-    i += (1 << (bwl + pred_h)) - (1 << bwl);
-  }
-}
-static INLINE void foreach_predicted_block(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
-    foreach_predicted_block_visitor visit, void *arg) {
-  int plane;
-
-  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
-    foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);
-  }
-}
-static INLINE void foreach_predicted_block_uv(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
-    foreach_predicted_block_visitor visit, void *arg) {
-  int plane;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);
-  }
-}
-static int raster_block_offset(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
-                               int plane, int block, int stride) {
-  const int bw = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int y = 4 * (block >> bw), x = 4 * (block & ((1 << bw) - 1));
+static int raster_block_offset(BLOCK_SIZE plane_bsize,
+                               int raster_block, int stride) {
+  const int bw = b_width_log2(plane_bsize);
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
  return y * stride + x;
 }
-static int16_t* raster_block_offset_int16(MACROBLOCKD *xd,
-                                         BLOCK_SIZE_TYPE bsize,
-                                         int plane, int block, int16_t *base) {
-  const int stride = plane_block_width(bsize, &xd->plane[plane]);
-  return base + raster_block_offset(xd, bsize, plane, block, stride);
+static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                          int raster_block, int16_t *base) {
+  const int stride = 4 << b_width_log2(plane_bsize);
+  return base + raster_block_offset(plane_bsize, raster_block, stride);
 }
-static uint8_t* raster_block_offset_uint8(MACROBLOCKD *xd,
-                                         BLOCK_SIZE_TYPE bsize,
-                                         int plane, int block,
-                                         uint8_t *base, int stride) {
-  return base + raster_block_offset(xd, bsize, plane, block, stride);
+static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize,
+                                          int raster_block, uint8_t *base,
+                                          int stride) {
+  return base + raster_block_offset(plane_bsize, raster_block, stride);
 }

-static int txfrm_block_to_raster_block(MACROBLOCKD *xd,
-                                       BLOCK_SIZE_TYPE bsize,
-                                       int plane, int block,
-                                       int ss_txfrm_size) {
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int txwl = ss_txfrm_size / 2;
-  const int tx_cols_log2 = bwl - txwl;
+static int txfrm_block_to_raster_block(BLOCK_SIZE plane_bsize,
+                                       TX_SIZE tx_size, int block) {
+  const int bwl = b_width_log2(plane_bsize);
+  const int tx_cols_log2 = bwl - tx_size;
  const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> ss_txfrm_size;
-  const int x = (raster_mb & (tx_cols - 1)) << (txwl);
-  const int y = raster_mb >> tx_cols_log2 << (txwl);
+  const int raster_mb = block >> (tx_size << 1);
+  const int x = (raster_mb & (tx_cols - 1)) << tx_size;
+  const int y = (raster_mb >> tx_cols_log2) << tx_size;
  return x + (y << bwl);
 }

-static void txfrm_block_to_raster_xy(MACROBLOCKD *xd,
-                                     BLOCK_SIZE_TYPE bsize,
-                                     int plane, int block,
-                                     int ss_txfrm_size,
+static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
+                                     TX_SIZE tx_size, int block,
                                     int *x, int *y) {
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int txwl = ss_txfrm_size / 2;
-  const int tx_cols_log2 = bwl - txwl;
+  const int bwl = b_width_log2(plane_bsize);
+  const int tx_cols_log2 = bwl - tx_size;
  const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> ss_txfrm_size;
-  *x = (raster_mb & (tx_cols - 1)) << (txwl);
-  *y = raster_mb >> tx_cols_log2 << (txwl);
+  const int raster_mb = block >> (tx_size << 1);
+  *x = (raster_mb & (tx_cols - 1)) << tx_size;
+  *y = (raster_mb >> tx_cols_log2) << tx_size;
 }

-static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
-                             BLOCK_SIZE_TYPE bsize, int ss_txfrm_size) {
-  const int bw = plane_block_width(bsize, &xd->plane[plane]);
-  const int bh = plane_block_height(bsize, &xd->plane[plane]);
+static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
+                             int plane, int block, TX_SIZE tx_size) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  uint8_t *const buf = pd->dst.buf;
+  const int stride = pd->dst.stride;
+
  int x, y;
-  txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
  x = x * 4 - 1;
  y = y * 4 - 1;
  // Copy a pixel into the umv if we are in a situation where the block size
@@ -664,41 +509,38 @@ static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
  // TODO(JBB): Should be able to do the full extend in place so we don't have
  // to do this multiple times.
  if (xd->mb_to_right_edge < 0) {
-    int umv_border_start = bw
-        + (xd->mb_to_right_edge >> (3 + xd->plane[plane].subsampling_x));
+    const int bw = 4 << b_width_log2(plane_bsize);
+    const int umv_border_start = bw + (xd->mb_to_right_edge >>
+                                       (3 + pd->subsampling_x));

    if (x + bw > umv_border_start)
-      vpx_memset(
-          xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride
-              + umv_border_start,
-          *(xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride
-              + umv_border_start - 1),
-          bw);
+      vpx_memset(&buf[y * stride + umv_border_start],
+                 buf[y * stride + umv_border_start - 1], bw);
  }
-  if (xd->mb_to_bottom_edge < 0) {
-    int umv_border_start = bh
-        + (xd->mb_to_bottom_edge >> (3 + xd->plane[plane].subsampling_y));
-    int i;
-    uint8_t c = *(xd->plane[plane].dst.buf
-        + (umv_border_start - 1) * xd->plane[plane].dst.stride + x);

-    uint8_t *d = xd->plane[plane].dst.buf
-        + umv_border_start * xd->plane[plane].dst.stride + x;
+  if (xd->mb_to_bottom_edge < 0) {
+    const int bh = 4 << b_height_log2(plane_bsize);
+    const int umv_border_start = bh + (xd->mb_to_bottom_edge >>
+                                       (3 + pd->subsampling_y));
+    int i;
+    const uint8_t c = buf[(umv_border_start - 1) * stride + x];
+    uint8_t *d = &buf[umv_border_start * stride + x];

    if (y + bh > umv_border_start)
-      for (i = 0; i < bh; i++, d += xd->plane[plane].dst.stride)
+      for (i = 0; i < bh; ++i, d += stride)
        *d = c;
  }
 }
-static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
-                                   int plane, int tx_size_in_blocks,
-                                   int eob, int aoff, int loff,
+static void set_contexts_on_border(MACROBLOCKD *xd,
+                                   struct macroblockd_plane *pd,
+                                   BLOCK_SIZE plane_bsize,
+                                   int tx_size_in_blocks, int has_eob,
+                                   int aoff, int loff,
                                   ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
-  struct macroblockd_plane *pd = &xd->plane[plane];
+  int mi_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  int mi_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
  int above_contexts = tx_size_in_blocks;
  int left_contexts = tx_size_in_blocks;
-  int mi_blocks_wide = 1 << plane_block_width_log2by4(bsize, pd);
-  int mi_blocks_high = 1 << plane_block_height_log2by4(bsize, pd);
  int pt;

  // xd->mb_to_right_edge is in units of pixels * 8.  This converts
@@ -706,26 +548,47 @@ static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
  if (xd->mb_to_right_edge < 0)
    mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));

+  if (xd->mb_to_bottom_edge < 0)
+    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
  // this code attempts to avoid copying into contexts that are outside
  // our border.  Any blocks that do are set to 0...
  if (above_contexts + aoff > mi_blocks_wide)
    above_contexts = mi_blocks_wide - aoff;

-  if (xd->mb_to_bottom_edge < 0)
-    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
  if (left_contexts + loff > mi_blocks_high)
    left_contexts = mi_blocks_high - loff;

  for (pt = 0; pt < above_contexts; pt++)
-    A[pt] = eob > 0;
+    A[pt] = has_eob;
  for (pt = above_contexts; pt < tx_size_in_blocks; pt++)
    A[pt] = 0;
  for (pt = 0; pt < left_contexts; pt++)
-    L[pt] = eob > 0;
+    L[pt] = has_eob;
  for (pt = left_contexts; pt < tx_size_in_blocks; pt++)
    L[pt] = 0;
 }

+static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                         int has_eob, int aoff, int loff) {
+  ENTROPY_CONTEXT *const A = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const L = pd->left_context + loff;
+  const int tx_size_in_blocks = 1 << tx_size;
+
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    set_contexts_on_border(xd, pd, plane_bsize, tx_size_in_blocks, has_eob,
+                           aoff, loff, A, L);
+  } else {
+    vpx_memset(A, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    vpx_memset(L, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+}
+
+static int get_tx_eob(struct segmentation *seg, int segment_id,
+                      TX_SIZE tx_size) {
+  const int eob_max = 16 << (tx_size << 1);
+  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}

 #endif  // VP9_COMMON_VP9_BLOCKD_H_
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -13,33 +13,33 @@
 #include "vp9/common/vp9_common_data.h"

 // Log 2 conversion lookup tables for block width and height
-const int b_width_log2_lookup[BLOCK_SIZE_TYPES] =
+const int b_width_log2_lookup[BLOCK_SIZES] =
  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
-const int b_height_log2_lookup[BLOCK_SIZE_TYPES] =
+const int b_height_log2_lookup[BLOCK_SIZES] =
  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
-const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
+const int num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
-const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES] =
+const int num_4x4_blocks_high_lookup[BLOCK_SIZES] =
  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
 // Log 2 conversion lookup tables for modeinfo width and height
-const int mi_width_log2_lookup[BLOCK_SIZE_TYPES] =
+const int mi_width_log2_lookup[BLOCK_SIZES] =
  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
-const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
+const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
-const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] =
+const int mi_height_log2_lookup[BLOCK_SIZES] =
  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
-const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] =
+const int num_8x8_blocks_high_lookup[BLOCK_SIZES] =
  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};

 // MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
-const int size_group_lookup[BLOCK_SIZE_TYPES] =
+const int size_group_lookup[BLOCK_SIZES] =
  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};

-const int num_pels_log2_lookup[BLOCK_SIZE_TYPES] =
+const int num_pels_log2_lookup[BLOCK_SIZES] =
  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};


-const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
+const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
  {  // 4X4
    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
    PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
@@ -74,51 +74,62 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
  }
 };

-const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = {
+const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
  {     // PARTITION_NONE
-    BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
-    BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
+    BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
+    BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
    BLOCK_64X64,
  }, {  // PARTITION_HORZ
-    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_64X32,
  }, {  // PARTITION_VERT
-    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_32X64,
  }, {  // PARTITION_SPLIT
-    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X16,   BLOCK_INVALID, BLOCK_INVALID,
    BLOCK_32X32,
  }
 };

-const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES] = {
-  TX_4X4, TX_4X4, TX_4X4,
-  TX_8X8, TX_8X8, TX_8X8,
+const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
+  TX_4X4,   TX_4X4,   TX_4X4,
+  TX_8X8,   TX_8X8,   TX_8X8,
  TX_16X16, TX_16X16, TX_16X16,
  TX_32X32, TX_32X32, TX_32X32, TX_32X32
 };
-const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = {
-  TX_4X4, TX_4X4, TX_4X4,
-  TX_4X4, TX_4X4, TX_4X4,
-  TX_8X8, TX_8X8, TX_8X8,
+const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES] = {
+  TX_4X4,   TX_4X4,   TX_4X4,
+  TX_4X4,   TX_4X4,   TX_4X4,
+  TX_8X8,   TX_8X8,   TX_8X8,
  TX_16X16, TX_16X16, TX_16X16, TX_32X32
 };

-const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = {
-  { BLOCK_4X4,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8 },
-  { BLOCK_8X4,   BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X16,  BLOCK_8X16 },
-  { BLOCK_16X8,  BLOCK_16X8,  BLOCK_16X16, BLOCK_16X32, BLOCK_16X32 },
-  { BLOCK_32X16, BLOCK_32X16, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64 },
-  { BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X64 }
+const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
+//  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
+//  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
+  {{BLOCK_4X4,   BLOCK_INVALID}, {BLOCK_INVALID, BLOCK_INVALID}},
+  {{BLOCK_4X8,   BLOCK_4X4},     {BLOCK_INVALID, BLOCK_INVALID}},
+  {{BLOCK_8X4,   BLOCK_INVALID}, {BLOCK_4X4,     BLOCK_INVALID}},
+  {{BLOCK_8X8,   BLOCK_8X4},     {BLOCK_4X8,     BLOCK_4X4}},
+  {{BLOCK_8X16,  BLOCK_8X8},     {BLOCK_INVALID, BLOCK_4X8}},
+  {{BLOCK_16X8,  BLOCK_INVALID}, {BLOCK_8X8,     BLOCK_8X4}},
+  {{BLOCK_16X16, BLOCK_16X8},    {BLOCK_8X16,    BLOCK_8X8}},
+  {{BLOCK_16X32, BLOCK_16X16},   {BLOCK_INVALID, BLOCK_8X16}},
+  {{BLOCK_32X16, BLOCK_INVALID}, {BLOCK_16X16,   BLOCK_16X8}},
+  {{BLOCK_32X32, BLOCK_32X16},   {BLOCK_16X32,   BLOCK_16X16}},
+  {{BLOCK_32X64, BLOCK_32X32},   {BLOCK_INVALID, BLOCK_16X32}},
+  {{BLOCK_64X32, BLOCK_INVALID}, {BLOCK_32X32,   BLOCK_32X16}},
+  {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
 };
+
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -13,20 +13,20 @@

 #include "vp9/common/vp9_enums.h"

-extern const int b_width_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int b_height_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int mi_width_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int mi_height_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES];
-extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES];
-extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES];
-extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES];
-extern const int size_group_lookup[BLOCK_SIZE_TYPES];
-extern const int num_pels_log2_lookup[BLOCK_SIZE_TYPES];
-extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES];
-extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES];
-extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES];
-extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES];
-extern const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5];
+extern const int b_width_log2_lookup[BLOCK_SIZES];
+extern const int b_height_log2_lookup[BLOCK_SIZES];
+extern const int mi_width_log2_lookup[BLOCK_SIZES];
+extern const int mi_height_log2_lookup[BLOCK_SIZES];
+extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES];
+extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES];
+extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES];
+extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZES];
+extern const int size_group_lookup[BLOCK_SIZES];
+extern const int num_pels_log2_lookup[BLOCK_SIZES];
+extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
+extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
+extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
+extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES];
+extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];

-#endif    // VP9_COMMON_VP9_COMMON_DATA_H
+#endif  // VP9_COMMON_VP9_COMMON_DATA_H
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -14,66 +14,45 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"

-#define VP9_FILTER_WEIGHT 128
-#define VP9_FILTER_SHIFT  7
-
-/* Assume a bank of 16 filters to choose from. There are two implementations
- * for filter wrapping behavior, since we want to be able to pick which filter
- * to start with. We could either:
- *
- * 1) make filter_ a pointer to the base of the filter array, and then add an
- *    additional offset parameter, to choose the starting filter.
- * 2) use a pointer to 2 periods worth of filters, so that even if the original
- *    phase offset is at 15/16, we'll have valid data to read. The filter
- *    tables become [32][8], and the second half is duplicated.
- * 3) fix the alignment of the filter tables, so that we know the 0/16 is
- *    always 256 byte aligned.
- *
- * Implementations 2 and 3 are likely preferable, as they avoid an extra 2
- * parameters, and switching between them is trivial, with the
- * ALIGN_FILTERS_256 macro, below.
- */
- #define ALIGN_FILTERS_256 1
-
 static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x0, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4,
                             int w, int h, int taps) {
-  int x, y, k, sum;
-  const int16_t *filter_x_base = filter_x0;
+  int x, y, k;

-#if ALIGN_FILTERS_256
-  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_x_base =
+      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);

  /* Adjust base pointer address for this source line */
  src -= taps / 2 - 1;

  for (y = 0; y < h; ++y) {
-    /* Pointer to filter to use */
-    const int16_t *filter_x = filter_x0;
-
    /* Initial phase offset */
-    int x0_q4 = (filter_x - filter_x_base) / taps;
-    int x_q4 = x0_q4;
+    int x_q4 = (int)(filter_x0 - filter_x_base) / taps;

    for (x = 0; x < w; ++x) {
      /* Per-pixel src offset */
-      int src_x = (x_q4 - x0_q4) >> 4;
+      const int src_x = x_q4 >> SUBPEL_BITS;
+      int sum = 0;

-      for (sum = 0, k = 0; k < taps; ++k) {
+      /* Pointer to filter to use */
+      const int16_t *const filter_x = filter_x_base +
+          (x_q4 & SUBPEL_MASK) * taps;
+
+      for (k = 0; k < taps; ++k)
        sum += src[src_x + k] * filter_x[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[x] = clip_pixel(sum >> VP9_FILTER_SHIFT);

-      /* Adjust source and filter to use for the next pixel */
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+
+      /* Move to the next source pixel */
      x_q4 += x_step_q4;
-      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
    }
    src += src_stride;
    dst += dst_stride;
@@ -85,37 +64,37 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                 const int16_t *filter_x0, int x_step_q4,
                                 const int16_t *filter_y, int y_step_q4,
                                 int w, int h, int taps) {
-  int x, y, k, sum;
-  const int16_t *filter_x_base = filter_x0;
+  int x, y, k;

-#if ALIGN_FILTERS_256
-  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_x_base =
+      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);

  /* Adjust base pointer address for this source line */
  src -= taps / 2 - 1;

  for (y = 0; y < h; ++y) {
-    /* Pointer to filter to use */
-    const int16_t *filter_x = filter_x0;
-
    /* Initial phase offset */
-    int x0_q4 = (filter_x - filter_x_base) / taps;
-    int x_q4 = x0_q4;
+    int x_q4 = (int)(filter_x0 - filter_x_base) / taps;

    for (x = 0; x < w; ++x) {
      /* Per-pixel src offset */
-      int src_x = (x_q4 - x0_q4) >> 4;
+      const int src_x = x_q4 >> SUBPEL_BITS;
+      int sum = 0;

-      for (sum = 0, k = 0; k < taps; ++k) {
+      /* Pointer to filter to use */
+      const int16_t *const filter_x = filter_x_base +
+          (x_q4 & SUBPEL_MASK) * taps;
+
+      for (k = 0; k < taps; ++k)
        sum += src[src_x + k] * filter_x[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[x] = (dst[x] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;

-      /* Adjust source and filter to use for the next pixel */
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+                   clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+
+      /* Move to the next source pixel */
      x_q4 += x_step_q4;
-      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
    }
    src += src_stride;
    dst += dst_stride;
@@ -127,37 +106,37 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y0, int y_step_q4,
                            int w, int h, int taps) {
-  int x, y, k, sum;
+  int x, y, k;

-  const int16_t *filter_y_base = filter_y0;
-
-#if ALIGN_FILTERS_256
-  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_y_base =
+      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);

  /* Adjust base pointer address for this source column */
  src -= src_stride * (taps / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    /* Pointer to filter to use */
-    const int16_t *filter_y = filter_y0;

+  for (x = 0; x < w; ++x) {
    /* Initial phase offset */
-    int y0_q4 = (filter_y - filter_y_base) / taps;
-    int y_q4 = y0_q4;
+    int y_q4 = (int)(filter_y0 - filter_y_base) / taps;

    for (y = 0; y < h; ++y) {
      /* Per-pixel src offset */
-      int src_y = (y_q4 - y0_q4) >> 4;
+      const int src_y = y_q4 >> SUBPEL_BITS;
+      int sum = 0;

-      for (sum = 0, k = 0; k < taps; ++k) {
+      /* Pointer to filter to use */
+      const int16_t *const filter_y = filter_y_base +
+          (y_q4 & SUBPEL_MASK) * taps;
+
+      for (k = 0; k < taps; ++k)
        sum += src[(src_y + k) * src_stride] * filter_y[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[y * dst_stride] = clip_pixel(sum >> VP9_FILTER_SHIFT);

-      /* Adjust source and filter to use for the next pixel */
+      dst[y * dst_stride] =
+          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+
+      /* Move to the next source pixel */
      y_q4 += y_step_q4;
-      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
    }
    ++src;
    ++dst;
@@ -169,38 +148,37 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y0, int y_step_q4,
                                int w, int h, int taps) {
-  int x, y, k, sum;
+  int x, y, k;

-  const int16_t *filter_y_base = filter_y0;
-
-#if ALIGN_FILTERS_256
-  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_y_base =
+      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);

  /* Adjust base pointer address for this source column */
  src -= src_stride * (taps / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    /* Pointer to filter to use */
-    const int16_t *filter_y = filter_y0;

+  for (x = 0; x < w; ++x) {
    /* Initial phase offset */
-    int y0_q4 = (filter_y - filter_y_base) / taps;
-    int y_q4 = y0_q4;
+    int y_q4 = (int)(filter_y0 - filter_y_base) / taps;

    for (y = 0; y < h; ++y) {
      /* Per-pixel src offset */
-      int src_y = (y_q4 - y0_q4) >> 4;
+      const int src_y = y_q4 >> SUBPEL_BITS;
+      int sum = 0;

-      for (sum = 0, k = 0; k < taps; ++k) {
+      /* Pointer to filter to use */
+      const int16_t *const filter_y = filter_y_base +
+          (y_q4 & SUBPEL_MASK) * taps;
+
+      for (k = 0; k < taps; ++k)
        sum += src[(src_y + k) * src_stride] * filter_y[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[y * dst_stride] =
-          (dst[y * dst_stride] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;

-      /* Adjust source and filter to use for the next pixel */
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+           clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+
+      /* Move to the next source pixel */
      y_q4 += y_step_q4;
-      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
    }
    ++src;
    ++dst;
@@ -213,58 +191,27 @@ static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
                       const int16_t *filter_y, int y_step_q4,
                       int w, int h, int taps) {
  /* Fixed size intermediate buffer places limits on parameters.
-   * Maximum intermediate_height is 135, for y_step_q4 == 32,
+   * Maximum intermediate_height is 324, for y_step_q4 == 80,
   * h == 64, taps == 8.
+   * y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
   */
-  uint8_t temp[64 * 135];
-  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
+  uint8_t temp[64 * 324];
+  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + taps;

  assert(w <= 64);
  assert(h <= 64);
  assert(taps <= 8);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
+  assert(y_step_q4 <= 80);
+  assert(x_step_q4 <= 80);

  if (intermediate_height < h)
    intermediate_height = h;

-  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
-                   temp, 64,
-                   filter_x, x_step_q4, filter_y, y_step_q4,
-                   w, intermediate_height, taps);
-  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4,
-                  w, h, taps);
-}
-
-static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h, int taps) {
-  /* Fixed size intermediate buffer places limits on parameters.
-   * Maximum intermediate_height is 135, for y_step_q4 == 32,
-   * h == 64, taps == 8.
-   */
-  uint8_t temp[64 * 135];
-  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(taps <= 8);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  if (intermediate_height < h)
-    intermediate_height = h;
-
-  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
-                   temp, 64,
-                   filter_x, x_step_q4, filter_y, y_step_q4,
-                   w, intermediate_height, taps);
-  convolve_avg_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, taps);
+  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64,
+                   filter_x, x_step_q4, filter_y, y_step_q4, w,
+                   intermediate_height, taps);
+  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, filter_x,
+                  x_step_q4, filter_y, y_step_q4, w, h, taps);
 }

 void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -273,8 +220,7 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h) {
  convolve_horiz_c(src, src_stride, dst, dst_stride,
-                   filter_x, x_step_q4, filter_y, y_step_q4,
-                   w, h, 8);
+                   filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }

 void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -283,8 +229,7 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
  convolve_avg_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8);
+                       filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }

 void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -293,8 +238,7 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
  convolve_vert_c(src, src_stride, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4,
-                  w, h, 8);
+                  filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }

 void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -303,8 +247,7 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
  convolve_avg_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8);
+                      filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }

 void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -313,8 +256,7 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                     const int16_t *filter_y, int y_step_q4,
                     int w, int h) {
  convolve_c(src, src_stride, dst, dst_stride,
-             filter_x, x_step_q4, filter_y, y_step_q4,
-             w, h, 8);
+             filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }

 void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -327,16 +269,9 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
  assert(w <= 64);
  assert(h <= 64);

-  vp9_convolve8(src, src_stride,
-                temp, 64,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_avg(temp, 64,
-                   dst, dst_stride,
-                   NULL, 0, /* These unused parameter should be removed! */
-                   NULL, 0, /* These unused parameter should be removed! */
-                   w, h);
+  vp9_convolve8(src, src_stride, temp, 64,
+               filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  vp9_convolve_avg(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
 }

 void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -347,7 +282,7 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
  int r;

  for (r = h; r > 0; --r) {
-    memcpy(dst, src, w);
+    vpx_memcpy(dst, src, w);
    src += src_stride;
    dst += dst_stride;
  }
@@ -361,9 +296,9 @@ void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
  int x, y;

  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = (dst[x] + src[x] + 1) >> 1;
-    }
+    for (x = 0; x < w; ++x)
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+
    src += src_stride;
    dst += dst_stride;
  }
--- a/vp9/common/vp9_convolve.h
+++ b/vp9/common/vp9_convolve.h
@@ -7,12 +7,14 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
-#ifndef VP9_COMMON_CONVOLVE_H_
-#define VP9_COMMON_CONVOLVE_H_
+#ifndef VP9_COMMON_VP9_CONVOLVE_H_
+#define VP9_COMMON_VP9_CONVOLVE_H_

 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"

+#define FILTER_BITS 7
+
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
@@ -24,4 +26,4 @@ struct subpix_fn_table {
  const int16_t (*filter_y)[8];
 };

-#endif  // VP9_COMMON_CONVOLVE_H_
+#endif  // VP9_COMMON_VP9_CONVOLVE_H_
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -22,23 +22,24 @@ static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
 * and uses the passed in member offset to print out the value of an integer
 * for each mbmi member value in the mi structure.
 */
-static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor,
+static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor,
                          size_t member_offset) {
  int mi_row;
  int mi_col;
  int mi_index = 0;
-  MODE_INFO *mi = common->mi;
-  int rows = common->mi_rows;
-  int cols = common->mi_cols;
+  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
  char prefix = descriptor[0];

-  log_frame_info(common, descriptor, file);
+  log_frame_info(cm, descriptor, file);
  mi_index = 0;
  for (mi_row = 0; mi_row < rows; mi_row++) {
    fprintf(file, "%c ", prefix);
    for (mi_col = 0; mi_col < cols; mi_col++) {
      fprintf(file, "%2d ",
-              *((int*) ((char *) (&mi[mi_index].mbmi) + member_offset)));
+              *((int*) ((char *) (&mi_8x8[mi_index]->mbmi) +
+                        member_offset)));
      mi_index++;
    }
    fprintf(file, "\n");
@@ -51,23 +52,23 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
  int mi_col;
  int mi_index = 0;
  FILE *mvs = fopen(file, "a");
-  MODE_INFO *mi = cm->mi;
+  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
  int rows = cm->mi_rows;
  int cols = cm->mi_cols;

  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
  print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
-  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, mb_skip_coeff));
+  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff));
  print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
-  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
  print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));

-  log_frame_info(cm, "Vectors ",mvs);
+  log_frame_info(cm, "Vectors ", mvs);
  for (mi_row = 0; mi_row < rows; mi_row++) {
-    fprintf(mvs,"V ");
+    fprintf(mvs, "V ");
    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row,
-              mi[mi_index].mbmi.mv[0].as_mv.col);
+      fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row,
+                               mi_8x8[mi_index]->mbmi.mv[0].as_mv.col);
      mi_index++;
    }
    fprintf(mvs, "\n");
--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
@@ -7,6 +7,8 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
+#ifndef VP9_COMMON_DEFAULT_COEF_PROBS_H_
+#define VP9_COMMON_DEFAULT_COEF_PROBS_H_

 /*Generated file, included by vp9_entropy.c*/
 static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
@@ -694,3 +696,4 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
  }
 };

+#endif  // VP9_COMMON_DEFAULT_COEF_PROBS_H_
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -107,101 +107,171 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
 };

 DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
-  0,  16,   1,  32,  17,   2,  48,  33,  18,   3,  64,  34,  49,  19,  65,  80,
-  50,   4,  35,  66,  20,  81,  96,  51,   5,  36,  82,  97,  67, 112,  21,  52,
-  98,  37,  83, 113,   6,  68, 128,  53,  22,  99, 114,  84,   7, 129,  38,  69,
-  100, 115, 144, 130,  85,  54,  23,   8, 145,  39,  70, 116, 101, 131, 160, 146,
-  55,  86,  24,  71, 132, 117, 161,  40,   9, 102, 147, 176, 162,  87,  56,  25,
-  133, 118, 177, 148,  72, 103,  41, 163,  10, 192, 178,  88,  57, 134, 149, 119,
-  26, 164,  73, 104, 193,  42, 179, 208,  11, 135,  89, 165, 120, 150,  58, 194,
-  180,  27,  74, 209, 105, 151, 136,  43,  90, 224, 166, 195, 181, 121, 210,  59,
-  12, 152, 106, 167, 196,  75, 137, 225, 211, 240, 182, 122,  91,  28, 197,  13,
-  226, 168, 183, 153,  44, 212, 138, 107, 241,  60,  29, 123, 198, 184, 227, 169,
-  242,  76, 213, 154,  45,  92,  14, 199, 139,  61, 228, 214, 170, 185, 243, 108,
-  77, 155,  30,  15, 200, 229, 124, 215, 244,  93,  46, 186, 171, 201, 109, 140,
-  230,  62, 216, 245,  31, 125,  78, 156, 231,  47, 187, 202, 217,  94, 246, 141,
-  63, 232, 172, 110, 247, 157,  79, 218, 203, 126, 233, 188, 248,  95, 173, 142,
-  219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251,
-  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255,
+  0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
+  50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
+  98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
+  100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146,
+  55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25,
+  133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119,
+  26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194,
+  180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59,
+  12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13,
+  226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169,
+  242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108,
+  77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140,
+  230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141,
+  63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142,
+  219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159,
+  251,
+  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
+  255,
 };

 DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
-  0,  16,  32,  48,   1,  64,  17,  80,  33,  96,  49,   2,  65, 112,  18,  81,
-  34, 128,  50,  97,   3,  66, 144,  19, 113,  35,  82, 160,  98,  51, 129,   4,
-  67, 176,  20, 114, 145,  83,  36,  99, 130,  52, 192,   5, 161,  68, 115,  21,
-  146,  84, 208, 177,  37, 131, 100,  53, 162, 224,  69,   6, 116, 193, 147,  85,
-  22, 240, 132,  38, 178, 101, 163,  54, 209, 117,  70,   7, 148, 194,  86, 179,
-  225,  23, 133,  39, 164,   8, 102, 210, 241,  55, 195, 118, 149,  71, 180,  24,
-  87, 226, 134, 165, 211,  40, 103,  56,  72, 150, 196, 242, 119,   9, 181, 227,
-  88, 166,  25, 135,  41, 104, 212,  57, 151, 197, 120,  73, 243, 182, 136, 167,
-  213,  89,  10, 228, 105, 152, 198,  26,  42, 121, 183, 244, 168,  58, 137, 229,
-  74, 214,  90, 153, 199, 184,  11, 106, 245,  27, 122, 230, 169,  43, 215,  59,
-  200, 138, 185, 246,  75,  12,  91, 154, 216, 231, 107,  28,  44, 201, 123, 170,
-  60, 247, 232,  76, 139,  13,  92, 217, 186, 248, 155, 108,  29, 124,  45, 202,
-  233, 171,  61,  14,  77, 140,  15, 249,  93,  30, 187, 156, 218,  46, 109, 125,
-  62, 172,  78, 203,  31, 141, 234,  94,  47, 188,  63, 157, 110, 250, 219,  79,
-  126, 204, 173, 142,  95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236,
-  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255,
+  0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
+  34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
+  67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
+  146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85,
+  22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179,
+  225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24,
+  87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227,
+  88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167,
+  213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229,
+  74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59,
+  200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170,
+  60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202,
+  233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125,
+  62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79,
+  126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205,
+  236,
+  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
+  255,
 };

 DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
-  0,   1,   2,  16,   3,  17,   4,  18,  32,   5,  33,  19,   6,  34,  48,  20,
-  49,   7,  35,  21,  50,  64,   8,  36,  65,  22,  51,  37,  80,   9,  66,  52,
-  23,  38,  81,  67,  10,  53,  24,  82,  68,  96,  39,  11,  54,  83,  97,  69,
-  25,  98,  84,  40, 112,  55,  12,  70,  99, 113,  85,  26,  41,  56, 114, 100,
-  13,  71, 128,  86,  27, 115, 101, 129,  42,  57,  72, 116,  14,  87, 130, 102,
-  144,  73, 131, 117,  28,  58,  15,  88,  43, 145, 103, 132, 146, 118,  74, 160,
-  89, 133, 104,  29,  59, 147, 119,  44, 161, 148,  90, 105, 134, 162, 120, 176,
-  75, 135, 149,  30,  60, 163, 177,  45, 121,  91, 106, 164, 178, 150, 192, 136,
-  165, 179,  31, 151, 193,  76, 122,  61, 137, 194, 107, 152, 180, 208,  46, 166,
-  167, 195,  92, 181, 138, 209, 123, 153, 224, 196,  77, 168, 210, 182, 240, 108,
-  197,  62, 154, 225, 183, 169, 211,  47, 139,  93, 184, 226, 212, 241, 198, 170,
-  124, 155, 199,  78, 213, 185, 109, 227, 200,  63, 228, 242, 140, 214, 171, 186,
-  156, 229, 243, 125,  94, 201, 244, 215, 216, 230, 141, 187, 202,  79, 172, 110,
-  157, 245, 217, 231,  95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158,
-  188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175,
-  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255,
+  0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
+  49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
+  23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
+  25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100,
+  13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102,
+  144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160,
+  89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176,
+  75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136,
+  165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166,
+  167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108,
+  197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170,
+  124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186,
+  156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110,
+  157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111,
+  158,
+  188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220,
+  175,
+  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
+  255,
 };

 DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
-  0,   32,    1,   64,   33,    2,   96,   65,   34,  128,    3,   97,   66,  160,  129,   35,   98,    4,   67,  130,  161,  192,   36,   99,  224,    5,  162,  193,   68,  131,   37,  100,
-  225,  194,  256,  163,   69,  132,    6,  226,  257,  288,  195,  101,  164,   38,  258,    7,  227,  289,  133,  320,   70,  196,  165,  290,  259,  228,   39,  321,  102,  352,    8,  197,
-  71,  134,  322,  291,  260,  353,  384,  229,  166,  103,   40,  354,  323,  292,  135,  385,  198,  261,   72,    9,  416,  167,  386,  355,  230,  324,  104,  293,   41,  417,  199,  136,
-  262,  387,  448,  325,  356,   10,   73,  418,  231,  168,  449,  294,  388,  105,  419,  263,   42,  200,  357,  450,  137,  480,   74,  326,  232,   11,  389,  169,  295,  420,  106,  451,
-  481,  358,  264,  327,  201,   43,  138,  512,  482,  390,  296,  233,  170,  421,   75,  452,  359,   12,  513,  265,  483,  328,  107,  202,  514,  544,  422,  391,  453,  139,   44,  234,
-  484,  297,  360,  171,   76,  515,  545,  266,  329,  454,   13,  423,  203,  108,  546,  485,  576,  298,  235,  140,  361,  330,  172,  547,   45,  455,  267,  577,  486,   77,  204,  362,
-  608,   14,  299,  578,  109,  236,  487,  609,  331,  141,  579,   46,   15,  173,  610,  363,   78,  205,   16,  110,  237,  611,  142,   47,  174,   79,  206,   17,  111,  238,   48,  143,
-  80,  175,  112,  207,   49,   18,  239,   81,  113,   19,   50,   82,  114,   51,   83,  115,  640,  516,  392,  268,  144,   20,  672,  641,  548,  517,  424,  393,  300,  269,  176,  145,
-  52,   21,  704,  673,  642,  580,  549,  518,  456,  425,  394,  332,  301,  270,  208,  177,  146,   84,   53,   22,  736,  705,  674,  643,  612,  581,  550,  519,  488,  457,  426,  395,
-  364,  333,  302,  271,  240,  209,  178,  147,  116,   85,   54,   23,  737,  706,  675,  613,  582,  551,  489,  458,  427,  365,  334,  303,  241,  210,  179,  117,   86,   55,  738,  707,
-  614,  583,  490,  459,  366,  335,  242,  211,  118,   87,  739,  615,  491,  367,  243,  119,  768,  644,  520,  396,  272,  148,   24,  800,  769,  676,  645,  552,  521,  428,  397,  304,
-  273,  180,  149,   56,   25,  832,  801,  770,  708,  677,  646,  584,  553,  522,  460,  429,  398,  336,  305,  274,  212,  181,  150,   88,   57,   26,  864,  833,  802,  771,  740,  709,
-  678,  647,  616,  585,  554,  523,  492,  461,  430,  399,  368,  337,  306,  275,  244,  213,  182,  151,  120,   89,   58,   27,  865,  834,  803,  741,  710,  679,  617,  586,  555,  493,
-  462,  431,  369,  338,  307,  245,  214,  183,  121,   90,   59,  866,  835,  742,  711,  618,  587,  494,  463,  370,  339,  246,  215,  122,   91,  867,  743,  619,  495,  371,  247,  123,
-  896,  772,  648,  524,  400,  276,  152,   28,  928,  897,  804,  773,  680,  649,  556,  525,  432,  401,  308,  277,  184,  153,   60,   29,  960,  929,  898,  836,  805,  774,  712,  681,
-  650,  588,  557,  526,  464,  433,  402,  340,  309,  278,  216,  185,  154,   92,   61,   30,  992,  961,  930,  899,  868,  837,  806,  775,  744,  713,  682,  651,  620,  589,  558,  527,
-  496,  465,  434,  403,  372,  341,  310,  279,  248,  217,  186,  155,  124,   93,   62,   31,  993,  962,  931,  869,  838,  807,  745,  714,  683,  621,  590,  559,  497,  466,  435,  373,
-  342,  311,  249,  218,  187,  125,   94,   63,  994,  963,  870,  839,  746,  715,  622,  591,  498,  467,  374,  343,  250,  219,  126,   95,  995,  871,  747,  623,  499,  375,  251,  127,
-  900,  776,  652,  528,  404,  280,  156,  932,  901,  808,  777,  684,  653,  560,  529,  436,  405,  312,  281,  188,  157,  964,  933,  902,  840,  809,  778,  716,  685,  654,  592,  561,
-  530,  468,  437,  406,  344,  313,  282,  220,  189,  158,  996,  965,  934,  903,  872,  841,  810,  779,  748,  717,  686,  655,  624,  593,  562,  531,  500,  469,  438,  407,  376,  345,
-  314,  283,  252,  221,  190,  159,  997,  966,  935,  873,  842,  811,  749,  718,  687,  625,  594,  563,  501,  470,  439,  377,  346,  315,  253,  222,  191,  998,  967,  874,  843,  750,
-  719,  626,  595,  502,  471,  378,  347,  254,  223,  999,  875,  751,  627,  503,  379,  255,  904,  780,  656,  532,  408,  284,  936,  905,  812,  781,  688,  657,  564,  533,  440,  409,
-  316,  285,  968,  937,  906,  844,  813,  782,  720,  689,  658,  596,  565,  534,  472,  441,  410,  348,  317,  286, 1000,  969,  938,  907,  876,  845,  814,  783,  752,  721,  690,  659,
-  628,  597,  566,  535,  504,  473,  442,  411,  380,  349,  318,  287, 1001,  970,  939,  877,  846,  815,  753,  722,  691,  629,  598,  567,  505,  474,  443,  381,  350,  319, 1002,  971,
-  878,  847,  754,  723,  630,  599,  506,  475,  382,  351, 1003,  879,  755,  631,  507,  383,  908,  784,  660,  536,  412,  940,  909,  816,  785,  692,  661,  568,  537,  444,  413,  972,
-  941,  910,  848,  817,  786,  724,  693,  662,  600,  569,  538,  476,  445,  414, 1004,  973,  942,  911,  880,  849,  818,  787,  756,  725,  694,  663,  632,  601,  570,  539,  508,  477,
-  446,  415, 1005,  974,  943,  881,  850,  819,  757,  726,  695,  633,  602,  571,  509,  478,  447, 1006,  975,  882,  851,  758,  727,  634,  603,  510,  479, 1007,  883,  759,  635,  511,
-  912,  788,  664,  540,  944,  913,  820,  789,  696,  665,  572,  541,  976,  945,  914,  852,  821,  790,  728,  697,  666,  604,  573,  542, 1008,  977,  946,  915,  884,  853,  822,  791,
-  760,  729,  698,  667,  636,  605,  574,  543, 1009,  978,  947,  885,  854,  823,  761,  730,  699,  637,  606,  575, 1010,  979,  886,  855,  762,  731,  638,  607, 1011,  887,  763,  639,
-  916,  792,  668,  948,  917,  824,  793,  700,  669,  980,  949,  918,  856,  825,  794,  732,  701,  670, 1012,  981,  950,  919,  888,  857,  826,  795,  764,  733,  702,  671, 1013,  982,
-  951,  889,  858,  827,  765,  734,  703, 1014,  983,  890,  859,  766,  735, 1015,  891,  767,  920,  796,  952,  921,  828,  797,  984,  953,  922,  860,  829,  798, 1016,  985,  954,  923,
-  892,  861,  830,  799, 1017,  986,  955,  893,  862,  831, 1018,  987,  894,  863, 1019,  895,  924,  956,  925,  988,  957,  926, 1020,  989,  958,  927, 1021,  990,  959, 1022,  991, 1023,
+  0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
+  129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
+  68, 131, 37, 100,
+  225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38,
+  258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321,
+  102, 352, 8, 197,
+  71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292,
+  135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293,
+  41, 417, 199, 136,
+  262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105,
+  419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169,
+  295, 420, 106, 451,
+  481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421,
+  75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391,
+  453, 139, 44, 234,
+  484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108,
+  546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577,
+  486, 77, 204, 362,
+  608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173,
+  610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17,
+  111, 238, 48, 143,
+  80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51,
+  83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424,
+  393, 300, 269, 176, 145,
+  52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301,
+  270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581,
+  550, 519, 488, 457, 426, 395,
+  364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737,
+  706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241,
+  210, 179, 117, 86, 55, 738, 707,
+  614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491,
+  367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676,
+  645, 552, 521, 428, 397, 304,
+  273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553,
+  522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26,
+  864, 833, 802, 771, 740, 709,
+  678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306,
+  275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741,
+  710, 679, 617, 586, 555, 493,
+  462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835,
+  742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867,
+  743, 619, 495, 371, 247, 123,
+  896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680,
+  649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929,
+  898, 836, 805, 774, 712, 681,
+  650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154,
+  92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682,
+  651, 620, 589, 558, 527,
+  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124,
+  93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590,
+  559, 497, 466, 435, 373,
+  342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715,
+  622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623,
+  499, 375, 251, 127,
+  900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560,
+  529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716,
+  685, 654, 592, 561,
+  530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903,
+  872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469,
+  438, 407, 376, 345,
+  314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718,
+  687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998,
+  967, 874, 843, 750,
+  719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503,
+  379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657,
+  564, 533, 440, 409,
+  316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534,
+  472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783,
+  752, 721, 690, 659,
+  628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970,
+  939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381,
+  350, 319, 1002, 971,
+  878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631,
+  507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568,
+  537, 444, 413, 972,
+  941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414,
+  1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601,
+  570, 539, 508, 477,
+  446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571,
+  509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479,
+  1007, 883, 759, 635, 511,
+  912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945,
+  914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915,
+  884, 853, 822, 791,
+  760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823,
+  761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607,
+  1011, 887, 763, 639,
+  916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825,
+  794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733,
+  702, 671, 1013, 982,
+  951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015,
+  891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798,
+  1016, 985, 954, 923,
+  892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863,
+  1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021,
+  990, 959, 1022, 991, 1023,
 };

 /* Array indices are identical to previously-existing CONTEXT_NODE indices */

-const vp9_tree_index vp9_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
-{
+const vp9_tree_index vp9_coef_tree[ 22] = {
  -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
  -ZERO_TOKEN, 4,                             /* 1 = ZERO */
  -ONE_TOKEN, 6,                              /* 2 = ONE */
@@ -377,7 +447,7 @@ static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {

 static void extend_model_to_full_distribution(vp9_prob p,
                                              vp9_prob *tree_probs) {
-  const int l = ((p - 1) / 2);
+  const int l = (p - 1) / 2;
  const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8;
  if (p & 1) {
    vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
@@ -436,11 +506,11 @@ const vp9_extra_bit vp9_extra_bits[12] = {

 #include "vp9/common/vp9_default_coef_probs.h"

-void vp9_default_coef_probs(VP9_COMMON *pc) {
-  vp9_copy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
-  vp9_copy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
-  vp9_copy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
-  vp9_copy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+void vp9_default_coef_probs(VP9_COMMON *cm) {
+  vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
+  vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
+  vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
+  vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }

 // Neighborhood 5-tuples for various scans and blocksizes,
@@ -569,31 +639,6 @@ void vp9_init_neighbors() {
                      vp9_default_scan_32x32_neighbors);
 }

-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) {
-  if (scan == vp9_default_scan_4x4) {
-    return vp9_default_scan_4x4_neighbors;
-  } else if (scan == vp9_row_scan_4x4) {
-    return vp9_row_scan_4x4_neighbors;
-  } else if (scan == vp9_col_scan_4x4) {
-    return vp9_col_scan_4x4_neighbors;
-  } else if (scan == vp9_default_scan_8x8) {
-    return vp9_default_scan_8x8_neighbors;
-  } else if (scan == vp9_row_scan_8x8) {
-    return vp9_row_scan_8x8_neighbors;
-  } else if (scan == vp9_col_scan_8x8) {
-    return vp9_col_scan_8x8_neighbors;
-  } else if (scan == vp9_default_scan_16x16) {
-    return vp9_default_scan_16x16_neighbors;
-  } else if (scan == vp9_row_scan_16x16) {
-    return vp9_row_scan_16x16_neighbors;
-  } else if (scan == vp9_col_scan_16x16) {
-    return vp9_col_scan_16x16_neighbors;
-  } else {
-    assert(scan == vp9_default_scan_32x32);
-    return vp9_default_scan_32x32_neighbors;
-  }
-}
-
 void vp9_coef_tree_initialize() {
  vp9_init_neighbors();
  init_bit_trees();
@@ -622,7 +667,6 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
  int t, i, j, k, l;
  unsigned int branch_ct[UNCONSTRAINED_NODES][2];
  vp9_prob coef_probs[UNCONSTRAINED_NODES];
-  int entropy_nodes_adapt = UNCONSTRAINED_NODES;

  for (i = 0; i < BLOCK_TYPES; ++i)
    for (j = 0; j < REF_TYPES; ++j)
@@ -635,7 +679,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
                                           0);
          branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          for (t = 0; t < entropy_nodes_adapt; ++t)
+          for (t = 0; t < UNCONSTRAINED_NODES; ++t)
            dst_coef_probs[i][j][k][l][t] = merge_probs(
                pre_coef_probs[i][j][k][l][t], coef_probs[t],
                branch_ct[t], count_sat, update_factor);
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -95,7 +95,7 @@ typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
 #define MODULUS_PARAM               13  /* Modulus parameter */

 struct VP9Common;
-void vp9_default_coef_probs(struct VP9Common *);
+void vp9_default_coef_probs(struct VP9Common *cm);
 extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);

 extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
@@ -154,19 +154,17 @@ extern DECLARE_ALIGNED(16, int16_t,
                       vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);

 void vp9_coef_tree_initialize(void);
-void vp9_adapt_coef_probs(struct VP9Common *);
+void vp9_adapt_coef_probs(struct VP9Common *cm);

-static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd,
-                                               BLOCK_SIZE_TYPE bsize) {
-  /* Clear entropy contexts */
-  const int bw = 1 << b_width_log2(bsize);
-  const int bh = 1 << b_height_log2(bsize);
+static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
  int i;
  for (i = 0; i < MAX_MB_PLANE; i++) {
-    vpx_memset(xd->plane[i].above_context, 0,
-               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[i].subsampling_x);
-    vpx_memset(xd->plane[i].left_context, 0,
-               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[i].subsampling_y);
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    vpx_memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) *
+                   num_4x4_blocks_wide_lookup[plane_bsize]);
+    vpx_memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) *
+                   num_4x4_blocks_high_lookup[plane_bsize]);
  }
 }

@@ -192,9 +190,6 @@ static INLINE int get_coef_context(const int16_t *neighbors,
          token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
 }

-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan);
-
-
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
@@ -338,6 +333,63 @@ static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
  }
 }

+static int get_entropy_context(TX_SIZE tx_size,
+                               ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
+
+  switch (tx_size) {
+    case TX_4X4:
+      above_ec = a[0] != 0;
+      left_ec = l[0] != 0;
+      break;
+    case TX_8X8:
+      above_ec = !!*(uint16_t *)a;
+      left_ec  = !!*(uint16_t *)l;
+      break;
+    case TX_16X16:
+      above_ec = !!*(uint32_t *)a;
+      left_ec  = !!*(uint32_t *)l;
+      break;
+    case TX_32X32:
+      above_ec = !!*(uint64_t *)a;
+      left_ec  = !!*(uint64_t *)l;
+      break;
+    default:
+      assert(!"Invalid transform size.");
+  }
+
+  return combine_entropy_contexts(above_ec, left_ec);
+}
+
+static void get_scan_and_band(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                              PLANE_TYPE type, int block_idx,
+                              const int16_t **scan,
+                              const int16_t **scan_nb,
+                              const uint8_t **band_translate) {
+  switch (tx_size) {
+    case TX_4X4:
+      get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb);
+      *band_translate = vp9_coefband_trans_4x4;
+      break;
+    case TX_8X8:
+      get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb);
+      *band_translate = vp9_coefband_trans_8x8plus;
+      break;
+    case TX_16X16:
+      get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb);
+      *band_translate = vp9_coefband_trans_8x8plus;
+      break;
+    case TX_32X32:
+      *scan = vp9_default_scan_32x32;
+      *scan_nb = vp9_default_scan_32x32_neighbors;
+      *band_translate = vp9_coefband_trans_8x8plus;
+      break;
+    default:
+      assert(!"Invalid transform size.");
+  }
+}
+
+
 enum { VP9_COEF_UPDATE_PROB = 252 };

 #endif  // VP9_COMMON_VP9_ENTROPY_H_
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -14,8 +14,8 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"

-const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES]
-                                  [VP9_INTRA_MODES - 1] = {
+const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES]
+                                  [INTRA_MODES - 1] = {
  { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
  { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
  { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
@@ -23,21 +23,21 @@ const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES]
  { 113,   9,  36, 155, 111, 157,  32,  44, 161 } /* y = d135 */,
  { 116,   9,  55, 176,  76,  96,  37,  61, 149 } /* y = d117 */,
  { 115,   9,  28, 141, 161, 167,  21,  25, 193 } /* y = d153 */,
-  { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d27 */,
+  { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d207 */,
  { 116,  12,  64, 120, 140, 125,  49, 115, 121 } /* y = d63 */,
  { 102,  19,  66, 162, 182, 122,  35,  59, 128 } /* y = tm */
 };

 static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS]
-                                        [VP9_INTRA_MODES - 1] = {
+                                        [INTRA_MODES - 1] = {
  {  65,  32,  18, 144, 162, 194,  41,  51,  98 } /* block_size < 8x8 */,
  { 132,  68,  18, 165, 217, 196,  45,  40,  78 } /* block_size < 16x16 */,
  { 173,  80,  19, 176, 240, 193,  64,  35,  46 } /* block_size < 32x32 */,
  { 221, 135,  38, 194, 248, 121,  96,  85,  29 } /* block_size >= 32x32 */
 };

-static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES]
-                                         [VP9_INTRA_MODES - 1] = {
+static const vp9_prob default_if_uv_probs[INTRA_MODES]
+                                         [INTRA_MODES - 1] = {
  { 120,   7,  76, 176, 208, 126,  28,  54, 103 } /* y = dc */,
  {  48,  12, 154, 155, 139,  90,  34, 117, 119 } /* y = v */,
  {  67,   6,  25, 204, 243, 158,  13,  21,  96 } /* y = h */,
@@ -45,7 +45,7 @@ static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES]
  {  83,   5,  42, 156, 111, 152,  26,  49, 152 } /* y = d135 */,
  {  80,   5,  58, 178,  74,  83,  33,  62, 145 } /* y = d117 */,
  {  86,   5,  32, 154, 192, 168,  14,  22, 163 } /* y = d153 */,
-  {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d27 */,
+  {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d207 */,
  {  77,   7,  64, 116, 132, 122,  37, 126, 120 } /* y = d63 */,
  { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
 };
@@ -98,9 +98,9 @@ static const vp9_prob default_partition_probs[NUM_FRAME_TYPES]
  }
 };

-const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
-                                 [VP9_INTRA_MODES]
-                                 [VP9_INTRA_MODES - 1] = {
+const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
+                                 [INTRA_MODES]
+                                 [INTRA_MODES - 1] = {
  { /* above = dc */
    { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
    {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
@@ -109,7 +109,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
    {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
    {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
-    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
+    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d207 */,
    {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
    {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
  }, { /* above = v */
@@ -120,7 +120,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
    {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
    {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
-    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
+    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d207 */,
    {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
    {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
  }, { /* above = h */
@@ -131,7 +131,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
    {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
    {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
-    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
+    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d207 */,
    {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
    {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
  }, { /* above = d45 */
@@ -142,7 +142,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
    {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
    {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
-    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
+    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d207 */,
    {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
    {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
  }, { /* above = d135 */
@@ -153,7 +153,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
    {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
    {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
-    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
+    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d207 */,
    {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
    {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
  }, { /* above = d117 */
@@ -164,7 +164,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
    {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
    {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
-    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
+    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d207 */,
    {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
    {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
  }, { /* above = d153 */
@@ -175,10 +175,10 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
    {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
    {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
-    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
+    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d207 */,
    {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
    {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
-  }, { /* above = d27 */
+  }, { /* above = d207 */
    {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
    {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
    {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
@@ -186,7 +186,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
    {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
    {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
-    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
+    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d207 */,
    {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
    {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
  }, { /* above = d63 */
@@ -197,7 +197,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
    {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
    {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
-    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
+    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d207 */,
    {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
    {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
  }, { /* above = tm */
@@ -208,14 +208,14 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
    {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
    {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
    {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
-    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
+    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d207 */,
    {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
    {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
  }
 };

 static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
-                                              [VP9_INTER_MODES - 1] = {
+                                              [INTER_MODES - 1] = {
  {2,       173,   34},  // 0 = both zero mv
  {7,       145,   85},  // 1 = one zero mv + one a predicted mv
  {7,       166,   63},  // 2 = two predicted mvs
@@ -226,7 +226,7 @@ static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
 };

 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
+const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
  -DC_PRED, 2,                      /* 0 = DC_NODE */
  -TM_PRED, 4,                      /* 1 = TM_NODE */
  -V_PRED, 6,                       /* 2 = V_NODE */
@@ -235,7 +235,7 @@ const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
  -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */
  -D45_PRED, 14,                    /* 6 = D45_NODE */
  -D63_PRED, 16,                    /* 7 = D63_NODE */
-  -D153_PRED, -D27_PRED             /* 8 = D153_NODE */
+  -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };

 const vp9_tree_index vp9_inter_mode_tree[6] = {
@@ -250,8 +250,8 @@ const vp9_tree_index vp9_partition_tree[6] = {
  -PARTITION_VERT, -PARTITION_SPLIT
 };

-struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
-struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];
+struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
+struct vp9_token vp9_inter_mode_encodings[INTER_MODES];

 struct vp9_token vp9_partition_encodings[PARTITION_TYPES];

@@ -317,8 +317,8 @@ static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
  192, 128, 64
 };

-static const vp9_prob default_switchable_interp_prob[VP9_SWITCHABLE_FILTERS+1]
-                                                  [VP9_SWITCHABLE_FILTERS-1] = {
+static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1]
+                                                  [SWITCHABLE_FILTERS-1] = {
  { 235, 162, },
  { 36, 255, },
  { 34, 3, },
@@ -338,11 +338,11 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) {
  vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
 }

-const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
+const vp9_tree_index vp9_switchable_interp_tree[SWITCHABLE_FILTERS*2-2] = {
  -EIGHTTAP, 2,
  -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
-struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];

 void vp9_entropy_mode_init() {
  vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
@@ -400,17 +400,17 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
                                             counts->single_ref[i][j]);

  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    update_mode_probs(VP9_INTER_MODES, vp9_inter_mode_tree,
+    update_mode_probs(INTER_MODES, vp9_inter_mode_tree,
                      counts->inter_mode[i], pre_fc->inter_mode_probs[i],
                      fc->inter_mode_probs[i], NEARESTMV);

  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
+    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
                      counts->y_mode[i], pre_fc->y_mode_prob[i],
                      fc->y_mode_prob[i], 0);

-  for (i = 0; i < VP9_INTRA_MODES; ++i)
-    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
+  for (i = 0; i < INTRA_MODES; ++i)
+    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
                      counts->uv_mode[i], pre_fc->uv_mode_prob[i],
                      fc->uv_mode_prob[i], 0);

@@ -421,8 +421,8 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
                      fc->partition_prob[INTER_FRAME][i], 0);

  if (cm->mcomp_filter_type == SWITCHABLE) {
-    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
-      update_mode_probs(VP9_SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
+    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+      update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
                        counts->switchable_interp[i],
                        pre_fc->switchable_interp_prob[i],
                        fc->switchable_interp_prob[i], 0);
@@ -440,14 +440,12 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
        fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
                                             branch_ct_8x8p[j]);

-      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i],
-                                       branch_ct_16x16p);
+      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
      for (j = 0; j < TX_SIZES - 2; ++j)
        fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
                                               branch_ct_16x16p[j]);

-      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i],
-                                       branch_ct_32x32p);
+      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
      for (j = 0; j < TX_SIZES - 1; ++j)
        fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
                                               branch_ct_32x32p[j]);
@@ -472,14 +470,14 @@ static void set_default_lf_deltas(struct loopfilter *lf) {
  lf->mode_deltas[1] = 0;
 }

-void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
+void vp9_setup_past_independence(VP9_COMMON *cm) {
  // Reset the segment feature data to the default stats:
  // Features disabled, 0, with delta coding (Default state).
-  struct loopfilter *const lf = &xd->lf;
+  struct loopfilter *const lf = &cm->lf;

  int i;
-  vp9_clearall_segfeatures(&xd->seg);
-  xd->seg.abs_delta = SEGMENT_DELTADATA;
+  vp9_clearall_segfeatures(&cm->seg);
+  cm->seg.abs_delta = SEGMENT_DELTADATA;
  if (cm->last_frame_seg_map)
    vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));

@@ -512,10 +510,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));

  vp9_update_mode_info_border(cm, cm->mip);
-  vp9_update_mode_info_in_image(cm, cm->mi);
-
  vp9_update_mode_info_border(cm, cm->prev_mip);
-  vp9_update_mode_info_in_image(cm, cm->prev_mi);

  vp9_zero(cm->ref_frame_sign_bias);

--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -14,10 +14,9 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_treecoder.h"

-#define SUBMVREF_COUNT 5
 #define TX_SIZE_CONTEXTS 2
-#define VP9_MODE_UPDATE_PROB  252
-#define VP9_SWITCHABLE_FILTERS 3   // number of switchable filters
+#define MODE_UPDATE_PROB  252
+#define SWITCHABLE_FILTERS 3   // number of switchable filters

 // #define MODE_STATS

@@ -35,32 +34,32 @@ struct tx_counts {
  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
 };

-extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
-extern const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES]
-                                        [VP9_INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
+                                        [INTRA_MODES - 1];

 extern const vp9_tree_index vp9_intra_mode_tree[];
 extern const vp9_tree_index vp9_inter_mode_tree[];

-extern struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
-extern struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];
+extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
+extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];

 // probability models for partition information
 extern const vp9_tree_index vp9_partition_tree[];
 extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];

 extern const vp9_tree_index vp9_switchable_interp_tree
-                 [2 * (VP9_SWITCHABLE_FILTERS - 1)];
+                 [2 * (SWITCHABLE_FILTERS - 1)];

-extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];

 void vp9_entropy_mode_init();

-void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);
+void vp9_setup_past_independence(struct VP9Common *cm);

-void vp9_init_mbmode_probs(struct VP9Common *x);
+void vp9_init_mbmode_probs(struct VP9Common *cm);

-void vp9_adapt_mode_probs(struct VP9Common *);
+void vp9_adapt_mode_probs(struct VP9Common *cm);

 void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
                                      unsigned int (*ct_32x32p)[2]);
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -39,12 +39,12 @@ const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
 };
 struct vp9_token vp9_mv_class_encodings[MV_CLASSES];

-const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = {
+const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2] = {
  -0, -1,
 };
 struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];

-const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {
+const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2] = {
  -0, 2,
  -1, 4,
  -2, -3
@@ -53,8 +53,8 @@ struct vp9_token vp9_mv_fp_encodings[4];

 static const nmv_context default_nmv_context = {
  {32, 64, 96},
-  {
-    { /* vert component */
+  { // NOLINT
+    { /* vert component */ // NOLINT
      128,                                                  /* sign */
      {224, 144, 192, 168, 192, 176, 192, 198, 198, 245},   /* class */
      {216},                                                /* class0 */
@@ -64,7 +64,7 @@ static const nmv_context default_nmv_context = {
      160,                                                  /* class0_hp bit */
      128,                                                  /* hp */
    },
-    { /* hor component */
+    { /* hor component */ // NOLINT
      128,                                                  /* sign */
      {216, 128, 176, 160, 176, 176, 192, 198, 198, 208},   /* class */
      {208},                                                /* class0 */
@@ -79,20 +79,59 @@ static const nmv_context default_nmv_context = {

 #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)

+static const uint8_t log_in_base_2[] = {
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
+};
+
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
  MV_CLASS_TYPE c = MV_CLASS_0;
-  if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;
-  else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;
-  else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;
-  else if (z < CLASS0_SIZE * 64)   c = MV_CLASS_3;
-  else if (z < CLASS0_SIZE * 128)  c = MV_CLASS_4;
-  else if (z < CLASS0_SIZE * 256)  c = MV_CLASS_5;
-  else if (z < CLASS0_SIZE * 512)  c = MV_CLASS_6;
-  else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
-  else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8;
-  else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9;
-  else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10;
-  else assert(0);
+  if (z >= CLASS0_SIZE * 4096)
+    c = MV_CLASS_10;
+  else
+    c = log_in_base_2[z >> 3];
+
  if (offset)
    *offset = z - mv_class_base(c);
  return c;
@@ -110,9 +149,7 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                             int incr, int usehp) {
  int s, z, c, o, d, e, f;
-  if (!incr)
-    return;
-  assert (v != 0);            /* should not be zero */
+  assert(v != 0);            /* should not be zero */
  s = v < 0;
  comp_counts->sign[s] += incr;
  z = (s ? -v : v) - 1;       /* magnitude - 1 */
@@ -123,68 +160,44 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts,
  d = (o >> 3);               /* int mv data */
  f = (o >> 1) & 3;           /* fractional pel mv data */
  e = (o & 1);                /* high precision mv data */
+
  if (c == MV_CLASS_0) {
    comp_counts->class0[d] += incr;
+    comp_counts->class0_fp[d][f] += incr;
+    comp_counts->class0_hp[e] += usehp * incr;
  } else {
    int i;
    int b = c + CLASS0_BITS - 1;  // number of bits
    for (i = 0; i < b; ++i)
      comp_counts->bits[i][((d >> i) & 1)] += incr;
-  }
-
-  /* Code the fractional pel bits */
-  if (c == MV_CLASS_0) {
-    comp_counts->class0_fp[d][f] += incr;
-  } else {
    comp_counts->fp[f] += incr;
-  }
-
-  /* Code the high precision bit */
-  if (usehp) {
-    if (c == MV_CLASS_0) {
-      comp_counts->class0_hp[e] += incr;
-    } else {
-      comp_counts->hp[e] += incr;
-    }
+    comp_counts->hp[e] += usehp * incr;
  }
 }

-static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
-  int v;
-  vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
-  for (v = 1; v <= MV_MAX; v++) {
-    inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
-    inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
-  }
-}

 void vp9_inc_mv(const MV *mv,  nmv_context_counts *counts) {
  const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
  ++counts->joints[j];

-  if (mv_joint_vertical(j))
-    ++counts->comps[0].mvcount[MV_MAX + mv->row];
+  if (mv_joint_vertical(j)) {
+    inc_mv_component(mv->row, &counts->comps[0], 1, 1);
+  }

-  if (mv_joint_horizontal(j))
-    ++counts->comps[1].mvcount[MV_MAX + mv->col];
+  if (mv_joint_horizontal(j)) {
+    inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+  }
 }

 static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
  return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
 }

-void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
-  counts_to_context(&nmv_count->comps[0], usehp);
-  counts_to_context(&nmv_count->comps[1], usehp);
-}
-
 static unsigned int adapt_probs(unsigned int i,
                                vp9_tree tree,
                                vp9_prob this_probs[],
                                const vp9_prob last_probs[],
                                const unsigned int num_events[]) {
-
-
  const unsigned int left = tree[i] <= 0
          ? num_events[-tree[i]]
          : adapt_probs(tree[i], tree, this_probs, last_probs, num_events);
@@ -207,8 +220,6 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
  nmv_context *pre_ctx = &pre_fc->nmvc;
  nmv_context_counts *cts = &cm->counts.mv;

-  vp9_counts_process(cts, allow_hp);
-
  adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);

  for (i = 0; i < 2; ++i) {
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -13,7 +13,7 @@
 #define VP9_COMMON_VP9_ENTROPYMV_H_

 #include "vp9/common/vp9_treecoder.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_blockd.h"

 struct VP9Common;
@@ -24,7 +24,7 @@ void vp9_init_mv_probs(struct VP9Common *cm);
 void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
 int vp9_use_mv_hp(const MV *ref);

-#define VP9_NMV_UPDATE_PROB  252
+#define NMV_UPDATE_PROB  252

 /* Symbols for coding which components are zero jointly */
 #define MV_JOINTS     4
@@ -73,6 +73,10 @@ extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 #define MV_MAX         ((1 << MV_MAX_BITS) - 1)
 #define MV_VALS        ((MV_MAX << 1) + 1)

+#define MV_IN_USE_BITS 14
+#define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
+#define MV_LOW   (-(1 << MV_IN_USE_BITS))
+
 extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];
 extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];

@@ -126,6 +130,4 @@ typedef struct {

 void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);

-void vp9_counts_process(nmv_context_counts *NMVcount, int usehp);
-
 #endif  // VP9_COMMON_VP9_ENTROPYMV_H_
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -13,15 +13,16 @@

 #include "./vpx_config.h"

-#define LOG2_MI_SIZE 3
-#define LOG2_MI_BLOCK_SIZE (6 - LOG2_MI_SIZE)  // 64 = 2^6
+#define MI_SIZE_LOG2 3
+#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2)  // 64 = 2^6

-#define MI_SIZE (1 << LOG2_MI_SIZE)  // pixels per mi-unit
-#define MI_BLOCK_SIZE (1 << LOG2_MI_BLOCK_SIZE)  // mi-units per max block
+#define MI_SIZE (1 << MI_SIZE_LOG2)  // pixels per mi-unit
+#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2)  // mi-units per max block

 #define MI_MASK (MI_BLOCK_SIZE - 1)

-typedef enum BLOCK_SIZE_TYPE {
+
+typedef enum BLOCK_SIZE {
  BLOCK_4X4,
  BLOCK_4X8,
  BLOCK_8X4,
@@ -35,15 +36,17 @@ typedef enum BLOCK_SIZE_TYPE {
  BLOCK_32X64,
  BLOCK_64X32,
  BLOCK_64X64,
-  BLOCK_SIZE_TYPES
-} BLOCK_SIZE_TYPE;
+  BLOCK_SIZES,
+  BLOCK_INVALID = BLOCK_SIZES
+} BLOCK_SIZE;

 typedef enum PARTITION_TYPE {
  PARTITION_NONE,
  PARTITION_HORZ,
  PARTITION_VERT,
  PARTITION_SPLIT,
-  PARTITION_TYPES, PARTITION_INVALID = PARTITION_TYPES
+  PARTITION_TYPES,
+  PARTITION_INVALID = PARTITION_TYPES
 } PARTITION_TYPE;

 #define PARTITION_PLOFFSET   4  // number of probability models per block size
--- a/vp9/common/vp9_extend.c
+++ b/vp9/common/vp9_extend.c
@@ -57,15 +57,23 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch,

 void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                               YV12_BUFFER_CONFIG *dst) {
-  const int et_y = dst->border;
-  const int el_y = dst->border;
-  const int eb_y = dst->border + dst->y_height - src->y_height;
-  const int er_y = dst->border + dst->y_width - src->y_width;
-
-  const int et_uv = dst->border >> (dst->uv_height != dst->y_height);
-  const int el_uv = dst->border >> (dst->uv_width != dst->y_width);
-  const int eb_uv = et_uv + dst->uv_height - src->uv_height;
-  const int er_uv = el_uv + dst->uv_width - src->uv_width;
+  // Extend src frame in buffer
+  // Altref filtering assumes 16 pixel extension
+  const int et_y = 16;
+  const int el_y = 16;
+  // Motion estimation may use src block variance with the block size up
+  // to 64x64, so the right and bottom need to be extended to 64 mulitple
+  // or up to 16, whichever is greater.
+  const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width,
+                       16);
+  const int er_y = MAX(ALIGN_POWER_OF_TWO(src->y_height, 6) - src->y_height,
+                       16);
+  const int uv_width_subsampling = (src->uv_width != src->y_width);
+  const int uv_height_subsampling = (src->uv_height != src->y_height);
+  const int et_uv = et_y >> uv_height_subsampling;
+  const int el_uv = el_y >> uv_width_subsampling;
+  const int eb_uv = eb_y >> uv_height_subsampling;
+  const int er_uv = er_y >> uv_width_subsampling;

 #if CONFIG_ALPHA
  const int et_a = dst->border >> (dst->alpha_height != dst->y_height);
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -8,14 +8,12 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
-#include <stdlib.h>
-#include "vp9/common/vp9_filter.h"
 #include "vpx_ports/mem.h"
-#include "vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"

-DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
+#include "vp9/common/vp9_filter.h"
+
+DECLARE_ALIGNED(256, const int16_t,
+                vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
  { 0, 0, 0, 128,   0, 0, 0, 0 },
  { 0, 0, 0, 120,   8, 0, 0, 0 },
  { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -34,8 +32,9 @@ DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
  { 0, 0, 0,   8, 120, 0, 0, 0 }
 };

-DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
-  /* Lagrangian interpolation filter */
+// Lagrangian interpolation filter
+DECLARE_ALIGNED(256, const int16_t,
+                vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
  { 0,   0,   0, 128,   0,   0,   0,  0},
  { 0,   1,  -5, 126,   8,  -3,   1,  0},
  { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -54,9 +53,9 @@ DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
  { 0,   1,  -3,   8, 126,  -5,   1,  0}
 };

-DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8])
-    = {
-  /* dct based filter */
+// DCT based filter
+DECLARE_ALIGNED(256, const int16_t,
+                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
  {0,   0,   0, 128,   0,   0,   0, 0},
  {-1,   3,  -7, 127,   8,  -3,   1, 0},
  {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -75,9 +74,9 @@ DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8])
  {0,   1,  -3,   8, 127,  -7,   3, -1}
 };

+// freqmultiplier = 0.5
 DECLARE_ALIGNED(256, const int16_t,
-                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = {
-  /* freqmultiplier = 0.5 */
+                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
  { 0,  0,  0, 128,  0,  0,  0,  0},
  {-3, -1, 32,  64, 38,  1, -3,  0},
  {-2, -2, 29,  63, 41,  2, -3,  0},
--- a/Show More
+++ b/Show More